diff options
| author | David Woodhouse <dwmw2@infradead.org> | 2007-01-17 18:34:51 -0500 |
|---|---|---|
| committer | David Woodhouse <dwmw2@infradead.org> | 2007-01-17 18:34:51 -0500 |
| commit | 9cdf083f981b8d37b3212400a359368661385099 (patch) | |
| tree | aa15a6a08ad87e650dea40fb59b3180bef0d345b /kernel | |
| parent | e499e01d234a31d59679b7b1e1cf628d917ba49a (diff) | |
| parent | a8b3485287731978899ced11f24628c927890e78 (diff) | |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Diffstat (limited to 'kernel')
63 files changed, 3078 insertions, 2826 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 248e1c396f8b..4af15802ccd4 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz | |||
| @@ -7,7 +7,7 @@ choice | |||
| 7 | default HZ_250 | 7 | default HZ_250 |
| 8 | help | 8 | help |
| 9 | Allows the configuration of the timer frequency. It is customary | 9 | Allows the configuration of the timer frequency. It is customary |
| 10 | to have the timer interrupt run at 1000 HZ but 100 HZ may be more | 10 | to have the timer interrupt run at 1000 Hz but 100 Hz may be more |
| 11 | beneficial for servers and NUMA systems that do not need to have | 11 | beneficial for servers and NUMA systems that do not need to have |
| 12 | a fast response for user interaction and that may experience bus | 12 | a fast response for user interaction and that may experience bus |
| 13 | contention and cacheline bounces as a result of timer interrupts. | 13 | contention and cacheline bounces as a result of timer interrupts. |
| @@ -19,21 +19,30 @@ choice | |||
| 19 | config HZ_100 | 19 | config HZ_100 |
| 20 | bool "100 HZ" | 20 | bool "100 HZ" |
| 21 | help | 21 | help |
| 22 | 100 HZ is a typical choice for servers, SMP and NUMA systems | 22 | 100 Hz is a typical choice for servers, SMP and NUMA systems |
| 23 | with lots of processors that may show reduced performance if | 23 | with lots of processors that may show reduced performance if |
| 24 | too many timer interrupts are occurring. | 24 | too many timer interrupts are occurring. |
| 25 | 25 | ||
| 26 | config HZ_250 | 26 | config HZ_250 |
| 27 | bool "250 HZ" | 27 | bool "250 HZ" |
| 28 | help | 28 | help |
| 29 | 250 HZ is a good compromise choice allowing server performance | 29 | 250 Hz is a good compromise choice allowing server performance |
| 30 | while also showing good interactive responsiveness even | 30 | while also showing good interactive responsiveness even |
| 31 | on SMP and NUMA systems. | 31 | on SMP and NUMA systems. If you are going to be using NTSC video |
| 32 | or multimedia, selected 300Hz instead. | ||
| 33 | |||
| 34 | config HZ_300 | ||
| 35 | bool "300 HZ" | ||
| 36 | help | ||
| 37 | 300 Hz is a good compromise choice allowing server performance | ||
| 38 | while also showing good interactive responsiveness even | ||
| 39 | on SMP and NUMA systems and exactly dividing by both PAL and | ||
| 40 | NTSC frame rates for video and multimedia work. | ||
| 32 | 41 | ||
| 33 | config HZ_1000 | 42 | config HZ_1000 |
| 34 | bool "1000 HZ" | 43 | bool "1000 HZ" |
| 35 | help | 44 | help |
| 36 | 1000 HZ is the preferred choice for desktop systems and other | 45 | 1000 Hz is the preferred choice for desktop systems and other |
| 37 | systems requiring fast interactive responses to events. | 46 | systems requiring fast interactive responses to events. |
| 38 | 47 | ||
| 39 | endchoice | 48 | endchoice |
| @@ -42,5 +51,6 @@ config HZ | |||
| 42 | int | 51 | int |
| 43 | default 100 if HZ_100 | 52 | default 100 if HZ_100 |
| 44 | default 250 if HZ_250 | 53 | default 250 if HZ_250 |
| 54 | default 300 if HZ_300 | ||
| 45 | default 1000 if HZ_1000 | 55 | default 1000 if HZ_1000 |
| 46 | 56 | ||
diff --git a/kernel/Makefile b/kernel/Makefile index 5e3f3b75563a..14f4d45e0ae9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -31,7 +31,6 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o | |||
| 31 | obj-$(CONFIG_UID16) += uid16.o | 31 | obj-$(CONFIG_UID16) += uid16.o |
| 32 | obj-$(CONFIG_MODULES) += module.o | 32 | obj-$(CONFIG_MODULES) += module.o |
| 33 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 33 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
| 34 | obj-$(CONFIG_STACK_UNWIND) += unwind.o | ||
| 35 | obj-$(CONFIG_PM) += power/ | 34 | obj-$(CONFIG_PM) += power/ |
| 36 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 35 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
| 37 | obj-$(CONFIG_KEXEC) += kexec.o | 36 | obj-$(CONFIG_KEXEC) += kexec.o |
diff --git a/kernel/acct.c b/kernel/acct.c index 0aad5ca36a81..70d0d88e5554 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -89,7 +89,8 @@ struct acct_glbs { | |||
| 89 | struct timer_list timer; | 89 | struct timer_list timer; |
| 90 | }; | 90 | }; |
| 91 | 91 | ||
| 92 | static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; | 92 | static struct acct_glbs acct_globals __cacheline_aligned = |
| 93 | {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; | ||
| 93 | 94 | ||
| 94 | /* | 95 | /* |
| 95 | * Called whenever the timer says to check the free space. | 96 | * Called whenever the timer says to check the free space. |
| @@ -117,7 +118,7 @@ static int check_free_space(struct file *file) | |||
| 117 | spin_unlock(&acct_globals.lock); | 118 | spin_unlock(&acct_globals.lock); |
| 118 | 119 | ||
| 119 | /* May block */ | 120 | /* May block */ |
| 120 | if (vfs_statfs(file->f_dentry, &sbuf)) | 121 | if (vfs_statfs(file->f_path.dentry, &sbuf)) |
| 121 | return res; | 122 | return res; |
| 122 | suspend = sbuf.f_blocks * SUSPEND; | 123 | suspend = sbuf.f_blocks * SUSPEND; |
| 123 | resume = sbuf.f_blocks * RESUME; | 124 | resume = sbuf.f_blocks * RESUME; |
| @@ -193,7 +194,7 @@ static void acct_file_reopen(struct file *file) | |||
| 193 | add_timer(&acct_globals.timer); | 194 | add_timer(&acct_globals.timer); |
| 194 | } | 195 | } |
| 195 | if (old_acct) { | 196 | if (old_acct) { |
| 196 | mnt_unpin(old_acct->f_vfsmnt); | 197 | mnt_unpin(old_acct->f_path.mnt); |
| 197 | spin_unlock(&acct_globals.lock); | 198 | spin_unlock(&acct_globals.lock); |
| 198 | do_acct_process(old_acct); | 199 | do_acct_process(old_acct); |
| 199 | filp_close(old_acct, NULL); | 200 | filp_close(old_acct, NULL); |
| @@ -211,7 +212,7 @@ static int acct_on(char *name) | |||
| 211 | if (IS_ERR(file)) | 212 | if (IS_ERR(file)) |
| 212 | return PTR_ERR(file); | 213 | return PTR_ERR(file); |
| 213 | 214 | ||
| 214 | if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { | 215 | if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { |
| 215 | filp_close(file, NULL); | 216 | filp_close(file, NULL); |
| 216 | return -EACCES; | 217 | return -EACCES; |
| 217 | } | 218 | } |
| @@ -228,11 +229,11 @@ static int acct_on(char *name) | |||
| 228 | } | 229 | } |
| 229 | 230 | ||
| 230 | spin_lock(&acct_globals.lock); | 231 | spin_lock(&acct_globals.lock); |
| 231 | mnt_pin(file->f_vfsmnt); | 232 | mnt_pin(file->f_path.mnt); |
| 232 | acct_file_reopen(file); | 233 | acct_file_reopen(file); |
| 233 | spin_unlock(&acct_globals.lock); | 234 | spin_unlock(&acct_globals.lock); |
| 234 | 235 | ||
| 235 | mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ | 236 | mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ |
| 236 | 237 | ||
| 237 | return 0; | 238 | return 0; |
| 238 | } | 239 | } |
| @@ -282,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name) | |||
| 282 | void acct_auto_close_mnt(struct vfsmount *m) | 283 | void acct_auto_close_mnt(struct vfsmount *m) |
| 283 | { | 284 | { |
| 284 | spin_lock(&acct_globals.lock); | 285 | spin_lock(&acct_globals.lock); |
| 285 | if (acct_globals.file && acct_globals.file->f_vfsmnt == m) | 286 | if (acct_globals.file && acct_globals.file->f_path.mnt == m) |
| 286 | acct_file_reopen(NULL); | 287 | acct_file_reopen(NULL); |
| 287 | spin_unlock(&acct_globals.lock); | 288 | spin_unlock(&acct_globals.lock); |
| 288 | } | 289 | } |
| @@ -298,7 +299,7 @@ void acct_auto_close(struct super_block *sb) | |||
| 298 | { | 299 | { |
| 299 | spin_lock(&acct_globals.lock); | 300 | spin_lock(&acct_globals.lock); |
| 300 | if (acct_globals.file && | 301 | if (acct_globals.file && |
| 301 | acct_globals.file->f_vfsmnt->mnt_sb == sb) { | 302 | acct_globals.file->f_path.mnt->mnt_sb == sb) { |
| 302 | acct_file_reopen(NULL); | 303 | acct_file_reopen(NULL); |
| 303 | } | 304 | } |
| 304 | spin_unlock(&acct_globals.lock); | 305 | spin_unlock(&acct_globals.lock); |
| @@ -427,6 +428,7 @@ static void do_acct_process(struct file *file) | |||
| 427 | u64 elapsed; | 428 | u64 elapsed; |
| 428 | u64 run_time; | 429 | u64 run_time; |
| 429 | struct timespec uptime; | 430 | struct timespec uptime; |
| 431 | struct tty_struct *tty; | ||
| 430 | 432 | ||
| 431 | /* | 433 | /* |
| 432 | * First check to see if there is enough free_space to continue | 434 | * First check to see if there is enough free_space to continue |
| @@ -483,16 +485,9 @@ static void do_acct_process(struct file *file) | |||
| 483 | ac.ac_ppid = current->parent->tgid; | 485 | ac.ac_ppid = current->parent->tgid; |
| 484 | #endif | 486 | #endif |
| 485 | 487 | ||
| 486 | mutex_lock(&tty_mutex); | ||
| 487 | /* FIXME: Whoever is responsible for current->signal locking needs | ||
| 488 | to use the same locking all over the kernel and document it */ | ||
| 489 | read_lock(&tasklist_lock); | ||
| 490 | ac.ac_tty = current->signal->tty ? | ||
| 491 | old_encode_dev(tty_devnum(current->signal->tty)) : 0; | ||
| 492 | read_unlock(&tasklist_lock); | ||
| 493 | mutex_unlock(&tty_mutex); | ||
| 494 | |||
| 495 | spin_lock_irq(¤t->sighand->siglock); | 488 | spin_lock_irq(¤t->sighand->siglock); |
| 489 | tty = current->signal->tty; | ||
| 490 | ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; | ||
| 496 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); | 491 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); |
| 497 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); | 492 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); |
| 498 | ac.ac_flag = pacct->ac_flag; | 493 | ac.ac_flag = pacct->ac_flag; |
diff --git a/kernel/audit.c b/kernel/audit.c index 98106f6078b0..d9b690ac684b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -57,6 +57,7 @@ | |||
| 57 | #include <linux/netlink.h> | 57 | #include <linux/netlink.h> |
| 58 | #include <linux/selinux.h> | 58 | #include <linux/selinux.h> |
| 59 | #include <linux/inotify.h> | 59 | #include <linux/inotify.h> |
| 60 | #include <linux/freezer.h> | ||
| 60 | 61 | ||
| 61 | #include "audit.h" | 62 | #include "audit.h" |
| 62 | 63 | ||
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 4f40d923af8e..9c8c23227c7f 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -636,10 +636,9 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) | |||
| 636 | struct audit_rule *rule; | 636 | struct audit_rule *rule; |
| 637 | int i; | 637 | int i; |
| 638 | 638 | ||
| 639 | rule = kmalloc(sizeof(*rule), GFP_KERNEL); | 639 | rule = kzalloc(sizeof(*rule), GFP_KERNEL); |
| 640 | if (unlikely(!rule)) | 640 | if (unlikely(!rule)) |
| 641 | return NULL; | 641 | return NULL; |
| 642 | memset(rule, 0, sizeof(*rule)); | ||
| 643 | 642 | ||
| 644 | rule->flags = krule->flags | krule->listnr; | 643 | rule->flags = krule->flags | krule->listnr; |
| 645 | rule->action = krule->action; | 644 | rule->action = krule->action; |
| @@ -801,8 +800,8 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, | |||
| 801 | 800 | ||
| 802 | /* our own copy of se_str */ | 801 | /* our own copy of se_str */ |
| 803 | se_str = kstrdup(sf->se_str, GFP_KERNEL); | 802 | se_str = kstrdup(sf->se_str, GFP_KERNEL); |
| 804 | if (unlikely(IS_ERR(se_str))) | 803 | if (unlikely(!se_str)) |
| 805 | return -ENOMEM; | 804 | return -ENOMEM; |
| 806 | df->se_str = se_str; | 805 | df->se_str = se_str; |
| 807 | 806 | ||
| 808 | /* our own (refreshed) copy of se_rule */ | 807 | /* our own (refreshed) copy of se_rule */ |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 42f2f1179711..298897559ca4 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -64,6 +64,7 @@ | |||
| 64 | #include <linux/tty.h> | 64 | #include <linux/tty.h> |
| 65 | #include <linux/selinux.h> | 65 | #include <linux/selinux.h> |
| 66 | #include <linux/binfmts.h> | 66 | #include <linux/binfmts.h> |
| 67 | #include <linux/highmem.h> | ||
| 67 | #include <linux/syscalls.h> | 68 | #include <linux/syscalls.h> |
| 68 | 69 | ||
| 69 | #include "audit.h" | 70 | #include "audit.h" |
| @@ -730,7 +731,7 @@ static inline void audit_free_context(struct audit_context *context) | |||
| 730 | printk(KERN_ERR "audit: freed %d contexts\n", count); | 731 | printk(KERN_ERR "audit: freed %d contexts\n", count); |
| 731 | } | 732 | } |
| 732 | 733 | ||
| 733 | static void audit_log_task_context(struct audit_buffer *ab) | 734 | void audit_log_task_context(struct audit_buffer *ab) |
| 734 | { | 735 | { |
| 735 | char *ctx = NULL; | 736 | char *ctx = NULL; |
| 736 | ssize_t len = 0; | 737 | ssize_t len = 0; |
| @@ -759,6 +760,8 @@ error_path: | |||
| 759 | return; | 760 | return; |
| 760 | } | 761 | } |
| 761 | 762 | ||
| 763 | EXPORT_SYMBOL(audit_log_task_context); | ||
| 764 | |||
| 762 | static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | 765 | static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) |
| 763 | { | 766 | { |
| 764 | char name[sizeof(tsk->comm)]; | 767 | char name[sizeof(tsk->comm)]; |
| @@ -778,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk | |||
| 778 | if ((vma->vm_flags & VM_EXECUTABLE) && | 781 | if ((vma->vm_flags & VM_EXECUTABLE) && |
| 779 | vma->vm_file) { | 782 | vma->vm_file) { |
| 780 | audit_log_d_path(ab, "exe=", | 783 | audit_log_d_path(ab, "exe=", |
| 781 | vma->vm_file->f_dentry, | 784 | vma->vm_file->f_path.dentry, |
| 782 | vma->vm_file->f_vfsmnt); | 785 | vma->vm_file->f_path.mnt); |
| 783 | break; | 786 | break; |
| 784 | } | 787 | } |
| 785 | vma = vma->vm_next; | 788 | vma = vma->vm_next; |
| @@ -823,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 823 | context->return_code); | 826 | context->return_code); |
| 824 | 827 | ||
| 825 | mutex_lock(&tty_mutex); | 828 | mutex_lock(&tty_mutex); |
| 829 | read_lock(&tasklist_lock); | ||
| 826 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | 830 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) |
| 827 | tty = tsk->signal->tty->name; | 831 | tty = tsk->signal->tty->name; |
| 828 | else | 832 | else |
| 829 | tty = "(none)"; | 833 | tty = "(none)"; |
| 834 | read_unlock(&tasklist_lock); | ||
| 830 | audit_log_format(ab, | 835 | audit_log_format(ab, |
| 831 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | 836 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" |
| 832 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" | 837 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" |
| @@ -1487,6 +1492,8 @@ uid_t audit_get_loginuid(struct audit_context *ctx) | |||
| 1487 | return ctx ? ctx->loginuid : -1; | 1492 | return ctx ? ctx->loginuid : -1; |
| 1488 | } | 1493 | } |
| 1489 | 1494 | ||
| 1495 | EXPORT_SYMBOL(audit_get_loginuid); | ||
| 1496 | |||
| 1490 | /** | 1497 | /** |
| 1491 | * __audit_mq_open - record audit data for a POSIX MQ open | 1498 | * __audit_mq_open - record audit data for a POSIX MQ open |
| 1492 | * @oflag: open flag | 1499 | * @oflag: open flag |
diff --git a/kernel/configs.c b/kernel/configs.c index f9e31974f4ad..8fa1fb28f8a7 100644 --- a/kernel/configs.c +++ b/kernel/configs.c | |||
| @@ -75,7 +75,7 @@ ikconfig_read_current(struct file *file, char __user *buf, | |||
| 75 | return count; | 75 | return count; |
| 76 | } | 76 | } |
| 77 | 77 | ||
| 78 | static struct file_operations ikconfig_file_ops = { | 78 | static const struct file_operations ikconfig_file_ops = { |
| 79 | .owner = THIS_MODULE, | 79 | .owner = THIS_MODULE, |
| 80 | .read = ikconfig_read_current, | 80 | .read = ikconfig_read_current, |
| 81 | }; | 81 | }; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 272254f20d97..7406fe6966f9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -204,7 +204,7 @@ int cpu_down(unsigned int cpu) | |||
| 204 | #endif /*CONFIG_HOTPLUG_CPU*/ | 204 | #endif /*CONFIG_HOTPLUG_CPU*/ |
| 205 | 205 | ||
| 206 | /* Requires cpu_add_remove_lock to be held */ | 206 | /* Requires cpu_add_remove_lock to be held */ |
| 207 | static int __devinit _cpu_up(unsigned int cpu) | 207 | static int __cpuinit _cpu_up(unsigned int cpu) |
| 208 | { | 208 | { |
| 209 | int ret; | 209 | int ret; |
| 210 | void *hcpu = (void *)(long)cpu; | 210 | void *hcpu = (void *)(long)cpu; |
| @@ -239,7 +239,7 @@ out_notify: | |||
| 239 | return ret; | 239 | return ret; |
| 240 | } | 240 | } |
| 241 | 241 | ||
| 242 | int __devinit cpu_up(unsigned int cpu) | 242 | int __cpuinit cpu_up(unsigned int cpu) |
| 243 | { | 243 | { |
| 244 | int err = 0; | 244 | int err = 0; |
| 245 | 245 | ||
| @@ -258,7 +258,7 @@ static cpumask_t frozen_cpus; | |||
| 258 | 258 | ||
| 259 | int disable_nonboot_cpus(void) | 259 | int disable_nonboot_cpus(void) |
| 260 | { | 260 | { |
| 261 | int cpu, first_cpu, error; | 261 | int cpu, first_cpu, error = 0; |
| 262 | 262 | ||
| 263 | mutex_lock(&cpu_add_remove_lock); | 263 | mutex_lock(&cpu_add_remove_lock); |
| 264 | first_cpu = first_cpu(cpu_present_map); | 264 | first_cpu = first_cpu(cpu_present_map); |
| @@ -270,11 +270,7 @@ int disable_nonboot_cpus(void) | |||
| 270 | goto out; | 270 | goto out; |
| 271 | } | 271 | } |
| 272 | } | 272 | } |
| 273 | error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu)); | 273 | |
| 274 | if (error) { | ||
| 275 | printk(KERN_ERR "Could not run on CPU%d\n", first_cpu); | ||
| 276 | goto out; | ||
| 277 | } | ||
| 278 | /* We take down all of the non-boot CPUs in one shot to avoid races | 274 | /* We take down all of the non-boot CPUs in one shot to avoid races |
| 279 | * with the userspace trying to use the CPU hotplug at the same time | 275 | * with the userspace trying to use the CPU hotplug at the same time |
| 280 | */ | 276 | */ |
| @@ -298,7 +294,7 @@ int disable_nonboot_cpus(void) | |||
| 298 | /* Make sure the CPUs won't be enabled by someone else */ | 294 | /* Make sure the CPUs won't be enabled by someone else */ |
| 299 | cpu_hotplug_disabled = 1; | 295 | cpu_hotplug_disabled = 1; |
| 300 | } else { | 296 | } else { |
| 301 | printk(KERN_ERR "Non-boot CPUs are not disabled"); | 297 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); |
| 302 | } | 298 | } |
| 303 | out: | 299 | out: |
| 304 | mutex_unlock(&cpu_add_remove_lock); | 300 | mutex_unlock(&cpu_add_remove_lock); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6313c38c930e..6b05dc69c959 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = { | |||
| 413 | * | 413 | * |
| 414 | * | 414 | * |
| 415 | * When reading/writing to a file: | 415 | * When reading/writing to a file: |
| 416 | * - the cpuset to use in file->f_dentry->d_parent->d_fsdata | 416 | * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata |
| 417 | * - the 'cftype' of the file is file->f_dentry->d_fsdata | 417 | * - the 'cftype' of the file is file->f_path.dentry->d_fsdata |
| 418 | */ | 418 | */ |
| 419 | 419 | ||
| 420 | struct cftype { | 420 | struct cftype { |
| @@ -729,9 +729,11 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
| 729 | } | 729 | } |
| 730 | 730 | ||
| 731 | /* Remaining checks don't apply to root cpuset */ | 731 | /* Remaining checks don't apply to root cpuset */ |
| 732 | if ((par = cur->parent) == NULL) | 732 | if (cur == &top_cpuset) |
| 733 | return 0; | 733 | return 0; |
| 734 | 734 | ||
| 735 | par = cur->parent; | ||
| 736 | |||
| 735 | /* We must be a subset of our parent cpuset */ | 737 | /* We must be a subset of our parent cpuset */ |
| 736 | if (!is_cpuset_subset(trial, par)) | 738 | if (!is_cpuset_subset(trial, par)) |
| 737 | return -EACCES; | 739 | return -EACCES; |
| @@ -1060,10 +1062,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
| 1060 | cpu_exclusive_changed = | 1062 | cpu_exclusive_changed = |
| 1061 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); | 1063 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
| 1062 | mutex_lock(&callback_mutex); | 1064 | mutex_lock(&callback_mutex); |
| 1063 | if (turning_on) | 1065 | cs->flags = trialcs.flags; |
| 1064 | set_bit(bit, &cs->flags); | ||
| 1065 | else | ||
| 1066 | clear_bit(bit, &cs->flags); | ||
| 1067 | mutex_unlock(&callback_mutex); | 1066 | mutex_unlock(&callback_mutex); |
| 1068 | 1067 | ||
| 1069 | if (cpu_exclusive_changed) | 1068 | if (cpu_exclusive_changed) |
| @@ -1281,18 +1280,19 @@ typedef enum { | |||
| 1281 | FILE_TASKLIST, | 1280 | FILE_TASKLIST, |
| 1282 | } cpuset_filetype_t; | 1281 | } cpuset_filetype_t; |
| 1283 | 1282 | ||
| 1284 | static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, | 1283 | static ssize_t cpuset_common_file_write(struct file *file, |
| 1284 | const char __user *userbuf, | ||
| 1285 | size_t nbytes, loff_t *unused_ppos) | 1285 | size_t nbytes, loff_t *unused_ppos) |
| 1286 | { | 1286 | { |
| 1287 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1287 | struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); |
| 1288 | struct cftype *cft = __d_cft(file->f_dentry); | 1288 | struct cftype *cft = __d_cft(file->f_path.dentry); |
| 1289 | cpuset_filetype_t type = cft->private; | 1289 | cpuset_filetype_t type = cft->private; |
| 1290 | char *buffer; | 1290 | char *buffer; |
| 1291 | char *pathbuf = NULL; | 1291 | char *pathbuf = NULL; |
| 1292 | int retval = 0; | 1292 | int retval = 0; |
| 1293 | 1293 | ||
| 1294 | /* Crude upper limit on largest legitimate cpulist user might write. */ | 1294 | /* Crude upper limit on largest legitimate cpulist user might write. */ |
| 1295 | if (nbytes > 100 + 6 * NR_CPUS) | 1295 | if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) |
| 1296 | return -E2BIG; | 1296 | return -E2BIG; |
| 1297 | 1297 | ||
| 1298 | /* +1 for nul-terminator */ | 1298 | /* +1 for nul-terminator */ |
| @@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf, | |||
| 1367 | size_t nbytes, loff_t *ppos) | 1367 | size_t nbytes, loff_t *ppos) |
| 1368 | { | 1368 | { |
| 1369 | ssize_t retval = 0; | 1369 | ssize_t retval = 0; |
| 1370 | struct cftype *cft = __d_cft(file->f_dentry); | 1370 | struct cftype *cft = __d_cft(file->f_path.dentry); |
| 1371 | if (!cft) | 1371 | if (!cft) |
| 1372 | return -ENODEV; | 1372 | return -ENODEV; |
| 1373 | 1373 | ||
| @@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
| 1417 | static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | 1417 | static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, |
| 1418 | size_t nbytes, loff_t *ppos) | 1418 | size_t nbytes, loff_t *ppos) |
| 1419 | { | 1419 | { |
| 1420 | struct cftype *cft = __d_cft(file->f_dentry); | 1420 | struct cftype *cft = __d_cft(file->f_path.dentry); |
| 1421 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1421 | struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); |
| 1422 | cpuset_filetype_t type = cft->private; | 1422 | cpuset_filetype_t type = cft->private; |
| 1423 | char *page; | 1423 | char *page; |
| 1424 | ssize_t retval = 0; | 1424 | ssize_t retval = 0; |
| @@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt | |||
| 1476 | loff_t *ppos) | 1476 | loff_t *ppos) |
| 1477 | { | 1477 | { |
| 1478 | ssize_t retval = 0; | 1478 | ssize_t retval = 0; |
| 1479 | struct cftype *cft = __d_cft(file->f_dentry); | 1479 | struct cftype *cft = __d_cft(file->f_path.dentry); |
| 1480 | if (!cft) | 1480 | if (!cft) |
| 1481 | return -ENODEV; | 1481 | return -ENODEV; |
| 1482 | 1482 | ||
| @@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) | |||
| 1498 | if (err) | 1498 | if (err) |
| 1499 | return err; | 1499 | return err; |
| 1500 | 1500 | ||
| 1501 | cft = __d_cft(file->f_dentry); | 1501 | cft = __d_cft(file->f_path.dentry); |
| 1502 | if (!cft) | 1502 | if (!cft) |
| 1503 | return -ENODEV; | 1503 | return -ENODEV; |
| 1504 | if (cft->open) | 1504 | if (cft->open) |
| @@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) | |||
| 1511 | 1511 | ||
| 1512 | static int cpuset_file_release(struct inode *inode, struct file *file) | 1512 | static int cpuset_file_release(struct inode *inode, struct file *file) |
| 1513 | { | 1513 | { |
| 1514 | struct cftype *cft = __d_cft(file->f_dentry); | 1514 | struct cftype *cft = __d_cft(file->f_path.dentry); |
| 1515 | if (cft->release) | 1515 | if (cft->release) |
| 1516 | return cft->release(inode, file); | 1516 | return cft->release(inode, file); |
| 1517 | return 0; | 1517 | return 0; |
| @@ -1532,7 +1532,7 @@ static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
| 1532 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | 1532 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); |
| 1533 | } | 1533 | } |
| 1534 | 1534 | ||
| 1535 | static struct file_operations cpuset_file_operations = { | 1535 | static const struct file_operations cpuset_file_operations = { |
| 1536 | .read = cpuset_file_read, | 1536 | .read = cpuset_file_read, |
| 1537 | .write = cpuset_file_write, | 1537 | .write = cpuset_file_write, |
| 1538 | .llseek = generic_file_llseek, | 1538 | .llseek = generic_file_llseek, |
| @@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
| 1700 | */ | 1700 | */ |
| 1701 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1701 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
| 1702 | { | 1702 | { |
| 1703 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1703 | struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); |
| 1704 | struct ctr_struct *ctr; | 1704 | struct ctr_struct *ctr; |
| 1705 | pid_t *pidarray; | 1705 | pid_t *pidarray; |
| 1706 | int npids; | 1706 | int npids; |
| @@ -2045,7 +2045,6 @@ out: | |||
| 2045 | return err; | 2045 | return err; |
| 2046 | } | 2046 | } |
| 2047 | 2047 | ||
| 2048 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG) | ||
| 2049 | /* | 2048 | /* |
| 2050 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 2049 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs |
| 2051 | * or memory nodes, we need to walk over the cpuset hierarchy, | 2050 | * or memory nodes, we need to walk over the cpuset hierarchy, |
| @@ -2109,9 +2108,7 @@ static void common_cpu_mem_hotplug_unplug(void) | |||
| 2109 | mutex_unlock(&callback_mutex); | 2108 | mutex_unlock(&callback_mutex); |
| 2110 | mutex_unlock(&manage_mutex); | 2109 | mutex_unlock(&manage_mutex); |
| 2111 | } | 2110 | } |
| 2112 | #endif | ||
| 2113 | 2111 | ||
| 2114 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 2115 | /* | 2112 | /* |
| 2116 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | 2113 | * The top_cpuset tracks what CPUs and Memory Nodes are online, |
| 2117 | * period. This is necessary in order to make cpusets transparent | 2114 | * period. This is necessary in order to make cpusets transparent |
| @@ -2128,7 +2125,6 @@ static int cpuset_handle_cpuhp(struct notifier_block *nb, | |||
| 2128 | common_cpu_mem_hotplug_unplug(); | 2125 | common_cpu_mem_hotplug_unplug(); |
| 2129 | return 0; | 2126 | return 0; |
| 2130 | } | 2127 | } |
| 2131 | #endif | ||
| 2132 | 2128 | ||
| 2133 | #ifdef CONFIG_MEMORY_HOTPLUG | 2129 | #ifdef CONFIG_MEMORY_HOTPLUG |
| 2134 | /* | 2130 | /* |
| @@ -2346,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
| 2346 | } | 2342 | } |
| 2347 | 2343 | ||
| 2348 | /** | 2344 | /** |
| 2349 | * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? | 2345 | * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? |
| 2350 | * @z: is this zone on an allowed node? | 2346 | * @z: is this zone on an allowed node? |
| 2351 | * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) | 2347 | * @gfp_mask: memory allocation flags |
| 2352 | * | 2348 | * |
| 2353 | * If we're in interrupt, yes, we can always allocate. If zone | 2349 | * If we're in interrupt, yes, we can always allocate. If |
| 2350 | * __GFP_THISNODE is set, yes, we can always allocate. If zone | ||
| 2354 | * z's node is in our tasks mems_allowed, yes. If it's not a | 2351 | * z's node is in our tasks mems_allowed, yes. If it's not a |
| 2355 | * __GFP_HARDWALL request and this zone's nodes is in the nearest | 2352 | * __GFP_HARDWALL request and this zone's nodes is in the nearest |
| 2356 | * mem_exclusive cpuset ancestor to this tasks cpuset, yes. | 2353 | * mem_exclusive cpuset ancestor to this tasks cpuset, yes. |
| 2357 | * Otherwise, no. | 2354 | * Otherwise, no. |
| 2358 | * | 2355 | * |
| 2356 | * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() | ||
| 2357 | * reduces to cpuset_zone_allowed_hardwall(). Otherwise, | ||
| 2358 | * cpuset_zone_allowed_softwall() might sleep, and might allow a zone | ||
| 2359 | * from an enclosing cpuset. | ||
| 2360 | * | ||
| 2361 | * cpuset_zone_allowed_hardwall() only handles the simpler case of | ||
| 2362 | * hardwall cpusets, and never sleeps. | ||
| 2363 | * | ||
| 2364 | * The __GFP_THISNODE placement logic is really handled elsewhere, | ||
| 2365 | * by forcibly using a zonelist starting at a specified node, and by | ||
| 2366 | * (in get_page_from_freelist()) refusing to consider the zones for | ||
| 2367 | * any node on the zonelist except the first. By the time any such | ||
| 2368 | * calls get to this routine, we should just shut up and say 'yes'. | ||
| 2369 | * | ||
| 2359 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, | 2370 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, |
| 2360 | * and do not allow allocations outside the current tasks cpuset. | 2371 | * and do not allow allocations outside the current tasks cpuset. |
| 2361 | * GFP_KERNEL allocations are not so marked, so can escape to the | 2372 | * GFP_KERNEL allocations are not so marked, so can escape to the |
| 2362 | * nearest mem_exclusive ancestor cpuset. | 2373 | * nearest enclosing mem_exclusive ancestor cpuset. |
| 2363 | * | 2374 | * |
| 2364 | * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() | 2375 | * Scanning up parent cpusets requires callback_mutex. The |
| 2365 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | 2376 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit |
| 2366 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | 2377 | * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the |
| 2367 | * mems_allowed came up empty on the first pass over the zonelist. | 2378 | * current tasks mems_allowed came up empty on the first pass over |
| 2368 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | 2379 | * the zonelist. So only GFP_KERNEL allocations, if all nodes in the |
| 2369 | * short of memory, might require taking the callback_mutex mutex. | 2380 | * cpuset are short of memory, might require taking the callback_mutex |
| 2381 | * mutex. | ||
| 2370 | * | 2382 | * |
| 2371 | * The first call here from mm/page_alloc:get_page_from_freelist() | 2383 | * The first call here from mm/page_alloc:get_page_from_freelist() |
| 2372 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so | 2384 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, |
| 2373 | * no allocation on a node outside the cpuset is allowed (unless in | 2385 | * so no allocation on a node outside the cpuset is allowed (unless |
| 2374 | * interrupt, of course). | 2386 | * in interrupt, of course). |
| 2375 | * | 2387 | * |
| 2376 | * The second pass through get_page_from_freelist() doesn't even call | 2388 | * The second pass through get_page_from_freelist() doesn't even call |
| 2377 | * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() | 2389 | * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() |
| @@ -2384,12 +2396,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
| 2384 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2396 | * GFP_USER - only nodes in current tasks mems allowed ok. |
| 2385 | * | 2397 | * |
| 2386 | * Rule: | 2398 | * Rule: |
| 2387 | * Don't call cpuset_zone_allowed() if you can't sleep, unless you | 2399 | * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you |
| 2388 | * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables | 2400 | * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables |
| 2389 | * the code that might scan up ancestor cpusets and sleep. | 2401 | * the code that might scan up ancestor cpusets and sleep. |
| 2390 | **/ | 2402 | */ |
| 2391 | 2403 | ||
| 2392 | int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | 2404 | int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) |
| 2393 | { | 2405 | { |
| 2394 | int node; /* node that zone z is on */ | 2406 | int node; /* node that zone z is on */ |
| 2395 | const struct cpuset *cs; /* current cpuset ancestors */ | 2407 | const struct cpuset *cs; /* current cpuset ancestors */ |
| @@ -2419,6 +2431,40 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
| 2419 | return allowed; | 2431 | return allowed; |
| 2420 | } | 2432 | } |
| 2421 | 2433 | ||
| 2434 | /* | ||
| 2435 | * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? | ||
| 2436 | * @z: is this zone on an allowed node? | ||
| 2437 | * @gfp_mask: memory allocation flags | ||
| 2438 | * | ||
| 2439 | * If we're in interrupt, yes, we can always allocate. | ||
| 2440 | * If __GFP_THISNODE is set, yes, we can always allocate. If zone | ||
| 2441 | * z's node is in our tasks mems_allowed, yes. Otherwise, no. | ||
| 2442 | * | ||
| 2443 | * The __GFP_THISNODE placement logic is really handled elsewhere, | ||
| 2444 | * by forcibly using a zonelist starting at a specified node, and by | ||
| 2445 | * (in get_page_from_freelist()) refusing to consider the zones for | ||
| 2446 | * any node on the zonelist except the first. By the time any such | ||
| 2447 | * calls get to this routine, we should just shut up and say 'yes'. | ||
| 2448 | * | ||
| 2449 | * Unlike the cpuset_zone_allowed_softwall() variant, above, | ||
| 2450 | * this variant requires that the zone be in the current tasks | ||
| 2451 | * mems_allowed or that we're in interrupt. It does not scan up the | ||
| 2452 | * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. | ||
| 2453 | * It never sleeps. | ||
| 2454 | */ | ||
| 2455 | |||
| 2456 | int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) | ||
| 2457 | { | ||
| 2458 | int node; /* node that zone z is on */ | ||
| 2459 | |||
| 2460 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | ||
| 2461 | return 1; | ||
| 2462 | node = zone_to_nid(z); | ||
| 2463 | if (node_isset(node, current->mems_allowed)) | ||
| 2464 | return 1; | ||
| 2465 | return 0; | ||
| 2466 | } | ||
| 2467 | |||
| 2422 | /** | 2468 | /** |
| 2423 | * cpuset_lock - lock out any changes to cpuset structures | 2469 | * cpuset_lock - lock out any changes to cpuset structures |
| 2424 | * | 2470 | * |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 66a0ea48751d..766d5912b26a 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
| @@ -20,7 +20,7 @@ | |||
| 20 | #include <linux/delayacct.h> | 20 | #include <linux/delayacct.h> |
| 21 | 21 | ||
| 22 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ | 22 | int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ |
| 23 | kmem_cache_t *delayacct_cache; | 23 | struct kmem_cache *delayacct_cache; |
| 24 | 24 | ||
| 25 | static int __init delayacct_setup_disable(char *str) | 25 | static int __init delayacct_setup_disable(char *str) |
| 26 | { | 26 | { |
| @@ -41,7 +41,7 @@ void delayacct_init(void) | |||
| 41 | 41 | ||
| 42 | void __delayacct_tsk_init(struct task_struct *tsk) | 42 | void __delayacct_tsk_init(struct task_struct *tsk) |
| 43 | { | 43 | { |
| 44 | tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); | 44 | tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); |
| 45 | if (tsk->delays) | 45 | if (tsk->delays) |
| 46 | spin_lock_init(&tsk->delays->lock); | 46 | spin_lock_init(&tsk->delays->lock); |
| 47 | } | 47 | } |
diff --git a/kernel/dma.c b/kernel/dma.c index 2020644c938a..937b13ca33ba 100644 --- a/kernel/dma.c +++ b/kernel/dma.c | |||
| @@ -140,7 +140,7 @@ static int proc_dma_open(struct inode *inode, struct file *file) | |||
| 140 | return single_open(file, proc_dma_show, NULL); | 140 | return single_open(file, proc_dma_show, NULL); |
| 141 | } | 141 | } |
| 142 | 142 | ||
| 143 | static struct file_operations proc_dma_operations = { | 143 | static const struct file_operations proc_dma_operations = { |
| 144 | .open = proc_dma_open, | 144 | .open = proc_dma_open, |
| 145 | .read = seq_read, | 145 | .read = seq_read, |
| 146 | .llseek = seq_lseek, | 146 | .llseek = seq_lseek, |
diff --git a/kernel/exit.c b/kernel/exit.c index 06de6c4e8ca3..35401720635b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -13,7 +13,7 @@ | |||
| 13 | #include <linux/completion.h> | 13 | #include <linux/completion.h> |
| 14 | #include <linux/personality.h> | 14 | #include <linux/personality.h> |
| 15 | #include <linux/tty.h> | 15 | #include <linux/tty.h> |
| 16 | #include <linux/namespace.h> | 16 | #include <linux/mnt_namespace.h> |
| 17 | #include <linux/key.h> | 17 | #include <linux/key.h> |
| 18 | #include <linux/security.h> | 18 | #include <linux/security.h> |
| 19 | #include <linux/cpu.h> | 19 | #include <linux/cpu.h> |
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/file.h> | 22 | #include <linux/file.h> |
| 23 | #include <linux/binfmts.h> | 23 | #include <linux/binfmts.h> |
| 24 | #include <linux/nsproxy.h> | 24 | #include <linux/nsproxy.h> |
| 25 | #include <linux/pid_namespace.h> | ||
| 25 | #include <linux/ptrace.h> | 26 | #include <linux/ptrace.h> |
| 26 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
| 27 | #include <linux/mount.h> | 28 | #include <linux/mount.h> |
| @@ -48,7 +49,6 @@ | |||
| 48 | #include <asm/mmu_context.h> | 49 | #include <asm/mmu_context.h> |
| 49 | 50 | ||
| 50 | extern void sem_exit (void); | 51 | extern void sem_exit (void); |
| 51 | extern struct task_struct *child_reaper; | ||
| 52 | 52 | ||
| 53 | static void exit_mm(struct task_struct * tsk); | 53 | static void exit_mm(struct task_struct * tsk); |
| 54 | 54 | ||
| @@ -189,21 +189,18 @@ repeat: | |||
| 189 | int session_of_pgrp(int pgrp) | 189 | int session_of_pgrp(int pgrp) |
| 190 | { | 190 | { |
| 191 | struct task_struct *p; | 191 | struct task_struct *p; |
| 192 | int sid = -1; | 192 | int sid = 0; |
| 193 | 193 | ||
| 194 | read_lock(&tasklist_lock); | 194 | read_lock(&tasklist_lock); |
| 195 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | 195 | |
| 196 | if (p->signal->session > 0) { | 196 | p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); |
| 197 | sid = p->signal->session; | 197 | if (p == NULL) |
| 198 | goto out; | 198 | p = find_task_by_pid(pgrp); |
| 199 | } | 199 | if (p != NULL) |
| 200 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); | 200 | sid = process_session(p); |
| 201 | p = find_task_by_pid(pgrp); | 201 | |
| 202 | if (p) | ||
| 203 | sid = p->signal->session; | ||
| 204 | out: | ||
| 205 | read_unlock(&tasklist_lock); | 202 | read_unlock(&tasklist_lock); |
| 206 | 203 | ||
| 207 | return sid; | 204 | return sid; |
| 208 | } | 205 | } |
| 209 | 206 | ||
| @@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) | |||
| 225 | || p->exit_state | 222 | || p->exit_state |
| 226 | || is_init(p->real_parent)) | 223 | || is_init(p->real_parent)) |
| 227 | continue; | 224 | continue; |
| 228 | if (process_group(p->real_parent) != pgrp | 225 | if (process_group(p->real_parent) != pgrp && |
| 229 | && p->real_parent->signal->session == p->signal->session) { | 226 | process_session(p->real_parent) == process_session(p)) { |
| 230 | ret = 0; | 227 | ret = 0; |
| 231 | break; | 228 | break; |
| 232 | } | 229 | } |
| @@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp) | |||
| 260 | } | 257 | } |
| 261 | 258 | ||
| 262 | /** | 259 | /** |
| 263 | * reparent_to_init - Reparent the calling kernel thread to the init task. | 260 | * reparent_to_init - Reparent the calling kernel thread to the init task |
| 261 | * of the pid space that the thread belongs to. | ||
| 264 | * | 262 | * |
| 265 | * If a kernel thread is launched as a result of a system call, or if | 263 | * If a kernel thread is launched as a result of a system call, or if |
| 266 | * it ever exits, it should generally reparent itself to init so that | 264 | * it ever exits, it should generally reparent itself to init so that |
| @@ -278,8 +276,8 @@ static void reparent_to_init(void) | |||
| 278 | ptrace_unlink(current); | 276 | ptrace_unlink(current); |
| 279 | /* Reparent to init */ | 277 | /* Reparent to init */ |
| 280 | remove_parent(current); | 278 | remove_parent(current); |
| 281 | current->parent = child_reaper; | 279 | current->parent = child_reaper(current); |
| 282 | current->real_parent = child_reaper; | 280 | current->real_parent = child_reaper(current); |
| 283 | add_parent(current); | 281 | add_parent(current); |
| 284 | 282 | ||
| 285 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | 283 | /* Set the exit signal to SIGCHLD so we signal init on exit */ |
| @@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp) | |||
| 302 | { | 300 | { |
| 303 | struct task_struct *curr = current->group_leader; | 301 | struct task_struct *curr = current->group_leader; |
| 304 | 302 | ||
| 305 | if (curr->signal->session != session) { | 303 | if (process_session(curr) != session) { |
| 306 | detach_pid(curr, PIDTYPE_SID); | 304 | detach_pid(curr, PIDTYPE_SID); |
| 307 | curr->signal->session = session; | 305 | set_signal_session(curr->signal, session); |
| 308 | attach_pid(curr, PIDTYPE_SID, session); | 306 | attach_pid(curr, PIDTYPE_SID, session); |
| 309 | } | 307 | } |
| 310 | if (process_group(curr) != pgrp) { | 308 | if (process_group(curr) != pgrp) { |
| @@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) | |||
| 314 | } | 312 | } |
| 315 | } | 313 | } |
| 316 | 314 | ||
| 317 | void set_special_pids(pid_t session, pid_t pgrp) | 315 | static void set_special_pids(pid_t session, pid_t pgrp) |
| 318 | { | 316 | { |
| 319 | write_lock_irq(&tasklist_lock); | 317 | write_lock_irq(&tasklist_lock); |
| 320 | __set_special_pids(session, pgrp); | 318 | __set_special_pids(session, pgrp); |
| @@ -384,9 +382,7 @@ void daemonize(const char *name, ...) | |||
| 384 | exit_mm(current); | 382 | exit_mm(current); |
| 385 | 383 | ||
| 386 | set_special_pids(1, 1); | 384 | set_special_pids(1, 1); |
| 387 | mutex_lock(&tty_mutex); | 385 | proc_clear_tty(current); |
| 388 | current->signal->tty = NULL; | ||
| 389 | mutex_unlock(&tty_mutex); | ||
| 390 | 386 | ||
| 391 | /* Block and flush all signals */ | 387 | /* Block and flush all signals */ |
| 392 | sigfillset(&blocked); | 388 | sigfillset(&blocked); |
| @@ -429,7 +425,7 @@ static void close_files(struct files_struct * files) | |||
| 429 | for (;;) { | 425 | for (;;) { |
| 430 | unsigned long set; | 426 | unsigned long set; |
| 431 | i = j * __NFDBITS; | 427 | i = j * __NFDBITS; |
| 432 | if (i >= fdt->max_fdset || i >= fdt->max_fds) | 428 | if (i >= fdt->max_fds) |
| 433 | break; | 429 | break; |
| 434 | set = fdt->open_fds->fds_bits[j++]; | 430 | set = fdt->open_fds->fds_bits[j++]; |
| 435 | while (set) { | 431 | while (set) { |
| @@ -470,9 +466,7 @@ void fastcall put_files_struct(struct files_struct *files) | |||
| 470 | * you can free files immediately. | 466 | * you can free files immediately. |
| 471 | */ | 467 | */ |
| 472 | fdt = files_fdtable(files); | 468 | fdt = files_fdtable(files); |
| 473 | if (fdt == &files->fdtab) | 469 | if (fdt != &files->fdtab) |
| 474 | fdt->free_files = files; | ||
| 475 | else | ||
| 476 | kmem_cache_free(files_cachep, files); | 470 | kmem_cache_free(files_cachep, files); |
| 477 | free_fdtable(fdt); | 471 | free_fdtable(fdt); |
| 478 | } | 472 | } |
| @@ -603,10 +597,6 @@ choose_new_parent(struct task_struct *p, struct task_struct *reaper) | |||
| 603 | static void | 597 | static void |
| 604 | reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | 598 | reparent_thread(struct task_struct *p, struct task_struct *father, int traced) |
| 605 | { | 599 | { |
| 606 | /* We don't want people slaying init. */ | ||
| 607 | if (p->exit_signal != -1) | ||
| 608 | p->exit_signal = SIGCHLD; | ||
| 609 | |||
| 610 | if (p->pdeath_signal) | 600 | if (p->pdeath_signal) |
| 611 | /* We already hold the tasklist_lock here. */ | 601 | /* We already hold the tasklist_lock here. */ |
| 612 | group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); | 602 | group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); |
| @@ -626,13 +616,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
| 626 | p->parent = p->real_parent; | 616 | p->parent = p->real_parent; |
| 627 | add_parent(p); | 617 | add_parent(p); |
| 628 | 618 | ||
| 629 | /* If we'd notified the old parent about this child's death, | 619 | if (p->state == TASK_TRACED) { |
| 630 | * also notify the new parent. | ||
| 631 | */ | ||
| 632 | if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 && | ||
| 633 | thread_group_empty(p)) | ||
| 634 | do_notify_parent(p, p->exit_signal); | ||
| 635 | else if (p->state == TASK_TRACED) { | ||
| 636 | /* | 620 | /* |
| 637 | * If it was at a trace stop, turn it into | 621 | * If it was at a trace stop, turn it into |
| 638 | * a normal stop since it's no longer being | 622 | * a normal stop since it's no longer being |
| @@ -642,6 +626,23 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
| 642 | } | 626 | } |
| 643 | } | 627 | } |
| 644 | 628 | ||
| 629 | /* If this is a threaded reparent there is no need to | ||
| 630 | * notify anyone anything has happened. | ||
| 631 | */ | ||
| 632 | if (p->real_parent->group_leader == father->group_leader) | ||
| 633 | return; | ||
| 634 | |||
| 635 | /* We don't want people slaying init. */ | ||
| 636 | if (p->exit_signal != -1) | ||
| 637 | p->exit_signal = SIGCHLD; | ||
| 638 | |||
| 639 | /* If we'd notified the old parent about this child's death, | ||
| 640 | * also notify the new parent. | ||
| 641 | */ | ||
| 642 | if (!traced && p->exit_state == EXIT_ZOMBIE && | ||
| 643 | p->exit_signal != -1 && thread_group_empty(p)) | ||
| 644 | do_notify_parent(p, p->exit_signal); | ||
| 645 | |||
| 645 | /* | 646 | /* |
| 646 | * process group orphan check | 647 | * process group orphan check |
| 647 | * Case ii: Our child is in a different pgrp | 648 | * Case ii: Our child is in a different pgrp |
| @@ -649,10 +650,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
| 649 | * outside, so the child pgrp is now orphaned. | 650 | * outside, so the child pgrp is now orphaned. |
| 650 | */ | 651 | */ |
| 651 | if ((process_group(p) != process_group(father)) && | 652 | if ((process_group(p) != process_group(father)) && |
| 652 | (p->signal->session == father->signal->session)) { | 653 | (process_session(p) == process_session(father))) { |
| 653 | int pgrp = process_group(p); | 654 | int pgrp = process_group(p); |
| 654 | 655 | ||
| 655 | if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { | 656 | if (will_become_orphaned_pgrp(pgrp, NULL) && |
| 657 | has_stopped_jobs(pgrp)) { | ||
| 656 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); | 658 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); |
| 657 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); | 659 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); |
| 658 | } | 660 | } |
| @@ -663,7 +665,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
| 663 | * When we die, we re-parent all our children. | 665 | * When we die, we re-parent all our children. |
| 664 | * Try to give them to another thread in our thread | 666 | * Try to give them to another thread in our thread |
| 665 | * group, and if no such member exists, give it to | 667 | * group, and if no such member exists, give it to |
| 666 | * the global child reaper process (ie "init") | 668 | * the child reaper process (ie "init") in our pid |
| 669 | * space. | ||
| 667 | */ | 670 | */ |
| 668 | static void | 671 | static void |
| 669 | forget_original_parent(struct task_struct *father, struct list_head *to_release) | 672 | forget_original_parent(struct task_struct *father, struct list_head *to_release) |
| @@ -674,7 +677,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release) | |||
| 674 | do { | 677 | do { |
| 675 | reaper = next_thread(reaper); | 678 | reaper = next_thread(reaper); |
| 676 | if (reaper == father) { | 679 | if (reaper == father) { |
| 677 | reaper = child_reaper; | 680 | reaper = child_reaper(father); |
| 678 | break; | 681 | break; |
| 679 | } | 682 | } |
| 680 | } while (reaper->exit_state); | 683 | } while (reaper->exit_state); |
| @@ -786,7 +789,7 @@ static void exit_notify(struct task_struct *tsk) | |||
| 786 | t = tsk->real_parent; | 789 | t = tsk->real_parent; |
| 787 | 790 | ||
| 788 | if ((process_group(t) != process_group(tsk)) && | 791 | if ((process_group(t) != process_group(tsk)) && |
| 789 | (t->signal->session == tsk->signal->session) && | 792 | (process_session(t) == process_session(tsk)) && |
| 790 | will_become_orphaned_pgrp(process_group(tsk), tsk) && | 793 | will_become_orphaned_pgrp(process_group(tsk), tsk) && |
| 791 | has_stopped_jobs(process_group(tsk))) { | 794 | has_stopped_jobs(process_group(tsk))) { |
| 792 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); | 795 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); |
| @@ -850,9 +853,7 @@ static void exit_notify(struct task_struct *tsk) | |||
| 850 | fastcall NORET_TYPE void do_exit(long code) | 853 | fastcall NORET_TYPE void do_exit(long code) |
| 851 | { | 854 | { |
| 852 | struct task_struct *tsk = current; | 855 | struct task_struct *tsk = current; |
| 853 | struct taskstats *tidstats; | ||
| 854 | int group_dead; | 856 | int group_dead; |
| 855 | unsigned int mycpu; | ||
| 856 | 857 | ||
| 857 | profile_task_exit(tsk); | 858 | profile_task_exit(tsk); |
| 858 | 859 | ||
| @@ -862,8 +863,13 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 862 | panic("Aiee, killing interrupt handler!"); | 863 | panic("Aiee, killing interrupt handler!"); |
| 863 | if (unlikely(!tsk->pid)) | 864 | if (unlikely(!tsk->pid)) |
| 864 | panic("Attempted to kill the idle task!"); | 865 | panic("Attempted to kill the idle task!"); |
| 865 | if (unlikely(tsk == child_reaper)) | 866 | if (unlikely(tsk == child_reaper(tsk))) { |
| 866 | panic("Attempted to kill init!"); | 867 | if (tsk->nsproxy->pid_ns != &init_pid_ns) |
| 868 | tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper; | ||
| 869 | else | ||
| 870 | panic("Attempted to kill init!"); | ||
| 871 | } | ||
| 872 | |||
| 867 | 873 | ||
| 868 | if (unlikely(current->ptrace & PT_TRACE_EXIT)) { | 874 | if (unlikely(current->ptrace & PT_TRACE_EXIT)) { |
| 869 | current->ptrace_message = code; | 875 | current->ptrace_message = code; |
| @@ -890,8 +896,6 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 890 | current->comm, current->pid, | 896 | current->comm, current->pid, |
| 891 | preempt_count()); | 897 | preempt_count()); |
| 892 | 898 | ||
| 893 | taskstats_exit_alloc(&tidstats, &mycpu); | ||
| 894 | |||
| 895 | acct_update_integrals(tsk); | 899 | acct_update_integrals(tsk); |
| 896 | if (tsk->mm) { | 900 | if (tsk->mm) { |
| 897 | update_hiwater_rss(tsk->mm); | 901 | update_hiwater_rss(tsk->mm); |
| @@ -911,8 +915,8 @@ fastcall NORET_TYPE void do_exit(long code) | |||
| 911 | #endif | 915 | #endif |
| 912 | if (unlikely(tsk->audit_context)) | 916 | if (unlikely(tsk->audit_context)) |
| 913 | audit_free(tsk); | 917 | audit_free(tsk); |
| 914 | taskstats_exit_send(tsk, tidstats, group_dead, mycpu); | 918 | |
| 915 | taskstats_exit_free(tidstats); | 919 | taskstats_exit(tsk, group_dead); |
| 916 | 920 | ||
| 917 | exit_mm(tsk); | 921 | exit_mm(tsk); |
| 918 | 922 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index 8cdd3e72ba55..fc723e595cd5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -18,7 +18,7 @@ | |||
| 18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
| 19 | #include <linux/vmalloc.h> | 19 | #include <linux/vmalloc.h> |
| 20 | #include <linux/completion.h> | 20 | #include <linux/completion.h> |
| 21 | #include <linux/namespace.h> | 21 | #include <linux/mnt_namespace.h> |
| 22 | #include <linux/personality.h> | 22 | #include <linux/personality.h> |
| 23 | #include <linux/mempolicy.h> | 23 | #include <linux/mempolicy.h> |
| 24 | #include <linux/sem.h> | 24 | #include <linux/sem.h> |
| @@ -36,6 +36,7 @@ | |||
| 36 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
| 37 | #include <linux/jiffies.h> | 37 | #include <linux/jiffies.h> |
| 38 | #include <linux/futex.h> | 38 | #include <linux/futex.h> |
| 39 | #include <linux/task_io_accounting_ops.h> | ||
| 39 | #include <linux/rcupdate.h> | 40 | #include <linux/rcupdate.h> |
| 40 | #include <linux/ptrace.h> | 41 | #include <linux/ptrace.h> |
| 41 | #include <linux/mount.h> | 42 | #include <linux/mount.h> |
| @@ -82,26 +83,26 @@ int nr_processes(void) | |||
| 82 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR | 83 | #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR |
| 83 | # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) | 84 | # define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL) |
| 84 | # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) | 85 | # define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk)) |
| 85 | static kmem_cache_t *task_struct_cachep; | 86 | static struct kmem_cache *task_struct_cachep; |
| 86 | #endif | 87 | #endif |
| 87 | 88 | ||
| 88 | /* SLAB cache for signal_struct structures (tsk->signal) */ | 89 | /* SLAB cache for signal_struct structures (tsk->signal) */ |
| 89 | static kmem_cache_t *signal_cachep; | 90 | static struct kmem_cache *signal_cachep; |
| 90 | 91 | ||
| 91 | /* SLAB cache for sighand_struct structures (tsk->sighand) */ | 92 | /* SLAB cache for sighand_struct structures (tsk->sighand) */ |
| 92 | kmem_cache_t *sighand_cachep; | 93 | struct kmem_cache *sighand_cachep; |
| 93 | 94 | ||
| 94 | /* SLAB cache for files_struct structures (tsk->files) */ | 95 | /* SLAB cache for files_struct structures (tsk->files) */ |
| 95 | kmem_cache_t *files_cachep; | 96 | struct kmem_cache *files_cachep; |
| 96 | 97 | ||
| 97 | /* SLAB cache for fs_struct structures (tsk->fs) */ | 98 | /* SLAB cache for fs_struct structures (tsk->fs) */ |
| 98 | kmem_cache_t *fs_cachep; | 99 | struct kmem_cache *fs_cachep; |
| 99 | 100 | ||
| 100 | /* SLAB cache for vm_area_struct structures */ | 101 | /* SLAB cache for vm_area_struct structures */ |
| 101 | kmem_cache_t *vm_area_cachep; | 102 | struct kmem_cache *vm_area_cachep; |
| 102 | 103 | ||
| 103 | /* SLAB cache for mm_struct structures (tsk->mm) */ | 104 | /* SLAB cache for mm_struct structures (tsk->mm) */ |
| 104 | static kmem_cache_t *mm_cachep; | 105 | static struct kmem_cache *mm_cachep; |
| 105 | 106 | ||
| 106 | void free_task(struct task_struct *tsk) | 107 | void free_task(struct task_struct *tsk) |
| 107 | { | 108 | { |
| @@ -202,7 +203,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 202 | struct mempolicy *pol; | 203 | struct mempolicy *pol; |
| 203 | 204 | ||
| 204 | down_write(&oldmm->mmap_sem); | 205 | down_write(&oldmm->mmap_sem); |
| 205 | flush_cache_mm(oldmm); | 206 | flush_cache_dup_mm(oldmm); |
| 206 | /* | 207 | /* |
| 207 | * Not linked in yet - no deadlock potential: | 208 | * Not linked in yet - no deadlock potential: |
| 208 | */ | 209 | */ |
| @@ -237,7 +238,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 237 | goto fail_nomem; | 238 | goto fail_nomem; |
| 238 | charge = len; | 239 | charge = len; |
| 239 | } | 240 | } |
| 240 | tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | 241 | tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
| 241 | if (!tmp) | 242 | if (!tmp) |
| 242 | goto fail_nomem; | 243 | goto fail_nomem; |
| 243 | *tmp = *mpnt; | 244 | *tmp = *mpnt; |
| @@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 252 | anon_vma_link(tmp); | 253 | anon_vma_link(tmp); |
| 253 | file = tmp->vm_file; | 254 | file = tmp->vm_file; |
| 254 | if (file) { | 255 | if (file) { |
| 255 | struct inode *inode = file->f_dentry->d_inode; | 256 | struct inode *inode = file->f_path.dentry->d_inode; |
| 256 | get_file(file); | 257 | get_file(file); |
| 257 | if (tmp->vm_flags & VM_DENYWRITE) | 258 | if (tmp->vm_flags & VM_DENYWRITE) |
| 258 | atomic_dec(&inode->i_writecount); | 259 | atomic_dec(&inode->i_writecount); |
| @@ -319,7 +320,7 @@ static inline void mm_free_pgd(struct mm_struct * mm) | |||
| 319 | 320 | ||
| 320 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); | 321 | __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); |
| 321 | 322 | ||
| 322 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) | 323 | #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) |
| 323 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) | 324 | #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) |
| 324 | 325 | ||
| 325 | #include <linux/init_task.h> | 326 | #include <linux/init_task.h> |
| @@ -448,7 +449,16 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) | |||
| 448 | tsk->vfork_done = NULL; | 449 | tsk->vfork_done = NULL; |
| 449 | complete(vfork_done); | 450 | complete(vfork_done); |
| 450 | } | 451 | } |
| 451 | if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) { | 452 | |
| 453 | /* | ||
| 454 | * If we're exiting normally, clear a user-space tid field if | ||
| 455 | * requested. We leave this alone when dying by signal, to leave | ||
| 456 | * the value intact in a core dump, and to save the unnecessary | ||
| 457 | * trouble otherwise. Userland only wants this done for a sys_exit. | ||
| 458 | */ | ||
| 459 | if (tsk->clear_child_tid | ||
| 460 | && !(tsk->flags & PF_SIGNALED) | ||
| 461 | && atomic_read(&mm->mm_users) > 1) { | ||
| 452 | u32 __user * tidptr = tsk->clear_child_tid; | 462 | u32 __user * tidptr = tsk->clear_child_tid; |
| 453 | tsk->clear_child_tid = NULL; | 463 | tsk->clear_child_tid = NULL; |
| 454 | 464 | ||
| @@ -479,6 +489,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) | |||
| 479 | 489 | ||
| 480 | memcpy(mm, oldmm, sizeof(*mm)); | 490 | memcpy(mm, oldmm, sizeof(*mm)); |
| 481 | 491 | ||
| 492 | /* Initializing for Swap token stuff */ | ||
| 493 | mm->token_priority = 0; | ||
| 494 | mm->last_interval = 0; | ||
| 495 | |||
| 482 | if (!mm_init(mm)) | 496 | if (!mm_init(mm)) |
| 483 | goto fail_nomem; | 497 | goto fail_nomem; |
| 484 | 498 | ||
| @@ -542,6 +556,10 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | |||
| 542 | goto fail_nomem; | 556 | goto fail_nomem; |
| 543 | 557 | ||
| 544 | good_mm: | 558 | good_mm: |
| 559 | /* Initializing for Swap token stuff */ | ||
| 560 | mm->token_priority = 0; | ||
| 561 | mm->last_interval = 0; | ||
| 562 | |||
| 545 | tsk->mm = mm; | 563 | tsk->mm = mm; |
| 546 | tsk->active_mm = mm; | 564 | tsk->active_mm = mm; |
| 547 | return 0; | 565 | return 0; |
| @@ -596,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) | |||
| 596 | 614 | ||
| 597 | static int count_open_files(struct fdtable *fdt) | 615 | static int count_open_files(struct fdtable *fdt) |
| 598 | { | 616 | { |
| 599 | int size = fdt->max_fdset; | 617 | int size = fdt->max_fds; |
| 600 | int i; | 618 | int i; |
| 601 | 619 | ||
| 602 | /* Find the last open fd */ | 620 | /* Find the last open fd */ |
| @@ -613,7 +631,7 @@ static struct files_struct *alloc_files(void) | |||
| 613 | struct files_struct *newf; | 631 | struct files_struct *newf; |
| 614 | struct fdtable *fdt; | 632 | struct fdtable *fdt; |
| 615 | 633 | ||
| 616 | newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); | 634 | newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); |
| 617 | if (!newf) | 635 | if (!newf) |
| 618 | goto out; | 636 | goto out; |
| 619 | 637 | ||
| @@ -623,12 +641,10 @@ static struct files_struct *alloc_files(void) | |||
| 623 | newf->next_fd = 0; | 641 | newf->next_fd = 0; |
| 624 | fdt = &newf->fdtab; | 642 | fdt = &newf->fdtab; |
| 625 | fdt->max_fds = NR_OPEN_DEFAULT; | 643 | fdt->max_fds = NR_OPEN_DEFAULT; |
| 626 | fdt->max_fdset = EMBEDDED_FD_SET_SIZE; | ||
| 627 | fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; | 644 | fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; |
| 628 | fdt->open_fds = (fd_set *)&newf->open_fds_init; | 645 | fdt->open_fds = (fd_set *)&newf->open_fds_init; |
| 629 | fdt->fd = &newf->fd_array[0]; | 646 | fdt->fd = &newf->fd_array[0]; |
| 630 | INIT_RCU_HEAD(&fdt->rcu); | 647 | INIT_RCU_HEAD(&fdt->rcu); |
| 631 | fdt->free_files = NULL; | ||
| 632 | fdt->next = NULL; | 648 | fdt->next = NULL; |
| 633 | rcu_assign_pointer(newf->fdt, fdt); | 649 | rcu_assign_pointer(newf->fdt, fdt); |
| 634 | out: | 650 | out: |
| @@ -644,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
| 644 | { | 660 | { |
| 645 | struct files_struct *newf; | 661 | struct files_struct *newf; |
| 646 | struct file **old_fds, **new_fds; | 662 | struct file **old_fds, **new_fds; |
| 647 | int open_files, size, i, expand; | 663 | int open_files, size, i; |
| 648 | struct fdtable *old_fdt, *new_fdt; | 664 | struct fdtable *old_fdt, *new_fdt; |
| 649 | 665 | ||
| 650 | *errorp = -ENOMEM; | 666 | *errorp = -ENOMEM; |
| @@ -655,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
| 655 | spin_lock(&oldf->file_lock); | 671 | spin_lock(&oldf->file_lock); |
| 656 | old_fdt = files_fdtable(oldf); | 672 | old_fdt = files_fdtable(oldf); |
| 657 | new_fdt = files_fdtable(newf); | 673 | new_fdt = files_fdtable(newf); |
| 658 | size = old_fdt->max_fdset; | ||
| 659 | open_files = count_open_files(old_fdt); | 674 | open_files = count_open_files(old_fdt); |
| 660 | expand = 0; | ||
| 661 | 675 | ||
| 662 | /* | 676 | /* |
| 663 | * Check whether we need to allocate a larger fd array or fd set. | 677 | * Check whether we need to allocate a larger fd array and fd set. |
| 664 | * Note: we're not a clone task, so the open count won't change. | 678 | * Note: we're not a clone task, so the open count won't change. |
| 665 | */ | 679 | */ |
| 666 | if (open_files > new_fdt->max_fdset) { | ||
| 667 | new_fdt->max_fdset = 0; | ||
| 668 | expand = 1; | ||
| 669 | } | ||
| 670 | if (open_files > new_fdt->max_fds) { | 680 | if (open_files > new_fdt->max_fds) { |
| 671 | new_fdt->max_fds = 0; | 681 | new_fdt->max_fds = 0; |
| 672 | expand = 1; | ||
| 673 | } | ||
| 674 | |||
| 675 | /* if the old fdset gets grown now, we'll only copy up to "size" fds */ | ||
| 676 | if (expand) { | ||
| 677 | spin_unlock(&oldf->file_lock); | 682 | spin_unlock(&oldf->file_lock); |
| 678 | spin_lock(&newf->file_lock); | 683 | spin_lock(&newf->file_lock); |
| 679 | *errorp = expand_files(newf, open_files-1); | 684 | *errorp = expand_files(newf, open_files-1); |
| @@ -693,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
| 693 | old_fds = old_fdt->fd; | 698 | old_fds = old_fdt->fd; |
| 694 | new_fds = new_fdt->fd; | 699 | new_fds = new_fdt->fd; |
| 695 | 700 | ||
| 696 | memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); | 701 | memcpy(new_fdt->open_fds->fds_bits, |
| 697 | memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); | 702 | old_fdt->open_fds->fds_bits, open_files/8); |
| 703 | memcpy(new_fdt->close_on_exec->fds_bits, | ||
| 704 | old_fdt->close_on_exec->fds_bits, open_files/8); | ||
| 698 | 705 | ||
| 699 | for (i = open_files; i != 0; i--) { | 706 | for (i = open_files; i != 0; i--) { |
| 700 | struct file *f = *old_fds++; | 707 | struct file *f = *old_fds++; |
| @@ -719,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
| 719 | /* This is long word aligned thus could use a optimized version */ | 726 | /* This is long word aligned thus could use a optimized version */ |
| 720 | memset(new_fds, 0, size); | 727 | memset(new_fds, 0, size); |
| 721 | 728 | ||
| 722 | if (new_fdt->max_fdset > open_files) { | 729 | if (new_fdt->max_fds > open_files) { |
| 723 | int left = (new_fdt->max_fdset-open_files)/8; | 730 | int left = (new_fdt->max_fds-open_files)/8; |
| 724 | int start = open_files / (8 * sizeof(unsigned long)); | 731 | int start = open_files / (8 * sizeof(unsigned long)); |
| 725 | 732 | ||
| 726 | memset(&new_fdt->open_fds->fds_bits[start], 0, left); | 733 | memset(&new_fdt->open_fds->fds_bits[start], 0, left); |
| 727 | memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); | 734 | memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); |
| 728 | } | 735 | } |
| 729 | 736 | ||
| 730 | out: | ||
| 731 | return newf; | 737 | return newf; |
| 732 | 738 | ||
| 733 | out_release: | 739 | out_release: |
| 734 | free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); | ||
| 735 | free_fdset (new_fdt->open_fds, new_fdt->max_fdset); | ||
| 736 | free_fd_array(new_fdt->fd, new_fdt->max_fds); | ||
| 737 | kmem_cache_free(files_cachep, newf); | 740 | kmem_cache_free(files_cachep, newf); |
| 741 | out: | ||
| 738 | return NULL; | 742 | return NULL; |
| 739 | } | 743 | } |
| 740 | 744 | ||
| @@ -830,7 +834,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
| 830 | if (clone_flags & CLONE_THREAD) { | 834 | if (clone_flags & CLONE_THREAD) { |
| 831 | atomic_inc(¤t->signal->count); | 835 | atomic_inc(¤t->signal->count); |
| 832 | atomic_inc(¤t->signal->live); | 836 | atomic_inc(¤t->signal->live); |
| 833 | taskstats_tgid_alloc(current); | ||
| 834 | return 0; | 837 | return 0; |
| 835 | } | 838 | } |
| 836 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); | 839 | sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); |
| @@ -1039,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1039 | p->wchar = 0; /* I/O counter: bytes written */ | 1042 | p->wchar = 0; /* I/O counter: bytes written */ |
| 1040 | p->syscr = 0; /* I/O counter: read syscalls */ | 1043 | p->syscr = 0; /* I/O counter: read syscalls */ |
| 1041 | p->syscw = 0; /* I/O counter: write syscalls */ | 1044 | p->syscw = 0; /* I/O counter: write syscalls */ |
| 1045 | task_io_accounting_init(p); | ||
| 1042 | acct_clear_integrals(p); | 1046 | acct_clear_integrals(p); |
| 1043 | 1047 | ||
| 1044 | p->it_virt_expires = cputime_zero; | 1048 | p->it_virt_expires = cputime_zero; |
| @@ -1243,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1243 | if (thread_group_leader(p)) { | 1247 | if (thread_group_leader(p)) { |
| 1244 | p->signal->tty = current->signal->tty; | 1248 | p->signal->tty = current->signal->tty; |
| 1245 | p->signal->pgrp = process_group(current); | 1249 | p->signal->pgrp = process_group(current); |
| 1246 | p->signal->session = current->signal->session; | 1250 | set_signal_session(p->signal, process_session(current)); |
| 1247 | attach_pid(p, PIDTYPE_PGID, process_group(p)); | 1251 | attach_pid(p, PIDTYPE_PGID, process_group(p)); |
| 1248 | attach_pid(p, PIDTYPE_SID, p->signal->session); | 1252 | attach_pid(p, PIDTYPE_SID, process_session(p)); |
| 1249 | 1253 | ||
| 1250 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1254 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
| 1251 | __get_cpu_var(process_counts)++; | 1255 | __get_cpu_var(process_counts)++; |
| @@ -1303,7 +1307,7 @@ fork_out: | |||
| 1303 | return ERR_PTR(retval); | 1307 | return ERR_PTR(retval); |
| 1304 | } | 1308 | } |
| 1305 | 1309 | ||
| 1306 | struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | 1310 | noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) |
| 1307 | { | 1311 | { |
| 1308 | memset(regs, 0, sizeof(struct pt_regs)); | 1312 | memset(regs, 0, sizeof(struct pt_regs)); |
| 1309 | return regs; | 1313 | return regs; |
| @@ -1413,7 +1417,7 @@ long do_fork(unsigned long clone_flags, | |||
| 1413 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 | 1417 | #define ARCH_MIN_MMSTRUCT_ALIGN 0 |
| 1414 | #endif | 1418 | #endif |
| 1415 | 1419 | ||
| 1416 | static void sighand_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) | 1420 | static void sighand_ctor(void *data, struct kmem_cache *cachep, unsigned long flags) |
| 1417 | { | 1421 | { |
| 1418 | struct sighand_struct *sighand = data; | 1422 | struct sighand_struct *sighand = data; |
| 1419 | 1423 | ||
| @@ -1509,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) | |||
| 1509 | } | 1513 | } |
| 1510 | 1514 | ||
| 1511 | /* | 1515 | /* |
| 1512 | * Unshare the namespace structure if it is being shared | 1516 | * Unshare the mnt_namespace structure if it is being shared |
| 1513 | */ | 1517 | */ |
| 1514 | static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) | 1518 | static int unshare_mnt_namespace(unsigned long unshare_flags, |
| 1519 | struct mnt_namespace **new_nsp, struct fs_struct *new_fs) | ||
| 1515 | { | 1520 | { |
| 1516 | struct namespace *ns = current->nsproxy->namespace; | 1521 | struct mnt_namespace *ns = current->nsproxy->mnt_ns; |
| 1517 | 1522 | ||
| 1518 | if ((unshare_flags & CLONE_NEWNS) && ns) { | 1523 | if ((unshare_flags & CLONE_NEWNS) && ns) { |
| 1519 | if (!capable(CAP_SYS_ADMIN)) | 1524 | if (!capable(CAP_SYS_ADMIN)) |
| 1520 | return -EPERM; | 1525 | return -EPERM; |
| 1521 | 1526 | ||
| 1522 | *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); | 1527 | *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); |
| 1523 | if (!*new_nsp) | 1528 | if (!*new_nsp) |
| 1524 | return -ENOMEM; | 1529 | return -ENOMEM; |
| 1525 | } | 1530 | } |
| @@ -1528,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new | |||
| 1528 | } | 1533 | } |
| 1529 | 1534 | ||
| 1530 | /* | 1535 | /* |
| 1531 | * Unsharing of sighand for tasks created with CLONE_SIGHAND is not | 1536 | * Unsharing of sighand is not supported yet |
| 1532 | * supported yet | ||
| 1533 | */ | 1537 | */ |
| 1534 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) | 1538 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) |
| 1535 | { | 1539 | { |
| 1536 | struct sighand_struct *sigh = current->sighand; | 1540 | struct sighand_struct *sigh = current->sighand; |
| 1537 | 1541 | ||
| 1538 | if ((unshare_flags & CLONE_SIGHAND) && | 1542 | if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) |
| 1539 | (sigh && atomic_read(&sigh->count) > 1)) | ||
| 1540 | return -EINVAL; | 1543 | return -EINVAL; |
| 1541 | else | 1544 | else |
| 1542 | return 0; | 1545 | return 0; |
| @@ -1609,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
| 1609 | { | 1612 | { |
| 1610 | int err = 0; | 1613 | int err = 0; |
| 1611 | struct fs_struct *fs, *new_fs = NULL; | 1614 | struct fs_struct *fs, *new_fs = NULL; |
| 1612 | struct namespace *ns, *new_ns = NULL; | 1615 | struct mnt_namespace *ns, *new_ns = NULL; |
| 1613 | struct sighand_struct *sigh, *new_sigh = NULL; | 1616 | struct sighand_struct *new_sigh = NULL; |
| 1614 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; | 1617 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; |
| 1615 | struct files_struct *fd, *new_fd = NULL; | 1618 | struct files_struct *fd, *new_fd = NULL; |
| 1616 | struct sem_undo_list *new_ulist = NULL; | 1619 | struct sem_undo_list *new_ulist = NULL; |
| @@ -1631,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
| 1631 | goto bad_unshare_out; | 1634 | goto bad_unshare_out; |
| 1632 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1635 | if ((err = unshare_fs(unshare_flags, &new_fs))) |
| 1633 | goto bad_unshare_cleanup_thread; | 1636 | goto bad_unshare_cleanup_thread; |
| 1634 | if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) | 1637 | if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) |
| 1635 | goto bad_unshare_cleanup_fs; | 1638 | goto bad_unshare_cleanup_fs; |
| 1636 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) | 1639 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) |
| 1637 | goto bad_unshare_cleanup_ns; | 1640 | goto bad_unshare_cleanup_ns; |
| @@ -1655,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
| 1655 | } | 1658 | } |
| 1656 | } | 1659 | } |
| 1657 | 1660 | ||
| 1658 | if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || | 1661 | if (new_fs || new_ns || new_mm || new_fd || new_ulist || |
| 1659 | new_uts || new_ipc) { | 1662 | new_uts || new_ipc) { |
| 1660 | 1663 | ||
| 1661 | task_lock(current); | 1664 | task_lock(current); |
| @@ -1672,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
| 1672 | } | 1675 | } |
| 1673 | 1676 | ||
| 1674 | if (new_ns) { | 1677 | if (new_ns) { |
| 1675 | ns = current->nsproxy->namespace; | 1678 | ns = current->nsproxy->mnt_ns; |
| 1676 | current->nsproxy->namespace = new_ns; | 1679 | current->nsproxy->mnt_ns = new_ns; |
| 1677 | new_ns = ns; | 1680 | new_ns = ns; |
| 1678 | } | 1681 | } |
| 1679 | 1682 | ||
| 1680 | if (new_sigh) { | ||
| 1681 | sigh = current->sighand; | ||
| 1682 | rcu_assign_pointer(current->sighand, new_sigh); | ||
| 1683 | new_sigh = sigh; | ||
| 1684 | } | ||
| 1685 | |||
| 1686 | if (new_mm) { | 1683 | if (new_mm) { |
| 1687 | mm = current->mm; | 1684 | mm = current->mm; |
| 1688 | active_mm = current->active_mm; | 1685 | active_mm = current->active_mm; |
| @@ -1740,7 +1737,7 @@ bad_unshare_cleanup_sigh: | |||
| 1740 | 1737 | ||
| 1741 | bad_unshare_cleanup_ns: | 1738 | bad_unshare_cleanup_ns: |
| 1742 | if (new_ns) | 1739 | if (new_ns) |
| 1743 | put_namespace(new_ns); | 1740 | put_mnt_ns(new_ns); |
| 1744 | 1741 | ||
| 1745 | bad_unshare_cleanup_fs: | 1742 | bad_unshare_cleanup_fs: |
| 1746 | if (new_fs) | 1743 | if (new_fs) |
diff --git a/kernel/futex.c b/kernel/futex.c index 93ef30ba209f..5a737de857d3 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
| 166 | /* | 166 | /* |
| 167 | * Get parameters which are the keys for a futex. | 167 | * Get parameters which are the keys for a futex. |
| 168 | * | 168 | * |
| 169 | * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, | 169 | * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, |
| 170 | * offset_within_page). For private mappings, it's (uaddr, current->mm). | 170 | * offset_within_page). For private mappings, it's (uaddr, current->mm). |
| 171 | * We can usually work out the index without swapping in the page. | 171 | * We can usually work out the index without swapping in the page. |
| 172 | * | 172 | * |
| @@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) | |||
| 223 | /* | 223 | /* |
| 224 | * Linear file mappings are also simple. | 224 | * Linear file mappings are also simple. |
| 225 | */ | 225 | */ |
| 226 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | 226 | key->shared.inode = vma->vm_file->f_path.dentry->d_inode; |
| 227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | 227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ |
| 228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | 228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { |
| 229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) | 229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) |
| @@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from) | |||
| 282 | { | 282 | { |
| 283 | int ret; | 283 | int ret; |
| 284 | 284 | ||
| 285 | inc_preempt_count(); | 285 | pagefault_disable(); |
| 286 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); | 286 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); |
| 287 | dec_preempt_count(); | 287 | pagefault_enable(); |
| 288 | 288 | ||
| 289 | return ret ? -EFAULT : 0; | 289 | return ret ? -EFAULT : 0; |
| 290 | } | 290 | } |
| @@ -324,12 +324,11 @@ static int refill_pi_state_cache(void) | |||
| 324 | if (likely(current->pi_state_cache)) | 324 | if (likely(current->pi_state_cache)) |
| 325 | return 0; | 325 | return 0; |
| 326 | 326 | ||
| 327 | pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); | 327 | pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); |
| 328 | 328 | ||
| 329 | if (!pi_state) | 329 | if (!pi_state) |
| 330 | return -ENOMEM; | 330 | return -ENOMEM; |
| 331 | 331 | ||
| 332 | memset(pi_state, 0, sizeof(*pi_state)); | ||
| 333 | INIT_LIST_HEAD(&pi_state->list); | 332 | INIT_LIST_HEAD(&pi_state->list); |
| 334 | /* pi_mutex gets initialized later */ | 333 | /* pi_mutex gets initialized later */ |
| 335 | pi_state->owner = NULL; | 334 | pi_state->owner = NULL; |
| @@ -553,7 +552,7 @@ static void wake_futex(struct futex_q *q) | |||
| 553 | * at the end of wake_up_all() does not prevent this store from | 552 | * at the end of wake_up_all() does not prevent this store from |
| 554 | * moving. | 553 | * moving. |
| 555 | */ | 554 | */ |
| 556 | wmb(); | 555 | smp_wmb(); |
| 557 | q->lock_ptr = NULL; | 556 | q->lock_ptr = NULL; |
| 558 | } | 557 | } |
| 559 | 558 | ||
| @@ -585,9 +584,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
| 585 | if (!(uval & FUTEX_OWNER_DIED)) { | 584 | if (!(uval & FUTEX_OWNER_DIED)) { |
| 586 | newval = FUTEX_WAITERS | new_owner->pid; | 585 | newval = FUTEX_WAITERS | new_owner->pid; |
| 587 | 586 | ||
| 588 | inc_preempt_count(); | 587 | pagefault_disable(); |
| 589 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | 588 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); |
| 590 | dec_preempt_count(); | 589 | pagefault_enable(); |
| 591 | if (curval == -EFAULT) | 590 | if (curval == -EFAULT) |
| 592 | return -EFAULT; | 591 | return -EFAULT; |
| 593 | if (curval != uval) | 592 | if (curval != uval) |
| @@ -618,9 +617,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | |||
| 618 | * There is no waiter, so we unlock the futex. The owner died | 617 | * There is no waiter, so we unlock the futex. The owner died |
| 619 | * bit has not to be preserved here. We are the owner: | 618 | * bit has not to be preserved here. We are the owner: |
| 620 | */ | 619 | */ |
| 621 | inc_preempt_count(); | 620 | pagefault_disable(); |
| 622 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | 621 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); |
| 623 | dec_preempt_count(); | 622 | pagefault_enable(); |
| 624 | 623 | ||
| 625 | if (oldval == -EFAULT) | 624 | if (oldval == -EFAULT) |
| 626 | return oldval; | 625 | return oldval; |
| @@ -1158,9 +1157,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | |||
| 1158 | */ | 1157 | */ |
| 1159 | newval = current->pid; | 1158 | newval = current->pid; |
| 1160 | 1159 | ||
| 1161 | inc_preempt_count(); | 1160 | pagefault_disable(); |
| 1162 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | 1161 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); |
| 1163 | dec_preempt_count(); | 1162 | pagefault_enable(); |
| 1164 | 1163 | ||
| 1165 | if (unlikely(curval == -EFAULT)) | 1164 | if (unlikely(curval == -EFAULT)) |
| 1166 | goto uaddr_faulted; | 1165 | goto uaddr_faulted; |
| @@ -1183,9 +1182,9 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | |||
| 1183 | uval = curval; | 1182 | uval = curval; |
| 1184 | newval = uval | FUTEX_WAITERS; | 1183 | newval = uval | FUTEX_WAITERS; |
| 1185 | 1184 | ||
| 1186 | inc_preempt_count(); | 1185 | pagefault_disable(); |
| 1187 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | 1186 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); |
| 1188 | dec_preempt_count(); | 1187 | pagefault_enable(); |
| 1189 | 1188 | ||
| 1190 | if (unlikely(curval == -EFAULT)) | 1189 | if (unlikely(curval == -EFAULT)) |
| 1191 | goto uaddr_faulted; | 1190 | goto uaddr_faulted; |
| @@ -1215,10 +1214,10 @@ static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | |||
| 1215 | newval = current->pid | | 1214 | newval = current->pid | |
| 1216 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | 1215 | FUTEX_OWNER_DIED | FUTEX_WAITERS; |
| 1217 | 1216 | ||
| 1218 | inc_preempt_count(); | 1217 | pagefault_disable(); |
| 1219 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | 1218 | curval = futex_atomic_cmpxchg_inatomic(uaddr, |
| 1220 | uval, newval); | 1219 | uval, newval); |
| 1221 | dec_preempt_count(); | 1220 | pagefault_enable(); |
| 1222 | 1221 | ||
| 1223 | if (unlikely(curval == -EFAULT)) | 1222 | if (unlikely(curval == -EFAULT)) |
| 1224 | goto uaddr_faulted; | 1223 | goto uaddr_faulted; |
| @@ -1390,9 +1389,9 @@ retry_locked: | |||
| 1390 | * anyone else up: | 1389 | * anyone else up: |
| 1391 | */ | 1390 | */ |
| 1392 | if (!(uval & FUTEX_OWNER_DIED)) { | 1391 | if (!(uval & FUTEX_OWNER_DIED)) { |
| 1393 | inc_preempt_count(); | 1392 | pagefault_disable(); |
| 1394 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | 1393 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); |
| 1395 | dec_preempt_count(); | 1394 | pagefault_enable(); |
| 1396 | } | 1395 | } |
| 1397 | 1396 | ||
| 1398 | if (unlikely(uval == -EFAULT)) | 1397 | if (unlikely(uval == -EFAULT)) |
| @@ -1493,7 +1492,7 @@ static unsigned int futex_poll(struct file *filp, | |||
| 1493 | return ret; | 1492 | return ret; |
| 1494 | } | 1493 | } |
| 1495 | 1494 | ||
| 1496 | static struct file_operations futex_fops = { | 1495 | static const struct file_operations futex_fops = { |
| 1497 | .release = futex_close, | 1496 | .release = futex_close, |
| 1498 | .poll = futex_poll, | 1497 | .poll = futex_poll, |
| 1499 | }; | 1498 | }; |
| @@ -1529,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal) | |||
| 1529 | goto out; | 1528 | goto out; |
| 1530 | } | 1529 | } |
| 1531 | filp->f_op = &futex_fops; | 1530 | filp->f_op = &futex_fops; |
| 1532 | filp->f_vfsmnt = mntget(futex_mnt); | 1531 | filp->f_path.mnt = mntget(futex_mnt); |
| 1533 | filp->f_dentry = dget(futex_mnt->mnt_root); | 1532 | filp->f_path.dentry = dget(futex_mnt->mnt_root); |
| 1534 | filp->f_mapping = filp->f_dentry->d_inode->i_mapping; | 1533 | filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; |
| 1535 | 1534 | ||
| 1536 | if (signal) { | 1535 | if (signal) { |
| 1537 | err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); | 1536 | err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); |
| @@ -1858,10 +1857,16 @@ static struct file_system_type futex_fs_type = { | |||
| 1858 | 1857 | ||
| 1859 | static int __init init(void) | 1858 | static int __init init(void) |
| 1860 | { | 1859 | { |
| 1861 | unsigned int i; | 1860 | int i = register_filesystem(&futex_fs_type); |
| 1861 | |||
| 1862 | if (i) | ||
| 1863 | return i; | ||
| 1862 | 1864 | ||
| 1863 | register_filesystem(&futex_fs_type); | ||
| 1864 | futex_mnt = kern_mount(&futex_fs_type); | 1865 | futex_mnt = kern_mount(&futex_fs_type); |
| 1866 | if (IS_ERR(futex_mnt)) { | ||
| 1867 | unregister_filesystem(&futex_fs_type); | ||
| 1868 | return PTR_ERR(futex_mnt); | ||
| 1869 | } | ||
| 1865 | 1870 | ||
| 1866 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { | 1871 | for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { |
| 1867 | INIT_LIST_HEAD(&futex_queues[i].chain); | 1872 | INIT_LIST_HEAD(&futex_queues[i].chain); |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ebfd24a41858..d27b25855743 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -517,10 +517,9 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, | |||
| 517 | 517 | ||
| 518 | if (!handle) | 518 | if (!handle) |
| 519 | handle = handle_bad_irq; | 519 | handle = handle_bad_irq; |
| 520 | 520 | else if (desc->chip == &no_irq_chip) { | |
| 521 | if (desc->chip == &no_irq_chip) { | ||
| 522 | printk(KERN_WARNING "Trying to install %sinterrupt handler " | 521 | printk(KERN_WARNING "Trying to install %sinterrupt handler " |
| 523 | "for IRQ%d\n", is_chained ? "chained " : " ", irq); | 522 | "for IRQ%d\n", is_chained ? "chained " : "", irq); |
| 524 | /* | 523 | /* |
| 525 | * Some ARM implementations install a handler for really dumb | 524 | * Some ARM implementations install a handler for really dumb |
| 526 | * interrupt hardware without setting an irq_chip. This worked | 525 | * interrupt hardware without setting an irq_chip. This worked |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a681912bc89a..aff1f0fabb0d 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
| @@ -54,7 +54,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = { | |||
| 54 | .chip = &no_irq_chip, | 54 | .chip = &no_irq_chip, |
| 55 | .handle_irq = handle_bad_irq, | 55 | .handle_irq = handle_bad_irq, |
| 56 | .depth = 1, | 56 | .depth = 1, |
| 57 | .lock = SPIN_LOCK_UNLOCKED, | 57 | .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), |
| 58 | #ifdef CONFIG_SMP | 58 | #ifdef CONFIG_SMP |
| 59 | .affinity = CPU_MASK_ALL | 59 | .affinity = CPU_MASK_ALL |
| 60 | #endif | 60 | #endif |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9a352667007c..61f5c717a8f5 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
| @@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
| 54 | unsigned int irq = (int)(long)data, full_count = count, err; | 54 | unsigned int irq = (int)(long)data, full_count = count, err; |
| 55 | cpumask_t new_value, tmp; | 55 | cpumask_t new_value, tmp; |
| 56 | 56 | ||
| 57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) | 57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || |
| 58 | CHECK_IRQ_PER_CPU(irq_desc[irq].status)) | ||
| 58 | return -EIO; | 59 | return -EIO; |
| 59 | 60 | ||
| 60 | err = cpumask_parse_user(buffer, count, new_value); | 61 | err = cpumask_parse_user(buffer, count, new_value); |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 543ea2e5ad93..9d8c79b48823 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
| @@ -176,7 +176,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, | |||
| 176 | 176 | ||
| 177 | int noirqdebug __read_mostly; | 177 | int noirqdebug __read_mostly; |
| 178 | 178 | ||
| 179 | int __init noirqdebug_setup(char *str) | 179 | int noirqdebug_setup(char *str) |
| 180 | { | 180 | { |
| 181 | noirqdebug = 1; | 181 | noirqdebug = 1; |
| 182 | printk(KERN_INFO "IRQ lockup detection disabled\n"); | 182 | printk(KERN_INFO "IRQ lockup detection disabled\n"); |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index eeac3e313b2b..6f294ff4f9ee 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/proc_fs.h> | 20 | #include <linux/proc_fs.h> |
| 21 | #include <linux/sched.h> /* for cond_resched */ | 21 | #include <linux/sched.h> /* for cond_resched */ |
| 22 | #include <linux/mm.h> | 22 | #include <linux/mm.h> |
| 23 | #include <linux/ctype.h> | ||
| 23 | 24 | ||
| 24 | #include <asm/sections.h> | 25 | #include <asm/sections.h> |
| 25 | 26 | ||
| @@ -30,14 +31,14 @@ | |||
| 30 | #endif | 31 | #endif |
| 31 | 32 | ||
| 32 | /* These will be re-linked against their real values during the second link stage */ | 33 | /* These will be re-linked against their real values during the second link stage */ |
| 33 | extern unsigned long kallsyms_addresses[] __attribute__((weak)); | 34 | extern const unsigned long kallsyms_addresses[] __attribute__((weak)); |
| 34 | extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); | 35 | extern const unsigned long kallsyms_num_syms __attribute__((weak)); |
| 35 | extern u8 kallsyms_names[] __attribute__((weak)); | 36 | extern const u8 kallsyms_names[] __attribute__((weak)); |
| 36 | 37 | ||
| 37 | extern u8 kallsyms_token_table[] __attribute__((weak)); | 38 | extern const u8 kallsyms_token_table[] __attribute__((weak)); |
| 38 | extern u16 kallsyms_token_index[] __attribute__((weak)); | 39 | extern const u16 kallsyms_token_index[] __attribute__((weak)); |
| 39 | 40 | ||
| 40 | extern unsigned long kallsyms_markers[] __attribute__((weak)); | 41 | extern const unsigned long kallsyms_markers[] __attribute__((weak)); |
| 41 | 42 | ||
| 42 | static inline int is_kernel_inittext(unsigned long addr) | 43 | static inline int is_kernel_inittext(unsigned long addr) |
| 43 | { | 44 | { |
| @@ -83,7 +84,7 @@ static int is_ksym_addr(unsigned long addr) | |||
| 83 | static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) | 84 | static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) |
| 84 | { | 85 | { |
| 85 | int len, skipped_first = 0; | 86 | int len, skipped_first = 0; |
| 86 | u8 *tptr, *data; | 87 | const u8 *tptr, *data; |
| 87 | 88 | ||
| 88 | /* get the compressed symbol length from the first symbol byte */ | 89 | /* get the compressed symbol length from the first symbol byte */ |
| 89 | data = &kallsyms_names[off]; | 90 | data = &kallsyms_names[off]; |
| @@ -131,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off) | |||
| 131 | * kallsyms array */ | 132 | * kallsyms array */ |
| 132 | static unsigned int get_symbol_offset(unsigned long pos) | 133 | static unsigned int get_symbol_offset(unsigned long pos) |
| 133 | { | 134 | { |
| 134 | u8 *name; | 135 | const u8 *name; |
| 135 | int i; | 136 | int i; |
| 136 | 137 | ||
| 137 | /* use the closest marker we have. We have markers every 256 positions, | 138 | /* use the closest marker we have. We have markers every 256 positions, |
| @@ -301,13 +302,6 @@ struct kallsym_iter | |||
| 301 | char name[KSYM_NAME_LEN+1]; | 302 | char name[KSYM_NAME_LEN+1]; |
| 302 | }; | 303 | }; |
| 303 | 304 | ||
| 304 | /* Only label it "global" if it is exported. */ | ||
| 305 | static void upcase_if_global(struct kallsym_iter *iter) | ||
| 306 | { | ||
| 307 | if (is_exported(iter->name, iter->owner)) | ||
| 308 | iter->type += 'A' - 'a'; | ||
| 309 | } | ||
| 310 | |||
| 311 | static int get_ksymbol_mod(struct kallsym_iter *iter) | 305 | static int get_ksymbol_mod(struct kallsym_iter *iter) |
| 312 | { | 306 | { |
| 313 | iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, | 307 | iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, |
| @@ -316,7 +310,10 @@ static int get_ksymbol_mod(struct kallsym_iter *iter) | |||
| 316 | if (iter->owner == NULL) | 310 | if (iter->owner == NULL) |
| 317 | return 0; | 311 | return 0; |
| 318 | 312 | ||
| 319 | upcase_if_global(iter); | 313 | /* Label it "global" if it is exported, "local" if not exported. */ |
| 314 | iter->type = is_exported(iter->name, iter->owner) | ||
| 315 | ? toupper(iter->type) : tolower(iter->type); | ||
| 316 | |||
| 320 | return 1; | 317 | return 1; |
| 321 | } | 318 | } |
| 322 | 319 | ||
| @@ -401,7 +398,7 @@ static int s_show(struct seq_file *m, void *p) | |||
| 401 | return 0; | 398 | return 0; |
| 402 | } | 399 | } |
| 403 | 400 | ||
| 404 | static struct seq_operations kallsyms_op = { | 401 | static const struct seq_operations kallsyms_op = { |
| 405 | .start = s_start, | 402 | .start = s_start, |
| 406 | .next = s_next, | 403 | .next = s_next, |
| 407 | .stop = s_stop, | 404 | .stop = s_stop, |
| @@ -436,7 +433,7 @@ static int kallsyms_release(struct inode *inode, struct file *file) | |||
| 436 | return seq_release(inode, file); | 433 | return seq_release(inode, file); |
| 437 | } | 434 | } |
| 438 | 435 | ||
| 439 | static struct file_operations kallsyms_operations = { | 436 | static const struct file_operations kallsyms_operations = { |
| 440 | .open = kallsyms_open, | 437 | .open = kallsyms_open, |
| 441 | .read = seq_read, | 438 | .read = seq_read, |
| 442 | .llseek = seq_lseek, | 439 | .llseek = seq_lseek, |
diff --git a/kernel/kexec.c b/kernel/kexec.c index fcdd5d2bc3f4..2a59c8a01ae0 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -20,6 +20,8 @@ | |||
| 20 | #include <linux/syscalls.h> | 20 | #include <linux/syscalls.h> |
| 21 | #include <linux/ioport.h> | 21 | #include <linux/ioport.h> |
| 22 | #include <linux/hardirq.h> | 22 | #include <linux/hardirq.h> |
| 23 | #include <linux/elf.h> | ||
| 24 | #include <linux/elfcore.h> | ||
| 23 | 25 | ||
| 24 | #include <asm/page.h> | 26 | #include <asm/page.h> |
| 25 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
| @@ -108,11 +110,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, | |||
| 108 | 110 | ||
| 109 | /* Allocate a controlling structure */ | 111 | /* Allocate a controlling structure */ |
| 110 | result = -ENOMEM; | 112 | result = -ENOMEM; |
| 111 | image = kmalloc(sizeof(*image), GFP_KERNEL); | 113 | image = kzalloc(sizeof(*image), GFP_KERNEL); |
| 112 | if (!image) | 114 | if (!image) |
| 113 | goto out; | 115 | goto out; |
| 114 | 116 | ||
| 115 | memset(image, 0, sizeof(*image)); | ||
| 116 | image->head = 0; | 117 | image->head = 0; |
| 117 | image->entry = &image->head; | 118 | image->entry = &image->head; |
| 118 | image->last_entry = &image->head; | 119 | image->last_entry = &image->head; |
| @@ -851,6 +852,7 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
| 851 | memset(ptr + uchunk, 0, mchunk - uchunk); | 852 | memset(ptr + uchunk, 0, mchunk - uchunk); |
| 852 | } | 853 | } |
| 853 | result = copy_from_user(ptr, buf, uchunk); | 854 | result = copy_from_user(ptr, buf, uchunk); |
| 855 | kexec_flush_icache_page(page); | ||
| 854 | kunmap(page); | 856 | kunmap(page); |
| 855 | if (result) { | 857 | if (result) { |
| 856 | result = (result < 0) ? result : -EIO; | 858 | result = (result < 0) ? result : -EIO; |
| @@ -1067,6 +1069,60 @@ void crash_kexec(struct pt_regs *regs) | |||
| 1067 | } | 1069 | } |
| 1068 | } | 1070 | } |
| 1069 | 1071 | ||
| 1072 | static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, | ||
| 1073 | size_t data_len) | ||
| 1074 | { | ||
| 1075 | struct elf_note note; | ||
| 1076 | |||
| 1077 | note.n_namesz = strlen(name) + 1; | ||
| 1078 | note.n_descsz = data_len; | ||
| 1079 | note.n_type = type; | ||
| 1080 | memcpy(buf, ¬e, sizeof(note)); | ||
| 1081 | buf += (sizeof(note) + 3)/4; | ||
| 1082 | memcpy(buf, name, note.n_namesz); | ||
| 1083 | buf += (note.n_namesz + 3)/4; | ||
| 1084 | memcpy(buf, data, note.n_descsz); | ||
| 1085 | buf += (note.n_descsz + 3)/4; | ||
| 1086 | |||
| 1087 | return buf; | ||
| 1088 | } | ||
| 1089 | |||
| 1090 | static void final_note(u32 *buf) | ||
| 1091 | { | ||
| 1092 | struct elf_note note; | ||
| 1093 | |||
| 1094 | note.n_namesz = 0; | ||
| 1095 | note.n_descsz = 0; | ||
| 1096 | note.n_type = 0; | ||
| 1097 | memcpy(buf, ¬e, sizeof(note)); | ||
| 1098 | } | ||
| 1099 | |||
| 1100 | void crash_save_cpu(struct pt_regs *regs, int cpu) | ||
| 1101 | { | ||
| 1102 | struct elf_prstatus prstatus; | ||
| 1103 | u32 *buf; | ||
| 1104 | |||
| 1105 | if ((cpu < 0) || (cpu >= NR_CPUS)) | ||
| 1106 | return; | ||
| 1107 | |||
| 1108 | /* Using ELF notes here is opportunistic. | ||
| 1109 | * I need a well defined structure format | ||
| 1110 | * for the data I pass, and I need tags | ||
| 1111 | * on the data to indicate what information I have | ||
| 1112 | * squirrelled away. ELF notes happen to provide | ||
| 1113 | * all of that, so there is no need to invent something new. | ||
| 1114 | */ | ||
| 1115 | buf = (u32*)per_cpu_ptr(crash_notes, cpu); | ||
| 1116 | if (!buf) | ||
| 1117 | return; | ||
| 1118 | memset(&prstatus, 0, sizeof(prstatus)); | ||
| 1119 | prstatus.pr_pid = current->pid; | ||
| 1120 | elf_core_copy_regs(&prstatus.pr_reg, regs); | ||
| 1121 | buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, | ||
| 1122 | sizeof(prstatus)); | ||
| 1123 | final_note(buf); | ||
| 1124 | } | ||
| 1125 | |||
| 1070 | static int __init crash_notes_memory_init(void) | 1126 | static int __init crash_notes_memory_init(void) |
| 1071 | { | 1127 | { |
| 1072 | /* Allocate memory for saving cpu registers. */ | 1128 | /* Allocate memory for saving cpu registers. */ |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 2b76dee28496..3a7379aa31ca 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
| @@ -25,7 +25,7 @@ | |||
| 25 | #include <linux/kmod.h> | 25 | #include <linux/kmod.h> |
| 26 | #include <linux/smp_lock.h> | 26 | #include <linux/smp_lock.h> |
| 27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
| 28 | #include <linux/namespace.h> | 28 | #include <linux/mnt_namespace.h> |
| 29 | #include <linux/completion.h> | 29 | #include <linux/completion.h> |
| 30 | #include <linux/file.h> | 30 | #include <linux/file.h> |
| 31 | #include <linux/workqueue.h> | 31 | #include <linux/workqueue.h> |
| @@ -114,6 +114,7 @@ EXPORT_SYMBOL(request_module); | |||
| 114 | #endif /* CONFIG_KMOD */ | 114 | #endif /* CONFIG_KMOD */ |
| 115 | 115 | ||
| 116 | struct subprocess_info { | 116 | struct subprocess_info { |
| 117 | struct work_struct work; | ||
| 117 | struct completion *complete; | 118 | struct completion *complete; |
| 118 | char *path; | 119 | char *path; |
| 119 | char **argv; | 120 | char **argv; |
| @@ -221,9 +222,10 @@ static int wait_for_helper(void *data) | |||
| 221 | } | 222 | } |
| 222 | 223 | ||
| 223 | /* This is run by khelper thread */ | 224 | /* This is run by khelper thread */ |
| 224 | static void __call_usermodehelper(void *data) | 225 | static void __call_usermodehelper(struct work_struct *work) |
| 225 | { | 226 | { |
| 226 | struct subprocess_info *sub_info = data; | 227 | struct subprocess_info *sub_info = |
| 228 | container_of(work, struct subprocess_info, work); | ||
| 227 | pid_t pid; | 229 | pid_t pid; |
| 228 | int wait = sub_info->wait; | 230 | int wait = sub_info->wait; |
| 229 | 231 | ||
| @@ -264,6 +266,8 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
| 264 | { | 266 | { |
| 265 | DECLARE_COMPLETION_ONSTACK(done); | 267 | DECLARE_COMPLETION_ONSTACK(done); |
| 266 | struct subprocess_info sub_info = { | 268 | struct subprocess_info sub_info = { |
| 269 | .work = __WORK_INITIALIZER(sub_info.work, | ||
| 270 | __call_usermodehelper), | ||
| 267 | .complete = &done, | 271 | .complete = &done, |
| 268 | .path = path, | 272 | .path = path, |
| 269 | .argv = argv, | 273 | .argv = argv, |
| @@ -272,7 +276,6 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
| 272 | .wait = wait, | 276 | .wait = wait, |
| 273 | .retval = 0, | 277 | .retval = 0, |
| 274 | }; | 278 | }; |
| 275 | DECLARE_WORK(work, __call_usermodehelper, &sub_info); | ||
| 276 | 279 | ||
| 277 | if (!khelper_wq) | 280 | if (!khelper_wq) |
| 278 | return -EBUSY; | 281 | return -EBUSY; |
| @@ -280,7 +283,7 @@ int call_usermodehelper_keys(char *path, char **argv, char **envp, | |||
| 280 | if (path[0] == '\0') | 283 | if (path[0] == '\0') |
| 281 | return 0; | 284 | return 0; |
| 282 | 285 | ||
| 283 | queue_work(khelper_wq, &work); | 286 | queue_work(khelper_wq, &sub_info.work); |
| 284 | wait_for_completion(&done); | 287 | wait_for_completion(&done); |
| 285 | return sub_info.retval; | 288 | return sub_info.retval; |
| 286 | } | 289 | } |
| @@ -291,6 +294,8 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, | |||
| 291 | { | 294 | { |
| 292 | DECLARE_COMPLETION(done); | 295 | DECLARE_COMPLETION(done); |
| 293 | struct subprocess_info sub_info = { | 296 | struct subprocess_info sub_info = { |
| 297 | .work = __WORK_INITIALIZER(sub_info.work, | ||
| 298 | __call_usermodehelper), | ||
| 294 | .complete = &done, | 299 | .complete = &done, |
| 295 | .path = path, | 300 | .path = path, |
| 296 | .argv = argv, | 301 | .argv = argv, |
| @@ -298,7 +303,6 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, | |||
| 298 | .retval = 0, | 303 | .retval = 0, |
| 299 | }; | 304 | }; |
| 300 | struct file *f; | 305 | struct file *f; |
| 301 | DECLARE_WORK(work, __call_usermodehelper, &sub_info); | ||
| 302 | 306 | ||
| 303 | if (!khelper_wq) | 307 | if (!khelper_wq) |
| 304 | return -EBUSY; | 308 | return -EBUSY; |
| @@ -318,7 +322,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp, | |||
| 318 | } | 322 | } |
| 319 | sub_info.stdin = f; | 323 | sub_info.stdin = f; |
| 320 | 324 | ||
| 321 | queue_work(khelper_wq, &work); | 325 | queue_work(khelper_wq, &sub_info.work); |
| 322 | wait_for_completion(&done); | 326 | wait_for_completion(&done); |
| 323 | return sub_info.retval; | 327 | return sub_info.retval; |
| 324 | } | 328 | } |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 610c837ad9e0..17ec4afb0994 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <linux/module.h> | 38 | #include <linux/module.h> |
| 39 | #include <linux/moduleloader.h> | 39 | #include <linux/moduleloader.h> |
| 40 | #include <linux/kallsyms.h> | 40 | #include <linux/kallsyms.h> |
| 41 | #include <linux/freezer.h> | ||
| 41 | #include <asm-generic/sections.h> | 42 | #include <asm-generic/sections.h> |
| 42 | #include <asm/cacheflush.h> | 43 | #include <asm/cacheflush.h> |
| 43 | #include <asm/errno.h> | 44 | #include <asm/errno.h> |
| @@ -83,9 +84,36 @@ struct kprobe_insn_page { | |||
| 83 | kprobe_opcode_t *insns; /* Page of instruction slots */ | 84 | kprobe_opcode_t *insns; /* Page of instruction slots */ |
| 84 | char slot_used[INSNS_PER_PAGE]; | 85 | char slot_used[INSNS_PER_PAGE]; |
| 85 | int nused; | 86 | int nused; |
| 87 | int ngarbage; | ||
| 86 | }; | 88 | }; |
| 87 | 89 | ||
| 88 | static struct hlist_head kprobe_insn_pages; | 90 | static struct hlist_head kprobe_insn_pages; |
| 91 | static int kprobe_garbage_slots; | ||
| 92 | static int collect_garbage_slots(void); | ||
| 93 | |||
| 94 | static int __kprobes check_safety(void) | ||
| 95 | { | ||
| 96 | int ret = 0; | ||
| 97 | #if defined(CONFIG_PREEMPT) && defined(CONFIG_PM) | ||
| 98 | ret = freeze_processes(); | ||
| 99 | if (ret == 0) { | ||
| 100 | struct task_struct *p, *q; | ||
| 101 | do_each_thread(p, q) { | ||
| 102 | if (p != current && p->state == TASK_RUNNING && | ||
| 103 | p->pid != 0) { | ||
| 104 | printk("Check failed: %s is running\n",p->comm); | ||
| 105 | ret = -1; | ||
| 106 | goto loop_end; | ||
| 107 | } | ||
| 108 | } while_each_thread(p, q); | ||
| 109 | } | ||
| 110 | loop_end: | ||
| 111 | thaw_processes(); | ||
| 112 | #else | ||
| 113 | synchronize_sched(); | ||
| 114 | #endif | ||
| 115 | return ret; | ||
| 116 | } | ||
| 89 | 117 | ||
| 90 | /** | 118 | /** |
| 91 | * get_insn_slot() - Find a slot on an executable page for an instruction. | 119 | * get_insn_slot() - Find a slot on an executable page for an instruction. |
| @@ -96,6 +124,7 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) | |||
| 96 | struct kprobe_insn_page *kip; | 124 | struct kprobe_insn_page *kip; |
| 97 | struct hlist_node *pos; | 125 | struct hlist_node *pos; |
| 98 | 126 | ||
| 127 | retry: | ||
| 99 | hlist_for_each(pos, &kprobe_insn_pages) { | 128 | hlist_for_each(pos, &kprobe_insn_pages) { |
| 100 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | 129 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); |
| 101 | if (kip->nused < INSNS_PER_PAGE) { | 130 | if (kip->nused < INSNS_PER_PAGE) { |
| @@ -112,7 +141,11 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) | |||
| 112 | } | 141 | } |
| 113 | } | 142 | } |
| 114 | 143 | ||
| 115 | /* All out of space. Need to allocate a new page. Use slot 0.*/ | 144 | /* If there are any garbage slots, collect it and try again. */ |
| 145 | if (kprobe_garbage_slots && collect_garbage_slots() == 0) { | ||
| 146 | goto retry; | ||
| 147 | } | ||
| 148 | /* All out of space. Need to allocate a new page. Use slot 0. */ | ||
| 116 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); | 149 | kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); |
| 117 | if (!kip) { | 150 | if (!kip) { |
| 118 | return NULL; | 151 | return NULL; |
| @@ -133,10 +166,62 @@ kprobe_opcode_t __kprobes *get_insn_slot(void) | |||
| 133 | memset(kip->slot_used, 0, INSNS_PER_PAGE); | 166 | memset(kip->slot_used, 0, INSNS_PER_PAGE); |
| 134 | kip->slot_used[0] = 1; | 167 | kip->slot_used[0] = 1; |
| 135 | kip->nused = 1; | 168 | kip->nused = 1; |
| 169 | kip->ngarbage = 0; | ||
| 136 | return kip->insns; | 170 | return kip->insns; |
| 137 | } | 171 | } |
| 138 | 172 | ||
| 139 | void __kprobes free_insn_slot(kprobe_opcode_t *slot) | 173 | /* Return 1 if all garbages are collected, otherwise 0. */ |
| 174 | static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) | ||
| 175 | { | ||
| 176 | kip->slot_used[idx] = 0; | ||
| 177 | kip->nused--; | ||
| 178 | if (kip->nused == 0) { | ||
| 179 | /* | ||
| 180 | * Page is no longer in use. Free it unless | ||
| 181 | * it's the last one. We keep the last one | ||
| 182 | * so as not to have to set it up again the | ||
| 183 | * next time somebody inserts a probe. | ||
| 184 | */ | ||
| 185 | hlist_del(&kip->hlist); | ||
| 186 | if (hlist_empty(&kprobe_insn_pages)) { | ||
| 187 | INIT_HLIST_NODE(&kip->hlist); | ||
| 188 | hlist_add_head(&kip->hlist, | ||
| 189 | &kprobe_insn_pages); | ||
| 190 | } else { | ||
| 191 | module_free(NULL, kip->insns); | ||
| 192 | kfree(kip); | ||
| 193 | } | ||
| 194 | return 1; | ||
| 195 | } | ||
| 196 | return 0; | ||
| 197 | } | ||
| 198 | |||
| 199 | static int __kprobes collect_garbage_slots(void) | ||
| 200 | { | ||
| 201 | struct kprobe_insn_page *kip; | ||
| 202 | struct hlist_node *pos, *next; | ||
| 203 | |||
| 204 | /* Ensure no-one is preepmted on the garbages */ | ||
| 205 | if (check_safety() != 0) | ||
| 206 | return -EAGAIN; | ||
| 207 | |||
| 208 | hlist_for_each_safe(pos, next, &kprobe_insn_pages) { | ||
| 209 | int i; | ||
| 210 | kip = hlist_entry(pos, struct kprobe_insn_page, hlist); | ||
| 211 | if (kip->ngarbage == 0) | ||
| 212 | continue; | ||
| 213 | kip->ngarbage = 0; /* we will collect all garbages */ | ||
| 214 | for (i = 0; i < INSNS_PER_PAGE; i++) { | ||
| 215 | if (kip->slot_used[i] == -1 && | ||
| 216 | collect_one_slot(kip, i)) | ||
| 217 | break; | ||
| 218 | } | ||
| 219 | } | ||
| 220 | kprobe_garbage_slots = 0; | ||
| 221 | return 0; | ||
| 222 | } | ||
| 223 | |||
| 224 | void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) | ||
| 140 | { | 225 | { |
| 141 | struct kprobe_insn_page *kip; | 226 | struct kprobe_insn_page *kip; |
| 142 | struct hlist_node *pos; | 227 | struct hlist_node *pos; |
| @@ -146,28 +231,18 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot) | |||
| 146 | if (kip->insns <= slot && | 231 | if (kip->insns <= slot && |
| 147 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { | 232 | slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { |
| 148 | int i = (slot - kip->insns) / MAX_INSN_SIZE; | 233 | int i = (slot - kip->insns) / MAX_INSN_SIZE; |
| 149 | kip->slot_used[i] = 0; | 234 | if (dirty) { |
| 150 | kip->nused--; | 235 | kip->slot_used[i] = -1; |
| 151 | if (kip->nused == 0) { | 236 | kip->ngarbage++; |
| 152 | /* | 237 | } else { |
| 153 | * Page is no longer in use. Free it unless | 238 | collect_one_slot(kip, i); |
| 154 | * it's the last one. We keep the last one | ||
| 155 | * so as not to have to set it up again the | ||
| 156 | * next time somebody inserts a probe. | ||
| 157 | */ | ||
| 158 | hlist_del(&kip->hlist); | ||
| 159 | if (hlist_empty(&kprobe_insn_pages)) { | ||
| 160 | INIT_HLIST_NODE(&kip->hlist); | ||
| 161 | hlist_add_head(&kip->hlist, | ||
| 162 | &kprobe_insn_pages); | ||
| 163 | } else { | ||
| 164 | module_free(NULL, kip->insns); | ||
| 165 | kfree(kip); | ||
| 166 | } | ||
| 167 | } | 239 | } |
| 168 | return; | 240 | break; |
| 169 | } | 241 | } |
| 170 | } | 242 | } |
| 243 | if (dirty && (++kprobe_garbage_slots > INSNS_PER_PAGE)) { | ||
| 244 | collect_garbage_slots(); | ||
| 245 | } | ||
| 171 | } | 246 | } |
| 172 | #endif | 247 | #endif |
| 173 | 248 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index 4f9c60ef95e8..1db8c72d0d38 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -31,6 +31,8 @@ struct kthread_create_info | |||
| 31 | /* Result passed back to kthread_create() from keventd. */ | 31 | /* Result passed back to kthread_create() from keventd. */ |
| 32 | struct task_struct *result; | 32 | struct task_struct *result; |
| 33 | struct completion done; | 33 | struct completion done; |
| 34 | |||
| 35 | struct work_struct work; | ||
| 34 | }; | 36 | }; |
| 35 | 37 | ||
| 36 | struct kthread_stop_info | 38 | struct kthread_stop_info |
| @@ -111,9 +113,10 @@ static int kthread(void *_create) | |||
| 111 | } | 113 | } |
| 112 | 114 | ||
| 113 | /* We are keventd: create a thread. */ | 115 | /* We are keventd: create a thread. */ |
| 114 | static void keventd_create_kthread(void *_create) | 116 | static void keventd_create_kthread(struct work_struct *work) |
| 115 | { | 117 | { |
| 116 | struct kthread_create_info *create = _create; | 118 | struct kthread_create_info *create = |
| 119 | container_of(work, struct kthread_create_info, work); | ||
| 117 | int pid; | 120 | int pid; |
| 118 | 121 | ||
| 119 | /* We want our own signal handler (we take no signals by default). */ | 122 | /* We want our own signal handler (we take no signals by default). */ |
| @@ -154,20 +157,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
| 154 | ...) | 157 | ...) |
| 155 | { | 158 | { |
| 156 | struct kthread_create_info create; | 159 | struct kthread_create_info create; |
| 157 | DECLARE_WORK(work, keventd_create_kthread, &create); | ||
| 158 | 160 | ||
| 159 | create.threadfn = threadfn; | 161 | create.threadfn = threadfn; |
| 160 | create.data = data; | 162 | create.data = data; |
| 161 | init_completion(&create.started); | 163 | init_completion(&create.started); |
| 162 | init_completion(&create.done); | 164 | init_completion(&create.done); |
| 165 | INIT_WORK(&create.work, keventd_create_kthread); | ||
| 163 | 166 | ||
| 164 | /* | 167 | /* |
| 165 | * The workqueue needs to start up first: | 168 | * The workqueue needs to start up first: |
| 166 | */ | 169 | */ |
| 167 | if (!helper_wq) | 170 | if (!helper_wq) |
| 168 | work.func(work.data); | 171 | create.work.func(&create.work); |
| 169 | else { | 172 | else { |
| 170 | queue_work(helper_wq, &work); | 173 | queue_work(helper_wq, &create.work); |
| 171 | wait_for_completion(&create.done); | 174 | wait_for_completion(&create.done); |
| 172 | } | 175 | } |
| 173 | if (!IS_ERR(create.result)) { | 176 | if (!IS_ERR(create.result)) { |
diff --git a/kernel/latency.c b/kernel/latency.c index 258f2555abbc..e63fcacb61a7 100644 --- a/kernel/latency.c +++ b/kernel/latency.c | |||
| @@ -36,6 +36,7 @@ | |||
| 36 | #include <linux/slab.h> | 36 | #include <linux/slab.h> |
| 37 | #include <linux/module.h> | 37 | #include <linux/module.h> |
| 38 | #include <linux/notifier.h> | 38 | #include <linux/notifier.h> |
| 39 | #include <linux/jiffies.h> | ||
| 39 | #include <asm/atomic.h> | 40 | #include <asm/atomic.h> |
| 40 | 41 | ||
| 41 | struct latency_info { | 42 | struct latency_info { |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c9fefdb1a7db..509efd49540f 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -43,13 +43,49 @@ | |||
| 43 | #include "lockdep_internals.h" | 43 | #include "lockdep_internals.h" |
| 44 | 44 | ||
| 45 | /* | 45 | /* |
| 46 | * hash_lock: protects the lockdep hashes and class/list/hash allocators. | 46 | * lockdep_lock: protects the lockdep graph, the hashes and the |
| 47 | * class/list/hash allocators. | ||
| 47 | * | 48 | * |
| 48 | * This is one of the rare exceptions where it's justified | 49 | * This is one of the rare exceptions where it's justified |
| 49 | * to use a raw spinlock - we really dont want the spinlock | 50 | * to use a raw spinlock - we really dont want the spinlock |
| 50 | * code to recurse back into the lockdep code. | 51 | * code to recurse back into the lockdep code... |
| 51 | */ | 52 | */ |
| 52 | static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; | 53 | static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; |
| 54 | |||
| 55 | static int graph_lock(void) | ||
| 56 | { | ||
| 57 | __raw_spin_lock(&lockdep_lock); | ||
| 58 | /* | ||
| 59 | * Make sure that if another CPU detected a bug while | ||
| 60 | * walking the graph we dont change it (while the other | ||
| 61 | * CPU is busy printing out stuff with the graph lock | ||
| 62 | * dropped already) | ||
| 63 | */ | ||
| 64 | if (!debug_locks) { | ||
| 65 | __raw_spin_unlock(&lockdep_lock); | ||
| 66 | return 0; | ||
| 67 | } | ||
| 68 | return 1; | ||
| 69 | } | ||
| 70 | |||
| 71 | static inline int graph_unlock(void) | ||
| 72 | { | ||
| 73 | __raw_spin_unlock(&lockdep_lock); | ||
| 74 | return 0; | ||
| 75 | } | ||
| 76 | |||
| 77 | /* | ||
| 78 | * Turn lock debugging off and return with 0 if it was off already, | ||
| 79 | * and also release the graph lock: | ||
| 80 | */ | ||
| 81 | static inline int debug_locks_off_graph_unlock(void) | ||
| 82 | { | ||
| 83 | int ret = debug_locks_off(); | ||
| 84 | |||
| 85 | __raw_spin_unlock(&lockdep_lock); | ||
| 86 | |||
| 87 | return ret; | ||
| 88 | } | ||
| 53 | 89 | ||
| 54 | static int lockdep_initialized; | 90 | static int lockdep_initialized; |
| 55 | 91 | ||
| @@ -57,14 +93,15 @@ unsigned long nr_list_entries; | |||
| 57 | static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; | 93 | static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; |
| 58 | 94 | ||
| 59 | /* | 95 | /* |
| 60 | * Allocate a lockdep entry. (assumes hash_lock held, returns | 96 | * Allocate a lockdep entry. (assumes the graph_lock held, returns |
| 61 | * with NULL on failure) | 97 | * with NULL on failure) |
| 62 | */ | 98 | */ |
| 63 | static struct lock_list *alloc_list_entry(void) | 99 | static struct lock_list *alloc_list_entry(void) |
| 64 | { | 100 | { |
| 65 | if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { | 101 | if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { |
| 66 | __raw_spin_unlock(&hash_lock); | 102 | if (!debug_locks_off_graph_unlock()) |
| 67 | debug_locks_off(); | 103 | return NULL; |
| 104 | |||
| 68 | printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); | 105 | printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); |
| 69 | printk("turning off the locking correctness validator.\n"); | 106 | printk("turning off the locking correctness validator.\n"); |
| 70 | return NULL; | 107 | return NULL; |
| @@ -140,21 +177,12 @@ void lockdep_on(void) | |||
| 140 | 177 | ||
| 141 | EXPORT_SYMBOL(lockdep_on); | 178 | EXPORT_SYMBOL(lockdep_on); |
| 142 | 179 | ||
| 143 | int lockdep_internal(void) | ||
| 144 | { | ||
| 145 | return current->lockdep_recursion != 0; | ||
| 146 | } | ||
| 147 | |||
| 148 | EXPORT_SYMBOL(lockdep_internal); | ||
| 149 | |||
| 150 | /* | 180 | /* |
| 151 | * Debugging switches: | 181 | * Debugging switches: |
| 152 | */ | 182 | */ |
| 153 | 183 | ||
| 154 | #define VERBOSE 0 | 184 | #define VERBOSE 0 |
| 155 | #ifdef VERBOSE | 185 | #define VERY_VERBOSE 0 |
| 156 | # define VERY_VERBOSE 0 | ||
| 157 | #endif | ||
| 158 | 186 | ||
| 159 | #if VERBOSE | 187 | #if VERBOSE |
| 160 | # define HARDIRQ_VERBOSE 1 | 188 | # define HARDIRQ_VERBOSE 1 |
| @@ -179,8 +207,8 @@ static int class_filter(struct lock_class *class) | |||
| 179 | !strcmp(class->name, "&struct->lockfield")) | 207 | !strcmp(class->name, "&struct->lockfield")) |
| 180 | return 1; | 208 | return 1; |
| 181 | #endif | 209 | #endif |
| 182 | /* Allow everything else. 0 would be filter everything else */ | 210 | /* Filter everything else. 1 would be to allow everything else */ |
| 183 | return 1; | 211 | return 0; |
| 184 | } | 212 | } |
| 185 | #endif | 213 | #endif |
| 186 | 214 | ||
| @@ -214,7 +242,7 @@ static int softirq_verbose(struct lock_class *class) | |||
| 214 | 242 | ||
| 215 | /* | 243 | /* |
| 216 | * Stack-trace: tightly packed array of stack backtrace | 244 | * Stack-trace: tightly packed array of stack backtrace |
| 217 | * addresses. Protected by the hash_lock. | 245 | * addresses. Protected by the graph_lock. |
| 218 | */ | 246 | */ |
| 219 | unsigned long nr_stack_trace_entries; | 247 | unsigned long nr_stack_trace_entries; |
| 220 | static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; | 248 | static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; |
| @@ -228,25 +256,20 @@ static int save_trace(struct stack_trace *trace) | |||
| 228 | trace->skip = 3; | 256 | trace->skip = 3; |
| 229 | trace->all_contexts = 0; | 257 | trace->all_contexts = 0; |
| 230 | 258 | ||
| 231 | /* Make sure to not recurse in case the the unwinder needs to tak | ||
| 232 | e locks. */ | ||
| 233 | lockdep_off(); | ||
| 234 | save_stack_trace(trace, NULL); | 259 | save_stack_trace(trace, NULL); |
| 235 | lockdep_on(); | ||
| 236 | 260 | ||
| 237 | trace->max_entries = trace->nr_entries; | 261 | trace->max_entries = trace->nr_entries; |
| 238 | 262 | ||
| 239 | nr_stack_trace_entries += trace->nr_entries; | 263 | nr_stack_trace_entries += trace->nr_entries; |
| 240 | if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES)) | ||
| 241 | return 0; | ||
| 242 | 264 | ||
| 243 | if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { | 265 | if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) { |
| 244 | __raw_spin_unlock(&hash_lock); | 266 | if (!debug_locks_off_graph_unlock()) |
| 245 | if (debug_locks_off()) { | 267 | return 0; |
| 246 | printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); | 268 | |
| 247 | printk("turning off the locking correctness validator.\n"); | 269 | printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); |
| 248 | dump_stack(); | 270 | printk("turning off the locking correctness validator.\n"); |
| 249 | } | 271 | dump_stack(); |
| 272 | |||
| 250 | return 0; | 273 | return 0; |
| 251 | } | 274 | } |
| 252 | 275 | ||
| @@ -357,7 +380,7 @@ get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4 | |||
| 357 | 380 | ||
| 358 | static void print_lock_name(struct lock_class *class) | 381 | static void print_lock_name(struct lock_class *class) |
| 359 | { | 382 | { |
| 360 | char str[128], c1, c2, c3, c4; | 383 | char str[KSYM_NAME_LEN + 1], c1, c2, c3, c4; |
| 361 | const char *name; | 384 | const char *name; |
| 362 | 385 | ||
| 363 | get_usage_chars(class, &c1, &c2, &c3, &c4); | 386 | get_usage_chars(class, &c1, &c2, &c3, &c4); |
| @@ -379,7 +402,7 @@ static void print_lock_name(struct lock_class *class) | |||
| 379 | static void print_lockdep_cache(struct lockdep_map *lock) | 402 | static void print_lockdep_cache(struct lockdep_map *lock) |
| 380 | { | 403 | { |
| 381 | const char *name; | 404 | const char *name; |
| 382 | char str[128]; | 405 | char str[KSYM_NAME_LEN + 1]; |
| 383 | 406 | ||
| 384 | name = lock->name; | 407 | name = lock->name; |
| 385 | if (!name) | 408 | if (!name) |
| @@ -449,7 +472,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth) | |||
| 449 | print_lock_class_header(class, depth); | 472 | print_lock_class_header(class, depth); |
| 450 | 473 | ||
| 451 | list_for_each_entry(entry, &class->locks_after, entry) { | 474 | list_for_each_entry(entry, &class->locks_after, entry) { |
| 452 | DEBUG_LOCKS_WARN_ON(!entry->class); | 475 | if (DEBUG_LOCKS_WARN_ON(!entry->class)) |
| 476 | return; | ||
| 477 | |||
| 453 | print_lock_dependencies(entry->class, depth + 1); | 478 | print_lock_dependencies(entry->class, depth + 1); |
| 454 | 479 | ||
| 455 | printk("%*s ... acquired at:\n",depth,""); | 480 | printk("%*s ... acquired at:\n",depth,""); |
| @@ -474,7 +499,8 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, | |||
| 474 | return 0; | 499 | return 0; |
| 475 | 500 | ||
| 476 | entry->class = this; | 501 | entry->class = this; |
| 477 | save_trace(&entry->trace); | 502 | if (!save_trace(&entry->trace)) |
| 503 | return 0; | ||
| 478 | 504 | ||
| 479 | /* | 505 | /* |
| 480 | * Since we never remove from the dependency list, the list can | 506 | * Since we never remove from the dependency list, the list can |
| @@ -532,9 +558,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth) | |||
| 532 | { | 558 | { |
| 533 | struct task_struct *curr = current; | 559 | struct task_struct *curr = current; |
| 534 | 560 | ||
| 535 | __raw_spin_unlock(&hash_lock); | 561 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 536 | debug_locks_off(); | ||
| 537 | if (debug_locks_silent) | ||
| 538 | return 0; | 562 | return 0; |
| 539 | 563 | ||
| 540 | printk("\n=======================================================\n"); | 564 | printk("\n=======================================================\n"); |
| @@ -563,7 +587,9 @@ static noinline int print_circular_bug_tail(void) | |||
| 563 | return 0; | 587 | return 0; |
| 564 | 588 | ||
| 565 | this.class = check_source->class; | 589 | this.class = check_source->class; |
| 566 | save_trace(&this.trace); | 590 | if (!save_trace(&this.trace)) |
| 591 | return 0; | ||
| 592 | |||
| 567 | print_circular_bug_entry(&this, 0); | 593 | print_circular_bug_entry(&this, 0); |
| 568 | 594 | ||
| 569 | printk("\nother info that might help us debug this:\n\n"); | 595 | printk("\nother info that might help us debug this:\n\n"); |
| @@ -579,8 +605,10 @@ static noinline int print_circular_bug_tail(void) | |||
| 579 | 605 | ||
| 580 | static int noinline print_infinite_recursion_bug(void) | 606 | static int noinline print_infinite_recursion_bug(void) |
| 581 | { | 607 | { |
| 582 | __raw_spin_unlock(&hash_lock); | 608 | if (!debug_locks_off_graph_unlock()) |
| 583 | DEBUG_LOCKS_WARN_ON(1); | 609 | return 0; |
| 610 | |||
| 611 | WARN_ON(1); | ||
| 584 | 612 | ||
| 585 | return 0; | 613 | return 0; |
| 586 | } | 614 | } |
| @@ -715,9 +743,7 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
| 715 | enum lock_usage_bit bit2, | 743 | enum lock_usage_bit bit2, |
| 716 | const char *irqclass) | 744 | const char *irqclass) |
| 717 | { | 745 | { |
| 718 | __raw_spin_unlock(&hash_lock); | 746 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 719 | debug_locks_off(); | ||
| 720 | if (debug_locks_silent) | ||
| 721 | return 0; | 747 | return 0; |
| 722 | 748 | ||
| 723 | printk("\n======================================================\n"); | 749 | printk("\n======================================================\n"); |
| @@ -798,9 +824,7 @@ static int | |||
| 798 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | 824 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, |
| 799 | struct held_lock *next) | 825 | struct held_lock *next) |
| 800 | { | 826 | { |
| 801 | debug_locks_off(); | 827 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 802 | __raw_spin_unlock(&hash_lock); | ||
| 803 | if (debug_locks_silent) | ||
| 804 | return 0; | 828 | return 0; |
| 805 | 829 | ||
| 806 | printk("\n=============================================\n"); | 830 | printk("\n=============================================\n"); |
| @@ -966,27 +990,24 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
| 966 | &prev->class->locks_after, next->acquire_ip); | 990 | &prev->class->locks_after, next->acquire_ip); |
| 967 | if (!ret) | 991 | if (!ret) |
| 968 | return 0; | 992 | return 0; |
| 969 | /* | 993 | |
| 970 | * Return value of 2 signals 'dependency already added', | ||
| 971 | * in that case we dont have to add the backlink either. | ||
| 972 | */ | ||
| 973 | if (ret == 2) | ||
| 974 | return 2; | ||
| 975 | ret = add_lock_to_list(next->class, prev->class, | 994 | ret = add_lock_to_list(next->class, prev->class, |
| 976 | &next->class->locks_before, next->acquire_ip); | 995 | &next->class->locks_before, next->acquire_ip); |
| 996 | if (!ret) | ||
| 997 | return 0; | ||
| 977 | 998 | ||
| 978 | /* | 999 | /* |
| 979 | * Debugging printouts: | 1000 | * Debugging printouts: |
| 980 | */ | 1001 | */ |
| 981 | if (verbose(prev->class) || verbose(next->class)) { | 1002 | if (verbose(prev->class) || verbose(next->class)) { |
| 982 | __raw_spin_unlock(&hash_lock); | 1003 | graph_unlock(); |
| 983 | printk("\n new dependency: "); | 1004 | printk("\n new dependency: "); |
| 984 | print_lock_name(prev->class); | 1005 | print_lock_name(prev->class); |
| 985 | printk(" => "); | 1006 | printk(" => "); |
| 986 | print_lock_name(next->class); | 1007 | print_lock_name(next->class); |
| 987 | printk("\n"); | 1008 | printk("\n"); |
| 988 | dump_stack(); | 1009 | dump_stack(); |
| 989 | __raw_spin_lock(&hash_lock); | 1010 | return graph_lock(); |
| 990 | } | 1011 | } |
| 991 | return 1; | 1012 | return 1; |
| 992 | } | 1013 | } |
| @@ -1025,7 +1046,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) | |||
| 1025 | * added: | 1046 | * added: |
| 1026 | */ | 1047 | */ |
| 1027 | if (hlock->read != 2) { | 1048 | if (hlock->read != 2) { |
| 1028 | check_prev_add(curr, hlock, next); | 1049 | if (!check_prev_add(curr, hlock, next)) |
| 1050 | return 0; | ||
| 1029 | /* | 1051 | /* |
| 1030 | * Stop after the first non-trylock entry, | 1052 | * Stop after the first non-trylock entry, |
| 1031 | * as non-trylock entries have added their | 1053 | * as non-trylock entries have added their |
| @@ -1050,8 +1072,10 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) | |||
| 1050 | } | 1072 | } |
| 1051 | return 1; | 1073 | return 1; |
| 1052 | out_bug: | 1074 | out_bug: |
| 1053 | __raw_spin_unlock(&hash_lock); | 1075 | if (!debug_locks_off_graph_unlock()) |
| 1054 | DEBUG_LOCKS_WARN_ON(1); | 1076 | return 0; |
| 1077 | |||
| 1078 | WARN_ON(1); | ||
| 1055 | 1079 | ||
| 1056 | return 0; | 1080 | return 0; |
| 1057 | } | 1081 | } |
| @@ -1182,6 +1206,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
| 1182 | struct lockdep_subclass_key *key; | 1206 | struct lockdep_subclass_key *key; |
| 1183 | struct list_head *hash_head; | 1207 | struct list_head *hash_head; |
| 1184 | struct lock_class *class; | 1208 | struct lock_class *class; |
| 1209 | unsigned long flags; | ||
| 1185 | 1210 | ||
| 1186 | class = look_up_lock_class(lock, subclass); | 1211 | class = look_up_lock_class(lock, subclass); |
| 1187 | if (likely(class)) | 1212 | if (likely(class)) |
| @@ -1203,7 +1228,11 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
| 1203 | key = lock->key->subkeys + subclass; | 1228 | key = lock->key->subkeys + subclass; |
| 1204 | hash_head = classhashentry(key); | 1229 | hash_head = classhashentry(key); |
| 1205 | 1230 | ||
| 1206 | __raw_spin_lock(&hash_lock); | 1231 | raw_local_irq_save(flags); |
| 1232 | if (!graph_lock()) { | ||
| 1233 | raw_local_irq_restore(flags); | ||
| 1234 | return NULL; | ||
| 1235 | } | ||
| 1207 | /* | 1236 | /* |
| 1208 | * We have to do the hash-walk again, to avoid races | 1237 | * We have to do the hash-walk again, to avoid races |
| 1209 | * with another CPU: | 1238 | * with another CPU: |
| @@ -1216,8 +1245,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
| 1216 | * the hash: | 1245 | * the hash: |
| 1217 | */ | 1246 | */ |
| 1218 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { | 1247 | if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { |
| 1219 | __raw_spin_unlock(&hash_lock); | 1248 | if (!debug_locks_off_graph_unlock()) { |
| 1220 | debug_locks_off(); | 1249 | raw_local_irq_restore(flags); |
| 1250 | return NULL; | ||
| 1251 | } | ||
| 1252 | raw_local_irq_restore(flags); | ||
| 1253 | |||
| 1221 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); | 1254 | printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); |
| 1222 | printk("turning off the locking correctness validator.\n"); | 1255 | printk("turning off the locking correctness validator.\n"); |
| 1223 | return NULL; | 1256 | return NULL; |
| @@ -1238,16 +1271,24 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
| 1238 | list_add_tail_rcu(&class->hash_entry, hash_head); | 1271 | list_add_tail_rcu(&class->hash_entry, hash_head); |
| 1239 | 1272 | ||
| 1240 | if (verbose(class)) { | 1273 | if (verbose(class)) { |
| 1241 | __raw_spin_unlock(&hash_lock); | 1274 | graph_unlock(); |
| 1275 | raw_local_irq_restore(flags); | ||
| 1276 | |||
| 1242 | printk("\nnew class %p: %s", class->key, class->name); | 1277 | printk("\nnew class %p: %s", class->key, class->name); |
| 1243 | if (class->name_version > 1) | 1278 | if (class->name_version > 1) |
| 1244 | printk("#%d", class->name_version); | 1279 | printk("#%d", class->name_version); |
| 1245 | printk("\n"); | 1280 | printk("\n"); |
| 1246 | dump_stack(); | 1281 | dump_stack(); |
| 1247 | __raw_spin_lock(&hash_lock); | 1282 | |
| 1283 | raw_local_irq_save(flags); | ||
| 1284 | if (!graph_lock()) { | ||
| 1285 | raw_local_irq_restore(flags); | ||
| 1286 | return NULL; | ||
| 1287 | } | ||
| 1248 | } | 1288 | } |
| 1249 | out_unlock_set: | 1289 | out_unlock_set: |
| 1250 | __raw_spin_unlock(&hash_lock); | 1290 | graph_unlock(); |
| 1291 | raw_local_irq_restore(flags); | ||
| 1251 | 1292 | ||
| 1252 | if (!subclass || force) | 1293 | if (!subclass || force) |
| 1253 | lock->class_cache = class; | 1294 | lock->class_cache = class; |
| @@ -1262,7 +1303,7 @@ out_unlock_set: | |||
| 1262 | * add it and return 0 - in this case the new dependency chain is | 1303 | * add it and return 0 - in this case the new dependency chain is |
| 1263 | * validated. If the key is already hashed, return 1. | 1304 | * validated. If the key is already hashed, return 1. |
| 1264 | */ | 1305 | */ |
| 1265 | static inline int lookup_chain_cache(u64 chain_key) | 1306 | static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) |
| 1266 | { | 1307 | { |
| 1267 | struct list_head *hash_head = chainhashentry(chain_key); | 1308 | struct list_head *hash_head = chainhashentry(chain_key); |
| 1268 | struct lock_chain *chain; | 1309 | struct lock_chain *chain; |
| @@ -1276,34 +1317,36 @@ static inline int lookup_chain_cache(u64 chain_key) | |||
| 1276 | if (chain->chain_key == chain_key) { | 1317 | if (chain->chain_key == chain_key) { |
| 1277 | cache_hit: | 1318 | cache_hit: |
| 1278 | debug_atomic_inc(&chain_lookup_hits); | 1319 | debug_atomic_inc(&chain_lookup_hits); |
| 1279 | /* | 1320 | if (very_verbose(class)) |
| 1280 | * In the debugging case, force redundant checking | 1321 | printk("\nhash chain already cached, key: " |
| 1281 | * by returning 1: | 1322 | "%016Lx tail class: [%p] %s\n", |
| 1282 | */ | 1323 | (unsigned long long)chain_key, |
| 1283 | #ifdef CONFIG_DEBUG_LOCKDEP | 1324 | class->key, class->name); |
| 1284 | __raw_spin_lock(&hash_lock); | ||
| 1285 | return 1; | ||
| 1286 | #endif | ||
| 1287 | return 0; | 1325 | return 0; |
| 1288 | } | 1326 | } |
| 1289 | } | 1327 | } |
| 1328 | if (very_verbose(class)) | ||
| 1329 | printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", | ||
| 1330 | (unsigned long long)chain_key, class->key, class->name); | ||
| 1290 | /* | 1331 | /* |
| 1291 | * Allocate a new chain entry from the static array, and add | 1332 | * Allocate a new chain entry from the static array, and add |
| 1292 | * it to the hash: | 1333 | * it to the hash: |
| 1293 | */ | 1334 | */ |
| 1294 | __raw_spin_lock(&hash_lock); | 1335 | if (!graph_lock()) |
| 1336 | return 0; | ||
| 1295 | /* | 1337 | /* |
| 1296 | * We have to walk the chain again locked - to avoid duplicates: | 1338 | * We have to walk the chain again locked - to avoid duplicates: |
| 1297 | */ | 1339 | */ |
| 1298 | list_for_each_entry(chain, hash_head, entry) { | 1340 | list_for_each_entry(chain, hash_head, entry) { |
| 1299 | if (chain->chain_key == chain_key) { | 1341 | if (chain->chain_key == chain_key) { |
| 1300 | __raw_spin_unlock(&hash_lock); | 1342 | graph_unlock(); |
| 1301 | goto cache_hit; | 1343 | goto cache_hit; |
| 1302 | } | 1344 | } |
| 1303 | } | 1345 | } |
| 1304 | if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { | 1346 | if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { |
| 1305 | __raw_spin_unlock(&hash_lock); | 1347 | if (!debug_locks_off_graph_unlock()) |
| 1306 | debug_locks_off(); | 1348 | return 0; |
| 1349 | |||
| 1307 | printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); | 1350 | printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); |
| 1308 | printk("turning off the locking correctness validator.\n"); | 1351 | printk("turning off the locking correctness validator.\n"); |
| 1309 | return 0; | 1352 | return 0; |
| @@ -1379,9 +1422,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other, | |||
| 1379 | struct held_lock *this, int forwards, | 1422 | struct held_lock *this, int forwards, |
| 1380 | const char *irqclass) | 1423 | const char *irqclass) |
| 1381 | { | 1424 | { |
| 1382 | __raw_spin_unlock(&hash_lock); | 1425 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 1383 | debug_locks_off(); | ||
| 1384 | if (debug_locks_silent) | ||
| 1385 | return 0; | 1426 | return 0; |
| 1386 | 1427 | ||
| 1387 | printk("\n=========================================================\n"); | 1428 | printk("\n=========================================================\n"); |
| @@ -1451,7 +1492,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, | |||
| 1451 | return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); | 1492 | return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass); |
| 1452 | } | 1493 | } |
| 1453 | 1494 | ||
| 1454 | static inline void print_irqtrace_events(struct task_struct *curr) | 1495 | void print_irqtrace_events(struct task_struct *curr) |
| 1455 | { | 1496 | { |
| 1456 | printk("irq event stamp: %u\n", curr->irq_events); | 1497 | printk("irq event stamp: %u\n", curr->irq_events); |
| 1457 | printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); | 1498 | printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); |
| @@ -1464,19 +1505,13 @@ static inline void print_irqtrace_events(struct task_struct *curr) | |||
| 1464 | print_ip_sym(curr->softirq_disable_ip); | 1505 | print_ip_sym(curr->softirq_disable_ip); |
| 1465 | } | 1506 | } |
| 1466 | 1507 | ||
| 1467 | #else | ||
| 1468 | static inline void print_irqtrace_events(struct task_struct *curr) | ||
| 1469 | { | ||
| 1470 | } | ||
| 1471 | #endif | 1508 | #endif |
| 1472 | 1509 | ||
| 1473 | static int | 1510 | static int |
| 1474 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | 1511 | print_usage_bug(struct task_struct *curr, struct held_lock *this, |
| 1475 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | 1512 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) |
| 1476 | { | 1513 | { |
| 1477 | __raw_spin_unlock(&hash_lock); | 1514 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 1478 | debug_locks_off(); | ||
| 1479 | if (debug_locks_silent) | ||
| 1480 | return 0; | 1515 | return 0; |
| 1481 | 1516 | ||
| 1482 | printk("\n=================================\n"); | 1517 | printk("\n=================================\n"); |
| @@ -1537,12 +1572,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
| 1537 | if (likely(this->class->usage_mask & new_mask)) | 1572 | if (likely(this->class->usage_mask & new_mask)) |
| 1538 | return 1; | 1573 | return 1; |
| 1539 | 1574 | ||
| 1540 | __raw_spin_lock(&hash_lock); | 1575 | if (!graph_lock()) |
| 1576 | return 0; | ||
| 1541 | /* | 1577 | /* |
| 1542 | * Make sure we didnt race: | 1578 | * Make sure we didnt race: |
| 1543 | */ | 1579 | */ |
| 1544 | if (unlikely(this->class->usage_mask & new_mask)) { | 1580 | if (unlikely(this->class->usage_mask & new_mask)) { |
| 1545 | __raw_spin_unlock(&hash_lock); | 1581 | graph_unlock(); |
| 1546 | return 1; | 1582 | return 1; |
| 1547 | } | 1583 | } |
| 1548 | 1584 | ||
| @@ -1728,15 +1764,16 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
| 1728 | debug_atomic_dec(&nr_unused_locks); | 1764 | debug_atomic_dec(&nr_unused_locks); |
| 1729 | break; | 1765 | break; |
| 1730 | default: | 1766 | default: |
| 1731 | debug_locks_off(); | 1767 | if (!debug_locks_off_graph_unlock()) |
| 1768 | return 0; | ||
| 1732 | WARN_ON(1); | 1769 | WARN_ON(1); |
| 1733 | return 0; | 1770 | return 0; |
| 1734 | } | 1771 | } |
| 1735 | 1772 | ||
| 1736 | __raw_spin_unlock(&hash_lock); | 1773 | graph_unlock(); |
| 1737 | 1774 | ||
| 1738 | /* | 1775 | /* |
| 1739 | * We must printk outside of the hash_lock: | 1776 | * We must printk outside of the graph_lock: |
| 1740 | */ | 1777 | */ |
| 1741 | if (ret == 2) { | 1778 | if (ret == 2) { |
| 1742 | printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); | 1779 | printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); |
| @@ -2134,9 +2171,9 @@ out_calc_hash: | |||
| 2134 | * We look up the chain_key and do the O(N^2) check and update of | 2171 | * We look up the chain_key and do the O(N^2) check and update of |
| 2135 | * the dependencies only if this is a new dependency chain. | 2172 | * the dependencies only if this is a new dependency chain. |
| 2136 | * (If lookup_chain_cache() returns with 1 it acquires | 2173 | * (If lookup_chain_cache() returns with 1 it acquires |
| 2137 | * hash_lock for us) | 2174 | * graph_lock for us) |
| 2138 | */ | 2175 | */ |
| 2139 | if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) { | 2176 | if (!trylock && (check == 2) && lookup_chain_cache(chain_key, class)) { |
| 2140 | /* | 2177 | /* |
| 2141 | * Check whether last held lock: | 2178 | * Check whether last held lock: |
| 2142 | * | 2179 | * |
| @@ -2167,7 +2204,7 @@ out_calc_hash: | |||
| 2167 | if (!chain_head && ret != 2) | 2204 | if (!chain_head && ret != 2) |
| 2168 | if (!check_prevs_add(curr, hlock)) | 2205 | if (!check_prevs_add(curr, hlock)) |
| 2169 | return 0; | 2206 | return 0; |
| 2170 | __raw_spin_unlock(&hash_lock); | 2207 | graph_unlock(); |
| 2171 | } | 2208 | } |
| 2172 | curr->lockdep_depth++; | 2209 | curr->lockdep_depth++; |
| 2173 | check_chain_key(curr); | 2210 | check_chain_key(curr); |
| @@ -2430,6 +2467,7 @@ EXPORT_SYMBOL_GPL(lock_release); | |||
| 2430 | void lockdep_reset(void) | 2467 | void lockdep_reset(void) |
| 2431 | { | 2468 | { |
| 2432 | unsigned long flags; | 2469 | unsigned long flags; |
| 2470 | int i; | ||
| 2433 | 2471 | ||
| 2434 | raw_local_irq_save(flags); | 2472 | raw_local_irq_save(flags); |
| 2435 | current->curr_chain_key = 0; | 2473 | current->curr_chain_key = 0; |
| @@ -2440,6 +2478,8 @@ void lockdep_reset(void) | |||
| 2440 | nr_softirq_chains = 0; | 2478 | nr_softirq_chains = 0; |
| 2441 | nr_process_chains = 0; | 2479 | nr_process_chains = 0; |
| 2442 | debug_locks = 1; | 2480 | debug_locks = 1; |
| 2481 | for (i = 0; i < CHAINHASH_SIZE; i++) | ||
| 2482 | INIT_LIST_HEAD(chainhash_table + i); | ||
| 2443 | raw_local_irq_restore(flags); | 2483 | raw_local_irq_restore(flags); |
| 2444 | } | 2484 | } |
| 2445 | 2485 | ||
| @@ -2476,7 +2516,7 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
| 2476 | int i; | 2516 | int i; |
| 2477 | 2517 | ||
| 2478 | raw_local_irq_save(flags); | 2518 | raw_local_irq_save(flags); |
| 2479 | __raw_spin_lock(&hash_lock); | 2519 | graph_lock(); |
| 2480 | 2520 | ||
| 2481 | /* | 2521 | /* |
| 2482 | * Unhash all classes that were created by this module: | 2522 | * Unhash all classes that were created by this module: |
| @@ -2490,7 +2530,7 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
| 2490 | zap_class(class); | 2530 | zap_class(class); |
| 2491 | } | 2531 | } |
| 2492 | 2532 | ||
| 2493 | __raw_spin_unlock(&hash_lock); | 2533 | graph_unlock(); |
| 2494 | raw_local_irq_restore(flags); | 2534 | raw_local_irq_restore(flags); |
| 2495 | } | 2535 | } |
| 2496 | 2536 | ||
| @@ -2518,20 +2558,20 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
| 2518 | * Debug check: in the end all mapped classes should | 2558 | * Debug check: in the end all mapped classes should |
| 2519 | * be gone. | 2559 | * be gone. |
| 2520 | */ | 2560 | */ |
| 2521 | __raw_spin_lock(&hash_lock); | 2561 | graph_lock(); |
| 2522 | for (i = 0; i < CLASSHASH_SIZE; i++) { | 2562 | for (i = 0; i < CLASSHASH_SIZE; i++) { |
| 2523 | head = classhash_table + i; | 2563 | head = classhash_table + i; |
| 2524 | if (list_empty(head)) | 2564 | if (list_empty(head)) |
| 2525 | continue; | 2565 | continue; |
| 2526 | list_for_each_entry_safe(class, next, head, hash_entry) { | 2566 | list_for_each_entry_safe(class, next, head, hash_entry) { |
| 2527 | if (unlikely(class == lock->class_cache)) { | 2567 | if (unlikely(class == lock->class_cache)) { |
| 2528 | __raw_spin_unlock(&hash_lock); | 2568 | if (debug_locks_off_graph_unlock()) |
| 2529 | DEBUG_LOCKS_WARN_ON(1); | 2569 | WARN_ON(1); |
| 2530 | goto out_restore; | 2570 | goto out_restore; |
| 2531 | } | 2571 | } |
| 2532 | } | 2572 | } |
| 2533 | } | 2573 | } |
| 2534 | __raw_spin_unlock(&hash_lock); | 2574 | graph_unlock(); |
| 2535 | 2575 | ||
| 2536 | out_restore: | 2576 | out_restore: |
| 2537 | raw_local_irq_restore(flags); | 2577 | raw_local_irq_restore(flags); |
| @@ -2645,6 +2685,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) | |||
| 2645 | } | 2685 | } |
| 2646 | local_irq_restore(flags); | 2686 | local_irq_restore(flags); |
| 2647 | } | 2687 | } |
| 2688 | EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); | ||
| 2648 | 2689 | ||
| 2649 | static void print_held_locks_bug(struct task_struct *curr) | 2690 | static void print_held_locks_bug(struct task_struct *curr) |
| 2650 | { | 2691 | { |
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h index eab043c83bb2..8ce09bc4613d 100644 --- a/kernel/lockdep_internals.h +++ b/kernel/lockdep_internals.h | |||
| @@ -20,7 +20,7 @@ | |||
| 20 | #define MAX_LOCKDEP_KEYS_BITS 11 | 20 | #define MAX_LOCKDEP_KEYS_BITS 11 |
| 21 | #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) | 21 | #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) |
| 22 | 22 | ||
| 23 | #define MAX_LOCKDEP_CHAINS_BITS 13 | 23 | #define MAX_LOCKDEP_CHAINS_BITS 14 |
| 24 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) | 24 | #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) |
| 25 | 25 | ||
| 26 | /* | 26 | /* |
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index f6e72eaab3fa..b554b40a4aa6 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c | |||
| @@ -113,7 +113,7 @@ static int l_show(struct seq_file *m, void *v) | |||
| 113 | return 0; | 113 | return 0; |
| 114 | } | 114 | } |
| 115 | 115 | ||
| 116 | static struct seq_operations lockdep_ops = { | 116 | static const struct seq_operations lockdep_ops = { |
| 117 | .start = l_start, | 117 | .start = l_start, |
| 118 | .next = l_next, | 118 | .next = l_next, |
| 119 | .stop = l_stop, | 119 | .stop = l_stop, |
| @@ -135,7 +135,7 @@ static int lockdep_open(struct inode *inode, struct file *file) | |||
| 135 | return res; | 135 | return res; |
| 136 | } | 136 | } |
| 137 | 137 | ||
| 138 | static struct file_operations proc_lockdep_operations = { | 138 | static const struct file_operations proc_lockdep_operations = { |
| 139 | .open = lockdep_open, | 139 | .open = lockdep_open, |
| 140 | .read = seq_read, | 140 | .read = seq_read, |
| 141 | .llseek = seq_lseek, | 141 | .llseek = seq_lseek, |
| @@ -319,7 +319,7 @@ static int lockdep_stats_open(struct inode *inode, struct file *file) | |||
| 319 | return single_open(file, lockdep_stats_show, NULL); | 319 | return single_open(file, lockdep_stats_show, NULL); |
| 320 | } | 320 | } |
| 321 | 321 | ||
| 322 | static struct file_operations proc_lockdep_stats_operations = { | 322 | static const struct file_operations proc_lockdep_stats_operations = { |
| 323 | .open = lockdep_stats_open, | 323 | .open = lockdep_stats_open, |
| 324 | .read = seq_read, | 324 | .read = seq_read, |
| 325 | .llseek = seq_lseek, | 325 | .llseek = seq_lseek, |
diff --git a/kernel/module.c b/kernel/module.c index f0166563c602..d0f2260a0210 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -34,10 +34,10 @@ | |||
| 34 | #include <linux/err.h> | 34 | #include <linux/err.h> |
| 35 | #include <linux/vermagic.h> | 35 | #include <linux/vermagic.h> |
| 36 | #include <linux/notifier.h> | 36 | #include <linux/notifier.h> |
| 37 | #include <linux/sched.h> | ||
| 37 | #include <linux/stop_machine.h> | 38 | #include <linux/stop_machine.h> |
| 38 | #include <linux/device.h> | 39 | #include <linux/device.h> |
| 39 | #include <linux/string.h> | 40 | #include <linux/string.h> |
| 40 | #include <linux/sched.h> | ||
| 41 | #include <linux/mutex.h> | 41 | #include <linux/mutex.h> |
| 42 | #include <linux/unwind.h> | 42 | #include <linux/unwind.h> |
| 43 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
| @@ -790,6 +790,19 @@ static struct module_attribute refcnt = { | |||
| 790 | .show = show_refcnt, | 790 | .show = show_refcnt, |
| 791 | }; | 791 | }; |
| 792 | 792 | ||
| 793 | void module_put(struct module *module) | ||
| 794 | { | ||
| 795 | if (module) { | ||
| 796 | unsigned int cpu = get_cpu(); | ||
| 797 | local_dec(&module->ref[cpu].count); | ||
| 798 | /* Maybe they're waiting for us to drop reference? */ | ||
| 799 | if (unlikely(!module_is_live(module))) | ||
| 800 | wake_up_process(module->waiter); | ||
| 801 | put_cpu(); | ||
| 802 | } | ||
| 803 | } | ||
| 804 | EXPORT_SYMBOL(module_put); | ||
| 805 | |||
| 793 | #else /* !CONFIG_MODULE_UNLOAD */ | 806 | #else /* !CONFIG_MODULE_UNLOAD */ |
| 794 | static void print_unload_info(struct seq_file *m, struct module *mod) | 807 | static void print_unload_info(struct seq_file *m, struct module *mod) |
| 795 | { | 808 | { |
| @@ -811,9 +824,34 @@ static inline void module_unload_init(struct module *mod) | |||
| 811 | } | 824 | } |
| 812 | #endif /* CONFIG_MODULE_UNLOAD */ | 825 | #endif /* CONFIG_MODULE_UNLOAD */ |
| 813 | 826 | ||
| 827 | static ssize_t show_initstate(struct module_attribute *mattr, | ||
| 828 | struct module *mod, char *buffer) | ||
| 829 | { | ||
| 830 | const char *state = "unknown"; | ||
| 831 | |||
| 832 | switch (mod->state) { | ||
| 833 | case MODULE_STATE_LIVE: | ||
| 834 | state = "live"; | ||
| 835 | break; | ||
| 836 | case MODULE_STATE_COMING: | ||
| 837 | state = "coming"; | ||
| 838 | break; | ||
| 839 | case MODULE_STATE_GOING: | ||
| 840 | state = "going"; | ||
| 841 | break; | ||
| 842 | } | ||
| 843 | return sprintf(buffer, "%s\n", state); | ||
| 844 | } | ||
| 845 | |||
| 846 | static struct module_attribute initstate = { | ||
| 847 | .attr = { .name = "initstate", .mode = 0444, .owner = THIS_MODULE }, | ||
| 848 | .show = show_initstate, | ||
| 849 | }; | ||
| 850 | |||
| 814 | static struct module_attribute *modinfo_attrs[] = { | 851 | static struct module_attribute *modinfo_attrs[] = { |
| 815 | &modinfo_version, | 852 | &modinfo_version, |
| 816 | &modinfo_srcversion, | 853 | &modinfo_srcversion, |
| 854 | &initstate, | ||
| 817 | #ifdef CONFIG_MODULE_UNLOAD | 855 | #ifdef CONFIG_MODULE_UNLOAD |
| 818 | &refcnt, | 856 | &refcnt, |
| 819 | #endif | 857 | #endif |
| @@ -1086,22 +1124,37 @@ static int mod_sysfs_setup(struct module *mod, | |||
| 1086 | goto out; | 1124 | goto out; |
| 1087 | kobj_set_kset_s(&mod->mkobj, module_subsys); | 1125 | kobj_set_kset_s(&mod->mkobj, module_subsys); |
| 1088 | mod->mkobj.mod = mod; | 1126 | mod->mkobj.mod = mod; |
| 1089 | err = kobject_register(&mod->mkobj.kobj); | 1127 | |
| 1128 | /* delay uevent until full sysfs population */ | ||
| 1129 | kobject_init(&mod->mkobj.kobj); | ||
| 1130 | err = kobject_add(&mod->mkobj.kobj); | ||
| 1090 | if (err) | 1131 | if (err) |
| 1091 | goto out; | 1132 | goto out; |
| 1092 | 1133 | ||
| 1134 | mod->drivers_dir = kobject_add_dir(&mod->mkobj.kobj, "drivers"); | ||
| 1135 | if (!mod->drivers_dir) { | ||
| 1136 | err = -ENOMEM; | ||
| 1137 | goto out_unreg; | ||
| 1138 | } | ||
| 1139 | |||
| 1093 | err = module_param_sysfs_setup(mod, kparam, num_params); | 1140 | err = module_param_sysfs_setup(mod, kparam, num_params); |
| 1094 | if (err) | 1141 | if (err) |
| 1095 | goto out_unreg; | 1142 | goto out_unreg_drivers; |
| 1096 | 1143 | ||
| 1097 | err = module_add_modinfo_attrs(mod); | 1144 | err = module_add_modinfo_attrs(mod); |
| 1098 | if (err) | 1145 | if (err) |
| 1099 | goto out_unreg; | 1146 | goto out_unreg_param; |
| 1100 | 1147 | ||
| 1148 | kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); | ||
| 1101 | return 0; | 1149 | return 0; |
| 1102 | 1150 | ||
| 1151 | out_unreg_param: | ||
| 1152 | module_param_sysfs_remove(mod); | ||
| 1153 | out_unreg_drivers: | ||
| 1154 | kobject_unregister(mod->drivers_dir); | ||
| 1103 | out_unreg: | 1155 | out_unreg: |
| 1104 | kobject_unregister(&mod->mkobj.kobj); | 1156 | kobject_del(&mod->mkobj.kobj); |
| 1157 | kobject_put(&mod->mkobj.kobj); | ||
| 1105 | out: | 1158 | out: |
| 1106 | return err; | 1159 | return err; |
| 1107 | } | 1160 | } |
| @@ -1110,6 +1163,7 @@ static void mod_kobject_remove(struct module *mod) | |||
| 1110 | { | 1163 | { |
| 1111 | module_remove_modinfo_attrs(mod); | 1164 | module_remove_modinfo_attrs(mod); |
| 1112 | module_param_sysfs_remove(mod); | 1165 | module_param_sysfs_remove(mod); |
| 1166 | kobject_unregister(mod->drivers_dir); | ||
| 1113 | 1167 | ||
| 1114 | kobject_unregister(&mod->mkobj.kobj); | 1168 | kobject_unregister(&mod->mkobj.kobj); |
| 1115 | } | 1169 | } |
| @@ -2182,7 +2236,7 @@ static int m_show(struct seq_file *m, void *p) | |||
| 2182 | Where refcount is a number or -, and deps is a comma-separated list | 2236 | Where refcount is a number or -, and deps is a comma-separated list |
| 2183 | of depends or -. | 2237 | of depends or -. |
| 2184 | */ | 2238 | */ |
| 2185 | struct seq_operations modules_op = { | 2239 | const struct seq_operations modules_op = { |
| 2186 | .start = m_start, | 2240 | .start = m_start, |
| 2187 | .next = m_next, | 2241 | .next = m_next, |
| 2188 | .stop = m_stop, | 2242 | .stop = m_stop, |
| @@ -2273,21 +2327,54 @@ void print_modules(void) | |||
| 2273 | printk("\n"); | 2327 | printk("\n"); |
| 2274 | } | 2328 | } |
| 2275 | 2329 | ||
| 2330 | static char *make_driver_name(struct device_driver *drv) | ||
| 2331 | { | ||
| 2332 | char *driver_name; | ||
| 2333 | |||
| 2334 | driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2, | ||
| 2335 | GFP_KERNEL); | ||
| 2336 | if (!driver_name) | ||
| 2337 | return NULL; | ||
| 2338 | |||
| 2339 | sprintf(driver_name, "%s:%s", drv->bus->name, drv->name); | ||
| 2340 | return driver_name; | ||
| 2341 | } | ||
| 2342 | |||
| 2276 | void module_add_driver(struct module *mod, struct device_driver *drv) | 2343 | void module_add_driver(struct module *mod, struct device_driver *drv) |
| 2277 | { | 2344 | { |
| 2345 | char *driver_name; | ||
| 2346 | int no_warn; | ||
| 2347 | |||
| 2278 | if (!mod || !drv) | 2348 | if (!mod || !drv) |
| 2279 | return; | 2349 | return; |
| 2280 | 2350 | ||
| 2281 | /* Don't check return code; this call is idempotent */ | 2351 | /* Don't check return codes; these calls are idempotent */ |
| 2282 | sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); | 2352 | no_warn = sysfs_create_link(&drv->kobj, &mod->mkobj.kobj, "module"); |
| 2353 | driver_name = make_driver_name(drv); | ||
| 2354 | if (driver_name) { | ||
| 2355 | no_warn = sysfs_create_link(mod->drivers_dir, &drv->kobj, | ||
| 2356 | driver_name); | ||
| 2357 | kfree(driver_name); | ||
| 2358 | } | ||
| 2283 | } | 2359 | } |
| 2284 | EXPORT_SYMBOL(module_add_driver); | 2360 | EXPORT_SYMBOL(module_add_driver); |
| 2285 | 2361 | ||
| 2286 | void module_remove_driver(struct device_driver *drv) | 2362 | void module_remove_driver(struct device_driver *drv) |
| 2287 | { | 2363 | { |
| 2364 | char *driver_name; | ||
| 2365 | |||
| 2288 | if (!drv) | 2366 | if (!drv) |
| 2289 | return; | 2367 | return; |
| 2368 | |||
| 2290 | sysfs_remove_link(&drv->kobj, "module"); | 2369 | sysfs_remove_link(&drv->kobj, "module"); |
| 2370 | if (drv->owner && drv->owner->drivers_dir) { | ||
| 2371 | driver_name = make_driver_name(drv); | ||
| 2372 | if (driver_name) { | ||
| 2373 | sysfs_remove_link(drv->owner->drivers_dir, | ||
| 2374 | driver_name); | ||
| 2375 | kfree(driver_name); | ||
| 2376 | } | ||
| 2377 | } | ||
| 2291 | } | 2378 | } |
| 2292 | EXPORT_SYMBOL(module_remove_driver); | 2379 | EXPORT_SYMBOL(module_remove_driver); |
| 2293 | 2380 | ||
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 18651641a7b5..841539d72c55 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
| @@ -77,6 +77,9 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | |||
| 77 | 77 | ||
| 78 | void debug_mutex_unlock(struct mutex *lock) | 78 | void debug_mutex_unlock(struct mutex *lock) |
| 79 | { | 79 | { |
| 80 | if (unlikely(!debug_locks)) | ||
| 81 | return; | ||
| 82 | |||
| 80 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); | 83 | DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); |
| 81 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); | 84 | DEBUG_LOCKS_WARN_ON(lock->magic != lock); |
| 82 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); | 85 | DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 8c71cf72a497..e7cbbb82765b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
| @@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) | |||
| 206 | } | 206 | } |
| 207 | 207 | ||
| 208 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 208 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
| 209 | |||
| 210 | int __sched | ||
| 211 | mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | ||
| 212 | { | ||
| 213 | might_sleep(); | ||
| 214 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass); | ||
| 215 | } | ||
| 216 | |||
| 217 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | ||
| 209 | #endif | 218 | #endif |
| 210 | 219 | ||
| 211 | /* | 220 | /* |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 674aceb7335a..f5b9ee6f6bbb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -17,8 +17,9 @@ | |||
| 17 | #include <linux/version.h> | 17 | #include <linux/version.h> |
| 18 | #include <linux/nsproxy.h> | 18 | #include <linux/nsproxy.h> |
| 19 | #include <linux/init_task.h> | 19 | #include <linux/init_task.h> |
| 20 | #include <linux/namespace.h> | 20 | #include <linux/mnt_namespace.h> |
| 21 | #include <linux/utsname.h> | 21 | #include <linux/utsname.h> |
| 22 | #include <linux/pid_namespace.h> | ||
| 22 | 23 | ||
| 23 | struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); | 24 | struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); |
| 24 | 25 | ||
| @@ -60,12 +61,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) | |||
| 60 | struct nsproxy *ns = clone_namespaces(orig); | 61 | struct nsproxy *ns = clone_namespaces(orig); |
| 61 | 62 | ||
| 62 | if (ns) { | 63 | if (ns) { |
| 63 | if (ns->namespace) | 64 | if (ns->mnt_ns) |
| 64 | get_namespace(ns->namespace); | 65 | get_mnt_ns(ns->mnt_ns); |
| 65 | if (ns->uts_ns) | 66 | if (ns->uts_ns) |
| 66 | get_uts_ns(ns->uts_ns); | 67 | get_uts_ns(ns->uts_ns); |
| 67 | if (ns->ipc_ns) | 68 | if (ns->ipc_ns) |
| 68 | get_ipc_ns(ns->ipc_ns); | 69 | get_ipc_ns(ns->ipc_ns); |
| 70 | if (ns->pid_ns) | ||
| 71 | get_pid_ns(ns->pid_ns); | ||
| 69 | } | 72 | } |
| 70 | 73 | ||
| 71 | return ns; | 74 | return ns; |
| @@ -97,7 +100,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) | |||
| 97 | 100 | ||
| 98 | tsk->nsproxy = new_ns; | 101 | tsk->nsproxy = new_ns; |
| 99 | 102 | ||
| 100 | err = copy_namespace(flags, tsk); | 103 | err = copy_mnt_ns(flags, tsk); |
| 101 | if (err) | 104 | if (err) |
| 102 | goto out_ns; | 105 | goto out_ns; |
| 103 | 106 | ||
| @@ -109,16 +112,23 @@ int copy_namespaces(int flags, struct task_struct *tsk) | |||
| 109 | if (err) | 112 | if (err) |
| 110 | goto out_ipc; | 113 | goto out_ipc; |
| 111 | 114 | ||
| 115 | err = copy_pid_ns(flags, tsk); | ||
| 116 | if (err) | ||
| 117 | goto out_pid; | ||
| 118 | |||
| 112 | out: | 119 | out: |
| 113 | put_nsproxy(old_ns); | 120 | put_nsproxy(old_ns); |
| 114 | return err; | 121 | return err; |
| 115 | 122 | ||
| 123 | out_pid: | ||
| 124 | if (new_ns->ipc_ns) | ||
| 125 | put_ipc_ns(new_ns->ipc_ns); | ||
| 116 | out_ipc: | 126 | out_ipc: |
| 117 | if (new_ns->uts_ns) | 127 | if (new_ns->uts_ns) |
| 118 | put_uts_ns(new_ns->uts_ns); | 128 | put_uts_ns(new_ns->uts_ns); |
| 119 | out_uts: | 129 | out_uts: |
| 120 | if (new_ns->namespace) | 130 | if (new_ns->mnt_ns) |
| 121 | put_namespace(new_ns->namespace); | 131 | put_mnt_ns(new_ns->mnt_ns); |
| 122 | out_ns: | 132 | out_ns: |
| 123 | tsk->nsproxy = old_ns; | 133 | tsk->nsproxy = old_ns; |
| 124 | kfree(new_ns); | 134 | kfree(new_ns); |
| @@ -127,11 +137,13 @@ out_ns: | |||
| 127 | 137 | ||
| 128 | void free_nsproxy(struct nsproxy *ns) | 138 | void free_nsproxy(struct nsproxy *ns) |
| 129 | { | 139 | { |
| 130 | if (ns->namespace) | 140 | if (ns->mnt_ns) |
| 131 | put_namespace(ns->namespace); | 141 | put_mnt_ns(ns->mnt_ns); |
| 132 | if (ns->uts_ns) | 142 | if (ns->uts_ns) |
| 133 | put_uts_ns(ns->uts_ns); | 143 | put_uts_ns(ns->uts_ns); |
| 134 | if (ns->ipc_ns) | 144 | if (ns->ipc_ns) |
| 135 | put_ipc_ns(ns->ipc_ns); | 145 | put_ipc_ns(ns->ipc_ns); |
| 136 | kfree(ns); | 146 | if (ns->pid_ns) |
| 147 | put_pid_ns(ns->pid_ns); | ||
| 148 | kfree(ns); | ||
| 137 | } | 149 | } |
diff --git a/kernel/params.c b/kernel/params.c index f406655d6653..718945da8f58 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -143,9 +143,15 @@ int parse_args(const char *name, | |||
| 143 | 143 | ||
| 144 | while (*args) { | 144 | while (*args) { |
| 145 | int ret; | 145 | int ret; |
| 146 | int irq_was_disabled; | ||
| 146 | 147 | ||
| 147 | args = next_arg(args, ¶m, &val); | 148 | args = next_arg(args, ¶m, &val); |
| 149 | irq_was_disabled = irqs_disabled(); | ||
| 148 | ret = parse_one(param, val, params, num, unknown); | 150 | ret = parse_one(param, val, params, num, unknown); |
| 151 | if (irq_was_disabled && !irqs_disabled()) { | ||
| 152 | printk(KERN_WARNING "parse_args(): option '%s' enabled " | ||
| 153 | "irq's!\n", param); | ||
| 154 | } | ||
| 149 | switch (ret) { | 155 | switch (ret) { |
| 150 | case -ENOENT: | 156 | case -ENOENT: |
| 151 | printk(KERN_ERR "%s: Unknown parameter `%s'\n", | 157 | printk(KERN_ERR "%s: Unknown parameter `%s'\n", |
diff --git a/kernel/pid.c b/kernel/pid.c index b914392085f9..2efe9d8d367b 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -26,12 +26,12 @@ | |||
| 26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
| 27 | #include <linux/bootmem.h> | 27 | #include <linux/bootmem.h> |
| 28 | #include <linux/hash.h> | 28 | #include <linux/hash.h> |
| 29 | #include <linux/pspace.h> | 29 | #include <linux/pid_namespace.h> |
| 30 | 30 | ||
| 31 | #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) | 31 | #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) |
| 32 | static struct hlist_head *pid_hash; | 32 | static struct hlist_head *pid_hash; |
| 33 | static int pidhash_shift; | 33 | static int pidhash_shift; |
| 34 | static kmem_cache_t *pid_cachep; | 34 | static struct kmem_cache *pid_cachep; |
| 35 | 35 | ||
| 36 | int pid_max = PID_MAX_DEFAULT; | 36 | int pid_max = PID_MAX_DEFAULT; |
| 37 | 37 | ||
| @@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT; | |||
| 43 | #define BITS_PER_PAGE (PAGE_SIZE*8) | 43 | #define BITS_PER_PAGE (PAGE_SIZE*8) |
| 44 | #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) | 44 | #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) |
| 45 | 45 | ||
| 46 | static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) | 46 | static inline int mk_pid(struct pid_namespace *pid_ns, |
| 47 | struct pidmap *map, int off) | ||
| 47 | { | 48 | { |
| 48 | return (map - pspace->pidmap)*BITS_PER_PAGE + off; | 49 | return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; |
| 49 | } | 50 | } |
| 50 | 51 | ||
| 51 | #define find_next_offset(map, off) \ | 52 | #define find_next_offset(map, off) \ |
| @@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) | |||
| 57 | * value does not cause lots of bitmaps to be allocated, but | 58 | * value does not cause lots of bitmaps to be allocated, but |
| 58 | * the scheme scales to up to 4 million PIDs, runtime. | 59 | * the scheme scales to up to 4 million PIDs, runtime. |
| 59 | */ | 60 | */ |
| 60 | struct pspace init_pspace = { | 61 | struct pid_namespace init_pid_ns = { |
| 62 | .kref = { | ||
| 63 | .refcount = ATOMIC_INIT(2), | ||
| 64 | }, | ||
| 61 | .pidmap = { | 65 | .pidmap = { |
| 62 | [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } | 66 | [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } |
| 63 | }, | 67 | }, |
| 64 | .last_pid = 0 | 68 | .last_pid = 0, |
| 69 | .child_reaper = &init_task | ||
| 65 | }; | 70 | }; |
| 66 | 71 | ||
| 67 | /* | 72 | /* |
| @@ -80,25 +85,25 @@ struct pspace init_pspace = { | |||
| 80 | 85 | ||
| 81 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); | 86 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); |
| 82 | 87 | ||
| 83 | static fastcall void free_pidmap(struct pspace *pspace, int pid) | 88 | static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid) |
| 84 | { | 89 | { |
| 85 | struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; | 90 | struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; |
| 86 | int offset = pid & BITS_PER_PAGE_MASK; | 91 | int offset = pid & BITS_PER_PAGE_MASK; |
| 87 | 92 | ||
| 88 | clear_bit(offset, map->page); | 93 | clear_bit(offset, map->page); |
| 89 | atomic_inc(&map->nr_free); | 94 | atomic_inc(&map->nr_free); |
| 90 | } | 95 | } |
| 91 | 96 | ||
| 92 | static int alloc_pidmap(struct pspace *pspace) | 97 | static int alloc_pidmap(struct pid_namespace *pid_ns) |
| 93 | { | 98 | { |
| 94 | int i, offset, max_scan, pid, last = pspace->last_pid; | 99 | int i, offset, max_scan, pid, last = pid_ns->last_pid; |
| 95 | struct pidmap *map; | 100 | struct pidmap *map; |
| 96 | 101 | ||
| 97 | pid = last + 1; | 102 | pid = last + 1; |
| 98 | if (pid >= pid_max) | 103 | if (pid >= pid_max) |
| 99 | pid = RESERVED_PIDS; | 104 | pid = RESERVED_PIDS; |
| 100 | offset = pid & BITS_PER_PAGE_MASK; | 105 | offset = pid & BITS_PER_PAGE_MASK; |
| 101 | map = &pspace->pidmap[pid/BITS_PER_PAGE]; | 106 | map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; |
| 102 | max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; | 107 | max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; |
| 103 | for (i = 0; i <= max_scan; ++i) { | 108 | for (i = 0; i <= max_scan; ++i) { |
| 104 | if (unlikely(!map->page)) { | 109 | if (unlikely(!map->page)) { |
| @@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace) | |||
| 120 | do { | 125 | do { |
| 121 | if (!test_and_set_bit(offset, map->page)) { | 126 | if (!test_and_set_bit(offset, map->page)) { |
| 122 | atomic_dec(&map->nr_free); | 127 | atomic_dec(&map->nr_free); |
| 123 | pspace->last_pid = pid; | 128 | pid_ns->last_pid = pid; |
| 124 | return pid; | 129 | return pid; |
| 125 | } | 130 | } |
| 126 | offset = find_next_offset(map, offset); | 131 | offset = find_next_offset(map, offset); |
| 127 | pid = mk_pid(pspace, map, offset); | 132 | pid = mk_pid(pid_ns, map, offset); |
| 128 | /* | 133 | /* |
| 129 | * find_next_offset() found a bit, the pid from it | 134 | * find_next_offset() found a bit, the pid from it |
| 130 | * is in-bounds, and if we fell back to the last | 135 | * is in-bounds, and if we fell back to the last |
| @@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace) | |||
| 135 | (i != max_scan || pid < last || | 140 | (i != max_scan || pid < last || |
| 136 | !((last+1) & BITS_PER_PAGE_MASK))); | 141 | !((last+1) & BITS_PER_PAGE_MASK))); |
| 137 | } | 142 | } |
| 138 | if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { | 143 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { |
| 139 | ++map; | 144 | ++map; |
| 140 | offset = 0; | 145 | offset = 0; |
| 141 | } else { | 146 | } else { |
| 142 | map = &pspace->pidmap[0]; | 147 | map = &pid_ns->pidmap[0]; |
| 143 | offset = RESERVED_PIDS; | 148 | offset = RESERVED_PIDS; |
| 144 | if (unlikely(last == offset)) | 149 | if (unlikely(last == offset)) |
| 145 | break; | 150 | break; |
| 146 | } | 151 | } |
| 147 | pid = mk_pid(pspace, map, offset); | 152 | pid = mk_pid(pid_ns, map, offset); |
| 148 | } | 153 | } |
| 149 | return -1; | 154 | return -1; |
| 150 | } | 155 | } |
| 151 | 156 | ||
| 152 | static int next_pidmap(struct pspace *pspace, int last) | 157 | static int next_pidmap(struct pid_namespace *pid_ns, int last) |
| 153 | { | 158 | { |
| 154 | int offset; | 159 | int offset; |
| 155 | struct pidmap *map, *end; | 160 | struct pidmap *map, *end; |
| 156 | 161 | ||
| 157 | offset = (last + 1) & BITS_PER_PAGE_MASK; | 162 | offset = (last + 1) & BITS_PER_PAGE_MASK; |
| 158 | map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; | 163 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; |
| 159 | end = &pspace->pidmap[PIDMAP_ENTRIES]; | 164 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; |
| 160 | for (; map < end; map++, offset = 0) { | 165 | for (; map < end; map++, offset = 0) { |
| 161 | if (unlikely(!map->page)) | 166 | if (unlikely(!map->page)) |
| 162 | continue; | 167 | continue; |
| 163 | offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); | 168 | offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); |
| 164 | if (offset < BITS_PER_PAGE) | 169 | if (offset < BITS_PER_PAGE) |
| 165 | return mk_pid(pspace, map, offset); | 170 | return mk_pid(pid_ns, map, offset); |
| 166 | } | 171 | } |
| 167 | return -1; | 172 | return -1; |
| 168 | } | 173 | } |
| @@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid) | |||
| 192 | hlist_del_rcu(&pid->pid_chain); | 197 | hlist_del_rcu(&pid->pid_chain); |
| 193 | spin_unlock_irqrestore(&pidmap_lock, flags); | 198 | spin_unlock_irqrestore(&pidmap_lock, flags); |
| 194 | 199 | ||
| 195 | free_pidmap(&init_pspace, pid->nr); | 200 | free_pidmap(current->nsproxy->pid_ns, pid->nr); |
| 196 | call_rcu(&pid->rcu, delayed_put_pid); | 201 | call_rcu(&pid->rcu, delayed_put_pid); |
| 197 | } | 202 | } |
| 198 | 203 | ||
| @@ -206,7 +211,7 @@ struct pid *alloc_pid(void) | |||
| 206 | if (!pid) | 211 | if (!pid) |
| 207 | goto out; | 212 | goto out; |
| 208 | 213 | ||
| 209 | nr = alloc_pidmap(&init_pspace); | 214 | nr = alloc_pidmap(current->nsproxy->pid_ns); |
| 210 | if (nr < 0) | 215 | if (nr < 0) |
| 211 | goto out_free; | 216 | goto out_free; |
| 212 | 217 | ||
| @@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr) | |||
| 348 | pid = find_pid(nr); | 353 | pid = find_pid(nr); |
| 349 | if (pid) | 354 | if (pid) |
| 350 | break; | 355 | break; |
| 351 | nr = next_pidmap(&init_pspace, nr); | 356 | nr = next_pidmap(current->nsproxy->pid_ns, nr); |
| 352 | } while (nr > 0); | 357 | } while (nr > 0); |
| 353 | 358 | ||
| 354 | return pid; | 359 | return pid; |
| 355 | } | 360 | } |
| 356 | EXPORT_SYMBOL_GPL(find_get_pid); | 361 | EXPORT_SYMBOL_GPL(find_get_pid); |
| 357 | 362 | ||
| 363 | int copy_pid_ns(int flags, struct task_struct *tsk) | ||
| 364 | { | ||
| 365 | struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; | ||
| 366 | int err = 0; | ||
| 367 | |||
| 368 | if (!old_ns) | ||
| 369 | return 0; | ||
| 370 | |||
| 371 | get_pid_ns(old_ns); | ||
| 372 | return err; | ||
| 373 | } | ||
| 374 | |||
| 375 | void free_pid_ns(struct kref *kref) | ||
| 376 | { | ||
| 377 | struct pid_namespace *ns; | ||
| 378 | |||
| 379 | ns = container_of(kref, struct pid_namespace, kref); | ||
| 380 | kfree(ns); | ||
| 381 | } | ||
| 382 | |||
| 358 | /* | 383 | /* |
| 359 | * The pid hash table is scaled according to the amount of memory in the | 384 | * The pid hash table is scaled according to the amount of memory in the |
| 360 | * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or | 385 | * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or |
| @@ -382,10 +407,10 @@ void __init pidhash_init(void) | |||
| 382 | 407 | ||
| 383 | void __init pidmap_init(void) | 408 | void __init pidmap_init(void) |
| 384 | { | 409 | { |
| 385 | init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); | 410 | init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
| 386 | /* Reserve PID 0. We never call free_pidmap(0) */ | 411 | /* Reserve PID 0. We never call free_pidmap(0) */ |
| 387 | set_bit(0, init_pspace.pidmap[0].page); | 412 | set_bit(0, init_pid_ns.pidmap[0].page); |
| 388 | atomic_dec(&init_pspace.pidmap[0].nr_free); | 413 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); |
| 389 | 414 | ||
| 390 | pid_cachep = kmem_cache_create("pid", sizeof(struct pid), | 415 | pid_cachep = kmem_cache_create("pid", sizeof(struct pid), |
| 391 | __alignof__(struct pid), | 416 | __alignof__(struct pid), |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 9cbb5d1be06f..5fe87de10ff0 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
| @@ -70,7 +70,7 @@ | |||
| 70 | /* | 70 | /* |
| 71 | * Lets keep our timers in a slab cache :-) | 71 | * Lets keep our timers in a slab cache :-) |
| 72 | */ | 72 | */ |
| 73 | static kmem_cache_t *posix_timers_cache; | 73 | static struct kmem_cache *posix_timers_cache; |
| 74 | static struct idr posix_timers_id; | 74 | static struct idr posix_timers_id; |
| 75 | static DEFINE_SPINLOCK(idr_lock); | 75 | static DEFINE_SPINLOCK(idr_lock); |
| 76 | 76 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 825068ca3479..ed296225dcd4 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -20,13 +20,14 @@ config PM | |||
| 20 | sending the processor to sleep and saving power. | 20 | sending the processor to sleep and saving power. |
| 21 | 21 | ||
| 22 | config PM_LEGACY | 22 | config PM_LEGACY |
| 23 | bool "Legacy Power Management API" | 23 | bool "Legacy Power Management API (DEPRECATED)" |
| 24 | depends on PM | 24 | depends on PM |
| 25 | default y | 25 | default n |
| 26 | ---help--- | 26 | ---help--- |
| 27 | Support for pm_register() and friends. | 27 | Support for pm_register() and friends. This old API is obsoleted |
| 28 | by the driver model. | ||
| 28 | 29 | ||
| 29 | If unsure, say Y. | 30 | If unsure, say N. |
| 30 | 31 | ||
| 31 | config PM_DEBUG | 32 | config PM_DEBUG |
| 32 | bool "Power Management Debug Support" | 33 | bool "Power Management Debug Support" |
| @@ -78,7 +79,7 @@ config PM_SYSFS_DEPRECATED | |||
| 78 | 79 | ||
| 79 | config SOFTWARE_SUSPEND | 80 | config SOFTWARE_SUSPEND |
| 80 | bool "Software Suspend" | 81 | bool "Software Suspend" |
| 81 | depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP)) | 82 | depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)) |
| 82 | ---help--- | 83 | ---help--- |
| 83 | Enable the possibility of suspending the machine. | 84 | Enable the possibility of suspending the machine. |
| 84 | It doesn't need ACPI or APM. | 85 | It doesn't need ACPI or APM. |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index b1fb7866b0b3..88fc5d7ac737 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
| @@ -20,6 +20,7 @@ | |||
| 20 | #include <linux/pm.h> | 20 | #include <linux/pm.h> |
| 21 | #include <linux/console.h> | 21 | #include <linux/console.h> |
| 22 | #include <linux/cpu.h> | 22 | #include <linux/cpu.h> |
| 23 | #include <linux/freezer.h> | ||
| 23 | 24 | ||
| 24 | #include "power.h" | 25 | #include "power.h" |
| 25 | 26 | ||
| @@ -27,6 +28,23 @@ | |||
| 27 | static int noresume = 0; | 28 | static int noresume = 0; |
| 28 | char resume_file[256] = CONFIG_PM_STD_PARTITION; | 29 | char resume_file[256] = CONFIG_PM_STD_PARTITION; |
| 29 | dev_t swsusp_resume_device; | 30 | dev_t swsusp_resume_device; |
| 31 | sector_t swsusp_resume_block; | ||
| 32 | |||
| 33 | /** | ||
| 34 | * platform_prepare - prepare the machine for hibernation using the | ||
| 35 | * platform driver if so configured and return an error code if it fails | ||
| 36 | */ | ||
| 37 | |||
| 38 | static inline int platform_prepare(void) | ||
| 39 | { | ||
| 40 | int error = 0; | ||
| 41 | |||
| 42 | if (pm_disk_mode == PM_DISK_PLATFORM) { | ||
| 43 | if (pm_ops && pm_ops->prepare) | ||
| 44 | error = pm_ops->prepare(PM_SUSPEND_DISK); | ||
| 45 | } | ||
| 46 | return error; | ||
| 47 | } | ||
| 30 | 48 | ||
| 31 | /** | 49 | /** |
| 32 | * power_down - Shut machine down for hibernate. | 50 | * power_down - Shut machine down for hibernate. |
| @@ -40,13 +58,13 @@ dev_t swsusp_resume_device; | |||
| 40 | 58 | ||
| 41 | static void power_down(suspend_disk_method_t mode) | 59 | static void power_down(suspend_disk_method_t mode) |
| 42 | { | 60 | { |
| 43 | int error = 0; | ||
| 44 | |||
| 45 | switch(mode) { | 61 | switch(mode) { |
| 46 | case PM_DISK_PLATFORM: | 62 | case PM_DISK_PLATFORM: |
| 47 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | 63 | if (pm_ops && pm_ops->enter) { |
| 48 | error = pm_ops->enter(PM_SUSPEND_DISK); | 64 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); |
| 49 | break; | 65 | pm_ops->enter(PM_SUSPEND_DISK); |
| 66 | break; | ||
| 67 | } | ||
| 50 | case PM_DISK_SHUTDOWN: | 68 | case PM_DISK_SHUTDOWN: |
| 51 | kernel_power_off(); | 69 | kernel_power_off(); |
| 52 | break; | 70 | break; |
| @@ -90,12 +108,18 @@ static int prepare_processes(void) | |||
| 90 | goto thaw; | 108 | goto thaw; |
| 91 | } | 109 | } |
| 92 | 110 | ||
| 111 | error = platform_prepare(); | ||
| 112 | if (error) | ||
| 113 | goto thaw; | ||
| 114 | |||
| 93 | /* Free memory before shutting down devices. */ | 115 | /* Free memory before shutting down devices. */ |
| 94 | if (!(error = swsusp_shrink_memory())) | 116 | if (!(error = swsusp_shrink_memory())) |
| 95 | return 0; | 117 | return 0; |
| 96 | thaw: | 118 | |
| 119 | platform_finish(); | ||
| 120 | thaw: | ||
| 97 | thaw_processes(); | 121 | thaw_processes(); |
| 98 | enable_cpus: | 122 | enable_cpus: |
| 99 | enable_nonboot_cpus(); | 123 | enable_nonboot_cpus(); |
| 100 | pm_restore_console(); | 124 | pm_restore_console(); |
| 101 | return error; | 125 | return error; |
| @@ -127,7 +151,7 @@ int pm_suspend_disk(void) | |||
| 127 | return error; | 151 | return error; |
| 128 | 152 | ||
| 129 | if (pm_disk_mode == PM_DISK_TESTPROC) | 153 | if (pm_disk_mode == PM_DISK_TESTPROC) |
| 130 | goto Thaw; | 154 | return 0; |
| 131 | 155 | ||
| 132 | suspend_console(); | 156 | suspend_console(); |
| 133 | error = device_suspend(PMSG_FREEZE); | 157 | error = device_suspend(PMSG_FREEZE); |
| @@ -189,10 +213,10 @@ static int software_resume(void) | |||
| 189 | { | 213 | { |
| 190 | int error; | 214 | int error; |
| 191 | 215 | ||
| 192 | down(&pm_sem); | 216 | mutex_lock(&pm_mutex); |
| 193 | if (!swsusp_resume_device) { | 217 | if (!swsusp_resume_device) { |
| 194 | if (!strlen(resume_file)) { | 218 | if (!strlen(resume_file)) { |
| 195 | up(&pm_sem); | 219 | mutex_unlock(&pm_mutex); |
| 196 | return -ENOENT; | 220 | return -ENOENT; |
| 197 | } | 221 | } |
| 198 | swsusp_resume_device = name_to_dev_t(resume_file); | 222 | swsusp_resume_device = name_to_dev_t(resume_file); |
| @@ -207,7 +231,7 @@ static int software_resume(void) | |||
| 207 | * FIXME: If noresume is specified, we need to find the partition | 231 | * FIXME: If noresume is specified, we need to find the partition |
| 208 | * and reset it back to normal swap space. | 232 | * and reset it back to normal swap space. |
| 209 | */ | 233 | */ |
| 210 | up(&pm_sem); | 234 | mutex_unlock(&pm_mutex); |
| 211 | return 0; | 235 | return 0; |
| 212 | } | 236 | } |
| 213 | 237 | ||
| @@ -251,7 +275,7 @@ static int software_resume(void) | |||
| 251 | unprepare_processes(); | 275 | unprepare_processes(); |
| 252 | Done: | 276 | Done: |
| 253 | /* For success case, the suspend path will release the lock */ | 277 | /* For success case, the suspend path will release the lock */ |
| 254 | up(&pm_sem); | 278 | mutex_unlock(&pm_mutex); |
| 255 | pr_debug("PM: Resume from disk failed.\n"); | 279 | pr_debug("PM: Resume from disk failed.\n"); |
| 256 | return 0; | 280 | return 0; |
| 257 | } | 281 | } |
| @@ -312,7 +336,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) | |||
| 312 | p = memchr(buf, '\n', n); | 336 | p = memchr(buf, '\n', n); |
| 313 | len = p ? p - buf : n; | 337 | len = p ? p - buf : n; |
| 314 | 338 | ||
| 315 | down(&pm_sem); | 339 | mutex_lock(&pm_mutex); |
| 316 | for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { | 340 | for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) { |
| 317 | if (!strncmp(buf, pm_disk_modes[i], len)) { | 341 | if (!strncmp(buf, pm_disk_modes[i], len)) { |
| 318 | mode = i; | 342 | mode = i; |
| @@ -336,7 +360,7 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n) | |||
| 336 | 360 | ||
| 337 | pr_debug("PM: suspend-to-disk mode set to '%s'\n", | 361 | pr_debug("PM: suspend-to-disk mode set to '%s'\n", |
| 338 | pm_disk_modes[mode]); | 362 | pm_disk_modes[mode]); |
| 339 | up(&pm_sem); | 363 | mutex_unlock(&pm_mutex); |
| 340 | return error ? error : n; | 364 | return error ? error : n; |
| 341 | } | 365 | } |
| 342 | 366 | ||
| @@ -361,14 +385,14 @@ static ssize_t resume_store(struct subsystem *subsys, const char *buf, size_t n) | |||
| 361 | if (maj != MAJOR(res) || min != MINOR(res)) | 385 | if (maj != MAJOR(res) || min != MINOR(res)) |
| 362 | goto out; | 386 | goto out; |
| 363 | 387 | ||
| 364 | down(&pm_sem); | 388 | mutex_lock(&pm_mutex); |
| 365 | swsusp_resume_device = res; | 389 | swsusp_resume_device = res; |
| 366 | up(&pm_sem); | 390 | mutex_unlock(&pm_mutex); |
| 367 | printk("Attempting manual resume\n"); | 391 | printk("Attempting manual resume\n"); |
| 368 | noresume = 0; | 392 | noresume = 0; |
| 369 | software_resume(); | 393 | software_resume(); |
| 370 | ret = n; | 394 | ret = n; |
| 371 | out: | 395 | out: |
| 372 | return ret; | 396 | return ret; |
| 373 | } | 397 | } |
| 374 | 398 | ||
| @@ -423,6 +447,19 @@ static int __init resume_setup(char *str) | |||
| 423 | return 1; | 447 | return 1; |
| 424 | } | 448 | } |
| 425 | 449 | ||
| 450 | static int __init resume_offset_setup(char *str) | ||
| 451 | { | ||
| 452 | unsigned long long offset; | ||
| 453 | |||
| 454 | if (noresume) | ||
| 455 | return 1; | ||
| 456 | |||
| 457 | if (sscanf(str, "%llu", &offset) == 1) | ||
| 458 | swsusp_resume_block = offset; | ||
| 459 | |||
| 460 | return 1; | ||
| 461 | } | ||
| 462 | |||
| 426 | static int __init noresume_setup(char *str) | 463 | static int __init noresume_setup(char *str) |
| 427 | { | 464 | { |
| 428 | noresume = 1; | 465 | noresume = 1; |
| @@ -430,4 +467,5 @@ static int __init noresume_setup(char *str) | |||
| 430 | } | 467 | } |
| 431 | 468 | ||
| 432 | __setup("noresume", noresume_setup); | 469 | __setup("noresume", noresume_setup); |
| 470 | __setup("resume_offset=", resume_offset_setup); | ||
| 433 | __setup("resume=", resume_setup); | 471 | __setup("resume=", resume_setup); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 873228c71dab..ff3a6182f5f0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | * | 8 | * |
| 9 | */ | 9 | */ |
| 10 | 10 | ||
| 11 | #include <linux/module.h> | ||
| 11 | #include <linux/suspend.h> | 12 | #include <linux/suspend.h> |
| 12 | #include <linux/kobject.h> | 13 | #include <linux/kobject.h> |
| 13 | #include <linux/string.h> | 14 | #include <linux/string.h> |
| @@ -18,16 +19,17 @@ | |||
| 18 | #include <linux/console.h> | 19 | #include <linux/console.h> |
| 19 | #include <linux/cpu.h> | 20 | #include <linux/cpu.h> |
| 20 | #include <linux/resume-trace.h> | 21 | #include <linux/resume-trace.h> |
| 22 | #include <linux/freezer.h> | ||
| 21 | 23 | ||
| 22 | #include "power.h" | 24 | #include "power.h" |
| 23 | 25 | ||
| 24 | /*This is just an arbitrary number */ | 26 | /*This is just an arbitrary number */ |
| 25 | #define FREE_PAGE_NUMBER (100) | 27 | #define FREE_PAGE_NUMBER (100) |
| 26 | 28 | ||
| 27 | DECLARE_MUTEX(pm_sem); | 29 | DEFINE_MUTEX(pm_mutex); |
| 28 | 30 | ||
| 29 | struct pm_ops *pm_ops; | 31 | struct pm_ops *pm_ops; |
| 30 | suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; | 32 | suspend_disk_method_t pm_disk_mode = PM_DISK_PLATFORM; |
| 31 | 33 | ||
| 32 | /** | 34 | /** |
| 33 | * pm_set_ops - Set the global power method table. | 35 | * pm_set_ops - Set the global power method table. |
| @@ -36,9 +38,9 @@ suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN; | |||
| 36 | 38 | ||
| 37 | void pm_set_ops(struct pm_ops * ops) | 39 | void pm_set_ops(struct pm_ops * ops) |
| 38 | { | 40 | { |
| 39 | down(&pm_sem); | 41 | mutex_lock(&pm_mutex); |
| 40 | pm_ops = ops; | 42 | pm_ops = ops; |
| 41 | up(&pm_sem); | 43 | mutex_unlock(&pm_mutex); |
| 42 | } | 44 | } |
| 43 | 45 | ||
| 44 | 46 | ||
| @@ -182,7 +184,7 @@ static int enter_state(suspend_state_t state) | |||
| 182 | 184 | ||
| 183 | if (!valid_state(state)) | 185 | if (!valid_state(state)) |
| 184 | return -ENODEV; | 186 | return -ENODEV; |
| 185 | if (down_trylock(&pm_sem)) | 187 | if (!mutex_trylock(&pm_mutex)) |
| 186 | return -EBUSY; | 188 | return -EBUSY; |
| 187 | 189 | ||
| 188 | if (state == PM_SUSPEND_DISK) { | 190 | if (state == PM_SUSPEND_DISK) { |
| @@ -200,7 +202,7 @@ static int enter_state(suspend_state_t state) | |||
| 200 | pr_debug("PM: Finishing wakeup.\n"); | 202 | pr_debug("PM: Finishing wakeup.\n"); |
| 201 | suspend_finish(state); | 203 | suspend_finish(state); |
| 202 | Unlock: | 204 | Unlock: |
| 203 | up(&pm_sem); | 205 | mutex_unlock(&pm_mutex); |
| 204 | return error; | 206 | return error; |
| 205 | } | 207 | } |
| 206 | 208 | ||
| @@ -229,7 +231,7 @@ int pm_suspend(suspend_state_t state) | |||
| 229 | return -EINVAL; | 231 | return -EINVAL; |
| 230 | } | 232 | } |
| 231 | 233 | ||
| 232 | 234 | EXPORT_SYMBOL(pm_suspend); | |
| 233 | 235 | ||
| 234 | decl_subsys(power,NULL,NULL); | 236 | decl_subsys(power,NULL,NULL); |
| 235 | 237 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index bfe999f7b272..eb461b816bf4 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -22,7 +22,9 @@ static inline int pm_suspend_disk(void) | |||
| 22 | return -EPERM; | 22 | return -EPERM; |
| 23 | } | 23 | } |
| 24 | #endif | 24 | #endif |
| 25 | extern struct semaphore pm_sem; | 25 | |
| 26 | extern struct mutex pm_mutex; | ||
| 27 | |||
| 26 | #define power_attr(_name) \ | 28 | #define power_attr(_name) \ |
| 27 | static struct subsys_attribute _name##_attr = { \ | 29 | static struct subsys_attribute _name##_attr = { \ |
| 28 | .attr = { \ | 30 | .attr = { \ |
| @@ -42,6 +44,7 @@ extern const void __nosave_begin, __nosave_end; | |||
| 42 | extern unsigned long image_size; | 44 | extern unsigned long image_size; |
| 43 | extern int in_suspend; | 45 | extern int in_suspend; |
| 44 | extern dev_t swsusp_resume_device; | 46 | extern dev_t swsusp_resume_device; |
| 47 | extern sector_t swsusp_resume_block; | ||
| 45 | 48 | ||
| 46 | extern asmlinkage int swsusp_arch_suspend(void); | 49 | extern asmlinkage int swsusp_arch_suspend(void); |
| 47 | extern asmlinkage int swsusp_arch_resume(void); | 50 | extern asmlinkage int swsusp_arch_resume(void); |
| @@ -102,8 +105,18 @@ struct snapshot_handle { | |||
| 102 | extern unsigned int snapshot_additional_pages(struct zone *zone); | 105 | extern unsigned int snapshot_additional_pages(struct zone *zone); |
| 103 | extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); | 106 | extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); |
| 104 | extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); | 107 | extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); |
| 108 | extern void snapshot_write_finalize(struct snapshot_handle *handle); | ||
| 105 | extern int snapshot_image_loaded(struct snapshot_handle *handle); | 109 | extern int snapshot_image_loaded(struct snapshot_handle *handle); |
| 106 | extern void snapshot_free_unused_memory(struct snapshot_handle *handle); | 110 | |
| 111 | /* | ||
| 112 | * This structure is used to pass the values needed for the identification | ||
| 113 | * of the resume swap area from a user space to the kernel via the | ||
| 114 | * SNAPSHOT_SET_SWAP_AREA ioctl | ||
| 115 | */ | ||
| 116 | struct resume_swap_area { | ||
| 117 | loff_t offset; | ||
| 118 | u_int32_t dev; | ||
| 119 | } __attribute__((packed)); | ||
| 107 | 120 | ||
| 108 | #define SNAPSHOT_IOC_MAGIC '3' | 121 | #define SNAPSHOT_IOC_MAGIC '3' |
| 109 | #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) | 122 | #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) |
| @@ -117,7 +130,14 @@ extern void snapshot_free_unused_memory(struct snapshot_handle *handle); | |||
| 117 | #define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) | 130 | #define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) |
| 118 | #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) | 131 | #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) |
| 119 | #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) | 132 | #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) |
| 120 | #define SNAPSHOT_IOC_MAXNR 11 | 133 | #define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) |
| 134 | #define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \ | ||
| 135 | struct resume_swap_area) | ||
| 136 | #define SNAPSHOT_IOC_MAXNR 13 | ||
| 137 | |||
| 138 | #define PMOPS_PREPARE 1 | ||
| 139 | #define PMOPS_ENTER 2 | ||
| 140 | #define PMOPS_FINISH 3 | ||
| 121 | 141 | ||
| 122 | /** | 142 | /** |
| 123 | * The bitmap is used for tracing allocated swap pages | 143 | * The bitmap is used for tracing allocated swap pages |
| @@ -141,7 +161,7 @@ struct bitmap_page { | |||
| 141 | 161 | ||
| 142 | extern void free_bitmap(struct bitmap_page *bitmap); | 162 | extern void free_bitmap(struct bitmap_page *bitmap); |
| 143 | extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); | 163 | extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits); |
| 144 | extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); | 164 | extern sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap); |
| 145 | extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); | 165 | extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); |
| 146 | 166 | ||
| 147 | extern int swsusp_check(void); | 167 | extern int swsusp_check(void); |
| @@ -153,3 +173,7 @@ extern int swsusp_read(void); | |||
| 153 | extern int swsusp_write(void); | 173 | extern int swsusp_write(void); |
| 154 | extern void swsusp_close(void); | 174 | extern void swsusp_close(void); |
| 155 | extern int suspend_enter(suspend_state_t state); | 175 | extern int suspend_enter(suspend_state_t state); |
| 176 | |||
| 177 | struct timeval; | ||
| 178 | extern void swsusp_show_speed(struct timeval *, struct timeval *, | ||
| 179 | unsigned int, char *); | ||
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index f1f900ac3164..678ec736076b 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c | |||
| @@ -16,12 +16,12 @@ | |||
| 16 | * callback we use. | 16 | * callback we use. |
| 17 | */ | 17 | */ |
| 18 | 18 | ||
| 19 | static void do_poweroff(void *dummy) | 19 | static void do_poweroff(struct work_struct *dummy) |
| 20 | { | 20 | { |
| 21 | kernel_power_off(); | 21 | kernel_power_off(); |
| 22 | } | 22 | } |
| 23 | 23 | ||
| 24 | static DECLARE_WORK(poweroff_work, do_poweroff, NULL); | 24 | static DECLARE_WORK(poweroff_work, do_poweroff); |
| 25 | 25 | ||
| 26 | static void handle_poweroff(int key, struct tty_struct *tty) | 26 | static void handle_poweroff(int key, struct tty_struct *tty) |
| 27 | { | 27 | { |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 72e72d2c61e6..6d566bf7085c 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -13,20 +13,22 @@ | |||
| 13 | #include <linux/suspend.h> | 13 | #include <linux/suspend.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/syscalls.h> | 15 | #include <linux/syscalls.h> |
| 16 | #include <linux/freezer.h> | ||
| 16 | 17 | ||
| 17 | /* | 18 | /* |
| 18 | * Timeout for stopping processes | 19 | * Timeout for stopping processes |
| 19 | */ | 20 | */ |
| 20 | #define TIMEOUT (20 * HZ) | 21 | #define TIMEOUT (20 * HZ) |
| 21 | 22 | ||
| 23 | #define FREEZER_KERNEL_THREADS 0 | ||
| 24 | #define FREEZER_USER_SPACE 1 | ||
| 22 | 25 | ||
| 23 | static inline int freezeable(struct task_struct * p) | 26 | static inline int freezeable(struct task_struct * p) |
| 24 | { | 27 | { |
| 25 | if ((p == current) || | 28 | if ((p == current) || |
| 26 | (p->flags & PF_NOFREEZE) || | 29 | (p->flags & PF_NOFREEZE) || |
| 27 | (p->exit_state == EXIT_ZOMBIE) || | 30 | (p->exit_state == EXIT_ZOMBIE) || |
| 28 | (p->exit_state == EXIT_DEAD) || | 31 | (p->exit_state == EXIT_DEAD)) |
| 29 | (p->state == TASK_STOPPED)) | ||
| 30 | return 0; | 32 | return 0; |
| 31 | return 1; | 33 | return 1; |
| 32 | } | 34 | } |
| @@ -39,7 +41,6 @@ void refrigerator(void) | |||
| 39 | long save; | 41 | long save; |
| 40 | save = current->state; | 42 | save = current->state; |
| 41 | pr_debug("%s entered refrigerator\n", current->comm); | 43 | pr_debug("%s entered refrigerator\n", current->comm); |
| 42 | printk("="); | ||
| 43 | 44 | ||
| 44 | frozen_process(current); | 45 | frozen_process(current); |
| 45 | spin_lock_irq(¤t->sighand->siglock); | 46 | spin_lock_irq(¤t->sighand->siglock); |
| @@ -59,10 +60,16 @@ static inline void freeze_process(struct task_struct *p) | |||
| 59 | unsigned long flags; | 60 | unsigned long flags; |
| 60 | 61 | ||
| 61 | if (!freezing(p)) { | 62 | if (!freezing(p)) { |
| 62 | freeze(p); | 63 | rmb(); |
| 63 | spin_lock_irqsave(&p->sighand->siglock, flags); | 64 | if (!frozen(p)) { |
| 64 | signal_wake_up(p, 0); | 65 | if (p->state == TASK_STOPPED) |
| 65 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 66 | force_sig_specific(SIGSTOP, p); |
| 67 | |||
| 68 | freeze(p); | ||
| 69 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
| 70 | signal_wake_up(p, p->state == TASK_STOPPED); | ||
| 71 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
| 72 | } | ||
| 66 | } | 73 | } |
| 67 | } | 74 | } |
| 68 | 75 | ||
| @@ -79,96 +86,134 @@ static void cancel_freezing(struct task_struct *p) | |||
| 79 | } | 86 | } |
| 80 | } | 87 | } |
| 81 | 88 | ||
| 82 | /* 0 = success, else # of processes that we failed to stop */ | 89 | static inline int is_user_space(struct task_struct *p) |
| 83 | int freeze_processes(void) | 90 | { |
| 91 | return p->mm && !(p->flags & PF_BORROWED_MM); | ||
| 92 | } | ||
| 93 | |||
| 94 | static unsigned int try_to_freeze_tasks(int freeze_user_space) | ||
| 84 | { | 95 | { |
| 85 | int todo, nr_user, user_frozen; | ||
| 86 | unsigned long start_time; | ||
| 87 | struct task_struct *g, *p; | 96 | struct task_struct *g, *p; |
| 97 | unsigned long end_time; | ||
| 98 | unsigned int todo; | ||
| 88 | 99 | ||
| 89 | printk( "Stopping tasks: " ); | 100 | end_time = jiffies + TIMEOUT; |
| 90 | start_time = jiffies; | ||
| 91 | user_frozen = 0; | ||
| 92 | do { | 101 | do { |
| 93 | nr_user = todo = 0; | 102 | todo = 0; |
| 94 | read_lock(&tasklist_lock); | 103 | read_lock(&tasklist_lock); |
| 95 | do_each_thread(g, p) { | 104 | do_each_thread(g, p) { |
| 96 | if (!freezeable(p)) | 105 | if (!freezeable(p)) |
| 97 | continue; | 106 | continue; |
| 107 | |||
| 98 | if (frozen(p)) | 108 | if (frozen(p)) |
| 99 | continue; | 109 | continue; |
| 110 | |||
| 100 | if (p->state == TASK_TRACED && frozen(p->parent)) { | 111 | if (p->state == TASK_TRACED && frozen(p->parent)) { |
| 101 | cancel_freezing(p); | 112 | cancel_freezing(p); |
| 102 | continue; | 113 | continue; |
| 103 | } | 114 | } |
| 104 | if (p->mm && !(p->flags & PF_BORROWED_MM)) { | 115 | if (is_user_space(p)) { |
| 105 | /* The task is a user-space one. | 116 | if (!freeze_user_space) |
| 106 | * Freeze it unless there's a vfork completion | 117 | continue; |
| 107 | * pending | 118 | |
| 119 | /* Freeze the task unless there is a vfork | ||
| 120 | * completion pending | ||
| 108 | */ | 121 | */ |
| 109 | if (!p->vfork_done) | 122 | if (!p->vfork_done) |
| 110 | freeze_process(p); | 123 | freeze_process(p); |
| 111 | nr_user++; | ||
| 112 | } else { | 124 | } else { |
| 113 | /* Freeze only if the user space is frozen */ | 125 | if (freeze_user_space) |
| 114 | if (user_frozen) | 126 | continue; |
| 115 | freeze_process(p); | 127 | |
| 116 | todo++; | 128 | freeze_process(p); |
| 117 | } | 129 | } |
| 130 | todo++; | ||
| 118 | } while_each_thread(g, p); | 131 | } while_each_thread(g, p); |
| 119 | read_unlock(&tasklist_lock); | 132 | read_unlock(&tasklist_lock); |
| 120 | todo += nr_user; | ||
| 121 | if (!user_frozen && !nr_user) { | ||
| 122 | sys_sync(); | ||
| 123 | start_time = jiffies; | ||
| 124 | } | ||
| 125 | user_frozen = !nr_user; | ||
| 126 | yield(); /* Yield is okay here */ | 133 | yield(); /* Yield is okay here */ |
| 127 | if (todo && time_after(jiffies, start_time + TIMEOUT)) | 134 | if (todo && time_after(jiffies, end_time)) |
| 128 | break; | 135 | break; |
| 129 | } while(todo); | 136 | } while (todo); |
| 130 | 137 | ||
| 131 | /* This does not unfreeze processes that are already frozen | ||
| 132 | * (we have slightly ugly calling convention in that respect, | ||
| 133 | * and caller must call thaw_processes() if something fails), | ||
| 134 | * but it cleans up leftover PF_FREEZE requests. | ||
| 135 | */ | ||
| 136 | if (todo) { | 138 | if (todo) { |
| 137 | printk( "\n" ); | 139 | /* This does not unfreeze processes that are already frozen |
| 138 | printk(KERN_ERR " stopping tasks timed out " | 140 | * (we have slightly ugly calling convention in that respect, |
| 139 | "after %d seconds (%d tasks remaining):\n", | 141 | * and caller must call thaw_processes() if something fails), |
| 140 | TIMEOUT / HZ, todo); | 142 | * but it cleans up leftover PF_FREEZE requests. |
| 143 | */ | ||
| 144 | printk("\n"); | ||
| 145 | printk(KERN_ERR "Stopping %s timed out after %d seconds " | ||
| 146 | "(%d tasks refusing to freeze):\n", | ||
| 147 | freeze_user_space ? "user space processes" : | ||
| 148 | "kernel threads", | ||
| 149 | TIMEOUT / HZ, todo); | ||
| 141 | read_lock(&tasklist_lock); | 150 | read_lock(&tasklist_lock); |
| 142 | do_each_thread(g, p) { | 151 | do_each_thread(g, p) { |
| 152 | if (is_user_space(p) == !freeze_user_space) | ||
| 153 | continue; | ||
| 154 | |||
| 143 | if (freezeable(p) && !frozen(p)) | 155 | if (freezeable(p) && !frozen(p)) |
| 144 | printk(KERN_ERR " %s\n", p->comm); | 156 | printk(KERN_ERR " %s\n", p->comm); |
| 157 | |||
| 145 | cancel_freezing(p); | 158 | cancel_freezing(p); |
| 146 | } while_each_thread(g, p); | 159 | } while_each_thread(g, p); |
| 147 | read_unlock(&tasklist_lock); | 160 | read_unlock(&tasklist_lock); |
| 148 | return todo; | ||
| 149 | } | 161 | } |
| 150 | 162 | ||
| 151 | printk( "|\n" ); | 163 | return todo; |
| 164 | } | ||
| 165 | |||
| 166 | /** | ||
| 167 | * freeze_processes - tell processes to enter the refrigerator | ||
| 168 | * | ||
| 169 | * Returns 0 on success, or the number of processes that didn't freeze, | ||
| 170 | * although they were told to. | ||
| 171 | */ | ||
| 172 | int freeze_processes(void) | ||
| 173 | { | ||
| 174 | unsigned int nr_unfrozen; | ||
| 175 | |||
| 176 | printk("Stopping tasks ... "); | ||
| 177 | nr_unfrozen = try_to_freeze_tasks(FREEZER_USER_SPACE); | ||
| 178 | if (nr_unfrozen) | ||
| 179 | return nr_unfrozen; | ||
| 180 | |||
| 181 | sys_sync(); | ||
| 182 | nr_unfrozen = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); | ||
| 183 | if (nr_unfrozen) | ||
| 184 | return nr_unfrozen; | ||
| 185 | |||
| 186 | printk("done.\n"); | ||
| 152 | BUG_ON(in_atomic()); | 187 | BUG_ON(in_atomic()); |
| 153 | return 0; | 188 | return 0; |
| 154 | } | 189 | } |
| 155 | 190 | ||
| 156 | void thaw_processes(void) | 191 | static void thaw_tasks(int thaw_user_space) |
| 157 | { | 192 | { |
| 158 | struct task_struct *g, *p; | 193 | struct task_struct *g, *p; |
| 159 | 194 | ||
| 160 | printk( "Restarting tasks..." ); | ||
| 161 | read_lock(&tasklist_lock); | 195 | read_lock(&tasklist_lock); |
| 162 | do_each_thread(g, p) { | 196 | do_each_thread(g, p) { |
| 163 | if (!freezeable(p)) | 197 | if (!freezeable(p)) |
| 164 | continue; | 198 | continue; |
| 199 | |||
| 200 | if (is_user_space(p) == !thaw_user_space) | ||
| 201 | continue; | ||
| 202 | |||
| 165 | if (!thaw_process(p)) | 203 | if (!thaw_process(p)) |
| 166 | printk(KERN_INFO " Strange, %s not stopped\n", p->comm ); | 204 | printk(KERN_WARNING " Strange, %s not stopped\n", |
| 205 | p->comm ); | ||
| 167 | } while_each_thread(g, p); | 206 | } while_each_thread(g, p); |
| 168 | |||
| 169 | read_unlock(&tasklist_lock); | 207 | read_unlock(&tasklist_lock); |
| 208 | } | ||
| 209 | |||
| 210 | void thaw_processes(void) | ||
| 211 | { | ||
| 212 | printk("Restarting tasks ... "); | ||
| 213 | thaw_tasks(FREEZER_KERNEL_THREADS); | ||
| 214 | thaw_tasks(FREEZER_USER_SPACE); | ||
| 170 | schedule(); | 215 | schedule(); |
| 171 | printk( " done\n" ); | 216 | printk("done.\n"); |
| 172 | } | 217 | } |
| 173 | 218 | ||
| 174 | EXPORT_SYMBOL(refrigerator); | 219 | EXPORT_SYMBOL(refrigerator); |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 99f9b7d177d6..c024606221c4 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -1,15 +1,15 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * linux/kernel/power/snapshot.c | 2 | * linux/kernel/power/snapshot.c |
| 3 | * | 3 | * |
| 4 | * This file provide system snapshot/restore functionality. | 4 | * This file provides system snapshot/restore functionality for swsusp. |
| 5 | * | 5 | * |
| 6 | * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> | 6 | * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> |
| 7 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | ||
| 7 | * | 8 | * |
| 8 | * This file is released under the GPLv2, and is based on swsusp.c. | 9 | * This file is released under the GPLv2. |
| 9 | * | 10 | * |
| 10 | */ | 11 | */ |
| 11 | 12 | ||
| 12 | |||
| 13 | #include <linux/version.h> | 13 | #include <linux/version.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
| @@ -34,137 +34,24 @@ | |||
| 34 | 34 | ||
| 35 | #include "power.h" | 35 | #include "power.h" |
| 36 | 36 | ||
| 37 | /* List of PBEs used for creating and restoring the suspend image */ | 37 | /* List of PBEs needed for restoring the pages that were allocated before |
| 38 | * the suspend and included in the suspend image, but have also been | ||
| 39 | * allocated by the "resume" kernel, so their contents cannot be written | ||
| 40 | * directly to their "original" page frames. | ||
| 41 | */ | ||
| 38 | struct pbe *restore_pblist; | 42 | struct pbe *restore_pblist; |
| 39 | 43 | ||
| 40 | static unsigned int nr_copy_pages; | 44 | /* Pointer to an auxiliary buffer (1 page) */ |
| 41 | static unsigned int nr_meta_pages; | ||
| 42 | static void *buffer; | 45 | static void *buffer; |
| 43 | 46 | ||
| 44 | #ifdef CONFIG_HIGHMEM | ||
| 45 | unsigned int count_highmem_pages(void) | ||
| 46 | { | ||
| 47 | struct zone *zone; | ||
| 48 | unsigned long zone_pfn; | ||
| 49 | unsigned int n = 0; | ||
| 50 | |||
| 51 | for_each_zone (zone) | ||
| 52 | if (is_highmem(zone)) { | ||
| 53 | mark_free_pages(zone); | ||
| 54 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; zone_pfn++) { | ||
| 55 | struct page *page; | ||
| 56 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
| 57 | if (!pfn_valid(pfn)) | ||
| 58 | continue; | ||
| 59 | page = pfn_to_page(pfn); | ||
| 60 | if (PageReserved(page)) | ||
| 61 | continue; | ||
| 62 | if (PageNosaveFree(page)) | ||
| 63 | continue; | ||
| 64 | n++; | ||
| 65 | } | ||
| 66 | } | ||
| 67 | return n; | ||
| 68 | } | ||
| 69 | |||
| 70 | struct highmem_page { | ||
| 71 | char *data; | ||
| 72 | struct page *page; | ||
| 73 | struct highmem_page *next; | ||
| 74 | }; | ||
| 75 | |||
| 76 | static struct highmem_page *highmem_copy; | ||
| 77 | |||
| 78 | static int save_highmem_zone(struct zone *zone) | ||
| 79 | { | ||
| 80 | unsigned long zone_pfn; | ||
| 81 | mark_free_pages(zone); | ||
| 82 | for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { | ||
| 83 | struct page *page; | ||
| 84 | struct highmem_page *save; | ||
| 85 | void *kaddr; | ||
| 86 | unsigned long pfn = zone_pfn + zone->zone_start_pfn; | ||
| 87 | |||
| 88 | if (!(pfn%10000)) | ||
| 89 | printk("."); | ||
| 90 | if (!pfn_valid(pfn)) | ||
| 91 | continue; | ||
| 92 | page = pfn_to_page(pfn); | ||
| 93 | /* | ||
| 94 | * This condition results from rvmalloc() sans vmalloc_32() | ||
| 95 | * and architectural memory reservations. This should be | ||
| 96 | * corrected eventually when the cases giving rise to this | ||
| 97 | * are better understood. | ||
| 98 | */ | ||
| 99 | if (PageReserved(page)) | ||
| 100 | continue; | ||
| 101 | BUG_ON(PageNosave(page)); | ||
| 102 | if (PageNosaveFree(page)) | ||
| 103 | continue; | ||
| 104 | save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC); | ||
| 105 | if (!save) | ||
| 106 | return -ENOMEM; | ||
| 107 | save->next = highmem_copy; | ||
| 108 | save->page = page; | ||
| 109 | save->data = (void *) get_zeroed_page(GFP_ATOMIC); | ||
| 110 | if (!save->data) { | ||
| 111 | kfree(save); | ||
| 112 | return -ENOMEM; | ||
| 113 | } | ||
| 114 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 115 | memcpy(save->data, kaddr, PAGE_SIZE); | ||
| 116 | kunmap_atomic(kaddr, KM_USER0); | ||
| 117 | highmem_copy = save; | ||
| 118 | } | ||
| 119 | return 0; | ||
| 120 | } | ||
| 121 | |||
| 122 | int save_highmem(void) | ||
| 123 | { | ||
| 124 | struct zone *zone; | ||
| 125 | int res = 0; | ||
| 126 | |||
| 127 | pr_debug("swsusp: Saving Highmem"); | ||
| 128 | drain_local_pages(); | ||
| 129 | for_each_zone (zone) { | ||
| 130 | if (is_highmem(zone)) | ||
| 131 | res = save_highmem_zone(zone); | ||
| 132 | if (res) | ||
| 133 | return res; | ||
| 134 | } | ||
| 135 | printk("\n"); | ||
| 136 | return 0; | ||
| 137 | } | ||
| 138 | |||
| 139 | int restore_highmem(void) | ||
| 140 | { | ||
| 141 | printk("swsusp: Restoring Highmem\n"); | ||
| 142 | while (highmem_copy) { | ||
| 143 | struct highmem_page *save = highmem_copy; | ||
| 144 | void *kaddr; | ||
| 145 | highmem_copy = save->next; | ||
| 146 | |||
| 147 | kaddr = kmap_atomic(save->page, KM_USER0); | ||
| 148 | memcpy(kaddr, save->data, PAGE_SIZE); | ||
| 149 | kunmap_atomic(kaddr, KM_USER0); | ||
| 150 | free_page((long) save->data); | ||
| 151 | kfree(save); | ||
| 152 | } | ||
| 153 | return 0; | ||
| 154 | } | ||
| 155 | #else | ||
| 156 | static inline unsigned int count_highmem_pages(void) {return 0;} | ||
| 157 | static inline int save_highmem(void) {return 0;} | ||
| 158 | static inline int restore_highmem(void) {return 0;} | ||
| 159 | #endif | ||
| 160 | |||
| 161 | /** | 47 | /** |
| 162 | * @safe_needed - on resume, for storing the PBE list and the image, | 48 | * @safe_needed - on resume, for storing the PBE list and the image, |
| 163 | * we can only use memory pages that do not conflict with the pages | 49 | * we can only use memory pages that do not conflict with the pages |
| 164 | * used before suspend. | 50 | * used before suspend. The unsafe pages have PageNosaveFree set |
| 51 | * and we count them using unsafe_pages. | ||
| 165 | * | 52 | * |
| 166 | * The unsafe pages are marked with the PG_nosave_free flag | 53 | * Each allocated image page is marked as PageNosave and PageNosaveFree |
| 167 | * and we count them using unsafe_pages | 54 | * so that swsusp_free() can release it. |
| 168 | */ | 55 | */ |
| 169 | 56 | ||
| 170 | #define PG_ANY 0 | 57 | #define PG_ANY 0 |
| @@ -174,7 +61,7 @@ static inline int restore_highmem(void) {return 0;} | |||
| 174 | 61 | ||
| 175 | static unsigned int allocated_unsafe_pages; | 62 | static unsigned int allocated_unsafe_pages; |
| 176 | 63 | ||
| 177 | static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | 64 | static void *get_image_page(gfp_t gfp_mask, int safe_needed) |
| 178 | { | 65 | { |
| 179 | void *res; | 66 | void *res; |
| 180 | 67 | ||
| @@ -195,20 +82,39 @@ static void *alloc_image_page(gfp_t gfp_mask, int safe_needed) | |||
| 195 | 82 | ||
| 196 | unsigned long get_safe_page(gfp_t gfp_mask) | 83 | unsigned long get_safe_page(gfp_t gfp_mask) |
| 197 | { | 84 | { |
| 198 | return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE); | 85 | return (unsigned long)get_image_page(gfp_mask, PG_SAFE); |
| 86 | } | ||
| 87 | |||
| 88 | static struct page *alloc_image_page(gfp_t gfp_mask) | ||
| 89 | { | ||
| 90 | struct page *page; | ||
| 91 | |||
| 92 | page = alloc_page(gfp_mask); | ||
| 93 | if (page) { | ||
| 94 | SetPageNosave(page); | ||
| 95 | SetPageNosaveFree(page); | ||
| 96 | } | ||
| 97 | return page; | ||
| 199 | } | 98 | } |
| 200 | 99 | ||
| 201 | /** | 100 | /** |
| 202 | * free_image_page - free page represented by @addr, allocated with | 101 | * free_image_page - free page represented by @addr, allocated with |
| 203 | * alloc_image_page (page flags set by it must be cleared) | 102 | * get_image_page (page flags set by it must be cleared) |
| 204 | */ | 103 | */ |
| 205 | 104 | ||
| 206 | static inline void free_image_page(void *addr, int clear_nosave_free) | 105 | static inline void free_image_page(void *addr, int clear_nosave_free) |
| 207 | { | 106 | { |
| 208 | ClearPageNosave(virt_to_page(addr)); | 107 | struct page *page; |
| 108 | |||
| 109 | BUG_ON(!virt_addr_valid(addr)); | ||
| 110 | |||
| 111 | page = virt_to_page(addr); | ||
| 112 | |||
| 113 | ClearPageNosave(page); | ||
| 209 | if (clear_nosave_free) | 114 | if (clear_nosave_free) |
| 210 | ClearPageNosaveFree(virt_to_page(addr)); | 115 | ClearPageNosaveFree(page); |
| 211 | free_page((unsigned long)addr); | 116 | |
| 117 | __free_page(page); | ||
| 212 | } | 118 | } |
| 213 | 119 | ||
| 214 | /* struct linked_page is used to build chains of pages */ | 120 | /* struct linked_page is used to build chains of pages */ |
| @@ -269,7 +175,7 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) | |||
| 269 | if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { | 175 | if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { |
| 270 | struct linked_page *lp; | 176 | struct linked_page *lp; |
| 271 | 177 | ||
| 272 | lp = alloc_image_page(ca->gfp_mask, ca->safe_needed); | 178 | lp = get_image_page(ca->gfp_mask, ca->safe_needed); |
| 273 | if (!lp) | 179 | if (!lp) |
| 274 | return NULL; | 180 | return NULL; |
| 275 | 181 | ||
| @@ -446,8 +352,8 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
| 446 | 352 | ||
| 447 | /* Compute the number of zones */ | 353 | /* Compute the number of zones */ |
| 448 | nr = 0; | 354 | nr = 0; |
| 449 | for_each_zone (zone) | 355 | for_each_zone(zone) |
| 450 | if (populated_zone(zone) && !is_highmem(zone)) | 356 | if (populated_zone(zone)) |
| 451 | nr++; | 357 | nr++; |
| 452 | 358 | ||
| 453 | /* Allocate the list of zones bitmap objects */ | 359 | /* Allocate the list of zones bitmap objects */ |
| @@ -459,10 +365,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
| 459 | } | 365 | } |
| 460 | 366 | ||
| 461 | /* Initialize the zone bitmap objects */ | 367 | /* Initialize the zone bitmap objects */ |
| 462 | for_each_zone (zone) { | 368 | for_each_zone(zone) { |
| 463 | unsigned long pfn; | 369 | unsigned long pfn; |
| 464 | 370 | ||
| 465 | if (!populated_zone(zone) || is_highmem(zone)) | 371 | if (!populated_zone(zone)) |
| 466 | continue; | 372 | continue; |
| 467 | 373 | ||
| 468 | zone_bm->start_pfn = zone->zone_start_pfn; | 374 | zone_bm->start_pfn = zone->zone_start_pfn; |
| @@ -481,7 +387,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
| 481 | while (bb) { | 387 | while (bb) { |
| 482 | unsigned long *ptr; | 388 | unsigned long *ptr; |
| 483 | 389 | ||
| 484 | ptr = alloc_image_page(gfp_mask, safe_needed); | 390 | ptr = get_image_page(gfp_mask, safe_needed); |
| 485 | bb->data = ptr; | 391 | bb->data = ptr; |
| 486 | if (!ptr) | 392 | if (!ptr) |
| 487 | goto Free; | 393 | goto Free; |
| @@ -505,7 +411,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) | |||
| 505 | memory_bm_position_reset(bm); | 411 | memory_bm_position_reset(bm); |
| 506 | return 0; | 412 | return 0; |
| 507 | 413 | ||
| 508 | Free: | 414 | Free: |
| 509 | bm->p_list = ca.chain; | 415 | bm->p_list = ca.chain; |
| 510 | memory_bm_free(bm, PG_UNSAFE_CLEAR); | 416 | memory_bm_free(bm, PG_UNSAFE_CLEAR); |
| 511 | return -ENOMEM; | 417 | return -ENOMEM; |
| @@ -651,7 +557,7 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) | |||
| 651 | memory_bm_position_reset(bm); | 557 | memory_bm_position_reset(bm); |
| 652 | return BM_END_OF_MAP; | 558 | return BM_END_OF_MAP; |
| 653 | 559 | ||
| 654 | Return_pfn: | 560 | Return_pfn: |
| 655 | bm->cur.chunk = chunk; | 561 | bm->cur.chunk = chunk; |
| 656 | bm->cur.bit = bit; | 562 | bm->cur.bit = bit; |
| 657 | return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; | 563 | return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit; |
| @@ -669,10 +575,82 @@ unsigned int snapshot_additional_pages(struct zone *zone) | |||
| 669 | 575 | ||
| 670 | res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); | 576 | res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); |
| 671 | res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); | 577 | res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); |
| 672 | return res; | 578 | return 2 * res; |
| 579 | } | ||
| 580 | |||
| 581 | #ifdef CONFIG_HIGHMEM | ||
| 582 | /** | ||
| 583 | * count_free_highmem_pages - compute the total number of free highmem | ||
| 584 | * pages, system-wide. | ||
| 585 | */ | ||
| 586 | |||
| 587 | static unsigned int count_free_highmem_pages(void) | ||
| 588 | { | ||
| 589 | struct zone *zone; | ||
| 590 | unsigned int cnt = 0; | ||
| 591 | |||
| 592 | for_each_zone(zone) | ||
| 593 | if (populated_zone(zone) && is_highmem(zone)) | ||
| 594 | cnt += zone->free_pages; | ||
| 595 | |||
| 596 | return cnt; | ||
| 597 | } | ||
| 598 | |||
| 599 | /** | ||
| 600 | * saveable_highmem_page - Determine whether a highmem page should be | ||
| 601 | * included in the suspend image. | ||
| 602 | * | ||
| 603 | * We should save the page if it isn't Nosave or NosaveFree, or Reserved, | ||
| 604 | * and it isn't a part of a free chunk of pages. | ||
| 605 | */ | ||
| 606 | |||
| 607 | static struct page *saveable_highmem_page(unsigned long pfn) | ||
| 608 | { | ||
| 609 | struct page *page; | ||
| 610 | |||
| 611 | if (!pfn_valid(pfn)) | ||
| 612 | return NULL; | ||
| 613 | |||
| 614 | page = pfn_to_page(pfn); | ||
| 615 | |||
| 616 | BUG_ON(!PageHighMem(page)); | ||
| 617 | |||
| 618 | if (PageNosave(page) || PageReserved(page) || PageNosaveFree(page)) | ||
| 619 | return NULL; | ||
| 620 | |||
| 621 | return page; | ||
| 673 | } | 622 | } |
| 674 | 623 | ||
| 675 | /** | 624 | /** |
| 625 | * count_highmem_pages - compute the total number of saveable highmem | ||
| 626 | * pages. | ||
| 627 | */ | ||
| 628 | |||
| 629 | unsigned int count_highmem_pages(void) | ||
| 630 | { | ||
| 631 | struct zone *zone; | ||
| 632 | unsigned int n = 0; | ||
| 633 | |||
| 634 | for_each_zone(zone) { | ||
| 635 | unsigned long pfn, max_zone_pfn; | ||
| 636 | |||
| 637 | if (!is_highmem(zone)) | ||
| 638 | continue; | ||
| 639 | |||
| 640 | mark_free_pages(zone); | ||
| 641 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | ||
| 642 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | ||
| 643 | if (saveable_highmem_page(pfn)) | ||
| 644 | n++; | ||
| 645 | } | ||
| 646 | return n; | ||
| 647 | } | ||
| 648 | #else | ||
| 649 | static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } | ||
| 650 | static inline unsigned int count_highmem_pages(void) { return 0; } | ||
| 651 | #endif /* CONFIG_HIGHMEM */ | ||
| 652 | |||
| 653 | /** | ||
| 676 | * pfn_is_nosave - check if given pfn is in the 'nosave' section | 654 | * pfn_is_nosave - check if given pfn is in the 'nosave' section |
| 677 | */ | 655 | */ |
| 678 | 656 | ||
| @@ -684,12 +662,12 @@ static inline int pfn_is_nosave(unsigned long pfn) | |||
| 684 | } | 662 | } |
| 685 | 663 | ||
| 686 | /** | 664 | /** |
| 687 | * saveable - Determine whether a page should be cloned or not. | 665 | * saveable - Determine whether a non-highmem page should be included in |
| 688 | * @pfn: The page | 666 | * the suspend image. |
| 689 | * | 667 | * |
| 690 | * We save a page if it isn't Nosave, and is not in the range of pages | 668 | * We should save the page if it isn't Nosave, and is not in the range |
| 691 | * statically defined as 'unsaveable', and it | 669 | * of pages statically defined as 'unsaveable', and it isn't a part of |
| 692 | * isn't a part of a free chunk of pages. | 670 | * a free chunk of pages. |
| 693 | */ | 671 | */ |
| 694 | 672 | ||
| 695 | static struct page *saveable_page(unsigned long pfn) | 673 | static struct page *saveable_page(unsigned long pfn) |
| @@ -701,76 +679,130 @@ static struct page *saveable_page(unsigned long pfn) | |||
| 701 | 679 | ||
| 702 | page = pfn_to_page(pfn); | 680 | page = pfn_to_page(pfn); |
| 703 | 681 | ||
| 704 | if (PageNosave(page)) | 682 | BUG_ON(PageHighMem(page)); |
| 683 | |||
| 684 | if (PageNosave(page) || PageNosaveFree(page)) | ||
| 705 | return NULL; | 685 | return NULL; |
| 686 | |||
| 706 | if (PageReserved(page) && pfn_is_nosave(pfn)) | 687 | if (PageReserved(page) && pfn_is_nosave(pfn)) |
| 707 | return NULL; | 688 | return NULL; |
| 708 | if (PageNosaveFree(page)) | ||
| 709 | return NULL; | ||
| 710 | 689 | ||
| 711 | return page; | 690 | return page; |
| 712 | } | 691 | } |
| 713 | 692 | ||
| 693 | /** | ||
| 694 | * count_data_pages - compute the total number of saveable non-highmem | ||
| 695 | * pages. | ||
| 696 | */ | ||
| 697 | |||
| 714 | unsigned int count_data_pages(void) | 698 | unsigned int count_data_pages(void) |
| 715 | { | 699 | { |
| 716 | struct zone *zone; | 700 | struct zone *zone; |
| 717 | unsigned long pfn, max_zone_pfn; | 701 | unsigned long pfn, max_zone_pfn; |
| 718 | unsigned int n = 0; | 702 | unsigned int n = 0; |
| 719 | 703 | ||
| 720 | for_each_zone (zone) { | 704 | for_each_zone(zone) { |
| 721 | if (is_highmem(zone)) | 705 | if (is_highmem(zone)) |
| 722 | continue; | 706 | continue; |
| 707 | |||
| 723 | mark_free_pages(zone); | 708 | mark_free_pages(zone); |
| 724 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 709 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
| 725 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 710 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 726 | n += !!saveable_page(pfn); | 711 | if(saveable_page(pfn)) |
| 712 | n++; | ||
| 727 | } | 713 | } |
| 728 | return n; | 714 | return n; |
| 729 | } | 715 | } |
| 730 | 716 | ||
| 731 | static inline void copy_data_page(long *dst, long *src) | 717 | /* This is needed, because copy_page and memcpy are not usable for copying |
| 718 | * task structs. | ||
| 719 | */ | ||
| 720 | static inline void do_copy_page(long *dst, long *src) | ||
| 732 | { | 721 | { |
| 733 | int n; | 722 | int n; |
| 734 | 723 | ||
| 735 | /* copy_page and memcpy are not usable for copying task structs. */ | ||
| 736 | for (n = PAGE_SIZE / sizeof(long); n; n--) | 724 | for (n = PAGE_SIZE / sizeof(long); n; n--) |
| 737 | *dst++ = *src++; | 725 | *dst++ = *src++; |
| 738 | } | 726 | } |
| 739 | 727 | ||
| 728 | #ifdef CONFIG_HIGHMEM | ||
| 729 | static inline struct page * | ||
| 730 | page_is_saveable(struct zone *zone, unsigned long pfn) | ||
| 731 | { | ||
| 732 | return is_highmem(zone) ? | ||
| 733 | saveable_highmem_page(pfn) : saveable_page(pfn); | ||
| 734 | } | ||
| 735 | |||
| 736 | static inline void | ||
| 737 | copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | ||
| 738 | { | ||
| 739 | struct page *s_page, *d_page; | ||
| 740 | void *src, *dst; | ||
| 741 | |||
| 742 | s_page = pfn_to_page(src_pfn); | ||
| 743 | d_page = pfn_to_page(dst_pfn); | ||
| 744 | if (PageHighMem(s_page)) { | ||
| 745 | src = kmap_atomic(s_page, KM_USER0); | ||
| 746 | dst = kmap_atomic(d_page, KM_USER1); | ||
| 747 | do_copy_page(dst, src); | ||
| 748 | kunmap_atomic(src, KM_USER0); | ||
| 749 | kunmap_atomic(dst, KM_USER1); | ||
| 750 | } else { | ||
| 751 | src = page_address(s_page); | ||
| 752 | if (PageHighMem(d_page)) { | ||
| 753 | /* Page pointed to by src may contain some kernel | ||
| 754 | * data modified by kmap_atomic() | ||
| 755 | */ | ||
| 756 | do_copy_page(buffer, src); | ||
| 757 | dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); | ||
| 758 | memcpy(dst, buffer, PAGE_SIZE); | ||
| 759 | kunmap_atomic(dst, KM_USER0); | ||
| 760 | } else { | ||
| 761 | dst = page_address(d_page); | ||
| 762 | do_copy_page(dst, src); | ||
| 763 | } | ||
| 764 | } | ||
| 765 | } | ||
| 766 | #else | ||
| 767 | #define page_is_saveable(zone, pfn) saveable_page(pfn) | ||
| 768 | |||
| 769 | static inline void | ||
| 770 | copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) | ||
| 771 | { | ||
| 772 | do_copy_page(page_address(pfn_to_page(dst_pfn)), | ||
| 773 | page_address(pfn_to_page(src_pfn))); | ||
| 774 | } | ||
| 775 | #endif /* CONFIG_HIGHMEM */ | ||
| 776 | |||
| 740 | static void | 777 | static void |
| 741 | copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) | 778 | copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) |
| 742 | { | 779 | { |
| 743 | struct zone *zone; | 780 | struct zone *zone; |
| 744 | unsigned long pfn; | 781 | unsigned long pfn; |
| 745 | 782 | ||
| 746 | for_each_zone (zone) { | 783 | for_each_zone(zone) { |
| 747 | unsigned long max_zone_pfn; | 784 | unsigned long max_zone_pfn; |
| 748 | 785 | ||
| 749 | if (is_highmem(zone)) | ||
| 750 | continue; | ||
| 751 | |||
| 752 | mark_free_pages(zone); | 786 | mark_free_pages(zone); |
| 753 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 787 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
| 754 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 788 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 755 | if (saveable_page(pfn)) | 789 | if (page_is_saveable(zone, pfn)) |
| 756 | memory_bm_set_bit(orig_bm, pfn); | 790 | memory_bm_set_bit(orig_bm, pfn); |
| 757 | } | 791 | } |
| 758 | memory_bm_position_reset(orig_bm); | 792 | memory_bm_position_reset(orig_bm); |
| 759 | memory_bm_position_reset(copy_bm); | 793 | memory_bm_position_reset(copy_bm); |
| 760 | do { | 794 | do { |
| 761 | pfn = memory_bm_next_pfn(orig_bm); | 795 | pfn = memory_bm_next_pfn(orig_bm); |
| 762 | if (likely(pfn != BM_END_OF_MAP)) { | 796 | if (likely(pfn != BM_END_OF_MAP)) |
| 763 | struct page *page; | 797 | copy_data_page(memory_bm_next_pfn(copy_bm), pfn); |
| 764 | void *src; | ||
| 765 | |||
| 766 | page = pfn_to_page(pfn); | ||
| 767 | src = page_address(page); | ||
| 768 | page = pfn_to_page(memory_bm_next_pfn(copy_bm)); | ||
| 769 | copy_data_page(page_address(page), src); | ||
| 770 | } | ||
| 771 | } while (pfn != BM_END_OF_MAP); | 798 | } while (pfn != BM_END_OF_MAP); |
| 772 | } | 799 | } |
| 773 | 800 | ||
| 801 | /* Total number of image pages */ | ||
| 802 | static unsigned int nr_copy_pages; | ||
| 803 | /* Number of pages needed for saving the original pfns of the image pages */ | ||
| 804 | static unsigned int nr_meta_pages; | ||
| 805 | |||
| 774 | /** | 806 | /** |
| 775 | * swsusp_free - free pages allocated for the suspend. | 807 | * swsusp_free - free pages allocated for the suspend. |
| 776 | * | 808 | * |
| @@ -792,7 +824,7 @@ void swsusp_free(void) | |||
| 792 | if (PageNosave(page) && PageNosaveFree(page)) { | 824 | if (PageNosave(page) && PageNosaveFree(page)) { |
| 793 | ClearPageNosave(page); | 825 | ClearPageNosave(page); |
| 794 | ClearPageNosaveFree(page); | 826 | ClearPageNosaveFree(page); |
| 795 | free_page((long) page_address(page)); | 827 | __free_page(page); |
| 796 | } | 828 | } |
| 797 | } | 829 | } |
| 798 | } | 830 | } |
| @@ -802,34 +834,108 @@ void swsusp_free(void) | |||
| 802 | buffer = NULL; | 834 | buffer = NULL; |
| 803 | } | 835 | } |
| 804 | 836 | ||
| 837 | #ifdef CONFIG_HIGHMEM | ||
| 838 | /** | ||
| 839 | * count_pages_for_highmem - compute the number of non-highmem pages | ||
| 840 | * that will be necessary for creating copies of highmem pages. | ||
| 841 | */ | ||
| 842 | |||
| 843 | static unsigned int count_pages_for_highmem(unsigned int nr_highmem) | ||
| 844 | { | ||
| 845 | unsigned int free_highmem = count_free_highmem_pages(); | ||
| 846 | |||
| 847 | if (free_highmem >= nr_highmem) | ||
| 848 | nr_highmem = 0; | ||
| 849 | else | ||
| 850 | nr_highmem -= free_highmem; | ||
| 851 | |||
| 852 | return nr_highmem; | ||
| 853 | } | ||
| 854 | #else | ||
| 855 | static unsigned int | ||
| 856 | count_pages_for_highmem(unsigned int nr_highmem) { return 0; } | ||
| 857 | #endif /* CONFIG_HIGHMEM */ | ||
| 805 | 858 | ||
| 806 | /** | 859 | /** |
| 807 | * enough_free_mem - Make sure we enough free memory to snapshot. | 860 | * enough_free_mem - Make sure we have enough free memory for the |
| 808 | * | 861 | * snapshot image. |
| 809 | * Returns TRUE or FALSE after checking the number of available | ||
| 810 | * free pages. | ||
| 811 | */ | 862 | */ |
| 812 | 863 | ||
| 813 | static int enough_free_mem(unsigned int nr_pages) | 864 | static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) |
| 814 | { | 865 | { |
| 815 | struct zone *zone; | 866 | struct zone *zone; |
| 816 | unsigned int free = 0, meta = 0; | 867 | unsigned int free = 0, meta = 0; |
| 817 | 868 | ||
| 818 | for_each_zone (zone) | 869 | for_each_zone(zone) { |
| 819 | if (!is_highmem(zone)) { | 870 | meta += snapshot_additional_pages(zone); |
| 871 | if (!is_highmem(zone)) | ||
| 820 | free += zone->free_pages; | 872 | free += zone->free_pages; |
| 821 | meta += snapshot_additional_pages(zone); | 873 | } |
| 822 | } | ||
| 823 | 874 | ||
| 824 | pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n", | 875 | nr_pages += count_pages_for_highmem(nr_highmem); |
| 876 | pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n", | ||
| 825 | nr_pages, PAGES_FOR_IO, meta, free); | 877 | nr_pages, PAGES_FOR_IO, meta, free); |
| 826 | 878 | ||
| 827 | return free > nr_pages + PAGES_FOR_IO + meta; | 879 | return free > nr_pages + PAGES_FOR_IO + meta; |
| 828 | } | 880 | } |
| 829 | 881 | ||
| 882 | #ifdef CONFIG_HIGHMEM | ||
| 883 | /** | ||
| 884 | * get_highmem_buffer - if there are some highmem pages in the suspend | ||
| 885 | * image, we may need the buffer to copy them and/or load their data. | ||
| 886 | */ | ||
| 887 | |||
| 888 | static inline int get_highmem_buffer(int safe_needed) | ||
| 889 | { | ||
| 890 | buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); | ||
| 891 | return buffer ? 0 : -ENOMEM; | ||
| 892 | } | ||
| 893 | |||
| 894 | /** | ||
| 895 | * alloc_highmem_image_pages - allocate some highmem pages for the image. | ||
| 896 | * Try to allocate as many pages as needed, but if the number of free | ||
| 897 | * highmem pages is lesser than that, allocate them all. | ||
| 898 | */ | ||
| 899 | |||
| 900 | static inline unsigned int | ||
| 901 | alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int nr_highmem) | ||
| 902 | { | ||
| 903 | unsigned int to_alloc = count_free_highmem_pages(); | ||
| 904 | |||
| 905 | if (to_alloc > nr_highmem) | ||
| 906 | to_alloc = nr_highmem; | ||
| 907 | |||
| 908 | nr_highmem -= to_alloc; | ||
| 909 | while (to_alloc-- > 0) { | ||
| 910 | struct page *page; | ||
| 911 | |||
| 912 | page = alloc_image_page(__GFP_HIGHMEM); | ||
| 913 | memory_bm_set_bit(bm, page_to_pfn(page)); | ||
| 914 | } | ||
| 915 | return nr_highmem; | ||
| 916 | } | ||
| 917 | #else | ||
| 918 | static inline int get_highmem_buffer(int safe_needed) { return 0; } | ||
| 919 | |||
| 920 | static inline unsigned int | ||
| 921 | alloc_highmem_image_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } | ||
| 922 | #endif /* CONFIG_HIGHMEM */ | ||
| 923 | |||
| 924 | /** | ||
| 925 | * swsusp_alloc - allocate memory for the suspend image | ||
| 926 | * | ||
| 927 | * We first try to allocate as many highmem pages as there are | ||
| 928 | * saveable highmem pages in the system. If that fails, we allocate | ||
| 929 | * non-highmem pages for the copies of the remaining highmem ones. | ||
| 930 | * | ||
| 931 | * In this approach it is likely that the copies of highmem pages will | ||
| 932 | * also be located in the high memory, because of the way in which | ||
| 933 | * copy_data_pages() works. | ||
| 934 | */ | ||
| 935 | |||
| 830 | static int | 936 | static int |
| 831 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | 937 | swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, |
| 832 | unsigned int nr_pages) | 938 | unsigned int nr_pages, unsigned int nr_highmem) |
| 833 | { | 939 | { |
| 834 | int error; | 940 | int error; |
| 835 | 941 | ||
| @@ -841,46 +947,61 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, | |||
| 841 | if (error) | 947 | if (error) |
| 842 | goto Free; | 948 | goto Free; |
| 843 | 949 | ||
| 950 | if (nr_highmem > 0) { | ||
| 951 | error = get_highmem_buffer(PG_ANY); | ||
| 952 | if (error) | ||
| 953 | goto Free; | ||
| 954 | |||
| 955 | nr_pages += alloc_highmem_image_pages(copy_bm, nr_highmem); | ||
| 956 | } | ||
| 844 | while (nr_pages-- > 0) { | 957 | while (nr_pages-- > 0) { |
| 845 | struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD); | 958 | struct page *page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); |
| 959 | |||
| 846 | if (!page) | 960 | if (!page) |
| 847 | goto Free; | 961 | goto Free; |
| 848 | 962 | ||
| 849 | SetPageNosave(page); | ||
| 850 | SetPageNosaveFree(page); | ||
| 851 | memory_bm_set_bit(copy_bm, page_to_pfn(page)); | 963 | memory_bm_set_bit(copy_bm, page_to_pfn(page)); |
| 852 | } | 964 | } |
| 853 | return 0; | 965 | return 0; |
| 854 | 966 | ||
| 855 | Free: | 967 | Free: |
| 856 | swsusp_free(); | 968 | swsusp_free(); |
| 857 | return -ENOMEM; | 969 | return -ENOMEM; |
| 858 | } | 970 | } |
| 859 | 971 | ||
| 860 | /* Memory bitmap used for marking saveable pages */ | 972 | /* Memory bitmap used for marking saveable pages (during suspend) or the |
| 973 | * suspend image pages (during resume) | ||
| 974 | */ | ||
| 861 | static struct memory_bitmap orig_bm; | 975 | static struct memory_bitmap orig_bm; |
| 862 | /* Memory bitmap used for marking allocated pages that will contain the copies | 976 | /* Memory bitmap used on suspend for marking allocated pages that will contain |
| 863 | * of saveable pages | 977 | * the copies of saveable pages. During resume it is initially used for |
| 978 | * marking the suspend image pages, but then its set bits are duplicated in | ||
| 979 | * @orig_bm and it is released. Next, on systems with high memory, it may be | ||
| 980 | * used for marking "safe" highmem pages, but it has to be reinitialized for | ||
| 981 | * this purpose. | ||
| 864 | */ | 982 | */ |
| 865 | static struct memory_bitmap copy_bm; | 983 | static struct memory_bitmap copy_bm; |
| 866 | 984 | ||
| 867 | asmlinkage int swsusp_save(void) | 985 | asmlinkage int swsusp_save(void) |
| 868 | { | 986 | { |
| 869 | unsigned int nr_pages; | 987 | unsigned int nr_pages, nr_highmem; |
| 870 | 988 | ||
| 871 | pr_debug("swsusp: critical section: \n"); | 989 | printk("swsusp: critical section: \n"); |
| 872 | 990 | ||
| 873 | drain_local_pages(); | 991 | drain_local_pages(); |
| 874 | nr_pages = count_data_pages(); | 992 | nr_pages = count_data_pages(); |
| 875 | printk("swsusp: Need to copy %u pages\n", nr_pages); | 993 | nr_highmem = count_highmem_pages(); |
| 994 | printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem); | ||
| 876 | 995 | ||
| 877 | if (!enough_free_mem(nr_pages)) { | 996 | if (!enough_free_mem(nr_pages, nr_highmem)) { |
| 878 | printk(KERN_ERR "swsusp: Not enough free memory\n"); | 997 | printk(KERN_ERR "swsusp: Not enough free memory\n"); |
| 879 | return -ENOMEM; | 998 | return -ENOMEM; |
| 880 | } | 999 | } |
| 881 | 1000 | ||
| 882 | if (swsusp_alloc(&orig_bm, ©_bm, nr_pages)) | 1001 | if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { |
| 1002 | printk(KERN_ERR "swsusp: Memory allocation failed\n"); | ||
| 883 | return -ENOMEM; | 1003 | return -ENOMEM; |
| 1004 | } | ||
| 884 | 1005 | ||
| 885 | /* During allocating of suspend pagedir, new cold pages may appear. | 1006 | /* During allocating of suspend pagedir, new cold pages may appear. |
| 886 | * Kill them. | 1007 | * Kill them. |
| @@ -894,10 +1015,12 @@ asmlinkage int swsusp_save(void) | |||
| 894 | * touch swap space! Except we must write out our image of course. | 1015 | * touch swap space! Except we must write out our image of course. |
| 895 | */ | 1016 | */ |
| 896 | 1017 | ||
| 1018 | nr_pages += nr_highmem; | ||
| 897 | nr_copy_pages = nr_pages; | 1019 | nr_copy_pages = nr_pages; |
| 898 | nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1020 | nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); |
| 899 | 1021 | ||
| 900 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); | 1022 | printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages); |
| 1023 | |||
| 901 | return 0; | 1024 | return 0; |
| 902 | } | 1025 | } |
| 903 | 1026 | ||
| @@ -960,7 +1083,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) | |||
| 960 | 1083 | ||
| 961 | if (!buffer) { | 1084 | if (!buffer) { |
| 962 | /* This makes the buffer be freed by swsusp_free() */ | 1085 | /* This makes the buffer be freed by swsusp_free() */ |
| 963 | buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); | 1086 | buffer = get_image_page(GFP_ATOMIC, PG_ANY); |
| 964 | if (!buffer) | 1087 | if (!buffer) |
| 965 | return -ENOMEM; | 1088 | return -ENOMEM; |
| 966 | } | 1089 | } |
| @@ -975,9 +1098,23 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count) | |||
| 975 | memset(buffer, 0, PAGE_SIZE); | 1098 | memset(buffer, 0, PAGE_SIZE); |
| 976 | pack_pfns(buffer, &orig_bm); | 1099 | pack_pfns(buffer, &orig_bm); |
| 977 | } else { | 1100 | } else { |
| 978 | unsigned long pfn = memory_bm_next_pfn(©_bm); | 1101 | struct page *page; |
| 979 | 1102 | ||
| 980 | handle->buffer = page_address(pfn_to_page(pfn)); | 1103 | page = pfn_to_page(memory_bm_next_pfn(©_bm)); |
| 1104 | if (PageHighMem(page)) { | ||
| 1105 | /* Highmem pages are copied to the buffer, | ||
| 1106 | * because we can't return with a kmapped | ||
| 1107 | * highmem page (we may not be called again). | ||
| 1108 | */ | ||
| 1109 | void *kaddr; | ||
| 1110 | |||
| 1111 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 1112 | memcpy(buffer, kaddr, PAGE_SIZE); | ||
| 1113 | kunmap_atomic(kaddr, KM_USER0); | ||
| 1114 | handle->buffer = buffer; | ||
| 1115 | } else { | ||
| 1116 | handle->buffer = page_address(page); | ||
| 1117 | } | ||
| 981 | } | 1118 | } |
| 982 | handle->prev = handle->cur; | 1119 | handle->prev = handle->cur; |
| 983 | } | 1120 | } |
| @@ -1005,7 +1142,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) | |||
| 1005 | unsigned long pfn, max_zone_pfn; | 1142 | unsigned long pfn, max_zone_pfn; |
| 1006 | 1143 | ||
| 1007 | /* Clear page flags */ | 1144 | /* Clear page flags */ |
| 1008 | for_each_zone (zone) { | 1145 | for_each_zone(zone) { |
| 1009 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; | 1146 | max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; |
| 1010 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) | 1147 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) |
| 1011 | if (pfn_valid(pfn)) | 1148 | if (pfn_valid(pfn)) |
| @@ -1101,6 +1238,218 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
| 1101 | } | 1238 | } |
| 1102 | } | 1239 | } |
| 1103 | 1240 | ||
| 1241 | /* List of "safe" pages that may be used to store data loaded from the suspend | ||
| 1242 | * image | ||
| 1243 | */ | ||
| 1244 | static struct linked_page *safe_pages_list; | ||
| 1245 | |||
| 1246 | #ifdef CONFIG_HIGHMEM | ||
| 1247 | /* struct highmem_pbe is used for creating the list of highmem pages that | ||
| 1248 | * should be restored atomically during the resume from disk, because the page | ||
| 1249 | * frames they have occupied before the suspend are in use. | ||
| 1250 | */ | ||
| 1251 | struct highmem_pbe { | ||
| 1252 | struct page *copy_page; /* data is here now */ | ||
| 1253 | struct page *orig_page; /* data was here before the suspend */ | ||
| 1254 | struct highmem_pbe *next; | ||
| 1255 | }; | ||
| 1256 | |||
| 1257 | /* List of highmem PBEs needed for restoring the highmem pages that were | ||
| 1258 | * allocated before the suspend and included in the suspend image, but have | ||
| 1259 | * also been allocated by the "resume" kernel, so their contents cannot be | ||
| 1260 | * written directly to their "original" page frames. | ||
| 1261 | */ | ||
| 1262 | static struct highmem_pbe *highmem_pblist; | ||
| 1263 | |||
| 1264 | /** | ||
| 1265 | * count_highmem_image_pages - compute the number of highmem pages in the | ||
| 1266 | * suspend image. The bits in the memory bitmap @bm that correspond to the | ||
| 1267 | * image pages are assumed to be set. | ||
| 1268 | */ | ||
| 1269 | |||
| 1270 | static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) | ||
| 1271 | { | ||
| 1272 | unsigned long pfn; | ||
| 1273 | unsigned int cnt = 0; | ||
| 1274 | |||
| 1275 | memory_bm_position_reset(bm); | ||
| 1276 | pfn = memory_bm_next_pfn(bm); | ||
| 1277 | while (pfn != BM_END_OF_MAP) { | ||
| 1278 | if (PageHighMem(pfn_to_page(pfn))) | ||
| 1279 | cnt++; | ||
| 1280 | |||
| 1281 | pfn = memory_bm_next_pfn(bm); | ||
| 1282 | } | ||
| 1283 | return cnt; | ||
| 1284 | } | ||
| 1285 | |||
| 1286 | /** | ||
| 1287 | * prepare_highmem_image - try to allocate as many highmem pages as | ||
| 1288 | * there are highmem image pages (@nr_highmem_p points to the variable | ||
| 1289 | * containing the number of highmem image pages). The pages that are | ||
| 1290 | * "safe" (ie. will not be overwritten when the suspend image is | ||
| 1291 | * restored) have the corresponding bits set in @bm (it must be | ||
| 1292 | * unitialized). | ||
| 1293 | * | ||
| 1294 | * NOTE: This function should not be called if there are no highmem | ||
| 1295 | * image pages. | ||
| 1296 | */ | ||
| 1297 | |||
| 1298 | static unsigned int safe_highmem_pages; | ||
| 1299 | |||
| 1300 | static struct memory_bitmap *safe_highmem_bm; | ||
| 1301 | |||
| 1302 | static int | ||
| 1303 | prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) | ||
| 1304 | { | ||
| 1305 | unsigned int to_alloc; | ||
| 1306 | |||
| 1307 | if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE)) | ||
| 1308 | return -ENOMEM; | ||
| 1309 | |||
| 1310 | if (get_highmem_buffer(PG_SAFE)) | ||
| 1311 | return -ENOMEM; | ||
| 1312 | |||
| 1313 | to_alloc = count_free_highmem_pages(); | ||
| 1314 | if (to_alloc > *nr_highmem_p) | ||
| 1315 | to_alloc = *nr_highmem_p; | ||
| 1316 | else | ||
| 1317 | *nr_highmem_p = to_alloc; | ||
| 1318 | |||
| 1319 | safe_highmem_pages = 0; | ||
| 1320 | while (to_alloc-- > 0) { | ||
| 1321 | struct page *page; | ||
| 1322 | |||
| 1323 | page = alloc_page(__GFP_HIGHMEM); | ||
| 1324 | if (!PageNosaveFree(page)) { | ||
| 1325 | /* The page is "safe", set its bit the bitmap */ | ||
| 1326 | memory_bm_set_bit(bm, page_to_pfn(page)); | ||
| 1327 | safe_highmem_pages++; | ||
| 1328 | } | ||
| 1329 | /* Mark the page as allocated */ | ||
| 1330 | SetPageNosave(page); | ||
| 1331 | SetPageNosaveFree(page); | ||
| 1332 | } | ||
| 1333 | memory_bm_position_reset(bm); | ||
| 1334 | safe_highmem_bm = bm; | ||
| 1335 | return 0; | ||
| 1336 | } | ||
| 1337 | |||
| 1338 | /** | ||
| 1339 | * get_highmem_page_buffer - for given highmem image page find the buffer | ||
| 1340 | * that suspend_write_next() should set for its caller to write to. | ||
| 1341 | * | ||
| 1342 | * If the page is to be saved to its "original" page frame or a copy of | ||
| 1343 | * the page is to be made in the highmem, @buffer is returned. Otherwise, | ||
| 1344 | * the copy of the page is to be made in normal memory, so the address of | ||
| 1345 | * the copy is returned. | ||
| 1346 | * | ||
| 1347 | * If @buffer is returned, the caller of suspend_write_next() will write | ||
| 1348 | * the page's contents to @buffer, so they will have to be copied to the | ||
| 1349 | * right location on the next call to suspend_write_next() and it is done | ||
| 1350 | * with the help of copy_last_highmem_page(). For this purpose, if | ||
| 1351 | * @buffer is returned, @last_highmem page is set to the page to which | ||
| 1352 | * the data will have to be copied from @buffer. | ||
| 1353 | */ | ||
| 1354 | |||
| 1355 | static struct page *last_highmem_page; | ||
| 1356 | |||
| 1357 | static void * | ||
| 1358 | get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) | ||
| 1359 | { | ||
| 1360 | struct highmem_pbe *pbe; | ||
| 1361 | void *kaddr; | ||
| 1362 | |||
| 1363 | if (PageNosave(page) && PageNosaveFree(page)) { | ||
| 1364 | /* We have allocated the "original" page frame and we can | ||
| 1365 | * use it directly to store the loaded page. | ||
| 1366 | */ | ||
| 1367 | last_highmem_page = page; | ||
| 1368 | return buffer; | ||
| 1369 | } | ||
| 1370 | /* The "original" page frame has not been allocated and we have to | ||
| 1371 | * use a "safe" page frame to store the loaded page. | ||
| 1372 | */ | ||
| 1373 | pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); | ||
| 1374 | if (!pbe) { | ||
| 1375 | swsusp_free(); | ||
| 1376 | return NULL; | ||
| 1377 | } | ||
| 1378 | pbe->orig_page = page; | ||
| 1379 | if (safe_highmem_pages > 0) { | ||
| 1380 | struct page *tmp; | ||
| 1381 | |||
| 1382 | /* Copy of the page will be stored in high memory */ | ||
| 1383 | kaddr = buffer; | ||
| 1384 | tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); | ||
| 1385 | safe_highmem_pages--; | ||
| 1386 | last_highmem_page = tmp; | ||
| 1387 | pbe->copy_page = tmp; | ||
| 1388 | } else { | ||
| 1389 | /* Copy of the page will be stored in normal memory */ | ||
| 1390 | kaddr = safe_pages_list; | ||
| 1391 | safe_pages_list = safe_pages_list->next; | ||
| 1392 | pbe->copy_page = virt_to_page(kaddr); | ||
| 1393 | } | ||
| 1394 | pbe->next = highmem_pblist; | ||
| 1395 | highmem_pblist = pbe; | ||
| 1396 | return kaddr; | ||
| 1397 | } | ||
| 1398 | |||
| 1399 | /** | ||
| 1400 | * copy_last_highmem_page - copy the contents of a highmem image from | ||
| 1401 | * @buffer, where the caller of snapshot_write_next() has place them, | ||
| 1402 | * to the right location represented by @last_highmem_page . | ||
| 1403 | */ | ||
| 1404 | |||
| 1405 | static void copy_last_highmem_page(void) | ||
| 1406 | { | ||
| 1407 | if (last_highmem_page) { | ||
| 1408 | void *dst; | ||
| 1409 | |||
| 1410 | dst = kmap_atomic(last_highmem_page, KM_USER0); | ||
| 1411 | memcpy(dst, buffer, PAGE_SIZE); | ||
| 1412 | kunmap_atomic(dst, KM_USER0); | ||
| 1413 | last_highmem_page = NULL; | ||
| 1414 | } | ||
| 1415 | } | ||
| 1416 | |||
| 1417 | static inline int last_highmem_page_copied(void) | ||
| 1418 | { | ||
| 1419 | return !last_highmem_page; | ||
| 1420 | } | ||
| 1421 | |||
| 1422 | static inline void free_highmem_data(void) | ||
| 1423 | { | ||
| 1424 | if (safe_highmem_bm) | ||
| 1425 | memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR); | ||
| 1426 | |||
| 1427 | if (buffer) | ||
| 1428 | free_image_page(buffer, PG_UNSAFE_CLEAR); | ||
| 1429 | } | ||
| 1430 | #else | ||
| 1431 | static inline int get_safe_write_buffer(void) { return 0; } | ||
| 1432 | |||
| 1433 | static unsigned int | ||
| 1434 | count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } | ||
| 1435 | |||
| 1436 | static inline int | ||
| 1437 | prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) | ||
| 1438 | { | ||
| 1439 | return 0; | ||
| 1440 | } | ||
| 1441 | |||
| 1442 | static inline void * | ||
| 1443 | get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) | ||
| 1444 | { | ||
| 1445 | return NULL; | ||
| 1446 | } | ||
| 1447 | |||
| 1448 | static inline void copy_last_highmem_page(void) {} | ||
| 1449 | static inline int last_highmem_page_copied(void) { return 1; } | ||
| 1450 | static inline void free_highmem_data(void) {} | ||
| 1451 | #endif /* CONFIG_HIGHMEM */ | ||
| 1452 | |||
| 1104 | /** | 1453 | /** |
| 1105 | * prepare_image - use the memory bitmap @bm to mark the pages that will | 1454 | * prepare_image - use the memory bitmap @bm to mark the pages that will |
| 1106 | * be overwritten in the process of restoring the system memory state | 1455 | * be overwritten in the process of restoring the system memory state |
| @@ -1110,20 +1459,25 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) | |||
| 1110 | * The idea is to allocate a new memory bitmap first and then allocate | 1459 | * The idea is to allocate a new memory bitmap first and then allocate |
| 1111 | * as many pages as needed for the image data, but not to assign these | 1460 | * as many pages as needed for the image data, but not to assign these |
| 1112 | * pages to specific tasks initially. Instead, we just mark them as | 1461 | * pages to specific tasks initially. Instead, we just mark them as |
| 1113 | * allocated and create a list of "safe" pages that will be used later. | 1462 | * allocated and create a lists of "safe" pages that will be used |
| 1463 | * later. On systems with high memory a list of "safe" highmem pages is | ||
| 1464 | * also created. | ||
| 1114 | */ | 1465 | */ |
| 1115 | 1466 | ||
| 1116 | #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) | 1467 | #define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) |
| 1117 | 1468 | ||
| 1118 | static struct linked_page *safe_pages_list; | ||
| 1119 | |||
| 1120 | static int | 1469 | static int |
| 1121 | prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | 1470 | prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) |
| 1122 | { | 1471 | { |
| 1123 | unsigned int nr_pages; | 1472 | unsigned int nr_pages, nr_highmem; |
| 1124 | struct linked_page *sp_list, *lp; | 1473 | struct linked_page *sp_list, *lp; |
| 1125 | int error; | 1474 | int error; |
| 1126 | 1475 | ||
| 1476 | /* If there is no highmem, the buffer will not be necessary */ | ||
| 1477 | free_image_page(buffer, PG_UNSAFE_CLEAR); | ||
| 1478 | buffer = NULL; | ||
| 1479 | |||
| 1480 | nr_highmem = count_highmem_image_pages(bm); | ||
| 1127 | error = mark_unsafe_pages(bm); | 1481 | error = mark_unsafe_pages(bm); |
| 1128 | if (error) | 1482 | if (error) |
| 1129 | goto Free; | 1483 | goto Free; |
| @@ -1134,6 +1488,11 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
| 1134 | 1488 | ||
| 1135 | duplicate_memory_bitmap(new_bm, bm); | 1489 | duplicate_memory_bitmap(new_bm, bm); |
| 1136 | memory_bm_free(bm, PG_UNSAFE_KEEP); | 1490 | memory_bm_free(bm, PG_UNSAFE_KEEP); |
| 1491 | if (nr_highmem > 0) { | ||
| 1492 | error = prepare_highmem_image(bm, &nr_highmem); | ||
| 1493 | if (error) | ||
| 1494 | goto Free; | ||
| 1495 | } | ||
| 1137 | /* Reserve some safe pages for potential later use. | 1496 | /* Reserve some safe pages for potential later use. |
| 1138 | * | 1497 | * |
| 1139 | * NOTE: This way we make sure there will be enough safe pages for the | 1498 | * NOTE: This way we make sure there will be enough safe pages for the |
| @@ -1142,10 +1501,10 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
| 1142 | */ | 1501 | */ |
| 1143 | sp_list = NULL; | 1502 | sp_list = NULL; |
| 1144 | /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ | 1503 | /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ |
| 1145 | nr_pages = nr_copy_pages - allocated_unsafe_pages; | 1504 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; |
| 1146 | nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); | 1505 | nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); |
| 1147 | while (nr_pages > 0) { | 1506 | while (nr_pages > 0) { |
| 1148 | lp = alloc_image_page(GFP_ATOMIC, PG_SAFE); | 1507 | lp = get_image_page(GFP_ATOMIC, PG_SAFE); |
| 1149 | if (!lp) { | 1508 | if (!lp) { |
| 1150 | error = -ENOMEM; | 1509 | error = -ENOMEM; |
| 1151 | goto Free; | 1510 | goto Free; |
| @@ -1156,7 +1515,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
| 1156 | } | 1515 | } |
| 1157 | /* Preallocate memory for the image */ | 1516 | /* Preallocate memory for the image */ |
| 1158 | safe_pages_list = NULL; | 1517 | safe_pages_list = NULL; |
| 1159 | nr_pages = nr_copy_pages - allocated_unsafe_pages; | 1518 | nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; |
| 1160 | while (nr_pages > 0) { | 1519 | while (nr_pages > 0) { |
| 1161 | lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); | 1520 | lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); |
| 1162 | if (!lp) { | 1521 | if (!lp) { |
| @@ -1181,7 +1540,7 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) | |||
| 1181 | } | 1540 | } |
| 1182 | return 0; | 1541 | return 0; |
| 1183 | 1542 | ||
| 1184 | Free: | 1543 | Free: |
| 1185 | swsusp_free(); | 1544 | swsusp_free(); |
| 1186 | return error; | 1545 | return error; |
| 1187 | } | 1546 | } |
| @@ -1196,6 +1555,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | |||
| 1196 | struct pbe *pbe; | 1555 | struct pbe *pbe; |
| 1197 | struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); | 1556 | struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); |
| 1198 | 1557 | ||
| 1558 | if (PageHighMem(page)) | ||
| 1559 | return get_highmem_page_buffer(page, ca); | ||
| 1560 | |||
| 1199 | if (PageNosave(page) && PageNosaveFree(page)) | 1561 | if (PageNosave(page) && PageNosaveFree(page)) |
| 1200 | /* We have allocated the "original" page frame and we can | 1562 | /* We have allocated the "original" page frame and we can |
| 1201 | * use it directly to store the loaded page. | 1563 | * use it directly to store the loaded page. |
| @@ -1210,12 +1572,12 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) | |||
| 1210 | swsusp_free(); | 1572 | swsusp_free(); |
| 1211 | return NULL; | 1573 | return NULL; |
| 1212 | } | 1574 | } |
| 1213 | pbe->orig_address = (unsigned long)page_address(page); | 1575 | pbe->orig_address = page_address(page); |
| 1214 | pbe->address = (unsigned long)safe_pages_list; | 1576 | pbe->address = safe_pages_list; |
| 1215 | safe_pages_list = safe_pages_list->next; | 1577 | safe_pages_list = safe_pages_list->next; |
| 1216 | pbe->next = restore_pblist; | 1578 | pbe->next = restore_pblist; |
| 1217 | restore_pblist = pbe; | 1579 | restore_pblist = pbe; |
| 1218 | return (void *)pbe->address; | 1580 | return pbe->address; |
| 1219 | } | 1581 | } |
| 1220 | 1582 | ||
| 1221 | /** | 1583 | /** |
| @@ -1249,14 +1611,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) | |||
| 1249 | if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) | 1611 | if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) |
| 1250 | return 0; | 1612 | return 0; |
| 1251 | 1613 | ||
| 1252 | if (!buffer) { | 1614 | if (handle->offset == 0) { |
| 1253 | /* This makes the buffer be freed by swsusp_free() */ | 1615 | if (!buffer) |
| 1254 | buffer = alloc_image_page(GFP_ATOMIC, PG_ANY); | 1616 | /* This makes the buffer be freed by swsusp_free() */ |
| 1617 | buffer = get_image_page(GFP_ATOMIC, PG_ANY); | ||
| 1618 | |||
| 1255 | if (!buffer) | 1619 | if (!buffer) |
| 1256 | return -ENOMEM; | 1620 | return -ENOMEM; |
| 1257 | } | 1621 | |
| 1258 | if (!handle->offset) | ||
| 1259 | handle->buffer = buffer; | 1622 | handle->buffer = buffer; |
| 1623 | } | ||
| 1260 | handle->sync_read = 1; | 1624 | handle->sync_read = 1; |
| 1261 | if (handle->prev < handle->cur) { | 1625 | if (handle->prev < handle->cur) { |
| 1262 | if (handle->prev == 0) { | 1626 | if (handle->prev == 0) { |
| @@ -1284,8 +1648,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) | |||
| 1284 | return -ENOMEM; | 1648 | return -ENOMEM; |
| 1285 | } | 1649 | } |
| 1286 | } else { | 1650 | } else { |
| 1651 | copy_last_highmem_page(); | ||
| 1287 | handle->buffer = get_buffer(&orig_bm, &ca); | 1652 | handle->buffer = get_buffer(&orig_bm, &ca); |
| 1288 | handle->sync_read = 0; | 1653 | if (handle->buffer != buffer) |
| 1654 | handle->sync_read = 0; | ||
| 1289 | } | 1655 | } |
| 1290 | handle->prev = handle->cur; | 1656 | handle->prev = handle->cur; |
| 1291 | } | 1657 | } |
| @@ -1301,15 +1667,73 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) | |||
| 1301 | return count; | 1667 | return count; |
| 1302 | } | 1668 | } |
| 1303 | 1669 | ||
| 1670 | /** | ||
| 1671 | * snapshot_write_finalize - must be called after the last call to | ||
| 1672 | * snapshot_write_next() in case the last page in the image happens | ||
| 1673 | * to be a highmem page and its contents should be stored in the | ||
| 1674 | * highmem. Additionally, it releases the memory that will not be | ||
| 1675 | * used any more. | ||
| 1676 | */ | ||
| 1677 | |||
| 1678 | void snapshot_write_finalize(struct snapshot_handle *handle) | ||
| 1679 | { | ||
| 1680 | copy_last_highmem_page(); | ||
| 1681 | /* Free only if we have loaded the image entirely */ | ||
| 1682 | if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { | ||
| 1683 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); | ||
| 1684 | free_highmem_data(); | ||
| 1685 | } | ||
| 1686 | } | ||
| 1687 | |||
| 1304 | int snapshot_image_loaded(struct snapshot_handle *handle) | 1688 | int snapshot_image_loaded(struct snapshot_handle *handle) |
| 1305 | { | 1689 | { |
| 1306 | return !(!nr_copy_pages || | 1690 | return !(!nr_copy_pages || !last_highmem_page_copied() || |
| 1307 | handle->cur <= nr_meta_pages + nr_copy_pages); | 1691 | handle->cur <= nr_meta_pages + nr_copy_pages); |
| 1308 | } | 1692 | } |
| 1309 | 1693 | ||
| 1310 | void snapshot_free_unused_memory(struct snapshot_handle *handle) | 1694 | #ifdef CONFIG_HIGHMEM |
| 1695 | /* Assumes that @buf is ready and points to a "safe" page */ | ||
| 1696 | static inline void | ||
| 1697 | swap_two_pages_data(struct page *p1, struct page *p2, void *buf) | ||
| 1311 | { | 1698 | { |
| 1312 | /* Free only if we have loaded the image entirely */ | 1699 | void *kaddr1, *kaddr2; |
| 1313 | if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) | 1700 | |
| 1314 | memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); | 1701 | kaddr1 = kmap_atomic(p1, KM_USER0); |
| 1702 | kaddr2 = kmap_atomic(p2, KM_USER1); | ||
| 1703 | memcpy(buf, kaddr1, PAGE_SIZE); | ||
| 1704 | memcpy(kaddr1, kaddr2, PAGE_SIZE); | ||
| 1705 | memcpy(kaddr2, buf, PAGE_SIZE); | ||
| 1706 | kunmap_atomic(kaddr1, KM_USER0); | ||
| 1707 | kunmap_atomic(kaddr2, KM_USER1); | ||
| 1708 | } | ||
| 1709 | |||
| 1710 | /** | ||
| 1711 | * restore_highmem - for each highmem page that was allocated before | ||
| 1712 | * the suspend and included in the suspend image, and also has been | ||
| 1713 | * allocated by the "resume" kernel swap its current (ie. "before | ||
| 1714 | * resume") contents with the previous (ie. "before suspend") one. | ||
| 1715 | * | ||
| 1716 | * If the resume eventually fails, we can call this function once | ||
| 1717 | * again and restore the "before resume" highmem state. | ||
| 1718 | */ | ||
| 1719 | |||
| 1720 | int restore_highmem(void) | ||
| 1721 | { | ||
| 1722 | struct highmem_pbe *pbe = highmem_pblist; | ||
| 1723 | void *buf; | ||
| 1724 | |||
| 1725 | if (!pbe) | ||
| 1726 | return 0; | ||
| 1727 | |||
| 1728 | buf = get_image_page(GFP_ATOMIC, PG_SAFE); | ||
| 1729 | if (!buf) | ||
| 1730 | return -ENOMEM; | ||
| 1731 | |||
| 1732 | while (pbe) { | ||
| 1733 | swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf); | ||
| 1734 | pbe = pbe->next; | ||
| 1735 | } | ||
| 1736 | free_image_page(buf, PG_UNSAFE_CLEAR); | ||
| 1737 | return 0; | ||
| 1315 | } | 1738 | } |
| 1739 | #endif /* CONFIG_HIGHMEM */ | ||
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 1a3b0dd2c3fc..3581f8f86acd 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -34,34 +34,123 @@ extern char resume_file[]; | |||
| 34 | #define SWSUSP_SIG "S1SUSPEND" | 34 | #define SWSUSP_SIG "S1SUSPEND" |
| 35 | 35 | ||
| 36 | static struct swsusp_header { | 36 | static struct swsusp_header { |
| 37 | char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; | 37 | char reserved[PAGE_SIZE - 20 - sizeof(sector_t)]; |
| 38 | swp_entry_t image; | 38 | sector_t image; |
| 39 | char orig_sig[10]; | 39 | char orig_sig[10]; |
| 40 | char sig[10]; | 40 | char sig[10]; |
| 41 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; | 41 | } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header; |
| 42 | 42 | ||
| 43 | /* | 43 | /* |
| 44 | * Saving part... | 44 | * General things |
| 45 | */ | 45 | */ |
| 46 | 46 | ||
| 47 | static unsigned short root_swap = 0xffff; | 47 | static unsigned short root_swap = 0xffff; |
| 48 | static struct block_device *resume_bdev; | ||
| 49 | |||
| 50 | /** | ||
| 51 | * submit - submit BIO request. | ||
| 52 | * @rw: READ or WRITE. | ||
| 53 | * @off physical offset of page. | ||
| 54 | * @page: page we're reading or writing. | ||
| 55 | * @bio_chain: list of pending biod (for async reading) | ||
| 56 | * | ||
| 57 | * Straight from the textbook - allocate and initialize the bio. | ||
| 58 | * If we're reading, make sure the page is marked as dirty. | ||
| 59 | * Then submit it and, if @bio_chain == NULL, wait. | ||
| 60 | */ | ||
| 61 | static int submit(int rw, pgoff_t page_off, struct page *page, | ||
| 62 | struct bio **bio_chain) | ||
| 63 | { | ||
| 64 | struct bio *bio; | ||
| 65 | |||
| 66 | bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); | ||
| 67 | if (!bio) | ||
| 68 | return -ENOMEM; | ||
| 69 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); | ||
| 70 | bio->bi_bdev = resume_bdev; | ||
| 71 | bio->bi_end_io = end_swap_bio_read; | ||
| 72 | |||
| 73 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { | ||
| 74 | printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); | ||
| 75 | bio_put(bio); | ||
| 76 | return -EFAULT; | ||
| 77 | } | ||
| 78 | |||
| 79 | lock_page(page); | ||
| 80 | bio_get(bio); | ||
| 81 | |||
| 82 | if (bio_chain == NULL) { | ||
| 83 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
| 84 | wait_on_page_locked(page); | ||
| 85 | if (rw == READ) | ||
| 86 | bio_set_pages_dirty(bio); | ||
| 87 | bio_put(bio); | ||
| 88 | } else { | ||
| 89 | if (rw == READ) | ||
| 90 | get_page(page); /* These pages are freed later */ | ||
| 91 | bio->bi_private = *bio_chain; | ||
| 92 | *bio_chain = bio; | ||
| 93 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
| 94 | } | ||
| 95 | return 0; | ||
| 96 | } | ||
| 97 | |||
| 98 | static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) | ||
| 99 | { | ||
| 100 | return submit(READ, page_off, virt_to_page(addr), bio_chain); | ||
| 101 | } | ||
| 102 | |||
| 103 | static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) | ||
| 104 | { | ||
| 105 | return submit(WRITE, page_off, virt_to_page(addr), bio_chain); | ||
| 106 | } | ||
| 107 | |||
| 108 | static int wait_on_bio_chain(struct bio **bio_chain) | ||
| 109 | { | ||
| 110 | struct bio *bio; | ||
| 111 | struct bio *next_bio; | ||
| 112 | int ret = 0; | ||
| 113 | |||
| 114 | if (bio_chain == NULL) | ||
| 115 | return 0; | ||
| 116 | |||
| 117 | bio = *bio_chain; | ||
| 118 | if (bio == NULL) | ||
| 119 | return 0; | ||
| 120 | while (bio) { | ||
| 121 | struct page *page; | ||
| 122 | |||
| 123 | next_bio = bio->bi_private; | ||
| 124 | page = bio->bi_io_vec[0].bv_page; | ||
| 125 | wait_on_page_locked(page); | ||
| 126 | if (!PageUptodate(page) || PageError(page)) | ||
| 127 | ret = -EIO; | ||
| 128 | put_page(page); | ||
| 129 | bio_put(bio); | ||
| 130 | bio = next_bio; | ||
| 131 | } | ||
| 132 | *bio_chain = NULL; | ||
| 133 | return ret; | ||
| 134 | } | ||
| 135 | |||
| 136 | /* | ||
| 137 | * Saving part | ||
| 138 | */ | ||
| 48 | 139 | ||
| 49 | static int mark_swapfiles(swp_entry_t start) | 140 | static int mark_swapfiles(sector_t start) |
| 50 | { | 141 | { |
| 51 | int error; | 142 | int error; |
| 52 | 143 | ||
| 53 | rw_swap_page_sync(READ, swp_entry(root_swap, 0), | 144 | bio_read_page(swsusp_resume_block, &swsusp_header, NULL); |
| 54 | virt_to_page((unsigned long)&swsusp_header), NULL); | ||
| 55 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || | 145 | if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || |
| 56 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | 146 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { |
| 57 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | 147 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); |
| 58 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | 148 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); |
| 59 | swsusp_header.image = start; | 149 | swsusp_header.image = start; |
| 60 | error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), | 150 | error = bio_write_page(swsusp_resume_block, |
| 61 | virt_to_page((unsigned long)&swsusp_header), | 151 | &swsusp_header, NULL); |
| 62 | NULL); | ||
| 63 | } else { | 152 | } else { |
| 64 | pr_debug("swsusp: Partition is not swap space.\n"); | 153 | printk(KERN_ERR "swsusp: Swap header not found!\n"); |
| 65 | error = -ENODEV; | 154 | error = -ENODEV; |
| 66 | } | 155 | } |
| 67 | return error; | 156 | return error; |
| @@ -74,12 +163,22 @@ static int mark_swapfiles(swp_entry_t start) | |||
| 74 | 163 | ||
| 75 | static int swsusp_swap_check(void) /* This is called before saving image */ | 164 | static int swsusp_swap_check(void) /* This is called before saving image */ |
| 76 | { | 165 | { |
| 77 | int res = swap_type_of(swsusp_resume_device); | 166 | int res; |
| 167 | |||
| 168 | res = swap_type_of(swsusp_resume_device, swsusp_resume_block, | ||
| 169 | &resume_bdev); | ||
| 170 | if (res < 0) | ||
| 171 | return res; | ||
| 172 | |||
| 173 | root_swap = res; | ||
| 174 | res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR); | ||
| 175 | if (res) | ||
| 176 | return res; | ||
| 177 | |||
| 178 | res = set_blocksize(resume_bdev, PAGE_SIZE); | ||
| 179 | if (res < 0) | ||
| 180 | blkdev_put(resume_bdev); | ||
| 78 | 181 | ||
| 79 | if (res >= 0) { | ||
| 80 | root_swap = res; | ||
| 81 | return 0; | ||
| 82 | } | ||
| 83 | return res; | 182 | return res; |
| 84 | } | 183 | } |
| 85 | 184 | ||
| @@ -90,36 +189,26 @@ static int swsusp_swap_check(void) /* This is called before saving image */ | |||
| 90 | * @bio_chain: Link the next write BIO here | 189 | * @bio_chain: Link the next write BIO here |
| 91 | */ | 190 | */ |
| 92 | 191 | ||
| 93 | static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) | 192 | static int write_page(void *buf, sector_t offset, struct bio **bio_chain) |
| 94 | { | 193 | { |
| 95 | swp_entry_t entry; | 194 | void *src; |
| 96 | int error = -ENOSPC; | 195 | |
| 97 | 196 | if (!offset) | |
| 98 | if (offset) { | 197 | return -ENOSPC; |
| 99 | struct page *page = virt_to_page(buf); | 198 | |
| 100 | 199 | if (bio_chain) { | |
| 101 | if (bio_chain) { | 200 | src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); |
| 102 | /* | 201 | if (src) { |
| 103 | * Whether or not we successfully allocated a copy page, | 202 | memcpy(src, buf, PAGE_SIZE); |
| 104 | * we take a ref on the page here. It gets undone in | 203 | } else { |
| 105 | * wait_on_bio_chain(). | 204 | WARN_ON_ONCE(1); |
| 106 | */ | 205 | bio_chain = NULL; /* Go synchronous */ |
| 107 | struct page *page_copy; | 206 | src = buf; |
| 108 | page_copy = alloc_page(GFP_ATOMIC); | ||
| 109 | if (page_copy == NULL) { | ||
| 110 | WARN_ON_ONCE(1); | ||
| 111 | bio_chain = NULL; /* Go synchronous */ | ||
| 112 | get_page(page); | ||
| 113 | } else { | ||
| 114 | memcpy(page_address(page_copy), | ||
| 115 | page_address(page), PAGE_SIZE); | ||
| 116 | page = page_copy; | ||
| 117 | } | ||
| 118 | } | 207 | } |
| 119 | entry = swp_entry(root_swap, offset); | 208 | } else { |
| 120 | error = rw_swap_page_sync(WRITE, entry, page, bio_chain); | 209 | src = buf; |
| 121 | } | 210 | } |
| 122 | return error; | 211 | return bio_write_page(offset, src, bio_chain); |
| 123 | } | 212 | } |
| 124 | 213 | ||
| 125 | /* | 214 | /* |
| @@ -137,11 +226,11 @@ static int write_page(void *buf, unsigned long offset, struct bio **bio_chain) | |||
| 137 | * at a time. | 226 | * at a time. |
| 138 | */ | 227 | */ |
| 139 | 228 | ||
| 140 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(long) - 1) | 229 | #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) |
| 141 | 230 | ||
| 142 | struct swap_map_page { | 231 | struct swap_map_page { |
| 143 | unsigned long entries[MAP_PAGE_ENTRIES]; | 232 | sector_t entries[MAP_PAGE_ENTRIES]; |
| 144 | unsigned long next_swap; | 233 | sector_t next_swap; |
| 145 | }; | 234 | }; |
| 146 | 235 | ||
| 147 | /** | 236 | /** |
| @@ -151,7 +240,7 @@ struct swap_map_page { | |||
| 151 | 240 | ||
| 152 | struct swap_map_handle { | 241 | struct swap_map_handle { |
| 153 | struct swap_map_page *cur; | 242 | struct swap_map_page *cur; |
| 154 | unsigned long cur_swap; | 243 | sector_t cur_swap; |
| 155 | struct bitmap_page *bitmap; | 244 | struct bitmap_page *bitmap; |
| 156 | unsigned int k; | 245 | unsigned int k; |
| 157 | }; | 246 | }; |
| @@ -166,26 +255,6 @@ static void release_swap_writer(struct swap_map_handle *handle) | |||
| 166 | handle->bitmap = NULL; | 255 | handle->bitmap = NULL; |
| 167 | } | 256 | } |
| 168 | 257 | ||
| 169 | static void show_speed(struct timeval *start, struct timeval *stop, | ||
| 170 | unsigned nr_pages, char *msg) | ||
| 171 | { | ||
| 172 | s64 elapsed_centisecs64; | ||
| 173 | int centisecs; | ||
| 174 | int k; | ||
| 175 | int kps; | ||
| 176 | |||
| 177 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); | ||
| 178 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); | ||
| 179 | centisecs = elapsed_centisecs64; | ||
| 180 | if (centisecs == 0) | ||
| 181 | centisecs = 1; /* avoid div-by-zero */ | ||
| 182 | k = nr_pages * (PAGE_SIZE / 1024); | ||
| 183 | kps = (k * 100) / centisecs; | ||
| 184 | printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, | ||
| 185 | centisecs / 100, centisecs % 100, | ||
| 186 | kps / 1000, (kps % 1000) / 10); | ||
| 187 | } | ||
| 188 | |||
| 189 | static int get_swap_writer(struct swap_map_handle *handle) | 258 | static int get_swap_writer(struct swap_map_handle *handle) |
| 190 | { | 259 | { |
| 191 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); | 260 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); |
| @@ -196,7 +265,7 @@ static int get_swap_writer(struct swap_map_handle *handle) | |||
| 196 | release_swap_writer(handle); | 265 | release_swap_writer(handle); |
| 197 | return -ENOMEM; | 266 | return -ENOMEM; |
| 198 | } | 267 | } |
| 199 | handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap); | 268 | handle->cur_swap = alloc_swapdev_block(root_swap, handle->bitmap); |
| 200 | if (!handle->cur_swap) { | 269 | if (!handle->cur_swap) { |
| 201 | release_swap_writer(handle); | 270 | release_swap_writer(handle); |
| 202 | return -ENOSPC; | 271 | return -ENOSPC; |
| @@ -205,43 +274,15 @@ static int get_swap_writer(struct swap_map_handle *handle) | |||
| 205 | return 0; | 274 | return 0; |
| 206 | } | 275 | } |
| 207 | 276 | ||
| 208 | static int wait_on_bio_chain(struct bio **bio_chain) | ||
| 209 | { | ||
| 210 | struct bio *bio; | ||
| 211 | struct bio *next_bio; | ||
| 212 | int ret = 0; | ||
| 213 | |||
| 214 | if (bio_chain == NULL) | ||
| 215 | return 0; | ||
| 216 | |||
| 217 | bio = *bio_chain; | ||
| 218 | if (bio == NULL) | ||
| 219 | return 0; | ||
| 220 | while (bio) { | ||
| 221 | struct page *page; | ||
| 222 | |||
| 223 | next_bio = bio->bi_private; | ||
| 224 | page = bio->bi_io_vec[0].bv_page; | ||
| 225 | wait_on_page_locked(page); | ||
| 226 | if (!PageUptodate(page) || PageError(page)) | ||
| 227 | ret = -EIO; | ||
| 228 | put_page(page); | ||
| 229 | bio_put(bio); | ||
| 230 | bio = next_bio; | ||
| 231 | } | ||
| 232 | *bio_chain = NULL; | ||
| 233 | return ret; | ||
| 234 | } | ||
| 235 | |||
| 236 | static int swap_write_page(struct swap_map_handle *handle, void *buf, | 277 | static int swap_write_page(struct swap_map_handle *handle, void *buf, |
| 237 | struct bio **bio_chain) | 278 | struct bio **bio_chain) |
| 238 | { | 279 | { |
| 239 | int error = 0; | 280 | int error = 0; |
| 240 | unsigned long offset; | 281 | sector_t offset; |
| 241 | 282 | ||
| 242 | if (!handle->cur) | 283 | if (!handle->cur) |
| 243 | return -EINVAL; | 284 | return -EINVAL; |
| 244 | offset = alloc_swap_page(root_swap, handle->bitmap); | 285 | offset = alloc_swapdev_block(root_swap, handle->bitmap); |
| 245 | error = write_page(buf, offset, bio_chain); | 286 | error = write_page(buf, offset, bio_chain); |
| 246 | if (error) | 287 | if (error) |
| 247 | return error; | 288 | return error; |
| @@ -250,7 +291,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
| 250 | error = wait_on_bio_chain(bio_chain); | 291 | error = wait_on_bio_chain(bio_chain); |
| 251 | if (error) | 292 | if (error) |
| 252 | goto out; | 293 | goto out; |
| 253 | offset = alloc_swap_page(root_swap, handle->bitmap); | 294 | offset = alloc_swapdev_block(root_swap, handle->bitmap); |
| 254 | if (!offset) | 295 | if (!offset) |
| 255 | return -ENOSPC; | 296 | return -ENOSPC; |
| 256 | handle->cur->next_swap = offset; | 297 | handle->cur->next_swap = offset; |
| @@ -261,7 +302,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, | |||
| 261 | handle->cur_swap = offset; | 302 | handle->cur_swap = offset; |
| 262 | handle->k = 0; | 303 | handle->k = 0; |
| 263 | } | 304 | } |
| 264 | out: | 305 | out: |
| 265 | return error; | 306 | return error; |
| 266 | } | 307 | } |
| 267 | 308 | ||
| @@ -315,7 +356,7 @@ static int save_image(struct swap_map_handle *handle, | |||
| 315 | error = err2; | 356 | error = err2; |
| 316 | if (!error) | 357 | if (!error) |
| 317 | printk("\b\b\b\bdone\n"); | 358 | printk("\b\b\b\bdone\n"); |
| 318 | show_speed(&start, &stop, nr_to_write, "Wrote"); | 359 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); |
| 319 | return error; | 360 | return error; |
| 320 | } | 361 | } |
| 321 | 362 | ||
| @@ -350,100 +391,50 @@ int swsusp_write(void) | |||
| 350 | struct swsusp_info *header; | 391 | struct swsusp_info *header; |
| 351 | int error; | 392 | int error; |
| 352 | 393 | ||
| 353 | if ((error = swsusp_swap_check())) { | 394 | error = swsusp_swap_check(); |
| 395 | if (error) { | ||
| 354 | printk(KERN_ERR "swsusp: Cannot find swap device, try " | 396 | printk(KERN_ERR "swsusp: Cannot find swap device, try " |
| 355 | "swapon -a.\n"); | 397 | "swapon -a.\n"); |
| 356 | return error; | 398 | return error; |
| 357 | } | 399 | } |
| 358 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); | 400 | memset(&snapshot, 0, sizeof(struct snapshot_handle)); |
| 359 | error = snapshot_read_next(&snapshot, PAGE_SIZE); | 401 | error = snapshot_read_next(&snapshot, PAGE_SIZE); |
| 360 | if (error < PAGE_SIZE) | 402 | if (error < PAGE_SIZE) { |
| 361 | return error < 0 ? error : -EFAULT; | 403 | if (error >= 0) |
| 404 | error = -EFAULT; | ||
| 405 | |||
| 406 | goto out; | ||
| 407 | } | ||
| 362 | header = (struct swsusp_info *)data_of(snapshot); | 408 | header = (struct swsusp_info *)data_of(snapshot); |
| 363 | if (!enough_swap(header->pages)) { | 409 | if (!enough_swap(header->pages)) { |
| 364 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | 410 | printk(KERN_ERR "swsusp: Not enough free swap\n"); |
| 365 | return -ENOSPC; | 411 | error = -ENOSPC; |
| 412 | goto out; | ||
| 366 | } | 413 | } |
| 367 | error = get_swap_writer(&handle); | 414 | error = get_swap_writer(&handle); |
| 368 | if (!error) { | 415 | if (!error) { |
| 369 | unsigned long start = handle.cur_swap; | 416 | sector_t start = handle.cur_swap; |
| 417 | |||
| 370 | error = swap_write_page(&handle, header, NULL); | 418 | error = swap_write_page(&handle, header, NULL); |
| 371 | if (!error) | 419 | if (!error) |
| 372 | error = save_image(&handle, &snapshot, | 420 | error = save_image(&handle, &snapshot, |
| 373 | header->pages - 1); | 421 | header->pages - 1); |
| 422 | |||
| 374 | if (!error) { | 423 | if (!error) { |
| 375 | flush_swap_writer(&handle); | 424 | flush_swap_writer(&handle); |
| 376 | printk("S"); | 425 | printk("S"); |
| 377 | error = mark_swapfiles(swp_entry(root_swap, start)); | 426 | error = mark_swapfiles(start); |
| 378 | printk("|\n"); | 427 | printk("|\n"); |
| 379 | } | 428 | } |
| 380 | } | 429 | } |
| 381 | if (error) | 430 | if (error) |
| 382 | free_all_swap_pages(root_swap, handle.bitmap); | 431 | free_all_swap_pages(root_swap, handle.bitmap); |
| 383 | release_swap_writer(&handle); | 432 | release_swap_writer(&handle); |
| 433 | out: | ||
| 434 | swsusp_close(); | ||
| 384 | return error; | 435 | return error; |
| 385 | } | 436 | } |
| 386 | 437 | ||
| 387 | static struct block_device *resume_bdev; | ||
| 388 | |||
| 389 | /** | ||
| 390 | * submit - submit BIO request. | ||
| 391 | * @rw: READ or WRITE. | ||
| 392 | * @off physical offset of page. | ||
| 393 | * @page: page we're reading or writing. | ||
| 394 | * @bio_chain: list of pending biod (for async reading) | ||
| 395 | * | ||
| 396 | * Straight from the textbook - allocate and initialize the bio. | ||
| 397 | * If we're reading, make sure the page is marked as dirty. | ||
| 398 | * Then submit it and, if @bio_chain == NULL, wait. | ||
| 399 | */ | ||
| 400 | static int submit(int rw, pgoff_t page_off, struct page *page, | ||
| 401 | struct bio **bio_chain) | ||
| 402 | { | ||
| 403 | struct bio *bio; | ||
| 404 | |||
| 405 | bio = bio_alloc(GFP_ATOMIC, 1); | ||
| 406 | if (!bio) | ||
| 407 | return -ENOMEM; | ||
| 408 | bio->bi_sector = page_off * (PAGE_SIZE >> 9); | ||
| 409 | bio->bi_bdev = resume_bdev; | ||
| 410 | bio->bi_end_io = end_swap_bio_read; | ||
| 411 | |||
| 412 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { | ||
| 413 | printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); | ||
| 414 | bio_put(bio); | ||
| 415 | return -EFAULT; | ||
| 416 | } | ||
| 417 | |||
| 418 | lock_page(page); | ||
| 419 | bio_get(bio); | ||
| 420 | |||
| 421 | if (bio_chain == NULL) { | ||
| 422 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
| 423 | wait_on_page_locked(page); | ||
| 424 | if (rw == READ) | ||
| 425 | bio_set_pages_dirty(bio); | ||
| 426 | bio_put(bio); | ||
| 427 | } else { | ||
| 428 | if (rw == READ) | ||
| 429 | get_page(page); /* These pages are freed later */ | ||
| 430 | bio->bi_private = *bio_chain; | ||
| 431 | *bio_chain = bio; | ||
| 432 | submit_bio(rw | (1 << BIO_RW_SYNC), bio); | ||
| 433 | } | ||
| 434 | return 0; | ||
| 435 | } | ||
| 436 | |||
| 437 | static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) | ||
| 438 | { | ||
| 439 | return submit(READ, page_off, virt_to_page(addr), bio_chain); | ||
| 440 | } | ||
| 441 | |||
| 442 | static int bio_write_page(pgoff_t page_off, void *addr) | ||
| 443 | { | ||
| 444 | return submit(WRITE, page_off, virt_to_page(addr), NULL); | ||
| 445 | } | ||
| 446 | |||
| 447 | /** | 438 | /** |
| 448 | * The following functions allow us to read data using a swap map | 439 | * The following functions allow us to read data using a swap map |
| 449 | * in a file-alike way | 440 | * in a file-alike way |
| @@ -456,17 +447,18 @@ static void release_swap_reader(struct swap_map_handle *handle) | |||
| 456 | handle->cur = NULL; | 447 | handle->cur = NULL; |
| 457 | } | 448 | } |
| 458 | 449 | ||
| 459 | static int get_swap_reader(struct swap_map_handle *handle, | 450 | static int get_swap_reader(struct swap_map_handle *handle, sector_t start) |
| 460 | swp_entry_t start) | ||
| 461 | { | 451 | { |
| 462 | int error; | 452 | int error; |
| 463 | 453 | ||
| 464 | if (!swp_offset(start)) | 454 | if (!start) |
| 465 | return -EINVAL; | 455 | return -EINVAL; |
| 466 | handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); | 456 | |
| 457 | handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); | ||
| 467 | if (!handle->cur) | 458 | if (!handle->cur) |
| 468 | return -ENOMEM; | 459 | return -ENOMEM; |
| 469 | error = bio_read_page(swp_offset(start), handle->cur, NULL); | 460 | |
| 461 | error = bio_read_page(start, handle->cur, NULL); | ||
| 470 | if (error) { | 462 | if (error) { |
| 471 | release_swap_reader(handle); | 463 | release_swap_reader(handle); |
| 472 | return error; | 464 | return error; |
| @@ -478,7 +470,7 @@ static int get_swap_reader(struct swap_map_handle *handle, | |||
| 478 | static int swap_read_page(struct swap_map_handle *handle, void *buf, | 470 | static int swap_read_page(struct swap_map_handle *handle, void *buf, |
| 479 | struct bio **bio_chain) | 471 | struct bio **bio_chain) |
| 480 | { | 472 | { |
| 481 | unsigned long offset; | 473 | sector_t offset; |
| 482 | int error; | 474 | int error; |
| 483 | 475 | ||
| 484 | if (!handle->cur) | 476 | if (!handle->cur) |
| @@ -547,11 +539,11 @@ static int load_image(struct swap_map_handle *handle, | |||
| 547 | error = err2; | 539 | error = err2; |
| 548 | if (!error) { | 540 | if (!error) { |
| 549 | printk("\b\b\b\bdone\n"); | 541 | printk("\b\b\b\bdone\n"); |
| 550 | snapshot_free_unused_memory(snapshot); | 542 | snapshot_write_finalize(snapshot); |
| 551 | if (!snapshot_image_loaded(snapshot)) | 543 | if (!snapshot_image_loaded(snapshot)) |
| 552 | error = -ENODATA; | 544 | error = -ENODATA; |
| 553 | } | 545 | } |
| 554 | show_speed(&start, &stop, nr_to_read, "Read"); | 546 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); |
| 555 | return error; | 547 | return error; |
| 556 | } | 548 | } |
| 557 | 549 | ||
| @@ -600,12 +592,16 @@ int swsusp_check(void) | |||
| 600 | if (!IS_ERR(resume_bdev)) { | 592 | if (!IS_ERR(resume_bdev)) { |
| 601 | set_blocksize(resume_bdev, PAGE_SIZE); | 593 | set_blocksize(resume_bdev, PAGE_SIZE); |
| 602 | memset(&swsusp_header, 0, sizeof(swsusp_header)); | 594 | memset(&swsusp_header, 0, sizeof(swsusp_header)); |
| 603 | if ((error = bio_read_page(0, &swsusp_header, NULL))) | 595 | error = bio_read_page(swsusp_resume_block, |
| 596 | &swsusp_header, NULL); | ||
| 597 | if (error) | ||
| 604 | return error; | 598 | return error; |
| 599 | |||
| 605 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | 600 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { |
| 606 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | 601 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); |
| 607 | /* Reset swap signature now */ | 602 | /* Reset swap signature now */ |
| 608 | error = bio_write_page(0, &swsusp_header); | 603 | error = bio_write_page(swsusp_resume_block, |
| 604 | &swsusp_header, NULL); | ||
| 609 | } else { | 605 | } else { |
| 610 | return -EINVAL; | 606 | return -EINVAL; |
| 611 | } | 607 | } |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 0b66659dc516..31aa0390c777 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
| @@ -49,6 +49,7 @@ | |||
| 49 | #include <linux/bootmem.h> | 49 | #include <linux/bootmem.h> |
| 50 | #include <linux/syscalls.h> | 50 | #include <linux/syscalls.h> |
| 51 | #include <linux/highmem.h> | 51 | #include <linux/highmem.h> |
| 52 | #include <linux/time.h> | ||
| 52 | 53 | ||
| 53 | #include "power.h" | 54 | #include "power.h" |
| 54 | 55 | ||
| @@ -64,10 +65,8 @@ int in_suspend __nosavedata = 0; | |||
| 64 | 65 | ||
| 65 | #ifdef CONFIG_HIGHMEM | 66 | #ifdef CONFIG_HIGHMEM |
| 66 | unsigned int count_highmem_pages(void); | 67 | unsigned int count_highmem_pages(void); |
| 67 | int save_highmem(void); | ||
| 68 | int restore_highmem(void); | 68 | int restore_highmem(void); |
| 69 | #else | 69 | #else |
| 70 | static inline int save_highmem(void) { return 0; } | ||
| 71 | static inline int restore_highmem(void) { return 0; } | 70 | static inline int restore_highmem(void) { return 0; } |
| 72 | static inline unsigned int count_highmem_pages(void) { return 0; } | 71 | static inline unsigned int count_highmem_pages(void) { return 0; } |
| 73 | #endif | 72 | #endif |
| @@ -134,18 +133,18 @@ static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit) | |||
| 134 | return 0; | 133 | return 0; |
| 135 | } | 134 | } |
| 136 | 135 | ||
| 137 | unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap) | 136 | sector_t alloc_swapdev_block(int swap, struct bitmap_page *bitmap) |
| 138 | { | 137 | { |
| 139 | unsigned long offset; | 138 | unsigned long offset; |
| 140 | 139 | ||
| 141 | offset = swp_offset(get_swap_page_of_type(swap)); | 140 | offset = swp_offset(get_swap_page_of_type(swap)); |
| 142 | if (offset) { | 141 | if (offset) { |
| 143 | if (bitmap_set(bitmap, offset)) { | 142 | if (bitmap_set(bitmap, offset)) |
| 144 | swap_free(swp_entry(swap, offset)); | 143 | swap_free(swp_entry(swap, offset)); |
| 145 | offset = 0; | 144 | else |
| 146 | } | 145 | return swapdev_block(swap, offset); |
| 147 | } | 146 | } |
| 148 | return offset; | 147 | return 0; |
| 149 | } | 148 | } |
| 150 | 149 | ||
| 151 | void free_all_swap_pages(int swap, struct bitmap_page *bitmap) | 150 | void free_all_swap_pages(int swap, struct bitmap_page *bitmap) |
| @@ -166,6 +165,34 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) | |||
| 166 | } | 165 | } |
| 167 | 166 | ||
| 168 | /** | 167 | /** |
| 168 | * swsusp_show_speed - print the time elapsed between two events represented by | ||
| 169 | * @start and @stop | ||
| 170 | * | ||
| 171 | * @nr_pages - number of pages processed between @start and @stop | ||
| 172 | * @msg - introductory message to print | ||
| 173 | */ | ||
| 174 | |||
| 175 | void swsusp_show_speed(struct timeval *start, struct timeval *stop, | ||
| 176 | unsigned nr_pages, char *msg) | ||
| 177 | { | ||
| 178 | s64 elapsed_centisecs64; | ||
| 179 | int centisecs; | ||
| 180 | int k; | ||
| 181 | int kps; | ||
| 182 | |||
| 183 | elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); | ||
| 184 | do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); | ||
| 185 | centisecs = elapsed_centisecs64; | ||
| 186 | if (centisecs == 0) | ||
| 187 | centisecs = 1; /* avoid div-by-zero */ | ||
| 188 | k = nr_pages * (PAGE_SIZE / 1024); | ||
| 189 | kps = (k * 100) / centisecs; | ||
| 190 | printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, | ||
| 191 | centisecs / 100, centisecs % 100, | ||
| 192 | kps / 1000, (kps % 1000) / 10); | ||
| 193 | } | ||
| 194 | |||
| 195 | /** | ||
| 169 | * swsusp_shrink_memory - Try to free as much memory as needed | 196 | * swsusp_shrink_memory - Try to free as much memory as needed |
| 170 | * | 197 | * |
| 171 | * ... but do not OOM-kill anyone | 198 | * ... but do not OOM-kill anyone |
| @@ -184,23 +211,37 @@ static inline unsigned long __shrink_memory(long tmp) | |||
| 184 | 211 | ||
| 185 | int swsusp_shrink_memory(void) | 212 | int swsusp_shrink_memory(void) |
| 186 | { | 213 | { |
| 187 | long size, tmp; | 214 | long tmp; |
| 188 | struct zone *zone; | 215 | struct zone *zone; |
| 189 | unsigned long pages = 0; | 216 | unsigned long pages = 0; |
| 190 | unsigned int i = 0; | 217 | unsigned int i = 0; |
| 191 | char *p = "-\\|/"; | 218 | char *p = "-\\|/"; |
| 219 | struct timeval start, stop; | ||
| 192 | 220 | ||
| 193 | printk("Shrinking memory... "); | 221 | printk("Shrinking memory... "); |
| 222 | do_gettimeofday(&start); | ||
| 194 | do { | 223 | do { |
| 195 | size = 2 * count_highmem_pages(); | 224 | long size, highmem_size; |
| 196 | size += size / 50 + count_data_pages() + PAGES_FOR_IO; | 225 | |
| 226 | highmem_size = count_highmem_pages(); | ||
| 227 | size = count_data_pages() + PAGES_FOR_IO; | ||
| 197 | tmp = size; | 228 | tmp = size; |
| 229 | size += highmem_size; | ||
| 198 | for_each_zone (zone) | 230 | for_each_zone (zone) |
| 199 | if (!is_highmem(zone) && populated_zone(zone)) { | 231 | if (populated_zone(zone)) { |
| 200 | tmp -= zone->free_pages; | 232 | if (is_highmem(zone)) { |
| 201 | tmp += zone->lowmem_reserve[ZONE_NORMAL]; | 233 | highmem_size -= zone->free_pages; |
| 202 | tmp += snapshot_additional_pages(zone); | 234 | } else { |
| 235 | tmp -= zone->free_pages; | ||
| 236 | tmp += zone->lowmem_reserve[ZONE_NORMAL]; | ||
| 237 | tmp += snapshot_additional_pages(zone); | ||
| 238 | } | ||
| 203 | } | 239 | } |
| 240 | |||
| 241 | if (highmem_size < 0) | ||
| 242 | highmem_size = 0; | ||
| 243 | |||
| 244 | tmp += highmem_size; | ||
| 204 | if (tmp > 0) { | 245 | if (tmp > 0) { |
| 205 | tmp = __shrink_memory(tmp); | 246 | tmp = __shrink_memory(tmp); |
| 206 | if (!tmp) | 247 | if (!tmp) |
| @@ -212,7 +253,9 @@ int swsusp_shrink_memory(void) | |||
| 212 | } | 253 | } |
| 213 | printk("\b%c", p[i++%4]); | 254 | printk("\b%c", p[i++%4]); |
| 214 | } while (tmp > 0); | 255 | } while (tmp > 0); |
| 256 | do_gettimeofday(&stop); | ||
| 215 | printk("\bdone (%lu pages freed)\n", pages); | 257 | printk("\bdone (%lu pages freed)\n", pages); |
| 258 | swsusp_show_speed(&start, &stop, pages, "Freed"); | ||
| 216 | 259 | ||
| 217 | return 0; | 260 | return 0; |
| 218 | } | 261 | } |
| @@ -223,6 +266,7 @@ int swsusp_suspend(void) | |||
| 223 | 266 | ||
| 224 | if ((error = arch_prepare_suspend())) | 267 | if ((error = arch_prepare_suspend())) |
| 225 | return error; | 268 | return error; |
| 269 | |||
| 226 | local_irq_disable(); | 270 | local_irq_disable(); |
| 227 | /* At this point, device_suspend() has been called, but *not* | 271 | /* At this point, device_suspend() has been called, but *not* |
| 228 | * device_power_down(). We *must* device_power_down() now. | 272 | * device_power_down(). We *must* device_power_down() now. |
| @@ -235,23 +279,16 @@ int swsusp_suspend(void) | |||
| 235 | goto Enable_irqs; | 279 | goto Enable_irqs; |
| 236 | } | 280 | } |
| 237 | 281 | ||
| 238 | if ((error = save_highmem())) { | ||
| 239 | printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); | ||
| 240 | goto Restore_highmem; | ||
| 241 | } | ||
| 242 | |||
| 243 | save_processor_state(); | 282 | save_processor_state(); |
| 244 | if ((error = swsusp_arch_suspend())) | 283 | if ((error = swsusp_arch_suspend())) |
| 245 | printk(KERN_ERR "Error %d suspending\n", error); | 284 | printk(KERN_ERR "Error %d suspending\n", error); |
| 246 | /* Restore control flow magically appears here */ | 285 | /* Restore control flow magically appears here */ |
| 247 | restore_processor_state(); | 286 | restore_processor_state(); |
| 248 | Restore_highmem: | ||
| 249 | restore_highmem(); | ||
| 250 | /* NOTE: device_power_up() is just a resume() for devices | 287 | /* NOTE: device_power_up() is just a resume() for devices |
| 251 | * that suspended with irqs off ... no overall powerup. | 288 | * that suspended with irqs off ... no overall powerup. |
| 252 | */ | 289 | */ |
| 253 | device_power_up(); | 290 | device_power_up(); |
| 254 | Enable_irqs: | 291 | Enable_irqs: |
| 255 | local_irq_enable(); | 292 | local_irq_enable(); |
| 256 | return error; | 293 | return error; |
| 257 | } | 294 | } |
| @@ -268,18 +305,23 @@ int swsusp_resume(void) | |||
| 268 | printk(KERN_ERR "Some devices failed to power down, very bad\n"); | 305 | printk(KERN_ERR "Some devices failed to power down, very bad\n"); |
| 269 | /* We'll ignore saved state, but this gets preempt count (etc) right */ | 306 | /* We'll ignore saved state, but this gets preempt count (etc) right */ |
| 270 | save_processor_state(); | 307 | save_processor_state(); |
| 271 | error = swsusp_arch_resume(); | 308 | error = restore_highmem(); |
| 272 | /* Code below is only ever reached in case of failure. Otherwise | 309 | if (!error) { |
| 273 | * execution continues at place where swsusp_arch_suspend was called | 310 | error = swsusp_arch_resume(); |
| 274 | */ | 311 | /* The code below is only ever reached in case of a failure. |
| 275 | BUG_ON(!error); | 312 | * Otherwise execution continues at place where |
| 313 | * swsusp_arch_suspend() was called | ||
| 314 | */ | ||
| 315 | BUG_ON(!error); | ||
| 316 | /* This call to restore_highmem() undos the previous one */ | ||
| 317 | restore_highmem(); | ||
| 318 | } | ||
| 276 | /* The only reason why swsusp_arch_resume() can fail is memory being | 319 | /* The only reason why swsusp_arch_resume() can fail is memory being |
| 277 | * very tight, so we have to free it as soon as we can to avoid | 320 | * very tight, so we have to free it as soon as we can to avoid |
| 278 | * subsequent failures | 321 | * subsequent failures |
| 279 | */ | 322 | */ |
| 280 | swsusp_free(); | 323 | swsusp_free(); |
| 281 | restore_processor_state(); | 324 | restore_processor_state(); |
| 282 | restore_highmem(); | ||
| 283 | touch_softlockup_watchdog(); | 325 | touch_softlockup_watchdog(); |
| 284 | device_power_up(); | 326 | device_power_up(); |
| 285 | local_irq_enable(); | 327 | local_irq_enable(); |
diff --git a/kernel/power/user.c b/kernel/power/user.c index d991d3b0e5a4..f7b7a785a5c6 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -11,6 +11,7 @@ | |||
| 11 | 11 | ||
| 12 | #include <linux/suspend.h> | 12 | #include <linux/suspend.h> |
| 13 | #include <linux/syscalls.h> | 13 | #include <linux/syscalls.h> |
| 14 | #include <linux/reboot.h> | ||
| 14 | #include <linux/string.h> | 15 | #include <linux/string.h> |
| 15 | #include <linux/device.h> | 16 | #include <linux/device.h> |
| 16 | #include <linux/miscdevice.h> | 17 | #include <linux/miscdevice.h> |
| @@ -21,6 +22,7 @@ | |||
| 21 | #include <linux/fs.h> | 22 | #include <linux/fs.h> |
| 22 | #include <linux/console.h> | 23 | #include <linux/console.h> |
| 23 | #include <linux/cpu.h> | 24 | #include <linux/cpu.h> |
| 25 | #include <linux/freezer.h> | ||
| 24 | 26 | ||
| 25 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
| 26 | 28 | ||
| @@ -54,7 +56,8 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
| 54 | filp->private_data = data; | 56 | filp->private_data = data; |
| 55 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); | 57 | memset(&data->handle, 0, sizeof(struct snapshot_handle)); |
| 56 | if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { | 58 | if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { |
| 57 | data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1; | 59 | data->swap = swsusp_resume_device ? |
| 60 | swap_type_of(swsusp_resume_device, 0, NULL) : -1; | ||
| 58 | data->mode = O_RDONLY; | 61 | data->mode = O_RDONLY; |
| 59 | } else { | 62 | } else { |
| 60 | data->swap = -1; | 63 | data->swap = -1; |
| @@ -76,10 +79,10 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
| 76 | free_all_swap_pages(data->swap, data->bitmap); | 79 | free_all_swap_pages(data->swap, data->bitmap); |
| 77 | free_bitmap(data->bitmap); | 80 | free_bitmap(data->bitmap); |
| 78 | if (data->frozen) { | 81 | if (data->frozen) { |
| 79 | down(&pm_sem); | 82 | mutex_lock(&pm_mutex); |
| 80 | thaw_processes(); | 83 | thaw_processes(); |
| 81 | enable_nonboot_cpus(); | 84 | enable_nonboot_cpus(); |
| 82 | up(&pm_sem); | 85 | mutex_unlock(&pm_mutex); |
| 83 | } | 86 | } |
| 84 | atomic_inc(&device_available); | 87 | atomic_inc(&device_available); |
| 85 | return 0; | 88 | return 0; |
| @@ -124,7 +127,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 124 | { | 127 | { |
| 125 | int error = 0; | 128 | int error = 0; |
| 126 | struct snapshot_data *data; | 129 | struct snapshot_data *data; |
| 127 | loff_t offset, avail; | 130 | loff_t avail; |
| 131 | sector_t offset; | ||
| 128 | 132 | ||
| 129 | if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) | 133 | if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) |
| 130 | return -ENOTTY; | 134 | return -ENOTTY; |
| @@ -140,7 +144,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 140 | case SNAPSHOT_FREEZE: | 144 | case SNAPSHOT_FREEZE: |
| 141 | if (data->frozen) | 145 | if (data->frozen) |
| 142 | break; | 146 | break; |
| 143 | down(&pm_sem); | 147 | mutex_lock(&pm_mutex); |
| 144 | error = disable_nonboot_cpus(); | 148 | error = disable_nonboot_cpus(); |
| 145 | if (!error) { | 149 | if (!error) { |
| 146 | error = freeze_processes(); | 150 | error = freeze_processes(); |
| @@ -150,7 +154,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 150 | error = -EBUSY; | 154 | error = -EBUSY; |
| 151 | } | 155 | } |
| 152 | } | 156 | } |
| 153 | up(&pm_sem); | 157 | mutex_unlock(&pm_mutex); |
| 154 | if (!error) | 158 | if (!error) |
| 155 | data->frozen = 1; | 159 | data->frozen = 1; |
| 156 | break; | 160 | break; |
| @@ -158,10 +162,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 158 | case SNAPSHOT_UNFREEZE: | 162 | case SNAPSHOT_UNFREEZE: |
| 159 | if (!data->frozen) | 163 | if (!data->frozen) |
| 160 | break; | 164 | break; |
| 161 | down(&pm_sem); | 165 | mutex_lock(&pm_mutex); |
| 162 | thaw_processes(); | 166 | thaw_processes(); |
| 163 | enable_nonboot_cpus(); | 167 | enable_nonboot_cpus(); |
| 164 | up(&pm_sem); | 168 | mutex_unlock(&pm_mutex); |
| 165 | data->frozen = 0; | 169 | data->frozen = 0; |
| 166 | break; | 170 | break; |
| 167 | 171 | ||
| @@ -170,7 +174,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 170 | error = -EPERM; | 174 | error = -EPERM; |
| 171 | break; | 175 | break; |
| 172 | } | 176 | } |
| 173 | down(&pm_sem); | 177 | mutex_lock(&pm_mutex); |
| 174 | /* Free memory before shutting down devices. */ | 178 | /* Free memory before shutting down devices. */ |
| 175 | error = swsusp_shrink_memory(); | 179 | error = swsusp_shrink_memory(); |
| 176 | if (!error) { | 180 | if (!error) { |
| @@ -183,7 +187,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 183 | } | 187 | } |
| 184 | resume_console(); | 188 | resume_console(); |
| 185 | } | 189 | } |
| 186 | up(&pm_sem); | 190 | mutex_unlock(&pm_mutex); |
| 187 | if (!error) | 191 | if (!error) |
| 188 | error = put_user(in_suspend, (unsigned int __user *)arg); | 192 | error = put_user(in_suspend, (unsigned int __user *)arg); |
| 189 | if (!error) | 193 | if (!error) |
| @@ -191,13 +195,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 191 | break; | 195 | break; |
| 192 | 196 | ||
| 193 | case SNAPSHOT_ATOMIC_RESTORE: | 197 | case SNAPSHOT_ATOMIC_RESTORE: |
| 198 | snapshot_write_finalize(&data->handle); | ||
| 194 | if (data->mode != O_WRONLY || !data->frozen || | 199 | if (data->mode != O_WRONLY || !data->frozen || |
| 195 | !snapshot_image_loaded(&data->handle)) { | 200 | !snapshot_image_loaded(&data->handle)) { |
| 196 | error = -EPERM; | 201 | error = -EPERM; |
| 197 | break; | 202 | break; |
| 198 | } | 203 | } |
| 199 | snapshot_free_unused_memory(&data->handle); | 204 | mutex_lock(&pm_mutex); |
| 200 | down(&pm_sem); | ||
| 201 | pm_prepare_console(); | 205 | pm_prepare_console(); |
| 202 | suspend_console(); | 206 | suspend_console(); |
| 203 | error = device_suspend(PMSG_PRETHAW); | 207 | error = device_suspend(PMSG_PRETHAW); |
| @@ -207,7 +211,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 207 | } | 211 | } |
| 208 | resume_console(); | 212 | resume_console(); |
| 209 | pm_restore_console(); | 213 | pm_restore_console(); |
| 210 | up(&pm_sem); | 214 | mutex_unlock(&pm_mutex); |
| 211 | break; | 215 | break; |
| 212 | 216 | ||
| 213 | case SNAPSHOT_FREE: | 217 | case SNAPSHOT_FREE: |
| @@ -238,10 +242,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 238 | break; | 242 | break; |
| 239 | } | 243 | } |
| 240 | } | 244 | } |
| 241 | offset = alloc_swap_page(data->swap, data->bitmap); | 245 | offset = alloc_swapdev_block(data->swap, data->bitmap); |
| 242 | if (offset) { | 246 | if (offset) { |
| 243 | offset <<= PAGE_SHIFT; | 247 | offset <<= PAGE_SHIFT; |
| 244 | error = put_user(offset, (loff_t __user *)arg); | 248 | error = put_user(offset, (sector_t __user *)arg); |
| 245 | } else { | 249 | } else { |
| 246 | error = -ENOSPC; | 250 | error = -ENOSPC; |
| 247 | } | 251 | } |
| @@ -264,7 +268,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 264 | * so we need to recode them | 268 | * so we need to recode them |
| 265 | */ | 269 | */ |
| 266 | if (old_decode_dev(arg)) { | 270 | if (old_decode_dev(arg)) { |
| 267 | data->swap = swap_type_of(old_decode_dev(arg)); | 271 | data->swap = swap_type_of(old_decode_dev(arg), |
| 272 | 0, NULL); | ||
| 268 | if (data->swap < 0) | 273 | if (data->swap < 0) |
| 269 | error = -ENODEV; | 274 | error = -ENODEV; |
| 270 | } else { | 275 | } else { |
| @@ -282,7 +287,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 282 | break; | 287 | break; |
| 283 | } | 288 | } |
| 284 | 289 | ||
| 285 | if (down_trylock(&pm_sem)) { | 290 | if (!mutex_trylock(&pm_mutex)) { |
| 286 | error = -EBUSY; | 291 | error = -EBUSY; |
| 287 | break; | 292 | break; |
| 288 | } | 293 | } |
| @@ -309,8 +314,66 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
| 309 | if (pm_ops->finish) | 314 | if (pm_ops->finish) |
| 310 | pm_ops->finish(PM_SUSPEND_MEM); | 315 | pm_ops->finish(PM_SUSPEND_MEM); |
| 311 | 316 | ||
| 312 | OutS3: | 317 | OutS3: |
| 313 | up(&pm_sem); | 318 | mutex_unlock(&pm_mutex); |
| 319 | break; | ||
| 320 | |||
| 321 | case SNAPSHOT_PMOPS: | ||
| 322 | switch (arg) { | ||
| 323 | |||
| 324 | case PMOPS_PREPARE: | ||
| 325 | if (pm_ops->prepare) { | ||
| 326 | error = pm_ops->prepare(PM_SUSPEND_DISK); | ||
| 327 | } | ||
| 328 | break; | ||
| 329 | |||
| 330 | case PMOPS_ENTER: | ||
| 331 | kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); | ||
| 332 | error = pm_ops->enter(PM_SUSPEND_DISK); | ||
| 333 | break; | ||
| 334 | |||
| 335 | case PMOPS_FINISH: | ||
| 336 | if (pm_ops && pm_ops->finish) { | ||
| 337 | pm_ops->finish(PM_SUSPEND_DISK); | ||
| 338 | } | ||
| 339 | break; | ||
| 340 | |||
| 341 | default: | ||
| 342 | printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); | ||
| 343 | error = -EINVAL; | ||
| 344 | |||
| 345 | } | ||
| 346 | break; | ||
| 347 | |||
| 348 | case SNAPSHOT_SET_SWAP_AREA: | ||
| 349 | if (data->bitmap) { | ||
| 350 | error = -EPERM; | ||
| 351 | } else { | ||
| 352 | struct resume_swap_area swap_area; | ||
| 353 | dev_t swdev; | ||
| 354 | |||
| 355 | error = copy_from_user(&swap_area, (void __user *)arg, | ||
| 356 | sizeof(struct resume_swap_area)); | ||
| 357 | if (error) { | ||
| 358 | error = -EFAULT; | ||
| 359 | break; | ||
| 360 | } | ||
| 361 | |||
| 362 | /* | ||
| 363 | * User space encodes device types as two-byte values, | ||
| 364 | * so we need to recode them | ||
| 365 | */ | ||
| 366 | swdev = old_decode_dev(swap_area.dev); | ||
| 367 | if (swdev) { | ||
| 368 | offset = swap_area.offset; | ||
| 369 | data->swap = swap_type_of(swdev, offset, NULL); | ||
| 370 | if (data->swap < 0) | ||
| 371 | error = -ENODEV; | ||
| 372 | } else { | ||
| 373 | data->swap = -1; | ||
| 374 | error = -EINVAL; | ||
| 375 | } | ||
| 376 | } | ||
| 314 | break; | 377 | break; |
| 315 | 378 | ||
| 316 | default: | 379 | default: |
| @@ -321,7 +384,7 @@ OutS3: | |||
| 321 | return error; | 384 | return error; |
| 322 | } | 385 | } |
| 323 | 386 | ||
| 324 | static struct file_operations snapshot_fops = { | 387 | static const struct file_operations snapshot_fops = { |
| 325 | .open = snapshot_open, | 388 | .open = snapshot_open, |
| 326 | .release = snapshot_release, | 389 | .release = snapshot_release, |
| 327 | .read = snapshot_read, | 390 | .read = snapshot_read, |
diff --git a/kernel/printk.c b/kernel/printk.c index 66426552fbfe..c770e1a4e882 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
| @@ -53,8 +53,6 @@ int console_printk[4] = { | |||
| 53 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ | 53 | DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ |
| 54 | }; | 54 | }; |
| 55 | 55 | ||
| 56 | EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */ | ||
| 57 | |||
| 58 | /* | 56 | /* |
| 59 | * Low lever drivers may need that to know if they can schedule in | 57 | * Low lever drivers may need that to know if they can schedule in |
| 60 | * their unblank() callback or not. So let's export it. | 58 | * their unblank() callback or not. So let's export it. |
| @@ -335,13 +333,25 @@ static void __call_console_drivers(unsigned long start, unsigned long end) | |||
| 335 | } | 333 | } |
| 336 | } | 334 | } |
| 337 | 335 | ||
| 336 | static int __read_mostly ignore_loglevel; | ||
| 337 | |||
| 338 | static int __init ignore_loglevel_setup(char *str) | ||
| 339 | { | ||
| 340 | ignore_loglevel = 1; | ||
| 341 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | ||
| 342 | |||
| 343 | return 1; | ||
| 344 | } | ||
| 345 | |||
| 346 | __setup("ignore_loglevel", ignore_loglevel_setup); | ||
| 347 | |||
| 338 | /* | 348 | /* |
| 339 | * Write out chars from start to end - 1 inclusive | 349 | * Write out chars from start to end - 1 inclusive |
| 340 | */ | 350 | */ |
| 341 | static void _call_console_drivers(unsigned long start, | 351 | static void _call_console_drivers(unsigned long start, |
| 342 | unsigned long end, int msg_log_level) | 352 | unsigned long end, int msg_log_level) |
| 343 | { | 353 | { |
| 344 | if (msg_log_level < console_loglevel && | 354 | if ((msg_log_level < console_loglevel || ignore_loglevel) && |
| 345 | console_drivers && start != end) { | 355 | console_drivers && start != end) { |
| 346 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { | 356 | if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { |
| 347 | /* wrapped write */ | 357 | /* wrapped write */ |
| @@ -631,12 +641,7 @@ EXPORT_SYMBOL(vprintk); | |||
| 631 | 641 | ||
| 632 | asmlinkage long sys_syslog(int type, char __user *buf, int len) | 642 | asmlinkage long sys_syslog(int type, char __user *buf, int len) |
| 633 | { | 643 | { |
| 634 | return 0; | 644 | return -ENOSYS; |
| 635 | } | ||
| 636 | |||
| 637 | int do_syslog(int type, char __user *buf, int len) | ||
| 638 | { | ||
| 639 | return 0; | ||
| 640 | } | 645 | } |
| 641 | 646 | ||
| 642 | static void call_console_drivers(unsigned long start, unsigned long end) | 647 | static void call_console_drivers(unsigned long start, unsigned long end) |
| @@ -777,7 +782,6 @@ int is_console_locked(void) | |||
| 777 | { | 782 | { |
| 778 | return console_locked; | 783 | return console_locked; |
| 779 | } | 784 | } |
| 780 | EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */ | ||
| 781 | 785 | ||
| 782 | /** | 786 | /** |
| 783 | * release_console_sem - unlock the console system | 787 | * release_console_sem - unlock the console system |
diff --git a/kernel/profile.c b/kernel/profile.c index f940b462eec9..a6574a18514e 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -40,7 +40,10 @@ int (*timer_hook)(struct pt_regs *) __read_mostly; | |||
| 40 | 40 | ||
| 41 | static atomic_t *prof_buffer; | 41 | static atomic_t *prof_buffer; |
| 42 | static unsigned long prof_len, prof_shift; | 42 | static unsigned long prof_len, prof_shift; |
| 43 | static int prof_on __read_mostly; | 43 | |
| 44 | int prof_on __read_mostly; | ||
| 45 | EXPORT_SYMBOL_GPL(prof_on); | ||
| 46 | |||
| 44 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; | 47 | static cpumask_t prof_cpu_mask = CPU_MASK_ALL; |
| 45 | #ifdef CONFIG_SMP | 48 | #ifdef CONFIG_SMP |
| 46 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); | 49 | static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); |
| @@ -51,9 +54,20 @@ static DEFINE_MUTEX(profile_flip_mutex); | |||
| 51 | static int __init profile_setup(char * str) | 54 | static int __init profile_setup(char * str) |
| 52 | { | 55 | { |
| 53 | static char __initdata schedstr[] = "schedule"; | 56 | static char __initdata schedstr[] = "schedule"; |
| 57 | static char __initdata sleepstr[] = "sleep"; | ||
| 58 | static char __initdata kvmstr[] = "kvm"; | ||
| 54 | int par; | 59 | int par; |
| 55 | 60 | ||
| 56 | if (!strncmp(str, schedstr, strlen(schedstr))) { | 61 | if (!strncmp(str, sleepstr, strlen(sleepstr))) { |
| 62 | prof_on = SLEEP_PROFILING; | ||
| 63 | if (str[strlen(sleepstr)] == ',') | ||
| 64 | str += strlen(sleepstr) + 1; | ||
| 65 | if (get_option(&str, &par)) | ||
| 66 | prof_shift = par; | ||
| 67 | printk(KERN_INFO | ||
| 68 | "kernel sleep profiling enabled (shift: %ld)\n", | ||
| 69 | prof_shift); | ||
| 70 | } else if (!strncmp(str, schedstr, strlen(schedstr))) { | ||
| 57 | prof_on = SCHED_PROFILING; | 71 | prof_on = SCHED_PROFILING; |
| 58 | if (str[strlen(schedstr)] == ',') | 72 | if (str[strlen(schedstr)] == ',') |
| 59 | str += strlen(schedstr) + 1; | 73 | str += strlen(schedstr) + 1; |
| @@ -62,6 +76,15 @@ static int __init profile_setup(char * str) | |||
| 62 | printk(KERN_INFO | 76 | printk(KERN_INFO |
| 63 | "kernel schedule profiling enabled (shift: %ld)\n", | 77 | "kernel schedule profiling enabled (shift: %ld)\n", |
| 64 | prof_shift); | 78 | prof_shift); |
| 79 | } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { | ||
| 80 | prof_on = KVM_PROFILING; | ||
| 81 | if (str[strlen(kvmstr)] == ',') | ||
| 82 | str += strlen(kvmstr) + 1; | ||
| 83 | if (get_option(&str, &par)) | ||
| 84 | prof_shift = par; | ||
| 85 | printk(KERN_INFO | ||
| 86 | "kernel KVM profiling enabled (shift: %ld)\n", | ||
| 87 | prof_shift); | ||
| 65 | } else if (get_option(&str, &par)) { | 88 | } else if (get_option(&str, &par)) { |
| 66 | prof_shift = par; | 89 | prof_shift = par; |
| 67 | prof_on = CPU_PROFILING; | 90 | prof_on = CPU_PROFILING; |
| @@ -204,7 +227,8 @@ EXPORT_SYMBOL_GPL(profile_event_unregister); | |||
| 204 | * positions to which hits are accounted during short intervals (e.g. | 227 | * positions to which hits are accounted during short intervals (e.g. |
| 205 | * several seconds) is usually very small. Exclusion from buffer | 228 | * several seconds) is usually very small. Exclusion from buffer |
| 206 | * flipping is provided by interrupt disablement (note that for | 229 | * flipping is provided by interrupt disablement (note that for |
| 207 | * SCHED_PROFILING profile_hit() may be called from process context). | 230 | * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from |
| 231 | * process context). | ||
| 208 | * The hash function is meant to be lightweight as opposed to strong, | 232 | * The hash function is meant to be lightweight as opposed to strong, |
| 209 | * and was vaguely inspired by ppc64 firmware-supported inverted | 233 | * and was vaguely inspired by ppc64 firmware-supported inverted |
| 210 | * pagetable hash functions, but uses a full hashtable full of finite | 234 | * pagetable hash functions, but uses a full hashtable full of finite |
| @@ -257,7 +281,7 @@ static void profile_discard_flip_buffers(void) | |||
| 257 | mutex_unlock(&profile_flip_mutex); | 281 | mutex_unlock(&profile_flip_mutex); |
| 258 | } | 282 | } |
| 259 | 283 | ||
| 260 | void profile_hit(int type, void *__pc) | 284 | void profile_hits(int type, void *__pc, unsigned int nr_hits) |
| 261 | { | 285 | { |
| 262 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; | 286 | unsigned long primary, secondary, flags, pc = (unsigned long)__pc; |
| 263 | int i, j, cpu; | 287 | int i, j, cpu; |
| @@ -274,21 +298,31 @@ void profile_hit(int type, void *__pc) | |||
| 274 | put_cpu(); | 298 | put_cpu(); |
| 275 | return; | 299 | return; |
| 276 | } | 300 | } |
| 301 | /* | ||
| 302 | * We buffer the global profiler buffer into a per-CPU | ||
| 303 | * queue and thus reduce the number of global (and possibly | ||
| 304 | * NUMA-alien) accesses. The write-queue is self-coalescing: | ||
| 305 | */ | ||
| 277 | local_irq_save(flags); | 306 | local_irq_save(flags); |
| 278 | do { | 307 | do { |
| 279 | for (j = 0; j < PROFILE_GRPSZ; ++j) { | 308 | for (j = 0; j < PROFILE_GRPSZ; ++j) { |
| 280 | if (hits[i + j].pc == pc) { | 309 | if (hits[i + j].pc == pc) { |
| 281 | hits[i + j].hits++; | 310 | hits[i + j].hits += nr_hits; |
| 282 | goto out; | 311 | goto out; |
| 283 | } else if (!hits[i + j].hits) { | 312 | } else if (!hits[i + j].hits) { |
| 284 | hits[i + j].pc = pc; | 313 | hits[i + j].pc = pc; |
| 285 | hits[i + j].hits = 1; | 314 | hits[i + j].hits = nr_hits; |
| 286 | goto out; | 315 | goto out; |
| 287 | } | 316 | } |
| 288 | } | 317 | } |
| 289 | i = (i + secondary) & (NR_PROFILE_HIT - 1); | 318 | i = (i + secondary) & (NR_PROFILE_HIT - 1); |
| 290 | } while (i != primary); | 319 | } while (i != primary); |
| 291 | atomic_inc(&prof_buffer[pc]); | 320 | |
| 321 | /* | ||
| 322 | * Add the current hit(s) and flush the write-queue out | ||
| 323 | * to the global buffer: | ||
| 324 | */ | ||
| 325 | atomic_add(nr_hits, &prof_buffer[pc]); | ||
| 292 | for (i = 0; i < NR_PROFILE_HIT; ++i) { | 326 | for (i = 0; i < NR_PROFILE_HIT; ++i) { |
| 293 | atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); | 327 | atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); |
| 294 | hits[i].pc = hits[i].hits = 0; | 328 | hits[i].pc = hits[i].hits = 0; |
| @@ -297,8 +331,8 @@ out: | |||
| 297 | local_irq_restore(flags); | 331 | local_irq_restore(flags); |
| 298 | put_cpu(); | 332 | put_cpu(); |
| 299 | } | 333 | } |
| 334 | EXPORT_SYMBOL_GPL(profile_hits); | ||
| 300 | 335 | ||
| 301 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 302 | static int __devinit profile_cpu_callback(struct notifier_block *info, | 336 | static int __devinit profile_cpu_callback(struct notifier_block *info, |
| 303 | unsigned long action, void *__cpu) | 337 | unsigned long action, void *__cpu) |
| 304 | { | 338 | { |
| @@ -351,19 +385,19 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, | |||
| 351 | } | 385 | } |
| 352 | return NOTIFY_OK; | 386 | return NOTIFY_OK; |
| 353 | } | 387 | } |
| 354 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 355 | #else /* !CONFIG_SMP */ | 388 | #else /* !CONFIG_SMP */ |
| 356 | #define profile_flip_buffers() do { } while (0) | 389 | #define profile_flip_buffers() do { } while (0) |
| 357 | #define profile_discard_flip_buffers() do { } while (0) | 390 | #define profile_discard_flip_buffers() do { } while (0) |
| 391 | #define profile_cpu_callback NULL | ||
| 358 | 392 | ||
| 359 | void profile_hit(int type, void *__pc) | 393 | void profile_hits(int type, void *__pc, unsigned int nr_hits) |
| 360 | { | 394 | { |
| 361 | unsigned long pc; | 395 | unsigned long pc; |
| 362 | 396 | ||
| 363 | if (prof_on != type || !prof_buffer) | 397 | if (prof_on != type || !prof_buffer) |
| 364 | return; | 398 | return; |
| 365 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; | 399 | pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; |
| 366 | atomic_inc(&prof_buffer[min(pc, prof_len - 1)]); | 400 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
| 367 | } | 401 | } |
| 368 | #endif /* !CONFIG_SMP */ | 402 | #endif /* !CONFIG_SMP */ |
| 369 | 403 | ||
| @@ -442,7 +476,8 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
| 442 | read = 0; | 476 | read = 0; |
| 443 | 477 | ||
| 444 | while (p < sizeof(unsigned int) && count > 0) { | 478 | while (p < sizeof(unsigned int) && count > 0) { |
| 445 | put_user(*((char *)(&sample_step)+p),buf); | 479 | if (put_user(*((char *)(&sample_step)+p),buf)) |
| 480 | return -EFAULT; | ||
| 446 | buf++; p++; count--; read++; | 481 | buf++; p++; count--; read++; |
| 447 | } | 482 | } |
| 448 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); | 483 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); |
| @@ -480,7 +515,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, | |||
| 480 | return count; | 515 | return count; |
| 481 | } | 516 | } |
| 482 | 517 | ||
| 483 | static struct file_operations proc_profile_operations = { | 518 | static const struct file_operations proc_profile_operations = { |
| 484 | .read = read_profile, | 519 | .read = read_profile, |
| 485 | .write = write_profile, | 520 | .write = write_profile, |
| 486 | }; | 521 | }; |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 26bb5ffe1ef1..3554b76da84c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -235,12 +235,14 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
| 235 | 235 | ||
| 236 | list = rdp->donelist; | 236 | list = rdp->donelist; |
| 237 | while (list) { | 237 | while (list) { |
| 238 | next = rdp->donelist = list->next; | 238 | next = list->next; |
| 239 | prefetch(next); | ||
| 239 | list->func(list); | 240 | list->func(list); |
| 240 | list = next; | 241 | list = next; |
| 241 | if (++count >= rdp->blimit) | 242 | if (++count >= rdp->blimit) |
| 242 | break; | 243 | break; |
| 243 | } | 244 | } |
| 245 | rdp->donelist = list; | ||
| 244 | 246 | ||
| 245 | local_irq_disable(); | 247 | local_irq_disable(); |
| 246 | rdp->qlen -= count; | 248 | rdp->qlen -= count; |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e2bda18f6f42..482b11ff65cb 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -401,7 +401,7 @@ static void srcu_torture_cleanup(void) | |||
| 401 | cleanup_srcu_struct(&srcu_ctl); | 401 | cleanup_srcu_struct(&srcu_ctl); |
| 402 | } | 402 | } |
| 403 | 403 | ||
| 404 | static int srcu_torture_read_lock(void) | 404 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) |
| 405 | { | 405 | { |
| 406 | return srcu_read_lock(&srcu_ctl); | 406 | return srcu_read_lock(&srcu_ctl); |
| 407 | } | 407 | } |
| @@ -419,7 +419,7 @@ static void srcu_read_delay(struct rcu_random_state *rrsp) | |||
| 419 | schedule_timeout_interruptible(longdelay); | 419 | schedule_timeout_interruptible(longdelay); |
| 420 | } | 420 | } |
| 421 | 421 | ||
| 422 | static void srcu_torture_read_unlock(int idx) | 422 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) |
| 423 | { | 423 | { |
| 424 | srcu_read_unlock(&srcu_ctl, idx); | 424 | srcu_read_unlock(&srcu_ctl, idx); |
| 425 | } | 425 | } |
| @@ -522,6 +522,7 @@ rcu_torture_writer(void *arg) | |||
| 522 | 522 | ||
| 523 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); | 523 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); |
| 524 | set_user_nice(current, 19); | 524 | set_user_nice(current, 19); |
| 525 | current->flags |= PF_NOFREEZE; | ||
| 525 | 526 | ||
| 526 | do { | 527 | do { |
| 527 | schedule_timeout_uninterruptible(1); | 528 | schedule_timeout_uninterruptible(1); |
| @@ -561,6 +562,7 @@ rcu_torture_fakewriter(void *arg) | |||
| 561 | 562 | ||
| 562 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); | 563 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); |
| 563 | set_user_nice(current, 19); | 564 | set_user_nice(current, 19); |
| 565 | current->flags |= PF_NOFREEZE; | ||
| 564 | 566 | ||
| 565 | do { | 567 | do { |
| 566 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); | 568 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); |
| @@ -591,6 +593,7 @@ rcu_torture_reader(void *arg) | |||
| 591 | 593 | ||
| 592 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); | 594 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); |
| 593 | set_user_nice(current, 19); | 595 | set_user_nice(current, 19); |
| 596 | current->flags |= PF_NOFREEZE; | ||
| 594 | 597 | ||
| 595 | do { | 598 | do { |
| 596 | idx = cur_ops->readlock(); | 599 | idx = cur_ops->readlock(); |
diff --git a/kernel/relay.c b/kernel/relay.c index f04bbdb56ac2..284e2e8b4eed 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
| @@ -138,7 +138,7 @@ depopulate: | |||
| 138 | */ | 138 | */ |
| 139 | struct rchan_buf *relay_create_buf(struct rchan *chan) | 139 | struct rchan_buf *relay_create_buf(struct rchan *chan) |
| 140 | { | 140 | { |
| 141 | struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL); | 141 | struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); |
| 142 | if (!buf) | 142 | if (!buf) |
| 143 | return NULL; | 143 | return NULL; |
| 144 | 144 | ||
| @@ -302,15 +302,16 @@ static struct rchan_callbacks default_channel_callbacks = { | |||
| 302 | 302 | ||
| 303 | /** | 303 | /** |
| 304 | * wakeup_readers - wake up readers waiting on a channel | 304 | * wakeup_readers - wake up readers waiting on a channel |
| 305 | * @private: the channel buffer | 305 | * @work: work struct that contains the the channel buffer |
| 306 | * | 306 | * |
| 307 | * This is the work function used to defer reader waking. The | 307 | * This is the work function used to defer reader waking. The |
| 308 | * reason waking is deferred is that calling directly from write | 308 | * reason waking is deferred is that calling directly from write |
| 309 | * causes problems if you're writing from say the scheduler. | 309 | * causes problems if you're writing from say the scheduler. |
| 310 | */ | 310 | */ |
| 311 | static void wakeup_readers(void *private) | 311 | static void wakeup_readers(struct work_struct *work) |
| 312 | { | 312 | { |
| 313 | struct rchan_buf *buf = private; | 313 | struct rchan_buf *buf = |
| 314 | container_of(work, struct rchan_buf, wake_readers.work); | ||
| 314 | wake_up_interruptible(&buf->read_wait); | 315 | wake_up_interruptible(&buf->read_wait); |
| 315 | } | 316 | } |
| 316 | 317 | ||
| @@ -321,14 +322,14 @@ static void wakeup_readers(void *private) | |||
| 321 | * | 322 | * |
| 322 | * See relay_reset for description of effect. | 323 | * See relay_reset for description of effect. |
| 323 | */ | 324 | */ |
| 324 | static inline void __relay_reset(struct rchan_buf *buf, unsigned int init) | 325 | static void __relay_reset(struct rchan_buf *buf, unsigned int init) |
| 325 | { | 326 | { |
| 326 | size_t i; | 327 | size_t i; |
| 327 | 328 | ||
| 328 | if (init) { | 329 | if (init) { |
| 329 | init_waitqueue_head(&buf->read_wait); | 330 | init_waitqueue_head(&buf->read_wait); |
| 330 | kref_init(&buf->kref); | 331 | kref_init(&buf->kref); |
| 331 | INIT_WORK(&buf->wake_readers, NULL, NULL); | 332 | INIT_DELAYED_WORK(&buf->wake_readers, NULL); |
| 332 | } else { | 333 | } else { |
| 333 | cancel_delayed_work(&buf->wake_readers); | 334 | cancel_delayed_work(&buf->wake_readers); |
| 334 | flush_scheduled_work(); | 335 | flush_scheduled_work(); |
| @@ -417,7 +418,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, | |||
| 417 | * The channel buffer and channel buffer data structure are then freed | 418 | * The channel buffer and channel buffer data structure are then freed |
| 418 | * automatically when the last reference is given up. | 419 | * automatically when the last reference is given up. |
| 419 | */ | 420 | */ |
| 420 | static inline void relay_close_buf(struct rchan_buf *buf) | 421 | static void relay_close_buf(struct rchan_buf *buf) |
| 421 | { | 422 | { |
| 422 | buf->finalized = 1; | 423 | buf->finalized = 1; |
| 423 | cancel_delayed_work(&buf->wake_readers); | 424 | cancel_delayed_work(&buf->wake_readers); |
| @@ -425,7 +426,7 @@ static inline void relay_close_buf(struct rchan_buf *buf) | |||
| 425 | kref_put(&buf->kref, relay_remove_buf); | 426 | kref_put(&buf->kref, relay_remove_buf); |
| 426 | } | 427 | } |
| 427 | 428 | ||
| 428 | static inline void setup_callbacks(struct rchan *chan, | 429 | static void setup_callbacks(struct rchan *chan, |
| 429 | struct rchan_callbacks *cb) | 430 | struct rchan_callbacks *cb) |
| 430 | { | 431 | { |
| 431 | if (!cb) { | 432 | if (!cb) { |
| @@ -478,7 +479,7 @@ struct rchan *relay_open(const char *base_filename, | |||
| 478 | if (!(subbuf_size && n_subbufs)) | 479 | if (!(subbuf_size && n_subbufs)) |
| 479 | return NULL; | 480 | return NULL; |
| 480 | 481 | ||
| 481 | chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL); | 482 | chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); |
| 482 | if (!chan) | 483 | if (!chan) |
| 483 | return NULL; | 484 | return NULL; |
| 484 | 485 | ||
| @@ -549,7 +550,8 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) | |||
| 549 | buf->padding[old_subbuf]; | 550 | buf->padding[old_subbuf]; |
| 550 | smp_mb(); | 551 | smp_mb(); |
| 551 | if (waitqueue_active(&buf->read_wait)) { | 552 | if (waitqueue_active(&buf->read_wait)) { |
| 552 | PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf); | 553 | PREPARE_DELAYED_WORK(&buf->wake_readers, |
| 554 | wakeup_readers); | ||
| 553 | schedule_delayed_work(&buf->wake_readers, 1); | 555 | schedule_delayed_work(&buf->wake_readers, 1); |
| 554 | } | 556 | } |
| 555 | } | 557 | } |
| @@ -944,11 +946,10 @@ typedef int (*subbuf_actor_t) (size_t read_start, | |||
| 944 | /* | 946 | /* |
| 945 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries | 947 | * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries |
| 946 | */ | 948 | */ |
| 947 | static inline ssize_t relay_file_read_subbufs(struct file *filp, | 949 | static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, |
| 948 | loff_t *ppos, | 950 | subbuf_actor_t subbuf_actor, |
| 949 | subbuf_actor_t subbuf_actor, | 951 | read_actor_t actor, |
| 950 | read_actor_t actor, | 952 | read_descriptor_t *desc) |
| 951 | read_descriptor_t *desc) | ||
| 952 | { | 953 | { |
| 953 | struct rchan_buf *buf = filp->private_data; | 954 | struct rchan_buf *buf = filp->private_data; |
| 954 | size_t read_start, avail; | 955 | size_t read_start, avail; |
| @@ -957,7 +958,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, | |||
| 957 | if (!desc->count) | 958 | if (!desc->count) |
| 958 | return 0; | 959 | return 0; |
| 959 | 960 | ||
| 960 | mutex_lock(&filp->f_dentry->d_inode->i_mutex); | 961 | mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); |
| 961 | do { | 962 | do { |
| 962 | if (!relay_file_read_avail(buf, *ppos)) | 963 | if (!relay_file_read_avail(buf, *ppos)) |
| 963 | break; | 964 | break; |
| @@ -977,7 +978,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, | |||
| 977 | *ppos = relay_file_read_end_pos(buf, read_start, ret); | 978 | *ppos = relay_file_read_end_pos(buf, read_start, ret); |
| 978 | } | 979 | } |
| 979 | } while (desc->count && ret); | 980 | } while (desc->count && ret); |
| 980 | mutex_unlock(&filp->f_dentry->d_inode->i_mutex); | 981 | mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); |
| 981 | 982 | ||
| 982 | return desc->written; | 983 | return desc->written; |
| 983 | } | 984 | } |
| @@ -1011,7 +1012,7 @@ static ssize_t relay_file_sendfile(struct file *filp, | |||
| 1011 | actor, &desc); | 1012 | actor, &desc); |
| 1012 | } | 1013 | } |
| 1013 | 1014 | ||
| 1014 | struct file_operations relay_file_operations = { | 1015 | const struct file_operations relay_file_operations = { |
| 1015 | .open = relay_file_open, | 1016 | .open = relay_file_open, |
| 1016 | .poll = relay_file_poll, | 1017 | .poll = relay_file_poll, |
| 1017 | .mmap = relay_file_mmap, | 1018 | .mmap = relay_file_mmap, |
diff --git a/kernel/resource.c b/kernel/resource.c index 6de60c12143e..7b9a497419d9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -88,7 +88,7 @@ static int r_show(struct seq_file *m, void *v) | |||
| 88 | return 0; | 88 | return 0; |
| 89 | } | 89 | } |
| 90 | 90 | ||
| 91 | static struct seq_operations resource_op = { | 91 | static const struct seq_operations resource_op = { |
| 92 | .start = r_start, | 92 | .start = r_start, |
| 93 | .next = r_next, | 93 | .next = r_next, |
| 94 | .stop = r_stop, | 94 | .stop = r_stop, |
| @@ -115,14 +115,14 @@ static int iomem_open(struct inode *inode, struct file *file) | |||
| 115 | return res; | 115 | return res; |
| 116 | } | 116 | } |
| 117 | 117 | ||
| 118 | static struct file_operations proc_ioports_operations = { | 118 | static const struct file_operations proc_ioports_operations = { |
| 119 | .open = ioports_open, | 119 | .open = ioports_open, |
| 120 | .read = seq_read, | 120 | .read = seq_read, |
| 121 | .llseek = seq_lseek, | 121 | .llseek = seq_lseek, |
| 122 | .release = seq_release, | 122 | .release = seq_release, |
| 123 | }; | 123 | }; |
| 124 | 124 | ||
| 125 | static struct file_operations proc_iomem_operations = { | 125 | static const struct file_operations proc_iomem_operations = { |
| 126 | .open = iomem_open, | 126 | .open = iomem_open, |
| 127 | .read = seq_read, | 127 | .read = seq_read, |
| 128 | .llseek = seq_lseek, | 128 | .llseek = seq_lseek, |
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 6dcea9dd8c94..015fc633c96c 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
| 14 | #include <linux/sysdev.h> | 14 | #include <linux/sysdev.h> |
| 15 | #include <linux/timer.h> | 15 | #include <linux/timer.h> |
| 16 | #include <linux/freezer.h> | ||
| 16 | 17 | ||
| 17 | #include "rtmutex.h" | 18 | #include "rtmutex.h" |
| 18 | 19 | ||
diff --git a/kernel/sched.c b/kernel/sched.c index 3399701c680e..cca93cc0dd7d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
| @@ -34,7 +34,7 @@ | |||
| 34 | #include <linux/security.h> | 34 | #include <linux/security.h> |
| 35 | #include <linux/notifier.h> | 35 | #include <linux/notifier.h> |
| 36 | #include <linux/profile.h> | 36 | #include <linux/profile.h> |
| 37 | #include <linux/suspend.h> | 37 | #include <linux/freezer.h> |
| 38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
| 39 | #include <linux/blkdev.h> | 39 | #include <linux/blkdev.h> |
| 40 | #include <linux/delay.h> | 40 | #include <linux/delay.h> |
| @@ -225,8 +225,10 @@ struct rq { | |||
| 225 | unsigned long nr_uninterruptible; | 225 | unsigned long nr_uninterruptible; |
| 226 | 226 | ||
| 227 | unsigned long expired_timestamp; | 227 | unsigned long expired_timestamp; |
| 228 | unsigned long long timestamp_last_tick; | 228 | /* Cached timestamp set by update_cpu_clock() */ |
| 229 | unsigned long long most_recent_timestamp; | ||
| 229 | struct task_struct *curr, *idle; | 230 | struct task_struct *curr, *idle; |
| 231 | unsigned long next_balance; | ||
| 230 | struct mm_struct *prev_mm; | 232 | struct mm_struct *prev_mm; |
| 231 | struct prio_array *active, *expired, arrays[2]; | 233 | struct prio_array *active, *expired, arrays[2]; |
| 232 | int best_expired_prio; | 234 | int best_expired_prio; |
| @@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | |||
| 426 | * bump this up when changing the output format or the meaning of an existing | 428 | * bump this up when changing the output format or the meaning of an existing |
| 427 | * format, so that tools can adapt (or abort) | 429 | * format, so that tools can adapt (or abort) |
| 428 | */ | 430 | */ |
| 429 | #define SCHEDSTAT_VERSION 12 | 431 | #define SCHEDSTAT_VERSION 14 |
| 430 | 432 | ||
| 431 | static int show_schedstat(struct seq_file *seq, void *v) | 433 | static int show_schedstat(struct seq_file *seq, void *v) |
| 432 | { | 434 | { |
| @@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 464 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | 466 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); |
| 465 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; | 467 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; |
| 466 | itype++) { | 468 | itype++) { |
| 467 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", | 469 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " |
| 470 | "%lu", | ||
| 468 | sd->lb_cnt[itype], | 471 | sd->lb_cnt[itype], |
| 469 | sd->lb_balanced[itype], | 472 | sd->lb_balanced[itype], |
| 470 | sd->lb_failed[itype], | 473 | sd->lb_failed[itype], |
| @@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
| 474 | sd->lb_nobusyq[itype], | 477 | sd->lb_nobusyq[itype], |
| 475 | sd->lb_nobusyg[itype]); | 478 | sd->lb_nobusyg[itype]); |
| 476 | } | 479 | } |
| 477 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", | 480 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" |
| 481 | " %lu %lu %lu\n", | ||
| 478 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 482 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, |
| 479 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, | 483 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, |
| 480 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | 484 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, |
| 481 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); | 485 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
| 486 | sd->ttwu_move_balance); | ||
| 482 | } | 487 | } |
| 483 | preempt_enable(); | 488 | preempt_enable(); |
| 484 | #endif | 489 | #endif |
| @@ -505,7 +510,7 @@ static int schedstat_open(struct inode *inode, struct file *file) | |||
| 505 | return res; | 510 | return res; |
| 506 | } | 511 | } |
| 507 | 512 | ||
| 508 | struct file_operations proc_schedstat_operations = { | 513 | const struct file_operations proc_schedstat_operations = { |
| 509 | .open = schedstat_open, | 514 | .open = schedstat_open, |
| 510 | .read = seq_read, | 515 | .read = seq_read, |
| 511 | .llseek = seq_lseek, | 516 | .llseek = seq_lseek, |
| @@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | |||
| 547 | #endif | 552 | #endif |
| 548 | 553 | ||
| 549 | /* | 554 | /* |
| 550 | * rq_lock - lock a given runqueue and disable interrupts. | 555 | * this_rq_lock - lock this runqueue and disable interrupts. |
| 551 | */ | 556 | */ |
| 552 | static inline struct rq *this_rq_lock(void) | 557 | static inline struct rq *this_rq_lock(void) |
| 553 | __acquires(rq->lock) | 558 | __acquires(rq->lock) |
| @@ -938,18 +943,31 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) | |||
| 938 | { | 943 | { |
| 939 | unsigned long long now; | 944 | unsigned long long now; |
| 940 | 945 | ||
| 946 | if (rt_task(p)) | ||
| 947 | goto out; | ||
| 948 | |||
| 941 | now = sched_clock(); | 949 | now = sched_clock(); |
| 942 | #ifdef CONFIG_SMP | 950 | #ifdef CONFIG_SMP |
| 943 | if (!local) { | 951 | if (!local) { |
| 944 | /* Compensate for drifting sched_clock */ | 952 | /* Compensate for drifting sched_clock */ |
| 945 | struct rq *this_rq = this_rq(); | 953 | struct rq *this_rq = this_rq(); |
| 946 | now = (now - this_rq->timestamp_last_tick) | 954 | now = (now - this_rq->most_recent_timestamp) |
| 947 | + rq->timestamp_last_tick; | 955 | + rq->most_recent_timestamp; |
| 948 | } | 956 | } |
| 949 | #endif | 957 | #endif |
| 950 | 958 | ||
| 951 | if (!rt_task(p)) | 959 | /* |
| 952 | p->prio = recalc_task_prio(p, now); | 960 | * Sleep time is in units of nanosecs, so shift by 20 to get a |
| 961 | * milliseconds-range estimation of the amount of time that the task | ||
| 962 | * spent sleeping: | ||
| 963 | */ | ||
| 964 | if (unlikely(prof_on == SLEEP_PROFILING)) { | ||
| 965 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
| 966 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), | ||
| 967 | (now - p->timestamp) >> 20); | ||
| 968 | } | ||
| 969 | |||
| 970 | p->prio = recalc_task_prio(p, now); | ||
| 953 | 971 | ||
| 954 | /* | 972 | /* |
| 955 | * This checks to make sure it's not an uninterruptible task | 973 | * This checks to make sure it's not an uninterruptible task |
| @@ -974,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) | |||
| 974 | } | 992 | } |
| 975 | } | 993 | } |
| 976 | p->timestamp = now; | 994 | p->timestamp = now; |
| 977 | 995 | out: | |
| 978 | __activate_task(p, rq); | 996 | __activate_task(p, rq); |
| 979 | } | 997 | } |
| 980 | 998 | ||
| @@ -1439,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
| 1439 | 1457 | ||
| 1440 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1458 | if (this_sd->flags & SD_WAKE_AFFINE) { |
| 1441 | unsigned long tl = this_load; | 1459 | unsigned long tl = this_load; |
| 1442 | unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | 1460 | unsigned long tl_per_task; |
| 1461 | |||
| 1462 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
| 1443 | 1463 | ||
| 1444 | /* | 1464 | /* |
| 1445 | * If sync wakeup then subtract the (maximum possible) | 1465 | * If sync wakeup then subtract the (maximum possible) |
| @@ -1547,6 +1567,7 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) | |||
| 1547 | return try_to_wake_up(p, state, 0); | 1567 | return try_to_wake_up(p, state, 0); |
| 1548 | } | 1568 | } |
| 1549 | 1569 | ||
| 1570 | static void task_running_tick(struct rq *rq, struct task_struct *p); | ||
| 1550 | /* | 1571 | /* |
| 1551 | * Perform scheduler related setup for a newly forked process p. | 1572 | * Perform scheduler related setup for a newly forked process p. |
| 1552 | * p is forked by current. | 1573 | * p is forked by current. |
| @@ -1607,7 +1628,7 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) | |||
| 1607 | * runqueue lock is not a problem. | 1628 | * runqueue lock is not a problem. |
| 1608 | */ | 1629 | */ |
| 1609 | current->time_slice = 1; | 1630 | current->time_slice = 1; |
| 1610 | scheduler_tick(); | 1631 | task_running_tick(cpu_rq(cpu), current); |
| 1611 | } | 1632 | } |
| 1612 | local_irq_enable(); | 1633 | local_irq_enable(); |
| 1613 | put_cpu(); | 1634 | put_cpu(); |
| @@ -1677,8 +1698,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
| 1677 | * Not the local CPU - must adjust timestamp. This should | 1698 | * Not the local CPU - must adjust timestamp. This should |
| 1678 | * get optimised away in the !CONFIG_SMP case. | 1699 | * get optimised away in the !CONFIG_SMP case. |
| 1679 | */ | 1700 | */ |
| 1680 | p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) | 1701 | p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) |
| 1681 | + rq->timestamp_last_tick; | 1702 | + rq->most_recent_timestamp; |
| 1682 | __activate_task(p, rq); | 1703 | __activate_task(p, rq); |
| 1683 | if (TASK_PREEMPTS_CURR(p, rq)) | 1704 | if (TASK_PREEMPTS_CURR(p, rq)) |
| 1684 | resched_task(rq->curr); | 1705 | resched_task(rq->curr); |
| @@ -1941,6 +1962,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) | |||
| 1941 | __acquires(rq1->lock) | 1962 | __acquires(rq1->lock) |
| 1942 | __acquires(rq2->lock) | 1963 | __acquires(rq2->lock) |
| 1943 | { | 1964 | { |
| 1965 | BUG_ON(!irqs_disabled()); | ||
| 1944 | if (rq1 == rq2) { | 1966 | if (rq1 == rq2) { |
| 1945 | spin_lock(&rq1->lock); | 1967 | spin_lock(&rq1->lock); |
| 1946 | __acquire(rq2->lock); /* Fake it out ;) */ | 1968 | __acquire(rq2->lock); /* Fake it out ;) */ |
| @@ -1980,6 +2002,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
| 1980 | __acquires(busiest->lock) | 2002 | __acquires(busiest->lock) |
| 1981 | __acquires(this_rq->lock) | 2003 | __acquires(this_rq->lock) |
| 1982 | { | 2004 | { |
| 2005 | if (unlikely(!irqs_disabled())) { | ||
| 2006 | /* printk() doesn't work good under rq->lock */ | ||
| 2007 | spin_unlock(&this_rq->lock); | ||
| 2008 | BUG_ON(1); | ||
| 2009 | } | ||
| 1983 | if (unlikely(!spin_trylock(&busiest->lock))) { | 2010 | if (unlikely(!spin_trylock(&busiest->lock))) { |
| 1984 | if (busiest < this_rq) { | 2011 | if (busiest < this_rq) { |
| 1985 | spin_unlock(&this_rq->lock); | 2012 | spin_unlock(&this_rq->lock); |
| @@ -2050,8 +2077,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, | |||
| 2050 | set_task_cpu(p, this_cpu); | 2077 | set_task_cpu(p, this_cpu); |
| 2051 | inc_nr_running(p, this_rq); | 2078 | inc_nr_running(p, this_rq); |
| 2052 | enqueue_task(p, this_array); | 2079 | enqueue_task(p, this_array); |
| 2053 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 2080 | p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) |
| 2054 | + this_rq->timestamp_last_tick; | 2081 | + this_rq->most_recent_timestamp; |
| 2055 | /* | 2082 | /* |
| 2056 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2083 | * Note that idle threads have a prio of MAX_PRIO, for this test |
| 2057 | * to be always true for them. | 2084 | * to be always true for them. |
| @@ -2087,10 +2114,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
| 2087 | * 2) too many balance attempts have failed. | 2114 | * 2) too many balance attempts have failed. |
| 2088 | */ | 2115 | */ |
| 2089 | 2116 | ||
| 2090 | if (sd->nr_balance_failed > sd->cache_nice_tries) | 2117 | if (sd->nr_balance_failed > sd->cache_nice_tries) { |
| 2118 | #ifdef CONFIG_SCHEDSTATS | ||
| 2119 | if (task_hot(p, rq->most_recent_timestamp, sd)) | ||
| 2120 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
| 2121 | #endif | ||
| 2091 | return 1; | 2122 | return 1; |
| 2123 | } | ||
| 2092 | 2124 | ||
| 2093 | if (task_hot(p, rq->timestamp_last_tick, sd)) | 2125 | if (task_hot(p, rq->most_recent_timestamp, sd)) |
| 2094 | return 0; | 2126 | return 0; |
| 2095 | return 1; | 2127 | return 1; |
| 2096 | } | 2128 | } |
| @@ -2188,11 +2220,6 @@ skip_queue: | |||
| 2188 | goto skip_bitmap; | 2220 | goto skip_bitmap; |
| 2189 | } | 2221 | } |
| 2190 | 2222 | ||
| 2191 | #ifdef CONFIG_SCHEDSTATS | ||
| 2192 | if (task_hot(tmp, busiest->timestamp_last_tick, sd)) | ||
| 2193 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
| 2194 | #endif | ||
| 2195 | |||
| 2196 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2223 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
| 2197 | pulled++; | 2224 | pulled++; |
| 2198 | rem_load_move -= tmp->load_weight; | 2225 | rem_load_move -= tmp->load_weight; |
| @@ -2230,7 +2257,7 @@ out: | |||
| 2230 | static struct sched_group * | 2257 | static struct sched_group * |
| 2231 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2258 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
| 2232 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, | 2259 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, |
| 2233 | cpumask_t *cpus) | 2260 | cpumask_t *cpus, int *balance) |
| 2234 | { | 2261 | { |
| 2235 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2262 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
| 2236 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2263 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
| @@ -2259,10 +2286,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2259 | unsigned long load, group_capacity; | 2286 | unsigned long load, group_capacity; |
| 2260 | int local_group; | 2287 | int local_group; |
| 2261 | int i; | 2288 | int i; |
| 2289 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
| 2262 | unsigned long sum_nr_running, sum_weighted_load; | 2290 | unsigned long sum_nr_running, sum_weighted_load; |
| 2263 | 2291 | ||
| 2264 | local_group = cpu_isset(this_cpu, group->cpumask); | 2292 | local_group = cpu_isset(this_cpu, group->cpumask); |
| 2265 | 2293 | ||
| 2294 | if (local_group) | ||
| 2295 | balance_cpu = first_cpu(group->cpumask); | ||
| 2296 | |||
| 2266 | /* Tally up the load of all CPUs in the group */ | 2297 | /* Tally up the load of all CPUs in the group */ |
| 2267 | sum_weighted_load = sum_nr_running = avg_load = 0; | 2298 | sum_weighted_load = sum_nr_running = avg_load = 0; |
| 2268 | 2299 | ||
| @@ -2278,9 +2309,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2278 | *sd_idle = 0; | 2309 | *sd_idle = 0; |
| 2279 | 2310 | ||
| 2280 | /* Bias balancing toward cpus of our domain */ | 2311 | /* Bias balancing toward cpus of our domain */ |
| 2281 | if (local_group) | 2312 | if (local_group) { |
| 2313 | if (idle_cpu(i) && !first_idle_cpu) { | ||
| 2314 | first_idle_cpu = 1; | ||
| 2315 | balance_cpu = i; | ||
| 2316 | } | ||
| 2317 | |||
| 2282 | load = target_load(i, load_idx); | 2318 | load = target_load(i, load_idx); |
| 2283 | else | 2319 | } else |
| 2284 | load = source_load(i, load_idx); | 2320 | load = source_load(i, load_idx); |
| 2285 | 2321 | ||
| 2286 | avg_load += load; | 2322 | avg_load += load; |
| @@ -2288,6 +2324,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
| 2288 | sum_weighted_load += rq->raw_weighted_load; | 2324 | sum_weighted_load += rq->raw_weighted_load; |
| 2289 | } | 2325 | } |
| 2290 | 2326 | ||
| 2327 | /* | ||
| 2328 | * First idle cpu or the first cpu(busiest) in this sched group | ||
| 2329 | * is eligible for doing load balancing at this and above | ||
| 2330 | * domains. | ||
| 2331 | */ | ||
| 2332 | if (local_group && balance_cpu != this_cpu && balance) { | ||
| 2333 | *balance = 0; | ||
| 2334 | goto ret; | ||
| 2335 | } | ||
| 2336 | |||
| 2291 | total_load += avg_load; | 2337 | total_load += avg_load; |
| 2292 | total_pwr += group->cpu_power; | 2338 | total_pwr += group->cpu_power; |
| 2293 | 2339 | ||
| @@ -2447,18 +2493,21 @@ small_imbalance: | |||
| 2447 | pwr_now /= SCHED_LOAD_SCALE; | 2493 | pwr_now /= SCHED_LOAD_SCALE; |
| 2448 | 2494 | ||
| 2449 | /* Amount of load we'd subtract */ | 2495 | /* Amount of load we'd subtract */ |
| 2450 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; | 2496 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / |
| 2497 | busiest->cpu_power; | ||
| 2451 | if (max_load > tmp) | 2498 | if (max_load > tmp) |
| 2452 | pwr_move += busiest->cpu_power * | 2499 | pwr_move += busiest->cpu_power * |
| 2453 | min(busiest_load_per_task, max_load - tmp); | 2500 | min(busiest_load_per_task, max_load - tmp); |
| 2454 | 2501 | ||
| 2455 | /* Amount of load we'd add */ | 2502 | /* Amount of load we'd add */ |
| 2456 | if (max_load*busiest->cpu_power < | 2503 | if (max_load * busiest->cpu_power < |
| 2457 | busiest_load_per_task*SCHED_LOAD_SCALE) | 2504 | busiest_load_per_task * SCHED_LOAD_SCALE) |
| 2458 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2505 | tmp = max_load * busiest->cpu_power / this->cpu_power; |
| 2459 | else | 2506 | else |
| 2460 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; | 2507 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / |
| 2461 | pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); | 2508 | this->cpu_power; |
| 2509 | pwr_move += this->cpu_power * | ||
| 2510 | min(this_load_per_task, this_load + tmp); | ||
| 2462 | pwr_move /= SCHED_LOAD_SCALE; | 2511 | pwr_move /= SCHED_LOAD_SCALE; |
| 2463 | 2512 | ||
| 2464 | /* Move if we gain throughput */ | 2513 | /* Move if we gain throughput */ |
| @@ -2479,8 +2528,8 @@ out_balanced: | |||
| 2479 | *imbalance = min_load_per_task; | 2528 | *imbalance = min_load_per_task; |
| 2480 | return group_min; | 2529 | return group_min; |
| 2481 | } | 2530 | } |
| 2482 | ret: | ||
| 2483 | #endif | 2531 | #endif |
| 2532 | ret: | ||
| 2484 | *imbalance = 0; | 2533 | *imbalance = 0; |
| 2485 | return NULL; | 2534 | return NULL; |
| 2486 | } | 2535 | } |
| @@ -2529,17 +2578,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n) | |||
| 2529 | /* | 2578 | /* |
| 2530 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2579 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
| 2531 | * tasks if there is an imbalance. | 2580 | * tasks if there is an imbalance. |
| 2532 | * | ||
| 2533 | * Called with this_rq unlocked. | ||
| 2534 | */ | 2581 | */ |
| 2535 | static int load_balance(int this_cpu, struct rq *this_rq, | 2582 | static int load_balance(int this_cpu, struct rq *this_rq, |
| 2536 | struct sched_domain *sd, enum idle_type idle) | 2583 | struct sched_domain *sd, enum idle_type idle, |
| 2584 | int *balance) | ||
| 2537 | { | 2585 | { |
| 2538 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 2586 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
| 2539 | struct sched_group *group; | 2587 | struct sched_group *group; |
| 2540 | unsigned long imbalance; | 2588 | unsigned long imbalance; |
| 2541 | struct rq *busiest; | 2589 | struct rq *busiest; |
| 2542 | cpumask_t cpus = CPU_MASK_ALL; | 2590 | cpumask_t cpus = CPU_MASK_ALL; |
| 2591 | unsigned long flags; | ||
| 2543 | 2592 | ||
| 2544 | /* | 2593 | /* |
| 2545 | * When power savings policy is enabled for the parent domain, idle | 2594 | * When power savings policy is enabled for the parent domain, idle |
| @@ -2555,7 +2604,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
| 2555 | 2604 | ||
| 2556 | redo: | 2605 | redo: |
| 2557 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 2606 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
| 2558 | &cpus); | 2607 | &cpus, balance); |
| 2608 | |||
| 2609 | if (*balance == 0) | ||
| 2610 | goto out_balanced; | ||
| 2611 | |||
| 2559 | if (!group) { | 2612 | if (!group) { |
| 2560 | schedstat_inc(sd, lb_nobusyg[idle]); | 2613 | schedstat_inc(sd, lb_nobusyg[idle]); |
| 2561 | goto out_balanced; | 2614 | goto out_balanced; |
| @@ -2579,11 +2632,13 @@ redo: | |||
| 2579 | * still unbalanced. nr_moved simply stays zero, so it is | 2632 | * still unbalanced. nr_moved simply stays zero, so it is |
| 2580 | * correctly treated as an imbalance. | 2633 | * correctly treated as an imbalance. |
| 2581 | */ | 2634 | */ |
| 2635 | local_irq_save(flags); | ||
| 2582 | double_rq_lock(this_rq, busiest); | 2636 | double_rq_lock(this_rq, busiest); |
| 2583 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2637 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
| 2584 | minus_1_or_zero(busiest->nr_running), | 2638 | minus_1_or_zero(busiest->nr_running), |
| 2585 | imbalance, sd, idle, &all_pinned); | 2639 | imbalance, sd, idle, &all_pinned); |
| 2586 | double_rq_unlock(this_rq, busiest); | 2640 | double_rq_unlock(this_rq, busiest); |
| 2641 | local_irq_restore(flags); | ||
| 2587 | 2642 | ||
| 2588 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2643 | /* All tasks on this runqueue were pinned by CPU affinity */ |
| 2589 | if (unlikely(all_pinned)) { | 2644 | if (unlikely(all_pinned)) { |
| @@ -2600,13 +2655,13 @@ redo: | |||
| 2600 | 2655 | ||
| 2601 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2656 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
| 2602 | 2657 | ||
| 2603 | spin_lock(&busiest->lock); | 2658 | spin_lock_irqsave(&busiest->lock, flags); |
| 2604 | 2659 | ||
| 2605 | /* don't kick the migration_thread, if the curr | 2660 | /* don't kick the migration_thread, if the curr |
| 2606 | * task on busiest cpu can't be moved to this_cpu | 2661 | * task on busiest cpu can't be moved to this_cpu |
| 2607 | */ | 2662 | */ |
| 2608 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { | 2663 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { |
| 2609 | spin_unlock(&busiest->lock); | 2664 | spin_unlock_irqrestore(&busiest->lock, flags); |
| 2610 | all_pinned = 1; | 2665 | all_pinned = 1; |
| 2611 | goto out_one_pinned; | 2666 | goto out_one_pinned; |
| 2612 | } | 2667 | } |
| @@ -2616,7 +2671,7 @@ redo: | |||
| 2616 | busiest->push_cpu = this_cpu; | 2671 | busiest->push_cpu = this_cpu; |
| 2617 | active_balance = 1; | 2672 | active_balance = 1; |
| 2618 | } | 2673 | } |
| 2619 | spin_unlock(&busiest->lock); | 2674 | spin_unlock_irqrestore(&busiest->lock, flags); |
| 2620 | if (active_balance) | 2675 | if (active_balance) |
| 2621 | wake_up_process(busiest->migration_thread); | 2676 | wake_up_process(busiest->migration_thread); |
| 2622 | 2677 | ||
| @@ -2695,7 +2750,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
| 2695 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2750 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
| 2696 | redo: | 2751 | redo: |
| 2697 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, | 2752 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, |
| 2698 | &sd_idle, &cpus); | 2753 | &sd_idle, &cpus, NULL); |
| 2699 | if (!group) { | 2754 | if (!group) { |
| 2700 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2755 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
| 2701 | goto out_balanced; | 2756 | goto out_balanced; |
| @@ -2755,14 +2810,28 @@ out_balanced: | |||
| 2755 | static void idle_balance(int this_cpu, struct rq *this_rq) | 2810 | static void idle_balance(int this_cpu, struct rq *this_rq) |
| 2756 | { | 2811 | { |
| 2757 | struct sched_domain *sd; | 2812 | struct sched_domain *sd; |
| 2813 | int pulled_task = 0; | ||
| 2814 | unsigned long next_balance = jiffies + 60 * HZ; | ||
| 2758 | 2815 | ||
| 2759 | for_each_domain(this_cpu, sd) { | 2816 | for_each_domain(this_cpu, sd) { |
| 2760 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 2817 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
| 2761 | /* If we've pulled tasks over stop searching: */ | 2818 | /* If we've pulled tasks over stop searching: */ |
| 2762 | if (load_balance_newidle(this_cpu, this_rq, sd)) | 2819 | pulled_task = load_balance_newidle(this_cpu, |
| 2820 | this_rq, sd); | ||
| 2821 | if (time_after(next_balance, | ||
| 2822 | sd->last_balance + sd->balance_interval)) | ||
| 2823 | next_balance = sd->last_balance | ||
| 2824 | + sd->balance_interval; | ||
| 2825 | if (pulled_task) | ||
| 2763 | break; | 2826 | break; |
| 2764 | } | 2827 | } |
| 2765 | } | 2828 | } |
| 2829 | if (!pulled_task) | ||
| 2830 | /* | ||
| 2831 | * We are going idle. next_balance may be set based on | ||
| 2832 | * a busy processor. So reset next_balance. | ||
| 2833 | */ | ||
| 2834 | this_rq->next_balance = next_balance; | ||
| 2766 | } | 2835 | } |
| 2767 | 2836 | ||
| 2768 | /* | 2837 | /* |
| @@ -2815,26 +2884,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
| 2815 | spin_unlock(&target_rq->lock); | 2884 | spin_unlock(&target_rq->lock); |
| 2816 | } | 2885 | } |
| 2817 | 2886 | ||
| 2818 | /* | 2887 | static void update_load(struct rq *this_rq) |
| 2819 | * rebalance_tick will get called every timer tick, on every CPU. | ||
| 2820 | * | ||
| 2821 | * It checks each scheduling domain to see if it is due to be balanced, | ||
| 2822 | * and initiates a balancing operation if so. | ||
| 2823 | * | ||
| 2824 | * Balancing parameters are set up in arch_init_sched_domains. | ||
| 2825 | */ | ||
| 2826 | |||
| 2827 | /* Don't have all balancing operations going off at once: */ | ||
| 2828 | static inline unsigned long cpu_offset(int cpu) | ||
| 2829 | { | ||
| 2830 | return jiffies + cpu * HZ / NR_CPUS; | ||
| 2831 | } | ||
| 2832 | |||
| 2833 | static void | ||
| 2834 | rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | ||
| 2835 | { | 2888 | { |
| 2836 | unsigned long this_load, interval, j = cpu_offset(this_cpu); | 2889 | unsigned long this_load; |
| 2837 | struct sched_domain *sd; | ||
| 2838 | int i, scale; | 2890 | int i, scale; |
| 2839 | 2891 | ||
| 2840 | this_load = this_rq->raw_weighted_load; | 2892 | this_load = this_rq->raw_weighted_load; |
| @@ -2854,6 +2906,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | |||
| 2854 | new_load += scale-1; | 2906 | new_load += scale-1; |
| 2855 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; | 2907 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; |
| 2856 | } | 2908 | } |
| 2909 | } | ||
| 2910 | |||
| 2911 | /* | ||
| 2912 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
| 2913 | * | ||
| 2914 | * It checks each scheduling domain to see if it is due to be balanced, | ||
| 2915 | * and initiates a balancing operation if so. | ||
| 2916 | * | ||
| 2917 | * Balancing parameters are set up in arch_init_sched_domains. | ||
| 2918 | */ | ||
| 2919 | static DEFINE_SPINLOCK(balancing); | ||
| 2920 | |||
| 2921 | static void run_rebalance_domains(struct softirq_action *h) | ||
| 2922 | { | ||
| 2923 | int this_cpu = smp_processor_id(), balance = 1; | ||
| 2924 | struct rq *this_rq = cpu_rq(this_cpu); | ||
| 2925 | unsigned long interval; | ||
| 2926 | struct sched_domain *sd; | ||
| 2927 | /* | ||
| 2928 | * We are idle if there are no processes running. This | ||
| 2929 | * is valid even if we are the idle process (SMT). | ||
| 2930 | */ | ||
| 2931 | enum idle_type idle = !this_rq->nr_running ? | ||
| 2932 | SCHED_IDLE : NOT_IDLE; | ||
| 2933 | /* Earliest time when we have to call run_rebalance_domains again */ | ||
| 2934 | unsigned long next_balance = jiffies + 60*HZ; | ||
| 2857 | 2935 | ||
| 2858 | for_each_domain(this_cpu, sd) { | 2936 | for_each_domain(this_cpu, sd) { |
| 2859 | if (!(sd->flags & SD_LOAD_BALANCE)) | 2937 | if (!(sd->flags & SD_LOAD_BALANCE)) |
| @@ -2868,8 +2946,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | |||
| 2868 | if (unlikely(!interval)) | 2946 | if (unlikely(!interval)) |
| 2869 | interval = 1; | 2947 | interval = 1; |
| 2870 | 2948 | ||
| 2871 | if (j - sd->last_balance >= interval) { | 2949 | if (sd->flags & SD_SERIALIZE) { |
| 2872 | if (load_balance(this_cpu, this_rq, sd, idle)) { | 2950 | if (!spin_trylock(&balancing)) |
| 2951 | goto out; | ||
| 2952 | } | ||
| 2953 | |||
| 2954 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | ||
| 2955 | if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { | ||
| 2873 | /* | 2956 | /* |
| 2874 | * We've pulled tasks over so either we're no | 2957 | * We've pulled tasks over so either we're no |
| 2875 | * longer idle, or one of our SMT siblings is | 2958 | * longer idle, or one of our SMT siblings is |
| @@ -2877,39 +2960,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | |||
| 2877 | */ | 2960 | */ |
| 2878 | idle = NOT_IDLE; | 2961 | idle = NOT_IDLE; |
| 2879 | } | 2962 | } |
| 2880 | sd->last_balance += interval; | 2963 | sd->last_balance = jiffies; |
| 2881 | } | 2964 | } |
| 2965 | if (sd->flags & SD_SERIALIZE) | ||
| 2966 | spin_unlock(&balancing); | ||
| 2967 | out: | ||
| 2968 | if (time_after(next_balance, sd->last_balance + interval)) | ||
| 2969 | next_balance = sd->last_balance + interval; | ||
| 2970 | |||
| 2971 | /* | ||
| 2972 | * Stop the load balance at this level. There is another | ||
| 2973 | * CPU in our sched group which is doing load balancing more | ||
| 2974 | * actively. | ||
| 2975 | */ | ||
| 2976 | if (!balance) | ||
| 2977 | break; | ||
| 2882 | } | 2978 | } |
| 2979 | this_rq->next_balance = next_balance; | ||
| 2883 | } | 2980 | } |
| 2884 | #else | 2981 | #else |
| 2885 | /* | 2982 | /* |
| 2886 | * on UP we do not need to balance between CPUs: | 2983 | * on UP we do not need to balance between CPUs: |
| 2887 | */ | 2984 | */ |
| 2888 | static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) | ||
| 2889 | { | ||
| 2890 | } | ||
| 2891 | static inline void idle_balance(int cpu, struct rq *rq) | 2985 | static inline void idle_balance(int cpu, struct rq *rq) |
| 2892 | { | 2986 | { |
| 2893 | } | 2987 | } |
| 2894 | #endif | 2988 | #endif |
| 2895 | 2989 | ||
| 2896 | static inline int wake_priority_sleeper(struct rq *rq) | 2990 | static inline void wake_priority_sleeper(struct rq *rq) |
| 2897 | { | 2991 | { |
| 2898 | int ret = 0; | ||
| 2899 | |||
| 2900 | #ifdef CONFIG_SCHED_SMT | 2992 | #ifdef CONFIG_SCHED_SMT |
| 2993 | if (!rq->nr_running) | ||
| 2994 | return; | ||
| 2995 | |||
| 2901 | spin_lock(&rq->lock); | 2996 | spin_lock(&rq->lock); |
| 2902 | /* | 2997 | /* |
| 2903 | * If an SMT sibling task has been put to sleep for priority | 2998 | * If an SMT sibling task has been put to sleep for priority |
| 2904 | * reasons reschedule the idle task to see if it can now run. | 2999 | * reasons reschedule the idle task to see if it can now run. |
| 2905 | */ | 3000 | */ |
| 2906 | if (rq->nr_running) { | 3001 | if (rq->nr_running) |
| 2907 | resched_task(rq->idle); | 3002 | resched_task(rq->idle); |
| 2908 | ret = 1; | ||
| 2909 | } | ||
| 2910 | spin_unlock(&rq->lock); | 3003 | spin_unlock(&rq->lock); |
| 2911 | #endif | 3004 | #endif |
| 2912 | return ret; | ||
| 2913 | } | 3005 | } |
| 2914 | 3006 | ||
| 2915 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3007 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
| @@ -2923,7 +3015,8 @@ EXPORT_PER_CPU_SYMBOL(kstat); | |||
| 2923 | static inline void | 3015 | static inline void |
| 2924 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) | 3016 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) |
| 2925 | { | 3017 | { |
| 2926 | p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); | 3018 | p->sched_time += now - p->last_ran; |
| 3019 | p->last_ran = rq->most_recent_timestamp = now; | ||
| 2927 | } | 3020 | } |
| 2928 | 3021 | ||
| 2929 | /* | 3022 | /* |
| @@ -2936,8 +3029,7 @@ unsigned long long current_sched_time(const struct task_struct *p) | |||
| 2936 | unsigned long flags; | 3029 | unsigned long flags; |
| 2937 | 3030 | ||
| 2938 | local_irq_save(flags); | 3031 | local_irq_save(flags); |
| 2939 | ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); | 3032 | ns = p->sched_time + sched_clock() - p->last_ran; |
| 2940 | ns = p->sched_time + sched_clock() - ns; | ||
| 2941 | local_irq_restore(flags); | 3033 | local_irq_restore(flags); |
| 2942 | 3034 | ||
| 2943 | return ns; | 3035 | return ns; |
| @@ -3037,35 +3129,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
| 3037 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 3129 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
| 3038 | } | 3130 | } |
| 3039 | 3131 | ||
| 3040 | /* | 3132 | static void task_running_tick(struct rq *rq, struct task_struct *p) |
| 3041 | * This function gets called by the timer code, with HZ frequency. | ||
| 3042 | * We call it with interrupts disabled. | ||
| 3043 | * | ||
| 3044 | * It also gets called by the fork code, when changing the parent's | ||
| 3045 | * timeslices. | ||
| 3046 | */ | ||
| 3047 | void scheduler_tick(void) | ||
| 3048 | { | 3133 | { |
| 3049 | unsigned long long now = sched_clock(); | ||
| 3050 | struct task_struct *p = current; | ||
| 3051 | int cpu = smp_processor_id(); | ||
| 3052 | struct rq *rq = cpu_rq(cpu); | ||
| 3053 | |||
| 3054 | update_cpu_clock(p, rq, now); | ||
| 3055 | |||
| 3056 | rq->timestamp_last_tick = now; | ||
| 3057 | |||
| 3058 | if (p == rq->idle) { | ||
| 3059 | if (wake_priority_sleeper(rq)) | ||
| 3060 | goto out; | ||
| 3061 | rebalance_tick(cpu, rq, SCHED_IDLE); | ||
| 3062 | return; | ||
| 3063 | } | ||
| 3064 | |||
| 3065 | /* Task might have expired already, but not scheduled off yet */ | ||
| 3066 | if (p->array != rq->active) { | 3134 | if (p->array != rq->active) { |
| 3135 | /* Task has expired but was not scheduled yet */ | ||
| 3067 | set_tsk_need_resched(p); | 3136 | set_tsk_need_resched(p); |
| 3068 | goto out; | 3137 | return; |
| 3069 | } | 3138 | } |
| 3070 | spin_lock(&rq->lock); | 3139 | spin_lock(&rq->lock); |
| 3071 | /* | 3140 | /* |
| @@ -3133,8 +3202,34 @@ void scheduler_tick(void) | |||
| 3133 | } | 3202 | } |
| 3134 | out_unlock: | 3203 | out_unlock: |
| 3135 | spin_unlock(&rq->lock); | 3204 | spin_unlock(&rq->lock); |
| 3136 | out: | 3205 | } |
| 3137 | rebalance_tick(cpu, rq, NOT_IDLE); | 3206 | |
| 3207 | /* | ||
| 3208 | * This function gets called by the timer code, with HZ frequency. | ||
| 3209 | * We call it with interrupts disabled. | ||
| 3210 | * | ||
| 3211 | * It also gets called by the fork code, when changing the parent's | ||
| 3212 | * timeslices. | ||
| 3213 | */ | ||
| 3214 | void scheduler_tick(void) | ||
| 3215 | { | ||
| 3216 | unsigned long long now = sched_clock(); | ||
| 3217 | struct task_struct *p = current; | ||
| 3218 | int cpu = smp_processor_id(); | ||
| 3219 | struct rq *rq = cpu_rq(cpu); | ||
| 3220 | |||
| 3221 | update_cpu_clock(p, rq, now); | ||
| 3222 | |||
| 3223 | if (p == rq->idle) | ||
| 3224 | /* Task on the idle queue */ | ||
| 3225 | wake_priority_sleeper(rq); | ||
| 3226 | else | ||
| 3227 | task_running_tick(rq, p); | ||
| 3228 | #ifdef CONFIG_SMP | ||
| 3229 | update_load(rq); | ||
| 3230 | if (time_after_eq(jiffies, rq->next_balance)) | ||
| 3231 | raise_softirq(SCHED_SOFTIRQ); | ||
| 3232 | #endif | ||
| 3138 | } | 3233 | } |
| 3139 | 3234 | ||
| 3140 | #ifdef CONFIG_SCHED_SMT | 3235 | #ifdef CONFIG_SCHED_SMT |
| @@ -3280,7 +3375,8 @@ void fastcall add_preempt_count(int val) | |||
| 3280 | /* | 3375 | /* |
| 3281 | * Spinlock count overflowing soon? | 3376 | * Spinlock count overflowing soon? |
| 3282 | */ | 3377 | */ |
| 3283 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); | 3378 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
| 3379 | PREEMPT_MASK - 10); | ||
| 3284 | } | 3380 | } |
| 3285 | EXPORT_SYMBOL(add_preempt_count); | 3381 | EXPORT_SYMBOL(add_preempt_count); |
| 3286 | 3382 | ||
| @@ -3333,6 +3429,9 @@ asmlinkage void __sched schedule(void) | |||
| 3333 | printk(KERN_ERR "BUG: scheduling while atomic: " | 3429 | printk(KERN_ERR "BUG: scheduling while atomic: " |
| 3334 | "%s/0x%08x/%d\n", | 3430 | "%s/0x%08x/%d\n", |
| 3335 | current->comm, preempt_count(), current->pid); | 3431 | current->comm, preempt_count(), current->pid); |
| 3432 | debug_show_held_locks(current); | ||
| 3433 | if (irqs_disabled()) | ||
| 3434 | print_irqtrace_events(current); | ||
| 3336 | dump_stack(); | 3435 | dump_stack(); |
| 3337 | } | 3436 | } |
| 3338 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 3437 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
| @@ -4518,15 +4617,6 @@ asmlinkage long sys_sched_yield(void) | |||
| 4518 | return 0; | 4617 | return 0; |
| 4519 | } | 4618 | } |
| 4520 | 4619 | ||
| 4521 | static inline int __resched_legal(int expected_preempt_count) | ||
| 4522 | { | ||
| 4523 | if (unlikely(preempt_count() != expected_preempt_count)) | ||
| 4524 | return 0; | ||
| 4525 | if (unlikely(system_state != SYSTEM_RUNNING)) | ||
| 4526 | return 0; | ||
| 4527 | return 1; | ||
| 4528 | } | ||
| 4529 | |||
| 4530 | static void __cond_resched(void) | 4620 | static void __cond_resched(void) |
| 4531 | { | 4621 | { |
| 4532 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 4622 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
| @@ -4546,7 +4636,8 @@ static void __cond_resched(void) | |||
| 4546 | 4636 | ||
| 4547 | int __sched cond_resched(void) | 4637 | int __sched cond_resched(void) |
| 4548 | { | 4638 | { |
| 4549 | if (need_resched() && __resched_legal(0)) { | 4639 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && |
| 4640 | system_state == SYSTEM_RUNNING) { | ||
| 4550 | __cond_resched(); | 4641 | __cond_resched(); |
| 4551 | return 1; | 4642 | return 1; |
| 4552 | } | 4643 | } |
| @@ -4572,7 +4663,7 @@ int cond_resched_lock(spinlock_t *lock) | |||
| 4572 | ret = 1; | 4663 | ret = 1; |
| 4573 | spin_lock(lock); | 4664 | spin_lock(lock); |
| 4574 | } | 4665 | } |
| 4575 | if (need_resched() && __resched_legal(1)) { | 4666 | if (need_resched() && system_state == SYSTEM_RUNNING) { |
| 4576 | spin_release(&lock->dep_map, 1, _THIS_IP_); | 4667 | spin_release(&lock->dep_map, 1, _THIS_IP_); |
| 4577 | _raw_spin_unlock(lock); | 4668 | _raw_spin_unlock(lock); |
| 4578 | preempt_enable_no_resched(); | 4669 | preempt_enable_no_resched(); |
| @@ -4588,7 +4679,7 @@ int __sched cond_resched_softirq(void) | |||
| 4588 | { | 4679 | { |
| 4589 | BUG_ON(!in_softirq()); | 4680 | BUG_ON(!in_softirq()); |
| 4590 | 4681 | ||
| 4591 | if (need_resched() && __resched_legal(0)) { | 4682 | if (need_resched() && system_state == SYSTEM_RUNNING) { |
| 4592 | raw_local_irq_disable(); | 4683 | raw_local_irq_disable(); |
| 4593 | _local_bh_enable(); | 4684 | _local_bh_enable(); |
| 4594 | raw_local_irq_enable(); | 4685 | raw_local_irq_enable(); |
| @@ -4804,18 +4895,18 @@ static void show_task(struct task_struct *p) | |||
| 4804 | show_stack(p, NULL); | 4895 | show_stack(p, NULL); |
| 4805 | } | 4896 | } |
| 4806 | 4897 | ||
| 4807 | void show_state(void) | 4898 | void show_state_filter(unsigned long state_filter) |
| 4808 | { | 4899 | { |
| 4809 | struct task_struct *g, *p; | 4900 | struct task_struct *g, *p; |
| 4810 | 4901 | ||
| 4811 | #if (BITS_PER_LONG == 32) | 4902 | #if (BITS_PER_LONG == 32) |
| 4812 | printk("\n" | 4903 | printk("\n" |
| 4813 | " sibling\n"); | 4904 | " free sibling\n"); |
| 4814 | printk(" task PC pid father child younger older\n"); | 4905 | printk(" task PC stack pid father child younger older\n"); |
| 4815 | #else | 4906 | #else |
| 4816 | printk("\n" | 4907 | printk("\n" |
| 4817 | " sibling\n"); | 4908 | " free sibling\n"); |
| 4818 | printk(" task PC pid father child younger older\n"); | 4909 | printk(" task PC stack pid father child younger older\n"); |
| 4819 | #endif | 4910 | #endif |
| 4820 | read_lock(&tasklist_lock); | 4911 | read_lock(&tasklist_lock); |
| 4821 | do_each_thread(g, p) { | 4912 | do_each_thread(g, p) { |
| @@ -4824,11 +4915,16 @@ void show_state(void) | |||
| 4824 | * console might take alot of time: | 4915 | * console might take alot of time: |
| 4825 | */ | 4916 | */ |
| 4826 | touch_nmi_watchdog(); | 4917 | touch_nmi_watchdog(); |
| 4827 | show_task(p); | 4918 | if (p->state & state_filter) |
| 4919 | show_task(p); | ||
| 4828 | } while_each_thread(g, p); | 4920 | } while_each_thread(g, p); |
| 4829 | 4921 | ||
| 4830 | read_unlock(&tasklist_lock); | 4922 | read_unlock(&tasklist_lock); |
| 4831 | debug_show_all_locks(); | 4923 | /* |
| 4924 | * Only show locks if all tasks are dumped: | ||
| 4925 | */ | ||
| 4926 | if (state_filter == -1) | ||
| 4927 | debug_show_all_locks(); | ||
| 4832 | } | 4928 | } |
| 4833 | 4929 | ||
| 4834 | /** | 4930 | /** |
| @@ -4973,8 +5069,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
| 4973 | * afterwards, and pretending it was a local activate. | 5069 | * afterwards, and pretending it was a local activate. |
| 4974 | * This way is cleaner and logically correct. | 5070 | * This way is cleaner and logically correct. |
| 4975 | */ | 5071 | */ |
| 4976 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick | 5072 | p->timestamp = p->timestamp - rq_src->most_recent_timestamp |
| 4977 | + rq_dest->timestamp_last_tick; | 5073 | + rq_dest->most_recent_timestamp; |
| 4978 | deactivate_task(p, rq_src); | 5074 | deactivate_task(p, rq_src); |
| 4979 | __activate_task(p, rq_dest); | 5075 | __activate_task(p, rq_dest); |
| 4980 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 5076 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
| @@ -5050,7 +5146,10 @@ wait_to_die: | |||
| 5050 | } | 5146 | } |
| 5051 | 5147 | ||
| 5052 | #ifdef CONFIG_HOTPLUG_CPU | 5148 | #ifdef CONFIG_HOTPLUG_CPU |
| 5053 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 5149 | /* |
| 5150 | * Figure out where task on dead CPU should go, use force if neccessary. | ||
| 5151 | * NOTE: interrupts should be disabled by the caller | ||
| 5152 | */ | ||
| 5054 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5153 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
| 5055 | { | 5154 | { |
| 5056 | unsigned long flags; | 5155 | unsigned long flags; |
| @@ -5170,6 +5269,7 @@ void idle_task_exit(void) | |||
| 5170 | mmdrop(mm); | 5269 | mmdrop(mm); |
| 5171 | } | 5270 | } |
| 5172 | 5271 | ||
| 5272 | /* called under rq->lock with disabled interrupts */ | ||
| 5173 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | 5273 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |
| 5174 | { | 5274 | { |
| 5175 | struct rq *rq = cpu_rq(dead_cpu); | 5275 | struct rq *rq = cpu_rq(dead_cpu); |
| @@ -5186,10 +5286,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | |||
| 5186 | * Drop lock around migration; if someone else moves it, | 5286 | * Drop lock around migration; if someone else moves it, |
| 5187 | * that's OK. No task can be added to this CPU, so iteration is | 5287 | * that's OK. No task can be added to this CPU, so iteration is |
| 5188 | * fine. | 5288 | * fine. |
| 5289 | * NOTE: interrupts should be left disabled --dev@ | ||
| 5189 | */ | 5290 | */ |
| 5190 | spin_unlock_irq(&rq->lock); | 5291 | spin_unlock(&rq->lock); |
| 5191 | move_task_off_dead_cpu(dead_cpu, p); | 5292 | move_task_off_dead_cpu(dead_cpu, p); |
| 5192 | spin_lock_irq(&rq->lock); | 5293 | spin_lock(&rq->lock); |
| 5193 | 5294 | ||
| 5194 | put_task_struct(p); | 5295 | put_task_struct(p); |
| 5195 | } | 5296 | } |
| @@ -5342,16 +5443,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 5342 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 5443 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
| 5343 | printk("does not load-balance\n"); | 5444 | printk("does not load-balance\n"); |
| 5344 | if (sd->parent) | 5445 | if (sd->parent) |
| 5345 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); | 5446 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" |
| 5447 | " has parent"); | ||
| 5346 | break; | 5448 | break; |
| 5347 | } | 5449 | } |
| 5348 | 5450 | ||
| 5349 | printk("span %s\n", str); | 5451 | printk("span %s\n", str); |
| 5350 | 5452 | ||
| 5351 | if (!cpu_isset(cpu, sd->span)) | 5453 | if (!cpu_isset(cpu, sd->span)) |
| 5352 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); | 5454 | printk(KERN_ERR "ERROR: domain->span does not contain " |
| 5455 | "CPU%d\n", cpu); | ||
| 5353 | if (!cpu_isset(cpu, group->cpumask)) | 5456 | if (!cpu_isset(cpu, group->cpumask)) |
| 5354 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); | 5457 | printk(KERN_ERR "ERROR: domain->groups does not contain" |
| 5458 | " CPU%d\n", cpu); | ||
| 5355 | 5459 | ||
| 5356 | printk(KERN_DEBUG); | 5460 | printk(KERN_DEBUG); |
| 5357 | for (i = 0; i < level + 2; i++) | 5461 | for (i = 0; i < level + 2; i++) |
| @@ -5366,7 +5470,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 5366 | 5470 | ||
| 5367 | if (!group->cpu_power) { | 5471 | if (!group->cpu_power) { |
| 5368 | printk("\n"); | 5472 | printk("\n"); |
| 5369 | printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); | 5473 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
| 5474 | "set\n"); | ||
| 5370 | } | 5475 | } |
| 5371 | 5476 | ||
| 5372 | if (!cpus_weight(group->cpumask)) { | 5477 | if (!cpus_weight(group->cpumask)) { |
| @@ -5389,15 +5494,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 5389 | printk("\n"); | 5494 | printk("\n"); |
| 5390 | 5495 | ||
| 5391 | if (!cpus_equal(sd->span, groupmask)) | 5496 | if (!cpus_equal(sd->span, groupmask)) |
| 5392 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | 5497 | printk(KERN_ERR "ERROR: groups don't span " |
| 5498 | "domain->span\n"); | ||
| 5393 | 5499 | ||
| 5394 | level++; | 5500 | level++; |
| 5395 | sd = sd->parent; | 5501 | sd = sd->parent; |
| 5502 | if (!sd) | ||
| 5503 | continue; | ||
| 5396 | 5504 | ||
| 5397 | if (sd) { | 5505 | if (!cpus_subset(groupmask, sd->span)) |
| 5398 | if (!cpus_subset(groupmask, sd->span)) | 5506 | printk(KERN_ERR "ERROR: parent span is not a superset " |
| 5399 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); | 5507 | "of domain->span\n"); |
| 5400 | } | ||
| 5401 | 5508 | ||
| 5402 | } while (sd); | 5509 | } while (sd); |
| 5403 | } | 5510 | } |
| @@ -5493,7 +5600,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) | |||
| 5493 | } | 5600 | } |
| 5494 | 5601 | ||
| 5495 | /* cpus with isolated domains */ | 5602 | /* cpus with isolated domains */ |
| 5496 | static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE; | 5603 | static cpumask_t cpu_isolated_map = CPU_MASK_NONE; |
| 5497 | 5604 | ||
| 5498 | /* Setup the mask of cpus configured for isolated domains */ | 5605 | /* Setup the mask of cpus configured for isolated domains */ |
| 5499 | static int __init isolated_cpu_setup(char *str) | 5606 | static int __init isolated_cpu_setup(char *str) |
| @@ -5511,28 +5618,27 @@ static int __init isolated_cpu_setup(char *str) | |||
| 5511 | __setup ("isolcpus=", isolated_cpu_setup); | 5618 | __setup ("isolcpus=", isolated_cpu_setup); |
| 5512 | 5619 | ||
| 5513 | /* | 5620 | /* |
| 5514 | * init_sched_build_groups takes an array of groups, the cpumask we wish | 5621 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer |
| 5515 | * to span, and a pointer to a function which identifies what group a CPU | 5622 | * to a function which identifies what group(along with sched group) a CPU |
| 5516 | * belongs to. The return value of group_fn must be a valid index into the | 5623 | * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS |
| 5517 | * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we | 5624 | * (due to the fact that we keep track of groups covered with a cpumask_t). |
| 5518 | * keep track of groups covered with a cpumask_t). | ||
| 5519 | * | 5625 | * |
| 5520 | * init_sched_build_groups will build a circular linked list of the groups | 5626 | * init_sched_build_groups will build a circular linked list of the groups |
| 5521 | * covered by the given span, and will set each group's ->cpumask correctly, | 5627 | * covered by the given span, and will set each group's ->cpumask correctly, |
| 5522 | * and ->cpu_power to 0. | 5628 | * and ->cpu_power to 0. |
| 5523 | */ | 5629 | */ |
| 5524 | static void | 5630 | static void |
| 5525 | init_sched_build_groups(struct sched_group groups[], cpumask_t span, | 5631 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, |
| 5526 | const cpumask_t *cpu_map, | 5632 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, |
| 5527 | int (*group_fn)(int cpu, const cpumask_t *cpu_map)) | 5633 | struct sched_group **sg)) |
| 5528 | { | 5634 | { |
| 5529 | struct sched_group *first = NULL, *last = NULL; | 5635 | struct sched_group *first = NULL, *last = NULL; |
| 5530 | cpumask_t covered = CPU_MASK_NONE; | 5636 | cpumask_t covered = CPU_MASK_NONE; |
| 5531 | int i; | 5637 | int i; |
| 5532 | 5638 | ||
| 5533 | for_each_cpu_mask(i, span) { | 5639 | for_each_cpu_mask(i, span) { |
| 5534 | int group = group_fn(i, cpu_map); | 5640 | struct sched_group *sg; |
| 5535 | struct sched_group *sg = &groups[group]; | 5641 | int group = group_fn(i, cpu_map, &sg); |
| 5536 | int j; | 5642 | int j; |
| 5537 | 5643 | ||
| 5538 | if (cpu_isset(i, covered)) | 5644 | if (cpu_isset(i, covered)) |
| @@ -5542,7 +5648,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
| 5542 | sg->cpu_power = 0; | 5648 | sg->cpu_power = 0; |
| 5543 | 5649 | ||
| 5544 | for_each_cpu_mask(j, span) { | 5650 | for_each_cpu_mask(j, span) { |
| 5545 | if (group_fn(j, cpu_map) != group) | 5651 | if (group_fn(j, cpu_map, NULL) != group) |
| 5546 | continue; | 5652 | continue; |
| 5547 | 5653 | ||
| 5548 | cpu_set(j, covered); | 5654 | cpu_set(j, covered); |
| @@ -5716,8 +5822,9 @@ __setup("max_cache_size=", setup_max_cache_size); | |||
| 5716 | */ | 5822 | */ |
| 5717 | static void touch_cache(void *__cache, unsigned long __size) | 5823 | static void touch_cache(void *__cache, unsigned long __size) |
| 5718 | { | 5824 | { |
| 5719 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | 5825 | unsigned long size = __size / sizeof(long); |
| 5720 | chunk2 = 2*size/3; | 5826 | unsigned long chunk1 = size / 3; |
| 5827 | unsigned long chunk2 = 2 * size / 3; | ||
| 5721 | unsigned long *cache = __cache; | 5828 | unsigned long *cache = __cache; |
| 5722 | int i; | 5829 | int i; |
| 5723 | 5830 | ||
| @@ -5826,11 +5933,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | |||
| 5826 | */ | 5933 | */ |
| 5827 | measure_one(cache, size, cpu1, cpu2); | 5934 | measure_one(cache, size, cpu1, cpu2); |
| 5828 | for (i = 0; i < ITERATIONS; i++) | 5935 | for (i = 0; i < ITERATIONS; i++) |
| 5829 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | 5936 | cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); |
| 5830 | 5937 | ||
| 5831 | measure_one(cache, size, cpu2, cpu1); | 5938 | measure_one(cache, size, cpu2, cpu1); |
| 5832 | for (i = 0; i < ITERATIONS; i++) | 5939 | for (i = 0; i < ITERATIONS; i++) |
| 5833 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | 5940 | cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); |
| 5834 | 5941 | ||
| 5835 | /* | 5942 | /* |
| 5836 | * (We measure the non-migrating [cached] cost on both | 5943 | * (We measure the non-migrating [cached] cost on both |
| @@ -5840,17 +5947,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | |||
| 5840 | 5947 | ||
| 5841 | measure_one(cache, size, cpu1, cpu1); | 5948 | measure_one(cache, size, cpu1, cpu1); |
| 5842 | for (i = 0; i < ITERATIONS; i++) | 5949 | for (i = 0; i < ITERATIONS; i++) |
| 5843 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | 5950 | cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); |
| 5844 | 5951 | ||
| 5845 | measure_one(cache, size, cpu2, cpu2); | 5952 | measure_one(cache, size, cpu2, cpu2); |
| 5846 | for (i = 0; i < ITERATIONS; i++) | 5953 | for (i = 0; i < ITERATIONS; i++) |
| 5847 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | 5954 | cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); |
| 5848 | 5955 | ||
| 5849 | /* | 5956 | /* |
| 5850 | * Get the per-iteration migration cost: | 5957 | * Get the per-iteration migration cost: |
| 5851 | */ | 5958 | */ |
| 5852 | do_div(cost1, 2*ITERATIONS); | 5959 | do_div(cost1, 2 * ITERATIONS); |
| 5853 | do_div(cost2, 2*ITERATIONS); | 5960 | do_div(cost2, 2 * ITERATIONS); |
| 5854 | 5961 | ||
| 5855 | return cost1 - cost2; | 5962 | return cost1 - cost2; |
| 5856 | } | 5963 | } |
| @@ -5888,7 +5995,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) | |||
| 5888 | */ | 5995 | */ |
| 5889 | cache = vmalloc(max_size); | 5996 | cache = vmalloc(max_size); |
| 5890 | if (!cache) { | 5997 | if (!cache) { |
| 5891 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | 5998 | printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); |
| 5892 | return 1000000; /* return 1 msec on very small boxen */ | 5999 | return 1000000; /* return 1 msec on very small boxen */ |
| 5893 | } | 6000 | } |
| 5894 | 6001 | ||
| @@ -5913,7 +6020,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) | |||
| 5913 | avg_fluct = (avg_fluct + fluct)/2; | 6020 | avg_fluct = (avg_fluct + fluct)/2; |
| 5914 | 6021 | ||
| 5915 | if (migration_debug) | 6022 | if (migration_debug) |
| 5916 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | 6023 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " |
| 6024 | "(%8Ld %8Ld)\n", | ||
| 5917 | cpu1, cpu2, size, | 6025 | cpu1, cpu2, size, |
| 5918 | (long)cost / 1000000, | 6026 | (long)cost / 1000000, |
| 5919 | ((long)cost / 100000) % 10, | 6027 | ((long)cost / 100000) % 10, |
| @@ -6008,20 +6116,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) | |||
| 6008 | -1 | 6116 | -1 |
| 6009 | #endif | 6117 | #endif |
| 6010 | ); | 6118 | ); |
| 6011 | if (system_state == SYSTEM_BOOTING) { | 6119 | if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { |
| 6012 | if (num_online_cpus() > 1) { | 6120 | printk("migration_cost="); |
| 6013 | printk("migration_cost="); | 6121 | for (distance = 0; distance <= max_distance; distance++) { |
| 6014 | for (distance = 0; distance <= max_distance; distance++) { | 6122 | if (distance) |
| 6015 | if (distance) | 6123 | printk(","); |
| 6016 | printk(","); | 6124 | printk("%ld", (long)migration_cost[distance] / 1000); |
| 6017 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
| 6018 | } | ||
| 6019 | printk("\n"); | ||
| 6020 | } | 6125 | } |
| 6126 | printk("\n"); | ||
| 6021 | } | 6127 | } |
| 6022 | j1 = jiffies; | 6128 | j1 = jiffies; |
| 6023 | if (migration_debug) | 6129 | if (migration_debug) |
| 6024 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | 6130 | printk("migration: %ld seconds\n", (j1-j0) / HZ); |
| 6025 | 6131 | ||
| 6026 | /* | 6132 | /* |
| 6027 | * Move back to the original CPU. NUMA-Q gets confused | 6133 | * Move back to the original CPU. NUMA-Q gets confused |
| @@ -6118,10 +6224,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | |||
| 6118 | */ | 6224 | */ |
| 6119 | #ifdef CONFIG_SCHED_SMT | 6225 | #ifdef CONFIG_SCHED_SMT |
| 6120 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6226 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
| 6121 | static struct sched_group sched_group_cpus[NR_CPUS]; | 6227 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
| 6122 | 6228 | ||
| 6123 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) | 6229 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, |
| 6230 | struct sched_group **sg) | ||
| 6124 | { | 6231 | { |
| 6232 | if (sg) | ||
| 6233 | *sg = &per_cpu(sched_group_cpus, cpu); | ||
| 6125 | return cpu; | 6234 | return cpu; |
| 6126 | } | 6235 | } |
| 6127 | #endif | 6236 | #endif |
| @@ -6131,39 +6240,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) | |||
| 6131 | */ | 6240 | */ |
| 6132 | #ifdef CONFIG_SCHED_MC | 6241 | #ifdef CONFIG_SCHED_MC |
| 6133 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6242 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
| 6134 | static struct sched_group sched_group_core[NR_CPUS]; | 6243 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
| 6135 | #endif | 6244 | #endif |
| 6136 | 6245 | ||
| 6137 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6246 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
| 6138 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) | 6247 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, |
| 6248 | struct sched_group **sg) | ||
| 6139 | { | 6249 | { |
| 6250 | int group; | ||
| 6140 | cpumask_t mask = cpu_sibling_map[cpu]; | 6251 | cpumask_t mask = cpu_sibling_map[cpu]; |
| 6141 | cpus_and(mask, mask, *cpu_map); | 6252 | cpus_and(mask, mask, *cpu_map); |
| 6142 | return first_cpu(mask); | 6253 | group = first_cpu(mask); |
| 6254 | if (sg) | ||
| 6255 | *sg = &per_cpu(sched_group_core, group); | ||
| 6256 | return group; | ||
| 6143 | } | 6257 | } |
| 6144 | #elif defined(CONFIG_SCHED_MC) | 6258 | #elif defined(CONFIG_SCHED_MC) |
| 6145 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) | 6259 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, |
| 6260 | struct sched_group **sg) | ||
| 6146 | { | 6261 | { |
| 6262 | if (sg) | ||
| 6263 | *sg = &per_cpu(sched_group_core, cpu); | ||
| 6147 | return cpu; | 6264 | return cpu; |
| 6148 | } | 6265 | } |
| 6149 | #endif | 6266 | #endif |
| 6150 | 6267 | ||
| 6151 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6268 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
| 6152 | static struct sched_group sched_group_phys[NR_CPUS]; | 6269 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
| 6153 | 6270 | ||
| 6154 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) | 6271 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, |
| 6272 | struct sched_group **sg) | ||
| 6155 | { | 6273 | { |
| 6274 | int group; | ||
| 6156 | #ifdef CONFIG_SCHED_MC | 6275 | #ifdef CONFIG_SCHED_MC |
| 6157 | cpumask_t mask = cpu_coregroup_map(cpu); | 6276 | cpumask_t mask = cpu_coregroup_map(cpu); |
| 6158 | cpus_and(mask, mask, *cpu_map); | 6277 | cpus_and(mask, mask, *cpu_map); |
| 6159 | return first_cpu(mask); | 6278 | group = first_cpu(mask); |
| 6160 | #elif defined(CONFIG_SCHED_SMT) | 6279 | #elif defined(CONFIG_SCHED_SMT) |
| 6161 | cpumask_t mask = cpu_sibling_map[cpu]; | 6280 | cpumask_t mask = cpu_sibling_map[cpu]; |
| 6162 | cpus_and(mask, mask, *cpu_map); | 6281 | cpus_and(mask, mask, *cpu_map); |
| 6163 | return first_cpu(mask); | 6282 | group = first_cpu(mask); |
| 6164 | #else | 6283 | #else |
| 6165 | return cpu; | 6284 | group = cpu; |
| 6166 | #endif | 6285 | #endif |
| 6286 | if (sg) | ||
| 6287 | *sg = &per_cpu(sched_group_phys, group); | ||
| 6288 | return group; | ||
| 6167 | } | 6289 | } |
| 6168 | 6290 | ||
| 6169 | #ifdef CONFIG_NUMA | 6291 | #ifdef CONFIG_NUMA |
| @@ -6176,12 +6298,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); | |||
| 6176 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 6298 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; |
| 6177 | 6299 | ||
| 6178 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 6300 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
| 6179 | static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; | 6301 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); |
| 6180 | 6302 | ||
| 6181 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) | 6303 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, |
| 6304 | struct sched_group **sg) | ||
| 6182 | { | 6305 | { |
| 6183 | return cpu_to_node(cpu); | 6306 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); |
| 6307 | int group; | ||
| 6308 | |||
| 6309 | cpus_and(nodemask, nodemask, *cpu_map); | ||
| 6310 | group = first_cpu(nodemask); | ||
| 6311 | |||
| 6312 | if (sg) | ||
| 6313 | *sg = &per_cpu(sched_group_allnodes, group); | ||
| 6314 | return group; | ||
| 6184 | } | 6315 | } |
| 6316 | |||
| 6185 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 6317 | static void init_numa_sched_groups_power(struct sched_group *group_head) |
| 6186 | { | 6318 | { |
| 6187 | struct sched_group *sg = group_head; | 6319 | struct sched_group *sg = group_head; |
| @@ -6217,16 +6349,9 @@ static void free_sched_groups(const cpumask_t *cpu_map) | |||
| 6217 | int cpu, i; | 6349 | int cpu, i; |
| 6218 | 6350 | ||
| 6219 | for_each_cpu_mask(cpu, *cpu_map) { | 6351 | for_each_cpu_mask(cpu, *cpu_map) { |
| 6220 | struct sched_group *sched_group_allnodes | ||
| 6221 | = sched_group_allnodes_bycpu[cpu]; | ||
| 6222 | struct sched_group **sched_group_nodes | 6352 | struct sched_group **sched_group_nodes |
| 6223 | = sched_group_nodes_bycpu[cpu]; | 6353 | = sched_group_nodes_bycpu[cpu]; |
| 6224 | 6354 | ||
| 6225 | if (sched_group_allnodes) { | ||
| 6226 | kfree(sched_group_allnodes); | ||
| 6227 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
| 6228 | } | ||
| 6229 | |||
| 6230 | if (!sched_group_nodes) | 6355 | if (!sched_group_nodes) |
| 6231 | continue; | 6356 | continue; |
| 6232 | 6357 | ||
| @@ -6320,7 +6445,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6320 | struct sched_domain *sd; | 6445 | struct sched_domain *sd; |
| 6321 | #ifdef CONFIG_NUMA | 6446 | #ifdef CONFIG_NUMA |
| 6322 | struct sched_group **sched_group_nodes = NULL; | 6447 | struct sched_group **sched_group_nodes = NULL; |
| 6323 | struct sched_group *sched_group_allnodes = NULL; | 6448 | int sd_allnodes = 0; |
| 6324 | 6449 | ||
| 6325 | /* | 6450 | /* |
| 6326 | * Allocate the per-node list of sched groups | 6451 | * Allocate the per-node list of sched groups |
| @@ -6338,7 +6463,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6338 | * Set up domains for cpus specified by the cpu_map. | 6463 | * Set up domains for cpus specified by the cpu_map. |
| 6339 | */ | 6464 | */ |
| 6340 | for_each_cpu_mask(i, *cpu_map) { | 6465 | for_each_cpu_mask(i, *cpu_map) { |
| 6341 | int group; | ||
| 6342 | struct sched_domain *sd = NULL, *p; | 6466 | struct sched_domain *sd = NULL, *p; |
| 6343 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 6467 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
| 6344 | 6468 | ||
| @@ -6347,26 +6471,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6347 | #ifdef CONFIG_NUMA | 6471 | #ifdef CONFIG_NUMA |
| 6348 | if (cpus_weight(*cpu_map) | 6472 | if (cpus_weight(*cpu_map) |
| 6349 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 6473 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
| 6350 | if (!sched_group_allnodes) { | ||
| 6351 | sched_group_allnodes | ||
| 6352 | = kmalloc_node(sizeof(struct sched_group) | ||
| 6353 | * MAX_NUMNODES, | ||
| 6354 | GFP_KERNEL, | ||
| 6355 | cpu_to_node(i)); | ||
| 6356 | if (!sched_group_allnodes) { | ||
| 6357 | printk(KERN_WARNING | ||
| 6358 | "Can not alloc allnodes sched group\n"); | ||
| 6359 | goto error; | ||
| 6360 | } | ||
| 6361 | sched_group_allnodes_bycpu[i] | ||
| 6362 | = sched_group_allnodes; | ||
| 6363 | } | ||
| 6364 | sd = &per_cpu(allnodes_domains, i); | 6474 | sd = &per_cpu(allnodes_domains, i); |
| 6365 | *sd = SD_ALLNODES_INIT; | 6475 | *sd = SD_ALLNODES_INIT; |
| 6366 | sd->span = *cpu_map; | 6476 | sd->span = *cpu_map; |
| 6367 | group = cpu_to_allnodes_group(i, cpu_map); | 6477 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); |
| 6368 | sd->groups = &sched_group_allnodes[group]; | ||
| 6369 | p = sd; | 6478 | p = sd; |
| 6479 | sd_allnodes = 1; | ||
| 6370 | } else | 6480 | } else |
| 6371 | p = NULL; | 6481 | p = NULL; |
| 6372 | 6482 | ||
| @@ -6381,36 +6491,33 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6381 | 6491 | ||
| 6382 | p = sd; | 6492 | p = sd; |
| 6383 | sd = &per_cpu(phys_domains, i); | 6493 | sd = &per_cpu(phys_domains, i); |
| 6384 | group = cpu_to_phys_group(i, cpu_map); | ||
| 6385 | *sd = SD_CPU_INIT; | 6494 | *sd = SD_CPU_INIT; |
| 6386 | sd->span = nodemask; | 6495 | sd->span = nodemask; |
| 6387 | sd->parent = p; | 6496 | sd->parent = p; |
| 6388 | if (p) | 6497 | if (p) |
| 6389 | p->child = sd; | 6498 | p->child = sd; |
| 6390 | sd->groups = &sched_group_phys[group]; | 6499 | cpu_to_phys_group(i, cpu_map, &sd->groups); |
| 6391 | 6500 | ||
| 6392 | #ifdef CONFIG_SCHED_MC | 6501 | #ifdef CONFIG_SCHED_MC |
| 6393 | p = sd; | 6502 | p = sd; |
| 6394 | sd = &per_cpu(core_domains, i); | 6503 | sd = &per_cpu(core_domains, i); |
| 6395 | group = cpu_to_core_group(i, cpu_map); | ||
| 6396 | *sd = SD_MC_INIT; | 6504 | *sd = SD_MC_INIT; |
| 6397 | sd->span = cpu_coregroup_map(i); | 6505 | sd->span = cpu_coregroup_map(i); |
| 6398 | cpus_and(sd->span, sd->span, *cpu_map); | 6506 | cpus_and(sd->span, sd->span, *cpu_map); |
| 6399 | sd->parent = p; | 6507 | sd->parent = p; |
| 6400 | p->child = sd; | 6508 | p->child = sd; |
| 6401 | sd->groups = &sched_group_core[group]; | 6509 | cpu_to_core_group(i, cpu_map, &sd->groups); |
| 6402 | #endif | 6510 | #endif |
| 6403 | 6511 | ||
| 6404 | #ifdef CONFIG_SCHED_SMT | 6512 | #ifdef CONFIG_SCHED_SMT |
| 6405 | p = sd; | 6513 | p = sd; |
| 6406 | sd = &per_cpu(cpu_domains, i); | 6514 | sd = &per_cpu(cpu_domains, i); |
| 6407 | group = cpu_to_cpu_group(i, cpu_map); | ||
| 6408 | *sd = SD_SIBLING_INIT; | 6515 | *sd = SD_SIBLING_INIT; |
| 6409 | sd->span = cpu_sibling_map[i]; | 6516 | sd->span = cpu_sibling_map[i]; |
| 6410 | cpus_and(sd->span, sd->span, *cpu_map); | 6517 | cpus_and(sd->span, sd->span, *cpu_map); |
| 6411 | sd->parent = p; | 6518 | sd->parent = p; |
| 6412 | p->child = sd; | 6519 | p->child = sd; |
| 6413 | sd->groups = &sched_group_cpus[group]; | 6520 | cpu_to_cpu_group(i, cpu_map, &sd->groups); |
| 6414 | #endif | 6521 | #endif |
| 6415 | } | 6522 | } |
| 6416 | 6523 | ||
| @@ -6422,8 +6529,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6422 | if (i != first_cpu(this_sibling_map)) | 6529 | if (i != first_cpu(this_sibling_map)) |
| 6423 | continue; | 6530 | continue; |
| 6424 | 6531 | ||
| 6425 | init_sched_build_groups(sched_group_cpus, this_sibling_map, | 6532 | init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); |
| 6426 | cpu_map, &cpu_to_cpu_group); | ||
| 6427 | } | 6533 | } |
| 6428 | #endif | 6534 | #endif |
| 6429 | 6535 | ||
| @@ -6434,8 +6540,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6434 | cpus_and(this_core_map, this_core_map, *cpu_map); | 6540 | cpus_and(this_core_map, this_core_map, *cpu_map); |
| 6435 | if (i != first_cpu(this_core_map)) | 6541 | if (i != first_cpu(this_core_map)) |
| 6436 | continue; | 6542 | continue; |
| 6437 | init_sched_build_groups(sched_group_core, this_core_map, | 6543 | init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); |
| 6438 | cpu_map, &cpu_to_core_group); | ||
| 6439 | } | 6544 | } |
| 6440 | #endif | 6545 | #endif |
| 6441 | 6546 | ||
| @@ -6448,15 +6553,13 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6448 | if (cpus_empty(nodemask)) | 6553 | if (cpus_empty(nodemask)) |
| 6449 | continue; | 6554 | continue; |
| 6450 | 6555 | ||
| 6451 | init_sched_build_groups(sched_group_phys, nodemask, | 6556 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); |
| 6452 | cpu_map, &cpu_to_phys_group); | ||
| 6453 | } | 6557 | } |
| 6454 | 6558 | ||
| 6455 | #ifdef CONFIG_NUMA | 6559 | #ifdef CONFIG_NUMA |
| 6456 | /* Set up node groups */ | 6560 | /* Set up node groups */ |
| 6457 | if (sched_group_allnodes) | 6561 | if (sd_allnodes) |
| 6458 | init_sched_build_groups(sched_group_allnodes, *cpu_map, | 6562 | init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); |
| 6459 | cpu_map, &cpu_to_allnodes_group); | ||
| 6460 | 6563 | ||
| 6461 | for (i = 0; i < MAX_NUMNODES; i++) { | 6564 | for (i = 0; i < MAX_NUMNODES; i++) { |
| 6462 | /* Set up node groups */ | 6565 | /* Set up node groups */ |
| @@ -6548,10 +6651,10 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
| 6548 | for (i = 0; i < MAX_NUMNODES; i++) | 6651 | for (i = 0; i < MAX_NUMNODES; i++) |
| 6549 | init_numa_sched_groups_power(sched_group_nodes[i]); | 6652 | init_numa_sched_groups_power(sched_group_nodes[i]); |
| 6550 | 6653 | ||
| 6551 | if (sched_group_allnodes) { | 6654 | if (sd_allnodes) { |
| 6552 | int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); | 6655 | struct sched_group *sg; |
| 6553 | struct sched_group *sg = &sched_group_allnodes[group]; | ||
| 6554 | 6656 | ||
| 6657 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); | ||
| 6555 | init_numa_sched_groups_power(sg); | 6658 | init_numa_sched_groups_power(sg); |
| 6556 | } | 6659 | } |
| 6557 | #endif | 6660 | #endif |
| @@ -6723,8 +6826,6 @@ SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | |||
| 6723 | sched_smt_power_savings_store); | 6826 | sched_smt_power_savings_store); |
| 6724 | #endif | 6827 | #endif |
| 6725 | 6828 | ||
| 6726 | |||
| 6727 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 6728 | /* | 6829 | /* |
| 6729 | * Force a reinitialization of the sched domains hierarchy. The domains | 6830 | * Force a reinitialization of the sched domains hierarchy. The domains |
| 6730 | * and groups cannot be updated in place without racing with the balancing | 6831 | * and groups cannot be updated in place without racing with the balancing |
| @@ -6757,7 +6858,6 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
| 6757 | 6858 | ||
| 6758 | return NOTIFY_OK; | 6859 | return NOTIFY_OK; |
| 6759 | } | 6860 | } |
| 6760 | #endif | ||
| 6761 | 6861 | ||
| 6762 | void __init sched_init_smp(void) | 6862 | void __init sched_init_smp(void) |
| 6763 | { | 6863 | { |
| @@ -6765,7 +6865,7 @@ void __init sched_init_smp(void) | |||
| 6765 | 6865 | ||
| 6766 | lock_cpu_hotplug(); | 6866 | lock_cpu_hotplug(); |
| 6767 | arch_init_sched_domains(&cpu_online_map); | 6867 | arch_init_sched_domains(&cpu_online_map); |
| 6768 | cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map); | 6868 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
| 6769 | if (cpus_empty(non_isolated_cpus)) | 6869 | if (cpus_empty(non_isolated_cpus)) |
| 6770 | cpu_set(smp_processor_id(), non_isolated_cpus); | 6870 | cpu_set(smp_processor_id(), non_isolated_cpus); |
| 6771 | unlock_cpu_hotplug(); | 6871 | unlock_cpu_hotplug(); |
| @@ -6833,6 +6933,10 @@ void __init sched_init(void) | |||
| 6833 | 6933 | ||
| 6834 | set_load_weight(&init_task); | 6934 | set_load_weight(&init_task); |
| 6835 | 6935 | ||
| 6936 | #ifdef CONFIG_SMP | ||
| 6937 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | ||
| 6938 | #endif | ||
| 6939 | |||
| 6836 | #ifdef CONFIG_RT_MUTEXES | 6940 | #ifdef CONFIG_RT_MUTEXES |
| 6837 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); | 6941 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); |
| 6838 | #endif | 6942 | #endif |
| @@ -6867,6 +6971,9 @@ void __might_sleep(char *file, int line) | |||
| 6867 | " context at %s:%d\n", file, line); | 6971 | " context at %s:%d\n", file, line); |
| 6868 | printk("in_atomic():%d, irqs_disabled():%d\n", | 6972 | printk("in_atomic():%d, irqs_disabled():%d\n", |
| 6869 | in_atomic(), irqs_disabled()); | 6973 | in_atomic(), irqs_disabled()); |
| 6974 | debug_show_held_locks(current); | ||
| 6975 | if (irqs_disabled()) | ||
| 6976 | print_irqtrace_events(current); | ||
| 6870 | dump_stack(); | 6977 | dump_stack(); |
| 6871 | } | 6978 | } |
| 6872 | #endif | 6979 | #endif |
diff --git a/kernel/signal.c b/kernel/signal.c index df18c167a2a7..5630255d2e2a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -23,6 +23,10 @@ | |||
| 23 | #include <linux/ptrace.h> | 23 | #include <linux/ptrace.h> |
| 24 | #include <linux/signal.h> | 24 | #include <linux/signal.h> |
| 25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
| 26 | #include <linux/freezer.h> | ||
| 27 | #include <linux/pid_namespace.h> | ||
| 28 | #include <linux/nsproxy.h> | ||
| 29 | |||
| 26 | #include <asm/param.h> | 30 | #include <asm/param.h> |
| 27 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
| 28 | #include <asm/unistd.h> | 32 | #include <asm/unistd.h> |
| @@ -33,7 +37,7 @@ | |||
| 33 | * SLAB caches for signal bits. | 37 | * SLAB caches for signal bits. |
| 34 | */ | 38 | */ |
| 35 | 39 | ||
| 36 | static kmem_cache_t *sigqueue_cachep; | 40 | static struct kmem_cache *sigqueue_cachep; |
| 37 | 41 | ||
| 38 | /* | 42 | /* |
| 39 | * In POSIX a signal is sent either to a specific thread (Linux task) | 43 | * In POSIX a signal is sent either to a specific thread (Linux task) |
| @@ -582,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
| 582 | error = -EPERM; | 586 | error = -EPERM; |
| 583 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) | 587 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) |
| 584 | && ((sig != SIGCONT) || | 588 | && ((sig != SIGCONT) || |
| 585 | (current->signal->session != t->signal->session)) | 589 | (process_session(current) != process_session(t))) |
| 586 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) | 590 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) |
| 587 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) | 591 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) |
| 588 | && !capable(CAP_KILL)) | 592 | && !capable(CAP_KILL)) |
| @@ -1133,8 +1137,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) | |||
| 1133 | return error; | 1137 | return error; |
| 1134 | } | 1138 | } |
| 1135 | 1139 | ||
| 1136 | int | 1140 | static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) |
| 1137 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) | ||
| 1138 | { | 1141 | { |
| 1139 | int error; | 1142 | int error; |
| 1140 | rcu_read_lock(); | 1143 | rcu_read_lock(); |
| @@ -1702,7 +1705,9 @@ finish_stop(int stop_count) | |||
| 1702 | read_unlock(&tasklist_lock); | 1705 | read_unlock(&tasklist_lock); |
| 1703 | } | 1706 | } |
| 1704 | 1707 | ||
| 1705 | schedule(); | 1708 | do { |
| 1709 | schedule(); | ||
| 1710 | } while (try_to_freeze()); | ||
| 1706 | /* | 1711 | /* |
| 1707 | * Now we don't run again until continued. | 1712 | * Now we don't run again until continued. |
| 1708 | */ | 1713 | */ |
| @@ -1877,8 +1882,12 @@ relock: | |||
| 1877 | if (sig_kernel_ignore(signr)) /* Default is nothing. */ | 1882 | if (sig_kernel_ignore(signr)) /* Default is nothing. */ |
| 1878 | continue; | 1883 | continue; |
| 1879 | 1884 | ||
| 1880 | /* Init gets no signals it doesn't want. */ | 1885 | /* |
| 1881 | if (current == child_reaper) | 1886 | * Init of a pid space gets no signals it doesn't want from |
| 1887 | * within that pid space. It can of course get signals from | ||
| 1888 | * its parent pid space. | ||
| 1889 | */ | ||
| 1890 | if (current == child_reaper(current)) | ||
| 1882 | continue; | 1891 | continue; |
| 1883 | 1892 | ||
| 1884 | if (sig_kernel_stop(signr)) { | 1893 | if (sig_kernel_stop(signr)) { |
diff --git a/kernel/softirq.c b/kernel/softirq.c index bf25015dce16..918e52df090e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -574,8 +574,6 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
| 574 | 574 | ||
| 575 | switch (action) { | 575 | switch (action) { |
| 576 | case CPU_UP_PREPARE: | 576 | case CPU_UP_PREPARE: |
| 577 | BUG_ON(per_cpu(tasklet_vec, hotcpu).list); | ||
| 578 | BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list); | ||
| 579 | p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); | 577 | p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); |
| 580 | if (IS_ERR(p)) { | 578 | if (IS_ERR(p)) { |
| 581 | printk("ksoftirqd for %i failed\n", hotcpu); | 579 | printk("ksoftirqd for %i failed\n", hotcpu); |
diff --git a/kernel/sys.c b/kernel/sys.c index 98489d82801b..c7675c1bfdf2 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -880,7 +880,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
| 880 | return 0; | 880 | return 0; |
| 881 | } | 881 | } |
| 882 | 882 | ||
| 883 | static void deferred_cad(void *dummy) | 883 | static void deferred_cad(struct work_struct *dummy) |
| 884 | { | 884 | { |
| 885 | kernel_restart(NULL); | 885 | kernel_restart(NULL); |
| 886 | } | 886 | } |
| @@ -892,7 +892,7 @@ static void deferred_cad(void *dummy) | |||
| 892 | */ | 892 | */ |
| 893 | void ctrl_alt_del(void) | 893 | void ctrl_alt_del(void) |
| 894 | { | 894 | { |
| 895 | static DECLARE_WORK(cad_work, deferred_cad, NULL); | 895 | static DECLARE_WORK(cad_work, deferred_cad); |
| 896 | 896 | ||
| 897 | if (C_A_D) | 897 | if (C_A_D) |
| 898 | schedule_work(&cad_work); | 898 | schedule_work(&cad_work); |
| @@ -1102,14 +1102,14 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid) | |||
| 1102 | asmlinkage long sys_setuid(uid_t uid) | 1102 | asmlinkage long sys_setuid(uid_t uid) |
| 1103 | { | 1103 | { |
| 1104 | int old_euid = current->euid; | 1104 | int old_euid = current->euid; |
| 1105 | int old_ruid, old_suid, new_ruid, new_suid; | 1105 | int old_ruid, old_suid, new_suid; |
| 1106 | int retval; | 1106 | int retval; |
| 1107 | 1107 | ||
| 1108 | retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); | 1108 | retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID); |
| 1109 | if (retval) | 1109 | if (retval) |
| 1110 | return retval; | 1110 | return retval; |
| 1111 | 1111 | ||
| 1112 | old_ruid = new_ruid = current->uid; | 1112 | old_ruid = current->uid; |
| 1113 | old_suid = current->suid; | 1113 | old_suid = current->suid; |
| 1114 | new_suid = old_suid; | 1114 | new_suid = old_suid; |
| 1115 | 1115 | ||
| @@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
| 1381 | 1381 | ||
| 1382 | if (p->real_parent == group_leader) { | 1382 | if (p->real_parent == group_leader) { |
| 1383 | err = -EPERM; | 1383 | err = -EPERM; |
| 1384 | if (p->signal->session != group_leader->signal->session) | 1384 | if (process_session(p) != process_session(group_leader)) |
| 1385 | goto out; | 1385 | goto out; |
| 1386 | err = -EACCES; | 1386 | err = -EACCES; |
| 1387 | if (p->did_exec) | 1387 | if (p->did_exec) |
| @@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
| 1397 | goto out; | 1397 | goto out; |
| 1398 | 1398 | ||
| 1399 | if (pgid != pid) { | 1399 | if (pgid != pid) { |
| 1400 | struct task_struct *p; | 1400 | struct task_struct *g = |
| 1401 | find_task_by_pid_type(PIDTYPE_PGID, pgid); | ||
| 1401 | 1402 | ||
| 1402 | do_each_task_pid(pgid, PIDTYPE_PGID, p) { | 1403 | if (!g || process_session(g) != process_session(group_leader)) |
| 1403 | if (p->signal->session == group_leader->signal->session) | 1404 | goto out; |
| 1404 | goto ok_pgid; | ||
| 1405 | } while_each_task_pid(pgid, PIDTYPE_PGID, p); | ||
| 1406 | goto out; | ||
| 1407 | } | 1405 | } |
| 1408 | 1406 | ||
| 1409 | ok_pgid: | ||
| 1410 | err = security_task_setpgid(p, pgid); | 1407 | err = security_task_setpgid(p, pgid); |
| 1411 | if (err) | 1408 | if (err) |
| 1412 | goto out; | 1409 | goto out; |
| @@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void) | |||
| 1459 | asmlinkage long sys_getsid(pid_t pid) | 1456 | asmlinkage long sys_getsid(pid_t pid) |
| 1460 | { | 1457 | { |
| 1461 | if (!pid) | 1458 | if (!pid) |
| 1462 | return current->signal->session; | 1459 | return process_session(current); |
| 1463 | else { | 1460 | else { |
| 1464 | int retval; | 1461 | int retval; |
| 1465 | struct task_struct *p; | 1462 | struct task_struct *p; |
| @@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid) | |||
| 1471 | if (p) { | 1468 | if (p) { |
| 1472 | retval = security_task_getsid(p); | 1469 | retval = security_task_getsid(p); |
| 1473 | if (!retval) | 1470 | if (!retval) |
| 1474 | retval = p->signal->session; | 1471 | retval = process_session(p); |
| 1475 | } | 1472 | } |
| 1476 | read_unlock(&tasklist_lock); | 1473 | read_unlock(&tasklist_lock); |
| 1477 | return retval; | 1474 | return retval; |
| @@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void) | |||
| 1484 | pid_t session; | 1481 | pid_t session; |
| 1485 | int err = -EPERM; | 1482 | int err = -EPERM; |
| 1486 | 1483 | ||
| 1487 | mutex_lock(&tty_mutex); | ||
| 1488 | write_lock_irq(&tasklist_lock); | 1484 | write_lock_irq(&tasklist_lock); |
| 1489 | 1485 | ||
| 1490 | /* Fail if I am already a session leader */ | 1486 | /* Fail if I am already a session leader */ |
| @@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void) | |||
| 1504 | 1500 | ||
| 1505 | group_leader->signal->leader = 1; | 1501 | group_leader->signal->leader = 1; |
| 1506 | __set_special_pids(session, session); | 1502 | __set_special_pids(session, session); |
| 1503 | |||
| 1504 | spin_lock(&group_leader->sighand->siglock); | ||
| 1507 | group_leader->signal->tty = NULL; | 1505 | group_leader->signal->tty = NULL; |
| 1508 | group_leader->signal->tty_old_pgrp = 0; | 1506 | group_leader->signal->tty_old_pgrp = 0; |
| 1507 | spin_unlock(&group_leader->sighand->siglock); | ||
| 1508 | |||
| 1509 | err = process_group(group_leader); | 1509 | err = process_group(group_leader); |
| 1510 | out: | 1510 | out: |
| 1511 | write_unlock_irq(&tasklist_lock); | 1511 | write_unlock_irq(&tasklist_lock); |
| 1512 | mutex_unlock(&tty_mutex); | ||
| 1513 | return err; | 1512 | return err; |
| 1514 | } | 1513 | } |
| 1515 | 1514 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 09e569f4792b..600b33358ded 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -54,6 +54,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, | |||
| 54 | 54 | ||
| 55 | #ifdef CONFIG_X86 | 55 | #ifdef CONFIG_X86 |
| 56 | #include <asm/nmi.h> | 56 | #include <asm/nmi.h> |
| 57 | #include <asm/stacktrace.h> | ||
| 57 | #endif | 58 | #endif |
| 58 | 59 | ||
| 59 | #if defined(CONFIG_SYSCTL) | 60 | #if defined(CONFIG_SYSCTL) |
| @@ -64,7 +65,6 @@ extern int sysctl_overcommit_memory; | |||
| 64 | extern int sysctl_overcommit_ratio; | 65 | extern int sysctl_overcommit_ratio; |
| 65 | extern int sysctl_panic_on_oom; | 66 | extern int sysctl_panic_on_oom; |
| 66 | extern int max_threads; | 67 | extern int max_threads; |
| 67 | extern int sysrq_enabled; | ||
| 68 | extern int core_uses_pid; | 68 | extern int core_uses_pid; |
| 69 | extern int suid_dumpable; | 69 | extern int suid_dumpable; |
| 70 | extern char core_pattern[]; | 70 | extern char core_pattern[]; |
| @@ -91,7 +91,9 @@ extern char modprobe_path[]; | |||
| 91 | extern int sg_big_buff; | 91 | extern int sg_big_buff; |
| 92 | #endif | 92 | #endif |
| 93 | #ifdef CONFIG_SYSVIPC | 93 | #ifdef CONFIG_SYSVIPC |
| 94 | static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | 94 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, |
| 95 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
| 96 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, | ||
| 95 | void __user *buffer, size_t *lenp, loff_t *ppos); | 97 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| 96 | #endif | 98 | #endif |
| 97 | 99 | ||
| @@ -130,12 +132,22 @@ extern int max_lock_depth; | |||
| 130 | 132 | ||
| 131 | #ifdef CONFIG_SYSCTL_SYSCALL | 133 | #ifdef CONFIG_SYSCTL_SYSCALL |
| 132 | static int parse_table(int __user *, int, void __user *, size_t __user *, | 134 | static int parse_table(int __user *, int, void __user *, size_t __user *, |
| 133 | void __user *, size_t, ctl_table *, void **); | 135 | void __user *, size_t, ctl_table *); |
| 134 | #endif | 136 | #endif |
| 135 | 137 | ||
| 136 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | 138 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, |
| 137 | void __user *buffer, size_t *lenp, loff_t *ppos); | 139 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| 138 | 140 | ||
| 141 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
| 142 | void __user *oldval, size_t __user *oldlenp, | ||
| 143 | void __user *newval, size_t newlen); | ||
| 144 | |||
| 145 | #ifdef CONFIG_SYSVIPC | ||
| 146 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
| 147 | void __user *oldval, size_t __user *oldlenp, | ||
| 148 | void __user *newval, size_t newlen); | ||
| 149 | #endif | ||
| 150 | |||
| 139 | #ifdef CONFIG_PROC_SYSCTL | 151 | #ifdef CONFIG_PROC_SYSCTL |
| 140 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, | 152 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, |
| 141 | void __user *buffer, size_t *lenp, loff_t *ppos); | 153 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| @@ -162,6 +174,40 @@ extern ctl_table inotify_table[]; | |||
| 162 | int sysctl_legacy_va_layout; | 174 | int sysctl_legacy_va_layout; |
| 163 | #endif | 175 | #endif |
| 164 | 176 | ||
| 177 | static void *get_uts(ctl_table *table, int write) | ||
| 178 | { | ||
| 179 | char *which = table->data; | ||
| 180 | #ifdef CONFIG_UTS_NS | ||
| 181 | struct uts_namespace *uts_ns = current->nsproxy->uts_ns; | ||
| 182 | which = (which - (char *)&init_uts_ns) + (char *)uts_ns; | ||
| 183 | #endif | ||
| 184 | if (!write) | ||
| 185 | down_read(&uts_sem); | ||
| 186 | else | ||
| 187 | down_write(&uts_sem); | ||
| 188 | return which; | ||
| 189 | } | ||
| 190 | |||
| 191 | static void put_uts(ctl_table *table, int write, void *which) | ||
| 192 | { | ||
| 193 | if (!write) | ||
| 194 | up_read(&uts_sem); | ||
| 195 | else | ||
| 196 | up_write(&uts_sem); | ||
| 197 | } | ||
| 198 | |||
| 199 | #ifdef CONFIG_SYSVIPC | ||
| 200 | static void *get_ipc(ctl_table *table, int write) | ||
| 201 | { | ||
| 202 | char *which = table->data; | ||
| 203 | struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; | ||
| 204 | which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; | ||
| 205 | return which; | ||
| 206 | } | ||
| 207 | #else | ||
| 208 | #define get_ipc(T,W) ((T)->data) | ||
| 209 | #endif | ||
| 210 | |||
| 165 | /* /proc declarations: */ | 211 | /* /proc declarations: */ |
| 166 | 212 | ||
| 167 | #ifdef CONFIG_PROC_SYSCTL | 213 | #ifdef CONFIG_PROC_SYSCTL |
| @@ -170,7 +216,7 @@ static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); | |||
| 170 | static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); | 216 | static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); |
| 171 | static int proc_opensys(struct inode *, struct file *); | 217 | static int proc_opensys(struct inode *, struct file *); |
| 172 | 218 | ||
| 173 | struct file_operations proc_sys_file_operations = { | 219 | const struct file_operations proc_sys_file_operations = { |
| 174 | .open = proc_opensys, | 220 | .open = proc_opensys, |
| 175 | .read = proc_readsys, | 221 | .read = proc_readsys, |
| 176 | .write = proc_writesys, | 222 | .write = proc_writesys, |
| @@ -228,7 +274,6 @@ static ctl_table root_table[] = { | |||
| 228 | }; | 274 | }; |
| 229 | 275 | ||
| 230 | static ctl_table kern_table[] = { | 276 | static ctl_table kern_table[] = { |
| 231 | #ifndef CONFIG_UTS_NS | ||
| 232 | { | 277 | { |
| 233 | .ctl_name = KERN_OSTYPE, | 278 | .ctl_name = KERN_OSTYPE, |
| 234 | .procname = "ostype", | 279 | .procname = "ostype", |
| @@ -236,7 +281,7 @@ static ctl_table kern_table[] = { | |||
| 236 | .maxlen = sizeof(init_uts_ns.name.sysname), | 281 | .maxlen = sizeof(init_uts_ns.name.sysname), |
| 237 | .mode = 0444, | 282 | .mode = 0444, |
| 238 | .proc_handler = &proc_do_uts_string, | 283 | .proc_handler = &proc_do_uts_string, |
| 239 | .strategy = &sysctl_string, | 284 | .strategy = &sysctl_uts_string, |
| 240 | }, | 285 | }, |
| 241 | { | 286 | { |
| 242 | .ctl_name = KERN_OSRELEASE, | 287 | .ctl_name = KERN_OSRELEASE, |
| @@ -245,7 +290,7 @@ static ctl_table kern_table[] = { | |||
| 245 | .maxlen = sizeof(init_uts_ns.name.release), | 290 | .maxlen = sizeof(init_uts_ns.name.release), |
| 246 | .mode = 0444, | 291 | .mode = 0444, |
| 247 | .proc_handler = &proc_do_uts_string, | 292 | .proc_handler = &proc_do_uts_string, |
| 248 | .strategy = &sysctl_string, | 293 | .strategy = &sysctl_uts_string, |
| 249 | }, | 294 | }, |
| 250 | { | 295 | { |
| 251 | .ctl_name = KERN_VERSION, | 296 | .ctl_name = KERN_VERSION, |
| @@ -254,7 +299,7 @@ static ctl_table kern_table[] = { | |||
| 254 | .maxlen = sizeof(init_uts_ns.name.version), | 299 | .maxlen = sizeof(init_uts_ns.name.version), |
| 255 | .mode = 0444, | 300 | .mode = 0444, |
| 256 | .proc_handler = &proc_do_uts_string, | 301 | .proc_handler = &proc_do_uts_string, |
| 257 | .strategy = &sysctl_string, | 302 | .strategy = &sysctl_uts_string, |
| 258 | }, | 303 | }, |
| 259 | { | 304 | { |
| 260 | .ctl_name = KERN_NODENAME, | 305 | .ctl_name = KERN_NODENAME, |
| @@ -263,7 +308,7 @@ static ctl_table kern_table[] = { | |||
| 263 | .maxlen = sizeof(init_uts_ns.name.nodename), | 308 | .maxlen = sizeof(init_uts_ns.name.nodename), |
| 264 | .mode = 0644, | 309 | .mode = 0644, |
| 265 | .proc_handler = &proc_do_uts_string, | 310 | .proc_handler = &proc_do_uts_string, |
| 266 | .strategy = &sysctl_string, | 311 | .strategy = &sysctl_uts_string, |
| 267 | }, | 312 | }, |
| 268 | { | 313 | { |
| 269 | .ctl_name = KERN_DOMAINNAME, | 314 | .ctl_name = KERN_DOMAINNAME, |
| @@ -272,57 +317,9 @@ static ctl_table kern_table[] = { | |||
| 272 | .maxlen = sizeof(init_uts_ns.name.domainname), | 317 | .maxlen = sizeof(init_uts_ns.name.domainname), |
| 273 | .mode = 0644, | 318 | .mode = 0644, |
| 274 | .proc_handler = &proc_do_uts_string, | 319 | .proc_handler = &proc_do_uts_string, |
| 275 | .strategy = &sysctl_string, | 320 | .strategy = &sysctl_uts_string, |
| 276 | }, | ||
| 277 | #else /* !CONFIG_UTS_NS */ | ||
| 278 | { | ||
| 279 | .ctl_name = KERN_OSTYPE, | ||
| 280 | .procname = "ostype", | ||
| 281 | .data = NULL, | ||
| 282 | /* could maybe use __NEW_UTS_LEN here? */ | ||
| 283 | .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), | ||
| 284 | .mode = 0444, | ||
| 285 | .proc_handler = &proc_do_uts_string, | ||
| 286 | .strategy = &sysctl_string, | ||
| 287 | }, | 321 | }, |
| 288 | { | 322 | { |
| 289 | .ctl_name = KERN_OSRELEASE, | ||
| 290 | .procname = "osrelease", | ||
| 291 | .data = NULL, | ||
| 292 | .maxlen = FIELD_SIZEOF(struct new_utsname, release), | ||
| 293 | .mode = 0444, | ||
| 294 | .proc_handler = &proc_do_uts_string, | ||
| 295 | .strategy = &sysctl_string, | ||
| 296 | }, | ||
| 297 | { | ||
| 298 | .ctl_name = KERN_VERSION, | ||
| 299 | .procname = "version", | ||
| 300 | .data = NULL, | ||
| 301 | .maxlen = FIELD_SIZEOF(struct new_utsname, version), | ||
| 302 | .mode = 0444, | ||
| 303 | .proc_handler = &proc_do_uts_string, | ||
| 304 | .strategy = &sysctl_string, | ||
| 305 | }, | ||
| 306 | { | ||
| 307 | .ctl_name = KERN_NODENAME, | ||
| 308 | .procname = "hostname", | ||
| 309 | .data = NULL, | ||
| 310 | .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), | ||
| 311 | .mode = 0644, | ||
| 312 | .proc_handler = &proc_do_uts_string, | ||
| 313 | .strategy = &sysctl_string, | ||
| 314 | }, | ||
| 315 | { | ||
| 316 | .ctl_name = KERN_DOMAINNAME, | ||
| 317 | .procname = "domainname", | ||
| 318 | .data = NULL, | ||
| 319 | .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), | ||
| 320 | .mode = 0644, | ||
| 321 | .proc_handler = &proc_do_uts_string, | ||
| 322 | .strategy = &sysctl_string, | ||
| 323 | }, | ||
| 324 | #endif /* !CONFIG_UTS_NS */ | ||
| 325 | { | ||
| 326 | .ctl_name = KERN_PANIC, | 323 | .ctl_name = KERN_PANIC, |
| 327 | .procname = "panic", | 324 | .procname = "panic", |
| 328 | .data = &panic_timeout, | 325 | .data = &panic_timeout, |
| @@ -480,65 +477,72 @@ static ctl_table kern_table[] = { | |||
| 480 | { | 477 | { |
| 481 | .ctl_name = KERN_SHMMAX, | 478 | .ctl_name = KERN_SHMMAX, |
| 482 | .procname = "shmmax", | 479 | .procname = "shmmax", |
| 483 | .data = NULL, | 480 | .data = &init_ipc_ns.shm_ctlmax, |
| 484 | .maxlen = sizeof (size_t), | 481 | .maxlen = sizeof (init_ipc_ns.shm_ctlmax), |
| 485 | .mode = 0644, | 482 | .mode = 0644, |
| 486 | .proc_handler = &proc_do_ipc_string, | 483 | .proc_handler = &proc_ipc_doulongvec_minmax, |
| 484 | .strategy = sysctl_ipc_data, | ||
| 487 | }, | 485 | }, |
| 488 | { | 486 | { |
| 489 | .ctl_name = KERN_SHMALL, | 487 | .ctl_name = KERN_SHMALL, |
| 490 | .procname = "shmall", | 488 | .procname = "shmall", |
| 491 | .data = NULL, | 489 | .data = &init_ipc_ns.shm_ctlall, |
| 492 | .maxlen = sizeof (size_t), | 490 | .maxlen = sizeof (init_ipc_ns.shm_ctlall), |
| 493 | .mode = 0644, | 491 | .mode = 0644, |
| 494 | .proc_handler = &proc_do_ipc_string, | 492 | .proc_handler = &proc_ipc_doulongvec_minmax, |
| 493 | .strategy = sysctl_ipc_data, | ||
| 495 | }, | 494 | }, |
| 496 | { | 495 | { |
| 497 | .ctl_name = KERN_SHMMNI, | 496 | .ctl_name = KERN_SHMMNI, |
| 498 | .procname = "shmmni", | 497 | .procname = "shmmni", |
| 499 | .data = NULL, | 498 | .data = &init_ipc_ns.shm_ctlmni, |
| 500 | .maxlen = sizeof (int), | 499 | .maxlen = sizeof (init_ipc_ns.shm_ctlmni), |
| 501 | .mode = 0644, | 500 | .mode = 0644, |
| 502 | .proc_handler = &proc_do_ipc_string, | 501 | .proc_handler = &proc_ipc_dointvec, |
| 502 | .strategy = sysctl_ipc_data, | ||
| 503 | }, | 503 | }, |
| 504 | { | 504 | { |
| 505 | .ctl_name = KERN_MSGMAX, | 505 | .ctl_name = KERN_MSGMAX, |
| 506 | .procname = "msgmax", | 506 | .procname = "msgmax", |
| 507 | .data = NULL, | 507 | .data = &init_ipc_ns.msg_ctlmax, |
| 508 | .maxlen = sizeof (int), | 508 | .maxlen = sizeof (init_ipc_ns.msg_ctlmax), |
| 509 | .mode = 0644, | 509 | .mode = 0644, |
| 510 | .proc_handler = &proc_do_ipc_string, | 510 | .proc_handler = &proc_ipc_dointvec, |
| 511 | .strategy = sysctl_ipc_data, | ||
| 511 | }, | 512 | }, |
| 512 | { | 513 | { |
| 513 | .ctl_name = KERN_MSGMNI, | 514 | .ctl_name = KERN_MSGMNI, |
| 514 | .procname = "msgmni", | 515 | .procname = "msgmni", |
| 515 | .data = NULL, | 516 | .data = &init_ipc_ns.msg_ctlmni, |
| 516 | .maxlen = sizeof (int), | 517 | .maxlen = sizeof (init_ipc_ns.msg_ctlmni), |
| 517 | .mode = 0644, | 518 | .mode = 0644, |
| 518 | .proc_handler = &proc_do_ipc_string, | 519 | .proc_handler = &proc_ipc_dointvec, |
| 520 | .strategy = sysctl_ipc_data, | ||
| 519 | }, | 521 | }, |
| 520 | { | 522 | { |
| 521 | .ctl_name = KERN_MSGMNB, | 523 | .ctl_name = KERN_MSGMNB, |
| 522 | .procname = "msgmnb", | 524 | .procname = "msgmnb", |
| 523 | .data = NULL, | 525 | .data = &init_ipc_ns.msg_ctlmnb, |
| 524 | .maxlen = sizeof (int), | 526 | .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), |
| 525 | .mode = 0644, | 527 | .mode = 0644, |
| 526 | .proc_handler = &proc_do_ipc_string, | 528 | .proc_handler = &proc_ipc_dointvec, |
| 529 | .strategy = sysctl_ipc_data, | ||
| 527 | }, | 530 | }, |
| 528 | { | 531 | { |
| 529 | .ctl_name = KERN_SEM, | 532 | .ctl_name = KERN_SEM, |
| 530 | .procname = "sem", | 533 | .procname = "sem", |
| 531 | .data = NULL, | 534 | .data = &init_ipc_ns.sem_ctls, |
| 532 | .maxlen = 4*sizeof (int), | 535 | .maxlen = 4*sizeof (int), |
| 533 | .mode = 0644, | 536 | .mode = 0644, |
| 534 | .proc_handler = &proc_do_ipc_string, | 537 | .proc_handler = &proc_ipc_dointvec, |
| 538 | .strategy = sysctl_ipc_data, | ||
| 535 | }, | 539 | }, |
| 536 | #endif | 540 | #endif |
| 537 | #ifdef CONFIG_MAGIC_SYSRQ | 541 | #ifdef CONFIG_MAGIC_SYSRQ |
| 538 | { | 542 | { |
| 539 | .ctl_name = KERN_SYSRQ, | 543 | .ctl_name = KERN_SYSRQ, |
| 540 | .procname = "sysrq", | 544 | .procname = "sysrq", |
| 541 | .data = &sysrq_enabled, | 545 | .data = &__sysrq_enabled, |
| 542 | .maxlen = sizeof (int), | 546 | .maxlen = sizeof (int), |
| 543 | .mode = 0644, | 547 | .mode = 0644, |
| 544 | .proc_handler = &proc_dointvec, | 548 | .proc_handler = &proc_dointvec, |
| @@ -707,6 +711,14 @@ static ctl_table kern_table[] = { | |||
| 707 | .mode = 0444, | 711 | .mode = 0444, |
| 708 | .proc_handler = &proc_dointvec, | 712 | .proc_handler = &proc_dointvec, |
| 709 | }, | 713 | }, |
| 714 | { | ||
| 715 | .ctl_name = CTL_UNNUMBERED, | ||
| 716 | .procname = "kstack_depth_to_print", | ||
| 717 | .data = &kstack_depth_to_print, | ||
| 718 | .maxlen = sizeof(int), | ||
| 719 | .mode = 0644, | ||
| 720 | .proc_handler = &proc_dointvec, | ||
| 721 | }, | ||
| 710 | #endif | 722 | #endif |
| 711 | #if defined(CONFIG_MMU) | 723 | #if defined(CONFIG_MMU) |
| 712 | { | 724 | { |
| @@ -977,17 +989,6 @@ static ctl_table vm_table[] = { | |||
| 977 | .extra1 = &zero, | 989 | .extra1 = &zero, |
| 978 | }, | 990 | }, |
| 979 | #endif | 991 | #endif |
| 980 | #ifdef CONFIG_SWAP | ||
| 981 | { | ||
| 982 | .ctl_name = VM_SWAP_TOKEN_TIMEOUT, | ||
| 983 | .procname = "swap_token_timeout", | ||
| 984 | .data = &swap_token_default_timeout, | ||
| 985 | .maxlen = sizeof(swap_token_default_timeout), | ||
| 986 | .mode = 0644, | ||
| 987 | .proc_handler = &proc_dointvec_jiffies, | ||
| 988 | .strategy = &sysctl_jiffies, | ||
| 989 | }, | ||
| 990 | #endif | ||
| 991 | #ifdef CONFIG_NUMA | 992 | #ifdef CONFIG_NUMA |
| 992 | { | 993 | { |
| 993 | .ctl_name = VM_ZONE_RECLAIM_MODE, | 994 | .ctl_name = VM_ZONE_RECLAIM_MODE, |
| @@ -1241,7 +1242,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
| 1241 | do { | 1242 | do { |
| 1242 | struct ctl_table_header *head = | 1243 | struct ctl_table_header *head = |
| 1243 | list_entry(tmp, struct ctl_table_header, ctl_entry); | 1244 | list_entry(tmp, struct ctl_table_header, ctl_entry); |
| 1244 | void *context = NULL; | ||
| 1245 | 1245 | ||
| 1246 | if (!use_table(head)) | 1246 | if (!use_table(head)) |
| 1247 | continue; | 1247 | continue; |
| @@ -1249,9 +1249,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
| 1249 | spin_unlock(&sysctl_lock); | 1249 | spin_unlock(&sysctl_lock); |
| 1250 | 1250 | ||
| 1251 | error = parse_table(name, nlen, oldval, oldlenp, | 1251 | error = parse_table(name, nlen, oldval, oldlenp, |
| 1252 | newval, newlen, head->ctl_table, | 1252 | newval, newlen, head->ctl_table); |
| 1253 | &context); | ||
| 1254 | kfree(context); | ||
| 1255 | 1253 | ||
| 1256 | spin_lock(&sysctl_lock); | 1254 | spin_lock(&sysctl_lock); |
| 1257 | unuse_table(head); | 1255 | unuse_table(head); |
| @@ -1307,7 +1305,7 @@ static inline int ctl_perm(ctl_table *table, int op) | |||
| 1307 | static int parse_table(int __user *name, int nlen, | 1305 | static int parse_table(int __user *name, int nlen, |
| 1308 | void __user *oldval, size_t __user *oldlenp, | 1306 | void __user *oldval, size_t __user *oldlenp, |
| 1309 | void __user *newval, size_t newlen, | 1307 | void __user *newval, size_t newlen, |
| 1310 | ctl_table *table, void **context) | 1308 | ctl_table *table) |
| 1311 | { | 1309 | { |
| 1312 | int n; | 1310 | int n; |
| 1313 | repeat: | 1311 | repeat: |
| @@ -1327,7 +1325,7 @@ repeat: | |||
| 1327 | error = table->strategy( | 1325 | error = table->strategy( |
| 1328 | table, name, nlen, | 1326 | table, name, nlen, |
| 1329 | oldval, oldlenp, | 1327 | oldval, oldlenp, |
| 1330 | newval, newlen, context); | 1328 | newval, newlen); |
| 1331 | if (error) | 1329 | if (error) |
| 1332 | return error; | 1330 | return error; |
| 1333 | } | 1331 | } |
| @@ -1338,7 +1336,7 @@ repeat: | |||
| 1338 | } | 1336 | } |
| 1339 | error = do_sysctl_strategy(table, name, nlen, | 1337 | error = do_sysctl_strategy(table, name, nlen, |
| 1340 | oldval, oldlenp, | 1338 | oldval, oldlenp, |
| 1341 | newval, newlen, context); | 1339 | newval, newlen); |
| 1342 | return error; | 1340 | return error; |
| 1343 | } | 1341 | } |
| 1344 | } | 1342 | } |
| @@ -1349,7 +1347,7 @@ repeat: | |||
| 1349 | int do_sysctl_strategy (ctl_table *table, | 1347 | int do_sysctl_strategy (ctl_table *table, |
| 1350 | int __user *name, int nlen, | 1348 | int __user *name, int nlen, |
| 1351 | void __user *oldval, size_t __user *oldlenp, | 1349 | void __user *oldval, size_t __user *oldlenp, |
| 1352 | void __user *newval, size_t newlen, void **context) | 1350 | void __user *newval, size_t newlen) |
| 1353 | { | 1351 | { |
| 1354 | int op = 0, rc; | 1352 | int op = 0, rc; |
| 1355 | size_t len; | 1353 | size_t len; |
| @@ -1363,7 +1361,7 @@ int do_sysctl_strategy (ctl_table *table, | |||
| 1363 | 1361 | ||
| 1364 | if (table->strategy) { | 1362 | if (table->strategy) { |
| 1365 | rc = table->strategy(table, name, nlen, oldval, oldlenp, | 1363 | rc = table->strategy(table, name, nlen, oldval, oldlenp, |
| 1366 | newval, newlen, context); | 1364 | newval, newlen); |
| 1367 | if (rc < 0) | 1365 | if (rc < 0) |
| 1368 | return rc; | 1366 | return rc; |
| 1369 | if (rc > 0) | 1367 | if (rc > 0) |
| @@ -1616,7 +1614,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, | |||
| 1616 | size_t count, loff_t *ppos) | 1614 | size_t count, loff_t *ppos) |
| 1617 | { | 1615 | { |
| 1618 | int op; | 1616 | int op; |
| 1619 | struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); | 1617 | struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); |
| 1620 | struct ctl_table *table; | 1618 | struct ctl_table *table; |
| 1621 | size_t res; | 1619 | size_t res; |
| 1622 | ssize_t error = -ENOTDIR; | 1620 | ssize_t error = -ENOTDIR; |
| @@ -1755,66 +1753,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, | |||
| 1755 | * Special case of dostring for the UTS structure. This has locks | 1753 | * Special case of dostring for the UTS structure. This has locks |
| 1756 | * to observe. Should this be in kernel/sys.c ???? | 1754 | * to observe. Should this be in kernel/sys.c ???? |
| 1757 | */ | 1755 | */ |
| 1758 | |||
| 1759 | #ifndef CONFIG_UTS_NS | ||
| 1760 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
| 1761 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 1762 | { | ||
| 1763 | int r; | ||
| 1764 | 1756 | ||
| 1765 | if (!write) { | ||
| 1766 | down_read(&uts_sem); | ||
| 1767 | r=proc_dostring(table,0,filp,buffer,lenp, ppos); | ||
| 1768 | up_read(&uts_sem); | ||
| 1769 | } else { | ||
| 1770 | down_write(&uts_sem); | ||
| 1771 | r=proc_dostring(table,1,filp,buffer,lenp, ppos); | ||
| 1772 | up_write(&uts_sem); | ||
| 1773 | } | ||
| 1774 | return r; | ||
| 1775 | } | ||
| 1776 | #else /* !CONFIG_UTS_NS */ | ||
| 1777 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | 1757 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, |
| 1778 | void __user *buffer, size_t *lenp, loff_t *ppos) | 1758 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 1779 | { | 1759 | { |
| 1780 | int r; | 1760 | int r; |
| 1781 | struct uts_namespace* uts_ns = current->nsproxy->uts_ns; | 1761 | void *which; |
| 1782 | char* which; | 1762 | which = get_uts(table, write); |
| 1783 | 1763 | r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); | |
| 1784 | switch (table->ctl_name) { | 1764 | put_uts(table, write, which); |
| 1785 | case KERN_OSTYPE: | ||
| 1786 | which = uts_ns->name.sysname; | ||
| 1787 | break; | ||
| 1788 | case KERN_NODENAME: | ||
| 1789 | which = uts_ns->name.nodename; | ||
| 1790 | break; | ||
| 1791 | case KERN_OSRELEASE: | ||
| 1792 | which = uts_ns->name.release; | ||
| 1793 | break; | ||
| 1794 | case KERN_VERSION: | ||
| 1795 | which = uts_ns->name.version; | ||
| 1796 | break; | ||
| 1797 | case KERN_DOMAINNAME: | ||
| 1798 | which = uts_ns->name.domainname; | ||
| 1799 | break; | ||
| 1800 | default: | ||
| 1801 | r = -EINVAL; | ||
| 1802 | goto out; | ||
| 1803 | } | ||
| 1804 | |||
| 1805 | if (!write) { | ||
| 1806 | down_read(&uts_sem); | ||
| 1807 | r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos); | ||
| 1808 | up_read(&uts_sem); | ||
| 1809 | } else { | ||
| 1810 | down_write(&uts_sem); | ||
| 1811 | r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos); | ||
| 1812 | up_write(&uts_sem); | ||
| 1813 | } | ||
| 1814 | out: | ||
| 1815 | return r; | 1765 | return r; |
| 1816 | } | 1766 | } |
| 1817 | #endif /* !CONFIG_UTS_NS */ | ||
| 1818 | 1767 | ||
| 1819 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, | 1768 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, |
| 1820 | int *valp, | 1769 | int *valp, |
| @@ -1886,7 +1835,7 @@ static int __do_proc_dointvec(void *tbl_data, ctl_table *table, | |||
| 1886 | p = buf; | 1835 | p = buf; |
| 1887 | if (*p == '-' && left > 1) { | 1836 | if (*p == '-' && left > 1) { |
| 1888 | neg = 1; | 1837 | neg = 1; |
| 1889 | left--, p++; | 1838 | p++; |
| 1890 | } | 1839 | } |
| 1891 | if (*p < '0' || *p > '9') | 1840 | if (*p < '0' || *p > '9') |
| 1892 | break; | 1841 | break; |
| @@ -1978,9 +1927,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp, | |||
| 1978 | 1927 | ||
| 1979 | #define OP_SET 0 | 1928 | #define OP_SET 0 |
| 1980 | #define OP_AND 1 | 1929 | #define OP_AND 1 |
| 1981 | #define OP_OR 2 | ||
| 1982 | #define OP_MAX 3 | ||
| 1983 | #define OP_MIN 4 | ||
| 1984 | 1930 | ||
| 1985 | static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, | 1931 | static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, |
| 1986 | int *valp, | 1932 | int *valp, |
| @@ -1992,13 +1938,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, | |||
| 1992 | switch(op) { | 1938 | switch(op) { |
| 1993 | case OP_SET: *valp = val; break; | 1939 | case OP_SET: *valp = val; break; |
| 1994 | case OP_AND: *valp &= val; break; | 1940 | case OP_AND: *valp &= val; break; |
| 1995 | case OP_OR: *valp |= val; break; | ||
| 1996 | case OP_MAX: if(*valp < val) | ||
| 1997 | *valp = val; | ||
| 1998 | break; | ||
| 1999 | case OP_MIN: if(*valp > val) | ||
| 2000 | *valp = val; | ||
| 2001 | break; | ||
| 2002 | } | 1941 | } |
| 2003 | } else { | 1942 | } else { |
| 2004 | int val = *valp; | 1943 | int val = *valp; |
| @@ -2137,7 +2076,7 @@ static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, | |||
| 2137 | p = buf; | 2076 | p = buf; |
| 2138 | if (*p == '-' && left > 1) { | 2077 | if (*p == '-' && left > 1) { |
| 2139 | neg = 1; | 2078 | neg = 1; |
| 2140 | left--, p++; | 2079 | p++; |
| 2141 | } | 2080 | } |
| 2142 | if (*p < '0' || *p > '9') | 2081 | if (*p < '0' || *p > '9') |
| 2143 | break; | 2082 | break; |
| @@ -2393,46 +2332,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, | |||
| 2393 | } | 2332 | } |
| 2394 | 2333 | ||
| 2395 | #ifdef CONFIG_SYSVIPC | 2334 | #ifdef CONFIG_SYSVIPC |
| 2396 | static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | 2335 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, |
| 2397 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2336 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 2398 | { | 2337 | { |
| 2399 | void *data; | 2338 | void *which; |
| 2400 | struct ipc_namespace *ns; | 2339 | which = get_ipc(table, write); |
| 2401 | 2340 | return __do_proc_dointvec(which, table, write, filp, buffer, | |
| 2402 | ns = current->nsproxy->ipc_ns; | ||
| 2403 | |||
| 2404 | switch (table->ctl_name) { | ||
| 2405 | case KERN_SHMMAX: | ||
| 2406 | data = &ns->shm_ctlmax; | ||
| 2407 | goto proc_minmax; | ||
| 2408 | case KERN_SHMALL: | ||
| 2409 | data = &ns->shm_ctlall; | ||
| 2410 | goto proc_minmax; | ||
| 2411 | case KERN_SHMMNI: | ||
| 2412 | data = &ns->shm_ctlmni; | ||
| 2413 | break; | ||
| 2414 | case KERN_MSGMAX: | ||
| 2415 | data = &ns->msg_ctlmax; | ||
| 2416 | break; | ||
| 2417 | case KERN_MSGMNI: | ||
| 2418 | data = &ns->msg_ctlmni; | ||
| 2419 | break; | ||
| 2420 | case KERN_MSGMNB: | ||
| 2421 | data = &ns->msg_ctlmnb; | ||
| 2422 | break; | ||
| 2423 | case KERN_SEM: | ||
| 2424 | data = &ns->sem_ctls; | ||
| 2425 | break; | ||
| 2426 | default: | ||
| 2427 | return -EINVAL; | ||
| 2428 | } | ||
| 2429 | |||
| 2430 | return __do_proc_dointvec(data, table, write, filp, buffer, | ||
| 2431 | lenp, ppos, NULL, NULL); | 2341 | lenp, ppos, NULL, NULL); |
| 2432 | proc_minmax: | 2342 | } |
| 2433 | return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, | 2343 | |
| 2344 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, | ||
| 2345 | struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2346 | { | ||
| 2347 | void *which; | ||
| 2348 | which = get_ipc(table, write); | ||
| 2349 | return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, | ||
| 2434 | lenp, ppos, 1l, 1l); | 2350 | lenp, ppos, 1l, 1l); |
| 2435 | } | 2351 | } |
| 2352 | |||
| 2436 | #endif | 2353 | #endif |
| 2437 | 2354 | ||
| 2438 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, | 2355 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, |
| @@ -2477,6 +2394,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | |||
| 2477 | { | 2394 | { |
| 2478 | return -ENOSYS; | 2395 | return -ENOSYS; |
| 2479 | } | 2396 | } |
| 2397 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, | ||
| 2398 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
| 2399 | { | ||
| 2400 | return -ENOSYS; | ||
| 2401 | } | ||
| 2402 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, | ||
| 2403 | struct file *filp, void __user *buffer, | ||
| 2404 | size_t *lenp, loff_t *ppos) | ||
| 2405 | { | ||
| 2406 | return -ENOSYS; | ||
| 2407 | } | ||
| 2480 | #endif | 2408 | #endif |
| 2481 | 2409 | ||
| 2482 | int proc_dointvec(ctl_table *table, int write, struct file *filp, | 2410 | int proc_dointvec(ctl_table *table, int write, struct file *filp, |
| @@ -2541,7 +2469,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, | |||
| 2541 | /* The generic string strategy routine: */ | 2469 | /* The generic string strategy routine: */ |
| 2542 | int sysctl_string(ctl_table *table, int __user *name, int nlen, | 2470 | int sysctl_string(ctl_table *table, int __user *name, int nlen, |
| 2543 | void __user *oldval, size_t __user *oldlenp, | 2471 | void __user *oldval, size_t __user *oldlenp, |
| 2544 | void __user *newval, size_t newlen, void **context) | 2472 | void __user *newval, size_t newlen) |
| 2545 | { | 2473 | { |
| 2546 | if (!table->data || !table->maxlen) | 2474 | if (!table->data || !table->maxlen) |
| 2547 | return -ENOTDIR; | 2475 | return -ENOTDIR; |
| @@ -2587,7 +2515,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, | |||
| 2587 | */ | 2515 | */ |
| 2588 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | 2516 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, |
| 2589 | void __user *oldval, size_t __user *oldlenp, | 2517 | void __user *oldval, size_t __user *oldlenp, |
| 2590 | void __user *newval, size_t newlen, void **context) | 2518 | void __user *newval, size_t newlen) |
| 2591 | { | 2519 | { |
| 2592 | 2520 | ||
| 2593 | if (newval && newlen) { | 2521 | if (newval && newlen) { |
| @@ -2623,7 +2551,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | |||
| 2623 | /* Strategy function to convert jiffies to seconds */ | 2551 | /* Strategy function to convert jiffies to seconds */ |
| 2624 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | 2552 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, |
| 2625 | void __user *oldval, size_t __user *oldlenp, | 2553 | void __user *oldval, size_t __user *oldlenp, |
| 2626 | void __user *newval, size_t newlen, void **context) | 2554 | void __user *newval, size_t newlen) |
| 2627 | { | 2555 | { |
| 2628 | if (oldval) { | 2556 | if (oldval) { |
| 2629 | size_t olen; | 2557 | size_t olen; |
| @@ -2651,7 +2579,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | |||
| 2651 | /* Strategy function to convert jiffies to seconds */ | 2579 | /* Strategy function to convert jiffies to seconds */ |
| 2652 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | 2580 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, |
| 2653 | void __user *oldval, size_t __user *oldlenp, | 2581 | void __user *oldval, size_t __user *oldlenp, |
| 2654 | void __user *newval, size_t newlen, void **context) | 2582 | void __user *newval, size_t newlen) |
| 2655 | { | 2583 | { |
| 2656 | if (oldval) { | 2584 | if (oldval) { |
| 2657 | size_t olen; | 2585 | size_t olen; |
| @@ -2676,6 +2604,64 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | |||
| 2676 | return 1; | 2604 | return 1; |
| 2677 | } | 2605 | } |
| 2678 | 2606 | ||
| 2607 | |||
| 2608 | /* The generic string strategy routine: */ | ||
| 2609 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
| 2610 | void __user *oldval, size_t __user *oldlenp, | ||
| 2611 | void __user *newval, size_t newlen) | ||
| 2612 | { | ||
| 2613 | struct ctl_table uts_table; | ||
| 2614 | int r, write; | ||
| 2615 | write = newval && newlen; | ||
| 2616 | memcpy(&uts_table, table, sizeof(uts_table)); | ||
| 2617 | uts_table.data = get_uts(table, write); | ||
| 2618 | r = sysctl_string(&uts_table, name, nlen, | ||
| 2619 | oldval, oldlenp, newval, newlen); | ||
| 2620 | put_uts(table, write, uts_table.data); | ||
| 2621 | return r; | ||
| 2622 | } | ||
| 2623 | |||
| 2624 | #ifdef CONFIG_SYSVIPC | ||
| 2625 | /* The generic sysctl ipc data routine. */ | ||
| 2626 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
| 2627 | void __user *oldval, size_t __user *oldlenp, | ||
| 2628 | void __user *newval, size_t newlen) | ||
| 2629 | { | ||
| 2630 | size_t len; | ||
| 2631 | void *data; | ||
| 2632 | |||
| 2633 | /* Get out of I don't have a variable */ | ||
| 2634 | if (!table->data || !table->maxlen) | ||
| 2635 | return -ENOTDIR; | ||
| 2636 | |||
| 2637 | data = get_ipc(table, 1); | ||
| 2638 | if (!data) | ||
| 2639 | return -ENOTDIR; | ||
| 2640 | |||
| 2641 | if (oldval && oldlenp) { | ||
| 2642 | if (get_user(len, oldlenp)) | ||
| 2643 | return -EFAULT; | ||
| 2644 | if (len) { | ||
| 2645 | if (len > table->maxlen) | ||
| 2646 | len = table->maxlen; | ||
| 2647 | if (copy_to_user(oldval, data, len)) | ||
| 2648 | return -EFAULT; | ||
| 2649 | if (put_user(len, oldlenp)) | ||
| 2650 | return -EFAULT; | ||
| 2651 | } | ||
| 2652 | } | ||
| 2653 | |||
| 2654 | if (newval && newlen) { | ||
| 2655 | if (newlen > table->maxlen) | ||
| 2656 | newlen = table->maxlen; | ||
| 2657 | |||
| 2658 | if (copy_from_user(data, newval, newlen)) | ||
| 2659 | return -EFAULT; | ||
| 2660 | } | ||
| 2661 | return 1; | ||
| 2662 | } | ||
| 2663 | #endif | ||
| 2664 | |||
| 2679 | #else /* CONFIG_SYSCTL_SYSCALL */ | 2665 | #else /* CONFIG_SYSCTL_SYSCALL */ |
| 2680 | 2666 | ||
| 2681 | 2667 | ||
| @@ -2714,32 +2700,44 @@ out: | |||
| 2714 | 2700 | ||
| 2715 | int sysctl_string(ctl_table *table, int __user *name, int nlen, | 2701 | int sysctl_string(ctl_table *table, int __user *name, int nlen, |
| 2716 | void __user *oldval, size_t __user *oldlenp, | 2702 | void __user *oldval, size_t __user *oldlenp, |
| 2717 | void __user *newval, size_t newlen, void **context) | 2703 | void __user *newval, size_t newlen) |
| 2718 | { | 2704 | { |
| 2719 | return -ENOSYS; | 2705 | return -ENOSYS; |
| 2720 | } | 2706 | } |
| 2721 | 2707 | ||
| 2722 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | 2708 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, |
| 2723 | void __user *oldval, size_t __user *oldlenp, | 2709 | void __user *oldval, size_t __user *oldlenp, |
| 2724 | void __user *newval, size_t newlen, void **context) | 2710 | void __user *newval, size_t newlen) |
| 2725 | { | 2711 | { |
| 2726 | return -ENOSYS; | 2712 | return -ENOSYS; |
| 2727 | } | 2713 | } |
| 2728 | 2714 | ||
| 2729 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | 2715 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, |
| 2730 | void __user *oldval, size_t __user *oldlenp, | 2716 | void __user *oldval, size_t __user *oldlenp, |
| 2731 | void __user *newval, size_t newlen, void **context) | 2717 | void __user *newval, size_t newlen) |
| 2732 | { | 2718 | { |
| 2733 | return -ENOSYS; | 2719 | return -ENOSYS; |
| 2734 | } | 2720 | } |
| 2735 | 2721 | ||
| 2736 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | 2722 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, |
| 2737 | void __user *oldval, size_t __user *oldlenp, | 2723 | void __user *oldval, size_t __user *oldlenp, |
| 2738 | void __user *newval, size_t newlen, void **context) | 2724 | void __user *newval, size_t newlen) |
| 2739 | { | 2725 | { |
| 2740 | return -ENOSYS; | 2726 | return -ENOSYS; |
| 2741 | } | 2727 | } |
| 2742 | 2728 | ||
| 2729 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
| 2730 | void __user *oldval, size_t __user *oldlenp, | ||
| 2731 | void __user *newval, size_t newlen) | ||
| 2732 | { | ||
| 2733 | return -ENOSYS; | ||
| 2734 | } | ||
| 2735 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
| 2736 | void __user *oldval, size_t __user *oldlenp, | ||
| 2737 | void __user *newval, size_t newlen) | ||
| 2738 | { | ||
| 2739 | return -ENOSYS; | ||
| 2740 | } | ||
| 2743 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 2741 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
| 2744 | 2742 | ||
| 2745 | /* | 2743 | /* |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index f45c5e70773c..4c3476fa058d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -34,7 +34,7 @@ | |||
| 34 | 34 | ||
| 35 | static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; | 35 | static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; |
| 36 | static int family_registered; | 36 | static int family_registered; |
| 37 | kmem_cache_t *taskstats_cache; | 37 | struct kmem_cache *taskstats_cache; |
| 38 | 38 | ||
| 39 | static struct genl_family family = { | 39 | static struct genl_family family = { |
| 40 | .id = GENL_ID_GENERATE, | 40 | .id = GENL_ID_GENERATE, |
| @@ -69,7 +69,7 @@ enum actions { | |||
| 69 | }; | 69 | }; |
| 70 | 70 | ||
| 71 | static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | 71 | static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, |
| 72 | void **replyp, size_t size) | 72 | size_t size) |
| 73 | { | 73 | { |
| 74 | struct sk_buff *skb; | 74 | struct sk_buff *skb; |
| 75 | void *reply; | 75 | void *reply; |
| @@ -77,8 +77,7 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | |||
| 77 | /* | 77 | /* |
| 78 | * If new attributes are added, please revisit this allocation | 78 | * If new attributes are added, please revisit this allocation |
| 79 | */ | 79 | */ |
| 80 | size = nlmsg_total_size(genlmsg_total_size(size)); | 80 | skb = genlmsg_new(size, GFP_KERNEL); |
| 81 | skb = nlmsg_new(size, GFP_KERNEL); | ||
| 82 | if (!skb) | 81 | if (!skb) |
| 83 | return -ENOMEM; | 82 | return -ENOMEM; |
| 84 | 83 | ||
| @@ -86,20 +85,15 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, | |||
| 86 | int seq = get_cpu_var(taskstats_seqnum)++; | 85 | int seq = get_cpu_var(taskstats_seqnum)++; |
| 87 | put_cpu_var(taskstats_seqnum); | 86 | put_cpu_var(taskstats_seqnum); |
| 88 | 87 | ||
| 89 | reply = genlmsg_put(skb, 0, seq, | 88 | reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); |
| 90 | family.id, 0, 0, | ||
| 91 | cmd, family.version); | ||
| 92 | } else | 89 | } else |
| 93 | reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, | 90 | reply = genlmsg_put_reply(skb, info, &family, 0, cmd); |
| 94 | family.id, 0, 0, | ||
| 95 | cmd, family.version); | ||
| 96 | if (reply == NULL) { | 91 | if (reply == NULL) { |
| 97 | nlmsg_free(skb); | 92 | nlmsg_free(skb); |
| 98 | return -EINVAL; | 93 | return -EINVAL; |
| 99 | } | 94 | } |
| 100 | 95 | ||
| 101 | *skbp = skb; | 96 | *skbp = skb; |
| 102 | *replyp = reply; | ||
| 103 | return 0; | 97 | return 0; |
| 104 | } | 98 | } |
| 105 | 99 | ||
| @@ -124,10 +118,10 @@ static int send_reply(struct sk_buff *skb, pid_t pid) | |||
| 124 | /* | 118 | /* |
| 125 | * Send taskstats data in @skb to listeners registered for @cpu's exit data | 119 | * Send taskstats data in @skb to listeners registered for @cpu's exit data |
| 126 | */ | 120 | */ |
| 127 | static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) | 121 | static void send_cpu_listeners(struct sk_buff *skb, |
| 122 | struct listener_list *listeners) | ||
| 128 | { | 123 | { |
| 129 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); | 124 | struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); |
| 130 | struct listener_list *listeners; | ||
| 131 | struct listener *s, *tmp; | 125 | struct listener *s, *tmp; |
| 132 | struct sk_buff *skb_next, *skb_cur = skb; | 126 | struct sk_buff *skb_next, *skb_cur = skb; |
| 133 | void *reply = genlmsg_data(genlhdr); | 127 | void *reply = genlmsg_data(genlhdr); |
| @@ -140,7 +134,6 @@ static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) | |||
| 140 | } | 134 | } |
| 141 | 135 | ||
| 142 | rc = 0; | 136 | rc = 0; |
| 143 | listeners = &per_cpu(listener_array, cpu); | ||
| 144 | down_read(&listeners->sem); | 137 | down_read(&listeners->sem); |
| 145 | list_for_each_entry(s, &listeners->list, list) { | 138 | list_for_each_entry(s, &listeners->list, list) { |
| 146 | skb_next = NULL; | 139 | skb_next = NULL; |
| @@ -191,6 +184,7 @@ static int fill_pid(pid_t pid, struct task_struct *tsk, | |||
| 191 | } else | 184 | } else |
| 192 | get_task_struct(tsk); | 185 | get_task_struct(tsk); |
| 193 | 186 | ||
| 187 | memset(stats, 0, sizeof(*stats)); | ||
| 194 | /* | 188 | /* |
| 195 | * Each accounting subsystem adds calls to its functions to | 189 | * Each accounting subsystem adds calls to its functions to |
| 196 | * fill in relevant parts of struct taskstsats as follows | 190 | * fill in relevant parts of struct taskstsats as follows |
| @@ -233,6 +227,8 @@ static int fill_tgid(pid_t tgid, struct task_struct *first, | |||
| 233 | 227 | ||
| 234 | if (first->signal->stats) | 228 | if (first->signal->stats) |
| 235 | memcpy(stats, first->signal->stats, sizeof(*stats)); | 229 | memcpy(stats, first->signal->stats, sizeof(*stats)); |
| 230 | else | ||
| 231 | memset(stats, 0, sizeof(*stats)); | ||
| 236 | 232 | ||
| 237 | tsk = first; | 233 | tsk = first; |
| 238 | do { | 234 | do { |
| @@ -349,14 +345,36 @@ static int parse(struct nlattr *na, cpumask_t *mask) | |||
| 349 | return ret; | 345 | return ret; |
| 350 | } | 346 | } |
| 351 | 347 | ||
| 348 | static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) | ||
| 349 | { | ||
| 350 | struct nlattr *na, *ret; | ||
| 351 | int aggr; | ||
| 352 | |||
| 353 | aggr = (type == TASKSTATS_TYPE_PID) | ||
| 354 | ? TASKSTATS_TYPE_AGGR_PID | ||
| 355 | : TASKSTATS_TYPE_AGGR_TGID; | ||
| 356 | |||
| 357 | na = nla_nest_start(skb, aggr); | ||
| 358 | if (!na) | ||
| 359 | goto err; | ||
| 360 | if (nla_put(skb, type, sizeof(pid), &pid) < 0) | ||
| 361 | goto err; | ||
| 362 | ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); | ||
| 363 | if (!ret) | ||
| 364 | goto err; | ||
| 365 | nla_nest_end(skb, na); | ||
| 366 | |||
| 367 | return nla_data(ret); | ||
| 368 | err: | ||
| 369 | return NULL; | ||
| 370 | } | ||
| 371 | |||
| 352 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | 372 | static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) |
| 353 | { | 373 | { |
| 354 | int rc = 0; | 374 | int rc = 0; |
| 355 | struct sk_buff *rep_skb; | 375 | struct sk_buff *rep_skb; |
| 356 | struct taskstats stats; | 376 | struct taskstats *stats; |
| 357 | void *reply; | ||
| 358 | size_t size; | 377 | size_t size; |
| 359 | struct nlattr *na; | ||
| 360 | cpumask_t mask; | 378 | cpumask_t mask; |
| 361 | 379 | ||
| 362 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); | 380 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); |
| @@ -377,83 +395,71 @@ static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
| 377 | size = nla_total_size(sizeof(u32)) + | 395 | size = nla_total_size(sizeof(u32)) + |
| 378 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | 396 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); |
| 379 | 397 | ||
| 380 | memset(&stats, 0, sizeof(stats)); | 398 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); |
| 381 | rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | ||
| 382 | if (rc < 0) | 399 | if (rc < 0) |
| 383 | return rc; | 400 | return rc; |
| 384 | 401 | ||
| 402 | rc = -EINVAL; | ||
| 385 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { | 403 | if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { |
| 386 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); | 404 | u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); |
| 387 | rc = fill_pid(pid, NULL, &stats); | 405 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); |
| 388 | if (rc < 0) | 406 | if (!stats) |
| 389 | goto err; | 407 | goto err; |
| 390 | 408 | ||
| 391 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | 409 | rc = fill_pid(pid, NULL, stats); |
| 392 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); | 410 | if (rc < 0) |
| 393 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | 411 | goto err; |
| 394 | stats); | ||
| 395 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { | 412 | } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { |
| 396 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); | 413 | u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); |
| 397 | rc = fill_tgid(tgid, NULL, &stats); | 414 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); |
| 398 | if (rc < 0) | 415 | if (!stats) |
| 399 | goto err; | 416 | goto err; |
| 400 | 417 | ||
| 401 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | 418 | rc = fill_tgid(tgid, NULL, stats); |
| 402 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); | 419 | if (rc < 0) |
| 403 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | 420 | goto err; |
| 404 | stats); | 421 | } else |
| 405 | } else { | ||
| 406 | rc = -EINVAL; | ||
| 407 | goto err; | 422 | goto err; |
| 408 | } | ||
| 409 | |||
| 410 | nla_nest_end(rep_skb, na); | ||
| 411 | 423 | ||
| 412 | return send_reply(rep_skb, info->snd_pid); | 424 | return send_reply(rep_skb, info->snd_pid); |
| 413 | |||
| 414 | nla_put_failure: | ||
| 415 | rc = genlmsg_cancel(rep_skb, reply); | ||
| 416 | err: | 425 | err: |
| 417 | nlmsg_free(rep_skb); | 426 | nlmsg_free(rep_skb); |
| 418 | return rc; | 427 | return rc; |
| 419 | } | 428 | } |
| 420 | 429 | ||
| 421 | void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) | 430 | static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) |
| 422 | { | 431 | { |
| 423 | struct listener_list *listeners; | 432 | struct signal_struct *sig = tsk->signal; |
| 424 | struct taskstats *tmp; | 433 | struct taskstats *stats; |
| 425 | /* | ||
| 426 | * This is the cpu on which the task is exiting currently and will | ||
| 427 | * be the one for which the exit event is sent, even if the cpu | ||
| 428 | * on which this function is running changes later. | ||
| 429 | */ | ||
| 430 | *mycpu = raw_smp_processor_id(); | ||
| 431 | 434 | ||
| 432 | *ptidstats = NULL; | 435 | if (sig->stats || thread_group_empty(tsk)) |
| 433 | tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); | 436 | goto ret; |
| 434 | if (!tmp) | ||
| 435 | return; | ||
| 436 | 437 | ||
| 437 | listeners = &per_cpu(listener_array, *mycpu); | 438 | /* No problem if kmem_cache_zalloc() fails */ |
| 438 | down_read(&listeners->sem); | 439 | stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); |
| 439 | if (!list_empty(&listeners->list)) { | 440 | |
| 440 | *ptidstats = tmp; | 441 | spin_lock_irq(&tsk->sighand->siglock); |
| 441 | tmp = NULL; | 442 | if (!sig->stats) { |
| 443 | sig->stats = stats; | ||
| 444 | stats = NULL; | ||
| 442 | } | 445 | } |
| 443 | up_read(&listeners->sem); | 446 | spin_unlock_irq(&tsk->sighand->siglock); |
| 444 | kfree(tmp); | 447 | |
| 448 | if (stats) | ||
| 449 | kmem_cache_free(taskstats_cache, stats); | ||
| 450 | ret: | ||
| 451 | return sig->stats; | ||
| 445 | } | 452 | } |
| 446 | 453 | ||
| 447 | /* Send pid data out on exit */ | 454 | /* Send pid data out on exit */ |
| 448 | void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, | 455 | void taskstats_exit(struct task_struct *tsk, int group_dead) |
| 449 | int group_dead, unsigned int mycpu) | ||
| 450 | { | 456 | { |
| 451 | int rc; | 457 | int rc; |
| 458 | struct listener_list *listeners; | ||
| 459 | struct taskstats *stats; | ||
| 452 | struct sk_buff *rep_skb; | 460 | struct sk_buff *rep_skb; |
| 453 | void *reply; | ||
| 454 | size_t size; | 461 | size_t size; |
| 455 | int is_thread_group; | 462 | int is_thread_group; |
| 456 | struct nlattr *na; | ||
| 457 | 463 | ||
| 458 | if (!family_registered) | 464 | if (!family_registered) |
| 459 | return; | 465 | return; |
| @@ -464,7 +470,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, | |||
| 464 | size = nla_total_size(sizeof(u32)) + | 470 | size = nla_total_size(sizeof(u32)) + |
| 465 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); | 471 | nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); |
| 466 | 472 | ||
| 467 | is_thread_group = (tsk->signal->stats != NULL); | 473 | is_thread_group = !!taskstats_tgid_alloc(tsk); |
| 468 | if (is_thread_group) { | 474 | if (is_thread_group) { |
| 469 | /* PID + STATS + TGID + STATS */ | 475 | /* PID + STATS + TGID + STATS */ |
| 470 | size = 2 * size; | 476 | size = 2 * size; |
| @@ -472,49 +478,39 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, | |||
| 472 | fill_tgid_exit(tsk); | 478 | fill_tgid_exit(tsk); |
| 473 | } | 479 | } |
| 474 | 480 | ||
| 475 | if (!tidstats) | 481 | listeners = &__raw_get_cpu_var(listener_array); |
| 482 | if (list_empty(&listeners->list)) | ||
| 476 | return; | 483 | return; |
| 477 | 484 | ||
| 478 | rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); | 485 | rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); |
| 479 | if (rc < 0) | ||
| 480 | goto ret; | ||
| 481 | |||
| 482 | rc = fill_pid(tsk->pid, tsk, tidstats); | ||
| 483 | if (rc < 0) | 486 | if (rc < 0) |
| 484 | goto err_skb; | 487 | return; |
| 485 | 488 | ||
| 486 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); | 489 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); |
| 487 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); | 490 | if (!stats) |
| 488 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | 491 | goto err; |
| 489 | *tidstats); | ||
| 490 | nla_nest_end(rep_skb, na); | ||
| 491 | 492 | ||
| 492 | if (!is_thread_group) | 493 | rc = fill_pid(tsk->pid, tsk, stats); |
| 493 | goto send; | 494 | if (rc < 0) |
| 495 | goto err; | ||
| 494 | 496 | ||
| 495 | /* | 497 | /* |
| 496 | * Doesn't matter if tsk is the leader or the last group member leaving | 498 | * Doesn't matter if tsk is the leader or the last group member leaving |
| 497 | */ | 499 | */ |
| 498 | if (!group_dead) | 500 | if (!is_thread_group || !group_dead) |
| 499 | goto send; | 501 | goto send; |
| 500 | 502 | ||
| 501 | na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); | 503 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); |
| 502 | NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); | 504 | if (!stats) |
| 503 | /* No locking needed for tsk->signal->stats since group is dead */ | 505 | goto err; |
| 504 | NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, | 506 | |
| 505 | *tsk->signal->stats); | 507 | memcpy(stats, tsk->signal->stats, sizeof(*stats)); |
| 506 | nla_nest_end(rep_skb, na); | ||
| 507 | 508 | ||
| 508 | send: | 509 | send: |
| 509 | send_cpu_listeners(rep_skb, mycpu); | 510 | send_cpu_listeners(rep_skb, listeners); |
| 510 | return; | 511 | return; |
| 511 | 512 | err: | |
| 512 | nla_put_failure: | ||
| 513 | genlmsg_cancel(rep_skb, reply); | ||
| 514 | err_skb: | ||
| 515 | nlmsg_free(rep_skb); | 513 | nlmsg_free(rep_skb); |
| 516 | ret: | ||
| 517 | return; | ||
| 518 | } | 514 | } |
| 519 | 515 | ||
| 520 | static struct genl_ops taskstats_ops = { | 516 | static struct genl_ops taskstats_ops = { |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 74eca5939bd9..22504afc0d34 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c) | |||
| 156 | /* check if clocksource is already registered */ | 156 | /* check if clocksource is already registered */ |
| 157 | if (is_registered_source(c)) { | 157 | if (is_registered_source(c)) { |
| 158 | printk("register_clocksource: Cannot register %s. " | 158 | printk("register_clocksource: Cannot register %s. " |
| 159 | "Already registered!", c->name); | 159 | "Already registered!", c->name); |
| 160 | ret = -EBUSY; | 160 | ret = -EBUSY; |
| 161 | } else { | 161 | } else { |
| 162 | /* register it */ | 162 | /* register it */ |
| @@ -186,6 +186,7 @@ void clocksource_reselect(void) | |||
| 186 | } | 186 | } |
| 187 | EXPORT_SYMBOL(clocksource_reselect); | 187 | EXPORT_SYMBOL(clocksource_reselect); |
| 188 | 188 | ||
| 189 | #ifdef CONFIG_SYSFS | ||
| 189 | /** | 190 | /** |
| 190 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | 191 | * sysfs_show_current_clocksources - sysfs interface for current clocksource |
| 191 | * @dev: unused | 192 | * @dev: unused |
| @@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf) | |||
| 275 | * Sysfs setup bits: | 276 | * Sysfs setup bits: |
| 276 | */ | 277 | */ |
| 277 | static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, | 278 | static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, |
| 278 | sysfs_override_clocksource); | 279 | sysfs_override_clocksource); |
| 279 | 280 | ||
| 280 | static SYSDEV_ATTR(available_clocksource, 0600, | 281 | static SYSDEV_ATTR(available_clocksource, 0600, |
| 281 | sysfs_show_available_clocksources, NULL); | 282 | sysfs_show_available_clocksources, NULL); |
| 282 | 283 | ||
| 283 | static struct sysdev_class clocksource_sysclass = { | 284 | static struct sysdev_class clocksource_sysclass = { |
| 284 | set_kset_name("clocksource"), | 285 | set_kset_name("clocksource"), |
| @@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void) | |||
| 307 | } | 308 | } |
| 308 | 309 | ||
| 309 | device_initcall(init_clocksource_sysfs); | 310 | device_initcall(init_clocksource_sysfs); |
| 311 | #endif /* CONFIG_SYSFS */ | ||
| 310 | 312 | ||
| 311 | /** | 313 | /** |
| 312 | * boot_override_clocksource - boot clock override | 314 | * boot_override_clocksource - boot clock override |
diff --git a/kernel/timer.c b/kernel/timer.c index c1c7fbcffec1..c2a8ccfc2882 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases; | |||
| 80 | EXPORT_SYMBOL(boot_tvec_bases); | 80 | EXPORT_SYMBOL(boot_tvec_bases); |
| 81 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; | 81 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; |
| 82 | 82 | ||
| 83 | /** | ||
| 84 | * __round_jiffies - function to round jiffies to a full second | ||
| 85 | * @j: the time in (absolute) jiffies that should be rounded | ||
| 86 | * @cpu: the processor number on which the timeout will happen | ||
| 87 | * | ||
| 88 | * __round_jiffies rounds an absolute time in the future (in jiffies) | ||
| 89 | * up or down to (approximately) full seconds. This is useful for timers | ||
| 90 | * for which the exact time they fire does not matter too much, as long as | ||
| 91 | * they fire approximately every X seconds. | ||
| 92 | * | ||
| 93 | * By rounding these timers to whole seconds, all such timers will fire | ||
| 94 | * at the same time, rather than at various times spread out. The goal | ||
| 95 | * of this is to have the CPU wake up less, which saves power. | ||
| 96 | * | ||
| 97 | * The exact rounding is skewed for each processor to avoid all | ||
| 98 | * processors firing at the exact same time, which could lead | ||
| 99 | * to lock contention or spurious cache line bouncing. | ||
| 100 | * | ||
| 101 | * The return value is the rounded version of the "j" parameter. | ||
| 102 | */ | ||
| 103 | unsigned long __round_jiffies(unsigned long j, int cpu) | ||
| 104 | { | ||
| 105 | int rem; | ||
| 106 | unsigned long original = j; | ||
| 107 | |||
| 108 | /* | ||
| 109 | * We don't want all cpus firing their timers at once hitting the | ||
| 110 | * same lock or cachelines, so we skew each extra cpu with an extra | ||
| 111 | * 3 jiffies. This 3 jiffies came originally from the mm/ code which | ||
| 112 | * already did this. | ||
| 113 | * The skew is done by adding 3*cpunr, then round, then subtract this | ||
| 114 | * extra offset again. | ||
| 115 | */ | ||
| 116 | j += cpu * 3; | ||
| 117 | |||
| 118 | rem = j % HZ; | ||
| 119 | |||
| 120 | /* | ||
| 121 | * If the target jiffie is just after a whole second (which can happen | ||
| 122 | * due to delays of the timer irq, long irq off times etc etc) then | ||
| 123 | * we should round down to the whole second, not up. Use 1/4th second | ||
| 124 | * as cutoff for this rounding as an extreme upper bound for this. | ||
| 125 | */ | ||
| 126 | if (rem < HZ/4) /* round down */ | ||
| 127 | j = j - rem; | ||
| 128 | else /* round up */ | ||
| 129 | j = j - rem + HZ; | ||
| 130 | |||
| 131 | /* now that we have rounded, subtract the extra skew again */ | ||
| 132 | j -= cpu * 3; | ||
| 133 | |||
| 134 | if (j <= jiffies) /* rounding ate our timeout entirely; */ | ||
| 135 | return original; | ||
| 136 | return j; | ||
| 137 | } | ||
| 138 | EXPORT_SYMBOL_GPL(__round_jiffies); | ||
| 139 | |||
| 140 | /** | ||
| 141 | * __round_jiffies_relative - function to round jiffies to a full second | ||
| 142 | * @j: the time in (relative) jiffies that should be rounded | ||
| 143 | * @cpu: the processor number on which the timeout will happen | ||
| 144 | * | ||
| 145 | * __round_jiffies_relative rounds a time delta in the future (in jiffies) | ||
| 146 | * up or down to (approximately) full seconds. This is useful for timers | ||
| 147 | * for which the exact time they fire does not matter too much, as long as | ||
| 148 | * they fire approximately every X seconds. | ||
| 149 | * | ||
| 150 | * By rounding these timers to whole seconds, all such timers will fire | ||
| 151 | * at the same time, rather than at various times spread out. The goal | ||
| 152 | * of this is to have the CPU wake up less, which saves power. | ||
| 153 | * | ||
| 154 | * The exact rounding is skewed for each processor to avoid all | ||
| 155 | * processors firing at the exact same time, which could lead | ||
| 156 | * to lock contention or spurious cache line bouncing. | ||
| 157 | * | ||
| 158 | * The return value is the rounded version of the "j" parameter. | ||
| 159 | */ | ||
| 160 | unsigned long __round_jiffies_relative(unsigned long j, int cpu) | ||
| 161 | { | ||
| 162 | /* | ||
| 163 | * In theory the following code can skip a jiffy in case jiffies | ||
| 164 | * increments right between the addition and the later subtraction. | ||
| 165 | * However since the entire point of this function is to use approximate | ||
| 166 | * timeouts, it's entirely ok to not handle that. | ||
| 167 | */ | ||
| 168 | return __round_jiffies(j + jiffies, cpu) - jiffies; | ||
| 169 | } | ||
| 170 | EXPORT_SYMBOL_GPL(__round_jiffies_relative); | ||
| 171 | |||
| 172 | /** | ||
| 173 | * round_jiffies - function to round jiffies to a full second | ||
| 174 | * @j: the time in (absolute) jiffies that should be rounded | ||
| 175 | * | ||
| 176 | * round_jiffies rounds an absolute time in the future (in jiffies) | ||
| 177 | * up or down to (approximately) full seconds. This is useful for timers | ||
| 178 | * for which the exact time they fire does not matter too much, as long as | ||
| 179 | * they fire approximately every X seconds. | ||
| 180 | * | ||
| 181 | * By rounding these timers to whole seconds, all such timers will fire | ||
| 182 | * at the same time, rather than at various times spread out. The goal | ||
| 183 | * of this is to have the CPU wake up less, which saves power. | ||
| 184 | * | ||
| 185 | * The return value is the rounded version of the "j" parameter. | ||
| 186 | */ | ||
| 187 | unsigned long round_jiffies(unsigned long j) | ||
| 188 | { | ||
| 189 | return __round_jiffies(j, raw_smp_processor_id()); | ||
| 190 | } | ||
| 191 | EXPORT_SYMBOL_GPL(round_jiffies); | ||
| 192 | |||
| 193 | /** | ||
| 194 | * round_jiffies_relative - function to round jiffies to a full second | ||
| 195 | * @j: the time in (relative) jiffies that should be rounded | ||
| 196 | * | ||
| 197 | * round_jiffies_relative rounds a time delta in the future (in jiffies) | ||
| 198 | * up or down to (approximately) full seconds. This is useful for timers | ||
| 199 | * for which the exact time they fire does not matter too much, as long as | ||
| 200 | * they fire approximately every X seconds. | ||
| 201 | * | ||
| 202 | * By rounding these timers to whole seconds, all such timers will fire | ||
| 203 | * at the same time, rather than at various times spread out. The goal | ||
| 204 | * of this is to have the CPU wake up less, which saves power. | ||
| 205 | * | ||
| 206 | * The return value is the rounded version of the "j" parameter. | ||
| 207 | */ | ||
| 208 | unsigned long round_jiffies_relative(unsigned long j) | ||
| 209 | { | ||
| 210 | return __round_jiffies_relative(j, raw_smp_processor_id()); | ||
| 211 | } | ||
| 212 | EXPORT_SYMBOL_GPL(round_jiffies_relative); | ||
| 213 | |||
| 214 | |||
| 83 | static inline void set_running_timer(tvec_base_t *base, | 215 | static inline void set_running_timer(tvec_base_t *base, |
| 84 | struct timer_list *timer) | 216 | struct timer_list *timer) |
| 85 | { | 217 | { |
| @@ -714,7 +846,7 @@ static int change_clocksource(void) | |||
| 714 | clock = new; | 846 | clock = new; |
| 715 | clock->cycle_last = now; | 847 | clock->cycle_last = now; |
| 716 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | 848 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", |
| 717 | clock->name); | 849 | clock->name); |
| 718 | return 1; | 850 | return 1; |
| 719 | } else if (clock->update_callback) { | 851 | } else if (clock->update_callback) { |
| 720 | return clock->update_callback(); | 852 | return clock->update_callback(); |
| @@ -722,7 +854,10 @@ static int change_clocksource(void) | |||
| 722 | return 0; | 854 | return 0; |
| 723 | } | 855 | } |
| 724 | #else | 856 | #else |
| 725 | #define change_clocksource() (0) | 857 | static inline int change_clocksource(void) |
| 858 | { | ||
| 859 | return 0; | ||
| 860 | } | ||
| 726 | #endif | 861 | #endif |
| 727 | 862 | ||
| 728 | /** | 863 | /** |
| @@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device); | |||
| 820 | * If the error is already larger, we look ahead even further | 955 | * If the error is already larger, we look ahead even further |
| 821 | * to compensate for late or lost adjustments. | 956 | * to compensate for late or lost adjustments. |
| 822 | */ | 957 | */ |
| 823 | static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) | 958 | static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, |
| 959 | s64 *offset) | ||
| 824 | { | 960 | { |
| 825 | s64 tick_error, i; | 961 | s64 tick_error, i; |
| 826 | u32 look_ahead, adj; | 962 | u32 look_ahead, adj; |
| @@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 * | |||
| 844 | * Now calculate the error in (1 << look_ahead) ticks, but first | 980 | * Now calculate the error in (1 << look_ahead) ticks, but first |
| 845 | * remove the single look ahead already included in the error. | 981 | * remove the single look ahead already included in the error. |
| 846 | */ | 982 | */ |
| 847 | tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); | 983 | tick_error = current_tick_length() >> |
| 984 | (TICK_LENGTH_SHIFT - clock->shift + 1); | ||
| 848 | tick_error -= clock->xtime_interval >> 1; | 985 | tick_error -= clock->xtime_interval >> 1; |
| 849 | error = ((error - tick_error) >> look_ahead) + tick_error; | 986 | error = ((error - tick_error) >> look_ahead) + tick_error; |
| 850 | 987 | ||
| @@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) | |||
| 896 | clock->mult += adj; | 1033 | clock->mult += adj; |
| 897 | clock->xtime_interval += interval; | 1034 | clock->xtime_interval += interval; |
| 898 | clock->xtime_nsec -= offset; | 1035 | clock->xtime_nsec -= offset; |
| 899 | clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); | 1036 | clock->error -= (interval - offset) << |
| 1037 | (TICK_LENGTH_SHIFT - clock->shift); | ||
| 900 | } | 1038 | } |
| 901 | 1039 | ||
| 902 | /** | 1040 | /** |
| @@ -1008,11 +1146,15 @@ static inline void calc_load(unsigned long ticks) | |||
| 1008 | unsigned long active_tasks; /* fixed-point */ | 1146 | unsigned long active_tasks; /* fixed-point */ |
| 1009 | static int count = LOAD_FREQ; | 1147 | static int count = LOAD_FREQ; |
| 1010 | 1148 | ||
| 1011 | active_tasks = count_active_tasks(); | 1149 | count -= ticks; |
| 1012 | for (count -= ticks; count < 0; count += LOAD_FREQ) { | 1150 | if (unlikely(count < 0)) { |
| 1013 | CALC_LOAD(avenrun[0], EXP_1, active_tasks); | 1151 | active_tasks = count_active_tasks(); |
| 1014 | CALC_LOAD(avenrun[1], EXP_5, active_tasks); | 1152 | do { |
| 1015 | CALC_LOAD(avenrun[2], EXP_15, active_tasks); | 1153 | CALC_LOAD(avenrun[0], EXP_1, active_tasks); |
| 1154 | CALC_LOAD(avenrun[1], EXP_5, active_tasks); | ||
| 1155 | CALC_LOAD(avenrun[2], EXP_15, active_tasks); | ||
| 1156 | count += LOAD_FREQ; | ||
| 1157 | } while (count < 0); | ||
| 1016 | } | 1158 | } |
| 1017 | } | 1159 | } |
| 1018 | 1160 | ||
| @@ -1202,11 +1344,10 @@ fastcall signed long __sched schedule_timeout(signed long timeout) | |||
| 1202 | * should never happens anyway). You just have the printk() | 1344 | * should never happens anyway). You just have the printk() |
| 1203 | * that will tell you if something is gone wrong and where. | 1345 | * that will tell you if something is gone wrong and where. |
| 1204 | */ | 1346 | */ |
| 1205 | if (timeout < 0) | 1347 | if (timeout < 0) { |
| 1206 | { | ||
| 1207 | printk(KERN_ERR "schedule_timeout: wrong timeout " | 1348 | printk(KERN_ERR "schedule_timeout: wrong timeout " |
| 1208 | "value %lx from %p\n", timeout, | 1349 | "value %lx\n", timeout); |
| 1209 | __builtin_return_address(0)); | 1350 | dump_stack(); |
| 1210 | current->state = TASK_RUNNING; | 1351 | current->state = TASK_RUNNING; |
| 1211 | goto out; | 1352 | goto out; |
| 1212 | } | 1353 | } |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 96f77013d3f0..baacc3691415 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
| @@ -96,6 +96,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
| 96 | stats->write_char = p->wchar; | 96 | stats->write_char = p->wchar; |
| 97 | stats->read_syscalls = p->syscr; | 97 | stats->read_syscalls = p->syscr; |
| 98 | stats->write_syscalls = p->syscw; | 98 | stats->write_syscalls = p->syscw; |
| 99 | #ifdef CONFIG_TASK_IO_ACCOUNTING | ||
| 100 | stats->read_bytes = p->ioac.read_bytes; | ||
| 101 | stats->write_bytes = p->ioac.write_bytes; | ||
| 102 | stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; | ||
| 103 | #else | ||
| 104 | stats->read_bytes = 0; | ||
| 105 | stats->write_bytes = 0; | ||
| 106 | stats->cancelled_write_bytes = 0; | ||
| 107 | #endif | ||
| 99 | } | 108 | } |
| 100 | #undef KB | 109 | #undef KB |
| 101 | #undef MB | 110 | #undef MB |
diff --git a/kernel/unwind.c b/kernel/unwind.c deleted file mode 100644 index ed0a21d4a902..000000000000 --- a/kernel/unwind.c +++ /dev/null | |||
| @@ -1,1182 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2002-2006 Novell, Inc. | ||
| 3 | * Jan Beulich <jbeulich@novell.com> | ||
| 4 | * This code is released under version 2 of the GNU GPL. | ||
| 5 | * | ||
| 6 | * A simple API for unwinding kernel stacks. This is used for | ||
| 7 | * debugging and error reporting purposes. The kernel doesn't need | ||
| 8 | * full-blown stack unwinding with all the bells and whistles, so there | ||
| 9 | * is not much point in implementing the full Dwarf2 unwind API. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/unwind.h> | ||
| 13 | #include <linux/module.h> | ||
| 14 | #include <linux/bootmem.h> | ||
| 15 | #include <linux/sort.h> | ||
| 16 | #include <linux/stop_machine.h> | ||
| 17 | #include <asm/sections.h> | ||
| 18 | #include <asm/uaccess.h> | ||
| 19 | #include <asm/unaligned.h> | ||
| 20 | |||
| 21 | extern char __start_unwind[], __end_unwind[]; | ||
| 22 | extern const u8 __start_unwind_hdr[], __end_unwind_hdr[]; | ||
| 23 | |||
| 24 | #define MAX_STACK_DEPTH 8 | ||
| 25 | |||
| 26 | #define EXTRA_INFO(f) { \ | ||
| 27 | BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \ | ||
| 28 | % FIELD_SIZEOF(struct unwind_frame_info, f)) \ | ||
| 29 | + offsetof(struct unwind_frame_info, f) \ | ||
| 30 | / FIELD_SIZEOF(struct unwind_frame_info, f), \ | ||
| 31 | FIELD_SIZEOF(struct unwind_frame_info, f) \ | ||
| 32 | } | ||
| 33 | #define PTREGS_INFO(f) EXTRA_INFO(regs.f) | ||
| 34 | |||
| 35 | static const struct { | ||
| 36 | unsigned offs:BITS_PER_LONG / 2; | ||
| 37 | unsigned width:BITS_PER_LONG / 2; | ||
| 38 | } reg_info[] = { | ||
| 39 | UNW_REGISTER_INFO | ||
| 40 | }; | ||
| 41 | |||
| 42 | #undef PTREGS_INFO | ||
| 43 | #undef EXTRA_INFO | ||
| 44 | |||
| 45 | #ifndef REG_INVALID | ||
| 46 | #define REG_INVALID(r) (reg_info[r].width == 0) | ||
| 47 | #endif | ||
| 48 | |||
| 49 | #define DW_CFA_nop 0x00 | ||
| 50 | #define DW_CFA_set_loc 0x01 | ||
| 51 | #define DW_CFA_advance_loc1 0x02 | ||
| 52 | #define DW_CFA_advance_loc2 0x03 | ||
| 53 | #define DW_CFA_advance_loc4 0x04 | ||
| 54 | #define DW_CFA_offset_extended 0x05 | ||
| 55 | #define DW_CFA_restore_extended 0x06 | ||
| 56 | #define DW_CFA_undefined 0x07 | ||
| 57 | #define DW_CFA_same_value 0x08 | ||
| 58 | #define DW_CFA_register 0x09 | ||
| 59 | #define DW_CFA_remember_state 0x0a | ||
| 60 | #define DW_CFA_restore_state 0x0b | ||
| 61 | #define DW_CFA_def_cfa 0x0c | ||
| 62 | #define DW_CFA_def_cfa_register 0x0d | ||
| 63 | #define DW_CFA_def_cfa_offset 0x0e | ||
| 64 | #define DW_CFA_def_cfa_expression 0x0f | ||
| 65 | #define DW_CFA_expression 0x10 | ||
| 66 | #define DW_CFA_offset_extended_sf 0x11 | ||
| 67 | #define DW_CFA_def_cfa_sf 0x12 | ||
| 68 | #define DW_CFA_def_cfa_offset_sf 0x13 | ||
| 69 | #define DW_CFA_val_offset 0x14 | ||
| 70 | #define DW_CFA_val_offset_sf 0x15 | ||
| 71 | #define DW_CFA_val_expression 0x16 | ||
| 72 | #define DW_CFA_lo_user 0x1c | ||
| 73 | #define DW_CFA_GNU_window_save 0x2d | ||
| 74 | #define DW_CFA_GNU_args_size 0x2e | ||
| 75 | #define DW_CFA_GNU_negative_offset_extended 0x2f | ||
| 76 | #define DW_CFA_hi_user 0x3f | ||
| 77 | |||
| 78 | #define DW_EH_PE_FORM 0x07 | ||
| 79 | #define DW_EH_PE_native 0x00 | ||
| 80 | #define DW_EH_PE_leb128 0x01 | ||
| 81 | #define DW_EH_PE_data2 0x02 | ||
| 82 | #define DW_EH_PE_data4 0x03 | ||
| 83 | #define DW_EH_PE_data8 0x04 | ||
| 84 | #define DW_EH_PE_signed 0x08 | ||
| 85 | #define DW_EH_PE_ADJUST 0x70 | ||
| 86 | #define DW_EH_PE_abs 0x00 | ||
| 87 | #define DW_EH_PE_pcrel 0x10 | ||
| 88 | #define DW_EH_PE_textrel 0x20 | ||
| 89 | #define DW_EH_PE_datarel 0x30 | ||
| 90 | #define DW_EH_PE_funcrel 0x40 | ||
| 91 | #define DW_EH_PE_aligned 0x50 | ||
| 92 | #define DW_EH_PE_indirect 0x80 | ||
| 93 | #define DW_EH_PE_omit 0xff | ||
| 94 | |||
| 95 | typedef unsigned long uleb128_t; | ||
| 96 | typedef signed long sleb128_t; | ||
| 97 | |||
| 98 | static struct unwind_table { | ||
| 99 | struct { | ||
| 100 | unsigned long pc; | ||
| 101 | unsigned long range; | ||
| 102 | } core, init; | ||
| 103 | const void *address; | ||
| 104 | unsigned long size; | ||
| 105 | const unsigned char *header; | ||
| 106 | unsigned long hdrsz; | ||
| 107 | struct unwind_table *link; | ||
| 108 | const char *name; | ||
| 109 | } root_table; | ||
| 110 | |||
| 111 | struct unwind_item { | ||
| 112 | enum item_location { | ||
| 113 | Nowhere, | ||
| 114 | Memory, | ||
| 115 | Register, | ||
| 116 | Value | ||
| 117 | } where; | ||
| 118 | uleb128_t value; | ||
| 119 | }; | ||
| 120 | |||
| 121 | struct unwind_state { | ||
| 122 | uleb128_t loc, org; | ||
| 123 | const u8 *cieStart, *cieEnd; | ||
| 124 | uleb128_t codeAlign; | ||
| 125 | sleb128_t dataAlign; | ||
| 126 | struct cfa { | ||
| 127 | uleb128_t reg, offs; | ||
| 128 | } cfa; | ||
| 129 | struct unwind_item regs[ARRAY_SIZE(reg_info)]; | ||
| 130 | unsigned stackDepth:8; | ||
| 131 | unsigned version:8; | ||
| 132 | const u8 *label; | ||
| 133 | const u8 *stack[MAX_STACK_DEPTH]; | ||
| 134 | }; | ||
| 135 | |||
| 136 | static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; | ||
| 137 | |||
| 138 | static struct unwind_table *find_table(unsigned long pc) | ||
| 139 | { | ||
| 140 | struct unwind_table *table; | ||
| 141 | |||
| 142 | for (table = &root_table; table; table = table->link) | ||
| 143 | if ((pc >= table->core.pc | ||
| 144 | && pc < table->core.pc + table->core.range) | ||
| 145 | || (pc >= table->init.pc | ||
| 146 | && pc < table->init.pc + table->init.range)) | ||
| 147 | break; | ||
| 148 | |||
| 149 | return table; | ||
| 150 | } | ||
| 151 | |||
| 152 | static unsigned long read_pointer(const u8 **pLoc, | ||
| 153 | const void *end, | ||
| 154 | signed ptrType); | ||
| 155 | |||
| 156 | static void init_unwind_table(struct unwind_table *table, | ||
| 157 | const char *name, | ||
| 158 | const void *core_start, | ||
| 159 | unsigned long core_size, | ||
| 160 | const void *init_start, | ||
| 161 | unsigned long init_size, | ||
| 162 | const void *table_start, | ||
| 163 | unsigned long table_size, | ||
| 164 | const u8 *header_start, | ||
| 165 | unsigned long header_size) | ||
| 166 | { | ||
| 167 | const u8 *ptr = header_start + 4; | ||
| 168 | const u8 *end = header_start + header_size; | ||
| 169 | |||
| 170 | table->core.pc = (unsigned long)core_start; | ||
| 171 | table->core.range = core_size; | ||
| 172 | table->init.pc = (unsigned long)init_start; | ||
| 173 | table->init.range = init_size; | ||
| 174 | table->address = table_start; | ||
| 175 | table->size = table_size; | ||
| 176 | /* See if the linker provided table looks valid. */ | ||
| 177 | if (header_size <= 4 | ||
| 178 | || header_start[0] != 1 | ||
| 179 | || (void *)read_pointer(&ptr, end, header_start[1]) != table_start | ||
| 180 | || header_start[2] == DW_EH_PE_omit | ||
| 181 | || read_pointer(&ptr, end, header_start[2]) <= 0 | ||
| 182 | || header_start[3] == DW_EH_PE_omit) | ||
| 183 | header_start = NULL; | ||
| 184 | table->hdrsz = header_size; | ||
| 185 | smp_wmb(); | ||
| 186 | table->header = header_start; | ||
| 187 | table->link = NULL; | ||
| 188 | table->name = name; | ||
| 189 | } | ||
| 190 | |||
| 191 | void __init unwind_init(void) | ||
| 192 | { | ||
| 193 | init_unwind_table(&root_table, "kernel", | ||
| 194 | _text, _end - _text, | ||
| 195 | NULL, 0, | ||
| 196 | __start_unwind, __end_unwind - __start_unwind, | ||
| 197 | __start_unwind_hdr, __end_unwind_hdr - __start_unwind_hdr); | ||
| 198 | } | ||
| 199 | |||
| 200 | static const u32 bad_cie, not_fde; | ||
| 201 | static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *); | ||
| 202 | static signed fde_pointer_type(const u32 *cie); | ||
| 203 | |||
| 204 | struct eh_frame_hdr_table_entry { | ||
| 205 | unsigned long start, fde; | ||
| 206 | }; | ||
| 207 | |||
| 208 | static int cmp_eh_frame_hdr_table_entries(const void *p1, const void *p2) | ||
| 209 | { | ||
| 210 | const struct eh_frame_hdr_table_entry *e1 = p1; | ||
| 211 | const struct eh_frame_hdr_table_entry *e2 = p2; | ||
| 212 | |||
| 213 | return (e1->start > e2->start) - (e1->start < e2->start); | ||
| 214 | } | ||
| 215 | |||
| 216 | static void swap_eh_frame_hdr_table_entries(void *p1, void *p2, int size) | ||
| 217 | { | ||
| 218 | struct eh_frame_hdr_table_entry *e1 = p1; | ||
| 219 | struct eh_frame_hdr_table_entry *e2 = p2; | ||
| 220 | unsigned long v; | ||
| 221 | |||
| 222 | v = e1->start; | ||
| 223 | e1->start = e2->start; | ||
| 224 | e2->start = v; | ||
| 225 | v = e1->fde; | ||
| 226 | e1->fde = e2->fde; | ||
| 227 | e2->fde = v; | ||
| 228 | } | ||
| 229 | |||
| 230 | static void __init setup_unwind_table(struct unwind_table *table, | ||
| 231 | void *(*alloc)(unsigned long)) | ||
| 232 | { | ||
| 233 | const u8 *ptr; | ||
| 234 | unsigned long tableSize = table->size, hdrSize; | ||
| 235 | unsigned n; | ||
| 236 | const u32 *fde; | ||
| 237 | struct { | ||
| 238 | u8 version; | ||
| 239 | u8 eh_frame_ptr_enc; | ||
| 240 | u8 fde_count_enc; | ||
| 241 | u8 table_enc; | ||
| 242 | unsigned long eh_frame_ptr; | ||
| 243 | unsigned int fde_count; | ||
| 244 | struct eh_frame_hdr_table_entry table[]; | ||
| 245 | } __attribute__((__packed__)) *header; | ||
| 246 | |||
| 247 | if (table->header) | ||
| 248 | return; | ||
| 249 | |||
| 250 | if (table->hdrsz) | ||
| 251 | printk(KERN_WARNING ".eh_frame_hdr for '%s' present but unusable\n", | ||
| 252 | table->name); | ||
| 253 | |||
| 254 | if (tableSize & (sizeof(*fde) - 1)) | ||
| 255 | return; | ||
| 256 | |||
| 257 | for (fde = table->address, n = 0; | ||
| 258 | tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; | ||
| 259 | tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { | ||
| 260 | const u32 *cie = cie_for_fde(fde, table); | ||
| 261 | signed ptrType; | ||
| 262 | |||
| 263 | if (cie == ¬_fde) | ||
| 264 | continue; | ||
| 265 | if (cie == NULL | ||
| 266 | || cie == &bad_cie | ||
| 267 | || (ptrType = fde_pointer_type(cie)) < 0) | ||
| 268 | return; | ||
| 269 | ptr = (const u8 *)(fde + 2); | ||
| 270 | if (!read_pointer(&ptr, | ||
| 271 | (const u8 *)(fde + 1) + *fde, | ||
| 272 | ptrType)) | ||
| 273 | return; | ||
| 274 | ++n; | ||
| 275 | } | ||
| 276 | |||
| 277 | if (tableSize || !n) | ||
| 278 | return; | ||
| 279 | |||
| 280 | hdrSize = 4 + sizeof(unsigned long) + sizeof(unsigned int) | ||
| 281 | + 2 * n * sizeof(unsigned long); | ||
| 282 | header = alloc(hdrSize); | ||
| 283 | if (!header) | ||
| 284 | return; | ||
| 285 | header->version = 1; | ||
| 286 | header->eh_frame_ptr_enc = DW_EH_PE_abs|DW_EH_PE_native; | ||
| 287 | header->fde_count_enc = DW_EH_PE_abs|DW_EH_PE_data4; | ||
| 288 | header->table_enc = DW_EH_PE_abs|DW_EH_PE_native; | ||
| 289 | put_unaligned((unsigned long)table->address, &header->eh_frame_ptr); | ||
| 290 | BUILD_BUG_ON(offsetof(typeof(*header), fde_count) | ||
| 291 | % __alignof(typeof(header->fde_count))); | ||
| 292 | header->fde_count = n; | ||
| 293 | |||
| 294 | BUILD_BUG_ON(offsetof(typeof(*header), table) | ||
| 295 | % __alignof(typeof(*header->table))); | ||
| 296 | for (fde = table->address, tableSize = table->size, n = 0; | ||
| 297 | tableSize; | ||
| 298 | tableSize -= sizeof(*fde) + *fde, fde += 1 + *fde / sizeof(*fde)) { | ||
| 299 | const u32 *cie = fde + 1 - fde[1] / sizeof(*fde); | ||
| 300 | |||
| 301 | if (!fde[1]) | ||
| 302 | continue; /* this is a CIE */ | ||
| 303 | ptr = (const u8 *)(fde + 2); | ||
| 304 | header->table[n].start = read_pointer(&ptr, | ||
| 305 | (const u8 *)(fde + 1) + *fde, | ||
| 306 | fde_pointer_type(cie)); | ||
| 307 | header->table[n].fde = (unsigned long)fde; | ||
| 308 | ++n; | ||
| 309 | } | ||
| 310 | WARN_ON(n != header->fde_count); | ||
| 311 | |||
| 312 | sort(header->table, | ||
| 313 | n, | ||
| 314 | sizeof(*header->table), | ||
| 315 | cmp_eh_frame_hdr_table_entries, | ||
| 316 | swap_eh_frame_hdr_table_entries); | ||
| 317 | |||
| 318 | table->hdrsz = hdrSize; | ||
| 319 | smp_wmb(); | ||
| 320 | table->header = (const void *)header; | ||
| 321 | } | ||
| 322 | |||
| 323 | static void *__init balloc(unsigned long sz) | ||
| 324 | { | ||
| 325 | return __alloc_bootmem_nopanic(sz, | ||
| 326 | sizeof(unsigned int), | ||
| 327 | __pa(MAX_DMA_ADDRESS)); | ||
| 328 | } | ||
| 329 | |||
| 330 | void __init unwind_setup(void) | ||
| 331 | { | ||
| 332 | setup_unwind_table(&root_table, balloc); | ||
| 333 | } | ||
| 334 | |||
| 335 | #ifdef CONFIG_MODULES | ||
| 336 | |||
| 337 | static struct unwind_table *last_table; | ||
| 338 | |||
| 339 | /* Must be called with module_mutex held. */ | ||
| 340 | void *unwind_add_table(struct module *module, | ||
| 341 | const void *table_start, | ||
| 342 | unsigned long table_size) | ||
| 343 | { | ||
| 344 | struct unwind_table *table; | ||
| 345 | |||
| 346 | if (table_size <= 0) | ||
| 347 | return NULL; | ||
| 348 | |||
| 349 | table = kmalloc(sizeof(*table), GFP_KERNEL); | ||
| 350 | if (!table) | ||
| 351 | return NULL; | ||
| 352 | |||
| 353 | init_unwind_table(table, module->name, | ||
| 354 | module->module_core, module->core_size, | ||
| 355 | module->module_init, module->init_size, | ||
| 356 | table_start, table_size, | ||
| 357 | NULL, 0); | ||
| 358 | |||
| 359 | if (last_table) | ||
| 360 | last_table->link = table; | ||
| 361 | else | ||
| 362 | root_table.link = table; | ||
| 363 | last_table = table; | ||
| 364 | |||
| 365 | return table; | ||
| 366 | } | ||
| 367 | |||
| 368 | struct unlink_table_info | ||
| 369 | { | ||
| 370 | struct unwind_table *table; | ||
| 371 | int init_only; | ||
| 372 | }; | ||
| 373 | |||
| 374 | static int unlink_table(void *arg) | ||
| 375 | { | ||
| 376 | struct unlink_table_info *info = arg; | ||
| 377 | struct unwind_table *table = info->table, *prev; | ||
| 378 | |||
| 379 | for (prev = &root_table; prev->link && prev->link != table; prev = prev->link) | ||
| 380 | ; | ||
| 381 | |||
| 382 | if (prev->link) { | ||
| 383 | if (info->init_only) { | ||
| 384 | table->init.pc = 0; | ||
| 385 | table->init.range = 0; | ||
| 386 | info->table = NULL; | ||
| 387 | } else { | ||
| 388 | prev->link = table->link; | ||
| 389 | if (!prev->link) | ||
| 390 | last_table = prev; | ||
| 391 | } | ||
| 392 | } else | ||
| 393 | info->table = NULL; | ||
| 394 | |||
| 395 | return 0; | ||
| 396 | } | ||
| 397 | |||
| 398 | /* Must be called with module_mutex held. */ | ||
| 399 | void unwind_remove_table(void *handle, int init_only) | ||
| 400 | { | ||
| 401 | struct unwind_table *table = handle; | ||
| 402 | struct unlink_table_info info; | ||
| 403 | |||
| 404 | if (!table || table == &root_table) | ||
| 405 | return; | ||
| 406 | |||
| 407 | if (init_only && table == last_table) { | ||
| 408 | table->init.pc = 0; | ||
| 409 | table->init.range = 0; | ||
| 410 | return; | ||
| 411 | } | ||
| 412 | |||
| 413 | info.table = table; | ||
| 414 | info.init_only = init_only; | ||
| 415 | stop_machine_run(unlink_table, &info, NR_CPUS); | ||
| 416 | |||
| 417 | if (info.table) | ||
| 418 | kfree(table); | ||
| 419 | } | ||
| 420 | |||
| 421 | #endif /* CONFIG_MODULES */ | ||
| 422 | |||
| 423 | static uleb128_t get_uleb128(const u8 **pcur, const u8 *end) | ||
| 424 | { | ||
| 425 | const u8 *cur = *pcur; | ||
| 426 | uleb128_t value; | ||
| 427 | unsigned shift; | ||
| 428 | |||
| 429 | for (shift = 0, value = 0; cur < end; shift += 7) { | ||
| 430 | if (shift + 7 > 8 * sizeof(value) | ||
| 431 | && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { | ||
| 432 | cur = end + 1; | ||
| 433 | break; | ||
| 434 | } | ||
| 435 | value |= (uleb128_t)(*cur & 0x7f) << shift; | ||
| 436 | if (!(*cur++ & 0x80)) | ||
| 437 | break; | ||
| 438 | } | ||
| 439 | *pcur = cur; | ||
| 440 | |||
| 441 | return value; | ||
| 442 | } | ||
| 443 | |||
| 444 | static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) | ||
| 445 | { | ||
| 446 | const u8 *cur = *pcur; | ||
| 447 | sleb128_t value; | ||
| 448 | unsigned shift; | ||
| 449 | |||
| 450 | for (shift = 0, value = 0; cur < end; shift += 7) { | ||
| 451 | if (shift + 7 > 8 * sizeof(value) | ||
| 452 | && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { | ||
| 453 | cur = end + 1; | ||
| 454 | break; | ||
| 455 | } | ||
| 456 | value |= (sleb128_t)(*cur & 0x7f) << shift; | ||
| 457 | if (!(*cur & 0x80)) { | ||
| 458 | value |= -(*cur++ & 0x40) << shift; | ||
| 459 | break; | ||
| 460 | } | ||
| 461 | } | ||
| 462 | *pcur = cur; | ||
| 463 | |||
| 464 | return value; | ||
| 465 | } | ||
| 466 | |||
| 467 | static const u32 *cie_for_fde(const u32 *fde, const struct unwind_table *table) | ||
| 468 | { | ||
| 469 | const u32 *cie; | ||
| 470 | |||
| 471 | if (!*fde || (*fde & (sizeof(*fde) - 1))) | ||
| 472 | return &bad_cie; | ||
| 473 | if (!fde[1]) | ||
| 474 | return ¬_fde; /* this is a CIE */ | ||
| 475 | if ((fde[1] & (sizeof(*fde) - 1)) | ||
| 476 | || fde[1] > (unsigned long)(fde + 1) - (unsigned long)table->address) | ||
| 477 | return NULL; /* this is not a valid FDE */ | ||
| 478 | cie = fde + 1 - fde[1] / sizeof(*fde); | ||
| 479 | if (*cie <= sizeof(*cie) + 4 | ||
| 480 | || *cie >= fde[1] - sizeof(*fde) | ||
| 481 | || (*cie & (sizeof(*cie) - 1)) | ||
| 482 | || cie[1]) | ||
| 483 | return NULL; /* this is not a (valid) CIE */ | ||
| 484 | return cie; | ||
| 485 | } | ||
| 486 | |||
| 487 | static unsigned long read_pointer(const u8 **pLoc, | ||
| 488 | const void *end, | ||
| 489 | signed ptrType) | ||
| 490 | { | ||
| 491 | unsigned long value = 0; | ||
| 492 | union { | ||
| 493 | const u8 *p8; | ||
| 494 | const u16 *p16u; | ||
| 495 | const s16 *p16s; | ||
| 496 | const u32 *p32u; | ||
| 497 | const s32 *p32s; | ||
| 498 | const unsigned long *pul; | ||
| 499 | } ptr; | ||
| 500 | |||
| 501 | if (ptrType < 0 || ptrType == DW_EH_PE_omit) | ||
| 502 | return 0; | ||
| 503 | ptr.p8 = *pLoc; | ||
| 504 | switch(ptrType & DW_EH_PE_FORM) { | ||
| 505 | case DW_EH_PE_data2: | ||
| 506 | if (end < (const void *)(ptr.p16u + 1)) | ||
| 507 | return 0; | ||
| 508 | if(ptrType & DW_EH_PE_signed) | ||
| 509 | value = get_unaligned(ptr.p16s++); | ||
| 510 | else | ||
| 511 | value = get_unaligned(ptr.p16u++); | ||
| 512 | break; | ||
| 513 | case DW_EH_PE_data4: | ||
| 514 | #ifdef CONFIG_64BIT | ||
| 515 | if (end < (const void *)(ptr.p32u + 1)) | ||
| 516 | return 0; | ||
| 517 | if(ptrType & DW_EH_PE_signed) | ||
| 518 | value = get_unaligned(ptr.p32s++); | ||
| 519 | else | ||
| 520 | value = get_unaligned(ptr.p32u++); | ||
| 521 | break; | ||
| 522 | case DW_EH_PE_data8: | ||
| 523 | BUILD_BUG_ON(sizeof(u64) != sizeof(value)); | ||
| 524 | #else | ||
| 525 | BUILD_BUG_ON(sizeof(u32) != sizeof(value)); | ||
| 526 | #endif | ||
| 527 | case DW_EH_PE_native: | ||
| 528 | if (end < (const void *)(ptr.pul + 1)) | ||
| 529 | return 0; | ||
| 530 | value = get_unaligned(ptr.pul++); | ||
| 531 | break; | ||
| 532 | case DW_EH_PE_leb128: | ||
| 533 | BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value)); | ||
| 534 | value = ptrType & DW_EH_PE_signed | ||
| 535 | ? get_sleb128(&ptr.p8, end) | ||
| 536 | : get_uleb128(&ptr.p8, end); | ||
| 537 | if ((const void *)ptr.p8 > end) | ||
| 538 | return 0; | ||
| 539 | break; | ||
| 540 | default: | ||
| 541 | return 0; | ||
| 542 | } | ||
| 543 | switch(ptrType & DW_EH_PE_ADJUST) { | ||
| 544 | case DW_EH_PE_abs: | ||
| 545 | break; | ||
| 546 | case DW_EH_PE_pcrel: | ||
| 547 | value += (unsigned long)*pLoc; | ||
| 548 | break; | ||
| 549 | default: | ||
| 550 | return 0; | ||
| 551 | } | ||
| 552 | if ((ptrType & DW_EH_PE_indirect) | ||
| 553 | && __get_user(value, (unsigned long *)value)) | ||
| 554 | return 0; | ||
| 555 | *pLoc = ptr.p8; | ||
| 556 | |||
| 557 | return value; | ||
| 558 | } | ||
| 559 | |||
| 560 | static signed fde_pointer_type(const u32 *cie) | ||
| 561 | { | ||
| 562 | const u8 *ptr = (const u8 *)(cie + 2); | ||
| 563 | unsigned version = *ptr; | ||
| 564 | |||
| 565 | if (version != 1) | ||
| 566 | return -1; /* unsupported */ | ||
| 567 | if (*++ptr) { | ||
| 568 | const char *aug; | ||
| 569 | const u8 *end = (const u8 *)(cie + 1) + *cie; | ||
| 570 | uleb128_t len; | ||
| 571 | |||
| 572 | /* check if augmentation size is first (and thus present) */ | ||
| 573 | if (*ptr != 'z') | ||
| 574 | return -1; | ||
| 575 | /* check if augmentation string is nul-terminated */ | ||
| 576 | if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL) | ||
| 577 | return -1; | ||
| 578 | ++ptr; /* skip terminator */ | ||
| 579 | get_uleb128(&ptr, end); /* skip code alignment */ | ||
| 580 | get_sleb128(&ptr, end); /* skip data alignment */ | ||
| 581 | /* skip return address column */ | ||
| 582 | version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end); | ||
| 583 | len = get_uleb128(&ptr, end); /* augmentation length */ | ||
| 584 | if (ptr + len < ptr || ptr + len > end) | ||
| 585 | return -1; | ||
| 586 | end = ptr + len; | ||
| 587 | while (*++aug) { | ||
| 588 | if (ptr >= end) | ||
| 589 | return -1; | ||
| 590 | switch(*aug) { | ||
| 591 | case 'L': | ||
| 592 | ++ptr; | ||
| 593 | break; | ||
| 594 | case 'P': { | ||
| 595 | signed ptrType = *ptr++; | ||
| 596 | |||
| 597 | if (!read_pointer(&ptr, end, ptrType) || ptr > end) | ||
| 598 | return -1; | ||
| 599 | } | ||
| 600 | break; | ||
| 601 | case 'R': | ||
| 602 | return *ptr; | ||
| 603 | default: | ||
| 604 | return -1; | ||
| 605 | } | ||
| 606 | } | ||
| 607 | } | ||
| 608 | return DW_EH_PE_native|DW_EH_PE_abs; | ||
| 609 | } | ||
| 610 | |||
| 611 | static int advance_loc(unsigned long delta, struct unwind_state *state) | ||
| 612 | { | ||
| 613 | state->loc += delta * state->codeAlign; | ||
| 614 | |||
| 615 | return delta > 0; | ||
| 616 | } | ||
| 617 | |||
| 618 | static void set_rule(uleb128_t reg, | ||
| 619 | enum item_location where, | ||
| 620 | uleb128_t value, | ||
| 621 | struct unwind_state *state) | ||
| 622 | { | ||
| 623 | if (reg < ARRAY_SIZE(state->regs)) { | ||
| 624 | state->regs[reg].where = where; | ||
| 625 | state->regs[reg].value = value; | ||
| 626 | } | ||
| 627 | } | ||
| 628 | |||
| 629 | static int processCFI(const u8 *start, | ||
| 630 | const u8 *end, | ||
| 631 | unsigned long targetLoc, | ||
| 632 | signed ptrType, | ||
| 633 | struct unwind_state *state) | ||
| 634 | { | ||
| 635 | union { | ||
| 636 | const u8 *p8; | ||
| 637 | const u16 *p16; | ||
| 638 | const u32 *p32; | ||
| 639 | } ptr; | ||
| 640 | int result = 1; | ||
| 641 | |||
| 642 | if (start != state->cieStart) { | ||
| 643 | state->loc = state->org; | ||
| 644 | result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state); | ||
| 645 | if (targetLoc == 0 && state->label == NULL) | ||
| 646 | return result; | ||
| 647 | } | ||
| 648 | for (ptr.p8 = start; result && ptr.p8 < end; ) { | ||
| 649 | switch(*ptr.p8 >> 6) { | ||
| 650 | uleb128_t value; | ||
| 651 | |||
| 652 | case 0: | ||
| 653 | switch(*ptr.p8++) { | ||
| 654 | case DW_CFA_nop: | ||
| 655 | break; | ||
| 656 | case DW_CFA_set_loc: | ||
| 657 | if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) | ||
| 658 | result = 0; | ||
| 659 | break; | ||
| 660 | case DW_CFA_advance_loc1: | ||
| 661 | result = ptr.p8 < end && advance_loc(*ptr.p8++, state); | ||
| 662 | break; | ||
| 663 | case DW_CFA_advance_loc2: | ||
| 664 | result = ptr.p8 <= end + 2 | ||
| 665 | && advance_loc(*ptr.p16++, state); | ||
| 666 | break; | ||
| 667 | case DW_CFA_advance_loc4: | ||
| 668 | result = ptr.p8 <= end + 4 | ||
| 669 | && advance_loc(*ptr.p32++, state); | ||
| 670 | break; | ||
| 671 | case DW_CFA_offset_extended: | ||
| 672 | value = get_uleb128(&ptr.p8, end); | ||
| 673 | set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); | ||
| 674 | break; | ||
| 675 | case DW_CFA_val_offset: | ||
| 676 | value = get_uleb128(&ptr.p8, end); | ||
| 677 | set_rule(value, Value, get_uleb128(&ptr.p8, end), state); | ||
| 678 | break; | ||
| 679 | case DW_CFA_offset_extended_sf: | ||
| 680 | value = get_uleb128(&ptr.p8, end); | ||
| 681 | set_rule(value, Memory, get_sleb128(&ptr.p8, end), state); | ||
| 682 | break; | ||
| 683 | case DW_CFA_val_offset_sf: | ||
| 684 | value = get_uleb128(&ptr.p8, end); | ||
| 685 | set_rule(value, Value, get_sleb128(&ptr.p8, end), state); | ||
| 686 | break; | ||
| 687 | case DW_CFA_restore_extended: | ||
| 688 | case DW_CFA_undefined: | ||
| 689 | case DW_CFA_same_value: | ||
| 690 | set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state); | ||
| 691 | break; | ||
| 692 | case DW_CFA_register: | ||
| 693 | value = get_uleb128(&ptr.p8, end); | ||
| 694 | set_rule(value, | ||
| 695 | Register, | ||
| 696 | get_uleb128(&ptr.p8, end), state); | ||
| 697 | break; | ||
| 698 | case DW_CFA_remember_state: | ||
| 699 | if (ptr.p8 == state->label) { | ||
| 700 | state->label = NULL; | ||
| 701 | return 1; | ||
| 702 | } | ||
| 703 | if (state->stackDepth >= MAX_STACK_DEPTH) | ||
| 704 | return 0; | ||
| 705 | state->stack[state->stackDepth++] = ptr.p8; | ||
| 706 | break; | ||
| 707 | case DW_CFA_restore_state: | ||
| 708 | if (state->stackDepth) { | ||
| 709 | const uleb128_t loc = state->loc; | ||
| 710 | const u8 *label = state->label; | ||
| 711 | |||
| 712 | state->label = state->stack[state->stackDepth - 1]; | ||
| 713 | memcpy(&state->cfa, &badCFA, sizeof(state->cfa)); | ||
| 714 | memset(state->regs, 0, sizeof(state->regs)); | ||
| 715 | state->stackDepth = 0; | ||
| 716 | result = processCFI(start, end, 0, ptrType, state); | ||
| 717 | state->loc = loc; | ||
| 718 | state->label = label; | ||
| 719 | } else | ||
| 720 | return 0; | ||
| 721 | break; | ||
| 722 | case DW_CFA_def_cfa: | ||
| 723 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
| 724 | /*nobreak*/ | ||
| 725 | case DW_CFA_def_cfa_offset: | ||
| 726 | state->cfa.offs = get_uleb128(&ptr.p8, end); | ||
| 727 | break; | ||
| 728 | case DW_CFA_def_cfa_sf: | ||
| 729 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
| 730 | /*nobreak*/ | ||
| 731 | case DW_CFA_def_cfa_offset_sf: | ||
| 732 | state->cfa.offs = get_sleb128(&ptr.p8, end) | ||
| 733 | * state->dataAlign; | ||
| 734 | break; | ||
| 735 | case DW_CFA_def_cfa_register: | ||
| 736 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
| 737 | break; | ||
| 738 | /*todo case DW_CFA_def_cfa_expression: */ | ||
| 739 | /*todo case DW_CFA_expression: */ | ||
| 740 | /*todo case DW_CFA_val_expression: */ | ||
| 741 | case DW_CFA_GNU_args_size: | ||
| 742 | get_uleb128(&ptr.p8, end); | ||
| 743 | break; | ||
| 744 | case DW_CFA_GNU_negative_offset_extended: | ||
| 745 | value = get_uleb128(&ptr.p8, end); | ||
| 746 | set_rule(value, | ||
| 747 | Memory, | ||
| 748 | (uleb128_t)0 - get_uleb128(&ptr.p8, end), state); | ||
| 749 | break; | ||
| 750 | case DW_CFA_GNU_window_save: | ||
| 751 | default: | ||
| 752 | result = 0; | ||
| 753 | break; | ||
| 754 | } | ||
| 755 | break; | ||
| 756 | case 1: | ||
| 757 | result = advance_loc(*ptr.p8++ & 0x3f, state); | ||
| 758 | break; | ||
| 759 | case 2: | ||
| 760 | value = *ptr.p8++ & 0x3f; | ||
| 761 | set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); | ||
| 762 | break; | ||
| 763 | case 3: | ||
| 764 | set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); | ||
| 765 | break; | ||
| 766 | } | ||
| 767 | if (ptr.p8 > end) | ||
| 768 | result = 0; | ||
| 769 | if (result && targetLoc != 0 && targetLoc < state->loc) | ||
| 770 | return 1; | ||
| 771 | } | ||
| 772 | |||
| 773 | return result | ||
| 774 | && ptr.p8 == end | ||
| 775 | && (targetLoc == 0 | ||
| 776 | || (/*todo While in theory this should apply, gcc in practice omits | ||
| 777 | everything past the function prolog, and hence the location | ||
| 778 | never reaches the end of the function. | ||
| 779 | targetLoc < state->loc &&*/ state->label == NULL)); | ||
| 780 | } | ||
| 781 | |||
| 782 | /* Unwind to previous to frame. Returns 0 if successful, negative | ||
| 783 | * number in case of an error. */ | ||
| 784 | int unwind(struct unwind_frame_info *frame) | ||
| 785 | { | ||
| 786 | #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) | ||
| 787 | const u32 *fde = NULL, *cie = NULL; | ||
| 788 | const u8 *ptr = NULL, *end = NULL; | ||
| 789 | unsigned long pc = UNW_PC(frame) - frame->call_frame; | ||
| 790 | unsigned long startLoc = 0, endLoc = 0, cfa; | ||
| 791 | unsigned i; | ||
| 792 | signed ptrType = -1; | ||
| 793 | uleb128_t retAddrReg = 0; | ||
| 794 | const struct unwind_table *table; | ||
| 795 | struct unwind_state state; | ||
| 796 | |||
| 797 | if (UNW_PC(frame) == 0) | ||
| 798 | return -EINVAL; | ||
| 799 | if ((table = find_table(pc)) != NULL | ||
| 800 | && !(table->size & (sizeof(*fde) - 1))) { | ||
| 801 | const u8 *hdr = table->header; | ||
| 802 | unsigned long tableSize; | ||
| 803 | |||
| 804 | smp_rmb(); | ||
| 805 | if (hdr && hdr[0] == 1) { | ||
| 806 | switch(hdr[3] & DW_EH_PE_FORM) { | ||
| 807 | case DW_EH_PE_native: tableSize = sizeof(unsigned long); break; | ||
| 808 | case DW_EH_PE_data2: tableSize = 2; break; | ||
| 809 | case DW_EH_PE_data4: tableSize = 4; break; | ||
| 810 | case DW_EH_PE_data8: tableSize = 8; break; | ||
| 811 | default: tableSize = 0; break; | ||
| 812 | } | ||
| 813 | ptr = hdr + 4; | ||
| 814 | end = hdr + table->hdrsz; | ||
| 815 | if (tableSize | ||
| 816 | && read_pointer(&ptr, end, hdr[1]) | ||
| 817 | == (unsigned long)table->address | ||
| 818 | && (i = read_pointer(&ptr, end, hdr[2])) > 0 | ||
| 819 | && i == (end - ptr) / (2 * tableSize) | ||
| 820 | && !((end - ptr) % (2 * tableSize))) { | ||
| 821 | do { | ||
| 822 | const u8 *cur = ptr + (i / 2) * (2 * tableSize); | ||
| 823 | |||
| 824 | startLoc = read_pointer(&cur, | ||
| 825 | cur + tableSize, | ||
| 826 | hdr[3]); | ||
| 827 | if (pc < startLoc) | ||
| 828 | i /= 2; | ||
| 829 | else { | ||
| 830 | ptr = cur - tableSize; | ||
| 831 | i = (i + 1) / 2; | ||
| 832 | } | ||
| 833 | } while (startLoc && i > 1); | ||
| 834 | if (i == 1 | ||
| 835 | && (startLoc = read_pointer(&ptr, | ||
| 836 | ptr + tableSize, | ||
| 837 | hdr[3])) != 0 | ||
| 838 | && pc >= startLoc) | ||
| 839 | fde = (void *)read_pointer(&ptr, | ||
| 840 | ptr + tableSize, | ||
| 841 | hdr[3]); | ||
| 842 | } | ||
| 843 | } | ||
| 844 | |||
| 845 | if (fde != NULL) { | ||
| 846 | cie = cie_for_fde(fde, table); | ||
| 847 | ptr = (const u8 *)(fde + 2); | ||
| 848 | if(cie != NULL | ||
| 849 | && cie != &bad_cie | ||
| 850 | && cie != ¬_fde | ||
| 851 | && (ptrType = fde_pointer_type(cie)) >= 0 | ||
| 852 | && read_pointer(&ptr, | ||
| 853 | (const u8 *)(fde + 1) + *fde, | ||
| 854 | ptrType) == startLoc) { | ||
| 855 | if (!(ptrType & DW_EH_PE_indirect)) | ||
| 856 | ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; | ||
| 857 | endLoc = startLoc | ||
| 858 | + read_pointer(&ptr, | ||
| 859 | (const u8 *)(fde + 1) + *fde, | ||
| 860 | ptrType); | ||
| 861 | if(pc >= endLoc) | ||
| 862 | fde = NULL; | ||
| 863 | } else | ||
| 864 | fde = NULL; | ||
| 865 | } | ||
| 866 | if (fde == NULL) { | ||
| 867 | for (fde = table->address, tableSize = table->size; | ||
| 868 | cie = NULL, tableSize > sizeof(*fde) | ||
| 869 | && tableSize - sizeof(*fde) >= *fde; | ||
| 870 | tableSize -= sizeof(*fde) + *fde, | ||
| 871 | fde += 1 + *fde / sizeof(*fde)) { | ||
| 872 | cie = cie_for_fde(fde, table); | ||
| 873 | if (cie == &bad_cie) { | ||
| 874 | cie = NULL; | ||
| 875 | break; | ||
| 876 | } | ||
| 877 | if (cie == NULL | ||
| 878 | || cie == ¬_fde | ||
| 879 | || (ptrType = fde_pointer_type(cie)) < 0) | ||
| 880 | continue; | ||
| 881 | ptr = (const u8 *)(fde + 2); | ||
| 882 | startLoc = read_pointer(&ptr, | ||
| 883 | (const u8 *)(fde + 1) + *fde, | ||
| 884 | ptrType); | ||
| 885 | if (!startLoc) | ||
| 886 | continue; | ||
| 887 | if (!(ptrType & DW_EH_PE_indirect)) | ||
| 888 | ptrType &= DW_EH_PE_FORM|DW_EH_PE_signed; | ||
| 889 | endLoc = startLoc | ||
| 890 | + read_pointer(&ptr, | ||
| 891 | (const u8 *)(fde + 1) + *fde, | ||
| 892 | ptrType); | ||
| 893 | if (pc >= startLoc && pc < endLoc) | ||
| 894 | break; | ||
| 895 | } | ||
| 896 | } | ||
| 897 | } | ||
| 898 | if (cie != NULL) { | ||
| 899 | memset(&state, 0, sizeof(state)); | ||
| 900 | state.cieEnd = ptr; /* keep here temporarily */ | ||
| 901 | ptr = (const u8 *)(cie + 2); | ||
| 902 | end = (const u8 *)(cie + 1) + *cie; | ||
| 903 | frame->call_frame = 1; | ||
| 904 | if ((state.version = *ptr) != 1) | ||
| 905 | cie = NULL; /* unsupported version */ | ||
| 906 | else if (*++ptr) { | ||
| 907 | /* check if augmentation size is first (and thus present) */ | ||
| 908 | if (*ptr == 'z') { | ||
| 909 | while (++ptr < end && *ptr) { | ||
| 910 | switch(*ptr) { | ||
| 911 | /* check for ignorable (or already handled) | ||
| 912 | * nul-terminated augmentation string */ | ||
| 913 | case 'L': | ||
| 914 | case 'P': | ||
| 915 | case 'R': | ||
| 916 | continue; | ||
| 917 | case 'S': | ||
| 918 | frame->call_frame = 0; | ||
| 919 | continue; | ||
| 920 | default: | ||
| 921 | break; | ||
| 922 | } | ||
| 923 | break; | ||
| 924 | } | ||
| 925 | } | ||
| 926 | if (ptr >= end || *ptr) | ||
| 927 | cie = NULL; | ||
| 928 | } | ||
| 929 | ++ptr; | ||
| 930 | } | ||
| 931 | if (cie != NULL) { | ||
| 932 | /* get code aligment factor */ | ||
| 933 | state.codeAlign = get_uleb128(&ptr, end); | ||
| 934 | /* get data aligment factor */ | ||
| 935 | state.dataAlign = get_sleb128(&ptr, end); | ||
| 936 | if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) | ||
| 937 | cie = NULL; | ||
| 938 | else { | ||
| 939 | retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); | ||
| 940 | /* skip augmentation */ | ||
| 941 | if (((const char *)(cie + 2))[1] == 'z') { | ||
| 942 | uleb128_t augSize = get_uleb128(&ptr, end); | ||
| 943 | |||
| 944 | ptr += augSize; | ||
| 945 | } | ||
| 946 | if (ptr > end | ||
| 947 | || retAddrReg >= ARRAY_SIZE(reg_info) | ||
| 948 | || REG_INVALID(retAddrReg) | ||
| 949 | || reg_info[retAddrReg].width != sizeof(unsigned long)) | ||
| 950 | cie = NULL; | ||
| 951 | } | ||
| 952 | } | ||
| 953 | if (cie != NULL) { | ||
| 954 | state.cieStart = ptr; | ||
| 955 | ptr = state.cieEnd; | ||
| 956 | state.cieEnd = end; | ||
| 957 | end = (const u8 *)(fde + 1) + *fde; | ||
| 958 | /* skip augmentation */ | ||
| 959 | if (((const char *)(cie + 2))[1] == 'z') { | ||
| 960 | uleb128_t augSize = get_uleb128(&ptr, end); | ||
| 961 | |||
| 962 | if ((ptr += augSize) > end) | ||
| 963 | fde = NULL; | ||
| 964 | } | ||
| 965 | } | ||
| 966 | if (cie == NULL || fde == NULL) { | ||
| 967 | #ifdef CONFIG_FRAME_POINTER | ||
| 968 | unsigned long top, bottom; | ||
| 969 | |||
| 970 | top = STACK_TOP(frame->task); | ||
| 971 | bottom = STACK_BOTTOM(frame->task); | ||
| 972 | # if FRAME_RETADDR_OFFSET < 0 | ||
| 973 | if (UNW_SP(frame) < top | ||
| 974 | && UNW_FP(frame) <= UNW_SP(frame) | ||
| 975 | && bottom < UNW_FP(frame) | ||
| 976 | # else | ||
| 977 | if (UNW_SP(frame) > top | ||
| 978 | && UNW_FP(frame) >= UNW_SP(frame) | ||
| 979 | && bottom > UNW_FP(frame) | ||
| 980 | # endif | ||
| 981 | && !((UNW_SP(frame) | UNW_FP(frame)) | ||
| 982 | & (sizeof(unsigned long) - 1))) { | ||
| 983 | unsigned long link; | ||
| 984 | |||
| 985 | if (!__get_user(link, | ||
| 986 | (unsigned long *)(UNW_FP(frame) | ||
| 987 | + FRAME_LINK_OFFSET)) | ||
| 988 | # if FRAME_RETADDR_OFFSET < 0 | ||
| 989 | && link > bottom && link < UNW_FP(frame) | ||
| 990 | # else | ||
| 991 | && link > UNW_FP(frame) && link < bottom | ||
| 992 | # endif | ||
| 993 | && !(link & (sizeof(link) - 1)) | ||
| 994 | && !__get_user(UNW_PC(frame), | ||
| 995 | (unsigned long *)(UNW_FP(frame) | ||
| 996 | + FRAME_RETADDR_OFFSET))) { | ||
| 997 | UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET | ||
| 998 | # if FRAME_RETADDR_OFFSET < 0 | ||
| 999 | - | ||
| 1000 | # else | ||
| 1001 | + | ||
| 1002 | # endif | ||
| 1003 | sizeof(UNW_PC(frame)); | ||
| 1004 | UNW_FP(frame) = link; | ||
| 1005 | return 0; | ||
| 1006 | } | ||
| 1007 | } | ||
| 1008 | #endif | ||
| 1009 | return -ENXIO; | ||
| 1010 | } | ||
| 1011 | state.org = startLoc; | ||
| 1012 | memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); | ||
| 1013 | /* process instructions */ | ||
| 1014 | if (!processCFI(ptr, end, pc, ptrType, &state) | ||
| 1015 | || state.loc > endLoc | ||
| 1016 | || state.regs[retAddrReg].where == Nowhere | ||
| 1017 | || state.cfa.reg >= ARRAY_SIZE(reg_info) | ||
| 1018 | || reg_info[state.cfa.reg].width != sizeof(unsigned long) | ||
| 1019 | || state.cfa.offs % sizeof(unsigned long)) | ||
| 1020 | return -EIO; | ||
| 1021 | /* update frame */ | ||
| 1022 | #ifndef CONFIG_AS_CFI_SIGNAL_FRAME | ||
| 1023 | if(frame->call_frame | ||
| 1024 | && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign)) | ||
| 1025 | frame->call_frame = 0; | ||
| 1026 | #endif | ||
| 1027 | cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; | ||
| 1028 | startLoc = min((unsigned long)UNW_SP(frame), cfa); | ||
| 1029 | endLoc = max((unsigned long)UNW_SP(frame), cfa); | ||
| 1030 | if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) { | ||
| 1031 | startLoc = min(STACK_LIMIT(cfa), cfa); | ||
| 1032 | endLoc = max(STACK_LIMIT(cfa), cfa); | ||
| 1033 | } | ||
| 1034 | #ifndef CONFIG_64BIT | ||
| 1035 | # define CASES CASE(8); CASE(16); CASE(32) | ||
| 1036 | #else | ||
| 1037 | # define CASES CASE(8); CASE(16); CASE(32); CASE(64) | ||
| 1038 | #endif | ||
| 1039 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | ||
| 1040 | if (REG_INVALID(i)) { | ||
| 1041 | if (state.regs[i].where == Nowhere) | ||
| 1042 | continue; | ||
| 1043 | return -EIO; | ||
| 1044 | } | ||
| 1045 | switch(state.regs[i].where) { | ||
| 1046 | default: | ||
| 1047 | break; | ||
| 1048 | case Register: | ||
| 1049 | if (state.regs[i].value >= ARRAY_SIZE(reg_info) | ||
| 1050 | || REG_INVALID(state.regs[i].value) | ||
| 1051 | || reg_info[i].width > reg_info[state.regs[i].value].width) | ||
| 1052 | return -EIO; | ||
| 1053 | switch(reg_info[state.regs[i].value].width) { | ||
| 1054 | #define CASE(n) \ | ||
| 1055 | case sizeof(u##n): \ | ||
| 1056 | state.regs[i].value = FRAME_REG(state.regs[i].value, \ | ||
| 1057 | const u##n); \ | ||
| 1058 | break | ||
| 1059 | CASES; | ||
| 1060 | #undef CASE | ||
| 1061 | default: | ||
| 1062 | return -EIO; | ||
| 1063 | } | ||
| 1064 | break; | ||
| 1065 | } | ||
| 1066 | } | ||
| 1067 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | ||
| 1068 | if (REG_INVALID(i)) | ||
| 1069 | continue; | ||
| 1070 | switch(state.regs[i].where) { | ||
| 1071 | case Nowhere: | ||
| 1072 | if (reg_info[i].width != sizeof(UNW_SP(frame)) | ||
| 1073 | || &FRAME_REG(i, __typeof__(UNW_SP(frame))) | ||
| 1074 | != &UNW_SP(frame)) | ||
| 1075 | continue; | ||
| 1076 | UNW_SP(frame) = cfa; | ||
| 1077 | break; | ||
| 1078 | case Register: | ||
| 1079 | switch(reg_info[i].width) { | ||
| 1080 | #define CASE(n) case sizeof(u##n): \ | ||
| 1081 | FRAME_REG(i, u##n) = state.regs[i].value; \ | ||
| 1082 | break | ||
| 1083 | CASES; | ||
| 1084 | #undef CASE | ||
| 1085 | default: | ||
| 1086 | return -EIO; | ||
| 1087 | } | ||
| 1088 | break; | ||
| 1089 | case Value: | ||
| 1090 | if (reg_info[i].width != sizeof(unsigned long)) | ||
| 1091 | return -EIO; | ||
| 1092 | FRAME_REG(i, unsigned long) = cfa + state.regs[i].value | ||
| 1093 | * state.dataAlign; | ||
| 1094 | break; | ||
| 1095 | case Memory: { | ||
| 1096 | unsigned long addr = cfa + state.regs[i].value | ||
| 1097 | * state.dataAlign; | ||
| 1098 | |||
| 1099 | if ((state.regs[i].value * state.dataAlign) | ||
| 1100 | % sizeof(unsigned long) | ||
| 1101 | || addr < startLoc | ||
| 1102 | || addr + sizeof(unsigned long) < addr | ||
| 1103 | || addr + sizeof(unsigned long) > endLoc) | ||
| 1104 | return -EIO; | ||
| 1105 | switch(reg_info[i].width) { | ||
| 1106 | #define CASE(n) case sizeof(u##n): \ | ||
| 1107 | __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ | ||
| 1108 | break | ||
| 1109 | CASES; | ||
| 1110 | #undef CASE | ||
| 1111 | default: | ||
| 1112 | return -EIO; | ||
| 1113 | } | ||
| 1114 | } | ||
| 1115 | break; | ||
| 1116 | } | ||
| 1117 | } | ||
| 1118 | |||
| 1119 | return 0; | ||
| 1120 | #undef CASES | ||
| 1121 | #undef FRAME_REG | ||
| 1122 | } | ||
| 1123 | EXPORT_SYMBOL(unwind); | ||
| 1124 | |||
| 1125 | int unwind_init_frame_info(struct unwind_frame_info *info, | ||
| 1126 | struct task_struct *tsk, | ||
| 1127 | /*const*/ struct pt_regs *regs) | ||
| 1128 | { | ||
| 1129 | info->task = tsk; | ||
| 1130 | info->call_frame = 0; | ||
| 1131 | arch_unw_init_frame_info(info, regs); | ||
| 1132 | |||
| 1133 | return 0; | ||
| 1134 | } | ||
| 1135 | EXPORT_SYMBOL(unwind_init_frame_info); | ||
| 1136 | |||
| 1137 | /* | ||
| 1138 | * Prepare to unwind a blocked task. | ||
| 1139 | */ | ||
| 1140 | int unwind_init_blocked(struct unwind_frame_info *info, | ||
| 1141 | struct task_struct *tsk) | ||
| 1142 | { | ||
| 1143 | info->task = tsk; | ||
| 1144 | info->call_frame = 0; | ||
| 1145 | arch_unw_init_blocked(info); | ||
| 1146 | |||
| 1147 | return 0; | ||
| 1148 | } | ||
| 1149 | EXPORT_SYMBOL(unwind_init_blocked); | ||
| 1150 | |||
| 1151 | /* | ||
| 1152 | * Prepare to unwind the currently running thread. | ||
| 1153 | */ | ||
| 1154 | int unwind_init_running(struct unwind_frame_info *info, | ||
| 1155 | asmlinkage int (*callback)(struct unwind_frame_info *, | ||
| 1156 | void *arg), | ||
| 1157 | void *arg) | ||
| 1158 | { | ||
| 1159 | info->task = current; | ||
| 1160 | info->call_frame = 0; | ||
| 1161 | |||
| 1162 | return arch_unwind_init_running(info, callback, arg); | ||
| 1163 | } | ||
| 1164 | EXPORT_SYMBOL(unwind_init_running); | ||
| 1165 | |||
| 1166 | /* | ||
| 1167 | * Unwind until the return pointer is in user-land (or until an error | ||
| 1168 | * occurs). Returns 0 if successful, negative number in case of | ||
| 1169 | * error. | ||
| 1170 | */ | ||
| 1171 | int unwind_to_user(struct unwind_frame_info *info) | ||
| 1172 | { | ||
| 1173 | while (!arch_unw_user_mode(info)) { | ||
| 1174 | int err = unwind(info); | ||
| 1175 | |||
| 1176 | if (err < 0) | ||
| 1177 | return err; | ||
| 1178 | } | ||
| 1179 | |||
| 1180 | return 0; | ||
| 1181 | } | ||
| 1182 | EXPORT_SYMBOL(unwind_to_user); | ||
diff --git a/kernel/user.c b/kernel/user.c index 220e586127a0..4869563080e9 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -26,7 +26,7 @@ | |||
| 26 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) | 26 | #define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) |
| 27 | #define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) | 27 | #define uidhashentry(uid) (uidhash_table + __uidhashfn((uid))) |
| 28 | 28 | ||
| 29 | static kmem_cache_t *uid_cachep; | 29 | static struct kmem_cache *uid_cachep; |
| 30 | static struct list_head uidhash_table[UIDHASH_SZ]; | 30 | static struct list_head uidhash_table[UIDHASH_SZ]; |
| 31 | 31 | ||
| 32 | /* | 32 | /* |
| @@ -132,7 +132,7 @@ struct user_struct * alloc_uid(uid_t uid) | |||
| 132 | if (!up) { | 132 | if (!up) { |
| 133 | struct user_struct *new; | 133 | struct user_struct *new; |
| 134 | 134 | ||
| 135 | new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL); | 135 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); |
| 136 | if (!new) | 136 | if (!new) |
| 137 | return NULL; | 137 | return NULL; |
| 138 | new->uid = uid; | 138 | new->uid = uid; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 17c2f03d2c27..a3da07c5af28 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -29,6 +29,9 @@ | |||
| 29 | #include <linux/kthread.h> | 29 | #include <linux/kthread.h> |
| 30 | #include <linux/hardirq.h> | 30 | #include <linux/hardirq.h> |
| 31 | #include <linux/mempolicy.h> | 31 | #include <linux/mempolicy.h> |
| 32 | #include <linux/freezer.h> | ||
| 33 | #include <linux/kallsyms.h> | ||
| 34 | #include <linux/debug_locks.h> | ||
| 32 | 35 | ||
| 33 | /* | 36 | /* |
| 34 | * The per-CPU workqueue (if single thread, we always use the first | 37 | * The per-CPU workqueue (if single thread, we always use the first |
| @@ -55,6 +58,8 @@ struct cpu_workqueue_struct { | |||
| 55 | struct task_struct *thread; | 58 | struct task_struct *thread; |
| 56 | 59 | ||
| 57 | int run_depth; /* Detect run_workqueue() recursion depth */ | 60 | int run_depth; /* Detect run_workqueue() recursion depth */ |
| 61 | |||
| 62 | int freezeable; /* Freeze the thread during suspend */ | ||
| 58 | } ____cacheline_aligned; | 63 | } ____cacheline_aligned; |
| 59 | 64 | ||
| 60 | /* | 65 | /* |
| @@ -80,6 +85,99 @@ static inline int is_single_threaded(struct workqueue_struct *wq) | |||
| 80 | return list_empty(&wq->list); | 85 | return list_empty(&wq->list); |
| 81 | } | 86 | } |
| 82 | 87 | ||
| 88 | /* | ||
| 89 | * Set the workqueue on which a work item is to be run | ||
| 90 | * - Must *only* be called if the pending flag is set | ||
| 91 | */ | ||
| 92 | static inline void set_wq_data(struct work_struct *work, void *wq) | ||
| 93 | { | ||
| 94 | unsigned long new; | ||
| 95 | |||
| 96 | BUG_ON(!work_pending(work)); | ||
| 97 | |||
| 98 | new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); | ||
| 99 | new |= WORK_STRUCT_FLAG_MASK & *work_data_bits(work); | ||
| 100 | atomic_long_set(&work->data, new); | ||
| 101 | } | ||
| 102 | |||
| 103 | static inline void *get_wq_data(struct work_struct *work) | ||
| 104 | { | ||
| 105 | return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK); | ||
| 106 | } | ||
| 107 | |||
| 108 | static int __run_work(struct cpu_workqueue_struct *cwq, struct work_struct *work) | ||
| 109 | { | ||
| 110 | int ret = 0; | ||
| 111 | unsigned long flags; | ||
| 112 | |||
| 113 | spin_lock_irqsave(&cwq->lock, flags); | ||
| 114 | /* | ||
| 115 | * We need to re-validate the work info after we've gotten | ||
| 116 | * the cpu_workqueue lock. We can run the work now iff: | ||
| 117 | * | ||
| 118 | * - the wq_data still matches the cpu_workqueue_struct | ||
| 119 | * - AND the work is still marked pending | ||
| 120 | * - AND the work is still on a list (which will be this | ||
| 121 | * workqueue_struct list) | ||
| 122 | * | ||
| 123 | * All these conditions are important, because we | ||
| 124 | * need to protect against the work being run right | ||
| 125 | * now on another CPU (all but the last one might be | ||
| 126 | * true if it's currently running and has not been | ||
| 127 | * released yet, for example). | ||
| 128 | */ | ||
| 129 | if (get_wq_data(work) == cwq | ||
| 130 | && work_pending(work) | ||
| 131 | && !list_empty(&work->entry)) { | ||
| 132 | work_func_t f = work->func; | ||
| 133 | list_del_init(&work->entry); | ||
| 134 | spin_unlock_irqrestore(&cwq->lock, flags); | ||
| 135 | |||
| 136 | if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) | ||
| 137 | work_release(work); | ||
| 138 | f(work); | ||
| 139 | |||
| 140 | spin_lock_irqsave(&cwq->lock, flags); | ||
| 141 | cwq->remove_sequence++; | ||
| 142 | wake_up(&cwq->work_done); | ||
| 143 | ret = 1; | ||
| 144 | } | ||
| 145 | spin_unlock_irqrestore(&cwq->lock, flags); | ||
| 146 | return ret; | ||
| 147 | } | ||
| 148 | |||
| 149 | /** | ||
| 150 | * run_scheduled_work - run scheduled work synchronously | ||
| 151 | * @work: work to run | ||
| 152 | * | ||
| 153 | * This checks if the work was pending, and runs it | ||
| 154 | * synchronously if so. It returns a boolean to indicate | ||
| 155 | * whether it had any scheduled work to run or not. | ||
| 156 | * | ||
| 157 | * NOTE! This _only_ works for normal work_structs. You | ||
| 158 | * CANNOT use this for delayed work, because the wq data | ||
| 159 | * for delayed work will not point properly to the per- | ||
| 160 | * CPU workqueue struct, but will change! | ||
| 161 | */ | ||
| 162 | int fastcall run_scheduled_work(struct work_struct *work) | ||
| 163 | { | ||
| 164 | for (;;) { | ||
| 165 | struct cpu_workqueue_struct *cwq; | ||
| 166 | |||
| 167 | if (!work_pending(work)) | ||
| 168 | return 0; | ||
| 169 | if (list_empty(&work->entry)) | ||
| 170 | return 0; | ||
| 171 | /* NOTE! This depends intimately on __queue_work! */ | ||
| 172 | cwq = get_wq_data(work); | ||
| 173 | if (!cwq) | ||
| 174 | return 0; | ||
| 175 | if (__run_work(cwq, work)) | ||
| 176 | return 1; | ||
| 177 | } | ||
| 178 | } | ||
| 179 | EXPORT_SYMBOL(run_scheduled_work); | ||
| 180 | |||
| 83 | /* Preempt must be disabled. */ | 181 | /* Preempt must be disabled. */ |
| 84 | static void __queue_work(struct cpu_workqueue_struct *cwq, | 182 | static void __queue_work(struct cpu_workqueue_struct *cwq, |
| 85 | struct work_struct *work) | 183 | struct work_struct *work) |
| @@ -87,7 +185,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq, | |||
| 87 | unsigned long flags; | 185 | unsigned long flags; |
| 88 | 186 | ||
| 89 | spin_lock_irqsave(&cwq->lock, flags); | 187 | spin_lock_irqsave(&cwq->lock, flags); |
| 90 | work->wq_data = cwq; | 188 | set_wq_data(work, cwq); |
| 91 | list_add_tail(&work->entry, &cwq->worklist); | 189 | list_add_tail(&work->entry, &cwq->worklist); |
| 92 | cwq->insert_sequence++; | 190 | cwq->insert_sequence++; |
| 93 | wake_up(&cwq->more_work); | 191 | wake_up(&cwq->more_work); |
| @@ -108,7 +206,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
| 108 | { | 206 | { |
| 109 | int ret = 0, cpu = get_cpu(); | 207 | int ret = 0, cpu = get_cpu(); |
| 110 | 208 | ||
| 111 | if (!test_and_set_bit(0, &work->pending)) { | 209 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { |
| 112 | if (unlikely(is_single_threaded(wq))) | 210 | if (unlikely(is_single_threaded(wq))) |
| 113 | cpu = singlethread_cpu; | 211 | cpu = singlethread_cpu; |
| 114 | BUG_ON(!list_empty(&work->entry)); | 212 | BUG_ON(!list_empty(&work->entry)); |
| @@ -122,38 +220,42 @@ EXPORT_SYMBOL_GPL(queue_work); | |||
| 122 | 220 | ||
| 123 | static void delayed_work_timer_fn(unsigned long __data) | 221 | static void delayed_work_timer_fn(unsigned long __data) |
| 124 | { | 222 | { |
| 125 | struct work_struct *work = (struct work_struct *)__data; | 223 | struct delayed_work *dwork = (struct delayed_work *)__data; |
| 126 | struct workqueue_struct *wq = work->wq_data; | 224 | struct workqueue_struct *wq = get_wq_data(&dwork->work); |
| 127 | int cpu = smp_processor_id(); | 225 | int cpu = smp_processor_id(); |
| 128 | 226 | ||
| 129 | if (unlikely(is_single_threaded(wq))) | 227 | if (unlikely(is_single_threaded(wq))) |
| 130 | cpu = singlethread_cpu; | 228 | cpu = singlethread_cpu; |
| 131 | 229 | ||
| 132 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | 230 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), &dwork->work); |
| 133 | } | 231 | } |
| 134 | 232 | ||
| 135 | /** | 233 | /** |
| 136 | * queue_delayed_work - queue work on a workqueue after delay | 234 | * queue_delayed_work - queue work on a workqueue after delay |
| 137 | * @wq: workqueue to use | 235 | * @wq: workqueue to use |
| 138 | * @work: work to queue | 236 | * @dwork: delayable work to queue |
| 139 | * @delay: number of jiffies to wait before queueing | 237 | * @delay: number of jiffies to wait before queueing |
| 140 | * | 238 | * |
| 141 | * Returns 0 if @work was already on a queue, non-zero otherwise. | 239 | * Returns 0 if @work was already on a queue, non-zero otherwise. |
| 142 | */ | 240 | */ |
| 143 | int fastcall queue_delayed_work(struct workqueue_struct *wq, | 241 | int fastcall queue_delayed_work(struct workqueue_struct *wq, |
| 144 | struct work_struct *work, unsigned long delay) | 242 | struct delayed_work *dwork, unsigned long delay) |
| 145 | { | 243 | { |
| 146 | int ret = 0; | 244 | int ret = 0; |
| 147 | struct timer_list *timer = &work->timer; | 245 | struct timer_list *timer = &dwork->timer; |
| 246 | struct work_struct *work = &dwork->work; | ||
| 148 | 247 | ||
| 149 | if (!test_and_set_bit(0, &work->pending)) { | 248 | if (delay == 0) |
| 249 | return queue_work(wq, work); | ||
| 250 | |||
| 251 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { | ||
| 150 | BUG_ON(timer_pending(timer)); | 252 | BUG_ON(timer_pending(timer)); |
| 151 | BUG_ON(!list_empty(&work->entry)); | 253 | BUG_ON(!list_empty(&work->entry)); |
| 152 | 254 | ||
| 153 | /* This stores wq for the moment, for the timer_fn */ | 255 | /* This stores wq for the moment, for the timer_fn */ |
| 154 | work->wq_data = wq; | 256 | set_wq_data(work, wq); |
| 155 | timer->expires = jiffies + delay; | 257 | timer->expires = jiffies + delay; |
| 156 | timer->data = (unsigned long)work; | 258 | timer->data = (unsigned long)dwork; |
| 157 | timer->function = delayed_work_timer_fn; | 259 | timer->function = delayed_work_timer_fn; |
| 158 | add_timer(timer); | 260 | add_timer(timer); |
| 159 | ret = 1; | 261 | ret = 1; |
| @@ -166,25 +268,26 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); | |||
| 166 | * queue_delayed_work_on - queue work on specific CPU after delay | 268 | * queue_delayed_work_on - queue work on specific CPU after delay |
| 167 | * @cpu: CPU number to execute work on | 269 | * @cpu: CPU number to execute work on |
| 168 | * @wq: workqueue to use | 270 | * @wq: workqueue to use |
| 169 | * @work: work to queue | 271 | * @dwork: work to queue |
| 170 | * @delay: number of jiffies to wait before queueing | 272 | * @delay: number of jiffies to wait before queueing |
| 171 | * | 273 | * |
| 172 | * Returns 0 if @work was already on a queue, non-zero otherwise. | 274 | * Returns 0 if @work was already on a queue, non-zero otherwise. |
| 173 | */ | 275 | */ |
| 174 | int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | 276 | int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, |
| 175 | struct work_struct *work, unsigned long delay) | 277 | struct delayed_work *dwork, unsigned long delay) |
| 176 | { | 278 | { |
| 177 | int ret = 0; | 279 | int ret = 0; |
| 178 | struct timer_list *timer = &work->timer; | 280 | struct timer_list *timer = &dwork->timer; |
| 281 | struct work_struct *work = &dwork->work; | ||
| 179 | 282 | ||
| 180 | if (!test_and_set_bit(0, &work->pending)) { | 283 | if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { |
| 181 | BUG_ON(timer_pending(timer)); | 284 | BUG_ON(timer_pending(timer)); |
| 182 | BUG_ON(!list_empty(&work->entry)); | 285 | BUG_ON(!list_empty(&work->entry)); |
| 183 | 286 | ||
| 184 | /* This stores wq for the moment, for the timer_fn */ | 287 | /* This stores wq for the moment, for the timer_fn */ |
| 185 | work->wq_data = wq; | 288 | set_wq_data(work, wq); |
| 186 | timer->expires = jiffies + delay; | 289 | timer->expires = jiffies + delay; |
| 187 | timer->data = (unsigned long)work; | 290 | timer->data = (unsigned long)dwork; |
| 188 | timer->function = delayed_work_timer_fn; | 291 | timer->function = delayed_work_timer_fn; |
| 189 | add_timer_on(timer, cpu); | 292 | add_timer_on(timer, cpu); |
| 190 | ret = 1; | 293 | ret = 1; |
| @@ -212,15 +315,26 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq) | |||
| 212 | while (!list_empty(&cwq->worklist)) { | 315 | while (!list_empty(&cwq->worklist)) { |
| 213 | struct work_struct *work = list_entry(cwq->worklist.next, | 316 | struct work_struct *work = list_entry(cwq->worklist.next, |
| 214 | struct work_struct, entry); | 317 | struct work_struct, entry); |
| 215 | void (*f) (void *) = work->func; | 318 | work_func_t f = work->func; |
| 216 | void *data = work->data; | ||
| 217 | 319 | ||
| 218 | list_del_init(cwq->worklist.next); | 320 | list_del_init(cwq->worklist.next); |
| 219 | spin_unlock_irqrestore(&cwq->lock, flags); | 321 | spin_unlock_irqrestore(&cwq->lock, flags); |
| 220 | 322 | ||
| 221 | BUG_ON(work->wq_data != cwq); | 323 | BUG_ON(get_wq_data(work) != cwq); |
| 222 | clear_bit(0, &work->pending); | 324 | if (!test_bit(WORK_STRUCT_NOAUTOREL, work_data_bits(work))) |
| 223 | f(data); | 325 | work_release(work); |
| 326 | f(work); | ||
| 327 | |||
| 328 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | ||
| 329 | printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " | ||
| 330 | "%s/0x%08x/%d\n", | ||
| 331 | current->comm, preempt_count(), | ||
| 332 | current->pid); | ||
| 333 | printk(KERN_ERR " last function: "); | ||
| 334 | print_symbol("%s\n", (unsigned long)f); | ||
| 335 | debug_show_held_locks(current); | ||
| 336 | dump_stack(); | ||
| 337 | } | ||
| 224 | 338 | ||
| 225 | spin_lock_irqsave(&cwq->lock, flags); | 339 | spin_lock_irqsave(&cwq->lock, flags); |
| 226 | cwq->remove_sequence++; | 340 | cwq->remove_sequence++; |
| @@ -237,7 +351,8 @@ static int worker_thread(void *__cwq) | |||
| 237 | struct k_sigaction sa; | 351 | struct k_sigaction sa; |
| 238 | sigset_t blocked; | 352 | sigset_t blocked; |
| 239 | 353 | ||
| 240 | current->flags |= PF_NOFREEZE; | 354 | if (!cwq->freezeable) |
| 355 | current->flags |= PF_NOFREEZE; | ||
| 241 | 356 | ||
| 242 | set_user_nice(current, -5); | 357 | set_user_nice(current, -5); |
| 243 | 358 | ||
| @@ -260,6 +375,9 @@ static int worker_thread(void *__cwq) | |||
| 260 | 375 | ||
| 261 | set_current_state(TASK_INTERRUPTIBLE); | 376 | set_current_state(TASK_INTERRUPTIBLE); |
| 262 | while (!kthread_should_stop()) { | 377 | while (!kthread_should_stop()) { |
| 378 | if (cwq->freezeable) | ||
| 379 | try_to_freeze(); | ||
| 380 | |||
| 263 | add_wait_queue(&cwq->more_work, &wait); | 381 | add_wait_queue(&cwq->more_work, &wait); |
| 264 | if (list_empty(&cwq->worklist)) | 382 | if (list_empty(&cwq->worklist)) |
| 265 | schedule(); | 383 | schedule(); |
| @@ -336,7 +454,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
| 336 | EXPORT_SYMBOL_GPL(flush_workqueue); | 454 | EXPORT_SYMBOL_GPL(flush_workqueue); |
| 337 | 455 | ||
| 338 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | 456 | static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, |
| 339 | int cpu) | 457 | int cpu, int freezeable) |
| 340 | { | 458 | { |
| 341 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 459 | struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
| 342 | struct task_struct *p; | 460 | struct task_struct *p; |
| @@ -346,6 +464,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | |||
| 346 | cwq->thread = NULL; | 464 | cwq->thread = NULL; |
| 347 | cwq->insert_sequence = 0; | 465 | cwq->insert_sequence = 0; |
| 348 | cwq->remove_sequence = 0; | 466 | cwq->remove_sequence = 0; |
| 467 | cwq->freezeable = freezeable; | ||
| 349 | INIT_LIST_HEAD(&cwq->worklist); | 468 | INIT_LIST_HEAD(&cwq->worklist); |
| 350 | init_waitqueue_head(&cwq->more_work); | 469 | init_waitqueue_head(&cwq->more_work); |
| 351 | init_waitqueue_head(&cwq->work_done); | 470 | init_waitqueue_head(&cwq->work_done); |
| @@ -361,7 +480,7 @@ static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, | |||
| 361 | } | 480 | } |
| 362 | 481 | ||
| 363 | struct workqueue_struct *__create_workqueue(const char *name, | 482 | struct workqueue_struct *__create_workqueue(const char *name, |
| 364 | int singlethread) | 483 | int singlethread, int freezeable) |
| 365 | { | 484 | { |
| 366 | int cpu, destroy = 0; | 485 | int cpu, destroy = 0; |
| 367 | struct workqueue_struct *wq; | 486 | struct workqueue_struct *wq; |
| @@ -381,7 +500,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
| 381 | mutex_lock(&workqueue_mutex); | 500 | mutex_lock(&workqueue_mutex); |
| 382 | if (singlethread) { | 501 | if (singlethread) { |
| 383 | INIT_LIST_HEAD(&wq->list); | 502 | INIT_LIST_HEAD(&wq->list); |
| 384 | p = create_workqueue_thread(wq, singlethread_cpu); | 503 | p = create_workqueue_thread(wq, singlethread_cpu, freezeable); |
| 385 | if (!p) | 504 | if (!p) |
| 386 | destroy = 1; | 505 | destroy = 1; |
| 387 | else | 506 | else |
| @@ -389,7 +508,7 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
| 389 | } else { | 508 | } else { |
| 390 | list_add(&wq->list, &workqueues); | 509 | list_add(&wq->list, &workqueues); |
| 391 | for_each_online_cpu(cpu) { | 510 | for_each_online_cpu(cpu) { |
| 392 | p = create_workqueue_thread(wq, cpu); | 511 | p = create_workqueue_thread(wq, cpu, freezeable); |
| 393 | if (p) { | 512 | if (p) { |
| 394 | kthread_bind(p, cpu); | 513 | kthread_bind(p, cpu); |
| 395 | wake_up_process(p); | 514 | wake_up_process(p); |
| @@ -468,38 +587,37 @@ EXPORT_SYMBOL(schedule_work); | |||
| 468 | 587 | ||
| 469 | /** | 588 | /** |
| 470 | * schedule_delayed_work - put work task in global workqueue after delay | 589 | * schedule_delayed_work - put work task in global workqueue after delay |
| 471 | * @work: job to be done | 590 | * @dwork: job to be done |
| 472 | * @delay: number of jiffies to wait | 591 | * @delay: number of jiffies to wait or 0 for immediate execution |
| 473 | * | 592 | * |
| 474 | * After waiting for a given time this puts a job in the kernel-global | 593 | * After waiting for a given time this puts a job in the kernel-global |
| 475 | * workqueue. | 594 | * workqueue. |
| 476 | */ | 595 | */ |
| 477 | int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) | 596 | int fastcall schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) |
| 478 | { | 597 | { |
| 479 | return queue_delayed_work(keventd_wq, work, delay); | 598 | return queue_delayed_work(keventd_wq, dwork, delay); |
| 480 | } | 599 | } |
| 481 | EXPORT_SYMBOL(schedule_delayed_work); | 600 | EXPORT_SYMBOL(schedule_delayed_work); |
| 482 | 601 | ||
| 483 | /** | 602 | /** |
| 484 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay | 603 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay |
| 485 | * @cpu: cpu to use | 604 | * @cpu: cpu to use |
| 486 | * @work: job to be done | 605 | * @dwork: job to be done |
| 487 | * @delay: number of jiffies to wait | 606 | * @delay: number of jiffies to wait |
| 488 | * | 607 | * |
| 489 | * After waiting for a given time this puts a job in the kernel-global | 608 | * After waiting for a given time this puts a job in the kernel-global |
| 490 | * workqueue on the specified CPU. | 609 | * workqueue on the specified CPU. |
| 491 | */ | 610 | */ |
| 492 | int schedule_delayed_work_on(int cpu, | 611 | int schedule_delayed_work_on(int cpu, |
| 493 | struct work_struct *work, unsigned long delay) | 612 | struct delayed_work *dwork, unsigned long delay) |
| 494 | { | 613 | { |
| 495 | return queue_delayed_work_on(cpu, keventd_wq, work, delay); | 614 | return queue_delayed_work_on(cpu, keventd_wq, dwork, delay); |
| 496 | } | 615 | } |
| 497 | EXPORT_SYMBOL(schedule_delayed_work_on); | 616 | EXPORT_SYMBOL(schedule_delayed_work_on); |
| 498 | 617 | ||
| 499 | /** | 618 | /** |
| 500 | * schedule_on_each_cpu - call a function on each online CPU from keventd | 619 | * schedule_on_each_cpu - call a function on each online CPU from keventd |
| 501 | * @func: the function to call | 620 | * @func: the function to call |
| 502 | * @info: a pointer to pass to func() | ||
| 503 | * | 621 | * |
| 504 | * Returns zero on success. | 622 | * Returns zero on success. |
| 505 | * Returns -ve errno on failure. | 623 | * Returns -ve errno on failure. |
| @@ -508,7 +626,7 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
| 508 | * | 626 | * |
| 509 | * schedule_on_each_cpu() is very slow. | 627 | * schedule_on_each_cpu() is very slow. |
| 510 | */ | 628 | */ |
| 511 | int schedule_on_each_cpu(void (*func)(void *info), void *info) | 629 | int schedule_on_each_cpu(work_func_t func) |
| 512 | { | 630 | { |
| 513 | int cpu; | 631 | int cpu; |
| 514 | struct work_struct *works; | 632 | struct work_struct *works; |
| @@ -519,9 +637,11 @@ int schedule_on_each_cpu(void (*func)(void *info), void *info) | |||
| 519 | 637 | ||
| 520 | mutex_lock(&workqueue_mutex); | 638 | mutex_lock(&workqueue_mutex); |
| 521 | for_each_online_cpu(cpu) { | 639 | for_each_online_cpu(cpu) { |
| 522 | INIT_WORK(per_cpu_ptr(works, cpu), func, info); | 640 | struct work_struct *work = per_cpu_ptr(works, cpu); |
| 523 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), | 641 | |
| 524 | per_cpu_ptr(works, cpu)); | 642 | INIT_WORK(work, func); |
| 643 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); | ||
| 644 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); | ||
| 525 | } | 645 | } |
| 526 | mutex_unlock(&workqueue_mutex); | 646 | mutex_unlock(&workqueue_mutex); |
| 527 | flush_workqueue(keventd_wq); | 647 | flush_workqueue(keventd_wq); |
| @@ -539,12 +659,12 @@ EXPORT_SYMBOL(flush_scheduled_work); | |||
| 539 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed | 659 | * cancel_rearming_delayed_workqueue - reliably kill off a delayed |
| 540 | * work whose handler rearms the delayed work. | 660 | * work whose handler rearms the delayed work. |
| 541 | * @wq: the controlling workqueue structure | 661 | * @wq: the controlling workqueue structure |
| 542 | * @work: the delayed work struct | 662 | * @dwork: the delayed work struct |
| 543 | */ | 663 | */ |
| 544 | void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, | 664 | void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, |
| 545 | struct work_struct *work) | 665 | struct delayed_work *dwork) |
| 546 | { | 666 | { |
| 547 | while (!cancel_delayed_work(work)) | 667 | while (!cancel_delayed_work(dwork)) |
| 548 | flush_workqueue(wq); | 668 | flush_workqueue(wq); |
| 549 | } | 669 | } |
| 550 | EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); | 670 | EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); |
| @@ -552,18 +672,17 @@ EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); | |||
| 552 | /** | 672 | /** |
| 553 | * cancel_rearming_delayed_work - reliably kill off a delayed keventd | 673 | * cancel_rearming_delayed_work - reliably kill off a delayed keventd |
| 554 | * work whose handler rearms the delayed work. | 674 | * work whose handler rearms the delayed work. |
| 555 | * @work: the delayed work struct | 675 | * @dwork: the delayed work struct |
| 556 | */ | 676 | */ |
| 557 | void cancel_rearming_delayed_work(struct work_struct *work) | 677 | void cancel_rearming_delayed_work(struct delayed_work *dwork) |
| 558 | { | 678 | { |
| 559 | cancel_rearming_delayed_workqueue(keventd_wq, work); | 679 | cancel_rearming_delayed_workqueue(keventd_wq, dwork); |
| 560 | } | 680 | } |
| 561 | EXPORT_SYMBOL(cancel_rearming_delayed_work); | 681 | EXPORT_SYMBOL(cancel_rearming_delayed_work); |
| 562 | 682 | ||
| 563 | /** | 683 | /** |
| 564 | * execute_in_process_context - reliably execute the routine with user context | 684 | * execute_in_process_context - reliably execute the routine with user context |
| 565 | * @fn: the function to execute | 685 | * @fn: the function to execute |
| 566 | * @data: data to pass to the function | ||
| 567 | * @ew: guaranteed storage for the execute work structure (must | 686 | * @ew: guaranteed storage for the execute work structure (must |
| 568 | * be available when the work executes) | 687 | * be available when the work executes) |
| 569 | * | 688 | * |
| @@ -573,15 +692,14 @@ EXPORT_SYMBOL(cancel_rearming_delayed_work); | |||
| 573 | * Returns: 0 - function was executed | 692 | * Returns: 0 - function was executed |
| 574 | * 1 - function was scheduled for execution | 693 | * 1 - function was scheduled for execution |
| 575 | */ | 694 | */ |
| 576 | int execute_in_process_context(void (*fn)(void *data), void *data, | 695 | int execute_in_process_context(work_func_t fn, struct execute_work *ew) |
| 577 | struct execute_work *ew) | ||
| 578 | { | 696 | { |
| 579 | if (!in_interrupt()) { | 697 | if (!in_interrupt()) { |
| 580 | fn(data); | 698 | fn(&ew->work); |
| 581 | return 0; | 699 | return 0; |
| 582 | } | 700 | } |
| 583 | 701 | ||
| 584 | INIT_WORK(&ew->work, fn, data); | 702 | INIT_WORK(&ew->work, fn); |
| 585 | schedule_work(&ew->work); | 703 | schedule_work(&ew->work); |
| 586 | 704 | ||
| 587 | return 1; | 705 | return 1; |
| @@ -609,7 +727,6 @@ int current_is_keventd(void) | |||
| 609 | 727 | ||
| 610 | } | 728 | } |
| 611 | 729 | ||
| 612 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 613 | /* Take the work from this (downed) CPU. */ | 730 | /* Take the work from this (downed) CPU. */ |
| 614 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | 731 | static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) |
| 615 | { | 732 | { |
| @@ -642,7 +759,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 642 | mutex_lock(&workqueue_mutex); | 759 | mutex_lock(&workqueue_mutex); |
| 643 | /* Create a new workqueue thread for it. */ | 760 | /* Create a new workqueue thread for it. */ |
| 644 | list_for_each_entry(wq, &workqueues, list) { | 761 | list_for_each_entry(wq, &workqueues, list) { |
| 645 | if (!create_workqueue_thread(wq, hotcpu)) { | 762 | if (!create_workqueue_thread(wq, hotcpu, 0)) { |
| 646 | printk("workqueue for %i failed\n", hotcpu); | 763 | printk("workqueue for %i failed\n", hotcpu); |
| 647 | return NOTIFY_BAD; | 764 | return NOTIFY_BAD; |
| 648 | } | 765 | } |
| @@ -692,7 +809,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
| 692 | 809 | ||
| 693 | return NOTIFY_OK; | 810 | return NOTIFY_OK; |
| 694 | } | 811 | } |
| 695 | #endif | ||
| 696 | 812 | ||
| 697 | void init_workqueues(void) | 813 | void init_workqueues(void) |
| 698 | { | 814 | { |
