aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/cpuset.c125
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/futex.c137
-rw-r--r--kernel/intermodule.c3
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c14
-rw-r--r--kernel/kprobes.c94
-rw-r--r--kernel/module.c33
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/posix-timers.c28
-rw-r--r--kernel/power/Kconfig14
-rw-r--r--kernel/power/disk.c55
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/pm.c3
-rw-r--r--kernel/power/process.c29
-rw-r--r--kernel/power/swsusp.c202
-rw-r--r--kernel/printk.c13
-rw-r--r--kernel/ptrace.c41
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/sched.c340
-rw-r--r--kernel/signal.c83
-rw-r--r--kernel/softlockup.c151
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/timer.c18
-rw-r--r--kernel/workqueue.c5
28 files changed, 1128 insertions, 290 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..8d57a2f1226b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o
27obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 27obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
28obj-$(CONFIG_KPROBES) += kprobes.o 28obj-$(CONFIG_KPROBES) += kprobes.o
29obj-$(CONFIG_SYSFS) += ksysfs.o 29obj-$(CONFIG_SYSFS) += ksysfs.o
30obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
30obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 31obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
31obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 32obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
32obj-$(CONFIG_SECCOMP) += seccomp.o 33obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 4168f631868e..f70e6027cca9 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -220,7 +220,7 @@ asmlinkage long sys_acct(const char __user *name)
220 return (PTR_ERR(tmp)); 220 return (PTR_ERR(tmp));
221 } 221 }
222 /* Difference from BSD - they don't do O_APPEND */ 222 /* Difference from BSD - they don't do O_APPEND */
223 file = filp_open(tmp, O_WRONLY|O_APPEND, 0); 223 file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
224 putname(tmp); 224 putname(tmp);
225 if (IS_ERR(file)) { 225 if (IS_ERR(file)) {
226 return (PTR_ERR(file)); 226 return (PTR_ERR(file));
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b8..1f06e7690106 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -628,13 +628,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
628 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 628 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
629 */ 629 */
630 630
631/*
632 * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
633 * Disable letting 'cpu_exclusive' cpusets define dynamic sched
634 * domains, until the sched domain can handle partial nodes.
635 * Remove this #if hackery when sched domains fixed.
636 */
637#if 0
638static void update_cpu_domains(struct cpuset *cur) 631static void update_cpu_domains(struct cpuset *cur)
639{ 632{
640 struct cpuset *c, *par = cur->parent; 633 struct cpuset *c, *par = cur->parent;
@@ -675,11 +668,6 @@ static void update_cpu_domains(struct cpuset *cur)
675 partition_sched_domains(&pspan, &cspan); 668 partition_sched_domains(&pspan, &cspan);
676 unlock_cpu_hotplug(); 669 unlock_cpu_hotplug();
677} 670}
678#else
679static void update_cpu_domains(struct cpuset *cur)
680{
681}
682#endif
683 671
684static int update_cpumask(struct cpuset *cs, char *buf) 672static int update_cpumask(struct cpuset *cs, char *buf)
685{ 673{
@@ -1611,17 +1599,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1611 return 0; 1599 return 0;
1612} 1600}
1613 1601
1602/*
1603 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
1604 * ancestor to the specified cpuset. Call while holding cpuset_sem.
1605 * If no ancestor is mem_exclusive (an unusual configuration), then
1606 * returns the root cpuset.
1607 */
1608static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1609{
1610 while (!is_mem_exclusive(cs) && cs->parent)
1611 cs = cs->parent;
1612 return cs;
1613}
1614
1614/** 1615/**
1615 * cpuset_zone_allowed - is zone z allowed in current->mems_allowed 1616 * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
1616 * @z: zone in question 1617 * @z: is this zone on an allowed node?
1618 * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
1617 * 1619 *
1618 * Is zone z allowed in current->mems_allowed, or is 1620 * If we're in interrupt, yes, we can always allocate. If zone
1619 * the CPU in interrupt context? (zone is always allowed in this case) 1621 * z's node is in our tasks mems_allowed, yes. If it's not a
1620 */ 1622 * __GFP_HARDWALL request and this zone's nodes is in the nearest
1621int cpuset_zone_allowed(struct zone *z) 1623 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
1624 * Otherwise, no.
1625 *
1626 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
1627 * and do not allow allocations outside the current tasks cpuset.
1628 * GFP_KERNEL allocations are not so marked, so can escape to the
1629 * nearest mem_exclusive ancestor cpuset.
1630 *
1631 * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages()
1632 * routine only calls here with __GFP_HARDWALL bit _not_ set if
1633 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
1634 * mems_allowed came up empty on the first pass over the zonelist.
1635 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
1636 * short of memory, might require taking the cpuset_sem semaphore.
1637 *
1638 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
1639 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
1640 * hardwall cpusets - no allocation on a node outside the cpuset is
1641 * allowed (unless in interrupt, of course).
1642 *
1643 * The second loop doesn't even call here for GFP_ATOMIC requests
1644 * (if the __alloc_pages() local variable 'wait' is set). That check
1645 * and the checks below have the combined affect in the second loop of
1646 * the __alloc_pages() routine that:
1647 * in_interrupt - any node ok (current task context irrelevant)
1648 * GFP_ATOMIC - any node ok
1649 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
1650 * GFP_USER - only nodes in current tasks mems allowed ok.
1651 **/
1652
1653int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
1622{ 1654{
1623 return in_interrupt() || 1655 int node; /* node that zone z is on */
1624 node_isset(z->zone_pgdat->node_id, current->mems_allowed); 1656 const struct cpuset *cs; /* current cpuset ancestors */
1657 int allowed = 1; /* is allocation in zone z allowed? */
1658
1659 if (in_interrupt())
1660 return 1;
1661 node = z->zone_pgdat->node_id;
1662 if (node_isset(node, current->mems_allowed))
1663 return 1;
1664 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
1665 return 0;
1666
1667 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1668 down(&cpuset_sem);
1669 cs = current->cpuset;
1670 if (!cs)
1671 goto done; /* current task exiting */
1672 cs = nearest_exclusive_ancestor(cs);
1673 allowed = node_isset(node, cs->mems_allowed);
1674done:
1675 up(&cpuset_sem);
1676 return allowed;
1677}
1678
1679/**
1680 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
1681 * @p: pointer to task_struct of some other task.
1682 *
1683 * Description: Return true if the nearest mem_exclusive ancestor
1684 * cpusets of tasks @p and current overlap. Used by oom killer to
1685 * determine if task @p's memory usage might impact the memory
1686 * available to the current task.
1687 *
1688 * Acquires cpuset_sem - not suitable for calling from a fast path.
1689 **/
1690
1691int cpuset_excl_nodes_overlap(const struct task_struct *p)
1692{
1693 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1694 int overlap = 0; /* do cpusets overlap? */
1695
1696 down(&cpuset_sem);
1697 cs1 = current->cpuset;
1698 if (!cs1)
1699 goto done; /* current task exiting */
1700 cs2 = p->cpuset;
1701 if (!cs2)
1702 goto done; /* task p is exiting */
1703 cs1 = nearest_exclusive_ancestor(cs1);
1704 cs2 = nearest_exclusive_ancestor(cs2);
1705 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1706done:
1707 up(&cpuset_sem);
1708
1709 return overlap;
1625} 1710}
1626 1711
1627/* 1712/*
diff --git a/kernel/fork.c b/kernel/fork.c
index b65187f0c74e..7e1ead9a6ba4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -994,6 +994,9 @@ static task_t *copy_process(unsigned long clone_flags,
994 * of CLONE_PTRACE. 994 * of CLONE_PTRACE.
995 */ 995 */
996 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); 996 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
997#ifdef TIF_SYSCALL_EMU
998 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
999#endif
997 1000
998 /* Our parent execution domain becomes current domain 1001 /* Our parent execution domain becomes current domain
999 These must match for thread signalling to apply */ 1002 These must match for thread signalling to apply */
diff --git a/kernel/futex.c b/kernel/futex.c
index c7130f86106c..ca05fe6a70b2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -40,6 +40,7 @@
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/syscalls.h> 41#include <linux/syscalls.h>
42#include <linux/signal.h> 42#include <linux/signal.h>
43#include <asm/futex.h>
43 44
44#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 45#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
45 46
@@ -327,6 +328,118 @@ out:
327} 328}
328 329
329/* 330/*
331 * Wake up all waiters hashed on the physical page that is mapped
332 * to this virtual address:
333 */
334static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
335{
336 union futex_key key1, key2;
337 struct futex_hash_bucket *bh1, *bh2;
338 struct list_head *head;
339 struct futex_q *this, *next;
340 int ret, op_ret, attempt = 0;
341
342retryfull:
343 down_read(&current->mm->mmap_sem);
344
345 ret = get_futex_key(uaddr1, &key1);
346 if (unlikely(ret != 0))
347 goto out;
348 ret = get_futex_key(uaddr2, &key2);
349 if (unlikely(ret != 0))
350 goto out;
351
352 bh1 = hash_futex(&key1);
353 bh2 = hash_futex(&key2);
354
355retry:
356 if (bh1 < bh2)
357 spin_lock(&bh1->lock);
358 spin_lock(&bh2->lock);
359 if (bh1 > bh2)
360 spin_lock(&bh1->lock);
361
362 op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
363 if (unlikely(op_ret < 0)) {
364 int dummy;
365
366 spin_unlock(&bh1->lock);
367 if (bh1 != bh2)
368 spin_unlock(&bh2->lock);
369
370 /* futex_atomic_op_inuser needs to both read and write
371 * *(int __user *)uaddr2, but we can't modify it
372 * non-atomically. Therefore, if get_user below is not
373 * enough, we need to handle the fault ourselves, while
374 * still holding the mmap_sem. */
375 if (attempt++) {
376 struct vm_area_struct * vma;
377 struct mm_struct *mm = current->mm;
378
379 ret = -EFAULT;
380 if (attempt >= 2 ||
381 !(vma = find_vma(mm, uaddr2)) ||
382 vma->vm_start > uaddr2 ||
383 !(vma->vm_flags & VM_WRITE))
384 goto out;
385
386 switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
387 case VM_FAULT_MINOR:
388 current->min_flt++;
389 break;
390 case VM_FAULT_MAJOR:
391 current->maj_flt++;
392 break;
393 default:
394 goto out;
395 }
396 goto retry;
397 }
398
399 /* If we would have faulted, release mmap_sem,
400 * fault it in and start all over again. */
401 up_read(&current->mm->mmap_sem);
402
403 ret = get_user(dummy, (int __user *)uaddr2);
404 if (ret)
405 return ret;
406
407 goto retryfull;
408 }
409
410 head = &bh1->chain;
411
412 list_for_each_entry_safe(this, next, head, list) {
413 if (match_futex (&this->key, &key1)) {
414 wake_futex(this);
415 if (++ret >= nr_wake)
416 break;
417 }
418 }
419
420 if (op_ret > 0) {
421 head = &bh2->chain;
422
423 op_ret = 0;
424 list_for_each_entry_safe(this, next, head, list) {
425 if (match_futex (&this->key, &key2)) {
426 wake_futex(this);
427 if (++op_ret >= nr_wake2)
428 break;
429 }
430 }
431 ret += op_ret;
432 }
433
434 spin_unlock(&bh1->lock);
435 if (bh1 != bh2)
436 spin_unlock(&bh2->lock);
437out:
438 up_read(&current->mm->mmap_sem);
439 return ret;
440}
441
442/*
330 * Requeue all waiters hashed on one physical page to another 443 * Requeue all waiters hashed on one physical page to another
331 * physical page. 444 * physical page.
332 */ 445 */
@@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal)
673 filp->f_mapping = filp->f_dentry->d_inode->i_mapping; 786 filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
674 787
675 if (signal) { 788 if (signal) {
676 int err;
677 err = f_setown(filp, current->pid, 1); 789 err = f_setown(filp, current->pid, 1);
678 if (err < 0) { 790 if (err < 0) {
679 put_unused_fd(ret); 791 goto error;
680 put_filp(filp);
681 ret = err;
682 goto out;
683 } 792 }
684 filp->f_owner.signum = signal; 793 filp->f_owner.signum = signal;
685 } 794 }
686 795
687 q = kmalloc(sizeof(*q), GFP_KERNEL); 796 q = kmalloc(sizeof(*q), GFP_KERNEL);
688 if (!q) { 797 if (!q) {
689 put_unused_fd(ret); 798 err = -ENOMEM;
690 put_filp(filp); 799 goto error;
691 ret = -ENOMEM;
692 goto out;
693 } 800 }
694 801
695 down_read(&current->mm->mmap_sem); 802 down_read(&current->mm->mmap_sem);
@@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal)
697 804
698 if (unlikely(err != 0)) { 805 if (unlikely(err != 0)) {
699 up_read(&current->mm->mmap_sem); 806 up_read(&current->mm->mmap_sem);
700 put_unused_fd(ret);
701 put_filp(filp);
702 kfree(q); 807 kfree(q);
703 return err; 808 goto error;
704 } 809 }
705 810
706 /* 811 /*
@@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal)
716 fd_install(ret, filp); 821 fd_install(ret, filp);
717out: 822out:
718 return ret; 823 return ret;
824error:
825 put_unused_fd(ret);
826 put_filp(filp);
827 ret = err;
828 goto out;
719} 829}
720 830
721long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 831long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
@@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
740 case FUTEX_CMP_REQUEUE: 850 case FUTEX_CMP_REQUEUE:
741 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); 851 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
742 break; 852 break;
853 case FUTEX_WAKE_OP:
854 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
855 break;
743 default: 856 default:
744 ret = -ENOSYS; 857 ret = -ENOSYS;
745 } 858 }
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b7..0cbe633420fb 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
39 struct list_head *tmp; 39 struct list_head *tmp;
40 struct inter_module_entry *ime, *ime_new; 40 struct inter_module_entry *ime, *ime_new;
41 41
42 if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { 42 if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
43 /* Overloaded kernel, not fatal */ 43 /* Overloaded kernel, not fatal */
44 printk(KERN_ERR 44 printk(KERN_ERR
45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", 45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
47 kmalloc_failed = 1; 47 kmalloc_failed = 1;
48 return; 48 return;
49 } 49 }
50 memset(ime_new, 0, sizeof(*ime_new));
51 ime_new->im_name = im_name; 50 ime_new->im_name = im_name;
52 ime_new->owner = owner; 51 ime_new->owner = owner;
53 ime_new->userdata = userdata; 52 ime_new->userdata = userdata;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c29f83c16497..3ff7b925c387 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
111 unsigned int status; 111 unsigned int status;
112 112
113 kstat_this_cpu.irqs[irq]++; 113 kstat_this_cpu.irqs[irq]++;
114 if (desc->status & IRQ_PER_CPU) { 114 if (CHECK_IRQ_PER_CPU(desc->status)) {
115 irqreturn_t action_ret; 115 irqreturn_t action_ret;
116 116
117 /* 117 /*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
18 18
19cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; 19cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
20 20
21#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
22cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
23#endif
24
21/** 25/**
22 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 26 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
23 * 27 *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
19 */ 19 */
20static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; 20static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
21 21
22void __attribute__((weak)) 22#ifdef CONFIG_GENERIC_PENDING_IRQ
23proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 23void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
24{
25 /*
26 * Save these away for later use. Re-progam when the
27 * interrupt is pending
28 */
29 set_pending_irq(irq, mask_val);
30}
31#else
32void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
24{ 33{
25 irq_affinity[irq] = mask_val; 34 irq_affinity[irq] = mask_val;
26 irq_desc[irq].handler->set_affinity(irq, mask_val); 35 irq_desc[irq].handler->set_affinity(irq, mask_val);
27} 36}
37#endif
28 38
29static int irq_affinity_read_proc(char *page, char **start, off_t off, 39static int irq_affinity_read_proc(char *page, char **start, off_t off,
30 int count, int *eof, void *data) 40 int count, int *eof, void *data)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b0237122b24e..f3ea492ab44d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
37#include <linux/init.h> 37#include <linux/init.h>
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <asm-generic/sections.h>
40#include <asm/cacheflush.h> 41#include <asm/cacheflush.h>
41#include <asm/errno.h> 42#include <asm/errno.h>
42#include <asm/kdebug.h> 43#include <asm/kdebug.h>
@@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages;
72 * get_insn_slot() - Find a slot on an executable page for an instruction. 73 * get_insn_slot() - Find a slot on an executable page for an instruction.
73 * We allocate an executable page if there's no room on existing ones. 74 * We allocate an executable page if there's no room on existing ones.
74 */ 75 */
75kprobe_opcode_t *get_insn_slot(void) 76kprobe_opcode_t __kprobes *get_insn_slot(void)
76{ 77{
77 struct kprobe_insn_page *kip; 78 struct kprobe_insn_page *kip;
78 struct hlist_node *pos; 79 struct hlist_node *pos;
@@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void)
117 return kip->insns; 118 return kip->insns;
118} 119}
119 120
120void free_insn_slot(kprobe_opcode_t *slot) 121void __kprobes free_insn_slot(kprobe_opcode_t *slot)
121{ 122{
122 struct kprobe_insn_page *kip; 123 struct kprobe_insn_page *kip;
123 struct hlist_node *pos; 124 struct hlist_node *pos;
@@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot)
152} 153}
153 154
154/* Locks kprobe: irqs must be disabled */ 155/* Locks kprobe: irqs must be disabled */
155void lock_kprobes(void) 156void __kprobes lock_kprobes(void)
156{ 157{
158 unsigned long flags = 0;
159
160 /* Avoiding local interrupts to happen right after we take the kprobe_lock
161 * and before we get a chance to update kprobe_cpu, this to prevent
162 * deadlock when we have a kprobe on ISR routine and a kprobe on task
163 * routine
164 */
165 local_irq_save(flags);
166
157 spin_lock(&kprobe_lock); 167 spin_lock(&kprobe_lock);
158 kprobe_cpu = smp_processor_id(); 168 kprobe_cpu = smp_processor_id();
169
170 local_irq_restore(flags);
159} 171}
160 172
161void unlock_kprobes(void) 173void __kprobes unlock_kprobes(void)
162{ 174{
175 unsigned long flags = 0;
176
177 /* Avoiding local interrupts to happen right after we update
178 * kprobe_cpu and before we get a a chance to release kprobe_lock,
179 * this to prevent deadlock when we have a kprobe on ISR routine and
180 * a kprobe on task routine
181 */
182 local_irq_save(flags);
183
163 kprobe_cpu = NR_CPUS; 184 kprobe_cpu = NR_CPUS;
164 spin_unlock(&kprobe_lock); 185 spin_unlock(&kprobe_lock);
186
187 local_irq_restore(flags);
165} 188}
166 189
167/* You have to be holding the kprobe_lock */ 190/* You have to be holding the kprobe_lock */
168struct kprobe *get_kprobe(void *addr) 191struct kprobe __kprobes *get_kprobe(void *addr)
169{ 192{
170 struct hlist_head *head; 193 struct hlist_head *head;
171 struct hlist_node *node; 194 struct hlist_node *node;
@@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr)
183 * Aggregate handlers for multiple kprobes support - these handlers 206 * Aggregate handlers for multiple kprobes support - these handlers
184 * take care of invoking the individual kprobe handlers on p->list 207 * take care of invoking the individual kprobe handlers on p->list
185 */ 208 */
186static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) 209static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
187{ 210{
188 struct kprobe *kp; 211 struct kprobe *kp;
189 212
@@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
198 return 0; 221 return 0;
199} 222}
200 223
201static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, 224static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
202 unsigned long flags) 225 unsigned long flags)
203{ 226{
204 struct kprobe *kp; 227 struct kprobe *kp;
205 228
@@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
213 return; 236 return;
214} 237}
215 238
216static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 239static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
217 int trapnr) 240 int trapnr)
218{ 241{
219 /* 242 /*
220 * if we faulted "during" the execution of a user specified 243 * if we faulted "during" the execution of a user specified
@@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
227 return 0; 250 return 0;
228} 251}
229 252
230static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 253static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
231{ 254{
232 struct kprobe *kp = curr_kprobe; 255 struct kprobe *kp = curr_kprobe;
233 if (curr_kprobe && kp->break_handler) { 256 if (curr_kprobe && kp->break_handler) {
@@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
240 return 0; 263 return 0;
241} 264}
242 265
243struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) 266struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
244{ 267{
245 struct hlist_node *node; 268 struct hlist_node *node;
246 struct kretprobe_instance *ri; 269 struct kretprobe_instance *ri;
@@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
249 return NULL; 272 return NULL;
250} 273}
251 274
252static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) 275static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
276 *rp)
253{ 277{
254 struct hlist_node *node; 278 struct hlist_node *node;
255 struct kretprobe_instance *ri; 279 struct kretprobe_instance *ri;
@@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
258 return NULL; 282 return NULL;
259} 283}
260 284
261void add_rp_inst(struct kretprobe_instance *ri) 285void __kprobes add_rp_inst(struct kretprobe_instance *ri)
262{ 286{
263 /* 287 /*
264 * Remove rp inst off the free list - 288 * Remove rp inst off the free list -
@@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri)
276 hlist_add_head(&ri->uflist, &ri->rp->used_instances); 300 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
277} 301}
278 302
279void recycle_rp_inst(struct kretprobe_instance *ri) 303void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
280{ 304{
281 /* remove rp inst off the rprobe_inst_table */ 305 /* remove rp inst off the rprobe_inst_table */
282 hlist_del(&ri->hlist); 306 hlist_del(&ri->hlist);
@@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri)
291 kfree(ri); 315 kfree(ri);
292} 316}
293 317
294struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) 318struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
295{ 319{
296 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; 320 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
297} 321}
@@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
302 * instances associated with this task. These left over instances represent 326 * instances associated with this task. These left over instances represent
303 * probed functions that have been called but will never return. 327 * probed functions that have been called but will never return.
304 */ 328 */
305void kprobe_flush_task(struct task_struct *tk) 329void __kprobes kprobe_flush_task(struct task_struct *tk)
306{ 330{
307 struct kretprobe_instance *ri; 331 struct kretprobe_instance *ri;
308 struct hlist_head *head; 332 struct hlist_head *head;
@@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk)
322 * This kprobe pre_handler is registered with every kretprobe. When probe 346 * This kprobe pre_handler is registered with every kretprobe. When probe
323 * hits it will set up the return probe. 347 * hits it will set up the return probe.
324 */ 348 */
325static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) 349static int __kprobes pre_handler_kretprobe(struct kprobe *p,
350 struct pt_regs *regs)
326{ 351{
327 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 352 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
328 353
@@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
353* Add the new probe to old_p->list. Fail if this is the 378* Add the new probe to old_p->list. Fail if this is the
354* second jprobe at the address - two jprobes can't coexist 379* second jprobe at the address - two jprobes can't coexist
355*/ 380*/
356static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 381static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
357{ 382{
358 struct kprobe *kp; 383 struct kprobe *kp;
359 384
@@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
395 * the intricacies 420 * the intricacies
396 * TODO: Move kcalloc outside the spinlock 421 * TODO: Move kcalloc outside the spinlock
397 */ 422 */
398static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) 423static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
424 struct kprobe *p)
399{ 425{
400 int ret = 0; 426 int ret = 0;
401 struct kprobe *ap; 427 struct kprobe *ap;
@@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
434 spin_unlock_irqrestore(&kprobe_lock, flags); 460 spin_unlock_irqrestore(&kprobe_lock, flags);
435} 461}
436 462
437int register_kprobe(struct kprobe *p) 463static int __kprobes in_kprobes_functions(unsigned long addr)
464{
465 if (addr >= (unsigned long)__kprobes_text_start
466 && addr < (unsigned long)__kprobes_text_end)
467 return -EINVAL;
468 return 0;
469}
470
471int __kprobes register_kprobe(struct kprobe *p)
438{ 472{
439 int ret = 0; 473 int ret = 0;
440 unsigned long flags = 0; 474 unsigned long flags = 0;
441 struct kprobe *old_p; 475 struct kprobe *old_p;
442 476
443 if ((ret = arch_prepare_kprobe(p)) != 0) { 477 if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
478 return ret;
479 if ((ret = arch_prepare_kprobe(p)) != 0)
444 goto rm_kprobe; 480 goto rm_kprobe;
445 } 481
446 spin_lock_irqsave(&kprobe_lock, flags); 482 spin_lock_irqsave(&kprobe_lock, flags);
447 old_p = get_kprobe(p->addr); 483 old_p = get_kprobe(p->addr);
448 p->nmissed = 0; 484 p->nmissed = 0;
@@ -466,7 +502,7 @@ rm_kprobe:
466 return ret; 502 return ret;
467} 503}
468 504
469void unregister_kprobe(struct kprobe *p) 505void __kprobes unregister_kprobe(struct kprobe *p)
470{ 506{
471 unsigned long flags; 507 unsigned long flags;
472 struct kprobe *old_p; 508 struct kprobe *old_p;
@@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = {
487 .priority = 0x7fffffff /* we need to notified first */ 523 .priority = 0x7fffffff /* we need to notified first */
488}; 524};
489 525
490int register_jprobe(struct jprobe *jp) 526int __kprobes register_jprobe(struct jprobe *jp)
491{ 527{
492 /* Todo: Verify probepoint is a function entry point */ 528 /* Todo: Verify probepoint is a function entry point */
493 jp->kp.pre_handler = setjmp_pre_handler; 529 jp->kp.pre_handler = setjmp_pre_handler;
@@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp)
496 return register_kprobe(&jp->kp); 532 return register_kprobe(&jp->kp);
497} 533}
498 534
499void unregister_jprobe(struct jprobe *jp) 535void __kprobes unregister_jprobe(struct jprobe *jp)
500{ 536{
501 unregister_kprobe(&jp->kp); 537 unregister_kprobe(&jp->kp);
502} 538}
503 539
504#ifdef ARCH_SUPPORTS_KRETPROBES 540#ifdef ARCH_SUPPORTS_KRETPROBES
505 541
506int register_kretprobe(struct kretprobe *rp) 542int __kprobes register_kretprobe(struct kretprobe *rp)
507{ 543{
508 int ret = 0; 544 int ret = 0;
509 struct kretprobe_instance *inst; 545 struct kretprobe_instance *inst;
@@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp)
540 576
541#else /* ARCH_SUPPORTS_KRETPROBES */ 577#else /* ARCH_SUPPORTS_KRETPROBES */
542 578
543int register_kretprobe(struct kretprobe *rp) 579int __kprobes register_kretprobe(struct kretprobe *rp)
544{ 580{
545 return -ENOSYS; 581 return -ENOSYS;
546} 582}
547 583
548#endif /* ARCH_SUPPORTS_KRETPROBES */ 584#endif /* ARCH_SUPPORTS_KRETPROBES */
549 585
550void unregister_kretprobe(struct kretprobe *rp) 586void __kprobes unregister_kretprobe(struct kretprobe *rp)
551{ 587{
552 unsigned long flags; 588 unsigned long flags;
553 struct kretprobe_instance *ri; 589 struct kretprobe_instance *ri;
diff --git a/kernel/module.c b/kernel/module.c
index c32995fbd8fd..4b39d3793c72 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1509,6 +1509,7 @@ static struct module *load_module(void __user *umod,
1509 long err = 0; 1509 long err = 0;
1510 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1510 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1511 struct exception_table_entry *extable; 1511 struct exception_table_entry *extable;
1512 mm_segment_t old_fs;
1512 1513
1513 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1514 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
1514 umod, len, uargs); 1515 umod, len, uargs);
@@ -1779,6 +1780,24 @@ static struct module *load_module(void __user *umod,
1779 if (err < 0) 1780 if (err < 0)
1780 goto cleanup; 1781 goto cleanup;
1781 1782
1783 /* flush the icache in correct context */
1784 old_fs = get_fs();
1785 set_fs(KERNEL_DS);
1786
1787 /*
1788 * Flush the instruction cache, since we've played with text.
1789 * Do it before processing of module parameters, so the module
1790 * can provide parameter accessor functions of its own.
1791 */
1792 if (mod->module_init)
1793 flush_icache_range((unsigned long)mod->module_init,
1794 (unsigned long)mod->module_init
1795 + mod->init_size);
1796 flush_icache_range((unsigned long)mod->module_core,
1797 (unsigned long)mod->module_core + mod->core_size);
1798
1799 set_fs(old_fs);
1800
1782 mod->args = args; 1801 mod->args = args;
1783 if (obsparmindex) { 1802 if (obsparmindex) {
1784 err = obsolete_params(mod->name, mod->args, 1803 err = obsolete_params(mod->name, mod->args,
@@ -1860,7 +1879,6 @@ sys_init_module(void __user *umod,
1860 const char __user *uargs) 1879 const char __user *uargs)
1861{ 1880{
1862 struct module *mod; 1881 struct module *mod;
1863 mm_segment_t old_fs = get_fs();
1864 int ret = 0; 1882 int ret = 0;
1865 1883
1866 /* Must have permission */ 1884 /* Must have permission */
@@ -1878,19 +1896,6 @@ sys_init_module(void __user *umod,
1878 return PTR_ERR(mod); 1896 return PTR_ERR(mod);
1879 } 1897 }
1880 1898
1881 /* flush the icache in correct context */
1882 set_fs(KERNEL_DS);
1883
1884 /* Flush the instruction cache, since we've played with text */
1885 if (mod->module_init)
1886 flush_icache_range((unsigned long)mod->module_init,
1887 (unsigned long)mod->module_init
1888 + mod->init_size);
1889 flush_icache_range((unsigned long)mod->module_core,
1890 (unsigned long)mod->module_core + mod->core_size);
1891
1892 set_fs(old_fs);
1893
1894 /* Now sew it into the lists. They won't access us, since 1899 /* Now sew it into the lists. They won't access us, since
1895 strong_try_module_get() will fail. */ 1900 strong_try_module_get() will fail. */
1896 stop_machine_run(__link_module, mod, NR_CPUS); 1901 stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8fc..fbf173215fd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
542{ 542{
543 struct module_kobject *mk; 543 struct module_kobject *mk;
544 544
545 mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); 545 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
546 memset(mk, 0, sizeof(struct module_kobject)); 546 BUG_ON(!mk);
547 547
548 mk->mod = THIS_MODULE; 548 mk->mod = THIS_MODULE;
549 kobj_set_kset_s(mk, module_subsys); 549 kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 38798a2ff994..b7b532acd9fc 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
427 timr->sigq->info.si_code = SI_TIMER; 427 timr->sigq->info.si_code = SI_TIMER;
428 timr->sigq->info.si_tid = timr->it_id; 428 timr->sigq->info.si_tid = timr->it_id;
429 timr->sigq->info.si_value = timr->it_sigev_value; 429 timr->sigq->info.si_value = timr->it_sigev_value;
430
430 if (timr->it_sigev_notify & SIGEV_THREAD_ID) { 431 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
431 if (unlikely(timr->it_process->flags & PF_EXITING)) { 432 struct task_struct *leader;
432 timr->it_sigev_notify = SIGEV_SIGNAL; 433 int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
433 put_task_struct(timr->it_process); 434 timr->it_process);
434 timr->it_process = timr->it_process->group_leader; 435
435 goto group; 436 if (likely(ret >= 0))
436 } 437 return ret;
437 return send_sigqueue(timr->it_sigev_signo, timr->sigq, 438
438 timr->it_process); 439 timr->it_sigev_notify = SIGEV_SIGNAL;
439 } 440 leader = timr->it_process->group_leader;
440 else { 441 put_task_struct(timr->it_process);
441 group: 442 timr->it_process = leader;
442 return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
443 timr->it_process);
444 } 443 }
444
445 return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
446 timr->it_process);
445} 447}
446EXPORT_SYMBOL_GPL(posix_timer_event); 448EXPORT_SYMBOL_GPL(posix_timer_event);
447 449
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b99f61b82685..396c7873e804 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -29,7 +29,7 @@ config PM_DEBUG
29 29
30config SOFTWARE_SUSPEND 30config SOFTWARE_SUSPEND
31 bool "Software Suspend" 31 bool "Software Suspend"
32 depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) 32 depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP))
33 ---help--- 33 ---help---
34 Enable the possibility of suspending the machine. 34 Enable the possibility of suspending the machine.
35 It doesn't need APM. 35 It doesn't need APM.
@@ -73,6 +73,18 @@ config PM_STD_PARTITION
73 suspended image to. It will simply pick the first available swap 73 suspended image to. It will simply pick the first available swap
74 device. 74 device.
75 75
76config SWSUSP_ENCRYPT
77 bool "Encrypt suspend image"
78 depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y)
79 default ""
80 ---help---
81 To prevent data gathering from swap after resume you can encrypt
82 the suspend image with a temporary key that is deleted on
83 resume.
84
85 Note that the temporary key is stored unencrypted on disk while the
86 system is suspended.
87
76config SUSPEND_SMP 88config SUSPEND_SMP
77 bool 89 bool
78 depends on HOTPLUG_CPU && X86 && PM 90 depends on HOTPLUG_CPU && X86 && PM
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 664eb0469b6e..2d8bf054d036 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -112,24 +112,12 @@ static inline void platform_finish(void)
112 } 112 }
113} 113}
114 114
115static void finish(void)
116{
117 device_resume();
118 platform_finish();
119 thaw_processes();
120 enable_nonboot_cpus();
121 pm_restore_console();
122}
123
124
125static int prepare_processes(void) 115static int prepare_processes(void)
126{ 116{
127 int error; 117 int error;
128 118
129 pm_prepare_console(); 119 pm_prepare_console();
130
131 sys_sync(); 120 sys_sync();
132
133 disable_nonboot_cpus(); 121 disable_nonboot_cpus();
134 122
135 if (freeze_processes()) { 123 if (freeze_processes()) {
@@ -162,15 +150,6 @@ static void unprepare_processes(void)
162 pm_restore_console(); 150 pm_restore_console();
163} 151}
164 152
165static int prepare_devices(void)
166{
167 int error;
168
169 if ((error = device_suspend(PMSG_FREEZE)))
170 printk("Some devices failed to suspend\n");
171 return error;
172}
173
174/** 153/**
175 * pm_suspend_disk - The granpappy of power management. 154 * pm_suspend_disk - The granpappy of power management.
176 * 155 *
@@ -187,17 +166,14 @@ int pm_suspend_disk(void)
187 error = prepare_processes(); 166 error = prepare_processes();
188 if (error) 167 if (error)
189 return error; 168 return error;
190 error = prepare_devices();
191 169
170 error = device_suspend(PMSG_FREEZE);
192 if (error) { 171 if (error) {
172 printk("Some devices failed to suspend\n");
193 unprepare_processes(); 173 unprepare_processes();
194 return error; 174 return error;
195 } 175 }
196 176
197 pr_debug("PM: Attempting to suspend to disk.\n");
198 if (pm_disk_mode == PM_DISK_FIRMWARE)
199 return pm_ops->enter(PM_SUSPEND_DISK);
200
201 pr_debug("PM: snapshotting memory.\n"); 177 pr_debug("PM: snapshotting memory.\n");
202 in_suspend = 1; 178 in_suspend = 1;
203 if ((error = swsusp_suspend())) 179 if ((error = swsusp_suspend()))
@@ -208,11 +184,20 @@ int pm_suspend_disk(void)
208 error = swsusp_write(); 184 error = swsusp_write();
209 if (!error) 185 if (!error)
210 power_down(pm_disk_mode); 186 power_down(pm_disk_mode);
187 else {
188 /* swsusp_write can not fail in device_resume,
189 no need to do second device_resume */
190 swsusp_free();
191 unprepare_processes();
192 return error;
193 }
211 } else 194 } else
212 pr_debug("PM: Image restored successfully.\n"); 195 pr_debug("PM: Image restored successfully.\n");
196
213 swsusp_free(); 197 swsusp_free();
214 Done: 198 Done:
215 finish(); 199 device_resume();
200 unprepare_processes();
216 return error; 201 return error;
217} 202}
218 203
@@ -233,9 +218,12 @@ static int software_resume(void)
233{ 218{
234 int error; 219 int error;
235 220
221 down(&pm_sem);
236 if (!swsusp_resume_device) { 222 if (!swsusp_resume_device) {
237 if (!strlen(resume_file)) 223 if (!strlen(resume_file)) {
224 up(&pm_sem);
238 return -ENOENT; 225 return -ENOENT;
226 }
239 swsusp_resume_device = name_to_dev_t(resume_file); 227 swsusp_resume_device = name_to_dev_t(resume_file);
240 pr_debug("swsusp: Resume From Partition %s\n", resume_file); 228 pr_debug("swsusp: Resume From Partition %s\n", resume_file);
241 } else { 229 } else {
@@ -248,6 +236,7 @@ static int software_resume(void)
248 * FIXME: If noresume is specified, we need to find the partition 236 * FIXME: If noresume is specified, we need to find the partition
249 * and reset it back to normal swap space. 237 * and reset it back to normal swap space.
250 */ 238 */
239 up(&pm_sem);
251 return 0; 240 return 0;
252 } 241 }
253 242
@@ -270,20 +259,24 @@ static int software_resume(void)
270 259
271 pr_debug("PM: Preparing devices for restore.\n"); 260 pr_debug("PM: Preparing devices for restore.\n");
272 261
273 if ((error = prepare_devices())) 262 if ((error = device_suspend(PMSG_FREEZE))) {
263 printk("Some devices failed to suspend\n");
274 goto Free; 264 goto Free;
265 }
275 266
276 mb(); 267 mb();
277 268
278 pr_debug("PM: Restoring saved image.\n"); 269 pr_debug("PM: Restoring saved image.\n");
279 swsusp_resume(); 270 swsusp_resume();
280 pr_debug("PM: Restore failed, recovering.n"); 271 pr_debug("PM: Restore failed, recovering.n");
281 finish(); 272 device_resume();
282 Free: 273 Free:
283 swsusp_free(); 274 swsusp_free();
284 Cleanup: 275 Cleanup:
285 unprepare_processes(); 276 unprepare_processes();
286 Done: 277 Done:
278 /* For success case, the suspend path will release the lock */
279 up(&pm_sem);
287 pr_debug("PM: Resume from disk failed.\n"); 280 pr_debug("PM: Resume from disk failed.\n");
288 return 0; 281 return 0;
289} 282}
@@ -390,7 +383,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t
390 if (sscanf(buf, "%u:%u", &maj, &min) == 2) { 383 if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
391 res = MKDEV(maj,min); 384 res = MKDEV(maj,min);
392 if (maj == MAJOR(res) && min == MINOR(res)) { 385 if (maj == MAJOR(res) && min == MINOR(res)) {
386 down(&pm_sem);
393 swsusp_resume_device = res; 387 swsusp_resume_device = res;
388 up(&pm_sem);
394 printk("Attempting manual resume\n"); 389 printk("Attempting manual resume\n");
395 noresume = 0; 390 noresume = 0;
396 software_resume(); 391 software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 71aa0fd22007..22bdc93cc038 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -143,11 +143,12 @@ static void suspend_finish(suspend_state_t state)
143 143
144 144
145 145
146static char * pm_states[] = { 146static char *pm_states[PM_SUSPEND_MAX] = {
147 [PM_SUSPEND_STANDBY] = "standby", 147 [PM_SUSPEND_STANDBY] = "standby",
148 [PM_SUSPEND_MEM] = "mem", 148 [PM_SUSPEND_MEM] = "mem",
149#ifdef CONFIG_SOFTWARE_SUSPEND
149 [PM_SUSPEND_DISK] = "disk", 150 [PM_SUSPEND_DISK] = "disk",
150 NULL, 151#endif
151}; 152};
152 153
153 154
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39e..159149321b3c 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
60 unsigned long id, 60 unsigned long id,
61 pm_callback callback) 61 pm_callback callback)
62{ 62{
63 struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); 63 struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
64 if (dev) { 64 if (dev) {
65 memset(dev, 0, sizeof(*dev));
66 dev->type = type; 65 dev->type = type;
67 dev->id = id; 66 dev->id = id;
68 dev->callback = callback; 67 dev->callback = callback;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3bd0d261818f..28de118f7a0b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -38,7 +38,6 @@ void refrigerator(void)
38 processes around? */ 38 processes around? */
39 long save; 39 long save;
40 save = current->state; 40 save = current->state;
41 current->state = TASK_UNINTERRUPTIBLE;
42 pr_debug("%s entered refrigerator\n", current->comm); 41 pr_debug("%s entered refrigerator\n", current->comm);
43 printk("="); 42 printk("=");
44 43
@@ -47,8 +46,10 @@ void refrigerator(void)
47 recalc_sigpending(); /* We sent fake signal, clean it up */ 46 recalc_sigpending(); /* We sent fake signal, clean it up */
48 spin_unlock_irq(&current->sighand->siglock); 47 spin_unlock_irq(&current->sighand->siglock);
49 48
50 while (frozen(current)) 49 while (frozen(current)) {
50 current->state = TASK_UNINTERRUPTIBLE;
51 schedule(); 51 schedule();
52 }
52 pr_debug("%s left refrigerator\n", current->comm); 53 pr_debug("%s left refrigerator\n", current->comm);
53 current->state = save; 54 current->state = save;
54} 55}
@@ -80,13 +81,33 @@ int freeze_processes(void)
80 } while_each_thread(g, p); 81 } while_each_thread(g, p);
81 read_unlock(&tasklist_lock); 82 read_unlock(&tasklist_lock);
82 yield(); /* Yield is okay here */ 83 yield(); /* Yield is okay here */
83 if (time_after(jiffies, start_time + TIMEOUT)) { 84 if (todo && time_after(jiffies, start_time + TIMEOUT)) {
84 printk( "\n" ); 85 printk( "\n" );
85 printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); 86 printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
86 return todo; 87 break;
87 } 88 }
88 } while(todo); 89 } while(todo);
89 90
91 /* This does not unfreeze processes that are already frozen
92 * (we have slightly ugly calling convention in that respect,
93 * and caller must call thaw_processes() if something fails),
94 * but it cleans up leftover PF_FREEZE requests.
95 */
96 if (todo) {
97 read_lock(&tasklist_lock);
98 do_each_thread(g, p)
99 if (freezing(p)) {
100 pr_debug(" clean up: %s\n", p->comm);
101 p->flags &= ~PF_FREEZE;
102 spin_lock_irqsave(&p->sighand->siglock, flags);
103 recalc_sigpending_tsk(p);
104 spin_unlock_irqrestore(&p->sighand->siglock, flags);
105 }
106 while_each_thread(g, p);
107 read_unlock(&tasklist_lock);
108 return todo;
109 }
110
90 printk( "|\n" ); 111 printk( "|\n" );
91 BUG_ON(in_atomic()); 112 BUG_ON(in_atomic());
92 return 0; 113 return 0;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f2bc71b9fe8b..d967e875ee82 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,6 +31,9 @@
31 * Alex Badea <vampire@go.ro>: 31 * Alex Badea <vampire@go.ro>:
32 * Fixed runaway init 32 * Fixed runaway init
33 * 33 *
34 * Andreas Steinmetz <ast@domdv.de>:
35 * Added encrypted suspend option
36 *
34 * More state savers are welcome. Especially for the scsi layer... 37 * More state savers are welcome. Especially for the scsi layer...
35 * 38 *
36 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt 39 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -71,8 +74,16 @@
71#include <asm/tlbflush.h> 74#include <asm/tlbflush.h>
72#include <asm/io.h> 75#include <asm/io.h>
73 76
77#include <linux/random.h>
78#include <linux/crypto.h>
79#include <asm/scatterlist.h>
80
74#include "power.h" 81#include "power.h"
75 82
83#define CIPHER "aes"
84#define MAXKEY 32
85#define MAXIV 32
86
76/* References to section boundaries */ 87/* References to section boundaries */
77extern const void __nosave_begin, __nosave_end; 88extern const void __nosave_begin, __nosave_end;
78 89
@@ -103,7 +114,8 @@ static suspend_pagedir_t *pagedir_save;
103#define SWSUSP_SIG "S1SUSPEND" 114#define SWSUSP_SIG "S1SUSPEND"
104 115
105static struct swsusp_header { 116static struct swsusp_header {
106 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; 117 char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)];
118 u8 key_iv[MAXKEY+MAXIV];
107 swp_entry_t swsusp_info; 119 swp_entry_t swsusp_info;
108 char orig_sig[10]; 120 char orig_sig[10];
109 char sig[10]; 121 char sig[10];
@@ -129,6 +141,131 @@ static struct swsusp_info swsusp_info;
129static unsigned short swapfile_used[MAX_SWAPFILES]; 141static unsigned short swapfile_used[MAX_SWAPFILES];
130static unsigned short root_swap; 142static unsigned short root_swap;
131 143
144static int write_page(unsigned long addr, swp_entry_t * loc);
145static int bio_read_page(pgoff_t page_off, void * page);
146
147static u8 key_iv[MAXKEY+MAXIV];
148
149#ifdef CONFIG_SWSUSP_ENCRYPT
150
151static int crypto_init(int mode, void **mem)
152{
153 int error = 0;
154 int len;
155 char *modemsg;
156 struct crypto_tfm *tfm;
157
158 modemsg = mode ? "suspend not possible" : "resume not possible";
159
160 tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
161 if(!tfm) {
162 printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
163 error = -EINVAL;
164 goto out;
165 }
166
167 if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
168 printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
169 error = -ENOKEY;
170 goto fail;
171 }
172
173 if (mode)
174 get_random_bytes(key_iv, MAXKEY+MAXIV);
175
176 len = crypto_tfm_alg_max_keysize(tfm);
177 if (len > MAXKEY)
178 len = MAXKEY;
179
180 if (crypto_cipher_setkey(tfm, key_iv, len)) {
181 printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
182 error = -EKEYREJECTED;
183 goto fail;
184 }
185
186 len = crypto_tfm_alg_ivsize(tfm);
187
188 if (MAXIV < len) {
189 printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
190 error = -EOVERFLOW;
191 goto fail;
192 }
193
194 crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
195
196 *mem=(void *)tfm;
197
198 goto out;
199
200fail: crypto_free_tfm(tfm);
201out: return error;
202}
203
204static __inline__ void crypto_exit(void *mem)
205{
206 crypto_free_tfm((struct crypto_tfm *)mem);
207}
208
209static __inline__ int crypto_write(struct pbe *p, void *mem)
210{
211 int error = 0;
212 struct scatterlist src, dst;
213
214 src.page = virt_to_page(p->address);
215 src.offset = 0;
216 src.length = PAGE_SIZE;
217 dst.page = virt_to_page((void *)&swsusp_header);
218 dst.offset = 0;
219 dst.length = PAGE_SIZE;
220
221 error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
222 PAGE_SIZE);
223
224 if (!error)
225 error = write_page((unsigned long)&swsusp_header,
226 &(p->swap_address));
227 return error;
228}
229
230static __inline__ int crypto_read(struct pbe *p, void *mem)
231{
232 int error = 0;
233 struct scatterlist src, dst;
234
235 error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
236 if (!error) {
237 src.offset = 0;
238 src.length = PAGE_SIZE;
239 dst.offset = 0;
240 dst.length = PAGE_SIZE;
241 src.page = dst.page = virt_to_page((void *)p->address);
242
243 error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
244 &src, PAGE_SIZE);
245 }
246 return error;
247}
248#else
249static __inline__ int crypto_init(int mode, void *mem)
250{
251 return 0;
252}
253
254static __inline__ void crypto_exit(void *mem)
255{
256}
257
258static __inline__ int crypto_write(struct pbe *p, void *mem)
259{
260 return write_page(p->address, &(p->swap_address));
261}
262
263static __inline__ int crypto_read(struct pbe *p, void *mem)
264{
265 return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
266}
267#endif
268
132static int mark_swapfiles(swp_entry_t prev) 269static int mark_swapfiles(swp_entry_t prev)
133{ 270{
134 int error; 271 int error;
@@ -140,6 +277,7 @@ static int mark_swapfiles(swp_entry_t prev)
140 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 277 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
141 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 278 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
142 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 279 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
280 memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV);
143 swsusp_header.swsusp_info = prev; 281 swsusp_header.swsusp_info = prev;
144 error = rw_swap_page_sync(WRITE, 282 error = rw_swap_page_sync(WRITE,
145 swp_entry(root_swap, 0), 283 swp_entry(root_swap, 0),
@@ -179,9 +317,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */
179 len=strlen(resume_file); 317 len=strlen(resume_file);
180 root_swap = 0xFFFF; 318 root_swap = 0xFFFF;
181 319
182 swap_list_lock(); 320 spin_lock(&swap_lock);
183 for (i=0; i<MAX_SWAPFILES; i++) { 321 for (i=0; i<MAX_SWAPFILES; i++) {
184 if (swap_info[i].flags == 0) { 322 if (!(swap_info[i].flags & SWP_WRITEOK)) {
185 swapfile_used[i]=SWAPFILE_UNUSED; 323 swapfile_used[i]=SWAPFILE_UNUSED;
186 } else { 324 } else {
187 if (!len) { 325 if (!len) {
@@ -202,7 +340,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */
202 } 340 }
203 } 341 }
204 } 342 }
205 swap_list_unlock(); 343 spin_unlock(&swap_lock);
206 return (root_swap != 0xffff) ? 0 : -ENODEV; 344 return (root_swap != 0xffff) ? 0 : -ENODEV;
207} 345}
208 346
@@ -216,12 +354,12 @@ static void lock_swapdevices(void)
216{ 354{
217 int i; 355 int i;
218 356
219 swap_list_lock(); 357 spin_lock(&swap_lock);
220 for (i = 0; i< MAX_SWAPFILES; i++) 358 for (i = 0; i< MAX_SWAPFILES; i++)
221 if (swapfile_used[i] == SWAPFILE_IGNORED) { 359 if (swapfile_used[i] == SWAPFILE_IGNORED) {
222 swap_info[i].flags ^= 0xFF; 360 swap_info[i].flags ^= SWP_WRITEOK;
223 } 361 }
224 swap_list_unlock(); 362 spin_unlock(&swap_lock);
225} 363}
226 364
227/** 365/**
@@ -286,6 +424,10 @@ static int data_write(void)
286 int error = 0, i = 0; 424 int error = 0, i = 0;
287 unsigned int mod = nr_copy_pages / 100; 425 unsigned int mod = nr_copy_pages / 100;
288 struct pbe *p; 426 struct pbe *p;
427 void *tfm;
428
429 if ((error = crypto_init(1, &tfm)))
430 return error;
289 431
290 if (!mod) 432 if (!mod)
291 mod = 1; 433 mod = 1;
@@ -294,11 +436,14 @@ static int data_write(void)
294 for_each_pbe (p, pagedir_nosave) { 436 for_each_pbe (p, pagedir_nosave) {
295 if (!(i%mod)) 437 if (!(i%mod))
296 printk( "\b\b\b\b%3d%%", i / mod ); 438 printk( "\b\b\b\b%3d%%", i / mod );
297 if ((error = write_page(p->address, &(p->swap_address)))) 439 if ((error = crypto_write(p, tfm))) {
440 crypto_exit(tfm);
298 return error; 441 return error;
442 }
299 i++; 443 i++;
300 } 444 }
301 printk("\b\b\b\bdone\n"); 445 printk("\b\b\b\bdone\n");
446 crypto_exit(tfm);
302 return error; 447 return error;
303} 448}
304 449
@@ -385,7 +530,6 @@ static int write_pagedir(void)
385 * write_suspend_image - Write entire image and metadata. 530 * write_suspend_image - Write entire image and metadata.
386 * 531 *
387 */ 532 */
388
389static int write_suspend_image(void) 533static int write_suspend_image(void)
390{ 534{
391 int error; 535 int error;
@@ -400,6 +544,7 @@ static int write_suspend_image(void)
400 if ((error = close_swap())) 544 if ((error = close_swap()))
401 goto FreePagedir; 545 goto FreePagedir;
402 Done: 546 Done:
547 memset(key_iv, 0, MAXKEY+MAXIV);
403 return error; 548 return error;
404 FreePagedir: 549 FreePagedir:
405 free_pagedir_entries(); 550 free_pagedir_entries();
@@ -591,18 +736,7 @@ static void copy_data_pages(void)
591 736
592static int calc_nr(int nr_copy) 737static int calc_nr(int nr_copy)
593{ 738{
594 int extra = 0; 739 return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
595 int mod = !!(nr_copy % PBES_PER_PAGE);
596 int diff = (nr_copy / PBES_PER_PAGE) + mod;
597
598 do {
599 extra += diff;
600 nr_copy += diff;
601 mod = !!(nr_copy % PBES_PER_PAGE);
602 diff = (nr_copy / PBES_PER_PAGE) + mod - extra;
603 } while (diff > 0);
604
605 return nr_copy;
606} 740}
607 741
608/** 742/**
@@ -886,20 +1020,21 @@ int swsusp_suspend(void)
886 * at resume time, and evil weirdness ensues. 1020 * at resume time, and evil weirdness ensues.
887 */ 1021 */
888 if ((error = device_power_down(PMSG_FREEZE))) { 1022 if ((error = device_power_down(PMSG_FREEZE))) {
1023 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
889 local_irq_enable(); 1024 local_irq_enable();
890 return error; 1025 return error;
891 } 1026 }
892 1027
893 if ((error = swsusp_swap_check())) { 1028 if ((error = swsusp_swap_check())) {
894 printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " 1029 printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
895 "swapon -a!\n"); 1030 device_power_up();
896 local_irq_enable(); 1031 local_irq_enable();
897 return error; 1032 return error;
898 } 1033 }
899 1034
900 save_processor_state(); 1035 save_processor_state();
901 if ((error = swsusp_arch_suspend())) 1036 if ((error = swsusp_arch_suspend()))
902 printk("Error %d suspending\n", error); 1037 printk(KERN_ERR "Error %d suspending\n", error);
903 /* Restore control flow magically appears here */ 1038 /* Restore control flow magically appears here */
904 restore_processor_state(); 1039 restore_processor_state();
905 BUG_ON (nr_copy_pages_check != nr_copy_pages); 1040 BUG_ON (nr_copy_pages_check != nr_copy_pages);
@@ -924,6 +1059,7 @@ int swsusp_resume(void)
924 BUG_ON(!error); 1059 BUG_ON(!error);
925 restore_processor_state(); 1060 restore_processor_state();
926 restore_highmem(); 1061 restore_highmem();
1062 touch_softlockup_watchdog();
927 device_power_up(); 1063 device_power_up();
928 local_irq_enable(); 1064 local_irq_enable();
929 return error; 1065 return error;
@@ -1179,7 +1315,8 @@ static const char * sanity_check(void)
1179 if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) 1315 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
1180 return "machine"; 1316 return "machine";
1181#if 0 1317#if 0
1182 if(swsusp_info.cpus != num_online_cpus()) 1318 /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
1319 if (swsusp_info.cpus != num_possible_cpus())
1183 return "number of cpus"; 1320 return "number of cpus";
1184#endif 1321#endif
1185 return NULL; 1322 return NULL;
@@ -1212,13 +1349,14 @@ static int check_sig(void)
1212 return error; 1349 return error;
1213 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 1350 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
1214 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 1351 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
1352 memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
1353 memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
1215 1354
1216 /* 1355 /*
1217 * Reset swap signature now. 1356 * Reset swap signature now.
1218 */ 1357 */
1219 error = bio_write_page(0, &swsusp_header); 1358 error = bio_write_page(0, &swsusp_header);
1220 } else { 1359 } else {
1221 printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
1222 return -EINVAL; 1360 return -EINVAL;
1223 } 1361 }
1224 if (!error) 1362 if (!error)
@@ -1239,6 +1377,10 @@ static int data_read(struct pbe *pblist)
1239 int error = 0; 1377 int error = 0;
1240 int i = 0; 1378 int i = 0;
1241 int mod = swsusp_info.image_pages / 100; 1379 int mod = swsusp_info.image_pages / 100;
1380 void *tfm;
1381
1382 if ((error = crypto_init(0, &tfm)))
1383 return error;
1242 1384
1243 if (!mod) 1385 if (!mod)
1244 mod = 1; 1386 mod = 1;
@@ -1250,14 +1392,15 @@ static int data_read(struct pbe *pblist)
1250 if (!(i % mod)) 1392 if (!(i % mod))
1251 printk("\b\b\b\b%3d%%", i / mod); 1393 printk("\b\b\b\b%3d%%", i / mod);
1252 1394
1253 error = bio_read_page(swp_offset(p->swap_address), 1395 if ((error = crypto_read(p, tfm))) {
1254 (void *)p->address); 1396 crypto_exit(tfm);
1255 if (error)
1256 return error; 1397 return error;
1398 }
1257 1399
1258 i++; 1400 i++;
1259 } 1401 }
1260 printk("\b\b\b\bdone\n"); 1402 printk("\b\b\b\bdone\n");
1403 crypto_exit(tfm);
1261 return error; 1404 return error;
1262} 1405}
1263 1406
@@ -1385,6 +1528,7 @@ int swsusp_read(void)
1385 1528
1386 error = read_suspend_image(); 1529 error = read_suspend_image();
1387 blkdev_put(resume_bdev); 1530 blkdev_put(resume_bdev);
1531 memset(key_iv, 0, MAXKEY+MAXIV);
1388 1532
1389 if (!error) 1533 if (!error)
1390 pr_debug("swsusp: Reading resume file was successful\n"); 1534 pr_debug("swsusp: Reading resume file was successful\n");
diff --git a/kernel/printk.c b/kernel/printk.c
index 5092397fac29..a967605bc2e3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -514,6 +514,9 @@ asmlinkage int printk(const char *fmt, ...)
514 return r; 514 return r;
515} 515}
516 516
517/* cpu currently holding logbuf_lock */
518static volatile unsigned int printk_cpu = UINT_MAX;
519
517asmlinkage int vprintk(const char *fmt, va_list args) 520asmlinkage int vprintk(const char *fmt, va_list args)
518{ 521{
519 unsigned long flags; 522 unsigned long flags;
@@ -522,11 +525,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
522 static char printk_buf[1024]; 525 static char printk_buf[1024];
523 static int log_level_unknown = 1; 526 static int log_level_unknown = 1;
524 527
525 if (unlikely(oops_in_progress)) 528 preempt_disable();
529 if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
530 /* If a crash is occurring during printk() on this CPU,
531 * make sure we can't deadlock */
526 zap_locks(); 532 zap_locks();
527 533
528 /* This stops the holder of console_sem just where we want him */ 534 /* This stops the holder of console_sem just where we want him */
529 spin_lock_irqsave(&logbuf_lock, flags); 535 spin_lock_irqsave(&logbuf_lock, flags);
536 printk_cpu = smp_processor_id();
530 537
531 /* Emit the output into the temporary buffer */ 538 /* Emit the output into the temporary buffer */
532 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); 539 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -595,6 +602,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
595 * CPU until it is officially up. We shouldn't be calling into 602 * CPU until it is officially up. We shouldn't be calling into
596 * random console drivers on a CPU which doesn't exist yet.. 603 * random console drivers on a CPU which doesn't exist yet..
597 */ 604 */
605 printk_cpu = UINT_MAX;
598 spin_unlock_irqrestore(&logbuf_lock, flags); 606 spin_unlock_irqrestore(&logbuf_lock, flags);
599 goto out; 607 goto out;
600 } 608 }
@@ -604,6 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
604 * We own the drivers. We can drop the spinlock and let 612 * We own the drivers. We can drop the spinlock and let
605 * release_console_sem() print the text 613 * release_console_sem() print the text
606 */ 614 */
615 printk_cpu = UINT_MAX;
607 spin_unlock_irqrestore(&logbuf_lock, flags); 616 spin_unlock_irqrestore(&logbuf_lock, flags);
608 console_may_schedule = 0; 617 console_may_schedule = 0;
609 release_console_sem(); 618 release_console_sem();
@@ -613,9 +622,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
613 * allows the semaphore holder to proceed and to call the 622 * allows the semaphore holder to proceed and to call the
614 * console drivers with the output which we just produced. 623 * console drivers with the output which we just produced.
615 */ 624 */
625 printk_cpu = UINT_MAX;
616 spin_unlock_irqrestore(&logbuf_lock, flags); 626 spin_unlock_irqrestore(&logbuf_lock, flags);
617 } 627 }
618out: 628out:
629 preempt_enable();
619 return printed_len; 630 return printed_len;
620} 631}
621EXPORT_SYMBOL(printk); 632EXPORT_SYMBOL(printk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8dcb8f6288bc..019e04ec065a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill)
118 return ret; 118 return ret;
119} 119}
120 120
121static int may_attach(struct task_struct *task)
122{
123 if (!task->mm)
124 return -EPERM;
125 if (((current->uid != task->euid) ||
126 (current->uid != task->suid) ||
127 (current->uid != task->uid) ||
128 (current->gid != task->egid) ||
129 (current->gid != task->sgid) ||
130 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
131 return -EPERM;
132 smp_rmb();
133 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
134 return -EPERM;
135
136 return security_ptrace(current, task);
137}
138
139int ptrace_may_attach(struct task_struct *task)
140{
141 int err;
142 task_lock(task);
143 err = may_attach(task);
144 task_unlock(task);
145 return !err;
146}
147
121int ptrace_attach(struct task_struct *task) 148int ptrace_attach(struct task_struct *task)
122{ 149{
123 int retval; 150 int retval;
@@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task)
127 goto bad; 154 goto bad;
128 if (task == current) 155 if (task == current)
129 goto bad; 156 goto bad;
130 if (!task->mm)
131 goto bad;
132 if(((current->uid != task->euid) ||
133 (current->uid != task->suid) ||
134 (current->uid != task->uid) ||
135 (current->gid != task->egid) ||
136 (current->gid != task->sgid) ||
137 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
138 goto bad;
139 smp_rmb();
140 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
141 goto bad;
142 /* the same process cannot be attached many times */ 157 /* the same process cannot be attached many times */
143 if (task->ptrace & PT_PTRACED) 158 if (task->ptrace & PT_PTRACED)
144 goto bad; 159 goto bad;
145 retval = security_ptrace(current, task); 160 retval = may_attach(task);
146 if (retval) 161 if (retval)
147 goto bad; 162 goto bad;
148 163
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e042201..92285d822de6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
430 */ 430 */
431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) 431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
432{ 432{
433 struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); 433 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
434 434
435 if (res) { 435 if (res) {
436 memset(res, 0, sizeof(*res));
437 res->name = name; 436 res->name = name;
438 res->start = start; 437 res->start = start;
439 res->end = start + n - 1; 438 res->end = start + n - 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..18b95520a2e2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1478,6 +1478,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1478 1478
1479/** 1479/**
1480 * finish_task_switch - clean up after a task-switch 1480 * finish_task_switch - clean up after a task-switch
1481 * @rq: runqueue associated with task-switch
1481 * @prev: the thread we just switched away from. 1482 * @prev: the thread we just switched away from.
1482 * 1483 *
1483 * finish_task_switch must be called after the context switch, paired 1484 * finish_task_switch must be called after the context switch, paired
@@ -4779,7 +4780,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
4779 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4780 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4780 * hold the hotplug lock. 4781 * hold the hotplug lock.
4781 */ 4782 */
4782void cpu_attach_domain(struct sched_domain *sd, int cpu) 4783static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4783{ 4784{
4784 runqueue_t *rq = cpu_rq(cpu); 4785 runqueue_t *rq = cpu_rq(cpu);
4785 struct sched_domain *tmp; 4786 struct sched_domain *tmp;
@@ -4802,7 +4803,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
4802} 4803}
4803 4804
4804/* cpus with isolated domains */ 4805/* cpus with isolated domains */
4805cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; 4806static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4806 4807
4807/* Setup the mask of cpus configured for isolated domains */ 4808/* Setup the mask of cpus configured for isolated domains */
4808static int __init isolated_cpu_setup(char *str) 4809static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4831,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
4830 * covered by the given span, and will set each group's ->cpumask correctly, 4831 * covered by the given span, and will set each group's ->cpumask correctly,
4831 * and ->cpu_power to 0. 4832 * and ->cpu_power to 0.
4832 */ 4833 */
4833void init_sched_build_groups(struct sched_group groups[], 4834static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
4834 cpumask_t span, int (*group_fn)(int cpu)) 4835 int (*group_fn)(int cpu))
4835{ 4836{
4836 struct sched_group *first = NULL, *last = NULL; 4837 struct sched_group *first = NULL, *last = NULL;
4837 cpumask_t covered = CPU_MASK_NONE; 4838 cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4865,85 @@ void init_sched_build_groups(struct sched_group groups[],
4864 last->next = first; 4865 last->next = first;
4865} 4866}
4866 4867
4868#define SD_NODES_PER_DOMAIN 16
4867 4869
4868#ifdef ARCH_HAS_SCHED_DOMAIN 4870#ifdef CONFIG_NUMA
4869extern void build_sched_domains(const cpumask_t *cpu_map); 4871/**
4870extern void arch_init_sched_domains(const cpumask_t *cpu_map); 4872 * find_next_best_node - find the next node to include in a sched_domain
4871extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); 4873 * @node: node whose sched_domain we're building
4872#else 4874 * @used_nodes: nodes already in the sched_domain
4875 *
4876 * Find the next node to include in a given scheduling domain. Simply
4877 * finds the closest node not already in the @used_nodes map.
4878 *
4879 * Should use nodemask_t.
4880 */
4881static int find_next_best_node(int node, unsigned long *used_nodes)
4882{
4883 int i, n, val, min_val, best_node = 0;
4884
4885 min_val = INT_MAX;
4886
4887 for (i = 0; i < MAX_NUMNODES; i++) {
4888 /* Start at @node */
4889 n = (node + i) % MAX_NUMNODES;
4890
4891 if (!nr_cpus_node(n))
4892 continue;
4893
4894 /* Skip already used nodes */
4895 if (test_bit(n, used_nodes))
4896 continue;
4897
4898 /* Simple min distance search */
4899 val = node_distance(node, n);
4900
4901 if (val < min_val) {
4902 min_val = val;
4903 best_node = n;
4904 }
4905 }
4906
4907 set_bit(best_node, used_nodes);
4908 return best_node;
4909}
4910
4911/**
4912 * sched_domain_node_span - get a cpumask for a node's sched_domain
4913 * @node: node whose cpumask we're constructing
4914 * @size: number of nodes to include in this span
4915 *
4916 * Given a node, construct a good cpumask for its sched_domain to span. It
4917 * should be one that prevents unnecessary balancing, but also spreads tasks
4918 * out optimally.
4919 */
4920static cpumask_t sched_domain_node_span(int node)
4921{
4922 int i;
4923 cpumask_t span, nodemask;
4924 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
4925
4926 cpus_clear(span);
4927 bitmap_zero(used_nodes, MAX_NUMNODES);
4928
4929 nodemask = node_to_cpumask(node);
4930 cpus_or(span, span, nodemask);
4931 set_bit(node, used_nodes);
4932
4933 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
4934 int next_node = find_next_best_node(node, used_nodes);
4935 nodemask = node_to_cpumask(next_node);
4936 cpus_or(span, span, nodemask);
4937 }
4938
4939 return span;
4940}
4941#endif
4942
4943/*
4944 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
4945 * can switch it on easily if needed.
4946 */
4873#ifdef CONFIG_SCHED_SMT 4947#ifdef CONFIG_SCHED_SMT
4874static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 4948static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4875static struct sched_group sched_group_cpus[NR_CPUS]; 4949static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +4965,20 @@ static int cpu_to_phys_group(int cpu)
4891} 4965}
4892 4966
4893#ifdef CONFIG_NUMA 4967#ifdef CONFIG_NUMA
4894
4895static DEFINE_PER_CPU(struct sched_domain, node_domains);
4896static struct sched_group sched_group_nodes[MAX_NUMNODES];
4897static int cpu_to_node_group(int cpu)
4898{
4899 return cpu_to_node(cpu);
4900}
4901#endif
4902
4903#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4904/* 4968/*
4905 * The domains setup code relies on siblings not spanning 4969 * The init_sched_build_groups can't handle what we want to do with node
4906 * multiple nodes. Make sure the architecture has a proper 4970 * groups, so roll our own. Now each node has its own list of groups which
4907 * siblings map: 4971 * gets dynamically allocated.
4908 */ 4972 */
4909static void check_sibling_maps(void) 4973static DEFINE_PER_CPU(struct sched_domain, node_domains);
4910{ 4974static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
4911 int i, j;
4912 4975
4913 for_each_online_cpu(i) { 4976static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
4914 for_each_cpu_mask(j, cpu_sibling_map[i]) { 4977static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
4915 if (cpu_to_node(i) != cpu_to_node(j)) { 4978
4916 printk(KERN_INFO "warning: CPU %d siblings map " 4979static int cpu_to_allnodes_group(int cpu)
4917 "to different node - isolating " 4980{
4918 "them.\n", i); 4981 return cpu_to_node(cpu);
4919 cpu_sibling_map[i] = cpumask_of_cpu(i);
4920 break;
4921 }
4922 }
4923 }
4924} 4982}
4925#endif 4983#endif
4926 4984
@@ -4928,9 +4986,24 @@ static void check_sibling_maps(void)
4928 * Build sched domains for a given set of cpus and attach the sched domains 4986 * Build sched domains for a given set of cpus and attach the sched domains
4929 * to the individual cpus 4987 * to the individual cpus
4930 */ 4988 */
4931static void build_sched_domains(const cpumask_t *cpu_map) 4989void build_sched_domains(const cpumask_t *cpu_map)
4932{ 4990{
4933 int i; 4991 int i;
4992#ifdef CONFIG_NUMA
4993 struct sched_group **sched_group_nodes = NULL;
4994 struct sched_group *sched_group_allnodes = NULL;
4995
4996 /*
4997 * Allocate the per-node list of sched groups
4998 */
4999 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5000 GFP_ATOMIC);
5001 if (!sched_group_nodes) {
5002 printk(KERN_WARNING "Can not alloc sched group node list\n");
5003 return;
5004 }
5005 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5006#endif
4934 5007
4935 /* 5008 /*
4936 * Set up domains for cpus specified by the cpu_map. 5009 * Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5016,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4943 cpus_and(nodemask, nodemask, *cpu_map); 5016 cpus_and(nodemask, nodemask, *cpu_map);
4944 5017
4945#ifdef CONFIG_NUMA 5018#ifdef CONFIG_NUMA
5019 if (cpus_weight(*cpu_map)
5020 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5021 if (!sched_group_allnodes) {
5022 sched_group_allnodes
5023 = kmalloc(sizeof(struct sched_group)
5024 * MAX_NUMNODES,
5025 GFP_KERNEL);
5026 if (!sched_group_allnodes) {
5027 printk(KERN_WARNING
5028 "Can not alloc allnodes sched group\n");
5029 break;
5030 }
5031 sched_group_allnodes_bycpu[i]
5032 = sched_group_allnodes;
5033 }
5034 sd = &per_cpu(allnodes_domains, i);
5035 *sd = SD_ALLNODES_INIT;
5036 sd->span = *cpu_map;
5037 group = cpu_to_allnodes_group(i);
5038 sd->groups = &sched_group_allnodes[group];
5039 p = sd;
5040 } else
5041 p = NULL;
5042
4946 sd = &per_cpu(node_domains, i); 5043 sd = &per_cpu(node_domains, i);
4947 group = cpu_to_node_group(i);
4948 *sd = SD_NODE_INIT; 5044 *sd = SD_NODE_INIT;
4949 sd->span = *cpu_map; 5045 sd->span = sched_domain_node_span(cpu_to_node(i));
4950 sd->groups = &sched_group_nodes[group]; 5046 sd->parent = p;
5047 cpus_and(sd->span, sd->span, *cpu_map);
4951#endif 5048#endif
4952 5049
4953 p = sd; 5050 p = sd;
@@ -4972,7 +5069,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4972 5069
4973#ifdef CONFIG_SCHED_SMT 5070#ifdef CONFIG_SCHED_SMT
4974 /* Set up CPU (sibling) groups */ 5071 /* Set up CPU (sibling) groups */
4975 for_each_online_cpu(i) { 5072 for_each_cpu_mask(i, *cpu_map) {
4976 cpumask_t this_sibling_map = cpu_sibling_map[i]; 5073 cpumask_t this_sibling_map = cpu_sibling_map[i];
4977 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 5074 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4978 if (i != first_cpu(this_sibling_map)) 5075 if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5094,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4997 5094
4998#ifdef CONFIG_NUMA 5095#ifdef CONFIG_NUMA
4999 /* Set up node groups */ 5096 /* Set up node groups */
5000 init_sched_build_groups(sched_group_nodes, *cpu_map, 5097 if (sched_group_allnodes)
5001 &cpu_to_node_group); 5098 init_sched_build_groups(sched_group_allnodes, *cpu_map,
5099 &cpu_to_allnodes_group);
5100
5101 for (i = 0; i < MAX_NUMNODES; i++) {
5102 /* Set up node groups */
5103 struct sched_group *sg, *prev;
5104 cpumask_t nodemask = node_to_cpumask(i);
5105 cpumask_t domainspan;
5106 cpumask_t covered = CPU_MASK_NONE;
5107 int j;
5108
5109 cpus_and(nodemask, nodemask, *cpu_map);
5110 if (cpus_empty(nodemask)) {
5111 sched_group_nodes[i] = NULL;
5112 continue;
5113 }
5114
5115 domainspan = sched_domain_node_span(i);
5116 cpus_and(domainspan, domainspan, *cpu_map);
5117
5118 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5119 sched_group_nodes[i] = sg;
5120 for_each_cpu_mask(j, nodemask) {
5121 struct sched_domain *sd;
5122 sd = &per_cpu(node_domains, j);
5123 sd->groups = sg;
5124 if (sd->groups == NULL) {
5125 /* Turn off balancing if we have no groups */
5126 sd->flags = 0;
5127 }
5128 }
5129 if (!sg) {
5130 printk(KERN_WARNING
5131 "Can not alloc domain group for node %d\n", i);
5132 continue;
5133 }
5134 sg->cpu_power = 0;
5135 sg->cpumask = nodemask;
5136 cpus_or(covered, covered, nodemask);
5137 prev = sg;
5138
5139 for (j = 0; j < MAX_NUMNODES; j++) {
5140 cpumask_t tmp, notcovered;
5141 int n = (i + j) % MAX_NUMNODES;
5142
5143 cpus_complement(notcovered, covered);
5144 cpus_and(tmp, notcovered, *cpu_map);
5145 cpus_and(tmp, tmp, domainspan);
5146 if (cpus_empty(tmp))
5147 break;
5148
5149 nodemask = node_to_cpumask(n);
5150 cpus_and(tmp, tmp, nodemask);
5151 if (cpus_empty(tmp))
5152 continue;
5153
5154 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5155 if (!sg) {
5156 printk(KERN_WARNING
5157 "Can not alloc domain group for node %d\n", j);
5158 break;
5159 }
5160 sg->cpu_power = 0;
5161 sg->cpumask = tmp;
5162 cpus_or(covered, covered, tmp);
5163 prev->next = sg;
5164 prev = sg;
5165 }
5166 prev->next = sched_group_nodes[i];
5167 }
5002#endif 5168#endif
5003 5169
5004 /* Calculate CPU power for physical packages and nodes */ 5170 /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5183,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5017 sd->groups->cpu_power = power; 5183 sd->groups->cpu_power = power;
5018 5184
5019#ifdef CONFIG_NUMA 5185#ifdef CONFIG_NUMA
5020 if (i == first_cpu(sd->groups->cpumask)) { 5186 sd = &per_cpu(allnodes_domains, i);
5021 /* Only add "power" once for each physical package. */ 5187 if (sd->groups) {
5022 sd = &per_cpu(node_domains, i); 5188 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5023 sd->groups->cpu_power += power; 5189 (cpus_weight(sd->groups->cpumask)-1) / 10;
5190 sd->groups->cpu_power = power;
5024 } 5191 }
5025#endif 5192#endif
5026 } 5193 }
5027 5194
5195#ifdef CONFIG_NUMA
5196 for (i = 0; i < MAX_NUMNODES; i++) {
5197 struct sched_group *sg = sched_group_nodes[i];
5198 int j;
5199
5200 if (sg == NULL)
5201 continue;
5202next_sg:
5203 for_each_cpu_mask(j, sg->cpumask) {
5204 struct sched_domain *sd;
5205 int power;
5206
5207 sd = &per_cpu(phys_domains, j);
5208 if (j != first_cpu(sd->groups->cpumask)) {
5209 /*
5210 * Only add "power" once for each
5211 * physical package.
5212 */
5213 continue;
5214 }
5215 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5216 (cpus_weight(sd->groups->cpumask)-1) / 10;
5217
5218 sg->cpu_power += power;
5219 }
5220 sg = sg->next;
5221 if (sg != sched_group_nodes[i])
5222 goto next_sg;
5223 }
5224#endif
5225
5028 /* Attach the domains */ 5226 /* Attach the domains */
5029 for_each_cpu_mask(i, *cpu_map) { 5227 for_each_cpu_mask(i, *cpu_map) {
5030 struct sched_domain *sd; 5228 struct sched_domain *sd;
@@ -5039,13 +5237,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5039/* 5237/*
5040 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 5238 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5041 */ 5239 */
5042static void arch_init_sched_domains(cpumask_t *cpu_map) 5240static void arch_init_sched_domains(const cpumask_t *cpu_map)
5043{ 5241{
5044 cpumask_t cpu_default_map; 5242 cpumask_t cpu_default_map;
5045 5243
5046#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5047 check_sibling_maps();
5048#endif
5049 /* 5244 /*
5050 * Setup mask for cpus without special case scheduling requirements. 5245 * Setup mask for cpus without special case scheduling requirements.
5051 * For now this just excludes isolated cpus, but could be used to 5246 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5253,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
5058 5253
5059static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 5254static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5060{ 5255{
5061 /* Do nothing: everything is statically allocated. */ 5256#ifdef CONFIG_NUMA
5062} 5257 int i;
5258 int cpu;
5259
5260 for_each_cpu_mask(cpu, *cpu_map) {
5261 struct sched_group *sched_group_allnodes
5262 = sched_group_allnodes_bycpu[cpu];
5263 struct sched_group **sched_group_nodes
5264 = sched_group_nodes_bycpu[cpu];
5265
5266 if (sched_group_allnodes) {
5267 kfree(sched_group_allnodes);
5268 sched_group_allnodes_bycpu[cpu] = NULL;
5269 }
5270
5271 if (!sched_group_nodes)
5272 continue;
5273
5274 for (i = 0; i < MAX_NUMNODES; i++) {
5275 cpumask_t nodemask = node_to_cpumask(i);
5276 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5063 5277
5064#endif /* ARCH_HAS_SCHED_DOMAIN */ 5278 cpus_and(nodemask, nodemask, *cpu_map);
5279 if (cpus_empty(nodemask))
5280 continue;
5281
5282 if (sg == NULL)
5283 continue;
5284 sg = sg->next;
5285next_sg:
5286 oldsg = sg;
5287 sg = sg->next;
5288 kfree(oldsg);
5289 if (oldsg != sched_group_nodes[i])
5290 goto next_sg;
5291 }
5292 kfree(sched_group_nodes);
5293 sched_group_nodes_bycpu[cpu] = NULL;
5294 }
5295#endif
5296}
5065 5297
5066/* 5298/*
5067 * Detach sched domains from a group of cpus specified in cpu_map 5299 * Detach sched domains from a group of cpus specified in cpu_map
diff --git a/kernel/signal.c b/kernel/signal.c
index d282fea81138..4980a073237f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -678,7 +678,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
678 678
679/* forward decl */ 679/* forward decl */
680static void do_notify_parent_cldstop(struct task_struct *tsk, 680static void do_notify_parent_cldstop(struct task_struct *tsk,
681 struct task_struct *parent, 681 int to_self,
682 int why); 682 int why);
683 683
684/* 684/*
@@ -729,14 +729,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
729 p->signal->group_stop_count = 0; 729 p->signal->group_stop_count = 0;
730 p->signal->flags = SIGNAL_STOP_CONTINUED; 730 p->signal->flags = SIGNAL_STOP_CONTINUED;
731 spin_unlock(&p->sighand->siglock); 731 spin_unlock(&p->sighand->siglock);
732 if (p->ptrace & PT_PTRACED) 732 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
733 do_notify_parent_cldstop(p, p->parent,
734 CLD_STOPPED);
735 else
736 do_notify_parent_cldstop(
737 p->group_leader,
738 p->group_leader->real_parent,
739 CLD_STOPPED);
740 spin_lock(&p->sighand->siglock); 733 spin_lock(&p->sighand->siglock);
741 } 734 }
742 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); 735 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -777,14 +770,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
777 p->signal->flags = SIGNAL_STOP_CONTINUED; 770 p->signal->flags = SIGNAL_STOP_CONTINUED;
778 p->signal->group_exit_code = 0; 771 p->signal->group_exit_code = 0;
779 spin_unlock(&p->sighand->siglock); 772 spin_unlock(&p->sighand->siglock);
780 if (p->ptrace & PT_PTRACED) 773 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
781 do_notify_parent_cldstop(p, p->parent,
782 CLD_CONTINUED);
783 else
784 do_notify_parent_cldstop(
785 p->group_leader,
786 p->group_leader->real_parent,
787 CLD_CONTINUED);
788 spin_lock(&p->sighand->siglock); 774 spin_lock(&p->sighand->siglock);
789 } else { 775 } else {
790 /* 776 /*
@@ -1380,16 +1366,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1380 unsigned long flags; 1366 unsigned long flags;
1381 int ret = 0; 1367 int ret = 0;
1382 1368
1383 /*
1384 * We need the tasklist lock even for the specific
1385 * thread case (when we don't need to follow the group
1386 * lists) in order to avoid races with "p->sighand"
1387 * going away or changing from under us.
1388 */
1389 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1369 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1390 read_lock(&tasklist_lock); 1370 read_lock(&tasklist_lock);
1371
1372 if (unlikely(p->flags & PF_EXITING)) {
1373 ret = -1;
1374 goto out_err;
1375 }
1376
1391 spin_lock_irqsave(&p->sighand->siglock, flags); 1377 spin_lock_irqsave(&p->sighand->siglock, flags);
1392 1378
1393 if (unlikely(!list_empty(&q->list))) { 1379 if (unlikely(!list_empty(&q->list))) {
1394 /* 1380 /*
1395 * If an SI_TIMER entry is already queue just increment 1381 * If an SI_TIMER entry is already queue just increment
@@ -1399,7 +1385,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1399 BUG(); 1385 BUG();
1400 q->info.si_overrun++; 1386 q->info.si_overrun++;
1401 goto out; 1387 goto out;
1402 } 1388 }
1403 /* Short-circuit ignored signals. */ 1389 /* Short-circuit ignored signals. */
1404 if (sig_ignored(p, sig)) { 1390 if (sig_ignored(p, sig)) {
1405 ret = 1; 1391 ret = 1;
@@ -1414,8 +1400,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1414 1400
1415out: 1401out:
1416 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1402 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1403out_err:
1417 read_unlock(&tasklist_lock); 1404 read_unlock(&tasklist_lock);
1418 return(ret); 1405
1406 return ret;
1419} 1407}
1420 1408
1421int 1409int
@@ -1542,14 +1530,20 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1542 spin_unlock_irqrestore(&psig->siglock, flags); 1530 spin_unlock_irqrestore(&psig->siglock, flags);
1543} 1531}
1544 1532
1545static void 1533static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
1546do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
1547 int why)
1548{ 1534{
1549 struct siginfo info; 1535 struct siginfo info;
1550 unsigned long flags; 1536 unsigned long flags;
1537 struct task_struct *parent;
1551 struct sighand_struct *sighand; 1538 struct sighand_struct *sighand;
1552 1539
1540 if (to_self)
1541 parent = tsk->parent;
1542 else {
1543 tsk = tsk->group_leader;
1544 parent = tsk->real_parent;
1545 }
1546
1553 info.si_signo = SIGCHLD; 1547 info.si_signo = SIGCHLD;
1554 info.si_errno = 0; 1548 info.si_errno = 0;
1555 info.si_pid = tsk->pid; 1549 info.si_pid = tsk->pid;
@@ -1618,8 +1612,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1618 !(current->ptrace & PT_ATTACHED)) && 1612 !(current->ptrace & PT_ATTACHED)) &&
1619 (likely(current->parent->signal != current->signal) || 1613 (likely(current->parent->signal != current->signal) ||
1620 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { 1614 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1621 do_notify_parent_cldstop(current, current->parent, 1615 do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
1622 CLD_TRAPPED);
1623 read_unlock(&tasklist_lock); 1616 read_unlock(&tasklist_lock);
1624 schedule(); 1617 schedule();
1625 } else { 1618 } else {
@@ -1668,25 +1661,25 @@ void ptrace_notify(int exit_code)
1668static void 1661static void
1669finish_stop(int stop_count) 1662finish_stop(int stop_count)
1670{ 1663{
1664 int to_self;
1665
1671 /* 1666 /*
1672 * If there are no other threads in the group, or if there is 1667 * If there are no other threads in the group, or if there is
1673 * a group stop in progress and we are the last to stop, 1668 * a group stop in progress and we are the last to stop,
1674 * report to the parent. When ptraced, every thread reports itself. 1669 * report to the parent. When ptraced, every thread reports itself.
1675 */ 1670 */
1676 if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { 1671 if (stop_count < 0 || (current->ptrace & PT_PTRACED))
1677 read_lock(&tasklist_lock); 1672 to_self = 1;
1678 do_notify_parent_cldstop(current, current->parent, 1673 else if (stop_count == 0)
1679 CLD_STOPPED); 1674 to_self = 0;
1680 read_unlock(&tasklist_lock); 1675 else
1681 } 1676 goto out;
1682 else if (stop_count == 0) {
1683 read_lock(&tasklist_lock);
1684 do_notify_parent_cldstop(current->group_leader,
1685 current->group_leader->real_parent,
1686 CLD_STOPPED);
1687 read_unlock(&tasklist_lock);
1688 }
1689 1677
1678 read_lock(&tasklist_lock);
1679 do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
1680 read_unlock(&tasklist_lock);
1681
1682out:
1690 schedule(); 1683 schedule();
1691 /* 1684 /*
1692 * Now we don't run again until continued. 1685 * Now we don't run again until continued.
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
1/*
2 * Detect Soft Lockups
3 *
4 * started by Ingo Molnar, (C) 2005, Red Hat
5 *
6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more.
8 */
9
10#include <linux/mm.h>
11#include <linux/cpu.h>
12#include <linux/init.h>
13#include <linux/delay.h>
14#include <linux/kthread.h>
15#include <linux/notifier.h>
16#include <linux/module.h>
17
18static DEFINE_SPINLOCK(print_lock);
19
20static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
21static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
22static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
23
24static int did_panic = 0;
25static int softlock_panic(struct notifier_block *this, unsigned long event,
26 void *ptr)
27{
28 did_panic = 1;
29
30 return NOTIFY_DONE;
31}
32
33static struct notifier_block panic_block = {
34 .notifier_call = softlock_panic,
35};
36
37void touch_softlockup_watchdog(void)
38{
39 per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
40}
41EXPORT_SYMBOL(touch_softlockup_watchdog);
42
43/*
44 * This callback runs from the timer interrupt, and checks
45 * whether the watchdog thread has hung or not:
46 */
47void softlockup_tick(struct pt_regs *regs)
48{
49 int this_cpu = smp_processor_id();
50 unsigned long timestamp = per_cpu(timestamp, this_cpu);
51
52 if (per_cpu(print_timestamp, this_cpu) == timestamp)
53 return;
54
55 /* Do not cause a second panic when there already was one */
56 if (did_panic)
57 return;
58
59 if (time_after(jiffies, timestamp + 10*HZ)) {
60 per_cpu(print_timestamp, this_cpu) = timestamp;
61
62 spin_lock(&print_lock);
63 printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
64 this_cpu);
65 show_regs(regs);
66 spin_unlock(&print_lock);
67 }
68}
69
70/*
71 * The watchdog thread - runs every second and touches the timestamp.
72 */
73static int watchdog(void * __bind_cpu)
74{
75 struct sched_param param = { .sched_priority = 99 };
76 int this_cpu = (long) __bind_cpu;
77
78 printk("softlockup thread %d started up.\n", this_cpu);
79
80 sched_setscheduler(current, SCHED_FIFO, &param);
81 current->flags |= PF_NOFREEZE;
82
83 set_current_state(TASK_INTERRUPTIBLE);
84
85 /*
86 * Run briefly once per second - if this gets delayed for
87 * more than 10 seconds then the debug-printout triggers
88 * in softlockup_tick():
89 */
90 while (!kthread_should_stop()) {
91 msleep_interruptible(1000);
92 touch_softlockup_watchdog();
93 }
94 __set_current_state(TASK_RUNNING);
95
96 return 0;
97}
98
99/*
100 * Create/destroy watchdog threads as CPUs come and go:
101 */
102static int __devinit
103cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
104{
105 int hotcpu = (unsigned long)hcpu;
106 struct task_struct *p;
107
108 switch (action) {
109 case CPU_UP_PREPARE:
110 BUG_ON(per_cpu(watchdog_task, hotcpu));
111 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
112 if (IS_ERR(p)) {
113 printk("watchdog for %i failed\n", hotcpu);
114 return NOTIFY_BAD;
115 }
116 per_cpu(watchdog_task, hotcpu) = p;
117 kthread_bind(p, hotcpu);
118 break;
119 case CPU_ONLINE:
120
121 wake_up_process(per_cpu(watchdog_task, hotcpu));
122 break;
123#ifdef CONFIG_HOTPLUG_CPU
124 case CPU_UP_CANCELED:
125 /* Unbind so it can run. Fall thru. */
126 kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
127 case CPU_DEAD:
128 p = per_cpu(watchdog_task, hotcpu);
129 per_cpu(watchdog_task, hotcpu) = NULL;
130 kthread_stop(p);
131 break;
132#endif /* CONFIG_HOTPLUG_CPU */
133 }
134 return NOTIFY_OK;
135}
136
137static struct notifier_block __devinitdata cpu_nfb = {
138 .notifier_call = cpu_callback
139};
140
141__init void spawn_softlockup_task(void)
142{
143 void *cpu = (void *)(long)smp_processor_id();
144
145 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
146 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
147 register_cpu_notifier(&cpu_nfb);
148
149 notifier_chain_register(&panic_notifier_list, &panic_block);
150}
151
diff --git a/kernel/sys.c b/kernel/sys.c
index 0bcaed6560ac..c80412be2302 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1711,7 +1711,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1711 unsigned long arg4, unsigned long arg5) 1711 unsigned long arg4, unsigned long arg5)
1712{ 1712{
1713 long error; 1713 long error;
1714 int sig;
1715 1714
1716 error = security_task_prctl(option, arg2, arg3, arg4, arg5); 1715 error = security_task_prctl(option, arg2, arg3, arg4, arg5);
1717 if (error) 1716 if (error)
@@ -1719,12 +1718,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1719 1718
1720 switch (option) { 1719 switch (option) {
1721 case PR_SET_PDEATHSIG: 1720 case PR_SET_PDEATHSIG:
1722 sig = arg2; 1721 if (!valid_signal(arg2)) {
1723 if (!valid_signal(sig)) {
1724 error = -EINVAL; 1722 error = -EINVAL;
1725 break; 1723 break;
1726 } 1724 }
1727 current->pdeath_signal = sig; 1725 current->pdeath_signal = arg2;
1728 break; 1726 break;
1729 case PR_GET_PDEATHSIG: 1727 case PR_GET_PDEATHSIG:
1730 error = put_user(current->pdeath_signal, (int __user *)arg2); 1728 error = put_user(current->pdeath_signal, (int __user *)arg2);
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723ff..13e2b513be01 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
950{ 950{
951 jiffies_64++; 951 jiffies_64++;
952 update_times(); 952 update_times();
953 softlockup_tick(regs);
953} 954}
954 955
955#ifdef __ARCH_WANT_SYS_ALARM 956#ifdef __ARCH_WANT_SYS_ALARM
@@ -1428,7 +1429,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
1428 } 1429 }
1429} 1430}
1430 1431
1431static inline u64 time_interpolator_get_counter(void) 1432static inline u64 time_interpolator_get_counter(int writelock)
1432{ 1433{
1433 unsigned int src = time_interpolator->source; 1434 unsigned int src = time_interpolator->source;
1434 1435
@@ -1442,6 +1443,15 @@ static inline u64 time_interpolator_get_counter(void)
1442 now = time_interpolator_get_cycles(src); 1443 now = time_interpolator_get_cycles(src);
1443 if (lcycle && time_after(lcycle, now)) 1444 if (lcycle && time_after(lcycle, now))
1444 return lcycle; 1445 return lcycle;
1446
1447 /* When holding the xtime write lock, there's no need
1448 * to add the overhead of the cmpxchg. Readers are
1449 * force to retry until the write lock is released.
1450 */
1451 if (writelock) {
1452 time_interpolator->last_cycle = now;
1453 return now;
1454 }
1445 /* Keep track of the last timer value returned. The use of cmpxchg here 1455 /* Keep track of the last timer value returned. The use of cmpxchg here
1446 * will cause contention in an SMP environment. 1456 * will cause contention in an SMP environment.
1447 */ 1457 */
@@ -1455,7 +1465,7 @@ static inline u64 time_interpolator_get_counter(void)
1455void time_interpolator_reset(void) 1465void time_interpolator_reset(void)
1456{ 1466{
1457 time_interpolator->offset = 0; 1467 time_interpolator->offset = 0;
1458 time_interpolator->last_counter = time_interpolator_get_counter(); 1468 time_interpolator->last_counter = time_interpolator_get_counter(1);
1459} 1469}
1460 1470
1461#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) 1471#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
@@ -1467,7 +1477,7 @@ unsigned long time_interpolator_get_offset(void)
1467 return 0; 1477 return 0;
1468 1478
1469 return time_interpolator->offset + 1479 return time_interpolator->offset +
1470 GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); 1480 GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
1471} 1481}
1472 1482
1473#define INTERPOLATOR_ADJUST 65536 1483#define INTERPOLATOR_ADJUST 65536
@@ -1490,7 +1500,7 @@ static void time_interpolator_update(long delta_nsec)
1490 * and the tuning logic insures that. 1500 * and the tuning logic insures that.
1491 */ 1501 */
1492 1502
1493 counter = time_interpolator_get_counter(); 1503 counter = time_interpolator_get_counter(1);
1494 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); 1504 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
1495 1505
1496 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) 1506 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c7e36d4a70ca..91bacb13a7e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name,
308 struct workqueue_struct *wq; 308 struct workqueue_struct *wq;
309 struct task_struct *p; 309 struct task_struct *p;
310 310
311 wq = kmalloc(sizeof(*wq), GFP_KERNEL); 311 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
312 if (!wq) 312 if (!wq)
313 return NULL; 313 return NULL;
314 memset(wq, 0, sizeof(*wq));
315 314
316 wq->name = name; 315 wq->name = name;
317 /* We don't need the distraction of CPUs appearing and vanishing. */ 316 /* We don't need the distraction of CPUs appearing and vanishing. */
@@ -499,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
499 case CPU_UP_PREPARE: 498 case CPU_UP_PREPARE:
500 /* Create a new workqueue thread for it. */ 499 /* Create a new workqueue thread for it. */
501 list_for_each_entry(wq, &workqueues, list) { 500 list_for_each_entry(wq, &workqueues, list) {
502 if (create_workqueue_thread(wq, hotcpu) < 0) { 501 if (!create_workqueue_thread(wq, hotcpu)) {
503 printk("workqueue for %i failed\n", hotcpu); 502 printk("workqueue for %i failed\n", hotcpu);
504 return NOTIFY_BAD; 503 return NOTIFY_BAD;
505 } 504 }