aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c45
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/compat.c9
-rw-r--r--kernel/cpuset.c226
-rw-r--r--kernel/exit.c26
-rw-r--r--kernel/fork.c104
-rw-r--r--kernel/futex.c137
-rw-r--r--kernel/intermodule.c3
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/proc.c14
-rw-r--r--kernel/kprobes.c94
-rw-r--r--kernel/module.c44
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/posix-timers.c28
-rw-r--r--kernel/power/Kconfig15
-rw-r--r--kernel/power/disk.c55
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/pm.c3
-rw-r--r--kernel/power/process.c29
-rw-r--r--kernel/power/swsusp.c202
-rw-r--r--kernel/printk.c13
-rw-r--r--kernel/ptrace.c41
-rw-r--r--kernel/rcupdate.c14
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/sched.c617
-rw-r--r--kernel/signal.c86
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c151
-rw-r--r--kernel/spinlock.c15
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sysctl.c4
-rw-r--r--kernel/timer.c55
-rw-r--r--kernel/workqueue.c5
35 files changed, 1580 insertions, 486 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..ff4dc02ce170 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
12obj-$(CONFIG_FUTEX) += futex.o 12obj-$(CONFIG_FUTEX) += futex.o
13obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 13obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
14obj-$(CONFIG_SMP) += cpu.o spinlock.o 14obj-$(CONFIG_SMP) += cpu.o spinlock.o
15obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
15obj-$(CONFIG_UID16) += uid16.o 16obj-$(CONFIG_UID16) += uid16.o
16obj-$(CONFIG_MODULES) += module.o 17obj-$(CONFIG_MODULES) += module.o
17obj-$(CONFIG_KALLSYMS) += kallsyms.o 18obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDIT) += audit.o
27obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 28obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
28obj-$(CONFIG_KPROBES) += kprobes.o 29obj-$(CONFIG_KPROBES) += kprobes.o
29obj-$(CONFIG_SYSFS) += ksysfs.o 30obj-$(CONFIG_SYSFS) += ksysfs.o
31obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
30obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 32obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
31obj-$(CONFIG_CRASH_DUMP) += crash_dump.o 33obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
32obj-$(CONFIG_SECCOMP) += seccomp.o 34obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 4168f631868e..b756f527497e 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -165,7 +165,7 @@ out:
165} 165}
166 166
167/* 167/*
168 * Close the old accouting file (if currently open) and then replace 168 * Close the old accounting file (if currently open) and then replace
169 * it with file (if non-NULL). 169 * it with file (if non-NULL).
170 * 170 *
171 * NOTE: acct_globals.lock MUST be held on entry and exit. 171 * NOTE: acct_globals.lock MUST be held on entry and exit.
@@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file)
199 } 199 }
200} 200}
201 201
202/* 202/**
203 * sys_acct() is the only system call needed to implement process 203 * sys_acct - enable/disable process accounting
204 * accounting. It takes the name of the file where accounting records 204 * @name: file name for accounting records or NULL to shutdown accounting
205 * should be written. If the filename is NULL, accounting will be 205 *
206 * shutdown. 206 * Returns 0 for success or negative errno values for failure.
207 *
208 * sys_acct() is the only system call needed to implement process
209 * accounting. It takes the name of the file where accounting records
210 * should be written. If the filename is NULL, accounting will be
211 * shutdown.
207 */ 212 */
208asmlinkage long sys_acct(const char __user *name) 213asmlinkage long sys_acct(const char __user *name)
209{ 214{
@@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name)
220 return (PTR_ERR(tmp)); 225 return (PTR_ERR(tmp));
221 } 226 }
222 /* Difference from BSD - they don't do O_APPEND */ 227 /* Difference from BSD - they don't do O_APPEND */
223 file = filp_open(tmp, O_WRONLY|O_APPEND, 0); 228 file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
224 putname(tmp); 229 putname(tmp);
225 if (IS_ERR(file)) { 230 if (IS_ERR(file)) {
226 return (PTR_ERR(file)); 231 return (PTR_ERR(file));
@@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name)
250 return (0); 255 return (0);
251} 256}
252 257
253/* 258/**
254 * If the accouting is turned on for a file in the filesystem pointed 259 * acct_auto_close - turn off a filesystem's accounting if it is on
255 * to by sb, turn accouting off. 260 * @sb: super block for the filesystem
261 *
262 * If the accounting is turned on for a file in the filesystem pointed
263 * to by sb, turn accounting off.
256 */ 264 */
257void acct_auto_close(struct super_block *sb) 265void acct_auto_close(struct super_block *sb)
258{ 266{
@@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file)
503 set_fs(fs); 511 set_fs(fs);
504} 512}
505 513
506/* 514/**
507 * acct_process - now just a wrapper around do_acct_process 515 * acct_process - now just a wrapper around do_acct_process
516 * @exitcode: task exit code
517 *
518 * handles process accounting for an exiting task
508 */ 519 */
509void acct_process(long exitcode) 520void acct_process(long exitcode)
510{ 521{
@@ -530,9 +541,9 @@ void acct_process(long exitcode)
530} 541}
531 542
532 543
533/* 544/**
534 * acct_update_integrals 545 * acct_update_integrals - update mm integral fields in task_struct
535 * - update mm integral fields in task_struct 546 * @tsk: task_struct for accounting
536 */ 547 */
537void acct_update_integrals(struct task_struct *tsk) 548void acct_update_integrals(struct task_struct *tsk)
538{ 549{
@@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk)
547 } 558 }
548} 559}
549 560
550/* 561/**
551 * acct_clear_integrals 562 * acct_clear_integrals - clear the mm integral fields in task_struct
552 * - clear the mm integral fields in task_struct 563 * @tsk: task_struct whose accounting fields are cleared
553 */ 564 */
554void acct_clear_integrals(struct task_struct *tsk) 565void acct_clear_integrals(struct task_struct *tsk)
555{ 566{
diff --git a/kernel/audit.c b/kernel/audit.c
index 8376ec10cf24..83096b67510a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -513,7 +513,8 @@ static int __init audit_init(void)
513{ 513{
514 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 514 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
515 audit_default ? "enabled" : "disabled"); 515 audit_default ? "enabled" : "disabled");
516 audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); 516 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
517 THIS_MODULE);
517 if (!audit_sock) 518 if (!audit_sock)
518 audit_panic("cannot initialize netlink socket"); 519 audit_panic("cannot initialize netlink socket");
519 520
diff --git a/kernel/compat.c b/kernel/compat.c
index ddfcaaa86623..102296e21ea8 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
48 if (!time_after(expire, now)) 48 if (!time_after(expire, now))
49 return 0; 49 return 0;
50 50
51 current->state = TASK_INTERRUPTIBLE; 51 expire = schedule_timeout_interruptible(expire - now);
52 expire = schedule_timeout(expire - now);
53 if (expire == 0) 52 if (expire == 0)
54 return 0; 53 return 0;
55 54
@@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp,
82 return -EINVAL; 81 return -EINVAL;
83 82
84 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); 83 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
85 current->state = TASK_INTERRUPTIBLE; 84 expire = schedule_timeout_interruptible(expire);
86 expire = schedule_timeout(expire);
87 if (expire == 0) 85 if (expire == 0)
88 return 0; 86 return 0;
89 87
@@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
795 recalc_sigpending(); 793 recalc_sigpending();
796 spin_unlock_irq(&current->sighand->siglock); 794 spin_unlock_irq(&current->sighand->siglock);
797 795
798 current->state = TASK_INTERRUPTIBLE; 796 timeout = schedule_timeout_interruptible(timeout);
799 timeout = schedule_timeout(timeout);
800 797
801 spin_lock_irq(&current->sighand->siglock); 798 spin_lock_irq(&current->sighand->siglock);
802 sig = dequeue_signal(current, &s, &info); 799 sig = dequeue_signal(current, &s, &info);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8ab1b4e518b8..79866bc6b3a1 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL;
180 */ 180 */
181 181
182static DECLARE_MUTEX(cpuset_sem); 182static DECLARE_MUTEX(cpuset_sem);
183static struct task_struct *cpuset_sem_owner;
184static int cpuset_sem_depth;
185
186/*
187 * The global cpuset semaphore cpuset_sem can be needed by the
188 * memory allocator to update a tasks mems_allowed (see the calls
189 * to cpuset_update_current_mems_allowed()) or to walk up the
190 * cpuset hierarchy to find a mem_exclusive cpuset see the calls
191 * to cpuset_excl_nodes_overlap()).
192 *
193 * But if the memory allocation is being done by cpuset.c code, it
194 * usually already holds cpuset_sem. Double tripping on a kernel
195 * semaphore deadlocks the current task, and any other task that
196 * subsequently tries to obtain the lock.
197 *
198 * Run all up's and down's on cpuset_sem through the following
199 * wrappers, which will detect this nested locking, and avoid
200 * deadlocking.
201 */
202
203static inline void cpuset_down(struct semaphore *psem)
204{
205 if (cpuset_sem_owner != current) {
206 down(psem);
207 cpuset_sem_owner = current;
208 }
209 cpuset_sem_depth++;
210}
211
212static inline void cpuset_up(struct semaphore *psem)
213{
214 if (--cpuset_sem_depth == 0) {
215 cpuset_sem_owner = NULL;
216 up(psem);
217 }
218}
183 219
184/* 220/*
185 * A couple of forward declarations required, due to cyclic reference loop: 221 * A couple of forward declarations required, due to cyclic reference loop:
@@ -522,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
522 * Refresh current tasks mems_allowed and mems_generation from 558 * Refresh current tasks mems_allowed and mems_generation from
523 * current tasks cpuset. Call with cpuset_sem held. 559 * current tasks cpuset. Call with cpuset_sem held.
524 * 560 *
525 * Be sure to call refresh_mems() on any cpuset operation which 561 * This routine is needed to update the per-task mems_allowed
526 * (1) holds cpuset_sem, and (2) might possibly alloc memory. 562 * data, within the tasks context, when it is trying to allocate
527 * Call after obtaining cpuset_sem lock, before any possible 563 * memory (in various mm/mempolicy.c routines) and notices
528 * allocation. Otherwise one risks trying to allocate memory 564 * that some other task has been modifying its cpuset.
529 * while the task cpuset_mems_generation is not the same as
530 * the mems_generation in its cpuset, which would deadlock on
531 * cpuset_sem in cpuset_update_current_mems_allowed().
532 *
533 * Since we hold cpuset_sem, once refresh_mems() is called, the
534 * test (current->cpuset_mems_generation != cs->mems_generation)
535 * in cpuset_update_current_mems_allowed() will remain false,
536 * until we drop cpuset_sem. Anyone else who would change our
537 * cpusets mems_generation needs to lock cpuset_sem first.
538 */ 565 */
539 566
540static void refresh_mems(void) 567static void refresh_mems(void)
@@ -628,13 +655,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
628 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 655 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
629 */ 656 */
630 657
631/*
632 * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
633 * Disable letting 'cpu_exclusive' cpusets define dynamic sched
634 * domains, until the sched domain can handle partial nodes.
635 * Remove this #if hackery when sched domains fixed.
636 */
637#if 0
638static void update_cpu_domains(struct cpuset *cur) 658static void update_cpu_domains(struct cpuset *cur)
639{ 659{
640 struct cpuset *c, *par = cur->parent; 660 struct cpuset *c, *par = cur->parent;
@@ -675,11 +695,6 @@ static void update_cpu_domains(struct cpuset *cur)
675 partition_sched_domains(&pspan, &cspan); 695 partition_sched_domains(&pspan, &cspan);
676 unlock_cpu_hotplug(); 696 unlock_cpu_hotplug();
677} 697}
678#else
679static void update_cpu_domains(struct cpuset *cur)
680{
681}
682#endif
683 698
684static int update_cpumask(struct cpuset *cs, char *buf) 699static int update_cpumask(struct cpuset *cs, char *buf)
685{ 700{
@@ -852,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
852 } 867 }
853 buffer[nbytes] = 0; /* nul-terminate */ 868 buffer[nbytes] = 0; /* nul-terminate */
854 869
855 down(&cpuset_sem); 870 cpuset_down(&cpuset_sem);
856 871
857 if (is_removed(cs)) { 872 if (is_removed(cs)) {
858 retval = -ENODEV; 873 retval = -ENODEV;
@@ -886,7 +901,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
886 if (retval == 0) 901 if (retval == 0)
887 retval = nbytes; 902 retval = nbytes;
888out2: 903out2:
889 up(&cpuset_sem); 904 cpuset_up(&cpuset_sem);
890 cpuset_release_agent(pathbuf); 905 cpuset_release_agent(pathbuf);
891out1: 906out1:
892 kfree(buffer); 907 kfree(buffer);
@@ -926,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
926{ 941{
927 cpumask_t mask; 942 cpumask_t mask;
928 943
929 down(&cpuset_sem); 944 cpuset_down(&cpuset_sem);
930 mask = cs->cpus_allowed; 945 mask = cs->cpus_allowed;
931 up(&cpuset_sem); 946 cpuset_up(&cpuset_sem);
932 947
933 return cpulist_scnprintf(page, PAGE_SIZE, mask); 948 return cpulist_scnprintf(page, PAGE_SIZE, mask);
934} 949}
@@ -937,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
937{ 952{
938 nodemask_t mask; 953 nodemask_t mask;
939 954
940 down(&cpuset_sem); 955 cpuset_down(&cpuset_sem);
941 mask = cs->mems_allowed; 956 mask = cs->mems_allowed;
942 up(&cpuset_sem); 957 cpuset_up(&cpuset_sem);
943 958
944 return nodelist_scnprintf(page, PAGE_SIZE, mask); 959 return nodelist_scnprintf(page, PAGE_SIZE, mask);
945} 960}
@@ -984,6 +999,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
984 *s++ = '\n'; 999 *s++ = '\n';
985 *s = '\0'; 1000 *s = '\0';
986 1001
1002 /* Do nothing if *ppos is at the eof or beyond the eof. */
1003 if (s - page <= *ppos)
1004 return 0;
1005
987 start = page + *ppos; 1006 start = page + *ppos;
988 n = s - start; 1007 n = s - start;
989 retval = n - copy_to_user(buf, start, min(n, nbytes)); 1008 retval = n - copy_to_user(buf, start, min(n, nbytes));
@@ -1342,8 +1361,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1342 if (!cs) 1361 if (!cs)
1343 return -ENOMEM; 1362 return -ENOMEM;
1344 1363
1345 down(&cpuset_sem); 1364 cpuset_down(&cpuset_sem);
1346 refresh_mems();
1347 cs->flags = 0; 1365 cs->flags = 0;
1348 if (notify_on_release(parent)) 1366 if (notify_on_release(parent))
1349 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1367 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1368,14 +1386,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1368 * will down() this new directory's i_sem and if we race with 1386 * will down() this new directory's i_sem and if we race with
1369 * another mkdir, we might deadlock. 1387 * another mkdir, we might deadlock.
1370 */ 1388 */
1371 up(&cpuset_sem); 1389 cpuset_up(&cpuset_sem);
1372 1390
1373 err = cpuset_populate_dir(cs->dentry); 1391 err = cpuset_populate_dir(cs->dentry);
1374 /* If err < 0, we have a half-filled directory - oh well ;) */ 1392 /* If err < 0, we have a half-filled directory - oh well ;) */
1375 return 0; 1393 return 0;
1376err: 1394err:
1377 list_del(&cs->sibling); 1395 list_del(&cs->sibling);
1378 up(&cpuset_sem); 1396 cpuset_up(&cpuset_sem);
1379 kfree(cs); 1397 kfree(cs);
1380 return err; 1398 return err;
1381} 1399}
@@ -1397,14 +1415,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1397 1415
1398 /* the vfs holds both inode->i_sem already */ 1416 /* the vfs holds both inode->i_sem already */
1399 1417
1400 down(&cpuset_sem); 1418 cpuset_down(&cpuset_sem);
1401 refresh_mems();
1402 if (atomic_read(&cs->count) > 0) { 1419 if (atomic_read(&cs->count) > 0) {
1403 up(&cpuset_sem); 1420 cpuset_up(&cpuset_sem);
1404 return -EBUSY; 1421 return -EBUSY;
1405 } 1422 }
1406 if (!list_empty(&cs->children)) { 1423 if (!list_empty(&cs->children)) {
1407 up(&cpuset_sem); 1424 cpuset_up(&cpuset_sem);
1408 return -EBUSY; 1425 return -EBUSY;
1409 } 1426 }
1410 parent = cs->parent; 1427 parent = cs->parent;
@@ -1420,7 +1437,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1420 spin_unlock(&d->d_lock); 1437 spin_unlock(&d->d_lock);
1421 cpuset_d_remove_dir(d); 1438 cpuset_d_remove_dir(d);
1422 dput(d); 1439 dput(d);
1423 up(&cpuset_sem); 1440 cpuset_up(&cpuset_sem);
1424 cpuset_release_agent(pathbuf); 1441 cpuset_release_agent(pathbuf);
1425 return 0; 1442 return 0;
1426} 1443}
@@ -1523,10 +1540,10 @@ void cpuset_exit(struct task_struct *tsk)
1523 if (notify_on_release(cs)) { 1540 if (notify_on_release(cs)) {
1524 char *pathbuf = NULL; 1541 char *pathbuf = NULL;
1525 1542
1526 down(&cpuset_sem); 1543 cpuset_down(&cpuset_sem);
1527 if (atomic_dec_and_test(&cs->count)) 1544 if (atomic_dec_and_test(&cs->count))
1528 check_for_release(cs, &pathbuf); 1545 check_for_release(cs, &pathbuf);
1529 up(&cpuset_sem); 1546 cpuset_up(&cpuset_sem);
1530 cpuset_release_agent(pathbuf); 1547 cpuset_release_agent(pathbuf);
1531 } else { 1548 } else {
1532 atomic_dec(&cs->count); 1549 atomic_dec(&cs->count);
@@ -1547,11 +1564,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
1547{ 1564{
1548 cpumask_t mask; 1565 cpumask_t mask;
1549 1566
1550 down(&cpuset_sem); 1567 cpuset_down(&cpuset_sem);
1551 task_lock((struct task_struct *)tsk); 1568 task_lock((struct task_struct *)tsk);
1552 guarantee_online_cpus(tsk->cpuset, &mask); 1569 guarantee_online_cpus(tsk->cpuset, &mask);
1553 task_unlock((struct task_struct *)tsk); 1570 task_unlock((struct task_struct *)tsk);
1554 up(&cpuset_sem); 1571 cpuset_up(&cpuset_sem);
1555 1572
1556 return mask; 1573 return mask;
1557} 1574}
@@ -1576,9 +1593,9 @@ void cpuset_update_current_mems_allowed(void)
1576 if (!cs) 1593 if (!cs)
1577 return; /* task is exiting */ 1594 return; /* task is exiting */
1578 if (current->cpuset_mems_generation != cs->mems_generation) { 1595 if (current->cpuset_mems_generation != cs->mems_generation) {
1579 down(&cpuset_sem); 1596 cpuset_down(&cpuset_sem);
1580 refresh_mems(); 1597 refresh_mems();
1581 up(&cpuset_sem); 1598 cpuset_up(&cpuset_sem);
1582 } 1599 }
1583} 1600}
1584 1601
@@ -1611,17 +1628,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1611 return 0; 1628 return 0;
1612} 1629}
1613 1630
1631/*
1632 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
1633 * ancestor to the specified cpuset. Call while holding cpuset_sem.
1634 * If no ancestor is mem_exclusive (an unusual configuration), then
1635 * returns the root cpuset.
1636 */
1637static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1638{
1639 while (!is_mem_exclusive(cs) && cs->parent)
1640 cs = cs->parent;
1641 return cs;
1642}
1643
1614/** 1644/**
1615 * cpuset_zone_allowed - is zone z allowed in current->mems_allowed 1645 * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
1616 * @z: zone in question 1646 * @z: is this zone on an allowed node?
1647 * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
1617 * 1648 *
1618 * Is zone z allowed in current->mems_allowed, or is 1649 * If we're in interrupt, yes, we can always allocate. If zone
1619 * the CPU in interrupt context? (zone is always allowed in this case) 1650 * z's node is in our tasks mems_allowed, yes. If it's not a
1620 */ 1651 * __GFP_HARDWALL request and this zone's nodes is in the nearest
1621int cpuset_zone_allowed(struct zone *z) 1652 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
1653 * Otherwise, no.
1654 *
1655 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
1656 * and do not allow allocations outside the current tasks cpuset.
1657 * GFP_KERNEL allocations are not so marked, so can escape to the
1658 * nearest mem_exclusive ancestor cpuset.
1659 *
1660 * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages()
1661 * routine only calls here with __GFP_HARDWALL bit _not_ set if
1662 * it's a GFP_KERNEL allocation, and all nodes in the current tasks
1663 * mems_allowed came up empty on the first pass over the zonelist.
1664 * So only GFP_KERNEL allocations, if all nodes in the cpuset are
1665 * short of memory, might require taking the cpuset_sem semaphore.
1666 *
1667 * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
1668 * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
1669 * hardwall cpusets - no allocation on a node outside the cpuset is
1670 * allowed (unless in interrupt, of course).
1671 *
1672 * The second loop doesn't even call here for GFP_ATOMIC requests
1673 * (if the __alloc_pages() local variable 'wait' is set). That check
1674 * and the checks below have the combined affect in the second loop of
1675 * the __alloc_pages() routine that:
1676 * in_interrupt - any node ok (current task context irrelevant)
1677 * GFP_ATOMIC - any node ok
1678 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok
1679 * GFP_USER - only nodes in current tasks mems allowed ok.
1680 **/
1681
1682int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
1683{
1684 int node; /* node that zone z is on */
1685 const struct cpuset *cs; /* current cpuset ancestors */
1686 int allowed = 1; /* is allocation in zone z allowed? */
1687
1688 if (in_interrupt())
1689 return 1;
1690 node = z->zone_pgdat->node_id;
1691 if (node_isset(node, current->mems_allowed))
1692 return 1;
1693 if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
1694 return 0;
1695
1696 /* Not hardwall and node outside mems_allowed: scan up cpusets */
1697 cpuset_down(&cpuset_sem);
1698 cs = current->cpuset;
1699 if (!cs)
1700 goto done; /* current task exiting */
1701 cs = nearest_exclusive_ancestor(cs);
1702 allowed = node_isset(node, cs->mems_allowed);
1703done:
1704 cpuset_up(&cpuset_sem);
1705 return allowed;
1706}
1707
1708/**
1709 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
1710 * @p: pointer to task_struct of some other task.
1711 *
1712 * Description: Return true if the nearest mem_exclusive ancestor
1713 * cpusets of tasks @p and current overlap. Used by oom killer to
1714 * determine if task @p's memory usage might impact the memory
1715 * available to the current task.
1716 *
1717 * Acquires cpuset_sem - not suitable for calling from a fast path.
1718 **/
1719
1720int cpuset_excl_nodes_overlap(const struct task_struct *p)
1622{ 1721{
1623 return in_interrupt() || 1722 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1624 node_isset(z->zone_pgdat->node_id, current->mems_allowed); 1723 int overlap = 0; /* do cpusets overlap? */
1724
1725 cpuset_down(&cpuset_sem);
1726 cs1 = current->cpuset;
1727 if (!cs1)
1728 goto done; /* current task exiting */
1729 cs2 = p->cpuset;
1730 if (!cs2)
1731 goto done; /* task p is exiting */
1732 cs1 = nearest_exclusive_ancestor(cs1);
1733 cs2 = nearest_exclusive_ancestor(cs2);
1734 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1735done:
1736 cpuset_up(&cpuset_sem);
1737
1738 return overlap;
1625} 1739}
1626 1740
1627/* 1741/*
@@ -1642,7 +1756,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1642 return -ENOMEM; 1756 return -ENOMEM;
1643 1757
1644 tsk = m->private; 1758 tsk = m->private;
1645 down(&cpuset_sem); 1759 cpuset_down(&cpuset_sem);
1646 task_lock(tsk); 1760 task_lock(tsk);
1647 cs = tsk->cpuset; 1761 cs = tsk->cpuset;
1648 task_unlock(tsk); 1762 task_unlock(tsk);
@@ -1657,7 +1771,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
1657 seq_puts(m, buf); 1771 seq_puts(m, buf);
1658 seq_putc(m, '\n'); 1772 seq_putc(m, '\n');
1659out: 1773out:
1660 up(&cpuset_sem); 1774 cpuset_up(&cpuset_sem);
1661 kfree(buf); 1775 kfree(buf);
1662 return retval; 1776 return retval;
1663} 1777}
diff --git a/kernel/exit.c b/kernel/exit.c
index 5b0fb9f09f21..6d2089a1bce7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -368,17 +368,19 @@ EXPORT_SYMBOL(daemonize);
368static inline void close_files(struct files_struct * files) 368static inline void close_files(struct files_struct * files)
369{ 369{
370 int i, j; 370 int i, j;
371 struct fdtable *fdt;
371 372
372 j = 0; 373 j = 0;
374 fdt = files_fdtable(files);
373 for (;;) { 375 for (;;) {
374 unsigned long set; 376 unsigned long set;
375 i = j * __NFDBITS; 377 i = j * __NFDBITS;
376 if (i >= files->max_fdset || i >= files->max_fds) 378 if (i >= fdt->max_fdset || i >= fdt->max_fds)
377 break; 379 break;
378 set = files->open_fds->fds_bits[j++]; 380 set = fdt->open_fds->fds_bits[j++];
379 while (set) { 381 while (set) {
380 if (set & 1) { 382 if (set & 1) {
381 struct file * file = xchg(&files->fd[i], NULL); 383 struct file * file = xchg(&fdt->fd[i], NULL);
382 if (file) 384 if (file)
383 filp_close(file, files); 385 filp_close(file, files);
384 } 386 }
@@ -403,18 +405,22 @@ struct files_struct *get_files_struct(struct task_struct *task)
403 405
404void fastcall put_files_struct(struct files_struct *files) 406void fastcall put_files_struct(struct files_struct *files)
405{ 407{
408 struct fdtable *fdt;
409
406 if (atomic_dec_and_test(&files->count)) { 410 if (atomic_dec_and_test(&files->count)) {
407 close_files(files); 411 close_files(files);
408 /* 412 /*
409 * Free the fd and fdset arrays if we expanded them. 413 * Free the fd and fdset arrays if we expanded them.
414 * If the fdtable was embedded, pass files for freeing
415 * at the end of the RCU grace period. Otherwise,
416 * you can free files immediately.
410 */ 417 */
411 if (files->fd != &files->fd_array[0]) 418 fdt = files_fdtable(files);
412 free_fd_array(files->fd, files->max_fds); 419 if (fdt == &files->fdtab)
413 if (files->max_fdset > __FD_SETSIZE) { 420 fdt->free_files = files;
414 free_fdset(files->open_fds, files->max_fdset); 421 else
415 free_fdset(files->close_on_exec, files->max_fdset); 422 kmem_cache_free(files_cachep, files);
416 } 423 free_fdtable(fdt);
417 kmem_cache_free(files_cachep, files);
418 } 424 }
419} 425}
420 426
diff --git a/kernel/fork.c b/kernel/fork.c
index b65187f0c74e..8149f3602881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -35,6 +35,7 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/jiffies.h> 36#include <linux/jiffies.h>
37#include <linux/futex.h> 37#include <linux/futex.h>
38#include <linux/rcupdate.h>
38#include <linux/ptrace.h> 39#include <linux/ptrace.h>
39#include <linux/mount.h> 40#include <linux/mount.h>
40#include <linux/audit.h> 41#include <linux/audit.h>
@@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
176 177
177 /* One for us, one for whoever does the "release_task()" (usually parent) */ 178 /* One for us, one for whoever does the "release_task()" (usually parent) */
178 atomic_set(&tsk->usage,2); 179 atomic_set(&tsk->usage,2);
180 atomic_set(&tsk->fs_excl, 0);
179 return tsk; 181 return tsk;
180} 182}
181 183
@@ -564,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
564 return 0; 566 return 0;
565} 567}
566 568
567static int count_open_files(struct files_struct *files, int size) 569static int count_open_files(struct fdtable *fdt)
568{ 570{
571 int size = fdt->max_fdset;
569 int i; 572 int i;
570 573
571 /* Find the last open fd */ 574 /* Find the last open fd */
572 for (i = size/(8*sizeof(long)); i > 0; ) { 575 for (i = size/(8*sizeof(long)); i > 0; ) {
573 if (files->open_fds->fds_bits[--i]) 576 if (fdt->open_fds->fds_bits[--i])
574 break; 577 break;
575 } 578 }
576 i = (i+1) * 8 * sizeof(long); 579 i = (i+1) * 8 * sizeof(long);
577 return i; 580 return i;
578} 581}
579 582
583static struct files_struct *alloc_files(void)
584{
585 struct files_struct *newf;
586 struct fdtable *fdt;
587
588 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
589 if (!newf)
590 goto out;
591
592 atomic_set(&newf->count, 1);
593
594 spin_lock_init(&newf->file_lock);
595 fdt = &newf->fdtab;
596 fdt->next_fd = 0;
597 fdt->max_fds = NR_OPEN_DEFAULT;
598 fdt->max_fdset = __FD_SETSIZE;
599 fdt->close_on_exec = &newf->close_on_exec_init;
600 fdt->open_fds = &newf->open_fds_init;
601 fdt->fd = &newf->fd_array[0];
602 INIT_RCU_HEAD(&fdt->rcu);
603 fdt->free_files = NULL;
604 fdt->next = NULL;
605 rcu_assign_pointer(newf->fdt, fdt);
606out:
607 return newf;
608}
609
580static int copy_files(unsigned long clone_flags, struct task_struct * tsk) 610static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
581{ 611{
582 struct files_struct *oldf, *newf; 612 struct files_struct *oldf, *newf;
583 struct file **old_fds, **new_fds; 613 struct file **old_fds, **new_fds;
584 int open_files, size, i, error = 0, expand; 614 int open_files, size, i, error = 0, expand;
615 struct fdtable *old_fdt, *new_fdt;
585 616
586 /* 617 /*
587 * A background process may not have any files ... 618 * A background process may not have any files ...
@@ -602,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
602 */ 633 */
603 tsk->files = NULL; 634 tsk->files = NULL;
604 error = -ENOMEM; 635 error = -ENOMEM;
605 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); 636 newf = alloc_files();
606 if (!newf) 637 if (!newf)
607 goto out; 638 goto out;
608 639
609 atomic_set(&newf->count, 1);
610
611 spin_lock_init(&newf->file_lock);
612 newf->next_fd = 0;
613 newf->max_fds = NR_OPEN_DEFAULT;
614 newf->max_fdset = __FD_SETSIZE;
615 newf->close_on_exec = &newf->close_on_exec_init;
616 newf->open_fds = &newf->open_fds_init;
617 newf->fd = &newf->fd_array[0];
618
619 spin_lock(&oldf->file_lock); 640 spin_lock(&oldf->file_lock);
620 641 old_fdt = files_fdtable(oldf);
621 open_files = count_open_files(oldf, oldf->max_fdset); 642 new_fdt = files_fdtable(newf);
643 size = old_fdt->max_fdset;
644 open_files = count_open_files(old_fdt);
622 expand = 0; 645 expand = 0;
623 646
624 /* 647 /*
625 * Check whether we need to allocate a larger fd array or fd set. 648 * Check whether we need to allocate a larger fd array or fd set.
626 * Note: we're not a clone task, so the open count won't change. 649 * Note: we're not a clone task, so the open count won't change.
627 */ 650 */
628 if (open_files > newf->max_fdset) { 651 if (open_files > new_fdt->max_fdset) {
629 newf->max_fdset = 0; 652 new_fdt->max_fdset = 0;
630 expand = 1; 653 expand = 1;
631 } 654 }
632 if (open_files > newf->max_fds) { 655 if (open_files > new_fdt->max_fds) {
633 newf->max_fds = 0; 656 new_fdt->max_fds = 0;
634 expand = 1; 657 expand = 1;
635 } 658 }
636 659
@@ -642,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
642 spin_unlock(&newf->file_lock); 665 spin_unlock(&newf->file_lock);
643 if (error < 0) 666 if (error < 0)
644 goto out_release; 667 goto out_release;
668 new_fdt = files_fdtable(newf);
669 /*
670 * Reacquire the oldf lock and a pointer to its fd table
671 * who knows it may have a new bigger fd table. We need
672 * the latest pointer.
673 */
645 spin_lock(&oldf->file_lock); 674 spin_lock(&oldf->file_lock);
675 old_fdt = files_fdtable(oldf);
646 } 676 }
647 677
648 old_fds = oldf->fd; 678 old_fds = old_fdt->fd;
649 new_fds = newf->fd; 679 new_fds = new_fdt->fd;
650 680
651 memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); 681 memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8);
652 memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); 682 memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8);
653 683
654 for (i = open_files; i != 0; i--) { 684 for (i = open_files; i != 0; i--) {
655 struct file *f = *old_fds++; 685 struct file *f = *old_fds++;
@@ -662,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
662 * is partway through open(). So make sure that this 692 * is partway through open(). So make sure that this
663 * fd is available to the new process. 693 * fd is available to the new process.
664 */ 694 */
665 FD_CLR(open_files - i, newf->open_fds); 695 FD_CLR(open_files - i, new_fdt->open_fds);
666 } 696 }
667 *new_fds++ = f; 697 rcu_assign_pointer(*new_fds++, f);
668 } 698 }
669 spin_unlock(&oldf->file_lock); 699 spin_unlock(&oldf->file_lock);
670 700
671 /* compute the remainder to be cleared */ 701 /* compute the remainder to be cleared */
672 size = (newf->max_fds - open_files) * sizeof(struct file *); 702 size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
673 703
674 /* This is long word aligned thus could use a optimized version */ 704 /* This is long word aligned thus could use a optimized version */
675 memset(new_fds, 0, size); 705 memset(new_fds, 0, size);
676 706
677 if (newf->max_fdset > open_files) { 707 if (new_fdt->max_fdset > open_files) {
678 int left = (newf->max_fdset-open_files)/8; 708 int left = (new_fdt->max_fdset-open_files)/8;
679 int start = open_files / (8 * sizeof(unsigned long)); 709 int start = open_files / (8 * sizeof(unsigned long));
680 710
681 memset(&newf->open_fds->fds_bits[start], 0, left); 711 memset(&new_fdt->open_fds->fds_bits[start], 0, left);
682 memset(&newf->close_on_exec->fds_bits[start], 0, left); 712 memset(&new_fdt->close_on_exec->fds_bits[start], 0, left);
683 } 713 }
684 714
685 tsk->files = newf; 715 tsk->files = newf;
@@ -688,9 +718,9 @@ out:
688 return error; 718 return error;
689 719
690out_release: 720out_release:
691 free_fdset (newf->close_on_exec, newf->max_fdset); 721 free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset);
692 free_fdset (newf->open_fds, newf->max_fdset); 722 free_fdset (new_fdt->open_fds, new_fdt->max_fdset);
693 free_fd_array(newf->fd, newf->max_fds); 723 free_fd_array(new_fdt->fd, new_fdt->max_fds);
694 kmem_cache_free(files_cachep, newf); 724 kmem_cache_free(files_cachep, newf);
695 goto out; 725 goto out;
696} 726}
@@ -994,6 +1024,9 @@ static task_t *copy_process(unsigned long clone_flags,
994 * of CLONE_PTRACE. 1024 * of CLONE_PTRACE.
995 */ 1025 */
996 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); 1026 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1027#ifdef TIF_SYSCALL_EMU
1028 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1029#endif
997 1030
998 /* Our parent execution domain becomes current domain 1031 /* Our parent execution domain becomes current domain
999 These must match for thread signalling to apply */ 1032 These must match for thread signalling to apply */
@@ -1112,6 +1145,9 @@ static task_t *copy_process(unsigned long clone_flags,
1112 __get_cpu_var(process_counts)++; 1145 __get_cpu_var(process_counts)++;
1113 } 1146 }
1114 1147
1148 if (!current->signal->tty && p->signal->tty)
1149 p->signal->tty = NULL;
1150
1115 nr_threads++; 1151 nr_threads++;
1116 total_forks++; 1152 total_forks++;
1117 write_unlock_irq(&tasklist_lock); 1153 write_unlock_irq(&tasklist_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index c7130f86106c..ca05fe6a70b2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -40,6 +40,7 @@
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/syscalls.h> 41#include <linux/syscalls.h>
42#include <linux/signal.h> 42#include <linux/signal.h>
43#include <asm/futex.h>
43 44
44#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 45#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
45 46
@@ -327,6 +328,118 @@ out:
327} 328}
328 329
329/* 330/*
331 * Wake up all waiters hashed on the physical page that is mapped
332 * to this virtual address:
333 */
334static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op)
335{
336 union futex_key key1, key2;
337 struct futex_hash_bucket *bh1, *bh2;
338 struct list_head *head;
339 struct futex_q *this, *next;
340 int ret, op_ret, attempt = 0;
341
342retryfull:
343 down_read(&current->mm->mmap_sem);
344
345 ret = get_futex_key(uaddr1, &key1);
346 if (unlikely(ret != 0))
347 goto out;
348 ret = get_futex_key(uaddr2, &key2);
349 if (unlikely(ret != 0))
350 goto out;
351
352 bh1 = hash_futex(&key1);
353 bh2 = hash_futex(&key2);
354
355retry:
356 if (bh1 < bh2)
357 spin_lock(&bh1->lock);
358 spin_lock(&bh2->lock);
359 if (bh1 > bh2)
360 spin_lock(&bh1->lock);
361
362 op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2);
363 if (unlikely(op_ret < 0)) {
364 int dummy;
365
366 spin_unlock(&bh1->lock);
367 if (bh1 != bh2)
368 spin_unlock(&bh2->lock);
369
370 /* futex_atomic_op_inuser needs to both read and write
371 * *(int __user *)uaddr2, but we can't modify it
372 * non-atomically. Therefore, if get_user below is not
373 * enough, we need to handle the fault ourselves, while
374 * still holding the mmap_sem. */
375 if (attempt++) {
376 struct vm_area_struct * vma;
377 struct mm_struct *mm = current->mm;
378
379 ret = -EFAULT;
380 if (attempt >= 2 ||
381 !(vma = find_vma(mm, uaddr2)) ||
382 vma->vm_start > uaddr2 ||
383 !(vma->vm_flags & VM_WRITE))
384 goto out;
385
386 switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
387 case VM_FAULT_MINOR:
388 current->min_flt++;
389 break;
390 case VM_FAULT_MAJOR:
391 current->maj_flt++;
392 break;
393 default:
394 goto out;
395 }
396 goto retry;
397 }
398
399 /* If we would have faulted, release mmap_sem,
400 * fault it in and start all over again. */
401 up_read(&current->mm->mmap_sem);
402
403 ret = get_user(dummy, (int __user *)uaddr2);
404 if (ret)
405 return ret;
406
407 goto retryfull;
408 }
409
410 head = &bh1->chain;
411
412 list_for_each_entry_safe(this, next, head, list) {
413 if (match_futex (&this->key, &key1)) {
414 wake_futex(this);
415 if (++ret >= nr_wake)
416 break;
417 }
418 }
419
420 if (op_ret > 0) {
421 head = &bh2->chain;
422
423 op_ret = 0;
424 list_for_each_entry_safe(this, next, head, list) {
425 if (match_futex (&this->key, &key2)) {
426 wake_futex(this);
427 if (++op_ret >= nr_wake2)
428 break;
429 }
430 }
431 ret += op_ret;
432 }
433
434 spin_unlock(&bh1->lock);
435 if (bh1 != bh2)
436 spin_unlock(&bh2->lock);
437out:
438 up_read(&current->mm->mmap_sem);
439 return ret;
440}
441
442/*
330 * Requeue all waiters hashed on one physical page to another 443 * Requeue all waiters hashed on one physical page to another
331 * physical page. 444 * physical page.
332 */ 445 */
@@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal)
673 filp->f_mapping = filp->f_dentry->d_inode->i_mapping; 786 filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
674 787
675 if (signal) { 788 if (signal) {
676 int err;
677 err = f_setown(filp, current->pid, 1); 789 err = f_setown(filp, current->pid, 1);
678 if (err < 0) { 790 if (err < 0) {
679 put_unused_fd(ret); 791 goto error;
680 put_filp(filp);
681 ret = err;
682 goto out;
683 } 792 }
684 filp->f_owner.signum = signal; 793 filp->f_owner.signum = signal;
685 } 794 }
686 795
687 q = kmalloc(sizeof(*q), GFP_KERNEL); 796 q = kmalloc(sizeof(*q), GFP_KERNEL);
688 if (!q) { 797 if (!q) {
689 put_unused_fd(ret); 798 err = -ENOMEM;
690 put_filp(filp); 799 goto error;
691 ret = -ENOMEM;
692 goto out;
693 } 800 }
694 801
695 down_read(&current->mm->mmap_sem); 802 down_read(&current->mm->mmap_sem);
@@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal)
697 804
698 if (unlikely(err != 0)) { 805 if (unlikely(err != 0)) {
699 up_read(&current->mm->mmap_sem); 806 up_read(&current->mm->mmap_sem);
700 put_unused_fd(ret);
701 put_filp(filp);
702 kfree(q); 807 kfree(q);
703 return err; 808 goto error;
704 } 809 }
705 810
706 /* 811 /*
@@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal)
716 fd_install(ret, filp); 821 fd_install(ret, filp);
717out: 822out:
718 return ret; 823 return ret;
824error:
825 put_unused_fd(ret);
826 put_filp(filp);
827 ret = err;
828 goto out;
719} 829}
720 830
721long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 831long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
@@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
740 case FUTEX_CMP_REQUEUE: 850 case FUTEX_CMP_REQUEUE:
741 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); 851 ret = futex_requeue(uaddr, uaddr2, val, val2, &val3);
742 break; 852 break;
853 case FUTEX_WAKE_OP:
854 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
855 break;
743 default: 856 default:
744 ret = -ENOSYS; 857 ret = -ENOSYS;
745 } 858 }
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
index 388977f3e9b7..0cbe633420fb 100644
--- a/kernel/intermodule.c
+++ b/kernel/intermodule.c
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void
39 struct list_head *tmp; 39 struct list_head *tmp;
40 struct inter_module_entry *ime, *ime_new; 40 struct inter_module_entry *ime, *ime_new;
41 41
42 if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { 42 if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
43 /* Overloaded kernel, not fatal */ 43 /* Overloaded kernel, not fatal */
44 printk(KERN_ERR 44 printk(KERN_ERR
45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", 45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void
47 kmalloc_failed = 1; 47 kmalloc_failed = 1;
48 return; 48 return;
49 } 49 }
50 memset(ime_new, 0, sizeof(*ime_new));
51 ime_new->im_name = im_name; 50 ime_new->im_name = im_name;
52 ime_new->owner = owner; 51 ime_new->owner = owner;
53 ime_new->userdata = userdata; 52 ime_new->userdata = userdata;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c29f83c16497..3ff7b925c387 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
111 unsigned int status; 111 unsigned int status;
112 112
113 kstat_this_cpu.irqs[irq]++; 113 kstat_this_cpu.irqs[irq]++;
114 if (desc->status & IRQ_PER_CPU) { 114 if (CHECK_IRQ_PER_CPU(desc->status)) {
115 irqreturn_t action_ret; 115 irqreturn_t action_ret;
116 116
117 /* 117 /*
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
18 18
19cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; 19cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
20 20
21#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
22cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
23#endif
24
21/** 25/**
22 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 26 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
23 * 27 *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
19 */ 19 */
20static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; 20static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
21 21
22void __attribute__((weak)) 22#ifdef CONFIG_GENERIC_PENDING_IRQ
23proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 23void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
24{
25 /*
26 * Save these away for later use. Re-progam when the
27 * interrupt is pending
28 */
29 set_pending_irq(irq, mask_val);
30}
31#else
32void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
24{ 33{
25 irq_affinity[irq] = mask_val; 34 irq_affinity[irq] = mask_val;
26 irq_desc[irq].handler->set_affinity(irq, mask_val); 35 irq_desc[irq].handler->set_affinity(irq, mask_val);
27} 36}
37#endif
28 38
29static int irq_affinity_read_proc(char *page, char **start, off_t off, 39static int irq_affinity_read_proc(char *page, char **start, off_t off,
30 int count, int *eof, void *data) 40 int count, int *eof, void *data)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b0237122b24e..f3ea492ab44d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -37,6 +37,7 @@
37#include <linux/init.h> 37#include <linux/init.h>
38#include <linux/module.h> 38#include <linux/module.h>
39#include <linux/moduleloader.h> 39#include <linux/moduleloader.h>
40#include <asm-generic/sections.h>
40#include <asm/cacheflush.h> 41#include <asm/cacheflush.h>
41#include <asm/errno.h> 42#include <asm/errno.h>
42#include <asm/kdebug.h> 43#include <asm/kdebug.h>
@@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages;
72 * get_insn_slot() - Find a slot on an executable page for an instruction. 73 * get_insn_slot() - Find a slot on an executable page for an instruction.
73 * We allocate an executable page if there's no room on existing ones. 74 * We allocate an executable page if there's no room on existing ones.
74 */ 75 */
75kprobe_opcode_t *get_insn_slot(void) 76kprobe_opcode_t __kprobes *get_insn_slot(void)
76{ 77{
77 struct kprobe_insn_page *kip; 78 struct kprobe_insn_page *kip;
78 struct hlist_node *pos; 79 struct hlist_node *pos;
@@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void)
117 return kip->insns; 118 return kip->insns;
118} 119}
119 120
120void free_insn_slot(kprobe_opcode_t *slot) 121void __kprobes free_insn_slot(kprobe_opcode_t *slot)
121{ 122{
122 struct kprobe_insn_page *kip; 123 struct kprobe_insn_page *kip;
123 struct hlist_node *pos; 124 struct hlist_node *pos;
@@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot)
152} 153}
153 154
154/* Locks kprobe: irqs must be disabled */ 155/* Locks kprobe: irqs must be disabled */
155void lock_kprobes(void) 156void __kprobes lock_kprobes(void)
156{ 157{
158 unsigned long flags = 0;
159
160 /* Avoiding local interrupts to happen right after we take the kprobe_lock
161 * and before we get a chance to update kprobe_cpu, this to prevent
162 * deadlock when we have a kprobe on ISR routine and a kprobe on task
163 * routine
164 */
165 local_irq_save(flags);
166
157 spin_lock(&kprobe_lock); 167 spin_lock(&kprobe_lock);
158 kprobe_cpu = smp_processor_id(); 168 kprobe_cpu = smp_processor_id();
169
170 local_irq_restore(flags);
159} 171}
160 172
161void unlock_kprobes(void) 173void __kprobes unlock_kprobes(void)
162{ 174{
175 unsigned long flags = 0;
176
177 /* Avoiding local interrupts to happen right after we update
178 * kprobe_cpu and before we get a a chance to release kprobe_lock,
179 * this to prevent deadlock when we have a kprobe on ISR routine and
180 * a kprobe on task routine
181 */
182 local_irq_save(flags);
183
163 kprobe_cpu = NR_CPUS; 184 kprobe_cpu = NR_CPUS;
164 spin_unlock(&kprobe_lock); 185 spin_unlock(&kprobe_lock);
186
187 local_irq_restore(flags);
165} 188}
166 189
167/* You have to be holding the kprobe_lock */ 190/* You have to be holding the kprobe_lock */
168struct kprobe *get_kprobe(void *addr) 191struct kprobe __kprobes *get_kprobe(void *addr)
169{ 192{
170 struct hlist_head *head; 193 struct hlist_head *head;
171 struct hlist_node *node; 194 struct hlist_node *node;
@@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr)
183 * Aggregate handlers for multiple kprobes support - these handlers 206 * Aggregate handlers for multiple kprobes support - these handlers
184 * take care of invoking the individual kprobe handlers on p->list 207 * take care of invoking the individual kprobe handlers on p->list
185 */ 208 */
186static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) 209static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
187{ 210{
188 struct kprobe *kp; 211 struct kprobe *kp;
189 212
@@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
198 return 0; 221 return 0;
199} 222}
200 223
201static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, 224static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
202 unsigned long flags) 225 unsigned long flags)
203{ 226{
204 struct kprobe *kp; 227 struct kprobe *kp;
205 228
@@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
213 return; 236 return;
214} 237}
215 238
216static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, 239static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
217 int trapnr) 240 int trapnr)
218{ 241{
219 /* 242 /*
220 * if we faulted "during" the execution of a user specified 243 * if we faulted "during" the execution of a user specified
@@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
227 return 0; 250 return 0;
228} 251}
229 252
230static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) 253static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
231{ 254{
232 struct kprobe *kp = curr_kprobe; 255 struct kprobe *kp = curr_kprobe;
233 if (curr_kprobe && kp->break_handler) { 256 if (curr_kprobe && kp->break_handler) {
@@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
240 return 0; 263 return 0;
241} 264}
242 265
243struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) 266struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
244{ 267{
245 struct hlist_node *node; 268 struct hlist_node *node;
246 struct kretprobe_instance *ri; 269 struct kretprobe_instance *ri;
@@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
249 return NULL; 272 return NULL;
250} 273}
251 274
252static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) 275static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
276 *rp)
253{ 277{
254 struct hlist_node *node; 278 struct hlist_node *node;
255 struct kretprobe_instance *ri; 279 struct kretprobe_instance *ri;
@@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
258 return NULL; 282 return NULL;
259} 283}
260 284
261void add_rp_inst(struct kretprobe_instance *ri) 285void __kprobes add_rp_inst(struct kretprobe_instance *ri)
262{ 286{
263 /* 287 /*
264 * Remove rp inst off the free list - 288 * Remove rp inst off the free list -
@@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri)
276 hlist_add_head(&ri->uflist, &ri->rp->used_instances); 300 hlist_add_head(&ri->uflist, &ri->rp->used_instances);
277} 301}
278 302
279void recycle_rp_inst(struct kretprobe_instance *ri) 303void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
280{ 304{
281 /* remove rp inst off the rprobe_inst_table */ 305 /* remove rp inst off the rprobe_inst_table */
282 hlist_del(&ri->hlist); 306 hlist_del(&ri->hlist);
@@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri)
291 kfree(ri); 315 kfree(ri);
292} 316}
293 317
294struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) 318struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk)
295{ 319{
296 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; 320 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
297} 321}
@@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
302 * instances associated with this task. These left over instances represent 326 * instances associated with this task. These left over instances represent
303 * probed functions that have been called but will never return. 327 * probed functions that have been called but will never return.
304 */ 328 */
305void kprobe_flush_task(struct task_struct *tk) 329void __kprobes kprobe_flush_task(struct task_struct *tk)
306{ 330{
307 struct kretprobe_instance *ri; 331 struct kretprobe_instance *ri;
308 struct hlist_head *head; 332 struct hlist_head *head;
@@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk)
322 * This kprobe pre_handler is registered with every kretprobe. When probe 346 * This kprobe pre_handler is registered with every kretprobe. When probe
323 * hits it will set up the return probe. 347 * hits it will set up the return probe.
324 */ 348 */
325static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) 349static int __kprobes pre_handler_kretprobe(struct kprobe *p,
350 struct pt_regs *regs)
326{ 351{
327 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 352 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
328 353
@@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
353* Add the new probe to old_p->list. Fail if this is the 378* Add the new probe to old_p->list. Fail if this is the
354* second jprobe at the address - two jprobes can't coexist 379* second jprobe at the address - two jprobes can't coexist
355*/ 380*/
356static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 381static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
357{ 382{
358 struct kprobe *kp; 383 struct kprobe *kp;
359 384
@@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
395 * the intricacies 420 * the intricacies
396 * TODO: Move kcalloc outside the spinlock 421 * TODO: Move kcalloc outside the spinlock
397 */ 422 */
398static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) 423static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
424 struct kprobe *p)
399{ 425{
400 int ret = 0; 426 int ret = 0;
401 struct kprobe *ap; 427 struct kprobe *ap;
@@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
434 spin_unlock_irqrestore(&kprobe_lock, flags); 460 spin_unlock_irqrestore(&kprobe_lock, flags);
435} 461}
436 462
437int register_kprobe(struct kprobe *p) 463static int __kprobes in_kprobes_functions(unsigned long addr)
464{
465 if (addr >= (unsigned long)__kprobes_text_start
466 && addr < (unsigned long)__kprobes_text_end)
467 return -EINVAL;
468 return 0;
469}
470
471int __kprobes register_kprobe(struct kprobe *p)
438{ 472{
439 int ret = 0; 473 int ret = 0;
440 unsigned long flags = 0; 474 unsigned long flags = 0;
441 struct kprobe *old_p; 475 struct kprobe *old_p;
442 476
443 if ((ret = arch_prepare_kprobe(p)) != 0) { 477 if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0)
478 return ret;
479 if ((ret = arch_prepare_kprobe(p)) != 0)
444 goto rm_kprobe; 480 goto rm_kprobe;
445 } 481
446 spin_lock_irqsave(&kprobe_lock, flags); 482 spin_lock_irqsave(&kprobe_lock, flags);
447 old_p = get_kprobe(p->addr); 483 old_p = get_kprobe(p->addr);
448 p->nmissed = 0; 484 p->nmissed = 0;
@@ -466,7 +502,7 @@ rm_kprobe:
466 return ret; 502 return ret;
467} 503}
468 504
469void unregister_kprobe(struct kprobe *p) 505void __kprobes unregister_kprobe(struct kprobe *p)
470{ 506{
471 unsigned long flags; 507 unsigned long flags;
472 struct kprobe *old_p; 508 struct kprobe *old_p;
@@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = {
487 .priority = 0x7fffffff /* we need to notified first */ 523 .priority = 0x7fffffff /* we need to notified first */
488}; 524};
489 525
490int register_jprobe(struct jprobe *jp) 526int __kprobes register_jprobe(struct jprobe *jp)
491{ 527{
492 /* Todo: Verify probepoint is a function entry point */ 528 /* Todo: Verify probepoint is a function entry point */
493 jp->kp.pre_handler = setjmp_pre_handler; 529 jp->kp.pre_handler = setjmp_pre_handler;
@@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp)
496 return register_kprobe(&jp->kp); 532 return register_kprobe(&jp->kp);
497} 533}
498 534
499void unregister_jprobe(struct jprobe *jp) 535void __kprobes unregister_jprobe(struct jprobe *jp)
500{ 536{
501 unregister_kprobe(&jp->kp); 537 unregister_kprobe(&jp->kp);
502} 538}
503 539
504#ifdef ARCH_SUPPORTS_KRETPROBES 540#ifdef ARCH_SUPPORTS_KRETPROBES
505 541
506int register_kretprobe(struct kretprobe *rp) 542int __kprobes register_kretprobe(struct kretprobe *rp)
507{ 543{
508 int ret = 0; 544 int ret = 0;
509 struct kretprobe_instance *inst; 545 struct kretprobe_instance *inst;
@@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp)
540 576
541#else /* ARCH_SUPPORTS_KRETPROBES */ 577#else /* ARCH_SUPPORTS_KRETPROBES */
542 578
543int register_kretprobe(struct kretprobe *rp) 579int __kprobes register_kretprobe(struct kretprobe *rp)
544{ 580{
545 return -ENOSYS; 581 return -ENOSYS;
546} 582}
547 583
548#endif /* ARCH_SUPPORTS_KRETPROBES */ 584#endif /* ARCH_SUPPORTS_KRETPROBES */
549 585
550void unregister_kretprobe(struct kretprobe *rp) 586void __kprobes unregister_kretprobe(struct kretprobe *rp)
551{ 587{
552 unsigned long flags; 588 unsigned long flags;
553 struct kretprobe_instance *ri; 589 struct kretprobe_instance *ri;
diff --git a/kernel/module.c b/kernel/module.c
index c32995fbd8fd..ff5c500ab625 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,6 +20,7 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/moduleloader.h> 21#include <linux/moduleloader.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/kernel.h>
23#include <linux/slab.h> 24#include <linux/slab.h>
24#include <linux/vmalloc.h> 25#include <linux/vmalloc.h>
25#include <linux/elf.h> 26#include <linux/elf.h>
@@ -498,7 +499,7 @@ static inline int try_force(unsigned int flags)
498{ 499{
499 int ret = (flags & O_TRUNC); 500 int ret = (flags & O_TRUNC);
500 if (ret) 501 if (ret)
501 tainted |= TAINT_FORCED_MODULE; 502 add_taint(TAINT_FORCED_MODULE);
502 return ret; 503 return ret;
503} 504}
504#else 505#else
@@ -897,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs,
897 if (!(tainted & TAINT_FORCED_MODULE)) { 898 if (!(tainted & TAINT_FORCED_MODULE)) {
898 printk("%s: no version for \"%s\" found: kernel tainted.\n", 899 printk("%s: no version for \"%s\" found: kernel tainted.\n",
899 mod->name, symname); 900 mod->name, symname);
900 tainted |= TAINT_FORCED_MODULE; 901 add_taint(TAINT_FORCED_MODULE);
901 } 902 }
902 return 1; 903 return 1;
903} 904}
@@ -1352,7 +1353,7 @@ static void set_license(struct module *mod, const char *license)
1352 if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { 1353 if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) {
1353 printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", 1354 printk(KERN_WARNING "%s: module license '%s' taints kernel.\n",
1354 mod->name, license); 1355 mod->name, license);
1355 tainted |= TAINT_PROPRIETARY_MODULE; 1356 add_taint(TAINT_PROPRIETARY_MODULE);
1356 } 1357 }
1357} 1358}
1358 1359
@@ -1509,6 +1510,7 @@ static struct module *load_module(void __user *umod,
1509 long err = 0; 1510 long err = 0;
1510 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1511 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1511 struct exception_table_entry *extable; 1512 struct exception_table_entry *extable;
1513 mm_segment_t old_fs;
1512 1514
1513 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1515 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
1514 umod, len, uargs); 1516 umod, len, uargs);
@@ -1609,7 +1611,7 @@ static struct module *load_module(void __user *umod,
1609 modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); 1611 modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
1610 /* This is allowed: modprobe --force will invalidate it. */ 1612 /* This is allowed: modprobe --force will invalidate it. */
1611 if (!modmagic) { 1613 if (!modmagic) {
1612 tainted |= TAINT_FORCED_MODULE; 1614 add_taint(TAINT_FORCED_MODULE);
1613 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", 1615 printk(KERN_WARNING "%s: no version magic, tainting kernel.\n",
1614 mod->name); 1616 mod->name);
1615 } else if (!same_magic(modmagic, vermagic)) { 1617 } else if (!same_magic(modmagic, vermagic)) {
@@ -1738,7 +1740,7 @@ static struct module *load_module(void __user *umod,
1738 (mod->num_gpl_syms && !gplcrcindex)) { 1740 (mod->num_gpl_syms && !gplcrcindex)) {
1739 printk(KERN_WARNING "%s: No versions for exported symbols." 1741 printk(KERN_WARNING "%s: No versions for exported symbols."
1740 " Tainting kernel.\n", mod->name); 1742 " Tainting kernel.\n", mod->name);
1741 tainted |= TAINT_FORCED_MODULE; 1743 add_taint(TAINT_FORCED_MODULE);
1742 } 1744 }
1743#endif 1745#endif
1744 1746
@@ -1779,6 +1781,24 @@ static struct module *load_module(void __user *umod,
1779 if (err < 0) 1781 if (err < 0)
1780 goto cleanup; 1782 goto cleanup;
1781 1783
1784 /* flush the icache in correct context */
1785 old_fs = get_fs();
1786 set_fs(KERNEL_DS);
1787
1788 /*
1789 * Flush the instruction cache, since we've played with text.
1790 * Do it before processing of module parameters, so the module
1791 * can provide parameter accessor functions of its own.
1792 */
1793 if (mod->module_init)
1794 flush_icache_range((unsigned long)mod->module_init,
1795 (unsigned long)mod->module_init
1796 + mod->init_size);
1797 flush_icache_range((unsigned long)mod->module_core,
1798 (unsigned long)mod->module_core + mod->core_size);
1799
1800 set_fs(old_fs);
1801
1782 mod->args = args; 1802 mod->args = args;
1783 if (obsparmindex) { 1803 if (obsparmindex) {
1784 err = obsolete_params(mod->name, mod->args, 1804 err = obsolete_params(mod->name, mod->args,
@@ -1860,7 +1880,6 @@ sys_init_module(void __user *umod,
1860 const char __user *uargs) 1880 const char __user *uargs)
1861{ 1881{
1862 struct module *mod; 1882 struct module *mod;
1863 mm_segment_t old_fs = get_fs();
1864 int ret = 0; 1883 int ret = 0;
1865 1884
1866 /* Must have permission */ 1885 /* Must have permission */
@@ -1878,19 +1897,6 @@ sys_init_module(void __user *umod,
1878 return PTR_ERR(mod); 1897 return PTR_ERR(mod);
1879 } 1898 }
1880 1899
1881 /* flush the icache in correct context */
1882 set_fs(KERNEL_DS);
1883
1884 /* Flush the instruction cache, since we've played with text */
1885 if (mod->module_init)
1886 flush_icache_range((unsigned long)mod->module_init,
1887 (unsigned long)mod->module_init
1888 + mod->init_size);
1889 flush_icache_range((unsigned long)mod->module_core,
1890 (unsigned long)mod->module_core + mod->core_size);
1891
1892 set_fs(old_fs);
1893
1894 /* Now sew it into the lists. They won't access us, since 1900 /* Now sew it into the lists. They won't access us, since
1895 strong_try_module_get() will fail. */ 1901 strong_try_module_get() will fail. */
1896 stop_machine_run(__link_module, mod, NR_CPUS); 1902 stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/params.c b/kernel/params.c
index d586c35ef8fc..fbf173215fd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
542{ 542{
543 struct module_kobject *mk; 543 struct module_kobject *mk;
544 544
545 mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); 545 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
546 memset(mk, 0, sizeof(struct module_kobject)); 546 BUG_ON(!mk);
547 547
548 mk->mod = THIS_MODULE; 548 mk->mod = THIS_MODULE;
549 kobj_set_kset_s(mk, module_subsys); 549 kobj_set_kset_s(mk, module_subsys);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 38798a2ff994..b7b532acd9fc 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private)
427 timr->sigq->info.si_code = SI_TIMER; 427 timr->sigq->info.si_code = SI_TIMER;
428 timr->sigq->info.si_tid = timr->it_id; 428 timr->sigq->info.si_tid = timr->it_id;
429 timr->sigq->info.si_value = timr->it_sigev_value; 429 timr->sigq->info.si_value = timr->it_sigev_value;
430
430 if (timr->it_sigev_notify & SIGEV_THREAD_ID) { 431 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
431 if (unlikely(timr->it_process->flags & PF_EXITING)) { 432 struct task_struct *leader;
432 timr->it_sigev_notify = SIGEV_SIGNAL; 433 int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
433 put_task_struct(timr->it_process); 434 timr->it_process);
434 timr->it_process = timr->it_process->group_leader; 435
435 goto group; 436 if (likely(ret >= 0))
436 } 437 return ret;
437 return send_sigqueue(timr->it_sigev_signo, timr->sigq, 438
438 timr->it_process); 439 timr->it_sigev_notify = SIGEV_SIGNAL;
439 } 440 leader = timr->it_process->group_leader;
440 else { 441 put_task_struct(timr->it_process);
441 group: 442 timr->it_process = leader;
442 return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
443 timr->it_process);
444 } 443 }
444
445 return send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
446 timr->it_process);
445} 447}
446EXPORT_SYMBOL_GPL(posix_timer_event); 448EXPORT_SYMBOL_GPL(posix_timer_event);
447 449
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 2c7121d9bff1..396c7873e804 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,5 +1,6 @@
1config PM 1config PM
2 bool "Power Management support" 2 bool "Power Management support"
3 depends on !IA64_HP_SIM
3 ---help--- 4 ---help---
4 "Power Management" means that parts of your computer are shut 5 "Power Management" means that parts of your computer are shut
5 off or put into a power conserving "sleep" mode if they are not 6 off or put into a power conserving "sleep" mode if they are not
@@ -28,7 +29,7 @@ config PM_DEBUG
28 29
29config SOFTWARE_SUSPEND 30config SOFTWARE_SUSPEND
30 bool "Software Suspend" 31 bool "Software Suspend"
31 depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) 32 depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP))
32 ---help--- 33 ---help---
33 Enable the possibility of suspending the machine. 34 Enable the possibility of suspending the machine.
34 It doesn't need APM. 35 It doesn't need APM.
@@ -72,6 +73,18 @@ config PM_STD_PARTITION
72 suspended image to. It will simply pick the first available swap 73 suspended image to. It will simply pick the first available swap
73 device. 74 device.
74 75
76config SWSUSP_ENCRYPT
77 bool "Encrypt suspend image"
78 depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y)
79 default ""
80 ---help---
81 To prevent data gathering from swap after resume you can encrypt
82 the suspend image with a temporary key that is deleted on
83 resume.
84
85 Note that the temporary key is stored unencrypted on disk while the
86 system is suspended.
87
75config SUSPEND_SMP 88config SUSPEND_SMP
76 bool 89 bool
77 depends on HOTPLUG_CPU && X86 && PM 90 depends on HOTPLUG_CPU && X86 && PM
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 664eb0469b6e..2d8bf054d036 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -112,24 +112,12 @@ static inline void platform_finish(void)
112 } 112 }
113} 113}
114 114
115static void finish(void)
116{
117 device_resume();
118 platform_finish();
119 thaw_processes();
120 enable_nonboot_cpus();
121 pm_restore_console();
122}
123
124
125static int prepare_processes(void) 115static int prepare_processes(void)
126{ 116{
127 int error; 117 int error;
128 118
129 pm_prepare_console(); 119 pm_prepare_console();
130
131 sys_sync(); 120 sys_sync();
132
133 disable_nonboot_cpus(); 121 disable_nonboot_cpus();
134 122
135 if (freeze_processes()) { 123 if (freeze_processes()) {
@@ -162,15 +150,6 @@ static void unprepare_processes(void)
162 pm_restore_console(); 150 pm_restore_console();
163} 151}
164 152
165static int prepare_devices(void)
166{
167 int error;
168
169 if ((error = device_suspend(PMSG_FREEZE)))
170 printk("Some devices failed to suspend\n");
171 return error;
172}
173
174/** 153/**
175 * pm_suspend_disk - The granpappy of power management. 154 * pm_suspend_disk - The granpappy of power management.
176 * 155 *
@@ -187,17 +166,14 @@ int pm_suspend_disk(void)
187 error = prepare_processes(); 166 error = prepare_processes();
188 if (error) 167 if (error)
189 return error; 168 return error;
190 error = prepare_devices();
191 169
170 error = device_suspend(PMSG_FREEZE);
192 if (error) { 171 if (error) {
172 printk("Some devices failed to suspend\n");
193 unprepare_processes(); 173 unprepare_processes();
194 return error; 174 return error;
195 } 175 }
196 176
197 pr_debug("PM: Attempting to suspend to disk.\n");
198 if (pm_disk_mode == PM_DISK_FIRMWARE)
199 return pm_ops->enter(PM_SUSPEND_DISK);
200
201 pr_debug("PM: snapshotting memory.\n"); 177 pr_debug("PM: snapshotting memory.\n");
202 in_suspend = 1; 178 in_suspend = 1;
203 if ((error = swsusp_suspend())) 179 if ((error = swsusp_suspend()))
@@ -208,11 +184,20 @@ int pm_suspend_disk(void)
208 error = swsusp_write(); 184 error = swsusp_write();
209 if (!error) 185 if (!error)
210 power_down(pm_disk_mode); 186 power_down(pm_disk_mode);
187 else {
188 /* swsusp_write can not fail in device_resume,
189 no need to do second device_resume */
190 swsusp_free();
191 unprepare_processes();
192 return error;
193 }
211 } else 194 } else
212 pr_debug("PM: Image restored successfully.\n"); 195 pr_debug("PM: Image restored successfully.\n");
196
213 swsusp_free(); 197 swsusp_free();
214 Done: 198 Done:
215 finish(); 199 device_resume();
200 unprepare_processes();
216 return error; 201 return error;
217} 202}
218 203
@@ -233,9 +218,12 @@ static int software_resume(void)
233{ 218{
234 int error; 219 int error;
235 220
221 down(&pm_sem);
236 if (!swsusp_resume_device) { 222 if (!swsusp_resume_device) {
237 if (!strlen(resume_file)) 223 if (!strlen(resume_file)) {
224 up(&pm_sem);
238 return -ENOENT; 225 return -ENOENT;
226 }
239 swsusp_resume_device = name_to_dev_t(resume_file); 227 swsusp_resume_device = name_to_dev_t(resume_file);
240 pr_debug("swsusp: Resume From Partition %s\n", resume_file); 228 pr_debug("swsusp: Resume From Partition %s\n", resume_file);
241 } else { 229 } else {
@@ -248,6 +236,7 @@ static int software_resume(void)
248 * FIXME: If noresume is specified, we need to find the partition 236 * FIXME: If noresume is specified, we need to find the partition
249 * and reset it back to normal swap space. 237 * and reset it back to normal swap space.
250 */ 238 */
239 up(&pm_sem);
251 return 0; 240 return 0;
252 } 241 }
253 242
@@ -270,20 +259,24 @@ static int software_resume(void)
270 259
271 pr_debug("PM: Preparing devices for restore.\n"); 260 pr_debug("PM: Preparing devices for restore.\n");
272 261
273 if ((error = prepare_devices())) 262 if ((error = device_suspend(PMSG_FREEZE))) {
263 printk("Some devices failed to suspend\n");
274 goto Free; 264 goto Free;
265 }
275 266
276 mb(); 267 mb();
277 268
278 pr_debug("PM: Restoring saved image.\n"); 269 pr_debug("PM: Restoring saved image.\n");
279 swsusp_resume(); 270 swsusp_resume();
280 pr_debug("PM: Restore failed, recovering.n"); 271 pr_debug("PM: Restore failed, recovering.n");
281 finish(); 272 device_resume();
282 Free: 273 Free:
283 swsusp_free(); 274 swsusp_free();
284 Cleanup: 275 Cleanup:
285 unprepare_processes(); 276 unprepare_processes();
286 Done: 277 Done:
278 /* For success case, the suspend path will release the lock */
279 up(&pm_sem);
287 pr_debug("PM: Resume from disk failed.\n"); 280 pr_debug("PM: Resume from disk failed.\n");
288 return 0; 281 return 0;
289} 282}
@@ -390,7 +383,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t
390 if (sscanf(buf, "%u:%u", &maj, &min) == 2) { 383 if (sscanf(buf, "%u:%u", &maj, &min) == 2) {
391 res = MKDEV(maj,min); 384 res = MKDEV(maj,min);
392 if (maj == MAJOR(res) && min == MINOR(res)) { 385 if (maj == MAJOR(res) && min == MINOR(res)) {
386 down(&pm_sem);
393 swsusp_resume_device = res; 387 swsusp_resume_device = res;
388 up(&pm_sem);
394 printk("Attempting manual resume\n"); 389 printk("Attempting manual resume\n");
395 noresume = 0; 390 noresume = 0;
396 software_resume(); 391 software_resume();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 71aa0fd22007..22bdc93cc038 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -143,11 +143,12 @@ static void suspend_finish(suspend_state_t state)
143 143
144 144
145 145
146static char * pm_states[] = { 146static char *pm_states[PM_SUSPEND_MAX] = {
147 [PM_SUSPEND_STANDBY] = "standby", 147 [PM_SUSPEND_STANDBY] = "standby",
148 [PM_SUSPEND_MEM] = "mem", 148 [PM_SUSPEND_MEM] = "mem",
149#ifdef CONFIG_SOFTWARE_SUSPEND
149 [PM_SUSPEND_DISK] = "disk", 150 [PM_SUSPEND_DISK] = "disk",
150 NULL, 151#endif
151}; 152};
152 153
153 154
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 61deda04e39e..159149321b3c 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type,
60 unsigned long id, 60 unsigned long id,
61 pm_callback callback) 61 pm_callback callback)
62{ 62{
63 struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); 63 struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL);
64 if (dev) { 64 if (dev) {
65 memset(dev, 0, sizeof(*dev));
66 dev->type = type; 65 dev->type = type;
67 dev->id = id; 66 dev->id = id;
68 dev->callback = callback; 67 dev->callback = callback;
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 3bd0d261818f..28de118f7a0b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -38,7 +38,6 @@ void refrigerator(void)
38 processes around? */ 38 processes around? */
39 long save; 39 long save;
40 save = current->state; 40 save = current->state;
41 current->state = TASK_UNINTERRUPTIBLE;
42 pr_debug("%s entered refrigerator\n", current->comm); 41 pr_debug("%s entered refrigerator\n", current->comm);
43 printk("="); 42 printk("=");
44 43
@@ -47,8 +46,10 @@ void refrigerator(void)
47 recalc_sigpending(); /* We sent fake signal, clean it up */ 46 recalc_sigpending(); /* We sent fake signal, clean it up */
48 spin_unlock_irq(&current->sighand->siglock); 47 spin_unlock_irq(&current->sighand->siglock);
49 48
50 while (frozen(current)) 49 while (frozen(current)) {
50 current->state = TASK_UNINTERRUPTIBLE;
51 schedule(); 51 schedule();
52 }
52 pr_debug("%s left refrigerator\n", current->comm); 53 pr_debug("%s left refrigerator\n", current->comm);
53 current->state = save; 54 current->state = save;
54} 55}
@@ -80,13 +81,33 @@ int freeze_processes(void)
80 } while_each_thread(g, p); 81 } while_each_thread(g, p);
81 read_unlock(&tasklist_lock); 82 read_unlock(&tasklist_lock);
82 yield(); /* Yield is okay here */ 83 yield(); /* Yield is okay here */
83 if (time_after(jiffies, start_time + TIMEOUT)) { 84 if (todo && time_after(jiffies, start_time + TIMEOUT)) {
84 printk( "\n" ); 85 printk( "\n" );
85 printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); 86 printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo );
86 return todo; 87 break;
87 } 88 }
88 } while(todo); 89 } while(todo);
89 90
91 /* This does not unfreeze processes that are already frozen
92 * (we have slightly ugly calling convention in that respect,
93 * and caller must call thaw_processes() if something fails),
94 * but it cleans up leftover PF_FREEZE requests.
95 */
96 if (todo) {
97 read_lock(&tasklist_lock);
98 do_each_thread(g, p)
99 if (freezing(p)) {
100 pr_debug(" clean up: %s\n", p->comm);
101 p->flags &= ~PF_FREEZE;
102 spin_lock_irqsave(&p->sighand->siglock, flags);
103 recalc_sigpending_tsk(p);
104 spin_unlock_irqrestore(&p->sighand->siglock, flags);
105 }
106 while_each_thread(g, p);
107 read_unlock(&tasklist_lock);
108 return todo;
109 }
110
90 printk( "|\n" ); 111 printk( "|\n" );
91 BUG_ON(in_atomic()); 112 BUG_ON(in_atomic());
92 return 0; 113 return 0;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f2bc71b9fe8b..d967e875ee82 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -31,6 +31,9 @@
31 * Alex Badea <vampire@go.ro>: 31 * Alex Badea <vampire@go.ro>:
32 * Fixed runaway init 32 * Fixed runaway init
33 * 33 *
34 * Andreas Steinmetz <ast@domdv.de>:
35 * Added encrypted suspend option
36 *
34 * More state savers are welcome. Especially for the scsi layer... 37 * More state savers are welcome. Especially for the scsi layer...
35 * 38 *
36 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt 39 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
@@ -71,8 +74,16 @@
71#include <asm/tlbflush.h> 74#include <asm/tlbflush.h>
72#include <asm/io.h> 75#include <asm/io.h>
73 76
77#include <linux/random.h>
78#include <linux/crypto.h>
79#include <asm/scatterlist.h>
80
74#include "power.h" 81#include "power.h"
75 82
83#define CIPHER "aes"
84#define MAXKEY 32
85#define MAXIV 32
86
76/* References to section boundaries */ 87/* References to section boundaries */
77extern const void __nosave_begin, __nosave_end; 88extern const void __nosave_begin, __nosave_end;
78 89
@@ -103,7 +114,8 @@ static suspend_pagedir_t *pagedir_save;
103#define SWSUSP_SIG "S1SUSPEND" 114#define SWSUSP_SIG "S1SUSPEND"
104 115
105static struct swsusp_header { 116static struct swsusp_header {
106 char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; 117 char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)];
118 u8 key_iv[MAXKEY+MAXIV];
107 swp_entry_t swsusp_info; 119 swp_entry_t swsusp_info;
108 char orig_sig[10]; 120 char orig_sig[10];
109 char sig[10]; 121 char sig[10];
@@ -129,6 +141,131 @@ static struct swsusp_info swsusp_info;
129static unsigned short swapfile_used[MAX_SWAPFILES]; 141static unsigned short swapfile_used[MAX_SWAPFILES];
130static unsigned short root_swap; 142static unsigned short root_swap;
131 143
144static int write_page(unsigned long addr, swp_entry_t * loc);
145static int bio_read_page(pgoff_t page_off, void * page);
146
147static u8 key_iv[MAXKEY+MAXIV];
148
149#ifdef CONFIG_SWSUSP_ENCRYPT
150
151static int crypto_init(int mode, void **mem)
152{
153 int error = 0;
154 int len;
155 char *modemsg;
156 struct crypto_tfm *tfm;
157
158 modemsg = mode ? "suspend not possible" : "resume not possible";
159
160 tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC);
161 if(!tfm) {
162 printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg);
163 error = -EINVAL;
164 goto out;
165 }
166
167 if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) {
168 printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg);
169 error = -ENOKEY;
170 goto fail;
171 }
172
173 if (mode)
174 get_random_bytes(key_iv, MAXKEY+MAXIV);
175
176 len = crypto_tfm_alg_max_keysize(tfm);
177 if (len > MAXKEY)
178 len = MAXKEY;
179
180 if (crypto_cipher_setkey(tfm, key_iv, len)) {
181 printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg);
182 error = -EKEYREJECTED;
183 goto fail;
184 }
185
186 len = crypto_tfm_alg_ivsize(tfm);
187
188 if (MAXIV < len) {
189 printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg);
190 error = -EOVERFLOW;
191 goto fail;
192 }
193
194 crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len);
195
196 *mem=(void *)tfm;
197
198 goto out;
199
200fail: crypto_free_tfm(tfm);
201out: return error;
202}
203
204static __inline__ void crypto_exit(void *mem)
205{
206 crypto_free_tfm((struct crypto_tfm *)mem);
207}
208
209static __inline__ int crypto_write(struct pbe *p, void *mem)
210{
211 int error = 0;
212 struct scatterlist src, dst;
213
214 src.page = virt_to_page(p->address);
215 src.offset = 0;
216 src.length = PAGE_SIZE;
217 dst.page = virt_to_page((void *)&swsusp_header);
218 dst.offset = 0;
219 dst.length = PAGE_SIZE;
220
221 error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src,
222 PAGE_SIZE);
223
224 if (!error)
225 error = write_page((unsigned long)&swsusp_header,
226 &(p->swap_address));
227 return error;
228}
229
230static __inline__ int crypto_read(struct pbe *p, void *mem)
231{
232 int error = 0;
233 struct scatterlist src, dst;
234
235 error = bio_read_page(swp_offset(p->swap_address), (void *)p->address);
236 if (!error) {
237 src.offset = 0;
238 src.length = PAGE_SIZE;
239 dst.offset = 0;
240 dst.length = PAGE_SIZE;
241 src.page = dst.page = virt_to_page((void *)p->address);
242
243 error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst,
244 &src, PAGE_SIZE);
245 }
246 return error;
247}
248#else
249static __inline__ int crypto_init(int mode, void *mem)
250{
251 return 0;
252}
253
254static __inline__ void crypto_exit(void *mem)
255{
256}
257
258static __inline__ int crypto_write(struct pbe *p, void *mem)
259{
260 return write_page(p->address, &(p->swap_address));
261}
262
263static __inline__ int crypto_read(struct pbe *p, void *mem)
264{
265 return bio_read_page(swp_offset(p->swap_address), (void *)p->address);
266}
267#endif
268
132static int mark_swapfiles(swp_entry_t prev) 269static int mark_swapfiles(swp_entry_t prev)
133{ 270{
134 int error; 271 int error;
@@ -140,6 +277,7 @@ static int mark_swapfiles(swp_entry_t prev)
140 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 277 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
141 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 278 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
142 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 279 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
280 memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV);
143 swsusp_header.swsusp_info = prev; 281 swsusp_header.swsusp_info = prev;
144 error = rw_swap_page_sync(WRITE, 282 error = rw_swap_page_sync(WRITE,
145 swp_entry(root_swap, 0), 283 swp_entry(root_swap, 0),
@@ -179,9 +317,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */
179 len=strlen(resume_file); 317 len=strlen(resume_file);
180 root_swap = 0xFFFF; 318 root_swap = 0xFFFF;
181 319
182 swap_list_lock(); 320 spin_lock(&swap_lock);
183 for (i=0; i<MAX_SWAPFILES; i++) { 321 for (i=0; i<MAX_SWAPFILES; i++) {
184 if (swap_info[i].flags == 0) { 322 if (!(swap_info[i].flags & SWP_WRITEOK)) {
185 swapfile_used[i]=SWAPFILE_UNUSED; 323 swapfile_used[i]=SWAPFILE_UNUSED;
186 } else { 324 } else {
187 if (!len) { 325 if (!len) {
@@ -202,7 +340,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */
202 } 340 }
203 } 341 }
204 } 342 }
205 swap_list_unlock(); 343 spin_unlock(&swap_lock);
206 return (root_swap != 0xffff) ? 0 : -ENODEV; 344 return (root_swap != 0xffff) ? 0 : -ENODEV;
207} 345}
208 346
@@ -216,12 +354,12 @@ static void lock_swapdevices(void)
216{ 354{
217 int i; 355 int i;
218 356
219 swap_list_lock(); 357 spin_lock(&swap_lock);
220 for (i = 0; i< MAX_SWAPFILES; i++) 358 for (i = 0; i< MAX_SWAPFILES; i++)
221 if (swapfile_used[i] == SWAPFILE_IGNORED) { 359 if (swapfile_used[i] == SWAPFILE_IGNORED) {
222 swap_info[i].flags ^= 0xFF; 360 swap_info[i].flags ^= SWP_WRITEOK;
223 } 361 }
224 swap_list_unlock(); 362 spin_unlock(&swap_lock);
225} 363}
226 364
227/** 365/**
@@ -286,6 +424,10 @@ static int data_write(void)
286 int error = 0, i = 0; 424 int error = 0, i = 0;
287 unsigned int mod = nr_copy_pages / 100; 425 unsigned int mod = nr_copy_pages / 100;
288 struct pbe *p; 426 struct pbe *p;
427 void *tfm;
428
429 if ((error = crypto_init(1, &tfm)))
430 return error;
289 431
290 if (!mod) 432 if (!mod)
291 mod = 1; 433 mod = 1;
@@ -294,11 +436,14 @@ static int data_write(void)
294 for_each_pbe (p, pagedir_nosave) { 436 for_each_pbe (p, pagedir_nosave) {
295 if (!(i%mod)) 437 if (!(i%mod))
296 printk( "\b\b\b\b%3d%%", i / mod ); 438 printk( "\b\b\b\b%3d%%", i / mod );
297 if ((error = write_page(p->address, &(p->swap_address)))) 439 if ((error = crypto_write(p, tfm))) {
440 crypto_exit(tfm);
298 return error; 441 return error;
442 }
299 i++; 443 i++;
300 } 444 }
301 printk("\b\b\b\bdone\n"); 445 printk("\b\b\b\bdone\n");
446 crypto_exit(tfm);
302 return error; 447 return error;
303} 448}
304 449
@@ -385,7 +530,6 @@ static int write_pagedir(void)
385 * write_suspend_image - Write entire image and metadata. 530 * write_suspend_image - Write entire image and metadata.
386 * 531 *
387 */ 532 */
388
389static int write_suspend_image(void) 533static int write_suspend_image(void)
390{ 534{
391 int error; 535 int error;
@@ -400,6 +544,7 @@ static int write_suspend_image(void)
400 if ((error = close_swap())) 544 if ((error = close_swap()))
401 goto FreePagedir; 545 goto FreePagedir;
402 Done: 546 Done:
547 memset(key_iv, 0, MAXKEY+MAXIV);
403 return error; 548 return error;
404 FreePagedir: 549 FreePagedir:
405 free_pagedir_entries(); 550 free_pagedir_entries();
@@ -591,18 +736,7 @@ static void copy_data_pages(void)
591 736
592static int calc_nr(int nr_copy) 737static int calc_nr(int nr_copy)
593{ 738{
594 int extra = 0; 739 return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
595 int mod = !!(nr_copy % PBES_PER_PAGE);
596 int diff = (nr_copy / PBES_PER_PAGE) + mod;
597
598 do {
599 extra += diff;
600 nr_copy += diff;
601 mod = !!(nr_copy % PBES_PER_PAGE);
602 diff = (nr_copy / PBES_PER_PAGE) + mod - extra;
603 } while (diff > 0);
604
605 return nr_copy;
606} 740}
607 741
608/** 742/**
@@ -886,20 +1020,21 @@ int swsusp_suspend(void)
886 * at resume time, and evil weirdness ensues. 1020 * at resume time, and evil weirdness ensues.
887 */ 1021 */
888 if ((error = device_power_down(PMSG_FREEZE))) { 1022 if ((error = device_power_down(PMSG_FREEZE))) {
1023 printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
889 local_irq_enable(); 1024 local_irq_enable();
890 return error; 1025 return error;
891 } 1026 }
892 1027
893 if ((error = swsusp_swap_check())) { 1028 if ((error = swsusp_swap_check())) {
894 printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " 1029 printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
895 "swapon -a!\n"); 1030 device_power_up();
896 local_irq_enable(); 1031 local_irq_enable();
897 return error; 1032 return error;
898 } 1033 }
899 1034
900 save_processor_state(); 1035 save_processor_state();
901 if ((error = swsusp_arch_suspend())) 1036 if ((error = swsusp_arch_suspend()))
902 printk("Error %d suspending\n", error); 1037 printk(KERN_ERR "Error %d suspending\n", error);
903 /* Restore control flow magically appears here */ 1038 /* Restore control flow magically appears here */
904 restore_processor_state(); 1039 restore_processor_state();
905 BUG_ON (nr_copy_pages_check != nr_copy_pages); 1040 BUG_ON (nr_copy_pages_check != nr_copy_pages);
@@ -924,6 +1059,7 @@ int swsusp_resume(void)
924 BUG_ON(!error); 1059 BUG_ON(!error);
925 restore_processor_state(); 1060 restore_processor_state();
926 restore_highmem(); 1061 restore_highmem();
1062 touch_softlockup_watchdog();
927 device_power_up(); 1063 device_power_up();
928 local_irq_enable(); 1064 local_irq_enable();
929 return error; 1065 return error;
@@ -1179,7 +1315,8 @@ static const char * sanity_check(void)
1179 if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) 1315 if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
1180 return "machine"; 1316 return "machine";
1181#if 0 1317#if 0
1182 if(swsusp_info.cpus != num_online_cpus()) 1318 /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */
1319 if (swsusp_info.cpus != num_possible_cpus())
1183 return "number of cpus"; 1320 return "number of cpus";
1184#endif 1321#endif
1185 return NULL; 1322 return NULL;
@@ -1212,13 +1349,14 @@ static int check_sig(void)
1212 return error; 1349 return error;
1213 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 1350 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
1214 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 1351 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
1352 memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV);
1353 memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV);
1215 1354
1216 /* 1355 /*
1217 * Reset swap signature now. 1356 * Reset swap signature now.
1218 */ 1357 */
1219 error = bio_write_page(0, &swsusp_header); 1358 error = bio_write_page(0, &swsusp_header);
1220 } else { 1359 } else {
1221 printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n");
1222 return -EINVAL; 1360 return -EINVAL;
1223 } 1361 }
1224 if (!error) 1362 if (!error)
@@ -1239,6 +1377,10 @@ static int data_read(struct pbe *pblist)
1239 int error = 0; 1377 int error = 0;
1240 int i = 0; 1378 int i = 0;
1241 int mod = swsusp_info.image_pages / 100; 1379 int mod = swsusp_info.image_pages / 100;
1380 void *tfm;
1381
1382 if ((error = crypto_init(0, &tfm)))
1383 return error;
1242 1384
1243 if (!mod) 1385 if (!mod)
1244 mod = 1; 1386 mod = 1;
@@ -1250,14 +1392,15 @@ static int data_read(struct pbe *pblist)
1250 if (!(i % mod)) 1392 if (!(i % mod))
1251 printk("\b\b\b\b%3d%%", i / mod); 1393 printk("\b\b\b\b%3d%%", i / mod);
1252 1394
1253 error = bio_read_page(swp_offset(p->swap_address), 1395 if ((error = crypto_read(p, tfm))) {
1254 (void *)p->address); 1396 crypto_exit(tfm);
1255 if (error)
1256 return error; 1397 return error;
1398 }
1257 1399
1258 i++; 1400 i++;
1259 } 1401 }
1260 printk("\b\b\b\bdone\n"); 1402 printk("\b\b\b\bdone\n");
1403 crypto_exit(tfm);
1261 return error; 1404 return error;
1262} 1405}
1263 1406
@@ -1385,6 +1528,7 @@ int swsusp_read(void)
1385 1528
1386 error = read_suspend_image(); 1529 error = read_suspend_image();
1387 blkdev_put(resume_bdev); 1530 blkdev_put(resume_bdev);
1531 memset(key_iv, 0, MAXKEY+MAXIV);
1388 1532
1389 if (!error) 1533 if (!error)
1390 pr_debug("swsusp: Reading resume file was successful\n"); 1534 pr_debug("swsusp: Reading resume file was successful\n");
diff --git a/kernel/printk.c b/kernel/printk.c
index 5092397fac29..a967605bc2e3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -514,6 +514,9 @@ asmlinkage int printk(const char *fmt, ...)
514 return r; 514 return r;
515} 515}
516 516
517/* cpu currently holding logbuf_lock */
518static volatile unsigned int printk_cpu = UINT_MAX;
519
517asmlinkage int vprintk(const char *fmt, va_list args) 520asmlinkage int vprintk(const char *fmt, va_list args)
518{ 521{
519 unsigned long flags; 522 unsigned long flags;
@@ -522,11 +525,15 @@ asmlinkage int vprintk(const char *fmt, va_list args)
522 static char printk_buf[1024]; 525 static char printk_buf[1024];
523 static int log_level_unknown = 1; 526 static int log_level_unknown = 1;
524 527
525 if (unlikely(oops_in_progress)) 528 preempt_disable();
529 if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
530 /* If a crash is occurring during printk() on this CPU,
531 * make sure we can't deadlock */
526 zap_locks(); 532 zap_locks();
527 533
528 /* This stops the holder of console_sem just where we want him */ 534 /* This stops the holder of console_sem just where we want him */
529 spin_lock_irqsave(&logbuf_lock, flags); 535 spin_lock_irqsave(&logbuf_lock, flags);
536 printk_cpu = smp_processor_id();
530 537
531 /* Emit the output into the temporary buffer */ 538 /* Emit the output into the temporary buffer */
532 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); 539 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -595,6 +602,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
595 * CPU until it is officially up. We shouldn't be calling into 602 * CPU until it is officially up. We shouldn't be calling into
596 * random console drivers on a CPU which doesn't exist yet.. 603 * random console drivers on a CPU which doesn't exist yet..
597 */ 604 */
605 printk_cpu = UINT_MAX;
598 spin_unlock_irqrestore(&logbuf_lock, flags); 606 spin_unlock_irqrestore(&logbuf_lock, flags);
599 goto out; 607 goto out;
600 } 608 }
@@ -604,6 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
604 * We own the drivers. We can drop the spinlock and let 612 * We own the drivers. We can drop the spinlock and let
605 * release_console_sem() print the text 613 * release_console_sem() print the text
606 */ 614 */
615 printk_cpu = UINT_MAX;
607 spin_unlock_irqrestore(&logbuf_lock, flags); 616 spin_unlock_irqrestore(&logbuf_lock, flags);
608 console_may_schedule = 0; 617 console_may_schedule = 0;
609 release_console_sem(); 618 release_console_sem();
@@ -613,9 +622,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
613 * allows the semaphore holder to proceed and to call the 622 * allows the semaphore holder to proceed and to call the
614 * console drivers with the output which we just produced. 623 * console drivers with the output which we just produced.
615 */ 624 */
625 printk_cpu = UINT_MAX;
616 spin_unlock_irqrestore(&logbuf_lock, flags); 626 spin_unlock_irqrestore(&logbuf_lock, flags);
617 } 627 }
618out: 628out:
629 preempt_enable();
619 return printed_len; 630 return printed_len;
620} 631}
621EXPORT_SYMBOL(printk); 632EXPORT_SYMBOL(printk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8dcb8f6288bc..019e04ec065a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill)
118 return ret; 118 return ret;
119} 119}
120 120
121static int may_attach(struct task_struct *task)
122{
123 if (!task->mm)
124 return -EPERM;
125 if (((current->uid != task->euid) ||
126 (current->uid != task->suid) ||
127 (current->uid != task->uid) ||
128 (current->gid != task->egid) ||
129 (current->gid != task->sgid) ||
130 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
131 return -EPERM;
132 smp_rmb();
133 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
134 return -EPERM;
135
136 return security_ptrace(current, task);
137}
138
139int ptrace_may_attach(struct task_struct *task)
140{
141 int err;
142 task_lock(task);
143 err = may_attach(task);
144 task_unlock(task);
145 return !err;
146}
147
121int ptrace_attach(struct task_struct *task) 148int ptrace_attach(struct task_struct *task)
122{ 149{
123 int retval; 150 int retval;
@@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task)
127 goto bad; 154 goto bad;
128 if (task == current) 155 if (task == current)
129 goto bad; 156 goto bad;
130 if (!task->mm)
131 goto bad;
132 if(((current->uid != task->euid) ||
133 (current->uid != task->suid) ||
134 (current->uid != task->uid) ||
135 (current->gid != task->egid) ||
136 (current->gid != task->sgid) ||
137 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
138 goto bad;
139 smp_rmb();
140 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE))
141 goto bad;
142 /* the same process cannot be attached many times */ 157 /* the same process cannot be attached many times */
143 if (task->ptrace & PT_PTRACED) 158 if (task->ptrace & PT_PTRACED)
144 goto bad; 159 goto bad;
145 retval = security_ptrace(current, task); 160 retval = may_attach(task);
146 if (retval) 161 if (retval)
147 goto bad; 162 goto bad;
148 163
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f436993bd590..bef3b6901b76 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -45,6 +45,7 @@
45#include <linux/percpu.h> 45#include <linux/percpu.h>
46#include <linux/notifier.h> 46#include <linux/notifier.h>
47#include <linux/rcupdate.h> 47#include <linux/rcupdate.h>
48#include <linux/rcuref.h>
48#include <linux/cpu.h> 49#include <linux/cpu.h>
49 50
50/* Definition for rcupdate control block. */ 51/* Definition for rcupdate control block. */
@@ -72,6 +73,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
72static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; 73static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
73static int maxbatch = 10; 74static int maxbatch = 10;
74 75
76#ifndef __HAVE_ARCH_CMPXCHG
77/*
78 * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
79 * 32 bit atomic_t implementations, and a hash function similar to that
80 * for our refcounting needs.
81 * Can't help multiprocessors which donot have cmpxchg :(
82 */
83
84spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
85 [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
86};
87#endif
88
75/** 89/**
76 * call_rcu - Queue an RCU callback for invocation after a grace period. 90 * call_rcu - Queue an RCU callback for invocation after a grace period.
77 * @head: structure to be used for queueing the RCU updates. 91 * @head: structure to be used for queueing the RCU updates.
diff --git a/kernel/resource.c b/kernel/resource.c
index 26967e042201..92285d822de6 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource);
430 */ 430 */
431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) 431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name)
432{ 432{
433 struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); 433 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
434 434
435 if (res) { 435 if (res) {
436 memset(res, 0, sizeof(*res));
437 res->name = name; 436 res->name = name;
438 res->start = start; 437 res->start = start;
439 res->end = start + n - 1; 438 res->end = start + n - 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5f889d0cbfcc..81b3a96ed2d0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
875 * smp_call_function() if an IPI is sent by the same process we are 875 * smp_call_function() if an IPI is sent by the same process we are
876 * waiting to become inactive. 876 * waiting to become inactive.
877 */ 877 */
878void wait_task_inactive(task_t * p) 878void wait_task_inactive(task_t *p)
879{ 879{
880 unsigned long flags; 880 unsigned long flags;
881 runqueue_t *rq; 881 runqueue_t *rq;
@@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
966 int local_group; 966 int local_group;
967 int i; 967 int i;
968 968
969 /* Skip over this group if it has no CPUs allowed */
970 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
971 goto nextgroup;
972
969 local_group = cpu_isset(this_cpu, group->cpumask); 973 local_group = cpu_isset(this_cpu, group->cpumask);
970 /* XXX: put a cpus allowed check */
971 974
972 /* Tally up the load of all CPUs in the group */ 975 /* Tally up the load of all CPUs in the group */
973 avg_load = 0; 976 avg_load = 0;
@@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
992 min_load = avg_load; 995 min_load = avg_load;
993 idlest = group; 996 idlest = group;
994 } 997 }
998nextgroup:
995 group = group->next; 999 group = group->next;
996 } while (group != sd->groups); 1000 } while (group != sd->groups);
997 1001
@@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1003/* 1007/*
1004 * find_idlest_queue - find the idlest runqueue among the cpus in group. 1008 * find_idlest_queue - find the idlest runqueue among the cpus in group.
1005 */ 1009 */
1006static int find_idlest_cpu(struct sched_group *group, int this_cpu) 1010static int
1011find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1007{ 1012{
1013 cpumask_t tmp;
1008 unsigned long load, min_load = ULONG_MAX; 1014 unsigned long load, min_load = ULONG_MAX;
1009 int idlest = -1; 1015 int idlest = -1;
1010 int i; 1016 int i;
1011 1017
1012 for_each_cpu_mask(i, group->cpumask) { 1018 /* Traverse only the allowed CPUs */
1019 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1020
1021 for_each_cpu_mask(i, tmp) {
1013 load = source_load(i, 0); 1022 load = source_load(i, 0);
1014 1023
1015 if (load < min_load || (load == min_load && i == this_cpu)) { 1024 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag)
1052 if (!group) 1061 if (!group)
1053 goto nextlevel; 1062 goto nextlevel;
1054 1063
1055 new_cpu = find_idlest_cpu(group, cpu); 1064 new_cpu = find_idlest_cpu(group, t, cpu);
1056 if (new_cpu == -1 || new_cpu == cpu) 1065 if (new_cpu == -1 || new_cpu == cpu)
1057 goto nextlevel; 1066 goto nextlevel;
1058 1067
@@ -1127,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p)
1127 * 1136 *
1128 * returns failure only if the task is already active. 1137 * returns failure only if the task is already active.
1129 */ 1138 */
1130static int try_to_wake_up(task_t * p, unsigned int state, int sync) 1139static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1131{ 1140{
1132 int cpu, this_cpu, success = 0; 1141 int cpu, this_cpu, success = 0;
1133 unsigned long flags; 1142 unsigned long flags;
@@ -1252,6 +1261,16 @@ out_activate:
1252 } 1261 }
1253 1262
1254 /* 1263 /*
1264 * Tasks that have marked their sleep as noninteractive get
1265 * woken up without updating their sleep average. (i.e. their
1266 * sleep is handled in a priority-neutral manner, no priority
1267 * boost and no penalty.)
1268 */
1269 if (old_state & TASK_NONINTERACTIVE)
1270 __activate_task(p, rq);
1271 else
1272 activate_task(p, rq, cpu == this_cpu);
1273 /*
1255 * Sync wakeups (i.e. those types of wakeups where the waker 1274 * Sync wakeups (i.e. those types of wakeups where the waker
1256 * has indicated that it will leave the CPU in short order) 1275 * has indicated that it will leave the CPU in short order)
1257 * don't trigger a preemption, if the woken up task will run on 1276 * don't trigger a preemption, if the woken up task will run on
@@ -1259,7 +1278,6 @@ out_activate:
1259 * the waker guarantees that the freshly woken up task is going 1278 * the waker guarantees that the freshly woken up task is going
1260 * to be considered on this CPU.) 1279 * to be considered on this CPU.)
1261 */ 1280 */
1262 activate_task(p, rq, cpu == this_cpu);
1263 if (!sync || cpu != this_cpu) { 1281 if (!sync || cpu != this_cpu) {
1264 if (TASK_PREEMPTS_CURR(p, rq)) 1282 if (TASK_PREEMPTS_CURR(p, rq))
1265 resched_task(rq->curr); 1283 resched_task(rq->curr);
@@ -1274,7 +1292,7 @@ out:
1274 return success; 1292 return success;
1275} 1293}
1276 1294
1277int fastcall wake_up_process(task_t * p) 1295int fastcall wake_up_process(task_t *p)
1278{ 1296{
1279 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | 1297 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1280 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); 1298 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
@@ -1353,7 +1371,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1353 * that must be done for every newly created context, then puts the task 1371 * that must be done for every newly created context, then puts the task
1354 * on the runqueue and wakes it. 1372 * on the runqueue and wakes it.
1355 */ 1373 */
1356void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) 1374void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1357{ 1375{
1358 unsigned long flags; 1376 unsigned long flags;
1359 int this_cpu, cpu; 1377 int this_cpu, cpu;
@@ -1436,7 +1454,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
1436 * artificially, because any timeslice recovered here 1454 * artificially, because any timeslice recovered here
1437 * was given away by the parent in the first place.) 1455 * was given away by the parent in the first place.)
1438 */ 1456 */
1439void fastcall sched_exit(task_t * p) 1457void fastcall sched_exit(task_t *p)
1440{ 1458{
1441 unsigned long flags; 1459 unsigned long flags;
1442 runqueue_t *rq; 1460 runqueue_t *rq;
@@ -1478,6 +1496,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1478 1496
1479/** 1497/**
1480 * finish_task_switch - clean up after a task-switch 1498 * finish_task_switch - clean up after a task-switch
1499 * @rq: runqueue associated with task-switch
1481 * @prev: the thread we just switched away from. 1500 * @prev: the thread we just switched away from.
1482 * 1501 *
1483 * finish_task_switch must be called after the context switch, paired 1502 * finish_task_switch must be called after the context switch, paired
@@ -1510,6 +1529,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1510 * Manfred Spraul <manfred@colorfullife.com> 1529 * Manfred Spraul <manfred@colorfullife.com>
1511 */ 1530 */
1512 prev_task_flags = prev->flags; 1531 prev_task_flags = prev->flags;
1532#ifdef CONFIG_DEBUG_SPINLOCK
1533 /* this is a valid case when another task releases the spinlock */
1534 rq->lock.owner = current;
1535#endif
1513 finish_arch_switch(prev); 1536 finish_arch_switch(prev);
1514 finish_lock_switch(rq, prev); 1537 finish_lock_switch(rq, prev);
1515 if (mm) 1538 if (mm)
@@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1752 */ 1775 */
1753static inline 1776static inline
1754int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 1777int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1755 struct sched_domain *sd, enum idle_type idle, int *all_pinned) 1778 struct sched_domain *sd, enum idle_type idle,
1779 int *all_pinned)
1756{ 1780{
1757 /* 1781 /*
1758 * We do not migrate tasks that are: 1782 * We do not migrate tasks that are:
@@ -1882,10 +1906,11 @@ out:
1882 */ 1906 */
1883static struct sched_group * 1907static struct sched_group *
1884find_busiest_group(struct sched_domain *sd, int this_cpu, 1908find_busiest_group(struct sched_domain *sd, int this_cpu,
1885 unsigned long *imbalance, enum idle_type idle) 1909 unsigned long *imbalance, enum idle_type idle, int *sd_idle)
1886{ 1910{
1887 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 1911 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1888 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 1912 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1913 unsigned long max_pull;
1889 int load_idx; 1914 int load_idx;
1890 1915
1891 max_load = this_load = total_load = total_pwr = 0; 1916 max_load = this_load = total_load = total_pwr = 0;
@@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1907 avg_load = 0; 1932 avg_load = 0;
1908 1933
1909 for_each_cpu_mask(i, group->cpumask) { 1934 for_each_cpu_mask(i, group->cpumask) {
1935 if (*sd_idle && !idle_cpu(i))
1936 *sd_idle = 0;
1937
1910 /* Bias balancing toward cpus of our domain */ 1938 /* Bias balancing toward cpus of our domain */
1911 if (local_group) 1939 if (local_group)
1912 load = target_load(i, load_idx); 1940 load = target_load(i, load_idx);
@@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1932 group = group->next; 1960 group = group->next;
1933 } while (group != sd->groups); 1961 } while (group != sd->groups);
1934 1962
1935 if (!busiest || this_load >= max_load) 1963 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
1936 goto out_balanced; 1964 goto out_balanced;
1937 1965
1938 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 1966 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1952 * by pulling tasks to us. Be careful of negative numbers as they'll 1980 * by pulling tasks to us. Be careful of negative numbers as they'll
1953 * appear as very large values with unsigned longs. 1981 * appear as very large values with unsigned longs.
1954 */ 1982 */
1983
1984 /* Don't want to pull so many tasks that a group would go idle */
1985 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
1986
1955 /* How much load to actually move to equalise the imbalance */ 1987 /* How much load to actually move to equalise the imbalance */
1956 *imbalance = min((max_load - avg_load) * busiest->cpu_power, 1988 *imbalance = min(max_pull * busiest->cpu_power,
1957 (avg_load - this_load) * this->cpu_power) 1989 (avg_load - this_load) * this->cpu_power)
1958 / SCHED_LOAD_SCALE; 1990 / SCHED_LOAD_SCALE;
1959 1991
@@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2050 unsigned long imbalance; 2082 unsigned long imbalance;
2051 int nr_moved, all_pinned = 0; 2083 int nr_moved, all_pinned = 0;
2052 int active_balance = 0; 2084 int active_balance = 0;
2085 int sd_idle = 0;
2086
2087 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
2088 sd_idle = 1;
2053 2089
2054 spin_lock(&this_rq->lock);
2055 schedstat_inc(sd, lb_cnt[idle]); 2090 schedstat_inc(sd, lb_cnt[idle]);
2056 2091
2057 group = find_busiest_group(sd, this_cpu, &imbalance, idle); 2092 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
2058 if (!group) { 2093 if (!group) {
2059 schedstat_inc(sd, lb_nobusyg[idle]); 2094 schedstat_inc(sd, lb_nobusyg[idle]);
2060 goto out_balanced; 2095 goto out_balanced;
@@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2078 * still unbalanced. nr_moved simply stays zero, so it is 2113 * still unbalanced. nr_moved simply stays zero, so it is
2079 * correctly treated as an imbalance. 2114 * correctly treated as an imbalance.
2080 */ 2115 */
2081 double_lock_balance(this_rq, busiest); 2116 double_rq_lock(this_rq, busiest);
2082 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2117 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2083 imbalance, sd, idle, 2118 imbalance, sd, idle, &all_pinned);
2084 &all_pinned); 2119 double_rq_unlock(this_rq, busiest);
2085 spin_unlock(&busiest->lock);
2086 2120
2087 /* All tasks on this runqueue were pinned by CPU affinity */ 2121 /* All tasks on this runqueue were pinned by CPU affinity */
2088 if (unlikely(all_pinned)) 2122 if (unlikely(all_pinned))
2089 goto out_balanced; 2123 goto out_balanced;
2090 } 2124 }
2091 2125
2092 spin_unlock(&this_rq->lock);
2093
2094 if (!nr_moved) { 2126 if (!nr_moved) {
2095 schedstat_inc(sd, lb_failed[idle]); 2127 schedstat_inc(sd, lb_failed[idle]);
2096 sd->nr_balance_failed++; 2128 sd->nr_balance_failed++;
@@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2098 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 2130 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
2099 2131
2100 spin_lock(&busiest->lock); 2132 spin_lock(&busiest->lock);
2133
2134 /* don't kick the migration_thread, if the curr
2135 * task on busiest cpu can't be moved to this_cpu
2136 */
2137 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
2138 spin_unlock(&busiest->lock);
2139 all_pinned = 1;
2140 goto out_one_pinned;
2141 }
2142
2101 if (!busiest->active_balance) { 2143 if (!busiest->active_balance) {
2102 busiest->active_balance = 1; 2144 busiest->active_balance = 1;
2103 busiest->push_cpu = this_cpu; 2145 busiest->push_cpu = this_cpu;
@@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2130 sd->balance_interval *= 2; 2172 sd->balance_interval *= 2;
2131 } 2173 }
2132 2174
2175 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2176 return -1;
2133 return nr_moved; 2177 return nr_moved;
2134 2178
2135out_balanced: 2179out_balanced:
2136 spin_unlock(&this_rq->lock);
2137
2138 schedstat_inc(sd, lb_balanced[idle]); 2180 schedstat_inc(sd, lb_balanced[idle]);
2139 2181
2140 sd->nr_balance_failed = 0; 2182 sd->nr_balance_failed = 0;
2183
2184out_one_pinned:
2141 /* tune up the balancing interval */ 2185 /* tune up the balancing interval */
2142 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || 2186 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2143 (sd->balance_interval < sd->max_interval)) 2187 (sd->balance_interval < sd->max_interval))
2144 sd->balance_interval *= 2; 2188 sd->balance_interval *= 2;
2145 2189
2190 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2191 return -1;
2146 return 0; 2192 return 0;
2147} 2193}
2148 2194
@@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2160 runqueue_t *busiest = NULL; 2206 runqueue_t *busiest = NULL;
2161 unsigned long imbalance; 2207 unsigned long imbalance;
2162 int nr_moved = 0; 2208 int nr_moved = 0;
2209 int sd_idle = 0;
2210
2211 if (sd->flags & SD_SHARE_CPUPOWER)
2212 sd_idle = 1;
2163 2213
2164 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2214 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2165 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); 2215 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
2166 if (!group) { 2216 if (!group) {
2167 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2217 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2168 goto out_balanced; 2218 goto out_balanced;
@@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2176 2226
2177 BUG_ON(busiest == this_rq); 2227 BUG_ON(busiest == this_rq);
2178 2228
2179 /* Attempt to move tasks */
2180 double_lock_balance(this_rq, busiest);
2181
2182 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); 2229 schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
2183 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2230
2231 nr_moved = 0;
2232 if (busiest->nr_running > 1) {
2233 /* Attempt to move tasks */
2234 double_lock_balance(this_rq, busiest);
2235 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2184 imbalance, sd, NEWLY_IDLE, NULL); 2236 imbalance, sd, NEWLY_IDLE, NULL);
2185 if (!nr_moved) 2237 spin_unlock(&busiest->lock);
2238 }
2239
2240 if (!nr_moved) {
2186 schedstat_inc(sd, lb_failed[NEWLY_IDLE]); 2241 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
2187 else 2242 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2243 return -1;
2244 } else
2188 sd->nr_balance_failed = 0; 2245 sd->nr_balance_failed = 0;
2189 2246
2190 spin_unlock(&busiest->lock);
2191 return nr_moved; 2247 return nr_moved;
2192 2248
2193out_balanced: 2249out_balanced:
2194 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2250 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2251 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
2252 return -1;
2195 sd->nr_balance_failed = 0; 2253 sd->nr_balance_failed = 0;
2196 return 0; 2254 return 0;
2197} 2255}
@@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2316 2374
2317 if (j - sd->last_balance >= interval) { 2375 if (j - sd->last_balance >= interval) {
2318 if (load_balance(this_cpu, this_rq, sd, idle)) { 2376 if (load_balance(this_cpu, this_rq, sd, idle)) {
2319 /* We've pulled tasks over so no longer idle */ 2377 /*
2378 * We've pulled tasks over so either we're no
2379 * longer idle, or one of our SMT siblings is
2380 * not idle.
2381 */
2320 idle = NOT_IDLE; 2382 idle = NOT_IDLE;
2321 } 2383 }
2322 sd->last_balance += interval; 2384 sd->last_balance += interval;
@@ -2575,6 +2637,13 @@ out:
2575} 2637}
2576 2638
2577#ifdef CONFIG_SCHED_SMT 2639#ifdef CONFIG_SCHED_SMT
2640static inline void wakeup_busy_runqueue(runqueue_t *rq)
2641{
2642 /* If an SMT runqueue is sleeping due to priority reasons wake it up */
2643 if (rq->curr == rq->idle && rq->nr_running)
2644 resched_task(rq->idle);
2645}
2646
2578static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 2647static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2579{ 2648{
2580 struct sched_domain *tmp, *sd = NULL; 2649 struct sched_domain *tmp, *sd = NULL;
@@ -2608,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2608 for_each_cpu_mask(i, sibling_map) { 2677 for_each_cpu_mask(i, sibling_map) {
2609 runqueue_t *smt_rq = cpu_rq(i); 2678 runqueue_t *smt_rq = cpu_rq(i);
2610 2679
2611 /* 2680 wakeup_busy_runqueue(smt_rq);
2612 * If an SMT sibling task is sleeping due to priority
2613 * reasons wake it up now.
2614 */
2615 if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
2616 resched_task(smt_rq->idle);
2617 } 2681 }
2618 2682
2619 for_each_cpu_mask(i, sibling_map) 2683 for_each_cpu_mask(i, sibling_map)
@@ -2624,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2624 */ 2688 */
2625} 2689}
2626 2690
2691/*
2692 * number of 'lost' timeslices this task wont be able to fully
2693 * utilize, if another task runs on a sibling. This models the
2694 * slowdown effect of other tasks running on siblings:
2695 */
2696static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2697{
2698 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2699}
2700
2627static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 2701static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2628{ 2702{
2629 struct sched_domain *tmp, *sd = NULL; 2703 struct sched_domain *tmp, *sd = NULL;
@@ -2667,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2667 runqueue_t *smt_rq = cpu_rq(i); 2741 runqueue_t *smt_rq = cpu_rq(i);
2668 task_t *smt_curr = smt_rq->curr; 2742 task_t *smt_curr = smt_rq->curr;
2669 2743
2744 /* Kernel threads do not participate in dependent sleeping */
2745 if (!p->mm || !smt_curr->mm || rt_task(p))
2746 goto check_smt_task;
2747
2670 /* 2748 /*
2671 * If a user task with lower static priority than the 2749 * If a user task with lower static priority than the
2672 * running task on the SMT sibling is trying to schedule, 2750 * running task on the SMT sibling is trying to schedule,
@@ -2675,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2675 * task from using an unfair proportion of the 2753 * task from using an unfair proportion of the
2676 * physical cpu's resources. -ck 2754 * physical cpu's resources. -ck
2677 */ 2755 */
2678 if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > 2756 if (rt_task(smt_curr)) {
2679 task_timeslice(p) || rt_task(smt_curr)) && 2757 /*
2680 p->mm && smt_curr->mm && !rt_task(p)) 2758 * With real time tasks we run non-rt tasks only
2681 ret = 1; 2759 * per_cpu_gain% of the time.
2760 */
2761 if ((jiffies % DEF_TIMESLICE) >
2762 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2763 ret = 1;
2764 } else
2765 if (smt_curr->static_prio < p->static_prio &&
2766 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2767 smt_slice(smt_curr, sd) > task_timeslice(p))
2768 ret = 1;
2769
2770check_smt_task:
2771 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2772 rt_task(smt_curr))
2773 continue;
2774 if (!p->mm) {
2775 wakeup_busy_runqueue(smt_rq);
2776 continue;
2777 }
2682 2778
2683 /* 2779 /*
2684 * Reschedule a lower priority task on the SMT sibling, 2780 * Reschedule a lower priority task on the SMT sibling for
2685 * or wake it up if it has been put to sleep for priority 2781 * it to be put to sleep, or wake it up if it has been put to
2686 * reasons. 2782 * sleep for priority reasons to see if it should run now.
2687 */ 2783 */
2688 if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > 2784 if (rt_task(p)) {
2689 task_timeslice(smt_curr) || rt_task(p)) && 2785 if ((jiffies % DEF_TIMESLICE) >
2690 smt_curr->mm && p->mm && !rt_task(smt_curr)) || 2786 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2691 (smt_curr == smt_rq->idle && smt_rq->nr_running)) 2787 resched_task(smt_curr);
2692 resched_task(smt_curr); 2788 } else {
2789 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2790 smt_slice(p, sd) > task_timeslice(smt_curr))
2791 resched_task(smt_curr);
2792 else
2793 wakeup_busy_runqueue(smt_rq);
2794 }
2693 } 2795 }
2694out_unlock: 2796out_unlock:
2695 for_each_cpu_mask(i, sibling_map) 2797 for_each_cpu_mask(i, sibling_map)
@@ -2887,6 +2989,7 @@ switch_tasks:
2887 if (next == rq->idle) 2989 if (next == rq->idle)
2888 schedstat_inc(rq, sched_goidle); 2990 schedstat_inc(rq, sched_goidle);
2889 prefetch(next); 2991 prefetch(next);
2992 prefetch_stack(next);
2890 clear_tsk_need_resched(prev); 2993 clear_tsk_need_resched(prev);
2891 rcu_qsctr_inc(task_cpu(prev)); 2994 rcu_qsctr_inc(task_cpu(prev));
2892 2995
@@ -3014,7 +3117,8 @@ need_resched:
3014 3117
3015#endif /* CONFIG_PREEMPT */ 3118#endif /* CONFIG_PREEMPT */
3016 3119
3017int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) 3120int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3121 void *key)
3018{ 3122{
3019 task_t *p = curr->private; 3123 task_t *p = curr->private;
3020 return try_to_wake_up(p, mode, sync); 3124 return try_to_wake_up(p, mode, sync);
@@ -3056,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3056 * @key: is directly passed to the wakeup function 3160 * @key: is directly passed to the wakeup function
3057 */ 3161 */
3058void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, 3162void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3059 int nr_exclusive, void *key) 3163 int nr_exclusive, void *key)
3060{ 3164{
3061 unsigned long flags; 3165 unsigned long flags;
3062 3166
@@ -3088,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3088 * 3192 *
3089 * On UP it can prevent extra preemption. 3193 * On UP it can prevent extra preemption.
3090 */ 3194 */
3091void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 3195void fastcall
3196__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
3092{ 3197{
3093 unsigned long flags; 3198 unsigned long flags;
3094 int sync = 1; 3199 int sync = 1;
@@ -3279,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3279 3384
3280EXPORT_SYMBOL(interruptible_sleep_on); 3385EXPORT_SYMBOL(interruptible_sleep_on);
3281 3386
3282long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) 3387long fastcall __sched
3388interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3283{ 3389{
3284 SLEEP_ON_VAR 3390 SLEEP_ON_VAR
3285 3391
@@ -3498,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3498 * @policy: new policy. 3604 * @policy: new policy.
3499 * @param: structure containing the new RT priority. 3605 * @param: structure containing the new RT priority.
3500 */ 3606 */
3501int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) 3607int sched_setscheduler(struct task_struct *p, int policy,
3608 struct sched_param *param)
3502{ 3609{
3503 int retval; 3610 int retval;
3504 int oldprio, oldpolicy = -1; 3611 int oldprio, oldpolicy = -1;
@@ -3518,7 +3625,7 @@ recheck:
3518 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. 3625 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0.
3519 */ 3626 */
3520 if (param->sched_priority < 0 || 3627 if (param->sched_priority < 0 ||
3521 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 3628 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3522 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 3629 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3523 return -EINVAL; 3630 return -EINVAL;
3524 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) 3631 if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
@@ -3581,7 +3688,8 @@ recheck:
3581} 3688}
3582EXPORT_SYMBOL_GPL(sched_setscheduler); 3689EXPORT_SYMBOL_GPL(sched_setscheduler);
3583 3690
3584static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 3691static int
3692do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3585{ 3693{
3586 int retval; 3694 int retval;
3587 struct sched_param lparam; 3695 struct sched_param lparam;
@@ -3848,7 +3956,7 @@ asmlinkage long sys_sched_yield(void)
3848 if (rt_task(current)) 3956 if (rt_task(current))
3849 target = rq->active; 3957 target = rq->active;
3850 3958
3851 if (current->array->nr_active == 1) { 3959 if (array->nr_active == 1) {
3852 schedstat_inc(rq, yld_act_empty); 3960 schedstat_inc(rq, yld_act_empty);
3853 if (!rq->expired->nr_active) 3961 if (!rq->expired->nr_active)
3854 schedstat_inc(rq, yld_both_empty); 3962 schedstat_inc(rq, yld_both_empty);
@@ -3912,7 +4020,7 @@ EXPORT_SYMBOL(cond_resched);
3912 * operations here to prevent schedule() from being called twice (once via 4020 * operations here to prevent schedule() from being called twice (once via
3913 * spin_unlock(), once by hand). 4021 * spin_unlock(), once by hand).
3914 */ 4022 */
3915int cond_resched_lock(spinlock_t * lock) 4023int cond_resched_lock(spinlock_t *lock)
3916{ 4024{
3917 int ret = 0; 4025 int ret = 0;
3918 4026
@@ -4095,7 +4203,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
4095 return list_entry(p->sibling.next,struct task_struct,sibling); 4203 return list_entry(p->sibling.next,struct task_struct,sibling);
4096} 4204}
4097 4205
4098static void show_task(task_t * p) 4206static void show_task(task_t *p)
4099{ 4207{
4100 task_t *relative; 4208 task_t *relative;
4101 unsigned state; 4209 unsigned state;
@@ -4121,7 +4229,7 @@ static void show_task(task_t * p)
4121#endif 4229#endif
4122#ifdef CONFIG_DEBUG_STACK_USAGE 4230#ifdef CONFIG_DEBUG_STACK_USAGE
4123 { 4231 {
4124 unsigned long * n = (unsigned long *) (p->thread_info+1); 4232 unsigned long *n = (unsigned long *) (p->thread_info+1);
4125 while (!*n) 4233 while (!*n)
4126 n++; 4234 n++;
4127 free = (unsigned long) n - (unsigned long)(p->thread_info+1); 4235 free = (unsigned long) n - (unsigned long)(p->thread_info+1);
@@ -4330,7 +4438,7 @@ out:
4330 * thread migration by bumping thread off CPU then 'pushing' onto 4438 * thread migration by bumping thread off CPU then 'pushing' onto
4331 * another runqueue. 4439 * another runqueue.
4332 */ 4440 */
4333static int migration_thread(void * data) 4441static int migration_thread(void *data)
4334{ 4442{
4335 runqueue_t *rq; 4443 runqueue_t *rq;
4336 int cpu = (long)data; 4444 int cpu = (long)data;
@@ -4779,7 +4887,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
4779 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 4887 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
4780 * hold the hotplug lock. 4888 * hold the hotplug lock.
4781 */ 4889 */
4782void cpu_attach_domain(struct sched_domain *sd, int cpu) 4890static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4783{ 4891{
4784 runqueue_t *rq = cpu_rq(cpu); 4892 runqueue_t *rq = cpu_rq(cpu);
4785 struct sched_domain *tmp; 4893 struct sched_domain *tmp;
@@ -4802,7 +4910,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu)
4802} 4910}
4803 4911
4804/* cpus with isolated domains */ 4912/* cpus with isolated domains */
4805cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; 4913static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
4806 4914
4807/* Setup the mask of cpus configured for isolated domains */ 4915/* Setup the mask of cpus configured for isolated domains */
4808static int __init isolated_cpu_setup(char *str) 4916static int __init isolated_cpu_setup(char *str)
@@ -4830,8 +4938,8 @@ __setup ("isolcpus=", isolated_cpu_setup);
4830 * covered by the given span, and will set each group's ->cpumask correctly, 4938 * covered by the given span, and will set each group's ->cpumask correctly,
4831 * and ->cpu_power to 0. 4939 * and ->cpu_power to 0.
4832 */ 4940 */
4833void init_sched_build_groups(struct sched_group groups[], 4941static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
4834 cpumask_t span, int (*group_fn)(int cpu)) 4942 int (*group_fn)(int cpu))
4835{ 4943{
4836 struct sched_group *first = NULL, *last = NULL; 4944 struct sched_group *first = NULL, *last = NULL;
4837 cpumask_t covered = CPU_MASK_NONE; 4945 cpumask_t covered = CPU_MASK_NONE;
@@ -4864,12 +4972,85 @@ void init_sched_build_groups(struct sched_group groups[],
4864 last->next = first; 4972 last->next = first;
4865} 4973}
4866 4974
4975#define SD_NODES_PER_DOMAIN 16
4867 4976
4868#ifdef ARCH_HAS_SCHED_DOMAIN 4977#ifdef CONFIG_NUMA
4869extern void build_sched_domains(const cpumask_t *cpu_map); 4978/**
4870extern void arch_init_sched_domains(const cpumask_t *cpu_map); 4979 * find_next_best_node - find the next node to include in a sched_domain
4871extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); 4980 * @node: node whose sched_domain we're building
4872#else 4981 * @used_nodes: nodes already in the sched_domain
4982 *
4983 * Find the next node to include in a given scheduling domain. Simply
4984 * finds the closest node not already in the @used_nodes map.
4985 *
4986 * Should use nodemask_t.
4987 */
4988static int find_next_best_node(int node, unsigned long *used_nodes)
4989{
4990 int i, n, val, min_val, best_node = 0;
4991
4992 min_val = INT_MAX;
4993
4994 for (i = 0; i < MAX_NUMNODES; i++) {
4995 /* Start at @node */
4996 n = (node + i) % MAX_NUMNODES;
4997
4998 if (!nr_cpus_node(n))
4999 continue;
5000
5001 /* Skip already used nodes */
5002 if (test_bit(n, used_nodes))
5003 continue;
5004
5005 /* Simple min distance search */
5006 val = node_distance(node, n);
5007
5008 if (val < min_val) {
5009 min_val = val;
5010 best_node = n;
5011 }
5012 }
5013
5014 set_bit(best_node, used_nodes);
5015 return best_node;
5016}
5017
5018/**
5019 * sched_domain_node_span - get a cpumask for a node's sched_domain
5020 * @node: node whose cpumask we're constructing
5021 * @size: number of nodes to include in this span
5022 *
5023 * Given a node, construct a good cpumask for its sched_domain to span. It
5024 * should be one that prevents unnecessary balancing, but also spreads tasks
5025 * out optimally.
5026 */
5027static cpumask_t sched_domain_node_span(int node)
5028{
5029 int i;
5030 cpumask_t span, nodemask;
5031 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
5032
5033 cpus_clear(span);
5034 bitmap_zero(used_nodes, MAX_NUMNODES);
5035
5036 nodemask = node_to_cpumask(node);
5037 cpus_or(span, span, nodemask);
5038 set_bit(node, used_nodes);
5039
5040 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5041 int next_node = find_next_best_node(node, used_nodes);
5042 nodemask = node_to_cpumask(next_node);
5043 cpus_or(span, span, nodemask);
5044 }
5045
5046 return span;
5047}
5048#endif
5049
5050/*
5051 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
5052 * can switch it on easily if needed.
5053 */
4873#ifdef CONFIG_SCHED_SMT 5054#ifdef CONFIG_SCHED_SMT
4874static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 5055static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
4875static struct sched_group sched_group_cpus[NR_CPUS]; 5056static struct sched_group sched_group_cpus[NR_CPUS];
@@ -4891,36 +5072,20 @@ static int cpu_to_phys_group(int cpu)
4891} 5072}
4892 5073
4893#ifdef CONFIG_NUMA 5074#ifdef CONFIG_NUMA
4894
4895static DEFINE_PER_CPU(struct sched_domain, node_domains);
4896static struct sched_group sched_group_nodes[MAX_NUMNODES];
4897static int cpu_to_node_group(int cpu)
4898{
4899 return cpu_to_node(cpu);
4900}
4901#endif
4902
4903#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
4904/* 5075/*
4905 * The domains setup code relies on siblings not spanning 5076 * The init_sched_build_groups can't handle what we want to do with node
4906 * multiple nodes. Make sure the architecture has a proper 5077 * groups, so roll our own. Now each node has its own list of groups which
4907 * siblings map: 5078 * gets dynamically allocated.
4908 */ 5079 */
4909static void check_sibling_maps(void) 5080static DEFINE_PER_CPU(struct sched_domain, node_domains);
4910{ 5081static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
4911 int i, j;
4912 5082
4913 for_each_online_cpu(i) { 5083static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
4914 for_each_cpu_mask(j, cpu_sibling_map[i]) { 5084static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
4915 if (cpu_to_node(i) != cpu_to_node(j)) { 5085
4916 printk(KERN_INFO "warning: CPU %d siblings map " 5086static int cpu_to_allnodes_group(int cpu)
4917 "to different node - isolating " 5087{
4918 "them.\n", i); 5088 return cpu_to_node(cpu);
4919 cpu_sibling_map[i] = cpumask_of_cpu(i);
4920 break;
4921 }
4922 }
4923 }
4924} 5089}
4925#endif 5090#endif
4926 5091
@@ -4928,9 +5093,24 @@ static void check_sibling_maps(void)
4928 * Build sched domains for a given set of cpus and attach the sched domains 5093 * Build sched domains for a given set of cpus and attach the sched domains
4929 * to the individual cpus 5094 * to the individual cpus
4930 */ 5095 */
4931static void build_sched_domains(const cpumask_t *cpu_map) 5096void build_sched_domains(const cpumask_t *cpu_map)
4932{ 5097{
4933 int i; 5098 int i;
5099#ifdef CONFIG_NUMA
5100 struct sched_group **sched_group_nodes = NULL;
5101 struct sched_group *sched_group_allnodes = NULL;
5102
5103 /*
5104 * Allocate the per-node list of sched groups
5105 */
5106 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5107 GFP_ATOMIC);
5108 if (!sched_group_nodes) {
5109 printk(KERN_WARNING "Can not alloc sched group node list\n");
5110 return;
5111 }
5112 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5113#endif
4934 5114
4935 /* 5115 /*
4936 * Set up domains for cpus specified by the cpu_map. 5116 * Set up domains for cpus specified by the cpu_map.
@@ -4943,11 +5123,35 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4943 cpus_and(nodemask, nodemask, *cpu_map); 5123 cpus_and(nodemask, nodemask, *cpu_map);
4944 5124
4945#ifdef CONFIG_NUMA 5125#ifdef CONFIG_NUMA
5126 if (cpus_weight(*cpu_map)
5127 > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
5128 if (!sched_group_allnodes) {
5129 sched_group_allnodes
5130 = kmalloc(sizeof(struct sched_group)
5131 * MAX_NUMNODES,
5132 GFP_KERNEL);
5133 if (!sched_group_allnodes) {
5134 printk(KERN_WARNING
5135 "Can not alloc allnodes sched group\n");
5136 break;
5137 }
5138 sched_group_allnodes_bycpu[i]
5139 = sched_group_allnodes;
5140 }
5141 sd = &per_cpu(allnodes_domains, i);
5142 *sd = SD_ALLNODES_INIT;
5143 sd->span = *cpu_map;
5144 group = cpu_to_allnodes_group(i);
5145 sd->groups = &sched_group_allnodes[group];
5146 p = sd;
5147 } else
5148 p = NULL;
5149
4946 sd = &per_cpu(node_domains, i); 5150 sd = &per_cpu(node_domains, i);
4947 group = cpu_to_node_group(i);
4948 *sd = SD_NODE_INIT; 5151 *sd = SD_NODE_INIT;
4949 sd->span = *cpu_map; 5152 sd->span = sched_domain_node_span(cpu_to_node(i));
4950 sd->groups = &sched_group_nodes[group]; 5153 sd->parent = p;
5154 cpus_and(sd->span, sd->span, *cpu_map);
4951#endif 5155#endif
4952 5156
4953 p = sd; 5157 p = sd;
@@ -4972,7 +5176,7 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4972 5176
4973#ifdef CONFIG_SCHED_SMT 5177#ifdef CONFIG_SCHED_SMT
4974 /* Set up CPU (sibling) groups */ 5178 /* Set up CPU (sibling) groups */
4975 for_each_online_cpu(i) { 5179 for_each_cpu_mask(i, *cpu_map) {
4976 cpumask_t this_sibling_map = cpu_sibling_map[i]; 5180 cpumask_t this_sibling_map = cpu_sibling_map[i];
4977 cpus_and(this_sibling_map, this_sibling_map, *cpu_map); 5181 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
4978 if (i != first_cpu(this_sibling_map)) 5182 if (i != first_cpu(this_sibling_map))
@@ -4997,8 +5201,77 @@ static void build_sched_domains(const cpumask_t *cpu_map)
4997 5201
4998#ifdef CONFIG_NUMA 5202#ifdef CONFIG_NUMA
4999 /* Set up node groups */ 5203 /* Set up node groups */
5000 init_sched_build_groups(sched_group_nodes, *cpu_map, 5204 if (sched_group_allnodes)
5001 &cpu_to_node_group); 5205 init_sched_build_groups(sched_group_allnodes, *cpu_map,
5206 &cpu_to_allnodes_group);
5207
5208 for (i = 0; i < MAX_NUMNODES; i++) {
5209 /* Set up node groups */
5210 struct sched_group *sg, *prev;
5211 cpumask_t nodemask = node_to_cpumask(i);
5212 cpumask_t domainspan;
5213 cpumask_t covered = CPU_MASK_NONE;
5214 int j;
5215
5216 cpus_and(nodemask, nodemask, *cpu_map);
5217 if (cpus_empty(nodemask)) {
5218 sched_group_nodes[i] = NULL;
5219 continue;
5220 }
5221
5222 domainspan = sched_domain_node_span(i);
5223 cpus_and(domainspan, domainspan, *cpu_map);
5224
5225 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5226 sched_group_nodes[i] = sg;
5227 for_each_cpu_mask(j, nodemask) {
5228 struct sched_domain *sd;
5229 sd = &per_cpu(node_domains, j);
5230 sd->groups = sg;
5231 if (sd->groups == NULL) {
5232 /* Turn off balancing if we have no groups */
5233 sd->flags = 0;
5234 }
5235 }
5236 if (!sg) {
5237 printk(KERN_WARNING
5238 "Can not alloc domain group for node %d\n", i);
5239 continue;
5240 }
5241 sg->cpu_power = 0;
5242 sg->cpumask = nodemask;
5243 cpus_or(covered, covered, nodemask);
5244 prev = sg;
5245
5246 for (j = 0; j < MAX_NUMNODES; j++) {
5247 cpumask_t tmp, notcovered;
5248 int n = (i + j) % MAX_NUMNODES;
5249
5250 cpus_complement(notcovered, covered);
5251 cpus_and(tmp, notcovered, *cpu_map);
5252 cpus_and(tmp, tmp, domainspan);
5253 if (cpus_empty(tmp))
5254 break;
5255
5256 nodemask = node_to_cpumask(n);
5257 cpus_and(tmp, tmp, nodemask);
5258 if (cpus_empty(tmp))
5259 continue;
5260
5261 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
5262 if (!sg) {
5263 printk(KERN_WARNING
5264 "Can not alloc domain group for node %d\n", j);
5265 break;
5266 }
5267 sg->cpu_power = 0;
5268 sg->cpumask = tmp;
5269 cpus_or(covered, covered, tmp);
5270 prev->next = sg;
5271 prev = sg;
5272 }
5273 prev->next = sched_group_nodes[i];
5274 }
5002#endif 5275#endif
5003 5276
5004 /* Calculate CPU power for physical packages and nodes */ 5277 /* Calculate CPU power for physical packages and nodes */
@@ -5017,14 +5290,46 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5017 sd->groups->cpu_power = power; 5290 sd->groups->cpu_power = power;
5018 5291
5019#ifdef CONFIG_NUMA 5292#ifdef CONFIG_NUMA
5020 if (i == first_cpu(sd->groups->cpumask)) { 5293 sd = &per_cpu(allnodes_domains, i);
5021 /* Only add "power" once for each physical package. */ 5294 if (sd->groups) {
5022 sd = &per_cpu(node_domains, i); 5295 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5023 sd->groups->cpu_power += power; 5296 (cpus_weight(sd->groups->cpumask)-1) / 10;
5297 sd->groups->cpu_power = power;
5024 } 5298 }
5025#endif 5299#endif
5026 } 5300 }
5027 5301
5302#ifdef CONFIG_NUMA
5303 for (i = 0; i < MAX_NUMNODES; i++) {
5304 struct sched_group *sg = sched_group_nodes[i];
5305 int j;
5306
5307 if (sg == NULL)
5308 continue;
5309next_sg:
5310 for_each_cpu_mask(j, sg->cpumask) {
5311 struct sched_domain *sd;
5312 int power;
5313
5314 sd = &per_cpu(phys_domains, j);
5315 if (j != first_cpu(sd->groups->cpumask)) {
5316 /*
5317 * Only add "power" once for each
5318 * physical package.
5319 */
5320 continue;
5321 }
5322 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
5323 (cpus_weight(sd->groups->cpumask)-1) / 10;
5324
5325 sg->cpu_power += power;
5326 }
5327 sg = sg->next;
5328 if (sg != sched_group_nodes[i])
5329 goto next_sg;
5330 }
5331#endif
5332
5028 /* Attach the domains */ 5333 /* Attach the domains */
5029 for_each_cpu_mask(i, *cpu_map) { 5334 for_each_cpu_mask(i, *cpu_map) {
5030 struct sched_domain *sd; 5335 struct sched_domain *sd;
@@ -5039,13 +5344,10 @@ static void build_sched_domains(const cpumask_t *cpu_map)
5039/* 5344/*
5040 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 5345 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5041 */ 5346 */
5042static void arch_init_sched_domains(cpumask_t *cpu_map) 5347static void arch_init_sched_domains(const cpumask_t *cpu_map)
5043{ 5348{
5044 cpumask_t cpu_default_map; 5349 cpumask_t cpu_default_map;
5045 5350
5046#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
5047 check_sibling_maps();
5048#endif
5049 /* 5351 /*
5050 * Setup mask for cpus without special case scheduling requirements. 5352 * Setup mask for cpus without special case scheduling requirements.
5051 * For now this just excludes isolated cpus, but could be used to 5353 * For now this just excludes isolated cpus, but could be used to
@@ -5058,10 +5360,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map)
5058 5360
5059static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 5361static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5060{ 5362{
5061 /* Do nothing: everything is statically allocated. */ 5363#ifdef CONFIG_NUMA
5062} 5364 int i;
5365 int cpu;
5063 5366
5064#endif /* ARCH_HAS_SCHED_DOMAIN */ 5367 for_each_cpu_mask(cpu, *cpu_map) {
5368 struct sched_group *sched_group_allnodes
5369 = sched_group_allnodes_bycpu[cpu];
5370 struct sched_group **sched_group_nodes
5371 = sched_group_nodes_bycpu[cpu];
5372
5373 if (sched_group_allnodes) {
5374 kfree(sched_group_allnodes);
5375 sched_group_allnodes_bycpu[cpu] = NULL;
5376 }
5377
5378 if (!sched_group_nodes)
5379 continue;
5380
5381 for (i = 0; i < MAX_NUMNODES; i++) {
5382 cpumask_t nodemask = node_to_cpumask(i);
5383 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5384
5385 cpus_and(nodemask, nodemask, *cpu_map);
5386 if (cpus_empty(nodemask))
5387 continue;
5388
5389 if (sg == NULL)
5390 continue;
5391 sg = sg->next;
5392next_sg:
5393 oldsg = sg;
5394 sg = sg->next;
5395 kfree(oldsg);
5396 if (oldsg != sched_group_nodes[i])
5397 goto next_sg;
5398 }
5399 kfree(sched_group_nodes);
5400 sched_group_nodes_bycpu[cpu] = NULL;
5401 }
5402#endif
5403}
5065 5404
5066/* 5405/*
5067 * Detach sched domains from a group of cpus specified in cpu_map 5406 * Detach sched domains from a group of cpus specified in cpu_map
@@ -5263,3 +5602,47 @@ void normalize_rt_tasks(void)
5263} 5602}
5264 5603
5265#endif /* CONFIG_MAGIC_SYSRQ */ 5604#endif /* CONFIG_MAGIC_SYSRQ */
5605
5606#ifdef CONFIG_IA64
5607/*
5608 * These functions are only useful for the IA64 MCA handling.
5609 *
5610 * They can only be called when the whole system has been
5611 * stopped - every CPU needs to be quiescent, and no scheduling
5612 * activity can take place. Using them for anything else would
5613 * be a serious bug, and as a result, they aren't even visible
5614 * under any other configuration.
5615 */
5616
5617/**
5618 * curr_task - return the current task for a given cpu.
5619 * @cpu: the processor in question.
5620 *
5621 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
5622 */
5623task_t *curr_task(int cpu)
5624{
5625 return cpu_curr(cpu);
5626}
5627
5628/**
5629 * set_curr_task - set the current task for a given cpu.
5630 * @cpu: the processor in question.
5631 * @p: the task pointer to set.
5632 *
5633 * Description: This function must only be used when non-maskable interrupts
5634 * are serviced on a separate stack. It allows the architecture to switch the
5635 * notion of the current task on a cpu in a non-blocking manner. This function
5636 * must be called with all CPU's synchronized, and interrupts disabled, the
5637 * and caller must save the original value of the current task (see
5638 * curr_task() above) and restore that value before reenabling interrupts and
5639 * re-starting the system.
5640 *
5641 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
5642 */
5643void set_curr_task(int cpu, task_t *p)
5644{
5645 cpu_curr(cpu) = p;
5646}
5647
5648#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index d282fea81138..b92c3c9f8b9a 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -678,7 +678,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
678 678
679/* forward decl */ 679/* forward decl */
680static void do_notify_parent_cldstop(struct task_struct *tsk, 680static void do_notify_parent_cldstop(struct task_struct *tsk,
681 struct task_struct *parent, 681 int to_self,
682 int why); 682 int why);
683 683
684/* 684/*
@@ -729,14 +729,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
729 p->signal->group_stop_count = 0; 729 p->signal->group_stop_count = 0;
730 p->signal->flags = SIGNAL_STOP_CONTINUED; 730 p->signal->flags = SIGNAL_STOP_CONTINUED;
731 spin_unlock(&p->sighand->siglock); 731 spin_unlock(&p->sighand->siglock);
732 if (p->ptrace & PT_PTRACED) 732 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED);
733 do_notify_parent_cldstop(p, p->parent,
734 CLD_STOPPED);
735 else
736 do_notify_parent_cldstop(
737 p->group_leader,
738 p->group_leader->real_parent,
739 CLD_STOPPED);
740 spin_lock(&p->sighand->siglock); 733 spin_lock(&p->sighand->siglock);
741 } 734 }
742 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); 735 rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending);
@@ -777,14 +770,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
777 p->signal->flags = SIGNAL_STOP_CONTINUED; 770 p->signal->flags = SIGNAL_STOP_CONTINUED;
778 p->signal->group_exit_code = 0; 771 p->signal->group_exit_code = 0;
779 spin_unlock(&p->sighand->siglock); 772 spin_unlock(&p->sighand->siglock);
780 if (p->ptrace & PT_PTRACED) 773 do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED);
781 do_notify_parent_cldstop(p, p->parent,
782 CLD_CONTINUED);
783 else
784 do_notify_parent_cldstop(
785 p->group_leader,
786 p->group_leader->real_parent,
787 CLD_CONTINUED);
788 spin_lock(&p->sighand->siglock); 774 spin_lock(&p->sighand->siglock);
789 } else { 775 } else {
790 /* 776 /*
@@ -1380,16 +1366,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1380 unsigned long flags; 1366 unsigned long flags;
1381 int ret = 0; 1367 int ret = 0;
1382 1368
1383 /*
1384 * We need the tasklist lock even for the specific
1385 * thread case (when we don't need to follow the group
1386 * lists) in order to avoid races with "p->sighand"
1387 * going away or changing from under us.
1388 */
1389 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1369 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1390 read_lock(&tasklist_lock); 1370 read_lock(&tasklist_lock);
1371
1372 if (unlikely(p->flags & PF_EXITING)) {
1373 ret = -1;
1374 goto out_err;
1375 }
1376
1391 spin_lock_irqsave(&p->sighand->siglock, flags); 1377 spin_lock_irqsave(&p->sighand->siglock, flags);
1392 1378
1393 if (unlikely(!list_empty(&q->list))) { 1379 if (unlikely(!list_empty(&q->list))) {
1394 /* 1380 /*
1395 * If an SI_TIMER entry is already queue just increment 1381 * If an SI_TIMER entry is already queue just increment
@@ -1399,7 +1385,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1399 BUG(); 1385 BUG();
1400 q->info.si_overrun++; 1386 q->info.si_overrun++;
1401 goto out; 1387 goto out;
1402 } 1388 }
1403 /* Short-circuit ignored signals. */ 1389 /* Short-circuit ignored signals. */
1404 if (sig_ignored(p, sig)) { 1390 if (sig_ignored(p, sig)) {
1405 ret = 1; 1391 ret = 1;
@@ -1414,8 +1400,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
1414 1400
1415out: 1401out:
1416 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1402 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1403out_err:
1417 read_unlock(&tasklist_lock); 1404 read_unlock(&tasklist_lock);
1418 return(ret); 1405
1406 return ret;
1419} 1407}
1420 1408
1421int 1409int
@@ -1542,14 +1530,20 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1542 spin_unlock_irqrestore(&psig->siglock, flags); 1530 spin_unlock_irqrestore(&psig->siglock, flags);
1543} 1531}
1544 1532
1545static void 1533static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why)
1546do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent,
1547 int why)
1548{ 1534{
1549 struct siginfo info; 1535 struct siginfo info;
1550 unsigned long flags; 1536 unsigned long flags;
1537 struct task_struct *parent;
1551 struct sighand_struct *sighand; 1538 struct sighand_struct *sighand;
1552 1539
1540 if (to_self)
1541 parent = tsk->parent;
1542 else {
1543 tsk = tsk->group_leader;
1544 parent = tsk->real_parent;
1545 }
1546
1553 info.si_signo = SIGCHLD; 1547 info.si_signo = SIGCHLD;
1554 info.si_errno = 0; 1548 info.si_errno = 0;
1555 info.si_pid = tsk->pid; 1549 info.si_pid = tsk->pid;
@@ -1618,8 +1612,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1618 !(current->ptrace & PT_ATTACHED)) && 1612 !(current->ptrace & PT_ATTACHED)) &&
1619 (likely(current->parent->signal != current->signal) || 1613 (likely(current->parent->signal != current->signal) ||
1620 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { 1614 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1621 do_notify_parent_cldstop(current, current->parent, 1615 do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
1622 CLD_TRAPPED);
1623 read_unlock(&tasklist_lock); 1616 read_unlock(&tasklist_lock);
1624 schedule(); 1617 schedule();
1625 } else { 1618 } else {
@@ -1668,25 +1661,25 @@ void ptrace_notify(int exit_code)
1668static void 1661static void
1669finish_stop(int stop_count) 1662finish_stop(int stop_count)
1670{ 1663{
1664 int to_self;
1665
1671 /* 1666 /*
1672 * If there are no other threads in the group, or if there is 1667 * If there are no other threads in the group, or if there is
1673 * a group stop in progress and we are the last to stop, 1668 * a group stop in progress and we are the last to stop,
1674 * report to the parent. When ptraced, every thread reports itself. 1669 * report to the parent. When ptraced, every thread reports itself.
1675 */ 1670 */
1676 if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { 1671 if (stop_count < 0 || (current->ptrace & PT_PTRACED))
1677 read_lock(&tasklist_lock); 1672 to_self = 1;
1678 do_notify_parent_cldstop(current, current->parent, 1673 else if (stop_count == 0)
1679 CLD_STOPPED); 1674 to_self = 0;
1680 read_unlock(&tasklist_lock); 1675 else
1681 } 1676 goto out;
1682 else if (stop_count == 0) {
1683 read_lock(&tasklist_lock);
1684 do_notify_parent_cldstop(current->group_leader,
1685 current->group_leader->real_parent,
1686 CLD_STOPPED);
1687 read_unlock(&tasklist_lock);
1688 }
1689 1677
1678 read_lock(&tasklist_lock);
1679 do_notify_parent_cldstop(current, to_self, CLD_STOPPED);
1680 read_unlock(&tasklist_lock);
1681
1682out:
1690 schedule(); 1683 schedule();
1691 /* 1684 /*
1692 * Now we don't run again until continued. 1685 * Now we don't run again until continued.
@@ -2228,8 +2221,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2228 recalc_sigpending(); 2221 recalc_sigpending();
2229 spin_unlock_irq(&current->sighand->siglock); 2222 spin_unlock_irq(&current->sighand->siglock);
2230 2223
2231 current->state = TASK_INTERRUPTIBLE; 2224 timeout = schedule_timeout_interruptible(timeout);
2232 timeout = schedule_timeout(timeout);
2233 2225
2234 try_to_freeze(); 2226 try_to_freeze();
2235 spin_lock_irq(&current->sighand->siglock); 2227 spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b4ab6af1dea8..f766b2fc48be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void)
84 cpu = smp_processor_id(); 84 cpu = smp_processor_id();
85restart: 85restart:
86 /* Reset the pending bitmask before enabling irqs */ 86 /* Reset the pending bitmask before enabling irqs */
87 local_softirq_pending() = 0; 87 set_softirq_pending(0);
88 88
89 local_irq_enable(); 89 local_irq_enable();
90 90
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
1/*
2 * Detect Soft Lockups
3 *
4 * started by Ingo Molnar, (C) 2005, Red Hat
5 *
6 * this code detects soft lockups: incidents in where on a CPU
7 * the kernel does not reschedule for 10 seconds or more.
8 */
9
10#include <linux/mm.h>
11#include <linux/cpu.h>
12#include <linux/init.h>
13#include <linux/delay.h>
14#include <linux/kthread.h>
15#include <linux/notifier.h>
16#include <linux/module.h>
17
18static DEFINE_SPINLOCK(print_lock);
19
20static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
21static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
22static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
23
24static int did_panic = 0;
25static int softlock_panic(struct notifier_block *this, unsigned long event,
26 void *ptr)
27{
28 did_panic = 1;
29
30 return NOTIFY_DONE;
31}
32
33static struct notifier_block panic_block = {
34 .notifier_call = softlock_panic,
35};
36
37void touch_softlockup_watchdog(void)
38{
39 per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
40}
41EXPORT_SYMBOL(touch_softlockup_watchdog);
42
43/*
44 * This callback runs from the timer interrupt, and checks
45 * whether the watchdog thread has hung or not:
46 */
47void softlockup_tick(struct pt_regs *regs)
48{
49 int this_cpu = smp_processor_id();
50 unsigned long timestamp = per_cpu(timestamp, this_cpu);
51
52 if (per_cpu(print_timestamp, this_cpu) == timestamp)
53 return;
54
55 /* Do not cause a second panic when there already was one */
56 if (did_panic)
57 return;
58
59 if (time_after(jiffies, timestamp + 10*HZ)) {
60 per_cpu(print_timestamp, this_cpu) = timestamp;
61
62 spin_lock(&print_lock);
63 printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
64 this_cpu);
65 show_regs(regs);
66 spin_unlock(&print_lock);
67 }
68}
69
70/*
71 * The watchdog thread - runs every second and touches the timestamp.
72 */
73static int watchdog(void * __bind_cpu)
74{
75 struct sched_param param = { .sched_priority = 99 };
76 int this_cpu = (long) __bind_cpu;
77
78 printk("softlockup thread %d started up.\n", this_cpu);
79
80 sched_setscheduler(current, SCHED_FIFO, &param);
81 current->flags |= PF_NOFREEZE;
82
83 set_current_state(TASK_INTERRUPTIBLE);
84
85 /*
86 * Run briefly once per second - if this gets delayed for
87 * more than 10 seconds then the debug-printout triggers
88 * in softlockup_tick():
89 */
90 while (!kthread_should_stop()) {
91 msleep_interruptible(1000);
92 touch_softlockup_watchdog();
93 }
94 __set_current_state(TASK_RUNNING);
95
96 return 0;
97}
98
99/*
100 * Create/destroy watchdog threads as CPUs come and go:
101 */
102static int __devinit
103cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
104{
105 int hotcpu = (unsigned long)hcpu;
106 struct task_struct *p;
107
108 switch (action) {
109 case CPU_UP_PREPARE:
110 BUG_ON(per_cpu(watchdog_task, hotcpu));
111 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
112 if (IS_ERR(p)) {
113 printk("watchdog for %i failed\n", hotcpu);
114 return NOTIFY_BAD;
115 }
116 per_cpu(watchdog_task, hotcpu) = p;
117 kthread_bind(p, hotcpu);
118 break;
119 case CPU_ONLINE:
120
121 wake_up_process(per_cpu(watchdog_task, hotcpu));
122 break;
123#ifdef CONFIG_HOTPLUG_CPU
124 case CPU_UP_CANCELED:
125 /* Unbind so it can run. Fall thru. */
126 kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
127 case CPU_DEAD:
128 p = per_cpu(watchdog_task, hotcpu);
129 per_cpu(watchdog_task, hotcpu) = NULL;
130 kthread_stop(p);
131 break;
132#endif /* CONFIG_HOTPLUG_CPU */
133 }
134 return NOTIFY_OK;
135}
136
137static struct notifier_block __devinitdata cpu_nfb = {
138 .notifier_call = cpu_callback
139};
140
141__init void spawn_softlockup_task(void)
142{
143 void *cpu = (void *)(long)smp_processor_id();
144
145 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
146 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
147 register_cpu_notifier(&cpu_nfb);
148
149 notifier_chain_register(&panic_notifier_list, &panic_block);
150}
151
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 0c3f9d8bbe17..0375fcd5921d 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -3,7 +3,10 @@
3 * 3 *
4 * Author: Zwane Mwaikambo <zwane@fsmlabs.com> 4 * Author: Zwane Mwaikambo <zwane@fsmlabs.com>
5 * 5 *
6 * Copyright (2004) Ingo Molnar 6 * Copyright (2004, 2005) Ingo Molnar
7 *
8 * This file contains the spinlock/rwlock implementations for the
9 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
7 */ 10 */
8 11
9#include <linux/config.h> 12#include <linux/config.h>
@@ -17,12 +20,12 @@
17 * Generic declaration of the raw read_trylock() function, 20 * Generic declaration of the raw read_trylock() function,
18 * architectures are supposed to optimize this: 21 * architectures are supposed to optimize this:
19 */ 22 */
20int __lockfunc generic_raw_read_trylock(rwlock_t *lock) 23int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
21{ 24{
22 _raw_read_lock(lock); 25 __raw_read_lock(lock);
23 return 1; 26 return 1;
24} 27}
25EXPORT_SYMBOL(generic_raw_read_trylock); 28EXPORT_SYMBOL(generic__raw_read_trylock);
26 29
27int __lockfunc _spin_trylock(spinlock_t *lock) 30int __lockfunc _spin_trylock(spinlock_t *lock)
28{ 31{
@@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock)
57} 60}
58EXPORT_SYMBOL(_write_trylock); 61EXPORT_SYMBOL(_write_trylock);
59 62
60#ifndef CONFIG_PREEMPT 63#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
61 64
62void __lockfunc _read_lock(rwlock_t *lock) 65void __lockfunc _read_lock(rwlock_t *lock)
63{ 66{
@@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
72 75
73 local_irq_save(flags); 76 local_irq_save(flags);
74 preempt_disable(); 77 preempt_disable();
75 _raw_spin_lock_flags(lock, flags); 78 _raw_spin_lock_flags(lock, &flags);
76 return flags; 79 return flags;
77} 80}
78EXPORT_SYMBOL(_spin_lock_irqsave); 81EXPORT_SYMBOL(_spin_lock_irqsave);
diff --git a/kernel/sys.c b/kernel/sys.c
index 0bcaed6560ac..c80412be2302 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1711,7 +1711,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1711 unsigned long arg4, unsigned long arg5) 1711 unsigned long arg4, unsigned long arg5)
1712{ 1712{
1713 long error; 1713 long error;
1714 int sig;
1715 1714
1716 error = security_task_prctl(option, arg2, arg3, arg4, arg5); 1715 error = security_task_prctl(option, arg2, arg3, arg4, arg5);
1717 if (error) 1716 if (error)
@@ -1719,12 +1718,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1719 1718
1720 switch (option) { 1719 switch (option) {
1721 case PR_SET_PDEATHSIG: 1720 case PR_SET_PDEATHSIG:
1722 sig = arg2; 1721 if (!valid_signal(arg2)) {
1723 if (!valid_signal(sig)) {
1724 error = -EINVAL; 1722 error = -EINVAL;
1725 break; 1723 break;
1726 } 1724 }
1727 current->pdeath_signal = sig; 1725 current->pdeath_signal = arg2;
1728 break; 1726 break;
1729 case PR_GET_PDEATHSIG: 1727 case PR_GET_PDEATHSIG:
1730 error = put_user(current->pdeath_signal, (int __user *)arg2); 1728 error = put_user(current->pdeath_signal, (int __user *)arg2);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3e0bbee549ea..8e56e2495542 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -31,6 +31,7 @@
31#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/kernel.h> 33#include <linux/kernel.h>
34#include <linux/net.h>
34#include <linux/sysrq.h> 35#include <linux/sysrq.h>
35#include <linux/highuid.h> 36#include <linux/highuid.h>
36#include <linux/writeback.h> 37#include <linux/writeback.h>
@@ -136,9 +137,6 @@ static struct ctl_table_header root_table_header =
136 137
137static ctl_table kern_table[]; 138static ctl_table kern_table[];
138static ctl_table vm_table[]; 139static ctl_table vm_table[];
139#ifdef CONFIG_NET
140extern ctl_table net_table[];
141#endif
142static ctl_table proc_table[]; 140static ctl_table proc_table[];
143static ctl_table fs_table[]; 141static ctl_table fs_table[];
144static ctl_table debug_table[]; 142static ctl_table debug_table[];
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723ff..3ba10fa35b60 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
950{ 950{
951 jiffies_64++; 951 jiffies_64++;
952 update_times(); 952 update_times();
953 softlockup_tick(regs);
953} 954}
954 955
955#ifdef __ARCH_WANT_SYS_ALARM 956#ifdef __ARCH_WANT_SYS_ALARM
@@ -1150,9 +1151,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
1150 out: 1151 out:
1151 return timeout < 0 ? 0 : timeout; 1152 return timeout < 0 ? 0 : timeout;
1152} 1153}
1153
1154EXPORT_SYMBOL(schedule_timeout); 1154EXPORT_SYMBOL(schedule_timeout);
1155 1155
1156/*
1157 * We can use __set_current_state() here because schedule_timeout() calls
1158 * schedule() unconditionally.
1159 */
1160signed long __sched schedule_timeout_interruptible(signed long timeout)
1161{
1162 __set_current_state(TASK_INTERRUPTIBLE);
1163 return schedule_timeout(timeout);
1164}
1165EXPORT_SYMBOL(schedule_timeout_interruptible);
1166
1167signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1168{
1169 __set_current_state(TASK_UNINTERRUPTIBLE);
1170 return schedule_timeout(timeout);
1171}
1172EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1173
1156/* Thread ID - the internal kernel "pid" */ 1174/* Thread ID - the internal kernel "pid" */
1157asmlinkage long sys_gettid(void) 1175asmlinkage long sys_gettid(void)
1158{ 1176{
@@ -1169,8 +1187,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
1169 if (!time_after(expire, now)) 1187 if (!time_after(expire, now))
1170 return 0; 1188 return 0;
1171 1189
1172 current->state = TASK_INTERRUPTIBLE; 1190 expire = schedule_timeout_interruptible(expire - now);
1173 expire = schedule_timeout(expire - now);
1174 1191
1175 ret = 0; 1192 ret = 0;
1176 if (expire) { 1193 if (expire) {
@@ -1198,8 +1215,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us
1198 return -EINVAL; 1215 return -EINVAL;
1199 1216
1200 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); 1217 expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
1201 current->state = TASK_INTERRUPTIBLE; 1218 expire = schedule_timeout_interruptible(expire);
1202 expire = schedule_timeout(expire);
1203 1219
1204 ret = 0; 1220 ret = 0;
1205 if (expire) { 1221 if (expire) {
@@ -1428,7 +1444,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src)
1428 } 1444 }
1429} 1445}
1430 1446
1431static inline u64 time_interpolator_get_counter(void) 1447static inline u64 time_interpolator_get_counter(int writelock)
1432{ 1448{
1433 unsigned int src = time_interpolator->source; 1449 unsigned int src = time_interpolator->source;
1434 1450
@@ -1442,6 +1458,15 @@ static inline u64 time_interpolator_get_counter(void)
1442 now = time_interpolator_get_cycles(src); 1458 now = time_interpolator_get_cycles(src);
1443 if (lcycle && time_after(lcycle, now)) 1459 if (lcycle && time_after(lcycle, now))
1444 return lcycle; 1460 return lcycle;
1461
1462 /* When holding the xtime write lock, there's no need
1463 * to add the overhead of the cmpxchg. Readers are
1464 * force to retry until the write lock is released.
1465 */
1466 if (writelock) {
1467 time_interpolator->last_cycle = now;
1468 return now;
1469 }
1445 /* Keep track of the last timer value returned. The use of cmpxchg here 1470 /* Keep track of the last timer value returned. The use of cmpxchg here
1446 * will cause contention in an SMP environment. 1471 * will cause contention in an SMP environment.
1447 */ 1472 */
@@ -1455,7 +1480,7 @@ static inline u64 time_interpolator_get_counter(void)
1455void time_interpolator_reset(void) 1480void time_interpolator_reset(void)
1456{ 1481{
1457 time_interpolator->offset = 0; 1482 time_interpolator->offset = 0;
1458 time_interpolator->last_counter = time_interpolator_get_counter(); 1483 time_interpolator->last_counter = time_interpolator_get_counter(1);
1459} 1484}
1460 1485
1461#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) 1486#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
@@ -1467,7 +1492,7 @@ unsigned long time_interpolator_get_offset(void)
1467 return 0; 1492 return 0;
1468 1493
1469 return time_interpolator->offset + 1494 return time_interpolator->offset +
1470 GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); 1495 GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
1471} 1496}
1472 1497
1473#define INTERPOLATOR_ADJUST 65536 1498#define INTERPOLATOR_ADJUST 65536
@@ -1490,7 +1515,7 @@ static void time_interpolator_update(long delta_nsec)
1490 * and the tuning logic insures that. 1515 * and the tuning logic insures that.
1491 */ 1516 */
1492 1517
1493 counter = time_interpolator_get_counter(); 1518 counter = time_interpolator_get_counter(1);
1494 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); 1519 offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
1495 1520
1496 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) 1521 if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
@@ -1588,10 +1613,8 @@ void msleep(unsigned int msecs)
1588{ 1613{
1589 unsigned long timeout = msecs_to_jiffies(msecs) + 1; 1614 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1590 1615
1591 while (timeout) { 1616 while (timeout)
1592 set_current_state(TASK_UNINTERRUPTIBLE); 1617 timeout = schedule_timeout_uninterruptible(timeout);
1593 timeout = schedule_timeout(timeout);
1594 }
1595} 1618}
1596 1619
1597EXPORT_SYMBOL(msleep); 1620EXPORT_SYMBOL(msleep);
@@ -1604,10 +1627,8 @@ unsigned long msleep_interruptible(unsigned int msecs)
1604{ 1627{
1605 unsigned long timeout = msecs_to_jiffies(msecs) + 1; 1628 unsigned long timeout = msecs_to_jiffies(msecs) + 1;
1606 1629
1607 while (timeout && !signal_pending(current)) { 1630 while (timeout && !signal_pending(current))
1608 set_current_state(TASK_INTERRUPTIBLE); 1631 timeout = schedule_timeout_interruptible(timeout);
1609 timeout = schedule_timeout(timeout);
1610 }
1611 return jiffies_to_msecs(timeout); 1632 return jiffies_to_msecs(timeout);
1612} 1633}
1613 1634
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c7e36d4a70ca..91bacb13a7e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name,
308 struct workqueue_struct *wq; 308 struct workqueue_struct *wq;
309 struct task_struct *p; 309 struct task_struct *p;
310 310
311 wq = kmalloc(sizeof(*wq), GFP_KERNEL); 311 wq = kzalloc(sizeof(*wq), GFP_KERNEL);
312 if (!wq) 312 if (!wq)
313 return NULL; 313 return NULL;
314 memset(wq, 0, sizeof(*wq));
315 314
316 wq->name = name; 315 wq->name = name;
317 /* We don't need the distraction of CPUs appearing and vanishing. */ 316 /* We don't need the distraction of CPUs appearing and vanishing. */
@@ -499,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
499 case CPU_UP_PREPARE: 498 case CPU_UP_PREPARE:
500 /* Create a new workqueue thread for it. */ 499 /* Create a new workqueue thread for it. */
501 list_for_each_entry(wq, &workqueues, list) { 500 list_for_each_entry(wq, &workqueues, list) {
502 if (create_workqueue_thread(wq, hotcpu) < 0) { 501 if (!create_workqueue_thread(wq, hotcpu)) {
503 printk("workqueue for %i failed\n", hotcpu); 502 printk("workqueue for %i failed\n", hotcpu);
504 return NOTIFY_BAD; 503 return NOTIFY_BAD;
505 } 504 }