diff options
Diffstat (limited to 'kernel')
35 files changed, 1580 insertions, 486 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb05cd05d237..ff4dc02ce170 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
12 | obj-$(CONFIG_FUTEX) += futex.o | 12 | obj-$(CONFIG_FUTEX) += futex.o |
13 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 13 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
14 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 14 | obj-$(CONFIG_SMP) += cpu.o spinlock.o |
15 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | ||
15 | obj-$(CONFIG_UID16) += uid16.o | 16 | obj-$(CONFIG_UID16) += uid16.o |
16 | obj-$(CONFIG_MODULES) += module.o | 17 | obj-$(CONFIG_MODULES) += module.o |
17 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 18 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDIT) += audit.o | |||
27 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 28 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
28 | obj-$(CONFIG_KPROBES) += kprobes.o | 29 | obj-$(CONFIG_KPROBES) += kprobes.o |
29 | obj-$(CONFIG_SYSFS) += ksysfs.o | 30 | obj-$(CONFIG_SYSFS) += ksysfs.o |
31 | obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | ||
30 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 32 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
31 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o | 33 | obj-$(CONFIG_CRASH_DUMP) += crash_dump.o |
32 | obj-$(CONFIG_SECCOMP) += seccomp.o | 34 | obj-$(CONFIG_SECCOMP) += seccomp.o |
diff --git a/kernel/acct.c b/kernel/acct.c index 4168f631868e..b756f527497e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -165,7 +165,7 @@ out: | |||
165 | } | 165 | } |
166 | 166 | ||
167 | /* | 167 | /* |
168 | * Close the old accouting file (if currently open) and then replace | 168 | * Close the old accounting file (if currently open) and then replace |
169 | * it with file (if non-NULL). | 169 | * it with file (if non-NULL). |
170 | * | 170 | * |
171 | * NOTE: acct_globals.lock MUST be held on entry and exit. | 171 | * NOTE: acct_globals.lock MUST be held on entry and exit. |
@@ -199,11 +199,16 @@ static void acct_file_reopen(struct file *file) | |||
199 | } | 199 | } |
200 | } | 200 | } |
201 | 201 | ||
202 | /* | 202 | /** |
203 | * sys_acct() is the only system call needed to implement process | 203 | * sys_acct - enable/disable process accounting |
204 | * accounting. It takes the name of the file where accounting records | 204 | * @name: file name for accounting records or NULL to shutdown accounting |
205 | * should be written. If the filename is NULL, accounting will be | 205 | * |
206 | * shutdown. | 206 | * Returns 0 for success or negative errno values for failure. |
207 | * | ||
208 | * sys_acct() is the only system call needed to implement process | ||
209 | * accounting. It takes the name of the file where accounting records | ||
210 | * should be written. If the filename is NULL, accounting will be | ||
211 | * shutdown. | ||
207 | */ | 212 | */ |
208 | asmlinkage long sys_acct(const char __user *name) | 213 | asmlinkage long sys_acct(const char __user *name) |
209 | { | 214 | { |
@@ -220,7 +225,7 @@ asmlinkage long sys_acct(const char __user *name) | |||
220 | return (PTR_ERR(tmp)); | 225 | return (PTR_ERR(tmp)); |
221 | } | 226 | } |
222 | /* Difference from BSD - they don't do O_APPEND */ | 227 | /* Difference from BSD - they don't do O_APPEND */ |
223 | file = filp_open(tmp, O_WRONLY|O_APPEND, 0); | 228 | file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0); |
224 | putname(tmp); | 229 | putname(tmp); |
225 | if (IS_ERR(file)) { | 230 | if (IS_ERR(file)) { |
226 | return (PTR_ERR(file)); | 231 | return (PTR_ERR(file)); |
@@ -250,9 +255,12 @@ asmlinkage long sys_acct(const char __user *name) | |||
250 | return (0); | 255 | return (0); |
251 | } | 256 | } |
252 | 257 | ||
253 | /* | 258 | /** |
254 | * If the accouting is turned on for a file in the filesystem pointed | 259 | * acct_auto_close - turn off a filesystem's accounting if it is on |
255 | * to by sb, turn accouting off. | 260 | * @sb: super block for the filesystem |
261 | * | ||
262 | * If the accounting is turned on for a file in the filesystem pointed | ||
263 | * to by sb, turn accounting off. | ||
256 | */ | 264 | */ |
257 | void acct_auto_close(struct super_block *sb) | 265 | void acct_auto_close(struct super_block *sb) |
258 | { | 266 | { |
@@ -503,8 +511,11 @@ static void do_acct_process(long exitcode, struct file *file) | |||
503 | set_fs(fs); | 511 | set_fs(fs); |
504 | } | 512 | } |
505 | 513 | ||
506 | /* | 514 | /** |
507 | * acct_process - now just a wrapper around do_acct_process | 515 | * acct_process - now just a wrapper around do_acct_process |
516 | * @exitcode: task exit code | ||
517 | * | ||
518 | * handles process accounting for an exiting task | ||
508 | */ | 519 | */ |
509 | void acct_process(long exitcode) | 520 | void acct_process(long exitcode) |
510 | { | 521 | { |
@@ -530,9 +541,9 @@ void acct_process(long exitcode) | |||
530 | } | 541 | } |
531 | 542 | ||
532 | 543 | ||
533 | /* | 544 | /** |
534 | * acct_update_integrals | 545 | * acct_update_integrals - update mm integral fields in task_struct |
535 | * - update mm integral fields in task_struct | 546 | * @tsk: task_struct for accounting |
536 | */ | 547 | */ |
537 | void acct_update_integrals(struct task_struct *tsk) | 548 | void acct_update_integrals(struct task_struct *tsk) |
538 | { | 549 | { |
@@ -547,9 +558,9 @@ void acct_update_integrals(struct task_struct *tsk) | |||
547 | } | 558 | } |
548 | } | 559 | } |
549 | 560 | ||
550 | /* | 561 | /** |
551 | * acct_clear_integrals | 562 | * acct_clear_integrals - clear the mm integral fields in task_struct |
552 | * - clear the mm integral fields in task_struct | 563 | * @tsk: task_struct whose accounting fields are cleared |
553 | */ | 564 | */ |
554 | void acct_clear_integrals(struct task_struct *tsk) | 565 | void acct_clear_integrals(struct task_struct *tsk) |
555 | { | 566 | { |
diff --git a/kernel/audit.c b/kernel/audit.c index 8376ec10cf24..83096b67510a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -513,7 +513,8 @@ static int __init audit_init(void) | |||
513 | { | 513 | { |
514 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 514 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", |
515 | audit_default ? "enabled" : "disabled"); | 515 | audit_default ? "enabled" : "disabled"); |
516 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive); | 516 | audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, |
517 | THIS_MODULE); | ||
517 | if (!audit_sock) | 518 | if (!audit_sock) |
518 | audit_panic("cannot initialize netlink socket"); | 519 | audit_panic("cannot initialize netlink socket"); |
519 | 520 | ||
diff --git a/kernel/compat.c b/kernel/compat.c index ddfcaaa86623..102296e21ea8 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
@@ -48,8 +48,7 @@ static long compat_nanosleep_restart(struct restart_block *restart) | |||
48 | if (!time_after(expire, now)) | 48 | if (!time_after(expire, now)) |
49 | return 0; | 49 | return 0; |
50 | 50 | ||
51 | current->state = TASK_INTERRUPTIBLE; | 51 | expire = schedule_timeout_interruptible(expire - now); |
52 | expire = schedule_timeout(expire - now); | ||
53 | if (expire == 0) | 52 | if (expire == 0) |
54 | return 0; | 53 | return 0; |
55 | 54 | ||
@@ -82,8 +81,7 @@ asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, | |||
82 | return -EINVAL; | 81 | return -EINVAL; |
83 | 82 | ||
84 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); | 83 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); |
85 | current->state = TASK_INTERRUPTIBLE; | 84 | expire = schedule_timeout_interruptible(expire); |
86 | expire = schedule_timeout(expire); | ||
87 | if (expire == 0) | 85 | if (expire == 0) |
88 | return 0; | 86 | return 0; |
89 | 87 | ||
@@ -795,8 +793,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, | |||
795 | recalc_sigpending(); | 793 | recalc_sigpending(); |
796 | spin_unlock_irq(¤t->sighand->siglock); | 794 | spin_unlock_irq(¤t->sighand->siglock); |
797 | 795 | ||
798 | current->state = TASK_INTERRUPTIBLE; | 796 | timeout = schedule_timeout_interruptible(timeout); |
799 | timeout = schedule_timeout(timeout); | ||
800 | 797 | ||
801 | spin_lock_irq(¤t->sighand->siglock); | 798 | spin_lock_irq(¤t->sighand->siglock); |
802 | sig = dequeue_signal(current, &s, &info); | 799 | sig = dequeue_signal(current, &s, &info); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8ab1b4e518b8..79866bc6b3a1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -180,6 +180,42 @@ static struct super_block *cpuset_sb = NULL; | |||
180 | */ | 180 | */ |
181 | 181 | ||
182 | static DECLARE_MUTEX(cpuset_sem); | 182 | static DECLARE_MUTEX(cpuset_sem); |
183 | static struct task_struct *cpuset_sem_owner; | ||
184 | static int cpuset_sem_depth; | ||
185 | |||
186 | /* | ||
187 | * The global cpuset semaphore cpuset_sem can be needed by the | ||
188 | * memory allocator to update a tasks mems_allowed (see the calls | ||
189 | * to cpuset_update_current_mems_allowed()) or to walk up the | ||
190 | * cpuset hierarchy to find a mem_exclusive cpuset see the calls | ||
191 | * to cpuset_excl_nodes_overlap()). | ||
192 | * | ||
193 | * But if the memory allocation is being done by cpuset.c code, it | ||
194 | * usually already holds cpuset_sem. Double tripping on a kernel | ||
195 | * semaphore deadlocks the current task, and any other task that | ||
196 | * subsequently tries to obtain the lock. | ||
197 | * | ||
198 | * Run all up's and down's on cpuset_sem through the following | ||
199 | * wrappers, which will detect this nested locking, and avoid | ||
200 | * deadlocking. | ||
201 | */ | ||
202 | |||
203 | static inline void cpuset_down(struct semaphore *psem) | ||
204 | { | ||
205 | if (cpuset_sem_owner != current) { | ||
206 | down(psem); | ||
207 | cpuset_sem_owner = current; | ||
208 | } | ||
209 | cpuset_sem_depth++; | ||
210 | } | ||
211 | |||
212 | static inline void cpuset_up(struct semaphore *psem) | ||
213 | { | ||
214 | if (--cpuset_sem_depth == 0) { | ||
215 | cpuset_sem_owner = NULL; | ||
216 | up(psem); | ||
217 | } | ||
218 | } | ||
183 | 219 | ||
184 | /* | 220 | /* |
185 | * A couple of forward declarations required, due to cyclic reference loop: | 221 | * A couple of forward declarations required, due to cyclic reference loop: |
@@ -522,19 +558,10 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
522 | * Refresh current tasks mems_allowed and mems_generation from | 558 | * Refresh current tasks mems_allowed and mems_generation from |
523 | * current tasks cpuset. Call with cpuset_sem held. | 559 | * current tasks cpuset. Call with cpuset_sem held. |
524 | * | 560 | * |
525 | * Be sure to call refresh_mems() on any cpuset operation which | 561 | * This routine is needed to update the per-task mems_allowed |
526 | * (1) holds cpuset_sem, and (2) might possibly alloc memory. | 562 | * data, within the tasks context, when it is trying to allocate |
527 | * Call after obtaining cpuset_sem lock, before any possible | 563 | * memory (in various mm/mempolicy.c routines) and notices |
528 | * allocation. Otherwise one risks trying to allocate memory | 564 | * that some other task has been modifying its cpuset. |
529 | * while the task cpuset_mems_generation is not the same as | ||
530 | * the mems_generation in its cpuset, which would deadlock on | ||
531 | * cpuset_sem in cpuset_update_current_mems_allowed(). | ||
532 | * | ||
533 | * Since we hold cpuset_sem, once refresh_mems() is called, the | ||
534 | * test (current->cpuset_mems_generation != cs->mems_generation) | ||
535 | * in cpuset_update_current_mems_allowed() will remain false, | ||
536 | * until we drop cpuset_sem. Anyone else who would change our | ||
537 | * cpusets mems_generation needs to lock cpuset_sem first. | ||
538 | */ | 565 | */ |
539 | 566 | ||
540 | static void refresh_mems(void) | 567 | static void refresh_mems(void) |
@@ -628,13 +655,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
628 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 655 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. |
629 | */ | 656 | */ |
630 | 657 | ||
631 | /* | ||
632 | * Hack to avoid 2.6.13 partial node dynamic sched domain bug. | ||
633 | * Disable letting 'cpu_exclusive' cpusets define dynamic sched | ||
634 | * domains, until the sched domain can handle partial nodes. | ||
635 | * Remove this #if hackery when sched domains fixed. | ||
636 | */ | ||
637 | #if 0 | ||
638 | static void update_cpu_domains(struct cpuset *cur) | 658 | static void update_cpu_domains(struct cpuset *cur) |
639 | { | 659 | { |
640 | struct cpuset *c, *par = cur->parent; | 660 | struct cpuset *c, *par = cur->parent; |
@@ -675,11 +695,6 @@ static void update_cpu_domains(struct cpuset *cur) | |||
675 | partition_sched_domains(&pspan, &cspan); | 695 | partition_sched_domains(&pspan, &cspan); |
676 | unlock_cpu_hotplug(); | 696 | unlock_cpu_hotplug(); |
677 | } | 697 | } |
678 | #else | ||
679 | static void update_cpu_domains(struct cpuset *cur) | ||
680 | { | ||
681 | } | ||
682 | #endif | ||
683 | 698 | ||
684 | static int update_cpumask(struct cpuset *cs, char *buf) | 699 | static int update_cpumask(struct cpuset *cs, char *buf) |
685 | { | 700 | { |
@@ -852,7 +867,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
852 | } | 867 | } |
853 | buffer[nbytes] = 0; /* nul-terminate */ | 868 | buffer[nbytes] = 0; /* nul-terminate */ |
854 | 869 | ||
855 | down(&cpuset_sem); | 870 | cpuset_down(&cpuset_sem); |
856 | 871 | ||
857 | if (is_removed(cs)) { | 872 | if (is_removed(cs)) { |
858 | retval = -ENODEV; | 873 | retval = -ENODEV; |
@@ -886,7 +901,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
886 | if (retval == 0) | 901 | if (retval == 0) |
887 | retval = nbytes; | 902 | retval = nbytes; |
888 | out2: | 903 | out2: |
889 | up(&cpuset_sem); | 904 | cpuset_up(&cpuset_sem); |
890 | cpuset_release_agent(pathbuf); | 905 | cpuset_release_agent(pathbuf); |
891 | out1: | 906 | out1: |
892 | kfree(buffer); | 907 | kfree(buffer); |
@@ -926,9 +941,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) | |||
926 | { | 941 | { |
927 | cpumask_t mask; | 942 | cpumask_t mask; |
928 | 943 | ||
929 | down(&cpuset_sem); | 944 | cpuset_down(&cpuset_sem); |
930 | mask = cs->cpus_allowed; | 945 | mask = cs->cpus_allowed; |
931 | up(&cpuset_sem); | 946 | cpuset_up(&cpuset_sem); |
932 | 947 | ||
933 | return cpulist_scnprintf(page, PAGE_SIZE, mask); | 948 | return cpulist_scnprintf(page, PAGE_SIZE, mask); |
934 | } | 949 | } |
@@ -937,9 +952,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
937 | { | 952 | { |
938 | nodemask_t mask; | 953 | nodemask_t mask; |
939 | 954 | ||
940 | down(&cpuset_sem); | 955 | cpuset_down(&cpuset_sem); |
941 | mask = cs->mems_allowed; | 956 | mask = cs->mems_allowed; |
942 | up(&cpuset_sem); | 957 | cpuset_up(&cpuset_sem); |
943 | 958 | ||
944 | return nodelist_scnprintf(page, PAGE_SIZE, mask); | 959 | return nodelist_scnprintf(page, PAGE_SIZE, mask); |
945 | } | 960 | } |
@@ -984,6 +999,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
984 | *s++ = '\n'; | 999 | *s++ = '\n'; |
985 | *s = '\0'; | 1000 | *s = '\0'; |
986 | 1001 | ||
1002 | /* Do nothing if *ppos is at the eof or beyond the eof. */ | ||
1003 | if (s - page <= *ppos) | ||
1004 | return 0; | ||
1005 | |||
987 | start = page + *ppos; | 1006 | start = page + *ppos; |
988 | n = s - start; | 1007 | n = s - start; |
989 | retval = n - copy_to_user(buf, start, min(n, nbytes)); | 1008 | retval = n - copy_to_user(buf, start, min(n, nbytes)); |
@@ -1342,8 +1361,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1342 | if (!cs) | 1361 | if (!cs) |
1343 | return -ENOMEM; | 1362 | return -ENOMEM; |
1344 | 1363 | ||
1345 | down(&cpuset_sem); | 1364 | cpuset_down(&cpuset_sem); |
1346 | refresh_mems(); | ||
1347 | cs->flags = 0; | 1365 | cs->flags = 0; |
1348 | if (notify_on_release(parent)) | 1366 | if (notify_on_release(parent)) |
1349 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1367 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
@@ -1368,14 +1386,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1368 | * will down() this new directory's i_sem and if we race with | 1386 | * will down() this new directory's i_sem and if we race with |
1369 | * another mkdir, we might deadlock. | 1387 | * another mkdir, we might deadlock. |
1370 | */ | 1388 | */ |
1371 | up(&cpuset_sem); | 1389 | cpuset_up(&cpuset_sem); |
1372 | 1390 | ||
1373 | err = cpuset_populate_dir(cs->dentry); | 1391 | err = cpuset_populate_dir(cs->dentry); |
1374 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 1392 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
1375 | return 0; | 1393 | return 0; |
1376 | err: | 1394 | err: |
1377 | list_del(&cs->sibling); | 1395 | list_del(&cs->sibling); |
1378 | up(&cpuset_sem); | 1396 | cpuset_up(&cpuset_sem); |
1379 | kfree(cs); | 1397 | kfree(cs); |
1380 | return err; | 1398 | return err; |
1381 | } | 1399 | } |
@@ -1397,14 +1415,13 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1397 | 1415 | ||
1398 | /* the vfs holds both inode->i_sem already */ | 1416 | /* the vfs holds both inode->i_sem already */ |
1399 | 1417 | ||
1400 | down(&cpuset_sem); | 1418 | cpuset_down(&cpuset_sem); |
1401 | refresh_mems(); | ||
1402 | if (atomic_read(&cs->count) > 0) { | 1419 | if (atomic_read(&cs->count) > 0) { |
1403 | up(&cpuset_sem); | 1420 | cpuset_up(&cpuset_sem); |
1404 | return -EBUSY; | 1421 | return -EBUSY; |
1405 | } | 1422 | } |
1406 | if (!list_empty(&cs->children)) { | 1423 | if (!list_empty(&cs->children)) { |
1407 | up(&cpuset_sem); | 1424 | cpuset_up(&cpuset_sem); |
1408 | return -EBUSY; | 1425 | return -EBUSY; |
1409 | } | 1426 | } |
1410 | parent = cs->parent; | 1427 | parent = cs->parent; |
@@ -1420,7 +1437,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1420 | spin_unlock(&d->d_lock); | 1437 | spin_unlock(&d->d_lock); |
1421 | cpuset_d_remove_dir(d); | 1438 | cpuset_d_remove_dir(d); |
1422 | dput(d); | 1439 | dput(d); |
1423 | up(&cpuset_sem); | 1440 | cpuset_up(&cpuset_sem); |
1424 | cpuset_release_agent(pathbuf); | 1441 | cpuset_release_agent(pathbuf); |
1425 | return 0; | 1442 | return 0; |
1426 | } | 1443 | } |
@@ -1523,10 +1540,10 @@ void cpuset_exit(struct task_struct *tsk) | |||
1523 | if (notify_on_release(cs)) { | 1540 | if (notify_on_release(cs)) { |
1524 | char *pathbuf = NULL; | 1541 | char *pathbuf = NULL; |
1525 | 1542 | ||
1526 | down(&cpuset_sem); | 1543 | cpuset_down(&cpuset_sem); |
1527 | if (atomic_dec_and_test(&cs->count)) | 1544 | if (atomic_dec_and_test(&cs->count)) |
1528 | check_for_release(cs, &pathbuf); | 1545 | check_for_release(cs, &pathbuf); |
1529 | up(&cpuset_sem); | 1546 | cpuset_up(&cpuset_sem); |
1530 | cpuset_release_agent(pathbuf); | 1547 | cpuset_release_agent(pathbuf); |
1531 | } else { | 1548 | } else { |
1532 | atomic_dec(&cs->count); | 1549 | atomic_dec(&cs->count); |
@@ -1547,11 +1564,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | |||
1547 | { | 1564 | { |
1548 | cpumask_t mask; | 1565 | cpumask_t mask; |
1549 | 1566 | ||
1550 | down(&cpuset_sem); | 1567 | cpuset_down(&cpuset_sem); |
1551 | task_lock((struct task_struct *)tsk); | 1568 | task_lock((struct task_struct *)tsk); |
1552 | guarantee_online_cpus(tsk->cpuset, &mask); | 1569 | guarantee_online_cpus(tsk->cpuset, &mask); |
1553 | task_unlock((struct task_struct *)tsk); | 1570 | task_unlock((struct task_struct *)tsk); |
1554 | up(&cpuset_sem); | 1571 | cpuset_up(&cpuset_sem); |
1555 | 1572 | ||
1556 | return mask; | 1573 | return mask; |
1557 | } | 1574 | } |
@@ -1576,9 +1593,9 @@ void cpuset_update_current_mems_allowed(void) | |||
1576 | if (!cs) | 1593 | if (!cs) |
1577 | return; /* task is exiting */ | 1594 | return; /* task is exiting */ |
1578 | if (current->cpuset_mems_generation != cs->mems_generation) { | 1595 | if (current->cpuset_mems_generation != cs->mems_generation) { |
1579 | down(&cpuset_sem); | 1596 | cpuset_down(&cpuset_sem); |
1580 | refresh_mems(); | 1597 | refresh_mems(); |
1581 | up(&cpuset_sem); | 1598 | cpuset_up(&cpuset_sem); |
1582 | } | 1599 | } |
1583 | } | 1600 | } |
1584 | 1601 | ||
@@ -1611,17 +1628,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | |||
1611 | return 0; | 1628 | return 0; |
1612 | } | 1629 | } |
1613 | 1630 | ||
1631 | /* | ||
1632 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | ||
1633 | * ancestor to the specified cpuset. Call while holding cpuset_sem. | ||
1634 | * If no ancestor is mem_exclusive (an unusual configuration), then | ||
1635 | * returns the root cpuset. | ||
1636 | */ | ||
1637 | static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | ||
1638 | { | ||
1639 | while (!is_mem_exclusive(cs) && cs->parent) | ||
1640 | cs = cs->parent; | ||
1641 | return cs; | ||
1642 | } | ||
1643 | |||
1614 | /** | 1644 | /** |
1615 | * cpuset_zone_allowed - is zone z allowed in current->mems_allowed | 1645 | * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? |
1616 | * @z: zone in question | 1646 | * @z: is this zone on an allowed node? |
1647 | * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) | ||
1617 | * | 1648 | * |
1618 | * Is zone z allowed in current->mems_allowed, or is | 1649 | * If we're in interrupt, yes, we can always allocate. If zone |
1619 | * the CPU in interrupt context? (zone is always allowed in this case) | 1650 | * z's node is in our tasks mems_allowed, yes. If it's not a |
1620 | */ | 1651 | * __GFP_HARDWALL request and this zone's nodes is in the nearest |
1621 | int cpuset_zone_allowed(struct zone *z) | 1652 | * mem_exclusive cpuset ancestor to this tasks cpuset, yes. |
1653 | * Otherwise, no. | ||
1654 | * | ||
1655 | * GFP_USER allocations are marked with the __GFP_HARDWALL bit, | ||
1656 | * and do not allow allocations outside the current tasks cpuset. | ||
1657 | * GFP_KERNEL allocations are not so marked, so can escape to the | ||
1658 | * nearest mem_exclusive ancestor cpuset. | ||
1659 | * | ||
1660 | * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() | ||
1661 | * routine only calls here with __GFP_HARDWALL bit _not_ set if | ||
1662 | * it's a GFP_KERNEL allocation, and all nodes in the current tasks | ||
1663 | * mems_allowed came up empty on the first pass over the zonelist. | ||
1664 | * So only GFP_KERNEL allocations, if all nodes in the cpuset are | ||
1665 | * short of memory, might require taking the cpuset_sem semaphore. | ||
1666 | * | ||
1667 | * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() | ||
1668 | * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing | ||
1669 | * hardwall cpusets - no allocation on a node outside the cpuset is | ||
1670 | * allowed (unless in interrupt, of course). | ||
1671 | * | ||
1672 | * The second loop doesn't even call here for GFP_ATOMIC requests | ||
1673 | * (if the __alloc_pages() local variable 'wait' is set). That check | ||
1674 | * and the checks below have the combined affect in the second loop of | ||
1675 | * the __alloc_pages() routine that: | ||
1676 | * in_interrupt - any node ok (current task context irrelevant) | ||
1677 | * GFP_ATOMIC - any node ok | ||
1678 | * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok | ||
1679 | * GFP_USER - only nodes in current tasks mems allowed ok. | ||
1680 | **/ | ||
1681 | |||
1682 | int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) | ||
1683 | { | ||
1684 | int node; /* node that zone z is on */ | ||
1685 | const struct cpuset *cs; /* current cpuset ancestors */ | ||
1686 | int allowed = 1; /* is allocation in zone z allowed? */ | ||
1687 | |||
1688 | if (in_interrupt()) | ||
1689 | return 1; | ||
1690 | node = z->zone_pgdat->node_id; | ||
1691 | if (node_isset(node, current->mems_allowed)) | ||
1692 | return 1; | ||
1693 | if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ | ||
1694 | return 0; | ||
1695 | |||
1696 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | ||
1697 | cpuset_down(&cpuset_sem); | ||
1698 | cs = current->cpuset; | ||
1699 | if (!cs) | ||
1700 | goto done; /* current task exiting */ | ||
1701 | cs = nearest_exclusive_ancestor(cs); | ||
1702 | allowed = node_isset(node, cs->mems_allowed); | ||
1703 | done: | ||
1704 | cpuset_up(&cpuset_sem); | ||
1705 | return allowed; | ||
1706 | } | ||
1707 | |||
1708 | /** | ||
1709 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? | ||
1710 | * @p: pointer to task_struct of some other task. | ||
1711 | * | ||
1712 | * Description: Return true if the nearest mem_exclusive ancestor | ||
1713 | * cpusets of tasks @p and current overlap. Used by oom killer to | ||
1714 | * determine if task @p's memory usage might impact the memory | ||
1715 | * available to the current task. | ||
1716 | * | ||
1717 | * Acquires cpuset_sem - not suitable for calling from a fast path. | ||
1718 | **/ | ||
1719 | |||
1720 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | ||
1622 | { | 1721 | { |
1623 | return in_interrupt() || | 1722 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
1624 | node_isset(z->zone_pgdat->node_id, current->mems_allowed); | 1723 | int overlap = 0; /* do cpusets overlap? */ |
1724 | |||
1725 | cpuset_down(&cpuset_sem); | ||
1726 | cs1 = current->cpuset; | ||
1727 | if (!cs1) | ||
1728 | goto done; /* current task exiting */ | ||
1729 | cs2 = p->cpuset; | ||
1730 | if (!cs2) | ||
1731 | goto done; /* task p is exiting */ | ||
1732 | cs1 = nearest_exclusive_ancestor(cs1); | ||
1733 | cs2 = nearest_exclusive_ancestor(cs2); | ||
1734 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | ||
1735 | done: | ||
1736 | cpuset_up(&cpuset_sem); | ||
1737 | |||
1738 | return overlap; | ||
1625 | } | 1739 | } |
1626 | 1740 | ||
1627 | /* | 1741 | /* |
@@ -1642,7 +1756,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1642 | return -ENOMEM; | 1756 | return -ENOMEM; |
1643 | 1757 | ||
1644 | tsk = m->private; | 1758 | tsk = m->private; |
1645 | down(&cpuset_sem); | 1759 | cpuset_down(&cpuset_sem); |
1646 | task_lock(tsk); | 1760 | task_lock(tsk); |
1647 | cs = tsk->cpuset; | 1761 | cs = tsk->cpuset; |
1648 | task_unlock(tsk); | 1762 | task_unlock(tsk); |
@@ -1657,7 +1771,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
1657 | seq_puts(m, buf); | 1771 | seq_puts(m, buf); |
1658 | seq_putc(m, '\n'); | 1772 | seq_putc(m, '\n'); |
1659 | out: | 1773 | out: |
1660 | up(&cpuset_sem); | 1774 | cpuset_up(&cpuset_sem); |
1661 | kfree(buf); | 1775 | kfree(buf); |
1662 | return retval; | 1776 | return retval; |
1663 | } | 1777 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index 5b0fb9f09f21..6d2089a1bce7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -368,17 +368,19 @@ EXPORT_SYMBOL(daemonize); | |||
368 | static inline void close_files(struct files_struct * files) | 368 | static inline void close_files(struct files_struct * files) |
369 | { | 369 | { |
370 | int i, j; | 370 | int i, j; |
371 | struct fdtable *fdt; | ||
371 | 372 | ||
372 | j = 0; | 373 | j = 0; |
374 | fdt = files_fdtable(files); | ||
373 | for (;;) { | 375 | for (;;) { |
374 | unsigned long set; | 376 | unsigned long set; |
375 | i = j * __NFDBITS; | 377 | i = j * __NFDBITS; |
376 | if (i >= files->max_fdset || i >= files->max_fds) | 378 | if (i >= fdt->max_fdset || i >= fdt->max_fds) |
377 | break; | 379 | break; |
378 | set = files->open_fds->fds_bits[j++]; | 380 | set = fdt->open_fds->fds_bits[j++]; |
379 | while (set) { | 381 | while (set) { |
380 | if (set & 1) { | 382 | if (set & 1) { |
381 | struct file * file = xchg(&files->fd[i], NULL); | 383 | struct file * file = xchg(&fdt->fd[i], NULL); |
382 | if (file) | 384 | if (file) |
383 | filp_close(file, files); | 385 | filp_close(file, files); |
384 | } | 386 | } |
@@ -403,18 +405,22 @@ struct files_struct *get_files_struct(struct task_struct *task) | |||
403 | 405 | ||
404 | void fastcall put_files_struct(struct files_struct *files) | 406 | void fastcall put_files_struct(struct files_struct *files) |
405 | { | 407 | { |
408 | struct fdtable *fdt; | ||
409 | |||
406 | if (atomic_dec_and_test(&files->count)) { | 410 | if (atomic_dec_and_test(&files->count)) { |
407 | close_files(files); | 411 | close_files(files); |
408 | /* | 412 | /* |
409 | * Free the fd and fdset arrays if we expanded them. | 413 | * Free the fd and fdset arrays if we expanded them. |
414 | * If the fdtable was embedded, pass files for freeing | ||
415 | * at the end of the RCU grace period. Otherwise, | ||
416 | * you can free files immediately. | ||
410 | */ | 417 | */ |
411 | if (files->fd != &files->fd_array[0]) | 418 | fdt = files_fdtable(files); |
412 | free_fd_array(files->fd, files->max_fds); | 419 | if (fdt == &files->fdtab) |
413 | if (files->max_fdset > __FD_SETSIZE) { | 420 | fdt->free_files = files; |
414 | free_fdset(files->open_fds, files->max_fdset); | 421 | else |
415 | free_fdset(files->close_on_exec, files->max_fdset); | 422 | kmem_cache_free(files_cachep, files); |
416 | } | 423 | free_fdtable(fdt); |
417 | kmem_cache_free(files_cachep, files); | ||
418 | } | 424 | } |
419 | } | 425 | } |
420 | 426 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index b65187f0c74e..8149f3602881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
36 | #include <linux/jiffies.h> | 36 | #include <linux/jiffies.h> |
37 | #include <linux/futex.h> | 37 | #include <linux/futex.h> |
38 | #include <linux/rcupdate.h> | ||
38 | #include <linux/ptrace.h> | 39 | #include <linux/ptrace.h> |
39 | #include <linux/mount.h> | 40 | #include <linux/mount.h> |
40 | #include <linux/audit.h> | 41 | #include <linux/audit.h> |
@@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
176 | 177 | ||
177 | /* One for us, one for whoever does the "release_task()" (usually parent) */ | 178 | /* One for us, one for whoever does the "release_task()" (usually parent) */ |
178 | atomic_set(&tsk->usage,2); | 179 | atomic_set(&tsk->usage,2); |
180 | atomic_set(&tsk->fs_excl, 0); | ||
179 | return tsk; | 181 | return tsk; |
180 | } | 182 | } |
181 | 183 | ||
@@ -564,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) | |||
564 | return 0; | 566 | return 0; |
565 | } | 567 | } |
566 | 568 | ||
567 | static int count_open_files(struct files_struct *files, int size) | 569 | static int count_open_files(struct fdtable *fdt) |
568 | { | 570 | { |
571 | int size = fdt->max_fdset; | ||
569 | int i; | 572 | int i; |
570 | 573 | ||
571 | /* Find the last open fd */ | 574 | /* Find the last open fd */ |
572 | for (i = size/(8*sizeof(long)); i > 0; ) { | 575 | for (i = size/(8*sizeof(long)); i > 0; ) { |
573 | if (files->open_fds->fds_bits[--i]) | 576 | if (fdt->open_fds->fds_bits[--i]) |
574 | break; | 577 | break; |
575 | } | 578 | } |
576 | i = (i+1) * 8 * sizeof(long); | 579 | i = (i+1) * 8 * sizeof(long); |
577 | return i; | 580 | return i; |
578 | } | 581 | } |
579 | 582 | ||
583 | static struct files_struct *alloc_files(void) | ||
584 | { | ||
585 | struct files_struct *newf; | ||
586 | struct fdtable *fdt; | ||
587 | |||
588 | newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); | ||
589 | if (!newf) | ||
590 | goto out; | ||
591 | |||
592 | atomic_set(&newf->count, 1); | ||
593 | |||
594 | spin_lock_init(&newf->file_lock); | ||
595 | fdt = &newf->fdtab; | ||
596 | fdt->next_fd = 0; | ||
597 | fdt->max_fds = NR_OPEN_DEFAULT; | ||
598 | fdt->max_fdset = __FD_SETSIZE; | ||
599 | fdt->close_on_exec = &newf->close_on_exec_init; | ||
600 | fdt->open_fds = &newf->open_fds_init; | ||
601 | fdt->fd = &newf->fd_array[0]; | ||
602 | INIT_RCU_HEAD(&fdt->rcu); | ||
603 | fdt->free_files = NULL; | ||
604 | fdt->next = NULL; | ||
605 | rcu_assign_pointer(newf->fdt, fdt); | ||
606 | out: | ||
607 | return newf; | ||
608 | } | ||
609 | |||
580 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | 610 | static int copy_files(unsigned long clone_flags, struct task_struct * tsk) |
581 | { | 611 | { |
582 | struct files_struct *oldf, *newf; | 612 | struct files_struct *oldf, *newf; |
583 | struct file **old_fds, **new_fds; | 613 | struct file **old_fds, **new_fds; |
584 | int open_files, size, i, error = 0, expand; | 614 | int open_files, size, i, error = 0, expand; |
615 | struct fdtable *old_fdt, *new_fdt; | ||
585 | 616 | ||
586 | /* | 617 | /* |
587 | * A background process may not have any files ... | 618 | * A background process may not have any files ... |
@@ -602,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |||
602 | */ | 633 | */ |
603 | tsk->files = NULL; | 634 | tsk->files = NULL; |
604 | error = -ENOMEM; | 635 | error = -ENOMEM; |
605 | newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); | 636 | newf = alloc_files(); |
606 | if (!newf) | 637 | if (!newf) |
607 | goto out; | 638 | goto out; |
608 | 639 | ||
609 | atomic_set(&newf->count, 1); | ||
610 | |||
611 | spin_lock_init(&newf->file_lock); | ||
612 | newf->next_fd = 0; | ||
613 | newf->max_fds = NR_OPEN_DEFAULT; | ||
614 | newf->max_fdset = __FD_SETSIZE; | ||
615 | newf->close_on_exec = &newf->close_on_exec_init; | ||
616 | newf->open_fds = &newf->open_fds_init; | ||
617 | newf->fd = &newf->fd_array[0]; | ||
618 | |||
619 | spin_lock(&oldf->file_lock); | 640 | spin_lock(&oldf->file_lock); |
620 | 641 | old_fdt = files_fdtable(oldf); | |
621 | open_files = count_open_files(oldf, oldf->max_fdset); | 642 | new_fdt = files_fdtable(newf); |
643 | size = old_fdt->max_fdset; | ||
644 | open_files = count_open_files(old_fdt); | ||
622 | expand = 0; | 645 | expand = 0; |
623 | 646 | ||
624 | /* | 647 | /* |
625 | * Check whether we need to allocate a larger fd array or fd set. | 648 | * Check whether we need to allocate a larger fd array or fd set. |
626 | * Note: we're not a clone task, so the open count won't change. | 649 | * Note: we're not a clone task, so the open count won't change. |
627 | */ | 650 | */ |
628 | if (open_files > newf->max_fdset) { | 651 | if (open_files > new_fdt->max_fdset) { |
629 | newf->max_fdset = 0; | 652 | new_fdt->max_fdset = 0; |
630 | expand = 1; | 653 | expand = 1; |
631 | } | 654 | } |
632 | if (open_files > newf->max_fds) { | 655 | if (open_files > new_fdt->max_fds) { |
633 | newf->max_fds = 0; | 656 | new_fdt->max_fds = 0; |
634 | expand = 1; | 657 | expand = 1; |
635 | } | 658 | } |
636 | 659 | ||
@@ -642,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |||
642 | spin_unlock(&newf->file_lock); | 665 | spin_unlock(&newf->file_lock); |
643 | if (error < 0) | 666 | if (error < 0) |
644 | goto out_release; | 667 | goto out_release; |
668 | new_fdt = files_fdtable(newf); | ||
669 | /* | ||
670 | * Reacquire the oldf lock and a pointer to its fd table | ||
671 | * who knows it may have a new bigger fd table. We need | ||
672 | * the latest pointer. | ||
673 | */ | ||
645 | spin_lock(&oldf->file_lock); | 674 | spin_lock(&oldf->file_lock); |
675 | old_fdt = files_fdtable(oldf); | ||
646 | } | 676 | } |
647 | 677 | ||
648 | old_fds = oldf->fd; | 678 | old_fds = old_fdt->fd; |
649 | new_fds = newf->fd; | 679 | new_fds = new_fdt->fd; |
650 | 680 | ||
651 | memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); | 681 | memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); |
652 | memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); | 682 | memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); |
653 | 683 | ||
654 | for (i = open_files; i != 0; i--) { | 684 | for (i = open_files; i != 0; i--) { |
655 | struct file *f = *old_fds++; | 685 | struct file *f = *old_fds++; |
@@ -662,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |||
662 | * is partway through open(). So make sure that this | 692 | * is partway through open(). So make sure that this |
663 | * fd is available to the new process. | 693 | * fd is available to the new process. |
664 | */ | 694 | */ |
665 | FD_CLR(open_files - i, newf->open_fds); | 695 | FD_CLR(open_files - i, new_fdt->open_fds); |
666 | } | 696 | } |
667 | *new_fds++ = f; | 697 | rcu_assign_pointer(*new_fds++, f); |
668 | } | 698 | } |
669 | spin_unlock(&oldf->file_lock); | 699 | spin_unlock(&oldf->file_lock); |
670 | 700 | ||
671 | /* compute the remainder to be cleared */ | 701 | /* compute the remainder to be cleared */ |
672 | size = (newf->max_fds - open_files) * sizeof(struct file *); | 702 | size = (new_fdt->max_fds - open_files) * sizeof(struct file *); |
673 | 703 | ||
674 | /* This is long word aligned thus could use a optimized version */ | 704 | /* This is long word aligned thus could use a optimized version */ |
675 | memset(new_fds, 0, size); | 705 | memset(new_fds, 0, size); |
676 | 706 | ||
677 | if (newf->max_fdset > open_files) { | 707 | if (new_fdt->max_fdset > open_files) { |
678 | int left = (newf->max_fdset-open_files)/8; | 708 | int left = (new_fdt->max_fdset-open_files)/8; |
679 | int start = open_files / (8 * sizeof(unsigned long)); | 709 | int start = open_files / (8 * sizeof(unsigned long)); |
680 | 710 | ||
681 | memset(&newf->open_fds->fds_bits[start], 0, left); | 711 | memset(&new_fdt->open_fds->fds_bits[start], 0, left); |
682 | memset(&newf->close_on_exec->fds_bits[start], 0, left); | 712 | memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); |
683 | } | 713 | } |
684 | 714 | ||
685 | tsk->files = newf; | 715 | tsk->files = newf; |
@@ -688,9 +718,9 @@ out: | |||
688 | return error; | 718 | return error; |
689 | 719 | ||
690 | out_release: | 720 | out_release: |
691 | free_fdset (newf->close_on_exec, newf->max_fdset); | 721 | free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); |
692 | free_fdset (newf->open_fds, newf->max_fdset); | 722 | free_fdset (new_fdt->open_fds, new_fdt->max_fdset); |
693 | free_fd_array(newf->fd, newf->max_fds); | 723 | free_fd_array(new_fdt->fd, new_fdt->max_fds); |
694 | kmem_cache_free(files_cachep, newf); | 724 | kmem_cache_free(files_cachep, newf); |
695 | goto out; | 725 | goto out; |
696 | } | 726 | } |
@@ -994,6 +1024,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
994 | * of CLONE_PTRACE. | 1024 | * of CLONE_PTRACE. |
995 | */ | 1025 | */ |
996 | clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); | 1026 | clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); |
1027 | #ifdef TIF_SYSCALL_EMU | ||
1028 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); | ||
1029 | #endif | ||
997 | 1030 | ||
998 | /* Our parent execution domain becomes current domain | 1031 | /* Our parent execution domain becomes current domain |
999 | These must match for thread signalling to apply */ | 1032 | These must match for thread signalling to apply */ |
@@ -1112,6 +1145,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1112 | __get_cpu_var(process_counts)++; | 1145 | __get_cpu_var(process_counts)++; |
1113 | } | 1146 | } |
1114 | 1147 | ||
1148 | if (!current->signal->tty && p->signal->tty) | ||
1149 | p->signal->tty = NULL; | ||
1150 | |||
1115 | nr_threads++; | 1151 | nr_threads++; |
1116 | total_forks++; | 1152 | total_forks++; |
1117 | write_unlock_irq(&tasklist_lock); | 1153 | write_unlock_irq(&tasklist_lock); |
diff --git a/kernel/futex.c b/kernel/futex.c index c7130f86106c..ca05fe6a70b2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -40,6 +40,7 @@ | |||
40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
41 | #include <linux/syscalls.h> | 41 | #include <linux/syscalls.h> |
42 | #include <linux/signal.h> | 42 | #include <linux/signal.h> |
43 | #include <asm/futex.h> | ||
43 | 44 | ||
44 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 45 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
45 | 46 | ||
@@ -327,6 +328,118 @@ out: | |||
327 | } | 328 | } |
328 | 329 | ||
329 | /* | 330 | /* |
331 | * Wake up all waiters hashed on the physical page that is mapped | ||
332 | * to this virtual address: | ||
333 | */ | ||
334 | static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) | ||
335 | { | ||
336 | union futex_key key1, key2; | ||
337 | struct futex_hash_bucket *bh1, *bh2; | ||
338 | struct list_head *head; | ||
339 | struct futex_q *this, *next; | ||
340 | int ret, op_ret, attempt = 0; | ||
341 | |||
342 | retryfull: | ||
343 | down_read(¤t->mm->mmap_sem); | ||
344 | |||
345 | ret = get_futex_key(uaddr1, &key1); | ||
346 | if (unlikely(ret != 0)) | ||
347 | goto out; | ||
348 | ret = get_futex_key(uaddr2, &key2); | ||
349 | if (unlikely(ret != 0)) | ||
350 | goto out; | ||
351 | |||
352 | bh1 = hash_futex(&key1); | ||
353 | bh2 = hash_futex(&key2); | ||
354 | |||
355 | retry: | ||
356 | if (bh1 < bh2) | ||
357 | spin_lock(&bh1->lock); | ||
358 | spin_lock(&bh2->lock); | ||
359 | if (bh1 > bh2) | ||
360 | spin_lock(&bh1->lock); | ||
361 | |||
362 | op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); | ||
363 | if (unlikely(op_ret < 0)) { | ||
364 | int dummy; | ||
365 | |||
366 | spin_unlock(&bh1->lock); | ||
367 | if (bh1 != bh2) | ||
368 | spin_unlock(&bh2->lock); | ||
369 | |||
370 | /* futex_atomic_op_inuser needs to both read and write | ||
371 | * *(int __user *)uaddr2, but we can't modify it | ||
372 | * non-atomically. Therefore, if get_user below is not | ||
373 | * enough, we need to handle the fault ourselves, while | ||
374 | * still holding the mmap_sem. */ | ||
375 | if (attempt++) { | ||
376 | struct vm_area_struct * vma; | ||
377 | struct mm_struct *mm = current->mm; | ||
378 | |||
379 | ret = -EFAULT; | ||
380 | if (attempt >= 2 || | ||
381 | !(vma = find_vma(mm, uaddr2)) || | ||
382 | vma->vm_start > uaddr2 || | ||
383 | !(vma->vm_flags & VM_WRITE)) | ||
384 | goto out; | ||
385 | |||
386 | switch (handle_mm_fault(mm, vma, uaddr2, 1)) { | ||
387 | case VM_FAULT_MINOR: | ||
388 | current->min_flt++; | ||
389 | break; | ||
390 | case VM_FAULT_MAJOR: | ||
391 | current->maj_flt++; | ||
392 | break; | ||
393 | default: | ||
394 | goto out; | ||
395 | } | ||
396 | goto retry; | ||
397 | } | ||
398 | |||
399 | /* If we would have faulted, release mmap_sem, | ||
400 | * fault it in and start all over again. */ | ||
401 | up_read(¤t->mm->mmap_sem); | ||
402 | |||
403 | ret = get_user(dummy, (int __user *)uaddr2); | ||
404 | if (ret) | ||
405 | return ret; | ||
406 | |||
407 | goto retryfull; | ||
408 | } | ||
409 | |||
410 | head = &bh1->chain; | ||
411 | |||
412 | list_for_each_entry_safe(this, next, head, list) { | ||
413 | if (match_futex (&this->key, &key1)) { | ||
414 | wake_futex(this); | ||
415 | if (++ret >= nr_wake) | ||
416 | break; | ||
417 | } | ||
418 | } | ||
419 | |||
420 | if (op_ret > 0) { | ||
421 | head = &bh2->chain; | ||
422 | |||
423 | op_ret = 0; | ||
424 | list_for_each_entry_safe(this, next, head, list) { | ||
425 | if (match_futex (&this->key, &key2)) { | ||
426 | wake_futex(this); | ||
427 | if (++op_ret >= nr_wake2) | ||
428 | break; | ||
429 | } | ||
430 | } | ||
431 | ret += op_ret; | ||
432 | } | ||
433 | |||
434 | spin_unlock(&bh1->lock); | ||
435 | if (bh1 != bh2) | ||
436 | spin_unlock(&bh2->lock); | ||
437 | out: | ||
438 | up_read(¤t->mm->mmap_sem); | ||
439 | return ret; | ||
440 | } | ||
441 | |||
442 | /* | ||
330 | * Requeue all waiters hashed on one physical page to another | 443 | * Requeue all waiters hashed on one physical page to another |
331 | * physical page. | 444 | * physical page. |
332 | */ | 445 | */ |
@@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
673 | filp->f_mapping = filp->f_dentry->d_inode->i_mapping; | 786 | filp->f_mapping = filp->f_dentry->d_inode->i_mapping; |
674 | 787 | ||
675 | if (signal) { | 788 | if (signal) { |
676 | int err; | ||
677 | err = f_setown(filp, current->pid, 1); | 789 | err = f_setown(filp, current->pid, 1); |
678 | if (err < 0) { | 790 | if (err < 0) { |
679 | put_unused_fd(ret); | 791 | goto error; |
680 | put_filp(filp); | ||
681 | ret = err; | ||
682 | goto out; | ||
683 | } | 792 | } |
684 | filp->f_owner.signum = signal; | 793 | filp->f_owner.signum = signal; |
685 | } | 794 | } |
686 | 795 | ||
687 | q = kmalloc(sizeof(*q), GFP_KERNEL); | 796 | q = kmalloc(sizeof(*q), GFP_KERNEL); |
688 | if (!q) { | 797 | if (!q) { |
689 | put_unused_fd(ret); | 798 | err = -ENOMEM; |
690 | put_filp(filp); | 799 | goto error; |
691 | ret = -ENOMEM; | ||
692 | goto out; | ||
693 | } | 800 | } |
694 | 801 | ||
695 | down_read(¤t->mm->mmap_sem); | 802 | down_read(¤t->mm->mmap_sem); |
@@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
697 | 804 | ||
698 | if (unlikely(err != 0)) { | 805 | if (unlikely(err != 0)) { |
699 | up_read(¤t->mm->mmap_sem); | 806 | up_read(¤t->mm->mmap_sem); |
700 | put_unused_fd(ret); | ||
701 | put_filp(filp); | ||
702 | kfree(q); | 807 | kfree(q); |
703 | return err; | 808 | goto error; |
704 | } | 809 | } |
705 | 810 | ||
706 | /* | 811 | /* |
@@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
716 | fd_install(ret, filp); | 821 | fd_install(ret, filp); |
717 | out: | 822 | out: |
718 | return ret; | 823 | return ret; |
824 | error: | ||
825 | put_unused_fd(ret); | ||
826 | put_filp(filp); | ||
827 | ret = err; | ||
828 | goto out; | ||
719 | } | 829 | } |
720 | 830 | ||
721 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 831 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, |
@@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
740 | case FUTEX_CMP_REQUEUE: | 850 | case FUTEX_CMP_REQUEUE: |
741 | ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); | 851 | ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); |
742 | break; | 852 | break; |
853 | case FUTEX_WAKE_OP: | ||
854 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | ||
855 | break; | ||
743 | default: | 856 | default: |
744 | ret = -ENOSYS; | 857 | ret = -ENOSYS; |
745 | } | 858 | } |
diff --git a/kernel/intermodule.c b/kernel/intermodule.c index 388977f3e9b7..0cbe633420fb 100644 --- a/kernel/intermodule.c +++ b/kernel/intermodule.c | |||
@@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void | |||
39 | struct list_head *tmp; | 39 | struct list_head *tmp; |
40 | struct inter_module_entry *ime, *ime_new; | 40 | struct inter_module_entry *ime, *ime_new; |
41 | 41 | ||
42 | if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { | 42 | if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { |
43 | /* Overloaded kernel, not fatal */ | 43 | /* Overloaded kernel, not fatal */ |
44 | printk(KERN_ERR | 44 | printk(KERN_ERR |
45 | "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", | 45 | "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", |
@@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void | |||
47 | kmalloc_failed = 1; | 47 | kmalloc_failed = 1; |
48 | return; | 48 | return; |
49 | } | 49 | } |
50 | memset(ime_new, 0, sizeof(*ime_new)); | ||
51 | ime_new->im_name = im_name; | 50 | ime_new->im_name = im_name; |
52 | ime_new->owner = owner; | 51 | ime_new->owner = owner; |
53 | ime_new->userdata = userdata; | 52 | ime_new->userdata = userdata; |
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c29f83c16497..3ff7b925c387 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) | |||
111 | unsigned int status; | 111 | unsigned int status; |
112 | 112 | ||
113 | kstat_this_cpu.irqs[irq]++; | 113 | kstat_this_cpu.irqs[irq]++; |
114 | if (desc->status & IRQ_PER_CPU) { | 114 | if (CHECK_IRQ_PER_CPU(desc->status)) { |
115 | irqreturn_t action_ret; | 115 | irqreturn_t action_ret; |
116 | 116 | ||
117 | /* | 117 | /* |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ac6700985705..1cfdb08ddf20 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -18,6 +18,10 @@ | |||
18 | 18 | ||
19 | cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; | 19 | cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; |
20 | 20 | ||
21 | #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) | ||
22 | cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; | ||
23 | #endif | ||
24 | |||
21 | /** | 25 | /** |
22 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) | 26 | * synchronize_irq - wait for pending IRQ handlers (on other CPUs) |
23 | * | 27 | * |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 85d08daa6600..f26e534c6585 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; | |||
19 | */ | 19 | */ |
20 | static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; | 20 | static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; |
21 | 21 | ||
22 | void __attribute__((weak)) | 22 | #ifdef CONFIG_GENERIC_PENDING_IRQ |
23 | proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | 23 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) |
24 | { | ||
25 | /* | ||
26 | * Save these away for later use. Re-progam when the | ||
27 | * interrupt is pending | ||
28 | */ | ||
29 | set_pending_irq(irq, mask_val); | ||
30 | } | ||
31 | #else | ||
32 | void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) | ||
24 | { | 33 | { |
25 | irq_affinity[irq] = mask_val; | 34 | irq_affinity[irq] = mask_val; |
26 | irq_desc[irq].handler->set_affinity(irq, mask_val); | 35 | irq_desc[irq].handler->set_affinity(irq, mask_val); |
27 | } | 36 | } |
37 | #endif | ||
28 | 38 | ||
29 | static int irq_affinity_read_proc(char *page, char **start, off_t off, | 39 | static int irq_affinity_read_proc(char *page, char **start, off_t off, |
30 | int count, int *eof, void *data) | 40 | int count, int *eof, void *data) |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b0237122b24e..f3ea492ab44d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -37,6 +37,7 @@ | |||
37 | #include <linux/init.h> | 37 | #include <linux/init.h> |
38 | #include <linux/module.h> | 38 | #include <linux/module.h> |
39 | #include <linux/moduleloader.h> | 39 | #include <linux/moduleloader.h> |
40 | #include <asm-generic/sections.h> | ||
40 | #include <asm/cacheflush.h> | 41 | #include <asm/cacheflush.h> |
41 | #include <asm/errno.h> | 42 | #include <asm/errno.h> |
42 | #include <asm/kdebug.h> | 43 | #include <asm/kdebug.h> |
@@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages; | |||
72 | * get_insn_slot() - Find a slot on an executable page for an instruction. | 73 | * get_insn_slot() - Find a slot on an executable page for an instruction. |
73 | * We allocate an executable page if there's no room on existing ones. | 74 | * We allocate an executable page if there's no room on existing ones. |
74 | */ | 75 | */ |
75 | kprobe_opcode_t *get_insn_slot(void) | 76 | kprobe_opcode_t __kprobes *get_insn_slot(void) |
76 | { | 77 | { |
77 | struct kprobe_insn_page *kip; | 78 | struct kprobe_insn_page *kip; |
78 | struct hlist_node *pos; | 79 | struct hlist_node *pos; |
@@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void) | |||
117 | return kip->insns; | 118 | return kip->insns; |
118 | } | 119 | } |
119 | 120 | ||
120 | void free_insn_slot(kprobe_opcode_t *slot) | 121 | void __kprobes free_insn_slot(kprobe_opcode_t *slot) |
121 | { | 122 | { |
122 | struct kprobe_insn_page *kip; | 123 | struct kprobe_insn_page *kip; |
123 | struct hlist_node *pos; | 124 | struct hlist_node *pos; |
@@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot) | |||
152 | } | 153 | } |
153 | 154 | ||
154 | /* Locks kprobe: irqs must be disabled */ | 155 | /* Locks kprobe: irqs must be disabled */ |
155 | void lock_kprobes(void) | 156 | void __kprobes lock_kprobes(void) |
156 | { | 157 | { |
158 | unsigned long flags = 0; | ||
159 | |||
160 | /* Avoiding local interrupts to happen right after we take the kprobe_lock | ||
161 | * and before we get a chance to update kprobe_cpu, this to prevent | ||
162 | * deadlock when we have a kprobe on ISR routine and a kprobe on task | ||
163 | * routine | ||
164 | */ | ||
165 | local_irq_save(flags); | ||
166 | |||
157 | spin_lock(&kprobe_lock); | 167 | spin_lock(&kprobe_lock); |
158 | kprobe_cpu = smp_processor_id(); | 168 | kprobe_cpu = smp_processor_id(); |
169 | |||
170 | local_irq_restore(flags); | ||
159 | } | 171 | } |
160 | 172 | ||
161 | void unlock_kprobes(void) | 173 | void __kprobes unlock_kprobes(void) |
162 | { | 174 | { |
175 | unsigned long flags = 0; | ||
176 | |||
177 | /* Avoiding local interrupts to happen right after we update | ||
178 | * kprobe_cpu and before we get a a chance to release kprobe_lock, | ||
179 | * this to prevent deadlock when we have a kprobe on ISR routine and | ||
180 | * a kprobe on task routine | ||
181 | */ | ||
182 | local_irq_save(flags); | ||
183 | |||
163 | kprobe_cpu = NR_CPUS; | 184 | kprobe_cpu = NR_CPUS; |
164 | spin_unlock(&kprobe_lock); | 185 | spin_unlock(&kprobe_lock); |
186 | |||
187 | local_irq_restore(flags); | ||
165 | } | 188 | } |
166 | 189 | ||
167 | /* You have to be holding the kprobe_lock */ | 190 | /* You have to be holding the kprobe_lock */ |
168 | struct kprobe *get_kprobe(void *addr) | 191 | struct kprobe __kprobes *get_kprobe(void *addr) |
169 | { | 192 | { |
170 | struct hlist_head *head; | 193 | struct hlist_head *head; |
171 | struct hlist_node *node; | 194 | struct hlist_node *node; |
@@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr) | |||
183 | * Aggregate handlers for multiple kprobes support - these handlers | 206 | * Aggregate handlers for multiple kprobes support - these handlers |
184 | * take care of invoking the individual kprobe handlers on p->list | 207 | * take care of invoking the individual kprobe handlers on p->list |
185 | */ | 208 | */ |
186 | static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | 209 | static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) |
187 | { | 210 | { |
188 | struct kprobe *kp; | 211 | struct kprobe *kp; |
189 | 212 | ||
@@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) | |||
198 | return 0; | 221 | return 0; |
199 | } | 222 | } |
200 | 223 | ||
201 | static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | 224 | static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, |
202 | unsigned long flags) | 225 | unsigned long flags) |
203 | { | 226 | { |
204 | struct kprobe *kp; | 227 | struct kprobe *kp; |
205 | 228 | ||
@@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, | |||
213 | return; | 236 | return; |
214 | } | 237 | } |
215 | 238 | ||
216 | static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | 239 | static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, |
217 | int trapnr) | 240 | int trapnr) |
218 | { | 241 | { |
219 | /* | 242 | /* |
220 | * if we faulted "during" the execution of a user specified | 243 | * if we faulted "during" the execution of a user specified |
@@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, | |||
227 | return 0; | 250 | return 0; |
228 | } | 251 | } |
229 | 252 | ||
230 | static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | 253 | static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) |
231 | { | 254 | { |
232 | struct kprobe *kp = curr_kprobe; | 255 | struct kprobe *kp = curr_kprobe; |
233 | if (curr_kprobe && kp->break_handler) { | 256 | if (curr_kprobe && kp->break_handler) { |
@@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) | |||
240 | return 0; | 263 | return 0; |
241 | } | 264 | } |
242 | 265 | ||
243 | struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) | 266 | struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) |
244 | { | 267 | { |
245 | struct hlist_node *node; | 268 | struct hlist_node *node; |
246 | struct kretprobe_instance *ri; | 269 | struct kretprobe_instance *ri; |
@@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) | |||
249 | return NULL; | 272 | return NULL; |
250 | } | 273 | } |
251 | 274 | ||
252 | static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) | 275 | static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe |
276 | *rp) | ||
253 | { | 277 | { |
254 | struct hlist_node *node; | 278 | struct hlist_node *node; |
255 | struct kretprobe_instance *ri; | 279 | struct kretprobe_instance *ri; |
@@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) | |||
258 | return NULL; | 282 | return NULL; |
259 | } | 283 | } |
260 | 284 | ||
261 | void add_rp_inst(struct kretprobe_instance *ri) | 285 | void __kprobes add_rp_inst(struct kretprobe_instance *ri) |
262 | { | 286 | { |
263 | /* | 287 | /* |
264 | * Remove rp inst off the free list - | 288 | * Remove rp inst off the free list - |
@@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri) | |||
276 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); | 300 | hlist_add_head(&ri->uflist, &ri->rp->used_instances); |
277 | } | 301 | } |
278 | 302 | ||
279 | void recycle_rp_inst(struct kretprobe_instance *ri) | 303 | void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) |
280 | { | 304 | { |
281 | /* remove rp inst off the rprobe_inst_table */ | 305 | /* remove rp inst off the rprobe_inst_table */ |
282 | hlist_del(&ri->hlist); | 306 | hlist_del(&ri->hlist); |
@@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri) | |||
291 | kfree(ri); | 315 | kfree(ri); |
292 | } | 316 | } |
293 | 317 | ||
294 | struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) | 318 | struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) |
295 | { | 319 | { |
296 | return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; | 320 | return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; |
297 | } | 321 | } |
@@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) | |||
302 | * instances associated with this task. These left over instances represent | 326 | * instances associated with this task. These left over instances represent |
303 | * probed functions that have been called but will never return. | 327 | * probed functions that have been called but will never return. |
304 | */ | 328 | */ |
305 | void kprobe_flush_task(struct task_struct *tk) | 329 | void __kprobes kprobe_flush_task(struct task_struct *tk) |
306 | { | 330 | { |
307 | struct kretprobe_instance *ri; | 331 | struct kretprobe_instance *ri; |
308 | struct hlist_head *head; | 332 | struct hlist_head *head; |
@@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk) | |||
322 | * This kprobe pre_handler is registered with every kretprobe. When probe | 346 | * This kprobe pre_handler is registered with every kretprobe. When probe |
323 | * hits it will set up the return probe. | 347 | * hits it will set up the return probe. |
324 | */ | 348 | */ |
325 | static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) | 349 | static int __kprobes pre_handler_kretprobe(struct kprobe *p, |
350 | struct pt_regs *regs) | ||
326 | { | 351 | { |
327 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); | 352 | struct kretprobe *rp = container_of(p, struct kretprobe, kp); |
328 | 353 | ||
@@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
353 | * Add the new probe to old_p->list. Fail if this is the | 378 | * Add the new probe to old_p->list. Fail if this is the |
354 | * second jprobe at the address - two jprobes can't coexist | 379 | * second jprobe at the address - two jprobes can't coexist |
355 | */ | 380 | */ |
356 | static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | 381 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) |
357 | { | 382 | { |
358 | struct kprobe *kp; | 383 | struct kprobe *kp; |
359 | 384 | ||
@@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
395 | * the intricacies | 420 | * the intricacies |
396 | * TODO: Move kcalloc outside the spinlock | 421 | * TODO: Move kcalloc outside the spinlock |
397 | */ | 422 | */ |
398 | static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) | 423 | static int __kprobes register_aggr_kprobe(struct kprobe *old_p, |
424 | struct kprobe *p) | ||
399 | { | 425 | { |
400 | int ret = 0; | 426 | int ret = 0; |
401 | struct kprobe *ap; | 427 | struct kprobe *ap; |
@@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p, | |||
434 | spin_unlock_irqrestore(&kprobe_lock, flags); | 460 | spin_unlock_irqrestore(&kprobe_lock, flags); |
435 | } | 461 | } |
436 | 462 | ||
437 | int register_kprobe(struct kprobe *p) | 463 | static int __kprobes in_kprobes_functions(unsigned long addr) |
464 | { | ||
465 | if (addr >= (unsigned long)__kprobes_text_start | ||
466 | && addr < (unsigned long)__kprobes_text_end) | ||
467 | return -EINVAL; | ||
468 | return 0; | ||
469 | } | ||
470 | |||
471 | int __kprobes register_kprobe(struct kprobe *p) | ||
438 | { | 472 | { |
439 | int ret = 0; | 473 | int ret = 0; |
440 | unsigned long flags = 0; | 474 | unsigned long flags = 0; |
441 | struct kprobe *old_p; | 475 | struct kprobe *old_p; |
442 | 476 | ||
443 | if ((ret = arch_prepare_kprobe(p)) != 0) { | 477 | if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) |
478 | return ret; | ||
479 | if ((ret = arch_prepare_kprobe(p)) != 0) | ||
444 | goto rm_kprobe; | 480 | goto rm_kprobe; |
445 | } | 481 | |
446 | spin_lock_irqsave(&kprobe_lock, flags); | 482 | spin_lock_irqsave(&kprobe_lock, flags); |
447 | old_p = get_kprobe(p->addr); | 483 | old_p = get_kprobe(p->addr); |
448 | p->nmissed = 0; | 484 | p->nmissed = 0; |
@@ -466,7 +502,7 @@ rm_kprobe: | |||
466 | return ret; | 502 | return ret; |
467 | } | 503 | } |
468 | 504 | ||
469 | void unregister_kprobe(struct kprobe *p) | 505 | void __kprobes unregister_kprobe(struct kprobe *p) |
470 | { | 506 | { |
471 | unsigned long flags; | 507 | unsigned long flags; |
472 | struct kprobe *old_p; | 508 | struct kprobe *old_p; |
@@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = { | |||
487 | .priority = 0x7fffffff /* we need to notified first */ | 523 | .priority = 0x7fffffff /* we need to notified first */ |
488 | }; | 524 | }; |
489 | 525 | ||
490 | int register_jprobe(struct jprobe *jp) | 526 | int __kprobes register_jprobe(struct jprobe *jp) |
491 | { | 527 | { |
492 | /* Todo: Verify probepoint is a function entry point */ | 528 | /* Todo: Verify probepoint is a function entry point */ |
493 | jp->kp.pre_handler = setjmp_pre_handler; | 529 | jp->kp.pre_handler = setjmp_pre_handler; |
@@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp) | |||
496 | return register_kprobe(&jp->kp); | 532 | return register_kprobe(&jp->kp); |
497 | } | 533 | } |
498 | 534 | ||
499 | void unregister_jprobe(struct jprobe *jp) | 535 | void __kprobes unregister_jprobe(struct jprobe *jp) |
500 | { | 536 | { |
501 | unregister_kprobe(&jp->kp); | 537 | unregister_kprobe(&jp->kp); |
502 | } | 538 | } |
503 | 539 | ||
504 | #ifdef ARCH_SUPPORTS_KRETPROBES | 540 | #ifdef ARCH_SUPPORTS_KRETPROBES |
505 | 541 | ||
506 | int register_kretprobe(struct kretprobe *rp) | 542 | int __kprobes register_kretprobe(struct kretprobe *rp) |
507 | { | 543 | { |
508 | int ret = 0; | 544 | int ret = 0; |
509 | struct kretprobe_instance *inst; | 545 | struct kretprobe_instance *inst; |
@@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp) | |||
540 | 576 | ||
541 | #else /* ARCH_SUPPORTS_KRETPROBES */ | 577 | #else /* ARCH_SUPPORTS_KRETPROBES */ |
542 | 578 | ||
543 | int register_kretprobe(struct kretprobe *rp) | 579 | int __kprobes register_kretprobe(struct kretprobe *rp) |
544 | { | 580 | { |
545 | return -ENOSYS; | 581 | return -ENOSYS; |
546 | } | 582 | } |
547 | 583 | ||
548 | #endif /* ARCH_SUPPORTS_KRETPROBES */ | 584 | #endif /* ARCH_SUPPORTS_KRETPROBES */ |
549 | 585 | ||
550 | void unregister_kretprobe(struct kretprobe *rp) | 586 | void __kprobes unregister_kretprobe(struct kretprobe *rp) |
551 | { | 587 | { |
552 | unsigned long flags; | 588 | unsigned long flags; |
553 | struct kretprobe_instance *ri; | 589 | struct kretprobe_instance *ri; |
diff --git a/kernel/module.c b/kernel/module.c index c32995fbd8fd..ff5c500ab625 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/module.h> | 20 | #include <linux/module.h> |
21 | #include <linux/moduleloader.h> | 21 | #include <linux/moduleloader.h> |
22 | #include <linux/init.h> | 22 | #include <linux/init.h> |
23 | #include <linux/kernel.h> | ||
23 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
24 | #include <linux/vmalloc.h> | 25 | #include <linux/vmalloc.h> |
25 | #include <linux/elf.h> | 26 | #include <linux/elf.h> |
@@ -498,7 +499,7 @@ static inline int try_force(unsigned int flags) | |||
498 | { | 499 | { |
499 | int ret = (flags & O_TRUNC); | 500 | int ret = (flags & O_TRUNC); |
500 | if (ret) | 501 | if (ret) |
501 | tainted |= TAINT_FORCED_MODULE; | 502 | add_taint(TAINT_FORCED_MODULE); |
502 | return ret; | 503 | return ret; |
503 | } | 504 | } |
504 | #else | 505 | #else |
@@ -897,7 +898,7 @@ static int check_version(Elf_Shdr *sechdrs, | |||
897 | if (!(tainted & TAINT_FORCED_MODULE)) { | 898 | if (!(tainted & TAINT_FORCED_MODULE)) { |
898 | printk("%s: no version for \"%s\" found: kernel tainted.\n", | 899 | printk("%s: no version for \"%s\" found: kernel tainted.\n", |
899 | mod->name, symname); | 900 | mod->name, symname); |
900 | tainted |= TAINT_FORCED_MODULE; | 901 | add_taint(TAINT_FORCED_MODULE); |
901 | } | 902 | } |
902 | return 1; | 903 | return 1; |
903 | } | 904 | } |
@@ -1352,7 +1353,7 @@ static void set_license(struct module *mod, const char *license) | |||
1352 | if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { | 1353 | if (!mod->license_gplok && !(tainted & TAINT_PROPRIETARY_MODULE)) { |
1353 | printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", | 1354 | printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", |
1354 | mod->name, license); | 1355 | mod->name, license); |
1355 | tainted |= TAINT_PROPRIETARY_MODULE; | 1356 | add_taint(TAINT_PROPRIETARY_MODULE); |
1356 | } | 1357 | } |
1357 | } | 1358 | } |
1358 | 1359 | ||
@@ -1509,6 +1510,7 @@ static struct module *load_module(void __user *umod, | |||
1509 | long err = 0; | 1510 | long err = 0; |
1510 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ | 1511 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ |
1511 | struct exception_table_entry *extable; | 1512 | struct exception_table_entry *extable; |
1513 | mm_segment_t old_fs; | ||
1512 | 1514 | ||
1513 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", | 1515 | DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", |
1514 | umod, len, uargs); | 1516 | umod, len, uargs); |
@@ -1609,7 +1611,7 @@ static struct module *load_module(void __user *umod, | |||
1609 | modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); | 1611 | modmagic = get_modinfo(sechdrs, infoindex, "vermagic"); |
1610 | /* This is allowed: modprobe --force will invalidate it. */ | 1612 | /* This is allowed: modprobe --force will invalidate it. */ |
1611 | if (!modmagic) { | 1613 | if (!modmagic) { |
1612 | tainted |= TAINT_FORCED_MODULE; | 1614 | add_taint(TAINT_FORCED_MODULE); |
1613 | printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", | 1615 | printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", |
1614 | mod->name); | 1616 | mod->name); |
1615 | } else if (!same_magic(modmagic, vermagic)) { | 1617 | } else if (!same_magic(modmagic, vermagic)) { |
@@ -1738,7 +1740,7 @@ static struct module *load_module(void __user *umod, | |||
1738 | (mod->num_gpl_syms && !gplcrcindex)) { | 1740 | (mod->num_gpl_syms && !gplcrcindex)) { |
1739 | printk(KERN_WARNING "%s: No versions for exported symbols." | 1741 | printk(KERN_WARNING "%s: No versions for exported symbols." |
1740 | " Tainting kernel.\n", mod->name); | 1742 | " Tainting kernel.\n", mod->name); |
1741 | tainted |= TAINT_FORCED_MODULE; | 1743 | add_taint(TAINT_FORCED_MODULE); |
1742 | } | 1744 | } |
1743 | #endif | 1745 | #endif |
1744 | 1746 | ||
@@ -1779,6 +1781,24 @@ static struct module *load_module(void __user *umod, | |||
1779 | if (err < 0) | 1781 | if (err < 0) |
1780 | goto cleanup; | 1782 | goto cleanup; |
1781 | 1783 | ||
1784 | /* flush the icache in correct context */ | ||
1785 | old_fs = get_fs(); | ||
1786 | set_fs(KERNEL_DS); | ||
1787 | |||
1788 | /* | ||
1789 | * Flush the instruction cache, since we've played with text. | ||
1790 | * Do it before processing of module parameters, so the module | ||
1791 | * can provide parameter accessor functions of its own. | ||
1792 | */ | ||
1793 | if (mod->module_init) | ||
1794 | flush_icache_range((unsigned long)mod->module_init, | ||
1795 | (unsigned long)mod->module_init | ||
1796 | + mod->init_size); | ||
1797 | flush_icache_range((unsigned long)mod->module_core, | ||
1798 | (unsigned long)mod->module_core + mod->core_size); | ||
1799 | |||
1800 | set_fs(old_fs); | ||
1801 | |||
1782 | mod->args = args; | 1802 | mod->args = args; |
1783 | if (obsparmindex) { | 1803 | if (obsparmindex) { |
1784 | err = obsolete_params(mod->name, mod->args, | 1804 | err = obsolete_params(mod->name, mod->args, |
@@ -1860,7 +1880,6 @@ sys_init_module(void __user *umod, | |||
1860 | const char __user *uargs) | 1880 | const char __user *uargs) |
1861 | { | 1881 | { |
1862 | struct module *mod; | 1882 | struct module *mod; |
1863 | mm_segment_t old_fs = get_fs(); | ||
1864 | int ret = 0; | 1883 | int ret = 0; |
1865 | 1884 | ||
1866 | /* Must have permission */ | 1885 | /* Must have permission */ |
@@ -1878,19 +1897,6 @@ sys_init_module(void __user *umod, | |||
1878 | return PTR_ERR(mod); | 1897 | return PTR_ERR(mod); |
1879 | } | 1898 | } |
1880 | 1899 | ||
1881 | /* flush the icache in correct context */ | ||
1882 | set_fs(KERNEL_DS); | ||
1883 | |||
1884 | /* Flush the instruction cache, since we've played with text */ | ||
1885 | if (mod->module_init) | ||
1886 | flush_icache_range((unsigned long)mod->module_init, | ||
1887 | (unsigned long)mod->module_init | ||
1888 | + mod->init_size); | ||
1889 | flush_icache_range((unsigned long)mod->module_core, | ||
1890 | (unsigned long)mod->module_core + mod->core_size); | ||
1891 | |||
1892 | set_fs(old_fs); | ||
1893 | |||
1894 | /* Now sew it into the lists. They won't access us, since | 1900 | /* Now sew it into the lists. They won't access us, since |
1895 | strong_try_module_get() will fail. */ | 1901 | strong_try_module_get() will fail. */ |
1896 | stop_machine_run(__link_module, mod, NR_CPUS); | 1902 | stop_machine_run(__link_module, mod, NR_CPUS); |
diff --git a/kernel/params.c b/kernel/params.c index d586c35ef8fc..fbf173215fd2 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name, | |||
542 | { | 542 | { |
543 | struct module_kobject *mk; | 543 | struct module_kobject *mk; |
544 | 544 | ||
545 | mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); | 545 | mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); |
546 | memset(mk, 0, sizeof(struct module_kobject)); | 546 | BUG_ON(!mk); |
547 | 547 | ||
548 | mk->mod = THIS_MODULE; | 548 | mk->mod = THIS_MODULE; |
549 | kobj_set_kset_s(mk, module_subsys); | 549 | kobj_set_kset_s(mk, module_subsys); |
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 38798a2ff994..b7b532acd9fc 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c | |||
@@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private) | |||
427 | timr->sigq->info.si_code = SI_TIMER; | 427 | timr->sigq->info.si_code = SI_TIMER; |
428 | timr->sigq->info.si_tid = timr->it_id; | 428 | timr->sigq->info.si_tid = timr->it_id; |
429 | timr->sigq->info.si_value = timr->it_sigev_value; | 429 | timr->sigq->info.si_value = timr->it_sigev_value; |
430 | |||
430 | if (timr->it_sigev_notify & SIGEV_THREAD_ID) { | 431 | if (timr->it_sigev_notify & SIGEV_THREAD_ID) { |
431 | if (unlikely(timr->it_process->flags & PF_EXITING)) { | 432 | struct task_struct *leader; |
432 | timr->it_sigev_notify = SIGEV_SIGNAL; | 433 | int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, |
433 | put_task_struct(timr->it_process); | 434 | timr->it_process); |
434 | timr->it_process = timr->it_process->group_leader; | 435 | |
435 | goto group; | 436 | if (likely(ret >= 0)) |
436 | } | 437 | return ret; |
437 | return send_sigqueue(timr->it_sigev_signo, timr->sigq, | 438 | |
438 | timr->it_process); | 439 | timr->it_sigev_notify = SIGEV_SIGNAL; |
439 | } | 440 | leader = timr->it_process->group_leader; |
440 | else { | 441 | put_task_struct(timr->it_process); |
441 | group: | 442 | timr->it_process = leader; |
442 | return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, | ||
443 | timr->it_process); | ||
444 | } | 443 | } |
444 | |||
445 | return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, | ||
446 | timr->it_process); | ||
445 | } | 447 | } |
446 | EXPORT_SYMBOL_GPL(posix_timer_event); | 448 | EXPORT_SYMBOL_GPL(posix_timer_event); |
447 | 449 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2c7121d9bff1..396c7873e804 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -1,5 +1,6 @@ | |||
1 | config PM | 1 | config PM |
2 | bool "Power Management support" | 2 | bool "Power Management support" |
3 | depends on !IA64_HP_SIM | ||
3 | ---help--- | 4 | ---help--- |
4 | "Power Management" means that parts of your computer are shut | 5 | "Power Management" means that parts of your computer are shut |
5 | off or put into a power conserving "sleep" mode if they are not | 6 | off or put into a power conserving "sleep" mode if they are not |
@@ -28,7 +29,7 @@ config PM_DEBUG | |||
28 | 29 | ||
29 | config SOFTWARE_SUSPEND | 30 | config SOFTWARE_SUSPEND |
30 | bool "Software Suspend" | 31 | bool "Software Suspend" |
31 | depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) | 32 | depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP)) |
32 | ---help--- | 33 | ---help--- |
33 | Enable the possibility of suspending the machine. | 34 | Enable the possibility of suspending the machine. |
34 | It doesn't need APM. | 35 | It doesn't need APM. |
@@ -72,6 +73,18 @@ config PM_STD_PARTITION | |||
72 | suspended image to. It will simply pick the first available swap | 73 | suspended image to. It will simply pick the first available swap |
73 | device. | 74 | device. |
74 | 75 | ||
76 | config SWSUSP_ENCRYPT | ||
77 | bool "Encrypt suspend image" | ||
78 | depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y) | ||
79 | default "" | ||
80 | ---help--- | ||
81 | To prevent data gathering from swap after resume you can encrypt | ||
82 | the suspend image with a temporary key that is deleted on | ||
83 | resume. | ||
84 | |||
85 | Note that the temporary key is stored unencrypted on disk while the | ||
86 | system is suspended. | ||
87 | |||
75 | config SUSPEND_SMP | 88 | config SUSPEND_SMP |
76 | bool | 89 | bool |
77 | depends on HOTPLUG_CPU && X86 && PM | 90 | depends on HOTPLUG_CPU && X86 && PM |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 664eb0469b6e..2d8bf054d036 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -112,24 +112,12 @@ static inline void platform_finish(void) | |||
112 | } | 112 | } |
113 | } | 113 | } |
114 | 114 | ||
115 | static void finish(void) | ||
116 | { | ||
117 | device_resume(); | ||
118 | platform_finish(); | ||
119 | thaw_processes(); | ||
120 | enable_nonboot_cpus(); | ||
121 | pm_restore_console(); | ||
122 | } | ||
123 | |||
124 | |||
125 | static int prepare_processes(void) | 115 | static int prepare_processes(void) |
126 | { | 116 | { |
127 | int error; | 117 | int error; |
128 | 118 | ||
129 | pm_prepare_console(); | 119 | pm_prepare_console(); |
130 | |||
131 | sys_sync(); | 120 | sys_sync(); |
132 | |||
133 | disable_nonboot_cpus(); | 121 | disable_nonboot_cpus(); |
134 | 122 | ||
135 | if (freeze_processes()) { | 123 | if (freeze_processes()) { |
@@ -162,15 +150,6 @@ static void unprepare_processes(void) | |||
162 | pm_restore_console(); | 150 | pm_restore_console(); |
163 | } | 151 | } |
164 | 152 | ||
165 | static int prepare_devices(void) | ||
166 | { | ||
167 | int error; | ||
168 | |||
169 | if ((error = device_suspend(PMSG_FREEZE))) | ||
170 | printk("Some devices failed to suspend\n"); | ||
171 | return error; | ||
172 | } | ||
173 | |||
174 | /** | 153 | /** |
175 | * pm_suspend_disk - The granpappy of power management. | 154 | * pm_suspend_disk - The granpappy of power management. |
176 | * | 155 | * |
@@ -187,17 +166,14 @@ int pm_suspend_disk(void) | |||
187 | error = prepare_processes(); | 166 | error = prepare_processes(); |
188 | if (error) | 167 | if (error) |
189 | return error; | 168 | return error; |
190 | error = prepare_devices(); | ||
191 | 169 | ||
170 | error = device_suspend(PMSG_FREEZE); | ||
192 | if (error) { | 171 | if (error) { |
172 | printk("Some devices failed to suspend\n"); | ||
193 | unprepare_processes(); | 173 | unprepare_processes(); |
194 | return error; | 174 | return error; |
195 | } | 175 | } |
196 | 176 | ||
197 | pr_debug("PM: Attempting to suspend to disk.\n"); | ||
198 | if (pm_disk_mode == PM_DISK_FIRMWARE) | ||
199 | return pm_ops->enter(PM_SUSPEND_DISK); | ||
200 | |||
201 | pr_debug("PM: snapshotting memory.\n"); | 177 | pr_debug("PM: snapshotting memory.\n"); |
202 | in_suspend = 1; | 178 | in_suspend = 1; |
203 | if ((error = swsusp_suspend())) | 179 | if ((error = swsusp_suspend())) |
@@ -208,11 +184,20 @@ int pm_suspend_disk(void) | |||
208 | error = swsusp_write(); | 184 | error = swsusp_write(); |
209 | if (!error) | 185 | if (!error) |
210 | power_down(pm_disk_mode); | 186 | power_down(pm_disk_mode); |
187 | else { | ||
188 | /* swsusp_write can not fail in device_resume, | ||
189 | no need to do second device_resume */ | ||
190 | swsusp_free(); | ||
191 | unprepare_processes(); | ||
192 | return error; | ||
193 | } | ||
211 | } else | 194 | } else |
212 | pr_debug("PM: Image restored successfully.\n"); | 195 | pr_debug("PM: Image restored successfully.\n"); |
196 | |||
213 | swsusp_free(); | 197 | swsusp_free(); |
214 | Done: | 198 | Done: |
215 | finish(); | 199 | device_resume(); |
200 | unprepare_processes(); | ||
216 | return error; | 201 | return error; |
217 | } | 202 | } |
218 | 203 | ||
@@ -233,9 +218,12 @@ static int software_resume(void) | |||
233 | { | 218 | { |
234 | int error; | 219 | int error; |
235 | 220 | ||
221 | down(&pm_sem); | ||
236 | if (!swsusp_resume_device) { | 222 | if (!swsusp_resume_device) { |
237 | if (!strlen(resume_file)) | 223 | if (!strlen(resume_file)) { |
224 | up(&pm_sem); | ||
238 | return -ENOENT; | 225 | return -ENOENT; |
226 | } | ||
239 | swsusp_resume_device = name_to_dev_t(resume_file); | 227 | swsusp_resume_device = name_to_dev_t(resume_file); |
240 | pr_debug("swsusp: Resume From Partition %s\n", resume_file); | 228 | pr_debug("swsusp: Resume From Partition %s\n", resume_file); |
241 | } else { | 229 | } else { |
@@ -248,6 +236,7 @@ static int software_resume(void) | |||
248 | * FIXME: If noresume is specified, we need to find the partition | 236 | * FIXME: If noresume is specified, we need to find the partition |
249 | * and reset it back to normal swap space. | 237 | * and reset it back to normal swap space. |
250 | */ | 238 | */ |
239 | up(&pm_sem); | ||
251 | return 0; | 240 | return 0; |
252 | } | 241 | } |
253 | 242 | ||
@@ -270,20 +259,24 @@ static int software_resume(void) | |||
270 | 259 | ||
271 | pr_debug("PM: Preparing devices for restore.\n"); | 260 | pr_debug("PM: Preparing devices for restore.\n"); |
272 | 261 | ||
273 | if ((error = prepare_devices())) | 262 | if ((error = device_suspend(PMSG_FREEZE))) { |
263 | printk("Some devices failed to suspend\n"); | ||
274 | goto Free; | 264 | goto Free; |
265 | } | ||
275 | 266 | ||
276 | mb(); | 267 | mb(); |
277 | 268 | ||
278 | pr_debug("PM: Restoring saved image.\n"); | 269 | pr_debug("PM: Restoring saved image.\n"); |
279 | swsusp_resume(); | 270 | swsusp_resume(); |
280 | pr_debug("PM: Restore failed, recovering.n"); | 271 | pr_debug("PM: Restore failed, recovering.n"); |
281 | finish(); | 272 | device_resume(); |
282 | Free: | 273 | Free: |
283 | swsusp_free(); | 274 | swsusp_free(); |
284 | Cleanup: | 275 | Cleanup: |
285 | unprepare_processes(); | 276 | unprepare_processes(); |
286 | Done: | 277 | Done: |
278 | /* For success case, the suspend path will release the lock */ | ||
279 | up(&pm_sem); | ||
287 | pr_debug("PM: Resume from disk failed.\n"); | 280 | pr_debug("PM: Resume from disk failed.\n"); |
288 | return 0; | 281 | return 0; |
289 | } | 282 | } |
@@ -390,7 +383,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t | |||
390 | if (sscanf(buf, "%u:%u", &maj, &min) == 2) { | 383 | if (sscanf(buf, "%u:%u", &maj, &min) == 2) { |
391 | res = MKDEV(maj,min); | 384 | res = MKDEV(maj,min); |
392 | if (maj == MAJOR(res) && min == MINOR(res)) { | 385 | if (maj == MAJOR(res) && min == MINOR(res)) { |
386 | down(&pm_sem); | ||
393 | swsusp_resume_device = res; | 387 | swsusp_resume_device = res; |
388 | up(&pm_sem); | ||
394 | printk("Attempting manual resume\n"); | 389 | printk("Attempting manual resume\n"); |
395 | noresume = 0; | 390 | noresume = 0; |
396 | software_resume(); | 391 | software_resume(); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 71aa0fd22007..22bdc93cc038 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -143,11 +143,12 @@ static void suspend_finish(suspend_state_t state) | |||
143 | 143 | ||
144 | 144 | ||
145 | 145 | ||
146 | static char * pm_states[] = { | 146 | static char *pm_states[PM_SUSPEND_MAX] = { |
147 | [PM_SUSPEND_STANDBY] = "standby", | 147 | [PM_SUSPEND_STANDBY] = "standby", |
148 | [PM_SUSPEND_MEM] = "mem", | 148 | [PM_SUSPEND_MEM] = "mem", |
149 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
149 | [PM_SUSPEND_DISK] = "disk", | 150 | [PM_SUSPEND_DISK] = "disk", |
150 | NULL, | 151 | #endif |
151 | }; | 152 | }; |
152 | 153 | ||
153 | 154 | ||
diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 61deda04e39e..159149321b3c 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c | |||
@@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type, | |||
60 | unsigned long id, | 60 | unsigned long id, |
61 | pm_callback callback) | 61 | pm_callback callback) |
62 | { | 62 | { |
63 | struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); | 63 | struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); |
64 | if (dev) { | 64 | if (dev) { |
65 | memset(dev, 0, sizeof(*dev)); | ||
66 | dev->type = type; | 65 | dev->type = type; |
67 | dev->id = id; | 66 | dev->id = id; |
68 | dev->callback = callback; | 67 | dev->callback = callback; |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 3bd0d261818f..28de118f7a0b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -38,7 +38,6 @@ void refrigerator(void) | |||
38 | processes around? */ | 38 | processes around? */ |
39 | long save; | 39 | long save; |
40 | save = current->state; | 40 | save = current->state; |
41 | current->state = TASK_UNINTERRUPTIBLE; | ||
42 | pr_debug("%s entered refrigerator\n", current->comm); | 41 | pr_debug("%s entered refrigerator\n", current->comm); |
43 | printk("="); | 42 | printk("="); |
44 | 43 | ||
@@ -47,8 +46,10 @@ void refrigerator(void) | |||
47 | recalc_sigpending(); /* We sent fake signal, clean it up */ | 46 | recalc_sigpending(); /* We sent fake signal, clean it up */ |
48 | spin_unlock_irq(¤t->sighand->siglock); | 47 | spin_unlock_irq(¤t->sighand->siglock); |
49 | 48 | ||
50 | while (frozen(current)) | 49 | while (frozen(current)) { |
50 | current->state = TASK_UNINTERRUPTIBLE; | ||
51 | schedule(); | 51 | schedule(); |
52 | } | ||
52 | pr_debug("%s left refrigerator\n", current->comm); | 53 | pr_debug("%s left refrigerator\n", current->comm); |
53 | current->state = save; | 54 | current->state = save; |
54 | } | 55 | } |
@@ -80,13 +81,33 @@ int freeze_processes(void) | |||
80 | } while_each_thread(g, p); | 81 | } while_each_thread(g, p); |
81 | read_unlock(&tasklist_lock); | 82 | read_unlock(&tasklist_lock); |
82 | yield(); /* Yield is okay here */ | 83 | yield(); /* Yield is okay here */ |
83 | if (time_after(jiffies, start_time + TIMEOUT)) { | 84 | if (todo && time_after(jiffies, start_time + TIMEOUT)) { |
84 | printk( "\n" ); | 85 | printk( "\n" ); |
85 | printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); | 86 | printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); |
86 | return todo; | 87 | break; |
87 | } | 88 | } |
88 | } while(todo); | 89 | } while(todo); |
89 | 90 | ||
91 | /* This does not unfreeze processes that are already frozen | ||
92 | * (we have slightly ugly calling convention in that respect, | ||
93 | * and caller must call thaw_processes() if something fails), | ||
94 | * but it cleans up leftover PF_FREEZE requests. | ||
95 | */ | ||
96 | if (todo) { | ||
97 | read_lock(&tasklist_lock); | ||
98 | do_each_thread(g, p) | ||
99 | if (freezing(p)) { | ||
100 | pr_debug(" clean up: %s\n", p->comm); | ||
101 | p->flags &= ~PF_FREEZE; | ||
102 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
103 | recalc_sigpending_tsk(p); | ||
104 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
105 | } | ||
106 | while_each_thread(g, p); | ||
107 | read_unlock(&tasklist_lock); | ||
108 | return todo; | ||
109 | } | ||
110 | |||
90 | printk( "|\n" ); | 111 | printk( "|\n" ); |
91 | BUG_ON(in_atomic()); | 112 | BUG_ON(in_atomic()); |
92 | return 0; | 113 | return 0; |
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index f2bc71b9fe8b..d967e875ee82 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -31,6 +31,9 @@ | |||
31 | * Alex Badea <vampire@go.ro>: | 31 | * Alex Badea <vampire@go.ro>: |
32 | * Fixed runaway init | 32 | * Fixed runaway init |
33 | * | 33 | * |
34 | * Andreas Steinmetz <ast@domdv.de>: | ||
35 | * Added encrypted suspend option | ||
36 | * | ||
34 | * More state savers are welcome. Especially for the scsi layer... | 37 | * More state savers are welcome. Especially for the scsi layer... |
35 | * | 38 | * |
36 | * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt | 39 | * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt |
@@ -71,8 +74,16 @@ | |||
71 | #include <asm/tlbflush.h> | 74 | #include <asm/tlbflush.h> |
72 | #include <asm/io.h> | 75 | #include <asm/io.h> |
73 | 76 | ||
77 | #include <linux/random.h> | ||
78 | #include <linux/crypto.h> | ||
79 | #include <asm/scatterlist.h> | ||
80 | |||
74 | #include "power.h" | 81 | #include "power.h" |
75 | 82 | ||
83 | #define CIPHER "aes" | ||
84 | #define MAXKEY 32 | ||
85 | #define MAXIV 32 | ||
86 | |||
76 | /* References to section boundaries */ | 87 | /* References to section boundaries */ |
77 | extern const void __nosave_begin, __nosave_end; | 88 | extern const void __nosave_begin, __nosave_end; |
78 | 89 | ||
@@ -103,7 +114,8 @@ static suspend_pagedir_t *pagedir_save; | |||
103 | #define SWSUSP_SIG "S1SUSPEND" | 114 | #define SWSUSP_SIG "S1SUSPEND" |
104 | 115 | ||
105 | static struct swsusp_header { | 116 | static struct swsusp_header { |
106 | char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; | 117 | char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; |
118 | u8 key_iv[MAXKEY+MAXIV]; | ||
107 | swp_entry_t swsusp_info; | 119 | swp_entry_t swsusp_info; |
108 | char orig_sig[10]; | 120 | char orig_sig[10]; |
109 | char sig[10]; | 121 | char sig[10]; |
@@ -129,6 +141,131 @@ static struct swsusp_info swsusp_info; | |||
129 | static unsigned short swapfile_used[MAX_SWAPFILES]; | 141 | static unsigned short swapfile_used[MAX_SWAPFILES]; |
130 | static unsigned short root_swap; | 142 | static unsigned short root_swap; |
131 | 143 | ||
144 | static int write_page(unsigned long addr, swp_entry_t * loc); | ||
145 | static int bio_read_page(pgoff_t page_off, void * page); | ||
146 | |||
147 | static u8 key_iv[MAXKEY+MAXIV]; | ||
148 | |||
149 | #ifdef CONFIG_SWSUSP_ENCRYPT | ||
150 | |||
151 | static int crypto_init(int mode, void **mem) | ||
152 | { | ||
153 | int error = 0; | ||
154 | int len; | ||
155 | char *modemsg; | ||
156 | struct crypto_tfm *tfm; | ||
157 | |||
158 | modemsg = mode ? "suspend not possible" : "resume not possible"; | ||
159 | |||
160 | tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC); | ||
161 | if(!tfm) { | ||
162 | printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg); | ||
163 | error = -EINVAL; | ||
164 | goto out; | ||
165 | } | ||
166 | |||
167 | if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) { | ||
168 | printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg); | ||
169 | error = -ENOKEY; | ||
170 | goto fail; | ||
171 | } | ||
172 | |||
173 | if (mode) | ||
174 | get_random_bytes(key_iv, MAXKEY+MAXIV); | ||
175 | |||
176 | len = crypto_tfm_alg_max_keysize(tfm); | ||
177 | if (len > MAXKEY) | ||
178 | len = MAXKEY; | ||
179 | |||
180 | if (crypto_cipher_setkey(tfm, key_iv, len)) { | ||
181 | printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg); | ||
182 | error = -EKEYREJECTED; | ||
183 | goto fail; | ||
184 | } | ||
185 | |||
186 | len = crypto_tfm_alg_ivsize(tfm); | ||
187 | |||
188 | if (MAXIV < len) { | ||
189 | printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg); | ||
190 | error = -EOVERFLOW; | ||
191 | goto fail; | ||
192 | } | ||
193 | |||
194 | crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len); | ||
195 | |||
196 | *mem=(void *)tfm; | ||
197 | |||
198 | goto out; | ||
199 | |||
200 | fail: crypto_free_tfm(tfm); | ||
201 | out: return error; | ||
202 | } | ||
203 | |||
204 | static __inline__ void crypto_exit(void *mem) | ||
205 | { | ||
206 | crypto_free_tfm((struct crypto_tfm *)mem); | ||
207 | } | ||
208 | |||
209 | static __inline__ int crypto_write(struct pbe *p, void *mem) | ||
210 | { | ||
211 | int error = 0; | ||
212 | struct scatterlist src, dst; | ||
213 | |||
214 | src.page = virt_to_page(p->address); | ||
215 | src.offset = 0; | ||
216 | src.length = PAGE_SIZE; | ||
217 | dst.page = virt_to_page((void *)&swsusp_header); | ||
218 | dst.offset = 0; | ||
219 | dst.length = PAGE_SIZE; | ||
220 | |||
221 | error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src, | ||
222 | PAGE_SIZE); | ||
223 | |||
224 | if (!error) | ||
225 | error = write_page((unsigned long)&swsusp_header, | ||
226 | &(p->swap_address)); | ||
227 | return error; | ||
228 | } | ||
229 | |||
230 | static __inline__ int crypto_read(struct pbe *p, void *mem) | ||
231 | { | ||
232 | int error = 0; | ||
233 | struct scatterlist src, dst; | ||
234 | |||
235 | error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); | ||
236 | if (!error) { | ||
237 | src.offset = 0; | ||
238 | src.length = PAGE_SIZE; | ||
239 | dst.offset = 0; | ||
240 | dst.length = PAGE_SIZE; | ||
241 | src.page = dst.page = virt_to_page((void *)p->address); | ||
242 | |||
243 | error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst, | ||
244 | &src, PAGE_SIZE); | ||
245 | } | ||
246 | return error; | ||
247 | } | ||
248 | #else | ||
249 | static __inline__ int crypto_init(int mode, void *mem) | ||
250 | { | ||
251 | return 0; | ||
252 | } | ||
253 | |||
254 | static __inline__ void crypto_exit(void *mem) | ||
255 | { | ||
256 | } | ||
257 | |||
258 | static __inline__ int crypto_write(struct pbe *p, void *mem) | ||
259 | { | ||
260 | return write_page(p->address, &(p->swap_address)); | ||
261 | } | ||
262 | |||
263 | static __inline__ int crypto_read(struct pbe *p, void *mem) | ||
264 | { | ||
265 | return bio_read_page(swp_offset(p->swap_address), (void *)p->address); | ||
266 | } | ||
267 | #endif | ||
268 | |||
132 | static int mark_swapfiles(swp_entry_t prev) | 269 | static int mark_swapfiles(swp_entry_t prev) |
133 | { | 270 | { |
134 | int error; | 271 | int error; |
@@ -140,6 +277,7 @@ static int mark_swapfiles(swp_entry_t prev) | |||
140 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { | 277 | !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { |
141 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); | 278 | memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); |
142 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); | 279 | memcpy(swsusp_header.sig,SWSUSP_SIG, 10); |
280 | memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); | ||
143 | swsusp_header.swsusp_info = prev; | 281 | swsusp_header.swsusp_info = prev; |
144 | error = rw_swap_page_sync(WRITE, | 282 | error = rw_swap_page_sync(WRITE, |
145 | swp_entry(root_swap, 0), | 283 | swp_entry(root_swap, 0), |
@@ -179,9 +317,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */ | |||
179 | len=strlen(resume_file); | 317 | len=strlen(resume_file); |
180 | root_swap = 0xFFFF; | 318 | root_swap = 0xFFFF; |
181 | 319 | ||
182 | swap_list_lock(); | 320 | spin_lock(&swap_lock); |
183 | for (i=0; i<MAX_SWAPFILES; i++) { | 321 | for (i=0; i<MAX_SWAPFILES; i++) { |
184 | if (swap_info[i].flags == 0) { | 322 | if (!(swap_info[i].flags & SWP_WRITEOK)) { |
185 | swapfile_used[i]=SWAPFILE_UNUSED; | 323 | swapfile_used[i]=SWAPFILE_UNUSED; |
186 | } else { | 324 | } else { |
187 | if (!len) { | 325 | if (!len) { |
@@ -202,7 +340,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */ | |||
202 | } | 340 | } |
203 | } | 341 | } |
204 | } | 342 | } |
205 | swap_list_unlock(); | 343 | spin_unlock(&swap_lock); |
206 | return (root_swap != 0xffff) ? 0 : -ENODEV; | 344 | return (root_swap != 0xffff) ? 0 : -ENODEV; |
207 | } | 345 | } |
208 | 346 | ||
@@ -216,12 +354,12 @@ static void lock_swapdevices(void) | |||
216 | { | 354 | { |
217 | int i; | 355 | int i; |
218 | 356 | ||
219 | swap_list_lock(); | 357 | spin_lock(&swap_lock); |
220 | for (i = 0; i< MAX_SWAPFILES; i++) | 358 | for (i = 0; i< MAX_SWAPFILES; i++) |
221 | if (swapfile_used[i] == SWAPFILE_IGNORED) { | 359 | if (swapfile_used[i] == SWAPFILE_IGNORED) { |
222 | swap_info[i].flags ^= 0xFF; | 360 | swap_info[i].flags ^= SWP_WRITEOK; |
223 | } | 361 | } |
224 | swap_list_unlock(); | 362 | spin_unlock(&swap_lock); |
225 | } | 363 | } |
226 | 364 | ||
227 | /** | 365 | /** |
@@ -286,6 +424,10 @@ static int data_write(void) | |||
286 | int error = 0, i = 0; | 424 | int error = 0, i = 0; |
287 | unsigned int mod = nr_copy_pages / 100; | 425 | unsigned int mod = nr_copy_pages / 100; |
288 | struct pbe *p; | 426 | struct pbe *p; |
427 | void *tfm; | ||
428 | |||
429 | if ((error = crypto_init(1, &tfm))) | ||
430 | return error; | ||
289 | 431 | ||
290 | if (!mod) | 432 | if (!mod) |
291 | mod = 1; | 433 | mod = 1; |
@@ -294,11 +436,14 @@ static int data_write(void) | |||
294 | for_each_pbe (p, pagedir_nosave) { | 436 | for_each_pbe (p, pagedir_nosave) { |
295 | if (!(i%mod)) | 437 | if (!(i%mod)) |
296 | printk( "\b\b\b\b%3d%%", i / mod ); | 438 | printk( "\b\b\b\b%3d%%", i / mod ); |
297 | if ((error = write_page(p->address, &(p->swap_address)))) | 439 | if ((error = crypto_write(p, tfm))) { |
440 | crypto_exit(tfm); | ||
298 | return error; | 441 | return error; |
442 | } | ||
299 | i++; | 443 | i++; |
300 | } | 444 | } |
301 | printk("\b\b\b\bdone\n"); | 445 | printk("\b\b\b\bdone\n"); |
446 | crypto_exit(tfm); | ||
302 | return error; | 447 | return error; |
303 | } | 448 | } |
304 | 449 | ||
@@ -385,7 +530,6 @@ static int write_pagedir(void) | |||
385 | * write_suspend_image - Write entire image and metadata. | 530 | * write_suspend_image - Write entire image and metadata. |
386 | * | 531 | * |
387 | */ | 532 | */ |
388 | |||
389 | static int write_suspend_image(void) | 533 | static int write_suspend_image(void) |
390 | { | 534 | { |
391 | int error; | 535 | int error; |
@@ -400,6 +544,7 @@ static int write_suspend_image(void) | |||
400 | if ((error = close_swap())) | 544 | if ((error = close_swap())) |
401 | goto FreePagedir; | 545 | goto FreePagedir; |
402 | Done: | 546 | Done: |
547 | memset(key_iv, 0, MAXKEY+MAXIV); | ||
403 | return error; | 548 | return error; |
404 | FreePagedir: | 549 | FreePagedir: |
405 | free_pagedir_entries(); | 550 | free_pagedir_entries(); |
@@ -591,18 +736,7 @@ static void copy_data_pages(void) | |||
591 | 736 | ||
592 | static int calc_nr(int nr_copy) | 737 | static int calc_nr(int nr_copy) |
593 | { | 738 | { |
594 | int extra = 0; | 739 | return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1); |
595 | int mod = !!(nr_copy % PBES_PER_PAGE); | ||
596 | int diff = (nr_copy / PBES_PER_PAGE) + mod; | ||
597 | |||
598 | do { | ||
599 | extra += diff; | ||
600 | nr_copy += diff; | ||
601 | mod = !!(nr_copy % PBES_PER_PAGE); | ||
602 | diff = (nr_copy / PBES_PER_PAGE) + mod - extra; | ||
603 | } while (diff > 0); | ||
604 | |||
605 | return nr_copy; | ||
606 | } | 740 | } |
607 | 741 | ||
608 | /** | 742 | /** |
@@ -886,20 +1020,21 @@ int swsusp_suspend(void) | |||
886 | * at resume time, and evil weirdness ensues. | 1020 | * at resume time, and evil weirdness ensues. |
887 | */ | 1021 | */ |
888 | if ((error = device_power_down(PMSG_FREEZE))) { | 1022 | if ((error = device_power_down(PMSG_FREEZE))) { |
1023 | printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); | ||
889 | local_irq_enable(); | 1024 | local_irq_enable(); |
890 | return error; | 1025 | return error; |
891 | } | 1026 | } |
892 | 1027 | ||
893 | if ((error = swsusp_swap_check())) { | 1028 | if ((error = swsusp_swap_check())) { |
894 | printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " | 1029 | printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); |
895 | "swapon -a!\n"); | 1030 | device_power_up(); |
896 | local_irq_enable(); | 1031 | local_irq_enable(); |
897 | return error; | 1032 | return error; |
898 | } | 1033 | } |
899 | 1034 | ||
900 | save_processor_state(); | 1035 | save_processor_state(); |
901 | if ((error = swsusp_arch_suspend())) | 1036 | if ((error = swsusp_arch_suspend())) |
902 | printk("Error %d suspending\n", error); | 1037 | printk(KERN_ERR "Error %d suspending\n", error); |
903 | /* Restore control flow magically appears here */ | 1038 | /* Restore control flow magically appears here */ |
904 | restore_processor_state(); | 1039 | restore_processor_state(); |
905 | BUG_ON (nr_copy_pages_check != nr_copy_pages); | 1040 | BUG_ON (nr_copy_pages_check != nr_copy_pages); |
@@ -924,6 +1059,7 @@ int swsusp_resume(void) | |||
924 | BUG_ON(!error); | 1059 | BUG_ON(!error); |
925 | restore_processor_state(); | 1060 | restore_processor_state(); |
926 | restore_highmem(); | 1061 | restore_highmem(); |
1062 | touch_softlockup_watchdog(); | ||
927 | device_power_up(); | 1063 | device_power_up(); |
928 | local_irq_enable(); | 1064 | local_irq_enable(); |
929 | return error; | 1065 | return error; |
@@ -1179,7 +1315,8 @@ static const char * sanity_check(void) | |||
1179 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) | 1315 | if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) |
1180 | return "machine"; | 1316 | return "machine"; |
1181 | #if 0 | 1317 | #if 0 |
1182 | if(swsusp_info.cpus != num_online_cpus()) | 1318 | /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */ |
1319 | if (swsusp_info.cpus != num_possible_cpus()) | ||
1183 | return "number of cpus"; | 1320 | return "number of cpus"; |
1184 | #endif | 1321 | #endif |
1185 | return NULL; | 1322 | return NULL; |
@@ -1212,13 +1349,14 @@ static int check_sig(void) | |||
1212 | return error; | 1349 | return error; |
1213 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { | 1350 | if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { |
1214 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); | 1351 | memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); |
1352 | memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV); | ||
1353 | memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV); | ||
1215 | 1354 | ||
1216 | /* | 1355 | /* |
1217 | * Reset swap signature now. | 1356 | * Reset swap signature now. |
1218 | */ | 1357 | */ |
1219 | error = bio_write_page(0, &swsusp_header); | 1358 | error = bio_write_page(0, &swsusp_header); |
1220 | } else { | 1359 | } else { |
1221 | printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n"); | ||
1222 | return -EINVAL; | 1360 | return -EINVAL; |
1223 | } | 1361 | } |
1224 | if (!error) | 1362 | if (!error) |
@@ -1239,6 +1377,10 @@ static int data_read(struct pbe *pblist) | |||
1239 | int error = 0; | 1377 | int error = 0; |
1240 | int i = 0; | 1378 | int i = 0; |
1241 | int mod = swsusp_info.image_pages / 100; | 1379 | int mod = swsusp_info.image_pages / 100; |
1380 | void *tfm; | ||
1381 | |||
1382 | if ((error = crypto_init(0, &tfm))) | ||
1383 | return error; | ||
1242 | 1384 | ||
1243 | if (!mod) | 1385 | if (!mod) |
1244 | mod = 1; | 1386 | mod = 1; |
@@ -1250,14 +1392,15 @@ static int data_read(struct pbe *pblist) | |||
1250 | if (!(i % mod)) | 1392 | if (!(i % mod)) |
1251 | printk("\b\b\b\b%3d%%", i / mod); | 1393 | printk("\b\b\b\b%3d%%", i / mod); |
1252 | 1394 | ||
1253 | error = bio_read_page(swp_offset(p->swap_address), | 1395 | if ((error = crypto_read(p, tfm))) { |
1254 | (void *)p->address); | 1396 | crypto_exit(tfm); |
1255 | if (error) | ||
1256 | return error; | 1397 | return error; |
1398 | } | ||
1257 | 1399 | ||
1258 | i++; | 1400 | i++; |
1259 | } | 1401 | } |
1260 | printk("\b\b\b\bdone\n"); | 1402 | printk("\b\b\b\bdone\n"); |
1403 | crypto_exit(tfm); | ||
1261 | return error; | 1404 | return error; |
1262 | } | 1405 | } |
1263 | 1406 | ||
@@ -1385,6 +1528,7 @@ int swsusp_read(void) | |||
1385 | 1528 | ||
1386 | error = read_suspend_image(); | 1529 | error = read_suspend_image(); |
1387 | blkdev_put(resume_bdev); | 1530 | blkdev_put(resume_bdev); |
1531 | memset(key_iv, 0, MAXKEY+MAXIV); | ||
1388 | 1532 | ||
1389 | if (!error) | 1533 | if (!error) |
1390 | pr_debug("swsusp: Reading resume file was successful\n"); | 1534 | pr_debug("swsusp: Reading resume file was successful\n"); |
diff --git a/kernel/printk.c b/kernel/printk.c index 5092397fac29..a967605bc2e3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -514,6 +514,9 @@ asmlinkage int printk(const char *fmt, ...) | |||
514 | return r; | 514 | return r; |
515 | } | 515 | } |
516 | 516 | ||
517 | /* cpu currently holding logbuf_lock */ | ||
518 | static volatile unsigned int printk_cpu = UINT_MAX; | ||
519 | |||
517 | asmlinkage int vprintk(const char *fmt, va_list args) | 520 | asmlinkage int vprintk(const char *fmt, va_list args) |
518 | { | 521 | { |
519 | unsigned long flags; | 522 | unsigned long flags; |
@@ -522,11 +525,15 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
522 | static char printk_buf[1024]; | 525 | static char printk_buf[1024]; |
523 | static int log_level_unknown = 1; | 526 | static int log_level_unknown = 1; |
524 | 527 | ||
525 | if (unlikely(oops_in_progress)) | 528 | preempt_disable(); |
529 | if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) | ||
530 | /* If a crash is occurring during printk() on this CPU, | ||
531 | * make sure we can't deadlock */ | ||
526 | zap_locks(); | 532 | zap_locks(); |
527 | 533 | ||
528 | /* This stops the holder of console_sem just where we want him */ | 534 | /* This stops the holder of console_sem just where we want him */ |
529 | spin_lock_irqsave(&logbuf_lock, flags); | 535 | spin_lock_irqsave(&logbuf_lock, flags); |
536 | printk_cpu = smp_processor_id(); | ||
530 | 537 | ||
531 | /* Emit the output into the temporary buffer */ | 538 | /* Emit the output into the temporary buffer */ |
532 | printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); | 539 | printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); |
@@ -595,6 +602,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
595 | * CPU until it is officially up. We shouldn't be calling into | 602 | * CPU until it is officially up. We shouldn't be calling into |
596 | * random console drivers on a CPU which doesn't exist yet.. | 603 | * random console drivers on a CPU which doesn't exist yet.. |
597 | */ | 604 | */ |
605 | printk_cpu = UINT_MAX; | ||
598 | spin_unlock_irqrestore(&logbuf_lock, flags); | 606 | spin_unlock_irqrestore(&logbuf_lock, flags); |
599 | goto out; | 607 | goto out; |
600 | } | 608 | } |
@@ -604,6 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
604 | * We own the drivers. We can drop the spinlock and let | 612 | * We own the drivers. We can drop the spinlock and let |
605 | * release_console_sem() print the text | 613 | * release_console_sem() print the text |
606 | */ | 614 | */ |
615 | printk_cpu = UINT_MAX; | ||
607 | spin_unlock_irqrestore(&logbuf_lock, flags); | 616 | spin_unlock_irqrestore(&logbuf_lock, flags); |
608 | console_may_schedule = 0; | 617 | console_may_schedule = 0; |
609 | release_console_sem(); | 618 | release_console_sem(); |
@@ -613,9 +622,11 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
613 | * allows the semaphore holder to proceed and to call the | 622 | * allows the semaphore holder to proceed and to call the |
614 | * console drivers with the output which we just produced. | 623 | * console drivers with the output which we just produced. |
615 | */ | 624 | */ |
625 | printk_cpu = UINT_MAX; | ||
616 | spin_unlock_irqrestore(&logbuf_lock, flags); | 626 | spin_unlock_irqrestore(&logbuf_lock, flags); |
617 | } | 627 | } |
618 | out: | 628 | out: |
629 | preempt_enable(); | ||
619 | return printed_len; | 630 | return printed_len; |
620 | } | 631 | } |
621 | EXPORT_SYMBOL(printk); | 632 | EXPORT_SYMBOL(printk); |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8dcb8f6288bc..019e04ec065a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
118 | return ret; | 118 | return ret; |
119 | } | 119 | } |
120 | 120 | ||
121 | static int may_attach(struct task_struct *task) | ||
122 | { | ||
123 | if (!task->mm) | ||
124 | return -EPERM; | ||
125 | if (((current->uid != task->euid) || | ||
126 | (current->uid != task->suid) || | ||
127 | (current->uid != task->uid) || | ||
128 | (current->gid != task->egid) || | ||
129 | (current->gid != task->sgid) || | ||
130 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) | ||
131 | return -EPERM; | ||
132 | smp_rmb(); | ||
133 | if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) | ||
134 | return -EPERM; | ||
135 | |||
136 | return security_ptrace(current, task); | ||
137 | } | ||
138 | |||
139 | int ptrace_may_attach(struct task_struct *task) | ||
140 | { | ||
141 | int err; | ||
142 | task_lock(task); | ||
143 | err = may_attach(task); | ||
144 | task_unlock(task); | ||
145 | return !err; | ||
146 | } | ||
147 | |||
121 | int ptrace_attach(struct task_struct *task) | 148 | int ptrace_attach(struct task_struct *task) |
122 | { | 149 | { |
123 | int retval; | 150 | int retval; |
@@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task) | |||
127 | goto bad; | 154 | goto bad; |
128 | if (task == current) | 155 | if (task == current) |
129 | goto bad; | 156 | goto bad; |
130 | if (!task->mm) | ||
131 | goto bad; | ||
132 | if(((current->uid != task->euid) || | ||
133 | (current->uid != task->suid) || | ||
134 | (current->uid != task->uid) || | ||
135 | (current->gid != task->egid) || | ||
136 | (current->gid != task->sgid) || | ||
137 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) | ||
138 | goto bad; | ||
139 | smp_rmb(); | ||
140 | if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) | ||
141 | goto bad; | ||
142 | /* the same process cannot be attached many times */ | 157 | /* the same process cannot be attached many times */ |
143 | if (task->ptrace & PT_PTRACED) | 158 | if (task->ptrace & PT_PTRACED) |
144 | goto bad; | 159 | goto bad; |
145 | retval = security_ptrace(current, task); | 160 | retval = may_attach(task); |
146 | if (retval) | 161 | if (retval) |
147 | goto bad; | 162 | goto bad; |
148 | 163 | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f436993bd590..bef3b6901b76 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/percpu.h> | 45 | #include <linux/percpu.h> |
46 | #include <linux/notifier.h> | 46 | #include <linux/notifier.h> |
47 | #include <linux/rcupdate.h> | 47 | #include <linux/rcupdate.h> |
48 | #include <linux/rcuref.h> | ||
48 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
49 | 50 | ||
50 | /* Definition for rcupdate control block. */ | 51 | /* Definition for rcupdate control block. */ |
@@ -72,6 +73,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | |||
72 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | 73 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; |
73 | static int maxbatch = 10; | 74 | static int maxbatch = 10; |
74 | 75 | ||
76 | #ifndef __HAVE_ARCH_CMPXCHG | ||
77 | /* | ||
78 | * We use an array of spinlocks for the rcurefs -- similar to ones in sparc | ||
79 | * 32 bit atomic_t implementations, and a hash function similar to that | ||
80 | * for our refcounting needs. | ||
81 | * Can't help multiprocessors which donot have cmpxchg :( | ||
82 | */ | ||
83 | |||
84 | spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { | ||
85 | [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED | ||
86 | }; | ||
87 | #endif | ||
88 | |||
75 | /** | 89 | /** |
76 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 90 | * call_rcu - Queue an RCU callback for invocation after a grace period. |
77 | * @head: structure to be used for queueing the RCU updates. | 91 | * @head: structure to be used for queueing the RCU updates. |
diff --git a/kernel/resource.c b/kernel/resource.c index 26967e042201..92285d822de6 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource); | |||
430 | */ | 430 | */ |
431 | struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) | 431 | struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) |
432 | { | 432 | { |
433 | struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); | 433 | struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); |
434 | 434 | ||
435 | if (res) { | 435 | if (res) { |
436 | memset(res, 0, sizeof(*res)); | ||
437 | res->name = name; | 436 | res->name = name; |
438 | res->start = start; | 437 | res->start = start; |
439 | res->end = start + n - 1; | 438 | res->end = start + n - 1; |
diff --git a/kernel/sched.c b/kernel/sched.c index 5f889d0cbfcc..81b3a96ed2d0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -875,7 +875,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) | |||
875 | * smp_call_function() if an IPI is sent by the same process we are | 875 | * smp_call_function() if an IPI is sent by the same process we are |
876 | * waiting to become inactive. | 876 | * waiting to become inactive. |
877 | */ | 877 | */ |
878 | void wait_task_inactive(task_t * p) | 878 | void wait_task_inactive(task_t *p) |
879 | { | 879 | { |
880 | unsigned long flags; | 880 | unsigned long flags; |
881 | runqueue_t *rq; | 881 | runqueue_t *rq; |
@@ -966,8 +966,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
966 | int local_group; | 966 | int local_group; |
967 | int i; | 967 | int i; |
968 | 968 | ||
969 | /* Skip over this group if it has no CPUs allowed */ | ||
970 | if (!cpus_intersects(group->cpumask, p->cpus_allowed)) | ||
971 | goto nextgroup; | ||
972 | |||
969 | local_group = cpu_isset(this_cpu, group->cpumask); | 973 | local_group = cpu_isset(this_cpu, group->cpumask); |
970 | /* XXX: put a cpus allowed check */ | ||
971 | 974 | ||
972 | /* Tally up the load of all CPUs in the group */ | 975 | /* Tally up the load of all CPUs in the group */ |
973 | avg_load = 0; | 976 | avg_load = 0; |
@@ -992,6 +995,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
992 | min_load = avg_load; | 995 | min_load = avg_load; |
993 | idlest = group; | 996 | idlest = group; |
994 | } | 997 | } |
998 | nextgroup: | ||
995 | group = group->next; | 999 | group = group->next; |
996 | } while (group != sd->groups); | 1000 | } while (group != sd->groups); |
997 | 1001 | ||
@@ -1003,13 +1007,18 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
1003 | /* | 1007 | /* |
1004 | * find_idlest_queue - find the idlest runqueue among the cpus in group. | 1008 | * find_idlest_queue - find the idlest runqueue among the cpus in group. |
1005 | */ | 1009 | */ |
1006 | static int find_idlest_cpu(struct sched_group *group, int this_cpu) | 1010 | static int |
1011 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | ||
1007 | { | 1012 | { |
1013 | cpumask_t tmp; | ||
1008 | unsigned long load, min_load = ULONG_MAX; | 1014 | unsigned long load, min_load = ULONG_MAX; |
1009 | int idlest = -1; | 1015 | int idlest = -1; |
1010 | int i; | 1016 | int i; |
1011 | 1017 | ||
1012 | for_each_cpu_mask(i, group->cpumask) { | 1018 | /* Traverse only the allowed CPUs */ |
1019 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | ||
1020 | |||
1021 | for_each_cpu_mask(i, tmp) { | ||
1013 | load = source_load(i, 0); | 1022 | load = source_load(i, 0); |
1014 | 1023 | ||
1015 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1024 | if (load < min_load || (load == min_load && i == this_cpu)) { |
@@ -1052,7 +1061,7 @@ static int sched_balance_self(int cpu, int flag) | |||
1052 | if (!group) | 1061 | if (!group) |
1053 | goto nextlevel; | 1062 | goto nextlevel; |
1054 | 1063 | ||
1055 | new_cpu = find_idlest_cpu(group, cpu); | 1064 | new_cpu = find_idlest_cpu(group, t, cpu); |
1056 | if (new_cpu == -1 || new_cpu == cpu) | 1065 | if (new_cpu == -1 || new_cpu == cpu) |
1057 | goto nextlevel; | 1066 | goto nextlevel; |
1058 | 1067 | ||
@@ -1127,7 +1136,7 @@ static inline int wake_idle(int cpu, task_t *p) | |||
1127 | * | 1136 | * |
1128 | * returns failure only if the task is already active. | 1137 | * returns failure only if the task is already active. |
1129 | */ | 1138 | */ |
1130 | static int try_to_wake_up(task_t * p, unsigned int state, int sync) | 1139 | static int try_to_wake_up(task_t *p, unsigned int state, int sync) |
1131 | { | 1140 | { |
1132 | int cpu, this_cpu, success = 0; | 1141 | int cpu, this_cpu, success = 0; |
1133 | unsigned long flags; | 1142 | unsigned long flags; |
@@ -1252,6 +1261,16 @@ out_activate: | |||
1252 | } | 1261 | } |
1253 | 1262 | ||
1254 | /* | 1263 | /* |
1264 | * Tasks that have marked their sleep as noninteractive get | ||
1265 | * woken up without updating their sleep average. (i.e. their | ||
1266 | * sleep is handled in a priority-neutral manner, no priority | ||
1267 | * boost and no penalty.) | ||
1268 | */ | ||
1269 | if (old_state & TASK_NONINTERACTIVE) | ||
1270 | __activate_task(p, rq); | ||
1271 | else | ||
1272 | activate_task(p, rq, cpu == this_cpu); | ||
1273 | /* | ||
1255 | * Sync wakeups (i.e. those types of wakeups where the waker | 1274 | * Sync wakeups (i.e. those types of wakeups where the waker |
1256 | * has indicated that it will leave the CPU in short order) | 1275 | * has indicated that it will leave the CPU in short order) |
1257 | * don't trigger a preemption, if the woken up task will run on | 1276 | * don't trigger a preemption, if the woken up task will run on |
@@ -1259,7 +1278,6 @@ out_activate: | |||
1259 | * the waker guarantees that the freshly woken up task is going | 1278 | * the waker guarantees that the freshly woken up task is going |
1260 | * to be considered on this CPU.) | 1279 | * to be considered on this CPU.) |
1261 | */ | 1280 | */ |
1262 | activate_task(p, rq, cpu == this_cpu); | ||
1263 | if (!sync || cpu != this_cpu) { | 1281 | if (!sync || cpu != this_cpu) { |
1264 | if (TASK_PREEMPTS_CURR(p, rq)) | 1282 | if (TASK_PREEMPTS_CURR(p, rq)) |
1265 | resched_task(rq->curr); | 1283 | resched_task(rq->curr); |
@@ -1274,7 +1292,7 @@ out: | |||
1274 | return success; | 1292 | return success; |
1275 | } | 1293 | } |
1276 | 1294 | ||
1277 | int fastcall wake_up_process(task_t * p) | 1295 | int fastcall wake_up_process(task_t *p) |
1278 | { | 1296 | { |
1279 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1297 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | |
1280 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | 1298 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); |
@@ -1353,7 +1371,7 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1353 | * that must be done for every newly created context, then puts the task | 1371 | * that must be done for every newly created context, then puts the task |
1354 | * on the runqueue and wakes it. | 1372 | * on the runqueue and wakes it. |
1355 | */ | 1373 | */ |
1356 | void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) | 1374 | void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) |
1357 | { | 1375 | { |
1358 | unsigned long flags; | 1376 | unsigned long flags; |
1359 | int this_cpu, cpu; | 1377 | int this_cpu, cpu; |
@@ -1436,7 +1454,7 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) | |||
1436 | * artificially, because any timeslice recovered here | 1454 | * artificially, because any timeslice recovered here |
1437 | * was given away by the parent in the first place.) | 1455 | * was given away by the parent in the first place.) |
1438 | */ | 1456 | */ |
1439 | void fastcall sched_exit(task_t * p) | 1457 | void fastcall sched_exit(task_t *p) |
1440 | { | 1458 | { |
1441 | unsigned long flags; | 1459 | unsigned long flags; |
1442 | runqueue_t *rq; | 1460 | runqueue_t *rq; |
@@ -1478,6 +1496,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) | |||
1478 | 1496 | ||
1479 | /** | 1497 | /** |
1480 | * finish_task_switch - clean up after a task-switch | 1498 | * finish_task_switch - clean up after a task-switch |
1499 | * @rq: runqueue associated with task-switch | ||
1481 | * @prev: the thread we just switched away from. | 1500 | * @prev: the thread we just switched away from. |
1482 | * | 1501 | * |
1483 | * finish_task_switch must be called after the context switch, paired | 1502 | * finish_task_switch must be called after the context switch, paired |
@@ -1510,6 +1529,10 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev) | |||
1510 | * Manfred Spraul <manfred@colorfullife.com> | 1529 | * Manfred Spraul <manfred@colorfullife.com> |
1511 | */ | 1530 | */ |
1512 | prev_task_flags = prev->flags; | 1531 | prev_task_flags = prev->flags; |
1532 | #ifdef CONFIG_DEBUG_SPINLOCK | ||
1533 | /* this is a valid case when another task releases the spinlock */ | ||
1534 | rq->lock.owner = current; | ||
1535 | #endif | ||
1513 | finish_arch_switch(prev); | 1536 | finish_arch_switch(prev); |
1514 | finish_lock_switch(rq, prev); | 1537 | finish_lock_switch(rq, prev); |
1515 | if (mm) | 1538 | if (mm) |
@@ -1752,7 +1775,8 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1752 | */ | 1775 | */ |
1753 | static inline | 1776 | static inline |
1754 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | 1777 | int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, |
1755 | struct sched_domain *sd, enum idle_type idle, int *all_pinned) | 1778 | struct sched_domain *sd, enum idle_type idle, |
1779 | int *all_pinned) | ||
1756 | { | 1780 | { |
1757 | /* | 1781 | /* |
1758 | * We do not migrate tasks that are: | 1782 | * We do not migrate tasks that are: |
@@ -1882,10 +1906,11 @@ out: | |||
1882 | */ | 1906 | */ |
1883 | static struct sched_group * | 1907 | static struct sched_group * |
1884 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 1908 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
1885 | unsigned long *imbalance, enum idle_type idle) | 1909 | unsigned long *imbalance, enum idle_type idle, int *sd_idle) |
1886 | { | 1910 | { |
1887 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 1911 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
1888 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 1912 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
1913 | unsigned long max_pull; | ||
1889 | int load_idx; | 1914 | int load_idx; |
1890 | 1915 | ||
1891 | max_load = this_load = total_load = total_pwr = 0; | 1916 | max_load = this_load = total_load = total_pwr = 0; |
@@ -1907,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1907 | avg_load = 0; | 1932 | avg_load = 0; |
1908 | 1933 | ||
1909 | for_each_cpu_mask(i, group->cpumask) { | 1934 | for_each_cpu_mask(i, group->cpumask) { |
1935 | if (*sd_idle && !idle_cpu(i)) | ||
1936 | *sd_idle = 0; | ||
1937 | |||
1910 | /* Bias balancing toward cpus of our domain */ | 1938 | /* Bias balancing toward cpus of our domain */ |
1911 | if (local_group) | 1939 | if (local_group) |
1912 | load = target_load(i, load_idx); | 1940 | load = target_load(i, load_idx); |
@@ -1932,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1932 | group = group->next; | 1960 | group = group->next; |
1933 | } while (group != sd->groups); | 1961 | } while (group != sd->groups); |
1934 | 1962 | ||
1935 | if (!busiest || this_load >= max_load) | 1963 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) |
1936 | goto out_balanced; | 1964 | goto out_balanced; |
1937 | 1965 | ||
1938 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 1966 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
@@ -1952,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1952 | * by pulling tasks to us. Be careful of negative numbers as they'll | 1980 | * by pulling tasks to us. Be careful of negative numbers as they'll |
1953 | * appear as very large values with unsigned longs. | 1981 | * appear as very large values with unsigned longs. |
1954 | */ | 1982 | */ |
1983 | |||
1984 | /* Don't want to pull so many tasks that a group would go idle */ | ||
1985 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | ||
1986 | |||
1955 | /* How much load to actually move to equalise the imbalance */ | 1987 | /* How much load to actually move to equalise the imbalance */ |
1956 | *imbalance = min((max_load - avg_load) * busiest->cpu_power, | 1988 | *imbalance = min(max_pull * busiest->cpu_power, |
1957 | (avg_load - this_load) * this->cpu_power) | 1989 | (avg_load - this_load) * this->cpu_power) |
1958 | / SCHED_LOAD_SCALE; | 1990 | / SCHED_LOAD_SCALE; |
1959 | 1991 | ||
@@ -2050,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2050 | unsigned long imbalance; | 2082 | unsigned long imbalance; |
2051 | int nr_moved, all_pinned = 0; | 2083 | int nr_moved, all_pinned = 0; |
2052 | int active_balance = 0; | 2084 | int active_balance = 0; |
2085 | int sd_idle = 0; | ||
2086 | |||
2087 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | ||
2088 | sd_idle = 1; | ||
2053 | 2089 | ||
2054 | spin_lock(&this_rq->lock); | ||
2055 | schedstat_inc(sd, lb_cnt[idle]); | 2090 | schedstat_inc(sd, lb_cnt[idle]); |
2056 | 2091 | ||
2057 | group = find_busiest_group(sd, this_cpu, &imbalance, idle); | 2092 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); |
2058 | if (!group) { | 2093 | if (!group) { |
2059 | schedstat_inc(sd, lb_nobusyg[idle]); | 2094 | schedstat_inc(sd, lb_nobusyg[idle]); |
2060 | goto out_balanced; | 2095 | goto out_balanced; |
@@ -2078,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2078 | * still unbalanced. nr_moved simply stays zero, so it is | 2113 | * still unbalanced. nr_moved simply stays zero, so it is |
2079 | * correctly treated as an imbalance. | 2114 | * correctly treated as an imbalance. |
2080 | */ | 2115 | */ |
2081 | double_lock_balance(this_rq, busiest); | 2116 | double_rq_lock(this_rq, busiest); |
2082 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2117 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2083 | imbalance, sd, idle, | 2118 | imbalance, sd, idle, &all_pinned); |
2084 | &all_pinned); | 2119 | double_rq_unlock(this_rq, busiest); |
2085 | spin_unlock(&busiest->lock); | ||
2086 | 2120 | ||
2087 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2121 | /* All tasks on this runqueue were pinned by CPU affinity */ |
2088 | if (unlikely(all_pinned)) | 2122 | if (unlikely(all_pinned)) |
2089 | goto out_balanced; | 2123 | goto out_balanced; |
2090 | } | 2124 | } |
2091 | 2125 | ||
2092 | spin_unlock(&this_rq->lock); | ||
2093 | |||
2094 | if (!nr_moved) { | 2126 | if (!nr_moved) { |
2095 | schedstat_inc(sd, lb_failed[idle]); | 2127 | schedstat_inc(sd, lb_failed[idle]); |
2096 | sd->nr_balance_failed++; | 2128 | sd->nr_balance_failed++; |
@@ -2098,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2098 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2130 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
2099 | 2131 | ||
2100 | spin_lock(&busiest->lock); | 2132 | spin_lock(&busiest->lock); |
2133 | |||
2134 | /* don't kick the migration_thread, if the curr | ||
2135 | * task on busiest cpu can't be moved to this_cpu | ||
2136 | */ | ||
2137 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { | ||
2138 | spin_unlock(&busiest->lock); | ||
2139 | all_pinned = 1; | ||
2140 | goto out_one_pinned; | ||
2141 | } | ||
2142 | |||
2101 | if (!busiest->active_balance) { | 2143 | if (!busiest->active_balance) { |
2102 | busiest->active_balance = 1; | 2144 | busiest->active_balance = 1; |
2103 | busiest->push_cpu = this_cpu; | 2145 | busiest->push_cpu = this_cpu; |
@@ -2130,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2130 | sd->balance_interval *= 2; | 2172 | sd->balance_interval *= 2; |
2131 | } | 2173 | } |
2132 | 2174 | ||
2175 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | ||
2176 | return -1; | ||
2133 | return nr_moved; | 2177 | return nr_moved; |
2134 | 2178 | ||
2135 | out_balanced: | 2179 | out_balanced: |
2136 | spin_unlock(&this_rq->lock); | ||
2137 | |||
2138 | schedstat_inc(sd, lb_balanced[idle]); | 2180 | schedstat_inc(sd, lb_balanced[idle]); |
2139 | 2181 | ||
2140 | sd->nr_balance_failed = 0; | 2182 | sd->nr_balance_failed = 0; |
2183 | |||
2184 | out_one_pinned: | ||
2141 | /* tune up the balancing interval */ | 2185 | /* tune up the balancing interval */ |
2142 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || | 2186 | if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || |
2143 | (sd->balance_interval < sd->max_interval)) | 2187 | (sd->balance_interval < sd->max_interval)) |
2144 | sd->balance_interval *= 2; | 2188 | sd->balance_interval *= 2; |
2145 | 2189 | ||
2190 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | ||
2191 | return -1; | ||
2146 | return 0; | 2192 | return 0; |
2147 | } | 2193 | } |
2148 | 2194 | ||
@@ -2160,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2160 | runqueue_t *busiest = NULL; | 2206 | runqueue_t *busiest = NULL; |
2161 | unsigned long imbalance; | 2207 | unsigned long imbalance; |
2162 | int nr_moved = 0; | 2208 | int nr_moved = 0; |
2209 | int sd_idle = 0; | ||
2210 | |||
2211 | if (sd->flags & SD_SHARE_CPUPOWER) | ||
2212 | sd_idle = 1; | ||
2163 | 2213 | ||
2164 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2214 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
2165 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); | 2215 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); |
2166 | if (!group) { | 2216 | if (!group) { |
2167 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2217 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
2168 | goto out_balanced; | 2218 | goto out_balanced; |
@@ -2176,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2176 | 2226 | ||
2177 | BUG_ON(busiest == this_rq); | 2227 | BUG_ON(busiest == this_rq); |
2178 | 2228 | ||
2179 | /* Attempt to move tasks */ | ||
2180 | double_lock_balance(this_rq, busiest); | ||
2181 | |||
2182 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); | 2229 | schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); |
2183 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2230 | |
2231 | nr_moved = 0; | ||
2232 | if (busiest->nr_running > 1) { | ||
2233 | /* Attempt to move tasks */ | ||
2234 | double_lock_balance(this_rq, busiest); | ||
2235 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | ||
2184 | imbalance, sd, NEWLY_IDLE, NULL); | 2236 | imbalance, sd, NEWLY_IDLE, NULL); |
2185 | if (!nr_moved) | 2237 | spin_unlock(&busiest->lock); |
2238 | } | ||
2239 | |||
2240 | if (!nr_moved) { | ||
2186 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); | 2241 | schedstat_inc(sd, lb_failed[NEWLY_IDLE]); |
2187 | else | 2242 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) |
2243 | return -1; | ||
2244 | } else | ||
2188 | sd->nr_balance_failed = 0; | 2245 | sd->nr_balance_failed = 0; |
2189 | 2246 | ||
2190 | spin_unlock(&busiest->lock); | ||
2191 | return nr_moved; | 2247 | return nr_moved; |
2192 | 2248 | ||
2193 | out_balanced: | 2249 | out_balanced: |
2194 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2250 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2251 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | ||
2252 | return -1; | ||
2195 | sd->nr_balance_failed = 0; | 2253 | sd->nr_balance_failed = 0; |
2196 | return 0; | 2254 | return 0; |
2197 | } | 2255 | } |
@@ -2316,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
2316 | 2374 | ||
2317 | if (j - sd->last_balance >= interval) { | 2375 | if (j - sd->last_balance >= interval) { |
2318 | if (load_balance(this_cpu, this_rq, sd, idle)) { | 2376 | if (load_balance(this_cpu, this_rq, sd, idle)) { |
2319 | /* We've pulled tasks over so no longer idle */ | 2377 | /* |
2378 | * We've pulled tasks over so either we're no | ||
2379 | * longer idle, or one of our SMT siblings is | ||
2380 | * not idle. | ||
2381 | */ | ||
2320 | idle = NOT_IDLE; | 2382 | idle = NOT_IDLE; |
2321 | } | 2383 | } |
2322 | sd->last_balance += interval; | 2384 | sd->last_balance += interval; |
@@ -2575,6 +2637,13 @@ out: | |||
2575 | } | 2637 | } |
2576 | 2638 | ||
2577 | #ifdef CONFIG_SCHED_SMT | 2639 | #ifdef CONFIG_SCHED_SMT |
2640 | static inline void wakeup_busy_runqueue(runqueue_t *rq) | ||
2641 | { | ||
2642 | /* If an SMT runqueue is sleeping due to priority reasons wake it up */ | ||
2643 | if (rq->curr == rq->idle && rq->nr_running) | ||
2644 | resched_task(rq->idle); | ||
2645 | } | ||
2646 | |||
2578 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 2647 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) |
2579 | { | 2648 | { |
2580 | struct sched_domain *tmp, *sd = NULL; | 2649 | struct sched_domain *tmp, *sd = NULL; |
@@ -2608,12 +2677,7 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
2608 | for_each_cpu_mask(i, sibling_map) { | 2677 | for_each_cpu_mask(i, sibling_map) { |
2609 | runqueue_t *smt_rq = cpu_rq(i); | 2678 | runqueue_t *smt_rq = cpu_rq(i); |
2610 | 2679 | ||
2611 | /* | 2680 | wakeup_busy_runqueue(smt_rq); |
2612 | * If an SMT sibling task is sleeping due to priority | ||
2613 | * reasons wake it up now. | ||
2614 | */ | ||
2615 | if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) | ||
2616 | resched_task(smt_rq->idle); | ||
2617 | } | 2681 | } |
2618 | 2682 | ||
2619 | for_each_cpu_mask(i, sibling_map) | 2683 | for_each_cpu_mask(i, sibling_map) |
@@ -2624,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | |||
2624 | */ | 2688 | */ |
2625 | } | 2689 | } |
2626 | 2690 | ||
2691 | /* | ||
2692 | * number of 'lost' timeslices this task wont be able to fully | ||
2693 | * utilize, if another task runs on a sibling. This models the | ||
2694 | * slowdown effect of other tasks running on siblings: | ||
2695 | */ | ||
2696 | static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | ||
2697 | { | ||
2698 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | ||
2699 | } | ||
2700 | |||
2627 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 2701 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) |
2628 | { | 2702 | { |
2629 | struct sched_domain *tmp, *sd = NULL; | 2703 | struct sched_domain *tmp, *sd = NULL; |
@@ -2667,6 +2741,10 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
2667 | runqueue_t *smt_rq = cpu_rq(i); | 2741 | runqueue_t *smt_rq = cpu_rq(i); |
2668 | task_t *smt_curr = smt_rq->curr; | 2742 | task_t *smt_curr = smt_rq->curr; |
2669 | 2743 | ||
2744 | /* Kernel threads do not participate in dependent sleeping */ | ||
2745 | if (!p->mm || !smt_curr->mm || rt_task(p)) | ||
2746 | goto check_smt_task; | ||
2747 | |||
2670 | /* | 2748 | /* |
2671 | * If a user task with lower static priority than the | 2749 | * If a user task with lower static priority than the |
2672 | * running task on the SMT sibling is trying to schedule, | 2750 | * running task on the SMT sibling is trying to schedule, |
@@ -2675,21 +2753,45 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
2675 | * task from using an unfair proportion of the | 2753 | * task from using an unfair proportion of the |
2676 | * physical cpu's resources. -ck | 2754 | * physical cpu's resources. -ck |
2677 | */ | 2755 | */ |
2678 | if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > | 2756 | if (rt_task(smt_curr)) { |
2679 | task_timeslice(p) || rt_task(smt_curr)) && | 2757 | /* |
2680 | p->mm && smt_curr->mm && !rt_task(p)) | 2758 | * With real time tasks we run non-rt tasks only |
2681 | ret = 1; | 2759 | * per_cpu_gain% of the time. |
2760 | */ | ||
2761 | if ((jiffies % DEF_TIMESLICE) > | ||
2762 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
2763 | ret = 1; | ||
2764 | } else | ||
2765 | if (smt_curr->static_prio < p->static_prio && | ||
2766 | !TASK_PREEMPTS_CURR(p, smt_rq) && | ||
2767 | smt_slice(smt_curr, sd) > task_timeslice(p)) | ||
2768 | ret = 1; | ||
2769 | |||
2770 | check_smt_task: | ||
2771 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
2772 | rt_task(smt_curr)) | ||
2773 | continue; | ||
2774 | if (!p->mm) { | ||
2775 | wakeup_busy_runqueue(smt_rq); | ||
2776 | continue; | ||
2777 | } | ||
2682 | 2778 | ||
2683 | /* | 2779 | /* |
2684 | * Reschedule a lower priority task on the SMT sibling, | 2780 | * Reschedule a lower priority task on the SMT sibling for |
2685 | * or wake it up if it has been put to sleep for priority | 2781 | * it to be put to sleep, or wake it up if it has been put to |
2686 | * reasons. | 2782 | * sleep for priority reasons to see if it should run now. |
2687 | */ | 2783 | */ |
2688 | if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > | 2784 | if (rt_task(p)) { |
2689 | task_timeslice(smt_curr) || rt_task(p)) && | 2785 | if ((jiffies % DEF_TIMESLICE) > |
2690 | smt_curr->mm && p->mm && !rt_task(smt_curr)) || | 2786 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
2691 | (smt_curr == smt_rq->idle && smt_rq->nr_running)) | 2787 | resched_task(smt_curr); |
2692 | resched_task(smt_curr); | 2788 | } else { |
2789 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
2790 | smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
2791 | resched_task(smt_curr); | ||
2792 | else | ||
2793 | wakeup_busy_runqueue(smt_rq); | ||
2794 | } | ||
2693 | } | 2795 | } |
2694 | out_unlock: | 2796 | out_unlock: |
2695 | for_each_cpu_mask(i, sibling_map) | 2797 | for_each_cpu_mask(i, sibling_map) |
@@ -2887,6 +2989,7 @@ switch_tasks: | |||
2887 | if (next == rq->idle) | 2989 | if (next == rq->idle) |
2888 | schedstat_inc(rq, sched_goidle); | 2990 | schedstat_inc(rq, sched_goidle); |
2889 | prefetch(next); | 2991 | prefetch(next); |
2992 | prefetch_stack(next); | ||
2890 | clear_tsk_need_resched(prev); | 2993 | clear_tsk_need_resched(prev); |
2891 | rcu_qsctr_inc(task_cpu(prev)); | 2994 | rcu_qsctr_inc(task_cpu(prev)); |
2892 | 2995 | ||
@@ -3014,7 +3117,8 @@ need_resched: | |||
3014 | 3117 | ||
3015 | #endif /* CONFIG_PREEMPT */ | 3118 | #endif /* CONFIG_PREEMPT */ |
3016 | 3119 | ||
3017 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) | 3120 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, |
3121 | void *key) | ||
3018 | { | 3122 | { |
3019 | task_t *p = curr->private; | 3123 | task_t *p = curr->private; |
3020 | return try_to_wake_up(p, mode, sync); | 3124 | return try_to_wake_up(p, mode, sync); |
@@ -3056,7 +3160,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | |||
3056 | * @key: is directly passed to the wakeup function | 3160 | * @key: is directly passed to the wakeup function |
3057 | */ | 3161 | */ |
3058 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, | 3162 | void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, |
3059 | int nr_exclusive, void *key) | 3163 | int nr_exclusive, void *key) |
3060 | { | 3164 | { |
3061 | unsigned long flags; | 3165 | unsigned long flags; |
3062 | 3166 | ||
@@ -3088,7 +3192,8 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) | |||
3088 | * | 3192 | * |
3089 | * On UP it can prevent extra preemption. | 3193 | * On UP it can prevent extra preemption. |
3090 | */ | 3194 | */ |
3091 | void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | 3195 | void fastcall |
3196 | __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) | ||
3092 | { | 3197 | { |
3093 | unsigned long flags; | 3198 | unsigned long flags; |
3094 | int sync = 1; | 3199 | int sync = 1; |
@@ -3279,7 +3384,8 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) | |||
3279 | 3384 | ||
3280 | EXPORT_SYMBOL(interruptible_sleep_on); | 3385 | EXPORT_SYMBOL(interruptible_sleep_on); |
3281 | 3386 | ||
3282 | long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | 3387 | long fastcall __sched |
3388 | interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | ||
3283 | { | 3389 | { |
3284 | SLEEP_ON_VAR | 3390 | SLEEP_ON_VAR |
3285 | 3391 | ||
@@ -3498,7 +3604,8 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
3498 | * @policy: new policy. | 3604 | * @policy: new policy. |
3499 | * @param: structure containing the new RT priority. | 3605 | * @param: structure containing the new RT priority. |
3500 | */ | 3606 | */ |
3501 | int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) | 3607 | int sched_setscheduler(struct task_struct *p, int policy, |
3608 | struct sched_param *param) | ||
3502 | { | 3609 | { |
3503 | int retval; | 3610 | int retval; |
3504 | int oldprio, oldpolicy = -1; | 3611 | int oldprio, oldpolicy = -1; |
@@ -3518,7 +3625,7 @@ recheck: | |||
3518 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. | 3625 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. |
3519 | */ | 3626 | */ |
3520 | if (param->sched_priority < 0 || | 3627 | if (param->sched_priority < 0 || |
3521 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || | 3628 | (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || |
3522 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) | 3629 | (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) |
3523 | return -EINVAL; | 3630 | return -EINVAL; |
3524 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) | 3631 | if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) |
@@ -3581,7 +3688,8 @@ recheck: | |||
3581 | } | 3688 | } |
3582 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 3689 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
3583 | 3690 | ||
3584 | static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | 3691 | static int |
3692 | do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | ||
3585 | { | 3693 | { |
3586 | int retval; | 3694 | int retval; |
3587 | struct sched_param lparam; | 3695 | struct sched_param lparam; |
@@ -3848,7 +3956,7 @@ asmlinkage long sys_sched_yield(void) | |||
3848 | if (rt_task(current)) | 3956 | if (rt_task(current)) |
3849 | target = rq->active; | 3957 | target = rq->active; |
3850 | 3958 | ||
3851 | if (current->array->nr_active == 1) { | 3959 | if (array->nr_active == 1) { |
3852 | schedstat_inc(rq, yld_act_empty); | 3960 | schedstat_inc(rq, yld_act_empty); |
3853 | if (!rq->expired->nr_active) | 3961 | if (!rq->expired->nr_active) |
3854 | schedstat_inc(rq, yld_both_empty); | 3962 | schedstat_inc(rq, yld_both_empty); |
@@ -3912,7 +4020,7 @@ EXPORT_SYMBOL(cond_resched); | |||
3912 | * operations here to prevent schedule() from being called twice (once via | 4020 | * operations here to prevent schedule() from being called twice (once via |
3913 | * spin_unlock(), once by hand). | 4021 | * spin_unlock(), once by hand). |
3914 | */ | 4022 | */ |
3915 | int cond_resched_lock(spinlock_t * lock) | 4023 | int cond_resched_lock(spinlock_t *lock) |
3916 | { | 4024 | { |
3917 | int ret = 0; | 4025 | int ret = 0; |
3918 | 4026 | ||
@@ -4095,7 +4203,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p) | |||
4095 | return list_entry(p->sibling.next,struct task_struct,sibling); | 4203 | return list_entry(p->sibling.next,struct task_struct,sibling); |
4096 | } | 4204 | } |
4097 | 4205 | ||
4098 | static void show_task(task_t * p) | 4206 | static void show_task(task_t *p) |
4099 | { | 4207 | { |
4100 | task_t *relative; | 4208 | task_t *relative; |
4101 | unsigned state; | 4209 | unsigned state; |
@@ -4121,7 +4229,7 @@ static void show_task(task_t * p) | |||
4121 | #endif | 4229 | #endif |
4122 | #ifdef CONFIG_DEBUG_STACK_USAGE | 4230 | #ifdef CONFIG_DEBUG_STACK_USAGE |
4123 | { | 4231 | { |
4124 | unsigned long * n = (unsigned long *) (p->thread_info+1); | 4232 | unsigned long *n = (unsigned long *) (p->thread_info+1); |
4125 | while (!*n) | 4233 | while (!*n) |
4126 | n++; | 4234 | n++; |
4127 | free = (unsigned long) n - (unsigned long)(p->thread_info+1); | 4235 | free = (unsigned long) n - (unsigned long)(p->thread_info+1); |
@@ -4330,7 +4438,7 @@ out: | |||
4330 | * thread migration by bumping thread off CPU then 'pushing' onto | 4438 | * thread migration by bumping thread off CPU then 'pushing' onto |
4331 | * another runqueue. | 4439 | * another runqueue. |
4332 | */ | 4440 | */ |
4333 | static int migration_thread(void * data) | 4441 | static int migration_thread(void *data) |
4334 | { | 4442 | { |
4335 | runqueue_t *rq; | 4443 | runqueue_t *rq; |
4336 | int cpu = (long)data; | 4444 | int cpu = (long)data; |
@@ -4779,7 +4887,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, | |||
4779 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 4887 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
4780 | * hold the hotplug lock. | 4888 | * hold the hotplug lock. |
4781 | */ | 4889 | */ |
4782 | void cpu_attach_domain(struct sched_domain *sd, int cpu) | 4890 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) |
4783 | { | 4891 | { |
4784 | runqueue_t *rq = cpu_rq(cpu); | 4892 | runqueue_t *rq = cpu_rq(cpu); |
4785 | struct sched_domain *tmp; | 4893 | struct sched_domain *tmp; |
@@ -4802,7 +4910,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu) | |||
4802 | } | 4910 | } |
4803 | 4911 | ||
4804 | /* cpus with isolated domains */ | 4912 | /* cpus with isolated domains */ |
4805 | cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; | 4913 | static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; |
4806 | 4914 | ||
4807 | /* Setup the mask of cpus configured for isolated domains */ | 4915 | /* Setup the mask of cpus configured for isolated domains */ |
4808 | static int __init isolated_cpu_setup(char *str) | 4916 | static int __init isolated_cpu_setup(char *str) |
@@ -4830,8 +4938,8 @@ __setup ("isolcpus=", isolated_cpu_setup); | |||
4830 | * covered by the given span, and will set each group's ->cpumask correctly, | 4938 | * covered by the given span, and will set each group's ->cpumask correctly, |
4831 | * and ->cpu_power to 0. | 4939 | * and ->cpu_power to 0. |
4832 | */ | 4940 | */ |
4833 | void init_sched_build_groups(struct sched_group groups[], | 4941 | static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, |
4834 | cpumask_t span, int (*group_fn)(int cpu)) | 4942 | int (*group_fn)(int cpu)) |
4835 | { | 4943 | { |
4836 | struct sched_group *first = NULL, *last = NULL; | 4944 | struct sched_group *first = NULL, *last = NULL; |
4837 | cpumask_t covered = CPU_MASK_NONE; | 4945 | cpumask_t covered = CPU_MASK_NONE; |
@@ -4864,12 +4972,85 @@ void init_sched_build_groups(struct sched_group groups[], | |||
4864 | last->next = first; | 4972 | last->next = first; |
4865 | } | 4973 | } |
4866 | 4974 | ||
4975 | #define SD_NODES_PER_DOMAIN 16 | ||
4867 | 4976 | ||
4868 | #ifdef ARCH_HAS_SCHED_DOMAIN | 4977 | #ifdef CONFIG_NUMA |
4869 | extern void build_sched_domains(const cpumask_t *cpu_map); | 4978 | /** |
4870 | extern void arch_init_sched_domains(const cpumask_t *cpu_map); | 4979 | * find_next_best_node - find the next node to include in a sched_domain |
4871 | extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); | 4980 | * @node: node whose sched_domain we're building |
4872 | #else | 4981 | * @used_nodes: nodes already in the sched_domain |
4982 | * | ||
4983 | * Find the next node to include in a given scheduling domain. Simply | ||
4984 | * finds the closest node not already in the @used_nodes map. | ||
4985 | * | ||
4986 | * Should use nodemask_t. | ||
4987 | */ | ||
4988 | static int find_next_best_node(int node, unsigned long *used_nodes) | ||
4989 | { | ||
4990 | int i, n, val, min_val, best_node = 0; | ||
4991 | |||
4992 | min_val = INT_MAX; | ||
4993 | |||
4994 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
4995 | /* Start at @node */ | ||
4996 | n = (node + i) % MAX_NUMNODES; | ||
4997 | |||
4998 | if (!nr_cpus_node(n)) | ||
4999 | continue; | ||
5000 | |||
5001 | /* Skip already used nodes */ | ||
5002 | if (test_bit(n, used_nodes)) | ||
5003 | continue; | ||
5004 | |||
5005 | /* Simple min distance search */ | ||
5006 | val = node_distance(node, n); | ||
5007 | |||
5008 | if (val < min_val) { | ||
5009 | min_val = val; | ||
5010 | best_node = n; | ||
5011 | } | ||
5012 | } | ||
5013 | |||
5014 | set_bit(best_node, used_nodes); | ||
5015 | return best_node; | ||
5016 | } | ||
5017 | |||
5018 | /** | ||
5019 | * sched_domain_node_span - get a cpumask for a node's sched_domain | ||
5020 | * @node: node whose cpumask we're constructing | ||
5021 | * @size: number of nodes to include in this span | ||
5022 | * | ||
5023 | * Given a node, construct a good cpumask for its sched_domain to span. It | ||
5024 | * should be one that prevents unnecessary balancing, but also spreads tasks | ||
5025 | * out optimally. | ||
5026 | */ | ||
5027 | static cpumask_t sched_domain_node_span(int node) | ||
5028 | { | ||
5029 | int i; | ||
5030 | cpumask_t span, nodemask; | ||
5031 | DECLARE_BITMAP(used_nodes, MAX_NUMNODES); | ||
5032 | |||
5033 | cpus_clear(span); | ||
5034 | bitmap_zero(used_nodes, MAX_NUMNODES); | ||
5035 | |||
5036 | nodemask = node_to_cpumask(node); | ||
5037 | cpus_or(span, span, nodemask); | ||
5038 | set_bit(node, used_nodes); | ||
5039 | |||
5040 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | ||
5041 | int next_node = find_next_best_node(node, used_nodes); | ||
5042 | nodemask = node_to_cpumask(next_node); | ||
5043 | cpus_or(span, span, nodemask); | ||
5044 | } | ||
5045 | |||
5046 | return span; | ||
5047 | } | ||
5048 | #endif | ||
5049 | |||
5050 | /* | ||
5051 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | ||
5052 | * can switch it on easily if needed. | ||
5053 | */ | ||
4873 | #ifdef CONFIG_SCHED_SMT | 5054 | #ifdef CONFIG_SCHED_SMT |
4874 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 5055 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
4875 | static struct sched_group sched_group_cpus[NR_CPUS]; | 5056 | static struct sched_group sched_group_cpus[NR_CPUS]; |
@@ -4891,36 +5072,20 @@ static int cpu_to_phys_group(int cpu) | |||
4891 | } | 5072 | } |
4892 | 5073 | ||
4893 | #ifdef CONFIG_NUMA | 5074 | #ifdef CONFIG_NUMA |
4894 | |||
4895 | static DEFINE_PER_CPU(struct sched_domain, node_domains); | ||
4896 | static struct sched_group sched_group_nodes[MAX_NUMNODES]; | ||
4897 | static int cpu_to_node_group(int cpu) | ||
4898 | { | ||
4899 | return cpu_to_node(cpu); | ||
4900 | } | ||
4901 | #endif | ||
4902 | |||
4903 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
4904 | /* | 5075 | /* |
4905 | * The domains setup code relies on siblings not spanning | 5076 | * The init_sched_build_groups can't handle what we want to do with node |
4906 | * multiple nodes. Make sure the architecture has a proper | 5077 | * groups, so roll our own. Now each node has its own list of groups which |
4907 | * siblings map: | 5078 | * gets dynamically allocated. |
4908 | */ | 5079 | */ |
4909 | static void check_sibling_maps(void) | 5080 | static DEFINE_PER_CPU(struct sched_domain, node_domains); |
4910 | { | 5081 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; |
4911 | int i, j; | ||
4912 | 5082 | ||
4913 | for_each_online_cpu(i) { | 5083 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
4914 | for_each_cpu_mask(j, cpu_sibling_map[i]) { | 5084 | static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; |
4915 | if (cpu_to_node(i) != cpu_to_node(j)) { | 5085 | |
4916 | printk(KERN_INFO "warning: CPU %d siblings map " | 5086 | static int cpu_to_allnodes_group(int cpu) |
4917 | "to different node - isolating " | 5087 | { |
4918 | "them.\n", i); | 5088 | return cpu_to_node(cpu); |
4919 | cpu_sibling_map[i] = cpumask_of_cpu(i); | ||
4920 | break; | ||
4921 | } | ||
4922 | } | ||
4923 | } | ||
4924 | } | 5089 | } |
4925 | #endif | 5090 | #endif |
4926 | 5091 | ||
@@ -4928,9 +5093,24 @@ static void check_sibling_maps(void) | |||
4928 | * Build sched domains for a given set of cpus and attach the sched domains | 5093 | * Build sched domains for a given set of cpus and attach the sched domains |
4929 | * to the individual cpus | 5094 | * to the individual cpus |
4930 | */ | 5095 | */ |
4931 | static void build_sched_domains(const cpumask_t *cpu_map) | 5096 | void build_sched_domains(const cpumask_t *cpu_map) |
4932 | { | 5097 | { |
4933 | int i; | 5098 | int i; |
5099 | #ifdef CONFIG_NUMA | ||
5100 | struct sched_group **sched_group_nodes = NULL; | ||
5101 | struct sched_group *sched_group_allnodes = NULL; | ||
5102 | |||
5103 | /* | ||
5104 | * Allocate the per-node list of sched groups | ||
5105 | */ | ||
5106 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | ||
5107 | GFP_ATOMIC); | ||
5108 | if (!sched_group_nodes) { | ||
5109 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | ||
5110 | return; | ||
5111 | } | ||
5112 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | ||
5113 | #endif | ||
4934 | 5114 | ||
4935 | /* | 5115 | /* |
4936 | * Set up domains for cpus specified by the cpu_map. | 5116 | * Set up domains for cpus specified by the cpu_map. |
@@ -4943,11 +5123,35 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
4943 | cpus_and(nodemask, nodemask, *cpu_map); | 5123 | cpus_and(nodemask, nodemask, *cpu_map); |
4944 | 5124 | ||
4945 | #ifdef CONFIG_NUMA | 5125 | #ifdef CONFIG_NUMA |
5126 | if (cpus_weight(*cpu_map) | ||
5127 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | ||
5128 | if (!sched_group_allnodes) { | ||
5129 | sched_group_allnodes | ||
5130 | = kmalloc(sizeof(struct sched_group) | ||
5131 | * MAX_NUMNODES, | ||
5132 | GFP_KERNEL); | ||
5133 | if (!sched_group_allnodes) { | ||
5134 | printk(KERN_WARNING | ||
5135 | "Can not alloc allnodes sched group\n"); | ||
5136 | break; | ||
5137 | } | ||
5138 | sched_group_allnodes_bycpu[i] | ||
5139 | = sched_group_allnodes; | ||
5140 | } | ||
5141 | sd = &per_cpu(allnodes_domains, i); | ||
5142 | *sd = SD_ALLNODES_INIT; | ||
5143 | sd->span = *cpu_map; | ||
5144 | group = cpu_to_allnodes_group(i); | ||
5145 | sd->groups = &sched_group_allnodes[group]; | ||
5146 | p = sd; | ||
5147 | } else | ||
5148 | p = NULL; | ||
5149 | |||
4946 | sd = &per_cpu(node_domains, i); | 5150 | sd = &per_cpu(node_domains, i); |
4947 | group = cpu_to_node_group(i); | ||
4948 | *sd = SD_NODE_INIT; | 5151 | *sd = SD_NODE_INIT; |
4949 | sd->span = *cpu_map; | 5152 | sd->span = sched_domain_node_span(cpu_to_node(i)); |
4950 | sd->groups = &sched_group_nodes[group]; | 5153 | sd->parent = p; |
5154 | cpus_and(sd->span, sd->span, *cpu_map); | ||
4951 | #endif | 5155 | #endif |
4952 | 5156 | ||
4953 | p = sd; | 5157 | p = sd; |
@@ -4972,7 +5176,7 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
4972 | 5176 | ||
4973 | #ifdef CONFIG_SCHED_SMT | 5177 | #ifdef CONFIG_SCHED_SMT |
4974 | /* Set up CPU (sibling) groups */ | 5178 | /* Set up CPU (sibling) groups */ |
4975 | for_each_online_cpu(i) { | 5179 | for_each_cpu_mask(i, *cpu_map) { |
4976 | cpumask_t this_sibling_map = cpu_sibling_map[i]; | 5180 | cpumask_t this_sibling_map = cpu_sibling_map[i]; |
4977 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); | 5181 | cpus_and(this_sibling_map, this_sibling_map, *cpu_map); |
4978 | if (i != first_cpu(this_sibling_map)) | 5182 | if (i != first_cpu(this_sibling_map)) |
@@ -4997,8 +5201,77 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
4997 | 5201 | ||
4998 | #ifdef CONFIG_NUMA | 5202 | #ifdef CONFIG_NUMA |
4999 | /* Set up node groups */ | 5203 | /* Set up node groups */ |
5000 | init_sched_build_groups(sched_group_nodes, *cpu_map, | 5204 | if (sched_group_allnodes) |
5001 | &cpu_to_node_group); | 5205 | init_sched_build_groups(sched_group_allnodes, *cpu_map, |
5206 | &cpu_to_allnodes_group); | ||
5207 | |||
5208 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
5209 | /* Set up node groups */ | ||
5210 | struct sched_group *sg, *prev; | ||
5211 | cpumask_t nodemask = node_to_cpumask(i); | ||
5212 | cpumask_t domainspan; | ||
5213 | cpumask_t covered = CPU_MASK_NONE; | ||
5214 | int j; | ||
5215 | |||
5216 | cpus_and(nodemask, nodemask, *cpu_map); | ||
5217 | if (cpus_empty(nodemask)) { | ||
5218 | sched_group_nodes[i] = NULL; | ||
5219 | continue; | ||
5220 | } | ||
5221 | |||
5222 | domainspan = sched_domain_node_span(i); | ||
5223 | cpus_and(domainspan, domainspan, *cpu_map); | ||
5224 | |||
5225 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | ||
5226 | sched_group_nodes[i] = sg; | ||
5227 | for_each_cpu_mask(j, nodemask) { | ||
5228 | struct sched_domain *sd; | ||
5229 | sd = &per_cpu(node_domains, j); | ||
5230 | sd->groups = sg; | ||
5231 | if (sd->groups == NULL) { | ||
5232 | /* Turn off balancing if we have no groups */ | ||
5233 | sd->flags = 0; | ||
5234 | } | ||
5235 | } | ||
5236 | if (!sg) { | ||
5237 | printk(KERN_WARNING | ||
5238 | "Can not alloc domain group for node %d\n", i); | ||
5239 | continue; | ||
5240 | } | ||
5241 | sg->cpu_power = 0; | ||
5242 | sg->cpumask = nodemask; | ||
5243 | cpus_or(covered, covered, nodemask); | ||
5244 | prev = sg; | ||
5245 | |||
5246 | for (j = 0; j < MAX_NUMNODES; j++) { | ||
5247 | cpumask_t tmp, notcovered; | ||
5248 | int n = (i + j) % MAX_NUMNODES; | ||
5249 | |||
5250 | cpus_complement(notcovered, covered); | ||
5251 | cpus_and(tmp, notcovered, *cpu_map); | ||
5252 | cpus_and(tmp, tmp, domainspan); | ||
5253 | if (cpus_empty(tmp)) | ||
5254 | break; | ||
5255 | |||
5256 | nodemask = node_to_cpumask(n); | ||
5257 | cpus_and(tmp, tmp, nodemask); | ||
5258 | if (cpus_empty(tmp)) | ||
5259 | continue; | ||
5260 | |||
5261 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | ||
5262 | if (!sg) { | ||
5263 | printk(KERN_WARNING | ||
5264 | "Can not alloc domain group for node %d\n", j); | ||
5265 | break; | ||
5266 | } | ||
5267 | sg->cpu_power = 0; | ||
5268 | sg->cpumask = tmp; | ||
5269 | cpus_or(covered, covered, tmp); | ||
5270 | prev->next = sg; | ||
5271 | prev = sg; | ||
5272 | } | ||
5273 | prev->next = sched_group_nodes[i]; | ||
5274 | } | ||
5002 | #endif | 5275 | #endif |
5003 | 5276 | ||
5004 | /* Calculate CPU power for physical packages and nodes */ | 5277 | /* Calculate CPU power for physical packages and nodes */ |
@@ -5017,14 +5290,46 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
5017 | sd->groups->cpu_power = power; | 5290 | sd->groups->cpu_power = power; |
5018 | 5291 | ||
5019 | #ifdef CONFIG_NUMA | 5292 | #ifdef CONFIG_NUMA |
5020 | if (i == first_cpu(sd->groups->cpumask)) { | 5293 | sd = &per_cpu(allnodes_domains, i); |
5021 | /* Only add "power" once for each physical package. */ | 5294 | if (sd->groups) { |
5022 | sd = &per_cpu(node_domains, i); | 5295 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * |
5023 | sd->groups->cpu_power += power; | 5296 | (cpus_weight(sd->groups->cpumask)-1) / 10; |
5297 | sd->groups->cpu_power = power; | ||
5024 | } | 5298 | } |
5025 | #endif | 5299 | #endif |
5026 | } | 5300 | } |
5027 | 5301 | ||
5302 | #ifdef CONFIG_NUMA | ||
5303 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
5304 | struct sched_group *sg = sched_group_nodes[i]; | ||
5305 | int j; | ||
5306 | |||
5307 | if (sg == NULL) | ||
5308 | continue; | ||
5309 | next_sg: | ||
5310 | for_each_cpu_mask(j, sg->cpumask) { | ||
5311 | struct sched_domain *sd; | ||
5312 | int power; | ||
5313 | |||
5314 | sd = &per_cpu(phys_domains, j); | ||
5315 | if (j != first_cpu(sd->groups->cpumask)) { | ||
5316 | /* | ||
5317 | * Only add "power" once for each | ||
5318 | * physical package. | ||
5319 | */ | ||
5320 | continue; | ||
5321 | } | ||
5322 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | ||
5323 | (cpus_weight(sd->groups->cpumask)-1) / 10; | ||
5324 | |||
5325 | sg->cpu_power += power; | ||
5326 | } | ||
5327 | sg = sg->next; | ||
5328 | if (sg != sched_group_nodes[i]) | ||
5329 | goto next_sg; | ||
5330 | } | ||
5331 | #endif | ||
5332 | |||
5028 | /* Attach the domains */ | 5333 | /* Attach the domains */ |
5029 | for_each_cpu_mask(i, *cpu_map) { | 5334 | for_each_cpu_mask(i, *cpu_map) { |
5030 | struct sched_domain *sd; | 5335 | struct sched_domain *sd; |
@@ -5039,13 +5344,10 @@ static void build_sched_domains(const cpumask_t *cpu_map) | |||
5039 | /* | 5344 | /* |
5040 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 5345 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
5041 | */ | 5346 | */ |
5042 | static void arch_init_sched_domains(cpumask_t *cpu_map) | 5347 | static void arch_init_sched_domains(const cpumask_t *cpu_map) |
5043 | { | 5348 | { |
5044 | cpumask_t cpu_default_map; | 5349 | cpumask_t cpu_default_map; |
5045 | 5350 | ||
5046 | #if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) | ||
5047 | check_sibling_maps(); | ||
5048 | #endif | ||
5049 | /* | 5351 | /* |
5050 | * Setup mask for cpus without special case scheduling requirements. | 5352 | * Setup mask for cpus without special case scheduling requirements. |
5051 | * For now this just excludes isolated cpus, but could be used to | 5353 | * For now this just excludes isolated cpus, but could be used to |
@@ -5058,10 +5360,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map) | |||
5058 | 5360 | ||
5059 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 5361 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
5060 | { | 5362 | { |
5061 | /* Do nothing: everything is statically allocated. */ | 5363 | #ifdef CONFIG_NUMA |
5062 | } | 5364 | int i; |
5365 | int cpu; | ||
5063 | 5366 | ||
5064 | #endif /* ARCH_HAS_SCHED_DOMAIN */ | 5367 | for_each_cpu_mask(cpu, *cpu_map) { |
5368 | struct sched_group *sched_group_allnodes | ||
5369 | = sched_group_allnodes_bycpu[cpu]; | ||
5370 | struct sched_group **sched_group_nodes | ||
5371 | = sched_group_nodes_bycpu[cpu]; | ||
5372 | |||
5373 | if (sched_group_allnodes) { | ||
5374 | kfree(sched_group_allnodes); | ||
5375 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
5376 | } | ||
5377 | |||
5378 | if (!sched_group_nodes) | ||
5379 | continue; | ||
5380 | |||
5381 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
5382 | cpumask_t nodemask = node_to_cpumask(i); | ||
5383 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
5384 | |||
5385 | cpus_and(nodemask, nodemask, *cpu_map); | ||
5386 | if (cpus_empty(nodemask)) | ||
5387 | continue; | ||
5388 | |||
5389 | if (sg == NULL) | ||
5390 | continue; | ||
5391 | sg = sg->next; | ||
5392 | next_sg: | ||
5393 | oldsg = sg; | ||
5394 | sg = sg->next; | ||
5395 | kfree(oldsg); | ||
5396 | if (oldsg != sched_group_nodes[i]) | ||
5397 | goto next_sg; | ||
5398 | } | ||
5399 | kfree(sched_group_nodes); | ||
5400 | sched_group_nodes_bycpu[cpu] = NULL; | ||
5401 | } | ||
5402 | #endif | ||
5403 | } | ||
5065 | 5404 | ||
5066 | /* | 5405 | /* |
5067 | * Detach sched domains from a group of cpus specified in cpu_map | 5406 | * Detach sched domains from a group of cpus specified in cpu_map |
@@ -5263,3 +5602,47 @@ void normalize_rt_tasks(void) | |||
5263 | } | 5602 | } |
5264 | 5603 | ||
5265 | #endif /* CONFIG_MAGIC_SYSRQ */ | 5604 | #endif /* CONFIG_MAGIC_SYSRQ */ |
5605 | |||
5606 | #ifdef CONFIG_IA64 | ||
5607 | /* | ||
5608 | * These functions are only useful for the IA64 MCA handling. | ||
5609 | * | ||
5610 | * They can only be called when the whole system has been | ||
5611 | * stopped - every CPU needs to be quiescent, and no scheduling | ||
5612 | * activity can take place. Using them for anything else would | ||
5613 | * be a serious bug, and as a result, they aren't even visible | ||
5614 | * under any other configuration. | ||
5615 | */ | ||
5616 | |||
5617 | /** | ||
5618 | * curr_task - return the current task for a given cpu. | ||
5619 | * @cpu: the processor in question. | ||
5620 | * | ||
5621 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | ||
5622 | */ | ||
5623 | task_t *curr_task(int cpu) | ||
5624 | { | ||
5625 | return cpu_curr(cpu); | ||
5626 | } | ||
5627 | |||
5628 | /** | ||
5629 | * set_curr_task - set the current task for a given cpu. | ||
5630 | * @cpu: the processor in question. | ||
5631 | * @p: the task pointer to set. | ||
5632 | * | ||
5633 | * Description: This function must only be used when non-maskable interrupts | ||
5634 | * are serviced on a separate stack. It allows the architecture to switch the | ||
5635 | * notion of the current task on a cpu in a non-blocking manner. This function | ||
5636 | * must be called with all CPU's synchronized, and interrupts disabled, the | ||
5637 | * and caller must save the original value of the current task (see | ||
5638 | * curr_task() above) and restore that value before reenabling interrupts and | ||
5639 | * re-starting the system. | ||
5640 | * | ||
5641 | * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! | ||
5642 | */ | ||
5643 | void set_curr_task(int cpu, task_t *p) | ||
5644 | { | ||
5645 | cpu_curr(cpu) = p; | ||
5646 | } | ||
5647 | |||
5648 | #endif | ||
diff --git a/kernel/signal.c b/kernel/signal.c index d282fea81138..b92c3c9f8b9a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -678,7 +678,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
678 | 678 | ||
679 | /* forward decl */ | 679 | /* forward decl */ |
680 | static void do_notify_parent_cldstop(struct task_struct *tsk, | 680 | static void do_notify_parent_cldstop(struct task_struct *tsk, |
681 | struct task_struct *parent, | 681 | int to_self, |
682 | int why); | 682 | int why); |
683 | 683 | ||
684 | /* | 684 | /* |
@@ -729,14 +729,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) | |||
729 | p->signal->group_stop_count = 0; | 729 | p->signal->group_stop_count = 0; |
730 | p->signal->flags = SIGNAL_STOP_CONTINUED; | 730 | p->signal->flags = SIGNAL_STOP_CONTINUED; |
731 | spin_unlock(&p->sighand->siglock); | 731 | spin_unlock(&p->sighand->siglock); |
732 | if (p->ptrace & PT_PTRACED) | 732 | do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); |
733 | do_notify_parent_cldstop(p, p->parent, | ||
734 | CLD_STOPPED); | ||
735 | else | ||
736 | do_notify_parent_cldstop( | ||
737 | p->group_leader, | ||
738 | p->group_leader->real_parent, | ||
739 | CLD_STOPPED); | ||
740 | spin_lock(&p->sighand->siglock); | 733 | spin_lock(&p->sighand->siglock); |
741 | } | 734 | } |
742 | rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); | 735 | rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); |
@@ -777,14 +770,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) | |||
777 | p->signal->flags = SIGNAL_STOP_CONTINUED; | 770 | p->signal->flags = SIGNAL_STOP_CONTINUED; |
778 | p->signal->group_exit_code = 0; | 771 | p->signal->group_exit_code = 0; |
779 | spin_unlock(&p->sighand->siglock); | 772 | spin_unlock(&p->sighand->siglock); |
780 | if (p->ptrace & PT_PTRACED) | 773 | do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); |
781 | do_notify_parent_cldstop(p, p->parent, | ||
782 | CLD_CONTINUED); | ||
783 | else | ||
784 | do_notify_parent_cldstop( | ||
785 | p->group_leader, | ||
786 | p->group_leader->real_parent, | ||
787 | CLD_CONTINUED); | ||
788 | spin_lock(&p->sighand->siglock); | 774 | spin_lock(&p->sighand->siglock); |
789 | } else { | 775 | } else { |
790 | /* | 776 | /* |
@@ -1380,16 +1366,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1380 | unsigned long flags; | 1366 | unsigned long flags; |
1381 | int ret = 0; | 1367 | int ret = 0; |
1382 | 1368 | ||
1383 | /* | ||
1384 | * We need the tasklist lock even for the specific | ||
1385 | * thread case (when we don't need to follow the group | ||
1386 | * lists) in order to avoid races with "p->sighand" | ||
1387 | * going away or changing from under us. | ||
1388 | */ | ||
1389 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1369 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
1390 | read_lock(&tasklist_lock); | 1370 | read_lock(&tasklist_lock); |
1371 | |||
1372 | if (unlikely(p->flags & PF_EXITING)) { | ||
1373 | ret = -1; | ||
1374 | goto out_err; | ||
1375 | } | ||
1376 | |||
1391 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1377 | spin_lock_irqsave(&p->sighand->siglock, flags); |
1392 | 1378 | ||
1393 | if (unlikely(!list_empty(&q->list))) { | 1379 | if (unlikely(!list_empty(&q->list))) { |
1394 | /* | 1380 | /* |
1395 | * If an SI_TIMER entry is already queue just increment | 1381 | * If an SI_TIMER entry is already queue just increment |
@@ -1399,7 +1385,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1399 | BUG(); | 1385 | BUG(); |
1400 | q->info.si_overrun++; | 1386 | q->info.si_overrun++; |
1401 | goto out; | 1387 | goto out; |
1402 | } | 1388 | } |
1403 | /* Short-circuit ignored signals. */ | 1389 | /* Short-circuit ignored signals. */ |
1404 | if (sig_ignored(p, sig)) { | 1390 | if (sig_ignored(p, sig)) { |
1405 | ret = 1; | 1391 | ret = 1; |
@@ -1414,8 +1400,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1414 | 1400 | ||
1415 | out: | 1401 | out: |
1416 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1402 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
1403 | out_err: | ||
1417 | read_unlock(&tasklist_lock); | 1404 | read_unlock(&tasklist_lock); |
1418 | return(ret); | 1405 | |
1406 | return ret; | ||
1419 | } | 1407 | } |
1420 | 1408 | ||
1421 | int | 1409 | int |
@@ -1542,14 +1530,20 @@ void do_notify_parent(struct task_struct *tsk, int sig) | |||
1542 | spin_unlock_irqrestore(&psig->siglock, flags); | 1530 | spin_unlock_irqrestore(&psig->siglock, flags); |
1543 | } | 1531 | } |
1544 | 1532 | ||
1545 | static void | 1533 | static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) |
1546 | do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent, | ||
1547 | int why) | ||
1548 | { | 1534 | { |
1549 | struct siginfo info; | 1535 | struct siginfo info; |
1550 | unsigned long flags; | 1536 | unsigned long flags; |
1537 | struct task_struct *parent; | ||
1551 | struct sighand_struct *sighand; | 1538 | struct sighand_struct *sighand; |
1552 | 1539 | ||
1540 | if (to_self) | ||
1541 | parent = tsk->parent; | ||
1542 | else { | ||
1543 | tsk = tsk->group_leader; | ||
1544 | parent = tsk->real_parent; | ||
1545 | } | ||
1546 | |||
1553 | info.si_signo = SIGCHLD; | 1547 | info.si_signo = SIGCHLD; |
1554 | info.si_errno = 0; | 1548 | info.si_errno = 0; |
1555 | info.si_pid = tsk->pid; | 1549 | info.si_pid = tsk->pid; |
@@ -1618,8 +1612,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) | |||
1618 | !(current->ptrace & PT_ATTACHED)) && | 1612 | !(current->ptrace & PT_ATTACHED)) && |
1619 | (likely(current->parent->signal != current->signal) || | 1613 | (likely(current->parent->signal != current->signal) || |
1620 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { | 1614 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { |
1621 | do_notify_parent_cldstop(current, current->parent, | 1615 | do_notify_parent_cldstop(current, 1, CLD_TRAPPED); |
1622 | CLD_TRAPPED); | ||
1623 | read_unlock(&tasklist_lock); | 1616 | read_unlock(&tasklist_lock); |
1624 | schedule(); | 1617 | schedule(); |
1625 | } else { | 1618 | } else { |
@@ -1668,25 +1661,25 @@ void ptrace_notify(int exit_code) | |||
1668 | static void | 1661 | static void |
1669 | finish_stop(int stop_count) | 1662 | finish_stop(int stop_count) |
1670 | { | 1663 | { |
1664 | int to_self; | ||
1665 | |||
1671 | /* | 1666 | /* |
1672 | * If there are no other threads in the group, or if there is | 1667 | * If there are no other threads in the group, or if there is |
1673 | * a group stop in progress and we are the last to stop, | 1668 | * a group stop in progress and we are the last to stop, |
1674 | * report to the parent. When ptraced, every thread reports itself. | 1669 | * report to the parent. When ptraced, every thread reports itself. |
1675 | */ | 1670 | */ |
1676 | if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { | 1671 | if (stop_count < 0 || (current->ptrace & PT_PTRACED)) |
1677 | read_lock(&tasklist_lock); | 1672 | to_self = 1; |
1678 | do_notify_parent_cldstop(current, current->parent, | 1673 | else if (stop_count == 0) |
1679 | CLD_STOPPED); | 1674 | to_self = 0; |
1680 | read_unlock(&tasklist_lock); | 1675 | else |
1681 | } | 1676 | goto out; |
1682 | else if (stop_count == 0) { | ||
1683 | read_lock(&tasklist_lock); | ||
1684 | do_notify_parent_cldstop(current->group_leader, | ||
1685 | current->group_leader->real_parent, | ||
1686 | CLD_STOPPED); | ||
1687 | read_unlock(&tasklist_lock); | ||
1688 | } | ||
1689 | 1677 | ||
1678 | read_lock(&tasklist_lock); | ||
1679 | do_notify_parent_cldstop(current, to_self, CLD_STOPPED); | ||
1680 | read_unlock(&tasklist_lock); | ||
1681 | |||
1682 | out: | ||
1690 | schedule(); | 1683 | schedule(); |
1691 | /* | 1684 | /* |
1692 | * Now we don't run again until continued. | 1685 | * Now we don't run again until continued. |
@@ -2228,8 +2221,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese, | |||
2228 | recalc_sigpending(); | 2221 | recalc_sigpending(); |
2229 | spin_unlock_irq(¤t->sighand->siglock); | 2222 | spin_unlock_irq(¤t->sighand->siglock); |
2230 | 2223 | ||
2231 | current->state = TASK_INTERRUPTIBLE; | 2224 | timeout = schedule_timeout_interruptible(timeout); |
2232 | timeout = schedule_timeout(timeout); | ||
2233 | 2225 | ||
2234 | try_to_freeze(); | 2226 | try_to_freeze(); |
2235 | spin_lock_irq(¤t->sighand->siglock); | 2227 | spin_lock_irq(¤t->sighand->siglock); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index b4ab6af1dea8..f766b2fc48be 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -84,7 +84,7 @@ asmlinkage void __do_softirq(void) | |||
84 | cpu = smp_processor_id(); | 84 | cpu = smp_processor_id(); |
85 | restart: | 85 | restart: |
86 | /* Reset the pending bitmask before enabling irqs */ | 86 | /* Reset the pending bitmask before enabling irqs */ |
87 | local_softirq_pending() = 0; | 87 | set_softirq_pending(0); |
88 | 88 | ||
89 | local_irq_enable(); | 89 | local_irq_enable(); |
90 | 90 | ||
diff --git a/kernel/softlockup.c b/kernel/softlockup.c new file mode 100644 index 000000000000..75976209cea7 --- /dev/null +++ b/kernel/softlockup.c | |||
@@ -0,0 +1,151 @@ | |||
1 | /* | ||
2 | * Detect Soft Lockups | ||
3 | * | ||
4 | * started by Ingo Molnar, (C) 2005, Red Hat | ||
5 | * | ||
6 | * this code detects soft lockups: incidents in where on a CPU | ||
7 | * the kernel does not reschedule for 10 seconds or more. | ||
8 | */ | ||
9 | |||
10 | #include <linux/mm.h> | ||
11 | #include <linux/cpu.h> | ||
12 | #include <linux/init.h> | ||
13 | #include <linux/delay.h> | ||
14 | #include <linux/kthread.h> | ||
15 | #include <linux/notifier.h> | ||
16 | #include <linux/module.h> | ||
17 | |||
18 | static DEFINE_SPINLOCK(print_lock); | ||
19 | |||
20 | static DEFINE_PER_CPU(unsigned long, timestamp) = 0; | ||
21 | static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; | ||
22 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); | ||
23 | |||
24 | static int did_panic = 0; | ||
25 | static int softlock_panic(struct notifier_block *this, unsigned long event, | ||
26 | void *ptr) | ||
27 | { | ||
28 | did_panic = 1; | ||
29 | |||
30 | return NOTIFY_DONE; | ||
31 | } | ||
32 | |||
33 | static struct notifier_block panic_block = { | ||
34 | .notifier_call = softlock_panic, | ||
35 | }; | ||
36 | |||
37 | void touch_softlockup_watchdog(void) | ||
38 | { | ||
39 | per_cpu(timestamp, raw_smp_processor_id()) = jiffies; | ||
40 | } | ||
41 | EXPORT_SYMBOL(touch_softlockup_watchdog); | ||
42 | |||
43 | /* | ||
44 | * This callback runs from the timer interrupt, and checks | ||
45 | * whether the watchdog thread has hung or not: | ||
46 | */ | ||
47 | void softlockup_tick(struct pt_regs *regs) | ||
48 | { | ||
49 | int this_cpu = smp_processor_id(); | ||
50 | unsigned long timestamp = per_cpu(timestamp, this_cpu); | ||
51 | |||
52 | if (per_cpu(print_timestamp, this_cpu) == timestamp) | ||
53 | return; | ||
54 | |||
55 | /* Do not cause a second panic when there already was one */ | ||
56 | if (did_panic) | ||
57 | return; | ||
58 | |||
59 | if (time_after(jiffies, timestamp + 10*HZ)) { | ||
60 | per_cpu(print_timestamp, this_cpu) = timestamp; | ||
61 | |||
62 | spin_lock(&print_lock); | ||
63 | printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", | ||
64 | this_cpu); | ||
65 | show_regs(regs); | ||
66 | spin_unlock(&print_lock); | ||
67 | } | ||
68 | } | ||
69 | |||
70 | /* | ||
71 | * The watchdog thread - runs every second and touches the timestamp. | ||
72 | */ | ||
73 | static int watchdog(void * __bind_cpu) | ||
74 | { | ||
75 | struct sched_param param = { .sched_priority = 99 }; | ||
76 | int this_cpu = (long) __bind_cpu; | ||
77 | |||
78 | printk("softlockup thread %d started up.\n", this_cpu); | ||
79 | |||
80 | sched_setscheduler(current, SCHED_FIFO, ¶m); | ||
81 | current->flags |= PF_NOFREEZE; | ||
82 | |||
83 | set_current_state(TASK_INTERRUPTIBLE); | ||
84 | |||
85 | /* | ||
86 | * Run briefly once per second - if this gets delayed for | ||
87 | * more than 10 seconds then the debug-printout triggers | ||
88 | * in softlockup_tick(): | ||
89 | */ | ||
90 | while (!kthread_should_stop()) { | ||
91 | msleep_interruptible(1000); | ||
92 | touch_softlockup_watchdog(); | ||
93 | } | ||
94 | __set_current_state(TASK_RUNNING); | ||
95 | |||
96 | return 0; | ||
97 | } | ||
98 | |||
99 | /* | ||
100 | * Create/destroy watchdog threads as CPUs come and go: | ||
101 | */ | ||
102 | static int __devinit | ||
103 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | ||
104 | { | ||
105 | int hotcpu = (unsigned long)hcpu; | ||
106 | struct task_struct *p; | ||
107 | |||
108 | switch (action) { | ||
109 | case CPU_UP_PREPARE: | ||
110 | BUG_ON(per_cpu(watchdog_task, hotcpu)); | ||
111 | p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); | ||
112 | if (IS_ERR(p)) { | ||
113 | printk("watchdog for %i failed\n", hotcpu); | ||
114 | return NOTIFY_BAD; | ||
115 | } | ||
116 | per_cpu(watchdog_task, hotcpu) = p; | ||
117 | kthread_bind(p, hotcpu); | ||
118 | break; | ||
119 | case CPU_ONLINE: | ||
120 | |||
121 | wake_up_process(per_cpu(watchdog_task, hotcpu)); | ||
122 | break; | ||
123 | #ifdef CONFIG_HOTPLUG_CPU | ||
124 | case CPU_UP_CANCELED: | ||
125 | /* Unbind so it can run. Fall thru. */ | ||
126 | kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); | ||
127 | case CPU_DEAD: | ||
128 | p = per_cpu(watchdog_task, hotcpu); | ||
129 | per_cpu(watchdog_task, hotcpu) = NULL; | ||
130 | kthread_stop(p); | ||
131 | break; | ||
132 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
133 | } | ||
134 | return NOTIFY_OK; | ||
135 | } | ||
136 | |||
137 | static struct notifier_block __devinitdata cpu_nfb = { | ||
138 | .notifier_call = cpu_callback | ||
139 | }; | ||
140 | |||
141 | __init void spawn_softlockup_task(void) | ||
142 | { | ||
143 | void *cpu = (void *)(long)smp_processor_id(); | ||
144 | |||
145 | cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
146 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
147 | register_cpu_notifier(&cpu_nfb); | ||
148 | |||
149 | notifier_chain_register(&panic_notifier_list, &panic_block); | ||
150 | } | ||
151 | |||
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 0c3f9d8bbe17..0375fcd5921d 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -3,7 +3,10 @@ | |||
3 | * | 3 | * |
4 | * Author: Zwane Mwaikambo <zwane@fsmlabs.com> | 4 | * Author: Zwane Mwaikambo <zwane@fsmlabs.com> |
5 | * | 5 | * |
6 | * Copyright (2004) Ingo Molnar | 6 | * Copyright (2004, 2005) Ingo Molnar |
7 | * | ||
8 | * This file contains the spinlock/rwlock implementations for the | ||
9 | * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) | ||
7 | */ | 10 | */ |
8 | 11 | ||
9 | #include <linux/config.h> | 12 | #include <linux/config.h> |
@@ -17,12 +20,12 @@ | |||
17 | * Generic declaration of the raw read_trylock() function, | 20 | * Generic declaration of the raw read_trylock() function, |
18 | * architectures are supposed to optimize this: | 21 | * architectures are supposed to optimize this: |
19 | */ | 22 | */ |
20 | int __lockfunc generic_raw_read_trylock(rwlock_t *lock) | 23 | int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock) |
21 | { | 24 | { |
22 | _raw_read_lock(lock); | 25 | __raw_read_lock(lock); |
23 | return 1; | 26 | return 1; |
24 | } | 27 | } |
25 | EXPORT_SYMBOL(generic_raw_read_trylock); | 28 | EXPORT_SYMBOL(generic__raw_read_trylock); |
26 | 29 | ||
27 | int __lockfunc _spin_trylock(spinlock_t *lock) | 30 | int __lockfunc _spin_trylock(spinlock_t *lock) |
28 | { | 31 | { |
@@ -57,7 +60,7 @@ int __lockfunc _write_trylock(rwlock_t *lock) | |||
57 | } | 60 | } |
58 | EXPORT_SYMBOL(_write_trylock); | 61 | EXPORT_SYMBOL(_write_trylock); |
59 | 62 | ||
60 | #ifndef CONFIG_PREEMPT | 63 | #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) |
61 | 64 | ||
62 | void __lockfunc _read_lock(rwlock_t *lock) | 65 | void __lockfunc _read_lock(rwlock_t *lock) |
63 | { | 66 | { |
@@ -72,7 +75,7 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) | |||
72 | 75 | ||
73 | local_irq_save(flags); | 76 | local_irq_save(flags); |
74 | preempt_disable(); | 77 | preempt_disable(); |
75 | _raw_spin_lock_flags(lock, flags); | 78 | _raw_spin_lock_flags(lock, &flags); |
76 | return flags; | 79 | return flags; |
77 | } | 80 | } |
78 | EXPORT_SYMBOL(_spin_lock_irqsave); | 81 | EXPORT_SYMBOL(_spin_lock_irqsave); |
diff --git a/kernel/sys.c b/kernel/sys.c index 0bcaed6560ac..c80412be2302 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1711,7 +1711,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
1711 | unsigned long arg4, unsigned long arg5) | 1711 | unsigned long arg4, unsigned long arg5) |
1712 | { | 1712 | { |
1713 | long error; | 1713 | long error; |
1714 | int sig; | ||
1715 | 1714 | ||
1716 | error = security_task_prctl(option, arg2, arg3, arg4, arg5); | 1715 | error = security_task_prctl(option, arg2, arg3, arg4, arg5); |
1717 | if (error) | 1716 | if (error) |
@@ -1719,12 +1718,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
1719 | 1718 | ||
1720 | switch (option) { | 1719 | switch (option) { |
1721 | case PR_SET_PDEATHSIG: | 1720 | case PR_SET_PDEATHSIG: |
1722 | sig = arg2; | 1721 | if (!valid_signal(arg2)) { |
1723 | if (!valid_signal(sig)) { | ||
1724 | error = -EINVAL; | 1722 | error = -EINVAL; |
1725 | break; | 1723 | break; |
1726 | } | 1724 | } |
1727 | current->pdeath_signal = sig; | 1725 | current->pdeath_signal = arg2; |
1728 | break; | 1726 | break; |
1729 | case PR_GET_PDEATHSIG: | 1727 | case PR_GET_PDEATHSIG: |
1730 | error = put_user(current->pdeath_signal, (int __user *)arg2); | 1728 | error = put_user(current->pdeath_signal, (int __user *)arg2); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3e0bbee549ea..8e56e2495542 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -31,6 +31,7 @@ | |||
31 | #include <linux/smp_lock.h> | 31 | #include <linux/smp_lock.h> |
32 | #include <linux/init.h> | 32 | #include <linux/init.h> |
33 | #include <linux/kernel.h> | 33 | #include <linux/kernel.h> |
34 | #include <linux/net.h> | ||
34 | #include <linux/sysrq.h> | 35 | #include <linux/sysrq.h> |
35 | #include <linux/highuid.h> | 36 | #include <linux/highuid.h> |
36 | #include <linux/writeback.h> | 37 | #include <linux/writeback.h> |
@@ -136,9 +137,6 @@ static struct ctl_table_header root_table_header = | |||
136 | 137 | ||
137 | static ctl_table kern_table[]; | 138 | static ctl_table kern_table[]; |
138 | static ctl_table vm_table[]; | 139 | static ctl_table vm_table[]; |
139 | #ifdef CONFIG_NET | ||
140 | extern ctl_table net_table[]; | ||
141 | #endif | ||
142 | static ctl_table proc_table[]; | 140 | static ctl_table proc_table[]; |
143 | static ctl_table fs_table[]; | 141 | static ctl_table fs_table[]; |
144 | static ctl_table debug_table[]; | 142 | static ctl_table debug_table[]; |
diff --git a/kernel/timer.c b/kernel/timer.c index 5377f40723ff..3ba10fa35b60 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs) | |||
950 | { | 950 | { |
951 | jiffies_64++; | 951 | jiffies_64++; |
952 | update_times(); | 952 | update_times(); |
953 | softlockup_tick(regs); | ||
953 | } | 954 | } |
954 | 955 | ||
955 | #ifdef __ARCH_WANT_SYS_ALARM | 956 | #ifdef __ARCH_WANT_SYS_ALARM |
@@ -1150,9 +1151,26 @@ fastcall signed long __sched schedule_timeout(signed long timeout) | |||
1150 | out: | 1151 | out: |
1151 | return timeout < 0 ? 0 : timeout; | 1152 | return timeout < 0 ? 0 : timeout; |
1152 | } | 1153 | } |
1153 | |||
1154 | EXPORT_SYMBOL(schedule_timeout); | 1154 | EXPORT_SYMBOL(schedule_timeout); |
1155 | 1155 | ||
1156 | /* | ||
1157 | * We can use __set_current_state() here because schedule_timeout() calls | ||
1158 | * schedule() unconditionally. | ||
1159 | */ | ||
1160 | signed long __sched schedule_timeout_interruptible(signed long timeout) | ||
1161 | { | ||
1162 | __set_current_state(TASK_INTERRUPTIBLE); | ||
1163 | return schedule_timeout(timeout); | ||
1164 | } | ||
1165 | EXPORT_SYMBOL(schedule_timeout_interruptible); | ||
1166 | |||
1167 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) | ||
1168 | { | ||
1169 | __set_current_state(TASK_UNINTERRUPTIBLE); | ||
1170 | return schedule_timeout(timeout); | ||
1171 | } | ||
1172 | EXPORT_SYMBOL(schedule_timeout_uninterruptible); | ||
1173 | |||
1156 | /* Thread ID - the internal kernel "pid" */ | 1174 | /* Thread ID - the internal kernel "pid" */ |
1157 | asmlinkage long sys_gettid(void) | 1175 | asmlinkage long sys_gettid(void) |
1158 | { | 1176 | { |
@@ -1169,8 +1187,7 @@ static long __sched nanosleep_restart(struct restart_block *restart) | |||
1169 | if (!time_after(expire, now)) | 1187 | if (!time_after(expire, now)) |
1170 | return 0; | 1188 | return 0; |
1171 | 1189 | ||
1172 | current->state = TASK_INTERRUPTIBLE; | 1190 | expire = schedule_timeout_interruptible(expire - now); |
1173 | expire = schedule_timeout(expire - now); | ||
1174 | 1191 | ||
1175 | ret = 0; | 1192 | ret = 0; |
1176 | if (expire) { | 1193 | if (expire) { |
@@ -1198,8 +1215,7 @@ asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __us | |||
1198 | return -EINVAL; | 1215 | return -EINVAL; |
1199 | 1216 | ||
1200 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); | 1217 | expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec); |
1201 | current->state = TASK_INTERRUPTIBLE; | 1218 | expire = schedule_timeout_interruptible(expire); |
1202 | expire = schedule_timeout(expire); | ||
1203 | 1219 | ||
1204 | ret = 0; | 1220 | ret = 0; |
1205 | if (expire) { | 1221 | if (expire) { |
@@ -1428,7 +1444,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src) | |||
1428 | } | 1444 | } |
1429 | } | 1445 | } |
1430 | 1446 | ||
1431 | static inline u64 time_interpolator_get_counter(void) | 1447 | static inline u64 time_interpolator_get_counter(int writelock) |
1432 | { | 1448 | { |
1433 | unsigned int src = time_interpolator->source; | 1449 | unsigned int src = time_interpolator->source; |
1434 | 1450 | ||
@@ -1442,6 +1458,15 @@ static inline u64 time_interpolator_get_counter(void) | |||
1442 | now = time_interpolator_get_cycles(src); | 1458 | now = time_interpolator_get_cycles(src); |
1443 | if (lcycle && time_after(lcycle, now)) | 1459 | if (lcycle && time_after(lcycle, now)) |
1444 | return lcycle; | 1460 | return lcycle; |
1461 | |||
1462 | /* When holding the xtime write lock, there's no need | ||
1463 | * to add the overhead of the cmpxchg. Readers are | ||
1464 | * force to retry until the write lock is released. | ||
1465 | */ | ||
1466 | if (writelock) { | ||
1467 | time_interpolator->last_cycle = now; | ||
1468 | return now; | ||
1469 | } | ||
1445 | /* Keep track of the last timer value returned. The use of cmpxchg here | 1470 | /* Keep track of the last timer value returned. The use of cmpxchg here |
1446 | * will cause contention in an SMP environment. | 1471 | * will cause contention in an SMP environment. |
1447 | */ | 1472 | */ |
@@ -1455,7 +1480,7 @@ static inline u64 time_interpolator_get_counter(void) | |||
1455 | void time_interpolator_reset(void) | 1480 | void time_interpolator_reset(void) |
1456 | { | 1481 | { |
1457 | time_interpolator->offset = 0; | 1482 | time_interpolator->offset = 0; |
1458 | time_interpolator->last_counter = time_interpolator_get_counter(); | 1483 | time_interpolator->last_counter = time_interpolator_get_counter(1); |
1459 | } | 1484 | } |
1460 | 1485 | ||
1461 | #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) | 1486 | #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) |
@@ -1467,7 +1492,7 @@ unsigned long time_interpolator_get_offset(void) | |||
1467 | return 0; | 1492 | return 0; |
1468 | 1493 | ||
1469 | return time_interpolator->offset + | 1494 | return time_interpolator->offset + |
1470 | GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); | 1495 | GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); |
1471 | } | 1496 | } |
1472 | 1497 | ||
1473 | #define INTERPOLATOR_ADJUST 65536 | 1498 | #define INTERPOLATOR_ADJUST 65536 |
@@ -1490,7 +1515,7 @@ static void time_interpolator_update(long delta_nsec) | |||
1490 | * and the tuning logic insures that. | 1515 | * and the tuning logic insures that. |
1491 | */ | 1516 | */ |
1492 | 1517 | ||
1493 | counter = time_interpolator_get_counter(); | 1518 | counter = time_interpolator_get_counter(1); |
1494 | offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); | 1519 | offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); |
1495 | 1520 | ||
1496 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) | 1521 | if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) |
@@ -1588,10 +1613,8 @@ void msleep(unsigned int msecs) | |||
1588 | { | 1613 | { |
1589 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; | 1614 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; |
1590 | 1615 | ||
1591 | while (timeout) { | 1616 | while (timeout) |
1592 | set_current_state(TASK_UNINTERRUPTIBLE); | 1617 | timeout = schedule_timeout_uninterruptible(timeout); |
1593 | timeout = schedule_timeout(timeout); | ||
1594 | } | ||
1595 | } | 1618 | } |
1596 | 1619 | ||
1597 | EXPORT_SYMBOL(msleep); | 1620 | EXPORT_SYMBOL(msleep); |
@@ -1604,10 +1627,8 @@ unsigned long msleep_interruptible(unsigned int msecs) | |||
1604 | { | 1627 | { |
1605 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; | 1628 | unsigned long timeout = msecs_to_jiffies(msecs) + 1; |
1606 | 1629 | ||
1607 | while (timeout && !signal_pending(current)) { | 1630 | while (timeout && !signal_pending(current)) |
1608 | set_current_state(TASK_INTERRUPTIBLE); | 1631 | timeout = schedule_timeout_interruptible(timeout); |
1609 | timeout = schedule_timeout(timeout); | ||
1610 | } | ||
1611 | return jiffies_to_msecs(timeout); | 1632 | return jiffies_to_msecs(timeout); |
1612 | } | 1633 | } |
1613 | 1634 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c7e36d4a70ca..91bacb13a7e2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
308 | struct workqueue_struct *wq; | 308 | struct workqueue_struct *wq; |
309 | struct task_struct *p; | 309 | struct task_struct *p; |
310 | 310 | ||
311 | wq = kmalloc(sizeof(*wq), GFP_KERNEL); | 311 | wq = kzalloc(sizeof(*wq), GFP_KERNEL); |
312 | if (!wq) | 312 | if (!wq) |
313 | return NULL; | 313 | return NULL; |
314 | memset(wq, 0, sizeof(*wq)); | ||
315 | 314 | ||
316 | wq->name = name; | 315 | wq->name = name; |
317 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 316 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
@@ -499,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
499 | case CPU_UP_PREPARE: | 498 | case CPU_UP_PREPARE: |
500 | /* Create a new workqueue thread for it. */ | 499 | /* Create a new workqueue thread for it. */ |
501 | list_for_each_entry(wq, &workqueues, list) { | 500 | list_for_each_entry(wq, &workqueues, list) { |
502 | if (create_workqueue_thread(wq, hotcpu) < 0) { | 501 | if (!create_workqueue_thread(wq, hotcpu)) { |
503 | printk("workqueue for %i failed\n", hotcpu); | 502 | printk("workqueue for %i failed\n", hotcpu); |
504 | return NOTIFY_BAD; | 503 | return NOTIFY_BAD; |
505 | } | 504 | } |