diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 2 | ||||
-rw-r--r-- | kernel/cpuset.c | 537 | ||||
-rw-r--r-- | kernel/crash_dump.c | 3 | ||||
-rw-r--r-- | kernel/exit.c | 4 | ||||
-rw-r--r-- | kernel/fork.c | 33 | ||||
-rw-r--r-- | kernel/irq/proc.c | 2 | ||||
-rw-r--r-- | kernel/module.c | 56 | ||||
-rw-r--r-- | kernel/pid.c | 22 | ||||
-rw-r--r-- | kernel/printk.c | 6 | ||||
-rw-r--r-- | kernel/ptrace.c | 77 | ||||
-rw-r--r-- | kernel/rcupdate.c | 49 | ||||
-rw-r--r-- | kernel/rcutorture.c | 99 | ||||
-rw-r--r-- | kernel/sched.c | 7 | ||||
-rw-r--r-- | kernel/signal.c | 137 | ||||
-rw-r--r-- | kernel/sys.c | 62 | ||||
-rw-r--r-- | kernel/sys_ni.c | 24 | ||||
-rw-r--r-- | kernel/sysctl.c | 22 | ||||
-rw-r--r-- | kernel/timer.c | 1 | ||||
-rw-r--r-- | kernel/workqueue.c | 40 |
19 files changed, 943 insertions, 240 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 32fa03ad1984..d13ab7d2d899 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -267,7 +267,7 @@ static int audit_set_failure(int state, uid_t loginuid) | |||
267 | return old; | 267 | return old; |
268 | } | 268 | } |
269 | 269 | ||
270 | int kauditd_thread(void *dummy) | 270 | static int kauditd_thread(void *dummy) |
271 | { | 271 | { |
272 | struct sk_buff *skb; | 272 | struct sk_buff *skb; |
273 | 273 | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7430640f9816..eab64e23bcae 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/namei.h> | 39 | #include <linux/namei.h> |
40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
41 | #include <linux/proc_fs.h> | 41 | #include <linux/proc_fs.h> |
42 | #include <linux/rcupdate.h> | ||
42 | #include <linux/sched.h> | 43 | #include <linux/sched.h> |
43 | #include <linux/seq_file.h> | 44 | #include <linux/seq_file.h> |
44 | #include <linux/slab.h> | 45 | #include <linux/slab.h> |
@@ -54,7 +55,23 @@ | |||
54 | #include <asm/atomic.h> | 55 | #include <asm/atomic.h> |
55 | #include <asm/semaphore.h> | 56 | #include <asm/semaphore.h> |
56 | 57 | ||
57 | #define CPUSET_SUPER_MAGIC 0x27e0eb | 58 | #define CPUSET_SUPER_MAGIC 0x27e0eb |
59 | |||
60 | /* | ||
61 | * Tracks how many cpusets are currently defined in system. | ||
62 | * When there is only one cpuset (the root cpuset) we can | ||
63 | * short circuit some hooks. | ||
64 | */ | ||
65 | int number_of_cpusets __read_mostly; | ||
66 | |||
67 | /* See "Frequency meter" comments, below. */ | ||
68 | |||
69 | struct fmeter { | ||
70 | int cnt; /* unprocessed events count */ | ||
71 | int val; /* most recent output value */ | ||
72 | time_t time; /* clock (secs) when val computed */ | ||
73 | spinlock_t lock; /* guards read or write of above */ | ||
74 | }; | ||
58 | 75 | ||
59 | struct cpuset { | 76 | struct cpuset { |
60 | unsigned long flags; /* "unsigned long" so bitops work */ | 77 | unsigned long flags; /* "unsigned long" so bitops work */ |
@@ -80,13 +97,16 @@ struct cpuset { | |||
80 | * Copy of global cpuset_mems_generation as of the most | 97 | * Copy of global cpuset_mems_generation as of the most |
81 | * recent time this cpuset changed its mems_allowed. | 98 | * recent time this cpuset changed its mems_allowed. |
82 | */ | 99 | */ |
83 | int mems_generation; | 100 | int mems_generation; |
101 | |||
102 | struct fmeter fmeter; /* memory_pressure filter */ | ||
84 | }; | 103 | }; |
85 | 104 | ||
86 | /* bits in struct cpuset flags field */ | 105 | /* bits in struct cpuset flags field */ |
87 | typedef enum { | 106 | typedef enum { |
88 | CS_CPU_EXCLUSIVE, | 107 | CS_CPU_EXCLUSIVE, |
89 | CS_MEM_EXCLUSIVE, | 108 | CS_MEM_EXCLUSIVE, |
109 | CS_MEMORY_MIGRATE, | ||
90 | CS_REMOVED, | 110 | CS_REMOVED, |
91 | CS_NOTIFY_ON_RELEASE | 111 | CS_NOTIFY_ON_RELEASE |
92 | } cpuset_flagbits_t; | 112 | } cpuset_flagbits_t; |
@@ -112,6 +132,11 @@ static inline int notify_on_release(const struct cpuset *cs) | |||
112 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 132 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
113 | } | 133 | } |
114 | 134 | ||
135 | static inline int is_memory_migrate(const struct cpuset *cs) | ||
136 | { | ||
137 | return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); | ||
138 | } | ||
139 | |||
115 | /* | 140 | /* |
116 | * Increment this atomic integer everytime any cpuset changes its | 141 | * Increment this atomic integer everytime any cpuset changes its |
117 | * mems_allowed value. Users of cpusets can track this generation | 142 | * mems_allowed value. Users of cpusets can track this generation |
@@ -137,13 +162,10 @@ static struct cpuset top_cpuset = { | |||
137 | .count = ATOMIC_INIT(0), | 162 | .count = ATOMIC_INIT(0), |
138 | .sibling = LIST_HEAD_INIT(top_cpuset.sibling), | 163 | .sibling = LIST_HEAD_INIT(top_cpuset.sibling), |
139 | .children = LIST_HEAD_INIT(top_cpuset.children), | 164 | .children = LIST_HEAD_INIT(top_cpuset.children), |
140 | .parent = NULL, | ||
141 | .dentry = NULL, | ||
142 | .mems_generation = 0, | ||
143 | }; | 165 | }; |
144 | 166 | ||
145 | static struct vfsmount *cpuset_mount; | 167 | static struct vfsmount *cpuset_mount; |
146 | static struct super_block *cpuset_sb = NULL; | 168 | static struct super_block *cpuset_sb; |
147 | 169 | ||
148 | /* | 170 | /* |
149 | * We have two global cpuset semaphores below. They can nest. | 171 | * We have two global cpuset semaphores below. They can nest. |
@@ -227,6 +249,11 @@ static struct super_block *cpuset_sb = NULL; | |||
227 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | 249 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock |
228 | * (task->alloc_lock) already in the task_struct routinely used for | 250 | * (task->alloc_lock) already in the task_struct routinely used for |
229 | * such matters. | 251 | * such matters. |
252 | * | ||
253 | * P.S. One more locking exception. RCU is used to guard the | ||
254 | * update of a tasks cpuset pointer by attach_task() and the | ||
255 | * access of task->cpuset->mems_generation via that pointer in | ||
256 | * the routine cpuset_update_task_memory_state(). | ||
230 | */ | 257 | */ |
231 | 258 | ||
232 | static DECLARE_MUTEX(manage_sem); | 259 | static DECLARE_MUTEX(manage_sem); |
@@ -304,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) | |||
304 | spin_lock(&dcache_lock); | 331 | spin_lock(&dcache_lock); |
305 | node = dentry->d_subdirs.next; | 332 | node = dentry->d_subdirs.next; |
306 | while (node != &dentry->d_subdirs) { | 333 | while (node != &dentry->d_subdirs) { |
307 | struct dentry *d = list_entry(node, struct dentry, d_child); | 334 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); |
308 | list_del_init(node); | 335 | list_del_init(node); |
309 | if (d->d_inode) { | 336 | if (d->d_inode) { |
310 | d = dget_locked(d); | 337 | d = dget_locked(d); |
@@ -316,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) | |||
316 | } | 343 | } |
317 | node = dentry->d_subdirs.next; | 344 | node = dentry->d_subdirs.next; |
318 | } | 345 | } |
319 | list_del_init(&dentry->d_child); | 346 | list_del_init(&dentry->d_u.d_child); |
320 | spin_unlock(&dcache_lock); | 347 | spin_unlock(&dcache_lock); |
321 | remove_dir(dentry); | 348 | remove_dir(dentry); |
322 | } | 349 | } |
@@ -570,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
570 | BUG_ON(!nodes_intersects(*pmask, node_online_map)); | 597 | BUG_ON(!nodes_intersects(*pmask, node_online_map)); |
571 | } | 598 | } |
572 | 599 | ||
573 | /* | 600 | /** |
574 | * Refresh current tasks mems_allowed and mems_generation from current | 601 | * cpuset_update_task_memory_state - update task memory placement |
575 | * tasks cpuset. | ||
576 | * | 602 | * |
577 | * Call without callback_sem or task_lock() held. May be called with | 603 | * If the current tasks cpusets mems_allowed changed behind our |
578 | * or without manage_sem held. Will acquire task_lock() and might | 604 | * backs, update current->mems_allowed, mems_generation and task NUMA |
579 | * acquire callback_sem during call. | 605 | * mempolicy to the new value. |
606 | * | ||
607 | * Task mempolicy is updated by rebinding it relative to the | ||
608 | * current->cpuset if a task has its memory placement changed. | ||
609 | * Do not call this routine if in_interrupt(). | ||
580 | * | 610 | * |
581 | * The task_lock() is required to dereference current->cpuset safely. | 611 | * Call without callback_sem or task_lock() held. May be called |
582 | * Without it, we could pick up the pointer value of current->cpuset | 612 | * with or without manage_sem held. Doesn't need task_lock to guard |
583 | * in one instruction, and then attach_task could give us a different | 613 | * against another task changing a non-NULL cpuset pointer to NULL, |
584 | * cpuset, and then the cpuset we had could be removed and freed, | 614 | * as that is only done by a task on itself, and if the current task |
585 | * and then on our next instruction, we could dereference a no longer | 615 | * is here, it is not simultaneously in the exit code NULL'ing its |
586 | * valid cpuset pointer to get its mems_generation field. | 616 | * cpuset pointer. This routine also might acquire callback_sem and |
617 | * current->mm->mmap_sem during call. | ||
618 | * | ||
619 | * Reading current->cpuset->mems_generation doesn't need task_lock | ||
620 | * to guard the current->cpuset derefence, because it is guarded | ||
621 | * from concurrent freeing of current->cpuset by attach_task(), | ||
622 | * using RCU. | ||
623 | * | ||
624 | * The rcu_dereference() is technically probably not needed, | ||
625 | * as I don't actually mind if I see a new cpuset pointer but | ||
626 | * an old value of mems_generation. However this really only | ||
627 | * matters on alpha systems using cpusets heavily. If I dropped | ||
628 | * that rcu_dereference(), it would save them a memory barrier. | ||
629 | * For all other arch's, rcu_dereference is a no-op anyway, and for | ||
630 | * alpha systems not using cpusets, another planned optimization, | ||
631 | * avoiding the rcu critical section for tasks in the root cpuset | ||
632 | * which is statically allocated, so can't vanish, will make this | ||
633 | * irrelevant. Better to use RCU as intended, than to engage in | ||
634 | * some cute trick to save a memory barrier that is impossible to | ||
635 | * test, for alpha systems using cpusets heavily, which might not | ||
636 | * even exist. | ||
587 | * | 637 | * |
588 | * This routine is needed to update the per-task mems_allowed data, | 638 | * This routine is needed to update the per-task mems_allowed data, |
589 | * within the tasks context, when it is trying to allocate memory | 639 | * within the tasks context, when it is trying to allocate memory |
@@ -591,27 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
591 | * task has been modifying its cpuset. | 641 | * task has been modifying its cpuset. |
592 | */ | 642 | */ |
593 | 643 | ||
594 | static void refresh_mems(void) | 644 | void cpuset_update_task_memory_state() |
595 | { | 645 | { |
596 | int my_cpusets_mem_gen; | 646 | int my_cpusets_mem_gen; |
647 | struct task_struct *tsk = current; | ||
648 | struct cpuset *cs; | ||
597 | 649 | ||
598 | task_lock(current); | 650 | if (tsk->cpuset == &top_cpuset) { |
599 | my_cpusets_mem_gen = current->cpuset->mems_generation; | 651 | /* Don't need rcu for top_cpuset. It's never freed. */ |
600 | task_unlock(current); | 652 | my_cpusets_mem_gen = top_cpuset.mems_generation; |
601 | 653 | } else { | |
602 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { | 654 | rcu_read_lock(); |
603 | struct cpuset *cs; | 655 | cs = rcu_dereference(tsk->cpuset); |
604 | nodemask_t oldmem = current->mems_allowed; | 656 | my_cpusets_mem_gen = cs->mems_generation; |
657 | rcu_read_unlock(); | ||
658 | } | ||
605 | 659 | ||
660 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { | ||
606 | down(&callback_sem); | 661 | down(&callback_sem); |
607 | task_lock(current); | 662 | task_lock(tsk); |
608 | cs = current->cpuset; | 663 | cs = tsk->cpuset; /* Maybe changed when task not locked */ |
609 | guarantee_online_mems(cs, ¤t->mems_allowed); | 664 | guarantee_online_mems(cs, &tsk->mems_allowed); |
610 | current->cpuset_mems_generation = cs->mems_generation; | 665 | tsk->cpuset_mems_generation = cs->mems_generation; |
611 | task_unlock(current); | 666 | task_unlock(tsk); |
612 | up(&callback_sem); | 667 | up(&callback_sem); |
613 | if (!nodes_equal(oldmem, current->mems_allowed)) | 668 | mpol_rebind_task(tsk, &tsk->mems_allowed); |
614 | numa_policy_rebind(&oldmem, ¤t->mems_allowed); | ||
615 | } | 669 | } |
616 | } | 670 | } |
617 | 671 | ||
@@ -766,36 +820,150 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
766 | } | 820 | } |
767 | 821 | ||
768 | /* | 822 | /* |
823 | * Handle user request to change the 'mems' memory placement | ||
824 | * of a cpuset. Needs to validate the request, update the | ||
825 | * cpusets mems_allowed and mems_generation, and for each | ||
826 | * task in the cpuset, rebind any vma mempolicies and if | ||
827 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
828 | * pages to the new memory. | ||
829 | * | ||
769 | * Call with manage_sem held. May take callback_sem during call. | 830 | * Call with manage_sem held. May take callback_sem during call. |
831 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
832 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
833 | * their mempolicies to the cpusets new mems_allowed. | ||
770 | */ | 834 | */ |
771 | 835 | ||
772 | static int update_nodemask(struct cpuset *cs, char *buf) | 836 | static int update_nodemask(struct cpuset *cs, char *buf) |
773 | { | 837 | { |
774 | struct cpuset trialcs; | 838 | struct cpuset trialcs; |
839 | nodemask_t oldmem; | ||
840 | struct task_struct *g, *p; | ||
841 | struct mm_struct **mmarray; | ||
842 | int i, n, ntasks; | ||
843 | int migrate; | ||
844 | int fudge; | ||
775 | int retval; | 845 | int retval; |
776 | 846 | ||
777 | trialcs = *cs; | 847 | trialcs = *cs; |
778 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 848 | retval = nodelist_parse(buf, trialcs.mems_allowed); |
779 | if (retval < 0) | 849 | if (retval < 0) |
780 | return retval; | 850 | goto done; |
781 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); | 851 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); |
782 | if (nodes_empty(trialcs.mems_allowed)) | 852 | oldmem = cs->mems_allowed; |
783 | return -ENOSPC; | 853 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { |
854 | retval = 0; /* Too easy - nothing to do */ | ||
855 | goto done; | ||
856 | } | ||
857 | if (nodes_empty(trialcs.mems_allowed)) { | ||
858 | retval = -ENOSPC; | ||
859 | goto done; | ||
860 | } | ||
784 | retval = validate_change(cs, &trialcs); | 861 | retval = validate_change(cs, &trialcs); |
785 | if (retval == 0) { | 862 | if (retval < 0) |
786 | down(&callback_sem); | 863 | goto done; |
787 | cs->mems_allowed = trialcs.mems_allowed; | 864 | |
788 | atomic_inc(&cpuset_mems_generation); | 865 | down(&callback_sem); |
789 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 866 | cs->mems_allowed = trialcs.mems_allowed; |
790 | up(&callback_sem); | 867 | atomic_inc(&cpuset_mems_generation); |
868 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | ||
869 | up(&callback_sem); | ||
870 | |||
871 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ | ||
872 | |||
873 | fudge = 10; /* spare mmarray[] slots */ | ||
874 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ | ||
875 | retval = -ENOMEM; | ||
876 | |||
877 | /* | ||
878 | * Allocate mmarray[] to hold mm reference for each task | ||
879 | * in cpuset cs. Can't kmalloc GFP_KERNEL while holding | ||
880 | * tasklist_lock. We could use GFP_ATOMIC, but with a | ||
881 | * few more lines of code, we can retry until we get a big | ||
882 | * enough mmarray[] w/o using GFP_ATOMIC. | ||
883 | */ | ||
884 | while (1) { | ||
885 | ntasks = atomic_read(&cs->count); /* guess */ | ||
886 | ntasks += fudge; | ||
887 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); | ||
888 | if (!mmarray) | ||
889 | goto done; | ||
890 | write_lock_irq(&tasklist_lock); /* block fork */ | ||
891 | if (atomic_read(&cs->count) <= ntasks) | ||
892 | break; /* got enough */ | ||
893 | write_unlock_irq(&tasklist_lock); /* try again */ | ||
894 | kfree(mmarray); | ||
791 | } | 895 | } |
896 | |||
897 | n = 0; | ||
898 | |||
899 | /* Load up mmarray[] with mm reference for each task in cpuset. */ | ||
900 | do_each_thread(g, p) { | ||
901 | struct mm_struct *mm; | ||
902 | |||
903 | if (n >= ntasks) { | ||
904 | printk(KERN_WARNING | ||
905 | "Cpuset mempolicy rebind incomplete.\n"); | ||
906 | continue; | ||
907 | } | ||
908 | if (p->cpuset != cs) | ||
909 | continue; | ||
910 | mm = get_task_mm(p); | ||
911 | if (!mm) | ||
912 | continue; | ||
913 | mmarray[n++] = mm; | ||
914 | } while_each_thread(g, p); | ||
915 | write_unlock_irq(&tasklist_lock); | ||
916 | |||
917 | /* | ||
918 | * Now that we've dropped the tasklist spinlock, we can | ||
919 | * rebind the vma mempolicies of each mm in mmarray[] to their | ||
920 | * new cpuset, and release that mm. The mpol_rebind_mm() | ||
921 | * call takes mmap_sem, which we couldn't take while holding | ||
922 | * tasklist_lock. Forks can happen again now - the mpol_copy() | ||
923 | * cpuset_being_rebound check will catch such forks, and rebind | ||
924 | * their vma mempolicies too. Because we still hold the global | ||
925 | * cpuset manage_sem, we know that no other rebind effort will | ||
926 | * be contending for the global variable cpuset_being_rebound. | ||
927 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | ||
928 | * is idempotent. Also migrate pages in each mm to new nodes. | ||
929 | */ | ||
930 | migrate = is_memory_migrate(cs); | ||
931 | for (i = 0; i < n; i++) { | ||
932 | struct mm_struct *mm = mmarray[i]; | ||
933 | |||
934 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
935 | if (migrate) { | ||
936 | do_migrate_pages(mm, &oldmem, &cs->mems_allowed, | ||
937 | MPOL_MF_MOVE_ALL); | ||
938 | } | ||
939 | mmput(mm); | ||
940 | } | ||
941 | |||
942 | /* We're done rebinding vma's to this cpusets new mems_allowed. */ | ||
943 | kfree(mmarray); | ||
944 | set_cpuset_being_rebound(NULL); | ||
945 | retval = 0; | ||
946 | done: | ||
792 | return retval; | 947 | return retval; |
793 | } | 948 | } |
794 | 949 | ||
795 | /* | 950 | /* |
951 | * Call with manage_sem held. | ||
952 | */ | ||
953 | |||
954 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | ||
955 | { | ||
956 | if (simple_strtoul(buf, NULL, 10) != 0) | ||
957 | cpuset_memory_pressure_enabled = 1; | ||
958 | else | ||
959 | cpuset_memory_pressure_enabled = 0; | ||
960 | return 0; | ||
961 | } | ||
962 | |||
963 | /* | ||
796 | * update_flag - read a 0 or a 1 in a file and update associated flag | 964 | * update_flag - read a 0 or a 1 in a file and update associated flag |
797 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 965 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
798 | * CS_NOTIFY_ON_RELEASE) | 966 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) |
799 | * cs: the cpuset to update | 967 | * cs: the cpuset to update |
800 | * buf: the buffer where we read the 0 or 1 | 968 | * buf: the buffer where we read the 0 or 1 |
801 | * | 969 | * |
@@ -834,6 +1002,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
834 | } | 1002 | } |
835 | 1003 | ||
836 | /* | 1004 | /* |
1005 | * Frequency meter - How fast is some event occuring? | ||
1006 | * | ||
1007 | * These routines manage a digitally filtered, constant time based, | ||
1008 | * event frequency meter. There are four routines: | ||
1009 | * fmeter_init() - initialize a frequency meter. | ||
1010 | * fmeter_markevent() - called each time the event happens. | ||
1011 | * fmeter_getrate() - returns the recent rate of such events. | ||
1012 | * fmeter_update() - internal routine used to update fmeter. | ||
1013 | * | ||
1014 | * A common data structure is passed to each of these routines, | ||
1015 | * which is used to keep track of the state required to manage the | ||
1016 | * frequency meter and its digital filter. | ||
1017 | * | ||
1018 | * The filter works on the number of events marked per unit time. | ||
1019 | * The filter is single-pole low-pass recursive (IIR). The time unit | ||
1020 | * is 1 second. Arithmetic is done using 32-bit integers scaled to | ||
1021 | * simulate 3 decimal digits of precision (multiplied by 1000). | ||
1022 | * | ||
1023 | * With an FM_COEF of 933, and a time base of 1 second, the filter | ||
1024 | * has a half-life of 10 seconds, meaning that if the events quit | ||
1025 | * happening, then the rate returned from the fmeter_getrate() | ||
1026 | * will be cut in half each 10 seconds, until it converges to zero. | ||
1027 | * | ||
1028 | * It is not worth doing a real infinitely recursive filter. If more | ||
1029 | * than FM_MAXTICKS ticks have elapsed since the last filter event, | ||
1030 | * just compute FM_MAXTICKS ticks worth, by which point the level | ||
1031 | * will be stable. | ||
1032 | * | ||
1033 | * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid | ||
1034 | * arithmetic overflow in the fmeter_update() routine. | ||
1035 | * | ||
1036 | * Given the simple 32 bit integer arithmetic used, this meter works | ||
1037 | * best for reporting rates between one per millisecond (msec) and | ||
1038 | * one per 32 (approx) seconds. At constant rates faster than one | ||
1039 | * per msec it maxes out at values just under 1,000,000. At constant | ||
1040 | * rates between one per msec, and one per second it will stabilize | ||
1041 | * to a value N*1000, where N is the rate of events per second. | ||
1042 | * At constant rates between one per second and one per 32 seconds, | ||
1043 | * it will be choppy, moving up on the seconds that have an event, | ||
1044 | * and then decaying until the next event. At rates slower than | ||
1045 | * about one in 32 seconds, it decays all the way back to zero between | ||
1046 | * each event. | ||
1047 | */ | ||
1048 | |||
1049 | #define FM_COEF 933 /* coefficient for half-life of 10 secs */ | ||
1050 | #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ | ||
1051 | #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ | ||
1052 | #define FM_SCALE 1000 /* faux fixed point scale */ | ||
1053 | |||
1054 | /* Initialize a frequency meter */ | ||
1055 | static void fmeter_init(struct fmeter *fmp) | ||
1056 | { | ||
1057 | fmp->cnt = 0; | ||
1058 | fmp->val = 0; | ||
1059 | fmp->time = 0; | ||
1060 | spin_lock_init(&fmp->lock); | ||
1061 | } | ||
1062 | |||
1063 | /* Internal meter update - process cnt events and update value */ | ||
1064 | static void fmeter_update(struct fmeter *fmp) | ||
1065 | { | ||
1066 | time_t now = get_seconds(); | ||
1067 | time_t ticks = now - fmp->time; | ||
1068 | |||
1069 | if (ticks == 0) | ||
1070 | return; | ||
1071 | |||
1072 | ticks = min(FM_MAXTICKS, ticks); | ||
1073 | while (ticks-- > 0) | ||
1074 | fmp->val = (FM_COEF * fmp->val) / FM_SCALE; | ||
1075 | fmp->time = now; | ||
1076 | |||
1077 | fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; | ||
1078 | fmp->cnt = 0; | ||
1079 | } | ||
1080 | |||
1081 | /* Process any previous ticks, then bump cnt by one (times scale). */ | ||
1082 | static void fmeter_markevent(struct fmeter *fmp) | ||
1083 | { | ||
1084 | spin_lock(&fmp->lock); | ||
1085 | fmeter_update(fmp); | ||
1086 | fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); | ||
1087 | spin_unlock(&fmp->lock); | ||
1088 | } | ||
1089 | |||
1090 | /* Process any previous ticks, then return current value. */ | ||
1091 | static int fmeter_getrate(struct fmeter *fmp) | ||
1092 | { | ||
1093 | int val; | ||
1094 | |||
1095 | spin_lock(&fmp->lock); | ||
1096 | fmeter_update(fmp); | ||
1097 | val = fmp->val; | ||
1098 | spin_unlock(&fmp->lock); | ||
1099 | return val; | ||
1100 | } | ||
1101 | |||
1102 | /* | ||
837 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | 1103 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly |
838 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | 1104 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be |
839 | * notified on release. | 1105 | * notified on release. |
@@ -848,6 +1114,8 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
848 | struct task_struct *tsk; | 1114 | struct task_struct *tsk; |
849 | struct cpuset *oldcs; | 1115 | struct cpuset *oldcs; |
850 | cpumask_t cpus; | 1116 | cpumask_t cpus; |
1117 | nodemask_t from, to; | ||
1118 | struct mm_struct *mm; | ||
851 | 1119 | ||
852 | if (sscanf(pidbuf, "%d", &pid) != 1) | 1120 | if (sscanf(pidbuf, "%d", &pid) != 1) |
853 | return -EIO; | 1121 | return -EIO; |
@@ -887,14 +1155,27 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
887 | return -ESRCH; | 1155 | return -ESRCH; |
888 | } | 1156 | } |
889 | atomic_inc(&cs->count); | 1157 | atomic_inc(&cs->count); |
890 | tsk->cpuset = cs; | 1158 | rcu_assign_pointer(tsk->cpuset, cs); |
891 | task_unlock(tsk); | 1159 | task_unlock(tsk); |
892 | 1160 | ||
893 | guarantee_online_cpus(cs, &cpus); | 1161 | guarantee_online_cpus(cs, &cpus); |
894 | set_cpus_allowed(tsk, cpus); | 1162 | set_cpus_allowed(tsk, cpus); |
895 | 1163 | ||
1164 | from = oldcs->mems_allowed; | ||
1165 | to = cs->mems_allowed; | ||
1166 | |||
896 | up(&callback_sem); | 1167 | up(&callback_sem); |
1168 | |||
1169 | mm = get_task_mm(tsk); | ||
1170 | if (mm) { | ||
1171 | mpol_rebind_mm(mm, &to); | ||
1172 | mmput(mm); | ||
1173 | } | ||
1174 | |||
1175 | if (is_memory_migrate(cs)) | ||
1176 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); | ||
897 | put_task_struct(tsk); | 1177 | put_task_struct(tsk); |
1178 | synchronize_rcu(); | ||
898 | if (atomic_dec_and_test(&oldcs->count)) | 1179 | if (atomic_dec_and_test(&oldcs->count)) |
899 | check_for_release(oldcs, ppathbuf); | 1180 | check_for_release(oldcs, ppathbuf); |
900 | return 0; | 1181 | return 0; |
@@ -905,11 +1186,14 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
905 | typedef enum { | 1186 | typedef enum { |
906 | FILE_ROOT, | 1187 | FILE_ROOT, |
907 | FILE_DIR, | 1188 | FILE_DIR, |
1189 | FILE_MEMORY_MIGRATE, | ||
908 | FILE_CPULIST, | 1190 | FILE_CPULIST, |
909 | FILE_MEMLIST, | 1191 | FILE_MEMLIST, |
910 | FILE_CPU_EXCLUSIVE, | 1192 | FILE_CPU_EXCLUSIVE, |
911 | FILE_MEM_EXCLUSIVE, | 1193 | FILE_MEM_EXCLUSIVE, |
912 | FILE_NOTIFY_ON_RELEASE, | 1194 | FILE_NOTIFY_ON_RELEASE, |
1195 | FILE_MEMORY_PRESSURE_ENABLED, | ||
1196 | FILE_MEMORY_PRESSURE, | ||
913 | FILE_TASKLIST, | 1197 | FILE_TASKLIST, |
914 | } cpuset_filetype_t; | 1198 | } cpuset_filetype_t; |
915 | 1199 | ||
@@ -960,6 +1244,15 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
960 | case FILE_NOTIFY_ON_RELEASE: | 1244 | case FILE_NOTIFY_ON_RELEASE: |
961 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); | 1245 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); |
962 | break; | 1246 | break; |
1247 | case FILE_MEMORY_MIGRATE: | ||
1248 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | ||
1249 | break; | ||
1250 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
1251 | retval = update_memory_pressure_enabled(cs, buffer); | ||
1252 | break; | ||
1253 | case FILE_MEMORY_PRESSURE: | ||
1254 | retval = -EACCES; | ||
1255 | break; | ||
963 | case FILE_TASKLIST: | 1256 | case FILE_TASKLIST: |
964 | retval = attach_task(cs, buffer, &pathbuf); | 1257 | retval = attach_task(cs, buffer, &pathbuf); |
965 | break; | 1258 | break; |
@@ -1060,6 +1353,15 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
1060 | case FILE_NOTIFY_ON_RELEASE: | 1353 | case FILE_NOTIFY_ON_RELEASE: |
1061 | *s++ = notify_on_release(cs) ? '1' : '0'; | 1354 | *s++ = notify_on_release(cs) ? '1' : '0'; |
1062 | break; | 1355 | break; |
1356 | case FILE_MEMORY_MIGRATE: | ||
1357 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | ||
1358 | break; | ||
1359 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
1360 | *s++ = cpuset_memory_pressure_enabled ? '1' : '0'; | ||
1361 | break; | ||
1362 | case FILE_MEMORY_PRESSURE: | ||
1363 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); | ||
1364 | break; | ||
1063 | default: | 1365 | default: |
1064 | retval = -EINVAL; | 1366 | retval = -EINVAL; |
1065 | goto out; | 1367 | goto out; |
@@ -1178,7 +1480,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode) | |||
1178 | 1480 | ||
1179 | /* | 1481 | /* |
1180 | * cpuset_create_dir - create a directory for an object. | 1482 | * cpuset_create_dir - create a directory for an object. |
1181 | * cs: the cpuset we create the directory for. | 1483 | * cs: the cpuset we create the directory for. |
1182 | * It must have a valid ->parent field | 1484 | * It must have a valid ->parent field |
1183 | * And we are going to fill its ->dentry field. | 1485 | * And we are going to fill its ->dentry field. |
1184 | * name: The name to give to the cpuset directory. Will be copied. | 1486 | * name: The name to give to the cpuset directory. Will be copied. |
@@ -1408,6 +1710,21 @@ static struct cftype cft_notify_on_release = { | |||
1408 | .private = FILE_NOTIFY_ON_RELEASE, | 1710 | .private = FILE_NOTIFY_ON_RELEASE, |
1409 | }; | 1711 | }; |
1410 | 1712 | ||
1713 | static struct cftype cft_memory_migrate = { | ||
1714 | .name = "memory_migrate", | ||
1715 | .private = FILE_MEMORY_MIGRATE, | ||
1716 | }; | ||
1717 | |||
1718 | static struct cftype cft_memory_pressure_enabled = { | ||
1719 | .name = "memory_pressure_enabled", | ||
1720 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1721 | }; | ||
1722 | |||
1723 | static struct cftype cft_memory_pressure = { | ||
1724 | .name = "memory_pressure", | ||
1725 | .private = FILE_MEMORY_PRESSURE, | ||
1726 | }; | ||
1727 | |||
1411 | static int cpuset_populate_dir(struct dentry *cs_dentry) | 1728 | static int cpuset_populate_dir(struct dentry *cs_dentry) |
1412 | { | 1729 | { |
1413 | int err; | 1730 | int err; |
@@ -1422,6 +1739,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
1422 | return err; | 1739 | return err; |
1423 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) | 1740 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) |
1424 | return err; | 1741 | return err; |
1742 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) | ||
1743 | return err; | ||
1744 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) | ||
1745 | return err; | ||
1425 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) | 1746 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) |
1426 | return err; | 1747 | return err; |
1427 | return 0; | 1748 | return 0; |
@@ -1446,7 +1767,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1446 | return -ENOMEM; | 1767 | return -ENOMEM; |
1447 | 1768 | ||
1448 | down(&manage_sem); | 1769 | down(&manage_sem); |
1449 | refresh_mems(); | 1770 | cpuset_update_task_memory_state(); |
1450 | cs->flags = 0; | 1771 | cs->flags = 0; |
1451 | if (notify_on_release(parent)) | 1772 | if (notify_on_release(parent)) |
1452 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1773 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
@@ -1457,11 +1778,13 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1457 | INIT_LIST_HEAD(&cs->children); | 1778 | INIT_LIST_HEAD(&cs->children); |
1458 | atomic_inc(&cpuset_mems_generation); | 1779 | atomic_inc(&cpuset_mems_generation); |
1459 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 1780 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
1781 | fmeter_init(&cs->fmeter); | ||
1460 | 1782 | ||
1461 | cs->parent = parent; | 1783 | cs->parent = parent; |
1462 | 1784 | ||
1463 | down(&callback_sem); | 1785 | down(&callback_sem); |
1464 | list_add(&cs->sibling, &cs->parent->children); | 1786 | list_add(&cs->sibling, &cs->parent->children); |
1787 | number_of_cpusets++; | ||
1465 | up(&callback_sem); | 1788 | up(&callback_sem); |
1466 | 1789 | ||
1467 | err = cpuset_create_dir(cs, name, mode); | 1790 | err = cpuset_create_dir(cs, name, mode); |
@@ -1503,7 +1826,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1503 | /* the vfs holds both inode->i_sem already */ | 1826 | /* the vfs holds both inode->i_sem already */ |
1504 | 1827 | ||
1505 | down(&manage_sem); | 1828 | down(&manage_sem); |
1506 | refresh_mems(); | 1829 | cpuset_update_task_memory_state(); |
1507 | if (atomic_read(&cs->count) > 0) { | 1830 | if (atomic_read(&cs->count) > 0) { |
1508 | up(&manage_sem); | 1831 | up(&manage_sem); |
1509 | return -EBUSY; | 1832 | return -EBUSY; |
@@ -1524,6 +1847,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1524 | spin_unlock(&d->d_lock); | 1847 | spin_unlock(&d->d_lock); |
1525 | cpuset_d_remove_dir(d); | 1848 | cpuset_d_remove_dir(d); |
1526 | dput(d); | 1849 | dput(d); |
1850 | number_of_cpusets--; | ||
1527 | up(&callback_sem); | 1851 | up(&callback_sem); |
1528 | if (list_empty(&parent->children)) | 1852 | if (list_empty(&parent->children)) |
1529 | check_for_release(parent, &pathbuf); | 1853 | check_for_release(parent, &pathbuf); |
@@ -1532,6 +1856,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1532 | return 0; | 1856 | return 0; |
1533 | } | 1857 | } |
1534 | 1858 | ||
1859 | /* | ||
1860 | * cpuset_init_early - just enough so that the calls to | ||
1861 | * cpuset_update_task_memory_state() in early init code | ||
1862 | * are harmless. | ||
1863 | */ | ||
1864 | |||
1865 | int __init cpuset_init_early(void) | ||
1866 | { | ||
1867 | struct task_struct *tsk = current; | ||
1868 | |||
1869 | tsk->cpuset = &top_cpuset; | ||
1870 | tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); | ||
1871 | return 0; | ||
1872 | } | ||
1873 | |||
1535 | /** | 1874 | /** |
1536 | * cpuset_init - initialize cpusets at system boot | 1875 | * cpuset_init - initialize cpusets at system boot |
1537 | * | 1876 | * |
@@ -1546,6 +1885,7 @@ int __init cpuset_init(void) | |||
1546 | top_cpuset.cpus_allowed = CPU_MASK_ALL; | 1885 | top_cpuset.cpus_allowed = CPU_MASK_ALL; |
1547 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1886 | top_cpuset.mems_allowed = NODE_MASK_ALL; |
1548 | 1887 | ||
1888 | fmeter_init(&top_cpuset.fmeter); | ||
1549 | atomic_inc(&cpuset_mems_generation); | 1889 | atomic_inc(&cpuset_mems_generation); |
1550 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); | 1890 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); |
1551 | 1891 | ||
@@ -1566,7 +1906,11 @@ int __init cpuset_init(void) | |||
1566 | root->d_inode->i_nlink++; | 1906 | root->d_inode->i_nlink++; |
1567 | top_cpuset.dentry = root; | 1907 | top_cpuset.dentry = root; |
1568 | root->d_inode->i_op = &cpuset_dir_inode_operations; | 1908 | root->d_inode->i_op = &cpuset_dir_inode_operations; |
1909 | number_of_cpusets = 1; | ||
1569 | err = cpuset_populate_dir(root); | 1910 | err = cpuset_populate_dir(root); |
1911 | /* memory_pressure_enabled is in root cpuset only */ | ||
1912 | if (err == 0) | ||
1913 | err = cpuset_add_file(root, &cft_memory_pressure_enabled); | ||
1570 | out: | 1914 | out: |
1571 | return err; | 1915 | return err; |
1572 | } | 1916 | } |
@@ -1632,15 +1976,13 @@ void cpuset_fork(struct task_struct *child) | |||
1632 | * | 1976 | * |
1633 | * We don't need to task_lock() this reference to tsk->cpuset, | 1977 | * We don't need to task_lock() this reference to tsk->cpuset, |
1634 | * because tsk is already marked PF_EXITING, so attach_task() won't | 1978 | * because tsk is already marked PF_EXITING, so attach_task() won't |
1635 | * mess with it. | 1979 | * mess with it, or task is a failed fork, never visible to attach_task. |
1636 | **/ | 1980 | **/ |
1637 | 1981 | ||
1638 | void cpuset_exit(struct task_struct *tsk) | 1982 | void cpuset_exit(struct task_struct *tsk) |
1639 | { | 1983 | { |
1640 | struct cpuset *cs; | 1984 | struct cpuset *cs; |
1641 | 1985 | ||
1642 | BUG_ON(!(tsk->flags & PF_EXITING)); | ||
1643 | |||
1644 | cs = tsk->cpuset; | 1986 | cs = tsk->cpuset; |
1645 | tsk->cpuset = NULL; | 1987 | tsk->cpuset = NULL; |
1646 | 1988 | ||
@@ -1667,14 +2009,14 @@ void cpuset_exit(struct task_struct *tsk) | |||
1667 | * tasks cpuset. | 2009 | * tasks cpuset. |
1668 | **/ | 2010 | **/ |
1669 | 2011 | ||
1670 | cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | 2012 | cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) |
1671 | { | 2013 | { |
1672 | cpumask_t mask; | 2014 | cpumask_t mask; |
1673 | 2015 | ||
1674 | down(&callback_sem); | 2016 | down(&callback_sem); |
1675 | task_lock((struct task_struct *)tsk); | 2017 | task_lock(tsk); |
1676 | guarantee_online_cpus(tsk->cpuset, &mask); | 2018 | guarantee_online_cpus(tsk->cpuset, &mask); |
1677 | task_unlock((struct task_struct *)tsk); | 2019 | task_unlock(tsk); |
1678 | up(&callback_sem); | 2020 | up(&callback_sem); |
1679 | 2021 | ||
1680 | return mask; | 2022 | return mask; |
@@ -1686,43 +2028,26 @@ void cpuset_init_current_mems_allowed(void) | |||
1686 | } | 2028 | } |
1687 | 2029 | ||
1688 | /** | 2030 | /** |
1689 | * cpuset_update_current_mems_allowed - update mems parameters to new values | 2031 | * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. |
1690 | * | 2032 | * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. |
1691 | * If the current tasks cpusets mems_allowed changed behind our backs, | ||
1692 | * update current->mems_allowed and mems_generation to the new value. | ||
1693 | * Do not call this routine if in_interrupt(). | ||
1694 | * | 2033 | * |
1695 | * Call without callback_sem or task_lock() held. May be called | 2034 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
1696 | * with or without manage_sem held. Unless exiting, it will acquire | 2035 | * attached to the specified @tsk. Guaranteed to return some non-empty |
1697 | * task_lock(). Also might acquire callback_sem during call to | 2036 | * subset of node_online_map, even if this means going outside the |
1698 | * refresh_mems(). | 2037 | * tasks cpuset. |
1699 | */ | 2038 | **/ |
1700 | 2039 | ||
1701 | void cpuset_update_current_mems_allowed(void) | 2040 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
1702 | { | 2041 | { |
1703 | struct cpuset *cs; | 2042 | nodemask_t mask; |
1704 | int need_to_refresh = 0; | ||
1705 | 2043 | ||
1706 | task_lock(current); | 2044 | down(&callback_sem); |
1707 | cs = current->cpuset; | 2045 | task_lock(tsk); |
1708 | if (!cs) | 2046 | guarantee_online_mems(tsk->cpuset, &mask); |
1709 | goto done; | 2047 | task_unlock(tsk); |
1710 | if (current->cpuset_mems_generation != cs->mems_generation) | 2048 | up(&callback_sem); |
1711 | need_to_refresh = 1; | ||
1712 | done: | ||
1713 | task_unlock(current); | ||
1714 | if (need_to_refresh) | ||
1715 | refresh_mems(); | ||
1716 | } | ||
1717 | 2049 | ||
1718 | /** | 2050 | return mask; |
1719 | * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed | ||
1720 | * @nodes: pointer to a node bitmap that is and-ed with mems_allowed | ||
1721 | */ | ||
1722 | void cpuset_restrict_to_mems_allowed(unsigned long *nodes) | ||
1723 | { | ||
1724 | bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), | ||
1725 | MAX_NUMNODES); | ||
1726 | } | 2051 | } |
1727 | 2052 | ||
1728 | /** | 2053 | /** |
@@ -1795,7 +2120,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
1795 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2120 | * GFP_USER - only nodes in current tasks mems allowed ok. |
1796 | **/ | 2121 | **/ |
1797 | 2122 | ||
1798 | int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | 2123 | int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) |
1799 | { | 2124 | { |
1800 | int node; /* node that zone z is on */ | 2125 | int node; /* node that zone z is on */ |
1801 | const struct cpuset *cs; /* current cpuset ancestors */ | 2126 | const struct cpuset *cs; /* current cpuset ancestors */ |
@@ -1867,6 +2192,42 @@ done: | |||
1867 | } | 2192 | } |
1868 | 2193 | ||
1869 | /* | 2194 | /* |
2195 | * Collection of memory_pressure is suppressed unless | ||
2196 | * this flag is enabled by writing "1" to the special | ||
2197 | * cpuset file 'memory_pressure_enabled' in the root cpuset. | ||
2198 | */ | ||
2199 | |||
2200 | int cpuset_memory_pressure_enabled __read_mostly; | ||
2201 | |||
2202 | /** | ||
2203 | * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. | ||
2204 | * | ||
2205 | * Keep a running average of the rate of synchronous (direct) | ||
2206 | * page reclaim efforts initiated by tasks in each cpuset. | ||
2207 | * | ||
2208 | * This represents the rate at which some task in the cpuset | ||
2209 | * ran low on memory on all nodes it was allowed to use, and | ||
2210 | * had to enter the kernels page reclaim code in an effort to | ||
2211 | * create more free memory by tossing clean pages or swapping | ||
2212 | * or writing dirty pages. | ||
2213 | * | ||
2214 | * Display to user space in the per-cpuset read-only file | ||
2215 | * "memory_pressure". Value displayed is an integer | ||
2216 | * representing the recent rate of entry into the synchronous | ||
2217 | * (direct) page reclaim by any task attached to the cpuset. | ||
2218 | **/ | ||
2219 | |||
2220 | void __cpuset_memory_pressure_bump(void) | ||
2221 | { | ||
2222 | struct cpuset *cs; | ||
2223 | |||
2224 | task_lock(current); | ||
2225 | cs = current->cpuset; | ||
2226 | fmeter_markevent(&cs->fmeter); | ||
2227 | task_unlock(current); | ||
2228 | } | ||
2229 | |||
2230 | /* | ||
1870 | * proc_cpuset_show() | 2231 | * proc_cpuset_show() |
1871 | * - Print tasks cpuset path into seq_file. | 2232 | * - Print tasks cpuset path into seq_file. |
1872 | * - Used for /proc/<pid>/cpuset. | 2233 | * - Used for /proc/<pid>/cpuset. |
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 334c37f5218a..fccb27dbc623 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c | |||
@@ -14,10 +14,12 @@ | |||
14 | 14 | ||
15 | #include <asm/io.h> | 15 | #include <asm/io.h> |
16 | #include <asm/uaccess.h> | 16 | #include <asm/uaccess.h> |
17 | #include <asm/kexec.h> | ||
17 | 18 | ||
18 | /* Stores the physical address of elf header of crash image. */ | 19 | /* Stores the physical address of elf header of crash image. */ |
19 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; | 20 | unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; |
20 | 21 | ||
22 | #ifndef HAVE_ARCH_COPY_OLDMEM_PAGE | ||
21 | /** | 23 | /** |
22 | * copy_oldmem_page - copy one page from "oldmem" | 24 | * copy_oldmem_page - copy one page from "oldmem" |
23 | * @pfn: page frame number to be copied | 25 | * @pfn: page frame number to be copied |
@@ -59,3 +61,4 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, | |||
59 | kfree(page); | 61 | kfree(page); |
60 | return csize; | 62 | return csize; |
61 | } | 63 | } |
64 | #endif | ||
diff --git a/kernel/exit.c b/kernel/exit.c index ee515683b92d..caceabf3f230 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -72,7 +72,6 @@ repeat: | |||
72 | __ptrace_unlink(p); | 72 | __ptrace_unlink(p); |
73 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | 73 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); |
74 | __exit_signal(p); | 74 | __exit_signal(p); |
75 | __exit_sighand(p); | ||
76 | /* | 75 | /* |
77 | * Note that the fastpath in sys_times depends on __exit_signal having | 76 | * Note that the fastpath in sys_times depends on __exit_signal having |
78 | * updated the counters before a task is removed from the tasklist of | 77 | * updated the counters before a task is removed from the tasklist of |
@@ -258,7 +257,7 @@ static inline void reparent_to_init(void) | |||
258 | 257 | ||
259 | void __set_special_pids(pid_t session, pid_t pgrp) | 258 | void __set_special_pids(pid_t session, pid_t pgrp) |
260 | { | 259 | { |
261 | struct task_struct *curr = current; | 260 | struct task_struct *curr = current->group_leader; |
262 | 261 | ||
263 | if (curr->signal->session != session) { | 262 | if (curr->signal->session != session) { |
264 | detach_pid(curr, PIDTYPE_SID); | 263 | detach_pid(curr, PIDTYPE_SID); |
@@ -926,7 +925,6 @@ do_group_exit(int exit_code) | |||
926 | /* Another thread got here before we took the lock. */ | 925 | /* Another thread got here before we took the lock. */ |
927 | exit_code = sig->group_exit_code; | 926 | exit_code = sig->group_exit_code; |
928 | else { | 927 | else { |
929 | sig->flags = SIGNAL_GROUP_EXIT; | ||
930 | sig->group_exit_code = exit_code; | 928 | sig->group_exit_code = exit_code; |
931 | zap_other_threads(current); | 929 | zap_other_threads(current); |
932 | } | 930 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index fb8572a42297..72e3252c6763 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -743,6 +743,14 @@ int unshare_files(void) | |||
743 | 743 | ||
744 | EXPORT_SYMBOL(unshare_files); | 744 | EXPORT_SYMBOL(unshare_files); |
745 | 745 | ||
746 | void sighand_free_cb(struct rcu_head *rhp) | ||
747 | { | ||
748 | struct sighand_struct *sp; | ||
749 | |||
750 | sp = container_of(rhp, struct sighand_struct, rcu); | ||
751 | kmem_cache_free(sighand_cachep, sp); | ||
752 | } | ||
753 | |||
746 | static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) | 754 | static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) |
747 | { | 755 | { |
748 | struct sighand_struct *sig; | 756 | struct sighand_struct *sig; |
@@ -752,7 +760,7 @@ static inline int copy_sighand(unsigned long clone_flags, struct task_struct * t | |||
752 | return 0; | 760 | return 0; |
753 | } | 761 | } |
754 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); | 762 | sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); |
755 | tsk->sighand = sig; | 763 | rcu_assign_pointer(tsk->sighand, sig); |
756 | if (!sig) | 764 | if (!sig) |
757 | return -ENOMEM; | 765 | return -ENOMEM; |
758 | spin_lock_init(&sig->siglock); | 766 | spin_lock_init(&sig->siglock); |
@@ -803,9 +811,6 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts | |||
803 | sig->it_prof_expires = cputime_zero; | 811 | sig->it_prof_expires = cputime_zero; |
804 | sig->it_prof_incr = cputime_zero; | 812 | sig->it_prof_incr = cputime_zero; |
805 | 813 | ||
806 | sig->tty = current->signal->tty; | ||
807 | sig->pgrp = process_group(current); | ||
808 | sig->session = current->signal->session; | ||
809 | sig->leader = 0; /* session leadership doesn't inherit */ | 814 | sig->leader = 0; /* session leadership doesn't inherit */ |
810 | sig->tty_old_pgrp = 0; | 815 | sig->tty_old_pgrp = 0; |
811 | 816 | ||
@@ -964,12 +969,13 @@ static task_t *copy_process(unsigned long clone_flags, | |||
964 | p->io_context = NULL; | 969 | p->io_context = NULL; |
965 | p->io_wait = NULL; | 970 | p->io_wait = NULL; |
966 | p->audit_context = NULL; | 971 | p->audit_context = NULL; |
972 | cpuset_fork(p); | ||
967 | #ifdef CONFIG_NUMA | 973 | #ifdef CONFIG_NUMA |
968 | p->mempolicy = mpol_copy(p->mempolicy); | 974 | p->mempolicy = mpol_copy(p->mempolicy); |
969 | if (IS_ERR(p->mempolicy)) { | 975 | if (IS_ERR(p->mempolicy)) { |
970 | retval = PTR_ERR(p->mempolicy); | 976 | retval = PTR_ERR(p->mempolicy); |
971 | p->mempolicy = NULL; | 977 | p->mempolicy = NULL; |
972 | goto bad_fork_cleanup; | 978 | goto bad_fork_cleanup_cpuset; |
973 | } | 979 | } |
974 | #endif | 980 | #endif |
975 | 981 | ||
@@ -1127,25 +1133,19 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1127 | attach_pid(p, PIDTYPE_PID, p->pid); | 1133 | attach_pid(p, PIDTYPE_PID, p->pid); |
1128 | attach_pid(p, PIDTYPE_TGID, p->tgid); | 1134 | attach_pid(p, PIDTYPE_TGID, p->tgid); |
1129 | if (thread_group_leader(p)) { | 1135 | if (thread_group_leader(p)) { |
1136 | p->signal->tty = current->signal->tty; | ||
1137 | p->signal->pgrp = process_group(current); | ||
1138 | p->signal->session = current->signal->session; | ||
1130 | attach_pid(p, PIDTYPE_PGID, process_group(p)); | 1139 | attach_pid(p, PIDTYPE_PGID, process_group(p)); |
1131 | attach_pid(p, PIDTYPE_SID, p->signal->session); | 1140 | attach_pid(p, PIDTYPE_SID, p->signal->session); |
1132 | if (p->pid) | 1141 | if (p->pid) |
1133 | __get_cpu_var(process_counts)++; | 1142 | __get_cpu_var(process_counts)++; |
1134 | } | 1143 | } |
1135 | 1144 | ||
1136 | if (!current->signal->tty && p->signal->tty) | ||
1137 | p->signal->tty = NULL; | ||
1138 | |||
1139 | nr_threads++; | 1145 | nr_threads++; |
1140 | total_forks++; | 1146 | total_forks++; |
1141 | write_unlock_irq(&tasklist_lock); | 1147 | write_unlock_irq(&tasklist_lock); |
1142 | proc_fork_connector(p); | 1148 | proc_fork_connector(p); |
1143 | cpuset_fork(p); | ||
1144 | retval = 0; | ||
1145 | |||
1146 | fork_out: | ||
1147 | if (retval) | ||
1148 | return ERR_PTR(retval); | ||
1149 | return p; | 1149 | return p; |
1150 | 1150 | ||
1151 | bad_fork_cleanup_namespace: | 1151 | bad_fork_cleanup_namespace: |
@@ -1172,7 +1172,9 @@ bad_fork_cleanup_security: | |||
1172 | bad_fork_cleanup_policy: | 1172 | bad_fork_cleanup_policy: |
1173 | #ifdef CONFIG_NUMA | 1173 | #ifdef CONFIG_NUMA |
1174 | mpol_free(p->mempolicy); | 1174 | mpol_free(p->mempolicy); |
1175 | bad_fork_cleanup_cpuset: | ||
1175 | #endif | 1176 | #endif |
1177 | cpuset_exit(p); | ||
1176 | bad_fork_cleanup: | 1178 | bad_fork_cleanup: |
1177 | if (p->binfmt) | 1179 | if (p->binfmt) |
1178 | module_put(p->binfmt->module); | 1180 | module_put(p->binfmt->module); |
@@ -1184,7 +1186,8 @@ bad_fork_cleanup_count: | |||
1184 | free_uid(p->user); | 1186 | free_uid(p->user); |
1185 | bad_fork_free: | 1187 | bad_fork_free: |
1186 | free_task(p); | 1188 | free_task(p); |
1187 | goto fork_out; | 1189 | fork_out: |
1190 | return ERR_PTR(retval); | ||
1188 | } | 1191 | } |
1189 | 1192 | ||
1190 | struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) | 1193 | struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 8a64a4844cde..d03b5eef8ce0 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -10,6 +10,8 @@ | |||
10 | #include <linux/proc_fs.h> | 10 | #include <linux/proc_fs.h> |
11 | #include <linux/interrupt.h> | 11 | #include <linux/interrupt.h> |
12 | 12 | ||
13 | #include "internals.h" | ||
14 | |||
13 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; | 15 | static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; |
14 | 16 | ||
15 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
diff --git a/kernel/module.c b/kernel/module.c index 4b06bbad49c2..e4276046a1b6 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -496,15 +496,15 @@ static void module_unload_free(struct module *mod) | |||
496 | } | 496 | } |
497 | 497 | ||
498 | #ifdef CONFIG_MODULE_FORCE_UNLOAD | 498 | #ifdef CONFIG_MODULE_FORCE_UNLOAD |
499 | static inline int try_force(unsigned int flags) | 499 | static inline int try_force_unload(unsigned int flags) |
500 | { | 500 | { |
501 | int ret = (flags & O_TRUNC); | 501 | int ret = (flags & O_TRUNC); |
502 | if (ret) | 502 | if (ret) |
503 | add_taint(TAINT_FORCED_MODULE); | 503 | add_taint(TAINT_FORCED_RMMOD); |
504 | return ret; | 504 | return ret; |
505 | } | 505 | } |
506 | #else | 506 | #else |
507 | static inline int try_force(unsigned int flags) | 507 | static inline int try_force_unload(unsigned int flags) |
508 | { | 508 | { |
509 | return 0; | 509 | return 0; |
510 | } | 510 | } |
@@ -524,7 +524,7 @@ static int __try_stop_module(void *_sref) | |||
524 | 524 | ||
525 | /* If it's not unused, quit unless we are told to block. */ | 525 | /* If it's not unused, quit unless we are told to block. */ |
526 | if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { | 526 | if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { |
527 | if (!(*sref->forced = try_force(sref->flags))) | 527 | if (!(*sref->forced = try_force_unload(sref->flags))) |
528 | return -EWOULDBLOCK; | 528 | return -EWOULDBLOCK; |
529 | } | 529 | } |
530 | 530 | ||
@@ -609,7 +609,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
609 | /* If it has an init func, it must have an exit func to unload */ | 609 | /* If it has an init func, it must have an exit func to unload */ |
610 | if ((mod->init != NULL && mod->exit == NULL) | 610 | if ((mod->init != NULL && mod->exit == NULL) |
611 | || mod->unsafe) { | 611 | || mod->unsafe) { |
612 | forced = try_force(flags); | 612 | forced = try_force_unload(flags); |
613 | if (!forced) { | 613 | if (!forced) { |
614 | /* This module can't be removed */ | 614 | /* This module can't be removed */ |
615 | ret = -EBUSY; | 615 | ret = -EBUSY; |
@@ -958,7 +958,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, | |||
958 | unsigned long ret; | 958 | unsigned long ret; |
959 | const unsigned long *crc; | 959 | const unsigned long *crc; |
960 | 960 | ||
961 | spin_lock_irq(&modlist_lock); | ||
962 | ret = __find_symbol(name, &owner, &crc, mod->license_gplok); | 961 | ret = __find_symbol(name, &owner, &crc, mod->license_gplok); |
963 | if (ret) { | 962 | if (ret) { |
964 | /* use_module can fail due to OOM, or module unloading */ | 963 | /* use_module can fail due to OOM, or module unloading */ |
@@ -966,7 +965,6 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, | |||
966 | !use_module(mod, owner)) | 965 | !use_module(mod, owner)) |
967 | ret = 0; | 966 | ret = 0; |
968 | } | 967 | } |
969 | spin_unlock_irq(&modlist_lock); | ||
970 | return ret; | 968 | return ret; |
971 | } | 969 | } |
972 | 970 | ||
@@ -1204,6 +1202,39 @@ void *__symbol_get(const char *symbol) | |||
1204 | } | 1202 | } |
1205 | EXPORT_SYMBOL_GPL(__symbol_get); | 1203 | EXPORT_SYMBOL_GPL(__symbol_get); |
1206 | 1204 | ||
1205 | /* | ||
1206 | * Ensure that an exported symbol [global namespace] does not already exist | ||
1207 | * in the Kernel or in some other modules exported symbol table. | ||
1208 | */ | ||
1209 | static int verify_export_symbols(struct module *mod) | ||
1210 | { | ||
1211 | const char *name = NULL; | ||
1212 | unsigned long i, ret = 0; | ||
1213 | struct module *owner; | ||
1214 | const unsigned long *crc; | ||
1215 | |||
1216 | for (i = 0; i < mod->num_syms; i++) | ||
1217 | if (__find_symbol(mod->syms[i].name, &owner, &crc, 1)) { | ||
1218 | name = mod->syms[i].name; | ||
1219 | ret = -ENOEXEC; | ||
1220 | goto dup; | ||
1221 | } | ||
1222 | |||
1223 | for (i = 0; i < mod->num_gpl_syms; i++) | ||
1224 | if (__find_symbol(mod->gpl_syms[i].name, &owner, &crc, 1)) { | ||
1225 | name = mod->gpl_syms[i].name; | ||
1226 | ret = -ENOEXEC; | ||
1227 | goto dup; | ||
1228 | } | ||
1229 | |||
1230 | dup: | ||
1231 | if (ret) | ||
1232 | printk(KERN_ERR "%s: exports duplicate symbol %s (owned by %s)\n", | ||
1233 | mod->name, name, module_name(owner)); | ||
1234 | |||
1235 | return ret; | ||
1236 | } | ||
1237 | |||
1207 | /* Change all symbols so that sh_value encodes the pointer directly. */ | 1238 | /* Change all symbols so that sh_value encodes the pointer directly. */ |
1208 | static int simplify_symbols(Elf_Shdr *sechdrs, | 1239 | static int simplify_symbols(Elf_Shdr *sechdrs, |
1209 | unsigned int symindex, | 1240 | unsigned int symindex, |
@@ -1715,6 +1746,11 @@ static struct module *load_module(void __user *umod, | |||
1715 | /* Set up license info based on the info section */ | 1746 | /* Set up license info based on the info section */ |
1716 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1747 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
1717 | 1748 | ||
1749 | if (strcmp(mod->name, "ndiswrapper") == 0) | ||
1750 | add_taint(TAINT_PROPRIETARY_MODULE); | ||
1751 | if (strcmp(mod->name, "driverloader") == 0) | ||
1752 | add_taint(TAINT_PROPRIETARY_MODULE); | ||
1753 | |||
1718 | #ifdef CONFIG_MODULE_UNLOAD | 1754 | #ifdef CONFIG_MODULE_UNLOAD |
1719 | /* Set up MODINFO_ATTR fields */ | 1755 | /* Set up MODINFO_ATTR fields */ |
1720 | setup_modinfo(mod, sechdrs, infoindex); | 1756 | setup_modinfo(mod, sechdrs, infoindex); |
@@ -1767,6 +1803,12 @@ static struct module *load_module(void __user *umod, | |||
1767 | goto cleanup; | 1803 | goto cleanup; |
1768 | } | 1804 | } |
1769 | 1805 | ||
1806 | /* Find duplicate symbols */ | ||
1807 | err = verify_export_symbols(mod); | ||
1808 | |||
1809 | if (err < 0) | ||
1810 | goto cleanup; | ||
1811 | |||
1770 | /* Set up and sort exception table */ | 1812 | /* Set up and sort exception table */ |
1771 | mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); | 1813 | mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); |
1772 | mod->extable = extable = (void *)sechdrs[exindex].sh_addr; | 1814 | mod->extable = extable = (void *)sechdrs[exindex].sh_addr; |
diff --git a/kernel/pid.c b/kernel/pid.c index edba31c681ac..1acc07246991 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_type type, int nr) | |||
136 | struct hlist_node *elem; | 136 | struct hlist_node *elem; |
137 | struct pid *pid; | 137 | struct pid *pid; |
138 | 138 | ||
139 | hlist_for_each_entry(pid, elem, | 139 | hlist_for_each_entry_rcu(pid, elem, |
140 | &pid_hash[type][pid_hashfn(nr)], pid_chain) { | 140 | &pid_hash[type][pid_hashfn(nr)], pid_chain) { |
141 | if (pid->nr == nr) | 141 | if (pid->nr == nr) |
142 | return pid; | 142 | return pid; |
@@ -150,15 +150,15 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr) | |||
150 | 150 | ||
151 | task_pid = &task->pids[type]; | 151 | task_pid = &task->pids[type]; |
152 | pid = find_pid(type, nr); | 152 | pid = find_pid(type, nr); |
153 | task_pid->nr = nr; | ||
153 | if (pid == NULL) { | 154 | if (pid == NULL) { |
154 | hlist_add_head(&task_pid->pid_chain, | ||
155 | &pid_hash[type][pid_hashfn(nr)]); | ||
156 | INIT_LIST_HEAD(&task_pid->pid_list); | 155 | INIT_LIST_HEAD(&task_pid->pid_list); |
156 | hlist_add_head_rcu(&task_pid->pid_chain, | ||
157 | &pid_hash[type][pid_hashfn(nr)]); | ||
157 | } else { | 158 | } else { |
158 | INIT_HLIST_NODE(&task_pid->pid_chain); | 159 | INIT_HLIST_NODE(&task_pid->pid_chain); |
159 | list_add_tail(&task_pid->pid_list, &pid->pid_list); | 160 | list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list); |
160 | } | 161 | } |
161 | task_pid->nr = nr; | ||
162 | 162 | ||
163 | return 0; | 163 | return 0; |
164 | } | 164 | } |
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t *task, enum pid_type type) | |||
170 | 170 | ||
171 | pid = &task->pids[type]; | 171 | pid = &task->pids[type]; |
172 | if (!hlist_unhashed(&pid->pid_chain)) { | 172 | if (!hlist_unhashed(&pid->pid_chain)) { |
173 | hlist_del(&pid->pid_chain); | ||
174 | 173 | ||
175 | if (list_empty(&pid->pid_list)) | 174 | if (list_empty(&pid->pid_list)) { |
176 | nr = pid->nr; | 175 | nr = pid->nr; |
177 | else { | 176 | hlist_del_rcu(&pid->pid_chain); |
177 | } else { | ||
178 | pid_next = list_entry(pid->pid_list.next, | 178 | pid_next = list_entry(pid->pid_list.next, |
179 | struct pid, pid_list); | 179 | struct pid, pid_list); |
180 | /* insert next pid from pid_list to hash */ | 180 | /* insert next pid from pid_list to hash */ |
181 | hlist_add_head(&pid_next->pid_chain, | 181 | hlist_replace_rcu(&pid->pid_chain, |
182 | &pid_hash[type][pid_hashfn(pid_next->nr)]); | 182 | &pid_next->pid_chain); |
183 | } | 183 | } |
184 | } | 184 | } |
185 | 185 | ||
186 | list_del(&pid->pid_list); | 186 | list_del_rcu(&pid->pid_list); |
187 | pid->nr = 0; | 187 | pid->nr = 0; |
188 | 188 | ||
189 | return nr; | 189 | return nr; |
diff --git a/kernel/printk.c b/kernel/printk.c index 5287be83e3e7..2251be80cd22 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -569,7 +569,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
569 | p[1] <= '7' && p[2] == '>') { | 569 | p[1] <= '7' && p[2] == '>') { |
570 | loglev_char = p[1]; | 570 | loglev_char = p[1]; |
571 | p += 3; | 571 | p += 3; |
572 | printed_len += 3; | 572 | printed_len -= 3; |
573 | } else { | 573 | } else { |
574 | loglev_char = default_message_loglevel | 574 | loglev_char = default_message_loglevel |
575 | + '0'; | 575 | + '0'; |
@@ -584,7 +584,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
584 | 584 | ||
585 | for (tp = tbuf; tp < tbuf + tlen; tp++) | 585 | for (tp = tbuf; tp < tbuf + tlen; tp++) |
586 | emit_log_char(*tp); | 586 | emit_log_char(*tp); |
587 | printed_len += tlen - 3; | 587 | printed_len += tlen; |
588 | } else { | 588 | } else { |
589 | if (p[0] != '<' || p[1] < '0' || | 589 | if (p[0] != '<' || p[1] < '0' || |
590 | p[1] > '7' || p[2] != '>') { | 590 | p[1] > '7' || p[2] != '>') { |
@@ -592,8 +592,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
592 | emit_log_char(default_message_loglevel | 592 | emit_log_char(default_message_loglevel |
593 | + '0'); | 593 | + '0'); |
594 | emit_log_char('>'); | 594 | emit_log_char('>'); |
595 | printed_len += 3; | ||
595 | } | 596 | } |
596 | printed_len += 3; | ||
597 | } | 597 | } |
598 | log_level_unknown = 0; | 598 | log_level_unknown = 0; |
599 | if (!*p) | 599 | if (!*p) |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 656476eedb1b..cceaf09ac413 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -408,54 +408,62 @@ int ptrace_request(struct task_struct *child, long request, | |||
408 | return ret; | 408 | return ret; |
409 | } | 409 | } |
410 | 410 | ||
411 | #ifndef __ARCH_SYS_PTRACE | 411 | /** |
412 | static int ptrace_get_task_struct(long request, long pid, | 412 | * ptrace_traceme -- helper for PTRACE_TRACEME |
413 | struct task_struct **childp) | 413 | * |
414 | * Performs checks and sets PT_PTRACED. | ||
415 | * Should be used by all ptrace implementations for PTRACE_TRACEME. | ||
416 | */ | ||
417 | int ptrace_traceme(void) | ||
414 | { | 418 | { |
415 | struct task_struct *child; | ||
416 | int ret; | 419 | int ret; |
417 | 420 | ||
418 | /* | 421 | /* |
419 | * Callers use child == NULL as an indication to exit early even | 422 | * Are we already being traced? |
420 | * when the return value is 0, so make sure it is non-NULL here. | 423 | */ |
424 | if (current->ptrace & PT_PTRACED) | ||
425 | return -EPERM; | ||
426 | ret = security_ptrace(current->parent, current); | ||
427 | if (ret) | ||
428 | return -EPERM; | ||
429 | /* | ||
430 | * Set the ptrace bit in the process ptrace flags. | ||
421 | */ | 431 | */ |
422 | *childp = NULL; | 432 | current->ptrace |= PT_PTRACED; |
433 | return 0; | ||
434 | } | ||
423 | 435 | ||
424 | if (request == PTRACE_TRACEME) { | 436 | /** |
425 | /* | 437 | * ptrace_get_task_struct -- grab a task struct reference for ptrace |
426 | * Are we already being traced? | 438 | * @pid: process id to grab a task_struct reference of |
427 | */ | 439 | * |
428 | if (current->ptrace & PT_PTRACED) | 440 | * This function is a helper for ptrace implementations. It checks |
429 | return -EPERM; | 441 | * permissions and then grabs a task struct for use of the actual |
430 | ret = security_ptrace(current->parent, current); | 442 | * ptrace implementation. |
431 | if (ret) | 443 | * |
432 | return -EPERM; | 444 | * Returns the task_struct for @pid or an ERR_PTR() on failure. |
433 | /* | 445 | */ |
434 | * Set the ptrace bit in the process ptrace flags. | 446 | struct task_struct *ptrace_get_task_struct(pid_t pid) |
435 | */ | 447 | { |
436 | current->ptrace |= PT_PTRACED; | 448 | struct task_struct *child; |
437 | return 0; | ||
438 | } | ||
439 | 449 | ||
440 | /* | 450 | /* |
441 | * You may not mess with init | 451 | * Tracing init is not allowed. |
442 | */ | 452 | */ |
443 | if (pid == 1) | 453 | if (pid == 1) |
444 | return -EPERM; | 454 | return ERR_PTR(-EPERM); |
445 | 455 | ||
446 | ret = -ESRCH; | ||
447 | read_lock(&tasklist_lock); | 456 | read_lock(&tasklist_lock); |
448 | child = find_task_by_pid(pid); | 457 | child = find_task_by_pid(pid); |
449 | if (child) | 458 | if (child) |
450 | get_task_struct(child); | 459 | get_task_struct(child); |
451 | read_unlock(&tasklist_lock); | 460 | read_unlock(&tasklist_lock); |
452 | if (!child) | 461 | if (!child) |
453 | return -ESRCH; | 462 | return ERR_PTR(-ESRCH); |
454 | 463 | return child; | |
455 | *childp = child; | ||
456 | return 0; | ||
457 | } | 464 | } |
458 | 465 | ||
466 | #ifndef __ARCH_SYS_PTRACE | ||
459 | asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | 467 | asmlinkage long sys_ptrace(long request, long pid, long addr, long data) |
460 | { | 468 | { |
461 | struct task_struct *child; | 469 | struct task_struct *child; |
@@ -465,9 +473,16 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | |||
465 | * This lock_kernel fixes a subtle race with suid exec | 473 | * This lock_kernel fixes a subtle race with suid exec |
466 | */ | 474 | */ |
467 | lock_kernel(); | 475 | lock_kernel(); |
468 | ret = ptrace_get_task_struct(request, pid, &child); | 476 | if (request == PTRACE_TRACEME) { |
469 | if (!child) | 477 | ret = ptrace_traceme(); |
470 | goto out; | 478 | goto out; |
479 | } | ||
480 | |||
481 | child = ptrace_get_task_struct(pid); | ||
482 | if (IS_ERR(child)) { | ||
483 | ret = PTR_ERR(child); | ||
484 | goto out; | ||
485 | } | ||
471 | 486 | ||
472 | if (request == PTRACE_ATTACH) { | 487 | if (request == PTRACE_ATTACH) { |
473 | ret = ptrace_attach(child); | 488 | ret = ptrace_attach(child); |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 48d3bce465b8..ccc45d49ce71 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/spinlock.h> | 36 | #include <linux/spinlock.h> |
37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
38 | #include <linux/rcupdate.h> | ||
38 | #include <linux/interrupt.h> | 39 | #include <linux/interrupt.h> |
39 | #include <linux/sched.h> | 40 | #include <linux/sched.h> |
40 | #include <asm/atomic.h> | 41 | #include <asm/atomic.h> |
@@ -45,7 +46,6 @@ | |||
45 | #include <linux/percpu.h> | 46 | #include <linux/percpu.h> |
46 | #include <linux/notifier.h> | 47 | #include <linux/notifier.h> |
47 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
48 | #include <linux/rcuref.h> | ||
49 | #include <linux/cpu.h> | 49 | #include <linux/cpu.h> |
50 | 50 | ||
51 | /* Definition for rcupdate control block. */ | 51 | /* Definition for rcupdate control block. */ |
@@ -61,9 +61,9 @@ struct rcu_state { | |||
61 | /* for current batch to proceed. */ | 61 | /* for current batch to proceed. */ |
62 | }; | 62 | }; |
63 | 63 | ||
64 | static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp = | 64 | static struct rcu_state rcu_state ____cacheline_internodealigned_in_smp = |
65 | {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; | 65 | {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; |
66 | static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp = | 66 | static struct rcu_state rcu_bh_state ____cacheline_internodealigned_in_smp = |
67 | {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; | 67 | {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE }; |
68 | 68 | ||
69 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | 69 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; |
@@ -73,19 +73,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | |||
73 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | 73 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; |
74 | static int maxbatch = 10000; | 74 | static int maxbatch = 10000; |
75 | 75 | ||
76 | #ifndef __HAVE_ARCH_CMPXCHG | ||
77 | /* | ||
78 | * We use an array of spinlocks for the rcurefs -- similar to ones in sparc | ||
79 | * 32 bit atomic_t implementations, and a hash function similar to that | ||
80 | * for our refcounting needs. | ||
81 | * Can't help multiprocessors which donot have cmpxchg :( | ||
82 | */ | ||
83 | |||
84 | spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { | ||
85 | [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED | ||
86 | }; | ||
87 | #endif | ||
88 | |||
89 | /** | 76 | /** |
90 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 77 | * call_rcu - Queue an RCU callback for invocation after a grace period. |
91 | * @head: structure to be used for queueing the RCU updates. | 78 | * @head: structure to be used for queueing the RCU updates. |
@@ -442,6 +429,36 @@ static void rcu_process_callbacks(unsigned long unused) | |||
442 | &__get_cpu_var(rcu_bh_data)); | 429 | &__get_cpu_var(rcu_bh_data)); |
443 | } | 430 | } |
444 | 431 | ||
432 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
433 | { | ||
434 | /* This cpu has pending rcu entries and the grace period | ||
435 | * for them has completed. | ||
436 | */ | ||
437 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
438 | return 1; | ||
439 | |||
440 | /* This cpu has no pending entries, but there are new entries */ | ||
441 | if (!rdp->curlist && rdp->nxtlist) | ||
442 | return 1; | ||
443 | |||
444 | /* This cpu has finished callbacks to invoke */ | ||
445 | if (rdp->donelist) | ||
446 | return 1; | ||
447 | |||
448 | /* The rcu core waits for a quiescent state from the cpu */ | ||
449 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
450 | return 1; | ||
451 | |||
452 | /* nothing to do */ | ||
453 | return 0; | ||
454 | } | ||
455 | |||
456 | int rcu_pending(int cpu) | ||
457 | { | ||
458 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
459 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
460 | } | ||
461 | |||
445 | void rcu_check_callbacks(int cpu, int user) | 462 | void rcu_check_callbacks(int cpu, int user) |
446 | { | 463 | { |
447 | if (user || | 464 | if (user || |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 49fbbeff201c..773219907dd8 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -39,7 +39,6 @@ | |||
39 | #include <linux/moduleparam.h> | 39 | #include <linux/moduleparam.h> |
40 | #include <linux/percpu.h> | 40 | #include <linux/percpu.h> |
41 | #include <linux/notifier.h> | 41 | #include <linux/notifier.h> |
42 | #include <linux/rcuref.h> | ||
43 | #include <linux/cpu.h> | 42 | #include <linux/cpu.h> |
44 | #include <linux/random.h> | 43 | #include <linux/random.h> |
45 | #include <linux/delay.h> | 44 | #include <linux/delay.h> |
@@ -49,9 +48,11 @@ | |||
49 | MODULE_LICENSE("GPL"); | 48 | MODULE_LICENSE("GPL"); |
50 | 49 | ||
51 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ | 50 | static int nreaders = -1; /* # reader threads, defaults to 4*ncpus */ |
52 | static int stat_interval = 0; /* Interval between stats, in seconds. */ | 51 | static int stat_interval; /* Interval between stats, in seconds. */ |
53 | /* Defaults to "only at end of test". */ | 52 | /* Defaults to "only at end of test". */ |
54 | static int verbose = 0; /* Print more debug info. */ | 53 | static int verbose; /* Print more debug info. */ |
54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | ||
55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ | ||
55 | 56 | ||
56 | MODULE_PARM(nreaders, "i"); | 57 | MODULE_PARM(nreaders, "i"); |
57 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 58 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
@@ -59,6 +60,10 @@ MODULE_PARM(stat_interval, "i"); | |||
59 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | 60 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); |
60 | MODULE_PARM(verbose, "i"); | 61 | MODULE_PARM(verbose, "i"); |
61 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | 62 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); |
63 | MODULE_PARM(test_no_idle_hz, "i"); | ||
64 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
65 | MODULE_PARM(shuffle_interval, "i"); | ||
66 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
62 | #define TORTURE_FLAG "rcutorture: " | 67 | #define TORTURE_FLAG "rcutorture: " |
63 | #define PRINTK_STRING(s) \ | 68 | #define PRINTK_STRING(s) \ |
64 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 69 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) |
@@ -73,6 +78,7 @@ static int nrealreaders; | |||
73 | static struct task_struct *writer_task; | 78 | static struct task_struct *writer_task; |
74 | static struct task_struct **reader_tasks; | 79 | static struct task_struct **reader_tasks; |
75 | static struct task_struct *stats_task; | 80 | static struct task_struct *stats_task; |
81 | static struct task_struct *shuffler_task; | ||
76 | 82 | ||
77 | #define RCU_TORTURE_PIPE_LEN 10 | 83 | #define RCU_TORTURE_PIPE_LEN 10 |
78 | 84 | ||
@@ -103,7 +109,7 @@ atomic_t n_rcu_torture_error; | |||
103 | /* | 109 | /* |
104 | * Allocate an element from the rcu_tortures pool. | 110 | * Allocate an element from the rcu_tortures pool. |
105 | */ | 111 | */ |
106 | struct rcu_torture * | 112 | static struct rcu_torture * |
107 | rcu_torture_alloc(void) | 113 | rcu_torture_alloc(void) |
108 | { | 114 | { |
109 | struct list_head *p; | 115 | struct list_head *p; |
@@ -376,12 +382,77 @@ rcu_torture_stats(void *arg) | |||
376 | return 0; | 382 | return 0; |
377 | } | 383 | } |
378 | 384 | ||
385 | static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ | ||
386 | |||
387 | /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case | ||
388 | * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. | ||
389 | */ | ||
390 | void rcu_torture_shuffle_tasks(void) | ||
391 | { | ||
392 | cpumask_t tmp_mask = CPU_MASK_ALL; | ||
393 | int i; | ||
394 | |||
395 | lock_cpu_hotplug(); | ||
396 | |||
397 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | ||
398 | if (num_online_cpus() == 1) { | ||
399 | unlock_cpu_hotplug(); | ||
400 | return; | ||
401 | } | ||
402 | |||
403 | if (rcu_idle_cpu != -1) | ||
404 | cpu_clear(rcu_idle_cpu, tmp_mask); | ||
405 | |||
406 | set_cpus_allowed(current, tmp_mask); | ||
407 | |||
408 | if (reader_tasks != NULL) { | ||
409 | for (i = 0; i < nrealreaders; i++) | ||
410 | if (reader_tasks[i]) | ||
411 | set_cpus_allowed(reader_tasks[i], tmp_mask); | ||
412 | } | ||
413 | |||
414 | if (writer_task) | ||
415 | set_cpus_allowed(writer_task, tmp_mask); | ||
416 | |||
417 | if (stats_task) | ||
418 | set_cpus_allowed(stats_task, tmp_mask); | ||
419 | |||
420 | if (rcu_idle_cpu == -1) | ||
421 | rcu_idle_cpu = num_online_cpus() - 1; | ||
422 | else | ||
423 | rcu_idle_cpu--; | ||
424 | |||
425 | unlock_cpu_hotplug(); | ||
426 | } | ||
427 | |||
428 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the | ||
429 | * system to become idle at a time and cut off its timer ticks. This is meant | ||
430 | * to test the support for such tickless idle CPU in RCU. | ||
431 | */ | ||
432 | static int | ||
433 | rcu_torture_shuffle(void *arg) | ||
434 | { | ||
435 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); | ||
436 | do { | ||
437 | schedule_timeout_interruptible(shuffle_interval * HZ); | ||
438 | rcu_torture_shuffle_tasks(); | ||
439 | } while (!kthread_should_stop()); | ||
440 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); | ||
441 | return 0; | ||
442 | } | ||
443 | |||
379 | static void | 444 | static void |
380 | rcu_torture_cleanup(void) | 445 | rcu_torture_cleanup(void) |
381 | { | 446 | { |
382 | int i; | 447 | int i; |
383 | 448 | ||
384 | fullstop = 1; | 449 | fullstop = 1; |
450 | if (shuffler_task != NULL) { | ||
451 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); | ||
452 | kthread_stop(shuffler_task); | ||
453 | } | ||
454 | shuffler_task = NULL; | ||
455 | |||
385 | if (writer_task != NULL) { | 456 | if (writer_task != NULL) { |
386 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); | 457 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); |
387 | kthread_stop(writer_task); | 458 | kthread_stop(writer_task); |
@@ -430,9 +501,11 @@ rcu_torture_init(void) | |||
430 | nrealreaders = nreaders; | 501 | nrealreaders = nreaders; |
431 | else | 502 | else |
432 | nrealreaders = 2 * num_online_cpus(); | 503 | nrealreaders = 2 * num_online_cpus(); |
433 | printk(KERN_ALERT TORTURE_FLAG | 504 | printk(KERN_ALERT TORTURE_FLAG "--- Start of test: nreaders=%d " |
434 | "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n", | 505 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
435 | nrealreaders, stat_interval, verbose); | 506 | "shuffle_interval = %d\n", |
507 | nrealreaders, stat_interval, verbose, test_no_idle_hz, | ||
508 | shuffle_interval); | ||
436 | fullstop = 0; | 509 | fullstop = 0; |
437 | 510 | ||
438 | /* Set up the freelist. */ | 511 | /* Set up the freelist. */ |
@@ -502,6 +575,18 @@ rcu_torture_init(void) | |||
502 | goto unwind; | 575 | goto unwind; |
503 | } | 576 | } |
504 | } | 577 | } |
578 | if (test_no_idle_hz) { | ||
579 | rcu_idle_cpu = num_online_cpus() - 1; | ||
580 | /* Create the shuffler thread */ | ||
581 | shuffler_task = kthread_run(rcu_torture_shuffle, NULL, | ||
582 | "rcu_torture_shuffle"); | ||
583 | if (IS_ERR(shuffler_task)) { | ||
584 | firsterr = PTR_ERR(shuffler_task); | ||
585 | VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); | ||
586 | shuffler_task = NULL; | ||
587 | goto unwind; | ||
588 | } | ||
589 | } | ||
505 | return 0; | 590 | return 0; |
506 | 591 | ||
507 | unwind: | 592 | unwind: |
diff --git a/kernel/sched.c b/kernel/sched.c index 6f46c94cc29e..92733091154c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -176,6 +176,13 @@ static unsigned int task_timeslice(task_t *p) | |||
176 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | 176 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
177 | < (long long) (sd)->cache_hot_time) | 177 | < (long long) (sd)->cache_hot_time) |
178 | 178 | ||
179 | void __put_task_struct_cb(struct rcu_head *rhp) | ||
180 | { | ||
181 | __put_task_struct(container_of(rhp, struct task_struct, rcu)); | ||
182 | } | ||
183 | |||
184 | EXPORT_SYMBOL_GPL(__put_task_struct_cb); | ||
185 | |||
179 | /* | 186 | /* |
180 | * These are the runqueue data structures: | 187 | * These are the runqueue data structures: |
181 | */ | 188 | */ |
diff --git a/kernel/signal.c b/kernel/signal.c index d7611f189ef7..08aa5b263f36 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -329,13 +329,20 @@ void __exit_sighand(struct task_struct *tsk) | |||
329 | /* Ok, we're done with the signal handlers */ | 329 | /* Ok, we're done with the signal handlers */ |
330 | tsk->sighand = NULL; | 330 | tsk->sighand = NULL; |
331 | if (atomic_dec_and_test(&sighand->count)) | 331 | if (atomic_dec_and_test(&sighand->count)) |
332 | kmem_cache_free(sighand_cachep, sighand); | 332 | sighand_free(sighand); |
333 | } | 333 | } |
334 | 334 | ||
335 | void exit_sighand(struct task_struct *tsk) | 335 | void exit_sighand(struct task_struct *tsk) |
336 | { | 336 | { |
337 | write_lock_irq(&tasklist_lock); | 337 | write_lock_irq(&tasklist_lock); |
338 | __exit_sighand(tsk); | 338 | rcu_read_lock(); |
339 | if (tsk->sighand != NULL) { | ||
340 | struct sighand_struct *sighand = rcu_dereference(tsk->sighand); | ||
341 | spin_lock(&sighand->siglock); | ||
342 | __exit_sighand(tsk); | ||
343 | spin_unlock(&sighand->siglock); | ||
344 | } | ||
345 | rcu_read_unlock(); | ||
339 | write_unlock_irq(&tasklist_lock); | 346 | write_unlock_irq(&tasklist_lock); |
340 | } | 347 | } |
341 | 348 | ||
@@ -345,19 +352,20 @@ void exit_sighand(struct task_struct *tsk) | |||
345 | void __exit_signal(struct task_struct *tsk) | 352 | void __exit_signal(struct task_struct *tsk) |
346 | { | 353 | { |
347 | struct signal_struct * sig = tsk->signal; | 354 | struct signal_struct * sig = tsk->signal; |
348 | struct sighand_struct * sighand = tsk->sighand; | 355 | struct sighand_struct * sighand; |
349 | 356 | ||
350 | if (!sig) | 357 | if (!sig) |
351 | BUG(); | 358 | BUG(); |
352 | if (!atomic_read(&sig->count)) | 359 | if (!atomic_read(&sig->count)) |
353 | BUG(); | 360 | BUG(); |
361 | rcu_read_lock(); | ||
362 | sighand = rcu_dereference(tsk->sighand); | ||
354 | spin_lock(&sighand->siglock); | 363 | spin_lock(&sighand->siglock); |
355 | posix_cpu_timers_exit(tsk); | 364 | posix_cpu_timers_exit(tsk); |
356 | if (atomic_dec_and_test(&sig->count)) { | 365 | if (atomic_dec_and_test(&sig->count)) { |
357 | posix_cpu_timers_exit_group(tsk); | 366 | posix_cpu_timers_exit_group(tsk); |
358 | if (tsk == sig->curr_target) | ||
359 | sig->curr_target = next_thread(tsk); | ||
360 | tsk->signal = NULL; | 367 | tsk->signal = NULL; |
368 | __exit_sighand(tsk); | ||
361 | spin_unlock(&sighand->siglock); | 369 | spin_unlock(&sighand->siglock); |
362 | flush_sigqueue(&sig->shared_pending); | 370 | flush_sigqueue(&sig->shared_pending); |
363 | } else { | 371 | } else { |
@@ -389,9 +397,11 @@ void __exit_signal(struct task_struct *tsk) | |||
389 | sig->nvcsw += tsk->nvcsw; | 397 | sig->nvcsw += tsk->nvcsw; |
390 | sig->nivcsw += tsk->nivcsw; | 398 | sig->nivcsw += tsk->nivcsw; |
391 | sig->sched_time += tsk->sched_time; | 399 | sig->sched_time += tsk->sched_time; |
400 | __exit_sighand(tsk); | ||
392 | spin_unlock(&sighand->siglock); | 401 | spin_unlock(&sighand->siglock); |
393 | sig = NULL; /* Marker for below. */ | 402 | sig = NULL; /* Marker for below. */ |
394 | } | 403 | } |
404 | rcu_read_unlock(); | ||
395 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); | 405 | clear_tsk_thread_flag(tsk,TIF_SIGPENDING); |
396 | flush_sigqueue(&tsk->pending); | 406 | flush_sigqueue(&tsk->pending); |
397 | if (sig) { | 407 | if (sig) { |
@@ -613,6 +623,33 @@ void signal_wake_up(struct task_struct *t, int resume) | |||
613 | * Returns 1 if any signals were found. | 623 | * Returns 1 if any signals were found. |
614 | * | 624 | * |
615 | * All callers must be holding the siglock. | 625 | * All callers must be holding the siglock. |
626 | * | ||
627 | * This version takes a sigset mask and looks at all signals, | ||
628 | * not just those in the first mask word. | ||
629 | */ | ||
630 | static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) | ||
631 | { | ||
632 | struct sigqueue *q, *n; | ||
633 | sigset_t m; | ||
634 | |||
635 | sigandsets(&m, mask, &s->signal); | ||
636 | if (sigisemptyset(&m)) | ||
637 | return 0; | ||
638 | |||
639 | signandsets(&s->signal, &s->signal, mask); | ||
640 | list_for_each_entry_safe(q, n, &s->list, list) { | ||
641 | if (sigismember(mask, q->info.si_signo)) { | ||
642 | list_del_init(&q->list); | ||
643 | __sigqueue_free(q); | ||
644 | } | ||
645 | } | ||
646 | return 1; | ||
647 | } | ||
648 | /* | ||
649 | * Remove signals in mask from the pending set and queue. | ||
650 | * Returns 1 if any signals were found. | ||
651 | * | ||
652 | * All callers must be holding the siglock. | ||
616 | */ | 653 | */ |
617 | static int rm_from_queue(unsigned long mask, struct sigpending *s) | 654 | static int rm_from_queue(unsigned long mask, struct sigpending *s) |
618 | { | 655 | { |
@@ -1080,18 +1117,29 @@ void zap_other_threads(struct task_struct *p) | |||
1080 | } | 1117 | } |
1081 | 1118 | ||
1082 | /* | 1119 | /* |
1083 | * Must be called with the tasklist_lock held for reading! | 1120 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. |
1084 | */ | 1121 | */ |
1085 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) | 1122 | int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) |
1086 | { | 1123 | { |
1087 | unsigned long flags; | 1124 | unsigned long flags; |
1125 | struct sighand_struct *sp; | ||
1088 | int ret; | 1126 | int ret; |
1089 | 1127 | ||
1128 | retry: | ||
1090 | ret = check_kill_permission(sig, info, p); | 1129 | ret = check_kill_permission(sig, info, p); |
1091 | if (!ret && sig && p->sighand) { | 1130 | if (!ret && sig && (sp = rcu_dereference(p->sighand))) { |
1092 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1131 | spin_lock_irqsave(&sp->siglock, flags); |
1132 | if (p->sighand != sp) { | ||
1133 | spin_unlock_irqrestore(&sp->siglock, flags); | ||
1134 | goto retry; | ||
1135 | } | ||
1136 | if ((atomic_read(&sp->count) == 0) || | ||
1137 | (atomic_read(&p->usage) == 0)) { | ||
1138 | spin_unlock_irqrestore(&sp->siglock, flags); | ||
1139 | return -ESRCH; | ||
1140 | } | ||
1093 | ret = __group_send_sig_info(sig, info, p); | 1141 | ret = __group_send_sig_info(sig, info, p); |
1094 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1142 | spin_unlock_irqrestore(&sp->siglock, flags); |
1095 | } | 1143 | } |
1096 | 1144 | ||
1097 | return ret; | 1145 | return ret; |
@@ -1136,14 +1184,21 @@ int | |||
1136 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) | 1184 | kill_proc_info(int sig, struct siginfo *info, pid_t pid) |
1137 | { | 1185 | { |
1138 | int error; | 1186 | int error; |
1187 | int acquired_tasklist_lock = 0; | ||
1139 | struct task_struct *p; | 1188 | struct task_struct *p; |
1140 | 1189 | ||
1141 | read_lock(&tasklist_lock); | 1190 | rcu_read_lock(); |
1191 | if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) { | ||
1192 | read_lock(&tasklist_lock); | ||
1193 | acquired_tasklist_lock = 1; | ||
1194 | } | ||
1142 | p = find_task_by_pid(pid); | 1195 | p = find_task_by_pid(pid); |
1143 | error = -ESRCH; | 1196 | error = -ESRCH; |
1144 | if (p) | 1197 | if (p) |
1145 | error = group_send_sig_info(sig, info, p); | 1198 | error = group_send_sig_info(sig, info, p); |
1146 | read_unlock(&tasklist_lock); | 1199 | if (unlikely(acquired_tasklist_lock)) |
1200 | read_unlock(&tasklist_lock); | ||
1201 | rcu_read_unlock(); | ||
1147 | return error; | 1202 | return error; |
1148 | } | 1203 | } |
1149 | 1204 | ||
@@ -1163,8 +1218,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, | |||
1163 | ret = -ESRCH; | 1218 | ret = -ESRCH; |
1164 | goto out_unlock; | 1219 | goto out_unlock; |
1165 | } | 1220 | } |
1166 | if ((!info || ((unsigned long)info != 1 && | 1221 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) |
1167 | (unsigned long)info != 2 && SI_FROMUSER(info))) | ||
1168 | && (euid != p->suid) && (euid != p->uid) | 1222 | && (euid != p->suid) && (euid != p->uid) |
1169 | && (uid != p->suid) && (uid != p->uid)) { | 1223 | && (uid != p->suid) && (uid != p->uid)) { |
1170 | ret = -EPERM; | 1224 | ret = -EPERM; |
@@ -1355,16 +1409,54 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1355 | { | 1409 | { |
1356 | unsigned long flags; | 1410 | unsigned long flags; |
1357 | int ret = 0; | 1411 | int ret = 0; |
1412 | struct sighand_struct *sh; | ||
1358 | 1413 | ||
1359 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1414 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
1360 | read_lock(&tasklist_lock); | 1415 | |
1416 | /* | ||
1417 | * The rcu based delayed sighand destroy makes it possible to | ||
1418 | * run this without tasklist lock held. The task struct itself | ||
1419 | * cannot go away as create_timer did get_task_struct(). | ||
1420 | * | ||
1421 | * We return -1, when the task is marked exiting, so | ||
1422 | * posix_timer_event can redirect it to the group leader | ||
1423 | */ | ||
1424 | rcu_read_lock(); | ||
1361 | 1425 | ||
1362 | if (unlikely(p->flags & PF_EXITING)) { | 1426 | if (unlikely(p->flags & PF_EXITING)) { |
1363 | ret = -1; | 1427 | ret = -1; |
1364 | goto out_err; | 1428 | goto out_err; |
1365 | } | 1429 | } |
1366 | 1430 | ||
1367 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1431 | retry: |
1432 | sh = rcu_dereference(p->sighand); | ||
1433 | |||
1434 | spin_lock_irqsave(&sh->siglock, flags); | ||
1435 | if (p->sighand != sh) { | ||
1436 | /* We raced with exec() in a multithreaded process... */ | ||
1437 | spin_unlock_irqrestore(&sh->siglock, flags); | ||
1438 | goto retry; | ||
1439 | } | ||
1440 | |||
1441 | /* | ||
1442 | * We do the check here again to handle the following scenario: | ||
1443 | * | ||
1444 | * CPU 0 CPU 1 | ||
1445 | * send_sigqueue | ||
1446 | * check PF_EXITING | ||
1447 | * interrupt exit code running | ||
1448 | * __exit_signal | ||
1449 | * lock sighand->siglock | ||
1450 | * unlock sighand->siglock | ||
1451 | * lock sh->siglock | ||
1452 | * add(tsk->pending) flush_sigqueue(tsk->pending) | ||
1453 | * | ||
1454 | */ | ||
1455 | |||
1456 | if (unlikely(p->flags & PF_EXITING)) { | ||
1457 | ret = -1; | ||
1458 | goto out; | ||
1459 | } | ||
1368 | 1460 | ||
1369 | if (unlikely(!list_empty(&q->list))) { | 1461 | if (unlikely(!list_empty(&q->list))) { |
1370 | /* | 1462 | /* |
@@ -1388,9 +1480,9 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1388 | signal_wake_up(p, sig == SIGKILL); | 1480 | signal_wake_up(p, sig == SIGKILL); |
1389 | 1481 | ||
1390 | out: | 1482 | out: |
1391 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1483 | spin_unlock_irqrestore(&sh->siglock, flags); |
1392 | out_err: | 1484 | out_err: |
1393 | read_unlock(&tasklist_lock); | 1485 | rcu_read_unlock(); |
1394 | 1486 | ||
1395 | return ret; | 1487 | return ret; |
1396 | } | 1488 | } |
@@ -1402,7 +1494,9 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1402 | int ret = 0; | 1494 | int ret = 0; |
1403 | 1495 | ||
1404 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); | 1496 | BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); |
1497 | |||
1405 | read_lock(&tasklist_lock); | 1498 | read_lock(&tasklist_lock); |
1499 | /* Since it_lock is held, p->sighand cannot be NULL. */ | ||
1406 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1500 | spin_lock_irqsave(&p->sighand->siglock, flags); |
1407 | handle_stop_signal(sig, p); | 1501 | handle_stop_signal(sig, p); |
1408 | 1502 | ||
@@ -1436,7 +1530,7 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) | |||
1436 | out: | 1530 | out: |
1437 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1531 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
1438 | read_unlock(&tasklist_lock); | 1532 | read_unlock(&tasklist_lock); |
1439 | return(ret); | 1533 | return ret; |
1440 | } | 1534 | } |
1441 | 1535 | ||
1442 | /* | 1536 | /* |
@@ -2338,6 +2432,7 @@ int | |||
2338 | do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | 2432 | do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) |
2339 | { | 2433 | { |
2340 | struct k_sigaction *k; | 2434 | struct k_sigaction *k; |
2435 | sigset_t mask; | ||
2341 | 2436 | ||
2342 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) | 2437 | if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) |
2343 | return -EINVAL; | 2438 | return -EINVAL; |
@@ -2385,9 +2480,11 @@ do_sigaction(int sig, const struct k_sigaction *act, struct k_sigaction *oact) | |||
2385 | *k = *act; | 2480 | *k = *act; |
2386 | sigdelsetmask(&k->sa.sa_mask, | 2481 | sigdelsetmask(&k->sa.sa_mask, |
2387 | sigmask(SIGKILL) | sigmask(SIGSTOP)); | 2482 | sigmask(SIGKILL) | sigmask(SIGSTOP)); |
2388 | rm_from_queue(sigmask(sig), &t->signal->shared_pending); | 2483 | sigemptyset(&mask); |
2484 | sigaddset(&mask, sig); | ||
2485 | rm_from_queue_full(&mask, &t->signal->shared_pending); | ||
2389 | do { | 2486 | do { |
2390 | rm_from_queue(sigmask(sig), &t->pending); | 2487 | rm_from_queue_full(&mask, &t->pending); |
2391 | recalc_sigpending_tsk(t); | 2488 | recalc_sigpending_tsk(t); |
2392 | t = next_thread(t); | 2489 | t = next_thread(t); |
2393 | } while (t != current); | 2490 | } while (t != current); |
diff --git a/kernel/sys.c b/kernel/sys.c index eecf84526afe..b6941e06d5d5 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -489,6 +489,12 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user | |||
489 | magic2 != LINUX_REBOOT_MAGIC2C)) | 489 | magic2 != LINUX_REBOOT_MAGIC2C)) |
490 | return -EINVAL; | 490 | return -EINVAL; |
491 | 491 | ||
492 | /* Instead of trying to make the power_off code look like | ||
493 | * halt when pm_power_off is not set do it the easy way. | ||
494 | */ | ||
495 | if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) | ||
496 | cmd = LINUX_REBOOT_CMD_HALT; | ||
497 | |||
492 | lock_kernel(); | 498 | lock_kernel(); |
493 | switch (cmd) { | 499 | switch (cmd) { |
494 | case LINUX_REBOOT_CMD_RESTART: | 500 | case LINUX_REBOOT_CMD_RESTART: |
@@ -1084,10 +1090,11 @@ asmlinkage long sys_times(struct tms __user * tbuf) | |||
1084 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | 1090 | asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) |
1085 | { | 1091 | { |
1086 | struct task_struct *p; | 1092 | struct task_struct *p; |
1093 | struct task_struct *group_leader = current->group_leader; | ||
1087 | int err = -EINVAL; | 1094 | int err = -EINVAL; |
1088 | 1095 | ||
1089 | if (!pid) | 1096 | if (!pid) |
1090 | pid = current->pid; | 1097 | pid = group_leader->pid; |
1091 | if (!pgid) | 1098 | if (!pgid) |
1092 | pgid = pid; | 1099 | pgid = pid; |
1093 | if (pgid < 0) | 1100 | if (pgid < 0) |
@@ -1107,16 +1114,16 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1107 | if (!thread_group_leader(p)) | 1114 | if (!thread_group_leader(p)) |
1108 | goto out; | 1115 | goto out; |
1109 | 1116 | ||
1110 | if (p->parent == current || p->real_parent == current) { | 1117 | if (p->real_parent == group_leader) { |
1111 | err = -EPERM; | 1118 | err = -EPERM; |
1112 | if (p->signal->session != current->signal->session) | 1119 | if (p->signal->session != group_leader->signal->session) |
1113 | goto out; | 1120 | goto out; |
1114 | err = -EACCES; | 1121 | err = -EACCES; |
1115 | if (p->did_exec) | 1122 | if (p->did_exec) |
1116 | goto out; | 1123 | goto out; |
1117 | } else { | 1124 | } else { |
1118 | err = -ESRCH; | 1125 | err = -ESRCH; |
1119 | if (p != current) | 1126 | if (p != group_leader) |
1120 | goto out; | 1127 | goto out; |
1121 | } | 1128 | } |
1122 | 1129 | ||
@@ -1128,7 +1135,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1128 | struct task_struct *p; | 1135 | struct task_struct *p; |
1129 | 1136 | ||
1130 | do_each_task_pid(pgid, PIDTYPE_PGID, p) { | 1137 | do_each_task_pid(pgid, PIDTYPE_PGID, p) { |
1131 | if (p->signal->session == current->signal->session) | 1138 | if (p->signal->session == group_leader->signal->session) |
1132 | goto ok_pgid; | 1139 | goto ok_pgid; |
1133 | } while_each_task_pid(pgid, PIDTYPE_PGID, p); | 1140 | } while_each_task_pid(pgid, PIDTYPE_PGID, p); |
1134 | goto out; | 1141 | goto out; |
@@ -1208,24 +1215,22 @@ asmlinkage long sys_getsid(pid_t pid) | |||
1208 | 1215 | ||
1209 | asmlinkage long sys_setsid(void) | 1216 | asmlinkage long sys_setsid(void) |
1210 | { | 1217 | { |
1218 | struct task_struct *group_leader = current->group_leader; | ||
1211 | struct pid *pid; | 1219 | struct pid *pid; |
1212 | int err = -EPERM; | 1220 | int err = -EPERM; |
1213 | 1221 | ||
1214 | if (!thread_group_leader(current)) | ||
1215 | return -EINVAL; | ||
1216 | |||
1217 | down(&tty_sem); | 1222 | down(&tty_sem); |
1218 | write_lock_irq(&tasklist_lock); | 1223 | write_lock_irq(&tasklist_lock); |
1219 | 1224 | ||
1220 | pid = find_pid(PIDTYPE_PGID, current->pid); | 1225 | pid = find_pid(PIDTYPE_PGID, group_leader->pid); |
1221 | if (pid) | 1226 | if (pid) |
1222 | goto out; | 1227 | goto out; |
1223 | 1228 | ||
1224 | current->signal->leader = 1; | 1229 | group_leader->signal->leader = 1; |
1225 | __set_special_pids(current->pid, current->pid); | 1230 | __set_special_pids(group_leader->pid, group_leader->pid); |
1226 | current->signal->tty = NULL; | 1231 | group_leader->signal->tty = NULL; |
1227 | current->signal->tty_old_pgrp = 0; | 1232 | group_leader->signal->tty_old_pgrp = 0; |
1228 | err = process_group(current); | 1233 | err = process_group(group_leader); |
1229 | out: | 1234 | out: |
1230 | write_unlock_irq(&tasklist_lock); | 1235 | write_unlock_irq(&tasklist_lock); |
1231 | up(&tty_sem); | 1236 | up(&tty_sem); |
@@ -1687,7 +1692,10 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1687 | if (unlikely(!p->signal)) | 1692 | if (unlikely(!p->signal)) |
1688 | return; | 1693 | return; |
1689 | 1694 | ||
1695 | utime = stime = cputime_zero; | ||
1696 | |||
1690 | switch (who) { | 1697 | switch (who) { |
1698 | case RUSAGE_BOTH: | ||
1691 | case RUSAGE_CHILDREN: | 1699 | case RUSAGE_CHILDREN: |
1692 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1700 | spin_lock_irqsave(&p->sighand->siglock, flags); |
1693 | utime = p->signal->cutime; | 1701 | utime = p->signal->cutime; |
@@ -1697,22 +1705,11 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1697 | r->ru_minflt = p->signal->cmin_flt; | 1705 | r->ru_minflt = p->signal->cmin_flt; |
1698 | r->ru_majflt = p->signal->cmaj_flt; | 1706 | r->ru_majflt = p->signal->cmaj_flt; |
1699 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1707 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
1700 | cputime_to_timeval(utime, &r->ru_utime); | 1708 | |
1701 | cputime_to_timeval(stime, &r->ru_stime); | 1709 | if (who == RUSAGE_CHILDREN) |
1702 | break; | 1710 | break; |
1711 | |||
1703 | case RUSAGE_SELF: | 1712 | case RUSAGE_SELF: |
1704 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1705 | utime = stime = cputime_zero; | ||
1706 | goto sum_group; | ||
1707 | case RUSAGE_BOTH: | ||
1708 | spin_lock_irqsave(&p->sighand->siglock, flags); | ||
1709 | utime = p->signal->cutime; | ||
1710 | stime = p->signal->cstime; | ||
1711 | r->ru_nvcsw = p->signal->cnvcsw; | ||
1712 | r->ru_nivcsw = p->signal->cnivcsw; | ||
1713 | r->ru_minflt = p->signal->cmin_flt; | ||
1714 | r->ru_majflt = p->signal->cmaj_flt; | ||
1715 | sum_group: | ||
1716 | utime = cputime_add(utime, p->signal->utime); | 1713 | utime = cputime_add(utime, p->signal->utime); |
1717 | stime = cputime_add(stime, p->signal->stime); | 1714 | stime = cputime_add(stime, p->signal->stime); |
1718 | r->ru_nvcsw += p->signal->nvcsw; | 1715 | r->ru_nvcsw += p->signal->nvcsw; |
@@ -1729,13 +1726,14 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1729 | r->ru_majflt += t->maj_flt; | 1726 | r->ru_majflt += t->maj_flt; |
1730 | t = next_thread(t); | 1727 | t = next_thread(t); |
1731 | } while (t != p); | 1728 | } while (t != p); |
1732 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | ||
1733 | cputime_to_timeval(utime, &r->ru_utime); | ||
1734 | cputime_to_timeval(stime, &r->ru_stime); | ||
1735 | break; | 1729 | break; |
1730 | |||
1736 | default: | 1731 | default: |
1737 | BUG(); | 1732 | BUG(); |
1738 | } | 1733 | } |
1734 | |||
1735 | cputime_to_timeval(utime, &r->ru_utime); | ||
1736 | cputime_to_timeval(stime, &r->ru_stime); | ||
1739 | } | 1737 | } |
1740 | 1738 | ||
1741 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | 1739 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1ab2370e2efa..17313b99e53d 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -82,6 +82,28 @@ cond_syscall(compat_sys_socketcall); | |||
82 | cond_syscall(sys_inotify_init); | 82 | cond_syscall(sys_inotify_init); |
83 | cond_syscall(sys_inotify_add_watch); | 83 | cond_syscall(sys_inotify_add_watch); |
84 | cond_syscall(sys_inotify_rm_watch); | 84 | cond_syscall(sys_inotify_rm_watch); |
85 | cond_syscall(sys_migrate_pages); | ||
86 | cond_syscall(sys_chown16); | ||
87 | cond_syscall(sys_fchown16); | ||
88 | cond_syscall(sys_getegid16); | ||
89 | cond_syscall(sys_geteuid16); | ||
90 | cond_syscall(sys_getgid16); | ||
91 | cond_syscall(sys_getgroups16); | ||
92 | cond_syscall(sys_getresgid16); | ||
93 | cond_syscall(sys_getresuid16); | ||
94 | cond_syscall(sys_getuid16); | ||
95 | cond_syscall(sys_lchown16); | ||
96 | cond_syscall(sys_setfsgid16); | ||
97 | cond_syscall(sys_setfsuid16); | ||
98 | cond_syscall(sys_setgid16); | ||
99 | cond_syscall(sys_setgroups16); | ||
100 | cond_syscall(sys_setregid16); | ||
101 | cond_syscall(sys_setresgid16); | ||
102 | cond_syscall(sys_setresuid16); | ||
103 | cond_syscall(sys_setreuid16); | ||
104 | cond_syscall(sys_setuid16); | ||
105 | cond_syscall(sys_vm86old); | ||
106 | cond_syscall(sys_vm86); | ||
85 | 107 | ||
86 | /* arch-specific weak syscall entries */ | 108 | /* arch-specific weak syscall entries */ |
87 | cond_syscall(sys_pciconfig_read); | 109 | cond_syscall(sys_pciconfig_read); |
@@ -90,3 +112,5 @@ cond_syscall(sys_pciconfig_iobase); | |||
90 | cond_syscall(sys32_ipc); | 112 | cond_syscall(sys32_ipc); |
91 | cond_syscall(sys32_sysctl); | 113 | cond_syscall(sys32_sysctl); |
92 | cond_syscall(ppc_rtas); | 114 | cond_syscall(ppc_rtas); |
115 | cond_syscall(sys_spu_run); | ||
116 | cond_syscall(sys_spu_create); | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a85047bb5739..03b0598f2369 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -68,6 +68,8 @@ extern int min_free_kbytes; | |||
68 | extern int printk_ratelimit_jiffies; | 68 | extern int printk_ratelimit_jiffies; |
69 | extern int printk_ratelimit_burst; | 69 | extern int printk_ratelimit_burst; |
70 | extern int pid_max_min, pid_max_max; | 70 | extern int pid_max_min, pid_max_max; |
71 | extern int sysctl_drop_caches; | ||
72 | extern int percpu_pagelist_fraction; | ||
71 | 73 | ||
72 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 74 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
73 | int unknown_nmi_panic; | 75 | int unknown_nmi_panic; |
@@ -78,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, | |||
78 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ | 80 | /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ |
79 | static int maxolduid = 65535; | 81 | static int maxolduid = 65535; |
80 | static int minolduid; | 82 | static int minolduid; |
83 | static int min_percpu_pagelist_fract = 8; | ||
81 | 84 | ||
82 | static int ngroups_max = NGROUPS_MAX; | 85 | static int ngroups_max = NGROUPS_MAX; |
83 | 86 | ||
@@ -775,6 +778,15 @@ static ctl_table vm_table[] = { | |||
775 | .strategy = &sysctl_intvec, | 778 | .strategy = &sysctl_intvec, |
776 | }, | 779 | }, |
777 | { | 780 | { |
781 | .ctl_name = VM_DROP_PAGECACHE, | ||
782 | .procname = "drop_caches", | ||
783 | .data = &sysctl_drop_caches, | ||
784 | .maxlen = sizeof(int), | ||
785 | .mode = 0644, | ||
786 | .proc_handler = drop_caches_sysctl_handler, | ||
787 | .strategy = &sysctl_intvec, | ||
788 | }, | ||
789 | { | ||
778 | .ctl_name = VM_MIN_FREE_KBYTES, | 790 | .ctl_name = VM_MIN_FREE_KBYTES, |
779 | .procname = "min_free_kbytes", | 791 | .procname = "min_free_kbytes", |
780 | .data = &min_free_kbytes, | 792 | .data = &min_free_kbytes, |
@@ -784,6 +796,16 @@ static ctl_table vm_table[] = { | |||
784 | .strategy = &sysctl_intvec, | 796 | .strategy = &sysctl_intvec, |
785 | .extra1 = &zero, | 797 | .extra1 = &zero, |
786 | }, | 798 | }, |
799 | { | ||
800 | .ctl_name = VM_PERCPU_PAGELIST_FRACTION, | ||
801 | .procname = "percpu_pagelist_fraction", | ||
802 | .data = &percpu_pagelist_fraction, | ||
803 | .maxlen = sizeof(percpu_pagelist_fraction), | ||
804 | .mode = 0644, | ||
805 | .proc_handler = &percpu_pagelist_fraction_sysctl_handler, | ||
806 | .strategy = &sysctl_intvec, | ||
807 | .extra1 = &min_percpu_pagelist_fract, | ||
808 | }, | ||
787 | #ifdef CONFIG_MMU | 809 | #ifdef CONFIG_MMU |
788 | { | 810 | { |
789 | .ctl_name = VM_MAX_MAP_COUNT, | 811 | .ctl_name = VM_MAX_MAP_COUNT, |
diff --git a/kernel/timer.c b/kernel/timer.c index fd74268d8663..074b4bd5cfd8 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/posix-timers.h> | 33 | #include <linux/posix-timers.h> |
34 | #include <linux/cpu.h> | 34 | #include <linux/cpu.h> |
35 | #include <linux/syscalls.h> | 35 | #include <linux/syscalls.h> |
36 | #include <linux/delay.h> | ||
36 | 37 | ||
37 | #include <asm/uaccess.h> | 38 | #include <asm/uaccess.h> |
38 | #include <asm/unistd.h> | 39 | #include <asm/unistd.h> |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2bd5aee1c736..82c4fa70595c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -29,7 +29,8 @@ | |||
29 | #include <linux/kthread.h> | 29 | #include <linux/kthread.h> |
30 | 30 | ||
31 | /* | 31 | /* |
32 | * The per-CPU workqueue (if single thread, we always use cpu 0's). | 32 | * The per-CPU workqueue (if single thread, we always use the first |
33 | * possible cpu). | ||
33 | * | 34 | * |
34 | * The sequence counters are for flush_scheduled_work(). It wants to wait | 35 | * The sequence counters are for flush_scheduled_work(). It wants to wait |
35 | * until until all currently-scheduled works are completed, but it doesn't | 36 | * until until all currently-scheduled works are completed, but it doesn't |
@@ -69,6 +70,8 @@ struct workqueue_struct { | |||
69 | static DEFINE_SPINLOCK(workqueue_lock); | 70 | static DEFINE_SPINLOCK(workqueue_lock); |
70 | static LIST_HEAD(workqueues); | 71 | static LIST_HEAD(workqueues); |
71 | 72 | ||
73 | static int singlethread_cpu; | ||
74 | |||
72 | /* If it's single threaded, it isn't in the list of workqueues. */ | 75 | /* If it's single threaded, it isn't in the list of workqueues. */ |
73 | static inline int is_single_threaded(struct workqueue_struct *wq) | 76 | static inline int is_single_threaded(struct workqueue_struct *wq) |
74 | { | 77 | { |
@@ -102,7 +105,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work) | |||
102 | 105 | ||
103 | if (!test_and_set_bit(0, &work->pending)) { | 106 | if (!test_and_set_bit(0, &work->pending)) { |
104 | if (unlikely(is_single_threaded(wq))) | 107 | if (unlikely(is_single_threaded(wq))) |
105 | cpu = any_online_cpu(cpu_online_map); | 108 | cpu = singlethread_cpu; |
106 | BUG_ON(!list_empty(&work->entry)); | 109 | BUG_ON(!list_empty(&work->entry)); |
107 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | 110 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
108 | ret = 1; | 111 | ret = 1; |
@@ -118,7 +121,7 @@ static void delayed_work_timer_fn(unsigned long __data) | |||
118 | int cpu = smp_processor_id(); | 121 | int cpu = smp_processor_id(); |
119 | 122 | ||
120 | if (unlikely(is_single_threaded(wq))) | 123 | if (unlikely(is_single_threaded(wq))) |
121 | cpu = any_online_cpu(cpu_online_map); | 124 | cpu = singlethread_cpu; |
122 | 125 | ||
123 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); | 126 | __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); |
124 | } | 127 | } |
@@ -267,7 +270,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq) | |||
267 | 270 | ||
268 | if (is_single_threaded(wq)) { | 271 | if (is_single_threaded(wq)) { |
269 | /* Always use first cpu's area. */ | 272 | /* Always use first cpu's area. */ |
270 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map))); | 273 | flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, singlethread_cpu)); |
271 | } else { | 274 | } else { |
272 | int cpu; | 275 | int cpu; |
273 | 276 | ||
@@ -315,12 +318,17 @@ struct workqueue_struct *__create_workqueue(const char *name, | |||
315 | return NULL; | 318 | return NULL; |
316 | 319 | ||
317 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); | 320 | wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct); |
321 | if (!wq->cpu_wq) { | ||
322 | kfree(wq); | ||
323 | return NULL; | ||
324 | } | ||
325 | |||
318 | wq->name = name; | 326 | wq->name = name; |
319 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 327 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
320 | lock_cpu_hotplug(); | 328 | lock_cpu_hotplug(); |
321 | if (singlethread) { | 329 | if (singlethread) { |
322 | INIT_LIST_HEAD(&wq->list); | 330 | INIT_LIST_HEAD(&wq->list); |
323 | p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map)); | 331 | p = create_workqueue_thread(wq, singlethread_cpu); |
324 | if (!p) | 332 | if (!p) |
325 | destroy = 1; | 333 | destroy = 1; |
326 | else | 334 | else |
@@ -374,7 +382,7 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
374 | /* We don't need the distraction of CPUs appearing and vanishing. */ | 382 | /* We don't need the distraction of CPUs appearing and vanishing. */ |
375 | lock_cpu_hotplug(); | 383 | lock_cpu_hotplug(); |
376 | if (is_single_threaded(wq)) | 384 | if (is_single_threaded(wq)) |
377 | cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map)); | 385 | cleanup_workqueue_thread(wq, singlethread_cpu); |
378 | else { | 386 | else { |
379 | for_each_online_cpu(cpu) | 387 | for_each_online_cpu(cpu) |
380 | cleanup_workqueue_thread(wq, cpu); | 388 | cleanup_workqueue_thread(wq, cpu); |
@@ -419,6 +427,25 @@ int schedule_delayed_work_on(int cpu, | |||
419 | return ret; | 427 | return ret; |
420 | } | 428 | } |
421 | 429 | ||
430 | int schedule_on_each_cpu(void (*func) (void *info), void *info) | ||
431 | { | ||
432 | int cpu; | ||
433 | struct work_struct *work; | ||
434 | |||
435 | work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); | ||
436 | |||
437 | if (!work) | ||
438 | return -ENOMEM; | ||
439 | for_each_online_cpu(cpu) { | ||
440 | INIT_WORK(work + cpu, func, info); | ||
441 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), | ||
442 | work + cpu); | ||
443 | } | ||
444 | flush_workqueue(keventd_wq); | ||
445 | kfree(work); | ||
446 | return 0; | ||
447 | } | ||
448 | |||
422 | void flush_scheduled_work(void) | 449 | void flush_scheduled_work(void) |
423 | { | 450 | { |
424 | flush_workqueue(keventd_wq); | 451 | flush_workqueue(keventd_wq); |
@@ -543,6 +570,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
543 | 570 | ||
544 | void init_workqueues(void) | 571 | void init_workqueues(void) |
545 | { | 572 | { |
573 | singlethread_cpu = first_cpu(cpu_possible_map); | ||
546 | hotcpu_notifier(workqueue_cpu_callback, 0); | 574 | hotcpu_notifier(workqueue_cpu_callback, 0); |
547 | keventd_wq = create_workqueue("events"); | 575 | keventd_wq = create_workqueue("events"); |
548 | BUG_ON(!keventd_wq); | 576 | BUG_ON(!keventd_wq); |