diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/acct.c | 26 | ||||
-rw-r--r-- | kernel/auditsc.c | 6 | ||||
-rw-r--r-- | kernel/cpuset.c | 22 | ||||
-rw-r--r-- | kernel/exit.c | 75 | ||||
-rw-r--r-- | kernel/fork.c | 81 | ||||
-rw-r--r-- | kernel/futex.c | 10 | ||||
-rw-r--r-- | kernel/irq/proc.c | 3 | ||||
-rw-r--r-- | kernel/kallsyms.c | 16 | ||||
-rw-r--r-- | kernel/kexec.c | 1 | ||||
-rw-r--r-- | kernel/kmod.c | 2 | ||||
-rw-r--r-- | kernel/mutex.c | 9 | ||||
-rw-r--r-- | kernel/nsproxy.c | 42 | ||||
-rw-r--r-- | kernel/pid.c | 75 | ||||
-rw-r--r-- | kernel/relay.c | 4 | ||||
-rw-r--r-- | kernel/sched.c | 511 | ||||
-rw-r--r-- | kernel/signal.c | 13 | ||||
-rw-r--r-- | kernel/sys.c | 23 | ||||
-rw-r--r-- | kernel/sysctl.c | 387 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 8 | ||||
-rw-r--r-- | kernel/timer.c | 148 | ||||
-rw-r--r-- | kernel/tsacct.c | 9 | ||||
-rw-r--r-- | kernel/workqueue.c | 21 |
22 files changed, 884 insertions, 608 deletions
diff --git a/kernel/acct.c b/kernel/acct.c index dc12db8600..70d0d88e55 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file) | |||
118 | spin_unlock(&acct_globals.lock); | 118 | spin_unlock(&acct_globals.lock); |
119 | 119 | ||
120 | /* May block */ | 120 | /* May block */ |
121 | if (vfs_statfs(file->f_dentry, &sbuf)) | 121 | if (vfs_statfs(file->f_path.dentry, &sbuf)) |
122 | return res; | 122 | return res; |
123 | suspend = sbuf.f_blocks * SUSPEND; | 123 | suspend = sbuf.f_blocks * SUSPEND; |
124 | resume = sbuf.f_blocks * RESUME; | 124 | resume = sbuf.f_blocks * RESUME; |
@@ -194,7 +194,7 @@ static void acct_file_reopen(struct file *file) | |||
194 | add_timer(&acct_globals.timer); | 194 | add_timer(&acct_globals.timer); |
195 | } | 195 | } |
196 | if (old_acct) { | 196 | if (old_acct) { |
197 | mnt_unpin(old_acct->f_vfsmnt); | 197 | mnt_unpin(old_acct->f_path.mnt); |
198 | spin_unlock(&acct_globals.lock); | 198 | spin_unlock(&acct_globals.lock); |
199 | do_acct_process(old_acct); | 199 | do_acct_process(old_acct); |
200 | filp_close(old_acct, NULL); | 200 | filp_close(old_acct, NULL); |
@@ -212,7 +212,7 @@ static int acct_on(char *name) | |||
212 | if (IS_ERR(file)) | 212 | if (IS_ERR(file)) |
213 | return PTR_ERR(file); | 213 | return PTR_ERR(file); |
214 | 214 | ||
215 | if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { | 215 | if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { |
216 | filp_close(file, NULL); | 216 | filp_close(file, NULL); |
217 | return -EACCES; | 217 | return -EACCES; |
218 | } | 218 | } |
@@ -229,11 +229,11 @@ static int acct_on(char *name) | |||
229 | } | 229 | } |
230 | 230 | ||
231 | spin_lock(&acct_globals.lock); | 231 | spin_lock(&acct_globals.lock); |
232 | mnt_pin(file->f_vfsmnt); | 232 | mnt_pin(file->f_path.mnt); |
233 | acct_file_reopen(file); | 233 | acct_file_reopen(file); |
234 | spin_unlock(&acct_globals.lock); | 234 | spin_unlock(&acct_globals.lock); |
235 | 235 | ||
236 | mntput(file->f_vfsmnt); /* it's pinned, now give up active reference */ | 236 | mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ |
237 | 237 | ||
238 | return 0; | 238 | return 0; |
239 | } | 239 | } |
@@ -283,7 +283,7 @@ asmlinkage long sys_acct(const char __user *name) | |||
283 | void acct_auto_close_mnt(struct vfsmount *m) | 283 | void acct_auto_close_mnt(struct vfsmount *m) |
284 | { | 284 | { |
285 | spin_lock(&acct_globals.lock); | 285 | spin_lock(&acct_globals.lock); |
286 | if (acct_globals.file && acct_globals.file->f_vfsmnt == m) | 286 | if (acct_globals.file && acct_globals.file->f_path.mnt == m) |
287 | acct_file_reopen(NULL); | 287 | acct_file_reopen(NULL); |
288 | spin_unlock(&acct_globals.lock); | 288 | spin_unlock(&acct_globals.lock); |
289 | } | 289 | } |
@@ -299,7 +299,7 @@ void acct_auto_close(struct super_block *sb) | |||
299 | { | 299 | { |
300 | spin_lock(&acct_globals.lock); | 300 | spin_lock(&acct_globals.lock); |
301 | if (acct_globals.file && | 301 | if (acct_globals.file && |
302 | acct_globals.file->f_vfsmnt->mnt_sb == sb) { | 302 | acct_globals.file->f_path.mnt->mnt_sb == sb) { |
303 | acct_file_reopen(NULL); | 303 | acct_file_reopen(NULL); |
304 | } | 304 | } |
305 | spin_unlock(&acct_globals.lock); | 305 | spin_unlock(&acct_globals.lock); |
@@ -428,6 +428,7 @@ static void do_acct_process(struct file *file) | |||
428 | u64 elapsed; | 428 | u64 elapsed; |
429 | u64 run_time; | 429 | u64 run_time; |
430 | struct timespec uptime; | 430 | struct timespec uptime; |
431 | struct tty_struct *tty; | ||
431 | 432 | ||
432 | /* | 433 | /* |
433 | * First check to see if there is enough free_space to continue | 434 | * First check to see if there is enough free_space to continue |
@@ -484,16 +485,9 @@ static void do_acct_process(struct file *file) | |||
484 | ac.ac_ppid = current->parent->tgid; | 485 | ac.ac_ppid = current->parent->tgid; |
485 | #endif | 486 | #endif |
486 | 487 | ||
487 | mutex_lock(&tty_mutex); | ||
488 | /* FIXME: Whoever is responsible for current->signal locking needs | ||
489 | to use the same locking all over the kernel and document it */ | ||
490 | read_lock(&tasklist_lock); | ||
491 | ac.ac_tty = current->signal->tty ? | ||
492 | old_encode_dev(tty_devnum(current->signal->tty)) : 0; | ||
493 | read_unlock(&tasklist_lock); | ||
494 | mutex_unlock(&tty_mutex); | ||
495 | |||
496 | spin_lock_irq(¤t->sighand->siglock); | 488 | spin_lock_irq(¤t->sighand->siglock); |
489 | tty = current->signal->tty; | ||
490 | ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; | ||
497 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); | 491 | ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); |
498 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); | 492 | ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); |
499 | ac.ac_flag = pacct->ac_flag; | 493 | ac.ac_flag = pacct->ac_flag; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 40722e26de..298897559c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -781,8 +781,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk | |||
781 | if ((vma->vm_flags & VM_EXECUTABLE) && | 781 | if ((vma->vm_flags & VM_EXECUTABLE) && |
782 | vma->vm_file) { | 782 | vma->vm_file) { |
783 | audit_log_d_path(ab, "exe=", | 783 | audit_log_d_path(ab, "exe=", |
784 | vma->vm_file->f_dentry, | 784 | vma->vm_file->f_path.dentry, |
785 | vma->vm_file->f_vfsmnt); | 785 | vma->vm_file->f_path.mnt); |
786 | break; | 786 | break; |
787 | } | 787 | } |
788 | vma = vma->vm_next; | 788 | vma = vma->vm_next; |
@@ -826,10 +826,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
826 | context->return_code); | 826 | context->return_code); |
827 | 827 | ||
828 | mutex_lock(&tty_mutex); | 828 | mutex_lock(&tty_mutex); |
829 | read_lock(&tasklist_lock); | ||
829 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | 830 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) |
830 | tty = tsk->signal->tty->name; | 831 | tty = tsk->signal->tty->name; |
831 | else | 832 | else |
832 | tty = "(none)"; | 833 | tty = "(none)"; |
834 | read_unlock(&tasklist_lock); | ||
833 | audit_log_format(ab, | 835 | audit_log_format(ab, |
834 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | 836 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" |
835 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" | 837 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 0a6b4d89f9..2c3b443147 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -413,8 +413,8 @@ static struct file_system_type cpuset_fs_type = { | |||
413 | * | 413 | * |
414 | * | 414 | * |
415 | * When reading/writing to a file: | 415 | * When reading/writing to a file: |
416 | * - the cpuset to use in file->f_dentry->d_parent->d_fsdata | 416 | * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata |
417 | * - the 'cftype' of the file is file->f_dentry->d_fsdata | 417 | * - the 'cftype' of the file is file->f_path.dentry->d_fsdata |
418 | */ | 418 | */ |
419 | 419 | ||
420 | struct cftype { | 420 | struct cftype { |
@@ -1284,8 +1284,8 @@ static ssize_t cpuset_common_file_write(struct file *file, | |||
1284 | const char __user *userbuf, | 1284 | const char __user *userbuf, |
1285 | size_t nbytes, loff_t *unused_ppos) | 1285 | size_t nbytes, loff_t *unused_ppos) |
1286 | { | 1286 | { |
1287 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1287 | struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); |
1288 | struct cftype *cft = __d_cft(file->f_dentry); | 1288 | struct cftype *cft = __d_cft(file->f_path.dentry); |
1289 | cpuset_filetype_t type = cft->private; | 1289 | cpuset_filetype_t type = cft->private; |
1290 | char *buffer; | 1290 | char *buffer; |
1291 | char *pathbuf = NULL; | 1291 | char *pathbuf = NULL; |
@@ -1367,7 +1367,7 @@ static ssize_t cpuset_file_write(struct file *file, const char __user *buf, | |||
1367 | size_t nbytes, loff_t *ppos) | 1367 | size_t nbytes, loff_t *ppos) |
1368 | { | 1368 | { |
1369 | ssize_t retval = 0; | 1369 | ssize_t retval = 0; |
1370 | struct cftype *cft = __d_cft(file->f_dentry); | 1370 | struct cftype *cft = __d_cft(file->f_path.dentry); |
1371 | if (!cft) | 1371 | if (!cft) |
1372 | return -ENODEV; | 1372 | return -ENODEV; |
1373 | 1373 | ||
@@ -1417,8 +1417,8 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) | |||
1417 | static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | 1417 | static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, |
1418 | size_t nbytes, loff_t *ppos) | 1418 | size_t nbytes, loff_t *ppos) |
1419 | { | 1419 | { |
1420 | struct cftype *cft = __d_cft(file->f_dentry); | 1420 | struct cftype *cft = __d_cft(file->f_path.dentry); |
1421 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1421 | struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); |
1422 | cpuset_filetype_t type = cft->private; | 1422 | cpuset_filetype_t type = cft->private; |
1423 | char *page; | 1423 | char *page; |
1424 | ssize_t retval = 0; | 1424 | ssize_t retval = 0; |
@@ -1476,7 +1476,7 @@ static ssize_t cpuset_file_read(struct file *file, char __user *buf, size_t nbyt | |||
1476 | loff_t *ppos) | 1476 | loff_t *ppos) |
1477 | { | 1477 | { |
1478 | ssize_t retval = 0; | 1478 | ssize_t retval = 0; |
1479 | struct cftype *cft = __d_cft(file->f_dentry); | 1479 | struct cftype *cft = __d_cft(file->f_path.dentry); |
1480 | if (!cft) | 1480 | if (!cft) |
1481 | return -ENODEV; | 1481 | return -ENODEV; |
1482 | 1482 | ||
@@ -1498,7 +1498,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) | |||
1498 | if (err) | 1498 | if (err) |
1499 | return err; | 1499 | return err; |
1500 | 1500 | ||
1501 | cft = __d_cft(file->f_dentry); | 1501 | cft = __d_cft(file->f_path.dentry); |
1502 | if (!cft) | 1502 | if (!cft) |
1503 | return -ENODEV; | 1503 | return -ENODEV; |
1504 | if (cft->open) | 1504 | if (cft->open) |
@@ -1511,7 +1511,7 @@ static int cpuset_file_open(struct inode *inode, struct file *file) | |||
1511 | 1511 | ||
1512 | static int cpuset_file_release(struct inode *inode, struct file *file) | 1512 | static int cpuset_file_release(struct inode *inode, struct file *file) |
1513 | { | 1513 | { |
1514 | struct cftype *cft = __d_cft(file->f_dentry); | 1514 | struct cftype *cft = __d_cft(file->f_path.dentry); |
1515 | if (cft->release) | 1515 | if (cft->release) |
1516 | return cft->release(inode, file); | 1516 | return cft->release(inode, file); |
1517 | return 0; | 1517 | return 0; |
@@ -1700,7 +1700,7 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) | |||
1700 | */ | 1700 | */ |
1701 | static int cpuset_tasks_open(struct inode *unused, struct file *file) | 1701 | static int cpuset_tasks_open(struct inode *unused, struct file *file) |
1702 | { | 1702 | { |
1703 | struct cpuset *cs = __d_cs(file->f_dentry->d_parent); | 1703 | struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); |
1704 | struct ctr_struct *ctr; | 1704 | struct ctr_struct *ctr; |
1705 | pid_t *pidarray; | 1705 | pid_t *pidarray; |
1706 | int npids; | 1706 | int npids; |
diff --git a/kernel/exit.c b/kernel/exit.c index 4e3f919edc..122fadb972 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -13,7 +13,7 @@ | |||
13 | #include <linux/completion.h> | 13 | #include <linux/completion.h> |
14 | #include <linux/personality.h> | 14 | #include <linux/personality.h> |
15 | #include <linux/tty.h> | 15 | #include <linux/tty.h> |
16 | #include <linux/namespace.h> | 16 | #include <linux/mnt_namespace.h> |
17 | #include <linux/key.h> | 17 | #include <linux/key.h> |
18 | #include <linux/security.h> | 18 | #include <linux/security.h> |
19 | #include <linux/cpu.h> | 19 | #include <linux/cpu.h> |
@@ -22,6 +22,7 @@ | |||
22 | #include <linux/file.h> | 22 | #include <linux/file.h> |
23 | #include <linux/binfmts.h> | 23 | #include <linux/binfmts.h> |
24 | #include <linux/nsproxy.h> | 24 | #include <linux/nsproxy.h> |
25 | #include <linux/pid_namespace.h> | ||
25 | #include <linux/ptrace.h> | 26 | #include <linux/ptrace.h> |
26 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
27 | #include <linux/mount.h> | 28 | #include <linux/mount.h> |
@@ -48,7 +49,6 @@ | |||
48 | #include <asm/mmu_context.h> | 49 | #include <asm/mmu_context.h> |
49 | 50 | ||
50 | extern void sem_exit (void); | 51 | extern void sem_exit (void); |
51 | extern struct task_struct *child_reaper; | ||
52 | 52 | ||
53 | static void exit_mm(struct task_struct * tsk); | 53 | static void exit_mm(struct task_struct * tsk); |
54 | 54 | ||
@@ -189,21 +189,18 @@ repeat: | |||
189 | int session_of_pgrp(int pgrp) | 189 | int session_of_pgrp(int pgrp) |
190 | { | 190 | { |
191 | struct task_struct *p; | 191 | struct task_struct *p; |
192 | int sid = -1; | 192 | int sid = 0; |
193 | 193 | ||
194 | read_lock(&tasklist_lock); | 194 | read_lock(&tasklist_lock); |
195 | do_each_task_pid(pgrp, PIDTYPE_PGID, p) { | 195 | |
196 | if (p->signal->session > 0) { | 196 | p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); |
197 | sid = p->signal->session; | 197 | if (p == NULL) |
198 | goto out; | 198 | p = find_task_by_pid(pgrp); |
199 | } | 199 | if (p != NULL) |
200 | } while_each_task_pid(pgrp, PIDTYPE_PGID, p); | 200 | sid = process_session(p); |
201 | p = find_task_by_pid(pgrp); | 201 | |
202 | if (p) | ||
203 | sid = p->signal->session; | ||
204 | out: | ||
205 | read_unlock(&tasklist_lock); | 202 | read_unlock(&tasklist_lock); |
206 | 203 | ||
207 | return sid; | 204 | return sid; |
208 | } | 205 | } |
209 | 206 | ||
@@ -225,8 +222,8 @@ static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) | |||
225 | || p->exit_state | 222 | || p->exit_state |
226 | || is_init(p->real_parent)) | 223 | || is_init(p->real_parent)) |
227 | continue; | 224 | continue; |
228 | if (process_group(p->real_parent) != pgrp | 225 | if (process_group(p->real_parent) != pgrp && |
229 | && p->real_parent->signal->session == p->signal->session) { | 226 | process_session(p->real_parent) == process_session(p)) { |
230 | ret = 0; | 227 | ret = 0; |
231 | break; | 228 | break; |
232 | } | 229 | } |
@@ -260,7 +257,8 @@ static int has_stopped_jobs(int pgrp) | |||
260 | } | 257 | } |
261 | 258 | ||
262 | /** | 259 | /** |
263 | * reparent_to_init - Reparent the calling kernel thread to the init task. | 260 | * reparent_to_init - Reparent the calling kernel thread to the init task |
261 | * of the pid space that the thread belongs to. | ||
264 | * | 262 | * |
265 | * If a kernel thread is launched as a result of a system call, or if | 263 | * If a kernel thread is launched as a result of a system call, or if |
266 | * it ever exits, it should generally reparent itself to init so that | 264 | * it ever exits, it should generally reparent itself to init so that |
@@ -278,8 +276,8 @@ static void reparent_to_init(void) | |||
278 | ptrace_unlink(current); | 276 | ptrace_unlink(current); |
279 | /* Reparent to init */ | 277 | /* Reparent to init */ |
280 | remove_parent(current); | 278 | remove_parent(current); |
281 | current->parent = child_reaper; | 279 | current->parent = child_reaper(current); |
282 | current->real_parent = child_reaper; | 280 | current->real_parent = child_reaper(current); |
283 | add_parent(current); | 281 | add_parent(current); |
284 | 282 | ||
285 | /* Set the exit signal to SIGCHLD so we signal init on exit */ | 283 | /* Set the exit signal to SIGCHLD so we signal init on exit */ |
@@ -302,9 +300,9 @@ void __set_special_pids(pid_t session, pid_t pgrp) | |||
302 | { | 300 | { |
303 | struct task_struct *curr = current->group_leader; | 301 | struct task_struct *curr = current->group_leader; |
304 | 302 | ||
305 | if (curr->signal->session != session) { | 303 | if (process_session(curr) != session) { |
306 | detach_pid(curr, PIDTYPE_SID); | 304 | detach_pid(curr, PIDTYPE_SID); |
307 | curr->signal->session = session; | 305 | set_signal_session(curr->signal, session); |
308 | attach_pid(curr, PIDTYPE_SID, session); | 306 | attach_pid(curr, PIDTYPE_SID, session); |
309 | } | 307 | } |
310 | if (process_group(curr) != pgrp) { | 308 | if (process_group(curr) != pgrp) { |
@@ -314,7 +312,7 @@ void __set_special_pids(pid_t session, pid_t pgrp) | |||
314 | } | 312 | } |
315 | } | 313 | } |
316 | 314 | ||
317 | void set_special_pids(pid_t session, pid_t pgrp) | 315 | static void set_special_pids(pid_t session, pid_t pgrp) |
318 | { | 316 | { |
319 | write_lock_irq(&tasklist_lock); | 317 | write_lock_irq(&tasklist_lock); |
320 | __set_special_pids(session, pgrp); | 318 | __set_special_pids(session, pgrp); |
@@ -384,9 +382,7 @@ void daemonize(const char *name, ...) | |||
384 | exit_mm(current); | 382 | exit_mm(current); |
385 | 383 | ||
386 | set_special_pids(1, 1); | 384 | set_special_pids(1, 1); |
387 | mutex_lock(&tty_mutex); | 385 | proc_clear_tty(current); |
388 | current->signal->tty = NULL; | ||
389 | mutex_unlock(&tty_mutex); | ||
390 | 386 | ||
391 | /* Block and flush all signals */ | 387 | /* Block and flush all signals */ |
392 | sigfillset(&blocked); | 388 | sigfillset(&blocked); |
@@ -429,7 +425,7 @@ static void close_files(struct files_struct * files) | |||
429 | for (;;) { | 425 | for (;;) { |
430 | unsigned long set; | 426 | unsigned long set; |
431 | i = j * __NFDBITS; | 427 | i = j * __NFDBITS; |
432 | if (i >= fdt->max_fdset || i >= fdt->max_fds) | 428 | if (i >= fdt->max_fds) |
433 | break; | 429 | break; |
434 | set = fdt->open_fds->fds_bits[j++]; | 430 | set = fdt->open_fds->fds_bits[j++]; |
435 | while (set) { | 431 | while (set) { |
@@ -470,11 +466,9 @@ void fastcall put_files_struct(struct files_struct *files) | |||
470 | * you can free files immediately. | 466 | * you can free files immediately. |
471 | */ | 467 | */ |
472 | fdt = files_fdtable(files); | 468 | fdt = files_fdtable(files); |
473 | if (fdt == &files->fdtab) | 469 | if (fdt != &files->fdtab) |
474 | fdt->free_files = files; | ||
475 | else | ||
476 | kmem_cache_free(files_cachep, files); | 470 | kmem_cache_free(files_cachep, files); |
477 | free_fdtable(fdt); | 471 | call_rcu(&fdt->rcu, free_fdtable_rcu); |
478 | } | 472 | } |
479 | } | 473 | } |
480 | 474 | ||
@@ -649,10 +643,11 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
649 | * outside, so the child pgrp is now orphaned. | 643 | * outside, so the child pgrp is now orphaned. |
650 | */ | 644 | */ |
651 | if ((process_group(p) != process_group(father)) && | 645 | if ((process_group(p) != process_group(father)) && |
652 | (p->signal->session == father->signal->session)) { | 646 | (process_session(p) == process_session(father))) { |
653 | int pgrp = process_group(p); | 647 | int pgrp = process_group(p); |
654 | 648 | ||
655 | if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { | 649 | if (will_become_orphaned_pgrp(pgrp, NULL) && |
650 | has_stopped_jobs(pgrp)) { | ||
656 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); | 651 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); |
657 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); | 652 | __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); |
658 | } | 653 | } |
@@ -663,7 +658,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
663 | * When we die, we re-parent all our children. | 658 | * When we die, we re-parent all our children. |
664 | * Try to give them to another thread in our thread | 659 | * Try to give them to another thread in our thread |
665 | * group, and if no such member exists, give it to | 660 | * group, and if no such member exists, give it to |
666 | * the global child reaper process (ie "init") | 661 | * the child reaper process (ie "init") in our pid |
662 | * space. | ||
667 | */ | 663 | */ |
668 | static void | 664 | static void |
669 | forget_original_parent(struct task_struct *father, struct list_head *to_release) | 665 | forget_original_parent(struct task_struct *father, struct list_head *to_release) |
@@ -674,7 +670,7 @@ forget_original_parent(struct task_struct *father, struct list_head *to_release) | |||
674 | do { | 670 | do { |
675 | reaper = next_thread(reaper); | 671 | reaper = next_thread(reaper); |
676 | if (reaper == father) { | 672 | if (reaper == father) { |
677 | reaper = child_reaper; | 673 | reaper = child_reaper(father); |
678 | break; | 674 | break; |
679 | } | 675 | } |
680 | } while (reaper->exit_state); | 676 | } while (reaper->exit_state); |
@@ -786,7 +782,7 @@ static void exit_notify(struct task_struct *tsk) | |||
786 | t = tsk->real_parent; | 782 | t = tsk->real_parent; |
787 | 783 | ||
788 | if ((process_group(t) != process_group(tsk)) && | 784 | if ((process_group(t) != process_group(tsk)) && |
789 | (t->signal->session == tsk->signal->session) && | 785 | (process_session(t) == process_session(tsk)) && |
790 | will_become_orphaned_pgrp(process_group(tsk), tsk) && | 786 | will_become_orphaned_pgrp(process_group(tsk), tsk) && |
791 | has_stopped_jobs(process_group(tsk))) { | 787 | has_stopped_jobs(process_group(tsk))) { |
792 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); | 788 | __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); |
@@ -860,8 +856,13 @@ fastcall NORET_TYPE void do_exit(long code) | |||
860 | panic("Aiee, killing interrupt handler!"); | 856 | panic("Aiee, killing interrupt handler!"); |
861 | if (unlikely(!tsk->pid)) | 857 | if (unlikely(!tsk->pid)) |
862 | panic("Attempted to kill the idle task!"); | 858 | panic("Attempted to kill the idle task!"); |
863 | if (unlikely(tsk == child_reaper)) | 859 | if (unlikely(tsk == child_reaper(tsk))) { |
864 | panic("Attempted to kill init!"); | 860 | if (tsk->nsproxy->pid_ns != &init_pid_ns) |
861 | tsk->nsproxy->pid_ns->child_reaper = init_pid_ns.child_reaper; | ||
862 | else | ||
863 | panic("Attempted to kill init!"); | ||
864 | } | ||
865 | |||
865 | 866 | ||
866 | if (unlikely(current->ptrace & PT_TRACE_EXIT)) { | 867 | if (unlikely(current->ptrace & PT_TRACE_EXIT)) { |
867 | current->ptrace_message = code; | 868 | current->ptrace_message = code; |
diff --git a/kernel/fork.c b/kernel/fork.c index 7f2e31ba33..d16c566eb6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -18,7 +18,7 @@ | |||
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/vmalloc.h> | 19 | #include <linux/vmalloc.h> |
20 | #include <linux/completion.h> | 20 | #include <linux/completion.h> |
21 | #include <linux/namespace.h> | 21 | #include <linux/mnt_namespace.h> |
22 | #include <linux/personality.h> | 22 | #include <linux/personality.h> |
23 | #include <linux/mempolicy.h> | 23 | #include <linux/mempolicy.h> |
24 | #include <linux/sem.h> | 24 | #include <linux/sem.h> |
@@ -36,6 +36,7 @@ | |||
36 | #include <linux/syscalls.h> | 36 | #include <linux/syscalls.h> |
37 | #include <linux/jiffies.h> | 37 | #include <linux/jiffies.h> |
38 | #include <linux/futex.h> | 38 | #include <linux/futex.h> |
39 | #include <linux/task_io_accounting_ops.h> | ||
39 | #include <linux/rcupdate.h> | 40 | #include <linux/rcupdate.h> |
40 | #include <linux/ptrace.h> | 41 | #include <linux/ptrace.h> |
41 | #include <linux/mount.h> | 42 | #include <linux/mount.h> |
@@ -252,7 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
252 | anon_vma_link(tmp); | 253 | anon_vma_link(tmp); |
253 | file = tmp->vm_file; | 254 | file = tmp->vm_file; |
254 | if (file) { | 255 | if (file) { |
255 | struct inode *inode = file->f_dentry->d_inode; | 256 | struct inode *inode = file->f_path.dentry->d_inode; |
256 | get_file(file); | 257 | get_file(file); |
257 | if (tmp->vm_flags & VM_DENYWRITE) | 258 | if (tmp->vm_flags & VM_DENYWRITE) |
258 | atomic_dec(&inode->i_writecount); | 259 | atomic_dec(&inode->i_writecount); |
@@ -613,7 +614,7 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) | |||
613 | 614 | ||
614 | static int count_open_files(struct fdtable *fdt) | 615 | static int count_open_files(struct fdtable *fdt) |
615 | { | 616 | { |
616 | int size = fdt->max_fdset; | 617 | int size = fdt->max_fds; |
617 | int i; | 618 | int i; |
618 | 619 | ||
619 | /* Find the last open fd */ | 620 | /* Find the last open fd */ |
@@ -640,12 +641,10 @@ static struct files_struct *alloc_files(void) | |||
640 | newf->next_fd = 0; | 641 | newf->next_fd = 0; |
641 | fdt = &newf->fdtab; | 642 | fdt = &newf->fdtab; |
642 | fdt->max_fds = NR_OPEN_DEFAULT; | 643 | fdt->max_fds = NR_OPEN_DEFAULT; |
643 | fdt->max_fdset = EMBEDDED_FD_SET_SIZE; | ||
644 | fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; | 644 | fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; |
645 | fdt->open_fds = (fd_set *)&newf->open_fds_init; | 645 | fdt->open_fds = (fd_set *)&newf->open_fds_init; |
646 | fdt->fd = &newf->fd_array[0]; | 646 | fdt->fd = &newf->fd_array[0]; |
647 | INIT_RCU_HEAD(&fdt->rcu); | 647 | INIT_RCU_HEAD(&fdt->rcu); |
648 | fdt->free_files = NULL; | ||
649 | fdt->next = NULL; | 648 | fdt->next = NULL; |
650 | rcu_assign_pointer(newf->fdt, fdt); | 649 | rcu_assign_pointer(newf->fdt, fdt); |
651 | out: | 650 | out: |
@@ -661,7 +660,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
661 | { | 660 | { |
662 | struct files_struct *newf; | 661 | struct files_struct *newf; |
663 | struct file **old_fds, **new_fds; | 662 | struct file **old_fds, **new_fds; |
664 | int open_files, size, i, expand; | 663 | int open_files, size, i; |
665 | struct fdtable *old_fdt, *new_fdt; | 664 | struct fdtable *old_fdt, *new_fdt; |
666 | 665 | ||
667 | *errorp = -ENOMEM; | 666 | *errorp = -ENOMEM; |
@@ -672,25 +671,14 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
672 | spin_lock(&oldf->file_lock); | 671 | spin_lock(&oldf->file_lock); |
673 | old_fdt = files_fdtable(oldf); | 672 | old_fdt = files_fdtable(oldf); |
674 | new_fdt = files_fdtable(newf); | 673 | new_fdt = files_fdtable(newf); |
675 | size = old_fdt->max_fdset; | ||
676 | open_files = count_open_files(old_fdt); | 674 | open_files = count_open_files(old_fdt); |
677 | expand = 0; | ||
678 | 675 | ||
679 | /* | 676 | /* |
680 | * Check whether we need to allocate a larger fd array or fd set. | 677 | * Check whether we need to allocate a larger fd array and fd set. |
681 | * Note: we're not a clone task, so the open count won't change. | 678 | * Note: we're not a clone task, so the open count won't change. |
682 | */ | 679 | */ |
683 | if (open_files > new_fdt->max_fdset) { | ||
684 | new_fdt->max_fdset = 0; | ||
685 | expand = 1; | ||
686 | } | ||
687 | if (open_files > new_fdt->max_fds) { | 680 | if (open_files > new_fdt->max_fds) { |
688 | new_fdt->max_fds = 0; | 681 | new_fdt->max_fds = 0; |
689 | expand = 1; | ||
690 | } | ||
691 | |||
692 | /* if the old fdset gets grown now, we'll only copy up to "size" fds */ | ||
693 | if (expand) { | ||
694 | spin_unlock(&oldf->file_lock); | 682 | spin_unlock(&oldf->file_lock); |
695 | spin_lock(&newf->file_lock); | 683 | spin_lock(&newf->file_lock); |
696 | *errorp = expand_files(newf, open_files-1); | 684 | *errorp = expand_files(newf, open_files-1); |
@@ -710,8 +698,10 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
710 | old_fds = old_fdt->fd; | 698 | old_fds = old_fdt->fd; |
711 | new_fds = new_fdt->fd; | 699 | new_fds = new_fdt->fd; |
712 | 700 | ||
713 | memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); | 701 | memcpy(new_fdt->open_fds->fds_bits, |
714 | memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); | 702 | old_fdt->open_fds->fds_bits, open_files/8); |
703 | memcpy(new_fdt->close_on_exec->fds_bits, | ||
704 | old_fdt->close_on_exec->fds_bits, open_files/8); | ||
715 | 705 | ||
716 | for (i = open_files; i != 0; i--) { | 706 | for (i = open_files; i != 0; i--) { |
717 | struct file *f = *old_fds++; | 707 | struct file *f = *old_fds++; |
@@ -736,22 +726,19 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) | |||
736 | /* This is long word aligned thus could use a optimized version */ | 726 | /* This is long word aligned thus could use a optimized version */ |
737 | memset(new_fds, 0, size); | 727 | memset(new_fds, 0, size); |
738 | 728 | ||
739 | if (new_fdt->max_fdset > open_files) { | 729 | if (new_fdt->max_fds > open_files) { |
740 | int left = (new_fdt->max_fdset-open_files)/8; | 730 | int left = (new_fdt->max_fds-open_files)/8; |
741 | int start = open_files / (8 * sizeof(unsigned long)); | 731 | int start = open_files / (8 * sizeof(unsigned long)); |
742 | 732 | ||
743 | memset(&new_fdt->open_fds->fds_bits[start], 0, left); | 733 | memset(&new_fdt->open_fds->fds_bits[start], 0, left); |
744 | memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); | 734 | memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); |
745 | } | 735 | } |
746 | 736 | ||
747 | out: | ||
748 | return newf; | 737 | return newf; |
749 | 738 | ||
750 | out_release: | 739 | out_release: |
751 | free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); | ||
752 | free_fdset (new_fdt->open_fds, new_fdt->max_fdset); | ||
753 | free_fd_array(new_fdt->fd, new_fdt->max_fds); | ||
754 | kmem_cache_free(files_cachep, newf); | 740 | kmem_cache_free(files_cachep, newf); |
741 | out: | ||
755 | return NULL; | 742 | return NULL; |
756 | } | 743 | } |
757 | 744 | ||
@@ -1055,6 +1042,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1055 | p->wchar = 0; /* I/O counter: bytes written */ | 1042 | p->wchar = 0; /* I/O counter: bytes written */ |
1056 | p->syscr = 0; /* I/O counter: read syscalls */ | 1043 | p->syscr = 0; /* I/O counter: read syscalls */ |
1057 | p->syscw = 0; /* I/O counter: write syscalls */ | 1044 | p->syscw = 0; /* I/O counter: write syscalls */ |
1045 | task_io_accounting_init(p); | ||
1058 | acct_clear_integrals(p); | 1046 | acct_clear_integrals(p); |
1059 | 1047 | ||
1060 | p->it_virt_expires = cputime_zero; | 1048 | p->it_virt_expires = cputime_zero; |
@@ -1259,9 +1247,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1259 | if (thread_group_leader(p)) { | 1247 | if (thread_group_leader(p)) { |
1260 | p->signal->tty = current->signal->tty; | 1248 | p->signal->tty = current->signal->tty; |
1261 | p->signal->pgrp = process_group(current); | 1249 | p->signal->pgrp = process_group(current); |
1262 | p->signal->session = current->signal->session; | 1250 | set_signal_session(p->signal, process_session(current)); |
1263 | attach_pid(p, PIDTYPE_PGID, process_group(p)); | 1251 | attach_pid(p, PIDTYPE_PGID, process_group(p)); |
1264 | attach_pid(p, PIDTYPE_SID, p->signal->session); | 1252 | attach_pid(p, PIDTYPE_SID, process_session(p)); |
1265 | 1253 | ||
1266 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1254 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
1267 | __get_cpu_var(process_counts)++; | 1255 | __get_cpu_var(process_counts)++; |
@@ -1525,17 +1513,18 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) | |||
1525 | } | 1513 | } |
1526 | 1514 | ||
1527 | /* | 1515 | /* |
1528 | * Unshare the namespace structure if it is being shared | 1516 | * Unshare the mnt_namespace structure if it is being shared |
1529 | */ | 1517 | */ |
1530 | static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) | 1518 | static int unshare_mnt_namespace(unsigned long unshare_flags, |
1519 | struct mnt_namespace **new_nsp, struct fs_struct *new_fs) | ||
1531 | { | 1520 | { |
1532 | struct namespace *ns = current->nsproxy->namespace; | 1521 | struct mnt_namespace *ns = current->nsproxy->mnt_ns; |
1533 | 1522 | ||
1534 | if ((unshare_flags & CLONE_NEWNS) && ns) { | 1523 | if ((unshare_flags & CLONE_NEWNS) && ns) { |
1535 | if (!capable(CAP_SYS_ADMIN)) | 1524 | if (!capable(CAP_SYS_ADMIN)) |
1536 | return -EPERM; | 1525 | return -EPERM; |
1537 | 1526 | ||
1538 | *new_nsp = dup_namespace(current, new_fs ? new_fs : current->fs); | 1527 | *new_nsp = dup_mnt_ns(current, new_fs ? new_fs : current->fs); |
1539 | if (!*new_nsp) | 1528 | if (!*new_nsp) |
1540 | return -ENOMEM; | 1529 | return -ENOMEM; |
1541 | } | 1530 | } |
@@ -1544,15 +1533,13 @@ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new | |||
1544 | } | 1533 | } |
1545 | 1534 | ||
1546 | /* | 1535 | /* |
1547 | * Unsharing of sighand for tasks created with CLONE_SIGHAND is not | 1536 | * Unsharing of sighand is not supported yet |
1548 | * supported yet | ||
1549 | */ | 1537 | */ |
1550 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) | 1538 | static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp) |
1551 | { | 1539 | { |
1552 | struct sighand_struct *sigh = current->sighand; | 1540 | struct sighand_struct *sigh = current->sighand; |
1553 | 1541 | ||
1554 | if ((unshare_flags & CLONE_SIGHAND) && | 1542 | if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1) |
1555 | (sigh && atomic_read(&sigh->count) > 1)) | ||
1556 | return -EINVAL; | 1543 | return -EINVAL; |
1557 | else | 1544 | else |
1558 | return 0; | 1545 | return 0; |
@@ -1625,8 +1612,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1625 | { | 1612 | { |
1626 | int err = 0; | 1613 | int err = 0; |
1627 | struct fs_struct *fs, *new_fs = NULL; | 1614 | struct fs_struct *fs, *new_fs = NULL; |
1628 | struct namespace *ns, *new_ns = NULL; | 1615 | struct mnt_namespace *ns, *new_ns = NULL; |
1629 | struct sighand_struct *sigh, *new_sigh = NULL; | 1616 | struct sighand_struct *new_sigh = NULL; |
1630 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; | 1617 | struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; |
1631 | struct files_struct *fd, *new_fd = NULL; | 1618 | struct files_struct *fd, *new_fd = NULL; |
1632 | struct sem_undo_list *new_ulist = NULL; | 1619 | struct sem_undo_list *new_ulist = NULL; |
@@ -1647,7 +1634,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1647 | goto bad_unshare_out; | 1634 | goto bad_unshare_out; |
1648 | if ((err = unshare_fs(unshare_flags, &new_fs))) | 1635 | if ((err = unshare_fs(unshare_flags, &new_fs))) |
1649 | goto bad_unshare_cleanup_thread; | 1636 | goto bad_unshare_cleanup_thread; |
1650 | if ((err = unshare_namespace(unshare_flags, &new_ns, new_fs))) | 1637 | if ((err = unshare_mnt_namespace(unshare_flags, &new_ns, new_fs))) |
1651 | goto bad_unshare_cleanup_fs; | 1638 | goto bad_unshare_cleanup_fs; |
1652 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) | 1639 | if ((err = unshare_sighand(unshare_flags, &new_sigh))) |
1653 | goto bad_unshare_cleanup_ns; | 1640 | goto bad_unshare_cleanup_ns; |
@@ -1671,7 +1658,7 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1671 | } | 1658 | } |
1672 | } | 1659 | } |
1673 | 1660 | ||
1674 | if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || | 1661 | if (new_fs || new_ns || new_mm || new_fd || new_ulist || |
1675 | new_uts || new_ipc) { | 1662 | new_uts || new_ipc) { |
1676 | 1663 | ||
1677 | task_lock(current); | 1664 | task_lock(current); |
@@ -1688,17 +1675,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) | |||
1688 | } | 1675 | } |
1689 | 1676 | ||
1690 | if (new_ns) { | 1677 | if (new_ns) { |
1691 | ns = current->nsproxy->namespace; | 1678 | ns = current->nsproxy->mnt_ns; |
1692 | current->nsproxy->namespace = new_ns; | 1679 | current->nsproxy->mnt_ns = new_ns; |
1693 | new_ns = ns; | 1680 | new_ns = ns; |
1694 | } | 1681 | } |
1695 | 1682 | ||
1696 | if (new_sigh) { | ||
1697 | sigh = current->sighand; | ||
1698 | rcu_assign_pointer(current->sighand, new_sigh); | ||
1699 | new_sigh = sigh; | ||
1700 | } | ||
1701 | |||
1702 | if (new_mm) { | 1683 | if (new_mm) { |
1703 | mm = current->mm; | 1684 | mm = current->mm; |
1704 | active_mm = current->active_mm; | 1685 | active_mm = current->active_mm; |
@@ -1756,7 +1737,7 @@ bad_unshare_cleanup_sigh: | |||
1756 | 1737 | ||
1757 | bad_unshare_cleanup_ns: | 1738 | bad_unshare_cleanup_ns: |
1758 | if (new_ns) | 1739 | if (new_ns) |
1759 | put_namespace(new_ns); | 1740 | put_mnt_ns(new_ns); |
1760 | 1741 | ||
1761 | bad_unshare_cleanup_fs: | 1742 | bad_unshare_cleanup_fs: |
1762 | if (new_fs) | 1743 | if (new_fs) |
diff --git a/kernel/futex.c b/kernel/futex.c index 95989a3b41..5a737de857 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
166 | /* | 166 | /* |
167 | * Get parameters which are the keys for a futex. | 167 | * Get parameters which are the keys for a futex. |
168 | * | 168 | * |
169 | * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode, | 169 | * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, |
170 | * offset_within_page). For private mappings, it's (uaddr, current->mm). | 170 | * offset_within_page). For private mappings, it's (uaddr, current->mm). |
171 | * We can usually work out the index without swapping in the page. | 171 | * We can usually work out the index without swapping in the page. |
172 | * | 172 | * |
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key) | |||
223 | /* | 223 | /* |
224 | * Linear file mappings are also simple. | 224 | * Linear file mappings are also simple. |
225 | */ | 225 | */ |
226 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | 226 | key->shared.inode = vma->vm_file->f_path.dentry->d_inode; |
227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | 227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ |
228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | 228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { |
229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) | 229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) |
@@ -1528,9 +1528,9 @@ static int futex_fd(u32 __user *uaddr, int signal) | |||
1528 | goto out; | 1528 | goto out; |
1529 | } | 1529 | } |
1530 | filp->f_op = &futex_fops; | 1530 | filp->f_op = &futex_fops; |
1531 | filp->f_vfsmnt = mntget(futex_mnt); | 1531 | filp->f_path.mnt = mntget(futex_mnt); |
1532 | filp->f_dentry = dget(futex_mnt->mnt_root); | 1532 | filp->f_path.dentry = dget(futex_mnt->mnt_root); |
1533 | filp->f_mapping = filp->f_dentry->d_inode->i_mapping; | 1533 | filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; |
1534 | 1534 | ||
1535 | if (signal) { | 1535 | if (signal) { |
1536 | err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); | 1536 | err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9a35266700..61f5c717a8 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -54,7 +54,8 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
54 | unsigned int irq = (int)(long)data, full_count = count, err; | 54 | unsigned int irq = (int)(long)data, full_count = count, err; |
55 | cpumask_t new_value, tmp; | 55 | cpumask_t new_value, tmp; |
56 | 56 | ||
57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity) | 57 | if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || |
58 | CHECK_IRQ_PER_CPU(irq_desc[irq].status)) | ||
58 | return -EIO; | 59 | return -EIO; |
59 | 60 | ||
60 | err = cpumask_parse_user(buffer, count, new_value); | 61 | err = cpumask_parse_user(buffer, count, new_value); |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ab63cfc429..6f294ff4f9 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -31,14 +31,14 @@ | |||
31 | #endif | 31 | #endif |
32 | 32 | ||
33 | /* These will be re-linked against their real values during the second link stage */ | 33 | /* These will be re-linked against their real values during the second link stage */ |
34 | extern unsigned long kallsyms_addresses[] __attribute__((weak)); | 34 | extern const unsigned long kallsyms_addresses[] __attribute__((weak)); |
35 | extern unsigned long kallsyms_num_syms __attribute__((weak,section("data"))); | 35 | extern const unsigned long kallsyms_num_syms __attribute__((weak)); |
36 | extern u8 kallsyms_names[] __attribute__((weak)); | 36 | extern const u8 kallsyms_names[] __attribute__((weak)); |
37 | 37 | ||
38 | extern u8 kallsyms_token_table[] __attribute__((weak)); | 38 | extern const u8 kallsyms_token_table[] __attribute__((weak)); |
39 | extern u16 kallsyms_token_index[] __attribute__((weak)); | 39 | extern const u16 kallsyms_token_index[] __attribute__((weak)); |
40 | 40 | ||
41 | extern unsigned long kallsyms_markers[] __attribute__((weak)); | 41 | extern const unsigned long kallsyms_markers[] __attribute__((weak)); |
42 | 42 | ||
43 | static inline int is_kernel_inittext(unsigned long addr) | 43 | static inline int is_kernel_inittext(unsigned long addr) |
44 | { | 44 | { |
@@ -84,7 +84,7 @@ static int is_ksym_addr(unsigned long addr) | |||
84 | static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) | 84 | static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) |
85 | { | 85 | { |
86 | int len, skipped_first = 0; | 86 | int len, skipped_first = 0; |
87 | u8 *tptr, *data; | 87 | const u8 *tptr, *data; |
88 | 88 | ||
89 | /* get the compressed symbol length from the first symbol byte */ | 89 | /* get the compressed symbol length from the first symbol byte */ |
90 | data = &kallsyms_names[off]; | 90 | data = &kallsyms_names[off]; |
@@ -132,7 +132,7 @@ static char kallsyms_get_symbol_type(unsigned int off) | |||
132 | * kallsyms array */ | 132 | * kallsyms array */ |
133 | static unsigned int get_symbol_offset(unsigned long pos) | 133 | static unsigned int get_symbol_offset(unsigned long pos) |
134 | { | 134 | { |
135 | u8 *name; | 135 | const u8 *name; |
136 | int i; | 136 | int i; |
137 | 137 | ||
138 | /* use the closest marker we have. We have markers every 256 positions, | 138 | /* use the closest marker we have. We have markers every 256 positions, |
diff --git a/kernel/kexec.c b/kernel/kexec.c index afbbbe981b..2a59c8a01a 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -852,6 +852,7 @@ static int kimage_load_crash_segment(struct kimage *image, | |||
852 | memset(ptr + uchunk, 0, mchunk - uchunk); | 852 | memset(ptr + uchunk, 0, mchunk - uchunk); |
853 | } | 853 | } |
854 | result = copy_from_user(ptr, buf, uchunk); | 854 | result = copy_from_user(ptr, buf, uchunk); |
855 | kexec_flush_icache_page(page); | ||
855 | kunmap(page); | 856 | kunmap(page); |
856 | if (result) { | 857 | if (result) { |
857 | result = (result < 0) ? result : -EIO; | 858 | result = (result < 0) ? result : -EIO; |
diff --git a/kernel/kmod.c b/kernel/kmod.c index 8d2bea09a4..3a7379aa31 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/kmod.h> | 25 | #include <linux/kmod.h> |
26 | #include <linux/smp_lock.h> | 26 | #include <linux/smp_lock.h> |
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <linux/namespace.h> | 28 | #include <linux/mnt_namespace.h> |
29 | #include <linux/completion.h> | 29 | #include <linux/completion.h> |
30 | #include <linux/file.h> | 30 | #include <linux/file.h> |
31 | #include <linux/workqueue.h> | 31 | #include <linux/workqueue.h> |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 8c71cf72a4..e7cbbb8276 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -206,6 +206,15 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) | |||
206 | } | 206 | } |
207 | 207 | ||
208 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 208 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
209 | |||
210 | int __sched | ||
211 | mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | ||
212 | { | ||
213 | might_sleep(); | ||
214 | return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass); | ||
215 | } | ||
216 | |||
217 | EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); | ||
209 | #endif | 218 | #endif |
210 | 219 | ||
211 | /* | 220 | /* |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 674aceb733..e2ce748e96 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
@@ -17,8 +17,9 @@ | |||
17 | #include <linux/version.h> | 17 | #include <linux/version.h> |
18 | #include <linux/nsproxy.h> | 18 | #include <linux/nsproxy.h> |
19 | #include <linux/init_task.h> | 19 | #include <linux/init_task.h> |
20 | #include <linux/namespace.h> | 20 | #include <linux/mnt_namespace.h> |
21 | #include <linux/utsname.h> | 21 | #include <linux/utsname.h> |
22 | #include <linux/pid_namespace.h> | ||
22 | 23 | ||
23 | struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); | 24 | struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); |
24 | 25 | ||
@@ -45,8 +46,10 @@ static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) | |||
45 | struct nsproxy *ns; | 46 | struct nsproxy *ns; |
46 | 47 | ||
47 | ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); | 48 | ns = kmemdup(orig, sizeof(struct nsproxy), GFP_KERNEL); |
48 | if (ns) | 49 | if (ns) { |
49 | atomic_set(&ns->count, 1); | 50 | atomic_set(&ns->count, 1); |
51 | ns->id = -1; | ||
52 | } | ||
50 | return ns; | 53 | return ns; |
51 | } | 54 | } |
52 | 55 | ||
@@ -60,12 +63,14 @@ struct nsproxy *dup_namespaces(struct nsproxy *orig) | |||
60 | struct nsproxy *ns = clone_namespaces(orig); | 63 | struct nsproxy *ns = clone_namespaces(orig); |
61 | 64 | ||
62 | if (ns) { | 65 | if (ns) { |
63 | if (ns->namespace) | 66 | if (ns->mnt_ns) |
64 | get_namespace(ns->namespace); | 67 | get_mnt_ns(ns->mnt_ns); |
65 | if (ns->uts_ns) | 68 | if (ns->uts_ns) |
66 | get_uts_ns(ns->uts_ns); | 69 | get_uts_ns(ns->uts_ns); |
67 | if (ns->ipc_ns) | 70 | if (ns->ipc_ns) |
68 | get_ipc_ns(ns->ipc_ns); | 71 | get_ipc_ns(ns->ipc_ns); |
72 | if (ns->pid_ns) | ||
73 | get_pid_ns(ns->pid_ns); | ||
69 | } | 74 | } |
70 | 75 | ||
71 | return ns; | 76 | return ns; |
@@ -97,7 +102,7 @@ int copy_namespaces(int flags, struct task_struct *tsk) | |||
97 | 102 | ||
98 | tsk->nsproxy = new_ns; | 103 | tsk->nsproxy = new_ns; |
99 | 104 | ||
100 | err = copy_namespace(flags, tsk); | 105 | err = copy_mnt_ns(flags, tsk); |
101 | if (err) | 106 | if (err) |
102 | goto out_ns; | 107 | goto out_ns; |
103 | 108 | ||
@@ -109,16 +114,23 @@ int copy_namespaces(int flags, struct task_struct *tsk) | |||
109 | if (err) | 114 | if (err) |
110 | goto out_ipc; | 115 | goto out_ipc; |
111 | 116 | ||
117 | err = copy_pid_ns(flags, tsk); | ||
118 | if (err) | ||
119 | goto out_pid; | ||
120 | |||
112 | out: | 121 | out: |
113 | put_nsproxy(old_ns); | 122 | put_nsproxy(old_ns); |
114 | return err; | 123 | return err; |
115 | 124 | ||
125 | out_pid: | ||
126 | if (new_ns->ipc_ns) | ||
127 | put_ipc_ns(new_ns->ipc_ns); | ||
116 | out_ipc: | 128 | out_ipc: |
117 | if (new_ns->uts_ns) | 129 | if (new_ns->uts_ns) |
118 | put_uts_ns(new_ns->uts_ns); | 130 | put_uts_ns(new_ns->uts_ns); |
119 | out_uts: | 131 | out_uts: |
120 | if (new_ns->namespace) | 132 | if (new_ns->mnt_ns) |
121 | put_namespace(new_ns->namespace); | 133 | put_mnt_ns(new_ns->mnt_ns); |
122 | out_ns: | 134 | out_ns: |
123 | tsk->nsproxy = old_ns; | 135 | tsk->nsproxy = old_ns; |
124 | kfree(new_ns); | 136 | kfree(new_ns); |
@@ -127,11 +139,13 @@ out_ns: | |||
127 | 139 | ||
128 | void free_nsproxy(struct nsproxy *ns) | 140 | void free_nsproxy(struct nsproxy *ns) |
129 | { | 141 | { |
130 | if (ns->namespace) | 142 | if (ns->mnt_ns) |
131 | put_namespace(ns->namespace); | 143 | put_mnt_ns(ns->mnt_ns); |
132 | if (ns->uts_ns) | 144 | if (ns->uts_ns) |
133 | put_uts_ns(ns->uts_ns); | 145 | put_uts_ns(ns->uts_ns); |
134 | if (ns->ipc_ns) | 146 | if (ns->ipc_ns) |
135 | put_ipc_ns(ns->ipc_ns); | 147 | put_ipc_ns(ns->ipc_ns); |
136 | kfree(ns); | 148 | if (ns->pid_ns) |
149 | put_pid_ns(ns->pid_ns); | ||
150 | kfree(ns); | ||
137 | } | 151 | } |
diff --git a/kernel/pid.c b/kernel/pid.c index a48879b0b9..2efe9d8d36 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -26,7 +26,7 @@ | |||
26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
27 | #include <linux/bootmem.h> | 27 | #include <linux/bootmem.h> |
28 | #include <linux/hash.h> | 28 | #include <linux/hash.h> |
29 | #include <linux/pspace.h> | 29 | #include <linux/pid_namespace.h> |
30 | 30 | ||
31 | #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) | 31 | #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) |
32 | static struct hlist_head *pid_hash; | 32 | static struct hlist_head *pid_hash; |
@@ -43,9 +43,10 @@ int pid_max_max = PID_MAX_LIMIT; | |||
43 | #define BITS_PER_PAGE (PAGE_SIZE*8) | 43 | #define BITS_PER_PAGE (PAGE_SIZE*8) |
44 | #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) | 44 | #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) |
45 | 45 | ||
46 | static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) | 46 | static inline int mk_pid(struct pid_namespace *pid_ns, |
47 | struct pidmap *map, int off) | ||
47 | { | 48 | { |
48 | return (map - pspace->pidmap)*BITS_PER_PAGE + off; | 49 | return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; |
49 | } | 50 | } |
50 | 51 | ||
51 | #define find_next_offset(map, off) \ | 52 | #define find_next_offset(map, off) \ |
@@ -57,11 +58,15 @@ static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) | |||
57 | * value does not cause lots of bitmaps to be allocated, but | 58 | * value does not cause lots of bitmaps to be allocated, but |
58 | * the scheme scales to up to 4 million PIDs, runtime. | 59 | * the scheme scales to up to 4 million PIDs, runtime. |
59 | */ | 60 | */ |
60 | struct pspace init_pspace = { | 61 | struct pid_namespace init_pid_ns = { |
62 | .kref = { | ||
63 | .refcount = ATOMIC_INIT(2), | ||
64 | }, | ||
61 | .pidmap = { | 65 | .pidmap = { |
62 | [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } | 66 | [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } |
63 | }, | 67 | }, |
64 | .last_pid = 0 | 68 | .last_pid = 0, |
69 | .child_reaper = &init_task | ||
65 | }; | 70 | }; |
66 | 71 | ||
67 | /* | 72 | /* |
@@ -80,25 +85,25 @@ struct pspace init_pspace = { | |||
80 | 85 | ||
81 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); | 86 | static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); |
82 | 87 | ||
83 | static fastcall void free_pidmap(struct pspace *pspace, int pid) | 88 | static fastcall void free_pidmap(struct pid_namespace *pid_ns, int pid) |
84 | { | 89 | { |
85 | struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; | 90 | struct pidmap *map = pid_ns->pidmap + pid / BITS_PER_PAGE; |
86 | int offset = pid & BITS_PER_PAGE_MASK; | 91 | int offset = pid & BITS_PER_PAGE_MASK; |
87 | 92 | ||
88 | clear_bit(offset, map->page); | 93 | clear_bit(offset, map->page); |
89 | atomic_inc(&map->nr_free); | 94 | atomic_inc(&map->nr_free); |
90 | } | 95 | } |
91 | 96 | ||
92 | static int alloc_pidmap(struct pspace *pspace) | 97 | static int alloc_pidmap(struct pid_namespace *pid_ns) |
93 | { | 98 | { |
94 | int i, offset, max_scan, pid, last = pspace->last_pid; | 99 | int i, offset, max_scan, pid, last = pid_ns->last_pid; |
95 | struct pidmap *map; | 100 | struct pidmap *map; |
96 | 101 | ||
97 | pid = last + 1; | 102 | pid = last + 1; |
98 | if (pid >= pid_max) | 103 | if (pid >= pid_max) |
99 | pid = RESERVED_PIDS; | 104 | pid = RESERVED_PIDS; |
100 | offset = pid & BITS_PER_PAGE_MASK; | 105 | offset = pid & BITS_PER_PAGE_MASK; |
101 | map = &pspace->pidmap[pid/BITS_PER_PAGE]; | 106 | map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; |
102 | max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; | 107 | max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; |
103 | for (i = 0; i <= max_scan; ++i) { | 108 | for (i = 0; i <= max_scan; ++i) { |
104 | if (unlikely(!map->page)) { | 109 | if (unlikely(!map->page)) { |
@@ -120,11 +125,11 @@ static int alloc_pidmap(struct pspace *pspace) | |||
120 | do { | 125 | do { |
121 | if (!test_and_set_bit(offset, map->page)) { | 126 | if (!test_and_set_bit(offset, map->page)) { |
122 | atomic_dec(&map->nr_free); | 127 | atomic_dec(&map->nr_free); |
123 | pspace->last_pid = pid; | 128 | pid_ns->last_pid = pid; |
124 | return pid; | 129 | return pid; |
125 | } | 130 | } |
126 | offset = find_next_offset(map, offset); | 131 | offset = find_next_offset(map, offset); |
127 | pid = mk_pid(pspace, map, offset); | 132 | pid = mk_pid(pid_ns, map, offset); |
128 | /* | 133 | /* |
129 | * find_next_offset() found a bit, the pid from it | 134 | * find_next_offset() found a bit, the pid from it |
130 | * is in-bounds, and if we fell back to the last | 135 | * is in-bounds, and if we fell back to the last |
@@ -135,34 +140,34 @@ static int alloc_pidmap(struct pspace *pspace) | |||
135 | (i != max_scan || pid < last || | 140 | (i != max_scan || pid < last || |
136 | !((last+1) & BITS_PER_PAGE_MASK))); | 141 | !((last+1) & BITS_PER_PAGE_MASK))); |
137 | } | 142 | } |
138 | if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { | 143 | if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { |
139 | ++map; | 144 | ++map; |
140 | offset = 0; | 145 | offset = 0; |
141 | } else { | 146 | } else { |
142 | map = &pspace->pidmap[0]; | 147 | map = &pid_ns->pidmap[0]; |
143 | offset = RESERVED_PIDS; | 148 | offset = RESERVED_PIDS; |
144 | if (unlikely(last == offset)) | 149 | if (unlikely(last == offset)) |
145 | break; | 150 | break; |
146 | } | 151 | } |
147 | pid = mk_pid(pspace, map, offset); | 152 | pid = mk_pid(pid_ns, map, offset); |
148 | } | 153 | } |
149 | return -1; | 154 | return -1; |
150 | } | 155 | } |
151 | 156 | ||
152 | static int next_pidmap(struct pspace *pspace, int last) | 157 | static int next_pidmap(struct pid_namespace *pid_ns, int last) |
153 | { | 158 | { |
154 | int offset; | 159 | int offset; |
155 | struct pidmap *map, *end; | 160 | struct pidmap *map, *end; |
156 | 161 | ||
157 | offset = (last + 1) & BITS_PER_PAGE_MASK; | 162 | offset = (last + 1) & BITS_PER_PAGE_MASK; |
158 | map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; | 163 | map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; |
159 | end = &pspace->pidmap[PIDMAP_ENTRIES]; | 164 | end = &pid_ns->pidmap[PIDMAP_ENTRIES]; |
160 | for (; map < end; map++, offset = 0) { | 165 | for (; map < end; map++, offset = 0) { |
161 | if (unlikely(!map->page)) | 166 | if (unlikely(!map->page)) |
162 | continue; | 167 | continue; |
163 | offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); | 168 | offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); |
164 | if (offset < BITS_PER_PAGE) | 169 | if (offset < BITS_PER_PAGE) |
165 | return mk_pid(pspace, map, offset); | 170 | return mk_pid(pid_ns, map, offset); |
166 | } | 171 | } |
167 | return -1; | 172 | return -1; |
168 | } | 173 | } |
@@ -192,7 +197,7 @@ fastcall void free_pid(struct pid *pid) | |||
192 | hlist_del_rcu(&pid->pid_chain); | 197 | hlist_del_rcu(&pid->pid_chain); |
193 | spin_unlock_irqrestore(&pidmap_lock, flags); | 198 | spin_unlock_irqrestore(&pidmap_lock, flags); |
194 | 199 | ||
195 | free_pidmap(&init_pspace, pid->nr); | 200 | free_pidmap(current->nsproxy->pid_ns, pid->nr); |
196 | call_rcu(&pid->rcu, delayed_put_pid); | 201 | call_rcu(&pid->rcu, delayed_put_pid); |
197 | } | 202 | } |
198 | 203 | ||
@@ -206,7 +211,7 @@ struct pid *alloc_pid(void) | |||
206 | if (!pid) | 211 | if (!pid) |
207 | goto out; | 212 | goto out; |
208 | 213 | ||
209 | nr = alloc_pidmap(&init_pspace); | 214 | nr = alloc_pidmap(current->nsproxy->pid_ns); |
210 | if (nr < 0) | 215 | if (nr < 0) |
211 | goto out_free; | 216 | goto out_free; |
212 | 217 | ||
@@ -348,13 +353,33 @@ struct pid *find_ge_pid(int nr) | |||
348 | pid = find_pid(nr); | 353 | pid = find_pid(nr); |
349 | if (pid) | 354 | if (pid) |
350 | break; | 355 | break; |
351 | nr = next_pidmap(&init_pspace, nr); | 356 | nr = next_pidmap(current->nsproxy->pid_ns, nr); |
352 | } while (nr > 0); | 357 | } while (nr > 0); |
353 | 358 | ||
354 | return pid; | 359 | return pid; |
355 | } | 360 | } |
356 | EXPORT_SYMBOL_GPL(find_get_pid); | 361 | EXPORT_SYMBOL_GPL(find_get_pid); |
357 | 362 | ||
363 | int copy_pid_ns(int flags, struct task_struct *tsk) | ||
364 | { | ||
365 | struct pid_namespace *old_ns = tsk->nsproxy->pid_ns; | ||
366 | int err = 0; | ||
367 | |||
368 | if (!old_ns) | ||
369 | return 0; | ||
370 | |||
371 | get_pid_ns(old_ns); | ||
372 | return err; | ||
373 | } | ||
374 | |||
375 | void free_pid_ns(struct kref *kref) | ||
376 | { | ||
377 | struct pid_namespace *ns; | ||
378 | |||
379 | ns = container_of(kref, struct pid_namespace, kref); | ||
380 | kfree(ns); | ||
381 | } | ||
382 | |||
358 | /* | 383 | /* |
359 | * The pid hash table is scaled according to the amount of memory in the | 384 | * The pid hash table is scaled according to the amount of memory in the |
360 | * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or | 385 | * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or |
@@ -382,10 +407,10 @@ void __init pidhash_init(void) | |||
382 | 407 | ||
383 | void __init pidmap_init(void) | 408 | void __init pidmap_init(void) |
384 | { | 409 | { |
385 | init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); | 410 | init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); |
386 | /* Reserve PID 0. We never call free_pidmap(0) */ | 411 | /* Reserve PID 0. We never call free_pidmap(0) */ |
387 | set_bit(0, init_pspace.pidmap[0].page); | 412 | set_bit(0, init_pid_ns.pidmap[0].page); |
388 | atomic_dec(&init_pspace.pidmap[0].nr_free); | 413 | atomic_dec(&init_pid_ns.pidmap[0].nr_free); |
389 | 414 | ||
390 | pid_cachep = kmem_cache_create("pid", sizeof(struct pid), | 415 | pid_cachep = kmem_cache_create("pid", sizeof(struct pid), |
391 | __alignof__(struct pid), | 416 | __alignof__(struct pid), |
diff --git a/kernel/relay.c b/kernel/relay.c index 75a3a9a7ef..818e514729 100644 --- a/kernel/relay.c +++ b/kernel/relay.c | |||
@@ -959,7 +959,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, | |||
959 | if (!desc->count) | 959 | if (!desc->count) |
960 | return 0; | 960 | return 0; |
961 | 961 | ||
962 | mutex_lock(&filp->f_dentry->d_inode->i_mutex); | 962 | mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); |
963 | do { | 963 | do { |
964 | if (!relay_file_read_avail(buf, *ppos)) | 964 | if (!relay_file_read_avail(buf, *ppos)) |
965 | break; | 965 | break; |
@@ -979,7 +979,7 @@ static inline ssize_t relay_file_read_subbufs(struct file *filp, | |||
979 | *ppos = relay_file_read_end_pos(buf, read_start, ret); | 979 | *ppos = relay_file_read_end_pos(buf, read_start, ret); |
980 | } | 980 | } |
981 | } while (desc->count && ret); | 981 | } while (desc->count && ret); |
982 | mutex_unlock(&filp->f_dentry->d_inode->i_mutex); | 982 | mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); |
983 | 983 | ||
984 | return desc->written; | 984 | return desc->written; |
985 | } | 985 | } |
diff --git a/kernel/sched.c b/kernel/sched.c index f385eff468..8a0afb97af 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -225,8 +225,10 @@ struct rq { | |||
225 | unsigned long nr_uninterruptible; | 225 | unsigned long nr_uninterruptible; |
226 | 226 | ||
227 | unsigned long expired_timestamp; | 227 | unsigned long expired_timestamp; |
228 | unsigned long long timestamp_last_tick; | 228 | /* Cached timestamp set by update_cpu_clock() */ |
229 | unsigned long long most_recent_timestamp; | ||
229 | struct task_struct *curr, *idle; | 230 | struct task_struct *curr, *idle; |
231 | unsigned long next_balance; | ||
230 | struct mm_struct *prev_mm; | 232 | struct mm_struct *prev_mm; |
231 | struct prio_array *active, *expired, arrays[2]; | 233 | struct prio_array *active, *expired, arrays[2]; |
232 | int best_expired_prio; | 234 | int best_expired_prio; |
@@ -426,7 +428,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) | |||
426 | * bump this up when changing the output format or the meaning of an existing | 428 | * bump this up when changing the output format or the meaning of an existing |
427 | * format, so that tools can adapt (or abort) | 429 | * format, so that tools can adapt (or abort) |
428 | */ | 430 | */ |
429 | #define SCHEDSTAT_VERSION 12 | 431 | #define SCHEDSTAT_VERSION 14 |
430 | 432 | ||
431 | static int show_schedstat(struct seq_file *seq, void *v) | 433 | static int show_schedstat(struct seq_file *seq, void *v) |
432 | { | 434 | { |
@@ -464,7 +466,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
464 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); | 466 | seq_printf(seq, "domain%d %s", dcnt++, mask_str); |
465 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; | 467 | for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; |
466 | itype++) { | 468 | itype++) { |
467 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", | 469 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " |
470 | "%lu", | ||
468 | sd->lb_cnt[itype], | 471 | sd->lb_cnt[itype], |
469 | sd->lb_balanced[itype], | 472 | sd->lb_balanced[itype], |
470 | sd->lb_failed[itype], | 473 | sd->lb_failed[itype], |
@@ -474,11 +477,13 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
474 | sd->lb_nobusyq[itype], | 477 | sd->lb_nobusyq[itype], |
475 | sd->lb_nobusyg[itype]); | 478 | sd->lb_nobusyg[itype]); |
476 | } | 479 | } |
477 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", | 480 | seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" |
481 | " %lu %lu %lu\n", | ||
478 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, | 482 | sd->alb_cnt, sd->alb_failed, sd->alb_pushed, |
479 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, | 483 | sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, |
480 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, | 484 | sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, |
481 | sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); | 485 | sd->ttwu_wake_remote, sd->ttwu_move_affine, |
486 | sd->ttwu_move_balance); | ||
482 | } | 487 | } |
483 | preempt_enable(); | 488 | preempt_enable(); |
484 | #endif | 489 | #endif |
@@ -547,7 +552,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) | |||
547 | #endif | 552 | #endif |
548 | 553 | ||
549 | /* | 554 | /* |
550 | * rq_lock - lock a given runqueue and disable interrupts. | 555 | * this_rq_lock - lock this runqueue and disable interrupts. |
551 | */ | 556 | */ |
552 | static inline struct rq *this_rq_lock(void) | 557 | static inline struct rq *this_rq_lock(void) |
553 | __acquires(rq->lock) | 558 | __acquires(rq->lock) |
@@ -938,13 +943,16 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) | |||
938 | { | 943 | { |
939 | unsigned long long now; | 944 | unsigned long long now; |
940 | 945 | ||
946 | if (rt_task(p)) | ||
947 | goto out; | ||
948 | |||
941 | now = sched_clock(); | 949 | now = sched_clock(); |
942 | #ifdef CONFIG_SMP | 950 | #ifdef CONFIG_SMP |
943 | if (!local) { | 951 | if (!local) { |
944 | /* Compensate for drifting sched_clock */ | 952 | /* Compensate for drifting sched_clock */ |
945 | struct rq *this_rq = this_rq(); | 953 | struct rq *this_rq = this_rq(); |
946 | now = (now - this_rq->timestamp_last_tick) | 954 | now = (now - this_rq->most_recent_timestamp) |
947 | + rq->timestamp_last_tick; | 955 | + rq->most_recent_timestamp; |
948 | } | 956 | } |
949 | #endif | 957 | #endif |
950 | 958 | ||
@@ -959,8 +967,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) | |||
959 | (now - p->timestamp) >> 20); | 967 | (now - p->timestamp) >> 20); |
960 | } | 968 | } |
961 | 969 | ||
962 | if (!rt_task(p)) | 970 | p->prio = recalc_task_prio(p, now); |
963 | p->prio = recalc_task_prio(p, now); | ||
964 | 971 | ||
965 | /* | 972 | /* |
966 | * This checks to make sure it's not an uninterruptible task | 973 | * This checks to make sure it's not an uninterruptible task |
@@ -985,7 +992,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) | |||
985 | } | 992 | } |
986 | } | 993 | } |
987 | p->timestamp = now; | 994 | p->timestamp = now; |
988 | 995 | out: | |
989 | __activate_task(p, rq); | 996 | __activate_task(p, rq); |
990 | } | 997 | } |
991 | 998 | ||
@@ -1450,7 +1457,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1450 | 1457 | ||
1451 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1458 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1452 | unsigned long tl = this_load; | 1459 | unsigned long tl = this_load; |
1453 | unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | 1460 | unsigned long tl_per_task; |
1461 | |||
1462 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1454 | 1463 | ||
1455 | /* | 1464 | /* |
1456 | * If sync wakeup then subtract the (maximum possible) | 1465 | * If sync wakeup then subtract the (maximum possible) |
@@ -1688,8 +1697,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1688 | * Not the local CPU - must adjust timestamp. This should | 1697 | * Not the local CPU - must adjust timestamp. This should |
1689 | * get optimised away in the !CONFIG_SMP case. | 1698 | * get optimised away in the !CONFIG_SMP case. |
1690 | */ | 1699 | */ |
1691 | p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) | 1700 | p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) |
1692 | + rq->timestamp_last_tick; | 1701 | + rq->most_recent_timestamp; |
1693 | __activate_task(p, rq); | 1702 | __activate_task(p, rq); |
1694 | if (TASK_PREEMPTS_CURR(p, rq)) | 1703 | if (TASK_PREEMPTS_CURR(p, rq)) |
1695 | resched_task(rq->curr); | 1704 | resched_task(rq->curr); |
@@ -1952,6 +1961,7 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) | |||
1952 | __acquires(rq1->lock) | 1961 | __acquires(rq1->lock) |
1953 | __acquires(rq2->lock) | 1962 | __acquires(rq2->lock) |
1954 | { | 1963 | { |
1964 | BUG_ON(!irqs_disabled()); | ||
1955 | if (rq1 == rq2) { | 1965 | if (rq1 == rq2) { |
1956 | spin_lock(&rq1->lock); | 1966 | spin_lock(&rq1->lock); |
1957 | __acquire(rq2->lock); /* Fake it out ;) */ | 1967 | __acquire(rq2->lock); /* Fake it out ;) */ |
@@ -1991,6 +2001,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1991 | __acquires(busiest->lock) | 2001 | __acquires(busiest->lock) |
1992 | __acquires(this_rq->lock) | 2002 | __acquires(this_rq->lock) |
1993 | { | 2003 | { |
2004 | if (unlikely(!irqs_disabled())) { | ||
2005 | /* printk() doesn't work good under rq->lock */ | ||
2006 | spin_unlock(&this_rq->lock); | ||
2007 | BUG_ON(1); | ||
2008 | } | ||
1994 | if (unlikely(!spin_trylock(&busiest->lock))) { | 2009 | if (unlikely(!spin_trylock(&busiest->lock))) { |
1995 | if (busiest < this_rq) { | 2010 | if (busiest < this_rq) { |
1996 | spin_unlock(&this_rq->lock); | 2011 | spin_unlock(&this_rq->lock); |
@@ -2061,8 +2076,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, | |||
2061 | set_task_cpu(p, this_cpu); | 2076 | set_task_cpu(p, this_cpu); |
2062 | inc_nr_running(p, this_rq); | 2077 | inc_nr_running(p, this_rq); |
2063 | enqueue_task(p, this_array); | 2078 | enqueue_task(p, this_array); |
2064 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 2079 | p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) |
2065 | + this_rq->timestamp_last_tick; | 2080 | + this_rq->most_recent_timestamp; |
2066 | /* | 2081 | /* |
2067 | * Note that idle threads have a prio of MAX_PRIO, for this test | 2082 | * Note that idle threads have a prio of MAX_PRIO, for this test |
2068 | * to be always true for them. | 2083 | * to be always true for them. |
@@ -2098,10 +2113,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, | |||
2098 | * 2) too many balance attempts have failed. | 2113 | * 2) too many balance attempts have failed. |
2099 | */ | 2114 | */ |
2100 | 2115 | ||
2101 | if (sd->nr_balance_failed > sd->cache_nice_tries) | 2116 | if (sd->nr_balance_failed > sd->cache_nice_tries) { |
2117 | #ifdef CONFIG_SCHEDSTATS | ||
2118 | if (task_hot(p, rq->most_recent_timestamp, sd)) | ||
2119 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
2120 | #endif | ||
2102 | return 1; | 2121 | return 1; |
2122 | } | ||
2103 | 2123 | ||
2104 | if (task_hot(p, rq->timestamp_last_tick, sd)) | 2124 | if (task_hot(p, rq->most_recent_timestamp, sd)) |
2105 | return 0; | 2125 | return 0; |
2106 | return 1; | 2126 | return 1; |
2107 | } | 2127 | } |
@@ -2199,11 +2219,6 @@ skip_queue: | |||
2199 | goto skip_bitmap; | 2219 | goto skip_bitmap; |
2200 | } | 2220 | } |
2201 | 2221 | ||
2202 | #ifdef CONFIG_SCHEDSTATS | ||
2203 | if (task_hot(tmp, busiest->timestamp_last_tick, sd)) | ||
2204 | schedstat_inc(sd, lb_hot_gained[idle]); | ||
2205 | #endif | ||
2206 | |||
2207 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2222 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
2208 | pulled++; | 2223 | pulled++; |
2209 | rem_load_move -= tmp->load_weight; | 2224 | rem_load_move -= tmp->load_weight; |
@@ -2241,7 +2256,7 @@ out: | |||
2241 | static struct sched_group * | 2256 | static struct sched_group * |
2242 | find_busiest_group(struct sched_domain *sd, int this_cpu, | 2257 | find_busiest_group(struct sched_domain *sd, int this_cpu, |
2243 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, | 2258 | unsigned long *imbalance, enum idle_type idle, int *sd_idle, |
2244 | cpumask_t *cpus) | 2259 | cpumask_t *cpus, int *balance) |
2245 | { | 2260 | { |
2246 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2261 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
2247 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2262 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
@@ -2270,10 +2285,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2270 | unsigned long load, group_capacity; | 2285 | unsigned long load, group_capacity; |
2271 | int local_group; | 2286 | int local_group; |
2272 | int i; | 2287 | int i; |
2288 | unsigned int balance_cpu = -1, first_idle_cpu = 0; | ||
2273 | unsigned long sum_nr_running, sum_weighted_load; | 2289 | unsigned long sum_nr_running, sum_weighted_load; |
2274 | 2290 | ||
2275 | local_group = cpu_isset(this_cpu, group->cpumask); | 2291 | local_group = cpu_isset(this_cpu, group->cpumask); |
2276 | 2292 | ||
2293 | if (local_group) | ||
2294 | balance_cpu = first_cpu(group->cpumask); | ||
2295 | |||
2277 | /* Tally up the load of all CPUs in the group */ | 2296 | /* Tally up the load of all CPUs in the group */ |
2278 | sum_weighted_load = sum_nr_running = avg_load = 0; | 2297 | sum_weighted_load = sum_nr_running = avg_load = 0; |
2279 | 2298 | ||
@@ -2289,9 +2308,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2289 | *sd_idle = 0; | 2308 | *sd_idle = 0; |
2290 | 2309 | ||
2291 | /* Bias balancing toward cpus of our domain */ | 2310 | /* Bias balancing toward cpus of our domain */ |
2292 | if (local_group) | 2311 | if (local_group) { |
2312 | if (idle_cpu(i) && !first_idle_cpu) { | ||
2313 | first_idle_cpu = 1; | ||
2314 | balance_cpu = i; | ||
2315 | } | ||
2316 | |||
2293 | load = target_load(i, load_idx); | 2317 | load = target_load(i, load_idx); |
2294 | else | 2318 | } else |
2295 | load = source_load(i, load_idx); | 2319 | load = source_load(i, load_idx); |
2296 | 2320 | ||
2297 | avg_load += load; | 2321 | avg_load += load; |
@@ -2299,6 +2323,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2299 | sum_weighted_load += rq->raw_weighted_load; | 2323 | sum_weighted_load += rq->raw_weighted_load; |
2300 | } | 2324 | } |
2301 | 2325 | ||
2326 | /* | ||
2327 | * First idle cpu or the first cpu(busiest) in this sched group | ||
2328 | * is eligible for doing load balancing at this and above | ||
2329 | * domains. | ||
2330 | */ | ||
2331 | if (local_group && balance_cpu != this_cpu && balance) { | ||
2332 | *balance = 0; | ||
2333 | goto ret; | ||
2334 | } | ||
2335 | |||
2302 | total_load += avg_load; | 2336 | total_load += avg_load; |
2303 | total_pwr += group->cpu_power; | 2337 | total_pwr += group->cpu_power; |
2304 | 2338 | ||
@@ -2458,18 +2492,21 @@ small_imbalance: | |||
2458 | pwr_now /= SCHED_LOAD_SCALE; | 2492 | pwr_now /= SCHED_LOAD_SCALE; |
2459 | 2493 | ||
2460 | /* Amount of load we'd subtract */ | 2494 | /* Amount of load we'd subtract */ |
2461 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; | 2495 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / |
2496 | busiest->cpu_power; | ||
2462 | if (max_load > tmp) | 2497 | if (max_load > tmp) |
2463 | pwr_move += busiest->cpu_power * | 2498 | pwr_move += busiest->cpu_power * |
2464 | min(busiest_load_per_task, max_load - tmp); | 2499 | min(busiest_load_per_task, max_load - tmp); |
2465 | 2500 | ||
2466 | /* Amount of load we'd add */ | 2501 | /* Amount of load we'd add */ |
2467 | if (max_load*busiest->cpu_power < | 2502 | if (max_load * busiest->cpu_power < |
2468 | busiest_load_per_task*SCHED_LOAD_SCALE) | 2503 | busiest_load_per_task * SCHED_LOAD_SCALE) |
2469 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2504 | tmp = max_load * busiest->cpu_power / this->cpu_power; |
2470 | else | 2505 | else |
2471 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; | 2506 | tmp = busiest_load_per_task * SCHED_LOAD_SCALE / |
2472 | pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); | 2507 | this->cpu_power; |
2508 | pwr_move += this->cpu_power * | ||
2509 | min(this_load_per_task, this_load + tmp); | ||
2473 | pwr_move /= SCHED_LOAD_SCALE; | 2510 | pwr_move /= SCHED_LOAD_SCALE; |
2474 | 2511 | ||
2475 | /* Move if we gain throughput */ | 2512 | /* Move if we gain throughput */ |
@@ -2490,8 +2527,8 @@ out_balanced: | |||
2490 | *imbalance = min_load_per_task; | 2527 | *imbalance = min_load_per_task; |
2491 | return group_min; | 2528 | return group_min; |
2492 | } | 2529 | } |
2493 | ret: | ||
2494 | #endif | 2530 | #endif |
2531 | ret: | ||
2495 | *imbalance = 0; | 2532 | *imbalance = 0; |
2496 | return NULL; | 2533 | return NULL; |
2497 | } | 2534 | } |
@@ -2540,17 +2577,17 @@ static inline unsigned long minus_1_or_zero(unsigned long n) | |||
2540 | /* | 2577 | /* |
2541 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2578 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2542 | * tasks if there is an imbalance. | 2579 | * tasks if there is an imbalance. |
2543 | * | ||
2544 | * Called with this_rq unlocked. | ||
2545 | */ | 2580 | */ |
2546 | static int load_balance(int this_cpu, struct rq *this_rq, | 2581 | static int load_balance(int this_cpu, struct rq *this_rq, |
2547 | struct sched_domain *sd, enum idle_type idle) | 2582 | struct sched_domain *sd, enum idle_type idle, |
2583 | int *balance) | ||
2548 | { | 2584 | { |
2549 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; | 2585 | int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; |
2550 | struct sched_group *group; | 2586 | struct sched_group *group; |
2551 | unsigned long imbalance; | 2587 | unsigned long imbalance; |
2552 | struct rq *busiest; | 2588 | struct rq *busiest; |
2553 | cpumask_t cpus = CPU_MASK_ALL; | 2589 | cpumask_t cpus = CPU_MASK_ALL; |
2590 | unsigned long flags; | ||
2554 | 2591 | ||
2555 | /* | 2592 | /* |
2556 | * When power savings policy is enabled for the parent domain, idle | 2593 | * When power savings policy is enabled for the parent domain, idle |
@@ -2566,7 +2603,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
2566 | 2603 | ||
2567 | redo: | 2604 | redo: |
2568 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 2605 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
2569 | &cpus); | 2606 | &cpus, balance); |
2607 | |||
2608 | if (*balance == 0) | ||
2609 | goto out_balanced; | ||
2610 | |||
2570 | if (!group) { | 2611 | if (!group) { |
2571 | schedstat_inc(sd, lb_nobusyg[idle]); | 2612 | schedstat_inc(sd, lb_nobusyg[idle]); |
2572 | goto out_balanced; | 2613 | goto out_balanced; |
@@ -2590,11 +2631,13 @@ redo: | |||
2590 | * still unbalanced. nr_moved simply stays zero, so it is | 2631 | * still unbalanced. nr_moved simply stays zero, so it is |
2591 | * correctly treated as an imbalance. | 2632 | * correctly treated as an imbalance. |
2592 | */ | 2633 | */ |
2634 | local_irq_save(flags); | ||
2593 | double_rq_lock(this_rq, busiest); | 2635 | double_rq_lock(this_rq, busiest); |
2594 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2636 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2595 | minus_1_or_zero(busiest->nr_running), | 2637 | minus_1_or_zero(busiest->nr_running), |
2596 | imbalance, sd, idle, &all_pinned); | 2638 | imbalance, sd, idle, &all_pinned); |
2597 | double_rq_unlock(this_rq, busiest); | 2639 | double_rq_unlock(this_rq, busiest); |
2640 | local_irq_restore(flags); | ||
2598 | 2641 | ||
2599 | /* All tasks on this runqueue were pinned by CPU affinity */ | 2642 | /* All tasks on this runqueue were pinned by CPU affinity */ |
2600 | if (unlikely(all_pinned)) { | 2643 | if (unlikely(all_pinned)) { |
@@ -2611,13 +2654,13 @@ redo: | |||
2611 | 2654 | ||
2612 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { | 2655 | if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { |
2613 | 2656 | ||
2614 | spin_lock(&busiest->lock); | 2657 | spin_lock_irqsave(&busiest->lock, flags); |
2615 | 2658 | ||
2616 | /* don't kick the migration_thread, if the curr | 2659 | /* don't kick the migration_thread, if the curr |
2617 | * task on busiest cpu can't be moved to this_cpu | 2660 | * task on busiest cpu can't be moved to this_cpu |
2618 | */ | 2661 | */ |
2619 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { | 2662 | if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { |
2620 | spin_unlock(&busiest->lock); | 2663 | spin_unlock_irqrestore(&busiest->lock, flags); |
2621 | all_pinned = 1; | 2664 | all_pinned = 1; |
2622 | goto out_one_pinned; | 2665 | goto out_one_pinned; |
2623 | } | 2666 | } |
@@ -2627,7 +2670,7 @@ redo: | |||
2627 | busiest->push_cpu = this_cpu; | 2670 | busiest->push_cpu = this_cpu; |
2628 | active_balance = 1; | 2671 | active_balance = 1; |
2629 | } | 2672 | } |
2630 | spin_unlock(&busiest->lock); | 2673 | spin_unlock_irqrestore(&busiest->lock, flags); |
2631 | if (active_balance) | 2674 | if (active_balance) |
2632 | wake_up_process(busiest->migration_thread); | 2675 | wake_up_process(busiest->migration_thread); |
2633 | 2676 | ||
@@ -2706,7 +2749,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) | |||
2706 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2749 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
2707 | redo: | 2750 | redo: |
2708 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, | 2751 | group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, |
2709 | &sd_idle, &cpus); | 2752 | &sd_idle, &cpus, NULL); |
2710 | if (!group) { | 2753 | if (!group) { |
2711 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); | 2754 | schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); |
2712 | goto out_balanced; | 2755 | goto out_balanced; |
@@ -2766,14 +2809,28 @@ out_balanced: | |||
2766 | static void idle_balance(int this_cpu, struct rq *this_rq) | 2809 | static void idle_balance(int this_cpu, struct rq *this_rq) |
2767 | { | 2810 | { |
2768 | struct sched_domain *sd; | 2811 | struct sched_domain *sd; |
2812 | int pulled_task = 0; | ||
2813 | unsigned long next_balance = jiffies + 60 * HZ; | ||
2769 | 2814 | ||
2770 | for_each_domain(this_cpu, sd) { | 2815 | for_each_domain(this_cpu, sd) { |
2771 | if (sd->flags & SD_BALANCE_NEWIDLE) { | 2816 | if (sd->flags & SD_BALANCE_NEWIDLE) { |
2772 | /* If we've pulled tasks over stop searching: */ | 2817 | /* If we've pulled tasks over stop searching: */ |
2773 | if (load_balance_newidle(this_cpu, this_rq, sd)) | 2818 | pulled_task = load_balance_newidle(this_cpu, |
2819 | this_rq, sd); | ||
2820 | if (time_after(next_balance, | ||
2821 | sd->last_balance + sd->balance_interval)) | ||
2822 | next_balance = sd->last_balance | ||
2823 | + sd->balance_interval; | ||
2824 | if (pulled_task) | ||
2774 | break; | 2825 | break; |
2775 | } | 2826 | } |
2776 | } | 2827 | } |
2828 | if (!pulled_task) | ||
2829 | /* | ||
2830 | * We are going idle. next_balance may be set based on | ||
2831 | * a busy processor. So reset next_balance. | ||
2832 | */ | ||
2833 | this_rq->next_balance = next_balance; | ||
2777 | } | 2834 | } |
2778 | 2835 | ||
2779 | /* | 2836 | /* |
@@ -2826,26 +2883,9 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
2826 | spin_unlock(&target_rq->lock); | 2883 | spin_unlock(&target_rq->lock); |
2827 | } | 2884 | } |
2828 | 2885 | ||
2829 | /* | 2886 | static void update_load(struct rq *this_rq) |
2830 | * rebalance_tick will get called every timer tick, on every CPU. | ||
2831 | * | ||
2832 | * It checks each scheduling domain to see if it is due to be balanced, | ||
2833 | * and initiates a balancing operation if so. | ||
2834 | * | ||
2835 | * Balancing parameters are set up in arch_init_sched_domains. | ||
2836 | */ | ||
2837 | |||
2838 | /* Don't have all balancing operations going off at once: */ | ||
2839 | static inline unsigned long cpu_offset(int cpu) | ||
2840 | { | 2887 | { |
2841 | return jiffies + cpu * HZ / NR_CPUS; | 2888 | unsigned long this_load; |
2842 | } | ||
2843 | |||
2844 | static void | ||
2845 | rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | ||
2846 | { | ||
2847 | unsigned long this_load, interval, j = cpu_offset(this_cpu); | ||
2848 | struct sched_domain *sd; | ||
2849 | int i, scale; | 2889 | int i, scale; |
2850 | 2890 | ||
2851 | this_load = this_rq->raw_weighted_load; | 2891 | this_load = this_rq->raw_weighted_load; |
@@ -2865,6 +2905,32 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | |||
2865 | new_load += scale-1; | 2905 | new_load += scale-1; |
2866 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; | 2906 | this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; |
2867 | } | 2907 | } |
2908 | } | ||
2909 | |||
2910 | /* | ||
2911 | * run_rebalance_domains is triggered when needed from the scheduler tick. | ||
2912 | * | ||
2913 | * It checks each scheduling domain to see if it is due to be balanced, | ||
2914 | * and initiates a balancing operation if so. | ||
2915 | * | ||
2916 | * Balancing parameters are set up in arch_init_sched_domains. | ||
2917 | */ | ||
2918 | static DEFINE_SPINLOCK(balancing); | ||
2919 | |||
2920 | static void run_rebalance_domains(struct softirq_action *h) | ||
2921 | { | ||
2922 | int this_cpu = smp_processor_id(), balance = 1; | ||
2923 | struct rq *this_rq = cpu_rq(this_cpu); | ||
2924 | unsigned long interval; | ||
2925 | struct sched_domain *sd; | ||
2926 | /* | ||
2927 | * We are idle if there are no processes running. This | ||
2928 | * is valid even if we are the idle process (SMT). | ||
2929 | */ | ||
2930 | enum idle_type idle = !this_rq->nr_running ? | ||
2931 | SCHED_IDLE : NOT_IDLE; | ||
2932 | /* Earliest time when we have to call run_rebalance_domains again */ | ||
2933 | unsigned long next_balance = jiffies + 60*HZ; | ||
2868 | 2934 | ||
2869 | for_each_domain(this_cpu, sd) { | 2935 | for_each_domain(this_cpu, sd) { |
2870 | if (!(sd->flags & SD_LOAD_BALANCE)) | 2936 | if (!(sd->flags & SD_LOAD_BALANCE)) |
@@ -2879,8 +2945,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | |||
2879 | if (unlikely(!interval)) | 2945 | if (unlikely(!interval)) |
2880 | interval = 1; | 2946 | interval = 1; |
2881 | 2947 | ||
2882 | if (j - sd->last_balance >= interval) { | 2948 | if (sd->flags & SD_SERIALIZE) { |
2883 | if (load_balance(this_cpu, this_rq, sd, idle)) { | 2949 | if (!spin_trylock(&balancing)) |
2950 | goto out; | ||
2951 | } | ||
2952 | |||
2953 | if (time_after_eq(jiffies, sd->last_balance + interval)) { | ||
2954 | if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { | ||
2884 | /* | 2955 | /* |
2885 | * We've pulled tasks over so either we're no | 2956 | * We've pulled tasks over so either we're no |
2886 | * longer idle, or one of our SMT siblings is | 2957 | * longer idle, or one of our SMT siblings is |
@@ -2888,39 +2959,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) | |||
2888 | */ | 2959 | */ |
2889 | idle = NOT_IDLE; | 2960 | idle = NOT_IDLE; |
2890 | } | 2961 | } |
2891 | sd->last_balance += interval; | 2962 | sd->last_balance = jiffies; |
2892 | } | 2963 | } |
2964 | if (sd->flags & SD_SERIALIZE) | ||
2965 | spin_unlock(&balancing); | ||
2966 | out: | ||
2967 | if (time_after(next_balance, sd->last_balance + interval)) | ||
2968 | next_balance = sd->last_balance + interval; | ||
2969 | |||
2970 | /* | ||
2971 | * Stop the load balance at this level. There is another | ||
2972 | * CPU in our sched group which is doing load balancing more | ||
2973 | * actively. | ||
2974 | */ | ||
2975 | if (!balance) | ||
2976 | break; | ||
2893 | } | 2977 | } |
2978 | this_rq->next_balance = next_balance; | ||
2894 | } | 2979 | } |
2895 | #else | 2980 | #else |
2896 | /* | 2981 | /* |
2897 | * on UP we do not need to balance between CPUs: | 2982 | * on UP we do not need to balance between CPUs: |
2898 | */ | 2983 | */ |
2899 | static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) | ||
2900 | { | ||
2901 | } | ||
2902 | static inline void idle_balance(int cpu, struct rq *rq) | 2984 | static inline void idle_balance(int cpu, struct rq *rq) |
2903 | { | 2985 | { |
2904 | } | 2986 | } |
2905 | #endif | 2987 | #endif |
2906 | 2988 | ||
2907 | static inline int wake_priority_sleeper(struct rq *rq) | 2989 | static inline void wake_priority_sleeper(struct rq *rq) |
2908 | { | 2990 | { |
2909 | int ret = 0; | ||
2910 | |||
2911 | #ifdef CONFIG_SCHED_SMT | 2991 | #ifdef CONFIG_SCHED_SMT |
2992 | if (!rq->nr_running) | ||
2993 | return; | ||
2994 | |||
2912 | spin_lock(&rq->lock); | 2995 | spin_lock(&rq->lock); |
2913 | /* | 2996 | /* |
2914 | * If an SMT sibling task has been put to sleep for priority | 2997 | * If an SMT sibling task has been put to sleep for priority |
2915 | * reasons reschedule the idle task to see if it can now run. | 2998 | * reasons reschedule the idle task to see if it can now run. |
2916 | */ | 2999 | */ |
2917 | if (rq->nr_running) { | 3000 | if (rq->nr_running) |
2918 | resched_task(rq->idle); | 3001 | resched_task(rq->idle); |
2919 | ret = 1; | ||
2920 | } | ||
2921 | spin_unlock(&rq->lock); | 3002 | spin_unlock(&rq->lock); |
2922 | #endif | 3003 | #endif |
2923 | return ret; | ||
2924 | } | 3004 | } |
2925 | 3005 | ||
2926 | DEFINE_PER_CPU(struct kernel_stat, kstat); | 3006 | DEFINE_PER_CPU(struct kernel_stat, kstat); |
@@ -2934,7 +3014,8 @@ EXPORT_PER_CPU_SYMBOL(kstat); | |||
2934 | static inline void | 3014 | static inline void |
2935 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) | 3015 | update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) |
2936 | { | 3016 | { |
2937 | p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); | 3017 | p->sched_time += now - p->last_ran; |
3018 | p->last_ran = rq->most_recent_timestamp = now; | ||
2938 | } | 3019 | } |
2939 | 3020 | ||
2940 | /* | 3021 | /* |
@@ -2947,8 +3028,7 @@ unsigned long long current_sched_time(const struct task_struct *p) | |||
2947 | unsigned long flags; | 3028 | unsigned long flags; |
2948 | 3029 | ||
2949 | local_irq_save(flags); | 3030 | local_irq_save(flags); |
2950 | ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); | 3031 | ns = p->sched_time + sched_clock() - p->last_ran; |
2951 | ns = p->sched_time + sched_clock() - ns; | ||
2952 | local_irq_restore(flags); | 3032 | local_irq_restore(flags); |
2953 | 3033 | ||
2954 | return ns; | 3034 | return ns; |
@@ -3048,35 +3128,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
3048 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 3128 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
3049 | } | 3129 | } |
3050 | 3130 | ||
3051 | /* | 3131 | static void task_running_tick(struct rq *rq, struct task_struct *p) |
3052 | * This function gets called by the timer code, with HZ frequency. | ||
3053 | * We call it with interrupts disabled. | ||
3054 | * | ||
3055 | * It also gets called by the fork code, when changing the parent's | ||
3056 | * timeslices. | ||
3057 | */ | ||
3058 | void scheduler_tick(void) | ||
3059 | { | 3132 | { |
3060 | unsigned long long now = sched_clock(); | ||
3061 | struct task_struct *p = current; | ||
3062 | int cpu = smp_processor_id(); | ||
3063 | struct rq *rq = cpu_rq(cpu); | ||
3064 | |||
3065 | update_cpu_clock(p, rq, now); | ||
3066 | |||
3067 | rq->timestamp_last_tick = now; | ||
3068 | |||
3069 | if (p == rq->idle) { | ||
3070 | if (wake_priority_sleeper(rq)) | ||
3071 | goto out; | ||
3072 | rebalance_tick(cpu, rq, SCHED_IDLE); | ||
3073 | return; | ||
3074 | } | ||
3075 | |||
3076 | /* Task might have expired already, but not scheduled off yet */ | ||
3077 | if (p->array != rq->active) { | 3133 | if (p->array != rq->active) { |
3134 | /* Task has expired but was not scheduled yet */ | ||
3078 | set_tsk_need_resched(p); | 3135 | set_tsk_need_resched(p); |
3079 | goto out; | 3136 | return; |
3080 | } | 3137 | } |
3081 | spin_lock(&rq->lock); | 3138 | spin_lock(&rq->lock); |
3082 | /* | 3139 | /* |
@@ -3144,8 +3201,34 @@ void scheduler_tick(void) | |||
3144 | } | 3201 | } |
3145 | out_unlock: | 3202 | out_unlock: |
3146 | spin_unlock(&rq->lock); | 3203 | spin_unlock(&rq->lock); |
3147 | out: | 3204 | } |
3148 | rebalance_tick(cpu, rq, NOT_IDLE); | 3205 | |
3206 | /* | ||
3207 | * This function gets called by the timer code, with HZ frequency. | ||
3208 | * We call it with interrupts disabled. | ||
3209 | * | ||
3210 | * It also gets called by the fork code, when changing the parent's | ||
3211 | * timeslices. | ||
3212 | */ | ||
3213 | void scheduler_tick(void) | ||
3214 | { | ||
3215 | unsigned long long now = sched_clock(); | ||
3216 | struct task_struct *p = current; | ||
3217 | int cpu = smp_processor_id(); | ||
3218 | struct rq *rq = cpu_rq(cpu); | ||
3219 | |||
3220 | update_cpu_clock(p, rq, now); | ||
3221 | |||
3222 | if (p == rq->idle) | ||
3223 | /* Task on the idle queue */ | ||
3224 | wake_priority_sleeper(rq); | ||
3225 | else | ||
3226 | task_running_tick(rq, p); | ||
3227 | #ifdef CONFIG_SMP | ||
3228 | update_load(rq); | ||
3229 | if (time_after_eq(jiffies, rq->next_balance)) | ||
3230 | raise_softirq(SCHED_SOFTIRQ); | ||
3231 | #endif | ||
3149 | } | 3232 | } |
3150 | 3233 | ||
3151 | #ifdef CONFIG_SCHED_SMT | 3234 | #ifdef CONFIG_SCHED_SMT |
@@ -3291,7 +3374,8 @@ void fastcall add_preempt_count(int val) | |||
3291 | /* | 3374 | /* |
3292 | * Spinlock count overflowing soon? | 3375 | * Spinlock count overflowing soon? |
3293 | */ | 3376 | */ |
3294 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); | 3377 | DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= |
3378 | PREEMPT_MASK - 10); | ||
3295 | } | 3379 | } |
3296 | EXPORT_SYMBOL(add_preempt_count); | 3380 | EXPORT_SYMBOL(add_preempt_count); |
3297 | 3381 | ||
@@ -4990,8 +5074,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
4990 | * afterwards, and pretending it was a local activate. | 5074 | * afterwards, and pretending it was a local activate. |
4991 | * This way is cleaner and logically correct. | 5075 | * This way is cleaner and logically correct. |
4992 | */ | 5076 | */ |
4993 | p->timestamp = p->timestamp - rq_src->timestamp_last_tick | 5077 | p->timestamp = p->timestamp - rq_src->most_recent_timestamp |
4994 | + rq_dest->timestamp_last_tick; | 5078 | + rq_dest->most_recent_timestamp; |
4995 | deactivate_task(p, rq_src); | 5079 | deactivate_task(p, rq_src); |
4996 | __activate_task(p, rq_dest); | 5080 | __activate_task(p, rq_dest); |
4997 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 5081 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
@@ -5067,7 +5151,10 @@ wait_to_die: | |||
5067 | } | 5151 | } |
5068 | 5152 | ||
5069 | #ifdef CONFIG_HOTPLUG_CPU | 5153 | #ifdef CONFIG_HOTPLUG_CPU |
5070 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 5154 | /* |
5155 | * Figure out where task on dead CPU should go, use force if neccessary. | ||
5156 | * NOTE: interrupts should be disabled by the caller | ||
5157 | */ | ||
5071 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5158 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) |
5072 | { | 5159 | { |
5073 | unsigned long flags; | 5160 | unsigned long flags; |
@@ -5187,6 +5274,7 @@ void idle_task_exit(void) | |||
5187 | mmdrop(mm); | 5274 | mmdrop(mm); |
5188 | } | 5275 | } |
5189 | 5276 | ||
5277 | /* called under rq->lock with disabled interrupts */ | ||
5190 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | 5278 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) |
5191 | { | 5279 | { |
5192 | struct rq *rq = cpu_rq(dead_cpu); | 5280 | struct rq *rq = cpu_rq(dead_cpu); |
@@ -5203,10 +5291,11 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | |||
5203 | * Drop lock around migration; if someone else moves it, | 5291 | * Drop lock around migration; if someone else moves it, |
5204 | * that's OK. No task can be added to this CPU, so iteration is | 5292 | * that's OK. No task can be added to this CPU, so iteration is |
5205 | * fine. | 5293 | * fine. |
5294 | * NOTE: interrupts should be left disabled --dev@ | ||
5206 | */ | 5295 | */ |
5207 | spin_unlock_irq(&rq->lock); | 5296 | spin_unlock(&rq->lock); |
5208 | move_task_off_dead_cpu(dead_cpu, p); | 5297 | move_task_off_dead_cpu(dead_cpu, p); |
5209 | spin_lock_irq(&rq->lock); | 5298 | spin_lock(&rq->lock); |
5210 | 5299 | ||
5211 | put_task_struct(p); | 5300 | put_task_struct(p); |
5212 | } | 5301 | } |
@@ -5359,16 +5448,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5359 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 5448 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
5360 | printk("does not load-balance\n"); | 5449 | printk("does not load-balance\n"); |
5361 | if (sd->parent) | 5450 | if (sd->parent) |
5362 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); | 5451 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" |
5452 | " has parent"); | ||
5363 | break; | 5453 | break; |
5364 | } | 5454 | } |
5365 | 5455 | ||
5366 | printk("span %s\n", str); | 5456 | printk("span %s\n", str); |
5367 | 5457 | ||
5368 | if (!cpu_isset(cpu, sd->span)) | 5458 | if (!cpu_isset(cpu, sd->span)) |
5369 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); | 5459 | printk(KERN_ERR "ERROR: domain->span does not contain " |
5460 | "CPU%d\n", cpu); | ||
5370 | if (!cpu_isset(cpu, group->cpumask)) | 5461 | if (!cpu_isset(cpu, group->cpumask)) |
5371 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); | 5462 | printk(KERN_ERR "ERROR: domain->groups does not contain" |
5463 | " CPU%d\n", cpu); | ||
5372 | 5464 | ||
5373 | printk(KERN_DEBUG); | 5465 | printk(KERN_DEBUG); |
5374 | for (i = 0; i < level + 2; i++) | 5466 | for (i = 0; i < level + 2; i++) |
@@ -5383,7 +5475,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5383 | 5475 | ||
5384 | if (!group->cpu_power) { | 5476 | if (!group->cpu_power) { |
5385 | printk("\n"); | 5477 | printk("\n"); |
5386 | printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); | 5478 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5479 | "set\n"); | ||
5387 | } | 5480 | } |
5388 | 5481 | ||
5389 | if (!cpus_weight(group->cpumask)) { | 5482 | if (!cpus_weight(group->cpumask)) { |
@@ -5406,15 +5499,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5406 | printk("\n"); | 5499 | printk("\n"); |
5407 | 5500 | ||
5408 | if (!cpus_equal(sd->span, groupmask)) | 5501 | if (!cpus_equal(sd->span, groupmask)) |
5409 | printk(KERN_ERR "ERROR: groups don't span domain->span\n"); | 5502 | printk(KERN_ERR "ERROR: groups don't span " |
5503 | "domain->span\n"); | ||
5410 | 5504 | ||
5411 | level++; | 5505 | level++; |
5412 | sd = sd->parent; | 5506 | sd = sd->parent; |
5507 | if (!sd) | ||
5508 | continue; | ||
5413 | 5509 | ||
5414 | if (sd) { | 5510 | if (!cpus_subset(groupmask, sd->span)) |
5415 | if (!cpus_subset(groupmask, sd->span)) | 5511 | printk(KERN_ERR "ERROR: parent span is not a superset " |
5416 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); | 5512 | "of domain->span\n"); |
5417 | } | ||
5418 | 5513 | ||
5419 | } while (sd); | 5514 | } while (sd); |
5420 | } | 5515 | } |
@@ -5528,28 +5623,27 @@ static int __init isolated_cpu_setup(char *str) | |||
5528 | __setup ("isolcpus=", isolated_cpu_setup); | 5623 | __setup ("isolcpus=", isolated_cpu_setup); |
5529 | 5624 | ||
5530 | /* | 5625 | /* |
5531 | * init_sched_build_groups takes an array of groups, the cpumask we wish | 5626 | * init_sched_build_groups takes the cpumask we wish to span, and a pointer |
5532 | * to span, and a pointer to a function which identifies what group a CPU | 5627 | * to a function which identifies what group(along with sched group) a CPU |
5533 | * belongs to. The return value of group_fn must be a valid index into the | 5628 | * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS |
5534 | * groups[] array, and must be >= 0 and < NR_CPUS (due to the fact that we | 5629 | * (due to the fact that we keep track of groups covered with a cpumask_t). |
5535 | * keep track of groups covered with a cpumask_t). | ||
5536 | * | 5630 | * |
5537 | * init_sched_build_groups will build a circular linked list of the groups | 5631 | * init_sched_build_groups will build a circular linked list of the groups |
5538 | * covered by the given span, and will set each group's ->cpumask correctly, | 5632 | * covered by the given span, and will set each group's ->cpumask correctly, |
5539 | * and ->cpu_power to 0. | 5633 | * and ->cpu_power to 0. |
5540 | */ | 5634 | */ |
5541 | static void | 5635 | static void |
5542 | init_sched_build_groups(struct sched_group groups[], cpumask_t span, | 5636 | init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, |
5543 | const cpumask_t *cpu_map, | 5637 | int (*group_fn)(int cpu, const cpumask_t *cpu_map, |
5544 | int (*group_fn)(int cpu, const cpumask_t *cpu_map)) | 5638 | struct sched_group **sg)) |
5545 | { | 5639 | { |
5546 | struct sched_group *first = NULL, *last = NULL; | 5640 | struct sched_group *first = NULL, *last = NULL; |
5547 | cpumask_t covered = CPU_MASK_NONE; | 5641 | cpumask_t covered = CPU_MASK_NONE; |
5548 | int i; | 5642 | int i; |
5549 | 5643 | ||
5550 | for_each_cpu_mask(i, span) { | 5644 | for_each_cpu_mask(i, span) { |
5551 | int group = group_fn(i, cpu_map); | 5645 | struct sched_group *sg; |
5552 | struct sched_group *sg = &groups[group]; | 5646 | int group = group_fn(i, cpu_map, &sg); |
5553 | int j; | 5647 | int j; |
5554 | 5648 | ||
5555 | if (cpu_isset(i, covered)) | 5649 | if (cpu_isset(i, covered)) |
@@ -5559,7 +5653,7 @@ init_sched_build_groups(struct sched_group groups[], cpumask_t span, | |||
5559 | sg->cpu_power = 0; | 5653 | sg->cpu_power = 0; |
5560 | 5654 | ||
5561 | for_each_cpu_mask(j, span) { | 5655 | for_each_cpu_mask(j, span) { |
5562 | if (group_fn(j, cpu_map) != group) | 5656 | if (group_fn(j, cpu_map, NULL) != group) |
5563 | continue; | 5657 | continue; |
5564 | 5658 | ||
5565 | cpu_set(j, covered); | 5659 | cpu_set(j, covered); |
@@ -5733,8 +5827,9 @@ __setup("max_cache_size=", setup_max_cache_size); | |||
5733 | */ | 5827 | */ |
5734 | static void touch_cache(void *__cache, unsigned long __size) | 5828 | static void touch_cache(void *__cache, unsigned long __size) |
5735 | { | 5829 | { |
5736 | unsigned long size = __size/sizeof(long), chunk1 = size/3, | 5830 | unsigned long size = __size / sizeof(long); |
5737 | chunk2 = 2*size/3; | 5831 | unsigned long chunk1 = size / 3; |
5832 | unsigned long chunk2 = 2 * size / 3; | ||
5738 | unsigned long *cache = __cache; | 5833 | unsigned long *cache = __cache; |
5739 | int i; | 5834 | int i; |
5740 | 5835 | ||
@@ -5843,11 +5938,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | |||
5843 | */ | 5938 | */ |
5844 | measure_one(cache, size, cpu1, cpu2); | 5939 | measure_one(cache, size, cpu1, cpu2); |
5845 | for (i = 0; i < ITERATIONS; i++) | 5940 | for (i = 0; i < ITERATIONS; i++) |
5846 | cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); | 5941 | cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); |
5847 | 5942 | ||
5848 | measure_one(cache, size, cpu2, cpu1); | 5943 | measure_one(cache, size, cpu2, cpu1); |
5849 | for (i = 0; i < ITERATIONS; i++) | 5944 | for (i = 0; i < ITERATIONS; i++) |
5850 | cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); | 5945 | cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); |
5851 | 5946 | ||
5852 | /* | 5947 | /* |
5853 | * (We measure the non-migrating [cached] cost on both | 5948 | * (We measure the non-migrating [cached] cost on both |
@@ -5857,17 +5952,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) | |||
5857 | 5952 | ||
5858 | measure_one(cache, size, cpu1, cpu1); | 5953 | measure_one(cache, size, cpu1, cpu1); |
5859 | for (i = 0; i < ITERATIONS; i++) | 5954 | for (i = 0; i < ITERATIONS; i++) |
5860 | cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); | 5955 | cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); |
5861 | 5956 | ||
5862 | measure_one(cache, size, cpu2, cpu2); | 5957 | measure_one(cache, size, cpu2, cpu2); |
5863 | for (i = 0; i < ITERATIONS; i++) | 5958 | for (i = 0; i < ITERATIONS; i++) |
5864 | cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); | 5959 | cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); |
5865 | 5960 | ||
5866 | /* | 5961 | /* |
5867 | * Get the per-iteration migration cost: | 5962 | * Get the per-iteration migration cost: |
5868 | */ | 5963 | */ |
5869 | do_div(cost1, 2*ITERATIONS); | 5964 | do_div(cost1, 2 * ITERATIONS); |
5870 | do_div(cost2, 2*ITERATIONS); | 5965 | do_div(cost2, 2 * ITERATIONS); |
5871 | 5966 | ||
5872 | return cost1 - cost2; | 5967 | return cost1 - cost2; |
5873 | } | 5968 | } |
@@ -5905,7 +6000,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) | |||
5905 | */ | 6000 | */ |
5906 | cache = vmalloc(max_size); | 6001 | cache = vmalloc(max_size); |
5907 | if (!cache) { | 6002 | if (!cache) { |
5908 | printk("could not vmalloc %d bytes for cache!\n", 2*max_size); | 6003 | printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); |
5909 | return 1000000; /* return 1 msec on very small boxen */ | 6004 | return 1000000; /* return 1 msec on very small boxen */ |
5910 | } | 6005 | } |
5911 | 6006 | ||
@@ -5930,7 +6025,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) | |||
5930 | avg_fluct = (avg_fluct + fluct)/2; | 6025 | avg_fluct = (avg_fluct + fluct)/2; |
5931 | 6026 | ||
5932 | if (migration_debug) | 6027 | if (migration_debug) |
5933 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", | 6028 | printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " |
6029 | "(%8Ld %8Ld)\n", | ||
5934 | cpu1, cpu2, size, | 6030 | cpu1, cpu2, size, |
5935 | (long)cost / 1000000, | 6031 | (long)cost / 1000000, |
5936 | ((long)cost / 100000) % 10, | 6032 | ((long)cost / 100000) % 10, |
@@ -6025,20 +6121,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) | |||
6025 | -1 | 6121 | -1 |
6026 | #endif | 6122 | #endif |
6027 | ); | 6123 | ); |
6028 | if (system_state == SYSTEM_BOOTING) { | 6124 | if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { |
6029 | if (num_online_cpus() > 1) { | 6125 | printk("migration_cost="); |
6030 | printk("migration_cost="); | 6126 | for (distance = 0; distance <= max_distance; distance++) { |
6031 | for (distance = 0; distance <= max_distance; distance++) { | 6127 | if (distance) |
6032 | if (distance) | 6128 | printk(","); |
6033 | printk(","); | 6129 | printk("%ld", (long)migration_cost[distance] / 1000); |
6034 | printk("%ld", (long)migration_cost[distance] / 1000); | ||
6035 | } | ||
6036 | printk("\n"); | ||
6037 | } | 6130 | } |
6131 | printk("\n"); | ||
6038 | } | 6132 | } |
6039 | j1 = jiffies; | 6133 | j1 = jiffies; |
6040 | if (migration_debug) | 6134 | if (migration_debug) |
6041 | printk("migration: %ld seconds\n", (j1-j0)/HZ); | 6135 | printk("migration: %ld seconds\n", (j1-j0) / HZ); |
6042 | 6136 | ||
6043 | /* | 6137 | /* |
6044 | * Move back to the original CPU. NUMA-Q gets confused | 6138 | * Move back to the original CPU. NUMA-Q gets confused |
@@ -6135,10 +6229,13 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | |||
6135 | */ | 6229 | */ |
6136 | #ifdef CONFIG_SCHED_SMT | 6230 | #ifdef CONFIG_SCHED_SMT |
6137 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6231 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
6138 | static struct sched_group sched_group_cpus[NR_CPUS]; | 6232 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
6139 | 6233 | ||
6140 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) | 6234 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, |
6235 | struct sched_group **sg) | ||
6141 | { | 6236 | { |
6237 | if (sg) | ||
6238 | *sg = &per_cpu(sched_group_cpus, cpu); | ||
6142 | return cpu; | 6239 | return cpu; |
6143 | } | 6240 | } |
6144 | #endif | 6241 | #endif |
@@ -6148,39 +6245,52 @@ static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map) | |||
6148 | */ | 6245 | */ |
6149 | #ifdef CONFIG_SCHED_MC | 6246 | #ifdef CONFIG_SCHED_MC |
6150 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 6247 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
6151 | static struct sched_group sched_group_core[NR_CPUS]; | 6248 | static DEFINE_PER_CPU(struct sched_group, sched_group_core); |
6152 | #endif | 6249 | #endif |
6153 | 6250 | ||
6154 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6251 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6155 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) | 6252 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, |
6253 | struct sched_group **sg) | ||
6156 | { | 6254 | { |
6255 | int group; | ||
6157 | cpumask_t mask = cpu_sibling_map[cpu]; | 6256 | cpumask_t mask = cpu_sibling_map[cpu]; |
6158 | cpus_and(mask, mask, *cpu_map); | 6257 | cpus_and(mask, mask, *cpu_map); |
6159 | return first_cpu(mask); | 6258 | group = first_cpu(mask); |
6259 | if (sg) | ||
6260 | *sg = &per_cpu(sched_group_core, group); | ||
6261 | return group; | ||
6160 | } | 6262 | } |
6161 | #elif defined(CONFIG_SCHED_MC) | 6263 | #elif defined(CONFIG_SCHED_MC) |
6162 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map) | 6264 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, |
6265 | struct sched_group **sg) | ||
6163 | { | 6266 | { |
6267 | if (sg) | ||
6268 | *sg = &per_cpu(sched_group_core, cpu); | ||
6164 | return cpu; | 6269 | return cpu; |
6165 | } | 6270 | } |
6166 | #endif | 6271 | #endif |
6167 | 6272 | ||
6168 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6273 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
6169 | static struct sched_group sched_group_phys[NR_CPUS]; | 6274 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
6170 | 6275 | ||
6171 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map) | 6276 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, |
6277 | struct sched_group **sg) | ||
6172 | { | 6278 | { |
6279 | int group; | ||
6173 | #ifdef CONFIG_SCHED_MC | 6280 | #ifdef CONFIG_SCHED_MC |
6174 | cpumask_t mask = cpu_coregroup_map(cpu); | 6281 | cpumask_t mask = cpu_coregroup_map(cpu); |
6175 | cpus_and(mask, mask, *cpu_map); | 6282 | cpus_and(mask, mask, *cpu_map); |
6176 | return first_cpu(mask); | 6283 | group = first_cpu(mask); |
6177 | #elif defined(CONFIG_SCHED_SMT) | 6284 | #elif defined(CONFIG_SCHED_SMT) |
6178 | cpumask_t mask = cpu_sibling_map[cpu]; | 6285 | cpumask_t mask = cpu_sibling_map[cpu]; |
6179 | cpus_and(mask, mask, *cpu_map); | 6286 | cpus_and(mask, mask, *cpu_map); |
6180 | return first_cpu(mask); | 6287 | group = first_cpu(mask); |
6181 | #else | 6288 | #else |
6182 | return cpu; | 6289 | group = cpu; |
6183 | #endif | 6290 | #endif |
6291 | if (sg) | ||
6292 | *sg = &per_cpu(sched_group_phys, group); | ||
6293 | return group; | ||
6184 | } | 6294 | } |
6185 | 6295 | ||
6186 | #ifdef CONFIG_NUMA | 6296 | #ifdef CONFIG_NUMA |
@@ -6193,12 +6303,22 @@ static DEFINE_PER_CPU(struct sched_domain, node_domains); | |||
6193 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; | 6303 | static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; |
6194 | 6304 | ||
6195 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); | 6305 | static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); |
6196 | static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; | 6306 | static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes); |
6197 | 6307 | ||
6198 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map) | 6308 | static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map, |
6309 | struct sched_group **sg) | ||
6199 | { | 6310 | { |
6200 | return cpu_to_node(cpu); | 6311 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu)); |
6312 | int group; | ||
6313 | |||
6314 | cpus_and(nodemask, nodemask, *cpu_map); | ||
6315 | group = first_cpu(nodemask); | ||
6316 | |||
6317 | if (sg) | ||
6318 | *sg = &per_cpu(sched_group_allnodes, group); | ||
6319 | return group; | ||
6201 | } | 6320 | } |
6321 | |||
6202 | static void init_numa_sched_groups_power(struct sched_group *group_head) | 6322 | static void init_numa_sched_groups_power(struct sched_group *group_head) |
6203 | { | 6323 | { |
6204 | struct sched_group *sg = group_head; | 6324 | struct sched_group *sg = group_head; |
@@ -6234,16 +6354,9 @@ static void free_sched_groups(const cpumask_t *cpu_map) | |||
6234 | int cpu, i; | 6354 | int cpu, i; |
6235 | 6355 | ||
6236 | for_each_cpu_mask(cpu, *cpu_map) { | 6356 | for_each_cpu_mask(cpu, *cpu_map) { |
6237 | struct sched_group *sched_group_allnodes | ||
6238 | = sched_group_allnodes_bycpu[cpu]; | ||
6239 | struct sched_group **sched_group_nodes | 6357 | struct sched_group **sched_group_nodes |
6240 | = sched_group_nodes_bycpu[cpu]; | 6358 | = sched_group_nodes_bycpu[cpu]; |
6241 | 6359 | ||
6242 | if (sched_group_allnodes) { | ||
6243 | kfree(sched_group_allnodes); | ||
6244 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
6245 | } | ||
6246 | |||
6247 | if (!sched_group_nodes) | 6360 | if (!sched_group_nodes) |
6248 | continue; | 6361 | continue; |
6249 | 6362 | ||
@@ -6337,7 +6450,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6337 | struct sched_domain *sd; | 6450 | struct sched_domain *sd; |
6338 | #ifdef CONFIG_NUMA | 6451 | #ifdef CONFIG_NUMA |
6339 | struct sched_group **sched_group_nodes = NULL; | 6452 | struct sched_group **sched_group_nodes = NULL; |
6340 | struct sched_group *sched_group_allnodes = NULL; | 6453 | int sd_allnodes = 0; |
6341 | 6454 | ||
6342 | /* | 6455 | /* |
6343 | * Allocate the per-node list of sched groups | 6456 | * Allocate the per-node list of sched groups |
@@ -6355,7 +6468,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6355 | * Set up domains for cpus specified by the cpu_map. | 6468 | * Set up domains for cpus specified by the cpu_map. |
6356 | */ | 6469 | */ |
6357 | for_each_cpu_mask(i, *cpu_map) { | 6470 | for_each_cpu_mask(i, *cpu_map) { |
6358 | int group; | ||
6359 | struct sched_domain *sd = NULL, *p; | 6471 | struct sched_domain *sd = NULL, *p; |
6360 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); | 6472 | cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); |
6361 | 6473 | ||
@@ -6364,26 +6476,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6364 | #ifdef CONFIG_NUMA | 6476 | #ifdef CONFIG_NUMA |
6365 | if (cpus_weight(*cpu_map) | 6477 | if (cpus_weight(*cpu_map) |
6366 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { | 6478 | > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { |
6367 | if (!sched_group_allnodes) { | ||
6368 | sched_group_allnodes | ||
6369 | = kmalloc_node(sizeof(struct sched_group) | ||
6370 | * MAX_NUMNODES, | ||
6371 | GFP_KERNEL, | ||
6372 | cpu_to_node(i)); | ||
6373 | if (!sched_group_allnodes) { | ||
6374 | printk(KERN_WARNING | ||
6375 | "Can not alloc allnodes sched group\n"); | ||
6376 | goto error; | ||
6377 | } | ||
6378 | sched_group_allnodes_bycpu[i] | ||
6379 | = sched_group_allnodes; | ||
6380 | } | ||
6381 | sd = &per_cpu(allnodes_domains, i); | 6479 | sd = &per_cpu(allnodes_domains, i); |
6382 | *sd = SD_ALLNODES_INIT; | 6480 | *sd = SD_ALLNODES_INIT; |
6383 | sd->span = *cpu_map; | 6481 | sd->span = *cpu_map; |
6384 | group = cpu_to_allnodes_group(i, cpu_map); | 6482 | cpu_to_allnodes_group(i, cpu_map, &sd->groups); |
6385 | sd->groups = &sched_group_allnodes[group]; | ||
6386 | p = sd; | 6483 | p = sd; |
6484 | sd_allnodes = 1; | ||
6387 | } else | 6485 | } else |
6388 | p = NULL; | 6486 | p = NULL; |
6389 | 6487 | ||
@@ -6398,36 +6496,33 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6398 | 6496 | ||
6399 | p = sd; | 6497 | p = sd; |
6400 | sd = &per_cpu(phys_domains, i); | 6498 | sd = &per_cpu(phys_domains, i); |
6401 | group = cpu_to_phys_group(i, cpu_map); | ||
6402 | *sd = SD_CPU_INIT; | 6499 | *sd = SD_CPU_INIT; |
6403 | sd->span = nodemask; | 6500 | sd->span = nodemask; |
6404 | sd->parent = p; | 6501 | sd->parent = p; |
6405 | if (p) | 6502 | if (p) |
6406 | p->child = sd; | 6503 | p->child = sd; |
6407 | sd->groups = &sched_group_phys[group]; | 6504 | cpu_to_phys_group(i, cpu_map, &sd->groups); |
6408 | 6505 | ||
6409 | #ifdef CONFIG_SCHED_MC | 6506 | #ifdef CONFIG_SCHED_MC |
6410 | p = sd; | 6507 | p = sd; |
6411 | sd = &per_cpu(core_domains, i); | 6508 | sd = &per_cpu(core_domains, i); |
6412 | group = cpu_to_core_group(i, cpu_map); | ||
6413 | *sd = SD_MC_INIT; | 6509 | *sd = SD_MC_INIT; |
6414 | sd->span = cpu_coregroup_map(i); | 6510 | sd->span = cpu_coregroup_map(i); |
6415 | cpus_and(sd->span, sd->span, *cpu_map); | 6511 | cpus_and(sd->span, sd->span, *cpu_map); |
6416 | sd->parent = p; | 6512 | sd->parent = p; |
6417 | p->child = sd; | 6513 | p->child = sd; |
6418 | sd->groups = &sched_group_core[group]; | 6514 | cpu_to_core_group(i, cpu_map, &sd->groups); |
6419 | #endif | 6515 | #endif |
6420 | 6516 | ||
6421 | #ifdef CONFIG_SCHED_SMT | 6517 | #ifdef CONFIG_SCHED_SMT |
6422 | p = sd; | 6518 | p = sd; |
6423 | sd = &per_cpu(cpu_domains, i); | 6519 | sd = &per_cpu(cpu_domains, i); |
6424 | group = cpu_to_cpu_group(i, cpu_map); | ||
6425 | *sd = SD_SIBLING_INIT; | 6520 | *sd = SD_SIBLING_INIT; |
6426 | sd->span = cpu_sibling_map[i]; | 6521 | sd->span = cpu_sibling_map[i]; |
6427 | cpus_and(sd->span, sd->span, *cpu_map); | 6522 | cpus_and(sd->span, sd->span, *cpu_map); |
6428 | sd->parent = p; | 6523 | sd->parent = p; |
6429 | p->child = sd; | 6524 | p->child = sd; |
6430 | sd->groups = &sched_group_cpus[group]; | 6525 | cpu_to_cpu_group(i, cpu_map, &sd->groups); |
6431 | #endif | 6526 | #endif |
6432 | } | 6527 | } |
6433 | 6528 | ||
@@ -6439,8 +6534,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6439 | if (i != first_cpu(this_sibling_map)) | 6534 | if (i != first_cpu(this_sibling_map)) |
6440 | continue; | 6535 | continue; |
6441 | 6536 | ||
6442 | init_sched_build_groups(sched_group_cpus, this_sibling_map, | 6537 | init_sched_build_groups(this_sibling_map, cpu_map, &cpu_to_cpu_group); |
6443 | cpu_map, &cpu_to_cpu_group); | ||
6444 | } | 6538 | } |
6445 | #endif | 6539 | #endif |
6446 | 6540 | ||
@@ -6451,8 +6545,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6451 | cpus_and(this_core_map, this_core_map, *cpu_map); | 6545 | cpus_and(this_core_map, this_core_map, *cpu_map); |
6452 | if (i != first_cpu(this_core_map)) | 6546 | if (i != first_cpu(this_core_map)) |
6453 | continue; | 6547 | continue; |
6454 | init_sched_build_groups(sched_group_core, this_core_map, | 6548 | init_sched_build_groups(this_core_map, cpu_map, &cpu_to_core_group); |
6455 | cpu_map, &cpu_to_core_group); | ||
6456 | } | 6549 | } |
6457 | #endif | 6550 | #endif |
6458 | 6551 | ||
@@ -6465,15 +6558,13 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6465 | if (cpus_empty(nodemask)) | 6558 | if (cpus_empty(nodemask)) |
6466 | continue; | 6559 | continue; |
6467 | 6560 | ||
6468 | init_sched_build_groups(sched_group_phys, nodemask, | 6561 | init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group); |
6469 | cpu_map, &cpu_to_phys_group); | ||
6470 | } | 6562 | } |
6471 | 6563 | ||
6472 | #ifdef CONFIG_NUMA | 6564 | #ifdef CONFIG_NUMA |
6473 | /* Set up node groups */ | 6565 | /* Set up node groups */ |
6474 | if (sched_group_allnodes) | 6566 | if (sd_allnodes) |
6475 | init_sched_build_groups(sched_group_allnodes, *cpu_map, | 6567 | init_sched_build_groups(*cpu_map, cpu_map, &cpu_to_allnodes_group); |
6476 | cpu_map, &cpu_to_allnodes_group); | ||
6477 | 6568 | ||
6478 | for (i = 0; i < MAX_NUMNODES; i++) { | 6569 | for (i = 0; i < MAX_NUMNODES; i++) { |
6479 | /* Set up node groups */ | 6570 | /* Set up node groups */ |
@@ -6565,10 +6656,10 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6565 | for (i = 0; i < MAX_NUMNODES; i++) | 6656 | for (i = 0; i < MAX_NUMNODES; i++) |
6566 | init_numa_sched_groups_power(sched_group_nodes[i]); | 6657 | init_numa_sched_groups_power(sched_group_nodes[i]); |
6567 | 6658 | ||
6568 | if (sched_group_allnodes) { | 6659 | if (sd_allnodes) { |
6569 | int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map); | 6660 | struct sched_group *sg; |
6570 | struct sched_group *sg = &sched_group_allnodes[group]; | ||
6571 | 6661 | ||
6662 | cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg); | ||
6572 | init_numa_sched_groups_power(sg); | 6663 | init_numa_sched_groups_power(sg); |
6573 | } | 6664 | } |
6574 | #endif | 6665 | #endif |
@@ -6847,6 +6938,10 @@ void __init sched_init(void) | |||
6847 | 6938 | ||
6848 | set_load_weight(&init_task); | 6939 | set_load_weight(&init_task); |
6849 | 6940 | ||
6941 | #ifdef CONFIG_SMP | ||
6942 | open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); | ||
6943 | #endif | ||
6944 | |||
6850 | #ifdef CONFIG_RT_MUTEXES | 6945 | #ifdef CONFIG_RT_MUTEXES |
6851 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); | 6946 | plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); |
6852 | #endif | 6947 | #endif |
diff --git a/kernel/signal.c b/kernel/signal.c index ec81defde3..1921ffdc5e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -24,6 +24,9 @@ | |||
24 | #include <linux/signal.h> | 24 | #include <linux/signal.h> |
25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
26 | #include <linux/freezer.h> | 26 | #include <linux/freezer.h> |
27 | #include <linux/pid_namespace.h> | ||
28 | #include <linux/nsproxy.h> | ||
29 | |||
27 | #include <asm/param.h> | 30 | #include <asm/param.h> |
28 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
29 | #include <asm/unistd.h> | 32 | #include <asm/unistd.h> |
@@ -583,7 +586,7 @@ static int check_kill_permission(int sig, struct siginfo *info, | |||
583 | error = -EPERM; | 586 | error = -EPERM; |
584 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) | 587 | if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info))) |
585 | && ((sig != SIGCONT) || | 588 | && ((sig != SIGCONT) || |
586 | (current->signal->session != t->signal->session)) | 589 | (process_session(current) != process_session(t))) |
587 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) | 590 | && (current->euid ^ t->suid) && (current->euid ^ t->uid) |
588 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) | 591 | && (current->uid ^ t->suid) && (current->uid ^ t->uid) |
589 | && !capable(CAP_KILL)) | 592 | && !capable(CAP_KILL)) |
@@ -1877,8 +1880,12 @@ relock: | |||
1877 | if (sig_kernel_ignore(signr)) /* Default is nothing. */ | 1880 | if (sig_kernel_ignore(signr)) /* Default is nothing. */ |
1878 | continue; | 1881 | continue; |
1879 | 1882 | ||
1880 | /* Init gets no signals it doesn't want. */ | 1883 | /* |
1881 | if (current == child_reaper) | 1884 | * Init of a pid space gets no signals it doesn't want from |
1885 | * within that pid space. It can of course get signals from | ||
1886 | * its parent pid space. | ||
1887 | */ | ||
1888 | if (current == child_reaper(current)) | ||
1882 | continue; | 1889 | continue; |
1883 | 1890 | ||
1884 | if (sig_kernel_stop(signr)) { | 1891 | if (sig_kernel_stop(signr)) { |
diff --git a/kernel/sys.c b/kernel/sys.c index a0c1a29a50..c7675c1bfd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1381,7 +1381,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1381 | 1381 | ||
1382 | if (p->real_parent == group_leader) { | 1382 | if (p->real_parent == group_leader) { |
1383 | err = -EPERM; | 1383 | err = -EPERM; |
1384 | if (p->signal->session != group_leader->signal->session) | 1384 | if (process_session(p) != process_session(group_leader)) |
1385 | goto out; | 1385 | goto out; |
1386 | err = -EACCES; | 1386 | err = -EACCES; |
1387 | if (p->did_exec) | 1387 | if (p->did_exec) |
@@ -1397,16 +1397,13 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) | |||
1397 | goto out; | 1397 | goto out; |
1398 | 1398 | ||
1399 | if (pgid != pid) { | 1399 | if (pgid != pid) { |
1400 | struct task_struct *p; | 1400 | struct task_struct *g = |
1401 | find_task_by_pid_type(PIDTYPE_PGID, pgid); | ||
1401 | 1402 | ||
1402 | do_each_task_pid(pgid, PIDTYPE_PGID, p) { | 1403 | if (!g || process_session(g) != process_session(group_leader)) |
1403 | if (p->signal->session == group_leader->signal->session) | 1404 | goto out; |
1404 | goto ok_pgid; | ||
1405 | } while_each_task_pid(pgid, PIDTYPE_PGID, p); | ||
1406 | goto out; | ||
1407 | } | 1405 | } |
1408 | 1406 | ||
1409 | ok_pgid: | ||
1410 | err = security_task_setpgid(p, pgid); | 1407 | err = security_task_setpgid(p, pgid); |
1411 | if (err) | 1408 | if (err) |
1412 | goto out; | 1409 | goto out; |
@@ -1459,7 +1456,7 @@ asmlinkage long sys_getpgrp(void) | |||
1459 | asmlinkage long sys_getsid(pid_t pid) | 1456 | asmlinkage long sys_getsid(pid_t pid) |
1460 | { | 1457 | { |
1461 | if (!pid) | 1458 | if (!pid) |
1462 | return current->signal->session; | 1459 | return process_session(current); |
1463 | else { | 1460 | else { |
1464 | int retval; | 1461 | int retval; |
1465 | struct task_struct *p; | 1462 | struct task_struct *p; |
@@ -1471,7 +1468,7 @@ asmlinkage long sys_getsid(pid_t pid) | |||
1471 | if (p) { | 1468 | if (p) { |
1472 | retval = security_task_getsid(p); | 1469 | retval = security_task_getsid(p); |
1473 | if (!retval) | 1470 | if (!retval) |
1474 | retval = p->signal->session; | 1471 | retval = process_session(p); |
1475 | } | 1472 | } |
1476 | read_unlock(&tasklist_lock); | 1473 | read_unlock(&tasklist_lock); |
1477 | return retval; | 1474 | return retval; |
@@ -1484,7 +1481,6 @@ asmlinkage long sys_setsid(void) | |||
1484 | pid_t session; | 1481 | pid_t session; |
1485 | int err = -EPERM; | 1482 | int err = -EPERM; |
1486 | 1483 | ||
1487 | mutex_lock(&tty_mutex); | ||
1488 | write_lock_irq(&tasklist_lock); | 1484 | write_lock_irq(&tasklist_lock); |
1489 | 1485 | ||
1490 | /* Fail if I am already a session leader */ | 1486 | /* Fail if I am already a session leader */ |
@@ -1504,12 +1500,15 @@ asmlinkage long sys_setsid(void) | |||
1504 | 1500 | ||
1505 | group_leader->signal->leader = 1; | 1501 | group_leader->signal->leader = 1; |
1506 | __set_special_pids(session, session); | 1502 | __set_special_pids(session, session); |
1503 | |||
1504 | spin_lock(&group_leader->sighand->siglock); | ||
1507 | group_leader->signal->tty = NULL; | 1505 | group_leader->signal->tty = NULL; |
1508 | group_leader->signal->tty_old_pgrp = 0; | 1506 | group_leader->signal->tty_old_pgrp = 0; |
1507 | spin_unlock(&group_leader->sighand->siglock); | ||
1508 | |||
1509 | err = process_group(group_leader); | 1509 | err = process_group(group_leader); |
1510 | out: | 1510 | out: |
1511 | write_unlock_irq(&tasklist_lock); | 1511 | write_unlock_irq(&tasklist_lock); |
1512 | mutex_unlock(&tty_mutex); | ||
1513 | return err; | 1512 | return err; |
1514 | } | 1513 | } |
1515 | 1514 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8e9f00fd6d..130c5ec9ee 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -92,7 +92,9 @@ extern char modprobe_path[]; | |||
92 | extern int sg_big_buff; | 92 | extern int sg_big_buff; |
93 | #endif | 93 | #endif |
94 | #ifdef CONFIG_SYSVIPC | 94 | #ifdef CONFIG_SYSVIPC |
95 | static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | 95 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, |
96 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
97 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, struct file *filp, | ||
96 | void __user *buffer, size_t *lenp, loff_t *ppos); | 98 | void __user *buffer, size_t *lenp, loff_t *ppos); |
97 | #endif | 99 | #endif |
98 | 100 | ||
@@ -131,12 +133,22 @@ extern int max_lock_depth; | |||
131 | 133 | ||
132 | #ifdef CONFIG_SYSCTL_SYSCALL | 134 | #ifdef CONFIG_SYSCTL_SYSCALL |
133 | static int parse_table(int __user *, int, void __user *, size_t __user *, | 135 | static int parse_table(int __user *, int, void __user *, size_t __user *, |
134 | void __user *, size_t, ctl_table *, void **); | 136 | void __user *, size_t, ctl_table *); |
135 | #endif | 137 | #endif |
136 | 138 | ||
137 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | 139 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, |
138 | void __user *buffer, size_t *lenp, loff_t *ppos); | 140 | void __user *buffer, size_t *lenp, loff_t *ppos); |
139 | 141 | ||
142 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
143 | void __user *oldval, size_t __user *oldlenp, | ||
144 | void __user *newval, size_t newlen); | ||
145 | |||
146 | #ifdef CONFIG_SYSVIPC | ||
147 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
148 | void __user *oldval, size_t __user *oldlenp, | ||
149 | void __user *newval, size_t newlen); | ||
150 | #endif | ||
151 | |||
140 | #ifdef CONFIG_PROC_SYSCTL | 152 | #ifdef CONFIG_PROC_SYSCTL |
141 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, | 153 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, |
142 | void __user *buffer, size_t *lenp, loff_t *ppos); | 154 | void __user *buffer, size_t *lenp, loff_t *ppos); |
@@ -163,6 +175,40 @@ extern ctl_table inotify_table[]; | |||
163 | int sysctl_legacy_va_layout; | 175 | int sysctl_legacy_va_layout; |
164 | #endif | 176 | #endif |
165 | 177 | ||
178 | static void *get_uts(ctl_table *table, int write) | ||
179 | { | ||
180 | char *which = table->data; | ||
181 | #ifdef CONFIG_UTS_NS | ||
182 | struct uts_namespace *uts_ns = current->nsproxy->uts_ns; | ||
183 | which = (which - (char *)&init_uts_ns) + (char *)uts_ns; | ||
184 | #endif | ||
185 | if (!write) | ||
186 | down_read(&uts_sem); | ||
187 | else | ||
188 | down_write(&uts_sem); | ||
189 | return which; | ||
190 | } | ||
191 | |||
192 | static void put_uts(ctl_table *table, int write, void *which) | ||
193 | { | ||
194 | if (!write) | ||
195 | up_read(&uts_sem); | ||
196 | else | ||
197 | up_write(&uts_sem); | ||
198 | } | ||
199 | |||
200 | #ifdef CONFIG_SYSVIPC | ||
201 | static void *get_ipc(ctl_table *table, int write) | ||
202 | { | ||
203 | char *which = table->data; | ||
204 | struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; | ||
205 | which = (which - (char *)&init_ipc_ns) + (char *)ipc_ns; | ||
206 | return which; | ||
207 | } | ||
208 | #else | ||
209 | #define get_ipc(T,W) ((T)->data) | ||
210 | #endif | ||
211 | |||
166 | /* /proc declarations: */ | 212 | /* /proc declarations: */ |
167 | 213 | ||
168 | #ifdef CONFIG_PROC_SYSCTL | 214 | #ifdef CONFIG_PROC_SYSCTL |
@@ -229,7 +275,6 @@ static ctl_table root_table[] = { | |||
229 | }; | 275 | }; |
230 | 276 | ||
231 | static ctl_table kern_table[] = { | 277 | static ctl_table kern_table[] = { |
232 | #ifndef CONFIG_UTS_NS | ||
233 | { | 278 | { |
234 | .ctl_name = KERN_OSTYPE, | 279 | .ctl_name = KERN_OSTYPE, |
235 | .procname = "ostype", | 280 | .procname = "ostype", |
@@ -237,7 +282,7 @@ static ctl_table kern_table[] = { | |||
237 | .maxlen = sizeof(init_uts_ns.name.sysname), | 282 | .maxlen = sizeof(init_uts_ns.name.sysname), |
238 | .mode = 0444, | 283 | .mode = 0444, |
239 | .proc_handler = &proc_do_uts_string, | 284 | .proc_handler = &proc_do_uts_string, |
240 | .strategy = &sysctl_string, | 285 | .strategy = &sysctl_uts_string, |
241 | }, | 286 | }, |
242 | { | 287 | { |
243 | .ctl_name = KERN_OSRELEASE, | 288 | .ctl_name = KERN_OSRELEASE, |
@@ -246,7 +291,7 @@ static ctl_table kern_table[] = { | |||
246 | .maxlen = sizeof(init_uts_ns.name.release), | 291 | .maxlen = sizeof(init_uts_ns.name.release), |
247 | .mode = 0444, | 292 | .mode = 0444, |
248 | .proc_handler = &proc_do_uts_string, | 293 | .proc_handler = &proc_do_uts_string, |
249 | .strategy = &sysctl_string, | 294 | .strategy = &sysctl_uts_string, |
250 | }, | 295 | }, |
251 | { | 296 | { |
252 | .ctl_name = KERN_VERSION, | 297 | .ctl_name = KERN_VERSION, |
@@ -255,7 +300,7 @@ static ctl_table kern_table[] = { | |||
255 | .maxlen = sizeof(init_uts_ns.name.version), | 300 | .maxlen = sizeof(init_uts_ns.name.version), |
256 | .mode = 0444, | 301 | .mode = 0444, |
257 | .proc_handler = &proc_do_uts_string, | 302 | .proc_handler = &proc_do_uts_string, |
258 | .strategy = &sysctl_string, | 303 | .strategy = &sysctl_uts_string, |
259 | }, | 304 | }, |
260 | { | 305 | { |
261 | .ctl_name = KERN_NODENAME, | 306 | .ctl_name = KERN_NODENAME, |
@@ -264,7 +309,7 @@ static ctl_table kern_table[] = { | |||
264 | .maxlen = sizeof(init_uts_ns.name.nodename), | 309 | .maxlen = sizeof(init_uts_ns.name.nodename), |
265 | .mode = 0644, | 310 | .mode = 0644, |
266 | .proc_handler = &proc_do_uts_string, | 311 | .proc_handler = &proc_do_uts_string, |
267 | .strategy = &sysctl_string, | 312 | .strategy = &sysctl_uts_string, |
268 | }, | 313 | }, |
269 | { | 314 | { |
270 | .ctl_name = KERN_DOMAINNAME, | 315 | .ctl_name = KERN_DOMAINNAME, |
@@ -273,56 +318,8 @@ static ctl_table kern_table[] = { | |||
273 | .maxlen = sizeof(init_uts_ns.name.domainname), | 318 | .maxlen = sizeof(init_uts_ns.name.domainname), |
274 | .mode = 0644, | 319 | .mode = 0644, |
275 | .proc_handler = &proc_do_uts_string, | 320 | .proc_handler = &proc_do_uts_string, |
276 | .strategy = &sysctl_string, | 321 | .strategy = &sysctl_uts_string, |
277 | }, | ||
278 | #else /* !CONFIG_UTS_NS */ | ||
279 | { | ||
280 | .ctl_name = KERN_OSTYPE, | ||
281 | .procname = "ostype", | ||
282 | .data = NULL, | ||
283 | /* could maybe use __NEW_UTS_LEN here? */ | ||
284 | .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), | ||
285 | .mode = 0444, | ||
286 | .proc_handler = &proc_do_uts_string, | ||
287 | .strategy = &sysctl_string, | ||
288 | }, | ||
289 | { | ||
290 | .ctl_name = KERN_OSRELEASE, | ||
291 | .procname = "osrelease", | ||
292 | .data = NULL, | ||
293 | .maxlen = FIELD_SIZEOF(struct new_utsname, release), | ||
294 | .mode = 0444, | ||
295 | .proc_handler = &proc_do_uts_string, | ||
296 | .strategy = &sysctl_string, | ||
297 | }, | ||
298 | { | ||
299 | .ctl_name = KERN_VERSION, | ||
300 | .procname = "version", | ||
301 | .data = NULL, | ||
302 | .maxlen = FIELD_SIZEOF(struct new_utsname, version), | ||
303 | .mode = 0444, | ||
304 | .proc_handler = &proc_do_uts_string, | ||
305 | .strategy = &sysctl_string, | ||
306 | }, | ||
307 | { | ||
308 | .ctl_name = KERN_NODENAME, | ||
309 | .procname = "hostname", | ||
310 | .data = NULL, | ||
311 | .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), | ||
312 | .mode = 0644, | ||
313 | .proc_handler = &proc_do_uts_string, | ||
314 | .strategy = &sysctl_string, | ||
315 | }, | ||
316 | { | ||
317 | .ctl_name = KERN_DOMAINNAME, | ||
318 | .procname = "domainname", | ||
319 | .data = NULL, | ||
320 | .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), | ||
321 | .mode = 0644, | ||
322 | .proc_handler = &proc_do_uts_string, | ||
323 | .strategy = &sysctl_string, | ||
324 | }, | 322 | }, |
325 | #endif /* !CONFIG_UTS_NS */ | ||
326 | { | 323 | { |
327 | .ctl_name = KERN_PANIC, | 324 | .ctl_name = KERN_PANIC, |
328 | .procname = "panic", | 325 | .procname = "panic", |
@@ -481,58 +478,65 @@ static ctl_table kern_table[] = { | |||
481 | { | 478 | { |
482 | .ctl_name = KERN_SHMMAX, | 479 | .ctl_name = KERN_SHMMAX, |
483 | .procname = "shmmax", | 480 | .procname = "shmmax", |
484 | .data = NULL, | 481 | .data = &init_ipc_ns.shm_ctlmax, |
485 | .maxlen = sizeof (size_t), | 482 | .maxlen = sizeof (init_ipc_ns.shm_ctlmax), |
486 | .mode = 0644, | 483 | .mode = 0644, |
487 | .proc_handler = &proc_do_ipc_string, | 484 | .proc_handler = &proc_ipc_doulongvec_minmax, |
485 | .strategy = sysctl_ipc_data, | ||
488 | }, | 486 | }, |
489 | { | 487 | { |
490 | .ctl_name = KERN_SHMALL, | 488 | .ctl_name = KERN_SHMALL, |
491 | .procname = "shmall", | 489 | .procname = "shmall", |
492 | .data = NULL, | 490 | .data = &init_ipc_ns.shm_ctlall, |
493 | .maxlen = sizeof (size_t), | 491 | .maxlen = sizeof (init_ipc_ns.shm_ctlall), |
494 | .mode = 0644, | 492 | .mode = 0644, |
495 | .proc_handler = &proc_do_ipc_string, | 493 | .proc_handler = &proc_ipc_doulongvec_minmax, |
494 | .strategy = sysctl_ipc_data, | ||
496 | }, | 495 | }, |
497 | { | 496 | { |
498 | .ctl_name = KERN_SHMMNI, | 497 | .ctl_name = KERN_SHMMNI, |
499 | .procname = "shmmni", | 498 | .procname = "shmmni", |
500 | .data = NULL, | 499 | .data = &init_ipc_ns.shm_ctlmni, |
501 | .maxlen = sizeof (int), | 500 | .maxlen = sizeof (init_ipc_ns.shm_ctlmni), |
502 | .mode = 0644, | 501 | .mode = 0644, |
503 | .proc_handler = &proc_do_ipc_string, | 502 | .proc_handler = &proc_ipc_dointvec, |
503 | .strategy = sysctl_ipc_data, | ||
504 | }, | 504 | }, |
505 | { | 505 | { |
506 | .ctl_name = KERN_MSGMAX, | 506 | .ctl_name = KERN_MSGMAX, |
507 | .procname = "msgmax", | 507 | .procname = "msgmax", |
508 | .data = NULL, | 508 | .data = &init_ipc_ns.msg_ctlmax, |
509 | .maxlen = sizeof (int), | 509 | .maxlen = sizeof (init_ipc_ns.msg_ctlmax), |
510 | .mode = 0644, | 510 | .mode = 0644, |
511 | .proc_handler = &proc_do_ipc_string, | 511 | .proc_handler = &proc_ipc_dointvec, |
512 | .strategy = sysctl_ipc_data, | ||
512 | }, | 513 | }, |
513 | { | 514 | { |
514 | .ctl_name = KERN_MSGMNI, | 515 | .ctl_name = KERN_MSGMNI, |
515 | .procname = "msgmni", | 516 | .procname = "msgmni", |
516 | .data = NULL, | 517 | .data = &init_ipc_ns.msg_ctlmni, |
517 | .maxlen = sizeof (int), | 518 | .maxlen = sizeof (init_ipc_ns.msg_ctlmni), |
518 | .mode = 0644, | 519 | .mode = 0644, |
519 | .proc_handler = &proc_do_ipc_string, | 520 | .proc_handler = &proc_ipc_dointvec, |
521 | .strategy = sysctl_ipc_data, | ||
520 | }, | 522 | }, |
521 | { | 523 | { |
522 | .ctl_name = KERN_MSGMNB, | 524 | .ctl_name = KERN_MSGMNB, |
523 | .procname = "msgmnb", | 525 | .procname = "msgmnb", |
524 | .data = NULL, | 526 | .data = &init_ipc_ns.msg_ctlmnb, |
525 | .maxlen = sizeof (int), | 527 | .maxlen = sizeof (init_ipc_ns.msg_ctlmnb), |
526 | .mode = 0644, | 528 | .mode = 0644, |
527 | .proc_handler = &proc_do_ipc_string, | 529 | .proc_handler = &proc_ipc_dointvec, |
530 | .strategy = sysctl_ipc_data, | ||
528 | }, | 531 | }, |
529 | { | 532 | { |
530 | .ctl_name = KERN_SEM, | 533 | .ctl_name = KERN_SEM, |
531 | .procname = "sem", | 534 | .procname = "sem", |
532 | .data = NULL, | 535 | .data = &init_ipc_ns.sem_ctls, |
533 | .maxlen = 4*sizeof (int), | 536 | .maxlen = 4*sizeof (int), |
534 | .mode = 0644, | 537 | .mode = 0644, |
535 | .proc_handler = &proc_do_ipc_string, | 538 | .proc_handler = &proc_ipc_dointvec, |
539 | .strategy = sysctl_ipc_data, | ||
536 | }, | 540 | }, |
537 | #endif | 541 | #endif |
538 | #ifdef CONFIG_MAGIC_SYSRQ | 542 | #ifdef CONFIG_MAGIC_SYSRQ |
@@ -1239,7 +1243,6 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
1239 | do { | 1243 | do { |
1240 | struct ctl_table_header *head = | 1244 | struct ctl_table_header *head = |
1241 | list_entry(tmp, struct ctl_table_header, ctl_entry); | 1245 | list_entry(tmp, struct ctl_table_header, ctl_entry); |
1242 | void *context = NULL; | ||
1243 | 1246 | ||
1244 | if (!use_table(head)) | 1247 | if (!use_table(head)) |
1245 | continue; | 1248 | continue; |
@@ -1247,9 +1250,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol | |||
1247 | spin_unlock(&sysctl_lock); | 1250 | spin_unlock(&sysctl_lock); |
1248 | 1251 | ||
1249 | error = parse_table(name, nlen, oldval, oldlenp, | 1252 | error = parse_table(name, nlen, oldval, oldlenp, |
1250 | newval, newlen, head->ctl_table, | 1253 | newval, newlen, head->ctl_table); |
1251 | &context); | ||
1252 | kfree(context); | ||
1253 | 1254 | ||
1254 | spin_lock(&sysctl_lock); | 1255 | spin_lock(&sysctl_lock); |
1255 | unuse_table(head); | 1256 | unuse_table(head); |
@@ -1305,7 +1306,7 @@ static inline int ctl_perm(ctl_table *table, int op) | |||
1305 | static int parse_table(int __user *name, int nlen, | 1306 | static int parse_table(int __user *name, int nlen, |
1306 | void __user *oldval, size_t __user *oldlenp, | 1307 | void __user *oldval, size_t __user *oldlenp, |
1307 | void __user *newval, size_t newlen, | 1308 | void __user *newval, size_t newlen, |
1308 | ctl_table *table, void **context) | 1309 | ctl_table *table) |
1309 | { | 1310 | { |
1310 | int n; | 1311 | int n; |
1311 | repeat: | 1312 | repeat: |
@@ -1325,7 +1326,7 @@ repeat: | |||
1325 | error = table->strategy( | 1326 | error = table->strategy( |
1326 | table, name, nlen, | 1327 | table, name, nlen, |
1327 | oldval, oldlenp, | 1328 | oldval, oldlenp, |
1328 | newval, newlen, context); | 1329 | newval, newlen); |
1329 | if (error) | 1330 | if (error) |
1330 | return error; | 1331 | return error; |
1331 | } | 1332 | } |
@@ -1336,7 +1337,7 @@ repeat: | |||
1336 | } | 1337 | } |
1337 | error = do_sysctl_strategy(table, name, nlen, | 1338 | error = do_sysctl_strategy(table, name, nlen, |
1338 | oldval, oldlenp, | 1339 | oldval, oldlenp, |
1339 | newval, newlen, context); | 1340 | newval, newlen); |
1340 | return error; | 1341 | return error; |
1341 | } | 1342 | } |
1342 | } | 1343 | } |
@@ -1347,7 +1348,7 @@ repeat: | |||
1347 | int do_sysctl_strategy (ctl_table *table, | 1348 | int do_sysctl_strategy (ctl_table *table, |
1348 | int __user *name, int nlen, | 1349 | int __user *name, int nlen, |
1349 | void __user *oldval, size_t __user *oldlenp, | 1350 | void __user *oldval, size_t __user *oldlenp, |
1350 | void __user *newval, size_t newlen, void **context) | 1351 | void __user *newval, size_t newlen) |
1351 | { | 1352 | { |
1352 | int op = 0, rc; | 1353 | int op = 0, rc; |
1353 | size_t len; | 1354 | size_t len; |
@@ -1361,7 +1362,7 @@ int do_sysctl_strategy (ctl_table *table, | |||
1361 | 1362 | ||
1362 | if (table->strategy) { | 1363 | if (table->strategy) { |
1363 | rc = table->strategy(table, name, nlen, oldval, oldlenp, | 1364 | rc = table->strategy(table, name, nlen, oldval, oldlenp, |
1364 | newval, newlen, context); | 1365 | newval, newlen); |
1365 | if (rc < 0) | 1366 | if (rc < 0) |
1366 | return rc; | 1367 | return rc; |
1367 | if (rc > 0) | 1368 | if (rc > 0) |
@@ -1614,7 +1615,7 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf, | |||
1614 | size_t count, loff_t *ppos) | 1615 | size_t count, loff_t *ppos) |
1615 | { | 1616 | { |
1616 | int op; | 1617 | int op; |
1617 | struct proc_dir_entry *de = PDE(file->f_dentry->d_inode); | 1618 | struct proc_dir_entry *de = PDE(file->f_path.dentry->d_inode); |
1618 | struct ctl_table *table; | 1619 | struct ctl_table *table; |
1619 | size_t res; | 1620 | size_t res; |
1620 | ssize_t error = -ENOTDIR; | 1621 | ssize_t error = -ENOTDIR; |
@@ -1753,66 +1754,17 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, | |||
1753 | * Special case of dostring for the UTS structure. This has locks | 1754 | * Special case of dostring for the UTS structure. This has locks |
1754 | * to observe. Should this be in kernel/sys.c ???? | 1755 | * to observe. Should this be in kernel/sys.c ???? |
1755 | */ | 1756 | */ |
1756 | |||
1757 | #ifndef CONFIG_UTS_NS | ||
1758 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | ||
1759 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1760 | { | ||
1761 | int r; | ||
1762 | 1757 | ||
1763 | if (!write) { | ||
1764 | down_read(&uts_sem); | ||
1765 | r=proc_dostring(table,0,filp,buffer,lenp, ppos); | ||
1766 | up_read(&uts_sem); | ||
1767 | } else { | ||
1768 | down_write(&uts_sem); | ||
1769 | r=proc_dostring(table,1,filp,buffer,lenp, ppos); | ||
1770 | up_write(&uts_sem); | ||
1771 | } | ||
1772 | return r; | ||
1773 | } | ||
1774 | #else /* !CONFIG_UTS_NS */ | ||
1775 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, | 1758 | static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, |
1776 | void __user *buffer, size_t *lenp, loff_t *ppos) | 1759 | void __user *buffer, size_t *lenp, loff_t *ppos) |
1777 | { | 1760 | { |
1778 | int r; | 1761 | int r; |
1779 | struct uts_namespace* uts_ns = current->nsproxy->uts_ns; | 1762 | void *which; |
1780 | char* which; | 1763 | which = get_uts(table, write); |
1781 | 1764 | r = _proc_do_string(which, table->maxlen,write,filp,buffer,lenp, ppos); | |
1782 | switch (table->ctl_name) { | 1765 | put_uts(table, write, which); |
1783 | case KERN_OSTYPE: | ||
1784 | which = uts_ns->name.sysname; | ||
1785 | break; | ||
1786 | case KERN_NODENAME: | ||
1787 | which = uts_ns->name.nodename; | ||
1788 | break; | ||
1789 | case KERN_OSRELEASE: | ||
1790 | which = uts_ns->name.release; | ||
1791 | break; | ||
1792 | case KERN_VERSION: | ||
1793 | which = uts_ns->name.version; | ||
1794 | break; | ||
1795 | case KERN_DOMAINNAME: | ||
1796 | which = uts_ns->name.domainname; | ||
1797 | break; | ||
1798 | default: | ||
1799 | r = -EINVAL; | ||
1800 | goto out; | ||
1801 | } | ||
1802 | |||
1803 | if (!write) { | ||
1804 | down_read(&uts_sem); | ||
1805 | r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos); | ||
1806 | up_read(&uts_sem); | ||
1807 | } else { | ||
1808 | down_write(&uts_sem); | ||
1809 | r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos); | ||
1810 | up_write(&uts_sem); | ||
1811 | } | ||
1812 | out: | ||
1813 | return r; | 1766 | return r; |
1814 | } | 1767 | } |
1815 | #endif /* !CONFIG_UTS_NS */ | ||
1816 | 1768 | ||
1817 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, | 1769 | static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, |
1818 | int *valp, | 1770 | int *valp, |
@@ -1976,9 +1928,6 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp, | |||
1976 | 1928 | ||
1977 | #define OP_SET 0 | 1929 | #define OP_SET 0 |
1978 | #define OP_AND 1 | 1930 | #define OP_AND 1 |
1979 | #define OP_OR 2 | ||
1980 | #define OP_MAX 3 | ||
1981 | #define OP_MIN 4 | ||
1982 | 1931 | ||
1983 | static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, | 1932 | static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, |
1984 | int *valp, | 1933 | int *valp, |
@@ -1990,13 +1939,6 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, | |||
1990 | switch(op) { | 1939 | switch(op) { |
1991 | case OP_SET: *valp = val; break; | 1940 | case OP_SET: *valp = val; break; |
1992 | case OP_AND: *valp &= val; break; | 1941 | case OP_AND: *valp &= val; break; |
1993 | case OP_OR: *valp |= val; break; | ||
1994 | case OP_MAX: if(*valp < val) | ||
1995 | *valp = val; | ||
1996 | break; | ||
1997 | case OP_MIN: if(*valp > val) | ||
1998 | *valp = val; | ||
1999 | break; | ||
2000 | } | 1942 | } |
2001 | } else { | 1943 | } else { |
2002 | int val = *valp; | 1944 | int val = *valp; |
@@ -2391,46 +2333,24 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, | |||
2391 | } | 2333 | } |
2392 | 2334 | ||
2393 | #ifdef CONFIG_SYSVIPC | 2335 | #ifdef CONFIG_SYSVIPC |
2394 | static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | 2336 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, |
2395 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2337 | void __user *buffer, size_t *lenp, loff_t *ppos) |
2396 | { | 2338 | { |
2397 | void *data; | 2339 | void *which; |
2398 | struct ipc_namespace *ns; | 2340 | which = get_ipc(table, write); |
2399 | 2341 | return __do_proc_dointvec(which, table, write, filp, buffer, | |
2400 | ns = current->nsproxy->ipc_ns; | ||
2401 | |||
2402 | switch (table->ctl_name) { | ||
2403 | case KERN_SHMMAX: | ||
2404 | data = &ns->shm_ctlmax; | ||
2405 | goto proc_minmax; | ||
2406 | case KERN_SHMALL: | ||
2407 | data = &ns->shm_ctlall; | ||
2408 | goto proc_minmax; | ||
2409 | case KERN_SHMMNI: | ||
2410 | data = &ns->shm_ctlmni; | ||
2411 | break; | ||
2412 | case KERN_MSGMAX: | ||
2413 | data = &ns->msg_ctlmax; | ||
2414 | break; | ||
2415 | case KERN_MSGMNI: | ||
2416 | data = &ns->msg_ctlmni; | ||
2417 | break; | ||
2418 | case KERN_MSGMNB: | ||
2419 | data = &ns->msg_ctlmnb; | ||
2420 | break; | ||
2421 | case KERN_SEM: | ||
2422 | data = &ns->sem_ctls; | ||
2423 | break; | ||
2424 | default: | ||
2425 | return -EINVAL; | ||
2426 | } | ||
2427 | |||
2428 | return __do_proc_dointvec(data, table, write, filp, buffer, | ||
2429 | lenp, ppos, NULL, NULL); | 2342 | lenp, ppos, NULL, NULL); |
2430 | proc_minmax: | 2343 | } |
2431 | return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, | 2344 | |
2345 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, | ||
2346 | struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2347 | { | ||
2348 | void *which; | ||
2349 | which = get_ipc(table, write); | ||
2350 | return __do_proc_doulongvec_minmax(which, table, write, filp, buffer, | ||
2432 | lenp, ppos, 1l, 1l); | 2351 | lenp, ppos, 1l, 1l); |
2433 | } | 2352 | } |
2353 | |||
2434 | #endif | 2354 | #endif |
2435 | 2355 | ||
2436 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, | 2356 | static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, |
@@ -2475,6 +2395,17 @@ static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, | |||
2475 | { | 2395 | { |
2476 | return -ENOSYS; | 2396 | return -ENOSYS; |
2477 | } | 2397 | } |
2398 | static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, | ||
2399 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
2400 | { | ||
2401 | return -ENOSYS; | ||
2402 | } | ||
2403 | static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, | ||
2404 | struct file *filp, void __user *buffer, | ||
2405 | size_t *lenp, loff_t *ppos) | ||
2406 | { | ||
2407 | return -ENOSYS; | ||
2408 | } | ||
2478 | #endif | 2409 | #endif |
2479 | 2410 | ||
2480 | int proc_dointvec(ctl_table *table, int write, struct file *filp, | 2411 | int proc_dointvec(ctl_table *table, int write, struct file *filp, |
@@ -2539,7 +2470,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write, | |||
2539 | /* The generic string strategy routine: */ | 2470 | /* The generic string strategy routine: */ |
2540 | int sysctl_string(ctl_table *table, int __user *name, int nlen, | 2471 | int sysctl_string(ctl_table *table, int __user *name, int nlen, |
2541 | void __user *oldval, size_t __user *oldlenp, | 2472 | void __user *oldval, size_t __user *oldlenp, |
2542 | void __user *newval, size_t newlen, void **context) | 2473 | void __user *newval, size_t newlen) |
2543 | { | 2474 | { |
2544 | if (!table->data || !table->maxlen) | 2475 | if (!table->data || !table->maxlen) |
2545 | return -ENOTDIR; | 2476 | return -ENOTDIR; |
@@ -2585,7 +2516,7 @@ int sysctl_string(ctl_table *table, int __user *name, int nlen, | |||
2585 | */ | 2516 | */ |
2586 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | 2517 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, |
2587 | void __user *oldval, size_t __user *oldlenp, | 2518 | void __user *oldval, size_t __user *oldlenp, |
2588 | void __user *newval, size_t newlen, void **context) | 2519 | void __user *newval, size_t newlen) |
2589 | { | 2520 | { |
2590 | 2521 | ||
2591 | if (newval && newlen) { | 2522 | if (newval && newlen) { |
@@ -2621,7 +2552,7 @@ int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | |||
2621 | /* Strategy function to convert jiffies to seconds */ | 2552 | /* Strategy function to convert jiffies to seconds */ |
2622 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | 2553 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, |
2623 | void __user *oldval, size_t __user *oldlenp, | 2554 | void __user *oldval, size_t __user *oldlenp, |
2624 | void __user *newval, size_t newlen, void **context) | 2555 | void __user *newval, size_t newlen) |
2625 | { | 2556 | { |
2626 | if (oldval) { | 2557 | if (oldval) { |
2627 | size_t olen; | 2558 | size_t olen; |
@@ -2649,7 +2580,7 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | |||
2649 | /* Strategy function to convert jiffies to seconds */ | 2580 | /* Strategy function to convert jiffies to seconds */ |
2650 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | 2581 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, |
2651 | void __user *oldval, size_t __user *oldlenp, | 2582 | void __user *oldval, size_t __user *oldlenp, |
2652 | void __user *newval, size_t newlen, void **context) | 2583 | void __user *newval, size_t newlen) |
2653 | { | 2584 | { |
2654 | if (oldval) { | 2585 | if (oldval) { |
2655 | size_t olen; | 2586 | size_t olen; |
@@ -2674,6 +2605,64 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | |||
2674 | return 1; | 2605 | return 1; |
2675 | } | 2606 | } |
2676 | 2607 | ||
2608 | |||
2609 | /* The generic string strategy routine: */ | ||
2610 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
2611 | void __user *oldval, size_t __user *oldlenp, | ||
2612 | void __user *newval, size_t newlen) | ||
2613 | { | ||
2614 | struct ctl_table uts_table; | ||
2615 | int r, write; | ||
2616 | write = newval && newlen; | ||
2617 | memcpy(&uts_table, table, sizeof(uts_table)); | ||
2618 | uts_table.data = get_uts(table, write); | ||
2619 | r = sysctl_string(&uts_table, name, nlen, | ||
2620 | oldval, oldlenp, newval, newlen); | ||
2621 | put_uts(table, write, uts_table.data); | ||
2622 | return r; | ||
2623 | } | ||
2624 | |||
2625 | #ifdef CONFIG_SYSVIPC | ||
2626 | /* The generic sysctl ipc data routine. */ | ||
2627 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
2628 | void __user *oldval, size_t __user *oldlenp, | ||
2629 | void __user *newval, size_t newlen) | ||
2630 | { | ||
2631 | size_t len; | ||
2632 | void *data; | ||
2633 | |||
2634 | /* Get out of I don't have a variable */ | ||
2635 | if (!table->data || !table->maxlen) | ||
2636 | return -ENOTDIR; | ||
2637 | |||
2638 | data = get_ipc(table, 1); | ||
2639 | if (!data) | ||
2640 | return -ENOTDIR; | ||
2641 | |||
2642 | if (oldval && oldlenp) { | ||
2643 | if (get_user(len, oldlenp)) | ||
2644 | return -EFAULT; | ||
2645 | if (len) { | ||
2646 | if (len > table->maxlen) | ||
2647 | len = table->maxlen; | ||
2648 | if (copy_to_user(oldval, data, len)) | ||
2649 | return -EFAULT; | ||
2650 | if (put_user(len, oldlenp)) | ||
2651 | return -EFAULT; | ||
2652 | } | ||
2653 | } | ||
2654 | |||
2655 | if (newval && newlen) { | ||
2656 | if (newlen > table->maxlen) | ||
2657 | newlen = table->maxlen; | ||
2658 | |||
2659 | if (copy_from_user(data, newval, newlen)) | ||
2660 | return -EFAULT; | ||
2661 | } | ||
2662 | return 1; | ||
2663 | } | ||
2664 | #endif | ||
2665 | |||
2677 | #else /* CONFIG_SYSCTL_SYSCALL */ | 2666 | #else /* CONFIG_SYSCTL_SYSCALL */ |
2678 | 2667 | ||
2679 | 2668 | ||
@@ -2712,32 +2701,44 @@ out: | |||
2712 | 2701 | ||
2713 | int sysctl_string(ctl_table *table, int __user *name, int nlen, | 2702 | int sysctl_string(ctl_table *table, int __user *name, int nlen, |
2714 | void __user *oldval, size_t __user *oldlenp, | 2703 | void __user *oldval, size_t __user *oldlenp, |
2715 | void __user *newval, size_t newlen, void **context) | 2704 | void __user *newval, size_t newlen) |
2716 | { | 2705 | { |
2717 | return -ENOSYS; | 2706 | return -ENOSYS; |
2718 | } | 2707 | } |
2719 | 2708 | ||
2720 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, | 2709 | int sysctl_intvec(ctl_table *table, int __user *name, int nlen, |
2721 | void __user *oldval, size_t __user *oldlenp, | 2710 | void __user *oldval, size_t __user *oldlenp, |
2722 | void __user *newval, size_t newlen, void **context) | 2711 | void __user *newval, size_t newlen) |
2723 | { | 2712 | { |
2724 | return -ENOSYS; | 2713 | return -ENOSYS; |
2725 | } | 2714 | } |
2726 | 2715 | ||
2727 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, | 2716 | int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, |
2728 | void __user *oldval, size_t __user *oldlenp, | 2717 | void __user *oldval, size_t __user *oldlenp, |
2729 | void __user *newval, size_t newlen, void **context) | 2718 | void __user *newval, size_t newlen) |
2730 | { | 2719 | { |
2731 | return -ENOSYS; | 2720 | return -ENOSYS; |
2732 | } | 2721 | } |
2733 | 2722 | ||
2734 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, | 2723 | int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, |
2735 | void __user *oldval, size_t __user *oldlenp, | 2724 | void __user *oldval, size_t __user *oldlenp, |
2736 | void __user *newval, size_t newlen, void **context) | 2725 | void __user *newval, size_t newlen) |
2737 | { | 2726 | { |
2738 | return -ENOSYS; | 2727 | return -ENOSYS; |
2739 | } | 2728 | } |
2740 | 2729 | ||
2730 | static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, | ||
2731 | void __user *oldval, size_t __user *oldlenp, | ||
2732 | void __user *newval, size_t newlen) | ||
2733 | { | ||
2734 | return -ENOSYS; | ||
2735 | } | ||
2736 | static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, | ||
2737 | void __user *oldval, size_t __user *oldlenp, | ||
2738 | void __user *newval, size_t newlen) | ||
2739 | { | ||
2740 | return -ENOSYS; | ||
2741 | } | ||
2741 | #endif /* CONFIG_SYSCTL_SYSCALL */ | 2742 | #endif /* CONFIG_SYSCTL_SYSCALL */ |
2742 | 2743 | ||
2743 | /* | 2744 | /* |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 74eca5939b..22504afc0d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -156,7 +156,7 @@ int clocksource_register(struct clocksource *c) | |||
156 | /* check if clocksource is already registered */ | 156 | /* check if clocksource is already registered */ |
157 | if (is_registered_source(c)) { | 157 | if (is_registered_source(c)) { |
158 | printk("register_clocksource: Cannot register %s. " | 158 | printk("register_clocksource: Cannot register %s. " |
159 | "Already registered!", c->name); | 159 | "Already registered!", c->name); |
160 | ret = -EBUSY; | 160 | ret = -EBUSY; |
161 | } else { | 161 | } else { |
162 | /* register it */ | 162 | /* register it */ |
@@ -186,6 +186,7 @@ void clocksource_reselect(void) | |||
186 | } | 186 | } |
187 | EXPORT_SYMBOL(clocksource_reselect); | 187 | EXPORT_SYMBOL(clocksource_reselect); |
188 | 188 | ||
189 | #ifdef CONFIG_SYSFS | ||
189 | /** | 190 | /** |
190 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | 191 | * sysfs_show_current_clocksources - sysfs interface for current clocksource |
191 | * @dev: unused | 192 | * @dev: unused |
@@ -275,10 +276,10 @@ sysfs_show_available_clocksources(struct sys_device *dev, char *buf) | |||
275 | * Sysfs setup bits: | 276 | * Sysfs setup bits: |
276 | */ | 277 | */ |
277 | static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, | 278 | static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, |
278 | sysfs_override_clocksource); | 279 | sysfs_override_clocksource); |
279 | 280 | ||
280 | static SYSDEV_ATTR(available_clocksource, 0600, | 281 | static SYSDEV_ATTR(available_clocksource, 0600, |
281 | sysfs_show_available_clocksources, NULL); | 282 | sysfs_show_available_clocksources, NULL); |
282 | 283 | ||
283 | static struct sysdev_class clocksource_sysclass = { | 284 | static struct sysdev_class clocksource_sysclass = { |
284 | set_kset_name("clocksource"), | 285 | set_kset_name("clocksource"), |
@@ -307,6 +308,7 @@ static int __init init_clocksource_sysfs(void) | |||
307 | } | 308 | } |
308 | 309 | ||
309 | device_initcall(init_clocksource_sysfs); | 310 | device_initcall(init_clocksource_sysfs); |
311 | #endif /* CONFIG_SYSFS */ | ||
310 | 312 | ||
311 | /** | 313 | /** |
312 | * boot_override_clocksource - boot clock override | 314 | * boot_override_clocksource - boot clock override |
diff --git a/kernel/timer.c b/kernel/timer.c index c1c7fbcffe..0256ab443d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -80,6 +80,138 @@ tvec_base_t boot_tvec_bases; | |||
80 | EXPORT_SYMBOL(boot_tvec_bases); | 80 | EXPORT_SYMBOL(boot_tvec_bases); |
81 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; | 81 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; |
82 | 82 | ||
83 | /** | ||
84 | * __round_jiffies - function to round jiffies to a full second | ||
85 | * @j: the time in (absolute) jiffies that should be rounded | ||
86 | * @cpu: the processor number on which the timeout will happen | ||
87 | * | ||
88 | * __round_jiffies rounds an absolute time in the future (in jiffies) | ||
89 | * up or down to (approximately) full seconds. This is useful for timers | ||
90 | * for which the exact time they fire does not matter too much, as long as | ||
91 | * they fire approximately every X seconds. | ||
92 | * | ||
93 | * By rounding these timers to whole seconds, all such timers will fire | ||
94 | * at the same time, rather than at various times spread out. The goal | ||
95 | * of this is to have the CPU wake up less, which saves power. | ||
96 | * | ||
97 | * The exact rounding is skewed for each processor to avoid all | ||
98 | * processors firing at the exact same time, which could lead | ||
99 | * to lock contention or spurious cache line bouncing. | ||
100 | * | ||
101 | * The return value is the rounded version of the "j" parameter. | ||
102 | */ | ||
103 | unsigned long __round_jiffies(unsigned long j, int cpu) | ||
104 | { | ||
105 | int rem; | ||
106 | unsigned long original = j; | ||
107 | |||
108 | /* | ||
109 | * We don't want all cpus firing their timers at once hitting the | ||
110 | * same lock or cachelines, so we skew each extra cpu with an extra | ||
111 | * 3 jiffies. This 3 jiffies came originally from the mm/ code which | ||
112 | * already did this. | ||
113 | * The skew is done by adding 3*cpunr, then round, then subtract this | ||
114 | * extra offset again. | ||
115 | */ | ||
116 | j += cpu * 3; | ||
117 | |||
118 | rem = j % HZ; | ||
119 | |||
120 | /* | ||
121 | * If the target jiffie is just after a whole second (which can happen | ||
122 | * due to delays of the timer irq, long irq off times etc etc) then | ||
123 | * we should round down to the whole second, not up. Use 1/4th second | ||
124 | * as cutoff for this rounding as an extreme upper bound for this. | ||
125 | */ | ||
126 | if (rem < HZ/4) /* round down */ | ||
127 | j = j - rem; | ||
128 | else /* round up */ | ||
129 | j = j - rem + HZ; | ||
130 | |||
131 | /* now that we have rounded, subtract the extra skew again */ | ||
132 | j -= cpu * 3; | ||
133 | |||
134 | if (j <= jiffies) /* rounding ate our timeout entirely; */ | ||
135 | return original; | ||
136 | return j; | ||
137 | } | ||
138 | EXPORT_SYMBOL_GPL(__round_jiffies); | ||
139 | |||
140 | /** | ||
141 | * __round_jiffies_relative - function to round jiffies to a full second | ||
142 | * @j: the time in (relative) jiffies that should be rounded | ||
143 | * @cpu: the processor number on which the timeout will happen | ||
144 | * | ||
145 | * __round_jiffies_relative rounds a time delta in the future (in jiffies) | ||
146 | * up or down to (approximately) full seconds. This is useful for timers | ||
147 | * for which the exact time they fire does not matter too much, as long as | ||
148 | * they fire approximately every X seconds. | ||
149 | * | ||
150 | * By rounding these timers to whole seconds, all such timers will fire | ||
151 | * at the same time, rather than at various times spread out. The goal | ||
152 | * of this is to have the CPU wake up less, which saves power. | ||
153 | * | ||
154 | * The exact rounding is skewed for each processor to avoid all | ||
155 | * processors firing at the exact same time, which could lead | ||
156 | * to lock contention or spurious cache line bouncing. | ||
157 | * | ||
158 | * The return value is the rounded version of the "j" parameter. | ||
159 | */ | ||
160 | unsigned long __round_jiffies_relative(unsigned long j, int cpu) | ||
161 | { | ||
162 | /* | ||
163 | * In theory the following code can skip a jiffy in case jiffies | ||
164 | * increments right between the addition and the later subtraction. | ||
165 | * However since the entire point of this function is to use approximate | ||
166 | * timeouts, it's entirely ok to not handle that. | ||
167 | */ | ||
168 | return __round_jiffies(j + jiffies, cpu) - jiffies; | ||
169 | } | ||
170 | EXPORT_SYMBOL_GPL(__round_jiffies_relative); | ||
171 | |||
172 | /** | ||
173 | * round_jiffies - function to round jiffies to a full second | ||
174 | * @j: the time in (absolute) jiffies that should be rounded | ||
175 | * | ||
176 | * round_jiffies rounds an absolute time in the future (in jiffies) | ||
177 | * up or down to (approximately) full seconds. This is useful for timers | ||
178 | * for which the exact time they fire does not matter too much, as long as | ||
179 | * they fire approximately every X seconds. | ||
180 | * | ||
181 | * By rounding these timers to whole seconds, all such timers will fire | ||
182 | * at the same time, rather than at various times spread out. The goal | ||
183 | * of this is to have the CPU wake up less, which saves power. | ||
184 | * | ||
185 | * The return value is the rounded version of the "j" parameter. | ||
186 | */ | ||
187 | unsigned long round_jiffies(unsigned long j) | ||
188 | { | ||
189 | return __round_jiffies(j, raw_smp_processor_id()); | ||
190 | } | ||
191 | EXPORT_SYMBOL_GPL(round_jiffies); | ||
192 | |||
193 | /** | ||
194 | * round_jiffies_relative - function to round jiffies to a full second | ||
195 | * @j: the time in (relative) jiffies that should be rounded | ||
196 | * | ||
197 | * round_jiffies_relative rounds a time delta in the future (in jiffies) | ||
198 | * up or down to (approximately) full seconds. This is useful for timers | ||
199 | * for which the exact time they fire does not matter too much, as long as | ||
200 | * they fire approximately every X seconds. | ||
201 | * | ||
202 | * By rounding these timers to whole seconds, all such timers will fire | ||
203 | * at the same time, rather than at various times spread out. The goal | ||
204 | * of this is to have the CPU wake up less, which saves power. | ||
205 | * | ||
206 | * The return value is the rounded version of the "j" parameter. | ||
207 | */ | ||
208 | unsigned long round_jiffies_relative(unsigned long j) | ||
209 | { | ||
210 | return __round_jiffies_relative(j, raw_smp_processor_id()); | ||
211 | } | ||
212 | EXPORT_SYMBOL_GPL(round_jiffies_relative); | ||
213 | |||
214 | |||
83 | static inline void set_running_timer(tvec_base_t *base, | 215 | static inline void set_running_timer(tvec_base_t *base, |
84 | struct timer_list *timer) | 216 | struct timer_list *timer) |
85 | { | 217 | { |
@@ -714,7 +846,7 @@ static int change_clocksource(void) | |||
714 | clock = new; | 846 | clock = new; |
715 | clock->cycle_last = now; | 847 | clock->cycle_last = now; |
716 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | 848 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", |
717 | clock->name); | 849 | clock->name); |
718 | return 1; | 850 | return 1; |
719 | } else if (clock->update_callback) { | 851 | } else if (clock->update_callback) { |
720 | return clock->update_callback(); | 852 | return clock->update_callback(); |
@@ -722,7 +854,10 @@ static int change_clocksource(void) | |||
722 | return 0; | 854 | return 0; |
723 | } | 855 | } |
724 | #else | 856 | #else |
725 | #define change_clocksource() (0) | 857 | static inline int change_clocksource(void) |
858 | { | ||
859 | return 0; | ||
860 | } | ||
726 | #endif | 861 | #endif |
727 | 862 | ||
728 | /** | 863 | /** |
@@ -820,7 +955,8 @@ device_initcall(timekeeping_init_device); | |||
820 | * If the error is already larger, we look ahead even further | 955 | * If the error is already larger, we look ahead even further |
821 | * to compensate for late or lost adjustments. | 956 | * to compensate for late or lost adjustments. |
822 | */ | 957 | */ |
823 | static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset) | 958 | static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, |
959 | s64 *offset) | ||
824 | { | 960 | { |
825 | s64 tick_error, i; | 961 | s64 tick_error, i; |
826 | u32 look_ahead, adj; | 962 | u32 look_ahead, adj; |
@@ -844,7 +980,8 @@ static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 * | |||
844 | * Now calculate the error in (1 << look_ahead) ticks, but first | 980 | * Now calculate the error in (1 << look_ahead) ticks, but first |
845 | * remove the single look ahead already included in the error. | 981 | * remove the single look ahead already included in the error. |
846 | */ | 982 | */ |
847 | tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); | 983 | tick_error = current_tick_length() >> |
984 | (TICK_LENGTH_SHIFT - clock->shift + 1); | ||
848 | tick_error -= clock->xtime_interval >> 1; | 985 | tick_error -= clock->xtime_interval >> 1; |
849 | error = ((error - tick_error) >> look_ahead) + tick_error; | 986 | error = ((error - tick_error) >> look_ahead) + tick_error; |
850 | 987 | ||
@@ -896,7 +1033,8 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset) | |||
896 | clock->mult += adj; | 1033 | clock->mult += adj; |
897 | clock->xtime_interval += interval; | 1034 | clock->xtime_interval += interval; |
898 | clock->xtime_nsec -= offset; | 1035 | clock->xtime_nsec -= offset; |
899 | clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); | 1036 | clock->error -= (interval - offset) << |
1037 | (TICK_LENGTH_SHIFT - clock->shift); | ||
900 | } | 1038 | } |
901 | 1039 | ||
902 | /** | 1040 | /** |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 96f77013d3..baacc36914 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
@@ -96,6 +96,15 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) | |||
96 | stats->write_char = p->wchar; | 96 | stats->write_char = p->wchar; |
97 | stats->read_syscalls = p->syscr; | 97 | stats->read_syscalls = p->syscr; |
98 | stats->write_syscalls = p->syscw; | 98 | stats->write_syscalls = p->syscw; |
99 | #ifdef CONFIG_TASK_IO_ACCOUNTING | ||
100 | stats->read_bytes = p->ioac.read_bytes; | ||
101 | stats->write_bytes = p->ioac.write_bytes; | ||
102 | stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; | ||
103 | #else | ||
104 | stats->read_bytes = 0; | ||
105 | stats->write_bytes = 0; | ||
106 | stats->cancelled_write_bytes = 0; | ||
107 | #endif | ||
99 | } | 108 | } |
100 | #undef KB | 109 | #undef KB |
101 | #undef MB | 110 | #undef MB |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6b186750e9..db49886bfa 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -85,22 +85,19 @@ static inline int is_single_threaded(struct workqueue_struct *wq) | |||
85 | return list_empty(&wq->list); | 85 | return list_empty(&wq->list); |
86 | } | 86 | } |
87 | 87 | ||
88 | /* | ||
89 | * Set the workqueue on which a work item is to be run | ||
90 | * - Must *only* be called if the pending flag is set | ||
91 | */ | ||
88 | static inline void set_wq_data(struct work_struct *work, void *wq) | 92 | static inline void set_wq_data(struct work_struct *work, void *wq) |
89 | { | 93 | { |
90 | unsigned long new, old, res; | 94 | unsigned long new; |
95 | |||
96 | BUG_ON(!work_pending(work)); | ||
91 | 97 | ||
92 | /* assume the pending flag is already set and that the task has already | ||
93 | * been queued on this workqueue */ | ||
94 | new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); | 98 | new = (unsigned long) wq | (1UL << WORK_STRUCT_PENDING); |
95 | res = work->management; | 99 | new |= work->management & WORK_STRUCT_FLAG_MASK; |
96 | if (res != new) { | 100 | work->management = new; |
97 | do { | ||
98 | old = res; | ||
99 | new = (unsigned long) wq; | ||
100 | new |= (old & WORK_STRUCT_FLAG_MASK); | ||
101 | res = cmpxchg(&work->management, old, new); | ||
102 | } while (res != old); | ||
103 | } | ||
104 | } | 101 | } |
105 | 102 | ||
106 | static inline void *get_wq_data(struct work_struct *work) | 103 | static inline void *get_wq_data(struct work_struct *work) |