aboutsummaryrefslogtreecommitdiffstats
path: root/fs/proc
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /fs/proc
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/Kconfig12
-rw-r--r--fs/proc/Makefile3
-rw-r--r--fs/proc/array.c305
-rw-r--r--fs/proc/base.c1571
-rw-r--r--fs/proc/fd.c369
-rw-r--r--fs/proc/fd.h14
-rw-r--r--fs/proc/generic.c75
-rw-r--r--fs/proc/inode.c47
-rw-r--r--fs/proc/internal.h75
-rw-r--r--fs/proc/kcore.c10
-rw-r--r--fs/proc/namespaces.c198
-rw-r--r--fs/proc/page.c8
-rw-r--r--fs/proc/proc_devtree.c11
-rw-r--r--fs/proc/proc_net.c4
-rw-r--r--fs/proc/proc_sysctl.c1341
-rw-r--r--fs/proc/root.c102
-rw-r--r--fs/proc/self.c59
-rw-r--r--fs/proc/stat.c178
-rw-r--r--fs/proc/task_mmu.c545
-rw-r--r--fs/proc/task_nommu.c71
-rw-r--r--fs/proc/uptime.c11
-rw-r--r--fs/proc/vmcore.c24
22 files changed, 1451 insertions, 3582 deletions
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 15af6222f8a..ddb83a0e15e 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -67,3 +67,15 @@ config PROC_PAGE_MONITOR
67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, 67 /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
68 /proc/kpagecount, and /proc/kpageflags. Disabling these 68 /proc/kpagecount, and /proc/kpageflags. Disabling these
69 interfaces will reduce the size of the kernel by approximately 4kb. 69 interfaces will reduce the size of the kernel by approximately 4kb.
70
71config REPORT_PRESENT_CPUS
72 default n
73 depends on PROC_FS && SMP
74 bool "Report present cpus instead of online cpus"
75 help
76 This is a work around to report Present CPUs instead of Online CPUs.
77 Some power savings implements use CPU hotplug for power domains.
78 It is a bug to enable this on a server or other architecture that
79 uses cpu hotplug in the correct way.
80
81
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 981b0560193..c1c72933592 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,7 +8,7 @@ proc-y := nommu.o task_nommu.o
8proc-$(CONFIG_MMU) := mmu.o task_mmu.o 8proc-$(CONFIG_MMU) := mmu.o task_mmu.o
9 9
10proc-y += inode.o root.o base.o generic.o array.o \ 10proc-y += inode.o root.o base.o generic.o array.o \
11 proc_tty.o fd.o 11 proc_tty.o
12proc-y += cmdline.o 12proc-y += cmdline.o
13proc-y += consoles.o 13proc-y += consoles.o
14proc-y += cpuinfo.o 14proc-y += cpuinfo.o
@@ -21,7 +21,6 @@ proc-y += uptime.o
21proc-y += version.o 21proc-y += version.o
22proc-y += softirqs.o 22proc-y += softirqs.o
23proc-y += namespaces.o 23proc-y += namespaces.o
24proc-y += self.o
25proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o 24proc-$(CONFIG_PROC_SYSCTL) += proc_sysctl.o
26proc-$(CONFIG_NET) += proc_net.o 25proc-$(CONFIG_NET) += proc_net.o
27proc-$(CONFIG_PROC_KCORE) += kcore.o 26proc-$(CONFIG_PROC_KCORE) += kcore.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6a91e6ffbcb..3a1dafd228d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,7 +81,6 @@
81#include <linux/pid_namespace.h> 81#include <linux/pid_namespace.h>
82#include <linux/ptrace.h> 82#include <linux/ptrace.h>
83#include <linux/tracehook.h> 83#include <linux/tracehook.h>
84#include <linux/user_namespace.h>
85 84
86#include <asm/pgtable.h> 85#include <asm/pgtable.h>
87#include <asm/processor.h> 86#include <asm/processor.h>
@@ -162,7 +161,6 @@ static inline const char *get_task_state(struct task_struct *tsk)
162static inline void task_state(struct seq_file *m, struct pid_namespace *ns, 161static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
163 struct pid *pid, struct task_struct *p) 162 struct pid *pid, struct task_struct *p)
164{ 163{
165 struct user_namespace *user_ns = seq_user_ns(m);
166 struct group_info *group_info; 164 struct group_info *group_info;
167 int g; 165 int g;
168 struct fdtable *fdt = NULL; 166 struct fdtable *fdt = NULL;
@@ -191,14 +189,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
191 task_tgid_nr_ns(p, ns), 189 task_tgid_nr_ns(p, ns),
192 pid_nr_ns(pid, ns), 190 pid_nr_ns(pid, ns),
193 ppid, tpid, 191 ppid, tpid,
194 from_kuid_munged(user_ns, cred->uid), 192 cred->uid, cred->euid, cred->suid, cred->fsuid,
195 from_kuid_munged(user_ns, cred->euid), 193 cred->gid, cred->egid, cred->sgid, cred->fsgid);
196 from_kuid_munged(user_ns, cred->suid),
197 from_kuid_munged(user_ns, cred->fsuid),
198 from_kgid_munged(user_ns, cred->gid),
199 from_kgid_munged(user_ns, cred->egid),
200 from_kgid_munged(user_ns, cred->sgid),
201 from_kgid_munged(user_ns, cred->fsgid));
202 194
203 task_lock(p); 195 task_lock(p);
204 if (p->files) 196 if (p->files)
@@ -212,15 +204,14 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
212 group_info = cred->group_info; 204 group_info = cred->group_info;
213 task_unlock(p); 205 task_unlock(p);
214 206
215 for (g = 0; g < group_info->ngroups; g++) 207 for (g = 0; g < min(group_info->ngroups, NGROUPS_SMALL); g++)
216 seq_printf(m, "%d ", 208 seq_printf(m, "%d ", GROUP_AT(group_info, g));
217 from_kgid_munged(user_ns, GROUP_AT(group_info, g)));
218 put_cred(cred); 209 put_cred(cred);
219 210
220 seq_putc(m, '\n'); 211 seq_putc(m, '\n');
221} 212}
222 213
223void render_sigset_t(struct seq_file *m, const char *header, 214static void render_sigset_t(struct seq_file *m, const char *header,
224 sigset_t *set) 215 sigset_t *set)
225{ 216{
226 int i; 217 int i;
@@ -308,10 +299,6 @@ static void render_cap_t(struct seq_file *m, const char *header,
308 seq_putc(m, '\n'); 299 seq_putc(m, '\n');
309} 300}
310 301
311/* Remove non-existent capabilities */
312#define NORM_CAPS(v) (v.cap[CAP_TO_INDEX(CAP_LAST_CAP)] &= \
313 CAP_TO_MASK(CAP_LAST_CAP + 1) - 1)
314
315static inline void task_cap(struct seq_file *m, struct task_struct *p) 302static inline void task_cap(struct seq_file *m, struct task_struct *p)
316{ 303{
317 const struct cred *cred; 304 const struct cred *cred;
@@ -325,24 +312,12 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
325 cap_bset = cred->cap_bset; 312 cap_bset = cred->cap_bset;
326 rcu_read_unlock(); 313 rcu_read_unlock();
327 314
328 NORM_CAPS(cap_inheritable);
329 NORM_CAPS(cap_permitted);
330 NORM_CAPS(cap_effective);
331 NORM_CAPS(cap_bset);
332
333 render_cap_t(m, "CapInh:\t", &cap_inheritable); 315 render_cap_t(m, "CapInh:\t", &cap_inheritable);
334 render_cap_t(m, "CapPrm:\t", &cap_permitted); 316 render_cap_t(m, "CapPrm:\t", &cap_permitted);
335 render_cap_t(m, "CapEff:\t", &cap_effective); 317 render_cap_t(m, "CapEff:\t", &cap_effective);
336 render_cap_t(m, "CapBnd:\t", &cap_bset); 318 render_cap_t(m, "CapBnd:\t", &cap_bset);
337} 319}
338 320
339static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
340{
341#ifdef CONFIG_SECCOMP
342 seq_printf(m, "Seccomp:\t%d\n", p->seccomp.mode);
343#endif
344}
345
346static inline void task_context_switch_counts(struct seq_file *m, 321static inline void task_context_switch_counts(struct seq_file *m,
347 struct task_struct *p) 322 struct task_struct *p)
348{ 323{
@@ -376,7 +351,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
376 } 351 }
377 task_sig(m, task); 352 task_sig(m, task);
378 task_cap(m, task); 353 task_cap(m, task);
379 task_seccomp(m, task);
380 task_cpus_allowed(m, task); 354 task_cpus_allowed(m, task);
381 cpuset_task_status_allowed(m, task); 355 cpuset_task_status_allowed(m, task);
382 task_context_switch_counts(m, task); 356 task_context_switch_counts(m, task);
@@ -387,7 +361,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
387 struct pid *pid, struct task_struct *task, int whole) 361 struct pid *pid, struct task_struct *task, int whole)
388{ 362{
389 unsigned long vsize, eip, esp, wchan = ~0UL; 363 unsigned long vsize, eip, esp, wchan = ~0UL;
390 int priority, nice; 364 long priority, nice;
391 int tty_pgrp = -1, tty_nr = 0; 365 int tty_pgrp = -1, tty_nr = 0;
392 sigset_t sigign, sigcatch; 366 sigset_t sigign, sigcatch;
393 char state; 367 char state;
@@ -406,7 +380,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
406 380
407 state = *get_task_state(task); 381 state = *get_task_state(task);
408 vsize = eip = esp = 0; 382 vsize = eip = esp = 0;
409 permitted = ptrace_may_access(task, PTRACE_MODE_READ | PTRACE_MODE_NOAUDIT); 383 permitted = ptrace_may_access(task, PTRACE_MODE_READ);
410 mm = get_task_mm(task); 384 mm = get_task_mm(task);
411 if (mm) { 385 if (mm) {
412 vsize = task_vsize(mm); 386 vsize = task_vsize(mm);
@@ -420,8 +394,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
420 394
421 sigemptyset(&sigign); 395 sigemptyset(&sigign);
422 sigemptyset(&sigcatch); 396 sigemptyset(&sigcatch);
423 cutime = cstime = utime = stime = 0; 397 cutime = cstime = utime = stime = cputime_zero;
424 cgtime = gtime = 0; 398 cgtime = gtime = cputime_zero;
425 399
426 if (lock_task_sighand(task, &flags)) { 400 if (lock_task_sighand(task, &flags)) {
427 struct signal_struct *sig = task->signal; 401 struct signal_struct *sig = task->signal;
@@ -449,14 +423,14 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
449 do { 423 do {
450 min_flt += t->min_flt; 424 min_flt += t->min_flt;
451 maj_flt += t->maj_flt; 425 maj_flt += t->maj_flt;
452 gtime += t->gtime; 426 gtime = cputime_add(gtime, t->gtime);
453 t = next_thread(t); 427 t = next_thread(t);
454 } while (t != task); 428 } while (t != task);
455 429
456 min_flt += sig->min_flt; 430 min_flt += sig->min_flt;
457 maj_flt += sig->maj_flt; 431 maj_flt += sig->maj_flt;
458 thread_group_cputime_adjusted(task, &utime, &stime); 432 thread_group_times(task, &utime, &stime);
459 gtime += sig->gtime; 433 gtime = cputime_add(gtime, sig->gtime);
460 } 434 }
461 435
462 sid = task_session_nr_ns(task, ns); 436 sid = task_session_nr_ns(task, ns);
@@ -471,7 +445,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
471 if (!whole) { 445 if (!whole) {
472 min_flt = task->min_flt; 446 min_flt = task->min_flt;
473 maj_flt = task->maj_flt; 447 maj_flt = task->maj_flt;
474 task_cputime_adjusted(task, &utime, &stime); 448 task_times(task, &utime, &stime);
475 gtime = task->gtime; 449 gtime = task->gtime;
476 } 450 }
477 451
@@ -488,70 +462,56 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
488 /* convert nsec -> ticks */ 462 /* convert nsec -> ticks */
489 start_time = nsec_to_clock_t(start_time); 463 start_time = nsec_to_clock_t(start_time);
490 464
491 seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state); 465 seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
492 seq_put_decimal_ll(m, ' ', ppid); 466%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
493 seq_put_decimal_ll(m, ' ', pgid); 467%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
494 seq_put_decimal_ll(m, ' ', sid); 468 pid_nr_ns(pid, ns),
495 seq_put_decimal_ll(m, ' ', tty_nr); 469 tcomm,
496 seq_put_decimal_ll(m, ' ', tty_pgrp); 470 state,
497 seq_put_decimal_ull(m, ' ', task->flags); 471 ppid,
498 seq_put_decimal_ull(m, ' ', min_flt); 472 pgid,
499 seq_put_decimal_ull(m, ' ', cmin_flt); 473 sid,
500 seq_put_decimal_ull(m, ' ', maj_flt); 474 tty_nr,
501 seq_put_decimal_ull(m, ' ', cmaj_flt); 475 tty_pgrp,
502 seq_put_decimal_ull(m, ' ', cputime_to_clock_t(utime)); 476 task->flags,
503 seq_put_decimal_ull(m, ' ', cputime_to_clock_t(stime)); 477 min_flt,
504 seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cutime)); 478 cmin_flt,
505 seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cstime)); 479 maj_flt,
506 seq_put_decimal_ll(m, ' ', priority); 480 cmaj_flt,
507 seq_put_decimal_ll(m, ' ', nice); 481 cputime_to_clock_t(utime),
508 seq_put_decimal_ll(m, ' ', num_threads); 482 cputime_to_clock_t(stime),
509 seq_put_decimal_ull(m, ' ', 0); 483 cputime_to_clock_t(cutime),
510 seq_put_decimal_ull(m, ' ', start_time); 484 cputime_to_clock_t(cstime),
511 seq_put_decimal_ull(m, ' ', vsize); 485 priority,
512 seq_put_decimal_ull(m, ' ', mm ? get_mm_rss(mm) : 0); 486 nice,
513 seq_put_decimal_ull(m, ' ', rsslim); 487 num_threads,
514 seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->start_code : 1) : 0); 488 start_time,
515 seq_put_decimal_ull(m, ' ', mm ? (permitted ? mm->end_code : 1) : 0); 489 vsize,
516 seq_put_decimal_ull(m, ' ', (permitted && mm) ? mm->start_stack : 0); 490 mm ? get_mm_rss(mm) : 0,
517 seq_put_decimal_ull(m, ' ', esp); 491 rsslim,
518 seq_put_decimal_ull(m, ' ', eip); 492 mm ? (permitted ? mm->start_code : 1) : 0,
519 /* The signal information here is obsolete. 493 mm ? (permitted ? mm->end_code : 1) : 0,
520 * It must be decimal for Linux 2.0 compatibility. 494 (permitted && mm) ? mm->start_stack : 0,
521 * Use /proc/#/status for real-time signals. 495 esp,
522 */ 496 eip,
523 seq_put_decimal_ull(m, ' ', task->pending.signal.sig[0] & 0x7fffffffUL); 497 /* The signal information here is obsolete.
524 seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); 498 * It must be decimal for Linux 2.0 compatibility.
525 seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); 499 * Use /proc/#/status for real-time signals.
526 seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); 500 */
527 seq_put_decimal_ull(m, ' ', wchan); 501 task->pending.signal.sig[0] & 0x7fffffffUL,
528 seq_put_decimal_ull(m, ' ', 0); 502 task->blocked.sig[0] & 0x7fffffffUL,
529 seq_put_decimal_ull(m, ' ', 0); 503 sigign .sig[0] & 0x7fffffffUL,
530 seq_put_decimal_ll(m, ' ', task->exit_signal); 504 sigcatch .sig[0] & 0x7fffffffUL,
531 seq_put_decimal_ll(m, ' ', task_cpu(task)); 505 wchan,
532 seq_put_decimal_ull(m, ' ', task->rt_priority); 506 0UL,
533 seq_put_decimal_ull(m, ' ', task->policy); 507 0UL,
534 seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task)); 508 task->exit_signal,
535 seq_put_decimal_ull(m, ' ', cputime_to_clock_t(gtime)); 509 task_cpu(task),
536 seq_put_decimal_ll(m, ' ', cputime_to_clock_t(cgtime)); 510 task->rt_priority,
537 511 task->policy,
538 if (mm && permitted) { 512 (unsigned long long)delayacct_blkio_ticks(task),
539 seq_put_decimal_ull(m, ' ', mm->start_data); 513 cputime_to_clock_t(gtime),
540 seq_put_decimal_ull(m, ' ', mm->end_data); 514 cputime_to_clock_t(cgtime));
541 seq_put_decimal_ull(m, ' ', mm->start_brk);
542 seq_put_decimal_ull(m, ' ', mm->arg_start);
543 seq_put_decimal_ull(m, ' ', mm->arg_end);
544 seq_put_decimal_ull(m, ' ', mm->env_start);
545 seq_put_decimal_ull(m, ' ', mm->env_end);
546 } else
547 seq_printf(m, " 0 0 0 0 0 0 0");
548
549 if (permitted)
550 seq_put_decimal_ll(m, ' ', task->exit_code);
551 else
552 seq_put_decimal_ll(m, ' ', 0);
553
554 seq_putc(m, '\n');
555 if (mm) 515 if (mm)
556 mmput(mm); 516 mmput(mm);
557 return 0; 517 return 0;
@@ -579,143 +539,8 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
579 size = task_statm(mm, &shared, &text, &data, &resident); 539 size = task_statm(mm, &shared, &text, &data, &resident);
580 mmput(mm); 540 mmput(mm);
581 } 541 }
582 /* 542 seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
583 * For quick read, open code by putting numbers directly 543 size, resident, shared, text, data);
584 * expected format is
585 * seq_printf(m, "%lu %lu %lu %lu 0 %lu 0\n",
586 * size, resident, shared, text, data);
587 */
588 seq_put_decimal_ull(m, 0, size);
589 seq_put_decimal_ull(m, ' ', resident);
590 seq_put_decimal_ull(m, ' ', shared);
591 seq_put_decimal_ull(m, ' ', text);
592 seq_put_decimal_ull(m, ' ', 0);
593 seq_put_decimal_ull(m, ' ', data);
594 seq_put_decimal_ull(m, ' ', 0);
595 seq_putc(m, '\n');
596
597 return 0;
598}
599
600#ifdef CONFIG_CHECKPOINT_RESTORE
601static struct pid *
602get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos)
603{
604 struct task_struct *start, *task;
605 struct pid *pid = NULL;
606
607 read_lock(&tasklist_lock);
608
609 start = pid_task(proc_pid(inode), PIDTYPE_PID);
610 if (!start)
611 goto out;
612
613 /*
614 * Lets try to continue searching first, this gives
615 * us significant speedup on children-rich processes.
616 */
617 if (pid_prev) {
618 task = pid_task(pid_prev, PIDTYPE_PID);
619 if (task && task->real_parent == start &&
620 !(list_empty(&task->sibling))) {
621 if (list_is_last(&task->sibling, &start->children))
622 goto out;
623 task = list_first_entry(&task->sibling,
624 struct task_struct, sibling);
625 pid = get_pid(task_pid(task));
626 goto out;
627 }
628 }
629
630 /*
631 * Slow search case.
632 *
633 * We might miss some children here if children
634 * are exited while we were not holding the lock,
635 * but it was never promised to be accurate that
636 * much.
637 *
638 * "Just suppose that the parent sleeps, but N children
639 * exit after we printed their tids. Now the slow paths
640 * skips N extra children, we miss N tasks." (c)
641 *
642 * So one need to stop or freeze the leader and all
643 * its children to get a precise result.
644 */
645 list_for_each_entry(task, &start->children, sibling) {
646 if (pos-- == 0) {
647 pid = get_pid(task_pid(task));
648 break;
649 }
650 }
651
652out:
653 read_unlock(&tasklist_lock);
654 return pid;
655}
656
657static int children_seq_show(struct seq_file *seq, void *v)
658{
659 struct inode *inode = seq->private;
660 pid_t pid;
661 544
662 pid = pid_nr_ns(v, inode->i_sb->s_fs_info);
663 return seq_printf(seq, "%d ", pid);
664}
665
666static void *children_seq_start(struct seq_file *seq, loff_t *pos)
667{
668 return get_children_pid(seq->private, NULL, *pos);
669}
670
671static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
672{
673 struct pid *pid;
674
675 pid = get_children_pid(seq->private, v, *pos + 1);
676 put_pid(v);
677
678 ++*pos;
679 return pid;
680}
681
682static void children_seq_stop(struct seq_file *seq, void *v)
683{
684 put_pid(v);
685}
686
687static const struct seq_operations children_seq_ops = {
688 .start = children_seq_start,
689 .next = children_seq_next,
690 .stop = children_seq_stop,
691 .show = children_seq_show,
692};
693
694static int children_seq_open(struct inode *inode, struct file *file)
695{
696 struct seq_file *m;
697 int ret;
698
699 ret = seq_open(file, &children_seq_ops);
700 if (ret)
701 return ret;
702
703 m = file->private_data;
704 m->private = inode;
705
706 return ret;
707}
708
709int children_seq_release(struct inode *inode, struct file *file)
710{
711 seq_release(inode, file);
712 return 0; 545 return 0;
713} 546}
714
715const struct file_operations proc_tid_children_operations = {
716 .open = children_seq_open,
717 .read = seq_read,
718 .llseek = seq_lseek,
719 .release = children_seq_release,
720};
721#endif /* CONFIG_CHECKPOINT_RESTORE */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9b43ff77a51..fc03d161a1d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -81,16 +81,12 @@
81#include <linux/oom.h> 81#include <linux/oom.h>
82#include <linux/elf.h> 82#include <linux/elf.h>
83#include <linux/pid_namespace.h> 83#include <linux/pid_namespace.h>
84#include <linux/user_namespace.h>
85#include <linux/fs_struct.h> 84#include <linux/fs_struct.h>
86#include <linux/slab.h> 85#include <linux/slab.h>
87#include <linux/flex_array.h>
88#ifdef CONFIG_HARDWALL 86#ifdef CONFIG_HARDWALL
89#include <asm/hardwall.h> 87#include <asm/hardwall.h>
90#endif 88#endif
91#include <trace/events/oom.h>
92#include "internal.h" 89#include "internal.h"
93#include "fd.h"
94 90
95/* NOTE: 91/* NOTE:
96 * Implementing inode permission operations in /proc is almost 92 * Implementing inode permission operations in /proc is almost
@@ -105,7 +101,7 @@
105struct pid_entry { 101struct pid_entry {
106 char *name; 102 char *name;
107 int len; 103 int len;
108 umode_t mode; 104 mode_t mode;
109 const struct inode_operations *iop; 105 const struct inode_operations *iop;
110 const struct file_operations *fop; 106 const struct file_operations *fop;
111 union proc_op op; 107 union proc_op op;
@@ -137,6 +133,12 @@ struct pid_entry {
137 NULL, &proc_single_file_operations, \ 133 NULL, &proc_single_file_operations, \
138 { .proc_show = show } ) 134 { .proc_show = show } )
139 135
136/* ANDROID is for special files in /proc. */
137#define ANDROID(NAME, MODE, OTYPE) \
138 NOD(NAME, (S_IFREG|(MODE)), \
139 &proc_##OTYPE##_inode_operations, \
140 &proc_##OTYPE##_operations, {})
141
140/* 142/*
141 * Count the number of hardlinks for the pid_entry table, excluding the . 143 * Count the number of hardlinks for the pid_entry table, excluding the .
142 * and .. links. 144 * and .. links.
@@ -169,9 +171,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
169 return result; 171 return result;
170} 172}
171 173
172static int proc_cwd_link(struct dentry *dentry, struct path *path) 174static int proc_cwd_link(struct inode *inode, struct path *path)
173{ 175{
174 struct task_struct *task = get_proc_task(dentry->d_inode); 176 struct task_struct *task = get_proc_task(inode);
175 int result = -ENOENT; 177 int result = -ENOENT;
176 178
177 if (task) { 179 if (task) {
@@ -186,9 +188,9 @@ static int proc_cwd_link(struct dentry *dentry, struct path *path)
186 return result; 188 return result;
187} 189}
188 190
189static int proc_root_link(struct dentry *dentry, struct path *path) 191static int proc_root_link(struct inode *inode, struct path *path)
190{ 192{
191 struct task_struct *task = get_proc_task(dentry->d_inode); 193 struct task_struct *task = get_proc_task(inode);
192 int result = -ENOENT; 194 int result = -ENOENT;
193 195
194 if (task) { 196 if (task) {
@@ -198,6 +200,85 @@ static int proc_root_link(struct dentry *dentry, struct path *path)
198 return result; 200 return result;
199} 201}
200 202
203static struct mm_struct *__check_mem_permission(struct task_struct *task)
204{
205 struct mm_struct *mm;
206
207 mm = get_task_mm(task);
208 if (!mm)
209 return ERR_PTR(-EINVAL);
210
211 /*
212 * A task can always look at itself, in case it chooses
213 * to use system calls instead of load instructions.
214 */
215 if (task == current)
216 return mm;
217
218 /*
219 * If current is actively ptrace'ing, and would also be
220 * permitted to freshly attach with ptrace now, permit it.
221 */
222 if (task_is_stopped_or_traced(task)) {
223 int match;
224 rcu_read_lock();
225 match = (ptrace_parent(task) == current);
226 rcu_read_unlock();
227 if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
228 return mm;
229 }
230
231 /*
232 * No one else is allowed.
233 */
234 mmput(mm);
235 return ERR_PTR(-EPERM);
236}
237
238/*
239 * If current may access user memory in @task return a reference to the
240 * corresponding mm, otherwise ERR_PTR.
241 */
242static struct mm_struct *check_mem_permission(struct task_struct *task)
243{
244 struct mm_struct *mm;
245 int err;
246
247 /*
248 * Avoid racing if task exec's as we might get a new mm but validate
249 * against old credentials.
250 */
251 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
252 if (err)
253 return ERR_PTR(err);
254
255 mm = __check_mem_permission(task);
256 mutex_unlock(&task->signal->cred_guard_mutex);
257
258 return mm;
259}
260
261struct mm_struct *mm_for_maps(struct task_struct *task)
262{
263 struct mm_struct *mm;
264 int err;
265
266 err = mutex_lock_killable(&task->signal->cred_guard_mutex);
267 if (err)
268 return ERR_PTR(err);
269
270 mm = get_task_mm(task);
271 if (mm && mm != current->mm &&
272 !ptrace_may_access(task, PTRACE_MODE_READ) &&
273 !capable(CAP_SYS_RESOURCE)) {
274 mmput(mm);
275 mm = ERR_PTR(-EACCES);
276 }
277 mutex_unlock(&task->signal->cred_guard_mutex);
278
279 return mm;
280}
281
201static int proc_pid_cmdline(struct task_struct *task, char * buffer) 282static int proc_pid_cmdline(struct task_struct *task, char * buffer)
202{ 283{
203 int res = 0; 284 int res = 0;
@@ -237,7 +318,7 @@ out:
237 318
238static int proc_pid_auxv(struct task_struct *task, char *buffer) 319static int proc_pid_auxv(struct task_struct *task, char *buffer)
239{ 320{
240 struct mm_struct *mm = mm_access(task, PTRACE_MODE_READ); 321 struct mm_struct *mm = mm_for_maps(task);
241 int res = PTR_ERR(mm); 322 int res = PTR_ERR(mm);
242 if (mm && !IS_ERR(mm)) { 323 if (mm && !IS_ERR(mm)) {
243 unsigned int nwords = 0; 324 unsigned int nwords = 0;
@@ -405,13 +486,12 @@ static const struct file_operations proc_lstats_operations = {
405 486
406static int proc_oom_score(struct task_struct *task, char *buffer) 487static int proc_oom_score(struct task_struct *task, char *buffer)
407{ 488{
408 unsigned long totalpages = totalram_pages + total_swap_pages;
409 unsigned long points = 0; 489 unsigned long points = 0;
410 490
411 read_lock(&tasklist_lock); 491 read_lock(&tasklist_lock);
412 if (pid_alive(task)) 492 if (pid_alive(task))
413 points = oom_badness(task, NULL, NULL, totalpages) * 493 points = oom_badness(task, NULL, NULL,
414 1000 / totalpages; 494 totalram_pages + total_swap_pages);
415 read_unlock(&tasklist_lock); 495 read_unlock(&tasklist_lock);
416 return sprintf(buffer, "%lu\n", points); 496 return sprintf(buffer, "%lu\n", points);
417} 497}
@@ -542,59 +622,134 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
542 if (error) 622 if (error)
543 return error; 623 return error;
544 624
625 if ((attr->ia_valid & ATTR_SIZE) &&
626 attr->ia_size != i_size_read(inode)) {
627 error = vmtruncate(inode, attr->ia_size);
628 if (error)
629 return error;
630 }
631
545 setattr_copy(inode, attr); 632 setattr_copy(inode, attr);
546 mark_inode_dirty(inode); 633 mark_inode_dirty(inode);
547 return 0; 634 return 0;
548} 635}
549 636
550/* 637static const struct inode_operations proc_def_inode_operations = {
551 * May current process learn task's sched/cmdline info (for hide_pid_min=1) 638 .setattr = proc_setattr,
552 * or euid/egid (for hide_pid_min=2)? 639};
553 */ 640
554static bool has_pid_permissions(struct pid_namespace *pid, 641static int mounts_open_common(struct inode *inode, struct file *file,
555 struct task_struct *task, 642 const struct seq_operations *op)
556 int hide_pid_min)
557{ 643{
558 if (pid->hide_pid < hide_pid_min) 644 struct task_struct *task = get_proc_task(inode);
559 return true; 645 struct nsproxy *nsp;
560 if (in_group_p(pid->pid_gid)) 646 struct mnt_namespace *ns = NULL;
561 return true; 647 struct path root;
562 return ptrace_may_access(task, PTRACE_MODE_READ); 648 struct proc_mounts *p;
649 int ret = -EINVAL;
650
651 if (task) {
652 rcu_read_lock();
653 nsp = task_nsproxy(task);
654 if (nsp) {
655 ns = nsp->mnt_ns;
656 if (ns)
657 get_mnt_ns(ns);
658 }
659 rcu_read_unlock();
660 if (ns && get_task_root(task, &root) == 0)
661 ret = 0;
662 put_task_struct(task);
663 }
664
665 if (!ns)
666 goto err;
667 if (ret)
668 goto err_put_ns;
669
670 ret = -ENOMEM;
671 p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
672 if (!p)
673 goto err_put_path;
674
675 file->private_data = &p->m;
676 ret = seq_open(file, op);
677 if (ret)
678 goto err_free;
679
680 p->m.private = p;
681 p->ns = ns;
682 p->root = root;
683 p->m.poll_event = ns->event;
684
685 return 0;
686
687 err_free:
688 kfree(p);
689 err_put_path:
690 path_put(&root);
691 err_put_ns:
692 put_mnt_ns(ns);
693 err:
694 return ret;
563} 695}
564 696
697static int mounts_release(struct inode *inode, struct file *file)
698{
699 struct proc_mounts *p = file->private_data;
700 path_put(&p->root);
701 put_mnt_ns(p->ns);
702 return seq_release(inode, file);
703}
565 704
566static int proc_pid_permission(struct inode *inode, int mask) 705static unsigned mounts_poll(struct file *file, poll_table *wait)
567{ 706{
568 struct pid_namespace *pid = inode->i_sb->s_fs_info; 707 struct proc_mounts *p = file->private_data;
569 struct task_struct *task; 708 unsigned res = POLLIN | POLLRDNORM;
570 bool has_perms;
571 709
572 task = get_proc_task(inode); 710 poll_wait(file, &p->ns->poll, wait);
573 if (!task) 711 if (mnt_had_events(p))
574 return -ESRCH; 712 res |= POLLERR | POLLPRI;
575 has_perms = has_pid_permissions(pid, task, 1);
576 put_task_struct(task);
577 713
578 if (!has_perms) { 714 return res;
579 if (pid->hide_pid == 2) { 715}
580 /*
581 * Let's make getdents(), stat(), and open()
582 * consistent with each other. If a process
583 * may not stat() a file, it shouldn't be seen
584 * in procfs at all.
585 */
586 return -ENOENT;
587 }
588 716
589 return -EPERM; 717static int mounts_open(struct inode *inode, struct file *file)
590 } 718{
591 return generic_permission(inode, mask); 719 return mounts_open_common(inode, file, &mounts_op);
592} 720}
593 721
722static const struct file_operations proc_mounts_operations = {
723 .open = mounts_open,
724 .read = seq_read,
725 .llseek = seq_lseek,
726 .release = mounts_release,
727 .poll = mounts_poll,
728};
594 729
730static int mountinfo_open(struct inode *inode, struct file *file)
731{
732 return mounts_open_common(inode, file, &mountinfo_op);
733}
595 734
596static const struct inode_operations proc_def_inode_operations = { 735static const struct file_operations proc_mountinfo_operations = {
597 .setattr = proc_setattr, 736 .open = mountinfo_open,
737 .read = seq_read,
738 .llseek = seq_lseek,
739 .release = mounts_release,
740 .poll = mounts_poll,
741};
742
743static int mountstats_open(struct inode *inode, struct file *file)
744{
745 return mounts_open_common(inode, file, &mountstats_op);
746}
747
748static const struct file_operations proc_mountstats_operations = {
749 .open = mountstats_open,
750 .read = seq_read,
751 .llseek = seq_lseek,
752 .release = mounts_release,
598}; 753};
599 754
600#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */ 755#define PROC_BLOCK_SIZE (3*1024) /* 4K page size but our output routines use some slack for overruns */
@@ -666,105 +821,139 @@ static const struct file_operations proc_single_file_operations = {
666 .release = single_release, 821 .release = single_release,
667}; 822};
668 823
669static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) 824static int mem_open(struct inode* inode, struct file* file)
825{
826 file->private_data = (void*)((long)current->self_exec_id);
827 /* OK to pass negative loff_t, we can catch out-of-range */
828 file->f_mode |= FMODE_UNSIGNED_OFFSET;
829 return 0;
830}
831
832static ssize_t mem_read(struct file * file, char __user * buf,
833 size_t count, loff_t *ppos)
670{ 834{
671 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 835 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
836 char *page;
837 unsigned long src = *ppos;
838 int ret = -ESRCH;
672 struct mm_struct *mm; 839 struct mm_struct *mm;
673 840
674 if (!task) 841 if (!task)
675 return -ESRCH; 842 goto out_no_task;
676 843
677 mm = mm_access(task, mode); 844 ret = -ENOMEM;
678 put_task_struct(task); 845 page = (char *)__get_free_page(GFP_TEMPORARY);
846 if (!page)
847 goto out;
679 848
849 mm = check_mem_permission(task);
850 ret = PTR_ERR(mm);
680 if (IS_ERR(mm)) 851 if (IS_ERR(mm))
681 return PTR_ERR(mm); 852 goto out_free;
682
683 if (mm) {
684 /* ensure this mm_struct can't be freed */
685 atomic_inc(&mm->mm_count);
686 /* but do not pin its memory */
687 mmput(mm);
688 }
689 853
690 file->private_data = mm; 854 ret = -EIO;
855
856 if (file->private_data != (void*)((long)current->self_exec_id))
857 goto out_put;
691 858
692 return 0; 859 ret = 0;
693} 860
861 while (count > 0) {
862 int this_len, retval;
694 863
695static int mem_open(struct inode *inode, struct file *file) 864 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
696{ 865 retval = access_remote_vm(mm, src, page, this_len, 0);
697 int ret = __mem_open(inode, file, PTRACE_MODE_ATTACH); 866 if (!retval) {
867 if (!ret)
868 ret = -EIO;
869 break;
870 }
698 871
699 /* OK to pass negative loff_t, we can catch out-of-range */ 872 if (copy_to_user(buf, page, retval)) {
700 file->f_mode |= FMODE_UNSIGNED_OFFSET; 873 ret = -EFAULT;
874 break;
875 }
876
877 ret += retval;
878 src += retval;
879 buf += retval;
880 count -= retval;
881 }
882 *ppos = src;
701 883
884out_put:
885 mmput(mm);
886out_free:
887 free_page((unsigned long) page);
888out:
889 put_task_struct(task);
890out_no_task:
702 return ret; 891 return ret;
703} 892}
704 893
705static ssize_t mem_rw(struct file *file, char __user *buf, 894#define mem_write NULL
706 size_t count, loff_t *ppos, int write) 895
896#ifndef mem_write
897/* This is a security hazard */
898static ssize_t mem_write(struct file * file, const char __user *buf,
899 size_t count, loff_t *ppos)
707{ 900{
708 struct mm_struct *mm = file->private_data; 901 int copied;
709 unsigned long addr = *ppos;
710 ssize_t copied;
711 char *page; 902 char *page;
903 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
904 unsigned long dst = *ppos;
905 struct mm_struct *mm;
712 906
713 if (!mm) 907 copied = -ESRCH;
714 return 0; 908 if (!task)
909 goto out_no_task;
715 910
911 copied = -ENOMEM;
716 page = (char *)__get_free_page(GFP_TEMPORARY); 912 page = (char *)__get_free_page(GFP_TEMPORARY);
717 if (!page) 913 if (!page)
718 return -ENOMEM; 914 goto out_task;
719 915
720 copied = 0; 916 mm = check_mem_permission(task);
721 if (!atomic_inc_not_zero(&mm->mm_users)) 917 copied = PTR_ERR(mm);
722 goto free; 918 if (IS_ERR(mm))
919 goto out_free;
723 920
921 copied = -EIO;
922 if (file->private_data != (void *)((long)current->self_exec_id))
923 goto out_mm;
924
925 copied = 0;
724 while (count > 0) { 926 while (count > 0) {
725 int this_len = min_t(int, count, PAGE_SIZE); 927 int this_len, retval;
726 928
727 if (write && copy_from_user(page, buf, this_len)) { 929 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
930 if (copy_from_user(page, buf, this_len)) {
728 copied = -EFAULT; 931 copied = -EFAULT;
729 break; 932 break;
730 } 933 }
731 934 retval = access_remote_vm(mm, dst, page, this_len, 1);
732 this_len = access_remote_vm(mm, addr, page, this_len, write); 935 if (!retval) {
733 if (!this_len) {
734 if (!copied) 936 if (!copied)
735 copied = -EIO; 937 copied = -EIO;
736 break; 938 break;
737 } 939 }
738 940 copied += retval;
739 if (!write && copy_to_user(buf, page, this_len)) { 941 buf += retval;
740 copied = -EFAULT; 942 dst += retval;
741 break; 943 count -= retval;
742 }
743
744 buf += this_len;
745 addr += this_len;
746 copied += this_len;
747 count -= this_len;
748 } 944 }
749 *ppos = addr; 945 *ppos = dst;
750 946
947out_mm:
751 mmput(mm); 948 mmput(mm);
752free: 949out_free:
753 free_page((unsigned long) page); 950 free_page((unsigned long) page);
951out_task:
952 put_task_struct(task);
953out_no_task:
754 return copied; 954 return copied;
755} 955}
756 956#endif
757static ssize_t mem_read(struct file *file, char __user *buf,
758 size_t count, loff_t *ppos)
759{
760 return mem_rw(file, buf, count, ppos, 0);
761}
762
763static ssize_t mem_write(struct file *file, const char __user *buf,
764 size_t count, loff_t *ppos)
765{
766 return mem_rw(file, (char __user*)buf, count, ppos, 1);
767}
768 957
769loff_t mem_lseek(struct file *file, loff_t offset, int orig) 958loff_t mem_lseek(struct file *file, loff_t offset, int orig)
770{ 959{
@@ -782,58 +971,49 @@ loff_t mem_lseek(struct file *file, loff_t offset, int orig)
782 return file->f_pos; 971 return file->f_pos;
783} 972}
784 973
785static int mem_release(struct inode *inode, struct file *file)
786{
787 struct mm_struct *mm = file->private_data;
788 if (mm)
789 mmdrop(mm);
790 return 0;
791}
792
793static const struct file_operations proc_mem_operations = { 974static const struct file_operations proc_mem_operations = {
794 .llseek = mem_lseek, 975 .llseek = mem_lseek,
795 .read = mem_read, 976 .read = mem_read,
796 .write = mem_write, 977 .write = mem_write,
797 .open = mem_open, 978 .open = mem_open,
798 .release = mem_release,
799}; 979};
800 980
801static int environ_open(struct inode *inode, struct file *file)
802{
803 return __mem_open(inode, file, PTRACE_MODE_READ);
804}
805
806static ssize_t environ_read(struct file *file, char __user *buf, 981static ssize_t environ_read(struct file *file, char __user *buf,
807 size_t count, loff_t *ppos) 982 size_t count, loff_t *ppos)
808{ 983{
984 struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
809 char *page; 985 char *page;
810 unsigned long src = *ppos; 986 unsigned long src = *ppos;
811 int ret = 0; 987 int ret = -ESRCH;
812 struct mm_struct *mm = file->private_data; 988 struct mm_struct *mm;
813 989
814 if (!mm) 990 if (!task)
815 return 0; 991 goto out_no_task;
816 992
993 ret = -ENOMEM;
817 page = (char *)__get_free_page(GFP_TEMPORARY); 994 page = (char *)__get_free_page(GFP_TEMPORARY);
818 if (!page) 995 if (!page)
819 return -ENOMEM; 996 goto out;
997
998
999 mm = mm_for_maps(task);
1000 ret = PTR_ERR(mm);
1001 if (!mm || IS_ERR(mm))
1002 goto out_free;
820 1003
821 ret = 0; 1004 ret = 0;
822 if (!atomic_inc_not_zero(&mm->mm_users))
823 goto free;
824 while (count > 0) { 1005 while (count > 0) {
825 size_t this_len, max_len; 1006 int this_len, retval, max_len;
826 int retval;
827
828 if (src >= (mm->env_end - mm->env_start))
829 break;
830 1007
831 this_len = mm->env_end - (mm->env_start + src); 1008 this_len = mm->env_end - (mm->env_start + src);
832 1009
833 max_len = min_t(size_t, PAGE_SIZE, count); 1010 if (this_len <= 0)
834 this_len = min(max_len, this_len); 1011 break;
1012
1013 max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
1014 this_len = (this_len > max_len) ? max_len : this_len;
835 1015
836 retval = access_remote_vm(mm, (mm->env_start + src), 1016 retval = access_process_vm(task, (mm->env_start + src),
837 page, this_len, 0); 1017 page, this_len, 0);
838 1018
839 if (retval <= 0) { 1019 if (retval <= 0) {
@@ -852,50 +1032,51 @@ static ssize_t environ_read(struct file *file, char __user *buf,
852 count -= retval; 1032 count -= retval;
853 } 1033 }
854 *ppos = src; 1034 *ppos = src;
855 mmput(mm);
856 1035
857free: 1036 mmput(mm);
1037out_free:
858 free_page((unsigned long) page); 1038 free_page((unsigned long) page);
1039out:
1040 put_task_struct(task);
1041out_no_task:
859 return ret; 1042 return ret;
860} 1043}
861 1044
862static const struct file_operations proc_environ_operations = { 1045static const struct file_operations proc_environ_operations = {
863 .open = environ_open,
864 .read = environ_read, 1046 .read = environ_read,
865 .llseek = generic_file_llseek, 1047 .llseek = generic_file_llseek,
866 .release = mem_release,
867}; 1048};
868 1049
869static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, 1050static ssize_t oom_adjust_read(struct file *file, char __user *buf,
870 loff_t *ppos) 1051 size_t count, loff_t *ppos)
871{ 1052{
872 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 1053 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
873 char buffer[PROC_NUMBUF]; 1054 char buffer[PROC_NUMBUF];
874 int oom_adj = OOM_ADJUST_MIN;
875 size_t len; 1055 size_t len;
1056 int oom_adjust = OOM_DISABLE;
876 unsigned long flags; 1057 unsigned long flags;
877 1058
878 if (!task) 1059 if (!task)
879 return -ESRCH; 1060 return -ESRCH;
1061
880 if (lock_task_sighand(task, &flags)) { 1062 if (lock_task_sighand(task, &flags)) {
881 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX) 1063 oom_adjust = task->signal->oom_adj;
882 oom_adj = OOM_ADJUST_MAX;
883 else
884 oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
885 OOM_SCORE_ADJ_MAX;
886 unlock_task_sighand(task, &flags); 1064 unlock_task_sighand(task, &flags);
887 } 1065 }
1066
888 put_task_struct(task); 1067 put_task_struct(task);
889 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); 1068
1069 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
1070
890 return simple_read_from_buffer(buf, count, ppos, buffer, len); 1071 return simple_read_from_buffer(buf, count, ppos, buffer, len);
891} 1072}
892 1073
893static ssize_t oom_adj_write(struct file *file, const char __user *buf, 1074static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
894 size_t count, loff_t *ppos) 1075 size_t count, loff_t *ppos)
895{ 1076{
896 struct task_struct *task; 1077 struct task_struct *task;
897 char buffer[PROC_NUMBUF]; 1078 char buffer[PROC_NUMBUF];
898 int oom_adj; 1079 int oom_adjust;
899 unsigned long flags; 1080 unsigned long flags;
900 int err; 1081 int err;
901 1082
@@ -907,11 +1088,11 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
907 goto out; 1088 goto out;
908 } 1089 }
909 1090
910 err = kstrtoint(strstrip(buffer), 0, &oom_adj); 1091 err = kstrtoint(strstrip(buffer), 0, &oom_adjust);
911 if (err) 1092 if (err)
912 goto out; 1093 goto out;
913 if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) && 1094 if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
914 oom_adj != OOM_DISABLE) { 1095 oom_adjust != OOM_DISABLE) {
915 err = -EINVAL; 1096 err = -EINVAL;
916 goto out; 1097 goto out;
917 } 1098 }
@@ -933,31 +1114,35 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
933 goto err_task_lock; 1114 goto err_task_lock;
934 } 1115 }
935 1116
936 /* 1117 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
937 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
938 * value is always attainable.
939 */
940 if (oom_adj == OOM_ADJUST_MAX)
941 oom_adj = OOM_SCORE_ADJ_MAX;
942 else
943 oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
944
945 if (oom_adj < task->signal->oom_score_adj &&
946 !capable(CAP_SYS_RESOURCE)) {
947 err = -EACCES; 1118 err = -EACCES;
948 goto err_sighand; 1119 goto err_sighand;
949 } 1120 }
950 1121
1122 if (oom_adjust != task->signal->oom_adj) {
1123 if (oom_adjust == OOM_DISABLE)
1124 atomic_inc(&task->mm->oom_disable_count);
1125 if (task->signal->oom_adj == OOM_DISABLE)
1126 atomic_dec(&task->mm->oom_disable_count);
1127 }
1128
951 /* 1129 /*
952 * /proc/pid/oom_adj is provided for legacy purposes, ask users to use 1130 * Warn that /proc/pid/oom_adj is deprecated, see
953 * /proc/pid/oom_score_adj instead. 1131 * Documentation/feature-removal-schedule.txt.
954 */ 1132 */
955 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n", 1133 printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
956 current->comm, task_pid_nr(current), task_pid_nr(task), 1134 current->comm, task_pid_nr(current), task_pid_nr(task),
957 task_pid_nr(task)); 1135 task_pid_nr(task));
958 1136 task->signal->oom_adj = oom_adjust;
959 task->signal->oom_score_adj = oom_adj; 1137 /*
960 trace_oom_score_adj_update(task); 1138 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1139 * value is always attainable.
1140 */
1141 if (task->signal->oom_adj == OOM_ADJUST_MAX)
1142 task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
1143 else
1144 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
1145 -OOM_DISABLE;
961err_sighand: 1146err_sighand:
962 unlock_task_sighand(task, &flags); 1147 unlock_task_sighand(task, &flags);
963err_task_lock: 1148err_task_lock:
@@ -967,9 +1152,41 @@ out:
967 return err < 0 ? err : count; 1152 return err < 0 ? err : count;
968} 1153}
969 1154
970static const struct file_operations proc_oom_adj_operations = { 1155static int oom_adjust_permission(struct inode *inode, int mask)
971 .read = oom_adj_read, 1156{
972 .write = oom_adj_write, 1157 uid_t uid;
1158 struct task_struct *p;
1159
1160 if (mask & MAY_NOT_BLOCK)
1161 return -ECHILD;
1162
1163 p = get_proc_task(inode);
1164 if(p) {
1165 uid = task_uid(p);
1166 put_task_struct(p);
1167 }
1168
1169 /*
1170 * System Server (uid == 1000) is granted access to oom_adj of all
1171 * android applications (uid > 10000) as and services (uid >= 1000)
1172 */
1173 if (p && (current_fsuid() == 1000) && (uid >= 1000)) {
1174 if (inode->i_mode >> 6 & mask) {
1175 return 0;
1176 }
1177 }
1178
1179 /* Fall back to default. */
1180 return generic_permission(inode, mask);
1181}
1182
1183static const struct inode_operations proc_oom_adjust_inode_operations = {
1184 .permission = oom_adjust_permission,
1185};
1186
1187static const struct file_operations proc_oom_adjust_operations = {
1188 .read = oom_adjust_read,
1189 .write = oom_adjust_write,
973 .llseek = generic_file_llseek, 1190 .llseek = generic_file_llseek,
974}; 1191};
975 1192
@@ -978,7 +1195,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
978{ 1195{
979 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 1196 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
980 char buffer[PROC_NUMBUF]; 1197 char buffer[PROC_NUMBUF];
981 short oom_score_adj = OOM_SCORE_ADJ_MIN; 1198 int oom_score_adj = OOM_SCORE_ADJ_MIN;
982 unsigned long flags; 1199 unsigned long flags;
983 size_t len; 1200 size_t len;
984 1201
@@ -989,7 +1206,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
989 unlock_task_sighand(task, &flags); 1206 unlock_task_sighand(task, &flags);
990 } 1207 }
991 put_task_struct(task); 1208 put_task_struct(task);
992 len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj); 1209 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
993 return simple_read_from_buffer(buf, count, ppos, buffer, len); 1210 return simple_read_from_buffer(buf, count, ppos, buffer, len);
994} 1211}
995 1212
@@ -1036,17 +1253,30 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1036 goto err_task_lock; 1253 goto err_task_lock;
1037 } 1254 }
1038 1255
1039 if ((short)oom_score_adj < task->signal->oom_score_adj_min && 1256 if (oom_score_adj < task->signal->oom_score_adj_min &&
1040 !capable(CAP_SYS_RESOURCE)) { 1257 !capable(CAP_SYS_RESOURCE)) {
1041 err = -EACCES; 1258 err = -EACCES;
1042 goto err_sighand; 1259 goto err_sighand;
1043 } 1260 }
1044 1261
1045 task->signal->oom_score_adj = (short)oom_score_adj; 1262 if (oom_score_adj != task->signal->oom_score_adj) {
1263 if (oom_score_adj == OOM_SCORE_ADJ_MIN)
1264 atomic_inc(&task->mm->oom_disable_count);
1265 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1266 atomic_dec(&task->mm->oom_disable_count);
1267 }
1268 task->signal->oom_score_adj = oom_score_adj;
1046 if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) 1269 if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
1047 task->signal->oom_score_adj_min = (short)oom_score_adj; 1270 task->signal->oom_score_adj_min = oom_score_adj;
1048 trace_oom_score_adj_update(task); 1271 /*
1049 1272 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1273 * always attainable.
1274 */
1275 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1276 task->signal->oom_adj = OOM_DISABLE;
1277 else
1278 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
1279 OOM_SCORE_ADJ_MAX;
1050err_sighand: 1280err_sighand:
1051 unlock_task_sighand(task, &flags); 1281 unlock_task_sighand(task, &flags);
1052err_task_lock: 1282err_task_lock:
@@ -1075,8 +1305,7 @@ static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1075 if (!task) 1305 if (!task)
1076 return -ESRCH; 1306 return -ESRCH;
1077 length = scnprintf(tmpbuf, TMPBUFLEN, "%u", 1307 length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1078 from_kuid(file->f_cred->user_ns, 1308 audit_get_loginuid(task));
1079 audit_get_loginuid(task)));
1080 put_task_struct(task); 1309 put_task_struct(task);
1081 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); 1310 return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1082} 1311}
@@ -1088,7 +1317,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1088 char *page, *tmp; 1317 char *page, *tmp;
1089 ssize_t length; 1318 ssize_t length;
1090 uid_t loginuid; 1319 uid_t loginuid;
1091 kuid_t kloginuid; 1320
1321 if (!capable(CAP_AUDIT_CONTROL))
1322 return -EPERM;
1092 1323
1093 rcu_read_lock(); 1324 rcu_read_lock();
1094 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { 1325 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1118,13 +1349,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1118 goto out_free_page; 1349 goto out_free_page;
1119 1350
1120 } 1351 }
1121 kloginuid = make_kuid(file->f_cred->user_ns, loginuid); 1352 length = audit_set_loginuid(current, loginuid);
1122 if (!uid_valid(kloginuid)) {
1123 length = -EINVAL;
1124 goto out_free_page;
1125 }
1126
1127 length = audit_set_loginuid(kloginuid);
1128 if (likely(length == 0)) 1353 if (likely(length == 0))
1129 length = count; 1354 length = count;
1130 1355
@@ -1308,7 +1533,8 @@ sched_autogroup_write(struct file *file, const char __user *buf,
1308 if (!p) 1533 if (!p)
1309 return -ESRCH; 1534 return -ESRCH;
1310 1535
1311 err = proc_sched_autogroup_set_nice(p, nice); 1536 err = nice;
1537 err = proc_sched_autogroup_set_nice(p, &err);
1312 if (err) 1538 if (err)
1313 count = err; 1539 count = err;
1314 1540
@@ -1398,13 +1624,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
1398 .release = single_release, 1624 .release = single_release,
1399}; 1625};
1400 1626
1401static int proc_exe_link(struct dentry *dentry, struct path *exe_path) 1627static int proc_exe_link(struct inode *inode, struct path *exe_path)
1402{ 1628{
1403 struct task_struct *task; 1629 struct task_struct *task;
1404 struct mm_struct *mm; 1630 struct mm_struct *mm;
1405 struct file *exe_file; 1631 struct file *exe_file;
1406 1632
1407 task = get_proc_task(dentry->d_inode); 1633 task = get_proc_task(inode);
1408 if (!task) 1634 if (!task)
1409 return -ENOENT; 1635 return -ENOENT;
1410 mm = get_task_mm(task); 1636 mm = get_task_mm(task);
@@ -1425,19 +1651,16 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
1425static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) 1651static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
1426{ 1652{
1427 struct inode *inode = dentry->d_inode; 1653 struct inode *inode = dentry->d_inode;
1428 struct path path;
1429 int error = -EACCES; 1654 int error = -EACCES;
1430 1655
1656 /* We don't need a base pointer in the /proc filesystem */
1657 path_put(&nd->path);
1658
1431 /* Are we allowed to snoop on the tasks file descriptors? */ 1659 /* Are we allowed to snoop on the tasks file descriptors? */
1432 if (!proc_fd_access_allowed(inode)) 1660 if (!proc_fd_access_allowed(inode))
1433 goto out; 1661 goto out;
1434 1662
1435 error = PROC_I(inode)->op.proc_get_link(dentry, &path); 1663 error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
1436 if (error)
1437 goto out;
1438
1439 nd_jump_link(nd, &path);
1440 return NULL;
1441out: 1664out:
1442 return ERR_PTR(error); 1665 return ERR_PTR(error);
1443} 1666}
@@ -1476,7 +1699,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
1476 if (!proc_fd_access_allowed(inode)) 1699 if (!proc_fd_access_allowed(inode))
1477 goto out; 1700 goto out;
1478 1701
1479 error = PROC_I(inode)->op.proc_get_link(dentry, &path); 1702 error = PROC_I(inode)->op.proc_get_link(inode, &path);
1480 if (error) 1703 if (error)
1481 goto out; 1704 goto out;
1482 1705
@@ -1486,7 +1709,7 @@ out:
1486 return error; 1709 return error;
1487} 1710}
1488 1711
1489const struct inode_operations proc_pid_link_inode_operations = { 1712static const struct inode_operations proc_pid_link_inode_operations = {
1490 .readlink = proc_pid_readlink, 1713 .readlink = proc_pid_readlink,
1491 .follow_link = proc_pid_follow_link, 1714 .follow_link = proc_pid_follow_link,
1492 .setattr = proc_setattr, 1715 .setattr = proc_setattr,
@@ -1495,6 +1718,21 @@ const struct inode_operations proc_pid_link_inode_operations = {
1495 1718
1496/* building an inode */ 1719/* building an inode */
1497 1720
1721static int task_dumpable(struct task_struct *task)
1722{
1723 int dumpable = 0;
1724 struct mm_struct *mm;
1725
1726 task_lock(task);
1727 mm = task->mm;
1728 if (mm)
1729 dumpable = get_dumpable(mm);
1730 task_unlock(task);
1731 if(dumpable == 1)
1732 return 1;
1733 return 0;
1734}
1735
1498struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) 1736struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1499{ 1737{
1500 struct inode * inode; 1738 struct inode * inode;
@@ -1542,23 +1780,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1542 struct inode *inode = dentry->d_inode; 1780 struct inode *inode = dentry->d_inode;
1543 struct task_struct *task; 1781 struct task_struct *task;
1544 const struct cred *cred; 1782 const struct cred *cred;
1545 struct pid_namespace *pid = dentry->d_sb->s_fs_info;
1546 1783
1547 generic_fillattr(inode, stat); 1784 generic_fillattr(inode, stat);
1548 1785
1549 rcu_read_lock(); 1786 rcu_read_lock();
1550 stat->uid = GLOBAL_ROOT_UID; 1787 stat->uid = 0;
1551 stat->gid = GLOBAL_ROOT_GID; 1788 stat->gid = 0;
1552 task = pid_task(proc_pid(inode), PIDTYPE_PID); 1789 task = pid_task(proc_pid(inode), PIDTYPE_PID);
1553 if (task) { 1790 if (task) {
1554 if (!has_pid_permissions(pid, task, 2)) {
1555 rcu_read_unlock();
1556 /*
1557 * This doesn't prevent learning whether PID exists,
1558 * it only makes getattr() consistent with readdir().
1559 */
1560 return -ENOENT;
1561 }
1562 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || 1791 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1563 task_dumpable(task)) { 1792 task_dumpable(task)) {
1564 cred = __task_cred(task); 1793 cred = __task_cred(task);
@@ -1587,13 +1816,13 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1587 * made this apply to all per process world readable and executable 1816 * made this apply to all per process world readable and executable
1588 * directories. 1817 * directories.
1589 */ 1818 */
1590int pid_revalidate(struct dentry *dentry, unsigned int flags) 1819int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1591{ 1820{
1592 struct inode *inode; 1821 struct inode *inode;
1593 struct task_struct *task; 1822 struct task_struct *task;
1594 const struct cred *cred; 1823 const struct cred *cred;
1595 1824
1596 if (flags & LOOKUP_RCU) 1825 if (nd && nd->flags & LOOKUP_RCU)
1597 return -ECHILD; 1826 return -ECHILD;
1598 1827
1599 inode = dentry->d_inode; 1828 inode = dentry->d_inode;
@@ -1608,8 +1837,8 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
1608 inode->i_gid = cred->egid; 1837 inode->i_gid = cred->egid;
1609 rcu_read_unlock(); 1838 rcu_read_unlock();
1610 } else { 1839 } else {
1611 inode->i_uid = GLOBAL_ROOT_UID; 1840 inode->i_uid = 0;
1612 inode->i_gid = GLOBAL_ROOT_GID; 1841 inode->i_gid = 0;
1613 } 1842 }
1614 inode->i_mode &= ~(S_ISUID | S_ISGID); 1843 inode->i_mode &= ~(S_ISUID | S_ISGID);
1615 security_task_to_inode(task, inode); 1844 security_task_to_inode(task, inode);
@@ -1620,6 +1849,15 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
1620 return 0; 1849 return 0;
1621} 1850}
1622 1851
1852static int pid_delete_dentry(const struct dentry * dentry)
1853{
1854 /* Is the task we represent dead?
1855 * If so, then don't put the dentry on the lru list,
1856 * kill it immediately.
1857 */
1858 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1859}
1860
1623const struct dentry_operations pid_dentry_operations = 1861const struct dentry_operations pid_dentry_operations =
1624{ 1862{
1625 .d_revalidate = pid_revalidate, 1863 .d_revalidate = pid_revalidate,
@@ -1682,337 +1920,375 @@ end_instantiate:
1682 return filldir(dirent, name, len, filp->f_pos, ino, type); 1920 return filldir(dirent, name, len, filp->f_pos, ino, type);
1683} 1921}
1684 1922
1685#ifdef CONFIG_CHECKPOINT_RESTORE 1923static unsigned name_to_int(struct dentry *dentry)
1924{
1925 const char *name = dentry->d_name.name;
1926 int len = dentry->d_name.len;
1927 unsigned n = 0;
1686 1928
1687/* 1929 if (len > 1 && *name == '0')
1688 * dname_to_vma_addr - maps a dentry name into two unsigned longs 1930 goto out;
1689 * which represent vma start and end addresses. 1931 while (len-- > 0) {
1690 */ 1932 unsigned c = *name++ - '0';
1691static int dname_to_vma_addr(struct dentry *dentry, 1933 if (c > 9)
1692 unsigned long *start, unsigned long *end) 1934 goto out;
1935 if (n >= (~0U-9)/10)
1936 goto out;
1937 n *= 10;
1938 n += c;
1939 }
1940 return n;
1941out:
1942 return ~0U;
1943}
1944
1945#define PROC_FDINFO_MAX 64
1946
1947static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1693{ 1948{
1694 if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) 1949 struct task_struct *task = get_proc_task(inode);
1695 return -EINVAL; 1950 struct files_struct *files = NULL;
1951 struct file *file;
1952 int fd = proc_fd(inode);
1696 1953
1697 return 0; 1954 if (task) {
1955 files = get_files_struct(task);
1956 put_task_struct(task);
1957 }
1958 if (files) {
1959 /*
1960 * We are not taking a ref to the file structure, so we must
1961 * hold ->file_lock.
1962 */
1963 spin_lock(&files->file_lock);
1964 file = fcheck_files(files, fd);
1965 if (file) {
1966 unsigned int f_flags;
1967 struct fdtable *fdt;
1968
1969 fdt = files_fdtable(files);
1970 f_flags = file->f_flags & ~O_CLOEXEC;
1971 if (FD_ISSET(fd, fdt->close_on_exec))
1972 f_flags |= O_CLOEXEC;
1973
1974 if (path) {
1975 *path = file->f_path;
1976 path_get(&file->f_path);
1977 }
1978 if (info)
1979 snprintf(info, PROC_FDINFO_MAX,
1980 "pos:\t%lli\n"
1981 "flags:\t0%o\n",
1982 (long long) file->f_pos,
1983 f_flags);
1984 spin_unlock(&files->file_lock);
1985 put_files_struct(files);
1986 return 0;
1987 }
1988 spin_unlock(&files->file_lock);
1989 put_files_struct(files);
1990 }
1991 return -ENOENT;
1698} 1992}
1699 1993
1700static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) 1994static int proc_fd_link(struct inode *inode, struct path *path)
1701{ 1995{
1702 unsigned long vm_start, vm_end; 1996 return proc_fd_info(inode, path, NULL);
1703 bool exact_vma_exists = false; 1997}
1704 struct mm_struct *mm = NULL; 1998
1999static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
2000{
2001 struct inode *inode;
1705 struct task_struct *task; 2002 struct task_struct *task;
2003 int fd;
2004 struct files_struct *files;
1706 const struct cred *cred; 2005 const struct cred *cred;
1707 struct inode *inode;
1708 int status = 0;
1709 2006
1710 if (flags & LOOKUP_RCU) 2007 if (nd && nd->flags & LOOKUP_RCU)
1711 return -ECHILD; 2008 return -ECHILD;
1712 2009
1713 if (!capable(CAP_SYS_ADMIN)) {
1714 status = -EACCES;
1715 goto out_notask;
1716 }
1717
1718 inode = dentry->d_inode; 2010 inode = dentry->d_inode;
1719 task = get_proc_task(inode); 2011 task = get_proc_task(inode);
1720 if (!task) 2012 fd = proc_fd(inode);
1721 goto out_notask;
1722
1723 mm = mm_access(task, PTRACE_MODE_READ);
1724 if (IS_ERR_OR_NULL(mm))
1725 goto out;
1726
1727 if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
1728 down_read(&mm->mmap_sem);
1729 exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
1730 up_read(&mm->mmap_sem);
1731 }
1732
1733 mmput(mm);
1734 2013
1735 if (exact_vma_exists) { 2014 if (task) {
1736 if (task_dumpable(task)) { 2015 files = get_files_struct(task);
2016 if (files) {
1737 rcu_read_lock(); 2017 rcu_read_lock();
1738 cred = __task_cred(task); 2018 if (fcheck_files(files, fd)) {
1739 inode->i_uid = cred->euid; 2019 rcu_read_unlock();
1740 inode->i_gid = cred->egid; 2020 put_files_struct(files);
2021 if (task_dumpable(task)) {
2022 rcu_read_lock();
2023 cred = __task_cred(task);
2024 inode->i_uid = cred->euid;
2025 inode->i_gid = cred->egid;
2026 rcu_read_unlock();
2027 } else {
2028 inode->i_uid = 0;
2029 inode->i_gid = 0;
2030 }
2031 inode->i_mode &= ~(S_ISUID | S_ISGID);
2032 security_task_to_inode(task, inode);
2033 put_task_struct(task);
2034 return 1;
2035 }
1741 rcu_read_unlock(); 2036 rcu_read_unlock();
1742 } else { 2037 put_files_struct(files);
1743 inode->i_uid = GLOBAL_ROOT_UID;
1744 inode->i_gid = GLOBAL_ROOT_GID;
1745 } 2038 }
1746 security_task_to_inode(task, inode); 2039 put_task_struct(task);
1747 status = 1;
1748 } 2040 }
1749 2041 d_drop(dentry);
1750out: 2042 return 0;
1751 put_task_struct(task);
1752
1753out_notask:
1754 if (status <= 0)
1755 d_drop(dentry);
1756
1757 return status;
1758} 2043}
1759 2044
1760static const struct dentry_operations tid_map_files_dentry_operations = { 2045static const struct dentry_operations tid_fd_dentry_operations =
1761 .d_revalidate = map_files_d_revalidate,
1762 .d_delete = pid_delete_dentry,
1763};
1764
1765static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
1766{ 2046{
1767 unsigned long vm_start, vm_end; 2047 .d_revalidate = tid_fd_revalidate,
1768 struct vm_area_struct *vma; 2048 .d_delete = pid_delete_dentry,
1769 struct task_struct *task;
1770 struct mm_struct *mm;
1771 int rc;
1772
1773 rc = -ENOENT;
1774 task = get_proc_task(dentry->d_inode);
1775 if (!task)
1776 goto out;
1777
1778 mm = get_task_mm(task);
1779 put_task_struct(task);
1780 if (!mm)
1781 goto out;
1782
1783 rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
1784 if (rc)
1785 goto out_mmput;
1786
1787 down_read(&mm->mmap_sem);
1788 vma = find_exact_vma(mm, vm_start, vm_end);
1789 if (vma && vma->vm_file) {
1790 *path = vma->vm_file->f_path;
1791 path_get(path);
1792 rc = 0;
1793 }
1794 up_read(&mm->mmap_sem);
1795
1796out_mmput:
1797 mmput(mm);
1798out:
1799 return rc;
1800}
1801
1802struct map_files_info {
1803 fmode_t mode;
1804 unsigned long len;
1805 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
1806}; 2049};
1807 2050
1808static struct dentry * 2051static struct dentry *proc_fd_instantiate(struct inode *dir,
1809proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, 2052 struct dentry *dentry, struct task_struct *task, const void *ptr)
1810 struct task_struct *task, const void *ptr)
1811{ 2053{
1812 fmode_t mode = (fmode_t)(unsigned long)ptr; 2054 unsigned fd = *(const unsigned *)ptr;
1813 struct proc_inode *ei; 2055 struct file *file;
1814 struct inode *inode; 2056 struct files_struct *files;
2057 struct inode *inode;
2058 struct proc_inode *ei;
2059 struct dentry *error = ERR_PTR(-ENOENT);
1815 2060
1816 inode = proc_pid_make_inode(dir->i_sb, task); 2061 inode = proc_pid_make_inode(dir->i_sb, task);
1817 if (!inode) 2062 if (!inode)
1818 return ERR_PTR(-ENOENT); 2063 goto out;
1819
1820 ei = PROC_I(inode); 2064 ei = PROC_I(inode);
1821 ei->op.proc_get_link = proc_map_files_get_link; 2065 ei->fd = fd;
1822 2066 files = get_files_struct(task);
1823 inode->i_op = &proc_pid_link_inode_operations; 2067 if (!files)
1824 inode->i_size = 64; 2068 goto out_iput;
1825 inode->i_mode = S_IFLNK; 2069 inode->i_mode = S_IFLNK;
1826 2070
1827 if (mode & FMODE_READ) 2071 /*
1828 inode->i_mode |= S_IRUSR; 2072 * We are not taking a ref to the file structure, so we must
1829 if (mode & FMODE_WRITE) 2073 * hold ->file_lock.
1830 inode->i_mode |= S_IWUSR; 2074 */
2075 spin_lock(&files->file_lock);
2076 file = fcheck_files(files, fd);
2077 if (!file)
2078 goto out_unlock;
2079 if (file->f_mode & FMODE_READ)
2080 inode->i_mode |= S_IRUSR | S_IXUSR;
2081 if (file->f_mode & FMODE_WRITE)
2082 inode->i_mode |= S_IWUSR | S_IXUSR;
2083 spin_unlock(&files->file_lock);
2084 put_files_struct(files);
1831 2085
1832 d_set_d_op(dentry, &tid_map_files_dentry_operations); 2086 inode->i_op = &proc_pid_link_inode_operations;
2087 inode->i_size = 64;
2088 ei->op.proc_get_link = proc_fd_link;
2089 d_set_d_op(dentry, &tid_fd_dentry_operations);
1833 d_add(dentry, inode); 2090 d_add(dentry, inode);
2091 /* Close the race of the process dying before we return the dentry */
2092 if (tid_fd_revalidate(dentry, NULL))
2093 error = NULL;
1834 2094
1835 return NULL; 2095 out:
2096 return error;
2097out_unlock:
2098 spin_unlock(&files->file_lock);
2099 put_files_struct(files);
2100out_iput:
2101 iput(inode);
2102 goto out;
1836} 2103}
1837 2104
1838static struct dentry *proc_map_files_lookup(struct inode *dir, 2105static struct dentry *proc_lookupfd_common(struct inode *dir,
1839 struct dentry *dentry, unsigned int flags) 2106 struct dentry *dentry,
2107 instantiate_t instantiate)
1840{ 2108{
1841 unsigned long vm_start, vm_end; 2109 struct task_struct *task = get_proc_task(dir);
1842 struct vm_area_struct *vma; 2110 unsigned fd = name_to_int(dentry);
1843 struct task_struct *task; 2111 struct dentry *result = ERR_PTR(-ENOENT);
1844 struct dentry *result;
1845 struct mm_struct *mm;
1846
1847 result = ERR_PTR(-EACCES);
1848 if (!capable(CAP_SYS_ADMIN))
1849 goto out;
1850 2112
1851 result = ERR_PTR(-ENOENT);
1852 task = get_proc_task(dir);
1853 if (!task) 2113 if (!task)
2114 goto out_no_task;
2115 if (fd == ~0U)
1854 goto out; 2116 goto out;
1855 2117
1856 result = ERR_PTR(-EACCES); 2118 result = instantiate(dir, dentry, task, &fd);
1857 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2119out:
1858 goto out_put_task; 2120 put_task_struct(task);
2121out_no_task:
2122 return result;
2123}
1859 2124
1860 result = ERR_PTR(-ENOENT); 2125static int proc_readfd_common(struct file * filp, void * dirent,
1861 if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) 2126 filldir_t filldir, instantiate_t instantiate)
1862 goto out_put_task; 2127{
2128 struct dentry *dentry = filp->f_path.dentry;
2129 struct inode *inode = dentry->d_inode;
2130 struct task_struct *p = get_proc_task(inode);
2131 unsigned int fd, ino;
2132 int retval;
2133 struct files_struct * files;
1863 2134
1864 mm = get_task_mm(task); 2135 retval = -ENOENT;
1865 if (!mm) 2136 if (!p)
1866 goto out_put_task; 2137 goto out_no_task;
2138 retval = 0;
1867 2139
1868 down_read(&mm->mmap_sem); 2140 fd = filp->f_pos;
1869 vma = find_exact_vma(mm, vm_start, vm_end); 2141 switch (fd) {
1870 if (!vma) 2142 case 0:
1871 goto out_no_vma; 2143 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
2144 goto out;
2145 filp->f_pos++;
2146 case 1:
2147 ino = parent_ino(dentry);
2148 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
2149 goto out;
2150 filp->f_pos++;
2151 default:
2152 files = get_files_struct(p);
2153 if (!files)
2154 goto out;
2155 rcu_read_lock();
2156 for (fd = filp->f_pos-2;
2157 fd < files_fdtable(files)->max_fds;
2158 fd++, filp->f_pos++) {
2159 char name[PROC_NUMBUF];
2160 int len;
1872 2161
1873 if (vma->vm_file) 2162 if (!fcheck_files(files, fd))
1874 result = proc_map_files_instantiate(dir, dentry, task, 2163 continue;
1875 (void *)(unsigned long)vma->vm_file->f_mode); 2164 rcu_read_unlock();
1876 2165
1877out_no_vma: 2166 len = snprintf(name, sizeof(name), "%d", fd);
1878 up_read(&mm->mmap_sem); 2167 if (proc_fill_cache(filp, dirent, filldir,
1879 mmput(mm); 2168 name, len, instantiate,
1880out_put_task: 2169 p, &fd) < 0) {
1881 put_task_struct(task); 2170 rcu_read_lock();
2171 break;
2172 }
2173 rcu_read_lock();
2174 }
2175 rcu_read_unlock();
2176 put_files_struct(files);
2177 }
1882out: 2178out:
1883 return result; 2179 put_task_struct(p);
2180out_no_task:
2181 return retval;
1884} 2182}
1885 2183
1886static const struct inode_operations proc_map_files_inode_operations = { 2184static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
1887 .lookup = proc_map_files_lookup, 2185 struct nameidata *nd)
1888 .permission = proc_fd_permission,
1889 .setattr = proc_setattr,
1890};
1891
1892static int
1893proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
1894{ 2186{
1895 struct dentry *dentry = filp->f_path.dentry; 2187 return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
1896 struct inode *inode = dentry->d_inode; 2188}
1897 struct vm_area_struct *vma;
1898 struct task_struct *task;
1899 struct mm_struct *mm;
1900 ino_t ino;
1901 int ret;
1902 2189
1903 ret = -EACCES; 2190static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
1904 if (!capable(CAP_SYS_ADMIN)) 2191{
1905 goto out; 2192 return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
2193}
1906 2194
1907 ret = -ENOENT; 2195static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1908 task = get_proc_task(inode); 2196 size_t len, loff_t *ppos)
1909 if (!task) 2197{
1910 goto out; 2198 char tmp[PROC_FDINFO_MAX];
2199 int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
2200 if (!err)
2201 err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
2202 return err;
2203}
1911 2204
1912 ret = -EACCES; 2205static const struct file_operations proc_fdinfo_file_operations = {
1913 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2206 .open = nonseekable_open,
1914 goto out_put_task; 2207 .read = proc_fdinfo_read,
2208 .llseek = no_llseek,
2209};
1915 2210
1916 ret = 0; 2211static const struct file_operations proc_fd_operations = {
1917 switch (filp->f_pos) { 2212 .read = generic_read_dir,
1918 case 0: 2213 .readdir = proc_readfd,
1919 ino = inode->i_ino; 2214 .llseek = default_llseek,
1920 if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) 2215};
1921 goto out_put_task;
1922 filp->f_pos++;
1923 case 1:
1924 ino = parent_ino(dentry);
1925 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1926 goto out_put_task;
1927 filp->f_pos++;
1928 default:
1929 {
1930 unsigned long nr_files, pos, i;
1931 struct flex_array *fa = NULL;
1932 struct map_files_info info;
1933 struct map_files_info *p;
1934 2216
1935 mm = get_task_mm(task); 2217/*
1936 if (!mm) 2218 * /proc/pid/fd needs a special permission handler so that a process can still
1937 goto out_put_task; 2219 * access /proc/self/fd after it has executed a setuid().
1938 down_read(&mm->mmap_sem); 2220 */
2221static int proc_fd_permission(struct inode *inode, int mask)
2222{
2223 int rv = generic_permission(inode, mask);
2224 if (rv == 0)
2225 return 0;
2226 if (task_pid(current) == proc_pid(inode))
2227 rv = 0;
2228 return rv;
2229}
1939 2230
1940 nr_files = 0; 2231/*
2232 * proc directories can do almost nothing..
2233 */
2234static const struct inode_operations proc_fd_inode_operations = {
2235 .lookup = proc_lookupfd,
2236 .permission = proc_fd_permission,
2237 .setattr = proc_setattr,
2238};
1941 2239
1942 /* 2240static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
1943 * We need two passes here: 2241 struct dentry *dentry, struct task_struct *task, const void *ptr)
1944 * 2242{
1945 * 1) Collect vmas of mapped files with mmap_sem taken 2243 unsigned fd = *(unsigned *)ptr;
1946 * 2) Release mmap_sem and instantiate entries 2244 struct inode *inode;
1947 * 2245 struct proc_inode *ei;
1948 * otherwise we get lockdep complained, since filldir() 2246 struct dentry *error = ERR_PTR(-ENOENT);
1949 * routine might require mmap_sem taken in might_fault().
1950 */
1951 2247
1952 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { 2248 inode = proc_pid_make_inode(dir->i_sb, task);
1953 if (vma->vm_file && ++pos > filp->f_pos) 2249 if (!inode)
1954 nr_files++; 2250 goto out;
1955 } 2251 ei = PROC_I(inode);
2252 ei->fd = fd;
2253 inode->i_mode = S_IFREG | S_IRUSR;
2254 inode->i_fop = &proc_fdinfo_file_operations;
2255 d_set_d_op(dentry, &tid_fd_dentry_operations);
2256 d_add(dentry, inode);
2257 /* Close the race of the process dying before we return the dentry */
2258 if (tid_fd_revalidate(dentry, NULL))
2259 error = NULL;
1956 2260
1957 if (nr_files) { 2261 out:
1958 fa = flex_array_alloc(sizeof(info), nr_files, 2262 return error;
1959 GFP_KERNEL); 2263}
1960 if (!fa || flex_array_prealloc(fa, 0, nr_files,
1961 GFP_KERNEL)) {
1962 ret = -ENOMEM;
1963 if (fa)
1964 flex_array_free(fa);
1965 up_read(&mm->mmap_sem);
1966 mmput(mm);
1967 goto out_put_task;
1968 }
1969 for (i = 0, vma = mm->mmap, pos = 2; vma;
1970 vma = vma->vm_next) {
1971 if (!vma->vm_file)
1972 continue;
1973 if (++pos <= filp->f_pos)
1974 continue;
1975 2264
1976 info.mode = vma->vm_file->f_mode; 2265static struct dentry *proc_lookupfdinfo(struct inode *dir,
1977 info.len = snprintf(info.name, 2266 struct dentry *dentry,
1978 sizeof(info.name), "%lx-%lx", 2267 struct nameidata *nd)
1979 vma->vm_start, vma->vm_end); 2268{
1980 if (flex_array_put(fa, i++, &info, GFP_KERNEL)) 2269 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
1981 BUG(); 2270}
1982 }
1983 }
1984 up_read(&mm->mmap_sem);
1985
1986 for (i = 0; i < nr_files; i++) {
1987 p = flex_array_get(fa, i);
1988 ret = proc_fill_cache(filp, dirent, filldir,
1989 p->name, p->len,
1990 proc_map_files_instantiate,
1991 task,
1992 (void *)(unsigned long)p->mode);
1993 if (ret)
1994 break;
1995 filp->f_pos++;
1996 }
1997 if (fa)
1998 flex_array_free(fa);
1999 mmput(mm);
2000 }
2001 }
2002 2271
2003out_put_task: 2272static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
2004 put_task_struct(task); 2273{
2005out: 2274 return proc_readfd_common(filp, dirent, filldir,
2006 return ret; 2275 proc_fdinfo_instantiate);
2007} 2276}
2008 2277
2009static const struct file_operations proc_map_files_operations = { 2278static const struct file_operations proc_fdinfo_operations = {
2010 .read = generic_read_dir, 2279 .read = generic_read_dir,
2011 .readdir = proc_map_files_readdir, 2280 .readdir = proc_readfdinfo,
2012 .llseek = default_llseek, 2281 .llseek = default_llseek,
2013}; 2282};
2014 2283
2015#endif /* CONFIG_CHECKPOINT_RESTORE */ 2284/*
2285 * proc directories can do almost nothing..
2286 */
2287static const struct inode_operations proc_fdinfo_inode_operations = {
2288 .lookup = proc_lookupfdinfo,
2289 .setattr = proc_setattr,
2290};
2291
2016 2292
2017static struct dentry *proc_pident_instantiate(struct inode *dir, 2293static struct dentry *proc_pident_instantiate(struct inode *dir,
2018 struct dentry *dentry, struct task_struct *task, const void *ptr) 2294 struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2029,7 +2305,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
2029 ei = PROC_I(inode); 2305 ei = PROC_I(inode);
2030 inode->i_mode = p->mode; 2306 inode->i_mode = p->mode;
2031 if (S_ISDIR(inode->i_mode)) 2307 if (S_ISDIR(inode->i_mode))
2032 set_nlink(inode, 2); /* Use getattr to fix if necessary */ 2308 inode->i_nlink = 2; /* Use getattr to fix if necessary */
2033 if (p->iop) 2309 if (p->iop)
2034 inode->i_op = p->iop; 2310 inode->i_op = p->iop;
2035 if (p->fop) 2311 if (p->fop)
@@ -2038,7 +2314,7 @@ static struct dentry *proc_pident_instantiate(struct inode *dir,
2038 d_set_d_op(dentry, &pid_dentry_operations); 2314 d_set_d_op(dentry, &pid_dentry_operations);
2039 d_add(dentry, inode); 2315 d_add(dentry, inode);
2040 /* Close the race of the process dying before we return the dentry */ 2316 /* Close the race of the process dying before we return the dentry */
2041 if (pid_revalidate(dentry, 0)) 2317 if (pid_revalidate(dentry, NULL))
2042 error = NULL; 2318 error = NULL;
2043out: 2319out:
2044 return error; 2320 return error;
@@ -2238,7 +2514,7 @@ static const struct file_operations proc_attr_dir_operations = {
2238}; 2514};
2239 2515
2240static struct dentry *proc_attr_dir_lookup(struct inode *dir, 2516static struct dentry *proc_attr_dir_lookup(struct inode *dir,
2241 struct dentry *dentry, unsigned int flags) 2517 struct dentry *dentry, struct nameidata *nd)
2242{ 2518{
2243 return proc_pident_lookup(dir, dentry, 2519 return proc_pident_lookup(dir, dentry,
2244 attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); 2520 attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
@@ -2338,6 +2614,145 @@ static const struct file_operations proc_coredump_filter_operations = {
2338}; 2614};
2339#endif 2615#endif
2340 2616
2617/*
2618 * /proc/self:
2619 */
2620static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
2621 int buflen)
2622{
2623 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2624 pid_t tgid = task_tgid_nr_ns(current, ns);
2625 char tmp[PROC_NUMBUF];
2626 if (!tgid)
2627 return -ENOENT;
2628 sprintf(tmp, "%d", tgid);
2629 return vfs_readlink(dentry,buffer,buflen,tmp);
2630}
2631
2632static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2633{
2634 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2635 pid_t tgid = task_tgid_nr_ns(current, ns);
2636 char *name = ERR_PTR(-ENOENT);
2637 if (tgid) {
2638 name = __getname();
2639 if (!name)
2640 name = ERR_PTR(-ENOMEM);
2641 else
2642 sprintf(name, "%d", tgid);
2643 }
2644 nd_set_link(nd, name);
2645 return NULL;
2646}
2647
2648static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
2649 void *cookie)
2650{
2651 char *s = nd_get_link(nd);
2652 if (!IS_ERR(s))
2653 __putname(s);
2654}
2655
2656static const struct inode_operations proc_self_inode_operations = {
2657 .readlink = proc_self_readlink,
2658 .follow_link = proc_self_follow_link,
2659 .put_link = proc_self_put_link,
2660};
2661
2662/*
2663 * proc base
2664 *
2665 * These are the directory entries in the root directory of /proc
2666 * that properly belong to the /proc filesystem, as they describe
2667 * describe something that is process related.
2668 */
2669static const struct pid_entry proc_base_stuff[] = {
2670 NOD("self", S_IFLNK|S_IRWXUGO,
2671 &proc_self_inode_operations, NULL, {}),
2672};
2673
2674static struct dentry *proc_base_instantiate(struct inode *dir,
2675 struct dentry *dentry, struct task_struct *task, const void *ptr)
2676{
2677 const struct pid_entry *p = ptr;
2678 struct inode *inode;
2679 struct proc_inode *ei;
2680 struct dentry *error;
2681
2682 /* Allocate the inode */
2683 error = ERR_PTR(-ENOMEM);
2684 inode = new_inode(dir->i_sb);
2685 if (!inode)
2686 goto out;
2687
2688 /* Initialize the inode */
2689 ei = PROC_I(inode);
2690 inode->i_ino = get_next_ino();
2691 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2692
2693 /*
2694 * grab the reference to the task.
2695 */
2696 ei->pid = get_task_pid(task, PIDTYPE_PID);
2697 if (!ei->pid)
2698 goto out_iput;
2699
2700 inode->i_mode = p->mode;
2701 if (S_ISDIR(inode->i_mode))
2702 inode->i_nlink = 2;
2703 if (S_ISLNK(inode->i_mode))
2704 inode->i_size = 64;
2705 if (p->iop)
2706 inode->i_op = p->iop;
2707 if (p->fop)
2708 inode->i_fop = p->fop;
2709 ei->op = p->op;
2710 d_add(dentry, inode);
2711 error = NULL;
2712out:
2713 return error;
2714out_iput:
2715 iput(inode);
2716 goto out;
2717}
2718
2719static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
2720{
2721 struct dentry *error;
2722 struct task_struct *task = get_proc_task(dir);
2723 const struct pid_entry *p, *last;
2724
2725 error = ERR_PTR(-ENOENT);
2726
2727 if (!task)
2728 goto out_no_task;
2729
2730 /* Lookup the directory entry */
2731 last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
2732 for (p = proc_base_stuff; p <= last; p++) {
2733 if (p->len != dentry->d_name.len)
2734 continue;
2735 if (!memcmp(dentry->d_name.name, p->name, p->len))
2736 break;
2737 }
2738 if (p > last)
2739 goto out;
2740
2741 error = proc_base_instantiate(dir, dentry, task, p);
2742
2743out:
2744 put_task_struct(task);
2745out_no_task:
2746 return error;
2747}
2748
2749static int proc_base_fill_cache(struct file *filp, void *dirent,
2750 filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2751{
2752 return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2753 proc_base_instantiate, task, p);
2754}
2755
2341#ifdef CONFIG_TASK_IO_ACCOUNTING 2756#ifdef CONFIG_TASK_IO_ACCOUNTING
2342static int do_io_accounting(struct task_struct *task, char *buffer, int whole) 2757static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
2343{ 2758{
@@ -2394,87 +2809,6 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
2394} 2809}
2395#endif /* CONFIG_TASK_IO_ACCOUNTING */ 2810#endif /* CONFIG_TASK_IO_ACCOUNTING */
2396 2811
2397#ifdef CONFIG_USER_NS
2398static int proc_id_map_open(struct inode *inode, struct file *file,
2399 struct seq_operations *seq_ops)
2400{
2401 struct user_namespace *ns = NULL;
2402 struct task_struct *task;
2403 struct seq_file *seq;
2404 int ret = -EINVAL;
2405
2406 task = get_proc_task(inode);
2407 if (task) {
2408 rcu_read_lock();
2409 ns = get_user_ns(task_cred_xxx(task, user_ns));
2410 rcu_read_unlock();
2411 put_task_struct(task);
2412 }
2413 if (!ns)
2414 goto err;
2415
2416 ret = seq_open(file, seq_ops);
2417 if (ret)
2418 goto err_put_ns;
2419
2420 seq = file->private_data;
2421 seq->private = ns;
2422
2423 return 0;
2424err_put_ns:
2425 put_user_ns(ns);
2426err:
2427 return ret;
2428}
2429
2430static int proc_id_map_release(struct inode *inode, struct file *file)
2431{
2432 struct seq_file *seq = file->private_data;
2433 struct user_namespace *ns = seq->private;
2434 put_user_ns(ns);
2435 return seq_release(inode, file);
2436}
2437
2438static int proc_uid_map_open(struct inode *inode, struct file *file)
2439{
2440 return proc_id_map_open(inode, file, &proc_uid_seq_operations);
2441}
2442
2443static int proc_gid_map_open(struct inode *inode, struct file *file)
2444{
2445 return proc_id_map_open(inode, file, &proc_gid_seq_operations);
2446}
2447
2448static int proc_projid_map_open(struct inode *inode, struct file *file)
2449{
2450 return proc_id_map_open(inode, file, &proc_projid_seq_operations);
2451}
2452
2453static const struct file_operations proc_uid_map_operations = {
2454 .open = proc_uid_map_open,
2455 .write = proc_uid_map_write,
2456 .read = seq_read,
2457 .llseek = seq_lseek,
2458 .release = proc_id_map_release,
2459};
2460
2461static const struct file_operations proc_gid_map_operations = {
2462 .open = proc_gid_map_open,
2463 .write = proc_gid_map_write,
2464 .read = seq_read,
2465 .llseek = seq_lseek,
2466 .release = proc_id_map_release,
2467};
2468
2469static const struct file_operations proc_projid_map_operations = {
2470 .open = proc_projid_map_open,
2471 .write = proc_projid_map_write,
2472 .read = seq_read,
2473 .llseek = seq_lseek,
2474 .release = proc_id_map_release,
2475};
2476#endif /* CONFIG_USER_NS */
2477
2478static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, 2812static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
2479 struct pid *pid, struct task_struct *task) 2813 struct pid *pid, struct task_struct *task)
2480{ 2814{
@@ -2495,9 +2829,6 @@ static const struct inode_operations proc_task_inode_operations;
2495static const struct pid_entry tgid_base_stuff[] = { 2829static const struct pid_entry tgid_base_stuff[] = {
2496 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), 2830 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2497 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2831 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2498#ifdef CONFIG_CHECKPOINT_RESTORE
2499 DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
2500#endif
2501 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 2832 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2502 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), 2833 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2503#ifdef CONFIG_NET 2834#ifdef CONFIG_NET
@@ -2521,9 +2852,9 @@ static const struct pid_entry tgid_base_stuff[] = {
2521 INF("cmdline", S_IRUGO, proc_pid_cmdline), 2852 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2522 ONE("stat", S_IRUGO, proc_tgid_stat), 2853 ONE("stat", S_IRUGO, proc_tgid_stat),
2523 ONE("statm", S_IRUGO, proc_pid_statm), 2854 ONE("statm", S_IRUGO, proc_pid_statm),
2524 REG("maps", S_IRUGO, proc_pid_maps_operations), 2855 REG("maps", S_IRUGO, proc_maps_operations),
2525#ifdef CONFIG_NUMA 2856#ifdef CONFIG_NUMA
2526 REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), 2857 REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
2527#endif 2858#endif
2528 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), 2859 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2529 LNK("cwd", proc_cwd_link), 2860 LNK("cwd", proc_cwd_link),
@@ -2534,7 +2865,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2534 REG("mountstats", S_IRUSR, proc_mountstats_operations), 2865 REG("mountstats", S_IRUSR, proc_mountstats_operations),
2535#ifdef CONFIG_PROC_PAGE_MONITOR 2866#ifdef CONFIG_PROC_PAGE_MONITOR
2536 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 2867 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2537 REG("smaps", S_IRUGO, proc_pid_smaps_operations), 2868 REG("smaps", S_IRUGO, proc_smaps_operations),
2538 REG("pagemap", S_IRUGO, proc_pagemap_operations), 2869 REG("pagemap", S_IRUGO, proc_pagemap_operations),
2539#endif 2870#endif
2540#ifdef CONFIG_SECURITY 2871#ifdef CONFIG_SECURITY
@@ -2559,7 +2890,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2559 REG("cgroup", S_IRUGO, proc_cgroup_operations), 2890 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2560#endif 2891#endif
2561 INF("oom_score", S_IRUGO, proc_oom_score), 2892 INF("oom_score", S_IRUGO, proc_oom_score),
2562 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), 2893 ANDROID("oom_adj",S_IRUGO|S_IWUSR, oom_adjust),
2563 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 2894 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2564#ifdef CONFIG_AUDITSYSCALL 2895#ifdef CONFIG_AUDITSYSCALL
2565 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2896 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -2577,11 +2908,6 @@ static const struct pid_entry tgid_base_stuff[] = {
2577#ifdef CONFIG_HARDWALL 2908#ifdef CONFIG_HARDWALL
2578 INF("hardwall", S_IRUGO, proc_pid_hardwall), 2909 INF("hardwall", S_IRUGO, proc_pid_hardwall),
2579#endif 2910#endif
2580#ifdef CONFIG_USER_NS
2581 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
2582 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
2583 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
2584#endif
2585}; 2911};
2586 2912
2587static int proc_tgid_base_readdir(struct file * filp, 2913static int proc_tgid_base_readdir(struct file * filp,
@@ -2597,8 +2923,7 @@ static const struct file_operations proc_tgid_base_operations = {
2597 .llseek = default_llseek, 2923 .llseek = default_llseek,
2598}; 2924};
2599 2925
2600static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2926static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2601{
2602 return proc_pident_lookup(dir, dentry, 2927 return proc_pident_lookup(dir, dentry,
2603 tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); 2928 tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2604} 2929}
@@ -2607,7 +2932,6 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
2607 .lookup = proc_tgid_base_lookup, 2932 .lookup = proc_tgid_base_lookup,
2608 .getattr = pid_getattr, 2933 .getattr = pid_getattr,
2609 .setattr = proc_setattr, 2934 .setattr = proc_setattr,
2610 .permission = proc_pid_permission,
2611}; 2935};
2612 2936
2613static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) 2937static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -2692,6 +3016,10 @@ void proc_flush_task(struct task_struct *task)
2692 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, 3016 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2693 tgid->numbers[i].nr); 3017 tgid->numbers[i].nr);
2694 } 3018 }
3019
3020 upid = &pid->numbers[pid->level];
3021 if (upid->nr == 1)
3022 pid_ns_release_proc(upid->ns);
2695} 3023}
2696 3024
2697static struct dentry *proc_pid_instantiate(struct inode *dir, 3025static struct dentry *proc_pid_instantiate(struct inode *dir,
@@ -2710,26 +3038,30 @@ static struct dentry *proc_pid_instantiate(struct inode *dir,
2710 inode->i_fop = &proc_tgid_base_operations; 3038 inode->i_fop = &proc_tgid_base_operations;
2711 inode->i_flags|=S_IMMUTABLE; 3039 inode->i_flags|=S_IMMUTABLE;
2712 3040
2713 set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, 3041 inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
2714 ARRAY_SIZE(tgid_base_stuff))); 3042 ARRAY_SIZE(tgid_base_stuff));
2715 3043
2716 d_set_d_op(dentry, &pid_dentry_operations); 3044 d_set_d_op(dentry, &pid_dentry_operations);
2717 3045
2718 d_add(dentry, inode); 3046 d_add(dentry, inode);
2719 /* Close the race of the process dying before we return the dentry */ 3047 /* Close the race of the process dying before we return the dentry */
2720 if (pid_revalidate(dentry, 0)) 3048 if (pid_revalidate(dentry, NULL))
2721 error = NULL; 3049 error = NULL;
2722out: 3050out:
2723 return error; 3051 return error;
2724} 3052}
2725 3053
2726struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 3054struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2727{ 3055{
2728 struct dentry *result = NULL; 3056 struct dentry *result;
2729 struct task_struct *task; 3057 struct task_struct *task;
2730 unsigned tgid; 3058 unsigned tgid;
2731 struct pid_namespace *ns; 3059 struct pid_namespace *ns;
2732 3060
3061 result = proc_base_lookup(dir, dentry);
3062 if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
3063 goto out;
3064
2733 tgid = name_to_int(dentry); 3065 tgid = name_to_int(dentry);
2734 if (tgid == ~0U) 3066 if (tgid == ~0U)
2735 goto out; 3067 goto out;
@@ -2792,7 +3124,7 @@ retry:
2792 return iter; 3124 return iter;
2793} 3125}
2794 3126
2795#define TGID_OFFSET (FIRST_PROCESS_ENTRY) 3127#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
2796 3128
2797static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 3129static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2798 struct tgid_iter iter) 3130 struct tgid_iter iter)
@@ -2803,21 +3135,27 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
2803 proc_pid_instantiate, iter.task, NULL); 3135 proc_pid_instantiate, iter.task, NULL);
2804} 3136}
2805 3137
2806static int fake_filldir(void *buf, const char *name, int namelen,
2807 loff_t offset, u64 ino, unsigned d_type)
2808{
2809 return 0;
2810}
2811
2812/* for the /proc/ directory itself, after non-process stuff has been done */ 3138/* for the /proc/ directory itself, after non-process stuff has been done */
2813int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) 3139int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2814{ 3140{
3141 unsigned int nr;
3142 struct task_struct *reaper;
2815 struct tgid_iter iter; 3143 struct tgid_iter iter;
2816 struct pid_namespace *ns; 3144 struct pid_namespace *ns;
2817 filldir_t __filldir;
2818 3145
2819 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) 3146 if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
2820 goto out; 3147 goto out_no_task;
3148 nr = filp->f_pos - FIRST_PROCESS_ENTRY;
3149
3150 reaper = get_proc_task(filp->f_path.dentry->d_inode);
3151 if (!reaper)
3152 goto out_no_task;
3153
3154 for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
3155 const struct pid_entry *p = &proc_base_stuff[nr];
3156 if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
3157 goto out;
3158 }
2821 3159
2822 ns = filp->f_dentry->d_sb->s_fs_info; 3160 ns = filp->f_dentry->d_sb->s_fs_info;
2823 iter.task = NULL; 3161 iter.task = NULL;
@@ -2825,19 +3163,16 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2825 for (iter = next_tgid(ns, iter); 3163 for (iter = next_tgid(ns, iter);
2826 iter.task; 3164 iter.task;
2827 iter.tgid += 1, iter = next_tgid(ns, iter)) { 3165 iter.tgid += 1, iter = next_tgid(ns, iter)) {
2828 if (has_pid_permissions(ns, iter.task, 2))
2829 __filldir = filldir;
2830 else
2831 __filldir = fake_filldir;
2832
2833 filp->f_pos = iter.tgid + TGID_OFFSET; 3166 filp->f_pos = iter.tgid + TGID_OFFSET;
2834 if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { 3167 if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
2835 put_task_struct(iter.task); 3168 put_task_struct(iter.task);
2836 goto out; 3169 goto out;
2837 } 3170 }
2838 } 3171 }
2839 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; 3172 filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
2840out: 3173out:
3174 put_task_struct(reaper);
3175out_no_task:
2841 return 0; 3176 return 0;
2842} 3177}
2843 3178
@@ -2863,12 +3198,9 @@ static const struct pid_entry tid_base_stuff[] = {
2863 INF("cmdline", S_IRUGO, proc_pid_cmdline), 3198 INF("cmdline", S_IRUGO, proc_pid_cmdline),
2864 ONE("stat", S_IRUGO, proc_tid_stat), 3199 ONE("stat", S_IRUGO, proc_tid_stat),
2865 ONE("statm", S_IRUGO, proc_pid_statm), 3200 ONE("statm", S_IRUGO, proc_pid_statm),
2866 REG("maps", S_IRUGO, proc_tid_maps_operations), 3201 REG("maps", S_IRUGO, proc_maps_operations),
2867#ifdef CONFIG_CHECKPOINT_RESTORE
2868 REG("children", S_IRUGO, proc_tid_children_operations),
2869#endif
2870#ifdef CONFIG_NUMA 3202#ifdef CONFIG_NUMA
2871 REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), 3203 REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
2872#endif 3204#endif
2873 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), 3205 REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations),
2874 LNK("cwd", proc_cwd_link), 3206 LNK("cwd", proc_cwd_link),
@@ -2878,7 +3210,7 @@ static const struct pid_entry tid_base_stuff[] = {
2878 REG("mountinfo", S_IRUGO, proc_mountinfo_operations), 3210 REG("mountinfo", S_IRUGO, proc_mountinfo_operations),
2879#ifdef CONFIG_PROC_PAGE_MONITOR 3211#ifdef CONFIG_PROC_PAGE_MONITOR
2880 REG("clear_refs", S_IWUSR, proc_clear_refs_operations), 3212 REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2881 REG("smaps", S_IRUGO, proc_tid_smaps_operations), 3213 REG("smaps", S_IRUGO, proc_smaps_operations),
2882 REG("pagemap", S_IRUGO, proc_pagemap_operations), 3214 REG("pagemap", S_IRUGO, proc_pagemap_operations),
2883#endif 3215#endif
2884#ifdef CONFIG_SECURITY 3216#ifdef CONFIG_SECURITY
@@ -2903,7 +3235,7 @@ static const struct pid_entry tid_base_stuff[] = {
2903 REG("cgroup", S_IRUGO, proc_cgroup_operations), 3235 REG("cgroup", S_IRUGO, proc_cgroup_operations),
2904#endif 3236#endif
2905 INF("oom_score", S_IRUGO, proc_oom_score), 3237 INF("oom_score", S_IRUGO, proc_oom_score),
2906 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), 3238 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2907 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), 3239 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2908#ifdef CONFIG_AUDITSYSCALL 3240#ifdef CONFIG_AUDITSYSCALL
2909 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 3241 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -2918,11 +3250,6 @@ static const struct pid_entry tid_base_stuff[] = {
2918#ifdef CONFIG_HARDWALL 3250#ifdef CONFIG_HARDWALL
2919 INF("hardwall", S_IRUGO, proc_pid_hardwall), 3251 INF("hardwall", S_IRUGO, proc_pid_hardwall),
2920#endif 3252#endif
2921#ifdef CONFIG_USER_NS
2922 REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations),
2923 REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations),
2924 REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations),
2925#endif
2926}; 3253};
2927 3254
2928static int proc_tid_base_readdir(struct file * filp, 3255static int proc_tid_base_readdir(struct file * filp,
@@ -2932,8 +3259,7 @@ static int proc_tid_base_readdir(struct file * filp,
2932 tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); 3259 tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
2933} 3260}
2934 3261
2935static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 3262static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2936{
2937 return proc_pident_lookup(dir, dentry, 3263 return proc_pident_lookup(dir, dentry,
2938 tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); 3264 tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
2939} 3265}
@@ -2964,20 +3290,20 @@ static struct dentry *proc_task_instantiate(struct inode *dir,
2964 inode->i_fop = &proc_tid_base_operations; 3290 inode->i_fop = &proc_tid_base_operations;
2965 inode->i_flags|=S_IMMUTABLE; 3291 inode->i_flags|=S_IMMUTABLE;
2966 3292
2967 set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, 3293 inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
2968 ARRAY_SIZE(tid_base_stuff))); 3294 ARRAY_SIZE(tid_base_stuff));
2969 3295
2970 d_set_d_op(dentry, &pid_dentry_operations); 3296 d_set_d_op(dentry, &pid_dentry_operations);
2971 3297
2972 d_add(dentry, inode); 3298 d_add(dentry, inode);
2973 /* Close the race of the process dying before we return the dentry */ 3299 /* Close the race of the process dying before we return the dentry */
2974 if (pid_revalidate(dentry, 0)) 3300 if (pid_revalidate(dentry, NULL))
2975 error = NULL; 3301 error = NULL;
2976out: 3302out:
2977 return error; 3303 return error;
2978} 3304}
2979 3305
2980static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) 3306static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2981{ 3307{
2982 struct dentry *result = ERR_PTR(-ENOENT); 3308 struct dentry *result = ERR_PTR(-ENOENT);
2983 struct task_struct *task; 3309 struct task_struct *task;
@@ -3173,7 +3499,6 @@ static const struct inode_operations proc_task_inode_operations = {
3173 .lookup = proc_task_lookup, 3499 .lookup = proc_task_lookup,
3174 .getattr = proc_task_getattr, 3500 .getattr = proc_task_getattr,
3175 .setattr = proc_setattr, 3501 .setattr = proc_setattr,
3176 .permission = proc_pid_permission,
3177}; 3502};
3178 3503
3179static const struct file_operations proc_task_operations = { 3504static const struct file_operations proc_task_operations = {
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
deleted file mode 100644
index d7a4a28ef63..00000000000
--- a/fs/proc/fd.c
+++ /dev/null
@@ -1,369 +0,0 @@
1#include <linux/sched.h>
2#include <linux/errno.h>
3#include <linux/dcache.h>
4#include <linux/path.h>
5#include <linux/fdtable.h>
6#include <linux/namei.h>
7#include <linux/pid.h>
8#include <linux/security.h>
9#include <linux/file.h>
10#include <linux/seq_file.h>
11
12#include <linux/proc_fs.h>
13
14#include "internal.h"
15#include "fd.h"
16
17static int seq_show(struct seq_file *m, void *v)
18{
19 struct files_struct *files = NULL;
20 int f_flags = 0, ret = -ENOENT;
21 struct file *file = NULL;
22 struct task_struct *task;
23
24 task = get_proc_task(m->private);
25 if (!task)
26 return -ENOENT;
27
28 files = get_files_struct(task);
29 put_task_struct(task);
30
31 if (files) {
32 int fd = proc_fd(m->private);
33
34 spin_lock(&files->file_lock);
35 file = fcheck_files(files, fd);
36 if (file) {
37 struct fdtable *fdt = files_fdtable(files);
38
39 f_flags = file->f_flags;
40 if (close_on_exec(fd, fdt))
41 f_flags |= O_CLOEXEC;
42
43 get_file(file);
44 ret = 0;
45 }
46 spin_unlock(&files->file_lock);
47 put_files_struct(files);
48 }
49
50 if (!ret) {
51 seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
52 (long long)file->f_pos, f_flags);
53 if (file->f_op->show_fdinfo)
54 ret = file->f_op->show_fdinfo(m, file);
55 fput(file);
56 }
57
58 return ret;
59}
60
61static int seq_fdinfo_open(struct inode *inode, struct file *file)
62{
63 return single_open(file, seq_show, inode);
64}
65
66static const struct file_operations proc_fdinfo_file_operations = {
67 .open = seq_fdinfo_open,
68 .read = seq_read,
69 .llseek = seq_lseek,
70 .release = single_release,
71};
72
73static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
74{
75 struct files_struct *files;
76 struct task_struct *task;
77 const struct cred *cred;
78 struct inode *inode;
79 int fd;
80
81 if (flags & LOOKUP_RCU)
82 return -ECHILD;
83
84 inode = dentry->d_inode;
85 task = get_proc_task(inode);
86 fd = proc_fd(inode);
87
88 if (task) {
89 files = get_files_struct(task);
90 if (files) {
91 struct file *file;
92
93 rcu_read_lock();
94 file = fcheck_files(files, fd);
95 if (file) {
96 unsigned f_mode = file->f_mode;
97
98 rcu_read_unlock();
99 put_files_struct(files);
100
101 if (task_dumpable(task)) {
102 rcu_read_lock();
103 cred = __task_cred(task);
104 inode->i_uid = cred->euid;
105 inode->i_gid = cred->egid;
106 rcu_read_unlock();
107 } else {
108 inode->i_uid = GLOBAL_ROOT_UID;
109 inode->i_gid = GLOBAL_ROOT_GID;
110 }
111
112 if (S_ISLNK(inode->i_mode)) {
113 unsigned i_mode = S_IFLNK;
114 if (f_mode & FMODE_READ)
115 i_mode |= S_IRUSR | S_IXUSR;
116 if (f_mode & FMODE_WRITE)
117 i_mode |= S_IWUSR | S_IXUSR;
118 inode->i_mode = i_mode;
119 }
120
121 security_task_to_inode(task, inode);
122 put_task_struct(task);
123 return 1;
124 }
125 rcu_read_unlock();
126 put_files_struct(files);
127 }
128 put_task_struct(task);
129 }
130
131 d_drop(dentry);
132 return 0;
133}
134
135static const struct dentry_operations tid_fd_dentry_operations = {
136 .d_revalidate = tid_fd_revalidate,
137 .d_delete = pid_delete_dentry,
138};
139
140static int proc_fd_link(struct dentry *dentry, struct path *path)
141{
142 struct files_struct *files = NULL;
143 struct task_struct *task;
144 int ret = -ENOENT;
145
146 task = get_proc_task(dentry->d_inode);
147 if (task) {
148 files = get_files_struct(task);
149 put_task_struct(task);
150 }
151
152 if (files) {
153 int fd = proc_fd(dentry->d_inode);
154 struct file *fd_file;
155
156 spin_lock(&files->file_lock);
157 fd_file = fcheck_files(files, fd);
158 if (fd_file) {
159 *path = fd_file->f_path;
160 path_get(&fd_file->f_path);
161 ret = 0;
162 }
163 spin_unlock(&files->file_lock);
164 put_files_struct(files);
165 }
166
167 return ret;
168}
169
170static struct dentry *
171proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
172 struct task_struct *task, const void *ptr)
173{
174 struct dentry *error = ERR_PTR(-ENOENT);
175 unsigned fd = (unsigned long)ptr;
176 struct proc_inode *ei;
177 struct inode *inode;
178
179 inode = proc_pid_make_inode(dir->i_sb, task);
180 if (!inode)
181 goto out;
182
183 ei = PROC_I(inode);
184 ei->fd = fd;
185
186 inode->i_mode = S_IFLNK;
187 inode->i_op = &proc_pid_link_inode_operations;
188 inode->i_size = 64;
189
190 ei->op.proc_get_link = proc_fd_link;
191
192 d_set_d_op(dentry, &tid_fd_dentry_operations);
193 d_add(dentry, inode);
194
195 /* Close the race of the process dying before we return the dentry */
196 if (tid_fd_revalidate(dentry, 0))
197 error = NULL;
198 out:
199 return error;
200}
201
202static struct dentry *proc_lookupfd_common(struct inode *dir,
203 struct dentry *dentry,
204 instantiate_t instantiate)
205{
206 struct task_struct *task = get_proc_task(dir);
207 struct dentry *result = ERR_PTR(-ENOENT);
208 unsigned fd = name_to_int(dentry);
209
210 if (!task)
211 goto out_no_task;
212 if (fd == ~0U)
213 goto out;
214
215 result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
216out:
217 put_task_struct(task);
218out_no_task:
219 return result;
220}
221
222static int proc_readfd_common(struct file * filp, void * dirent,
223 filldir_t filldir, instantiate_t instantiate)
224{
225 struct dentry *dentry = filp->f_path.dentry;
226 struct inode *inode = dentry->d_inode;
227 struct task_struct *p = get_proc_task(inode);
228 struct files_struct *files;
229 unsigned int fd, ino;
230 int retval;
231
232 retval = -ENOENT;
233 if (!p)
234 goto out_no_task;
235 retval = 0;
236
237 fd = filp->f_pos;
238 switch (fd) {
239 case 0:
240 if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
241 goto out;
242 filp->f_pos++;
243 case 1:
244 ino = parent_ino(dentry);
245 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
246 goto out;
247 filp->f_pos++;
248 default:
249 files = get_files_struct(p);
250 if (!files)
251 goto out;
252 rcu_read_lock();
253 for (fd = filp->f_pos - 2;
254 fd < files_fdtable(files)->max_fds;
255 fd++, filp->f_pos++) {
256 char name[PROC_NUMBUF];
257 int len;
258 int rv;
259
260 if (!fcheck_files(files, fd))
261 continue;
262 rcu_read_unlock();
263
264 len = snprintf(name, sizeof(name), "%d", fd);
265 rv = proc_fill_cache(filp, dirent, filldir,
266 name, len, instantiate, p,
267 (void *)(unsigned long)fd);
268 if (rv < 0)
269 goto out_fd_loop;
270 rcu_read_lock();
271 }
272 rcu_read_unlock();
273out_fd_loop:
274 put_files_struct(files);
275 }
276out:
277 put_task_struct(p);
278out_no_task:
279 return retval;
280}
281
282static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
283{
284 return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
285}
286
287const struct file_operations proc_fd_operations = {
288 .read = generic_read_dir,
289 .readdir = proc_readfd,
290 .llseek = default_llseek,
291};
292
293static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
294 unsigned int flags)
295{
296 return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
297}
298
299/*
300 * /proc/pid/fd needs a special permission handler so that a process can still
301 * access /proc/self/fd after it has executed a setuid().
302 */
303int proc_fd_permission(struct inode *inode, int mask)
304{
305 int rv = generic_permission(inode, mask);
306 if (rv == 0)
307 return 0;
308 if (task_pid(current) == proc_pid(inode))
309 rv = 0;
310 return rv;
311}
312
313const struct inode_operations proc_fd_inode_operations = {
314 .lookup = proc_lookupfd,
315 .permission = proc_fd_permission,
316 .setattr = proc_setattr,
317};
318
319static struct dentry *
320proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
321 struct task_struct *task, const void *ptr)
322{
323 struct dentry *error = ERR_PTR(-ENOENT);
324 unsigned fd = (unsigned long)ptr;
325 struct proc_inode *ei;
326 struct inode *inode;
327
328 inode = proc_pid_make_inode(dir->i_sb, task);
329 if (!inode)
330 goto out;
331
332 ei = PROC_I(inode);
333 ei->fd = fd;
334
335 inode->i_mode = S_IFREG | S_IRUSR;
336 inode->i_fop = &proc_fdinfo_file_operations;
337
338 d_set_d_op(dentry, &tid_fd_dentry_operations);
339 d_add(dentry, inode);
340
341 /* Close the race of the process dying before we return the dentry */
342 if (tid_fd_revalidate(dentry, 0))
343 error = NULL;
344 out:
345 return error;
346}
347
348static struct dentry *
349proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
350{
351 return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
352}
353
354static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
355{
356 return proc_readfd_common(filp, dirent, filldir,
357 proc_fdinfo_instantiate);
358}
359
360const struct inode_operations proc_fdinfo_inode_operations = {
361 .lookup = proc_lookupfdinfo,
362 .setattr = proc_setattr,
363};
364
365const struct file_operations proc_fdinfo_operations = {
366 .read = generic_read_dir,
367 .readdir = proc_readfdinfo,
368 .llseek = default_llseek,
369};
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
deleted file mode 100644
index cbb1d47deda..00000000000
--- a/fs/proc/fd.h
+++ /dev/null
@@ -1,14 +0,0 @@
1#ifndef __PROCFS_FD_H__
2#define __PROCFS_FD_H__
3
4#include <linux/fs.h>
5
6extern const struct file_operations proc_fd_operations;
7extern const struct inode_operations proc_fd_inode_operations;
8
9extern const struct file_operations proc_fdinfo_operations;
10extern const struct inode_operations proc_fdinfo_inode_operations;
11
12extern int proc_fd_permission(struct inode *inode, int mask);
13
14#endif /* __PROCFS_FD_H__ */
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 76ddae83daa..9d99131d0d6 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -261,9 +261,16 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
261 if (error) 261 if (error)
262 return error; 262 return error;
263 263
264 if ((iattr->ia_valid & ATTR_SIZE) &&
265 iattr->ia_size != i_size_read(inode)) {
266 error = vmtruncate(inode, iattr->ia_size);
267 if (error)
268 return error;
269 }
270
264 setattr_copy(inode, iattr); 271 setattr_copy(inode, iattr);
265 mark_inode_dirty(inode); 272 mark_inode_dirty(inode);
266 273
267 de->uid = inode->i_uid; 274 de->uid = inode->i_uid;
268 de->gid = inode->i_gid; 275 de->gid = inode->i_gid;
269 de->mode = inode->i_mode; 276 de->mode = inode->i_mode;
@@ -276,7 +283,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
276 struct inode *inode = dentry->d_inode; 283 struct inode *inode = dentry->d_inode;
277 struct proc_dir_entry *de = PROC_I(inode)->pde; 284 struct proc_dir_entry *de = PROC_I(inode)->pde;
278 if (de && de->nlink) 285 if (de && de->nlink)
279 set_nlink(inode, de->nlink); 286 inode->i_nlink = de->nlink;
280 287
281 generic_fillattr(inode, stat); 288 generic_fillattr(inode, stat);
282 return 0; 289 return 0;
@@ -343,39 +350,37 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
343 * Return an inode number between PROC_DYNAMIC_FIRST and 350 * Return an inode number between PROC_DYNAMIC_FIRST and
344 * 0xffffffff, or zero on failure. 351 * 0xffffffff, or zero on failure.
345 */ 352 */
346int proc_alloc_inum(unsigned int *inum) 353static unsigned int get_inode_number(void)
347{ 354{
348 unsigned int i; 355 unsigned int i;
349 int error; 356 int error;
350 357
351retry: 358retry:
352 if (!ida_pre_get(&proc_inum_ida, GFP_KERNEL)) 359 if (ida_pre_get(&proc_inum_ida, GFP_KERNEL) == 0)
353 return -ENOMEM; 360 return 0;
354 361
355 spin_lock_irq(&proc_inum_lock); 362 spin_lock(&proc_inum_lock);
356 error = ida_get_new(&proc_inum_ida, &i); 363 error = ida_get_new(&proc_inum_ida, &i);
357 spin_unlock_irq(&proc_inum_lock); 364 spin_unlock(&proc_inum_lock);
358 if (error == -EAGAIN) 365 if (error == -EAGAIN)
359 goto retry; 366 goto retry;
360 else if (error) 367 else if (error)
361 return error; 368 return 0;
362 369
363 if (i > UINT_MAX - PROC_DYNAMIC_FIRST) { 370 if (i > UINT_MAX - PROC_DYNAMIC_FIRST) {
364 spin_lock_irq(&proc_inum_lock); 371 spin_lock(&proc_inum_lock);
365 ida_remove(&proc_inum_ida, i); 372 ida_remove(&proc_inum_ida, i);
366 spin_unlock_irq(&proc_inum_lock); 373 spin_unlock(&proc_inum_lock);
367 return -ENOSPC; 374 return 0;
368 } 375 }
369 *inum = PROC_DYNAMIC_FIRST + i; 376 return PROC_DYNAMIC_FIRST + i;
370 return 0;
371} 377}
372 378
373void proc_free_inum(unsigned int inum) 379static void release_inode_number(unsigned int inum)
374{ 380{
375 unsigned long flags; 381 spin_lock(&proc_inum_lock);
376 spin_lock_irqsave(&proc_inum_lock, flags);
377 ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST); 382 ida_remove(&proc_inum_ida, inum - PROC_DYNAMIC_FIRST);
378 spin_unlock_irqrestore(&proc_inum_lock, flags); 383 spin_unlock(&proc_inum_lock);
379} 384}
380 385
381static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) 386static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
@@ -422,7 +427,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
422 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { 427 if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
423 pde_get(de); 428 pde_get(de);
424 spin_unlock(&proc_subdir_lock); 429 spin_unlock(&proc_subdir_lock);
425 error = -ENOMEM; 430 error = -EINVAL;
426 inode = proc_get_inode(dir->i_sb, de); 431 inode = proc_get_inode(dir->i_sb, de);
427 goto out_unlock; 432 goto out_unlock;
428 } 433 }
@@ -441,7 +446,7 @@ out_unlock:
441} 446}
442 447
443struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry, 448struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
444 unsigned int flags) 449 struct nameidata *nd)
445{ 450{
446 return proc_lookup_de(PDE(dir), dir, dentry); 451 return proc_lookup_de(PDE(dir), dir, dentry);
447} 452}
@@ -549,12 +554,13 @@ static const struct inode_operations proc_dir_inode_operations = {
549 554
550static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp) 555static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
551{ 556{
557 unsigned int i;
552 struct proc_dir_entry *tmp; 558 struct proc_dir_entry *tmp;
553 int ret;
554 559
555 ret = proc_alloc_inum(&dp->low_ino); 560 i = get_inode_number();
556 if (ret) 561 if (i == 0)
557 return ret; 562 return -EAGAIN;
563 dp->low_ino = i;
558 564
559 if (S_ISDIR(dp->mode)) { 565 if (S_ISDIR(dp->mode)) {
560 if (dp->proc_iops == NULL) { 566 if (dp->proc_iops == NULL) {
@@ -591,7 +597,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
591 597
592static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, 598static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
593 const char *name, 599 const char *name,
594 umode_t mode, 600 mode_t mode,
595 nlink_t nlink) 601 nlink_t nlink)
596{ 602{
597 struct proc_dir_entry *ent = NULL; 603 struct proc_dir_entry *ent = NULL;
@@ -599,8 +605,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
599 unsigned int len; 605 unsigned int len;
600 606
601 /* make sure name is valid */ 607 /* make sure name is valid */
602 if (!name || !strlen(name)) 608 if (!name || !strlen(name)) goto out;
603 goto out;
604 609
605 if (xlate_proc_name(name, parent, &fn) != 0) 610 if (xlate_proc_name(name, parent, &fn) != 0)
606 goto out; 611 goto out;
@@ -611,18 +616,20 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent,
611 616
612 len = strlen(fn); 617 len = strlen(fn);
613 618
614 ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); 619 ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
615 if (!ent) 620 if (!ent) goto out;
616 goto out;
617 621
622 memset(ent, 0, sizeof(struct proc_dir_entry));
618 memcpy(ent->name, fn, len + 1); 623 memcpy(ent->name, fn, len + 1);
619 ent->namelen = len; 624 ent->namelen = len;
620 ent->mode = mode; 625 ent->mode = mode;
621 ent->nlink = nlink; 626 ent->nlink = nlink;
622 atomic_set(&ent->count, 1); 627 atomic_set(&ent->count, 1);
628 ent->pde_users = 0;
623 spin_lock_init(&ent->pde_unload_lock); 629 spin_lock_init(&ent->pde_unload_lock);
630 ent->pde_unload_completion = NULL;
624 INIT_LIST_HEAD(&ent->pde_openers); 631 INIT_LIST_HEAD(&ent->pde_openers);
625out: 632 out:
626 return ent; 633 return ent;
627} 634}
628 635
@@ -652,7 +659,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
652} 659}
653EXPORT_SYMBOL(proc_symlink); 660EXPORT_SYMBOL(proc_symlink);
654 661
655struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, 662struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
656 struct proc_dir_entry *parent) 663 struct proc_dir_entry *parent)
657{ 664{
658 struct proc_dir_entry *ent; 665 struct proc_dir_entry *ent;
@@ -692,7 +699,7 @@ struct proc_dir_entry *proc_mkdir(const char *name,
692} 699}
693EXPORT_SYMBOL(proc_mkdir); 700EXPORT_SYMBOL(proc_mkdir);
694 701
695struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode, 702struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
696 struct proc_dir_entry *parent) 703 struct proc_dir_entry *parent)
697{ 704{
698 struct proc_dir_entry *ent; 705 struct proc_dir_entry *ent;
@@ -721,7 +728,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode,
721} 728}
722EXPORT_SYMBOL(create_proc_entry); 729EXPORT_SYMBOL(create_proc_entry);
723 730
724struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, 731struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
725 struct proc_dir_entry *parent, 732 struct proc_dir_entry *parent,
726 const struct file_operations *proc_fops, 733 const struct file_operations *proc_fops,
727 void *data) 734 void *data)
@@ -758,7 +765,7 @@ EXPORT_SYMBOL(proc_create_data);
758 765
759static void free_proc_entry(struct proc_dir_entry *de) 766static void free_proc_entry(struct proc_dir_entry *de)
760{ 767{
761 proc_free_inum(de->low_ino); 768 release_inode_number(de->low_ino);
762 769
763 if (S_ISLNK(de->mode)) 770 if (S_ISLNK(de->mode))
764 kfree(de->data); 771 kfree(de->data);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 439ae688650..7ed72d6c1c6 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -7,7 +7,6 @@
7#include <linux/time.h> 7#include <linux/time.h>
8#include <linux/proc_fs.h> 8#include <linux/proc_fs.h>
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/pid_namespace.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12#include <linux/string.h> 11#include <linux/string.h>
13#include <linux/stat.h> 12#include <linux/stat.h>
@@ -18,10 +17,9 @@
18#include <linux/init.h> 17#include <linux/init.h>
19#include <linux/module.h> 18#include <linux/module.h>
20#include <linux/sysctl.h> 19#include <linux/sysctl.h>
21#include <linux/seq_file.h>
22#include <linux/slab.h> 20#include <linux/slab.h>
23#include <linux/mount.h>
24 21
22#include <asm/system.h>
25#include <asm/uaccess.h> 23#include <asm/uaccess.h>
26 24
27#include "internal.h" 25#include "internal.h"
@@ -31,10 +29,9 @@ static void proc_evict_inode(struct inode *inode)
31 struct proc_dir_entry *de; 29 struct proc_dir_entry *de;
32 struct ctl_table_header *head; 30 struct ctl_table_header *head;
33 const struct proc_ns_operations *ns_ops; 31 const struct proc_ns_operations *ns_ops;
34 void *ns;
35 32
36 truncate_inode_pages(&inode->i_data, 0); 33 truncate_inode_pages(&inode->i_data, 0);
37 clear_inode(inode); 34 end_writeback(inode);
38 35
39 /* Stop tracking associated processes */ 36 /* Stop tracking associated processes */
40 put_pid(PROC_I(inode)->pid); 37 put_pid(PROC_I(inode)->pid);
@@ -50,9 +47,8 @@ static void proc_evict_inode(struct inode *inode)
50 } 47 }
51 /* Release any associated namespace */ 48 /* Release any associated namespace */
52 ns_ops = PROC_I(inode)->ns_ops; 49 ns_ops = PROC_I(inode)->ns_ops;
53 ns = PROC_I(inode)->ns; 50 if (ns_ops && ns_ops->put)
54 if (ns_ops && ns) 51 ns_ops->put(PROC_I(inode)->ns);
55 ns_ops->put(ns);
56} 52}
57 53
58static struct kmem_cache * proc_inode_cachep; 54static struct kmem_cache * proc_inode_cachep;
@@ -81,6 +77,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
81static void proc_i_callback(struct rcu_head *head) 77static void proc_i_callback(struct rcu_head *head)
82{ 78{
83 struct inode *inode = container_of(head, struct inode, i_rcu); 79 struct inode *inode = container_of(head, struct inode, i_rcu);
80 INIT_LIST_HEAD(&inode->i_dentry);
84 kmem_cache_free(proc_inode_cachep, PROC_I(inode)); 81 kmem_cache_free(proc_inode_cachep, PROC_I(inode));
85} 82}
86 83
@@ -105,27 +102,12 @@ void __init proc_init_inodecache(void)
105 init_once); 102 init_once);
106} 103}
107 104
108static int proc_show_options(struct seq_file *seq, struct dentry *root)
109{
110 struct super_block *sb = root->d_sb;
111 struct pid_namespace *pid = sb->s_fs_info;
112
113 if (!gid_eq(pid->pid_gid, GLOBAL_ROOT_GID))
114 seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
115 if (pid->hide_pid != 0)
116 seq_printf(seq, ",hidepid=%u", pid->hide_pid);
117
118 return 0;
119}
120
121static const struct super_operations proc_sops = { 105static const struct super_operations proc_sops = {
122 .alloc_inode = proc_alloc_inode, 106 .alloc_inode = proc_alloc_inode,
123 .destroy_inode = proc_destroy_inode, 107 .destroy_inode = proc_destroy_inode,
124 .drop_inode = generic_delete_inode, 108 .drop_inode = generic_delete_inode,
125 .evict_inode = proc_evict_inode, 109 .evict_inode = proc_evict_inode,
126 .statfs = simple_statfs, 110 .statfs = simple_statfs,
127 .remount_fs = proc_remount,
128 .show_options = proc_show_options,
129}; 111};
130 112
131static void __pde_users_dec(struct proc_dir_entry *pde) 113static void __pde_users_dec(struct proc_dir_entry *pde)
@@ -452,6 +434,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
452 return NULL; 434 return NULL;
453 if (inode->i_state & I_NEW) { 435 if (inode->i_state & I_NEW) {
454 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 436 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
437 PROC_I(inode)->fd = 0;
455 PROC_I(inode)->pde = de; 438 PROC_I(inode)->pde = de;
456 439
457 if (de->mode) { 440 if (de->mode) {
@@ -462,7 +445,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
462 if (de->size) 445 if (de->size)
463 inode->i_size = de->size; 446 inode->i_size = de->size;
464 if (de->nlink) 447 if (de->nlink)
465 set_nlink(inode, de->nlink); 448 inode->i_nlink = de->nlink;
466 if (de->proc_iops) 449 if (de->proc_iops)
467 inode->i_op = de->proc_iops; 450 inode->i_op = de->proc_iops;
468 if (de->proc_fops) { 451 if (de->proc_fops) {
@@ -486,6 +469,8 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
486 469
487int proc_fill_super(struct super_block *s) 470int proc_fill_super(struct super_block *s)
488{ 471{
472 struct inode * root_inode;
473
489 s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; 474 s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
490 s->s_blocksize = 1024; 475 s->s_blocksize = 1024;
491 s->s_blocksize_bits = 10; 476 s->s_blocksize_bits = 10;
@@ -494,11 +479,19 @@ int proc_fill_super(struct super_block *s)
494 s->s_time_gran = 1; 479 s->s_time_gran = 1;
495 480
496 pde_get(&proc_root); 481 pde_get(&proc_root);
497 s->s_root = d_make_root(proc_get_inode(s, &proc_root)); 482 root_inode = proc_get_inode(s, &proc_root);
498 if (s->s_root) 483 if (!root_inode)
499 return 0; 484 goto out_no_root;
485 root_inode->i_uid = 0;
486 root_inode->i_gid = 0;
487 s->s_root = d_alloc_root(root_inode);
488 if (!s->s_root)
489 goto out_no_root;
490 return 0;
500 491
492out_no_root:
501 printk("proc_read_super: get root inode failed\n"); 493 printk("proc_read_super: get root inode failed\n");
494 iput(root_inode);
502 pde_put(&proc_root); 495 pde_put(&proc_root);
503 return -ENOMEM; 496 return -ENOMEM;
504} 497}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 252544c0520..7838e5cfec1 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -9,19 +9,13 @@
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 */ 10 */
11 11
12#include <linux/sched.h>
13#include <linux/proc_fs.h> 12#include <linux/proc_fs.h>
14struct ctl_table_header;
15struct mempolicy;
16 13
17extern struct proc_dir_entry proc_root; 14extern struct proc_dir_entry proc_root;
18extern void proc_self_init(void);
19#ifdef CONFIG_PROC_SYSCTL 15#ifdef CONFIG_PROC_SYSCTL
20extern int proc_sys_init(void); 16extern int proc_sys_init(void);
21extern void sysctl_head_put(struct ctl_table_header *head);
22#else 17#else
23static inline void proc_sys_init(void) { } 18static inline void proc_sys_init(void) { }
24static inline void sysctl_head_put(struct ctl_table_header *head) { }
25#endif 19#endif
26#ifdef CONFIG_NET 20#ifdef CONFIG_NET
27extern int proc_net_init(void); 21extern int proc_net_init(void);
@@ -34,6 +28,8 @@ struct vmalloc_info {
34 unsigned long largest_chunk; 28 unsigned long largest_chunk;
35}; 29};
36 30
31extern struct mm_struct *mm_for_maps(struct task_struct *);
32
37#ifdef CONFIG_MMU 33#ifdef CONFIG_MMU
38#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) 34#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
39extern void get_vmalloc_info(struct vmalloc_info *vmi); 35extern void get_vmalloc_info(struct vmalloc_info *vmi);
@@ -57,18 +53,13 @@ extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
57 struct pid *pid, struct task_struct *task); 53 struct pid *pid, struct task_struct *task);
58extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); 54extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
59 55
60extern const struct file_operations proc_tid_children_operations; 56extern const struct file_operations proc_maps_operations;
61extern const struct file_operations proc_pid_maps_operations; 57extern const struct file_operations proc_numa_maps_operations;
62extern const struct file_operations proc_tid_maps_operations; 58extern const struct file_operations proc_smaps_operations;
63extern const struct file_operations proc_pid_numa_maps_operations;
64extern const struct file_operations proc_tid_numa_maps_operations;
65extern const struct file_operations proc_pid_smaps_operations;
66extern const struct file_operations proc_tid_smaps_operations;
67extern const struct file_operations proc_clear_refs_operations; 59extern const struct file_operations proc_clear_refs_operations;
68extern const struct file_operations proc_pagemap_operations; 60extern const struct file_operations proc_pagemap_operations;
69extern const struct file_operations proc_net_operations; 61extern const struct file_operations proc_net_operations;
70extern const struct inode_operations proc_net_inode_operations; 62extern const struct inode_operations proc_net_inode_operations;
71extern const struct inode_operations proc_pid_link_inode_operations;
72 63
73struct proc_maps_private { 64struct proc_maps_private {
74 struct pid *pid; 65 struct pid *pid;
@@ -76,9 +67,6 @@ struct proc_maps_private {
76#ifdef CONFIG_MMU 67#ifdef CONFIG_MMU
77 struct vm_area_struct *tail_vma; 68 struct vm_area_struct *tail_vma;
78#endif 69#endif
79#ifdef CONFIG_NUMA
80 struct mempolicy *task_mempolicy;
81#endif
82}; 70};
83 71
84void proc_init_inodecache(void); 72void proc_init_inodecache(void);
@@ -98,52 +86,6 @@ static inline int proc_fd(struct inode *inode)
98 return PROC_I(inode)->fd; 86 return PROC_I(inode)->fd;
99} 87}
100 88
101static inline int task_dumpable(struct task_struct *task)
102{
103 int dumpable = 0;
104 struct mm_struct *mm;
105
106 task_lock(task);
107 mm = task->mm;
108 if (mm)
109 dumpable = get_dumpable(mm);
110 task_unlock(task);
111 if (dumpable == SUID_DUMPABLE_ENABLED)
112 return 1;
113 return 0;
114}
115
116static inline int pid_delete_dentry(const struct dentry * dentry)
117{
118 /* Is the task we represent dead?
119 * If so, then don't put the dentry on the lru list,
120 * kill it immediately.
121 */
122 return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
123}
124
125static inline unsigned name_to_int(struct dentry *dentry)
126{
127 const char *name = dentry->d_name.name;
128 int len = dentry->d_name.len;
129 unsigned n = 0;
130
131 if (len > 1 && *name == '0')
132 goto out;
133 while (len-- > 0) {
134 unsigned c = *name++ - '0';
135 if (c > 9)
136 goto out;
137 if (n >= (~0U-9)/10)
138 goto out;
139 n *= 10;
140 n += c;
141 }
142 return n;
143out:
144 return ~0U;
145}
146
147struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino, 89struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
148 struct dentry *dentry); 90 struct dentry *dentry);
149int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, 91int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
@@ -159,7 +101,7 @@ void pde_users_dec(struct proc_dir_entry *pde);
159 101
160extern spinlock_t proc_subdir_lock; 102extern spinlock_t proc_subdir_lock;
161 103
162struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int); 104struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
163int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir); 105int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
164unsigned long task_vsize(struct mm_struct *); 106unsigned long task_vsize(struct mm_struct *);
165unsigned long task_statm(struct mm_struct *, 107unsigned long task_statm(struct mm_struct *,
@@ -175,7 +117,6 @@ void pde_put(struct proc_dir_entry *pde);
175 117
176int proc_fill_super(struct super_block *); 118int proc_fill_super(struct super_block *);
177struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); 119struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
178int proc_remount(struct super_block *sb, int *flags, char *data);
179 120
180/* 121/*
181 * These are generic /proc routines that use the internal 122 * These are generic /proc routines that use the internal
@@ -185,7 +126,7 @@ int proc_remount(struct super_block *sb, int *flags, char *data);
185 * of the /proc/<pid> subdirectories. 126 * of the /proc/<pid> subdirectories.
186 */ 127 */
187int proc_readdir(struct file *, void *, filldir_t); 128int proc_readdir(struct file *, void *, filldir_t);
188struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); 129struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
189 130
190 131
191 132
@@ -195,7 +136,7 @@ typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
195int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, 136int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
196 const char *name, int len, 137 const char *name, int len,
197 instantiate_t instantiate, struct task_struct *task, const void *ptr); 138 instantiate_t instantiate, struct task_struct *task, const void *ptr);
198int pid_revalidate(struct dentry *dentry, unsigned int flags); 139int pid_revalidate(struct dentry *dentry, struct nameidata *nd);
199struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task); 140struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
200extern const struct dentry_operations pid_dentry_operations; 141extern const struct dentry_operations pid_dentry_operations;
201int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); 142int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e96d4f18ca3..d245cb23dd7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -157,8 +157,7 @@ static int kcore_update_ram(void)
157 157
158#ifdef CONFIG_SPARSEMEM_VMEMMAP 158#ifdef CONFIG_SPARSEMEM_VMEMMAP
159/* calculate vmemmap's address from given system ram pfn and register it */ 159/* calculate vmemmap's address from given system ram pfn and register it */
160static int 160int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
161get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
162{ 161{
163 unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT; 162 unsigned long pfn = __pa(ent->addr) >> PAGE_SHIFT;
164 unsigned long nr_pages = ent->size >> PAGE_SHIFT; 163 unsigned long nr_pages = ent->size >> PAGE_SHIFT;
@@ -190,8 +189,7 @@ get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
190 189
191} 190}
192#else 191#else
193static int 192int get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
194get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head)
195{ 193{
196 return 1; 194 return 1;
197} 195}
@@ -249,7 +247,7 @@ static int kcore_update_ram(void)
249 /* Not inialized....update now */ 247 /* Not inialized....update now */
250 /* find out "max pfn" */ 248 /* find out "max pfn" */
251 end_pfn = 0; 249 end_pfn = 0;
252 for_each_node_state(nid, N_MEMORY) { 250 for_each_node_state(nid, N_HIGH_MEMORY) {
253 unsigned long node_end; 251 unsigned long node_end;
254 node_end = NODE_DATA(nid)->node_start_pfn + 252 node_end = NODE_DATA(nid)->node_start_pfn +
255 NODE_DATA(nid)->node_spanned_pages; 253 NODE_DATA(nid)->node_spanned_pages;
@@ -515,7 +513,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
515 513
516 n = copy_to_user(buffer, (char *)start, tsz); 514 n = copy_to_user(buffer, (char *)start, tsz);
517 /* 515 /*
518 * We cannot distinguish between fault on source 516 * We cannot distingush between fault on source
519 * and fault on destination. When this happens 517 * and fault on destination. When this happens
520 * we clear too and hope it will trigger the 518 * we clear too and hope it will trigger the
521 * EFAULT again. 519 * EFAULT again.
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index b7a47196c8c..be177f702ac 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -9,9 +9,9 @@
9#include <linux/file.h> 9#include <linux/file.h>
10#include <linux/utsname.h> 10#include <linux/utsname.h>
11#include <net/net_namespace.h> 11#include <net/net_namespace.h>
12#include <linux/mnt_namespace.h>
12#include <linux/ipc_namespace.h> 13#include <linux/ipc_namespace.h>
13#include <linux/pid_namespace.h> 14#include <linux/pid_namespace.h>
14#include <linux/user_namespace.h>
15#include "internal.h" 15#include "internal.h"
16 16
17 17
@@ -25,168 +25,12 @@ static const struct proc_ns_operations *ns_entries[] = {
25#ifdef CONFIG_IPC_NS 25#ifdef CONFIG_IPC_NS
26 &ipcns_operations, 26 &ipcns_operations,
27#endif 27#endif
28#ifdef CONFIG_PID_NS
29 &pidns_operations,
30#endif
31#ifdef CONFIG_USER_NS
32 &userns_operations,
33#endif
34 &mntns_operations,
35}; 28};
36 29
37static const struct file_operations ns_file_operations = { 30static const struct file_operations ns_file_operations = {
38 .llseek = no_llseek, 31 .llseek = no_llseek,
39}; 32};
40 33
41static const struct inode_operations ns_inode_operations = {
42 .setattr = proc_setattr,
43};
44
45static int ns_delete_dentry(const struct dentry *dentry)
46{
47 /* Don't cache namespace inodes when not in use */
48 return 1;
49}
50
51static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
52{
53 struct inode *inode = dentry->d_inode;
54 const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
55
56 return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
57 ns_ops->name, inode->i_ino);
58}
59
60const struct dentry_operations ns_dentry_operations =
61{
62 .d_delete = ns_delete_dentry,
63 .d_dname = ns_dname,
64};
65
66static struct dentry *proc_ns_get_dentry(struct super_block *sb,
67 struct task_struct *task, const struct proc_ns_operations *ns_ops)
68{
69 struct dentry *dentry, *result;
70 struct inode *inode;
71 struct proc_inode *ei;
72 struct qstr qname = { .name = "", };
73 void *ns;
74
75 ns = ns_ops->get(task);
76 if (!ns)
77 return ERR_PTR(-ENOENT);
78
79 dentry = d_alloc_pseudo(sb, &qname);
80 if (!dentry) {
81 ns_ops->put(ns);
82 return ERR_PTR(-ENOMEM);
83 }
84
85 inode = iget_locked(sb, ns_ops->inum(ns));
86 if (!inode) {
87 dput(dentry);
88 ns_ops->put(ns);
89 return ERR_PTR(-ENOMEM);
90 }
91
92 ei = PROC_I(inode);
93 if (inode->i_state & I_NEW) {
94 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
95 inode->i_op = &ns_inode_operations;
96 inode->i_mode = S_IFREG | S_IRUGO;
97 inode->i_fop = &ns_file_operations;
98 ei->ns_ops = ns_ops;
99 ei->ns = ns;
100 unlock_new_inode(inode);
101 } else {
102 ns_ops->put(ns);
103 }
104
105 d_set_d_op(dentry, &ns_dentry_operations);
106 result = d_instantiate_unique(dentry, inode);
107 if (result) {
108 dput(dentry);
109 dentry = result;
110 }
111
112 return dentry;
113}
114
115static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
116{
117 struct inode *inode = dentry->d_inode;
118 struct super_block *sb = inode->i_sb;
119 struct proc_inode *ei = PROC_I(inode);
120 struct task_struct *task;
121 struct dentry *ns_dentry;
122 void *error = ERR_PTR(-EACCES);
123
124 task = get_proc_task(inode);
125 if (!task)
126 goto out;
127
128 if (!ptrace_may_access(task, PTRACE_MODE_READ))
129 goto out_put_task;
130
131 ns_dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
132 if (IS_ERR(ns_dentry)) {
133 error = ERR_CAST(ns_dentry);
134 goto out_put_task;
135 }
136
137 dput(nd->path.dentry);
138 nd->path.dentry = ns_dentry;
139 error = NULL;
140
141out_put_task:
142 put_task_struct(task);
143out:
144 return error;
145}
146
147static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen)
148{
149 struct inode *inode = dentry->d_inode;
150 struct proc_inode *ei = PROC_I(inode);
151 const struct proc_ns_operations *ns_ops = ei->ns_ops;
152 struct task_struct *task;
153 void *ns;
154 char name[50];
155 int len = -EACCES;
156
157 task = get_proc_task(inode);
158 if (!task)
159 goto out;
160
161 if (!ptrace_may_access(task, PTRACE_MODE_READ))
162 goto out_put_task;
163
164 len = -ENOENT;
165 ns = ns_ops->get(task);
166 if (!ns)
167 goto out_put_task;
168
169 snprintf(name, sizeof(name), "%s:[%u]", ns_ops->name, ns_ops->inum(ns));
170 len = strlen(name);
171
172 if (len > buflen)
173 len = buflen;
174 if (copy_to_user(buffer, name, len))
175 len = -EFAULT;
176
177 ns_ops->put(ns);
178out_put_task:
179 put_task_struct(task);
180out:
181 return len;
182}
183
184static const struct inode_operations proc_ns_link_inode_operations = {
185 .readlink = proc_ns_readlink,
186 .follow_link = proc_ns_follow_link,
187 .setattr = proc_setattr,
188};
189
190static struct dentry *proc_ns_instantiate(struct inode *dir, 34static struct dentry *proc_ns_instantiate(struct inode *dir,
191 struct dentry *dentry, struct task_struct *task, const void *ptr) 35 struct dentry *dentry, struct task_struct *task, const void *ptr)
192{ 36{
@@ -194,23 +38,32 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
194 struct inode *inode; 38 struct inode *inode;
195 struct proc_inode *ei; 39 struct proc_inode *ei;
196 struct dentry *error = ERR_PTR(-ENOENT); 40 struct dentry *error = ERR_PTR(-ENOENT);
41 void *ns;
197 42
198 inode = proc_pid_make_inode(dir->i_sb, task); 43 inode = proc_pid_make_inode(dir->i_sb, task);
199 if (!inode) 44 if (!inode)
200 goto out; 45 goto out;
201 46
47 ns = ns_ops->get(task);
48 if (!ns)
49 goto out_iput;
50
202 ei = PROC_I(inode); 51 ei = PROC_I(inode);
203 inode->i_mode = S_IFLNK|S_IRWXUGO; 52 inode->i_mode = S_IFREG|S_IRUSR;
204 inode->i_op = &proc_ns_link_inode_operations; 53 inode->i_fop = &ns_file_operations;
205 ei->ns_ops = ns_ops; 54 ei->ns_ops = ns_ops;
55 ei->ns = ns;
206 56
207 d_set_d_op(dentry, &pid_dentry_operations); 57 dentry->d_op = &pid_dentry_operations;
208 d_add(dentry, inode); 58 d_add(dentry, inode);
209 /* Close the race of the process dying before we return the dentry */ 59 /* Close the race of the process dying before we return the dentry */
210 if (pid_revalidate(dentry, 0)) 60 if (pid_revalidate(dentry, NULL))
211 error = NULL; 61 error = NULL;
212out: 62out:
213 return error; 63 return error;
64out_iput:
65 iput(inode);
66 goto out;
214} 67}
215 68
216static int proc_ns_fill_cache(struct file *filp, void *dirent, 69static int proc_ns_fill_cache(struct file *filp, void *dirent,
@@ -237,6 +90,10 @@ static int proc_ns_dir_readdir(struct file *filp, void *dirent,
237 if (!task) 90 if (!task)
238 goto out_no_task; 91 goto out_no_task;
239 92
93 ret = -EPERM;
94 if (!ptrace_may_access(task, PTRACE_MODE_READ))
95 goto out;
96
240 ret = 0; 97 ret = 0;
241 i = filp->f_pos; 98 i = filp->f_pos;
242 switch (i) { 99 switch (i) {
@@ -284,7 +141,7 @@ const struct file_operations proc_ns_dir_operations = {
284}; 141};
285 142
286static struct dentry *proc_ns_dir_lookup(struct inode *dir, 143static struct dentry *proc_ns_dir_lookup(struct inode *dir,
287 struct dentry *dentry, unsigned int flags) 144 struct dentry *dentry, struct nameidata *nd)
288{ 145{
289 struct dentry *error; 146 struct dentry *error;
290 struct task_struct *task = get_proc_task(dir); 147 struct task_struct *task = get_proc_task(dir);
@@ -296,14 +153,19 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
296 if (!task) 153 if (!task)
297 goto out_no_task; 154 goto out_no_task;
298 155
299 last = &ns_entries[ARRAY_SIZE(ns_entries)]; 156 error = ERR_PTR(-EPERM);
300 for (entry = ns_entries; entry < last; entry++) { 157 if (!ptrace_may_access(task, PTRACE_MODE_READ))
158 goto out;
159
160 last = &ns_entries[ARRAY_SIZE(ns_entries) - 1];
161 for (entry = ns_entries; entry <= last; entry++) {
301 if (strlen((*entry)->name) != len) 162 if (strlen((*entry)->name) != len)
302 continue; 163 continue;
303 if (!memcmp(dentry->d_name.name, (*entry)->name, len)) 164 if (!memcmp(dentry->d_name.name, (*entry)->name, len))
304 break; 165 break;
305 } 166 }
306 if (entry == last) 167 error = ERR_PTR(-ENOENT);
168 if (entry > last)
307 goto out; 169 goto out;
308 170
309 error = proc_ns_instantiate(dir, dentry, task, *entry); 171 error = proc_ns_instantiate(dir, dentry, task, *entry);
@@ -337,7 +199,3 @@ out_invalid:
337 return ERR_PTR(-EINVAL); 199 return ERR_PTR(-EINVAL);
338} 200}
339 201
340bool proc_ns_inode(struct inode *inode)
341{
342 return inode->i_fop == &ns_file_operations;
343}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index b8730d9ebae..6d8e6a9e93a 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -115,14 +115,6 @@ u64 stable_page_flags(struct page *page)
115 u |= 1 << KPF_COMPOUND_TAIL; 115 u |= 1 << KPF_COMPOUND_TAIL;
116 if (PageHuge(page)) 116 if (PageHuge(page))
117 u |= 1 << KPF_HUGE; 117 u |= 1 << KPF_HUGE;
118 /*
119 * PageTransCompound can be true for non-huge compound pages (slab
120 * pages or pages allocated by drivers with __GFP_COMP) because it
121 * just checks PG_head/PG_tail, so we need to check PageLRU to make
122 * sure a given page is a thp, not a non-huge compound page.
123 */
124 else if (PageTransCompound(page) && PageLRU(compound_trans_head(page)))
125 u |= 1 << KPF_THP;
126 118
127 /* 119 /*
128 * Caveats on high order pages: page->_count will only be set 120 * Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de20ec480fa..927cbd115e5 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -101,11 +101,6 @@ void proc_device_tree_update_prop(struct proc_dir_entry *pde,
101{ 101{
102 struct proc_dir_entry *ent; 102 struct proc_dir_entry *ent;
103 103
104 if (!oldprop) {
105 proc_device_tree_add_prop(pde, newprop);
106 return;
107 }
108
109 for (ent = pde->subdir; ent != NULL; ent = ent->next) 104 for (ent = pde->subdir; ent != NULL; ent = ent->next)
110 if (ent->data == oldprop) 105 if (ent->data == oldprop)
111 break; 106 break;
@@ -195,7 +190,11 @@ void proc_device_tree_add_node(struct device_node *np,
195 set_node_proc_entry(np, de); 190 set_node_proc_entry(np, de);
196 for (child = NULL; (child = of_get_next_child(np, child));) { 191 for (child = NULL; (child = of_get_next_child(np, child));) {
197 /* Use everything after the last slash, or the full name */ 192 /* Use everything after the last slash, or the full name */
198 p = kbasename(child->full_name); 193 p = strrchr(child->full_name, '/');
194 if (!p)
195 p = child->full_name;
196 else
197 ++p;
199 198
200 if (duplicate_name(de, p)) 199 if (duplicate_name(de, p))
201 p = fixup_name(np, de, p); 200 p = fixup_name(np, de, p);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index fe72cd073de..f738024ccc8 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -119,7 +119,7 @@ static struct net *get_proc_task_net(struct inode *dir)
119} 119}
120 120
121static struct dentry *proc_tgid_net_lookup(struct inode *dir, 121static struct dentry *proc_tgid_net_lookup(struct inode *dir,
122 struct dentry *dentry, unsigned int flags) 122 struct dentry *dentry, struct nameidata *nd)
123{ 123{
124 struct dentry *de; 124 struct dentry *de;
125 struct net *net; 125 struct net *net;
@@ -179,7 +179,7 @@ const struct file_operations proc_net_operations = {
179 179
180 180
181struct proc_dir_entry *proc_net_fops_create(struct net *net, 181struct proc_dir_entry *proc_net_fops_create(struct net *net,
182 const char *name, umode_t mode, const struct file_operations *fops) 182 const char *name, mode_t mode, const struct file_operations *fops)
183{ 183{
184 return proc_create(name, mode, net->proc_net, fops); 184 return proc_create(name, mode, net->proc_net, fops);
185} 185}
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 1827d88ad58..1a77dbef226 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -3,13 +3,9 @@
3 */ 3 */
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/sysctl.h> 5#include <linux/sysctl.h>
6#include <linux/poll.h>
7#include <linux/proc_fs.h> 6#include <linux/proc_fs.h>
8#include <linux/security.h> 7#include <linux/security.h>
9#include <linux/sched.h>
10#include <linux/namei.h> 8#include <linux/namei.h>
11#include <linux/mm.h>
12#include <linux/module.h>
13#include "internal.h" 9#include "internal.h"
14 10
15static const struct dentry_operations proc_sys_dentry_operations; 11static const struct dentry_operations proc_sys_dentry_operations;
@@ -18,379 +14,6 @@ static const struct inode_operations proc_sys_inode_operations;
18static const struct file_operations proc_sys_dir_file_operations; 14static const struct file_operations proc_sys_dir_file_operations;
19static const struct inode_operations proc_sys_dir_operations; 15static const struct inode_operations proc_sys_dir_operations;
20 16
21void proc_sys_poll_notify(struct ctl_table_poll *poll)
22{
23 if (!poll)
24 return;
25
26 atomic_inc(&poll->event);
27 wake_up_interruptible(&poll->wait);
28}
29
30static struct ctl_table root_table[] = {
31 {
32 .procname = "",
33 .mode = S_IFDIR|S_IRUGO|S_IXUGO,
34 },
35 { }
36};
37static struct ctl_table_root sysctl_table_root = {
38 .default_set.dir.header = {
39 {{.count = 1,
40 .nreg = 1,
41 .ctl_table = root_table }},
42 .ctl_table_arg = root_table,
43 .root = &sysctl_table_root,
44 .set = &sysctl_table_root.default_set,
45 },
46};
47
48static DEFINE_SPINLOCK(sysctl_lock);
49
50static void drop_sysctl_table(struct ctl_table_header *header);
51static int sysctl_follow_link(struct ctl_table_header **phead,
52 struct ctl_table **pentry, struct nsproxy *namespaces);
53static int insert_links(struct ctl_table_header *head);
54static void put_links(struct ctl_table_header *header);
55
56static void sysctl_print_dir(struct ctl_dir *dir)
57{
58 if (dir->header.parent)
59 sysctl_print_dir(dir->header.parent);
60 printk(KERN_CONT "%s/", dir->header.ctl_table[0].procname);
61}
62
63static int namecmp(const char *name1, int len1, const char *name2, int len2)
64{
65 int minlen;
66 int cmp;
67
68 minlen = len1;
69 if (minlen > len2)
70 minlen = len2;
71
72 cmp = memcmp(name1, name2, minlen);
73 if (cmp == 0)
74 cmp = len1 - len2;
75 return cmp;
76}
77
78/* Called under sysctl_lock */
79static struct ctl_table *find_entry(struct ctl_table_header **phead,
80 struct ctl_dir *dir, const char *name, int namelen)
81{
82 struct ctl_table_header *head;
83 struct ctl_table *entry;
84 struct rb_node *node = dir->root.rb_node;
85
86 while (node)
87 {
88 struct ctl_node *ctl_node;
89 const char *procname;
90 int cmp;
91
92 ctl_node = rb_entry(node, struct ctl_node, node);
93 head = ctl_node->header;
94 entry = &head->ctl_table[ctl_node - head->node];
95 procname = entry->procname;
96
97 cmp = namecmp(name, namelen, procname, strlen(procname));
98 if (cmp < 0)
99 node = node->rb_left;
100 else if (cmp > 0)
101 node = node->rb_right;
102 else {
103 *phead = head;
104 return entry;
105 }
106 }
107 return NULL;
108}
109
110static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry)
111{
112 struct rb_node *node = &head->node[entry - head->ctl_table].node;
113 struct rb_node **p = &head->parent->root.rb_node;
114 struct rb_node *parent = NULL;
115 const char *name = entry->procname;
116 int namelen = strlen(name);
117
118 while (*p) {
119 struct ctl_table_header *parent_head;
120 struct ctl_table *parent_entry;
121 struct ctl_node *parent_node;
122 const char *parent_name;
123 int cmp;
124
125 parent = *p;
126 parent_node = rb_entry(parent, struct ctl_node, node);
127 parent_head = parent_node->header;
128 parent_entry = &parent_head->ctl_table[parent_node - parent_head->node];
129 parent_name = parent_entry->procname;
130
131 cmp = namecmp(name, namelen, parent_name, strlen(parent_name));
132 if (cmp < 0)
133 p = &(*p)->rb_left;
134 else if (cmp > 0)
135 p = &(*p)->rb_right;
136 else {
137 printk(KERN_ERR "sysctl duplicate entry: ");
138 sysctl_print_dir(head->parent);
139 printk(KERN_CONT "/%s\n", entry->procname);
140 return -EEXIST;
141 }
142 }
143
144 rb_link_node(node, parent, p);
145 rb_insert_color(node, &head->parent->root);
146 return 0;
147}
148
149static void erase_entry(struct ctl_table_header *head, struct ctl_table *entry)
150{
151 struct rb_node *node = &head->node[entry - head->ctl_table].node;
152
153 rb_erase(node, &head->parent->root);
154}
155
156static void init_header(struct ctl_table_header *head,
157 struct ctl_table_root *root, struct ctl_table_set *set,
158 struct ctl_node *node, struct ctl_table *table)
159{
160 head->ctl_table = table;
161 head->ctl_table_arg = table;
162 head->used = 0;
163 head->count = 1;
164 head->nreg = 1;
165 head->unregistering = NULL;
166 head->root = root;
167 head->set = set;
168 head->parent = NULL;
169 head->node = node;
170 if (node) {
171 struct ctl_table *entry;
172 for (entry = table; entry->procname; entry++, node++)
173 node->header = head;
174 }
175}
176
177static void erase_header(struct ctl_table_header *head)
178{
179 struct ctl_table *entry;
180 for (entry = head->ctl_table; entry->procname; entry++)
181 erase_entry(head, entry);
182}
183
184static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
185{
186 struct ctl_table *entry;
187 int err;
188
189 dir->header.nreg++;
190 header->parent = dir;
191 err = insert_links(header);
192 if (err)
193 goto fail_links;
194 for (entry = header->ctl_table; entry->procname; entry++) {
195 err = insert_entry(header, entry);
196 if (err)
197 goto fail;
198 }
199 return 0;
200fail:
201 erase_header(header);
202 put_links(header);
203fail_links:
204 header->parent = NULL;
205 drop_sysctl_table(&dir->header);
206 return err;
207}
208
209/* called under sysctl_lock */
210static int use_table(struct ctl_table_header *p)
211{
212 if (unlikely(p->unregistering))
213 return 0;
214 p->used++;
215 return 1;
216}
217
218/* called under sysctl_lock */
219static void unuse_table(struct ctl_table_header *p)
220{
221 if (!--p->used)
222 if (unlikely(p->unregistering))
223 complete(p->unregistering);
224}
225
226/* called under sysctl_lock, will reacquire if has to wait */
227static void start_unregistering(struct ctl_table_header *p)
228{
229 /*
230 * if p->used is 0, nobody will ever touch that entry again;
231 * we'll eliminate all paths to it before dropping sysctl_lock
232 */
233 if (unlikely(p->used)) {
234 struct completion wait;
235 init_completion(&wait);
236 p->unregistering = &wait;
237 spin_unlock(&sysctl_lock);
238 wait_for_completion(&wait);
239 spin_lock(&sysctl_lock);
240 } else {
241 /* anything non-NULL; we'll never dereference it */
242 p->unregistering = ERR_PTR(-EINVAL);
243 }
244 /*
245 * do not remove from the list until nobody holds it; walking the
246 * list in do_sysctl() relies on that.
247 */
248 erase_header(p);
249}
250
251static void sysctl_head_get(struct ctl_table_header *head)
252{
253 spin_lock(&sysctl_lock);
254 head->count++;
255 spin_unlock(&sysctl_lock);
256}
257
258void sysctl_head_put(struct ctl_table_header *head)
259{
260 spin_lock(&sysctl_lock);
261 if (!--head->count)
262 kfree_rcu(head, rcu);
263 spin_unlock(&sysctl_lock);
264}
265
266static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
267{
268 BUG_ON(!head);
269 spin_lock(&sysctl_lock);
270 if (!use_table(head))
271 head = ERR_PTR(-ENOENT);
272 spin_unlock(&sysctl_lock);
273 return head;
274}
275
276static void sysctl_head_finish(struct ctl_table_header *head)
277{
278 if (!head)
279 return;
280 spin_lock(&sysctl_lock);
281 unuse_table(head);
282 spin_unlock(&sysctl_lock);
283}
284
285static struct ctl_table_set *
286lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
287{
288 struct ctl_table_set *set = &root->default_set;
289 if (root->lookup)
290 set = root->lookup(root, namespaces);
291 return set;
292}
293
294static struct ctl_table *lookup_entry(struct ctl_table_header **phead,
295 struct ctl_dir *dir,
296 const char *name, int namelen)
297{
298 struct ctl_table_header *head;
299 struct ctl_table *entry;
300
301 spin_lock(&sysctl_lock);
302 entry = find_entry(&head, dir, name, namelen);
303 if (entry && use_table(head))
304 *phead = head;
305 else
306 entry = NULL;
307 spin_unlock(&sysctl_lock);
308 return entry;
309}
310
311static struct ctl_node *first_usable_entry(struct rb_node *node)
312{
313 struct ctl_node *ctl_node;
314
315 for (;node; node = rb_next(node)) {
316 ctl_node = rb_entry(node, struct ctl_node, node);
317 if (use_table(ctl_node->header))
318 return ctl_node;
319 }
320 return NULL;
321}
322
323static void first_entry(struct ctl_dir *dir,
324 struct ctl_table_header **phead, struct ctl_table **pentry)
325{
326 struct ctl_table_header *head = NULL;
327 struct ctl_table *entry = NULL;
328 struct ctl_node *ctl_node;
329
330 spin_lock(&sysctl_lock);
331 ctl_node = first_usable_entry(rb_first(&dir->root));
332 spin_unlock(&sysctl_lock);
333 if (ctl_node) {
334 head = ctl_node->header;
335 entry = &head->ctl_table[ctl_node - head->node];
336 }
337 *phead = head;
338 *pentry = entry;
339}
340
341static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentry)
342{
343 struct ctl_table_header *head = *phead;
344 struct ctl_table *entry = *pentry;
345 struct ctl_node *ctl_node = &head->node[entry - head->ctl_table];
346
347 spin_lock(&sysctl_lock);
348 unuse_table(head);
349
350 ctl_node = first_usable_entry(rb_next(&ctl_node->node));
351 spin_unlock(&sysctl_lock);
352 head = NULL;
353 if (ctl_node) {
354 head = ctl_node->header;
355 entry = &head->ctl_table[ctl_node - head->node];
356 }
357 *phead = head;
358 *pentry = entry;
359}
360
361void register_sysctl_root(struct ctl_table_root *root)
362{
363}
364
365/*
366 * sysctl_perm does NOT grant the superuser all rights automatically, because
367 * some sysctl variables are readonly even to root.
368 */
369
370static int test_perm(int mode, int op)
371{
372 if (uid_eq(current_euid(), GLOBAL_ROOT_UID))
373 mode >>= 6;
374 else if (in_egroup_p(GLOBAL_ROOT_GID))
375 mode >>= 3;
376 if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
377 return 0;
378 return -EACCES;
379}
380
381static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
382{
383 struct ctl_table_root *root = head->root;
384 int mode;
385
386 if (root->permissions)
387 mode = root->permissions(head, table);
388 else
389 mode = table->mode;
390
391 return test_perm(mode, op);
392}
393
394static struct inode *proc_sys_make_inode(struct super_block *sb, 17static struct inode *proc_sys_make_inode(struct super_block *sb,
395 struct ctl_table_header *head, struct ctl_table *table) 18 struct ctl_table_header *head, struct ctl_table *table)
396{ 19{
@@ -410,12 +33,13 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
410 33
411 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 34 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
412 inode->i_mode = table->mode; 35 inode->i_mode = table->mode;
413 if (!S_ISDIR(table->mode)) { 36 if (!table->child) {
414 inode->i_mode |= S_IFREG; 37 inode->i_mode |= S_IFREG;
415 inode->i_op = &proc_sys_inode_operations; 38 inode->i_op = &proc_sys_inode_operations;
416 inode->i_fop = &proc_sys_file_operations; 39 inode->i_fop = &proc_sys_file_operations;
417 } else { 40 } else {
418 inode->i_mode |= S_IFDIR; 41 inode->i_mode |= S_IFDIR;
42 inode->i_nlink = 0;
419 inode->i_op = &proc_sys_dir_operations; 43 inode->i_op = &proc_sys_dir_operations;
420 inode->i_fop = &proc_sys_dir_file_operations; 44 inode->i_fop = &proc_sys_dir_file_operations;
421 } 45 }
@@ -423,44 +47,75 @@ out:
423 return inode; 47 return inode;
424} 48}
425 49
50static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name)
51{
52 int len;
53 for ( ; p->procname; p++) {
54
55 if (!p->procname)
56 continue;
57
58 len = strlen(p->procname);
59 if (len != name->len)
60 continue;
61
62 if (memcmp(p->procname, name->name, len) != 0)
63 continue;
64
65 /* I have a match */
66 return p;
67 }
68 return NULL;
69}
70
426static struct ctl_table_header *grab_header(struct inode *inode) 71static struct ctl_table_header *grab_header(struct inode *inode)
427{ 72{
428 struct ctl_table_header *head = PROC_I(inode)->sysctl; 73 if (PROC_I(inode)->sysctl)
429 if (!head) 74 return sysctl_head_grab(PROC_I(inode)->sysctl);
430 head = &sysctl_table_root.default_set.dir.header; 75 else
431 return sysctl_head_grab(head); 76 return sysctl_head_next(NULL);
432} 77}
433 78
434static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry, 79static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
435 unsigned int flags) 80 struct nameidata *nd)
436{ 81{
437 struct ctl_table_header *head = grab_header(dir); 82 struct ctl_table_header *head = grab_header(dir);
83 struct ctl_table *table = PROC_I(dir)->sysctl_entry;
438 struct ctl_table_header *h = NULL; 84 struct ctl_table_header *h = NULL;
439 struct qstr *name = &dentry->d_name; 85 struct qstr *name = &dentry->d_name;
440 struct ctl_table *p; 86 struct ctl_table *p;
441 struct inode *inode; 87 struct inode *inode;
442 struct dentry *err = ERR_PTR(-ENOENT); 88 struct dentry *err = ERR_PTR(-ENOENT);
443 struct ctl_dir *ctl_dir;
444 int ret;
445 89
446 if (IS_ERR(head)) 90 if (IS_ERR(head))
447 return ERR_CAST(head); 91 return ERR_CAST(head);
448 92
449 ctl_dir = container_of(head, struct ctl_dir, header); 93 if (table && !table->child) {
450 94 WARN_ON(1);
451 p = lookup_entry(&h, ctl_dir, name->name, name->len);
452 if (!p)
453 goto out; 95 goto out;
96 }
454 97
455 if (S_ISLNK(p->mode)) { 98 table = table ? table->child : head->ctl_table;
456 ret = sysctl_follow_link(&h, &p, current->nsproxy); 99
457 err = ERR_PTR(ret); 100 p = find_in_table(table, name);
458 if (ret) 101 if (!p) {
459 goto out; 102 for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
103 if (h->attached_to != table)
104 continue;
105 p = find_in_table(h->attached_by, name);
106 if (p)
107 break;
108 }
460 } 109 }
461 110
111 if (!p)
112 goto out;
113
462 err = ERR_PTR(-ENOMEM); 114 err = ERR_PTR(-ENOMEM);
463 inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p); 115 inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
116 if (h)
117 sysctl_head_finish(h);
118
464 if (!inode) 119 if (!inode)
465 goto out; 120 goto out;
466 121
@@ -469,8 +124,6 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
469 d_add(dentry, inode); 124 d_add(dentry, inode);
470 125
471out: 126out:
472 if (h)
473 sysctl_head_finish(h);
474 sysctl_head_finish(head); 127 sysctl_head_finish(head);
475 return err; 128 return err;
476} 129}
@@ -492,7 +145,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
492 * and won't be until we finish. 145 * and won't be until we finish.
493 */ 146 */
494 error = -EPERM; 147 error = -EPERM;
495 if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ)) 148 if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
496 goto out; 149 goto out;
497 150
498 /* if that can happen at all, it should be -EINVAL, not -EISDIR */ 151 /* if that can happen at all, it should be -EINVAL, not -EISDIR */
@@ -523,54 +176,6 @@ static ssize_t proc_sys_write(struct file *filp, const char __user *buf,
523 return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); 176 return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1);
524} 177}
525 178
526static int proc_sys_open(struct inode *inode, struct file *filp)
527{
528 struct ctl_table_header *head = grab_header(inode);
529 struct ctl_table *table = PROC_I(inode)->sysctl_entry;
530
531 /* sysctl was unregistered */
532 if (IS_ERR(head))
533 return PTR_ERR(head);
534
535 if (table->poll)
536 filp->private_data = proc_sys_poll_event(table->poll);
537
538 sysctl_head_finish(head);
539
540 return 0;
541}
542
543static unsigned int proc_sys_poll(struct file *filp, poll_table *wait)
544{
545 struct inode *inode = filp->f_path.dentry->d_inode;
546 struct ctl_table_header *head = grab_header(inode);
547 struct ctl_table *table = PROC_I(inode)->sysctl_entry;
548 unsigned int ret = DEFAULT_POLLMASK;
549 unsigned long event;
550
551 /* sysctl was unregistered */
552 if (IS_ERR(head))
553 return POLLERR | POLLHUP;
554
555 if (!table->proc_handler)
556 goto out;
557
558 if (!table->poll)
559 goto out;
560
561 event = (unsigned long)filp->private_data;
562 poll_wait(filp, &table->poll->wait, wait);
563
564 if (event != atomic_read(&table->poll->event)) {
565 filp->private_data = proc_sys_poll_event(table->poll);
566 ret = POLLIN | POLLRDNORM | POLLERR | POLLPRI;
567 }
568
569out:
570 sysctl_head_finish(head);
571
572 return ret;
573}
574 179
575static int proc_sys_fill_cache(struct file *filp, void *dirent, 180static int proc_sys_fill_cache(struct file *filp, void *dirent,
576 filldir_t filldir, 181 filldir_t filldir,
@@ -610,45 +215,28 @@ static int proc_sys_fill_cache(struct file *filp, void *dirent,
610 return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type); 215 return !!filldir(dirent, qname.name, qname.len, filp->f_pos, ino, type);
611} 216}
612 217
613static int proc_sys_link_fill_cache(struct file *filp, void *dirent,
614 filldir_t filldir,
615 struct ctl_table_header *head,
616 struct ctl_table *table)
617{
618 int err, ret = 0;
619 head = sysctl_head_grab(head);
620
621 if (S_ISLNK(table->mode)) {
622 /* It is not an error if we can not follow the link ignore it */
623 err = sysctl_follow_link(&head, &table, current->nsproxy);
624 if (err)
625 goto out;
626 }
627
628 ret = proc_sys_fill_cache(filp, dirent, filldir, head, table);
629out:
630 sysctl_head_finish(head);
631 return ret;
632}
633
634static int scan(struct ctl_table_header *head, ctl_table *table, 218static int scan(struct ctl_table_header *head, ctl_table *table,
635 unsigned long *pos, struct file *file, 219 unsigned long *pos, struct file *file,
636 void *dirent, filldir_t filldir) 220 void *dirent, filldir_t filldir)
637{ 221{
638 int res;
639 222
640 if ((*pos)++ < file->f_pos) 223 for (; table->procname; table++, (*pos)++) {
641 return 0; 224 int res;
642 225
643 if (unlikely(S_ISLNK(table->mode))) 226 /* Can't do anything without a proc name */
644 res = proc_sys_link_fill_cache(file, dirent, filldir, head, table); 227 if (!table->procname)
645 else 228 continue;
646 res = proc_sys_fill_cache(file, dirent, filldir, head, table);
647 229
648 if (res == 0) 230 if (*pos < file->f_pos)
649 file->f_pos = *pos; 231 continue;
232
233 res = proc_sys_fill_cache(file, dirent, filldir, head, table);
234 if (res)
235 return res;
650 236
651 return res; 237 file->f_pos = *pos + 1;
238 }
239 return 0;
652} 240}
653 241
654static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir) 242static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
@@ -656,16 +244,20 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
656 struct dentry *dentry = filp->f_path.dentry; 244 struct dentry *dentry = filp->f_path.dentry;
657 struct inode *inode = dentry->d_inode; 245 struct inode *inode = dentry->d_inode;
658 struct ctl_table_header *head = grab_header(inode); 246 struct ctl_table_header *head = grab_header(inode);
247 struct ctl_table *table = PROC_I(inode)->sysctl_entry;
659 struct ctl_table_header *h = NULL; 248 struct ctl_table_header *h = NULL;
660 struct ctl_table *entry;
661 struct ctl_dir *ctl_dir;
662 unsigned long pos; 249 unsigned long pos;
663 int ret = -EINVAL; 250 int ret = -EINVAL;
664 251
665 if (IS_ERR(head)) 252 if (IS_ERR(head))
666 return PTR_ERR(head); 253 return PTR_ERR(head);
667 254
668 ctl_dir = container_of(head, struct ctl_dir, header); 255 if (table && !table->child) {
256 WARN_ON(1);
257 goto out;
258 }
259
260 table = table ? table->child : head->ctl_table;
669 261
670 ret = 0; 262 ret = 0;
671 /* Avoid a switch here: arm builds fail with missing __cmpdi2 */ 263 /* Avoid a switch here: arm builds fail with missing __cmpdi2 */
@@ -683,8 +275,14 @@ static int proc_sys_readdir(struct file *filp, void *dirent, filldir_t filldir)
683 } 275 }
684 pos = 2; 276 pos = 2;
685 277
686 for (first_entry(ctl_dir, &h, &entry); h; next_entry(&h, &entry)) { 278 ret = scan(head, table, &pos, filp, dirent, filldir);
687 ret = scan(h, entry, &pos, filp, dirent, filldir); 279 if (ret)
280 goto out;
281
282 for (h = sysctl_head_next(NULL); h; h = sysctl_head_next(h)) {
283 if (h->attached_to != table)
284 continue;
285 ret = scan(h, h->attached_by, &pos, filp, dirent, filldir);
688 if (ret) { 286 if (ret) {
689 sysctl_head_finish(h); 287 sysctl_head_finish(h);
690 break; 288 break;
@@ -718,7 +316,7 @@ static int proc_sys_permission(struct inode *inode, int mask)
718 if (!table) /* global root - r-xr-xr-x */ 316 if (!table) /* global root - r-xr-xr-x */
719 error = mask & MAY_WRITE ? -EACCES : 0; 317 error = mask & MAY_WRITE ? -EACCES : 0;
720 else /* Use the permissions on the sysctl table entry */ 318 else /* Use the permissions on the sysctl table entry */
721 error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK); 319 error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK);
722 320
723 sysctl_head_finish(head); 321 sysctl_head_finish(head);
724 return error; 322 return error;
@@ -736,6 +334,13 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr)
736 if (error) 334 if (error)
737 return error; 335 return error;
738 336
337 if ((attr->ia_valid & ATTR_SIZE) &&
338 attr->ia_size != i_size_read(inode)) {
339 error = vmtruncate(inode, attr->ia_size);
340 if (error)
341 return error;
342 }
343
739 setattr_copy(inode, attr); 344 setattr_copy(inode, attr);
740 mark_inode_dirty(inode); 345 mark_inode_dirty(inode);
741 return 0; 346 return 0;
@@ -759,15 +364,12 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
759} 364}
760 365
761static const struct file_operations proc_sys_file_operations = { 366static const struct file_operations proc_sys_file_operations = {
762 .open = proc_sys_open,
763 .poll = proc_sys_poll,
764 .read = proc_sys_read, 367 .read = proc_sys_read,
765 .write = proc_sys_write, 368 .write = proc_sys_write,
766 .llseek = default_llseek, 369 .llseek = default_llseek,
767}; 370};
768 371
769static const struct file_operations proc_sys_dir_file_operations = { 372static const struct file_operations proc_sys_dir_file_operations = {
770 .read = generic_read_dir,
771 .readdir = proc_sys_readdir, 373 .readdir = proc_sys_readdir,
772 .llseek = generic_file_llseek, 374 .llseek = generic_file_llseek,
773}; 375};
@@ -785,9 +387,9 @@ static const struct inode_operations proc_sys_dir_operations = {
785 .getattr = proc_sys_getattr, 387 .getattr = proc_sys_getattr,
786}; 388};
787 389
788static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) 390static int proc_sys_revalidate(struct dentry *dentry, struct nameidata *nd)
789{ 391{
790 if (flags & LOOKUP_RCU) 392 if (nd->flags & LOOKUP_RCU)
791 return -ECHILD; 393 return -ECHILD;
792 return !PROC_I(dentry->d_inode)->sysctl->unregistering; 394 return !PROC_I(dentry->d_inode)->sysctl->unregistering;
793} 395}
@@ -797,21 +399,6 @@ static int proc_sys_delete(const struct dentry *dentry)
797 return !!PROC_I(dentry->d_inode)->sysctl->unregistering; 399 return !!PROC_I(dentry->d_inode)->sysctl->unregistering;
798} 400}
799 401
800static int sysctl_is_seen(struct ctl_table_header *p)
801{
802 struct ctl_table_set *set = p->set;
803 int res;
804 spin_lock(&sysctl_lock);
805 if (p->unregistering)
806 res = 0;
807 else if (!set->is_seen)
808 res = 1;
809 else
810 res = set->is_seen(set);
811 spin_unlock(&sysctl_lock);
812 return res;
813}
814
815static int proc_sys_compare(const struct dentry *parent, 402static int proc_sys_compare(const struct dentry *parent,
816 const struct inode *pinode, 403 const struct inode *pinode,
817 const struct dentry *dentry, const struct inode *inode, 404 const struct dentry *dentry, const struct inode *inode,
@@ -837,753 +424,6 @@ static const struct dentry_operations proc_sys_dentry_operations = {
837 .d_compare = proc_sys_compare, 424 .d_compare = proc_sys_compare,
838}; 425};
839 426
840static struct ctl_dir *find_subdir(struct ctl_dir *dir,
841 const char *name, int namelen)
842{
843 struct ctl_table_header *head;
844 struct ctl_table *entry;
845
846 entry = find_entry(&head, dir, name, namelen);
847 if (!entry)
848 return ERR_PTR(-ENOENT);
849 if (!S_ISDIR(entry->mode))
850 return ERR_PTR(-ENOTDIR);
851 return container_of(head, struct ctl_dir, header);
852}
853
854static struct ctl_dir *new_dir(struct ctl_table_set *set,
855 const char *name, int namelen)
856{
857 struct ctl_table *table;
858 struct ctl_dir *new;
859 struct ctl_node *node;
860 char *new_name;
861
862 new = kzalloc(sizeof(*new) + sizeof(struct ctl_node) +
863 sizeof(struct ctl_table)*2 + namelen + 1,
864 GFP_KERNEL);
865 if (!new)
866 return NULL;
867
868 node = (struct ctl_node *)(new + 1);
869 table = (struct ctl_table *)(node + 1);
870 new_name = (char *)(table + 2);
871 memcpy(new_name, name, namelen);
872 new_name[namelen] = '\0';
873 table[0].procname = new_name;
874 table[0].mode = S_IFDIR|S_IRUGO|S_IXUGO;
875 init_header(&new->header, set->dir.header.root, set, node, table);
876
877 return new;
878}
879
880/**
881 * get_subdir - find or create a subdir with the specified name.
882 * @dir: Directory to create the subdirectory in
883 * @name: The name of the subdirectory to find or create
884 * @namelen: The length of name
885 *
886 * Takes a directory with an elevated reference count so we know that
887 * if we drop the lock the directory will not go away. Upon success
888 * the reference is moved from @dir to the returned subdirectory.
889 * Upon error an error code is returned and the reference on @dir is
890 * simply dropped.
891 */
892static struct ctl_dir *get_subdir(struct ctl_dir *dir,
893 const char *name, int namelen)
894{
895 struct ctl_table_set *set = dir->header.set;
896 struct ctl_dir *subdir, *new = NULL;
897 int err;
898
899 spin_lock(&sysctl_lock);
900 subdir = find_subdir(dir, name, namelen);
901 if (!IS_ERR(subdir))
902 goto found;
903 if (PTR_ERR(subdir) != -ENOENT)
904 goto failed;
905
906 spin_unlock(&sysctl_lock);
907 new = new_dir(set, name, namelen);
908 spin_lock(&sysctl_lock);
909 subdir = ERR_PTR(-ENOMEM);
910 if (!new)
911 goto failed;
912
913 /* Was the subdir added while we dropped the lock? */
914 subdir = find_subdir(dir, name, namelen);
915 if (!IS_ERR(subdir))
916 goto found;
917 if (PTR_ERR(subdir) != -ENOENT)
918 goto failed;
919
920 /* Nope. Use the our freshly made directory entry. */
921 err = insert_header(dir, &new->header);
922 subdir = ERR_PTR(err);
923 if (err)
924 goto failed;
925 subdir = new;
926found:
927 subdir->header.nreg++;
928failed:
929 if (unlikely(IS_ERR(subdir))) {
930 printk(KERN_ERR "sysctl could not get directory: ");
931 sysctl_print_dir(dir);
932 printk(KERN_CONT "/%*.*s %ld\n",
933 namelen, namelen, name, PTR_ERR(subdir));
934 }
935 drop_sysctl_table(&dir->header);
936 if (new)
937 drop_sysctl_table(&new->header);
938 spin_unlock(&sysctl_lock);
939 return subdir;
940}
941
942static struct ctl_dir *xlate_dir(struct ctl_table_set *set, struct ctl_dir *dir)
943{
944 struct ctl_dir *parent;
945 const char *procname;
946 if (!dir->header.parent)
947 return &set->dir;
948 parent = xlate_dir(set, dir->header.parent);
949 if (IS_ERR(parent))
950 return parent;
951 procname = dir->header.ctl_table[0].procname;
952 return find_subdir(parent, procname, strlen(procname));
953}
954
955static int sysctl_follow_link(struct ctl_table_header **phead,
956 struct ctl_table **pentry, struct nsproxy *namespaces)
957{
958 struct ctl_table_header *head;
959 struct ctl_table_root *root;
960 struct ctl_table_set *set;
961 struct ctl_table *entry;
962 struct ctl_dir *dir;
963 int ret;
964
965 ret = 0;
966 spin_lock(&sysctl_lock);
967 root = (*pentry)->data;
968 set = lookup_header_set(root, namespaces);
969 dir = xlate_dir(set, (*phead)->parent);
970 if (IS_ERR(dir))
971 ret = PTR_ERR(dir);
972 else {
973 const char *procname = (*pentry)->procname;
974 head = NULL;
975 entry = find_entry(&head, dir, procname, strlen(procname));
976 ret = -ENOENT;
977 if (entry && use_table(head)) {
978 unuse_table(*phead);
979 *phead = head;
980 *pentry = entry;
981 ret = 0;
982 }
983 }
984
985 spin_unlock(&sysctl_lock);
986 return ret;
987}
988
989static int sysctl_err(const char *path, struct ctl_table *table, char *fmt, ...)
990{
991 struct va_format vaf;
992 va_list args;
993
994 va_start(args, fmt);
995 vaf.fmt = fmt;
996 vaf.va = &args;
997
998 printk(KERN_ERR "sysctl table check failed: %s/%s %pV\n",
999 path, table->procname, &vaf);
1000
1001 va_end(args);
1002 return -EINVAL;
1003}
1004
1005static int sysctl_check_table(const char *path, struct ctl_table *table)
1006{
1007 int err = 0;
1008 for (; table->procname; table++) {
1009 if (table->child)
1010 err = sysctl_err(path, table, "Not a file");
1011
1012 if ((table->proc_handler == proc_dostring) ||
1013 (table->proc_handler == proc_dointvec) ||
1014 (table->proc_handler == proc_dointvec_minmax) ||
1015 (table->proc_handler == proc_dointvec_jiffies) ||
1016 (table->proc_handler == proc_dointvec_userhz_jiffies) ||
1017 (table->proc_handler == proc_dointvec_ms_jiffies) ||
1018 (table->proc_handler == proc_doulongvec_minmax) ||
1019 (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
1020 if (!table->data)
1021 err = sysctl_err(path, table, "No data");
1022 if (!table->maxlen)
1023 err = sysctl_err(path, table, "No maxlen");
1024 }
1025 if (!table->proc_handler)
1026 err = sysctl_err(path, table, "No proc_handler");
1027
1028 if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
1029 err = sysctl_err(path, table, "bogus .mode 0%o",
1030 table->mode);
1031 }
1032 return err;
1033}
1034
1035static struct ctl_table_header *new_links(struct ctl_dir *dir, struct ctl_table *table,
1036 struct ctl_table_root *link_root)
1037{
1038 struct ctl_table *link_table, *entry, *link;
1039 struct ctl_table_header *links;
1040 struct ctl_node *node;
1041 char *link_name;
1042 int nr_entries, name_bytes;
1043
1044 name_bytes = 0;
1045 nr_entries = 0;
1046 for (entry = table; entry->procname; entry++) {
1047 nr_entries++;
1048 name_bytes += strlen(entry->procname) + 1;
1049 }
1050
1051 links = kzalloc(sizeof(struct ctl_table_header) +
1052 sizeof(struct ctl_node)*nr_entries +
1053 sizeof(struct ctl_table)*(nr_entries + 1) +
1054 name_bytes,
1055 GFP_KERNEL);
1056
1057 if (!links)
1058 return NULL;
1059
1060 node = (struct ctl_node *)(links + 1);
1061 link_table = (struct ctl_table *)(node + nr_entries);
1062 link_name = (char *)&link_table[nr_entries + 1];
1063
1064 for (link = link_table, entry = table; entry->procname; link++, entry++) {
1065 int len = strlen(entry->procname) + 1;
1066 memcpy(link_name, entry->procname, len);
1067 link->procname = link_name;
1068 link->mode = S_IFLNK|S_IRWXUGO;
1069 link->data = link_root;
1070 link_name += len;
1071 }
1072 init_header(links, dir->header.root, dir->header.set, node, link_table);
1073 links->nreg = nr_entries;
1074
1075 return links;
1076}
1077
1078static bool get_links(struct ctl_dir *dir,
1079 struct ctl_table *table, struct ctl_table_root *link_root)
1080{
1081 struct ctl_table_header *head;
1082 struct ctl_table *entry, *link;
1083
1084 /* Are there links available for every entry in table? */
1085 for (entry = table; entry->procname; entry++) {
1086 const char *procname = entry->procname;
1087 link = find_entry(&head, dir, procname, strlen(procname));
1088 if (!link)
1089 return false;
1090 if (S_ISDIR(link->mode) && S_ISDIR(entry->mode))
1091 continue;
1092 if (S_ISLNK(link->mode) && (link->data == link_root))
1093 continue;
1094 return false;
1095 }
1096
1097 /* The checks passed. Increase the registration count on the links */
1098 for (entry = table; entry->procname; entry++) {
1099 const char *procname = entry->procname;
1100 link = find_entry(&head, dir, procname, strlen(procname));
1101 head->nreg++;
1102 }
1103 return true;
1104}
1105
1106static int insert_links(struct ctl_table_header *head)
1107{
1108 struct ctl_table_set *root_set = &sysctl_table_root.default_set;
1109 struct ctl_dir *core_parent = NULL;
1110 struct ctl_table_header *links;
1111 int err;
1112
1113 if (head->set == root_set)
1114 return 0;
1115
1116 core_parent = xlate_dir(root_set, head->parent);
1117 if (IS_ERR(core_parent))
1118 return 0;
1119
1120 if (get_links(core_parent, head->ctl_table, head->root))
1121 return 0;
1122
1123 core_parent->header.nreg++;
1124 spin_unlock(&sysctl_lock);
1125
1126 links = new_links(core_parent, head->ctl_table, head->root);
1127
1128 spin_lock(&sysctl_lock);
1129 err = -ENOMEM;
1130 if (!links)
1131 goto out;
1132
1133 err = 0;
1134 if (get_links(core_parent, head->ctl_table, head->root)) {
1135 kfree(links);
1136 goto out;
1137 }
1138
1139 err = insert_header(core_parent, links);
1140 if (err)
1141 kfree(links);
1142out:
1143 drop_sysctl_table(&core_parent->header);
1144 return err;
1145}
1146
1147/**
1148 * __register_sysctl_table - register a leaf sysctl table
1149 * @set: Sysctl tree to register on
1150 * @path: The path to the directory the sysctl table is in.
1151 * @table: the top-level table structure
1152 *
1153 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1154 * array. A completely 0 filled entry terminates the table.
1155 *
1156 * The members of the &struct ctl_table structure are used as follows:
1157 *
1158 * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
1159 * enter a sysctl file
1160 *
1161 * data - a pointer to data for use by proc_handler
1162 *
1163 * maxlen - the maximum size in bytes of the data
1164 *
1165 * mode - the file permissions for the /proc/sys file
1166 *
1167 * child - must be %NULL.
1168 *
1169 * proc_handler - the text handler routine (described below)
1170 *
1171 * extra1, extra2 - extra pointers usable by the proc handler routines
1172 *
1173 * Leaf nodes in the sysctl tree will be represented by a single file
1174 * under /proc; non-leaf nodes will be represented by directories.
1175 *
1176 * There must be a proc_handler routine for any terminal nodes.
1177 * Several default handlers are available to cover common cases -
1178 *
1179 * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
1180 * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
1181 * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
1182 *
1183 * It is the handler's job to read the input buffer from user memory
1184 * and process it. The handler should return 0 on success.
1185 *
1186 * This routine returns %NULL on a failure to register, and a pointer
1187 * to the table header on success.
1188 */
1189struct ctl_table_header *__register_sysctl_table(
1190 struct ctl_table_set *set,
1191 const char *path, struct ctl_table *table)
1192{
1193 struct ctl_table_root *root = set->dir.header.root;
1194 struct ctl_table_header *header;
1195 const char *name, *nextname;
1196 struct ctl_dir *dir;
1197 struct ctl_table *entry;
1198 struct ctl_node *node;
1199 int nr_entries = 0;
1200
1201 for (entry = table; entry->procname; entry++)
1202 nr_entries++;
1203
1204 header = kzalloc(sizeof(struct ctl_table_header) +
1205 sizeof(struct ctl_node)*nr_entries, GFP_KERNEL);
1206 if (!header)
1207 return NULL;
1208
1209 node = (struct ctl_node *)(header + 1);
1210 init_header(header, root, set, node, table);
1211 if (sysctl_check_table(path, table))
1212 goto fail;
1213
1214 spin_lock(&sysctl_lock);
1215 dir = &set->dir;
1216 /* Reference moved down the diretory tree get_subdir */
1217 dir->header.nreg++;
1218 spin_unlock(&sysctl_lock);
1219
1220 /* Find the directory for the ctl_table */
1221 for (name = path; name; name = nextname) {
1222 int namelen;
1223 nextname = strchr(name, '/');
1224 if (nextname) {
1225 namelen = nextname - name;
1226 nextname++;
1227 } else {
1228 namelen = strlen(name);
1229 }
1230 if (namelen == 0)
1231 continue;
1232
1233 dir = get_subdir(dir, name, namelen);
1234 if (IS_ERR(dir))
1235 goto fail;
1236 }
1237
1238 spin_lock(&sysctl_lock);
1239 if (insert_header(dir, header))
1240 goto fail_put_dir_locked;
1241
1242 drop_sysctl_table(&dir->header);
1243 spin_unlock(&sysctl_lock);
1244
1245 return header;
1246
1247fail_put_dir_locked:
1248 drop_sysctl_table(&dir->header);
1249 spin_unlock(&sysctl_lock);
1250fail:
1251 kfree(header);
1252 dump_stack();
1253 return NULL;
1254}
1255
1256/**
1257 * register_sysctl - register a sysctl table
1258 * @path: The path to the directory the sysctl table is in.
1259 * @table: the table structure
1260 *
1261 * Register a sysctl table. @table should be a filled in ctl_table
1262 * array. A completely 0 filled entry terminates the table.
1263 *
1264 * See __register_sysctl_table for more details.
1265 */
1266struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *table)
1267{
1268 return __register_sysctl_table(&sysctl_table_root.default_set,
1269 path, table);
1270}
1271EXPORT_SYMBOL(register_sysctl);
1272
1273static char *append_path(const char *path, char *pos, const char *name)
1274{
1275 int namelen;
1276 namelen = strlen(name);
1277 if (((pos - path) + namelen + 2) >= PATH_MAX)
1278 return NULL;
1279 memcpy(pos, name, namelen);
1280 pos[namelen] = '/';
1281 pos[namelen + 1] = '\0';
1282 pos += namelen + 1;
1283 return pos;
1284}
1285
1286static int count_subheaders(struct ctl_table *table)
1287{
1288 int has_files = 0;
1289 int nr_subheaders = 0;
1290 struct ctl_table *entry;
1291
1292 /* special case: no directory and empty directory */
1293 if (!table || !table->procname)
1294 return 1;
1295
1296 for (entry = table; entry->procname; entry++) {
1297 if (entry->child)
1298 nr_subheaders += count_subheaders(entry->child);
1299 else
1300 has_files = 1;
1301 }
1302 return nr_subheaders + has_files;
1303}
1304
1305static int register_leaf_sysctl_tables(const char *path, char *pos,
1306 struct ctl_table_header ***subheader, struct ctl_table_set *set,
1307 struct ctl_table *table)
1308{
1309 struct ctl_table *ctl_table_arg = NULL;
1310 struct ctl_table *entry, *files;
1311 int nr_files = 0;
1312 int nr_dirs = 0;
1313 int err = -ENOMEM;
1314
1315 for (entry = table; entry->procname; entry++) {
1316 if (entry->child)
1317 nr_dirs++;
1318 else
1319 nr_files++;
1320 }
1321
1322 files = table;
1323 /* If there are mixed files and directories we need a new table */
1324 if (nr_dirs && nr_files) {
1325 struct ctl_table *new;
1326 files = kzalloc(sizeof(struct ctl_table) * (nr_files + 1),
1327 GFP_KERNEL);
1328 if (!files)
1329 goto out;
1330
1331 ctl_table_arg = files;
1332 for (new = files, entry = table; entry->procname; entry++) {
1333 if (entry->child)
1334 continue;
1335 *new = *entry;
1336 new++;
1337 }
1338 }
1339
1340 /* Register everything except a directory full of subdirectories */
1341 if (nr_files || !nr_dirs) {
1342 struct ctl_table_header *header;
1343 header = __register_sysctl_table(set, path, files);
1344 if (!header) {
1345 kfree(ctl_table_arg);
1346 goto out;
1347 }
1348
1349 /* Remember if we need to free the file table */
1350 header->ctl_table_arg = ctl_table_arg;
1351 **subheader = header;
1352 (*subheader)++;
1353 }
1354
1355 /* Recurse into the subdirectories. */
1356 for (entry = table; entry->procname; entry++) {
1357 char *child_pos;
1358
1359 if (!entry->child)
1360 continue;
1361
1362 err = -ENAMETOOLONG;
1363 child_pos = append_path(path, pos, entry->procname);
1364 if (!child_pos)
1365 goto out;
1366
1367 err = register_leaf_sysctl_tables(path, child_pos, subheader,
1368 set, entry->child);
1369 pos[0] = '\0';
1370 if (err)
1371 goto out;
1372 }
1373 err = 0;
1374out:
1375 /* On failure our caller will unregister all registered subheaders */
1376 return err;
1377}
1378
1379/**
1380 * __register_sysctl_paths - register a sysctl table hierarchy
1381 * @set: Sysctl tree to register on
1382 * @path: The path to the directory the sysctl table is in.
1383 * @table: the top-level table structure
1384 *
1385 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1386 * array. A completely 0 filled entry terminates the table.
1387 *
1388 * See __register_sysctl_table for more details.
1389 */
1390struct ctl_table_header *__register_sysctl_paths(
1391 struct ctl_table_set *set,
1392 const struct ctl_path *path, struct ctl_table *table)
1393{
1394 struct ctl_table *ctl_table_arg = table;
1395 int nr_subheaders = count_subheaders(table);
1396 struct ctl_table_header *header = NULL, **subheaders, **subheader;
1397 const struct ctl_path *component;
1398 char *new_path, *pos;
1399
1400 pos = new_path = kmalloc(PATH_MAX, GFP_KERNEL);
1401 if (!new_path)
1402 return NULL;
1403
1404 pos[0] = '\0';
1405 for (component = path; component->procname; component++) {
1406 pos = append_path(new_path, pos, component->procname);
1407 if (!pos)
1408 goto out;
1409 }
1410 while (table->procname && table->child && !table[1].procname) {
1411 pos = append_path(new_path, pos, table->procname);
1412 if (!pos)
1413 goto out;
1414 table = table->child;
1415 }
1416 if (nr_subheaders == 1) {
1417 header = __register_sysctl_table(set, new_path, table);
1418 if (header)
1419 header->ctl_table_arg = ctl_table_arg;
1420 } else {
1421 header = kzalloc(sizeof(*header) +
1422 sizeof(*subheaders)*nr_subheaders, GFP_KERNEL);
1423 if (!header)
1424 goto out;
1425
1426 subheaders = (struct ctl_table_header **) (header + 1);
1427 subheader = subheaders;
1428 header->ctl_table_arg = ctl_table_arg;
1429
1430 if (register_leaf_sysctl_tables(new_path, pos, &subheader,
1431 set, table))
1432 goto err_register_leaves;
1433 }
1434
1435out:
1436 kfree(new_path);
1437 return header;
1438
1439err_register_leaves:
1440 while (subheader > subheaders) {
1441 struct ctl_table_header *subh = *(--subheader);
1442 struct ctl_table *table = subh->ctl_table_arg;
1443 unregister_sysctl_table(subh);
1444 kfree(table);
1445 }
1446 kfree(header);
1447 header = NULL;
1448 goto out;
1449}
1450
1451/**
1452 * register_sysctl_table_path - register a sysctl table hierarchy
1453 * @path: The path to the directory the sysctl table is in.
1454 * @table: the top-level table structure
1455 *
1456 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1457 * array. A completely 0 filled entry terminates the table.
1458 *
1459 * See __register_sysctl_paths for more details.
1460 */
1461struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1462 struct ctl_table *table)
1463{
1464 return __register_sysctl_paths(&sysctl_table_root.default_set,
1465 path, table);
1466}
1467EXPORT_SYMBOL(register_sysctl_paths);
1468
1469/**
1470 * register_sysctl_table - register a sysctl table hierarchy
1471 * @table: the top-level table structure
1472 *
1473 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1474 * array. A completely 0 filled entry terminates the table.
1475 *
1476 * See register_sysctl_paths for more details.
1477 */
1478struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1479{
1480 static const struct ctl_path null_path[] = { {} };
1481
1482 return register_sysctl_paths(null_path, table);
1483}
1484EXPORT_SYMBOL(register_sysctl_table);
1485
1486static void put_links(struct ctl_table_header *header)
1487{
1488 struct ctl_table_set *root_set = &sysctl_table_root.default_set;
1489 struct ctl_table_root *root = header->root;
1490 struct ctl_dir *parent = header->parent;
1491 struct ctl_dir *core_parent;
1492 struct ctl_table *entry;
1493
1494 if (header->set == root_set)
1495 return;
1496
1497 core_parent = xlate_dir(root_set, parent);
1498 if (IS_ERR(core_parent))
1499 return;
1500
1501 for (entry = header->ctl_table; entry->procname; entry++) {
1502 struct ctl_table_header *link_head;
1503 struct ctl_table *link;
1504 const char *name = entry->procname;
1505
1506 link = find_entry(&link_head, core_parent, name, strlen(name));
1507 if (link &&
1508 ((S_ISDIR(link->mode) && S_ISDIR(entry->mode)) ||
1509 (S_ISLNK(link->mode) && (link->data == root)))) {
1510 drop_sysctl_table(link_head);
1511 }
1512 else {
1513 printk(KERN_ERR "sysctl link missing during unregister: ");
1514 sysctl_print_dir(parent);
1515 printk(KERN_CONT "/%s\n", name);
1516 }
1517 }
1518}
1519
1520static void drop_sysctl_table(struct ctl_table_header *header)
1521{
1522 struct ctl_dir *parent = header->parent;
1523
1524 if (--header->nreg)
1525 return;
1526
1527 put_links(header);
1528 start_unregistering(header);
1529 if (!--header->count)
1530 kfree_rcu(header, rcu);
1531
1532 if (parent)
1533 drop_sysctl_table(&parent->header);
1534}
1535
1536/**
1537 * unregister_sysctl_table - unregister a sysctl table hierarchy
1538 * @header: the header returned from register_sysctl_table
1539 *
1540 * Unregisters the sysctl table and all children. proc entries may not
1541 * actually be removed until they are no longer used by anyone.
1542 */
1543void unregister_sysctl_table(struct ctl_table_header * header)
1544{
1545 int nr_subheaders;
1546 might_sleep();
1547
1548 if (header == NULL)
1549 return;
1550
1551 nr_subheaders = count_subheaders(header->ctl_table_arg);
1552 if (unlikely(nr_subheaders > 1)) {
1553 struct ctl_table_header **subheaders;
1554 int i;
1555
1556 subheaders = (struct ctl_table_header **)(header + 1);
1557 for (i = nr_subheaders -1; i >= 0; i--) {
1558 struct ctl_table_header *subh = subheaders[i];
1559 struct ctl_table *table = subh->ctl_table_arg;
1560 unregister_sysctl_table(subh);
1561 kfree(table);
1562 }
1563 kfree(header);
1564 return;
1565 }
1566
1567 spin_lock(&sysctl_lock);
1568 drop_sysctl_table(header);
1569 spin_unlock(&sysctl_lock);
1570}
1571EXPORT_SYMBOL(unregister_sysctl_table);
1572
1573void setup_sysctl_set(struct ctl_table_set *set,
1574 struct ctl_table_root *root,
1575 int (*is_seen)(struct ctl_table_set *))
1576{
1577 memset(set, 0, sizeof(*set));
1578 set->is_seen = is_seen;
1579 init_header(&set->dir.header, root, set, NULL, root_table);
1580}
1581
1582void retire_sysctl_set(struct ctl_table_set *set)
1583{
1584 WARN_ON(!RB_EMPTY_ROOT(&set->dir.root));
1585}
1586
1587int __init proc_sys_init(void) 427int __init proc_sys_init(void)
1588{ 428{
1589 struct proc_dir_entry *proc_sys_root; 429 struct proc_dir_entry *proc_sys_root;
@@ -1592,6 +432,5 @@ int __init proc_sys_init(void)
1592 proc_sys_root->proc_iops = &proc_sys_dir_operations; 432 proc_sys_root->proc_iops = &proc_sys_dir_operations;
1593 proc_sys_root->proc_fops = &proc_sys_dir_file_operations; 433 proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
1594 proc_sys_root->nlink = 0; 434 proc_sys_root->nlink = 0;
1595 435 return 0;
1596 return sysctl_init();
1597} 436}
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c6e9fac26ba..9a8a2b77b87 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -18,7 +18,6 @@
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/pid_namespace.h> 20#include <linux/pid_namespace.h>
21#include <linux/parser.h>
22 21
23#include "internal.h" 22#include "internal.h"
24 23
@@ -37,89 +36,25 @@ static int proc_set_super(struct super_block *sb, void *data)
37 return err; 36 return err;
38} 37}
39 38
40enum {
41 Opt_gid, Opt_hidepid, Opt_err,
42};
43
44static const match_table_t tokens = {
45 {Opt_hidepid, "hidepid=%u"},
46 {Opt_gid, "gid=%u"},
47 {Opt_err, NULL},
48};
49
50static int proc_parse_options(char *options, struct pid_namespace *pid)
51{
52 char *p;
53 substring_t args[MAX_OPT_ARGS];
54 int option;
55
56 if (!options)
57 return 1;
58
59 while ((p = strsep(&options, ",")) != NULL) {
60 int token;
61 if (!*p)
62 continue;
63
64 args[0].to = args[0].from = NULL;
65 token = match_token(p, tokens, args);
66 switch (token) {
67 case Opt_gid:
68 if (match_int(&args[0], &option))
69 return 0;
70 pid->pid_gid = make_kgid(current_user_ns(), option);
71 break;
72 case Opt_hidepid:
73 if (match_int(&args[0], &option))
74 return 0;
75 if (option < 0 || option > 2) {
76 pr_err("proc: hidepid value must be between 0 and 2.\n");
77 return 0;
78 }
79 pid->hide_pid = option;
80 break;
81 default:
82 pr_err("proc: unrecognized mount option \"%s\" "
83 "or missing value\n", p);
84 return 0;
85 }
86 }
87
88 return 1;
89}
90
91int proc_remount(struct super_block *sb, int *flags, char *data)
92{
93 struct pid_namespace *pid = sb->s_fs_info;
94 return !proc_parse_options(data, pid);
95}
96
97static struct dentry *proc_mount(struct file_system_type *fs_type, 39static struct dentry *proc_mount(struct file_system_type *fs_type,
98 int flags, const char *dev_name, void *data) 40 int flags, const char *dev_name, void *data)
99{ 41{
100 int err; 42 int err;
101 struct super_block *sb; 43 struct super_block *sb;
102 struct pid_namespace *ns; 44 struct pid_namespace *ns;
103 char *options; 45 struct proc_inode *ei;
104 46
105 if (flags & MS_KERNMOUNT) { 47 if (flags & MS_KERNMOUNT)
106 ns = (struct pid_namespace *)data; 48 ns = (struct pid_namespace *)data;
107 options = NULL; 49 else
108 } else { 50 ns = current->nsproxy->pid_ns;
109 ns = task_active_pid_ns(current);
110 options = data;
111 }
112 51
113 sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); 52 sb = sget(fs_type, proc_test_super, proc_set_super, ns);
114 if (IS_ERR(sb)) 53 if (IS_ERR(sb))
115 return ERR_CAST(sb); 54 return ERR_CAST(sb);
116 55
117 if (!proc_parse_options(options, ns)) {
118 deactivate_locked_super(sb);
119 return ERR_PTR(-EINVAL);
120 }
121
122 if (!sb->s_root) { 56 if (!sb->s_root) {
57 sb->s_flags = flags;
123 err = proc_fill_super(sb); 58 err = proc_fill_super(sb);
124 if (err) { 59 if (err) {
125 deactivate_locked_super(sb); 60 deactivate_locked_super(sb);
@@ -129,6 +64,13 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
129 sb->s_flags |= MS_ACTIVE; 64 sb->s_flags |= MS_ACTIVE;
130 } 65 }
131 66
67 ei = PROC_I(sb->s_root->d_inode);
68 if (!ei->pid) {
69 rcu_read_lock();
70 ei->pid = get_pid(find_pid_ns(1, ns));
71 rcu_read_unlock();
72 }
73
132 return dget(sb->s_root); 74 return dget(sb->s_root);
133} 75}
134 76
@@ -145,19 +87,24 @@ static struct file_system_type proc_fs_type = {
145 .name = "proc", 87 .name = "proc",
146 .mount = proc_mount, 88 .mount = proc_mount,
147 .kill_sb = proc_kill_sb, 89 .kill_sb = proc_kill_sb,
148 .fs_flags = FS_USERNS_MOUNT,
149}; 90};
150 91
151void __init proc_root_init(void) 92void __init proc_root_init(void)
152{ 93{
94 struct vfsmount *mnt;
153 int err; 95 int err;
154 96
155 proc_init_inodecache(); 97 proc_init_inodecache();
156 err = register_filesystem(&proc_fs_type); 98 err = register_filesystem(&proc_fs_type);
157 if (err) 99 if (err)
158 return; 100 return;
101 mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
102 if (IS_ERR(mnt)) {
103 unregister_filesystem(&proc_fs_type);
104 return;
105 }
159 106
160 proc_self_init(); 107 init_pid_ns.proc_mnt = mnt;
161 proc_symlink("mounts", NULL, "self/mounts"); 108 proc_symlink("mounts", NULL, "self/mounts");
162 109
163 proc_net_init(); 110 proc_net_init();
@@ -188,12 +135,13 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
188 return 0; 135 return 0;
189} 136}
190 137
191static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) 138static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
192{ 139{
193 if (!proc_lookup(dir, dentry, flags)) 140 if (!proc_lookup(dir, dentry, nd)) {
194 return NULL; 141 return NULL;
142 }
195 143
196 return proc_pid_lookup(dir, dentry, flags); 144 return proc_pid_lookup(dir, dentry, nd);
197} 145}
198 146
199static int proc_root_readdir(struct file * filp, 147static int proc_root_readdir(struct file * filp,
@@ -261,5 +209,5 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
261 209
262void pid_ns_release_proc(struct pid_namespace *ns) 210void pid_ns_release_proc(struct pid_namespace *ns)
263{ 211{
264 kern_unmount(ns->proc_mnt); 212 mntput(ns->proc_mnt);
265} 213}
diff --git a/fs/proc/self.c b/fs/proc/self.c
deleted file mode 100644
index aa5cc3bff14..00000000000
--- a/fs/proc/self.c
+++ /dev/null
@@ -1,59 +0,0 @@
1#include <linux/proc_fs.h>
2#include <linux/sched.h>
3#include <linux/namei.h>
4
5/*
6 * /proc/self:
7 */
8static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
9 int buflen)
10{
11 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
12 pid_t tgid = task_tgid_nr_ns(current, ns);
13 char tmp[PROC_NUMBUF];
14 if (!tgid)
15 return -ENOENT;
16 sprintf(tmp, "%d", tgid);
17 return vfs_readlink(dentry,buffer,buflen,tmp);
18}
19
20static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
21{
22 struct pid_namespace *ns = dentry->d_sb->s_fs_info;
23 pid_t tgid = task_tgid_nr_ns(current, ns);
24 char *name = ERR_PTR(-ENOENT);
25 if (tgid) {
26 /* 11 for max length of signed int in decimal + NULL term */
27 name = kmalloc(12, GFP_KERNEL);
28 if (!name)
29 name = ERR_PTR(-ENOMEM);
30 else
31 sprintf(name, "%d", tgid);
32 }
33 nd_set_link(nd, name);
34 return NULL;
35}
36
37static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
38 void *cookie)
39{
40 char *s = nd_get_link(nd);
41 if (!IS_ERR(s))
42 kfree(s);
43}
44
45static const struct inode_operations proc_self_inode_operations = {
46 .readlink = proc_self_readlink,
47 .follow_link = proc_self_follow_link,
48 .put_link = proc_self_put_link,
49};
50
51void __init proc_self_init(void)
52{
53 struct proc_dir_entry *proc_self_symlink;
54 mode_t mode;
55
56 mode = S_IFLNK | S_IRWXUGO;
57 proc_self_symlink = proc_create("self", mode, NULL, NULL );
58 proc_self_symlink->proc_iops = &proc_self_inode_operations;
59}
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index e296572c73e..4b758ad5c83 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -10,7 +10,6 @@
10#include <linux/time.h> 10#include <linux/time.h>
11#include <linux/irqnr.h> 11#include <linux/irqnr.h>
12#include <asm/cputime.h> 12#include <asm/cputime.h>
13#include <linux/tick.h>
14 13
15#ifndef arch_irq_stat_cpu 14#ifndef arch_irq_stat_cpu
16#define arch_irq_stat_cpu(cpu) 0 15#define arch_irq_stat_cpu(cpu) 0
@@ -18,93 +17,40 @@
18#ifndef arch_irq_stat 17#ifndef arch_irq_stat
19#define arch_irq_stat() 0 18#define arch_irq_stat() 0
20#endif 19#endif
21 20#ifndef arch_idle_time
22#ifdef arch_idle_time 21#define arch_idle_time(cpu) 0
23
24static cputime64_t get_idle_time(int cpu)
25{
26 cputime64_t idle;
27
28 idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
29 if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
30 idle += arch_idle_time(cpu);
31 return idle;
32}
33
34static cputime64_t get_iowait_time(int cpu)
35{
36 cputime64_t iowait;
37
38 iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
39 if (cpu_online(cpu) && nr_iowait_cpu(cpu))
40 iowait += arch_idle_time(cpu);
41 return iowait;
42}
43
44#else
45
46static u64 get_idle_time(int cpu)
47{
48 u64 idle, idle_time = -1ULL;
49
50 if (cpu_online(cpu))
51 idle_time = get_cpu_idle_time_us(cpu, NULL);
52
53 if (idle_time == -1ULL)
54 /* !NO_HZ or cpu offline so we can rely on cpustat.idle */
55 idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
56 else
57 idle = usecs_to_cputime64(idle_time);
58
59 return idle;
60}
61
62static u64 get_iowait_time(int cpu)
63{
64 u64 iowait, iowait_time = -1ULL;
65
66 if (cpu_online(cpu))
67 iowait_time = get_cpu_iowait_time_us(cpu, NULL);
68
69 if (iowait_time == -1ULL)
70 /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */
71 iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
72 else
73 iowait = usecs_to_cputime64(iowait_time);
74
75 return iowait;
76}
77
78#endif 22#endif
79 23
80static int show_stat(struct seq_file *p, void *v) 24static int show_stat(struct seq_file *p, void *v)
81{ 25{
82 int i, j; 26 int i, j;
83 unsigned long jif; 27 unsigned long jif;
84 u64 user, nice, system, idle, iowait, irq, softirq, steal; 28 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
85 u64 guest, guest_nice; 29 cputime64_t guest, guest_nice;
86 u64 sum = 0; 30 u64 sum = 0;
87 u64 sum_softirq = 0; 31 u64 sum_softirq = 0;
88 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; 32 unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
89 struct timespec boottime; 33 struct timespec boottime;
90 34
91 user = nice = system = idle = iowait = 35 user = nice = system = idle = iowait =
92 irq = softirq = steal = 0; 36 irq = softirq = steal = cputime64_zero;
93 guest = guest_nice = 0; 37 guest = guest_nice = cputime64_zero;
94 getboottime(&boottime); 38 getboottime(&boottime);
95 jif = boottime.tv_sec; 39 jif = boottime.tv_sec;
96 40
97 for_each_possible_cpu(i) { 41 for_each_possible_cpu(i) {
98 user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; 42 user = cputime64_add(user, kstat_cpu(i).cpustat.user);
99 nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; 43 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
100 system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; 44 system = cputime64_add(system, kstat_cpu(i).cpustat.system);
101 idle += get_idle_time(i); 45 idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
102 iowait += get_iowait_time(i); 46 idle = cputime64_add(idle, arch_idle_time(i));
103 irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; 47 iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
104 softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; 48 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
105 steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; 49 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
106 guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; 50 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
107 guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; 51 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
52 guest_nice = cputime64_add(guest_nice,
53 kstat_cpu(i).cpustat.guest_nice);
108 sum += kstat_cpu_irqs_sum(i); 54 sum += kstat_cpu_irqs_sum(i);
109 sum += arch_irq_stat_cpu(i); 55 sum += arch_irq_stat_cpu(i);
110 56
@@ -117,49 +63,56 @@ static int show_stat(struct seq_file *p, void *v)
117 } 63 }
118 sum += arch_irq_stat(); 64 sum += arch_irq_stat();
119 65
120 seq_puts(p, "cpu "); 66 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu "
121 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); 67 "%llu\n",
122 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); 68 (unsigned long long)cputime64_to_clock_t(user),
123 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); 69 (unsigned long long)cputime64_to_clock_t(nice),
124 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); 70 (unsigned long long)cputime64_to_clock_t(system),
125 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); 71 (unsigned long long)cputime64_to_clock_t(idle),
126 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); 72 (unsigned long long)cputime64_to_clock_t(iowait),
127 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); 73 (unsigned long long)cputime64_to_clock_t(irq),
128 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); 74 (unsigned long long)cputime64_to_clock_t(softirq),
129 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); 75 (unsigned long long)cputime64_to_clock_t(steal),
130 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); 76 (unsigned long long)cputime64_to_clock_t(guest),
131 seq_putc(p, '\n'); 77 (unsigned long long)cputime64_to_clock_t(guest_nice));
132 78#if defined(CONFIG_REPORT_PRESENT_CPUS)
79 for_each_present_cpu(i) {
80#else
133 for_each_online_cpu(i) { 81 for_each_online_cpu(i) {
82#endif
83
134 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ 84 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
135 user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; 85 user = kstat_cpu(i).cpustat.user;
136 nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; 86 nice = kstat_cpu(i).cpustat.nice;
137 system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; 87 system = kstat_cpu(i).cpustat.system;
138 idle = get_idle_time(i); 88 idle = kstat_cpu(i).cpustat.idle;
139 iowait = get_iowait_time(i); 89 idle = cputime64_add(idle, arch_idle_time(i));
140 irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; 90 iowait = kstat_cpu(i).cpustat.iowait;
141 softirq = kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]; 91 irq = kstat_cpu(i).cpustat.irq;
142 steal = kcpustat_cpu(i).cpustat[CPUTIME_STEAL]; 92 softirq = kstat_cpu(i).cpustat.softirq;
143 guest = kcpustat_cpu(i).cpustat[CPUTIME_GUEST]; 93 steal = kstat_cpu(i).cpustat.steal;
144 guest_nice = kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE]; 94 guest = kstat_cpu(i).cpustat.guest;
145 seq_printf(p, "cpu%d", i); 95 guest_nice = kstat_cpu(i).cpustat.guest_nice;
146 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(user)); 96 seq_printf(p,
147 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(nice)); 97 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu "
148 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(system)); 98 "%llu\n",
149 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(idle)); 99 i,
150 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(iowait)); 100 (unsigned long long)cputime64_to_clock_t(user),
151 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(irq)); 101 (unsigned long long)cputime64_to_clock_t(nice),
152 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(softirq)); 102 (unsigned long long)cputime64_to_clock_t(system),
153 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(steal)); 103 (unsigned long long)cputime64_to_clock_t(idle),
154 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest)); 104 (unsigned long long)cputime64_to_clock_t(iowait),
155 seq_put_decimal_ull(p, ' ', cputime64_to_clock_t(guest_nice)); 105 (unsigned long long)cputime64_to_clock_t(irq),
156 seq_putc(p, '\n'); 106 (unsigned long long)cputime64_to_clock_t(softirq),
107 (unsigned long long)cputime64_to_clock_t(steal),
108 (unsigned long long)cputime64_to_clock_t(guest),
109 (unsigned long long)cputime64_to_clock_t(guest_nice));
157 } 110 }
158 seq_printf(p, "intr %llu", (unsigned long long)sum); 111 seq_printf(p, "intr %llu", (unsigned long long)sum);
159 112
160 /* sum again ? it could be updated? */ 113 /* sum again ? it could be updated? */
161 for_each_irq_nr(j) 114 for_each_irq_nr(j)
162 seq_put_decimal_ull(p, ' ', kstat_irqs(j)); 115 seq_printf(p, " %u", kstat_irqs(j));
163 116
164 seq_printf(p, 117 seq_printf(p,
165 "\nctxt %llu\n" 118 "\nctxt %llu\n"
@@ -176,7 +129,7 @@ static int show_stat(struct seq_file *p, void *v)
176 seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq); 129 seq_printf(p, "softirq %llu", (unsigned long long)sum_softirq);
177 130
178 for (i = 0; i < NR_SOFTIRQS; i++) 131 for (i = 0; i < NR_SOFTIRQS; i++)
179 seq_put_decimal_ull(p, ' ', per_softirq_sums[i]); 132 seq_printf(p, " %u", per_softirq_sums[i]);
180 seq_putc(p, '\n'); 133 seq_putc(p, '\n');
181 134
182 return 0; 135 return 0;
@@ -184,14 +137,11 @@ static int show_stat(struct seq_file *p, void *v)
184 137
185static int stat_open(struct inode *inode, struct file *file) 138static int stat_open(struct inode *inode, struct file *file)
186{ 139{
187 unsigned size = 1024 + 128 * num_possible_cpus(); 140 unsigned size = 4096 * (1 + num_possible_cpus() / 32);
188 char *buf; 141 char *buf;
189 struct seq_file *m; 142 struct seq_file *m;
190 int res; 143 int res;
191 144
192 /* minimum size to display an interrupt count : 2 bytes */
193 size += 2 * nr_irqs;
194
195 /* don't ask for more than the kmalloc() max size */ 145 /* don't ask for more than the kmalloc() max size */
196 if (size > KMALLOC_MAX_SIZE) 146 if (size > KMALLOC_MAX_SIZE)
197 size = KMALLOC_MAX_SIZE; 147 size = KMALLOC_MAX_SIZE;
@@ -203,7 +153,7 @@ static int stat_open(struct inode *inode, struct file *file)
203 if (!res) { 153 if (!res) {
204 m = file->private_data; 154 m = file->private_data;
205 m->buf = buf; 155 m->buf = buf;
206 m->size = ksize(buf); 156 m->size = size;
207 } else 157 } else
208 kfree(buf); 158 kfree(buf);
209 return res; 159 return res;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca5ce7f9f80..c7d4ee663f1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -44,7 +44,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
44 "VmPeak:\t%8lu kB\n" 44 "VmPeak:\t%8lu kB\n"
45 "VmSize:\t%8lu kB\n" 45 "VmSize:\t%8lu kB\n"
46 "VmLck:\t%8lu kB\n" 46 "VmLck:\t%8lu kB\n"
47 "VmPin:\t%8lu kB\n"
48 "VmHWM:\t%8lu kB\n" 47 "VmHWM:\t%8lu kB\n"
49 "VmRSS:\t%8lu kB\n" 48 "VmRSS:\t%8lu kB\n"
50 "VmData:\t%8lu kB\n" 49 "VmData:\t%8lu kB\n"
@@ -54,9 +53,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
54 "VmPTE:\t%8lu kB\n" 53 "VmPTE:\t%8lu kB\n"
55 "VmSwap:\t%8lu kB\n", 54 "VmSwap:\t%8lu kB\n",
56 hiwater_vm << (PAGE_SHIFT-10), 55 hiwater_vm << (PAGE_SHIFT-10),
57 total_vm << (PAGE_SHIFT-10), 56 (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
58 mm->locked_vm << (PAGE_SHIFT-10), 57 mm->locked_vm << (PAGE_SHIFT-10),
59 mm->pinned_vm << (PAGE_SHIFT-10),
60 hiwater_rss << (PAGE_SHIFT-10), 58 hiwater_rss << (PAGE_SHIFT-10),
61 total_rss << (PAGE_SHIFT-10), 59 total_rss << (PAGE_SHIFT-10),
62 data << (PAGE_SHIFT-10), 60 data << (PAGE_SHIFT-10),
@@ -90,55 +88,10 @@ static void pad_len_spaces(struct seq_file *m, int len)
90 seq_printf(m, "%*c", len, ' '); 88 seq_printf(m, "%*c", len, ' ');
91} 89}
92 90
93#ifdef CONFIG_NUMA
94/*
95 * These functions are for numa_maps but called in generic **maps seq_file
96 * ->start(), ->stop() ops.
97 *
98 * numa_maps scans all vmas under mmap_sem and checks their mempolicy.
99 * Each mempolicy object is controlled by reference counting. The problem here
100 * is how to avoid accessing dead mempolicy object.
101 *
102 * Because we're holding mmap_sem while reading seq_file, it's safe to access
103 * each vma's mempolicy, no vma objects will never drop refs to mempolicy.
104 *
105 * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy
106 * is set and replaced under mmap_sem but unrefed and cleared under task_lock().
107 * So, without task_lock(), we cannot trust get_vma_policy() because we cannot
108 * gurantee the task never exits under us. But taking task_lock() around
109 * get_vma_plicy() causes lock order problem.
110 *
111 * To access task->mempolicy without lock, we hold a reference count of an
112 * object pointed by task->mempolicy and remember it. This will guarantee
113 * that task->mempolicy points to an alive object or NULL in numa_maps accesses.
114 */
115static void hold_task_mempolicy(struct proc_maps_private *priv)
116{
117 struct task_struct *task = priv->task;
118
119 task_lock(task);
120 priv->task_mempolicy = task->mempolicy;
121 mpol_get(priv->task_mempolicy);
122 task_unlock(task);
123}
124static void release_task_mempolicy(struct proc_maps_private *priv)
125{
126 mpol_put(priv->task_mempolicy);
127}
128#else
129static void hold_task_mempolicy(struct proc_maps_private *priv)
130{
131}
132static void release_task_mempolicy(struct proc_maps_private *priv)
133{
134}
135#endif
136
137static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) 91static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
138{ 92{
139 if (vma && vma != priv->tail_vma) { 93 if (vma && vma != priv->tail_vma) {
140 struct mm_struct *mm = vma->vm_mm; 94 struct mm_struct *mm = vma->vm_mm;
141 release_task_mempolicy(priv);
142 up_read(&mm->mmap_sem); 95 up_read(&mm->mmap_sem);
143 mmput(mm); 96 mmput(mm);
144 } 97 }
@@ -170,14 +123,14 @@ static void *m_start(struct seq_file *m, loff_t *pos)
170 if (!priv->task) 123 if (!priv->task)
171 return ERR_PTR(-ESRCH); 124 return ERR_PTR(-ESRCH);
172 125
173 mm = mm_access(priv->task, PTRACE_MODE_READ); 126 mm = mm_for_maps(priv->task);
174 if (!mm || IS_ERR(mm)) 127 if (!mm || IS_ERR(mm))
175 return mm; 128 return mm;
176 down_read(&mm->mmap_sem); 129 down_read(&mm->mmap_sem);
177 130
178 tail_vma = get_gate_vma(priv->task->mm); 131 tail_vma = get_gate_vma(priv->task->mm);
179 priv->tail_vma = tail_vma; 132 priv->tail_vma = tail_vma;
180 hold_task_mempolicy(priv); 133
181 /* Start with last addr hint */ 134 /* Start with last addr hint */
182 vma = find_vma(mm, last_addr); 135 vma = find_vma(mm, last_addr);
183 if (last_addr && vma) { 136 if (last_addr && vma) {
@@ -204,7 +157,6 @@ out:
204 if (vma) 157 if (vma)
205 return vma; 158 return vma;
206 159
207 release_task_mempolicy(priv);
208 /* End of vmas has been reached */ 160 /* End of vmas has been reached */
209 m->version = (tail_vma != NULL)? 0: -1UL; 161 m->version = (tail_vma != NULL)? 0: -1UL;
210 up_read(&mm->mmap_sem); 162 up_read(&mm->mmap_sem);
@@ -255,20 +207,16 @@ static int do_maps_open(struct inode *inode, struct file *file,
255 return ret; 207 return ret;
256} 208}
257 209
258static void 210static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
259show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
260{ 211{
261 struct mm_struct *mm = vma->vm_mm; 212 struct mm_struct *mm = vma->vm_mm;
262 struct file *file = vma->vm_file; 213 struct file *file = vma->vm_file;
263 struct proc_maps_private *priv = m->private;
264 struct task_struct *task = priv->task;
265 vm_flags_t flags = vma->vm_flags; 214 vm_flags_t flags = vma->vm_flags;
266 unsigned long ino = 0; 215 unsigned long ino = 0;
267 unsigned long long pgoff = 0; 216 unsigned long long pgoff = 0;
268 unsigned long start, end; 217 unsigned long start, end;
269 dev_t dev = 0; 218 dev_t dev = 0;
270 int len; 219 int len;
271 const char *name = NULL;
272 220
273 if (file) { 221 if (file) {
274 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 222 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
@@ -302,57 +250,36 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
302 if (file) { 250 if (file) {
303 pad_len_spaces(m, len); 251 pad_len_spaces(m, len);
304 seq_path(m, &file->f_path, "\n"); 252 seq_path(m, &file->f_path, "\n");
305 goto done; 253 } else {
306 } 254 const char *name = arch_vma_name(vma);
307 255 if (!name) {
308 name = arch_vma_name(vma); 256 if (mm) {
309 if (!name) { 257 if (vma->vm_start <= mm->brk &&
310 pid_t tid; 258 vma->vm_end >= mm->start_brk) {
311 259 name = "[heap]";
312 if (!mm) { 260 } else if (vma->vm_start <= mm->start_stack &&
313 name = "[vdso]"; 261 vma->vm_end >= mm->start_stack) {
314 goto done; 262 name = "[stack]";
315 } 263 }
316
317 if (vma->vm_start <= mm->brk &&
318 vma->vm_end >= mm->start_brk) {
319 name = "[heap]";
320 goto done;
321 }
322
323 tid = vm_is_stack(task, vma, is_pid);
324
325 if (tid != 0) {
326 /*
327 * Thread stack in /proc/PID/task/TID/maps or
328 * the main process stack.
329 */
330 if (!is_pid || (vma->vm_start <= mm->start_stack &&
331 vma->vm_end >= mm->start_stack)) {
332 name = "[stack]";
333 } else { 264 } else {
334 /* Thread stack in /proc/PID/maps */ 265 name = "[vdso]";
335 pad_len_spaces(m, len);
336 seq_printf(m, "[stack:%d]", tid);
337 } 266 }
338 } 267 }
339 } 268 if (name) {
340 269 pad_len_spaces(m, len);
341done: 270 seq_puts(m, name);
342 if (name) { 271 }
343 pad_len_spaces(m, len);
344 seq_puts(m, name);
345 } 272 }
346 seq_putc(m, '\n'); 273 seq_putc(m, '\n');
347} 274}
348 275
349static int show_map(struct seq_file *m, void *v, int is_pid) 276static int show_map(struct seq_file *m, void *v)
350{ 277{
351 struct vm_area_struct *vma = v; 278 struct vm_area_struct *vma = v;
352 struct proc_maps_private *priv = m->private; 279 struct proc_maps_private *priv = m->private;
353 struct task_struct *task = priv->task; 280 struct task_struct *task = priv->task;
354 281
355 show_map_vma(m, vma, is_pid); 282 show_map_vma(m, vma);
356 283
357 if (m->count < m->size) /* vma is copied successfully */ 284 if (m->count < m->size) /* vma is copied successfully */
358 m->version = (vma != get_gate_vma(task->mm)) 285 m->version = (vma != get_gate_vma(task->mm))
@@ -360,49 +287,20 @@ static int show_map(struct seq_file *m, void *v, int is_pid)
360 return 0; 287 return 0;
361} 288}
362 289
363static int show_pid_map(struct seq_file *m, void *v)
364{
365 return show_map(m, v, 1);
366}
367
368static int show_tid_map(struct seq_file *m, void *v)
369{
370 return show_map(m, v, 0);
371}
372
373static const struct seq_operations proc_pid_maps_op = { 290static const struct seq_operations proc_pid_maps_op = {
374 .start = m_start, 291 .start = m_start,
375 .next = m_next, 292 .next = m_next,
376 .stop = m_stop, 293 .stop = m_stop,
377 .show = show_pid_map 294 .show = show_map
378};
379
380static const struct seq_operations proc_tid_maps_op = {
381 .start = m_start,
382 .next = m_next,
383 .stop = m_stop,
384 .show = show_tid_map
385}; 295};
386 296
387static int pid_maps_open(struct inode *inode, struct file *file) 297static int maps_open(struct inode *inode, struct file *file)
388{ 298{
389 return do_maps_open(inode, file, &proc_pid_maps_op); 299 return do_maps_open(inode, file, &proc_pid_maps_op);
390} 300}
391 301
392static int tid_maps_open(struct inode *inode, struct file *file) 302const struct file_operations proc_maps_operations = {
393{ 303 .open = maps_open,
394 return do_maps_open(inode, file, &proc_tid_maps_op);
395}
396
397const struct file_operations proc_pid_maps_operations = {
398 .open = pid_maps_open,
399 .read = seq_read,
400 .llseek = seq_lseek,
401 .release = seq_release_private,
402};
403
404const struct file_operations proc_tid_maps_operations = {
405 .open = tid_maps_open,
406 .read = seq_read, 304 .read = seq_read,
407 .llseek = seq_lseek, 305 .llseek = seq_lseek,
408 .release = seq_release_private, 306 .release = seq_release_private,
@@ -439,7 +337,6 @@ struct mem_size_stats {
439 unsigned long anonymous; 337 unsigned long anonymous;
440 unsigned long anonymous_thp; 338 unsigned long anonymous_thp;
441 unsigned long swap; 339 unsigned long swap;
442 unsigned long nonlinear;
443 u64 pss; 340 u64 pss;
444}; 341};
445 342
@@ -449,33 +346,24 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
449{ 346{
450 struct mem_size_stats *mss = walk->private; 347 struct mem_size_stats *mss = walk->private;
451 struct vm_area_struct *vma = mss->vma; 348 struct vm_area_struct *vma = mss->vma;
452 pgoff_t pgoff = linear_page_index(vma, addr); 349 struct page *page;
453 struct page *page = NULL;
454 int mapcount; 350 int mapcount;
455 351
456 if (pte_present(ptent)) { 352 if (is_swap_pte(ptent)) {
457 page = vm_normal_page(vma, addr, ptent); 353 mss->swap += ptent_size;
458 } else if (is_swap_pte(ptent)) { 354 return;
459 swp_entry_t swpent = pte_to_swp_entry(ptent);
460
461 if (!non_swap_entry(swpent))
462 mss->swap += ptent_size;
463 else if (is_migration_entry(swpent))
464 page = migration_entry_to_page(swpent);
465 } else if (pte_file(ptent)) {
466 if (pte_to_pgoff(ptent) != pgoff)
467 mss->nonlinear += ptent_size;
468 } 355 }
469 356
357 if (!pte_present(ptent))
358 return;
359
360 page = vm_normal_page(vma, addr, ptent);
470 if (!page) 361 if (!page)
471 return; 362 return;
472 363
473 if (PageAnon(page)) 364 if (PageAnon(page))
474 mss->anonymous += ptent_size; 365 mss->anonymous += ptent_size;
475 366
476 if (page->index != pgoff)
477 mss->nonlinear += ptent_size;
478
479 mss->resident += ptent_size; 367 mss->resident += ptent_size;
480 /* Accumulate the size in pages that have been accessed. */ 368 /* Accumulate the size in pages that have been accessed. */
481 if (pte_young(ptent) || PageReferenced(page)) 369 if (pte_young(ptent) || PageReferenced(page))
@@ -504,15 +392,21 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
504 pte_t *pte; 392 pte_t *pte;
505 spinlock_t *ptl; 393 spinlock_t *ptl;
506 394
507 if (pmd_trans_huge_lock(pmd, vma) == 1) { 395 spin_lock(&walk->mm->page_table_lock);
508 smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); 396 if (pmd_trans_huge(*pmd)) {
397 if (pmd_trans_splitting(*pmd)) {
398 spin_unlock(&walk->mm->page_table_lock);
399 wait_split_huge_page(vma->anon_vma, pmd);
400 } else {
401 smaps_pte_entry(*(pte_t *)pmd, addr,
402 HPAGE_PMD_SIZE, walk);
403 spin_unlock(&walk->mm->page_table_lock);
404 mss->anonymous_thp += HPAGE_PMD_SIZE;
405 return 0;
406 }
407 } else {
509 spin_unlock(&walk->mm->page_table_lock); 408 spin_unlock(&walk->mm->page_table_lock);
510 mss->anonymous_thp += HPAGE_PMD_SIZE;
511 return 0;
512 } 409 }
513
514 if (pmd_trans_unstable(pmd))
515 return 0;
516 /* 410 /*
517 * The mmap_sem held all the way back in m_start() is what 411 * The mmap_sem held all the way back in m_start() is what
518 * keeps khugepaged out of here and from collapsing things 412 * keeps khugepaged out of here and from collapsing things
@@ -526,58 +420,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
526 return 0; 420 return 0;
527} 421}
528 422
529static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) 423static int show_smap(struct seq_file *m, void *v)
530{
531 /*
532 * Don't forget to update Documentation/ on changes.
533 */
534 static const char mnemonics[BITS_PER_LONG][2] = {
535 /*
536 * In case if we meet a flag we don't know about.
537 */
538 [0 ... (BITS_PER_LONG-1)] = "??",
539
540 [ilog2(VM_READ)] = "rd",
541 [ilog2(VM_WRITE)] = "wr",
542 [ilog2(VM_EXEC)] = "ex",
543 [ilog2(VM_SHARED)] = "sh",
544 [ilog2(VM_MAYREAD)] = "mr",
545 [ilog2(VM_MAYWRITE)] = "mw",
546 [ilog2(VM_MAYEXEC)] = "me",
547 [ilog2(VM_MAYSHARE)] = "ms",
548 [ilog2(VM_GROWSDOWN)] = "gd",
549 [ilog2(VM_PFNMAP)] = "pf",
550 [ilog2(VM_DENYWRITE)] = "dw",
551 [ilog2(VM_LOCKED)] = "lo",
552 [ilog2(VM_IO)] = "io",
553 [ilog2(VM_SEQ_READ)] = "sr",
554 [ilog2(VM_RAND_READ)] = "rr",
555 [ilog2(VM_DONTCOPY)] = "dc",
556 [ilog2(VM_DONTEXPAND)] = "de",
557 [ilog2(VM_ACCOUNT)] = "ac",
558 [ilog2(VM_NORESERVE)] = "nr",
559 [ilog2(VM_HUGETLB)] = "ht",
560 [ilog2(VM_NONLINEAR)] = "nl",
561 [ilog2(VM_ARCH_1)] = "ar",
562 [ilog2(VM_DONTDUMP)] = "dd",
563 [ilog2(VM_MIXEDMAP)] = "mm",
564 [ilog2(VM_HUGEPAGE)] = "hg",
565 [ilog2(VM_NOHUGEPAGE)] = "nh",
566 [ilog2(VM_MERGEABLE)] = "mg",
567 };
568 size_t i;
569
570 seq_puts(m, "VmFlags: ");
571 for (i = 0; i < BITS_PER_LONG; i++) {
572 if (vma->vm_flags & (1UL << i)) {
573 seq_printf(m, "%c%c ",
574 mnemonics[i][0], mnemonics[i][1]);
575 }
576 }
577 seq_putc(m, '\n');
578}
579
580static int show_smap(struct seq_file *m, void *v, int is_pid)
581{ 424{
582 struct proc_maps_private *priv = m->private; 425 struct proc_maps_private *priv = m->private;
583 struct task_struct *task = priv->task; 426 struct task_struct *task = priv->task;
@@ -595,7 +438,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
595 if (vma->vm_mm && !is_vm_hugetlb_page(vma)) 438 if (vma->vm_mm && !is_vm_hugetlb_page(vma))
596 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); 439 walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
597 440
598 show_map_vma(m, vma, is_pid); 441 show_map_vma(m, vma);
599 442
600 seq_printf(m, 443 seq_printf(m,
601 "Size: %8lu kB\n" 444 "Size: %8lu kB\n"
@@ -628,61 +471,26 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
628 (vma->vm_flags & VM_LOCKED) ? 471 (vma->vm_flags & VM_LOCKED) ?
629 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); 472 (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
630 473
631 if (vma->vm_flags & VM_NONLINEAR)
632 seq_printf(m, "Nonlinear: %8lu kB\n",
633 mss.nonlinear >> 10);
634
635 show_smap_vma_flags(m, vma);
636
637 if (m->count < m->size) /* vma is copied successfully */ 474 if (m->count < m->size) /* vma is copied successfully */
638 m->version = (vma != get_gate_vma(task->mm)) 475 m->version = (vma != get_gate_vma(task->mm))
639 ? vma->vm_start : 0; 476 ? vma->vm_start : 0;
640 return 0; 477 return 0;
641} 478}
642 479
643static int show_pid_smap(struct seq_file *m, void *v)
644{
645 return show_smap(m, v, 1);
646}
647
648static int show_tid_smap(struct seq_file *m, void *v)
649{
650 return show_smap(m, v, 0);
651}
652
653static const struct seq_operations proc_pid_smaps_op = { 480static const struct seq_operations proc_pid_smaps_op = {
654 .start = m_start, 481 .start = m_start,
655 .next = m_next, 482 .next = m_next,
656 .stop = m_stop, 483 .stop = m_stop,
657 .show = show_pid_smap 484 .show = show_smap
658};
659
660static const struct seq_operations proc_tid_smaps_op = {
661 .start = m_start,
662 .next = m_next,
663 .stop = m_stop,
664 .show = show_tid_smap
665}; 485};
666 486
667static int pid_smaps_open(struct inode *inode, struct file *file) 487static int smaps_open(struct inode *inode, struct file *file)
668{ 488{
669 return do_maps_open(inode, file, &proc_pid_smaps_op); 489 return do_maps_open(inode, file, &proc_pid_smaps_op);
670} 490}
671 491
672static int tid_smaps_open(struct inode *inode, struct file *file) 492const struct file_operations proc_smaps_operations = {
673{ 493 .open = smaps_open,
674 return do_maps_open(inode, file, &proc_tid_smaps_op);
675}
676
677const struct file_operations proc_pid_smaps_operations = {
678 .open = pid_smaps_open,
679 .read = seq_read,
680 .llseek = seq_lseek,
681 .release = seq_release_private,
682};
683
684const struct file_operations proc_tid_smaps_operations = {
685 .open = tid_smaps_open,
686 .read = seq_read, 494 .read = seq_read,
687 .llseek = seq_lseek, 495 .llseek = seq_lseek,
688 .release = seq_release_private, 496 .release = seq_release_private,
@@ -696,9 +504,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
696 spinlock_t *ptl; 504 spinlock_t *ptl;
697 struct page *page; 505 struct page *page;
698 506
699 split_huge_page_pmd(vma, addr, pmd); 507 split_huge_page_pmd(walk->mm, pmd);
700 if (pmd_trans_unstable(pmd))
701 return 0;
702 508
703 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 509 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
704 for (; addr != end; pte++, addr += PAGE_SIZE) { 510 for (; addr != end; pte++, addr += PAGE_SIZE) {
@@ -787,18 +593,11 @@ const struct file_operations proc_clear_refs_operations = {
787 .llseek = noop_llseek, 593 .llseek = noop_llseek,
788}; 594};
789 595
790typedef struct {
791 u64 pme;
792} pagemap_entry_t;
793
794struct pagemapread { 596struct pagemapread {
795 int pos, len; 597 int pos, len;
796 pagemap_entry_t *buffer; 598 u64 *buffer;
797}; 599};
798 600
799#define PAGEMAP_WALK_SIZE (PMD_SIZE)
800#define PAGEMAP_WALK_MASK (PMD_MASK)
801
802#define PM_ENTRY_BYTES sizeof(u64) 601#define PM_ENTRY_BYTES sizeof(u64)
803#define PM_STATUS_BITS 3 602#define PM_STATUS_BITS 3
804#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) 603#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
@@ -813,19 +612,13 @@ struct pagemapread {
813 612
814#define PM_PRESENT PM_STATUS(4LL) 613#define PM_PRESENT PM_STATUS(4LL)
815#define PM_SWAP PM_STATUS(2LL) 614#define PM_SWAP PM_STATUS(2LL)
816#define PM_FILE PM_STATUS(1LL)
817#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT) 615#define PM_NOT_PRESENT PM_PSHIFT(PAGE_SHIFT)
818#define PM_END_OF_BUFFER 1 616#define PM_END_OF_BUFFER 1
819 617
820static inline pagemap_entry_t make_pme(u64 val) 618static int add_to_pagemap(unsigned long addr, u64 pfn,
821{
822 return (pagemap_entry_t) { .pme = val };
823}
824
825static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
826 struct pagemapread *pm) 619 struct pagemapread *pm)
827{ 620{
828 pm->buffer[pm->pos++] = *pme; 621 pm->buffer[pm->pos++] = pfn;
829 if (pm->pos >= pm->len) 622 if (pm->pos >= pm->len)
830 return PM_END_OF_BUFFER; 623 return PM_END_OF_BUFFER;
831 return 0; 624 return 0;
@@ -837,66 +630,31 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
837 struct pagemapread *pm = walk->private; 630 struct pagemapread *pm = walk->private;
838 unsigned long addr; 631 unsigned long addr;
839 int err = 0; 632 int err = 0;
840 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT);
841
842 for (addr = start; addr < end; addr += PAGE_SIZE) { 633 for (addr = start; addr < end; addr += PAGE_SIZE) {
843 err = add_to_pagemap(addr, &pme, pm); 634 err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
844 if (err) 635 if (err)
845 break; 636 break;
846 } 637 }
847 return err; 638 return err;
848} 639}
849 640
850static void pte_to_pagemap_entry(pagemap_entry_t *pme, 641static u64 swap_pte_to_pagemap_entry(pte_t pte)
851 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
852{ 642{
853 u64 frame, flags; 643 swp_entry_t e = pte_to_swp_entry(pte);
854 struct page *page = NULL; 644 return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
855
856 if (pte_present(pte)) {
857 frame = pte_pfn(pte);
858 flags = PM_PRESENT;
859 page = vm_normal_page(vma, addr, pte);
860 } else if (is_swap_pte(pte)) {
861 swp_entry_t entry = pte_to_swp_entry(pte);
862
863 frame = swp_type(entry) |
864 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
865 flags = PM_SWAP;
866 if (is_migration_entry(entry))
867 page = migration_entry_to_page(entry);
868 } else {
869 *pme = make_pme(PM_NOT_PRESENT);
870 return;
871 }
872
873 if (page && !PageAnon(page))
874 flags |= PM_FILE;
875
876 *pme = make_pme(PM_PFRAME(frame) | PM_PSHIFT(PAGE_SHIFT) | flags);
877} 645}
878 646
879#ifdef CONFIG_TRANSPARENT_HUGEPAGE 647static u64 pte_to_pagemap_entry(pte_t pte)
880static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
881 pmd_t pmd, int offset)
882{
883 /*
884 * Currently pmd for thp is always present because thp can not be
885 * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
886 * This if-check is just to prepare for future implementation.
887 */
888 if (pmd_present(pmd))
889 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
890 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT);
891 else
892 *pme = make_pme(PM_NOT_PRESENT);
893}
894#else
895static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme,
896 pmd_t pmd, int offset)
897{ 648{
649 u64 pme = 0;
650 if (is_swap_pte(pte))
651 pme = PM_PFRAME(swap_pte_to_pagemap_entry(pte))
652 | PM_PSHIFT(PAGE_SHIFT) | PM_SWAP;
653 else if (pte_present(pte))
654 pme = PM_PFRAME(pte_pfn(pte))
655 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
656 return pme;
898} 657}
899#endif
900 658
901static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, 659static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
902 struct mm_walk *walk) 660 struct mm_walk *walk)
@@ -905,46 +663,29 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
905 struct pagemapread *pm = walk->private; 663 struct pagemapread *pm = walk->private;
906 pte_t *pte; 664 pte_t *pte;
907 int err = 0; 665 int err = 0;
908 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT); 666
667 split_huge_page_pmd(walk->mm, pmd);
909 668
910 /* find the first VMA at or above 'addr' */ 669 /* find the first VMA at or above 'addr' */
911 vma = find_vma(walk->mm, addr); 670 vma = find_vma(walk->mm, addr);
912 if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
913 for (; addr != end; addr += PAGE_SIZE) {
914 unsigned long offset;
915
916 offset = (addr & ~PAGEMAP_WALK_MASK) >>
917 PAGE_SHIFT;
918 thp_pmd_to_pagemap_entry(&pme, *pmd, offset);
919 err = add_to_pagemap(addr, &pme, pm);
920 if (err)
921 break;
922 }
923 spin_unlock(&walk->mm->page_table_lock);
924 return err;
925 }
926
927 if (pmd_trans_unstable(pmd))
928 return 0;
929 for (; addr != end; addr += PAGE_SIZE) { 671 for (; addr != end; addr += PAGE_SIZE) {
672 u64 pfn = PM_NOT_PRESENT;
930 673
931 /* check to see if we've left 'vma' behind 674 /* check to see if we've left 'vma' behind
932 * and need a new, higher one */ 675 * and need a new, higher one */
933 if (vma && (addr >= vma->vm_end)) { 676 if (vma && (addr >= vma->vm_end))
934 vma = find_vma(walk->mm, addr); 677 vma = find_vma(walk->mm, addr);
935 pme = make_pme(PM_NOT_PRESENT);
936 }
937 678
938 /* check that 'vma' actually covers this address, 679 /* check that 'vma' actually covers this address,
939 * and that it isn't a huge page vma */ 680 * and that it isn't a huge page vma */
940 if (vma && (vma->vm_start <= addr) && 681 if (vma && (vma->vm_start <= addr) &&
941 !is_vm_hugetlb_page(vma)) { 682 !is_vm_hugetlb_page(vma)) {
942 pte = pte_offset_map(pmd, addr); 683 pte = pte_offset_map(pmd, addr);
943 pte_to_pagemap_entry(&pme, vma, addr, *pte); 684 pfn = pte_to_pagemap_entry(*pte);
944 /* unmap before userspace copy */ 685 /* unmap before userspace copy */
945 pte_unmap(pte); 686 pte_unmap(pte);
946 } 687 }
947 err = add_to_pagemap(addr, &pme, pm); 688 err = add_to_pagemap(addr, pfn, pm);
948 if (err) 689 if (err)
949 return err; 690 return err;
950 } 691 }
@@ -955,14 +696,13 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
955} 696}
956 697
957#ifdef CONFIG_HUGETLB_PAGE 698#ifdef CONFIG_HUGETLB_PAGE
958static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, 699static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
959 pte_t pte, int offset)
960{ 700{
701 u64 pme = 0;
961 if (pte_present(pte)) 702 if (pte_present(pte))
962 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) 703 pme = PM_PFRAME(pte_pfn(pte) + offset)
963 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT); 704 | PM_PSHIFT(PAGE_SHIFT) | PM_PRESENT;
964 else 705 return pme;
965 *pme = make_pme(PM_NOT_PRESENT);
966} 706}
967 707
968/* This function walks within one hugetlb entry in the single call */ 708/* This function walks within one hugetlb entry in the single call */
@@ -972,12 +712,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
972{ 712{
973 struct pagemapread *pm = walk->private; 713 struct pagemapread *pm = walk->private;
974 int err = 0; 714 int err = 0;
975 pagemap_entry_t pme; 715 u64 pfn;
976 716
977 for (; addr != end; addr += PAGE_SIZE) { 717 for (; addr != end; addr += PAGE_SIZE) {
978 int offset = (addr & ~hmask) >> PAGE_SHIFT; 718 int offset = (addr & ~hmask) >> PAGE_SHIFT;
979 huge_pte_to_pagemap_entry(&pme, *pte, offset); 719 pfn = huge_pte_to_pagemap_entry(*pte, offset);
980 err = add_to_pagemap(addr, &pme, pm); 720 err = add_to_pagemap(addr, pfn, pm);
981 if (err) 721 if (err)
982 return err; 722 return err;
983 } 723 }
@@ -994,11 +734,11 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
994 * For each page in the address space, this file contains one 64-bit entry 734 * For each page in the address space, this file contains one 64-bit entry
995 * consisting of the following: 735 * consisting of the following:
996 * 736 *
997 * Bits 0-54 page frame number (PFN) if present 737 * Bits 0-55 page frame number (PFN) if present
998 * Bits 0-4 swap type if swapped 738 * Bits 0-4 swap type if swapped
999 * Bits 5-54 swap offset if swapped 739 * Bits 5-55 swap offset if swapped
1000 * Bits 55-60 page shift (page size = 1<<page shift) 740 * Bits 55-60 page shift (page size = 1<<page shift)
1001 * Bit 61 page is file-page or shared-anon 741 * Bit 61 reserved for future use
1002 * Bit 62 page swapped 742 * Bit 62 page swapped
1003 * Bit 63 page present 743 * Bit 63 page present
1004 * 744 *
@@ -1012,6 +752,8 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1012 * determine which areas of memory are actually mapped and llseek to 752 * determine which areas of memory are actually mapped and llseek to
1013 * skip over unmapped regions. 753 * skip over unmapped regions.
1014 */ 754 */
755#define PAGEMAP_WALK_SIZE (PMD_SIZE)
756#define PAGEMAP_WALK_MASK (PMD_MASK)
1015static ssize_t pagemap_read(struct file *file, char __user *buf, 757static ssize_t pagemap_read(struct file *file, char __user *buf,
1016 size_t count, loff_t *ppos) 758 size_t count, loff_t *ppos)
1017{ 759{
@@ -1044,7 +786,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1044 if (!pm.buffer) 786 if (!pm.buffer)
1045 goto out_task; 787 goto out_task;
1046 788
1047 mm = mm_access(task, PTRACE_MODE_READ); 789 mm = mm_for_maps(task);
1048 ret = PTR_ERR(mm); 790 ret = PTR_ERR(mm);
1049 if (!mm || IS_ERR(mm)) 791 if (!mm || IS_ERR(mm))
1050 goto out_free; 792 goto out_free;
@@ -1179,7 +921,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
1179 return NULL; 921 return NULL;
1180 922
1181 nid = page_to_nid(page); 923 nid = page_to_nid(page);
1182 if (!node_isset(nid, node_states[N_MEMORY])) 924 if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
1183 return NULL; 925 return NULL;
1184 926
1185 return page; 927 return page;
@@ -1194,21 +936,26 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
1194 pte_t *pte; 936 pte_t *pte;
1195 937
1196 md = walk->private; 938 md = walk->private;
1197 939 spin_lock(&walk->mm->page_table_lock);
1198 if (pmd_trans_huge_lock(pmd, md->vma) == 1) { 940 if (pmd_trans_huge(*pmd)) {
1199 pte_t huge_pte = *(pte_t *)pmd; 941 if (pmd_trans_splitting(*pmd)) {
1200 struct page *page; 942 spin_unlock(&walk->mm->page_table_lock);
1201 943 wait_split_huge_page(md->vma->anon_vma, pmd);
1202 page = can_gather_numa_stats(huge_pte, md->vma, addr); 944 } else {
1203 if (page) 945 pte_t huge_pte = *(pte_t *)pmd;
1204 gather_stats(page, md, pte_dirty(huge_pte), 946 struct page *page;
1205 HPAGE_PMD_SIZE/PAGE_SIZE); 947
948 page = can_gather_numa_stats(huge_pte, md->vma, addr);
949 if (page)
950 gather_stats(page, md, pte_dirty(huge_pte),
951 HPAGE_PMD_SIZE/PAGE_SIZE);
952 spin_unlock(&walk->mm->page_table_lock);
953 return 0;
954 }
955 } else {
1206 spin_unlock(&walk->mm->page_table_lock); 956 spin_unlock(&walk->mm->page_table_lock);
1207 return 0;
1208 } 957 }
1209 958
1210 if (pmd_trans_unstable(pmd))
1211 return 0;
1212 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 959 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
1213 do { 960 do {
1214 struct page *page = can_gather_numa_stats(*pte, md->vma, addr); 961 struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
@@ -1250,14 +997,13 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
1250/* 997/*
1251 * Display pages allocated per node and memory policy via /proc. 998 * Display pages allocated per node and memory policy via /proc.
1252 */ 999 */
1253static int show_numa_map(struct seq_file *m, void *v, int is_pid) 1000static int show_numa_map(struct seq_file *m, void *v)
1254{ 1001{
1255 struct numa_maps_private *numa_priv = m->private; 1002 struct numa_maps_private *numa_priv = m->private;
1256 struct proc_maps_private *proc_priv = &numa_priv->proc_maps; 1003 struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
1257 struct vm_area_struct *vma = v; 1004 struct vm_area_struct *vma = v;
1258 struct numa_maps *md = &numa_priv->md; 1005 struct numa_maps *md = &numa_priv->md;
1259 struct file *file = vma->vm_file; 1006 struct file *file = vma->vm_file;
1260 struct task_struct *task = proc_priv->task;
1261 struct mm_struct *mm = vma->vm_mm; 1007 struct mm_struct *mm = vma->vm_mm;
1262 struct mm_walk walk = {}; 1008 struct mm_walk walk = {};
1263 struct mempolicy *pol; 1009 struct mempolicy *pol;
@@ -1277,8 +1023,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1277 walk.private = md; 1023 walk.private = md;
1278 walk.mm = mm; 1024 walk.mm = mm;
1279 1025
1280 pol = get_vma_policy(task, vma, vma->vm_start); 1026 pol = get_vma_policy(proc_priv->task, vma, vma->vm_start);
1281 mpol_to_str(buffer, sizeof(buffer), pol); 1027 mpol_to_str(buffer, sizeof(buffer), pol, 0);
1282 mpol_cond_put(pol); 1028 mpol_cond_put(pol);
1283 1029
1284 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 1030 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
@@ -1288,19 +1034,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1288 seq_path(m, &file->f_path, "\n\t= "); 1034 seq_path(m, &file->f_path, "\n\t= ");
1289 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { 1035 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1290 seq_printf(m, " heap"); 1036 seq_printf(m, " heap");
1291 } else { 1037 } else if (vma->vm_start <= mm->start_stack &&
1292 pid_t tid = vm_is_stack(task, vma, is_pid); 1038 vma->vm_end >= mm->start_stack) {
1293 if (tid != 0) { 1039 seq_printf(m, " stack");
1294 /*
1295 * Thread stack in /proc/PID/task/TID/maps or
1296 * the main process stack.
1297 */
1298 if (!is_pid || (vma->vm_start <= mm->start_stack &&
1299 vma->vm_end >= mm->start_stack))
1300 seq_printf(m, " stack");
1301 else
1302 seq_printf(m, " stack:%d", tid);
1303 }
1304 } 1040 }
1305 1041
1306 if (is_vm_hugetlb_page(vma)) 1042 if (is_vm_hugetlb_page(vma))
@@ -1332,7 +1068,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
1332 if (md->writeback) 1068 if (md->writeback)
1333 seq_printf(m, " writeback=%lu", md->writeback); 1069 seq_printf(m, " writeback=%lu", md->writeback);
1334 1070
1335 for_each_node_state(n, N_MEMORY) 1071 for_each_node_state(n, N_HIGH_MEMORY)
1336 if (md->node[n]) 1072 if (md->node[n])
1337 seq_printf(m, " N%d=%lu", n, md->node[n]); 1073 seq_printf(m, " N%d=%lu", n, md->node[n]);
1338out: 1074out:
@@ -1343,39 +1079,21 @@ out:
1343 return 0; 1079 return 0;
1344} 1080}
1345 1081
1346static int show_pid_numa_map(struct seq_file *m, void *v)
1347{
1348 return show_numa_map(m, v, 1);
1349}
1350
1351static int show_tid_numa_map(struct seq_file *m, void *v)
1352{
1353 return show_numa_map(m, v, 0);
1354}
1355
1356static const struct seq_operations proc_pid_numa_maps_op = { 1082static const struct seq_operations proc_pid_numa_maps_op = {
1357 .start = m_start, 1083 .start = m_start,
1358 .next = m_next, 1084 .next = m_next,
1359 .stop = m_stop, 1085 .stop = m_stop,
1360 .show = show_pid_numa_map, 1086 .show = show_numa_map,
1361};
1362
1363static const struct seq_operations proc_tid_numa_maps_op = {
1364 .start = m_start,
1365 .next = m_next,
1366 .stop = m_stop,
1367 .show = show_tid_numa_map,
1368}; 1087};
1369 1088
1370static int numa_maps_open(struct inode *inode, struct file *file, 1089static int numa_maps_open(struct inode *inode, struct file *file)
1371 const struct seq_operations *ops)
1372{ 1090{
1373 struct numa_maps_private *priv; 1091 struct numa_maps_private *priv;
1374 int ret = -ENOMEM; 1092 int ret = -ENOMEM;
1375 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 1093 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
1376 if (priv) { 1094 if (priv) {
1377 priv->proc_maps.pid = proc_pid(inode); 1095 priv->proc_maps.pid = proc_pid(inode);
1378 ret = seq_open(file, ops); 1096 ret = seq_open(file, &proc_pid_numa_maps_op);
1379 if (!ret) { 1097 if (!ret) {
1380 struct seq_file *m = file->private_data; 1098 struct seq_file *m = file->private_data;
1381 m->private = priv; 1099 m->private = priv;
@@ -1386,25 +1104,8 @@ static int numa_maps_open(struct inode *inode, struct file *file,
1386 return ret; 1104 return ret;
1387} 1105}
1388 1106
1389static int pid_numa_maps_open(struct inode *inode, struct file *file) 1107const struct file_operations proc_numa_maps_operations = {
1390{ 1108 .open = numa_maps_open,
1391 return numa_maps_open(inode, file, &proc_pid_numa_maps_op);
1392}
1393
1394static int tid_numa_maps_open(struct inode *inode, struct file *file)
1395{
1396 return numa_maps_open(inode, file, &proc_tid_numa_maps_op);
1397}
1398
1399const struct file_operations proc_pid_numa_maps_operations = {
1400 .open = pid_numa_maps_open,
1401 .read = seq_read,
1402 .llseek = seq_lseek,
1403 .release = seq_release_private,
1404};
1405
1406const struct file_operations proc_tid_numa_maps_operations = {
1407 .open = tid_numa_maps_open,
1408 .read = seq_read, 1109 .read = seq_read,
1409 .llseek = seq_lseek, 1110 .llseek = seq_lseek,
1410 .release = seq_release_private, 1111 .release = seq_release_private,
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 1ccfa537f5f..980de547c07 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -134,11 +134,9 @@ static void pad_len_spaces(struct seq_file *m, int len)
134/* 134/*
135 * display a single VMA to a sequenced file 135 * display a single VMA to a sequenced file
136 */ 136 */
137static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, 137static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
138 int is_pid)
139{ 138{
140 struct mm_struct *mm = vma->vm_mm; 139 struct mm_struct *mm = vma->vm_mm;
141 struct proc_maps_private *priv = m->private;
142 unsigned long ino = 0; 140 unsigned long ino = 0;
143 struct file *file; 141 struct file *file;
144 dev_t dev = 0; 142 dev_t dev = 0;
@@ -170,19 +168,10 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
170 pad_len_spaces(m, len); 168 pad_len_spaces(m, len);
171 seq_path(m, &file->f_path, ""); 169 seq_path(m, &file->f_path, "");
172 } else if (mm) { 170 } else if (mm) {
173 pid_t tid = vm_is_stack(priv->task, vma, is_pid); 171 if (vma->vm_start <= mm->start_stack &&
174 172 vma->vm_end >= mm->start_stack) {
175 if (tid != 0) {
176 pad_len_spaces(m, len); 173 pad_len_spaces(m, len);
177 /* 174 seq_puts(m, "[stack]");
178 * Thread stack in /proc/PID/task/TID/maps or
179 * the main process stack.
180 */
181 if (!is_pid || (vma->vm_start <= mm->start_stack &&
182 vma->vm_end >= mm->start_stack))
183 seq_printf(m, "[stack]");
184 else
185 seq_printf(m, "[stack:%d]", tid);
186 } 175 }
187 } 176 }
188 177
@@ -193,22 +182,11 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
193/* 182/*
194 * display mapping lines for a particular process's /proc/pid/maps 183 * display mapping lines for a particular process's /proc/pid/maps
195 */ 184 */
196static int show_map(struct seq_file *m, void *_p, int is_pid) 185static int show_map(struct seq_file *m, void *_p)
197{ 186{
198 struct rb_node *p = _p; 187 struct rb_node *p = _p;
199 188
200 return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb), 189 return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb));
201 is_pid);
202}
203
204static int show_pid_map(struct seq_file *m, void *_p)
205{
206 return show_map(m, _p, 1);
207}
208
209static int show_tid_map(struct seq_file *m, void *_p)
210{
211 return show_map(m, _p, 0);
212} 190}
213 191
214static void *m_start(struct seq_file *m, loff_t *pos) 192static void *m_start(struct seq_file *m, loff_t *pos)
@@ -223,7 +201,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
223 if (!priv->task) 201 if (!priv->task)
224 return ERR_PTR(-ESRCH); 202 return ERR_PTR(-ESRCH);
225 203
226 mm = mm_access(priv->task, PTRACE_MODE_READ); 204 mm = mm_for_maps(priv->task);
227 if (!mm || IS_ERR(mm)) { 205 if (!mm || IS_ERR(mm)) {
228 put_task_struct(priv->task); 206 put_task_struct(priv->task);
229 priv->task = NULL; 207 priv->task = NULL;
@@ -262,18 +240,10 @@ static const struct seq_operations proc_pid_maps_ops = {
262 .start = m_start, 240 .start = m_start,
263 .next = m_next, 241 .next = m_next,
264 .stop = m_stop, 242 .stop = m_stop,
265 .show = show_pid_map 243 .show = show_map
266};
267
268static const struct seq_operations proc_tid_maps_ops = {
269 .start = m_start,
270 .next = m_next,
271 .stop = m_stop,
272 .show = show_tid_map
273}; 244};
274 245
275static int maps_open(struct inode *inode, struct file *file, 246static int maps_open(struct inode *inode, struct file *file)
276 const struct seq_operations *ops)
277{ 247{
278 struct proc_maps_private *priv; 248 struct proc_maps_private *priv;
279 int ret = -ENOMEM; 249 int ret = -ENOMEM;
@@ -281,7 +251,7 @@ static int maps_open(struct inode *inode, struct file *file,
281 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 251 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
282 if (priv) { 252 if (priv) {
283 priv->pid = proc_pid(inode); 253 priv->pid = proc_pid(inode);
284 ret = seq_open(file, ops); 254 ret = seq_open(file, &proc_pid_maps_ops);
285 if (!ret) { 255 if (!ret) {
286 struct seq_file *m = file->private_data; 256 struct seq_file *m = file->private_data;
287 m->private = priv; 257 m->private = priv;
@@ -292,25 +262,8 @@ static int maps_open(struct inode *inode, struct file *file,
292 return ret; 262 return ret;
293} 263}
294 264
295static int pid_maps_open(struct inode *inode, struct file *file) 265const struct file_operations proc_maps_operations = {
296{ 266 .open = maps_open,
297 return maps_open(inode, file, &proc_pid_maps_ops);
298}
299
300static int tid_maps_open(struct inode *inode, struct file *file)
301{
302 return maps_open(inode, file, &proc_tid_maps_ops);
303}
304
305const struct file_operations proc_pid_maps_operations = {
306 .open = pid_maps_open,
307 .read = seq_read,
308 .llseek = seq_lseek,
309 .release = seq_release_private,
310};
311
312const struct file_operations proc_tid_maps_operations = {
313 .open = tid_maps_open,
314 .read = seq_read, 267 .read = seq_read,
315 .llseek = seq_lseek, 268 .llseek = seq_lseek,
316 .release = seq_release_private, 269 .release = seq_release_private,
diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c
index 9610ac772d7..766b1d45605 100644
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -11,20 +11,15 @@ static int uptime_proc_show(struct seq_file *m, void *v)
11{ 11{
12 struct timespec uptime; 12 struct timespec uptime;
13 struct timespec idle; 13 struct timespec idle;
14 u64 idletime;
15 u64 nsec;
16 u32 rem;
17 int i; 14 int i;
15 cputime_t idletime = cputime_zero;
18 16
19 idletime = 0;
20 for_each_possible_cpu(i) 17 for_each_possible_cpu(i)
21 idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; 18 idletime = cputime64_add(idletime, kstat_cpu(i).cpustat.idle);
22 19
23 do_posix_clock_monotonic_gettime(&uptime); 20 do_posix_clock_monotonic_gettime(&uptime);
24 monotonic_to_bootbased(&uptime); 21 monotonic_to_bootbased(&uptime);
25 nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC; 22 cputime_to_timespec(idletime, &idle);
26 idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
27 idle.tv_nsec = rem;
28 seq_printf(m, "%lu.%02lu %lu.%02lu\n", 23 seq_printf(m, "%lu.%02lu %lu.%02lu\n",
29 (unsigned long) uptime.tv_sec, 24 (unsigned long) uptime.tv_sec,
30 (uptime.tv_nsec / (NSEC_PER_SEC / 100)), 25 (uptime.tv_nsec / (NSEC_PER_SEC / 100)),
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 0d5071d2998..cd99bf55765 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -12,7 +12,6 @@
12#include <linux/user.h> 12#include <linux/user.h>
13#include <linux/elf.h> 13#include <linux/elf.h>
14#include <linux/elfcore.h> 14#include <linux/elfcore.h>
15#include <linux/export.h>
16#include <linux/slab.h> 15#include <linux/slab.h>
17#include <linux/highmem.h> 16#include <linux/highmem.h>
18#include <linux/bootmem.h> 17#include <linux/bootmem.h>
@@ -700,26 +699,3 @@ static int __init vmcore_init(void)
700 return 0; 699 return 0;
701} 700}
702module_init(vmcore_init) 701module_init(vmcore_init)
703
704/* Cleanup function for vmcore module. */
705void vmcore_cleanup(void)
706{
707 struct list_head *pos, *next;
708
709 if (proc_vmcore) {
710 remove_proc_entry(proc_vmcore->name, proc_vmcore->parent);
711 proc_vmcore = NULL;
712 }
713
714 /* clear the vmcore list. */
715 list_for_each_safe(pos, next, &vmcore_list) {
716 struct vmcore *m;
717
718 m = list_entry(pos, struct vmcore, list);
719 list_del(&m->list);
720 kfree(m);
721 }
722 kfree(elfcorebuf);
723 elfcorebuf = NULL;
724}
725EXPORT_SYMBOL_GPL(vmcore_cleanup);