/*
* linux/fs/proc/base.c
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
* proc base directory handling functions
*
* 1999, Al Viro. Rewritten. Now it covers the whole per-process part.
* Instead of using magical inumbers to determine the kind of object
* we allocate and fill in-core inodes upon lookup. They don't even
* go into icache. We cache the reference to task_struct upon lookup too.
* Eventually it should become a filesystem in its own. We don't use the
* rest of procfs anymore.
*
*
* Changelog:
* 17-Jan-2005
* Allan Bezerra
* Bruna Moreira <bruna.moreira@indt.org.br>
* Edjard Mota <edjard.mota@indt.org.br>
* Ilias Biris <ilias.biris@indt.org.br>
* Mauricio Lin <mauricio.lin@indt.org.br>
*
* Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
*
* A new process specific entry (smaps) included in /proc. It shows the
* size of rss for each memory area. The maps entry lacks information
* about physical memory size (rss) for each mapped file, i.e.,
* rss information for executables and library files.
* This additional information is useful for any tools that need to know
* about physical memory consumption for a process specific library.
*
* Changelog:
* 21-Feb-2005
* Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
* Pud inclusion in the page table walking.
*
* ChangeLog:
* 10-Mar-2005
* 10LE Instituto Nokia de Tecnologia - INdT:
* A better way to walks through the page table as suggested by Hugh Dickins.
*
* Simo Piiroinen <simo.piiroinen@nokia.com>:
* Smaps information related to shared, private, clean and dirty pages.
*
* Paul Mundt <paul.mundt@nokia.com>:
* Overall revision about smaps.
*/
#include <asm/uaccess.h>
#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/string.h>
#include <linux/seq_file.h>
#include <linux/namei.h>
#include <linux/mnt_namespace.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/rcupdate.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
#include <linux/resource.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
#include <linux/cgroup.h>
#include <linux/cpuset.h>
#include <linux/audit.h>
#include <linux/poll.h>
#include <linux/nsproxy.h>
#include <linux/oom.h>
#include <linux/elf.h>
#include <linux/pid_namespace.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
#include "internal.h"
/* NOTE:
* Implementing inode permission operations in /proc is almost
* certainly an error. Permission checks need to happen during
* each system call not at open time. The reason is that most of
* what we wish to check for permissions in /proc varies at runtime.
*
* The classic example of a problem is opening file descriptors
* in /proc for a task before it execs a suid executable.
*/
struct pid_entry {
char *name;
int len;
mode_t mode;
const struct inode_operations *iop;
const struct file_operations *fop;
union proc_op op;
};
#define NOD(NAME, MODE, IOP, FOP, OP) { \
.name = (NAME), \
.len = sizeof(NAME) - 1, \
.mode = MODE, \
.iop = IOP, \
.fop = FOP, \
.op = OP, \
}
#define DIR(NAME, MODE, iops, fops) \
NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
#define LNK(NAME, get_link) \
NOD(NAME, (S_IFLNK|S_IRWXUGO), \
&proc_pid_link_inode_operations, NULL, \
{ .proc_get_link = get_link } )
#define REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
#define INF(NAME, MODE, read) \
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_info_file_operations, \
{ .proc_read = read } )
#define ONE(NAME, MODE, show) \
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
*/
static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
unsigned int n)
{
unsigned int i;
unsigned int count;
count = 0;
for (i = 0; i < n; ++i) {
if (S_ISDIR(entries[i].mode))
++count;
}
return count;
}
static int get_task_root(struct task_struct *task, struct path *root)
{
int result = -ENOENT;
task_lock(task);
if (task->fs) {
get_fs_root(task->fs, root);
result = 0;
}
task_unlock(task);
return result;
}
static int proc_cwd_link(struct inode *inode, struct path *path)
{
struct task_struct *task = get_proc_task(inode);
int result = -ENOENT;
if (task) {
task_lock(task);
if (task->fs) {
get_fs_pwd(task->fs, path);
result = 0;
}
task_unlock(task);
put_task_struct(task);
}
return result;
}
static int proc_root_link(struct inode *inode, struct path *path)
{
struct task_struct *task = get_proc_task(inode);
int result = -ENOENT;
if (task) {
result = get_task_root(task, path);
put_task_struct(task);
}
return result;
}
/*
* Return zero if current may access user memory in @task, -error if not.
*/
static int check_mem_permission(struct task_struct *task)
{
/*
* A task can always look at itself, in case it chooses
* to use system calls instead of load instructions.
*/
if (task == current)
return 0;
/*
* If current is actively ptrace'ing, and would also be
* permitted to freshly attach with ptrace now, permit it.
*/
if (task_is_stopped_or_traced(task)) {
int match;
rcu_read_lock();
match = (tracehook_tracer_task(task) == current);
rcu_read_unlock();
if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
return 0;
}
/*
* Noone else is allowed.
*/
return -EPERM;
}
struct mm_struct *mm_for_maps(struct task_struct *task)
{
struct mm_struct *mm;
if (mutex_lock_killable(&task->signal->cred_guard_mutex))
return NULL;
mm = get_task_mm(task);
if (mm && mm != current->mm &&
!ptrace_may_access(task, PTRACE_MODE_READ)) {
mmput(mm);
mm = NULL;
}
mutex_unlock(&task->signal->cred_guard_mutex);
return mm;
}
static int proc_pid_cmdline(struct task_struct *task, char * buffer)
{
int res = 0;
unsigned int len;
struct mm_struct *mm = get_task_mm(task);
if (!mm)
goto out;
if (!mm->arg_end)
goto out_mm; /* Shh! No looking before we're done */
len = mm->arg_end - mm->arg_start;
if (len > PAGE_SIZE)
len = PAGE_SIZE;
res = access_process_vm(task, mm->arg_start, buffer, len, 0);
// If the nul at the end of args has been overwritten, then
// assume application is using setproctitle(3).
if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
len = strnlen(buffer, res);
if (len < res) {
res = len;
} else {
len = mm->env_end - mm->env_start;
if (len > PAGE_SIZE - res)
len = PAGE_SIZE - res;
res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
res = strnlen(buffer, res);
}
}
out_mm:
mmput(mm);
out:
return res;
}
static int proc_pid_auxv(struct task_struct *task, char *buffer)
{
int res = 0;
struct mm_struct *mm = get_task_mm(task);
if (mm) {
unsigned int nwords = 0;
do {
nwords += 2;
} while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
res = nwords * sizeof(mm->saved_auxv[0]);
if (res > PAGE_SIZE)
res = PAGE_SIZE;
memcpy(buffer, mm->saved_auxv, res);
mmput(mm);
}
return res;
}
#ifdef CONFIG_KALLSYMS
/*
* Provides a wchan file via kallsyms in a proper one-value-per-file format.
* Returns the resolved symbol. If that fails, simply return the address.
*/
static int proc_pid_wchan(struct task_struct *task, char *buffer)
{
unsigned long wchan;
char symname[KSYM_NAME_LEN];
wchan = get_wchan(task);
if (lookup_symbol_name(wchan, symname) < 0)
if (!ptrace_may_access(task, PTRACE_MODE_READ))
return 0;
else
return sprintf(buffer, "%lu", wchan);
else
return sprintf(buffer, "%s", symname);
}
#endif /* CONFIG_KALLSYMS */
#ifdef CONFIG_STACKTRACE
#define MAX_STACK_TRACE_DEPTH 64
static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
struct stack_trace trace;
unsigned long *entries;
int i;
entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
if (!entries)
return -ENOMEM;
trace.nr_entries = 0;
trace.max_entries = MAX_STACK_TRACE_DEPTH;
trace.entries = entries;
trace.skip = 0;
save_stack_trace_tsk(task, &trace);
for (i = 0; i < trace.nr_entries; i++) {
seq_printf(m, "[<%p>] %pS\n",
(void *)entries[i], (void *)entries[i]);
}
kfree(entries);
return 0;
}
#endif
#ifdef CONFIG_SCHEDSTATS
/*
* Provides /proc/PID/schedstat
*/
static int proc_pid_schedstat(struct task_struct *task, char *buffer)
{
return sprintf(buffer, "%llu %llu %lu\n",
(unsigned long long)task->se.sum_exec_runtime,
(unsigned long long)task->sched_info.run_delay,
task->sched_info.pcount);
}
#endif
#ifdef CONFIG_LATENCYTOP
static int lstats_show_proc(struct seq_file *m, void *v)
{
int i;
struct inode *inode = m->private;
struct task_struct *task = get_proc_task(inode);
if (!task)
return -ESRCH;
seq_puts(m, "Latency Top version : v0.1\n");
for (i = 0; i < 32; i++) {
if (task->latency_record[i].backtrace[0]) {
int q;
seq_printf(m, "%i %li %li ",
task->latency_record[i].count,
task->latency_record[i].time,
task->latency_record[i].max);
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
char sym[KSYM_SYMBOL_LEN];
char *c;
if (!task->latency_record[i].backtrace[q])
break;
if (task->latency_record[i].backtrace[q] == ULONG_MAX)
break;
sprint_symbol(sym, task->latency_record[i].backtrace[q]);
c = strchr(sym, '+');
if (c)
*c = 0;
seq_printf(m, "%s ", sym);
}
seq_printf(m, "\n");
}
}
put_task_struct(task);
return 0;
}
static int lstats_open(struct inode *inode, struct file *file)
{
return single_open(file, lstats_show_proc, inode);
}
static ssize_t lstats_write(struct file *file, const char __user *buf,
size_t count, loff_t *offs)
{
struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
if (!task)
return -ESRCH;
clear_all_latency_tracing(task);
put_task_struct(task);
return count;
}
static const struct file_operations proc_lstats_operations = {
.open = lstats_open,
.read = seq_read,
.write = lstats_write,
.llseek = seq_lseek,
.release = single_release,
};
#endif
static int proc_oom_score(struct task_struct *task, char *buffer)
{
unsigned long points = 0;
read_lock(&tasklist_lock);
if (pid_alive(task))
points = oom_badness(task, NULL, NULL,
totalram_pages + total_swap_pages);
read_unlock(&tasklist_lock);
return sprintf(buffer, "%lu\n", points);
}
struct limit_names {
char *name;
char *unit;
};
|