aboutsummaryrefslogtreecommitdiffstats
path: root/fs/proc/base.c
diff options
context:
space:
mode:
authorPavel Emelyanov <xemul@parallels.com>2012-01-10 18:11:23 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-01-10 19:30:54 -0500
commit640708a2cff7f81e246243b0073c66e6ece7e53e (patch)
tree8cc00ae2b374bf6750ad9ca20da3566e28cfc9ff /fs/proc/base.c
parent7773fbc54182a90cd248656619c7d33859e5f91d (diff)
procfs: introduce the /proc/<pid>/map_files/ directory
This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end", the target is the file. Opening a symlink results in a file that point exactly to the same inode as them vma's one. For example the ls -l of some arbitrary /proc/<pid>/map_files/ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so This *helps* checkpointing process in three ways: 1. When dumping a task mappings we do know exact file that is mapped by particular region. We do this by opening /proc/$pid/map_files/$address symlink the way we do with file descriptors. 2. This also helps in determining which anonymous shared mappings are shared with each other by comparing the inodes of them. 3. When restoring a set of processes in case two of them has a mapping shared, we map the memory by the 1st one and then open its /proc/$pid/map_files/$address file and map it by the 2nd task. Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable re-reading and reparsing for this text file which slows down restore procedure significantly. Also as being pointed in (3) it is a way easier to use top level shared mapping in children as /proc/$pid/map_files/$address when needed. [akpm@linux-foundation.org: coding-style fixes] [gorcunov@openvz.org: make map_files depend on CHECKPOINT_RESTORE] Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org> Reviewed-by: Vasiliy Kulikov <segoon@openwall.com> Reviewed-by: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: Tejun Heo <tj@kernel.org> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Al Viro <viro@ZenIV.linux.org.uk> Cc: Pavel Machek <pavel@ucw.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/proc/base.c')
-rw-r--r--fs/proc/base.c355
1 files changed, 355 insertions, 0 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e31d95055c67..4d755fed3ecb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,6 +83,7 @@
83#include <linux/pid_namespace.h> 83#include <linux/pid_namespace.h>
84#include <linux/fs_struct.h> 84#include <linux/fs_struct.h>
85#include <linux/slab.h> 85#include <linux/slab.h>
86#include <linux/flex_array.h>
86#ifdef CONFIG_HARDWALL 87#ifdef CONFIG_HARDWALL
87#include <asm/hardwall.h> 88#include <asm/hardwall.h>
88#endif 89#endif
@@ -134,6 +135,8 @@ struct pid_entry {
134 NULL, &proc_single_file_operations, \ 135 NULL, &proc_single_file_operations, \
135 { .proc_show = show } ) 136 { .proc_show = show } )
136 137
138static int proc_fd_permission(struct inode *inode, int mask);
139
137/* 140/*
138 * Count the number of hardlinks for the pid_entry table, excluding the . 141 * Count the number of hardlinks for the pid_entry table, excluding the .
139 * and .. links. 142 * and .. links.
@@ -2046,6 +2049,355 @@ static const struct file_operations proc_fd_operations = {
2046 .llseek = default_llseek, 2049 .llseek = default_llseek,
2047}; 2050};
2048 2051
2052#ifdef CONFIG_CHECKPOINT_RESTORE
2053
2054/*
2055 * dname_to_vma_addr - maps a dentry name into two unsigned longs
2056 * which represent vma start and end addresses.
2057 */
2058static int dname_to_vma_addr(struct dentry *dentry,
2059 unsigned long *start, unsigned long *end)
2060{
2061 if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
2062 return -EINVAL;
2063
2064 return 0;
2065}
2066
2067static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
2068{
2069 unsigned long vm_start, vm_end;
2070 bool exact_vma_exists = false;
2071 struct mm_struct *mm = NULL;
2072 struct task_struct *task;
2073 const struct cred *cred;
2074 struct inode *inode;
2075 int status = 0;
2076
2077 if (nd && nd->flags & LOOKUP_RCU)
2078 return -ECHILD;
2079
2080 if (!capable(CAP_SYS_ADMIN)) {
2081 status = -EACCES;
2082 goto out_notask;
2083 }
2084
2085 inode = dentry->d_inode;
2086 task = get_proc_task(inode);
2087 if (!task)
2088 goto out_notask;
2089
2090 if (!ptrace_may_access(task, PTRACE_MODE_READ))
2091 goto out;
2092
2093 mm = get_task_mm(task);
2094 if (!mm)
2095 goto out;
2096
2097 if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
2098 down_read(&mm->mmap_sem);
2099 exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
2100 up_read(&mm->mmap_sem);
2101 }
2102
2103 mmput(mm);
2104
2105 if (exact_vma_exists) {
2106 if (task_dumpable(task)) {
2107 rcu_read_lock();
2108 cred = __task_cred(task);
2109 inode->i_uid = cred->euid;
2110 inode->i_gid = cred->egid;
2111 rcu_read_unlock();
2112 } else {
2113 inode->i_uid = 0;
2114 inode->i_gid = 0;
2115 }
2116 security_task_to_inode(task, inode);
2117 status = 1;
2118 }
2119
2120out:
2121 put_task_struct(task);
2122
2123out_notask:
2124 if (status <= 0)
2125 d_drop(dentry);
2126
2127 return status;
2128}
2129
2130static const struct dentry_operations tid_map_files_dentry_operations = {
2131 .d_revalidate = map_files_d_revalidate,
2132 .d_delete = pid_delete_dentry,
2133};
2134
2135static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
2136{
2137 unsigned long vm_start, vm_end;
2138 struct vm_area_struct *vma;
2139 struct task_struct *task;
2140 struct mm_struct *mm;
2141 int rc;
2142
2143 rc = -ENOENT;
2144 task = get_proc_task(dentry->d_inode);
2145 if (!task)
2146 goto out;
2147
2148 mm = get_task_mm(task);
2149 put_task_struct(task);
2150 if (!mm)
2151 goto out;
2152
2153 rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
2154 if (rc)
2155 goto out_mmput;
2156
2157 down_read(&mm->mmap_sem);
2158 vma = find_exact_vma(mm, vm_start, vm_end);
2159 if (vma && vma->vm_file) {
2160 *path = vma->vm_file->f_path;
2161 path_get(path);
2162 rc = 0;
2163 }
2164 up_read(&mm->mmap_sem);
2165
2166out_mmput:
2167 mmput(mm);
2168out:
2169 return rc;
2170}
2171
2172struct map_files_info {
2173 struct file *file;
2174 unsigned long len;
2175 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
2176};
2177
2178static struct dentry *
2179proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
2180 struct task_struct *task, const void *ptr)
2181{
2182 const struct file *file = ptr;
2183 struct proc_inode *ei;
2184 struct inode *inode;
2185
2186 if (!file)
2187 return ERR_PTR(-ENOENT);
2188
2189 inode = proc_pid_make_inode(dir->i_sb, task);
2190 if (!inode)
2191 return ERR_PTR(-ENOENT);
2192
2193 ei = PROC_I(inode);
2194 ei->op.proc_get_link = proc_map_files_get_link;
2195
2196 inode->i_op = &proc_pid_link_inode_operations;
2197 inode->i_size = 64;
2198 inode->i_mode = S_IFLNK;
2199
2200 if (file->f_mode & FMODE_READ)
2201 inode->i_mode |= S_IRUSR;
2202 if (file->f_mode & FMODE_WRITE)
2203 inode->i_mode |= S_IWUSR;
2204
2205 d_set_d_op(dentry, &tid_map_files_dentry_operations);
2206 d_add(dentry, inode);
2207
2208 return NULL;
2209}
2210
2211static struct dentry *proc_map_files_lookup(struct inode *dir,
2212 struct dentry *dentry, struct nameidata *nd)
2213{
2214 unsigned long vm_start, vm_end;
2215 struct vm_area_struct *vma;
2216 struct task_struct *task;
2217 struct dentry *result;
2218 struct mm_struct *mm;
2219
2220 result = ERR_PTR(-EACCES);
2221 if (!capable(CAP_SYS_ADMIN))
2222 goto out;
2223
2224 result = ERR_PTR(-ENOENT);
2225 task = get_proc_task(dir);
2226 if (!task)
2227 goto out;
2228
2229 result = ERR_PTR(-EACCES);
2230 if (lock_trace(task))
2231 goto out_put_task;
2232
2233 result = ERR_PTR(-ENOENT);
2234 if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
2235 goto out_unlock;
2236
2237 mm = get_task_mm(task);
2238 if (!mm)
2239 goto out_unlock;
2240
2241 down_read(&mm->mmap_sem);
2242 vma = find_exact_vma(mm, vm_start, vm_end);
2243 if (!vma)
2244 goto out_no_vma;
2245
2246 result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
2247
2248out_no_vma:
2249 up_read(&mm->mmap_sem);
2250 mmput(mm);
2251out_unlock:
2252 unlock_trace(task);
2253out_put_task:
2254 put_task_struct(task);
2255out:
2256 return result;
2257}
2258
2259static const struct inode_operations proc_map_files_inode_operations = {
2260 .lookup = proc_map_files_lookup,
2261 .permission = proc_fd_permission,
2262 .setattr = proc_setattr,
2263};
2264
2265static int
2266proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
2267{
2268 struct dentry *dentry = filp->f_path.dentry;
2269 struct inode *inode = dentry->d_inode;
2270 struct vm_area_struct *vma;
2271 struct task_struct *task;
2272 struct mm_struct *mm;
2273 ino_t ino;
2274 int ret;
2275
2276 ret = -EACCES;
2277 if (!capable(CAP_SYS_ADMIN))
2278 goto out;
2279
2280 ret = -ENOENT;
2281 task = get_proc_task(inode);
2282 if (!task)
2283 goto out;
2284
2285 ret = -EACCES;
2286 if (lock_trace(task))
2287 goto out_put_task;
2288
2289 ret = 0;
2290 switch (filp->f_pos) {
2291 case 0:
2292 ino = inode->i_ino;
2293 if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
2294 goto out_unlock;
2295 filp->f_pos++;
2296 case 1:
2297 ino = parent_ino(dentry);
2298 if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
2299 goto out_unlock;
2300 filp->f_pos++;
2301 default:
2302 {
2303 unsigned long nr_files, pos, i;
2304 struct flex_array *fa = NULL;
2305 struct map_files_info info;
2306 struct map_files_info *p;
2307
2308 mm = get_task_mm(task);
2309 if (!mm)
2310 goto out_unlock;
2311 down_read(&mm->mmap_sem);
2312
2313 nr_files = 0;
2314
2315 /*
2316 * We need two passes here:
2317 *
2318 * 1) Collect vmas of mapped files with mmap_sem taken
2319 * 2) Release mmap_sem and instantiate entries
2320 *
2321 * otherwise we get lockdep complained, since filldir()
2322 * routine might require mmap_sem taken in might_fault().
2323 */
2324
2325 for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
2326 if (vma->vm_file && ++pos > filp->f_pos)
2327 nr_files++;
2328 }
2329
2330 if (nr_files) {
2331 fa = flex_array_alloc(sizeof(info), nr_files,
2332 GFP_KERNEL);
2333 if (!fa || flex_array_prealloc(fa, 0, nr_files,
2334 GFP_KERNEL)) {
2335 ret = -ENOMEM;
2336 if (fa)
2337 flex_array_free(fa);
2338 up_read(&mm->mmap_sem);
2339 mmput(mm);
2340 goto out_unlock;
2341 }
2342 for (i = 0, vma = mm->mmap, pos = 2; vma;
2343 vma = vma->vm_next) {
2344 if (!vma->vm_file)
2345 continue;
2346 if (++pos <= filp->f_pos)
2347 continue;
2348
2349 get_file(vma->vm_file);
2350 info.file = vma->vm_file;
2351 info.len = snprintf(info.name,
2352 sizeof(info.name), "%lx-%lx",
2353 vma->vm_start, vma->vm_end);
2354 if (flex_array_put(fa, i++, &info, GFP_KERNEL))
2355 BUG();
2356 }
2357 }
2358 up_read(&mm->mmap_sem);
2359
2360 for (i = 0; i < nr_files; i++) {
2361 p = flex_array_get(fa, i);
2362 ret = proc_fill_cache(filp, dirent, filldir,
2363 p->name, p->len,
2364 proc_map_files_instantiate,
2365 task, p->file);
2366 if (ret)
2367 break;
2368 filp->f_pos++;
2369 fput(p->file);
2370 }
2371 for (; i < nr_files; i++) {
2372 /*
2373 * In case of error don't forget
2374 * to put rest of file refs.
2375 */
2376 p = flex_array_get(fa, i);
2377 fput(p->file);
2378 }
2379 if (fa)
2380 flex_array_free(fa);
2381 mmput(mm);
2382 }
2383 }
2384
2385out_unlock:
2386 unlock_trace(task);
2387out_put_task:
2388 put_task_struct(task);
2389out:
2390 return ret;
2391}
2392
2393static const struct file_operations proc_map_files_operations = {
2394 .read = generic_read_dir,
2395 .readdir = proc_map_files_readdir,
2396 .llseek = default_llseek,
2397};
2398
2399#endif /* CONFIG_CHECKPOINT_RESTORE */
2400
2049/* 2401/*
2050 * /proc/pid/fd needs a special permission handler so that a process can still 2402 * /proc/pid/fd needs a special permission handler so that a process can still
2051 * access /proc/self/fd after it has executed a setuid(). 2403 * access /proc/self/fd after it has executed a setuid().
@@ -2661,6 +3013,9 @@ static const struct inode_operations proc_task_inode_operations;
2661static const struct pid_entry tgid_base_stuff[] = { 3013static const struct pid_entry tgid_base_stuff[] = {
2662 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), 3014 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2663 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 3015 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
3016#ifdef CONFIG_CHECKPOINT_RESTORE
3017 DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
3018#endif
2664 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 3019 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2665 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), 3020 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2666#ifdef CONFIG_NET 3021#ifdef CONFIG_NET