aboutsummaryrefslogtreecommitdiffstats
path: root/fs/proc
diff options
context:
space:
mode:
Diffstat (limited to 'fs/proc')
-rw-r--r--fs/proc/array.c21
-rw-r--r--fs/proc/base.c132
-rw-r--r--fs/proc/generic.c44
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/page.c65
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/task_mmu.c356
7 files changed, 358 insertions, 269 deletions
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ce065cf3104f..eed2050db9be 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header,
308static inline void task_cap(struct seq_file *m, struct task_struct *p) 308static inline void task_cap(struct seq_file *m, struct task_struct *p)
309{ 309{
310 const struct cred *cred; 310 const struct cred *cred;
311 kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; 311 kernel_cap_t cap_inheritable, cap_permitted, cap_effective,
312 cap_bset, cap_ambient;
312 313
313 rcu_read_lock(); 314 rcu_read_lock();
314 cred = __task_cred(p); 315 cred = __task_cred(p);
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
316 cap_permitted = cred->cap_permitted; 317 cap_permitted = cred->cap_permitted;
317 cap_effective = cred->cap_effective; 318 cap_effective = cred->cap_effective;
318 cap_bset = cred->cap_bset; 319 cap_bset = cred->cap_bset;
320 cap_ambient = cred->cap_ambient;
319 rcu_read_unlock(); 321 rcu_read_unlock();
320 322
321 render_cap_t(m, "CapInh:\t", &cap_inheritable); 323 render_cap_t(m, "CapInh:\t", &cap_inheritable);
322 render_cap_t(m, "CapPrm:\t", &cap_permitted); 324 render_cap_t(m, "CapPrm:\t", &cap_permitted);
323 render_cap_t(m, "CapEff:\t", &cap_effective); 325 render_cap_t(m, "CapEff:\t", &cap_effective);
324 render_cap_t(m, "CapBnd:\t", &cap_bset); 326 render_cap_t(m, "CapBnd:\t", &cap_bset);
327 render_cap_t(m, "CapAmb:\t", &cap_ambient);
325} 328}
326 329
327static inline void task_seccomp(struct seq_file *m, struct task_struct *p) 330static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
@@ -372,7 +375,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
372static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, 375static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
373 struct pid *pid, struct task_struct *task, int whole) 376 struct pid *pid, struct task_struct *task, int whole)
374{ 377{
375 unsigned long vsize, eip, esp, wchan = ~0UL; 378 unsigned long vsize, eip, esp, wchan = 0;
376 int priority, nice; 379 int priority, nice;
377 int tty_pgrp = -1, tty_nr = 0; 380 int tty_pgrp = -1, tty_nr = 0;
378 sigset_t sigign, sigcatch; 381 sigset_t sigign, sigcatch;
@@ -504,7 +507,19 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
504 seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); 507 seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
505 seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); 508 seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
506 seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); 509 seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
507 seq_put_decimal_ull(m, ' ', wchan); 510
511 /*
512 * We used to output the absolute kernel address, but that's an
513 * information leak - so instead we show a 0/1 flag here, to signal
514 * to user-space whether there's a wchan field in /proc/PID/wchan.
515 *
516 * This works with older implementations of procps as well.
517 */
518 if (wchan)
519 seq_puts(m, " 1");
520 else
521 seq_puts(m, " 0");
522
508 seq_put_decimal_ull(m, ' ', 0); 523 seq_put_decimal_ull(m, ' ', 0);
509 seq_put_decimal_ull(m, ' ', 0); 524 seq_put_decimal_ull(m, ' ', 0);
510 seq_put_decimal_ll(m, ' ', task->exit_signal); 525 seq_put_decimal_ll(m, ' ', task->exit_signal);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index aa50d1ac28fc..bd3e9e68125b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -430,13 +430,10 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
430 430
431 wchan = get_wchan(task); 431 wchan = get_wchan(task);
432 432
433 if (lookup_symbol_name(wchan, symname) < 0) { 433 if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname))
434 if (!ptrace_may_access(task, PTRACE_MODE_READ))
435 return 0;
436 seq_printf(m, "%lu", wchan);
437 } else {
438 seq_printf(m, "%s", symname); 434 seq_printf(m, "%s", symname);
439 } 435 else
436 seq_putc(m, '0');
440 437
441 return 0; 438 return 0;
442} 439}
@@ -1035,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
1035 return simple_read_from_buffer(buf, count, ppos, buffer, len); 1032 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1036} 1033}
1037 1034
1035/*
1036 * /proc/pid/oom_adj exists solely for backwards compatibility with previous
1037 * kernels. The effective policy is defined by oom_score_adj, which has a
1038 * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
1039 * Values written to oom_adj are simply mapped linearly to oom_score_adj.
1040 * Processes that become oom disabled via oom_adj will still be oom disabled
1041 * with this implementation.
1042 *
1043 * oom_adj cannot be removed since existing userspace binaries use it.
1044 */
1038static ssize_t oom_adj_write(struct file *file, const char __user *buf, 1045static ssize_t oom_adj_write(struct file *file, const char __user *buf,
1039 size_t count, loff_t *ppos) 1046 size_t count, loff_t *ppos)
1040{ 1047{
@@ -1230,10 +1237,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1230 size_t count, loff_t *ppos) 1237 size_t count, loff_t *ppos)
1231{ 1238{
1232 struct inode * inode = file_inode(file); 1239 struct inode * inode = file_inode(file);
1233 char *page, *tmp;
1234 ssize_t length;
1235 uid_t loginuid; 1240 uid_t loginuid;
1236 kuid_t kloginuid; 1241 kuid_t kloginuid;
1242 int rv;
1237 1243
1238 rcu_read_lock(); 1244 rcu_read_lock();
1239 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { 1245 if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) {
@@ -1242,46 +1248,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1242 } 1248 }
1243 rcu_read_unlock(); 1249 rcu_read_unlock();
1244 1250
1245 if (count >= PAGE_SIZE)
1246 count = PAGE_SIZE - 1;
1247
1248 if (*ppos != 0) { 1251 if (*ppos != 0) {
1249 /* No partial writes. */ 1252 /* No partial writes. */
1250 return -EINVAL; 1253 return -EINVAL;
1251 } 1254 }
1252 page = (char*)__get_free_page(GFP_TEMPORARY);
1253 if (!page)
1254 return -ENOMEM;
1255 length = -EFAULT;
1256 if (copy_from_user(page, buf, count))
1257 goto out_free_page;
1258 1255
1259 page[count] = '\0'; 1256 rv = kstrtou32_from_user(buf, count, 10, &loginuid);
1260 loginuid = simple_strtoul(page, &tmp, 10); 1257 if (rv < 0)
1261 if (tmp == page) { 1258 return rv;
1262 length = -EINVAL;
1263 goto out_free_page;
1264
1265 }
1266 1259
1267 /* is userspace tring to explicitly UNSET the loginuid? */ 1260 /* is userspace tring to explicitly UNSET the loginuid? */
1268 if (loginuid == AUDIT_UID_UNSET) { 1261 if (loginuid == AUDIT_UID_UNSET) {
1269 kloginuid = INVALID_UID; 1262 kloginuid = INVALID_UID;
1270 } else { 1263 } else {
1271 kloginuid = make_kuid(file->f_cred->user_ns, loginuid); 1264 kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
1272 if (!uid_valid(kloginuid)) { 1265 if (!uid_valid(kloginuid))
1273 length = -EINVAL; 1266 return -EINVAL;
1274 goto out_free_page;
1275 }
1276 } 1267 }
1277 1268
1278 length = audit_set_loginuid(kloginuid); 1269 rv = audit_set_loginuid(kloginuid);
1279 if (likely(length == 0)) 1270 if (rv < 0)
1280 length = count; 1271 return rv;
1281 1272 return count;
1282out_free_page:
1283 free_page((unsigned long) page);
1284 return length;
1285} 1273}
1286 1274
1287static const struct file_operations proc_loginuid_operations = { 1275static const struct file_operations proc_loginuid_operations = {
@@ -1335,8 +1323,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
1335 const char __user * buf, size_t count, loff_t *ppos) 1323 const char __user * buf, size_t count, loff_t *ppos)
1336{ 1324{
1337 struct task_struct *task; 1325 struct task_struct *task;
1338 char buffer[PROC_NUMBUF], *end; 1326 char buffer[PROC_NUMBUF];
1339 int make_it_fail; 1327 int make_it_fail;
1328 int rv;
1340 1329
1341 if (!capable(CAP_SYS_RESOURCE)) 1330 if (!capable(CAP_SYS_RESOURCE))
1342 return -EPERM; 1331 return -EPERM;
@@ -1345,9 +1334,9 @@ static ssize_t proc_fault_inject_write(struct file * file,
1345 count = sizeof(buffer) - 1; 1334 count = sizeof(buffer) - 1;
1346 if (copy_from_user(buffer, buf, count)) 1335 if (copy_from_user(buffer, buf, count))
1347 return -EFAULT; 1336 return -EFAULT;
1348 make_it_fail = simple_strtol(strstrip(buffer), &end, 0); 1337 rv = kstrtoint(strstrip(buffer), 0, &make_it_fail);
1349 if (*end) 1338 if (rv < 0)
1350 return -EINVAL; 1339 return rv;
1351 if (make_it_fail < 0 || make_it_fail > 1) 1340 if (make_it_fail < 0 || make_it_fail > 1)
1352 return -EINVAL; 1341 return -EINVAL;
1353 1342
@@ -1836,8 +1825,6 @@ end_instantiate:
1836 return dir_emit(ctx, name, len, 1, DT_UNKNOWN); 1825 return dir_emit(ctx, name, len, 1, DT_UNKNOWN);
1837} 1826}
1838 1827
1839#ifdef CONFIG_CHECKPOINT_RESTORE
1840
1841/* 1828/*
1842 * dname_to_vma_addr - maps a dentry name into two unsigned longs 1829 * dname_to_vma_addr - maps a dentry name into two unsigned longs
1843 * which represent vma start and end addresses. 1830 * which represent vma start and end addresses.
@@ -1864,11 +1851,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags)
1864 if (flags & LOOKUP_RCU) 1851 if (flags & LOOKUP_RCU)
1865 return -ECHILD; 1852 return -ECHILD;
1866 1853
1867 if (!capable(CAP_SYS_ADMIN)) {
1868 status = -EPERM;
1869 goto out_notask;
1870 }
1871
1872 inode = d_inode(dentry); 1854 inode = d_inode(dentry);
1873 task = get_proc_task(inode); 1855 task = get_proc_task(inode);
1874 if (!task) 1856 if (!task)
@@ -1957,6 +1939,29 @@ struct map_files_info {
1957 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ 1939 unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
1958}; 1940};
1959 1941
1942/*
1943 * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the
1944 * symlinks may be used to bypass permissions on ancestor directories in the
1945 * path to the file in question.
1946 */
1947static const char *
1948proc_map_files_follow_link(struct dentry *dentry, void **cookie)
1949{
1950 if (!capable(CAP_SYS_ADMIN))
1951 return ERR_PTR(-EPERM);
1952
1953 return proc_pid_follow_link(dentry, NULL);
1954}
1955
1956/*
1957 * Identical to proc_pid_link_inode_operations except for follow_link()
1958 */
1959static const struct inode_operations proc_map_files_link_inode_operations = {
1960 .readlink = proc_pid_readlink,
1961 .follow_link = proc_map_files_follow_link,
1962 .setattr = proc_setattr,
1963};
1964
1960static int 1965static int
1961proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, 1966proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1962 struct task_struct *task, const void *ptr) 1967 struct task_struct *task, const void *ptr)
@@ -1972,7 +1977,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
1972 ei = PROC_I(inode); 1977 ei = PROC_I(inode);
1973 ei->op.proc_get_link = proc_map_files_get_link; 1978 ei->op.proc_get_link = proc_map_files_get_link;
1974 1979
1975 inode->i_op = &proc_pid_link_inode_operations; 1980 inode->i_op = &proc_map_files_link_inode_operations;
1976 inode->i_size = 64; 1981 inode->i_size = 64;
1977 inode->i_mode = S_IFLNK; 1982 inode->i_mode = S_IFLNK;
1978 1983
@@ -1996,10 +2001,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
1996 int result; 2001 int result;
1997 struct mm_struct *mm; 2002 struct mm_struct *mm;
1998 2003
1999 result = -EPERM;
2000 if (!capable(CAP_SYS_ADMIN))
2001 goto out;
2002
2003 result = -ENOENT; 2004 result = -ENOENT;
2004 task = get_proc_task(dir); 2005 task = get_proc_task(dir);
2005 if (!task) 2006 if (!task)
@@ -2053,10 +2054,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx)
2053 struct map_files_info *p; 2054 struct map_files_info *p;
2054 int ret; 2055 int ret;
2055 2056
2056 ret = -EPERM;
2057 if (!capable(CAP_SYS_ADMIN))
2058 goto out;
2059
2060 ret = -ENOENT; 2057 ret = -ENOENT;
2061 task = get_proc_task(file_inode(file)); 2058 task = get_proc_task(file_inode(file));
2062 if (!task) 2059 if (!task)
@@ -2245,7 +2242,6 @@ static const struct file_operations proc_timers_operations = {
2245 .llseek = seq_lseek, 2242 .llseek = seq_lseek,
2246 .release = seq_release_private, 2243 .release = seq_release_private,
2247}; 2244};
2248#endif /* CONFIG_CHECKPOINT_RESTORE */
2249 2245
2250static int proc_pident_instantiate(struct inode *dir, 2246static int proc_pident_instantiate(struct inode *dir,
2251 struct dentry *dentry, struct task_struct *task, const void *ptr) 2247 struct dentry *dentry, struct task_struct *task, const void *ptr)
@@ -2481,32 +2477,20 @@ static ssize_t proc_coredump_filter_write(struct file *file,
2481{ 2477{
2482 struct task_struct *task; 2478 struct task_struct *task;
2483 struct mm_struct *mm; 2479 struct mm_struct *mm;
2484 char buffer[PROC_NUMBUF], *end;
2485 unsigned int val; 2480 unsigned int val;
2486 int ret; 2481 int ret;
2487 int i; 2482 int i;
2488 unsigned long mask; 2483 unsigned long mask;
2489 2484
2490 ret = -EFAULT; 2485 ret = kstrtouint_from_user(buf, count, 0, &val);
2491 memset(buffer, 0, sizeof(buffer)); 2486 if (ret < 0)
2492 if (count > sizeof(buffer) - 1) 2487 return ret;
2493 count = sizeof(buffer) - 1;
2494 if (copy_from_user(buffer, buf, count))
2495 goto out_no_task;
2496
2497 ret = -EINVAL;
2498 val = (unsigned int)simple_strtoul(buffer, &end, 0);
2499 if (*end == '\n')
2500 end++;
2501 if (end - buffer == 0)
2502 goto out_no_task;
2503 2488
2504 ret = -ESRCH; 2489 ret = -ESRCH;
2505 task = get_proc_task(file_inode(file)); 2490 task = get_proc_task(file_inode(file));
2506 if (!task) 2491 if (!task)
2507 goto out_no_task; 2492 goto out_no_task;
2508 2493
2509 ret = end - buffer;
2510 mm = get_task_mm(task); 2494 mm = get_task_mm(task);
2511 if (!mm) 2495 if (!mm)
2512 goto out_no_mm; 2496 goto out_no_mm;
@@ -2522,7 +2506,9 @@ static ssize_t proc_coredump_filter_write(struct file *file,
2522 out_no_mm: 2506 out_no_mm:
2523 put_task_struct(task); 2507 put_task_struct(task);
2524 out_no_task: 2508 out_no_task:
2525 return ret; 2509 if (ret < 0)
2510 return ret;
2511 return count;
2526} 2512}
2527 2513
2528static const struct file_operations proc_coredump_filter_operations = { 2514static const struct file_operations proc_coredump_filter_operations = {
@@ -2744,9 +2730,7 @@ static const struct inode_operations proc_task_inode_operations;
2744static const struct pid_entry tgid_base_stuff[] = { 2730static const struct pid_entry tgid_base_stuff[] = {
2745 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), 2731 DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2746 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), 2732 DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2747#ifdef CONFIG_CHECKPOINT_RESTORE
2748 DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), 2733 DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
2749#endif
2750 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), 2734 DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2751 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), 2735 DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
2752#ifdef CONFIG_NET 2736#ifdef CONFIG_NET
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index e5dee5c3188e..ff3ffc76a937 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -26,7 +26,7 @@
26 26
27#include "internal.h" 27#include "internal.h"
28 28
29static DEFINE_SPINLOCK(proc_subdir_lock); 29static DEFINE_RWLOCK(proc_subdir_lock);
30 30
31static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) 31static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de)
32{ 32{
@@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret,
172{ 172{
173 int rv; 173 int rv;
174 174
175 spin_lock(&proc_subdir_lock); 175 read_lock(&proc_subdir_lock);
176 rv = __xlate_proc_name(name, ret, residual); 176 rv = __xlate_proc_name(name, ret, residual);
177 spin_unlock(&proc_subdir_lock); 177 read_unlock(&proc_subdir_lock);
178 return rv; 178 return rv;
179} 179}
180 180
@@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
231{ 231{
232 struct inode *inode; 232 struct inode *inode;
233 233
234 spin_lock(&proc_subdir_lock); 234 read_lock(&proc_subdir_lock);
235 de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); 235 de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
236 if (de) { 236 if (de) {
237 pde_get(de); 237 pde_get(de);
238 spin_unlock(&proc_subdir_lock); 238 read_unlock(&proc_subdir_lock);
239 inode = proc_get_inode(dir->i_sb, de); 239 inode = proc_get_inode(dir->i_sb, de);
240 if (!inode) 240 if (!inode)
241 return ERR_PTR(-ENOMEM); 241 return ERR_PTR(-ENOMEM);
@@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
243 d_add(dentry, inode); 243 d_add(dentry, inode);
244 return NULL; 244 return NULL;
245 } 245 }
246 spin_unlock(&proc_subdir_lock); 246 read_unlock(&proc_subdir_lock);
247 return ERR_PTR(-ENOENT); 247 return ERR_PTR(-ENOENT);
248} 248}
249 249
@@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
270 if (!dir_emit_dots(file, ctx)) 270 if (!dir_emit_dots(file, ctx))
271 return 0; 271 return 0;
272 272
273 spin_lock(&proc_subdir_lock); 273 read_lock(&proc_subdir_lock);
274 de = pde_subdir_first(de); 274 de = pde_subdir_first(de);
275 i = ctx->pos - 2; 275 i = ctx->pos - 2;
276 for (;;) { 276 for (;;) {
277 if (!de) { 277 if (!de) {
278 spin_unlock(&proc_subdir_lock); 278 read_unlock(&proc_subdir_lock);
279 return 0; 279 return 0;
280 } 280 }
281 if (!i) 281 if (!i)
@@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file,
287 do { 287 do {
288 struct proc_dir_entry *next; 288 struct proc_dir_entry *next;
289 pde_get(de); 289 pde_get(de);
290 spin_unlock(&proc_subdir_lock); 290 read_unlock(&proc_subdir_lock);
291 if (!dir_emit(ctx, de->name, de->namelen, 291 if (!dir_emit(ctx, de->name, de->namelen,
292 de->low_ino, de->mode >> 12)) { 292 de->low_ino, de->mode >> 12)) {
293 pde_put(de); 293 pde_put(de);
294 return 0; 294 return 0;
295 } 295 }
296 spin_lock(&proc_subdir_lock); 296 read_lock(&proc_subdir_lock);
297 ctx->pos++; 297 ctx->pos++;
298 next = pde_subdir_next(de); 298 next = pde_subdir_next(de);
299 pde_put(de); 299 pde_put(de);
300 de = next; 300 de = next;
301 } while (de); 301 } while (de);
302 spin_unlock(&proc_subdir_lock); 302 read_unlock(&proc_subdir_lock);
303 return 1; 303 return 1;
304} 304}
305 305
@@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
338 if (ret) 338 if (ret)
339 return ret; 339 return ret;
340 340
341 spin_lock(&proc_subdir_lock); 341 write_lock(&proc_subdir_lock);
342 dp->parent = dir; 342 dp->parent = dir;
343 if (pde_subdir_insert(dir, dp) == false) { 343 if (pde_subdir_insert(dir, dp) == false) {
344 WARN(1, "proc_dir_entry '%s/%s' already registered\n", 344 WARN(1, "proc_dir_entry '%s/%s' already registered\n",
345 dir->name, dp->name); 345 dir->name, dp->name);
346 spin_unlock(&proc_subdir_lock); 346 write_unlock(&proc_subdir_lock);
347 proc_free_inum(dp->low_ino); 347 proc_free_inum(dp->low_ino);
348 return -EEXIST; 348 return -EEXIST;
349 } 349 }
350 spin_unlock(&proc_subdir_lock); 350 write_unlock(&proc_subdir_lock);
351 351
352 return 0; 352 return 0;
353} 353}
@@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
549 const char *fn = name; 549 const char *fn = name;
550 unsigned int len; 550 unsigned int len;
551 551
552 spin_lock(&proc_subdir_lock); 552 write_lock(&proc_subdir_lock);
553 if (__xlate_proc_name(name, &parent, &fn) != 0) { 553 if (__xlate_proc_name(name, &parent, &fn) != 0) {
554 spin_unlock(&proc_subdir_lock); 554 write_unlock(&proc_subdir_lock);
555 return; 555 return;
556 } 556 }
557 len = strlen(fn); 557 len = strlen(fn);
@@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
559 de = pde_subdir_find(parent, fn, len); 559 de = pde_subdir_find(parent, fn, len);
560 if (de) 560 if (de)
561 rb_erase(&de->subdir_node, &parent->subdir); 561 rb_erase(&de->subdir_node, &parent->subdir);
562 spin_unlock(&proc_subdir_lock); 562 write_unlock(&proc_subdir_lock);
563 if (!de) { 563 if (!de) {
564 WARN(1, "name '%s'\n", name); 564 WARN(1, "name '%s'\n", name);
565 return; 565 return;
@@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
583 const char *fn = name; 583 const char *fn = name;
584 unsigned int len; 584 unsigned int len;
585 585
586 spin_lock(&proc_subdir_lock); 586 write_lock(&proc_subdir_lock);
587 if (__xlate_proc_name(name, &parent, &fn) != 0) { 587 if (__xlate_proc_name(name, &parent, &fn) != 0) {
588 spin_unlock(&proc_subdir_lock); 588 write_unlock(&proc_subdir_lock);
589 return -ENOENT; 589 return -ENOENT;
590 } 590 }
591 len = strlen(fn); 591 len = strlen(fn);
592 592
593 root = pde_subdir_find(parent, fn, len); 593 root = pde_subdir_find(parent, fn, len);
594 if (!root) { 594 if (!root) {
595 spin_unlock(&proc_subdir_lock); 595 write_unlock(&proc_subdir_lock);
596 return -ENOENT; 596 return -ENOENT;
597 } 597 }
598 rb_erase(&root->subdir_node, &parent->subdir); 598 rb_erase(&root->subdir_node, &parent->subdir);
@@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
605 de = next; 605 de = next;
606 continue; 606 continue;
607 } 607 }
608 spin_unlock(&proc_subdir_lock); 608 write_unlock(&proc_subdir_lock);
609 609
610 proc_entry_rundown(de); 610 proc_entry_rundown(de);
611 next = de->parent; 611 next = de->parent;
@@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
616 break; 616 break;
617 pde_put(de); 617 pde_put(de);
618 618
619 spin_lock(&proc_subdir_lock); 619 write_lock(&proc_subdir_lock);
620 de = next; 620 de = next;
621 } 621 }
622 pde_put(root); 622 pde_put(root);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d3ebf2e61853..9155a5a0d3b9 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -27,7 +27,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
27{ 27{
28 struct sysinfo i; 28 struct sysinfo i;
29 unsigned long committed; 29 unsigned long committed;
30 struct vmalloc_info vmi;
31 long cached; 30 long cached;
32 long available; 31 long available;
33 unsigned long pagecache; 32 unsigned long pagecache;
@@ -49,8 +48,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
49 if (cached < 0) 48 if (cached < 0)
50 cached = 0; 49 cached = 0;
51 50
52 get_vmalloc_info(&vmi);
53
54 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) 51 for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
55 pages[lru] = global_page_state(NR_LRU_BASE + lru); 52 pages[lru] = global_page_state(NR_LRU_BASE + lru);
56 53
@@ -191,8 +188,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
191 K(vm_commit_limit()), 188 K(vm_commit_limit()),
192 K(committed), 189 K(committed),
193 (unsigned long)VMALLOC_TOTAL >> 10, 190 (unsigned long)VMALLOC_TOTAL >> 10,
194 vmi.used >> 10, 191 0ul, // used to be vmalloc 'used'
195 vmi.largest_chunk >> 10 192 0ul // used to be vmalloc 'largest_chunk'
196#ifdef CONFIG_MEMORY_FAILURE 193#ifdef CONFIG_MEMORY_FAILURE
197 , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) 194 , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
198#endif 195#endif
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 7eee2d8b97d9..93484034a03d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -9,12 +9,16 @@
9#include <linux/proc_fs.h> 9#include <linux/proc_fs.h>
10#include <linux/seq_file.h> 10#include <linux/seq_file.h>
11#include <linux/hugetlb.h> 11#include <linux/hugetlb.h>
12#include <linux/memcontrol.h>
13#include <linux/mmu_notifier.h>
14#include <linux/page_idle.h>
12#include <linux/kernel-page-flags.h> 15#include <linux/kernel-page-flags.h>
13#include <asm/uaccess.h> 16#include <asm/uaccess.h>
14#include "internal.h" 17#include "internal.h"
15 18
16#define KPMSIZE sizeof(u64) 19#define KPMSIZE sizeof(u64)
17#define KPMMASK (KPMSIZE - 1) 20#define KPMMASK (KPMSIZE - 1)
21#define KPMBITS (KPMSIZE * BITS_PER_BYTE)
18 22
19/* /proc/kpagecount - an array exposing page counts 23/* /proc/kpagecount - an array exposing page counts
20 * 24 *
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf,
54 pfn++; 58 pfn++;
55 out++; 59 out++;
56 count -= KPMSIZE; 60 count -= KPMSIZE;
61
62 cond_resched();
57 } 63 }
58 64
59 *ppos += (char __user *)out - buf; 65 *ppos += (char __user *)out - buf;
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page)
146 if (PageBalloon(page)) 152 if (PageBalloon(page))
147 u |= 1 << KPF_BALLOON; 153 u |= 1 << KPF_BALLOON;
148 154
155 if (page_is_idle(page))
156 u |= 1 << KPF_IDLE;
157
149 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); 158 u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked);
150 159
151 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); 160 u |= kpf_copy_bit(k, KPF_SLAB, PG_slab);
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
212 pfn++; 221 pfn++;
213 out++; 222 out++;
214 count -= KPMSIZE; 223 count -= KPMSIZE;
224
225 cond_resched();
215 } 226 }
216 227
217 *ppos += (char __user *)out - buf; 228 *ppos += (char __user *)out - buf;
@@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = {
225 .read = kpageflags_read, 236 .read = kpageflags_read,
226}; 237};
227 238
239#ifdef CONFIG_MEMCG
240static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
241 size_t count, loff_t *ppos)
242{
243 u64 __user *out = (u64 __user *)buf;
244 struct page *ppage;
245 unsigned long src = *ppos;
246 unsigned long pfn;
247 ssize_t ret = 0;
248 u64 ino;
249
250 pfn = src / KPMSIZE;
251 count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
252 if (src & KPMMASK || count & KPMMASK)
253 return -EINVAL;
254
255 while (count > 0) {
256 if (pfn_valid(pfn))
257 ppage = pfn_to_page(pfn);
258 else
259 ppage = NULL;
260
261 if (ppage)
262 ino = page_cgroup_ino(ppage);
263 else
264 ino = 0;
265
266 if (put_user(ino, out)) {
267 ret = -EFAULT;
268 break;
269 }
270
271 pfn++;
272 out++;
273 count -= KPMSIZE;
274
275 cond_resched();
276 }
277
278 *ppos += (char __user *)out - buf;
279 if (!ret)
280 ret = (char __user *)out - buf;
281 return ret;
282}
283
284static const struct file_operations proc_kpagecgroup_operations = {
285 .llseek = mem_lseek,
286 .read = kpagecgroup_read,
287};
288#endif /* CONFIG_MEMCG */
289
228static int __init proc_page_init(void) 290static int __init proc_page_init(void)
229{ 291{
230 proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); 292 proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
231 proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); 293 proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
294#ifdef CONFIG_MEMCG
295 proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
296#endif
232 return 0; 297 return 0;
233} 298}
234fs_initcall(proc_page_init); 299fs_initcall(proc_page_init);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 68feb0f70e63..361ab4ee42fc 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -134,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
134 } 134 }
135 135
136 sb->s_flags |= MS_ACTIVE; 136 sb->s_flags |= MS_ACTIVE;
137 /* User space would break if executables appear on proc */
138 sb->s_iflags |= SB_I_NOEXEC;
137 } 139 }
138 140
139 return dget(sb->s_root); 141 return dget(sb->s_root);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ca1e091881d4..187b3b5f242e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -13,6 +13,7 @@
13#include <linux/swap.h> 13#include <linux/swap.h>
14#include <linux/swapops.h> 14#include <linux/swapops.h>
15#include <linux/mmu_notifier.h> 15#include <linux/mmu_notifier.h>
16#include <linux/page_idle.h>
16 17
17#include <asm/elf.h> 18#include <asm/elf.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
@@ -69,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
69 ptes >> 10, 70 ptes >> 10,
70 pmds >> 10, 71 pmds >> 10,
71 swap << (PAGE_SHIFT-10)); 72 swap << (PAGE_SHIFT-10));
73 hugetlb_report_usage(m, mm);
72} 74}
73 75
74unsigned long task_vsize(struct mm_struct *mm) 76unsigned long task_vsize(struct mm_struct *mm)
@@ -445,7 +447,10 @@ struct mem_size_stats {
445 unsigned long anonymous; 447 unsigned long anonymous;
446 unsigned long anonymous_thp; 448 unsigned long anonymous_thp;
447 unsigned long swap; 449 unsigned long swap;
450 unsigned long shared_hugetlb;
451 unsigned long private_hugetlb;
448 u64 pss; 452 u64 pss;
453 u64 swap_pss;
449}; 454};
450 455
451static void smaps_account(struct mem_size_stats *mss, struct page *page, 456static void smaps_account(struct mem_size_stats *mss, struct page *page,
@@ -458,7 +463,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
458 463
459 mss->resident += size; 464 mss->resident += size;
460 /* Accumulate the size in pages that have been accessed. */ 465 /* Accumulate the size in pages that have been accessed. */
461 if (young || PageReferenced(page)) 466 if (young || page_is_young(page) || PageReferenced(page))
462 mss->referenced += size; 467 mss->referenced += size;
463 mapcount = page_mapcount(page); 468 mapcount = page_mapcount(page);
464 if (mapcount >= 2) { 469 if (mapcount >= 2) {
@@ -492,9 +497,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
492 } else if (is_swap_pte(*pte)) { 497 } else if (is_swap_pte(*pte)) {
493 swp_entry_t swpent = pte_to_swp_entry(*pte); 498 swp_entry_t swpent = pte_to_swp_entry(*pte);
494 499
495 if (!non_swap_entry(swpent)) 500 if (!non_swap_entry(swpent)) {
501 int mapcount;
502
496 mss->swap += PAGE_SIZE; 503 mss->swap += PAGE_SIZE;
497 else if (is_migration_entry(swpent)) 504 mapcount = swp_swapcount(swpent);
505 if (mapcount >= 2) {
506 u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT;
507
508 do_div(pss_delta, mapcount);
509 mss->swap_pss += pss_delta;
510 } else {
511 mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT;
512 }
513 } else if (is_migration_entry(swpent))
498 page = migration_entry_to_page(swpent); 514 page = migration_entry_to_page(swpent);
499 } 515 }
500 516
@@ -597,6 +613,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
597 [ilog2(VM_HUGEPAGE)] = "hg", 613 [ilog2(VM_HUGEPAGE)] = "hg",
598 [ilog2(VM_NOHUGEPAGE)] = "nh", 614 [ilog2(VM_NOHUGEPAGE)] = "nh",
599 [ilog2(VM_MERGEABLE)] = "mg", 615 [ilog2(VM_MERGEABLE)] = "mg",
616 [ilog2(VM_UFFD_MISSING)]= "um",
617 [ilog2(VM_UFFD_WP)] = "uw",
600 }; 618 };
601 size_t i; 619 size_t i;
602 620
@@ -610,12 +628,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
610 seq_putc(m, '\n'); 628 seq_putc(m, '\n');
611} 629}
612 630
631#ifdef CONFIG_HUGETLB_PAGE
632static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
633 unsigned long addr, unsigned long end,
634 struct mm_walk *walk)
635{
636 struct mem_size_stats *mss = walk->private;
637 struct vm_area_struct *vma = walk->vma;
638 struct page *page = NULL;
639
640 if (pte_present(*pte)) {
641 page = vm_normal_page(vma, addr, *pte);
642 } else if (is_swap_pte(*pte)) {
643 swp_entry_t swpent = pte_to_swp_entry(*pte);
644
645 if (is_migration_entry(swpent))
646 page = migration_entry_to_page(swpent);
647 }
648 if (page) {
649 int mapcount = page_mapcount(page);
650
651 if (mapcount >= 2)
652 mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
653 else
654 mss->private_hugetlb += huge_page_size(hstate_vma(vma));
655 }
656 return 0;
657}
658#endif /* HUGETLB_PAGE */
659
613static int show_smap(struct seq_file *m, void *v, int is_pid) 660static int show_smap(struct seq_file *m, void *v, int is_pid)
614{ 661{
615 struct vm_area_struct *vma = v; 662 struct vm_area_struct *vma = v;
616 struct mem_size_stats mss; 663 struct mem_size_stats mss;
617 struct mm_walk smaps_walk = { 664 struct mm_walk smaps_walk = {
618 .pmd_entry = smaps_pte_range, 665 .pmd_entry = smaps_pte_range,
666#ifdef CONFIG_HUGETLB_PAGE
667 .hugetlb_entry = smaps_hugetlb_range,
668#endif
619 .mm = vma->vm_mm, 669 .mm = vma->vm_mm,
620 .private = &mss, 670 .private = &mss,
621 }; 671 };
@@ -637,7 +687,10 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
637 "Referenced: %8lu kB\n" 687 "Referenced: %8lu kB\n"
638 "Anonymous: %8lu kB\n" 688 "Anonymous: %8lu kB\n"
639 "AnonHugePages: %8lu kB\n" 689 "AnonHugePages: %8lu kB\n"
690 "Shared_Hugetlb: %8lu kB\n"
691 "Private_Hugetlb: %7lu kB\n"
640 "Swap: %8lu kB\n" 692 "Swap: %8lu kB\n"
693 "SwapPss: %8lu kB\n"
641 "KernelPageSize: %8lu kB\n" 694 "KernelPageSize: %8lu kB\n"
642 "MMUPageSize: %8lu kB\n" 695 "MMUPageSize: %8lu kB\n"
643 "Locked: %8lu kB\n", 696 "Locked: %8lu kB\n",
@@ -651,7 +704,10 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
651 mss.referenced >> 10, 704 mss.referenced >> 10,
652 mss.anonymous >> 10, 705 mss.anonymous >> 10,
653 mss.anonymous_thp >> 10, 706 mss.anonymous_thp >> 10,
707 mss.shared_hugetlb >> 10,
708 mss.private_hugetlb >> 10,
654 mss.swap >> 10, 709 mss.swap >> 10,
710 (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
655 vma_kernel_pagesize(vma) >> 10, 711 vma_kernel_pagesize(vma) >> 10,
656 vma_mmu_pagesize(vma) >> 10, 712 vma_mmu_pagesize(vma) >> 10,
657 (vma->vm_flags & VM_LOCKED) ? 713 (vma->vm_flags & VM_LOCKED) ?
@@ -710,23 +766,6 @@ const struct file_operations proc_tid_smaps_operations = {
710 .release = proc_map_release, 766 .release = proc_map_release,
711}; 767};
712 768
713/*
714 * We do not want to have constant page-shift bits sitting in
715 * pagemap entries and are about to reuse them some time soon.
716 *
717 * Here's the "migration strategy":
718 * 1. when the system boots these bits remain what they are,
719 * but a warning about future change is printed in log;
720 * 2. once anyone clears soft-dirty bits via clear_refs file,
721 * these flag is set to denote, that user is aware of the
722 * new API and those page-shift bits change their meaning.
723 * The respective warning is printed in dmesg;
724 * 3. In a couple of releases we will remove all the mentions
725 * of page-shift in pagemap entries.
726 */
727
728static bool soft_dirty_cleared __read_mostly;
729
730enum clear_refs_types { 769enum clear_refs_types {
731 CLEAR_REFS_ALL = 1, 770 CLEAR_REFS_ALL = 1,
732 CLEAR_REFS_ANON, 771 CLEAR_REFS_ANON,
@@ -753,36 +792,37 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
753 pte_t ptent = *pte; 792 pte_t ptent = *pte;
754 793
755 if (pte_present(ptent)) { 794 if (pte_present(ptent)) {
795 ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
756 ptent = pte_wrprotect(ptent); 796 ptent = pte_wrprotect(ptent);
757 ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); 797 ptent = pte_clear_soft_dirty(ptent);
798 ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
758 } else if (is_swap_pte(ptent)) { 799 } else if (is_swap_pte(ptent)) {
759 ptent = pte_swp_clear_soft_dirty(ptent); 800 ptent = pte_swp_clear_soft_dirty(ptent);
801 set_pte_at(vma->vm_mm, addr, pte, ptent);
760 } 802 }
761
762 set_pte_at(vma->vm_mm, addr, pte, ptent);
763} 803}
804#else
805static inline void clear_soft_dirty(struct vm_area_struct *vma,
806 unsigned long addr, pte_t *pte)
807{
808}
809#endif
764 810
811#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
765static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 812static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
766 unsigned long addr, pmd_t *pmdp) 813 unsigned long addr, pmd_t *pmdp)
767{ 814{
768 pmd_t pmd = *pmdp; 815 pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
769 816
770 pmd = pmd_wrprotect(pmd); 817 pmd = pmd_wrprotect(pmd);
771 pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); 818 pmd = pmd_clear_soft_dirty(pmd);
772 819
773 if (vma->vm_flags & VM_SOFTDIRTY) 820 if (vma->vm_flags & VM_SOFTDIRTY)
774 vma->vm_flags &= ~VM_SOFTDIRTY; 821 vma->vm_flags &= ~VM_SOFTDIRTY;
775 822
776 set_pmd_at(vma->vm_mm, addr, pmdp, pmd); 823 set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
777} 824}
778
779#else 825#else
780
781static inline void clear_soft_dirty(struct vm_area_struct *vma,
782 unsigned long addr, pte_t *pte)
783{
784}
785
786static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, 826static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
787 unsigned long addr, pmd_t *pmdp) 827 unsigned long addr, pmd_t *pmdp)
788{ 828{
@@ -808,6 +848,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
808 848
809 /* Clear accessed and referenced bits. */ 849 /* Clear accessed and referenced bits. */
810 pmdp_test_and_clear_young(vma, addr, pmd); 850 pmdp_test_and_clear_young(vma, addr, pmd);
851 test_and_clear_page_young(page);
811 ClearPageReferenced(page); 852 ClearPageReferenced(page);
812out: 853out:
813 spin_unlock(ptl); 854 spin_unlock(ptl);
@@ -835,6 +876,7 @@ out:
835 876
836 /* Clear accessed and referenced bits. */ 877 /* Clear accessed and referenced bits. */
837 ptep_test_and_clear_young(vma, addr, pte); 878 ptep_test_and_clear_young(vma, addr, pte);
879 test_and_clear_page_young(page);
838 ClearPageReferenced(page); 880 ClearPageReferenced(page);
839 } 881 }
840 pte_unmap_unlock(pte - 1, ptl); 882 pte_unmap_unlock(pte - 1, ptl);
@@ -887,13 +929,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
887 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) 929 if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
888 return -EINVAL; 930 return -EINVAL;
889 931
890 if (type == CLEAR_REFS_SOFT_DIRTY) {
891 soft_dirty_cleared = true;
892 pr_warn_once("The pagemap bits 55-60 has changed their meaning!"
893 " See the linux/Documentation/vm/pagemap.txt for "
894 "details.\n");
895 }
896
897 task = get_proc_task(file_inode(file)); 932 task = get_proc_task(file_inode(file));
898 if (!task) 933 if (!task)
899 return -ESRCH; 934 return -ESRCH;
@@ -961,36 +996,26 @@ typedef struct {
961struct pagemapread { 996struct pagemapread {
962 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ 997 int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
963 pagemap_entry_t *buffer; 998 pagemap_entry_t *buffer;
964 bool v2; 999 bool show_pfn;
965}; 1000};
966 1001
967#define PAGEMAP_WALK_SIZE (PMD_SIZE) 1002#define PAGEMAP_WALK_SIZE (PMD_SIZE)
968#define PAGEMAP_WALK_MASK (PMD_MASK) 1003#define PAGEMAP_WALK_MASK (PMD_MASK)
969 1004
970#define PM_ENTRY_BYTES sizeof(pagemap_entry_t) 1005#define PM_ENTRY_BYTES sizeof(pagemap_entry_t)
971#define PM_STATUS_BITS 3 1006#define PM_PFRAME_BITS 55
972#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) 1007#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
973#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) 1008#define PM_SOFT_DIRTY BIT_ULL(55)
974#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) 1009#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
975#define PM_PSHIFT_BITS 6 1010#define PM_FILE BIT_ULL(61)
976#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) 1011#define PM_SWAP BIT_ULL(62)
977#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) 1012#define PM_PRESENT BIT_ULL(63)
978#define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) 1013
979#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
980#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
981/* in "new" pagemap pshift bits are occupied with more status bits */
982#define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT))
983
984#define __PM_SOFT_DIRTY (1LL)
985#define PM_PRESENT PM_STATUS(4LL)
986#define PM_SWAP PM_STATUS(2LL)
987#define PM_FILE PM_STATUS(1LL)
988#define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0)
989#define PM_END_OF_BUFFER 1 1014#define PM_END_OF_BUFFER 1
990 1015
991static inline pagemap_entry_t make_pme(u64 val) 1016static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
992{ 1017{
993 return (pagemap_entry_t) { .pme = val }; 1018 return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
994} 1019}
995 1020
996static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, 1021static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
@@ -1011,7 +1036,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
1011 1036
1012 while (addr < end) { 1037 while (addr < end) {
1013 struct vm_area_struct *vma = find_vma(walk->mm, addr); 1038 struct vm_area_struct *vma = find_vma(walk->mm, addr);
1014 pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); 1039 pagemap_entry_t pme = make_pme(0, 0);
1015 /* End of address space hole, which we mark as non-present. */ 1040 /* End of address space hole, which we mark as non-present. */
1016 unsigned long hole_end; 1041 unsigned long hole_end;
1017 1042
@@ -1031,7 +1056,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
1031 1056
1032 /* Addresses in the VMA. */ 1057 /* Addresses in the VMA. */
1033 if (vma->vm_flags & VM_SOFTDIRTY) 1058 if (vma->vm_flags & VM_SOFTDIRTY)
1034 pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); 1059 pme = make_pme(0, PM_SOFT_DIRTY);
1035 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { 1060 for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
1036 err = add_to_pagemap(addr, &pme, pm); 1061 err = add_to_pagemap(addr, &pme, pm);
1037 if (err) 1062 if (err)
@@ -1042,67 +1067,42 @@ out:
1042 return err; 1067 return err;
1043} 1068}
1044 1069
1045static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, 1070static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
1046 struct vm_area_struct *vma, unsigned long addr, pte_t pte) 1071 struct vm_area_struct *vma, unsigned long addr, pte_t pte)
1047{ 1072{
1048 u64 frame, flags; 1073 u64 frame = 0, flags = 0;
1049 struct page *page = NULL; 1074 struct page *page = NULL;
1050 int flags2 = 0;
1051 1075
1052 if (pte_present(pte)) { 1076 if (pte_present(pte)) {
1053 frame = pte_pfn(pte); 1077 if (pm->show_pfn)
1054 flags = PM_PRESENT; 1078 frame = pte_pfn(pte);
1079 flags |= PM_PRESENT;
1055 page = vm_normal_page(vma, addr, pte); 1080 page = vm_normal_page(vma, addr, pte);
1056 if (pte_soft_dirty(pte)) 1081 if (pte_soft_dirty(pte))
1057 flags2 |= __PM_SOFT_DIRTY; 1082 flags |= PM_SOFT_DIRTY;
1058 } else if (is_swap_pte(pte)) { 1083 } else if (is_swap_pte(pte)) {
1059 swp_entry_t entry; 1084 swp_entry_t entry;
1060 if (pte_swp_soft_dirty(pte)) 1085 if (pte_swp_soft_dirty(pte))
1061 flags2 |= __PM_SOFT_DIRTY; 1086 flags |= PM_SOFT_DIRTY;
1062 entry = pte_to_swp_entry(pte); 1087 entry = pte_to_swp_entry(pte);
1063 frame = swp_type(entry) | 1088 frame = swp_type(entry) |
1064 (swp_offset(entry) << MAX_SWAPFILES_SHIFT); 1089 (swp_offset(entry) << MAX_SWAPFILES_SHIFT);
1065 flags = PM_SWAP; 1090 flags |= PM_SWAP;
1066 if (is_migration_entry(entry)) 1091 if (is_migration_entry(entry))
1067 page = migration_entry_to_page(entry); 1092 page = migration_entry_to_page(entry);
1068 } else {
1069 if (vma->vm_flags & VM_SOFTDIRTY)
1070 flags2 |= __PM_SOFT_DIRTY;
1071 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
1072 return;
1073 } 1093 }
1074 1094
1075 if (page && !PageAnon(page)) 1095 if (page && !PageAnon(page))
1076 flags |= PM_FILE; 1096 flags |= PM_FILE;
1077 if ((vma->vm_flags & VM_SOFTDIRTY)) 1097 if (page && page_mapcount(page) == 1)
1078 flags2 |= __PM_SOFT_DIRTY; 1098 flags |= PM_MMAP_EXCLUSIVE;
1099 if (vma->vm_flags & VM_SOFTDIRTY)
1100 flags |= PM_SOFT_DIRTY;
1079 1101
1080 *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); 1102 return make_pme(frame, flags);
1081} 1103}
1082 1104
1083#ifdef CONFIG_TRANSPARENT_HUGEPAGE 1105static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
1084static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
1085 pmd_t pmd, int offset, int pmd_flags2)
1086{
1087 /*
1088 * Currently pmd for thp is always present because thp can not be
1089 * swapped-out, migrated, or HWPOISONed (split in such cases instead.)
1090 * This if-check is just to prepare for future implementation.
1091 */
1092 if (pmd_present(pmd))
1093 *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
1094 | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
1095 else
1096 *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
1097}
1098#else
1099static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
1100 pmd_t pmd, int offset, int pmd_flags2)
1101{
1102}
1103#endif
1104
1105static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1106 struct mm_walk *walk) 1106 struct mm_walk *walk)
1107{ 1107{
1108 struct vm_area_struct *vma = walk->vma; 1108 struct vm_area_struct *vma = walk->vma;
@@ -1111,41 +1111,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1111 pte_t *pte, *orig_pte; 1111 pte_t *pte, *orig_pte;
1112 int err = 0; 1112 int err = 0;
1113 1113
1114 if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1114#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1115 int pmd_flags2; 1115 if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
1116 u64 flags = 0, frame = 0;
1117 pmd_t pmd = *pmdp;
1116 1118
1117 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) 1119 if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd))
1118 pmd_flags2 = __PM_SOFT_DIRTY; 1120 flags |= PM_SOFT_DIRTY;
1119 else 1121
1120 pmd_flags2 = 0; 1122 /*
1123 * Currently pmd for thp is always present because thp
1124 * can not be swapped-out, migrated, or HWPOISONed
1125 * (split in such cases instead.)
1126 * This if-check is just to prepare for future implementation.
1127 */
1128 if (pmd_present(pmd)) {
1129 struct page *page = pmd_page(pmd);
1130
1131 if (page_mapcount(page) == 1)
1132 flags |= PM_MMAP_EXCLUSIVE;
1133
1134 flags |= PM_PRESENT;
1135 if (pm->show_pfn)
1136 frame = pmd_pfn(pmd) +
1137 ((addr & ~PMD_MASK) >> PAGE_SHIFT);
1138 }
1121 1139
1122 for (; addr != end; addr += PAGE_SIZE) { 1140 for (; addr != end; addr += PAGE_SIZE) {
1123 unsigned long offset; 1141 pagemap_entry_t pme = make_pme(frame, flags);
1124 pagemap_entry_t pme;
1125 1142
1126 offset = (addr & ~PAGEMAP_WALK_MASK) >>
1127 PAGE_SHIFT;
1128 thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2);
1129 err = add_to_pagemap(addr, &pme, pm); 1143 err = add_to_pagemap(addr, &pme, pm);
1130 if (err) 1144 if (err)
1131 break; 1145 break;
1146 if (pm->show_pfn && (flags & PM_PRESENT))
1147 frame++;
1132 } 1148 }
1133 spin_unlock(ptl); 1149 spin_unlock(ptl);
1134 return err; 1150 return err;
1135 } 1151 }
1136 1152
1137 if (pmd_trans_unstable(pmd)) 1153 if (pmd_trans_unstable(pmdp))
1138 return 0; 1154 return 0;
1155#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1139 1156
1140 /* 1157 /*
1141 * We can assume that @vma always points to a valid one and @end never 1158 * We can assume that @vma always points to a valid one and @end never
1142 * goes beyond vma->vm_end. 1159 * goes beyond vma->vm_end.
1143 */ 1160 */
1144 orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); 1161 orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
1145 for (; addr < end; pte++, addr += PAGE_SIZE) { 1162 for (; addr < end; pte++, addr += PAGE_SIZE) {
1146 pagemap_entry_t pme; 1163 pagemap_entry_t pme;
1147 1164
1148 pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); 1165 pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
1149 err = add_to_pagemap(addr, &pme, pm); 1166 err = add_to_pagemap(addr, &pme, pm);
1150 if (err) 1167 if (err)
1151 break; 1168 break;
@@ -1158,40 +1175,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
1158} 1175}
1159 1176
1160#ifdef CONFIG_HUGETLB_PAGE 1177#ifdef CONFIG_HUGETLB_PAGE
1161static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
1162 pte_t pte, int offset, int flags2)
1163{
1164 if (pte_present(pte))
1165 *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
1166 PM_STATUS2(pm->v2, flags2) |
1167 PM_PRESENT);
1168 else
1169 *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
1170 PM_STATUS2(pm->v2, flags2));
1171}
1172
1173/* This function walks within one hugetlb entry in the single call */ 1178/* This function walks within one hugetlb entry in the single call */
1174static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, 1179static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
1175 unsigned long addr, unsigned long end, 1180 unsigned long addr, unsigned long end,
1176 struct mm_walk *walk) 1181 struct mm_walk *walk)
1177{ 1182{
1178 struct pagemapread *pm = walk->private; 1183 struct pagemapread *pm = walk->private;
1179 struct vm_area_struct *vma = walk->vma; 1184 struct vm_area_struct *vma = walk->vma;
1185 u64 flags = 0, frame = 0;
1180 int err = 0; 1186 int err = 0;
1181 int flags2; 1187 pte_t pte;
1182 pagemap_entry_t pme;
1183 1188
1184 if (vma->vm_flags & VM_SOFTDIRTY) 1189 if (vma->vm_flags & VM_SOFTDIRTY)
1185 flags2 = __PM_SOFT_DIRTY; 1190 flags |= PM_SOFT_DIRTY;
1186 else 1191
1187 flags2 = 0; 1192 pte = huge_ptep_get(ptep);
1193 if (pte_present(pte)) {
1194 struct page *page = pte_page(pte);
1195
1196 if (!PageAnon(page))
1197 flags |= PM_FILE;
1198
1199 if (page_mapcount(page) == 1)
1200 flags |= PM_MMAP_EXCLUSIVE;
1201
1202 flags |= PM_PRESENT;
1203 if (pm->show_pfn)
1204 frame = pte_pfn(pte) +
1205 ((addr & ~hmask) >> PAGE_SHIFT);
1206 }
1188 1207
1189 for (; addr != end; addr += PAGE_SIZE) { 1208 for (; addr != end; addr += PAGE_SIZE) {
1190 int offset = (addr & ~hmask) >> PAGE_SHIFT; 1209 pagemap_entry_t pme = make_pme(frame, flags);
1191 huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); 1210
1192 err = add_to_pagemap(addr, &pme, pm); 1211 err = add_to_pagemap(addr, &pme, pm);
1193 if (err) 1212 if (err)
1194 return err; 1213 return err;
1214 if (pm->show_pfn && (flags & PM_PRESENT))
1215 frame++;
1195 } 1216 }
1196 1217
1197 cond_resched(); 1218 cond_resched();
@@ -1209,7 +1230,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1209 * Bits 0-54 page frame number (PFN) if present 1230 * Bits 0-54 page frame number (PFN) if present
1210 * Bits 0-4 swap type if swapped 1231 * Bits 0-4 swap type if swapped
1211 * Bits 5-54 swap offset if swapped 1232 * Bits 5-54 swap offset if swapped
1212 * Bits 55-60 page shift (page size = 1<<page shift) 1233 * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt)
1234 * Bit 56 page exclusively mapped
1235 * Bits 57-60 zero
1213 * Bit 61 page is file-page or shared-anon 1236 * Bit 61 page is file-page or shared-anon
1214 * Bit 62 page swapped 1237 * Bit 62 page swapped
1215 * Bit 63 page present 1238 * Bit 63 page present
@@ -1227,42 +1250,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
1227static ssize_t pagemap_read(struct file *file, char __user *buf, 1250static ssize_t pagemap_read(struct file *file, char __user *buf,
1228 size_t count, loff_t *ppos) 1251 size_t count, loff_t *ppos)
1229{ 1252{
1230 struct task_struct *task = get_proc_task(file_inode(file)); 1253 struct mm_struct *mm = file->private_data;
1231 struct mm_struct *mm;
1232 struct pagemapread pm; 1254 struct pagemapread pm;
1233 int ret = -ESRCH;
1234 struct mm_walk pagemap_walk = {}; 1255 struct mm_walk pagemap_walk = {};
1235 unsigned long src; 1256 unsigned long src;
1236 unsigned long svpfn; 1257 unsigned long svpfn;
1237 unsigned long start_vaddr; 1258 unsigned long start_vaddr;
1238 unsigned long end_vaddr; 1259 unsigned long end_vaddr;
1239 int copied = 0; 1260 int ret = 0, copied = 0;
1240 1261
1241 if (!task) 1262 if (!mm || !atomic_inc_not_zero(&mm->mm_users))
1242 goto out; 1263 goto out;
1243 1264
1244 ret = -EINVAL; 1265 ret = -EINVAL;
1245 /* file position must be aligned */ 1266 /* file position must be aligned */
1246 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) 1267 if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES))
1247 goto out_task; 1268 goto out_mm;
1248 1269
1249 ret = 0; 1270 ret = 0;
1250 if (!count) 1271 if (!count)
1251 goto out_task; 1272 goto out_mm;
1273
1274 /* do not disclose physical addresses: attack vector */
1275 pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
1252 1276
1253 pm.v2 = soft_dirty_cleared;
1254 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); 1277 pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
1255 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); 1278 pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY);
1256 ret = -ENOMEM; 1279 ret = -ENOMEM;
1257 if (!pm.buffer) 1280 if (!pm.buffer)
1258 goto out_task; 1281 goto out_mm;
1259
1260 mm = mm_access(task, PTRACE_MODE_READ);
1261 ret = PTR_ERR(mm);
1262 if (!mm || IS_ERR(mm))
1263 goto out_free;
1264 1282
1265 pagemap_walk.pmd_entry = pagemap_pte_range; 1283 pagemap_walk.pmd_entry = pagemap_pmd_range;
1266 pagemap_walk.pte_hole = pagemap_pte_hole; 1284 pagemap_walk.pte_hole = pagemap_pte_hole;
1267#ifdef CONFIG_HUGETLB_PAGE 1285#ifdef CONFIG_HUGETLB_PAGE
1268 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; 1286 pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
@@ -1273,10 +1291,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1273 src = *ppos; 1291 src = *ppos;
1274 svpfn = src / PM_ENTRY_BYTES; 1292 svpfn = src / PM_ENTRY_BYTES;
1275 start_vaddr = svpfn << PAGE_SHIFT; 1293 start_vaddr = svpfn << PAGE_SHIFT;
1276 end_vaddr = TASK_SIZE_OF(task); 1294 end_vaddr = mm->task_size;
1277 1295
1278 /* watch out for wraparound */ 1296 /* watch out for wraparound */
1279 if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) 1297 if (svpfn > mm->task_size >> PAGE_SHIFT)
1280 start_vaddr = end_vaddr; 1298 start_vaddr = end_vaddr;
1281 1299
1282 /* 1300 /*
@@ -1303,7 +1321,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1303 len = min(count, PM_ENTRY_BYTES * pm.pos); 1321 len = min(count, PM_ENTRY_BYTES * pm.pos);
1304 if (copy_to_user(buf, pm.buffer, len)) { 1322 if (copy_to_user(buf, pm.buffer, len)) {
1305 ret = -EFAULT; 1323 ret = -EFAULT;
1306 goto out_mm; 1324 goto out_free;
1307 } 1325 }
1308 copied += len; 1326 copied += len;
1309 buf += len; 1327 buf += len;
@@ -1313,24 +1331,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
1313 if (!ret || ret == PM_END_OF_BUFFER) 1331 if (!ret || ret == PM_END_OF_BUFFER)
1314 ret = copied; 1332 ret = copied;
1315 1333
1316out_mm:
1317 mmput(mm);
1318out_free: 1334out_free:
1319 kfree(pm.buffer); 1335 kfree(pm.buffer);
1320out_task: 1336out_mm:
1321 put_task_struct(task); 1337 mmput(mm);
1322out: 1338out:
1323 return ret; 1339 return ret;
1324} 1340}
1325 1341
1326static int pagemap_open(struct inode *inode, struct file *file) 1342static int pagemap_open(struct inode *inode, struct file *file)
1327{ 1343{
1328 /* do not disclose physical addresses: attack vector */ 1344 struct mm_struct *mm;
1329 if (!capable(CAP_SYS_ADMIN)) 1345
1330 return -EPERM; 1346 mm = proc_mem_open(inode, PTRACE_MODE_READ);
1331 pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " 1347 if (IS_ERR(mm))
1332 "to stop being page-shift some time soon. See the " 1348 return PTR_ERR(mm);
1333 "linux/Documentation/vm/pagemap.txt for details.\n"); 1349 file->private_data = mm;
1350 return 0;
1351}
1352
1353static int pagemap_release(struct inode *inode, struct file *file)
1354{
1355 struct mm_struct *mm = file->private_data;
1356
1357 if (mm)
1358 mmdrop(mm);
1334 return 0; 1359 return 0;
1335} 1360}
1336 1361
@@ -1338,6 +1363,7 @@ const struct file_operations proc_pagemap_operations = {
1338 .llseek = mem_lseek, /* borrow this */ 1363 .llseek = mem_lseek, /* borrow this */
1339 .read = pagemap_read, 1364 .read = pagemap_read,
1340 .open = pagemap_open, 1365 .open = pagemap_open,
1366 .release = pagemap_release,
1341}; 1367};
1342#endif /* CONFIG_PROC_PAGE_MONITOR */ 1368#endif /* CONFIG_PROC_PAGE_MONITOR */
1343 1369