diff options
Diffstat (limited to 'fs/proc')
-rw-r--r-- | fs/proc/array.c | 21 | ||||
-rw-r--r-- | fs/proc/base.c | 132 | ||||
-rw-r--r-- | fs/proc/generic.c | 44 | ||||
-rw-r--r-- | fs/proc/meminfo.c | 7 | ||||
-rw-r--r-- | fs/proc/page.c | 65 | ||||
-rw-r--r-- | fs/proc/root.c | 2 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 356 |
7 files changed, 358 insertions, 269 deletions
diff --git a/fs/proc/array.c b/fs/proc/array.c index ce065cf3104f..eed2050db9be 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -308,7 +308,8 @@ static void render_cap_t(struct seq_file *m, const char *header, | |||
308 | static inline void task_cap(struct seq_file *m, struct task_struct *p) | 308 | static inline void task_cap(struct seq_file *m, struct task_struct *p) |
309 | { | 309 | { |
310 | const struct cred *cred; | 310 | const struct cred *cred; |
311 | kernel_cap_t cap_inheritable, cap_permitted, cap_effective, cap_bset; | 311 | kernel_cap_t cap_inheritable, cap_permitted, cap_effective, |
312 | cap_bset, cap_ambient; | ||
312 | 313 | ||
313 | rcu_read_lock(); | 314 | rcu_read_lock(); |
314 | cred = __task_cred(p); | 315 | cred = __task_cred(p); |
@@ -316,12 +317,14 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) | |||
316 | cap_permitted = cred->cap_permitted; | 317 | cap_permitted = cred->cap_permitted; |
317 | cap_effective = cred->cap_effective; | 318 | cap_effective = cred->cap_effective; |
318 | cap_bset = cred->cap_bset; | 319 | cap_bset = cred->cap_bset; |
320 | cap_ambient = cred->cap_ambient; | ||
319 | rcu_read_unlock(); | 321 | rcu_read_unlock(); |
320 | 322 | ||
321 | render_cap_t(m, "CapInh:\t", &cap_inheritable); | 323 | render_cap_t(m, "CapInh:\t", &cap_inheritable); |
322 | render_cap_t(m, "CapPrm:\t", &cap_permitted); | 324 | render_cap_t(m, "CapPrm:\t", &cap_permitted); |
323 | render_cap_t(m, "CapEff:\t", &cap_effective); | 325 | render_cap_t(m, "CapEff:\t", &cap_effective); |
324 | render_cap_t(m, "CapBnd:\t", &cap_bset); | 326 | render_cap_t(m, "CapBnd:\t", &cap_bset); |
327 | render_cap_t(m, "CapAmb:\t", &cap_ambient); | ||
325 | } | 328 | } |
326 | 329 | ||
327 | static inline void task_seccomp(struct seq_file *m, struct task_struct *p) | 330 | static inline void task_seccomp(struct seq_file *m, struct task_struct *p) |
@@ -372,7 +375,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, | |||
372 | static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, | 375 | static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, |
373 | struct pid *pid, struct task_struct *task, int whole) | 376 | struct pid *pid, struct task_struct *task, int whole) |
374 | { | 377 | { |
375 | unsigned long vsize, eip, esp, wchan = ~0UL; | 378 | unsigned long vsize, eip, esp, wchan = 0; |
376 | int priority, nice; | 379 | int priority, nice; |
377 | int tty_pgrp = -1, tty_nr = 0; | 380 | int tty_pgrp = -1, tty_nr = 0; |
378 | sigset_t sigign, sigcatch; | 381 | sigset_t sigign, sigcatch; |
@@ -504,7 +507,19 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, | |||
504 | seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); | 507 | seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL); |
505 | seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); | 508 | seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL); |
506 | seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); | 509 | seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL); |
507 | seq_put_decimal_ull(m, ' ', wchan); | 510 | |
511 | /* | ||
512 | * We used to output the absolute kernel address, but that's an | ||
513 | * information leak - so instead we show a 0/1 flag here, to signal | ||
514 | * to user-space whether there's a wchan field in /proc/PID/wchan. | ||
515 | * | ||
516 | * This works with older implementations of procps as well. | ||
517 | */ | ||
518 | if (wchan) | ||
519 | seq_puts(m, " 1"); | ||
520 | else | ||
521 | seq_puts(m, " 0"); | ||
522 | |||
508 | seq_put_decimal_ull(m, ' ', 0); | 523 | seq_put_decimal_ull(m, ' ', 0); |
509 | seq_put_decimal_ull(m, ' ', 0); | 524 | seq_put_decimal_ull(m, ' ', 0); |
510 | seq_put_decimal_ll(m, ' ', task->exit_signal); | 525 | seq_put_decimal_ll(m, ' ', task->exit_signal); |
diff --git a/fs/proc/base.c b/fs/proc/base.c index aa50d1ac28fc..bd3e9e68125b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -430,13 +430,10 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, | |||
430 | 430 | ||
431 | wchan = get_wchan(task); | 431 | wchan = get_wchan(task); |
432 | 432 | ||
433 | if (lookup_symbol_name(wchan, symname) < 0) { | 433 | if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname)) |
434 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) | ||
435 | return 0; | ||
436 | seq_printf(m, "%lu", wchan); | ||
437 | } else { | ||
438 | seq_printf(m, "%s", symname); | 434 | seq_printf(m, "%s", symname); |
439 | } | 435 | else |
436 | seq_putc(m, '0'); | ||
440 | 437 | ||
441 | return 0; | 438 | return 0; |
442 | } | 439 | } |
@@ -1035,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, | |||
1035 | return simple_read_from_buffer(buf, count, ppos, buffer, len); | 1032 | return simple_read_from_buffer(buf, count, ppos, buffer, len); |
1036 | } | 1033 | } |
1037 | 1034 | ||
1035 | /* | ||
1036 | * /proc/pid/oom_adj exists solely for backwards compatibility with previous | ||
1037 | * kernels. The effective policy is defined by oom_score_adj, which has a | ||
1038 | * different scale: oom_adj grew exponentially and oom_score_adj grows linearly. | ||
1039 | * Values written to oom_adj are simply mapped linearly to oom_score_adj. | ||
1040 | * Processes that become oom disabled via oom_adj will still be oom disabled | ||
1041 | * with this implementation. | ||
1042 | * | ||
1043 | * oom_adj cannot be removed since existing userspace binaries use it. | ||
1044 | */ | ||
1038 | static ssize_t oom_adj_write(struct file *file, const char __user *buf, | 1045 | static ssize_t oom_adj_write(struct file *file, const char __user *buf, |
1039 | size_t count, loff_t *ppos) | 1046 | size_t count, loff_t *ppos) |
1040 | { | 1047 | { |
@@ -1230,10 +1237,9 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, | |||
1230 | size_t count, loff_t *ppos) | 1237 | size_t count, loff_t *ppos) |
1231 | { | 1238 | { |
1232 | struct inode * inode = file_inode(file); | 1239 | struct inode * inode = file_inode(file); |
1233 | char *page, *tmp; | ||
1234 | ssize_t length; | ||
1235 | uid_t loginuid; | 1240 | uid_t loginuid; |
1236 | kuid_t kloginuid; | 1241 | kuid_t kloginuid; |
1242 | int rv; | ||
1237 | 1243 | ||
1238 | rcu_read_lock(); | 1244 | rcu_read_lock(); |
1239 | if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { | 1245 | if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { |
@@ -1242,46 +1248,28 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, | |||
1242 | } | 1248 | } |
1243 | rcu_read_unlock(); | 1249 | rcu_read_unlock(); |
1244 | 1250 | ||
1245 | if (count >= PAGE_SIZE) | ||
1246 | count = PAGE_SIZE - 1; | ||
1247 | |||
1248 | if (*ppos != 0) { | 1251 | if (*ppos != 0) { |
1249 | /* No partial writes. */ | 1252 | /* No partial writes. */ |
1250 | return -EINVAL; | 1253 | return -EINVAL; |
1251 | } | 1254 | } |
1252 | page = (char*)__get_free_page(GFP_TEMPORARY); | ||
1253 | if (!page) | ||
1254 | return -ENOMEM; | ||
1255 | length = -EFAULT; | ||
1256 | if (copy_from_user(page, buf, count)) | ||
1257 | goto out_free_page; | ||
1258 | 1255 | ||
1259 | page[count] = '\0'; | 1256 | rv = kstrtou32_from_user(buf, count, 10, &loginuid); |
1260 | loginuid = simple_strtoul(page, &tmp, 10); | 1257 | if (rv < 0) |
1261 | if (tmp == page) { | 1258 | return rv; |
1262 | length = -EINVAL; | ||
1263 | goto out_free_page; | ||
1264 | |||
1265 | } | ||
1266 | 1259 | ||
1267 | /* is userspace tring to explicitly UNSET the loginuid? */ | 1260 | /* is userspace tring to explicitly UNSET the loginuid? */ |
1268 | if (loginuid == AUDIT_UID_UNSET) { | 1261 | if (loginuid == AUDIT_UID_UNSET) { |
1269 | kloginuid = INVALID_UID; | 1262 | kloginuid = INVALID_UID; |
1270 | } else { | 1263 | } else { |
1271 | kloginuid = make_kuid(file->f_cred->user_ns, loginuid); | 1264 | kloginuid = make_kuid(file->f_cred->user_ns, loginuid); |
1272 | if (!uid_valid(kloginuid)) { | 1265 | if (!uid_valid(kloginuid)) |
1273 | length = -EINVAL; | 1266 | return -EINVAL; |
1274 | goto out_free_page; | ||
1275 | } | ||
1276 | } | 1267 | } |
1277 | 1268 | ||
1278 | length = audit_set_loginuid(kloginuid); | 1269 | rv = audit_set_loginuid(kloginuid); |
1279 | if (likely(length == 0)) | 1270 | if (rv < 0) |
1280 | length = count; | 1271 | return rv; |
1281 | 1272 | return count; | |
1282 | out_free_page: | ||
1283 | free_page((unsigned long) page); | ||
1284 | return length; | ||
1285 | } | 1273 | } |
1286 | 1274 | ||
1287 | static const struct file_operations proc_loginuid_operations = { | 1275 | static const struct file_operations proc_loginuid_operations = { |
@@ -1335,8 +1323,9 @@ static ssize_t proc_fault_inject_write(struct file * file, | |||
1335 | const char __user * buf, size_t count, loff_t *ppos) | 1323 | const char __user * buf, size_t count, loff_t *ppos) |
1336 | { | 1324 | { |
1337 | struct task_struct *task; | 1325 | struct task_struct *task; |
1338 | char buffer[PROC_NUMBUF], *end; | 1326 | char buffer[PROC_NUMBUF]; |
1339 | int make_it_fail; | 1327 | int make_it_fail; |
1328 | int rv; | ||
1340 | 1329 | ||
1341 | if (!capable(CAP_SYS_RESOURCE)) | 1330 | if (!capable(CAP_SYS_RESOURCE)) |
1342 | return -EPERM; | 1331 | return -EPERM; |
@@ -1345,9 +1334,9 @@ static ssize_t proc_fault_inject_write(struct file * file, | |||
1345 | count = sizeof(buffer) - 1; | 1334 | count = sizeof(buffer) - 1; |
1346 | if (copy_from_user(buffer, buf, count)) | 1335 | if (copy_from_user(buffer, buf, count)) |
1347 | return -EFAULT; | 1336 | return -EFAULT; |
1348 | make_it_fail = simple_strtol(strstrip(buffer), &end, 0); | 1337 | rv = kstrtoint(strstrip(buffer), 0, &make_it_fail); |
1349 | if (*end) | 1338 | if (rv < 0) |
1350 | return -EINVAL; | 1339 | return rv; |
1351 | if (make_it_fail < 0 || make_it_fail > 1) | 1340 | if (make_it_fail < 0 || make_it_fail > 1) |
1352 | return -EINVAL; | 1341 | return -EINVAL; |
1353 | 1342 | ||
@@ -1836,8 +1825,6 @@ end_instantiate: | |||
1836 | return dir_emit(ctx, name, len, 1, DT_UNKNOWN); | 1825 | return dir_emit(ctx, name, len, 1, DT_UNKNOWN); |
1837 | } | 1826 | } |
1838 | 1827 | ||
1839 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
1840 | |||
1841 | /* | 1828 | /* |
1842 | * dname_to_vma_addr - maps a dentry name into two unsigned longs | 1829 | * dname_to_vma_addr - maps a dentry name into two unsigned longs |
1843 | * which represent vma start and end addresses. | 1830 | * which represent vma start and end addresses. |
@@ -1864,11 +1851,6 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) | |||
1864 | if (flags & LOOKUP_RCU) | 1851 | if (flags & LOOKUP_RCU) |
1865 | return -ECHILD; | 1852 | return -ECHILD; |
1866 | 1853 | ||
1867 | if (!capable(CAP_SYS_ADMIN)) { | ||
1868 | status = -EPERM; | ||
1869 | goto out_notask; | ||
1870 | } | ||
1871 | |||
1872 | inode = d_inode(dentry); | 1854 | inode = d_inode(dentry); |
1873 | task = get_proc_task(inode); | 1855 | task = get_proc_task(inode); |
1874 | if (!task) | 1856 | if (!task) |
@@ -1957,6 +1939,29 @@ struct map_files_info { | |||
1957 | unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ | 1939 | unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ |
1958 | }; | 1940 | }; |
1959 | 1941 | ||
1942 | /* | ||
1943 | * Only allow CAP_SYS_ADMIN to follow the links, due to concerns about how the | ||
1944 | * symlinks may be used to bypass permissions on ancestor directories in the | ||
1945 | * path to the file in question. | ||
1946 | */ | ||
1947 | static const char * | ||
1948 | proc_map_files_follow_link(struct dentry *dentry, void **cookie) | ||
1949 | { | ||
1950 | if (!capable(CAP_SYS_ADMIN)) | ||
1951 | return ERR_PTR(-EPERM); | ||
1952 | |||
1953 | return proc_pid_follow_link(dentry, NULL); | ||
1954 | } | ||
1955 | |||
1956 | /* | ||
1957 | * Identical to proc_pid_link_inode_operations except for follow_link() | ||
1958 | */ | ||
1959 | static const struct inode_operations proc_map_files_link_inode_operations = { | ||
1960 | .readlink = proc_pid_readlink, | ||
1961 | .follow_link = proc_map_files_follow_link, | ||
1962 | .setattr = proc_setattr, | ||
1963 | }; | ||
1964 | |||
1960 | static int | 1965 | static int |
1961 | proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, | 1966 | proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, |
1962 | struct task_struct *task, const void *ptr) | 1967 | struct task_struct *task, const void *ptr) |
@@ -1972,7 +1977,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, | |||
1972 | ei = PROC_I(inode); | 1977 | ei = PROC_I(inode); |
1973 | ei->op.proc_get_link = proc_map_files_get_link; | 1978 | ei->op.proc_get_link = proc_map_files_get_link; |
1974 | 1979 | ||
1975 | inode->i_op = &proc_pid_link_inode_operations; | 1980 | inode->i_op = &proc_map_files_link_inode_operations; |
1976 | inode->i_size = 64; | 1981 | inode->i_size = 64; |
1977 | inode->i_mode = S_IFLNK; | 1982 | inode->i_mode = S_IFLNK; |
1978 | 1983 | ||
@@ -1996,10 +2001,6 @@ static struct dentry *proc_map_files_lookup(struct inode *dir, | |||
1996 | int result; | 2001 | int result; |
1997 | struct mm_struct *mm; | 2002 | struct mm_struct *mm; |
1998 | 2003 | ||
1999 | result = -EPERM; | ||
2000 | if (!capable(CAP_SYS_ADMIN)) | ||
2001 | goto out; | ||
2002 | |||
2003 | result = -ENOENT; | 2004 | result = -ENOENT; |
2004 | task = get_proc_task(dir); | 2005 | task = get_proc_task(dir); |
2005 | if (!task) | 2006 | if (!task) |
@@ -2053,10 +2054,6 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) | |||
2053 | struct map_files_info *p; | 2054 | struct map_files_info *p; |
2054 | int ret; | 2055 | int ret; |
2055 | 2056 | ||
2056 | ret = -EPERM; | ||
2057 | if (!capable(CAP_SYS_ADMIN)) | ||
2058 | goto out; | ||
2059 | |||
2060 | ret = -ENOENT; | 2057 | ret = -ENOENT; |
2061 | task = get_proc_task(file_inode(file)); | 2058 | task = get_proc_task(file_inode(file)); |
2062 | if (!task) | 2059 | if (!task) |
@@ -2245,7 +2242,6 @@ static const struct file_operations proc_timers_operations = { | |||
2245 | .llseek = seq_lseek, | 2242 | .llseek = seq_lseek, |
2246 | .release = seq_release_private, | 2243 | .release = seq_release_private, |
2247 | }; | 2244 | }; |
2248 | #endif /* CONFIG_CHECKPOINT_RESTORE */ | ||
2249 | 2245 | ||
2250 | static int proc_pident_instantiate(struct inode *dir, | 2246 | static int proc_pident_instantiate(struct inode *dir, |
2251 | struct dentry *dentry, struct task_struct *task, const void *ptr) | 2247 | struct dentry *dentry, struct task_struct *task, const void *ptr) |
@@ -2481,32 +2477,20 @@ static ssize_t proc_coredump_filter_write(struct file *file, | |||
2481 | { | 2477 | { |
2482 | struct task_struct *task; | 2478 | struct task_struct *task; |
2483 | struct mm_struct *mm; | 2479 | struct mm_struct *mm; |
2484 | char buffer[PROC_NUMBUF], *end; | ||
2485 | unsigned int val; | 2480 | unsigned int val; |
2486 | int ret; | 2481 | int ret; |
2487 | int i; | 2482 | int i; |
2488 | unsigned long mask; | 2483 | unsigned long mask; |
2489 | 2484 | ||
2490 | ret = -EFAULT; | 2485 | ret = kstrtouint_from_user(buf, count, 0, &val); |
2491 | memset(buffer, 0, sizeof(buffer)); | 2486 | if (ret < 0) |
2492 | if (count > sizeof(buffer) - 1) | 2487 | return ret; |
2493 | count = sizeof(buffer) - 1; | ||
2494 | if (copy_from_user(buffer, buf, count)) | ||
2495 | goto out_no_task; | ||
2496 | |||
2497 | ret = -EINVAL; | ||
2498 | val = (unsigned int)simple_strtoul(buffer, &end, 0); | ||
2499 | if (*end == '\n') | ||
2500 | end++; | ||
2501 | if (end - buffer == 0) | ||
2502 | goto out_no_task; | ||
2503 | 2488 | ||
2504 | ret = -ESRCH; | 2489 | ret = -ESRCH; |
2505 | task = get_proc_task(file_inode(file)); | 2490 | task = get_proc_task(file_inode(file)); |
2506 | if (!task) | 2491 | if (!task) |
2507 | goto out_no_task; | 2492 | goto out_no_task; |
2508 | 2493 | ||
2509 | ret = end - buffer; | ||
2510 | mm = get_task_mm(task); | 2494 | mm = get_task_mm(task); |
2511 | if (!mm) | 2495 | if (!mm) |
2512 | goto out_no_mm; | 2496 | goto out_no_mm; |
@@ -2522,7 +2506,9 @@ static ssize_t proc_coredump_filter_write(struct file *file, | |||
2522 | out_no_mm: | 2506 | out_no_mm: |
2523 | put_task_struct(task); | 2507 | put_task_struct(task); |
2524 | out_no_task: | 2508 | out_no_task: |
2525 | return ret; | 2509 | if (ret < 0) |
2510 | return ret; | ||
2511 | return count; | ||
2526 | } | 2512 | } |
2527 | 2513 | ||
2528 | static const struct file_operations proc_coredump_filter_operations = { | 2514 | static const struct file_operations proc_coredump_filter_operations = { |
@@ -2744,9 +2730,7 @@ static const struct inode_operations proc_task_inode_operations; | |||
2744 | static const struct pid_entry tgid_base_stuff[] = { | 2730 | static const struct pid_entry tgid_base_stuff[] = { |
2745 | DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), | 2731 | DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), |
2746 | DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), | 2732 | DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), |
2747 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
2748 | DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), | 2733 | DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), |
2749 | #endif | ||
2750 | DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), | 2734 | DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), |
2751 | DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), | 2735 | DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), |
2752 | #ifdef CONFIG_NET | 2736 | #ifdef CONFIG_NET |
diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e5dee5c3188e..ff3ffc76a937 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c | |||
@@ -26,7 +26,7 @@ | |||
26 | 26 | ||
27 | #include "internal.h" | 27 | #include "internal.h" |
28 | 28 | ||
29 | static DEFINE_SPINLOCK(proc_subdir_lock); | 29 | static DEFINE_RWLOCK(proc_subdir_lock); |
30 | 30 | ||
31 | static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) | 31 | static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) |
32 | { | 32 | { |
@@ -172,9 +172,9 @@ static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, | |||
172 | { | 172 | { |
173 | int rv; | 173 | int rv; |
174 | 174 | ||
175 | spin_lock(&proc_subdir_lock); | 175 | read_lock(&proc_subdir_lock); |
176 | rv = __xlate_proc_name(name, ret, residual); | 176 | rv = __xlate_proc_name(name, ret, residual); |
177 | spin_unlock(&proc_subdir_lock); | 177 | read_unlock(&proc_subdir_lock); |
178 | return rv; | 178 | return rv; |
179 | } | 179 | } |
180 | 180 | ||
@@ -231,11 +231,11 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, | |||
231 | { | 231 | { |
232 | struct inode *inode; | 232 | struct inode *inode; |
233 | 233 | ||
234 | spin_lock(&proc_subdir_lock); | 234 | read_lock(&proc_subdir_lock); |
235 | de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); | 235 | de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len); |
236 | if (de) { | 236 | if (de) { |
237 | pde_get(de); | 237 | pde_get(de); |
238 | spin_unlock(&proc_subdir_lock); | 238 | read_unlock(&proc_subdir_lock); |
239 | inode = proc_get_inode(dir->i_sb, de); | 239 | inode = proc_get_inode(dir->i_sb, de); |
240 | if (!inode) | 240 | if (!inode) |
241 | return ERR_PTR(-ENOMEM); | 241 | return ERR_PTR(-ENOMEM); |
@@ -243,7 +243,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, | |||
243 | d_add(dentry, inode); | 243 | d_add(dentry, inode); |
244 | return NULL; | 244 | return NULL; |
245 | } | 245 | } |
246 | spin_unlock(&proc_subdir_lock); | 246 | read_unlock(&proc_subdir_lock); |
247 | return ERR_PTR(-ENOENT); | 247 | return ERR_PTR(-ENOENT); |
248 | } | 248 | } |
249 | 249 | ||
@@ -270,12 +270,12 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, | |||
270 | if (!dir_emit_dots(file, ctx)) | 270 | if (!dir_emit_dots(file, ctx)) |
271 | return 0; | 271 | return 0; |
272 | 272 | ||
273 | spin_lock(&proc_subdir_lock); | 273 | read_lock(&proc_subdir_lock); |
274 | de = pde_subdir_first(de); | 274 | de = pde_subdir_first(de); |
275 | i = ctx->pos - 2; | 275 | i = ctx->pos - 2; |
276 | for (;;) { | 276 | for (;;) { |
277 | if (!de) { | 277 | if (!de) { |
278 | spin_unlock(&proc_subdir_lock); | 278 | read_unlock(&proc_subdir_lock); |
279 | return 0; | 279 | return 0; |
280 | } | 280 | } |
281 | if (!i) | 281 | if (!i) |
@@ -287,19 +287,19 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *file, | |||
287 | do { | 287 | do { |
288 | struct proc_dir_entry *next; | 288 | struct proc_dir_entry *next; |
289 | pde_get(de); | 289 | pde_get(de); |
290 | spin_unlock(&proc_subdir_lock); | 290 | read_unlock(&proc_subdir_lock); |
291 | if (!dir_emit(ctx, de->name, de->namelen, | 291 | if (!dir_emit(ctx, de->name, de->namelen, |
292 | de->low_ino, de->mode >> 12)) { | 292 | de->low_ino, de->mode >> 12)) { |
293 | pde_put(de); | 293 | pde_put(de); |
294 | return 0; | 294 | return 0; |
295 | } | 295 | } |
296 | spin_lock(&proc_subdir_lock); | 296 | read_lock(&proc_subdir_lock); |
297 | ctx->pos++; | 297 | ctx->pos++; |
298 | next = pde_subdir_next(de); | 298 | next = pde_subdir_next(de); |
299 | pde_put(de); | 299 | pde_put(de); |
300 | de = next; | 300 | de = next; |
301 | } while (de); | 301 | } while (de); |
302 | spin_unlock(&proc_subdir_lock); | 302 | read_unlock(&proc_subdir_lock); |
303 | return 1; | 303 | return 1; |
304 | } | 304 | } |
305 | 305 | ||
@@ -338,16 +338,16 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp | |||
338 | if (ret) | 338 | if (ret) |
339 | return ret; | 339 | return ret; |
340 | 340 | ||
341 | spin_lock(&proc_subdir_lock); | 341 | write_lock(&proc_subdir_lock); |
342 | dp->parent = dir; | 342 | dp->parent = dir; |
343 | if (pde_subdir_insert(dir, dp) == false) { | 343 | if (pde_subdir_insert(dir, dp) == false) { |
344 | WARN(1, "proc_dir_entry '%s/%s' already registered\n", | 344 | WARN(1, "proc_dir_entry '%s/%s' already registered\n", |
345 | dir->name, dp->name); | 345 | dir->name, dp->name); |
346 | spin_unlock(&proc_subdir_lock); | 346 | write_unlock(&proc_subdir_lock); |
347 | proc_free_inum(dp->low_ino); | 347 | proc_free_inum(dp->low_ino); |
348 | return -EEXIST; | 348 | return -EEXIST; |
349 | } | 349 | } |
350 | spin_unlock(&proc_subdir_lock); | 350 | write_unlock(&proc_subdir_lock); |
351 | 351 | ||
352 | return 0; | 352 | return 0; |
353 | } | 353 | } |
@@ -549,9 +549,9 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) | |||
549 | const char *fn = name; | 549 | const char *fn = name; |
550 | unsigned int len; | 550 | unsigned int len; |
551 | 551 | ||
552 | spin_lock(&proc_subdir_lock); | 552 | write_lock(&proc_subdir_lock); |
553 | if (__xlate_proc_name(name, &parent, &fn) != 0) { | 553 | if (__xlate_proc_name(name, &parent, &fn) != 0) { |
554 | spin_unlock(&proc_subdir_lock); | 554 | write_unlock(&proc_subdir_lock); |
555 | return; | 555 | return; |
556 | } | 556 | } |
557 | len = strlen(fn); | 557 | len = strlen(fn); |
@@ -559,7 +559,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) | |||
559 | de = pde_subdir_find(parent, fn, len); | 559 | de = pde_subdir_find(parent, fn, len); |
560 | if (de) | 560 | if (de) |
561 | rb_erase(&de->subdir_node, &parent->subdir); | 561 | rb_erase(&de->subdir_node, &parent->subdir); |
562 | spin_unlock(&proc_subdir_lock); | 562 | write_unlock(&proc_subdir_lock); |
563 | if (!de) { | 563 | if (!de) { |
564 | WARN(1, "name '%s'\n", name); | 564 | WARN(1, "name '%s'\n", name); |
565 | return; | 565 | return; |
@@ -583,16 +583,16 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) | |||
583 | const char *fn = name; | 583 | const char *fn = name; |
584 | unsigned int len; | 584 | unsigned int len; |
585 | 585 | ||
586 | spin_lock(&proc_subdir_lock); | 586 | write_lock(&proc_subdir_lock); |
587 | if (__xlate_proc_name(name, &parent, &fn) != 0) { | 587 | if (__xlate_proc_name(name, &parent, &fn) != 0) { |
588 | spin_unlock(&proc_subdir_lock); | 588 | write_unlock(&proc_subdir_lock); |
589 | return -ENOENT; | 589 | return -ENOENT; |
590 | } | 590 | } |
591 | len = strlen(fn); | 591 | len = strlen(fn); |
592 | 592 | ||
593 | root = pde_subdir_find(parent, fn, len); | 593 | root = pde_subdir_find(parent, fn, len); |
594 | if (!root) { | 594 | if (!root) { |
595 | spin_unlock(&proc_subdir_lock); | 595 | write_unlock(&proc_subdir_lock); |
596 | return -ENOENT; | 596 | return -ENOENT; |
597 | } | 597 | } |
598 | rb_erase(&root->subdir_node, &parent->subdir); | 598 | rb_erase(&root->subdir_node, &parent->subdir); |
@@ -605,7 +605,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) | |||
605 | de = next; | 605 | de = next; |
606 | continue; | 606 | continue; |
607 | } | 607 | } |
608 | spin_unlock(&proc_subdir_lock); | 608 | write_unlock(&proc_subdir_lock); |
609 | 609 | ||
610 | proc_entry_rundown(de); | 610 | proc_entry_rundown(de); |
611 | next = de->parent; | 611 | next = de->parent; |
@@ -616,7 +616,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) | |||
616 | break; | 616 | break; |
617 | pde_put(de); | 617 | pde_put(de); |
618 | 618 | ||
619 | spin_lock(&proc_subdir_lock); | 619 | write_lock(&proc_subdir_lock); |
620 | de = next; | 620 | de = next; |
621 | } | 621 | } |
622 | pde_put(root); | 622 | pde_put(root); |
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d3ebf2e61853..9155a5a0d3b9 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c | |||
@@ -27,7 +27,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
27 | { | 27 | { |
28 | struct sysinfo i; | 28 | struct sysinfo i; |
29 | unsigned long committed; | 29 | unsigned long committed; |
30 | struct vmalloc_info vmi; | ||
31 | long cached; | 30 | long cached; |
32 | long available; | 31 | long available; |
33 | unsigned long pagecache; | 32 | unsigned long pagecache; |
@@ -49,8 +48,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
49 | if (cached < 0) | 48 | if (cached < 0) |
50 | cached = 0; | 49 | cached = 0; |
51 | 50 | ||
52 | get_vmalloc_info(&vmi); | ||
53 | |||
54 | for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) | 51 | for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) |
55 | pages[lru] = global_page_state(NR_LRU_BASE + lru); | 52 | pages[lru] = global_page_state(NR_LRU_BASE + lru); |
56 | 53 | ||
@@ -191,8 +188,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) | |||
191 | K(vm_commit_limit()), | 188 | K(vm_commit_limit()), |
192 | K(committed), | 189 | K(committed), |
193 | (unsigned long)VMALLOC_TOTAL >> 10, | 190 | (unsigned long)VMALLOC_TOTAL >> 10, |
194 | vmi.used >> 10, | 191 | 0ul, // used to be vmalloc 'used' |
195 | vmi.largest_chunk >> 10 | 192 | 0ul // used to be vmalloc 'largest_chunk' |
196 | #ifdef CONFIG_MEMORY_FAILURE | 193 | #ifdef CONFIG_MEMORY_FAILURE |
197 | , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) | 194 | , atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10) |
198 | #endif | 195 | #endif |
diff --git a/fs/proc/page.c b/fs/proc/page.c index 7eee2d8b97d9..93484034a03d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c | |||
@@ -9,12 +9,16 @@ | |||
9 | #include <linux/proc_fs.h> | 9 | #include <linux/proc_fs.h> |
10 | #include <linux/seq_file.h> | 10 | #include <linux/seq_file.h> |
11 | #include <linux/hugetlb.h> | 11 | #include <linux/hugetlb.h> |
12 | #include <linux/memcontrol.h> | ||
13 | #include <linux/mmu_notifier.h> | ||
14 | #include <linux/page_idle.h> | ||
12 | #include <linux/kernel-page-flags.h> | 15 | #include <linux/kernel-page-flags.h> |
13 | #include <asm/uaccess.h> | 16 | #include <asm/uaccess.h> |
14 | #include "internal.h" | 17 | #include "internal.h" |
15 | 18 | ||
16 | #define KPMSIZE sizeof(u64) | 19 | #define KPMSIZE sizeof(u64) |
17 | #define KPMMASK (KPMSIZE - 1) | 20 | #define KPMMASK (KPMSIZE - 1) |
21 | #define KPMBITS (KPMSIZE * BITS_PER_BYTE) | ||
18 | 22 | ||
19 | /* /proc/kpagecount - an array exposing page counts | 23 | /* /proc/kpagecount - an array exposing page counts |
20 | * | 24 | * |
@@ -54,6 +58,8 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, | |||
54 | pfn++; | 58 | pfn++; |
55 | out++; | 59 | out++; |
56 | count -= KPMSIZE; | 60 | count -= KPMSIZE; |
61 | |||
62 | cond_resched(); | ||
57 | } | 63 | } |
58 | 64 | ||
59 | *ppos += (char __user *)out - buf; | 65 | *ppos += (char __user *)out - buf; |
@@ -146,6 +152,9 @@ u64 stable_page_flags(struct page *page) | |||
146 | if (PageBalloon(page)) | 152 | if (PageBalloon(page)) |
147 | u |= 1 << KPF_BALLOON; | 153 | u |= 1 << KPF_BALLOON; |
148 | 154 | ||
155 | if (page_is_idle(page)) | ||
156 | u |= 1 << KPF_IDLE; | ||
157 | |||
149 | u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); | 158 | u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); |
150 | 159 | ||
151 | u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); | 160 | u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); |
@@ -212,6 +221,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, | |||
212 | pfn++; | 221 | pfn++; |
213 | out++; | 222 | out++; |
214 | count -= KPMSIZE; | 223 | count -= KPMSIZE; |
224 | |||
225 | cond_resched(); | ||
215 | } | 226 | } |
216 | 227 | ||
217 | *ppos += (char __user *)out - buf; | 228 | *ppos += (char __user *)out - buf; |
@@ -225,10 +236,64 @@ static const struct file_operations proc_kpageflags_operations = { | |||
225 | .read = kpageflags_read, | 236 | .read = kpageflags_read, |
226 | }; | 237 | }; |
227 | 238 | ||
239 | #ifdef CONFIG_MEMCG | ||
240 | static ssize_t kpagecgroup_read(struct file *file, char __user *buf, | ||
241 | size_t count, loff_t *ppos) | ||
242 | { | ||
243 | u64 __user *out = (u64 __user *)buf; | ||
244 | struct page *ppage; | ||
245 | unsigned long src = *ppos; | ||
246 | unsigned long pfn; | ||
247 | ssize_t ret = 0; | ||
248 | u64 ino; | ||
249 | |||
250 | pfn = src / KPMSIZE; | ||
251 | count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); | ||
252 | if (src & KPMMASK || count & KPMMASK) | ||
253 | return -EINVAL; | ||
254 | |||
255 | while (count > 0) { | ||
256 | if (pfn_valid(pfn)) | ||
257 | ppage = pfn_to_page(pfn); | ||
258 | else | ||
259 | ppage = NULL; | ||
260 | |||
261 | if (ppage) | ||
262 | ino = page_cgroup_ino(ppage); | ||
263 | else | ||
264 | ino = 0; | ||
265 | |||
266 | if (put_user(ino, out)) { | ||
267 | ret = -EFAULT; | ||
268 | break; | ||
269 | } | ||
270 | |||
271 | pfn++; | ||
272 | out++; | ||
273 | count -= KPMSIZE; | ||
274 | |||
275 | cond_resched(); | ||
276 | } | ||
277 | |||
278 | *ppos += (char __user *)out - buf; | ||
279 | if (!ret) | ||
280 | ret = (char __user *)out - buf; | ||
281 | return ret; | ||
282 | } | ||
283 | |||
284 | static const struct file_operations proc_kpagecgroup_operations = { | ||
285 | .llseek = mem_lseek, | ||
286 | .read = kpagecgroup_read, | ||
287 | }; | ||
288 | #endif /* CONFIG_MEMCG */ | ||
289 | |||
228 | static int __init proc_page_init(void) | 290 | static int __init proc_page_init(void) |
229 | { | 291 | { |
230 | proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); | 292 | proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); |
231 | proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); | 293 | proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); |
294 | #ifdef CONFIG_MEMCG | ||
295 | proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations); | ||
296 | #endif | ||
232 | return 0; | 297 | return 0; |
233 | } | 298 | } |
234 | fs_initcall(proc_page_init); | 299 | fs_initcall(proc_page_init); |
diff --git a/fs/proc/root.c b/fs/proc/root.c index 68feb0f70e63..361ab4ee42fc 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c | |||
@@ -134,6 +134,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, | |||
134 | } | 134 | } |
135 | 135 | ||
136 | sb->s_flags |= MS_ACTIVE; | 136 | sb->s_flags |= MS_ACTIVE; |
137 | /* User space would break if executables appear on proc */ | ||
138 | sb->s_iflags |= SB_I_NOEXEC; | ||
137 | } | 139 | } |
138 | 140 | ||
139 | return dget(sb->s_root); | 141 | return dget(sb->s_root); |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca1e091881d4..187b3b5f242e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/swap.h> | 13 | #include <linux/swap.h> |
14 | #include <linux/swapops.h> | 14 | #include <linux/swapops.h> |
15 | #include <linux/mmu_notifier.h> | 15 | #include <linux/mmu_notifier.h> |
16 | #include <linux/page_idle.h> | ||
16 | 17 | ||
17 | #include <asm/elf.h> | 18 | #include <asm/elf.h> |
18 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
@@ -69,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) | |||
69 | ptes >> 10, | 70 | ptes >> 10, |
70 | pmds >> 10, | 71 | pmds >> 10, |
71 | swap << (PAGE_SHIFT-10)); | 72 | swap << (PAGE_SHIFT-10)); |
73 | hugetlb_report_usage(m, mm); | ||
72 | } | 74 | } |
73 | 75 | ||
74 | unsigned long task_vsize(struct mm_struct *mm) | 76 | unsigned long task_vsize(struct mm_struct *mm) |
@@ -445,7 +447,10 @@ struct mem_size_stats { | |||
445 | unsigned long anonymous; | 447 | unsigned long anonymous; |
446 | unsigned long anonymous_thp; | 448 | unsigned long anonymous_thp; |
447 | unsigned long swap; | 449 | unsigned long swap; |
450 | unsigned long shared_hugetlb; | ||
451 | unsigned long private_hugetlb; | ||
448 | u64 pss; | 452 | u64 pss; |
453 | u64 swap_pss; | ||
449 | }; | 454 | }; |
450 | 455 | ||
451 | static void smaps_account(struct mem_size_stats *mss, struct page *page, | 456 | static void smaps_account(struct mem_size_stats *mss, struct page *page, |
@@ -458,7 +463,7 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, | |||
458 | 463 | ||
459 | mss->resident += size; | 464 | mss->resident += size; |
460 | /* Accumulate the size in pages that have been accessed. */ | 465 | /* Accumulate the size in pages that have been accessed. */ |
461 | if (young || PageReferenced(page)) | 466 | if (young || page_is_young(page) || PageReferenced(page)) |
462 | mss->referenced += size; | 467 | mss->referenced += size; |
463 | mapcount = page_mapcount(page); | 468 | mapcount = page_mapcount(page); |
464 | if (mapcount >= 2) { | 469 | if (mapcount >= 2) { |
@@ -492,9 +497,20 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, | |||
492 | } else if (is_swap_pte(*pte)) { | 497 | } else if (is_swap_pte(*pte)) { |
493 | swp_entry_t swpent = pte_to_swp_entry(*pte); | 498 | swp_entry_t swpent = pte_to_swp_entry(*pte); |
494 | 499 | ||
495 | if (!non_swap_entry(swpent)) | 500 | if (!non_swap_entry(swpent)) { |
501 | int mapcount; | ||
502 | |||
496 | mss->swap += PAGE_SIZE; | 503 | mss->swap += PAGE_SIZE; |
497 | else if (is_migration_entry(swpent)) | 504 | mapcount = swp_swapcount(swpent); |
505 | if (mapcount >= 2) { | ||
506 | u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; | ||
507 | |||
508 | do_div(pss_delta, mapcount); | ||
509 | mss->swap_pss += pss_delta; | ||
510 | } else { | ||
511 | mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; | ||
512 | } | ||
513 | } else if (is_migration_entry(swpent)) | ||
498 | page = migration_entry_to_page(swpent); | 514 | page = migration_entry_to_page(swpent); |
499 | } | 515 | } |
500 | 516 | ||
@@ -597,6 +613,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) | |||
597 | [ilog2(VM_HUGEPAGE)] = "hg", | 613 | [ilog2(VM_HUGEPAGE)] = "hg", |
598 | [ilog2(VM_NOHUGEPAGE)] = "nh", | 614 | [ilog2(VM_NOHUGEPAGE)] = "nh", |
599 | [ilog2(VM_MERGEABLE)] = "mg", | 615 | [ilog2(VM_MERGEABLE)] = "mg", |
616 | [ilog2(VM_UFFD_MISSING)]= "um", | ||
617 | [ilog2(VM_UFFD_WP)] = "uw", | ||
600 | }; | 618 | }; |
601 | size_t i; | 619 | size_t i; |
602 | 620 | ||
@@ -610,12 +628,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) | |||
610 | seq_putc(m, '\n'); | 628 | seq_putc(m, '\n'); |
611 | } | 629 | } |
612 | 630 | ||
631 | #ifdef CONFIG_HUGETLB_PAGE | ||
632 | static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, | ||
633 | unsigned long addr, unsigned long end, | ||
634 | struct mm_walk *walk) | ||
635 | { | ||
636 | struct mem_size_stats *mss = walk->private; | ||
637 | struct vm_area_struct *vma = walk->vma; | ||
638 | struct page *page = NULL; | ||
639 | |||
640 | if (pte_present(*pte)) { | ||
641 | page = vm_normal_page(vma, addr, *pte); | ||
642 | } else if (is_swap_pte(*pte)) { | ||
643 | swp_entry_t swpent = pte_to_swp_entry(*pte); | ||
644 | |||
645 | if (is_migration_entry(swpent)) | ||
646 | page = migration_entry_to_page(swpent); | ||
647 | } | ||
648 | if (page) { | ||
649 | int mapcount = page_mapcount(page); | ||
650 | |||
651 | if (mapcount >= 2) | ||
652 | mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); | ||
653 | else | ||
654 | mss->private_hugetlb += huge_page_size(hstate_vma(vma)); | ||
655 | } | ||
656 | return 0; | ||
657 | } | ||
658 | #endif /* HUGETLB_PAGE */ | ||
659 | |||
613 | static int show_smap(struct seq_file *m, void *v, int is_pid) | 660 | static int show_smap(struct seq_file *m, void *v, int is_pid) |
614 | { | 661 | { |
615 | struct vm_area_struct *vma = v; | 662 | struct vm_area_struct *vma = v; |
616 | struct mem_size_stats mss; | 663 | struct mem_size_stats mss; |
617 | struct mm_walk smaps_walk = { | 664 | struct mm_walk smaps_walk = { |
618 | .pmd_entry = smaps_pte_range, | 665 | .pmd_entry = smaps_pte_range, |
666 | #ifdef CONFIG_HUGETLB_PAGE | ||
667 | .hugetlb_entry = smaps_hugetlb_range, | ||
668 | #endif | ||
619 | .mm = vma->vm_mm, | 669 | .mm = vma->vm_mm, |
620 | .private = &mss, | 670 | .private = &mss, |
621 | }; | 671 | }; |
@@ -637,7 +687,10 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) | |||
637 | "Referenced: %8lu kB\n" | 687 | "Referenced: %8lu kB\n" |
638 | "Anonymous: %8lu kB\n" | 688 | "Anonymous: %8lu kB\n" |
639 | "AnonHugePages: %8lu kB\n" | 689 | "AnonHugePages: %8lu kB\n" |
690 | "Shared_Hugetlb: %8lu kB\n" | ||
691 | "Private_Hugetlb: %7lu kB\n" | ||
640 | "Swap: %8lu kB\n" | 692 | "Swap: %8lu kB\n" |
693 | "SwapPss: %8lu kB\n" | ||
641 | "KernelPageSize: %8lu kB\n" | 694 | "KernelPageSize: %8lu kB\n" |
642 | "MMUPageSize: %8lu kB\n" | 695 | "MMUPageSize: %8lu kB\n" |
643 | "Locked: %8lu kB\n", | 696 | "Locked: %8lu kB\n", |
@@ -651,7 +704,10 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) | |||
651 | mss.referenced >> 10, | 704 | mss.referenced >> 10, |
652 | mss.anonymous >> 10, | 705 | mss.anonymous >> 10, |
653 | mss.anonymous_thp >> 10, | 706 | mss.anonymous_thp >> 10, |
707 | mss.shared_hugetlb >> 10, | ||
708 | mss.private_hugetlb >> 10, | ||
654 | mss.swap >> 10, | 709 | mss.swap >> 10, |
710 | (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), | ||
655 | vma_kernel_pagesize(vma) >> 10, | 711 | vma_kernel_pagesize(vma) >> 10, |
656 | vma_mmu_pagesize(vma) >> 10, | 712 | vma_mmu_pagesize(vma) >> 10, |
657 | (vma->vm_flags & VM_LOCKED) ? | 713 | (vma->vm_flags & VM_LOCKED) ? |
@@ -710,23 +766,6 @@ const struct file_operations proc_tid_smaps_operations = { | |||
710 | .release = proc_map_release, | 766 | .release = proc_map_release, |
711 | }; | 767 | }; |
712 | 768 | ||
713 | /* | ||
714 | * We do not want to have constant page-shift bits sitting in | ||
715 | * pagemap entries and are about to reuse them some time soon. | ||
716 | * | ||
717 | * Here's the "migration strategy": | ||
718 | * 1. when the system boots these bits remain what they are, | ||
719 | * but a warning about future change is printed in log; | ||
720 | * 2. once anyone clears soft-dirty bits via clear_refs file, | ||
721 | * these flag is set to denote, that user is aware of the | ||
722 | * new API and those page-shift bits change their meaning. | ||
723 | * The respective warning is printed in dmesg; | ||
724 | * 3. In a couple of releases we will remove all the mentions | ||
725 | * of page-shift in pagemap entries. | ||
726 | */ | ||
727 | |||
728 | static bool soft_dirty_cleared __read_mostly; | ||
729 | |||
730 | enum clear_refs_types { | 769 | enum clear_refs_types { |
731 | CLEAR_REFS_ALL = 1, | 770 | CLEAR_REFS_ALL = 1, |
732 | CLEAR_REFS_ANON, | 771 | CLEAR_REFS_ANON, |
@@ -753,36 +792,37 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, | |||
753 | pte_t ptent = *pte; | 792 | pte_t ptent = *pte; |
754 | 793 | ||
755 | if (pte_present(ptent)) { | 794 | if (pte_present(ptent)) { |
795 | ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte); | ||
756 | ptent = pte_wrprotect(ptent); | 796 | ptent = pte_wrprotect(ptent); |
757 | ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY); | 797 | ptent = pte_clear_soft_dirty(ptent); |
798 | ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent); | ||
758 | } else if (is_swap_pte(ptent)) { | 799 | } else if (is_swap_pte(ptent)) { |
759 | ptent = pte_swp_clear_soft_dirty(ptent); | 800 | ptent = pte_swp_clear_soft_dirty(ptent); |
801 | set_pte_at(vma->vm_mm, addr, pte, ptent); | ||
760 | } | 802 | } |
761 | |||
762 | set_pte_at(vma->vm_mm, addr, pte, ptent); | ||
763 | } | 803 | } |
804 | #else | ||
805 | static inline void clear_soft_dirty(struct vm_area_struct *vma, | ||
806 | unsigned long addr, pte_t *pte) | ||
807 | { | ||
808 | } | ||
809 | #endif | ||
764 | 810 | ||
811 | #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
765 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, | 812 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, |
766 | unsigned long addr, pmd_t *pmdp) | 813 | unsigned long addr, pmd_t *pmdp) |
767 | { | 814 | { |
768 | pmd_t pmd = *pmdp; | 815 | pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp); |
769 | 816 | ||
770 | pmd = pmd_wrprotect(pmd); | 817 | pmd = pmd_wrprotect(pmd); |
771 | pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); | 818 | pmd = pmd_clear_soft_dirty(pmd); |
772 | 819 | ||
773 | if (vma->vm_flags & VM_SOFTDIRTY) | 820 | if (vma->vm_flags & VM_SOFTDIRTY) |
774 | vma->vm_flags &= ~VM_SOFTDIRTY; | 821 | vma->vm_flags &= ~VM_SOFTDIRTY; |
775 | 822 | ||
776 | set_pmd_at(vma->vm_mm, addr, pmdp, pmd); | 823 | set_pmd_at(vma->vm_mm, addr, pmdp, pmd); |
777 | } | 824 | } |
778 | |||
779 | #else | 825 | #else |
780 | |||
781 | static inline void clear_soft_dirty(struct vm_area_struct *vma, | ||
782 | unsigned long addr, pte_t *pte) | ||
783 | { | ||
784 | } | ||
785 | |||
786 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, | 826 | static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, |
787 | unsigned long addr, pmd_t *pmdp) | 827 | unsigned long addr, pmd_t *pmdp) |
788 | { | 828 | { |
@@ -808,6 +848,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, | |||
808 | 848 | ||
809 | /* Clear accessed and referenced bits. */ | 849 | /* Clear accessed and referenced bits. */ |
810 | pmdp_test_and_clear_young(vma, addr, pmd); | 850 | pmdp_test_and_clear_young(vma, addr, pmd); |
851 | test_and_clear_page_young(page); | ||
811 | ClearPageReferenced(page); | 852 | ClearPageReferenced(page); |
812 | out: | 853 | out: |
813 | spin_unlock(ptl); | 854 | spin_unlock(ptl); |
@@ -835,6 +876,7 @@ out: | |||
835 | 876 | ||
836 | /* Clear accessed and referenced bits. */ | 877 | /* Clear accessed and referenced bits. */ |
837 | ptep_test_and_clear_young(vma, addr, pte); | 878 | ptep_test_and_clear_young(vma, addr, pte); |
879 | test_and_clear_page_young(page); | ||
838 | ClearPageReferenced(page); | 880 | ClearPageReferenced(page); |
839 | } | 881 | } |
840 | pte_unmap_unlock(pte - 1, ptl); | 882 | pte_unmap_unlock(pte - 1, ptl); |
@@ -887,13 +929,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, | |||
887 | if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) | 929 | if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) |
888 | return -EINVAL; | 930 | return -EINVAL; |
889 | 931 | ||
890 | if (type == CLEAR_REFS_SOFT_DIRTY) { | ||
891 | soft_dirty_cleared = true; | ||
892 | pr_warn_once("The pagemap bits 55-60 has changed their meaning!" | ||
893 | " See the linux/Documentation/vm/pagemap.txt for " | ||
894 | "details.\n"); | ||
895 | } | ||
896 | |||
897 | task = get_proc_task(file_inode(file)); | 932 | task = get_proc_task(file_inode(file)); |
898 | if (!task) | 933 | if (!task) |
899 | return -ESRCH; | 934 | return -ESRCH; |
@@ -961,36 +996,26 @@ typedef struct { | |||
961 | struct pagemapread { | 996 | struct pagemapread { |
962 | int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ | 997 | int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ |
963 | pagemap_entry_t *buffer; | 998 | pagemap_entry_t *buffer; |
964 | bool v2; | 999 | bool show_pfn; |
965 | }; | 1000 | }; |
966 | 1001 | ||
967 | #define PAGEMAP_WALK_SIZE (PMD_SIZE) | 1002 | #define PAGEMAP_WALK_SIZE (PMD_SIZE) |
968 | #define PAGEMAP_WALK_MASK (PMD_MASK) | 1003 | #define PAGEMAP_WALK_MASK (PMD_MASK) |
969 | 1004 | ||
970 | #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) | 1005 | #define PM_ENTRY_BYTES sizeof(pagemap_entry_t) |
971 | #define PM_STATUS_BITS 3 | 1006 | #define PM_PFRAME_BITS 55 |
972 | #define PM_STATUS_OFFSET (64 - PM_STATUS_BITS) | 1007 | #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) |
973 | #define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET) | 1008 | #define PM_SOFT_DIRTY BIT_ULL(55) |
974 | #define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK) | 1009 | #define PM_MMAP_EXCLUSIVE BIT_ULL(56) |
975 | #define PM_PSHIFT_BITS 6 | 1010 | #define PM_FILE BIT_ULL(61) |
976 | #define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS) | 1011 | #define PM_SWAP BIT_ULL(62) |
977 | #define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET) | 1012 | #define PM_PRESENT BIT_ULL(63) |
978 | #define __PM_PSHIFT(x) (((u64) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK) | 1013 | |
979 | #define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1) | ||
980 | #define PM_PFRAME(x) ((x) & PM_PFRAME_MASK) | ||
981 | /* in "new" pagemap pshift bits are occupied with more status bits */ | ||
982 | #define PM_STATUS2(v2, x) (__PM_PSHIFT(v2 ? x : PAGE_SHIFT)) | ||
983 | |||
984 | #define __PM_SOFT_DIRTY (1LL) | ||
985 | #define PM_PRESENT PM_STATUS(4LL) | ||
986 | #define PM_SWAP PM_STATUS(2LL) | ||
987 | #define PM_FILE PM_STATUS(1LL) | ||
988 | #define PM_NOT_PRESENT(v2) PM_STATUS2(v2, 0) | ||
989 | #define PM_END_OF_BUFFER 1 | 1014 | #define PM_END_OF_BUFFER 1 |
990 | 1015 | ||
991 | static inline pagemap_entry_t make_pme(u64 val) | 1016 | static inline pagemap_entry_t make_pme(u64 frame, u64 flags) |
992 | { | 1017 | { |
993 | return (pagemap_entry_t) { .pme = val }; | 1018 | return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags }; |
994 | } | 1019 | } |
995 | 1020 | ||
996 | static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, | 1021 | static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, |
@@ -1011,7 +1036,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, | |||
1011 | 1036 | ||
1012 | while (addr < end) { | 1037 | while (addr < end) { |
1013 | struct vm_area_struct *vma = find_vma(walk->mm, addr); | 1038 | struct vm_area_struct *vma = find_vma(walk->mm, addr); |
1014 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); | 1039 | pagemap_entry_t pme = make_pme(0, 0); |
1015 | /* End of address space hole, which we mark as non-present. */ | 1040 | /* End of address space hole, which we mark as non-present. */ |
1016 | unsigned long hole_end; | 1041 | unsigned long hole_end; |
1017 | 1042 | ||
@@ -1031,7 +1056,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, | |||
1031 | 1056 | ||
1032 | /* Addresses in the VMA. */ | 1057 | /* Addresses in the VMA. */ |
1033 | if (vma->vm_flags & VM_SOFTDIRTY) | 1058 | if (vma->vm_flags & VM_SOFTDIRTY) |
1034 | pme.pme |= PM_STATUS2(pm->v2, __PM_SOFT_DIRTY); | 1059 | pme = make_pme(0, PM_SOFT_DIRTY); |
1035 | for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { | 1060 | for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { |
1036 | err = add_to_pagemap(addr, &pme, pm); | 1061 | err = add_to_pagemap(addr, &pme, pm); |
1037 | if (err) | 1062 | if (err) |
@@ -1042,67 +1067,42 @@ out: | |||
1042 | return err; | 1067 | return err; |
1043 | } | 1068 | } |
1044 | 1069 | ||
1045 | static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | 1070 | static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, |
1046 | struct vm_area_struct *vma, unsigned long addr, pte_t pte) | 1071 | struct vm_area_struct *vma, unsigned long addr, pte_t pte) |
1047 | { | 1072 | { |
1048 | u64 frame, flags; | 1073 | u64 frame = 0, flags = 0; |
1049 | struct page *page = NULL; | 1074 | struct page *page = NULL; |
1050 | int flags2 = 0; | ||
1051 | 1075 | ||
1052 | if (pte_present(pte)) { | 1076 | if (pte_present(pte)) { |
1053 | frame = pte_pfn(pte); | 1077 | if (pm->show_pfn) |
1054 | flags = PM_PRESENT; | 1078 | frame = pte_pfn(pte); |
1079 | flags |= PM_PRESENT; | ||
1055 | page = vm_normal_page(vma, addr, pte); | 1080 | page = vm_normal_page(vma, addr, pte); |
1056 | if (pte_soft_dirty(pte)) | 1081 | if (pte_soft_dirty(pte)) |
1057 | flags2 |= __PM_SOFT_DIRTY; | 1082 | flags |= PM_SOFT_DIRTY; |
1058 | } else if (is_swap_pte(pte)) { | 1083 | } else if (is_swap_pte(pte)) { |
1059 | swp_entry_t entry; | 1084 | swp_entry_t entry; |
1060 | if (pte_swp_soft_dirty(pte)) | 1085 | if (pte_swp_soft_dirty(pte)) |
1061 | flags2 |= __PM_SOFT_DIRTY; | 1086 | flags |= PM_SOFT_DIRTY; |
1062 | entry = pte_to_swp_entry(pte); | 1087 | entry = pte_to_swp_entry(pte); |
1063 | frame = swp_type(entry) | | 1088 | frame = swp_type(entry) | |
1064 | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); | 1089 | (swp_offset(entry) << MAX_SWAPFILES_SHIFT); |
1065 | flags = PM_SWAP; | 1090 | flags |= PM_SWAP; |
1066 | if (is_migration_entry(entry)) | 1091 | if (is_migration_entry(entry)) |
1067 | page = migration_entry_to_page(entry); | 1092 | page = migration_entry_to_page(entry); |
1068 | } else { | ||
1069 | if (vma->vm_flags & VM_SOFTDIRTY) | ||
1070 | flags2 |= __PM_SOFT_DIRTY; | ||
1071 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); | ||
1072 | return; | ||
1073 | } | 1093 | } |
1074 | 1094 | ||
1075 | if (page && !PageAnon(page)) | 1095 | if (page && !PageAnon(page)) |
1076 | flags |= PM_FILE; | 1096 | flags |= PM_FILE; |
1077 | if ((vma->vm_flags & VM_SOFTDIRTY)) | 1097 | if (page && page_mapcount(page) == 1) |
1078 | flags2 |= __PM_SOFT_DIRTY; | 1098 | flags |= PM_MMAP_EXCLUSIVE; |
1099 | if (vma->vm_flags & VM_SOFTDIRTY) | ||
1100 | flags |= PM_SOFT_DIRTY; | ||
1079 | 1101 | ||
1080 | *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); | 1102 | return make_pme(frame, flags); |
1081 | } | 1103 | } |
1082 | 1104 | ||
1083 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 1105 | static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, |
1084 | static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | ||
1085 | pmd_t pmd, int offset, int pmd_flags2) | ||
1086 | { | ||
1087 | /* | ||
1088 | * Currently pmd for thp is always present because thp can not be | ||
1089 | * swapped-out, migrated, or HWPOISONed (split in such cases instead.) | ||
1090 | * This if-check is just to prepare for future implementation. | ||
1091 | */ | ||
1092 | if (pmd_present(pmd)) | ||
1093 | *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) | ||
1094 | | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); | ||
1095 | else | ||
1096 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); | ||
1097 | } | ||
1098 | #else | ||
1099 | static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | ||
1100 | pmd_t pmd, int offset, int pmd_flags2) | ||
1101 | { | ||
1102 | } | ||
1103 | #endif | ||
1104 | |||
1105 | static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | ||
1106 | struct mm_walk *walk) | 1106 | struct mm_walk *walk) |
1107 | { | 1107 | { |
1108 | struct vm_area_struct *vma = walk->vma; | 1108 | struct vm_area_struct *vma = walk->vma; |
@@ -1111,41 +1111,58 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
1111 | pte_t *pte, *orig_pte; | 1111 | pte_t *pte, *orig_pte; |
1112 | int err = 0; | 1112 | int err = 0; |
1113 | 1113 | ||
1114 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1114 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
1115 | int pmd_flags2; | 1115 | if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) { |
1116 | u64 flags = 0, frame = 0; | ||
1117 | pmd_t pmd = *pmdp; | ||
1116 | 1118 | ||
1117 | if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) | 1119 | if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) |
1118 | pmd_flags2 = __PM_SOFT_DIRTY; | 1120 | flags |= PM_SOFT_DIRTY; |
1119 | else | 1121 | |
1120 | pmd_flags2 = 0; | 1122 | /* |
1123 | * Currently pmd for thp is always present because thp | ||
1124 | * can not be swapped-out, migrated, or HWPOISONed | ||
1125 | * (split in such cases instead.) | ||
1126 | * This if-check is just to prepare for future implementation. | ||
1127 | */ | ||
1128 | if (pmd_present(pmd)) { | ||
1129 | struct page *page = pmd_page(pmd); | ||
1130 | |||
1131 | if (page_mapcount(page) == 1) | ||
1132 | flags |= PM_MMAP_EXCLUSIVE; | ||
1133 | |||
1134 | flags |= PM_PRESENT; | ||
1135 | if (pm->show_pfn) | ||
1136 | frame = pmd_pfn(pmd) + | ||
1137 | ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
1138 | } | ||
1121 | 1139 | ||
1122 | for (; addr != end; addr += PAGE_SIZE) { | 1140 | for (; addr != end; addr += PAGE_SIZE) { |
1123 | unsigned long offset; | 1141 | pagemap_entry_t pme = make_pme(frame, flags); |
1124 | pagemap_entry_t pme; | ||
1125 | 1142 | ||
1126 | offset = (addr & ~PAGEMAP_WALK_MASK) >> | ||
1127 | PAGE_SHIFT; | ||
1128 | thp_pmd_to_pagemap_entry(&pme, pm, *pmd, offset, pmd_flags2); | ||
1129 | err = add_to_pagemap(addr, &pme, pm); | 1143 | err = add_to_pagemap(addr, &pme, pm); |
1130 | if (err) | 1144 | if (err) |
1131 | break; | 1145 | break; |
1146 | if (pm->show_pfn && (flags & PM_PRESENT)) | ||
1147 | frame++; | ||
1132 | } | 1148 | } |
1133 | spin_unlock(ptl); | 1149 | spin_unlock(ptl); |
1134 | return err; | 1150 | return err; |
1135 | } | 1151 | } |
1136 | 1152 | ||
1137 | if (pmd_trans_unstable(pmd)) | 1153 | if (pmd_trans_unstable(pmdp)) |
1138 | return 0; | 1154 | return 0; |
1155 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
1139 | 1156 | ||
1140 | /* | 1157 | /* |
1141 | * We can assume that @vma always points to a valid one and @end never | 1158 | * We can assume that @vma always points to a valid one and @end never |
1142 | * goes beyond vma->vm_end. | 1159 | * goes beyond vma->vm_end. |
1143 | */ | 1160 | */ |
1144 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | 1161 | orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); |
1145 | for (; addr < end; pte++, addr += PAGE_SIZE) { | 1162 | for (; addr < end; pte++, addr += PAGE_SIZE) { |
1146 | pagemap_entry_t pme; | 1163 | pagemap_entry_t pme; |
1147 | 1164 | ||
1148 | pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); | 1165 | pme = pte_to_pagemap_entry(pm, vma, addr, *pte); |
1149 | err = add_to_pagemap(addr, &pme, pm); | 1166 | err = add_to_pagemap(addr, &pme, pm); |
1150 | if (err) | 1167 | if (err) |
1151 | break; | 1168 | break; |
@@ -1158,40 +1175,44 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
1158 | } | 1175 | } |
1159 | 1176 | ||
1160 | #ifdef CONFIG_HUGETLB_PAGE | 1177 | #ifdef CONFIG_HUGETLB_PAGE |
1161 | static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | ||
1162 | pte_t pte, int offset, int flags2) | ||
1163 | { | ||
1164 | if (pte_present(pte)) | ||
1165 | *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | | ||
1166 | PM_STATUS2(pm->v2, flags2) | | ||
1167 | PM_PRESENT); | ||
1168 | else | ||
1169 | *pme = make_pme(PM_NOT_PRESENT(pm->v2) | | ||
1170 | PM_STATUS2(pm->v2, flags2)); | ||
1171 | } | ||
1172 | |||
1173 | /* This function walks within one hugetlb entry in the single call */ | 1178 | /* This function walks within one hugetlb entry in the single call */ |
1174 | static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, | 1179 | static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, |
1175 | unsigned long addr, unsigned long end, | 1180 | unsigned long addr, unsigned long end, |
1176 | struct mm_walk *walk) | 1181 | struct mm_walk *walk) |
1177 | { | 1182 | { |
1178 | struct pagemapread *pm = walk->private; | 1183 | struct pagemapread *pm = walk->private; |
1179 | struct vm_area_struct *vma = walk->vma; | 1184 | struct vm_area_struct *vma = walk->vma; |
1185 | u64 flags = 0, frame = 0; | ||
1180 | int err = 0; | 1186 | int err = 0; |
1181 | int flags2; | 1187 | pte_t pte; |
1182 | pagemap_entry_t pme; | ||
1183 | 1188 | ||
1184 | if (vma->vm_flags & VM_SOFTDIRTY) | 1189 | if (vma->vm_flags & VM_SOFTDIRTY) |
1185 | flags2 = __PM_SOFT_DIRTY; | 1190 | flags |= PM_SOFT_DIRTY; |
1186 | else | 1191 | |
1187 | flags2 = 0; | 1192 | pte = huge_ptep_get(ptep); |
1193 | if (pte_present(pte)) { | ||
1194 | struct page *page = pte_page(pte); | ||
1195 | |||
1196 | if (!PageAnon(page)) | ||
1197 | flags |= PM_FILE; | ||
1198 | |||
1199 | if (page_mapcount(page) == 1) | ||
1200 | flags |= PM_MMAP_EXCLUSIVE; | ||
1201 | |||
1202 | flags |= PM_PRESENT; | ||
1203 | if (pm->show_pfn) | ||
1204 | frame = pte_pfn(pte) + | ||
1205 | ((addr & ~hmask) >> PAGE_SHIFT); | ||
1206 | } | ||
1188 | 1207 | ||
1189 | for (; addr != end; addr += PAGE_SIZE) { | 1208 | for (; addr != end; addr += PAGE_SIZE) { |
1190 | int offset = (addr & ~hmask) >> PAGE_SHIFT; | 1209 | pagemap_entry_t pme = make_pme(frame, flags); |
1191 | huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); | 1210 | |
1192 | err = add_to_pagemap(addr, &pme, pm); | 1211 | err = add_to_pagemap(addr, &pme, pm); |
1193 | if (err) | 1212 | if (err) |
1194 | return err; | 1213 | return err; |
1214 | if (pm->show_pfn && (flags & PM_PRESENT)) | ||
1215 | frame++; | ||
1195 | } | 1216 | } |
1196 | 1217 | ||
1197 | cond_resched(); | 1218 | cond_resched(); |
@@ -1209,7 +1230,9 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, | |||
1209 | * Bits 0-54 page frame number (PFN) if present | 1230 | * Bits 0-54 page frame number (PFN) if present |
1210 | * Bits 0-4 swap type if swapped | 1231 | * Bits 0-4 swap type if swapped |
1211 | * Bits 5-54 swap offset if swapped | 1232 | * Bits 5-54 swap offset if swapped |
1212 | * Bits 55-60 page shift (page size = 1<<page shift) | 1233 | * Bit 55 pte is soft-dirty (see Documentation/vm/soft-dirty.txt) |
1234 | * Bit 56 page exclusively mapped | ||
1235 | * Bits 57-60 zero | ||
1213 | * Bit 61 page is file-page or shared-anon | 1236 | * Bit 61 page is file-page or shared-anon |
1214 | * Bit 62 page swapped | 1237 | * Bit 62 page swapped |
1215 | * Bit 63 page present | 1238 | * Bit 63 page present |
@@ -1227,42 +1250,37 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, | |||
1227 | static ssize_t pagemap_read(struct file *file, char __user *buf, | 1250 | static ssize_t pagemap_read(struct file *file, char __user *buf, |
1228 | size_t count, loff_t *ppos) | 1251 | size_t count, loff_t *ppos) |
1229 | { | 1252 | { |
1230 | struct task_struct *task = get_proc_task(file_inode(file)); | 1253 | struct mm_struct *mm = file->private_data; |
1231 | struct mm_struct *mm; | ||
1232 | struct pagemapread pm; | 1254 | struct pagemapread pm; |
1233 | int ret = -ESRCH; | ||
1234 | struct mm_walk pagemap_walk = {}; | 1255 | struct mm_walk pagemap_walk = {}; |
1235 | unsigned long src; | 1256 | unsigned long src; |
1236 | unsigned long svpfn; | 1257 | unsigned long svpfn; |
1237 | unsigned long start_vaddr; | 1258 | unsigned long start_vaddr; |
1238 | unsigned long end_vaddr; | 1259 | unsigned long end_vaddr; |
1239 | int copied = 0; | 1260 | int ret = 0, copied = 0; |
1240 | 1261 | ||
1241 | if (!task) | 1262 | if (!mm || !atomic_inc_not_zero(&mm->mm_users)) |
1242 | goto out; | 1263 | goto out; |
1243 | 1264 | ||
1244 | ret = -EINVAL; | 1265 | ret = -EINVAL; |
1245 | /* file position must be aligned */ | 1266 | /* file position must be aligned */ |
1246 | if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) | 1267 | if ((*ppos % PM_ENTRY_BYTES) || (count % PM_ENTRY_BYTES)) |
1247 | goto out_task; | 1268 | goto out_mm; |
1248 | 1269 | ||
1249 | ret = 0; | 1270 | ret = 0; |
1250 | if (!count) | 1271 | if (!count) |
1251 | goto out_task; | 1272 | goto out_mm; |
1273 | |||
1274 | /* do not disclose physical addresses: attack vector */ | ||
1275 | pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); | ||
1252 | 1276 | ||
1253 | pm.v2 = soft_dirty_cleared; | ||
1254 | pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); | 1277 | pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); |
1255 | pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); | 1278 | pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); |
1256 | ret = -ENOMEM; | 1279 | ret = -ENOMEM; |
1257 | if (!pm.buffer) | 1280 | if (!pm.buffer) |
1258 | goto out_task; | 1281 | goto out_mm; |
1259 | |||
1260 | mm = mm_access(task, PTRACE_MODE_READ); | ||
1261 | ret = PTR_ERR(mm); | ||
1262 | if (!mm || IS_ERR(mm)) | ||
1263 | goto out_free; | ||
1264 | 1282 | ||
1265 | pagemap_walk.pmd_entry = pagemap_pte_range; | 1283 | pagemap_walk.pmd_entry = pagemap_pmd_range; |
1266 | pagemap_walk.pte_hole = pagemap_pte_hole; | 1284 | pagemap_walk.pte_hole = pagemap_pte_hole; |
1267 | #ifdef CONFIG_HUGETLB_PAGE | 1285 | #ifdef CONFIG_HUGETLB_PAGE |
1268 | pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; | 1286 | pagemap_walk.hugetlb_entry = pagemap_hugetlb_range; |
@@ -1273,10 +1291,10 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, | |||
1273 | src = *ppos; | 1291 | src = *ppos; |
1274 | svpfn = src / PM_ENTRY_BYTES; | 1292 | svpfn = src / PM_ENTRY_BYTES; |
1275 | start_vaddr = svpfn << PAGE_SHIFT; | 1293 | start_vaddr = svpfn << PAGE_SHIFT; |
1276 | end_vaddr = TASK_SIZE_OF(task); | 1294 | end_vaddr = mm->task_size; |
1277 | 1295 | ||
1278 | /* watch out for wraparound */ | 1296 | /* watch out for wraparound */ |
1279 | if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) | 1297 | if (svpfn > mm->task_size >> PAGE_SHIFT) |
1280 | start_vaddr = end_vaddr; | 1298 | start_vaddr = end_vaddr; |
1281 | 1299 | ||
1282 | /* | 1300 | /* |
@@ -1303,7 +1321,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, | |||
1303 | len = min(count, PM_ENTRY_BYTES * pm.pos); | 1321 | len = min(count, PM_ENTRY_BYTES * pm.pos); |
1304 | if (copy_to_user(buf, pm.buffer, len)) { | 1322 | if (copy_to_user(buf, pm.buffer, len)) { |
1305 | ret = -EFAULT; | 1323 | ret = -EFAULT; |
1306 | goto out_mm; | 1324 | goto out_free; |
1307 | } | 1325 | } |
1308 | copied += len; | 1326 | copied += len; |
1309 | buf += len; | 1327 | buf += len; |
@@ -1313,24 +1331,31 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, | |||
1313 | if (!ret || ret == PM_END_OF_BUFFER) | 1331 | if (!ret || ret == PM_END_OF_BUFFER) |
1314 | ret = copied; | 1332 | ret = copied; |
1315 | 1333 | ||
1316 | out_mm: | ||
1317 | mmput(mm); | ||
1318 | out_free: | 1334 | out_free: |
1319 | kfree(pm.buffer); | 1335 | kfree(pm.buffer); |
1320 | out_task: | 1336 | out_mm: |
1321 | put_task_struct(task); | 1337 | mmput(mm); |
1322 | out: | 1338 | out: |
1323 | return ret; | 1339 | return ret; |
1324 | } | 1340 | } |
1325 | 1341 | ||
1326 | static int pagemap_open(struct inode *inode, struct file *file) | 1342 | static int pagemap_open(struct inode *inode, struct file *file) |
1327 | { | 1343 | { |
1328 | /* do not disclose physical addresses: attack vector */ | 1344 | struct mm_struct *mm; |
1329 | if (!capable(CAP_SYS_ADMIN)) | 1345 | |
1330 | return -EPERM; | 1346 | mm = proc_mem_open(inode, PTRACE_MODE_READ); |
1331 | pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about " | 1347 | if (IS_ERR(mm)) |
1332 | "to stop being page-shift some time soon. See the " | 1348 | return PTR_ERR(mm); |
1333 | "linux/Documentation/vm/pagemap.txt for details.\n"); | 1349 | file->private_data = mm; |
1350 | return 0; | ||
1351 | } | ||
1352 | |||
1353 | static int pagemap_release(struct inode *inode, struct file *file) | ||
1354 | { | ||
1355 | struct mm_struct *mm = file->private_data; | ||
1356 | |||
1357 | if (mm) | ||
1358 | mmdrop(mm); | ||
1334 | return 0; | 1359 | return 0; |
1335 | } | 1360 | } |
1336 | 1361 | ||
@@ -1338,6 +1363,7 @@ const struct file_operations proc_pagemap_operations = { | |||
1338 | .llseek = mem_lseek, /* borrow this */ | 1363 | .llseek = mem_lseek, /* borrow this */ |
1339 | .read = pagemap_read, | 1364 | .read = pagemap_read, |
1340 | .open = pagemap_open, | 1365 | .open = pagemap_open, |
1366 | .release = pagemap_release, | ||
1341 | }; | 1367 | }; |
1342 | #endif /* CONFIG_PROC_PAGE_MONITOR */ | 1368 | #endif /* CONFIG_PROC_PAGE_MONITOR */ |
1343 | 1369 | ||