aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Mackall <mpm@selenic.com>2008-02-05 01:29:04 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-05 12:44:16 -0500
commit85863e475e59afb027b0113290e3796ee6020b7d (patch)
tree047cc687b98c0261bd3c083f17c090fbf082355f
parenta6198797cc3fd659b2d81cdf6bb6b9bba9cd93e9 (diff)
maps4: add /proc/pid/pagemap interface
This interface provides a mapping for each page in an address space to its physical page frame number, allowing precise determination of what pages are mapped and what pages are shared between processes. New in this version: - headers gone again (as recommended by Dave Hansen and Alan Cox) - 64-bit entries (as per discussion with Andi Kleen) - swap pte information exported (from Dave Hansen) - page walker callback for holes (from Dave Hansen) - direct put_user I/O (as suggested by Rusty Russell) This patch folds in cleanups and swap PTE support from Dave Hansen <haveblue@us.ibm.com>. Signed-off-by: Matt Mackall <mpm@selenic.com> Cc: Dave Hansen <haveblue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/task_mmu.c200
3 files changed, 204 insertions, 2 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1bd646d3fe9a..9004db04efa0 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -783,7 +783,7 @@ out_no_task:
783} 783}
784#endif 784#endif
785 785
786static loff_t mem_lseek(struct file * file, loff_t offset, int orig) 786loff_t mem_lseek(struct file *file, loff_t offset, int orig)
787{ 787{
788 switch (orig) { 788 switch (orig) {
789 case 0: 789 case 0:
@@ -2252,6 +2252,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2252#ifdef CONFIG_MMU 2252#ifdef CONFIG_MMU
2253 REG("clear_refs", S_IWUSR, clear_refs), 2253 REG("clear_refs", S_IWUSR, clear_refs),
2254 REG("smaps", S_IRUGO, smaps), 2254 REG("smaps", S_IRUGO, smaps),
2255 REG("pagemap", S_IRUSR, pagemap),
2255#endif 2256#endif
2256#ifdef CONFIG_SECURITY 2257#ifdef CONFIG_SECURITY
2257 DIR("attr", S_IRUGO|S_IXUGO, attr_dir), 2258 DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
@@ -2580,6 +2581,7 @@ static const struct pid_entry tid_base_stuff[] = {
2580#ifdef CONFIG_MMU 2581#ifdef CONFIG_MMU
2581 REG("clear_refs", S_IWUSR, clear_refs), 2582 REG("clear_refs", S_IWUSR, clear_refs),
2582 REG("smaps", S_IRUGO, smaps), 2583 REG("smaps", S_IRUGO, smaps),
2584 REG("pagemap", S_IRUSR, pagemap),
2583#endif 2585#endif
2584#ifdef CONFIG_SECURITY 2586#ifdef CONFIG_SECURITY
2585 DIR("attr", S_IRUGO|S_IXUGO, attr_dir), 2587 DIR("attr", S_IRUGO|S_IXUGO, attr_dir),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index ddfaeec37492..7d57e8069924 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -52,11 +52,13 @@ extern int proc_tid_stat(struct task_struct *, char *);
52extern int proc_tgid_stat(struct task_struct *, char *); 52extern int proc_tgid_stat(struct task_struct *, char *);
53extern int proc_pid_status(struct task_struct *, char *); 53extern int proc_pid_status(struct task_struct *, char *);
54extern int proc_pid_statm(struct task_struct *, char *); 54extern int proc_pid_statm(struct task_struct *, char *);
55extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
55 56
56extern const struct file_operations proc_maps_operations; 57extern const struct file_operations proc_maps_operations;
57extern const struct file_operations proc_numa_maps_operations; 58extern const struct file_operations proc_numa_maps_operations;
58extern const struct file_operations proc_smaps_operations; 59extern const struct file_operations proc_smaps_operations;
59extern const struct file_operations proc_clear_refs_operations; 60extern const struct file_operations proc_clear_refs_operations;
61extern const struct file_operations proc_pagemap_operations;
60 62
61void free_proc_entry(struct proc_dir_entry *de); 63void free_proc_entry(struct proc_dir_entry *de);
62 64
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 308fc5451e43..bbd9b145051d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -5,7 +5,10 @@
5#include <linux/highmem.h> 5#include <linux/highmem.h>
6#include <linux/ptrace.h> 6#include <linux/ptrace.h>
7#include <linux/pagemap.h> 7#include <linux/pagemap.h>
8#include <linux/ptrace.h>
8#include <linux/mempolicy.h> 9#include <linux/mempolicy.h>
10#include <linux/swap.h>
11#include <linux/swapops.h>
9 12
10#include <asm/elf.h> 13#include <asm/elf.h>
11#include <asm/uaccess.h> 14#include <asm/uaccess.h>
@@ -519,6 +522,202 @@ const struct file_operations proc_clear_refs_operations = {
519 .write = clear_refs_write, 522 .write = clear_refs_write,
520}; 523};
521 524
525struct pagemapread {
526 char __user *out, *end;
527};
528
529#define PM_ENTRY_BYTES sizeof(u64)
530#define PM_RESERVED_BITS 3
531#define PM_RESERVED_OFFSET (64 - PM_RESERVED_BITS)
532#define PM_RESERVED_MASK (((1LL<<PM_RESERVED_BITS)-1) << PM_RESERVED_OFFSET)
533#define PM_SPECIAL(nr) (((nr) << PM_RESERVED_OFFSET) | PM_RESERVED_MASK)
534#define PM_NOT_PRESENT PM_SPECIAL(1LL)
535#define PM_SWAP PM_SPECIAL(2LL)
536#define PM_END_OF_BUFFER 1
537
538static int add_to_pagemap(unsigned long addr, u64 pfn,
539 struct pagemapread *pm)
540{
541 /*
542 * Make sure there's room in the buffer for an
543 * entire entry. Otherwise, only copy part of
544 * the pfn.
545 */
546 if (pm->out + PM_ENTRY_BYTES >= pm->end) {
547 if (copy_to_user(pm->out, &pfn, pm->end - pm->out))
548 return -EFAULT;
549 pm->out = pm->end;
550 return PM_END_OF_BUFFER;
551 }
552
553 if (put_user(pfn, pm->out))
554 return -EFAULT;
555 pm->out += PM_ENTRY_BYTES;
556 return 0;
557}
558
559static int pagemap_pte_hole(unsigned long start, unsigned long end,
560 void *private)
561{
562 struct pagemapread *pm = private;
563 unsigned long addr;
564 int err = 0;
565 for (addr = start; addr < end; addr += PAGE_SIZE) {
566 err = add_to_pagemap(addr, PM_NOT_PRESENT, pm);
567 if (err)
568 break;
569 }
570 return err;
571}
572
573u64 swap_pte_to_pagemap_entry(pte_t pte)
574{
575 swp_entry_t e = pte_to_swp_entry(pte);
576 return PM_SWAP | swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT);
577}
578
579static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
580 void *private)
581{
582 struct pagemapread *pm = private;
583 pte_t *pte;
584 int err = 0;
585
586 for (; addr != end; addr += PAGE_SIZE) {
587 u64 pfn = PM_NOT_PRESENT;
588 pte = pte_offset_map(pmd, addr);
589 if (is_swap_pte(*pte))
590 pfn = swap_pte_to_pagemap_entry(*pte);
591 else if (pte_present(*pte))
592 pfn = pte_pfn(*pte);
593 /* unmap so we're not in atomic when we copy to userspace */
594 pte_unmap(pte);
595 err = add_to_pagemap(addr, pfn, pm);
596 if (err)
597 return err;
598 }
599
600 cond_resched();
601
602 return err;
603}
604
605static struct mm_walk pagemap_walk = {
606 .pmd_entry = pagemap_pte_range,
607 .pte_hole = pagemap_pte_hole
608};
609
610/*
611 * /proc/pid/pagemap - an array mapping virtual pages to pfns
612 *
613 * For each page in the address space, this file contains one 64-bit
614 * entry representing the corresponding physical page frame number
615 * (PFN) if the page is present. If there is a swap entry for the
616 * physical page, then an encoding of the swap file number and the
617 * page's offset into the swap file are returned. If no page is
618 * present at all, PM_NOT_PRESENT is returned. This allows determining
619 * precisely which pages are mapped (or in swap) and comparing mapped
620 * pages between processes.
621 *
622 * Efficient users of this interface will use /proc/pid/maps to
623 * determine which areas of memory are actually mapped and llseek to
624 * skip over unmapped regions.
625 */
626static ssize_t pagemap_read(struct file *file, char __user *buf,
627 size_t count, loff_t *ppos)
628{
629 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
630 struct page **pages, *page;
631 unsigned long uaddr, uend;
632 struct mm_struct *mm;
633 struct pagemapread pm;
634 int pagecount;
635 int ret = -ESRCH;
636
637 if (!task)
638 goto out;
639
640 ret = -EACCES;
641 if (!ptrace_may_attach(task))
642 goto out;
643
644 ret = -EINVAL;
645 /* file position must be aligned */
646 if (*ppos % PM_ENTRY_BYTES)
647 goto out;
648
649 ret = 0;
650 mm = get_task_mm(task);
651 if (!mm)
652 goto out;
653
654 ret = -ENOMEM;
655 uaddr = (unsigned long)buf & PAGE_MASK;
656 uend = (unsigned long)(buf + count);
657 pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE;
658 pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL);
659 if (!pages)
660 goto out_task;
661
662 down_read(&current->mm->mmap_sem);
663 ret = get_user_pages(current, current->mm, uaddr, pagecount,
664 1, 0, pages, NULL);
665 up_read(&current->mm->mmap_sem);
666
667 if (ret < 0)
668 goto out_free;
669
670 pm.out = buf;
671 pm.end = buf + count;
672
673 if (!ptrace_may_attach(task)) {
674 ret = -EIO;
675 } else {
676 unsigned long src = *ppos;
677 unsigned long svpfn = src / PM_ENTRY_BYTES;
678 unsigned long start_vaddr = svpfn << PAGE_SHIFT;
679 unsigned long end_vaddr = TASK_SIZE_OF(task);
680
681 /* watch out for wraparound */
682 if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT)
683 start_vaddr = end_vaddr;
684
685 /*
686 * The odds are that this will stop walking way
687 * before end_vaddr, because the length of the
688 * user buffer is tracked in "pm", and the walk
689 * will stop when we hit the end of the buffer.
690 */
691 ret = walk_page_range(mm, start_vaddr, end_vaddr,
692 &pagemap_walk, &pm);
693 if (ret == PM_END_OF_BUFFER)
694 ret = 0;
695 /* don't need mmap_sem for these, but this looks cleaner */
696 *ppos += pm.out - buf;
697 if (!ret)
698 ret = pm.out - buf;
699 }
700
701 for (; pagecount; pagecount--) {
702 page = pages[pagecount-1];
703 if (!PageReserved(page))
704 SetPageDirty(page);
705 page_cache_release(page);
706 }
707 mmput(mm);
708out_free:
709 kfree(pages);
710out_task:
711 put_task_struct(task);
712out:
713 return ret;
714}
715
716const struct file_operations proc_pagemap_operations = {
717 .llseek = mem_lseek, /* borrow this */
718 .read = pagemap_read,
719};
720
522#ifdef CONFIG_NUMA 721#ifdef CONFIG_NUMA
523extern int show_numa_map(struct seq_file *m, void *v); 722extern int show_numa_map(struct seq_file *m, void *v);
524 723
@@ -552,4 +751,3 @@ const struct file_operations proc_numa_maps_operations = {
552 .release = seq_release_private, 751 .release = seq_release_private,
553}; 752};
554#endif 753#endif
555