diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 182 |
1 files changed, 144 insertions, 38 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3bd7fb7e4b75..2a8206009422 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
132 | } | 132 | } |
133 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; | 133 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; |
134 | } | 134 | } |
135 | |||
135 | /* Generate a custom zonelist for the BIND policy. */ | 136 | /* Generate a custom zonelist for the BIND policy. */ |
136 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | 137 | static struct zonelist *bind_zonelist(nodemask_t *nodes) |
137 | { | 138 | { |
138 | struct zonelist *zl; | 139 | struct zonelist *zl; |
139 | int num, max, nd; | 140 | int num, max, nd, k; |
140 | 141 | ||
141 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 142 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
142 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); | 143 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); |
143 | if (!zl) | 144 | if (!zl) |
144 | return NULL; | 145 | return NULL; |
145 | num = 0; | 146 | num = 0; |
146 | for_each_node_mask(nd, *nodes) | 147 | /* First put in the highest zones from all nodes, then all the next |
147 | zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; | 148 | lower zones etc. Avoid empty zones because the memory allocator |
149 | doesn't like them. If you implement node hot removal you | ||
150 | have to fix that. */ | ||
151 | for (k = policy_zone; k >= 0; k--) { | ||
152 | for_each_node_mask(nd, *nodes) { | ||
153 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | ||
154 | if (z->present_pages > 0) | ||
155 | zl->zones[num++] = z; | ||
156 | } | ||
157 | } | ||
148 | zl->zones[num] = NULL; | 158 | zl->zones[num] = NULL; |
149 | return zl; | 159 | return zl; |
150 | } | 160 | } |
@@ -187,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
187 | return policy; | 197 | return policy; |
188 | } | 198 | } |
189 | 199 | ||
190 | static void gather_stats(struct page *, void *); | 200 | static void gather_stats(struct page *, void *, int pte_dirty); |
191 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 201 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
192 | unsigned long flags); | 202 | unsigned long flags); |
193 | 203 | ||
@@ -229,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
229 | continue; | 239 | continue; |
230 | 240 | ||
231 | if (flags & MPOL_MF_STATS) | 241 | if (flags & MPOL_MF_STATS) |
232 | gather_stats(page, private); | 242 | gather_stats(page, private, pte_dirty(*pte)); |
233 | else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 243 | else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
234 | migrate_page_add(page, private, flags); | 244 | migrate_page_add(page, private, flags); |
235 | else | 245 | else |
@@ -542,7 +552,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
542 | */ | 552 | */ |
543 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 553 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
544 | if (isolate_lru_page(page)) | 554 | if (isolate_lru_page(page)) |
545 | list_add(&page->lru, pagelist); | 555 | list_add_tail(&page->lru, pagelist); |
546 | } | 556 | } |
547 | } | 557 | } |
548 | 558 | ||
@@ -559,6 +569,7 @@ static int migrate_pages_to(struct list_head *pagelist, | |||
559 | LIST_HEAD(moved); | 569 | LIST_HEAD(moved); |
560 | LIST_HEAD(failed); | 570 | LIST_HEAD(failed); |
561 | int err = 0; | 571 | int err = 0; |
572 | unsigned long offset = 0; | ||
562 | int nr_pages; | 573 | int nr_pages; |
563 | struct page *page; | 574 | struct page *page; |
564 | struct list_head *p; | 575 | struct list_head *p; |
@@ -566,8 +577,21 @@ static int migrate_pages_to(struct list_head *pagelist, | |||
566 | redo: | 577 | redo: |
567 | nr_pages = 0; | 578 | nr_pages = 0; |
568 | list_for_each(p, pagelist) { | 579 | list_for_each(p, pagelist) { |
569 | if (vma) | 580 | if (vma) { |
570 | page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start); | 581 | /* |
582 | * The address passed to alloc_page_vma is used to | ||
583 | * generate the proper interleave behavior. We fake | ||
584 | * the address here by an increasing offset in order | ||
585 | * to get the proper distribution of pages. | ||
586 | * | ||
587 | * No decision has been made as to which page | ||
588 | * a certain old page is moved to so we cannot | ||
589 | * specify the correct address. | ||
590 | */ | ||
591 | page = alloc_page_vma(GFP_HIGHUSER, vma, | ||
592 | offset + vma->vm_start); | ||
593 | offset += PAGE_SIZE; | ||
594 | } | ||
571 | else | 595 | else |
572 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | 596 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); |
573 | 597 | ||
@@ -575,9 +599,9 @@ redo: | |||
575 | err = -ENOMEM; | 599 | err = -ENOMEM; |
576 | goto out; | 600 | goto out; |
577 | } | 601 | } |
578 | list_add(&page->lru, &newlist); | 602 | list_add_tail(&page->lru, &newlist); |
579 | nr_pages++; | 603 | nr_pages++; |
580 | if (nr_pages > MIGRATE_CHUNK_SIZE); | 604 | if (nr_pages > MIGRATE_CHUNK_SIZE) |
581 | break; | 605 | break; |
582 | } | 606 | } |
583 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | 607 | err = migrate_pages(pagelist, &newlist, &moved, &failed); |
@@ -724,7 +748,7 @@ long do_mbind(unsigned long start, unsigned long len, | |||
724 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 748 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
725 | || mode > MPOL_MAX) | 749 | || mode > MPOL_MAX) |
726 | return -EINVAL; | 750 | return -EINVAL; |
727 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) | 751 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
728 | return -EPERM; | 752 | return -EPERM; |
729 | 753 | ||
730 | if (start & ~PAGE_MASK) | 754 | if (start & ~PAGE_MASK) |
@@ -798,6 +822,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, | |||
798 | nodes_clear(*nodes); | 822 | nodes_clear(*nodes); |
799 | if (maxnode == 0 || !nmask) | 823 | if (maxnode == 0 || !nmask) |
800 | return 0; | 824 | return 0; |
825 | if (maxnode > PAGE_SIZE*BITS_PER_BYTE) | ||
826 | return -EINVAL; | ||
801 | 827 | ||
802 | nlongs = BITS_TO_LONGS(maxnode); | 828 | nlongs = BITS_TO_LONGS(maxnode); |
803 | if ((maxnode % BITS_PER_LONG) == 0) | 829 | if ((maxnode % BITS_PER_LONG) == 0) |
@@ -916,19 +942,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | |||
916 | */ | 942 | */ |
917 | if ((current->euid != task->suid) && (current->euid != task->uid) && | 943 | if ((current->euid != task->suid) && (current->euid != task->uid) && |
918 | (current->uid != task->suid) && (current->uid != task->uid) && | 944 | (current->uid != task->suid) && (current->uid != task->uid) && |
919 | !capable(CAP_SYS_ADMIN)) { | 945 | !capable(CAP_SYS_NICE)) { |
920 | err = -EPERM; | 946 | err = -EPERM; |
921 | goto out; | 947 | goto out; |
922 | } | 948 | } |
923 | 949 | ||
924 | task_nodes = cpuset_mems_allowed(task); | 950 | task_nodes = cpuset_mems_allowed(task); |
925 | /* Is the user allowed to access the target nodes? */ | 951 | /* Is the user allowed to access the target nodes? */ |
926 | if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { | 952 | if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { |
927 | err = -EPERM; | 953 | err = -EPERM; |
928 | goto out; | 954 | goto out; |
929 | } | 955 | } |
930 | 956 | ||
931 | err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); | 957 | err = do_migrate_pages(mm, &old, &new, |
958 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | ||
932 | out: | 959 | out: |
933 | mmput(mm); | 960 | mmput(mm); |
934 | return err; | 961 | return err; |
@@ -1726,66 +1753,145 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
1726 | struct numa_maps { | 1753 | struct numa_maps { |
1727 | unsigned long pages; | 1754 | unsigned long pages; |
1728 | unsigned long anon; | 1755 | unsigned long anon; |
1729 | unsigned long mapped; | 1756 | unsigned long active; |
1757 | unsigned long writeback; | ||
1730 | unsigned long mapcount_max; | 1758 | unsigned long mapcount_max; |
1759 | unsigned long dirty; | ||
1760 | unsigned long swapcache; | ||
1731 | unsigned long node[MAX_NUMNODES]; | 1761 | unsigned long node[MAX_NUMNODES]; |
1732 | }; | 1762 | }; |
1733 | 1763 | ||
1734 | static void gather_stats(struct page *page, void *private) | 1764 | static void gather_stats(struct page *page, void *private, int pte_dirty) |
1735 | { | 1765 | { |
1736 | struct numa_maps *md = private; | 1766 | struct numa_maps *md = private; |
1737 | int count = page_mapcount(page); | 1767 | int count = page_mapcount(page); |
1738 | 1768 | ||
1739 | if (count) | 1769 | md->pages++; |
1740 | md->mapped++; | 1770 | if (pte_dirty || PageDirty(page)) |
1771 | md->dirty++; | ||
1741 | 1772 | ||
1742 | if (count > md->mapcount_max) | 1773 | if (PageSwapCache(page)) |
1743 | md->mapcount_max = count; | 1774 | md->swapcache++; |
1744 | 1775 | ||
1745 | md->pages++; | 1776 | if (PageActive(page)) |
1777 | md->active++; | ||
1778 | |||
1779 | if (PageWriteback(page)) | ||
1780 | md->writeback++; | ||
1746 | 1781 | ||
1747 | if (PageAnon(page)) | 1782 | if (PageAnon(page)) |
1748 | md->anon++; | 1783 | md->anon++; |
1749 | 1784 | ||
1785 | if (count > md->mapcount_max) | ||
1786 | md->mapcount_max = count; | ||
1787 | |||
1750 | md->node[page_to_nid(page)]++; | 1788 | md->node[page_to_nid(page)]++; |
1751 | cond_resched(); | 1789 | cond_resched(); |
1752 | } | 1790 | } |
1753 | 1791 | ||
1792 | #ifdef CONFIG_HUGETLB_PAGE | ||
1793 | static void check_huge_range(struct vm_area_struct *vma, | ||
1794 | unsigned long start, unsigned long end, | ||
1795 | struct numa_maps *md) | ||
1796 | { | ||
1797 | unsigned long addr; | ||
1798 | struct page *page; | ||
1799 | |||
1800 | for (addr = start; addr < end; addr += HPAGE_SIZE) { | ||
1801 | pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); | ||
1802 | pte_t pte; | ||
1803 | |||
1804 | if (!ptep) | ||
1805 | continue; | ||
1806 | |||
1807 | pte = *ptep; | ||
1808 | if (pte_none(pte)) | ||
1809 | continue; | ||
1810 | |||
1811 | page = pte_page(pte); | ||
1812 | if (!page) | ||
1813 | continue; | ||
1814 | |||
1815 | gather_stats(page, md, pte_dirty(*ptep)); | ||
1816 | } | ||
1817 | } | ||
1818 | #else | ||
1819 | static inline void check_huge_range(struct vm_area_struct *vma, | ||
1820 | unsigned long start, unsigned long end, | ||
1821 | struct numa_maps *md) | ||
1822 | { | ||
1823 | } | ||
1824 | #endif | ||
1825 | |||
1754 | int show_numa_map(struct seq_file *m, void *v) | 1826 | int show_numa_map(struct seq_file *m, void *v) |
1755 | { | 1827 | { |
1756 | struct task_struct *task = m->private; | 1828 | struct task_struct *task = m->private; |
1757 | struct vm_area_struct *vma = v; | 1829 | struct vm_area_struct *vma = v; |
1758 | struct numa_maps *md; | 1830 | struct numa_maps *md; |
1831 | struct file *file = vma->vm_file; | ||
1832 | struct mm_struct *mm = vma->vm_mm; | ||
1759 | int n; | 1833 | int n; |
1760 | char buffer[50]; | 1834 | char buffer[50]; |
1761 | 1835 | ||
1762 | if (!vma->vm_mm) | 1836 | if (!mm) |
1763 | return 0; | 1837 | return 0; |
1764 | 1838 | ||
1765 | md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); | 1839 | md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); |
1766 | if (!md) | 1840 | if (!md) |
1767 | return 0; | 1841 | return 0; |
1768 | 1842 | ||
1769 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | 1843 | mpol_to_str(buffer, sizeof(buffer), |
1770 | &node_online_map, MPOL_MF_STATS, md); | 1844 | get_vma_policy(task, vma, vma->vm_start)); |
1771 | 1845 | ||
1772 | if (md->pages) { | 1846 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); |
1773 | mpol_to_str(buffer, sizeof(buffer), | ||
1774 | get_vma_policy(task, vma, vma->vm_start)); | ||
1775 | 1847 | ||
1776 | seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", | 1848 | if (file) { |
1777 | vma->vm_start, buffer, md->pages, | 1849 | seq_printf(m, " file="); |
1778 | md->mapped, md->mapcount_max); | 1850 | seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); |
1851 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { | ||
1852 | seq_printf(m, " heap"); | ||
1853 | } else if (vma->vm_start <= mm->start_stack && | ||
1854 | vma->vm_end >= mm->start_stack) { | ||
1855 | seq_printf(m, " stack"); | ||
1856 | } | ||
1779 | 1857 | ||
1780 | if (md->anon) | 1858 | if (is_vm_hugetlb_page(vma)) { |
1781 | seq_printf(m," anon=%lu",md->anon); | 1859 | check_huge_range(vma, vma->vm_start, vma->vm_end, md); |
1860 | seq_printf(m, " huge"); | ||
1861 | } else { | ||
1862 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | ||
1863 | &node_online_map, MPOL_MF_STATS, md); | ||
1864 | } | ||
1782 | 1865 | ||
1783 | for_each_online_node(n) | 1866 | if (!md->pages) |
1784 | if (md->node[n]) | 1867 | goto out; |
1785 | seq_printf(m, " N%d=%lu", n, md->node[n]); | ||
1786 | 1868 | ||
1787 | seq_putc(m, '\n'); | 1869 | if (md->anon) |
1788 | } | 1870 | seq_printf(m," anon=%lu",md->anon); |
1871 | |||
1872 | if (md->dirty) | ||
1873 | seq_printf(m," dirty=%lu",md->dirty); | ||
1874 | |||
1875 | if (md->pages != md->anon && md->pages != md->dirty) | ||
1876 | seq_printf(m, " mapped=%lu", md->pages); | ||
1877 | |||
1878 | if (md->mapcount_max > 1) | ||
1879 | seq_printf(m, " mapmax=%lu", md->mapcount_max); | ||
1880 | |||
1881 | if (md->swapcache) | ||
1882 | seq_printf(m," swapcache=%lu", md->swapcache); | ||
1883 | |||
1884 | if (md->active < md->pages && !is_vm_hugetlb_page(vma)) | ||
1885 | seq_printf(m," active=%lu", md->active); | ||
1886 | |||
1887 | if (md->writeback) | ||
1888 | seq_printf(m," writeback=%lu", md->writeback); | ||
1889 | |||
1890 | for_each_online_node(n) | ||
1891 | if (md->node[n]) | ||
1892 | seq_printf(m, " N%d=%lu", n, md->node[n]); | ||
1893 | out: | ||
1894 | seq_putc(m, '\n'); | ||
1789 | kfree(md); | 1895 | kfree(md); |
1790 | 1896 | ||
1791 | if (m->count < m->size) | 1897 | if (m->count < m->size) |