aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c182
1 files changed, 144 insertions, 38 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3bd7fb7e4b75..2a8206009422 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
132 } 132 }
133 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 133 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
134} 134}
135
135/* Generate a custom zonelist for the BIND policy. */ 136/* Generate a custom zonelist for the BIND policy. */
136static struct zonelist *bind_zonelist(nodemask_t *nodes) 137static struct zonelist *bind_zonelist(nodemask_t *nodes)
137{ 138{
138 struct zonelist *zl; 139 struct zonelist *zl;
139 int num, max, nd; 140 int num, max, nd, k;
140 141
141 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 142 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
142 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 143 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
143 if (!zl) 144 if (!zl)
144 return NULL; 145 return NULL;
145 num = 0; 146 num = 0;
146 for_each_node_mask(nd, *nodes) 147 /* First put in the highest zones from all nodes, then all the next
147 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; 148 lower zones etc. Avoid empty zones because the memory allocator
149 doesn't like them. If you implement node hot removal you
150 have to fix that. */
151 for (k = policy_zone; k >= 0; k--) {
152 for_each_node_mask(nd, *nodes) {
153 struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 if (z->present_pages > 0)
155 zl->zones[num++] = z;
156 }
157 }
148 zl->zones[num] = NULL; 158 zl->zones[num] = NULL;
149 return zl; 159 return zl;
150} 160}
@@ -187,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
187 return policy; 197 return policy;
188} 198}
189 199
190static void gather_stats(struct page *, void *); 200static void gather_stats(struct page *, void *, int pte_dirty);
191static void migrate_page_add(struct page *page, struct list_head *pagelist, 201static void migrate_page_add(struct page *page, struct list_head *pagelist,
192 unsigned long flags); 202 unsigned long flags);
193 203
@@ -229,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
229 continue; 239 continue;
230 240
231 if (flags & MPOL_MF_STATS) 241 if (flags & MPOL_MF_STATS)
232 gather_stats(page, private); 242 gather_stats(page, private, pte_dirty(*pte));
233 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 243 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
234 migrate_page_add(page, private, flags); 244 migrate_page_add(page, private, flags);
235 else 245 else
@@ -542,7 +552,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
542 */ 552 */
543 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 553 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
544 if (isolate_lru_page(page)) 554 if (isolate_lru_page(page))
545 list_add(&page->lru, pagelist); 555 list_add_tail(&page->lru, pagelist);
546 } 556 }
547} 557}
548 558
@@ -559,6 +569,7 @@ static int migrate_pages_to(struct list_head *pagelist,
559 LIST_HEAD(moved); 569 LIST_HEAD(moved);
560 LIST_HEAD(failed); 570 LIST_HEAD(failed);
561 int err = 0; 571 int err = 0;
572 unsigned long offset = 0;
562 int nr_pages; 573 int nr_pages;
563 struct page *page; 574 struct page *page;
564 struct list_head *p; 575 struct list_head *p;
@@ -566,8 +577,21 @@ static int migrate_pages_to(struct list_head *pagelist,
566redo: 577redo:
567 nr_pages = 0; 578 nr_pages = 0;
568 list_for_each(p, pagelist) { 579 list_for_each(p, pagelist) {
569 if (vma) 580 if (vma) {
570 page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start); 581 /*
582 * The address passed to alloc_page_vma is used to
583 * generate the proper interleave behavior. We fake
584 * the address here by an increasing offset in order
585 * to get the proper distribution of pages.
586 *
587 * No decision has been made as to which page
588 * a certain old page is moved to so we cannot
589 * specify the correct address.
590 */
591 page = alloc_page_vma(GFP_HIGHUSER, vma,
592 offset + vma->vm_start);
593 offset += PAGE_SIZE;
594 }
571 else 595 else
572 page = alloc_pages_node(dest, GFP_HIGHUSER, 0); 596 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
573 597
@@ -575,9 +599,9 @@ redo:
575 err = -ENOMEM; 599 err = -ENOMEM;
576 goto out; 600 goto out;
577 } 601 }
578 list_add(&page->lru, &newlist); 602 list_add_tail(&page->lru, &newlist);
579 nr_pages++; 603 nr_pages++;
580 if (nr_pages > MIGRATE_CHUNK_SIZE); 604 if (nr_pages > MIGRATE_CHUNK_SIZE)
581 break; 605 break;
582 } 606 }
583 err = migrate_pages(pagelist, &newlist, &moved, &failed); 607 err = migrate_pages(pagelist, &newlist, &moved, &failed);
@@ -724,7 +748,7 @@ long do_mbind(unsigned long start, unsigned long len,
724 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 748 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
725 || mode > MPOL_MAX) 749 || mode > MPOL_MAX)
726 return -EINVAL; 750 return -EINVAL;
727 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) 751 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
728 return -EPERM; 752 return -EPERM;
729 753
730 if (start & ~PAGE_MASK) 754 if (start & ~PAGE_MASK)
@@ -798,6 +822,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
798 nodes_clear(*nodes); 822 nodes_clear(*nodes);
799 if (maxnode == 0 || !nmask) 823 if (maxnode == 0 || !nmask)
800 return 0; 824 return 0;
825 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
826 return -EINVAL;
801 827
802 nlongs = BITS_TO_LONGS(maxnode); 828 nlongs = BITS_TO_LONGS(maxnode);
803 if ((maxnode % BITS_PER_LONG) == 0) 829 if ((maxnode % BITS_PER_LONG) == 0)
@@ -916,19 +942,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
916 */ 942 */
917 if ((current->euid != task->suid) && (current->euid != task->uid) && 943 if ((current->euid != task->suid) && (current->euid != task->uid) &&
918 (current->uid != task->suid) && (current->uid != task->uid) && 944 (current->uid != task->suid) && (current->uid != task->uid) &&
919 !capable(CAP_SYS_ADMIN)) { 945 !capable(CAP_SYS_NICE)) {
920 err = -EPERM; 946 err = -EPERM;
921 goto out; 947 goto out;
922 } 948 }
923 949
924 task_nodes = cpuset_mems_allowed(task); 950 task_nodes = cpuset_mems_allowed(task);
925 /* Is the user allowed to access the target nodes? */ 951 /* Is the user allowed to access the target nodes? */
926 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { 952 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
927 err = -EPERM; 953 err = -EPERM;
928 goto out; 954 goto out;
929 } 955 }
930 956
931 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); 957 err = do_migrate_pages(mm, &old, &new,
958 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
932out: 959out:
933 mmput(mm); 960 mmput(mm);
934 return err; 961 return err;
@@ -1726,66 +1753,145 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1726struct numa_maps { 1753struct numa_maps {
1727 unsigned long pages; 1754 unsigned long pages;
1728 unsigned long anon; 1755 unsigned long anon;
1729 unsigned long mapped; 1756 unsigned long active;
1757 unsigned long writeback;
1730 unsigned long mapcount_max; 1758 unsigned long mapcount_max;
1759 unsigned long dirty;
1760 unsigned long swapcache;
1731 unsigned long node[MAX_NUMNODES]; 1761 unsigned long node[MAX_NUMNODES];
1732}; 1762};
1733 1763
1734static void gather_stats(struct page *page, void *private) 1764static void gather_stats(struct page *page, void *private, int pte_dirty)
1735{ 1765{
1736 struct numa_maps *md = private; 1766 struct numa_maps *md = private;
1737 int count = page_mapcount(page); 1767 int count = page_mapcount(page);
1738 1768
1739 if (count) 1769 md->pages++;
1740 md->mapped++; 1770 if (pte_dirty || PageDirty(page))
1771 md->dirty++;
1741 1772
1742 if (count > md->mapcount_max) 1773 if (PageSwapCache(page))
1743 md->mapcount_max = count; 1774 md->swapcache++;
1744 1775
1745 md->pages++; 1776 if (PageActive(page))
1777 md->active++;
1778
1779 if (PageWriteback(page))
1780 md->writeback++;
1746 1781
1747 if (PageAnon(page)) 1782 if (PageAnon(page))
1748 md->anon++; 1783 md->anon++;
1749 1784
1785 if (count > md->mapcount_max)
1786 md->mapcount_max = count;
1787
1750 md->node[page_to_nid(page)]++; 1788 md->node[page_to_nid(page)]++;
1751 cond_resched(); 1789 cond_resched();
1752} 1790}
1753 1791
1792#ifdef CONFIG_HUGETLB_PAGE
1793static void check_huge_range(struct vm_area_struct *vma,
1794 unsigned long start, unsigned long end,
1795 struct numa_maps *md)
1796{
1797 unsigned long addr;
1798 struct page *page;
1799
1800 for (addr = start; addr < end; addr += HPAGE_SIZE) {
1801 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1802 pte_t pte;
1803
1804 if (!ptep)
1805 continue;
1806
1807 pte = *ptep;
1808 if (pte_none(pte))
1809 continue;
1810
1811 page = pte_page(pte);
1812 if (!page)
1813 continue;
1814
1815 gather_stats(page, md, pte_dirty(*ptep));
1816 }
1817}
1818#else
1819static inline void check_huge_range(struct vm_area_struct *vma,
1820 unsigned long start, unsigned long end,
1821 struct numa_maps *md)
1822{
1823}
1824#endif
1825
1754int show_numa_map(struct seq_file *m, void *v) 1826int show_numa_map(struct seq_file *m, void *v)
1755{ 1827{
1756 struct task_struct *task = m->private; 1828 struct task_struct *task = m->private;
1757 struct vm_area_struct *vma = v; 1829 struct vm_area_struct *vma = v;
1758 struct numa_maps *md; 1830 struct numa_maps *md;
1831 struct file *file = vma->vm_file;
1832 struct mm_struct *mm = vma->vm_mm;
1759 int n; 1833 int n;
1760 char buffer[50]; 1834 char buffer[50];
1761 1835
1762 if (!vma->vm_mm) 1836 if (!mm)
1763 return 0; 1837 return 0;
1764 1838
1765 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); 1839 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1766 if (!md) 1840 if (!md)
1767 return 0; 1841 return 0;
1768 1842
1769 check_pgd_range(vma, vma->vm_start, vma->vm_end, 1843 mpol_to_str(buffer, sizeof(buffer),
1770 &node_online_map, MPOL_MF_STATS, md); 1844 get_vma_policy(task, vma, vma->vm_start));
1771 1845
1772 if (md->pages) { 1846 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1773 mpol_to_str(buffer, sizeof(buffer),
1774 get_vma_policy(task, vma, vma->vm_start));
1775 1847
1776 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", 1848 if (file) {
1777 vma->vm_start, buffer, md->pages, 1849 seq_printf(m, " file=");
1778 md->mapped, md->mapcount_max); 1850 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1851 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1852 seq_printf(m, " heap");
1853 } else if (vma->vm_start <= mm->start_stack &&
1854 vma->vm_end >= mm->start_stack) {
1855 seq_printf(m, " stack");
1856 }
1779 1857
1780 if (md->anon) 1858 if (is_vm_hugetlb_page(vma)) {
1781 seq_printf(m," anon=%lu",md->anon); 1859 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1860 seq_printf(m, " huge");
1861 } else {
1862 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1863 &node_online_map, MPOL_MF_STATS, md);
1864 }
1782 1865
1783 for_each_online_node(n) 1866 if (!md->pages)
1784 if (md->node[n]) 1867 goto out;
1785 seq_printf(m, " N%d=%lu", n, md->node[n]);
1786 1868
1787 seq_putc(m, '\n'); 1869 if (md->anon)
1788 } 1870 seq_printf(m," anon=%lu",md->anon);
1871
1872 if (md->dirty)
1873 seq_printf(m," dirty=%lu",md->dirty);
1874
1875 if (md->pages != md->anon && md->pages != md->dirty)
1876 seq_printf(m, " mapped=%lu", md->pages);
1877
1878 if (md->mapcount_max > 1)
1879 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1880
1881 if (md->swapcache)
1882 seq_printf(m," swapcache=%lu", md->swapcache);
1883
1884 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1885 seq_printf(m," active=%lu", md->active);
1886
1887 if (md->writeback)
1888 seq_printf(m," writeback=%lu", md->writeback);
1889
1890 for_each_online_node(n)
1891 if (md->node[n])
1892 seq_printf(m, " N%d=%lu", n, md->node[n]);
1893out:
1894 seq_putc(m, '\n');
1789 kfree(md); 1895 kfree(md);
1790 1896
1791 if (m->count < m->size) 1897 if (m->count < m->size)