diff options
author | Kumar Gala <galak@kernel.crashing.org> | 2006-03-20 12:58:02 -0500 |
---|---|---|
committer | Kumar Gala <galak@kernel.crashing.org> | 2006-03-20 12:58:02 -0500 |
commit | 1a02e59a2970f9ed28ab51d3b08624b79e54d848 (patch) | |
tree | 470cce472be3b08c160e0c569648e7228651b12a /mm | |
parent | ebcff3c773b42bce6182ec16485abca4e53fba97 (diff) | |
parent | 2c276603c3e5ebf38155a9d1fbbda656d52d138e (diff) |
Merge branch 'master'
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/madvise.c | 21 | ||||
-rw-r--r-- | mm/memory.c | 10 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 1 | ||||
-rw-r--r-- | mm/mempolicy.c | 182 | ||||
-rw-r--r-- | mm/nommu.c | 10 | ||||
-rw-r--r-- | mm/oom_kill.c | 124 | ||||
-rw-r--r-- | mm/page_alloc.c | 63 | ||||
-rw-r--r-- | mm/rmap.c | 21 | ||||
-rw-r--r-- | mm/shmem.c | 81 | ||||
-rw-r--r-- | mm/slab.c | 132 | ||||
-rw-r--r-- | mm/swap.c | 27 | ||||
-rw-r--r-- | mm/vmscan.c | 137 |
13 files changed, 599 insertions, 214 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67f29516662a..508707704d2c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page) | |||
85 | BUG_ON(page_count(page)); | 85 | BUG_ON(page_count(page)); |
86 | 86 | ||
87 | INIT_LIST_HEAD(&page->lru); | 87 | INIT_LIST_HEAD(&page->lru); |
88 | page[1].mapping = NULL; | 88 | page[1].lru.next = NULL; /* reset dtor */ |
89 | 89 | ||
90 | spin_lock(&hugetlb_lock); | 90 | spin_lock(&hugetlb_lock); |
91 | enqueue_huge_page(page); | 91 | enqueue_huge_page(page); |
@@ -105,7 +105,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) | |||
105 | } | 105 | } |
106 | spin_unlock(&hugetlb_lock); | 106 | spin_unlock(&hugetlb_lock); |
107 | set_page_count(page, 1); | 107 | set_page_count(page, 1); |
108 | page[1].mapping = (void *)free_huge_page; | 108 | page[1].lru.next = (void *)free_huge_page; /* set dtor */ |
109 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) | 109 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) |
110 | clear_user_highpage(&page[i], addr); | 110 | clear_user_highpage(&page[i], addr); |
111 | return page; | 111 | return page; |
diff --git a/mm/madvise.c b/mm/madvise.c index ae0ae3ea299a..af3d573b0141 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma, | |||
22 | struct mm_struct * mm = vma->vm_mm; | 22 | struct mm_struct * mm = vma->vm_mm; |
23 | int error = 0; | 23 | int error = 0; |
24 | pgoff_t pgoff; | 24 | pgoff_t pgoff; |
25 | int new_flags = vma->vm_flags & ~VM_READHINTMASK; | 25 | int new_flags = vma->vm_flags; |
26 | 26 | ||
27 | switch (behavior) { | 27 | switch (behavior) { |
28 | case MADV_NORMAL: | ||
29 | new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; | ||
30 | break; | ||
28 | case MADV_SEQUENTIAL: | 31 | case MADV_SEQUENTIAL: |
29 | new_flags |= VM_SEQ_READ; | 32 | new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; |
30 | break; | 33 | break; |
31 | case MADV_RANDOM: | 34 | case MADV_RANDOM: |
32 | new_flags |= VM_RAND_READ; | 35 | new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; |
33 | break; | 36 | break; |
34 | default: | 37 | case MADV_DONTFORK: |
38 | new_flags |= VM_DONTCOPY; | ||
39 | break; | ||
40 | case MADV_DOFORK: | ||
41 | new_flags &= ~VM_DONTCOPY; | ||
35 | break; | 42 | break; |
36 | } | 43 | } |
37 | 44 | ||
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
177 | long error; | 184 | long error; |
178 | 185 | ||
179 | switch (behavior) { | 186 | switch (behavior) { |
187 | case MADV_DOFORK: | ||
188 | if (vma->vm_flags & VM_IO) { | ||
189 | error = -EINVAL; | ||
190 | break; | ||
191 | } | ||
192 | case MADV_DONTFORK: | ||
180 | case MADV_NORMAL: | 193 | case MADV_NORMAL: |
181 | case MADV_SEQUENTIAL: | 194 | case MADV_SEQUENTIAL: |
182 | case MADV_RANDOM: | 195 | case MADV_RANDOM: |
diff --git a/mm/memory.c b/mm/memory.c index 2bee1f21aa8a..9abc6008544b 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages); | |||
82 | EXPORT_SYMBOL(high_memory); | 82 | EXPORT_SYMBOL(high_memory); |
83 | EXPORT_SYMBOL(vmalloc_earlyreserve); | 83 | EXPORT_SYMBOL(vmalloc_earlyreserve); |
84 | 84 | ||
85 | int randomize_va_space __read_mostly = 1; | ||
86 | |||
87 | static int __init disable_randmaps(char *s) | ||
88 | { | ||
89 | randomize_va_space = 0; | ||
90 | return 0; | ||
91 | } | ||
92 | __setup("norandmaps", disable_randmaps); | ||
93 | |||
94 | |||
85 | /* | 95 | /* |
86 | * If a p?d_bad entry is found while walking page tables, report | 96 | * If a p?d_bad entry is found while walking page tables, report |
87 | * the error, before resetting entry to p?d_none. Usually (but | 97 | * the error, before resetting entry to p?d_none. Usually (but |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a918f77f02f3..1fe76d963ac2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
130 | onlined_pages++; | 130 | onlined_pages++; |
131 | } | 131 | } |
132 | zone->present_pages += onlined_pages; | 132 | zone->present_pages += onlined_pages; |
133 | zone->zone_pgdat->node_present_pages += onlined_pages; | ||
133 | 134 | ||
134 | setup_per_zone_pages_min(); | 135 | setup_per_zone_pages_min(); |
135 | 136 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3bd7fb7e4b75..2a8206009422 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
132 | } | 132 | } |
133 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; | 133 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; |
134 | } | 134 | } |
135 | |||
135 | /* Generate a custom zonelist for the BIND policy. */ | 136 | /* Generate a custom zonelist for the BIND policy. */ |
136 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | 137 | static struct zonelist *bind_zonelist(nodemask_t *nodes) |
137 | { | 138 | { |
138 | struct zonelist *zl; | 139 | struct zonelist *zl; |
139 | int num, max, nd; | 140 | int num, max, nd, k; |
140 | 141 | ||
141 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 142 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
142 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); | 143 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); |
143 | if (!zl) | 144 | if (!zl) |
144 | return NULL; | 145 | return NULL; |
145 | num = 0; | 146 | num = 0; |
146 | for_each_node_mask(nd, *nodes) | 147 | /* First put in the highest zones from all nodes, then all the next |
147 | zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; | 148 | lower zones etc. Avoid empty zones because the memory allocator |
149 | doesn't like them. If you implement node hot removal you | ||
150 | have to fix that. */ | ||
151 | for (k = policy_zone; k >= 0; k--) { | ||
152 | for_each_node_mask(nd, *nodes) { | ||
153 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | ||
154 | if (z->present_pages > 0) | ||
155 | zl->zones[num++] = z; | ||
156 | } | ||
157 | } | ||
148 | zl->zones[num] = NULL; | 158 | zl->zones[num] = NULL; |
149 | return zl; | 159 | return zl; |
150 | } | 160 | } |
@@ -187,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
187 | return policy; | 197 | return policy; |
188 | } | 198 | } |
189 | 199 | ||
190 | static void gather_stats(struct page *, void *); | 200 | static void gather_stats(struct page *, void *, int pte_dirty); |
191 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 201 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
192 | unsigned long flags); | 202 | unsigned long flags); |
193 | 203 | ||
@@ -229,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
229 | continue; | 239 | continue; |
230 | 240 | ||
231 | if (flags & MPOL_MF_STATS) | 241 | if (flags & MPOL_MF_STATS) |
232 | gather_stats(page, private); | 242 | gather_stats(page, private, pte_dirty(*pte)); |
233 | else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 243 | else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
234 | migrate_page_add(page, private, flags); | 244 | migrate_page_add(page, private, flags); |
235 | else | 245 | else |
@@ -542,7 +552,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
542 | */ | 552 | */ |
543 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 553 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
544 | if (isolate_lru_page(page)) | 554 | if (isolate_lru_page(page)) |
545 | list_add(&page->lru, pagelist); | 555 | list_add_tail(&page->lru, pagelist); |
546 | } | 556 | } |
547 | } | 557 | } |
548 | 558 | ||
@@ -559,6 +569,7 @@ static int migrate_pages_to(struct list_head *pagelist, | |||
559 | LIST_HEAD(moved); | 569 | LIST_HEAD(moved); |
560 | LIST_HEAD(failed); | 570 | LIST_HEAD(failed); |
561 | int err = 0; | 571 | int err = 0; |
572 | unsigned long offset = 0; | ||
562 | int nr_pages; | 573 | int nr_pages; |
563 | struct page *page; | 574 | struct page *page; |
564 | struct list_head *p; | 575 | struct list_head *p; |
@@ -566,8 +577,21 @@ static int migrate_pages_to(struct list_head *pagelist, | |||
566 | redo: | 577 | redo: |
567 | nr_pages = 0; | 578 | nr_pages = 0; |
568 | list_for_each(p, pagelist) { | 579 | list_for_each(p, pagelist) { |
569 | if (vma) | 580 | if (vma) { |
570 | page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start); | 581 | /* |
582 | * The address passed to alloc_page_vma is used to | ||
583 | * generate the proper interleave behavior. We fake | ||
584 | * the address here by an increasing offset in order | ||
585 | * to get the proper distribution of pages. | ||
586 | * | ||
587 | * No decision has been made as to which page | ||
588 | * a certain old page is moved to so we cannot | ||
589 | * specify the correct address. | ||
590 | */ | ||
591 | page = alloc_page_vma(GFP_HIGHUSER, vma, | ||
592 | offset + vma->vm_start); | ||
593 | offset += PAGE_SIZE; | ||
594 | } | ||
571 | else | 595 | else |
572 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | 596 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); |
573 | 597 | ||
@@ -575,9 +599,9 @@ redo: | |||
575 | err = -ENOMEM; | 599 | err = -ENOMEM; |
576 | goto out; | 600 | goto out; |
577 | } | 601 | } |
578 | list_add(&page->lru, &newlist); | 602 | list_add_tail(&page->lru, &newlist); |
579 | nr_pages++; | 603 | nr_pages++; |
580 | if (nr_pages > MIGRATE_CHUNK_SIZE); | 604 | if (nr_pages > MIGRATE_CHUNK_SIZE) |
581 | break; | 605 | break; |
582 | } | 606 | } |
583 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | 607 | err = migrate_pages(pagelist, &newlist, &moved, &failed); |
@@ -724,7 +748,7 @@ long do_mbind(unsigned long start, unsigned long len, | |||
724 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 748 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
725 | || mode > MPOL_MAX) | 749 | || mode > MPOL_MAX) |
726 | return -EINVAL; | 750 | return -EINVAL; |
727 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) | 751 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) |
728 | return -EPERM; | 752 | return -EPERM; |
729 | 753 | ||
730 | if (start & ~PAGE_MASK) | 754 | if (start & ~PAGE_MASK) |
@@ -798,6 +822,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, | |||
798 | nodes_clear(*nodes); | 822 | nodes_clear(*nodes); |
799 | if (maxnode == 0 || !nmask) | 823 | if (maxnode == 0 || !nmask) |
800 | return 0; | 824 | return 0; |
825 | if (maxnode > PAGE_SIZE*BITS_PER_BYTE) | ||
826 | return -EINVAL; | ||
801 | 827 | ||
802 | nlongs = BITS_TO_LONGS(maxnode); | 828 | nlongs = BITS_TO_LONGS(maxnode); |
803 | if ((maxnode % BITS_PER_LONG) == 0) | 829 | if ((maxnode % BITS_PER_LONG) == 0) |
@@ -916,19 +942,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, | |||
916 | */ | 942 | */ |
917 | if ((current->euid != task->suid) && (current->euid != task->uid) && | 943 | if ((current->euid != task->suid) && (current->euid != task->uid) && |
918 | (current->uid != task->suid) && (current->uid != task->uid) && | 944 | (current->uid != task->suid) && (current->uid != task->uid) && |
919 | !capable(CAP_SYS_ADMIN)) { | 945 | !capable(CAP_SYS_NICE)) { |
920 | err = -EPERM; | 946 | err = -EPERM; |
921 | goto out; | 947 | goto out; |
922 | } | 948 | } |
923 | 949 | ||
924 | task_nodes = cpuset_mems_allowed(task); | 950 | task_nodes = cpuset_mems_allowed(task); |
925 | /* Is the user allowed to access the target nodes? */ | 951 | /* Is the user allowed to access the target nodes? */ |
926 | if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { | 952 | if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { |
927 | err = -EPERM; | 953 | err = -EPERM; |
928 | goto out; | 954 | goto out; |
929 | } | 955 | } |
930 | 956 | ||
931 | err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); | 957 | err = do_migrate_pages(mm, &old, &new, |
958 | capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); | ||
932 | out: | 959 | out: |
933 | mmput(mm); | 960 | mmput(mm); |
934 | return err; | 961 | return err; |
@@ -1726,66 +1753,145 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
1726 | struct numa_maps { | 1753 | struct numa_maps { |
1727 | unsigned long pages; | 1754 | unsigned long pages; |
1728 | unsigned long anon; | 1755 | unsigned long anon; |
1729 | unsigned long mapped; | 1756 | unsigned long active; |
1757 | unsigned long writeback; | ||
1730 | unsigned long mapcount_max; | 1758 | unsigned long mapcount_max; |
1759 | unsigned long dirty; | ||
1760 | unsigned long swapcache; | ||
1731 | unsigned long node[MAX_NUMNODES]; | 1761 | unsigned long node[MAX_NUMNODES]; |
1732 | }; | 1762 | }; |
1733 | 1763 | ||
1734 | static void gather_stats(struct page *page, void *private) | 1764 | static void gather_stats(struct page *page, void *private, int pte_dirty) |
1735 | { | 1765 | { |
1736 | struct numa_maps *md = private; | 1766 | struct numa_maps *md = private; |
1737 | int count = page_mapcount(page); | 1767 | int count = page_mapcount(page); |
1738 | 1768 | ||
1739 | if (count) | 1769 | md->pages++; |
1740 | md->mapped++; | 1770 | if (pte_dirty || PageDirty(page)) |
1771 | md->dirty++; | ||
1741 | 1772 | ||
1742 | if (count > md->mapcount_max) | 1773 | if (PageSwapCache(page)) |
1743 | md->mapcount_max = count; | 1774 | md->swapcache++; |
1744 | 1775 | ||
1745 | md->pages++; | 1776 | if (PageActive(page)) |
1777 | md->active++; | ||
1778 | |||
1779 | if (PageWriteback(page)) | ||
1780 | md->writeback++; | ||
1746 | 1781 | ||
1747 | if (PageAnon(page)) | 1782 | if (PageAnon(page)) |
1748 | md->anon++; | 1783 | md->anon++; |
1749 | 1784 | ||
1785 | if (count > md->mapcount_max) | ||
1786 | md->mapcount_max = count; | ||
1787 | |||
1750 | md->node[page_to_nid(page)]++; | 1788 | md->node[page_to_nid(page)]++; |
1751 | cond_resched(); | 1789 | cond_resched(); |
1752 | } | 1790 | } |
1753 | 1791 | ||
1792 | #ifdef CONFIG_HUGETLB_PAGE | ||
1793 | static void check_huge_range(struct vm_area_struct *vma, | ||
1794 | unsigned long start, unsigned long end, | ||
1795 | struct numa_maps *md) | ||
1796 | { | ||
1797 | unsigned long addr; | ||
1798 | struct page *page; | ||
1799 | |||
1800 | for (addr = start; addr < end; addr += HPAGE_SIZE) { | ||
1801 | pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); | ||
1802 | pte_t pte; | ||
1803 | |||
1804 | if (!ptep) | ||
1805 | continue; | ||
1806 | |||
1807 | pte = *ptep; | ||
1808 | if (pte_none(pte)) | ||
1809 | continue; | ||
1810 | |||
1811 | page = pte_page(pte); | ||
1812 | if (!page) | ||
1813 | continue; | ||
1814 | |||
1815 | gather_stats(page, md, pte_dirty(*ptep)); | ||
1816 | } | ||
1817 | } | ||
1818 | #else | ||
1819 | static inline void check_huge_range(struct vm_area_struct *vma, | ||
1820 | unsigned long start, unsigned long end, | ||
1821 | struct numa_maps *md) | ||
1822 | { | ||
1823 | } | ||
1824 | #endif | ||
1825 | |||
1754 | int show_numa_map(struct seq_file *m, void *v) | 1826 | int show_numa_map(struct seq_file *m, void *v) |
1755 | { | 1827 | { |
1756 | struct task_struct *task = m->private; | 1828 | struct task_struct *task = m->private; |
1757 | struct vm_area_struct *vma = v; | 1829 | struct vm_area_struct *vma = v; |
1758 | struct numa_maps *md; | 1830 | struct numa_maps *md; |
1831 | struct file *file = vma->vm_file; | ||
1832 | struct mm_struct *mm = vma->vm_mm; | ||
1759 | int n; | 1833 | int n; |
1760 | char buffer[50]; | 1834 | char buffer[50]; |
1761 | 1835 | ||
1762 | if (!vma->vm_mm) | 1836 | if (!mm) |
1763 | return 0; | 1837 | return 0; |
1764 | 1838 | ||
1765 | md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); | 1839 | md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); |
1766 | if (!md) | 1840 | if (!md) |
1767 | return 0; | 1841 | return 0; |
1768 | 1842 | ||
1769 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | 1843 | mpol_to_str(buffer, sizeof(buffer), |
1770 | &node_online_map, MPOL_MF_STATS, md); | 1844 | get_vma_policy(task, vma, vma->vm_start)); |
1771 | 1845 | ||
1772 | if (md->pages) { | 1846 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); |
1773 | mpol_to_str(buffer, sizeof(buffer), | ||
1774 | get_vma_policy(task, vma, vma->vm_start)); | ||
1775 | 1847 | ||
1776 | seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", | 1848 | if (file) { |
1777 | vma->vm_start, buffer, md->pages, | 1849 | seq_printf(m, " file="); |
1778 | md->mapped, md->mapcount_max); | 1850 | seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); |
1851 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { | ||
1852 | seq_printf(m, " heap"); | ||
1853 | } else if (vma->vm_start <= mm->start_stack && | ||
1854 | vma->vm_end >= mm->start_stack) { | ||
1855 | seq_printf(m, " stack"); | ||
1856 | } | ||
1779 | 1857 | ||
1780 | if (md->anon) | 1858 | if (is_vm_hugetlb_page(vma)) { |
1781 | seq_printf(m," anon=%lu",md->anon); | 1859 | check_huge_range(vma, vma->vm_start, vma->vm_end, md); |
1860 | seq_printf(m, " huge"); | ||
1861 | } else { | ||
1862 | check_pgd_range(vma, vma->vm_start, vma->vm_end, | ||
1863 | &node_online_map, MPOL_MF_STATS, md); | ||
1864 | } | ||
1782 | 1865 | ||
1783 | for_each_online_node(n) | 1866 | if (!md->pages) |
1784 | if (md->node[n]) | 1867 | goto out; |
1785 | seq_printf(m, " N%d=%lu", n, md->node[n]); | ||
1786 | 1868 | ||
1787 | seq_putc(m, '\n'); | 1869 | if (md->anon) |
1788 | } | 1870 | seq_printf(m," anon=%lu",md->anon); |
1871 | |||
1872 | if (md->dirty) | ||
1873 | seq_printf(m," dirty=%lu",md->dirty); | ||
1874 | |||
1875 | if (md->pages != md->anon && md->pages != md->dirty) | ||
1876 | seq_printf(m, " mapped=%lu", md->pages); | ||
1877 | |||
1878 | if (md->mapcount_max > 1) | ||
1879 | seq_printf(m, " mapmax=%lu", md->mapcount_max); | ||
1880 | |||
1881 | if (md->swapcache) | ||
1882 | seq_printf(m," swapcache=%lu", md->swapcache); | ||
1883 | |||
1884 | if (md->active < md->pages && !is_vm_hugetlb_page(vma)) | ||
1885 | seq_printf(m," active=%lu", md->active); | ||
1886 | |||
1887 | if (md->writeback) | ||
1888 | seq_printf(m," writeback=%lu", md->writeback); | ||
1889 | |||
1890 | for_each_online_node(n) | ||
1891 | if (md->node[n]) | ||
1892 | seq_printf(m, " N%d=%lu", n, md->node[n]); | ||
1893 | out: | ||
1894 | seq_putc(m, '\n'); | ||
1789 | kfree(md); | 1895 | kfree(md); |
1790 | 1896 | ||
1791 | if (m->count < m->size) | 1897 | if (m->count < m->size) |
diff --git a/mm/nommu.c b/mm/nommu.c index c10262d68232..4951f4786f28 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -53,10 +53,11 @@ DECLARE_RWSEM(nommu_vma_sem); | |||
53 | struct vm_operations_struct generic_file_vm_ops = { | 53 | struct vm_operations_struct generic_file_vm_ops = { |
54 | }; | 54 | }; |
55 | 55 | ||
56 | EXPORT_SYMBOL(vmalloc); | ||
57 | EXPORT_SYMBOL(vfree); | 56 | EXPORT_SYMBOL(vfree); |
58 | EXPORT_SYMBOL(vmalloc_to_page); | 57 | EXPORT_SYMBOL(vmalloc_to_page); |
59 | EXPORT_SYMBOL(vmalloc_32); | 58 | EXPORT_SYMBOL(vmalloc_32); |
59 | EXPORT_SYMBOL(vmap); | ||
60 | EXPORT_SYMBOL(vunmap); | ||
60 | 61 | ||
61 | /* | 62 | /* |
62 | * Handle all mappings that got truncated by a "truncate()" | 63 | * Handle all mappings that got truncated by a "truncate()" |
@@ -203,6 +204,13 @@ void *vmalloc(unsigned long size) | |||
203 | { | 204 | { |
204 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); | 205 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); |
205 | } | 206 | } |
207 | EXPORT_SYMBOL(vmalloc); | ||
208 | |||
209 | void *vmalloc_node(unsigned long size, int node) | ||
210 | { | ||
211 | return vmalloc(size); | ||
212 | } | ||
213 | EXPORT_SYMBOL(vmalloc_node); | ||
206 | 214 | ||
207 | /* | 215 | /* |
208 | * vmalloc_32 - allocate virtually continguos memory (32bit addressable) | 216 | * vmalloc_32 - allocate virtually continguos memory (32bit addressable) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b05ab8f2a562..78747afad6b0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -58,15 +58,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
58 | 58 | ||
59 | /* | 59 | /* |
60 | * Processes which fork a lot of child processes are likely | 60 | * Processes which fork a lot of child processes are likely |
61 | * a good choice. We add the vmsize of the children if they | 61 | * a good choice. We add half the vmsize of the children if they |
62 | * have an own mm. This prevents forking servers to flood the | 62 | * have an own mm. This prevents forking servers to flood the |
63 | * machine with an endless amount of children | 63 | * machine with an endless amount of children. In case a single |
64 | * child is eating the vast majority of memory, adding only half | ||
65 | * to the parents will make the child our kill candidate of choice. | ||
64 | */ | 66 | */ |
65 | list_for_each(tsk, &p->children) { | 67 | list_for_each(tsk, &p->children) { |
66 | struct task_struct *chld; | 68 | struct task_struct *chld; |
67 | chld = list_entry(tsk, struct task_struct, sibling); | 69 | chld = list_entry(tsk, struct task_struct, sibling); |
68 | if (chld->mm != p->mm && chld->mm) | 70 | if (chld->mm != p->mm && chld->mm) |
69 | points += chld->mm->total_vm; | 71 | points += chld->mm->total_vm/2 + 1; |
70 | } | 72 | } |
71 | 73 | ||
72 | /* | 74 | /* |
@@ -131,17 +133,47 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
131 | } | 133 | } |
132 | 134 | ||
133 | /* | 135 | /* |
136 | * Types of limitations to the nodes from which allocations may occur | ||
137 | */ | ||
138 | #define CONSTRAINT_NONE 1 | ||
139 | #define CONSTRAINT_MEMORY_POLICY 2 | ||
140 | #define CONSTRAINT_CPUSET 3 | ||
141 | |||
142 | /* | ||
143 | * Determine the type of allocation constraint. | ||
144 | */ | ||
145 | static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) | ||
146 | { | ||
147 | #ifdef CONFIG_NUMA | ||
148 | struct zone **z; | ||
149 | nodemask_t nodes = node_online_map; | ||
150 | |||
151 | for (z = zonelist->zones; *z; z++) | ||
152 | if (cpuset_zone_allowed(*z, gfp_mask)) | ||
153 | node_clear((*z)->zone_pgdat->node_id, | ||
154 | nodes); | ||
155 | else | ||
156 | return CONSTRAINT_CPUSET; | ||
157 | |||
158 | if (!nodes_empty(nodes)) | ||
159 | return CONSTRAINT_MEMORY_POLICY; | ||
160 | #endif | ||
161 | |||
162 | return CONSTRAINT_NONE; | ||
163 | } | ||
164 | |||
165 | /* | ||
134 | * Simple selection loop. We chose the process with the highest | 166 | * Simple selection loop. We chose the process with the highest |
135 | * number of 'points'. We expect the caller will lock the tasklist. | 167 | * number of 'points'. We expect the caller will lock the tasklist. |
136 | * | 168 | * |
137 | * (not docbooked, we don't want this one cluttering up the manual) | 169 | * (not docbooked, we don't want this one cluttering up the manual) |
138 | */ | 170 | */ |
139 | static struct task_struct * select_bad_process(void) | 171 | static struct task_struct *select_bad_process(unsigned long *ppoints) |
140 | { | 172 | { |
141 | unsigned long maxpoints = 0; | ||
142 | struct task_struct *g, *p; | 173 | struct task_struct *g, *p; |
143 | struct task_struct *chosen = NULL; | 174 | struct task_struct *chosen = NULL; |
144 | struct timespec uptime; | 175 | struct timespec uptime; |
176 | *ppoints = 0; | ||
145 | 177 | ||
146 | do_posix_clock_monotonic_gettime(&uptime); | 178 | do_posix_clock_monotonic_gettime(&uptime); |
147 | do_each_thread(g, p) { | 179 | do_each_thread(g, p) { |
@@ -169,9 +201,9 @@ static struct task_struct * select_bad_process(void) | |||
169 | return p; | 201 | return p; |
170 | 202 | ||
171 | points = badness(p, uptime.tv_sec); | 203 | points = badness(p, uptime.tv_sec); |
172 | if (points > maxpoints || !chosen) { | 204 | if (points > *ppoints || !chosen) { |
173 | chosen = p; | 205 | chosen = p; |
174 | maxpoints = points; | 206 | *ppoints = points; |
175 | } | 207 | } |
176 | } while_each_thread(g, p); | 208 | } while_each_thread(g, p); |
177 | return chosen; | 209 | return chosen; |
@@ -182,7 +214,7 @@ static struct task_struct * select_bad_process(void) | |||
182 | * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that | 214 | * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that |
183 | * we select a process with CAP_SYS_RAW_IO set). | 215 | * we select a process with CAP_SYS_RAW_IO set). |
184 | */ | 216 | */ |
185 | static void __oom_kill_task(task_t *p) | 217 | static void __oom_kill_task(task_t *p, const char *message) |
186 | { | 218 | { |
187 | if (p->pid == 1) { | 219 | if (p->pid == 1) { |
188 | WARN_ON(1); | 220 | WARN_ON(1); |
@@ -198,8 +230,8 @@ static void __oom_kill_task(task_t *p) | |||
198 | return; | 230 | return; |
199 | } | 231 | } |
200 | task_unlock(p); | 232 | task_unlock(p); |
201 | printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", | 233 | printk(KERN_ERR "%s: Killed process %d (%s).\n", |
202 | p->pid, p->comm); | 234 | message, p->pid, p->comm); |
203 | 235 | ||
204 | /* | 236 | /* |
205 | * We give our sacrificial lamb high priority and access to | 237 | * We give our sacrificial lamb high priority and access to |
@@ -212,7 +244,7 @@ static void __oom_kill_task(task_t *p) | |||
212 | force_sig(SIGKILL, p); | 244 | force_sig(SIGKILL, p); |
213 | } | 245 | } |
214 | 246 | ||
215 | static struct mm_struct *oom_kill_task(task_t *p) | 247 | static struct mm_struct *oom_kill_task(task_t *p, const char *message) |
216 | { | 248 | { |
217 | struct mm_struct *mm = get_task_mm(p); | 249 | struct mm_struct *mm = get_task_mm(p); |
218 | task_t * g, * q; | 250 | task_t * g, * q; |
@@ -224,35 +256,38 @@ static struct mm_struct *oom_kill_task(task_t *p) | |||
224 | return NULL; | 256 | return NULL; |
225 | } | 257 | } |
226 | 258 | ||
227 | __oom_kill_task(p); | 259 | __oom_kill_task(p, message); |
228 | /* | 260 | /* |
229 | * kill all processes that share the ->mm (i.e. all threads), | 261 | * kill all processes that share the ->mm (i.e. all threads), |
230 | * but are in a different thread group | 262 | * but are in a different thread group |
231 | */ | 263 | */ |
232 | do_each_thread(g, q) | 264 | do_each_thread(g, q) |
233 | if (q->mm == mm && q->tgid != p->tgid) | 265 | if (q->mm == mm && q->tgid != p->tgid) |
234 | __oom_kill_task(q); | 266 | __oom_kill_task(q, message); |
235 | while_each_thread(g, q); | 267 | while_each_thread(g, q); |
236 | 268 | ||
237 | return mm; | 269 | return mm; |
238 | } | 270 | } |
239 | 271 | ||
240 | static struct mm_struct *oom_kill_process(struct task_struct *p) | 272 | static struct mm_struct *oom_kill_process(struct task_struct *p, |
273 | unsigned long points, const char *message) | ||
241 | { | 274 | { |
242 | struct mm_struct *mm; | 275 | struct mm_struct *mm; |
243 | struct task_struct *c; | 276 | struct task_struct *c; |
244 | struct list_head *tsk; | 277 | struct list_head *tsk; |
245 | 278 | ||
279 | printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and " | ||
280 | "children.\n", p->pid, p->comm, points); | ||
246 | /* Try to kill a child first */ | 281 | /* Try to kill a child first */ |
247 | list_for_each(tsk, &p->children) { | 282 | list_for_each(tsk, &p->children) { |
248 | c = list_entry(tsk, struct task_struct, sibling); | 283 | c = list_entry(tsk, struct task_struct, sibling); |
249 | if (c->mm == p->mm) | 284 | if (c->mm == p->mm) |
250 | continue; | 285 | continue; |
251 | mm = oom_kill_task(c); | 286 | mm = oom_kill_task(c, message); |
252 | if (mm) | 287 | if (mm) |
253 | return mm; | 288 | return mm; |
254 | } | 289 | } |
255 | return oom_kill_task(p); | 290 | return oom_kill_task(p, message); |
256 | } | 291 | } |
257 | 292 | ||
258 | /** | 293 | /** |
@@ -263,10 +298,11 @@ static struct mm_struct *oom_kill_process(struct task_struct *p) | |||
263 | * OR try to be smart about which process to kill. Note that we | 298 | * OR try to be smart about which process to kill. Note that we |
264 | * don't have to be perfect here, we just have to be good. | 299 | * don't have to be perfect here, we just have to be good. |
265 | */ | 300 | */ |
266 | void out_of_memory(gfp_t gfp_mask, int order) | 301 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) |
267 | { | 302 | { |
268 | struct mm_struct *mm = NULL; | 303 | struct mm_struct *mm = NULL; |
269 | task_t * p; | 304 | task_t *p; |
305 | unsigned long points = 0; | ||
270 | 306 | ||
271 | if (printk_ratelimit()) { | 307 | if (printk_ratelimit()) { |
272 | printk("oom-killer: gfp_mask=0x%x, order=%d\n", | 308 | printk("oom-killer: gfp_mask=0x%x, order=%d\n", |
@@ -277,24 +313,48 @@ void out_of_memory(gfp_t gfp_mask, int order) | |||
277 | 313 | ||
278 | cpuset_lock(); | 314 | cpuset_lock(); |
279 | read_lock(&tasklist_lock); | 315 | read_lock(&tasklist_lock); |
316 | |||
317 | /* | ||
318 | * Check if there were limitations on the allocation (only relevant for | ||
319 | * NUMA) that may require different handling. | ||
320 | */ | ||
321 | switch (constrained_alloc(zonelist, gfp_mask)) { | ||
322 | case CONSTRAINT_MEMORY_POLICY: | ||
323 | mm = oom_kill_process(current, points, | ||
324 | "No available memory (MPOL_BIND)"); | ||
325 | break; | ||
326 | |||
327 | case CONSTRAINT_CPUSET: | ||
328 | mm = oom_kill_process(current, points, | ||
329 | "No available memory in cpuset"); | ||
330 | break; | ||
331 | |||
332 | case CONSTRAINT_NONE: | ||
280 | retry: | 333 | retry: |
281 | p = select_bad_process(); | 334 | /* |
335 | * Rambo mode: Shoot down a process and hope it solves whatever | ||
336 | * issues we may have. | ||
337 | */ | ||
338 | p = select_bad_process(&points); | ||
282 | 339 | ||
283 | if (PTR_ERR(p) == -1UL) | 340 | if (PTR_ERR(p) == -1UL) |
284 | goto out; | 341 | goto out; |
285 | 342 | ||
286 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 343 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
287 | if (!p) { | 344 | if (!p) { |
288 | read_unlock(&tasklist_lock); | 345 | read_unlock(&tasklist_lock); |
289 | cpuset_unlock(); | 346 | cpuset_unlock(); |
290 | panic("Out of memory and no killable processes...\n"); | 347 | panic("Out of memory and no killable processes...\n"); |
291 | } | 348 | } |
292 | 349 | ||
293 | mm = oom_kill_process(p); | 350 | mm = oom_kill_process(p, points, "Out of memory"); |
294 | if (!mm) | 351 | if (!mm) |
295 | goto retry; | 352 | goto retry; |
353 | |||
354 | break; | ||
355 | } | ||
296 | 356 | ||
297 | out: | 357 | out: |
298 | read_unlock(&tasklist_lock); | 358 | read_unlock(&tasklist_lock); |
299 | cpuset_unlock(); | 359 | cpuset_unlock(); |
300 | if (mm) | 360 | if (mm) |
@@ -305,5 +365,5 @@ retry: | |||
305 | * retry to allocate memory unless "p" is current | 365 | * retry to allocate memory unless "p" is current |
306 | */ | 366 | */ |
307 | if (!test_thread_flag(TIF_MEMDIE)) | 367 | if (!test_thread_flag(TIF_MEMDIE)) |
308 | schedule_timeout_interruptible(1); | 368 | schedule_timeout_uninterruptible(1); |
309 | } | 369 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dde04ff4be31..234bd4895d14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -56,6 +56,7 @@ long nr_swap_pages; | |||
56 | int percpu_pagelist_fraction; | 56 | int percpu_pagelist_fraction; |
57 | 57 | ||
58 | static void fastcall free_hot_cold_page(struct page *page, int cold); | 58 | static void fastcall free_hot_cold_page(struct page *page, int cold); |
59 | static void __free_pages_ok(struct page *page, unsigned int order); | ||
59 | 60 | ||
60 | /* | 61 | /* |
61 | * results with 256, 32 in the lowmem_reserve sysctl: | 62 | * results with 256, 32 in the lowmem_reserve sysctl: |
@@ -169,20 +170,23 @@ static void bad_page(struct page *page) | |||
169 | * All pages have PG_compound set. All pages have their ->private pointing at | 170 | * All pages have PG_compound set. All pages have their ->private pointing at |
170 | * the head page (even the head page has this). | 171 | * the head page (even the head page has this). |
171 | * | 172 | * |
172 | * The first tail page's ->mapping, if non-zero, holds the address of the | 173 | * The first tail page's ->lru.next holds the address of the compound page's |
173 | * compound page's put_page() function. | 174 | * put_page() function. Its ->lru.prev holds the order of allocation. |
174 | * | 175 | * This usage means that zero-order pages may not be compound. |
175 | * The order of the allocation is stored in the first tail page's ->index | ||
176 | * This is only for debug at present. This usage means that zero-order pages | ||
177 | * may not be compound. | ||
178 | */ | 176 | */ |
177 | |||
178 | static void free_compound_page(struct page *page) | ||
179 | { | ||
180 | __free_pages_ok(page, (unsigned long)page[1].lru.prev); | ||
181 | } | ||
182 | |||
179 | static void prep_compound_page(struct page *page, unsigned long order) | 183 | static void prep_compound_page(struct page *page, unsigned long order) |
180 | { | 184 | { |
181 | int i; | 185 | int i; |
182 | int nr_pages = 1 << order; | 186 | int nr_pages = 1 << order; |
183 | 187 | ||
184 | page[1].mapping = NULL; | 188 | page[1].lru.next = (void *)free_compound_page; /* set dtor */ |
185 | page[1].index = order; | 189 | page[1].lru.prev = (void *)order; |
186 | for (i = 0; i < nr_pages; i++) { | 190 | for (i = 0; i < nr_pages; i++) { |
187 | struct page *p = page + i; | 191 | struct page *p = page + i; |
188 | 192 | ||
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
196 | int i; | 200 | int i; |
197 | int nr_pages = 1 << order; | 201 | int nr_pages = 1 << order; |
198 | 202 | ||
199 | if (unlikely(page[1].index != order)) | 203 | if (unlikely((unsigned long)page[1].lru.prev != order)) |
200 | bad_page(page); | 204 | bad_page(page); |
201 | 205 | ||
202 | for (i = 0; i < nr_pages; i++) { | 206 | for (i = 0; i < nr_pages; i++) { |
@@ -586,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
586 | } | 590 | } |
587 | 591 | ||
588 | #ifdef CONFIG_NUMA | 592 | #ifdef CONFIG_NUMA |
589 | /* Called from the slab reaper to drain remote pagesets */ | 593 | /* |
590 | void drain_remote_pages(void) | 594 | * Called from the slab reaper to drain pagesets on a particular node that |
595 | * belong to the currently executing processor. | ||
596 | */ | ||
597 | void drain_node_pages(int nodeid) | ||
591 | { | 598 | { |
592 | struct zone *zone; | 599 | int i, z; |
593 | int i; | ||
594 | unsigned long flags; | 600 | unsigned long flags; |
595 | 601 | ||
596 | local_irq_save(flags); | 602 | local_irq_save(flags); |
597 | for_each_zone(zone) { | 603 | for (z = 0; z < MAX_NR_ZONES; z++) { |
604 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; | ||
598 | struct per_cpu_pageset *pset; | 605 | struct per_cpu_pageset *pset; |
599 | 606 | ||
600 | /* Do not drain local pagesets */ | ||
601 | if (zone->zone_pgdat->node_id == numa_node_id()) | ||
602 | continue; | ||
603 | |||
604 | pset = zone_pcp(zone, smp_processor_id()); | 607 | pset = zone_pcp(zone, smp_processor_id()); |
605 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { | 608 | for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { |
606 | struct per_cpu_pages *pcp; | 609 | struct per_cpu_pages *pcp; |
@@ -1011,7 +1014,7 @@ rebalance: | |||
1011 | if (page) | 1014 | if (page) |
1012 | goto got_pg; | 1015 | goto got_pg; |
1013 | 1016 | ||
1014 | out_of_memory(gfp_mask, order); | 1017 | out_of_memory(zonelist, gfp_mask, order); |
1015 | goto restart; | 1018 | goto restart; |
1016 | } | 1019 | } |
1017 | 1020 | ||
@@ -1537,29 +1540,29 @@ static int __initdata node_load[MAX_NUMNODES]; | |||
1537 | */ | 1540 | */ |
1538 | static int __init find_next_best_node(int node, nodemask_t *used_node_mask) | 1541 | static int __init find_next_best_node(int node, nodemask_t *used_node_mask) |
1539 | { | 1542 | { |
1540 | int i, n, val; | 1543 | int n, val; |
1541 | int min_val = INT_MAX; | 1544 | int min_val = INT_MAX; |
1542 | int best_node = -1; | 1545 | int best_node = -1; |
1543 | 1546 | ||
1544 | for_each_online_node(i) { | 1547 | /* Use the local node if we haven't already */ |
1545 | cpumask_t tmp; | 1548 | if (!node_isset(node, *used_node_mask)) { |
1549 | node_set(node, *used_node_mask); | ||
1550 | return node; | ||
1551 | } | ||
1546 | 1552 | ||
1547 | /* Start from local node */ | 1553 | for_each_online_node(n) { |
1548 | n = (node+i) % num_online_nodes(); | 1554 | cpumask_t tmp; |
1549 | 1555 | ||
1550 | /* Don't want a node to appear more than once */ | 1556 | /* Don't want a node to appear more than once */ |
1551 | if (node_isset(n, *used_node_mask)) | 1557 | if (node_isset(n, *used_node_mask)) |
1552 | continue; | 1558 | continue; |
1553 | 1559 | ||
1554 | /* Use the local node if we haven't already */ | ||
1555 | if (!node_isset(node, *used_node_mask)) { | ||
1556 | best_node = node; | ||
1557 | break; | ||
1558 | } | ||
1559 | |||
1560 | /* Use the distance array to find the distance */ | 1560 | /* Use the distance array to find the distance */ |
1561 | val = node_distance(node, n); | 1561 | val = node_distance(node, n); |
1562 | 1562 | ||
1563 | /* Penalize nodes under us ("prefer the next node") */ | ||
1564 | val += (n < node); | ||
1565 | |||
1563 | /* Give preference to headless and unused nodes */ | 1566 | /* Give preference to headless and unused nodes */ |
1564 | tmp = node_to_cpumask(n); | 1567 | tmp = node_to_cpumask(n); |
1565 | if (!cpus_empty(tmp)) | 1568 | if (!cpus_empty(tmp)) |
@@ -212,25 +212,33 @@ out: | |||
212 | * through real pte's pointing to valid pages and then releasing | 212 | * through real pte's pointing to valid pages and then releasing |
213 | * the page from the swap cache. | 213 | * the page from the swap cache. |
214 | * | 214 | * |
215 | * Must hold page lock on page. | 215 | * Must hold page lock on page and mmap_sem of one vma that contains |
216 | * the page. | ||
216 | */ | 217 | */ |
217 | void remove_from_swap(struct page *page) | 218 | void remove_from_swap(struct page *page) |
218 | { | 219 | { |
219 | struct anon_vma *anon_vma; | 220 | struct anon_vma *anon_vma; |
220 | struct vm_area_struct *vma; | 221 | struct vm_area_struct *vma; |
222 | unsigned long mapping; | ||
221 | 223 | ||
222 | if (!PageAnon(page) || !PageSwapCache(page)) | 224 | if (!PageSwapCache(page)) |
223 | return; | 225 | return; |
224 | 226 | ||
225 | anon_vma = page_lock_anon_vma(page); | 227 | mapping = (unsigned long)page->mapping; |
226 | if (!anon_vma) | 228 | |
229 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) | ||
227 | return; | 230 | return; |
228 | 231 | ||
232 | /* | ||
233 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. | ||
234 | */ | ||
235 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); | ||
236 | spin_lock(&anon_vma->lock); | ||
237 | |||
229 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | 238 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) |
230 | remove_vma_swap(vma, page); | 239 | remove_vma_swap(vma, page); |
231 | 240 | ||
232 | spin_unlock(&anon_vma->lock); | 241 | spin_unlock(&anon_vma->lock); |
233 | |||
234 | delete_from_swap_cache(page); | 242 | delete_from_swap_cache(page); |
235 | } | 243 | } |
236 | EXPORT_SYMBOL(remove_from_swap); | 244 | EXPORT_SYMBOL(remove_from_swap); |
@@ -529,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page, | |||
529 | */ | 537 | */ |
530 | void page_add_file_rmap(struct page *page) | 538 | void page_add_file_rmap(struct page *page) |
531 | { | 539 | { |
532 | BUG_ON(PageAnon(page)); | ||
533 | BUG_ON(!pfn_valid(page_to_pfn(page))); | ||
534 | |||
535 | if (atomic_inc_and_test(&page->_mapcount)) | 540 | if (atomic_inc_and_test(&page->_mapcount)) |
536 | __inc_page_state(nr_mapped); | 541 | __inc_page_state(nr_mapped); |
537 | } | 542 | } |
diff --git a/mm/shmem.c b/mm/shmem.c index f7ac7b812f92..7c455fbaff7b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -45,6 +45,7 @@ | |||
45 | #include <linux/swapops.h> | 45 | #include <linux/swapops.h> |
46 | #include <linux/mempolicy.h> | 46 | #include <linux/mempolicy.h> |
47 | #include <linux/namei.h> | 47 | #include <linux/namei.h> |
48 | #include <linux/ctype.h> | ||
48 | #include <asm/uaccess.h> | 49 | #include <asm/uaccess.h> |
49 | #include <asm/div64.h> | 50 | #include <asm/div64.h> |
50 | #include <asm/pgtable.h> | 51 | #include <asm/pgtable.h> |
@@ -874,6 +875,51 @@ redirty: | |||
874 | } | 875 | } |
875 | 876 | ||
876 | #ifdef CONFIG_NUMA | 877 | #ifdef CONFIG_NUMA |
878 | static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) | ||
879 | { | ||
880 | char *nodelist = strchr(value, ':'); | ||
881 | int err = 1; | ||
882 | |||
883 | if (nodelist) { | ||
884 | /* NUL-terminate policy string */ | ||
885 | *nodelist++ = '\0'; | ||
886 | if (nodelist_parse(nodelist, *policy_nodes)) | ||
887 | goto out; | ||
888 | } | ||
889 | if (!strcmp(value, "default")) { | ||
890 | *policy = MPOL_DEFAULT; | ||
891 | /* Don't allow a nodelist */ | ||
892 | if (!nodelist) | ||
893 | err = 0; | ||
894 | } else if (!strcmp(value, "prefer")) { | ||
895 | *policy = MPOL_PREFERRED; | ||
896 | /* Insist on a nodelist of one node only */ | ||
897 | if (nodelist) { | ||
898 | char *rest = nodelist; | ||
899 | while (isdigit(*rest)) | ||
900 | rest++; | ||
901 | if (!*rest) | ||
902 | err = 0; | ||
903 | } | ||
904 | } else if (!strcmp(value, "bind")) { | ||
905 | *policy = MPOL_BIND; | ||
906 | /* Insist on a nodelist */ | ||
907 | if (nodelist) | ||
908 | err = 0; | ||
909 | } else if (!strcmp(value, "interleave")) { | ||
910 | *policy = MPOL_INTERLEAVE; | ||
911 | /* Default to nodes online if no nodelist */ | ||
912 | if (!nodelist) | ||
913 | *policy_nodes = node_online_map; | ||
914 | err = 0; | ||
915 | } | ||
916 | out: | ||
917 | /* Restore string for error message */ | ||
918 | if (nodelist) | ||
919 | *--nodelist = ':'; | ||
920 | return err; | ||
921 | } | ||
922 | |||
877 | static struct page *shmem_swapin_async(struct shared_policy *p, | 923 | static struct page *shmem_swapin_async(struct shared_policy *p, |
878 | swp_entry_t entry, unsigned long idx) | 924 | swp_entry_t entry, unsigned long idx) |
879 | { | 925 | { |
@@ -926,6 +972,11 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info, | |||
926 | return page; | 972 | return page; |
927 | } | 973 | } |
928 | #else | 974 | #else |
975 | static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) | ||
976 | { | ||
977 | return 1; | ||
978 | } | ||
979 | |||
929 | static inline struct page * | 980 | static inline struct page * |
930 | shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) | 981 | shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) |
931 | { | 982 | { |
@@ -1859,7 +1910,23 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, | |||
1859 | { | 1910 | { |
1860 | char *this_char, *value, *rest; | 1911 | char *this_char, *value, *rest; |
1861 | 1912 | ||
1862 | while ((this_char = strsep(&options, ",")) != NULL) { | 1913 | while (options != NULL) { |
1914 | this_char = options; | ||
1915 | for (;;) { | ||
1916 | /* | ||
1917 | * NUL-terminate this option: unfortunately, | ||
1918 | * mount options form a comma-separated list, | ||
1919 | * but mpol's nodelist may also contain commas. | ||
1920 | */ | ||
1921 | options = strchr(options, ','); | ||
1922 | if (options == NULL) | ||
1923 | break; | ||
1924 | options++; | ||
1925 | if (!isdigit(*options)) { | ||
1926 | options[-1] = '\0'; | ||
1927 | break; | ||
1928 | } | ||
1929 | } | ||
1863 | if (!*this_char) | 1930 | if (!*this_char) |
1864 | continue; | 1931 | continue; |
1865 | if ((value = strchr(this_char,'=')) != NULL) { | 1932 | if ((value = strchr(this_char,'=')) != NULL) { |
@@ -1910,18 +1977,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, | |||
1910 | if (*rest) | 1977 | if (*rest) |
1911 | goto bad_val; | 1978 | goto bad_val; |
1912 | } else if (!strcmp(this_char,"mpol")) { | 1979 | } else if (!strcmp(this_char,"mpol")) { |
1913 | if (!strcmp(value,"default")) | 1980 | if (shmem_parse_mpol(value,policy,policy_nodes)) |
1914 | *policy = MPOL_DEFAULT; | ||
1915 | else if (!strcmp(value,"preferred")) | ||
1916 | *policy = MPOL_PREFERRED; | ||
1917 | else if (!strcmp(value,"bind")) | ||
1918 | *policy = MPOL_BIND; | ||
1919 | else if (!strcmp(value,"interleave")) | ||
1920 | *policy = MPOL_INTERLEAVE; | ||
1921 | else | ||
1922 | goto bad_val; | 1981 | goto bad_val; |
1923 | } else if (!strcmp(this_char,"mpol_nodelist")) { | ||
1924 | nodelist_parse(value, *policy_nodes); | ||
1925 | } else { | 1982 | } else { |
1926 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", | 1983 | printk(KERN_ERR "tmpfs: Bad mount option %s\n", |
1927 | this_char); | 1984 | this_char); |
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char * | |||
789 | dump_stack(); | 789 | dump_stack(); |
790 | } | 790 | } |
791 | 791 | ||
792 | #ifdef CONFIG_NUMA | ||
793 | /* | ||
794 | * Special reaping functions for NUMA systems called from cache_reap(). | ||
795 | * These take care of doing round robin flushing of alien caches (containing | ||
796 | * objects freed on different nodes from which they were allocated) and the | ||
797 | * flushing of remote pcps by calling drain_node_pages. | ||
798 | */ | ||
799 | static DEFINE_PER_CPU(unsigned long, reap_node); | ||
800 | |||
801 | static void init_reap_node(int cpu) | ||
802 | { | ||
803 | int node; | ||
804 | |||
805 | node = next_node(cpu_to_node(cpu), node_online_map); | ||
806 | if (node == MAX_NUMNODES) | ||
807 | node = 0; | ||
808 | |||
809 | __get_cpu_var(reap_node) = node; | ||
810 | } | ||
811 | |||
812 | static void next_reap_node(void) | ||
813 | { | ||
814 | int node = __get_cpu_var(reap_node); | ||
815 | |||
816 | /* | ||
817 | * Also drain per cpu pages on remote zones | ||
818 | */ | ||
819 | if (node != numa_node_id()) | ||
820 | drain_node_pages(node); | ||
821 | |||
822 | node = next_node(node, node_online_map); | ||
823 | if (unlikely(node >= MAX_NUMNODES)) | ||
824 | node = first_node(node_online_map); | ||
825 | __get_cpu_var(reap_node) = node; | ||
826 | } | ||
827 | |||
828 | #else | ||
829 | #define init_reap_node(cpu) do { } while (0) | ||
830 | #define next_reap_node(void) do { } while (0) | ||
831 | #endif | ||
832 | |||
792 | /* | 833 | /* |
793 | * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz | 834 | * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz |
794 | * via the workqueue/eventd. | 835 | * via the workqueue/eventd. |
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu) | |||
806 | * at that time. | 847 | * at that time. |
807 | */ | 848 | */ |
808 | if (keventd_up() && reap_work->func == NULL) { | 849 | if (keventd_up() && reap_work->func == NULL) { |
850 | init_reap_node(cpu); | ||
809 | INIT_WORK(reap_work, cache_reap, NULL); | 851 | INIT_WORK(reap_work, cache_reap, NULL); |
810 | schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); | 852 | schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); |
811 | } | 853 | } |
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
884 | } | 926 | } |
885 | } | 927 | } |
886 | 928 | ||
929 | /* | ||
930 | * Called from cache_reap() to regularly drain alien caches round robin. | ||
931 | */ | ||
932 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | ||
933 | { | ||
934 | int node = __get_cpu_var(reap_node); | ||
935 | |||
936 | if (l3->alien) { | ||
937 | struct array_cache *ac = l3->alien[node]; | ||
938 | if (ac && ac->avail) { | ||
939 | spin_lock_irq(&ac->lock); | ||
940 | __drain_alien_cache(cachep, ac, node); | ||
941 | spin_unlock_irq(&ac->lock); | ||
942 | } | ||
943 | } | ||
944 | } | ||
945 | |||
887 | static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) | 946 | static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) |
888 | { | 947 | { |
889 | int i = 0; | 948 | int i = 0; |
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al | |||
902 | #else | 961 | #else |
903 | 962 | ||
904 | #define drain_alien_cache(cachep, alien) do { } while (0) | 963 | #define drain_alien_cache(cachep, alien) do { } while (0) |
964 | #define reap_alien(cachep, l3) do { } while (0) | ||
905 | 965 | ||
906 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | 966 | static inline struct array_cache **alloc_alien_cache(int node, int limit) |
907 | { | 967 | { |
@@ -1124,6 +1184,7 @@ void __init kmem_cache_init(void) | |||
1124 | struct cache_sizes *sizes; | 1184 | struct cache_sizes *sizes; |
1125 | struct cache_names *names; | 1185 | struct cache_names *names; |
1126 | int i; | 1186 | int i; |
1187 | int order; | ||
1127 | 1188 | ||
1128 | for (i = 0; i < NUM_INIT_LISTS; i++) { | 1189 | for (i = 0; i < NUM_INIT_LISTS; i++) { |
1129 | kmem_list3_init(&initkmem_list3[i]); | 1190 | kmem_list3_init(&initkmem_list3[i]); |
@@ -1167,11 +1228,15 @@ void __init kmem_cache_init(void) | |||
1167 | 1228 | ||
1168 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); | 1229 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); |
1169 | 1230 | ||
1170 | cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0, | 1231 | for (order = 0; order < MAX_ORDER; order++) { |
1171 | &left_over, &cache_cache.num); | 1232 | cache_estimate(order, cache_cache.buffer_size, |
1233 | cache_line_size(), 0, &left_over, &cache_cache.num); | ||
1234 | if (cache_cache.num) | ||
1235 | break; | ||
1236 | } | ||
1172 | if (!cache_cache.num) | 1237 | if (!cache_cache.num) |
1173 | BUG(); | 1238 | BUG(); |
1174 | 1239 | cache_cache.gfporder = order; | |
1175 | cache_cache.colour = left_over / cache_cache.colour_off; | 1240 | cache_cache.colour = left_over / cache_cache.colour_off; |
1176 | cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + | 1241 | cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + |
1177 | sizeof(struct slab), cache_line_size()); | 1242 | sizeof(struct slab), cache_line_size()); |
@@ -1628,36 +1693,44 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, | |||
1628 | size_t size, size_t align, unsigned long flags) | 1693 | size_t size, size_t align, unsigned long flags) |
1629 | { | 1694 | { |
1630 | size_t left_over = 0; | 1695 | size_t left_over = 0; |
1696 | int gfporder; | ||
1631 | 1697 | ||
1632 | for (;; cachep->gfporder++) { | 1698 | for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { |
1633 | unsigned int num; | 1699 | unsigned int num; |
1634 | size_t remainder; | 1700 | size_t remainder; |
1635 | 1701 | ||
1636 | if (cachep->gfporder > MAX_GFP_ORDER) { | 1702 | cache_estimate(gfporder, size, align, flags, &remainder, &num); |
1637 | cachep->num = 0; | ||
1638 | break; | ||
1639 | } | ||
1640 | |||
1641 | cache_estimate(cachep->gfporder, size, align, flags, | ||
1642 | &remainder, &num); | ||
1643 | if (!num) | 1703 | if (!num) |
1644 | continue; | 1704 | continue; |
1705 | |||
1645 | /* More than offslab_limit objects will cause problems */ | 1706 | /* More than offslab_limit objects will cause problems */ |
1646 | if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) | 1707 | if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit) |
1647 | break; | 1708 | break; |
1648 | 1709 | ||
1710 | /* Found something acceptable - save it away */ | ||
1649 | cachep->num = num; | 1711 | cachep->num = num; |
1712 | cachep->gfporder = gfporder; | ||
1650 | left_over = remainder; | 1713 | left_over = remainder; |
1651 | 1714 | ||
1652 | /* | 1715 | /* |
1716 | * A VFS-reclaimable slab tends to have most allocations | ||
1717 | * as GFP_NOFS and we really don't want to have to be allocating | ||
1718 | * higher-order pages when we are unable to shrink dcache. | ||
1719 | */ | ||
1720 | if (flags & SLAB_RECLAIM_ACCOUNT) | ||
1721 | break; | ||
1722 | |||
1723 | /* | ||
1653 | * Large number of objects is good, but very large slabs are | 1724 | * Large number of objects is good, but very large slabs are |
1654 | * currently bad for the gfp()s. | 1725 | * currently bad for the gfp()s. |
1655 | */ | 1726 | */ |
1656 | if (cachep->gfporder >= slab_break_gfp_order) | 1727 | if (gfporder >= slab_break_gfp_order) |
1657 | break; | 1728 | break; |
1658 | 1729 | ||
1659 | if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder)) | 1730 | /* |
1660 | /* Acceptable internal fragmentation */ | 1731 | * Acceptable internal fragmentation? |
1732 | */ | ||
1733 | if ((left_over * 8) <= (PAGE_SIZE << gfporder)) | ||
1661 | break; | 1734 | break; |
1662 | } | 1735 | } |
1663 | return left_over; | 1736 | return left_over; |
@@ -1717,6 +1790,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1717 | BUG(); | 1790 | BUG(); |
1718 | } | 1791 | } |
1719 | 1792 | ||
1793 | /* | ||
1794 | * Prevent CPUs from coming and going. | ||
1795 | * lock_cpu_hotplug() nests outside cache_chain_mutex | ||
1796 | */ | ||
1797 | lock_cpu_hotplug(); | ||
1798 | |||
1720 | mutex_lock(&cache_chain_mutex); | 1799 | mutex_lock(&cache_chain_mutex); |
1721 | 1800 | ||
1722 | list_for_each(p, &cache_chain) { | 1801 | list_for_each(p, &cache_chain) { |
@@ -1863,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1863 | 1942 | ||
1864 | size = ALIGN(size, align); | 1943 | size = ALIGN(size, align); |
1865 | 1944 | ||
1866 | if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { | 1945 | left_over = calculate_slab_order(cachep, size, align, flags); |
1867 | /* | ||
1868 | * A VFS-reclaimable slab tends to have most allocations | ||
1869 | * as GFP_NOFS and we really don't want to have to be allocating | ||
1870 | * higher-order pages when we are unable to shrink dcache. | ||
1871 | */ | ||
1872 | cachep->gfporder = 0; | ||
1873 | cache_estimate(cachep->gfporder, size, align, flags, | ||
1874 | &left_over, &cachep->num); | ||
1875 | } else | ||
1876 | left_over = calculate_slab_order(cachep, size, align, flags); | ||
1877 | 1946 | ||
1878 | if (!cachep->num) { | 1947 | if (!cachep->num) { |
1879 | printk("kmem_cache_create: couldn't create cache %s.\n", name); | 1948 | printk("kmem_cache_create: couldn't create cache %s.\n", name); |
@@ -1918,8 +1987,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1918 | cachep->dtor = dtor; | 1987 | cachep->dtor = dtor; |
1919 | cachep->name = name; | 1988 | cachep->name = name; |
1920 | 1989 | ||
1921 | /* Don't let CPUs to come and go */ | ||
1922 | lock_cpu_hotplug(); | ||
1923 | 1990 | ||
1924 | if (g_cpucache_up == FULL) { | 1991 | if (g_cpucache_up == FULL) { |
1925 | enable_cpucache(cachep); | 1992 | enable_cpucache(cachep); |
@@ -1978,12 +2045,12 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
1978 | 2045 | ||
1979 | /* cache setup completed, link it into the list */ | 2046 | /* cache setup completed, link it into the list */ |
1980 | list_add(&cachep->next, &cache_chain); | 2047 | list_add(&cachep->next, &cache_chain); |
1981 | unlock_cpu_hotplug(); | ||
1982 | oops: | 2048 | oops: |
1983 | if (!cachep && (flags & SLAB_PANIC)) | 2049 | if (!cachep && (flags & SLAB_PANIC)) |
1984 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2050 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
1985 | name); | 2051 | name); |
1986 | mutex_unlock(&cache_chain_mutex); | 2052 | mutex_unlock(&cache_chain_mutex); |
2053 | unlock_cpu_hotplug(); | ||
1987 | return cachep; | 2054 | return cachep; |
1988 | } | 2055 | } |
1989 | EXPORT_SYMBOL(kmem_cache_create); | 2056 | EXPORT_SYMBOL(kmem_cache_create); |
@@ -2550,7 +2617,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) | |||
2550 | "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", | 2617 | "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", |
2551 | cachep->name, cachep->num, slabp, slabp->inuse); | 2618 | cachep->name, cachep->num, slabp, slabp->inuse); |
2552 | for (i = 0; | 2619 | for (i = 0; |
2553 | i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t); | 2620 | i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); |
2554 | i++) { | 2621 | i++) { |
2555 | if ((i % 16) == 0) | 2622 | if ((i % 16) == 0) |
2556 | printk("\n%03x:", i); | 2623 | printk("\n%03x:", i); |
@@ -3490,8 +3557,7 @@ static void cache_reap(void *unused) | |||
3490 | check_irq_on(); | 3557 | check_irq_on(); |
3491 | 3558 | ||
3492 | l3 = searchp->nodelists[numa_node_id()]; | 3559 | l3 = searchp->nodelists[numa_node_id()]; |
3493 | if (l3->alien) | 3560 | reap_alien(searchp, l3); |
3494 | drain_alien_cache(searchp, l3->alien); | ||
3495 | spin_lock_irq(&l3->list_lock); | 3561 | spin_lock_irq(&l3->list_lock); |
3496 | 3562 | ||
3497 | drain_array_locked(searchp, cpu_cache_get(searchp), 0, | 3563 | drain_array_locked(searchp, cpu_cache_get(searchp), 0, |
@@ -3541,7 +3607,7 @@ static void cache_reap(void *unused) | |||
3541 | } | 3607 | } |
3542 | check_irq_on(); | 3608 | check_irq_on(); |
3543 | mutex_unlock(&cache_chain_mutex); | 3609 | mutex_unlock(&cache_chain_mutex); |
3544 | drain_remote_pages(); | 3610 | next_reap_node(); |
3545 | /* Setup the next iteration */ | 3611 | /* Setup the next iteration */ |
3546 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 3612 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); |
3547 | } | 3613 | } |
@@ -40,7 +40,7 @@ static void put_compound_page(struct page *page) | |||
40 | if (put_page_testzero(page)) { | 40 | if (put_page_testzero(page)) { |
41 | void (*dtor)(struct page *page); | 41 | void (*dtor)(struct page *page); |
42 | 42 | ||
43 | dtor = (void (*)(struct page *))page[1].mapping; | 43 | dtor = (void (*)(struct page *))page[1].lru.next; |
44 | (*dtor)(page); | 44 | (*dtor)(page); |
45 | } | 45 | } |
46 | } | 46 | } |
@@ -489,13 +489,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount) | |||
489 | if (count >= FBC_BATCH || count <= -FBC_BATCH) { | 489 | if (count >= FBC_BATCH || count <= -FBC_BATCH) { |
490 | spin_lock(&fbc->lock); | 490 | spin_lock(&fbc->lock); |
491 | fbc->count += count; | 491 | fbc->count += count; |
492 | *pcount = 0; | ||
492 | spin_unlock(&fbc->lock); | 493 | spin_unlock(&fbc->lock); |
493 | count = 0; | 494 | } else { |
495 | *pcount = count; | ||
494 | } | 496 | } |
495 | *pcount = count; | ||
496 | put_cpu(); | 497 | put_cpu(); |
497 | } | 498 | } |
498 | EXPORT_SYMBOL(percpu_counter_mod); | 499 | EXPORT_SYMBOL(percpu_counter_mod); |
500 | |||
501 | /* | ||
502 | * Add up all the per-cpu counts, return the result. This is a more accurate | ||
503 | * but much slower version of percpu_counter_read_positive() | ||
504 | */ | ||
505 | long percpu_counter_sum(struct percpu_counter *fbc) | ||
506 | { | ||
507 | long ret; | ||
508 | int cpu; | ||
509 | |||
510 | spin_lock(&fbc->lock); | ||
511 | ret = fbc->count; | ||
512 | for_each_cpu(cpu) { | ||
513 | long *pcount = per_cpu_ptr(fbc->counters, cpu); | ||
514 | ret += *pcount; | ||
515 | } | ||
516 | spin_unlock(&fbc->lock); | ||
517 | return ret < 0 ? 0 : ret; | ||
518 | } | ||
519 | EXPORT_SYMBOL(percpu_counter_sum); | ||
499 | #endif | 520 | #endif |
500 | 521 | ||
501 | /* | 522 | /* |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 5a610804cd06..4fe7e3aa02e2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
443 | BUG_ON(PageActive(page)); | 443 | BUG_ON(PageActive(page)); |
444 | 444 | ||
445 | sc->nr_scanned++; | 445 | sc->nr_scanned++; |
446 | |||
447 | if (!sc->may_swap && page_mapped(page)) | ||
448 | goto keep_locked; | ||
449 | |||
446 | /* Double the slab pressure for mapped and swapcache pages */ | 450 | /* Double the slab pressure for mapped and swapcache pages */ |
447 | if (page_mapped(page) || PageSwapCache(page)) | 451 | if (page_mapped(page) || PageSwapCache(page)) |
448 | sc->nr_scanned++; | 452 | sc->nr_scanned++; |
@@ -632,7 +636,7 @@ static int swap_page(struct page *page) | |||
632 | struct address_space *mapping = page_mapping(page); | 636 | struct address_space *mapping = page_mapping(page); |
633 | 637 | ||
634 | if (page_mapped(page) && mapping) | 638 | if (page_mapped(page) && mapping) |
635 | if (try_to_unmap(page, 0) != SWAP_SUCCESS) | 639 | if (try_to_unmap(page, 1) != SWAP_SUCCESS) |
636 | goto unlock_retry; | 640 | goto unlock_retry; |
637 | 641 | ||
638 | if (PageDirty(page)) { | 642 | if (PageDirty(page)) { |
@@ -696,7 +700,7 @@ int migrate_page_remove_references(struct page *newpage, | |||
696 | * the page. | 700 | * the page. |
697 | */ | 701 | */ |
698 | if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) | 702 | if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) |
699 | return 1; | 703 | return -EAGAIN; |
700 | 704 | ||
701 | /* | 705 | /* |
702 | * Establish swap ptes for anonymous pages or destroy pte | 706 | * Establish swap ptes for anonymous pages or destroy pte |
@@ -717,13 +721,15 @@ int migrate_page_remove_references(struct page *newpage, | |||
717 | * If the page was not migrated then the PageSwapCache bit | 721 | * If the page was not migrated then the PageSwapCache bit |
718 | * is still set and the operation may continue. | 722 | * is still set and the operation may continue. |
719 | */ | 723 | */ |
720 | try_to_unmap(page, 1); | 724 | if (try_to_unmap(page, 1) == SWAP_FAIL) |
725 | /* A vma has VM_LOCKED set -> Permanent failure */ | ||
726 | return -EPERM; | ||
721 | 727 | ||
722 | /* | 728 | /* |
723 | * Give up if we were unable to remove all mappings. | 729 | * Give up if we were unable to remove all mappings. |
724 | */ | 730 | */ |
725 | if (page_mapcount(page)) | 731 | if (page_mapcount(page)) |
726 | return 1; | 732 | return -EAGAIN; |
727 | 733 | ||
728 | write_lock_irq(&mapping->tree_lock); | 734 | write_lock_irq(&mapping->tree_lock); |
729 | 735 | ||
@@ -734,7 +740,7 @@ int migrate_page_remove_references(struct page *newpage, | |||
734 | if (!page_mapping(page) || page_count(page) != nr_refs || | 740 | if (!page_mapping(page) || page_count(page) != nr_refs || |
735 | *radix_pointer != page) { | 741 | *radix_pointer != page) { |
736 | write_unlock_irq(&mapping->tree_lock); | 742 | write_unlock_irq(&mapping->tree_lock); |
737 | return 1; | 743 | return -EAGAIN; |
738 | } | 744 | } |
739 | 745 | ||
740 | /* | 746 | /* |
@@ -809,10 +815,14 @@ EXPORT_SYMBOL(migrate_page_copy); | |||
809 | */ | 815 | */ |
810 | int migrate_page(struct page *newpage, struct page *page) | 816 | int migrate_page(struct page *newpage, struct page *page) |
811 | { | 817 | { |
818 | int rc; | ||
819 | |||
812 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | 820 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ |
813 | 821 | ||
814 | if (migrate_page_remove_references(newpage, page, 2)) | 822 | rc = migrate_page_remove_references(newpage, page, 2); |
815 | return -EAGAIN; | 823 | |
824 | if (rc) | ||
825 | return rc; | ||
816 | 826 | ||
817 | migrate_page_copy(newpage, page); | 827 | migrate_page_copy(newpage, page); |
818 | 828 | ||
@@ -839,7 +849,7 @@ EXPORT_SYMBOL(migrate_page); | |||
839 | * pages are swapped out. | 849 | * pages are swapped out. |
840 | * | 850 | * |
841 | * The function returns after 10 attempts or if no pages | 851 | * The function returns after 10 attempts or if no pages |
842 | * are movable anymore because t has become empty | 852 | * are movable anymore because to has become empty |
843 | * or no retryable pages exist anymore. | 853 | * or no retryable pages exist anymore. |
844 | * | 854 | * |
845 | * Return: Number of pages not migrated when "to" ran empty. | 855 | * Return: Number of pages not migrated when "to" ran empty. |
@@ -928,12 +938,21 @@ redo: | |||
928 | goto unlock_both; | 938 | goto unlock_both; |
929 | 939 | ||
930 | if (mapping->a_ops->migratepage) { | 940 | if (mapping->a_ops->migratepage) { |
941 | /* | ||
942 | * Most pages have a mapping and most filesystems | ||
943 | * should provide a migration function. Anonymous | ||
944 | * pages are part of swap space which also has its | ||
945 | * own migration function. This is the most common | ||
946 | * path for page migration. | ||
947 | */ | ||
931 | rc = mapping->a_ops->migratepage(newpage, page); | 948 | rc = mapping->a_ops->migratepage(newpage, page); |
932 | goto unlock_both; | 949 | goto unlock_both; |
933 | } | 950 | } |
934 | 951 | ||
935 | /* | 952 | /* |
936 | * Trigger writeout if page is dirty | 953 | * Default handling if a filesystem does not provide |
954 | * a migration function. We can only migrate clean | ||
955 | * pages so try to write out any dirty pages first. | ||
937 | */ | 956 | */ |
938 | if (PageDirty(page)) { | 957 | if (PageDirty(page)) { |
939 | switch (pageout(page, mapping)) { | 958 | switch (pageout(page, mapping)) { |
@@ -949,9 +968,10 @@ redo: | |||
949 | ; /* try to migrate the page below */ | 968 | ; /* try to migrate the page below */ |
950 | } | 969 | } |
951 | } | 970 | } |
971 | |||
952 | /* | 972 | /* |
953 | * If we have no buffer or can release the buffer | 973 | * Buffers are managed in a filesystem specific way. |
954 | * then do a simple migration. | 974 | * We must have no buffers or drop them. |
955 | */ | 975 | */ |
956 | if (!page_has_buffers(page) || | 976 | if (!page_has_buffers(page) || |
957 | try_to_release_page(page, GFP_KERNEL)) { | 977 | try_to_release_page(page, GFP_KERNEL)) { |
@@ -966,6 +986,11 @@ redo: | |||
966 | * swap them out. | 986 | * swap them out. |
967 | */ | 987 | */ |
968 | if (pass > 4) { | 988 | if (pass > 4) { |
989 | /* | ||
990 | * Persistently unable to drop buffers..... As a | ||
991 | * measure of last resort we fall back to | ||
992 | * swap_page(). | ||
993 | */ | ||
969 | unlock_page(newpage); | 994 | unlock_page(newpage); |
970 | newpage = NULL; | 995 | newpage = NULL; |
971 | rc = swap_page(page); | 996 | rc = swap_page(page); |
@@ -1176,9 +1201,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
1176 | struct page *page; | 1201 | struct page *page; |
1177 | struct pagevec pvec; | 1202 | struct pagevec pvec; |
1178 | int reclaim_mapped = 0; | 1203 | int reclaim_mapped = 0; |
1179 | long mapped_ratio; | 1204 | |
1180 | long distress; | 1205 | if (unlikely(sc->may_swap)) { |
1181 | long swap_tendency; | 1206 | long mapped_ratio; |
1207 | long distress; | ||
1208 | long swap_tendency; | ||
1209 | |||
1210 | /* | ||
1211 | * `distress' is a measure of how much trouble we're having | ||
1212 | * reclaiming pages. 0 -> no problems. 100 -> great trouble. | ||
1213 | */ | ||
1214 | distress = 100 >> zone->prev_priority; | ||
1215 | |||
1216 | /* | ||
1217 | * The point of this algorithm is to decide when to start | ||
1218 | * reclaiming mapped memory instead of just pagecache. Work out | ||
1219 | * how much memory | ||
1220 | * is mapped. | ||
1221 | */ | ||
1222 | mapped_ratio = (sc->nr_mapped * 100) / total_memory; | ||
1223 | |||
1224 | /* | ||
1225 | * Now decide how much we really want to unmap some pages. The | ||
1226 | * mapped ratio is downgraded - just because there's a lot of | ||
1227 | * mapped memory doesn't necessarily mean that page reclaim | ||
1228 | * isn't succeeding. | ||
1229 | * | ||
1230 | * The distress ratio is important - we don't want to start | ||
1231 | * going oom. | ||
1232 | * | ||
1233 | * A 100% value of vm_swappiness overrides this algorithm | ||
1234 | * altogether. | ||
1235 | */ | ||
1236 | swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; | ||
1237 | |||
1238 | /* | ||
1239 | * Now use this metric to decide whether to start moving mapped | ||
1240 | * memory onto the inactive list. | ||
1241 | */ | ||
1242 | if (swap_tendency >= 100) | ||
1243 | reclaim_mapped = 1; | ||
1244 | } | ||
1182 | 1245 | ||
1183 | lru_add_drain(); | 1246 | lru_add_drain(); |
1184 | spin_lock_irq(&zone->lru_lock); | 1247 | spin_lock_irq(&zone->lru_lock); |
@@ -1188,37 +1251,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
1188 | zone->nr_active -= pgmoved; | 1251 | zone->nr_active -= pgmoved; |
1189 | spin_unlock_irq(&zone->lru_lock); | 1252 | spin_unlock_irq(&zone->lru_lock); |
1190 | 1253 | ||
1191 | /* | ||
1192 | * `distress' is a measure of how much trouble we're having reclaiming | ||
1193 | * pages. 0 -> no problems. 100 -> great trouble. | ||
1194 | */ | ||
1195 | distress = 100 >> zone->prev_priority; | ||
1196 | |||
1197 | /* | ||
1198 | * The point of this algorithm is to decide when to start reclaiming | ||
1199 | * mapped memory instead of just pagecache. Work out how much memory | ||
1200 | * is mapped. | ||
1201 | */ | ||
1202 | mapped_ratio = (sc->nr_mapped * 100) / total_memory; | ||
1203 | |||
1204 | /* | ||
1205 | * Now decide how much we really want to unmap some pages. The mapped | ||
1206 | * ratio is downgraded - just because there's a lot of mapped memory | ||
1207 | * doesn't necessarily mean that page reclaim isn't succeeding. | ||
1208 | * | ||
1209 | * The distress ratio is important - we don't want to start going oom. | ||
1210 | * | ||
1211 | * A 100% value of vm_swappiness overrides this algorithm altogether. | ||
1212 | */ | ||
1213 | swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; | ||
1214 | |||
1215 | /* | ||
1216 | * Now use this metric to decide whether to start moving mapped memory | ||
1217 | * onto the inactive list. | ||
1218 | */ | ||
1219 | if (swap_tendency >= 100) | ||
1220 | reclaim_mapped = 1; | ||
1221 | |||
1222 | while (!list_empty(&l_hold)) { | 1254 | while (!list_empty(&l_hold)) { |
1223 | cond_resched(); | 1255 | cond_resched(); |
1224 | page = lru_to_page(&l_hold); | 1256 | page = lru_to_page(&l_hold); |
@@ -1595,9 +1627,7 @@ scan: | |||
1595 | sc.nr_reclaimed = 0; | 1627 | sc.nr_reclaimed = 0; |
1596 | sc.priority = priority; | 1628 | sc.priority = priority; |
1597 | sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; | 1629 | sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; |
1598 | atomic_inc(&zone->reclaim_in_progress); | ||
1599 | shrink_zone(zone, &sc); | 1630 | shrink_zone(zone, &sc); |
1600 | atomic_dec(&zone->reclaim_in_progress); | ||
1601 | reclaim_state->reclaimed_slab = 0; | 1631 | reclaim_state->reclaimed_slab = 0; |
1602 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1632 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
1603 | lru_pages); | 1633 | lru_pages); |
@@ -1859,7 +1889,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1859 | 1889 | ||
1860 | if (!(gfp_mask & __GFP_WAIT) || | 1890 | if (!(gfp_mask & __GFP_WAIT) || |
1861 | zone->all_unreclaimable || | 1891 | zone->all_unreclaimable || |
1862 | atomic_read(&zone->reclaim_in_progress) > 0) | 1892 | atomic_read(&zone->reclaim_in_progress) > 0 || |
1893 | (p->flags & PF_MEMALLOC)) | ||
1863 | return 0; | 1894 | return 0; |
1864 | 1895 | ||
1865 | node_id = zone->zone_pgdat->node_id; | 1896 | node_id = zone->zone_pgdat->node_id; |
@@ -1884,7 +1915,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1884 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | 1915 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; |
1885 | 1916 | ||
1886 | cond_resched(); | 1917 | cond_resched(); |
1887 | p->flags |= PF_MEMALLOC; | 1918 | /* |
1919 | * We need to be able to allocate from the reserves for RECLAIM_SWAP | ||
1920 | * and we also need to be able to write out pages for RECLAIM_WRITE | ||
1921 | * and RECLAIM_SWAP. | ||
1922 | */ | ||
1923 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; | ||
1888 | reclaim_state.reclaimed_slab = 0; | 1924 | reclaim_state.reclaimed_slab = 0; |
1889 | p->reclaim_state = &reclaim_state; | 1925 | p->reclaim_state = &reclaim_state; |
1890 | 1926 | ||
@@ -1908,11 +1944,10 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1908 | * a long time. | 1944 | * a long time. |
1909 | */ | 1945 | */ |
1910 | shrink_slab(sc.nr_scanned, gfp_mask, order); | 1946 | shrink_slab(sc.nr_scanned, gfp_mask, order); |
1911 | sc.nr_reclaimed = 1; /* Avoid getting the off node timeout */ | ||
1912 | } | 1947 | } |
1913 | 1948 | ||
1914 | p->reclaim_state = NULL; | 1949 | p->reclaim_state = NULL; |
1915 | current->flags &= ~PF_MEMALLOC; | 1950 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
1916 | 1951 | ||
1917 | if (sc.nr_reclaimed == 0) | 1952 | if (sc.nr_reclaimed == 0) |
1918 | zone->last_unsuccessful_zone_reclaim = jiffies; | 1953 | zone->last_unsuccessful_zone_reclaim = jiffies; |