aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorKumar Gala <galak@kernel.crashing.org>2006-03-20 12:58:02 -0500
committerKumar Gala <galak@kernel.crashing.org>2006-03-20 12:58:02 -0500
commit1a02e59a2970f9ed28ab51d3b08624b79e54d848 (patch)
tree470cce472be3b08c160e0c569648e7228651b12a /mm
parentebcff3c773b42bce6182ec16485abca4e53fba97 (diff)
parent2c276603c3e5ebf38155a9d1fbbda656d52d138e (diff)
Merge branch 'master'
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/madvise.c21
-rw-r--r--mm/memory.c10
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c182
-rw-r--r--mm/nommu.c10
-rw-r--r--mm/oom_kill.c124
-rw-r--r--mm/page_alloc.c63
-rw-r--r--mm/rmap.c21
-rw-r--r--mm/shmem.c81
-rw-r--r--mm/slab.c132
-rw-r--r--mm/swap.c27
-rw-r--r--mm/vmscan.c137
13 files changed, 599 insertions, 214 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67f29516662a..508707704d2c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -85,7 +85,7 @@ void free_huge_page(struct page *page)
85 BUG_ON(page_count(page)); 85 BUG_ON(page_count(page));
86 86
87 INIT_LIST_HEAD(&page->lru); 87 INIT_LIST_HEAD(&page->lru);
88 page[1].mapping = NULL; 88 page[1].lru.next = NULL; /* reset dtor */
89 89
90 spin_lock(&hugetlb_lock); 90 spin_lock(&hugetlb_lock);
91 enqueue_huge_page(page); 91 enqueue_huge_page(page);
@@ -105,7 +105,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
105 } 105 }
106 spin_unlock(&hugetlb_lock); 106 spin_unlock(&hugetlb_lock);
107 set_page_count(page, 1); 107 set_page_count(page, 1);
108 page[1].mapping = (void *)free_huge_page; 108 page[1].lru.next = (void *)free_huge_page; /* set dtor */
109 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) 109 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
110 clear_user_highpage(&page[i], addr); 110 clear_user_highpage(&page[i], addr);
111 return page; 111 return page;
diff --git a/mm/madvise.c b/mm/madvise.c
index ae0ae3ea299a..af3d573b0141 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -22,16 +22,23 @@ static long madvise_behavior(struct vm_area_struct * vma,
22 struct mm_struct * mm = vma->vm_mm; 22 struct mm_struct * mm = vma->vm_mm;
23 int error = 0; 23 int error = 0;
24 pgoff_t pgoff; 24 pgoff_t pgoff;
25 int new_flags = vma->vm_flags & ~VM_READHINTMASK; 25 int new_flags = vma->vm_flags;
26 26
27 switch (behavior) { 27 switch (behavior) {
28 case MADV_NORMAL:
29 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
30 break;
28 case MADV_SEQUENTIAL: 31 case MADV_SEQUENTIAL:
29 new_flags |= VM_SEQ_READ; 32 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
30 break; 33 break;
31 case MADV_RANDOM: 34 case MADV_RANDOM:
32 new_flags |= VM_RAND_READ; 35 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
33 break; 36 break;
34 default: 37 case MADV_DONTFORK:
38 new_flags |= VM_DONTCOPY;
39 break;
40 case MADV_DOFORK:
41 new_flags &= ~VM_DONTCOPY;
35 break; 42 break;
36 } 43 }
37 44
@@ -177,6 +184,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
177 long error; 184 long error;
178 185
179 switch (behavior) { 186 switch (behavior) {
187 case MADV_DOFORK:
188 if (vma->vm_flags & VM_IO) {
189 error = -EINVAL;
190 break;
191 }
192 case MADV_DONTFORK:
180 case MADV_NORMAL: 193 case MADV_NORMAL:
181 case MADV_SEQUENTIAL: 194 case MADV_SEQUENTIAL:
182 case MADV_RANDOM: 195 case MADV_RANDOM:
diff --git a/mm/memory.c b/mm/memory.c
index 2bee1f21aa8a..9abc6008544b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -82,6 +82,16 @@ EXPORT_SYMBOL(num_physpages);
82EXPORT_SYMBOL(high_memory); 82EXPORT_SYMBOL(high_memory);
83EXPORT_SYMBOL(vmalloc_earlyreserve); 83EXPORT_SYMBOL(vmalloc_earlyreserve);
84 84
85int randomize_va_space __read_mostly = 1;
86
87static int __init disable_randmaps(char *s)
88{
89 randomize_va_space = 0;
90 return 0;
91}
92__setup("norandmaps", disable_randmaps);
93
94
85/* 95/*
86 * If a p?d_bad entry is found while walking page tables, report 96 * If a p?d_bad entry is found while walking page tables, report
87 * the error, before resetting entry to p?d_none. Usually (but 97 * the error, before resetting entry to p?d_none. Usually (but
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a918f77f02f3..1fe76d963ac2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
130 onlined_pages++; 130 onlined_pages++;
131 } 131 }
132 zone->present_pages += onlined_pages; 132 zone->present_pages += onlined_pages;
133 zone->zone_pgdat->node_present_pages += onlined_pages;
133 134
134 setup_per_zone_pages_min(); 135 setup_per_zone_pages_min();
135 136
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3bd7fb7e4b75..2a8206009422 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
132 } 132 }
133 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 133 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
134} 134}
135
135/* Generate a custom zonelist for the BIND policy. */ 136/* Generate a custom zonelist for the BIND policy. */
136static struct zonelist *bind_zonelist(nodemask_t *nodes) 137static struct zonelist *bind_zonelist(nodemask_t *nodes)
137{ 138{
138 struct zonelist *zl; 139 struct zonelist *zl;
139 int num, max, nd; 140 int num, max, nd, k;
140 141
141 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 142 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
142 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 143 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
143 if (!zl) 144 if (!zl)
144 return NULL; 145 return NULL;
145 num = 0; 146 num = 0;
146 for_each_node_mask(nd, *nodes) 147 /* First put in the highest zones from all nodes, then all the next
147 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; 148 lower zones etc. Avoid empty zones because the memory allocator
149 doesn't like them. If you implement node hot removal you
150 have to fix that. */
151 for (k = policy_zone; k >= 0; k--) {
152 for_each_node_mask(nd, *nodes) {
153 struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 if (z->present_pages > 0)
155 zl->zones[num++] = z;
156 }
157 }
148 zl->zones[num] = NULL; 158 zl->zones[num] = NULL;
149 return zl; 159 return zl;
150} 160}
@@ -187,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
187 return policy; 197 return policy;
188} 198}
189 199
190static void gather_stats(struct page *, void *); 200static void gather_stats(struct page *, void *, int pte_dirty);
191static void migrate_page_add(struct page *page, struct list_head *pagelist, 201static void migrate_page_add(struct page *page, struct list_head *pagelist,
192 unsigned long flags); 202 unsigned long flags);
193 203
@@ -229,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
229 continue; 239 continue;
230 240
231 if (flags & MPOL_MF_STATS) 241 if (flags & MPOL_MF_STATS)
232 gather_stats(page, private); 242 gather_stats(page, private, pte_dirty(*pte));
233 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 243 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
234 migrate_page_add(page, private, flags); 244 migrate_page_add(page, private, flags);
235 else 245 else
@@ -542,7 +552,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
542 */ 552 */
543 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 553 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
544 if (isolate_lru_page(page)) 554 if (isolate_lru_page(page))
545 list_add(&page->lru, pagelist); 555 list_add_tail(&page->lru, pagelist);
546 } 556 }
547} 557}
548 558
@@ -559,6 +569,7 @@ static int migrate_pages_to(struct list_head *pagelist,
559 LIST_HEAD(moved); 569 LIST_HEAD(moved);
560 LIST_HEAD(failed); 570 LIST_HEAD(failed);
561 int err = 0; 571 int err = 0;
572 unsigned long offset = 0;
562 int nr_pages; 573 int nr_pages;
563 struct page *page; 574 struct page *page;
564 struct list_head *p; 575 struct list_head *p;
@@ -566,8 +577,21 @@ static int migrate_pages_to(struct list_head *pagelist,
566redo: 577redo:
567 nr_pages = 0; 578 nr_pages = 0;
568 list_for_each(p, pagelist) { 579 list_for_each(p, pagelist) {
569 if (vma) 580 if (vma) {
570 page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start); 581 /*
582 * The address passed to alloc_page_vma is used to
583 * generate the proper interleave behavior. We fake
584 * the address here by an increasing offset in order
585 * to get the proper distribution of pages.
586 *
587 * No decision has been made as to which page
588 * a certain old page is moved to so we cannot
589 * specify the correct address.
590 */
591 page = alloc_page_vma(GFP_HIGHUSER, vma,
592 offset + vma->vm_start);
593 offset += PAGE_SIZE;
594 }
571 else 595 else
572 page = alloc_pages_node(dest, GFP_HIGHUSER, 0); 596 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
573 597
@@ -575,9 +599,9 @@ redo:
575 err = -ENOMEM; 599 err = -ENOMEM;
576 goto out; 600 goto out;
577 } 601 }
578 list_add(&page->lru, &newlist); 602 list_add_tail(&page->lru, &newlist);
579 nr_pages++; 603 nr_pages++;
580 if (nr_pages > MIGRATE_CHUNK_SIZE); 604 if (nr_pages > MIGRATE_CHUNK_SIZE)
581 break; 605 break;
582 } 606 }
583 err = migrate_pages(pagelist, &newlist, &moved, &failed); 607 err = migrate_pages(pagelist, &newlist, &moved, &failed);
@@ -724,7 +748,7 @@ long do_mbind(unsigned long start, unsigned long len,
724 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 748 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
725 || mode > MPOL_MAX) 749 || mode > MPOL_MAX)
726 return -EINVAL; 750 return -EINVAL;
727 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) 751 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
728 return -EPERM; 752 return -EPERM;
729 753
730 if (start & ~PAGE_MASK) 754 if (start & ~PAGE_MASK)
@@ -798,6 +822,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
798 nodes_clear(*nodes); 822 nodes_clear(*nodes);
799 if (maxnode == 0 || !nmask) 823 if (maxnode == 0 || !nmask)
800 return 0; 824 return 0;
825 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
826 return -EINVAL;
801 827
802 nlongs = BITS_TO_LONGS(maxnode); 828 nlongs = BITS_TO_LONGS(maxnode);
803 if ((maxnode % BITS_PER_LONG) == 0) 829 if ((maxnode % BITS_PER_LONG) == 0)
@@ -916,19 +942,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
916 */ 942 */
917 if ((current->euid != task->suid) && (current->euid != task->uid) && 943 if ((current->euid != task->suid) && (current->euid != task->uid) &&
918 (current->uid != task->suid) && (current->uid != task->uid) && 944 (current->uid != task->suid) && (current->uid != task->uid) &&
919 !capable(CAP_SYS_ADMIN)) { 945 !capable(CAP_SYS_NICE)) {
920 err = -EPERM; 946 err = -EPERM;
921 goto out; 947 goto out;
922 } 948 }
923 949
924 task_nodes = cpuset_mems_allowed(task); 950 task_nodes = cpuset_mems_allowed(task);
925 /* Is the user allowed to access the target nodes? */ 951 /* Is the user allowed to access the target nodes? */
926 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { 952 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
927 err = -EPERM; 953 err = -EPERM;
928 goto out; 954 goto out;
929 } 955 }
930 956
931 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); 957 err = do_migrate_pages(mm, &old, &new,
958 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
932out: 959out:
933 mmput(mm); 960 mmput(mm);
934 return err; 961 return err;
@@ -1726,66 +1753,145 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1726struct numa_maps { 1753struct numa_maps {
1727 unsigned long pages; 1754 unsigned long pages;
1728 unsigned long anon; 1755 unsigned long anon;
1729 unsigned long mapped; 1756 unsigned long active;
1757 unsigned long writeback;
1730 unsigned long mapcount_max; 1758 unsigned long mapcount_max;
1759 unsigned long dirty;
1760 unsigned long swapcache;
1731 unsigned long node[MAX_NUMNODES]; 1761 unsigned long node[MAX_NUMNODES];
1732}; 1762};
1733 1763
1734static void gather_stats(struct page *page, void *private) 1764static void gather_stats(struct page *page, void *private, int pte_dirty)
1735{ 1765{
1736 struct numa_maps *md = private; 1766 struct numa_maps *md = private;
1737 int count = page_mapcount(page); 1767 int count = page_mapcount(page);
1738 1768
1739 if (count) 1769 md->pages++;
1740 md->mapped++; 1770 if (pte_dirty || PageDirty(page))
1771 md->dirty++;
1741 1772
1742 if (count > md->mapcount_max) 1773 if (PageSwapCache(page))
1743 md->mapcount_max = count; 1774 md->swapcache++;
1744 1775
1745 md->pages++; 1776 if (PageActive(page))
1777 md->active++;
1778
1779 if (PageWriteback(page))
1780 md->writeback++;
1746 1781
1747 if (PageAnon(page)) 1782 if (PageAnon(page))
1748 md->anon++; 1783 md->anon++;
1749 1784
1785 if (count > md->mapcount_max)
1786 md->mapcount_max = count;
1787
1750 md->node[page_to_nid(page)]++; 1788 md->node[page_to_nid(page)]++;
1751 cond_resched(); 1789 cond_resched();
1752} 1790}
1753 1791
1792#ifdef CONFIG_HUGETLB_PAGE
1793static void check_huge_range(struct vm_area_struct *vma,
1794 unsigned long start, unsigned long end,
1795 struct numa_maps *md)
1796{
1797 unsigned long addr;
1798 struct page *page;
1799
1800 for (addr = start; addr < end; addr += HPAGE_SIZE) {
1801 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1802 pte_t pte;
1803
1804 if (!ptep)
1805 continue;
1806
1807 pte = *ptep;
1808 if (pte_none(pte))
1809 continue;
1810
1811 page = pte_page(pte);
1812 if (!page)
1813 continue;
1814
1815 gather_stats(page, md, pte_dirty(*ptep));
1816 }
1817}
1818#else
1819static inline void check_huge_range(struct vm_area_struct *vma,
1820 unsigned long start, unsigned long end,
1821 struct numa_maps *md)
1822{
1823}
1824#endif
1825
1754int show_numa_map(struct seq_file *m, void *v) 1826int show_numa_map(struct seq_file *m, void *v)
1755{ 1827{
1756 struct task_struct *task = m->private; 1828 struct task_struct *task = m->private;
1757 struct vm_area_struct *vma = v; 1829 struct vm_area_struct *vma = v;
1758 struct numa_maps *md; 1830 struct numa_maps *md;
1831 struct file *file = vma->vm_file;
1832 struct mm_struct *mm = vma->vm_mm;
1759 int n; 1833 int n;
1760 char buffer[50]; 1834 char buffer[50];
1761 1835
1762 if (!vma->vm_mm) 1836 if (!mm)
1763 return 0; 1837 return 0;
1764 1838
1765 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); 1839 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1766 if (!md) 1840 if (!md)
1767 return 0; 1841 return 0;
1768 1842
1769 check_pgd_range(vma, vma->vm_start, vma->vm_end, 1843 mpol_to_str(buffer, sizeof(buffer),
1770 &node_online_map, MPOL_MF_STATS, md); 1844 get_vma_policy(task, vma, vma->vm_start));
1771 1845
1772 if (md->pages) { 1846 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1773 mpol_to_str(buffer, sizeof(buffer),
1774 get_vma_policy(task, vma, vma->vm_start));
1775 1847
1776 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", 1848 if (file) {
1777 vma->vm_start, buffer, md->pages, 1849 seq_printf(m, " file=");
1778 md->mapped, md->mapcount_max); 1850 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1851 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1852 seq_printf(m, " heap");
1853 } else if (vma->vm_start <= mm->start_stack &&
1854 vma->vm_end >= mm->start_stack) {
1855 seq_printf(m, " stack");
1856 }
1779 1857
1780 if (md->anon) 1858 if (is_vm_hugetlb_page(vma)) {
1781 seq_printf(m," anon=%lu",md->anon); 1859 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1860 seq_printf(m, " huge");
1861 } else {
1862 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1863 &node_online_map, MPOL_MF_STATS, md);
1864 }
1782 1865
1783 for_each_online_node(n) 1866 if (!md->pages)
1784 if (md->node[n]) 1867 goto out;
1785 seq_printf(m, " N%d=%lu", n, md->node[n]);
1786 1868
1787 seq_putc(m, '\n'); 1869 if (md->anon)
1788 } 1870 seq_printf(m," anon=%lu",md->anon);
1871
1872 if (md->dirty)
1873 seq_printf(m," dirty=%lu",md->dirty);
1874
1875 if (md->pages != md->anon && md->pages != md->dirty)
1876 seq_printf(m, " mapped=%lu", md->pages);
1877
1878 if (md->mapcount_max > 1)
1879 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1880
1881 if (md->swapcache)
1882 seq_printf(m," swapcache=%lu", md->swapcache);
1883
1884 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1885 seq_printf(m," active=%lu", md->active);
1886
1887 if (md->writeback)
1888 seq_printf(m," writeback=%lu", md->writeback);
1889
1890 for_each_online_node(n)
1891 if (md->node[n])
1892 seq_printf(m, " N%d=%lu", n, md->node[n]);
1893out:
1894 seq_putc(m, '\n');
1789 kfree(md); 1895 kfree(md);
1790 1896
1791 if (m->count < m->size) 1897 if (m->count < m->size)
diff --git a/mm/nommu.c b/mm/nommu.c
index c10262d68232..4951f4786f28 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -53,10 +53,11 @@ DECLARE_RWSEM(nommu_vma_sem);
53struct vm_operations_struct generic_file_vm_ops = { 53struct vm_operations_struct generic_file_vm_ops = {
54}; 54};
55 55
56EXPORT_SYMBOL(vmalloc);
57EXPORT_SYMBOL(vfree); 56EXPORT_SYMBOL(vfree);
58EXPORT_SYMBOL(vmalloc_to_page); 57EXPORT_SYMBOL(vmalloc_to_page);
59EXPORT_SYMBOL(vmalloc_32); 58EXPORT_SYMBOL(vmalloc_32);
59EXPORT_SYMBOL(vmap);
60EXPORT_SYMBOL(vunmap);
60 61
61/* 62/*
62 * Handle all mappings that got truncated by a "truncate()" 63 * Handle all mappings that got truncated by a "truncate()"
@@ -203,6 +204,13 @@ void *vmalloc(unsigned long size)
203{ 204{
204 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); 205 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
205} 206}
207EXPORT_SYMBOL(vmalloc);
208
209void *vmalloc_node(unsigned long size, int node)
210{
211 return vmalloc(size);
212}
213EXPORT_SYMBOL(vmalloc_node);
206 214
207/* 215/*
208 * vmalloc_32 - allocate virtually continguos memory (32bit addressable) 216 * vmalloc_32 - allocate virtually continguos memory (32bit addressable)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index b05ab8f2a562..78747afad6b0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,15 +58,17 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 58
59 /* 59 /*
60 * Processes which fork a lot of child processes are likely 60 * Processes which fork a lot of child processes are likely
61 * a good choice. We add the vmsize of the children if they 61 * a good choice. We add half the vmsize of the children if they
62 * have an own mm. This prevents forking servers to flood the 62 * have an own mm. This prevents forking servers to flood the
63 * machine with an endless amount of children 63 * machine with an endless amount of children. In case a single
64 * child is eating the vast majority of memory, adding only half
65 * to the parents will make the child our kill candidate of choice.
64 */ 66 */
65 list_for_each(tsk, &p->children) { 67 list_for_each(tsk, &p->children) {
66 struct task_struct *chld; 68 struct task_struct *chld;
67 chld = list_entry(tsk, struct task_struct, sibling); 69 chld = list_entry(tsk, struct task_struct, sibling);
68 if (chld->mm != p->mm && chld->mm) 70 if (chld->mm != p->mm && chld->mm)
69 points += chld->mm->total_vm; 71 points += chld->mm->total_vm/2 + 1;
70 } 72 }
71 73
72 /* 74 /*
@@ -131,17 +133,47 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
131} 133}
132 134
133/* 135/*
136 * Types of limitations to the nodes from which allocations may occur
137 */
138#define CONSTRAINT_NONE 1
139#define CONSTRAINT_MEMORY_POLICY 2
140#define CONSTRAINT_CPUSET 3
141
142/*
143 * Determine the type of allocation constraint.
144 */
145static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
146{
147#ifdef CONFIG_NUMA
148 struct zone **z;
149 nodemask_t nodes = node_online_map;
150
151 for (z = zonelist->zones; *z; z++)
152 if (cpuset_zone_allowed(*z, gfp_mask))
153 node_clear((*z)->zone_pgdat->node_id,
154 nodes);
155 else
156 return CONSTRAINT_CPUSET;
157
158 if (!nodes_empty(nodes))
159 return CONSTRAINT_MEMORY_POLICY;
160#endif
161
162 return CONSTRAINT_NONE;
163}
164
165/*
134 * Simple selection loop. We chose the process with the highest 166 * Simple selection loop. We chose the process with the highest
135 * number of 'points'. We expect the caller will lock the tasklist. 167 * number of 'points'. We expect the caller will lock the tasklist.
136 * 168 *
137 * (not docbooked, we don't want this one cluttering up the manual) 169 * (not docbooked, we don't want this one cluttering up the manual)
138 */ 170 */
139static struct task_struct * select_bad_process(void) 171static struct task_struct *select_bad_process(unsigned long *ppoints)
140{ 172{
141 unsigned long maxpoints = 0;
142 struct task_struct *g, *p; 173 struct task_struct *g, *p;
143 struct task_struct *chosen = NULL; 174 struct task_struct *chosen = NULL;
144 struct timespec uptime; 175 struct timespec uptime;
176 *ppoints = 0;
145 177
146 do_posix_clock_monotonic_gettime(&uptime); 178 do_posix_clock_monotonic_gettime(&uptime);
147 do_each_thread(g, p) { 179 do_each_thread(g, p) {
@@ -169,9 +201,9 @@ static struct task_struct * select_bad_process(void)
169 return p; 201 return p;
170 202
171 points = badness(p, uptime.tv_sec); 203 points = badness(p, uptime.tv_sec);
172 if (points > maxpoints || !chosen) { 204 if (points > *ppoints || !chosen) {
173 chosen = p; 205 chosen = p;
174 maxpoints = points; 206 *ppoints = points;
175 } 207 }
176 } while_each_thread(g, p); 208 } while_each_thread(g, p);
177 return chosen; 209 return chosen;
@@ -182,7 +214,7 @@ static struct task_struct * select_bad_process(void)
182 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that 214 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
183 * we select a process with CAP_SYS_RAW_IO set). 215 * we select a process with CAP_SYS_RAW_IO set).
184 */ 216 */
185static void __oom_kill_task(task_t *p) 217static void __oom_kill_task(task_t *p, const char *message)
186{ 218{
187 if (p->pid == 1) { 219 if (p->pid == 1) {
188 WARN_ON(1); 220 WARN_ON(1);
@@ -198,8 +230,8 @@ static void __oom_kill_task(task_t *p)
198 return; 230 return;
199 } 231 }
200 task_unlock(p); 232 task_unlock(p);
201 printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", 233 printk(KERN_ERR "%s: Killed process %d (%s).\n",
202 p->pid, p->comm); 234 message, p->pid, p->comm);
203 235
204 /* 236 /*
205 * We give our sacrificial lamb high priority and access to 237 * We give our sacrificial lamb high priority and access to
@@ -212,7 +244,7 @@ static void __oom_kill_task(task_t *p)
212 force_sig(SIGKILL, p); 244 force_sig(SIGKILL, p);
213} 245}
214 246
215static struct mm_struct *oom_kill_task(task_t *p) 247static struct mm_struct *oom_kill_task(task_t *p, const char *message)
216{ 248{
217 struct mm_struct *mm = get_task_mm(p); 249 struct mm_struct *mm = get_task_mm(p);
218 task_t * g, * q; 250 task_t * g, * q;
@@ -224,35 +256,38 @@ static struct mm_struct *oom_kill_task(task_t *p)
224 return NULL; 256 return NULL;
225 } 257 }
226 258
227 __oom_kill_task(p); 259 __oom_kill_task(p, message);
228 /* 260 /*
229 * kill all processes that share the ->mm (i.e. all threads), 261 * kill all processes that share the ->mm (i.e. all threads),
230 * but are in a different thread group 262 * but are in a different thread group
231 */ 263 */
232 do_each_thread(g, q) 264 do_each_thread(g, q)
233 if (q->mm == mm && q->tgid != p->tgid) 265 if (q->mm == mm && q->tgid != p->tgid)
234 __oom_kill_task(q); 266 __oom_kill_task(q, message);
235 while_each_thread(g, q); 267 while_each_thread(g, q);
236 268
237 return mm; 269 return mm;
238} 270}
239 271
240static struct mm_struct *oom_kill_process(struct task_struct *p) 272static struct mm_struct *oom_kill_process(struct task_struct *p,
273 unsigned long points, const char *message)
241{ 274{
242 struct mm_struct *mm; 275 struct mm_struct *mm;
243 struct task_struct *c; 276 struct task_struct *c;
244 struct list_head *tsk; 277 struct list_head *tsk;
245 278
279 printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li and "
280 "children.\n", p->pid, p->comm, points);
246 /* Try to kill a child first */ 281 /* Try to kill a child first */
247 list_for_each(tsk, &p->children) { 282 list_for_each(tsk, &p->children) {
248 c = list_entry(tsk, struct task_struct, sibling); 283 c = list_entry(tsk, struct task_struct, sibling);
249 if (c->mm == p->mm) 284 if (c->mm == p->mm)
250 continue; 285 continue;
251 mm = oom_kill_task(c); 286 mm = oom_kill_task(c, message);
252 if (mm) 287 if (mm)
253 return mm; 288 return mm;
254 } 289 }
255 return oom_kill_task(p); 290 return oom_kill_task(p, message);
256} 291}
257 292
258/** 293/**
@@ -263,10 +298,11 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
263 * OR try to be smart about which process to kill. Note that we 298 * OR try to be smart about which process to kill. Note that we
264 * don't have to be perfect here, we just have to be good. 299 * don't have to be perfect here, we just have to be good.
265 */ 300 */
266void out_of_memory(gfp_t gfp_mask, int order) 301void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
267{ 302{
268 struct mm_struct *mm = NULL; 303 struct mm_struct *mm = NULL;
269 task_t * p; 304 task_t *p;
305 unsigned long points = 0;
270 306
271 if (printk_ratelimit()) { 307 if (printk_ratelimit()) {
272 printk("oom-killer: gfp_mask=0x%x, order=%d\n", 308 printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -277,24 +313,48 @@ void out_of_memory(gfp_t gfp_mask, int order)
277 313
278 cpuset_lock(); 314 cpuset_lock();
279 read_lock(&tasklist_lock); 315 read_lock(&tasklist_lock);
316
317 /*
318 * Check if there were limitations on the allocation (only relevant for
319 * NUMA) that may require different handling.
320 */
321 switch (constrained_alloc(zonelist, gfp_mask)) {
322 case CONSTRAINT_MEMORY_POLICY:
323 mm = oom_kill_process(current, points,
324 "No available memory (MPOL_BIND)");
325 break;
326
327 case CONSTRAINT_CPUSET:
328 mm = oom_kill_process(current, points,
329 "No available memory in cpuset");
330 break;
331
332 case CONSTRAINT_NONE:
280retry: 333retry:
281 p = select_bad_process(); 334 /*
335 * Rambo mode: Shoot down a process and hope it solves whatever
336 * issues we may have.
337 */
338 p = select_bad_process(&points);
282 339
283 if (PTR_ERR(p) == -1UL) 340 if (PTR_ERR(p) == -1UL)
284 goto out; 341 goto out;
285 342
286 /* Found nothing?!?! Either we hang forever, or we panic. */ 343 /* Found nothing?!?! Either we hang forever, or we panic. */
287 if (!p) { 344 if (!p) {
288 read_unlock(&tasklist_lock); 345 read_unlock(&tasklist_lock);
289 cpuset_unlock(); 346 cpuset_unlock();
290 panic("Out of memory and no killable processes...\n"); 347 panic("Out of memory and no killable processes...\n");
291 } 348 }
292 349
293 mm = oom_kill_process(p); 350 mm = oom_kill_process(p, points, "Out of memory");
294 if (!mm) 351 if (!mm)
295 goto retry; 352 goto retry;
353
354 break;
355 }
296 356
297 out: 357out:
298 read_unlock(&tasklist_lock); 358 read_unlock(&tasklist_lock);
299 cpuset_unlock(); 359 cpuset_unlock();
300 if (mm) 360 if (mm)
@@ -305,5 +365,5 @@ retry:
305 * retry to allocate memory unless "p" is current 365 * retry to allocate memory unless "p" is current
306 */ 366 */
307 if (!test_thread_flag(TIF_MEMDIE)) 367 if (!test_thread_flag(TIF_MEMDIE))
308 schedule_timeout_interruptible(1); 368 schedule_timeout_uninterruptible(1);
309} 369}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dde04ff4be31..234bd4895d14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@ long nr_swap_pages;
56int percpu_pagelist_fraction; 56int percpu_pagelist_fraction;
57 57
58static void fastcall free_hot_cold_page(struct page *page, int cold); 58static void fastcall free_hot_cold_page(struct page *page, int cold);
59static void __free_pages_ok(struct page *page, unsigned int order);
59 60
60/* 61/*
61 * results with 256, 32 in the lowmem_reserve sysctl: 62 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -169,20 +170,23 @@ static void bad_page(struct page *page)
169 * All pages have PG_compound set. All pages have their ->private pointing at 170 * All pages have PG_compound set. All pages have their ->private pointing at
170 * the head page (even the head page has this). 171 * the head page (even the head page has this).
171 * 172 *
172 * The first tail page's ->mapping, if non-zero, holds the address of the 173 * The first tail page's ->lru.next holds the address of the compound page's
173 * compound page's put_page() function. 174 * put_page() function. Its ->lru.prev holds the order of allocation.
174 * 175 * This usage means that zero-order pages may not be compound.
175 * The order of the allocation is stored in the first tail page's ->index
176 * This is only for debug at present. This usage means that zero-order pages
177 * may not be compound.
178 */ 176 */
177
178static void free_compound_page(struct page *page)
179{
180 __free_pages_ok(page, (unsigned long)page[1].lru.prev);
181}
182
179static void prep_compound_page(struct page *page, unsigned long order) 183static void prep_compound_page(struct page *page, unsigned long order)
180{ 184{
181 int i; 185 int i;
182 int nr_pages = 1 << order; 186 int nr_pages = 1 << order;
183 187
184 page[1].mapping = NULL; 188 page[1].lru.next = (void *)free_compound_page; /* set dtor */
185 page[1].index = order; 189 page[1].lru.prev = (void *)order;
186 for (i = 0; i < nr_pages; i++) { 190 for (i = 0; i < nr_pages; i++) {
187 struct page *p = page + i; 191 struct page *p = page + i;
188 192
@@ -196,7 +200,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
196 int i; 200 int i;
197 int nr_pages = 1 << order; 201 int nr_pages = 1 << order;
198 202
199 if (unlikely(page[1].index != order)) 203 if (unlikely((unsigned long)page[1].lru.prev != order))
200 bad_page(page); 204 bad_page(page);
201 205
202 for (i = 0; i < nr_pages; i++) { 206 for (i = 0; i < nr_pages; i++) {
@@ -586,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
586} 590}
587 591
588#ifdef CONFIG_NUMA 592#ifdef CONFIG_NUMA
589/* Called from the slab reaper to drain remote pagesets */ 593/*
590void drain_remote_pages(void) 594 * Called from the slab reaper to drain pagesets on a particular node that
595 * belong to the currently executing processor.
596 */
597void drain_node_pages(int nodeid)
591{ 598{
592 struct zone *zone; 599 int i, z;
593 int i;
594 unsigned long flags; 600 unsigned long flags;
595 601
596 local_irq_save(flags); 602 local_irq_save(flags);
597 for_each_zone(zone) { 603 for (z = 0; z < MAX_NR_ZONES; z++) {
604 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
598 struct per_cpu_pageset *pset; 605 struct per_cpu_pageset *pset;
599 606
600 /* Do not drain local pagesets */
601 if (zone->zone_pgdat->node_id == numa_node_id())
602 continue;
603
604 pset = zone_pcp(zone, smp_processor_id()); 607 pset = zone_pcp(zone, smp_processor_id());
605 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 608 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
606 struct per_cpu_pages *pcp; 609 struct per_cpu_pages *pcp;
@@ -1011,7 +1014,7 @@ rebalance:
1011 if (page) 1014 if (page)
1012 goto got_pg; 1015 goto got_pg;
1013 1016
1014 out_of_memory(gfp_mask, order); 1017 out_of_memory(zonelist, gfp_mask, order);
1015 goto restart; 1018 goto restart;
1016 } 1019 }
1017 1020
@@ -1537,29 +1540,29 @@ static int __initdata node_load[MAX_NUMNODES];
1537 */ 1540 */
1538static int __init find_next_best_node(int node, nodemask_t *used_node_mask) 1541static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
1539{ 1542{
1540 int i, n, val; 1543 int n, val;
1541 int min_val = INT_MAX; 1544 int min_val = INT_MAX;
1542 int best_node = -1; 1545 int best_node = -1;
1543 1546
1544 for_each_online_node(i) { 1547 /* Use the local node if we haven't already */
1545 cpumask_t tmp; 1548 if (!node_isset(node, *used_node_mask)) {
1549 node_set(node, *used_node_mask);
1550 return node;
1551 }
1546 1552
1547 /* Start from local node */ 1553 for_each_online_node(n) {
1548 n = (node+i) % num_online_nodes(); 1554 cpumask_t tmp;
1549 1555
1550 /* Don't want a node to appear more than once */ 1556 /* Don't want a node to appear more than once */
1551 if (node_isset(n, *used_node_mask)) 1557 if (node_isset(n, *used_node_mask))
1552 continue; 1558 continue;
1553 1559
1554 /* Use the local node if we haven't already */
1555 if (!node_isset(node, *used_node_mask)) {
1556 best_node = node;
1557 break;
1558 }
1559
1560 /* Use the distance array to find the distance */ 1560 /* Use the distance array to find the distance */
1561 val = node_distance(node, n); 1561 val = node_distance(node, n);
1562 1562
1563 /* Penalize nodes under us ("prefer the next node") */
1564 val += (n < node);
1565
1563 /* Give preference to headless and unused nodes */ 1566 /* Give preference to headless and unused nodes */
1564 tmp = node_to_cpumask(n); 1567 tmp = node_to_cpumask(n);
1565 if (!cpus_empty(tmp)) 1568 if (!cpus_empty(tmp))
diff --git a/mm/rmap.c b/mm/rmap.c
index df2c41c2a9a2..67f0e20b101f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -212,25 +212,33 @@ out:
212 * through real pte's pointing to valid pages and then releasing 212 * through real pte's pointing to valid pages and then releasing
213 * the page from the swap cache. 213 * the page from the swap cache.
214 * 214 *
215 * Must hold page lock on page. 215 * Must hold page lock on page and mmap_sem of one vma that contains
216 * the page.
216 */ 217 */
217void remove_from_swap(struct page *page) 218void remove_from_swap(struct page *page)
218{ 219{
219 struct anon_vma *anon_vma; 220 struct anon_vma *anon_vma;
220 struct vm_area_struct *vma; 221 struct vm_area_struct *vma;
222 unsigned long mapping;
221 223
222 if (!PageAnon(page) || !PageSwapCache(page)) 224 if (!PageSwapCache(page))
223 return; 225 return;
224 226
225 anon_vma = page_lock_anon_vma(page); 227 mapping = (unsigned long)page->mapping;
226 if (!anon_vma) 228
229 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
227 return; 230 return;
228 231
232 /*
233 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
234 */
235 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
236 spin_lock(&anon_vma->lock);
237
229 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) 238 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
230 remove_vma_swap(vma, page); 239 remove_vma_swap(vma, page);
231 240
232 spin_unlock(&anon_vma->lock); 241 spin_unlock(&anon_vma->lock);
233
234 delete_from_swap_cache(page); 242 delete_from_swap_cache(page);
235} 243}
236EXPORT_SYMBOL(remove_from_swap); 244EXPORT_SYMBOL(remove_from_swap);
@@ -529,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page,
529 */ 537 */
530void page_add_file_rmap(struct page *page) 538void page_add_file_rmap(struct page *page)
531{ 539{
532 BUG_ON(PageAnon(page));
533 BUG_ON(!pfn_valid(page_to_pfn(page)));
534
535 if (atomic_inc_and_test(&page->_mapcount)) 540 if (atomic_inc_and_test(&page->_mapcount))
536 __inc_page_state(nr_mapped); 541 __inc_page_state(nr_mapped);
537} 542}
diff --git a/mm/shmem.c b/mm/shmem.c
index f7ac7b812f92..7c455fbaff7b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -45,6 +45,7 @@
45#include <linux/swapops.h> 45#include <linux/swapops.h>
46#include <linux/mempolicy.h> 46#include <linux/mempolicy.h>
47#include <linux/namei.h> 47#include <linux/namei.h>
48#include <linux/ctype.h>
48#include <asm/uaccess.h> 49#include <asm/uaccess.h>
49#include <asm/div64.h> 50#include <asm/div64.h>
50#include <asm/pgtable.h> 51#include <asm/pgtable.h>
@@ -874,6 +875,51 @@ redirty:
874} 875}
875 876
876#ifdef CONFIG_NUMA 877#ifdef CONFIG_NUMA
878static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
879{
880 char *nodelist = strchr(value, ':');
881 int err = 1;
882
883 if (nodelist) {
884 /* NUL-terminate policy string */
885 *nodelist++ = '\0';
886 if (nodelist_parse(nodelist, *policy_nodes))
887 goto out;
888 }
889 if (!strcmp(value, "default")) {
890 *policy = MPOL_DEFAULT;
891 /* Don't allow a nodelist */
892 if (!nodelist)
893 err = 0;
894 } else if (!strcmp(value, "prefer")) {
895 *policy = MPOL_PREFERRED;
896 /* Insist on a nodelist of one node only */
897 if (nodelist) {
898 char *rest = nodelist;
899 while (isdigit(*rest))
900 rest++;
901 if (!*rest)
902 err = 0;
903 }
904 } else if (!strcmp(value, "bind")) {
905 *policy = MPOL_BIND;
906 /* Insist on a nodelist */
907 if (nodelist)
908 err = 0;
909 } else if (!strcmp(value, "interleave")) {
910 *policy = MPOL_INTERLEAVE;
911 /* Default to nodes online if no nodelist */
912 if (!nodelist)
913 *policy_nodes = node_online_map;
914 err = 0;
915 }
916out:
917 /* Restore string for error message */
918 if (nodelist)
919 *--nodelist = ':';
920 return err;
921}
922
877static struct page *shmem_swapin_async(struct shared_policy *p, 923static struct page *shmem_swapin_async(struct shared_policy *p,
878 swp_entry_t entry, unsigned long idx) 924 swp_entry_t entry, unsigned long idx)
879{ 925{
@@ -926,6 +972,11 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
926 return page; 972 return page;
927} 973}
928#else 974#else
975static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
976{
977 return 1;
978}
979
929static inline struct page * 980static inline struct page *
930shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx) 981shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
931{ 982{
@@ -1859,7 +1910,23 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1859{ 1910{
1860 char *this_char, *value, *rest; 1911 char *this_char, *value, *rest;
1861 1912
1862 while ((this_char = strsep(&options, ",")) != NULL) { 1913 while (options != NULL) {
1914 this_char = options;
1915 for (;;) {
1916 /*
1917 * NUL-terminate this option: unfortunately,
1918 * mount options form a comma-separated list,
1919 * but mpol's nodelist may also contain commas.
1920 */
1921 options = strchr(options, ',');
1922 if (options == NULL)
1923 break;
1924 options++;
1925 if (!isdigit(*options)) {
1926 options[-1] = '\0';
1927 break;
1928 }
1929 }
1863 if (!*this_char) 1930 if (!*this_char)
1864 continue; 1931 continue;
1865 if ((value = strchr(this_char,'=')) != NULL) { 1932 if ((value = strchr(this_char,'=')) != NULL) {
@@ -1910,18 +1977,8 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
1910 if (*rest) 1977 if (*rest)
1911 goto bad_val; 1978 goto bad_val;
1912 } else if (!strcmp(this_char,"mpol")) { 1979 } else if (!strcmp(this_char,"mpol")) {
1913 if (!strcmp(value,"default")) 1980 if (shmem_parse_mpol(value,policy,policy_nodes))
1914 *policy = MPOL_DEFAULT;
1915 else if (!strcmp(value,"preferred"))
1916 *policy = MPOL_PREFERRED;
1917 else if (!strcmp(value,"bind"))
1918 *policy = MPOL_BIND;
1919 else if (!strcmp(value,"interleave"))
1920 *policy = MPOL_INTERLEAVE;
1921 else
1922 goto bad_val; 1981 goto bad_val;
1923 } else if (!strcmp(this_char,"mpol_nodelist")) {
1924 nodelist_parse(value, *policy_nodes);
1925 } else { 1982 } else {
1926 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 1983 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1927 this_char); 1984 this_char);
diff --git a/mm/slab.c b/mm/slab.c
index d66c2b0d9715..d0bd7f07ab04 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
789 dump_stack(); 789 dump_stack();
790} 790}
791 791
792#ifdef CONFIG_NUMA
793/*
794 * Special reaping functions for NUMA systems called from cache_reap().
795 * These take care of doing round robin flushing of alien caches (containing
796 * objects freed on different nodes from which they were allocated) and the
797 * flushing of remote pcps by calling drain_node_pages.
798 */
799static DEFINE_PER_CPU(unsigned long, reap_node);
800
801static void init_reap_node(int cpu)
802{
803 int node;
804
805 node = next_node(cpu_to_node(cpu), node_online_map);
806 if (node == MAX_NUMNODES)
807 node = 0;
808
809 __get_cpu_var(reap_node) = node;
810}
811
812static void next_reap_node(void)
813{
814 int node = __get_cpu_var(reap_node);
815
816 /*
817 * Also drain per cpu pages on remote zones
818 */
819 if (node != numa_node_id())
820 drain_node_pages(node);
821
822 node = next_node(node, node_online_map);
823 if (unlikely(node >= MAX_NUMNODES))
824 node = first_node(node_online_map);
825 __get_cpu_var(reap_node) = node;
826}
827
828#else
829#define init_reap_node(cpu) do { } while (0)
830#define next_reap_node(void) do { } while (0)
831#endif
832
792/* 833/*
793 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 834 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
794 * via the workqueue/eventd. 835 * via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
806 * at that time. 847 * at that time.
807 */ 848 */
808 if (keventd_up() && reap_work->func == NULL) { 849 if (keventd_up() && reap_work->func == NULL) {
850 init_reap_node(cpu);
809 INIT_WORK(reap_work, cache_reap, NULL); 851 INIT_WORK(reap_work, cache_reap, NULL);
810 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 852 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
811 } 853 }
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
884 } 926 }
885} 927}
886 928
929/*
930 * Called from cache_reap() to regularly drain alien caches round robin.
931 */
932static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
933{
934 int node = __get_cpu_var(reap_node);
935
936 if (l3->alien) {
937 struct array_cache *ac = l3->alien[node];
938 if (ac && ac->avail) {
939 spin_lock_irq(&ac->lock);
940 __drain_alien_cache(cachep, ac, node);
941 spin_unlock_irq(&ac->lock);
942 }
943 }
944}
945
887static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) 946static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
888{ 947{
889 int i = 0; 948 int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
902#else 961#else
903 962
904#define drain_alien_cache(cachep, alien) do { } while (0) 963#define drain_alien_cache(cachep, alien) do { } while (0)
964#define reap_alien(cachep, l3) do { } while (0)
905 965
906static inline struct array_cache **alloc_alien_cache(int node, int limit) 966static inline struct array_cache **alloc_alien_cache(int node, int limit)
907{ 967{
@@ -1124,6 +1184,7 @@ void __init kmem_cache_init(void)
1124 struct cache_sizes *sizes; 1184 struct cache_sizes *sizes;
1125 struct cache_names *names; 1185 struct cache_names *names;
1126 int i; 1186 int i;
1187 int order;
1127 1188
1128 for (i = 0; i < NUM_INIT_LISTS; i++) { 1189 for (i = 0; i < NUM_INIT_LISTS; i++) {
1129 kmem_list3_init(&initkmem_list3[i]); 1190 kmem_list3_init(&initkmem_list3[i]);
@@ -1167,11 +1228,15 @@ void __init kmem_cache_init(void)
1167 1228
1168 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); 1229 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
1169 1230
1170 cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0, 1231 for (order = 0; order < MAX_ORDER; order++) {
1171 &left_over, &cache_cache.num); 1232 cache_estimate(order, cache_cache.buffer_size,
1233 cache_line_size(), 0, &left_over, &cache_cache.num);
1234 if (cache_cache.num)
1235 break;
1236 }
1172 if (!cache_cache.num) 1237 if (!cache_cache.num)
1173 BUG(); 1238 BUG();
1174 1239 cache_cache.gfporder = order;
1175 cache_cache.colour = left_over / cache_cache.colour_off; 1240 cache_cache.colour = left_over / cache_cache.colour_off;
1176 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1241 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1177 sizeof(struct slab), cache_line_size()); 1242 sizeof(struct slab), cache_line_size());
@@ -1628,36 +1693,44 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1628 size_t size, size_t align, unsigned long flags) 1693 size_t size, size_t align, unsigned long flags)
1629{ 1694{
1630 size_t left_over = 0; 1695 size_t left_over = 0;
1696 int gfporder;
1631 1697
1632 for (;; cachep->gfporder++) { 1698 for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
1633 unsigned int num; 1699 unsigned int num;
1634 size_t remainder; 1700 size_t remainder;
1635 1701
1636 if (cachep->gfporder > MAX_GFP_ORDER) { 1702 cache_estimate(gfporder, size, align, flags, &remainder, &num);
1637 cachep->num = 0;
1638 break;
1639 }
1640
1641 cache_estimate(cachep->gfporder, size, align, flags,
1642 &remainder, &num);
1643 if (!num) 1703 if (!num)
1644 continue; 1704 continue;
1705
1645 /* More than offslab_limit objects will cause problems */ 1706 /* More than offslab_limit objects will cause problems */
1646 if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) 1707 if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
1647 break; 1708 break;
1648 1709
1710 /* Found something acceptable - save it away */
1649 cachep->num = num; 1711 cachep->num = num;
1712 cachep->gfporder = gfporder;
1650 left_over = remainder; 1713 left_over = remainder;
1651 1714
1652 /* 1715 /*
1716 * A VFS-reclaimable slab tends to have most allocations
1717 * as GFP_NOFS and we really don't want to have to be allocating
1718 * higher-order pages when we are unable to shrink dcache.
1719 */
1720 if (flags & SLAB_RECLAIM_ACCOUNT)
1721 break;
1722
1723 /*
1653 * Large number of objects is good, but very large slabs are 1724 * Large number of objects is good, but very large slabs are
1654 * currently bad for the gfp()s. 1725 * currently bad for the gfp()s.
1655 */ 1726 */
1656 if (cachep->gfporder >= slab_break_gfp_order) 1727 if (gfporder >= slab_break_gfp_order)
1657 break; 1728 break;
1658 1729
1659 if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder)) 1730 /*
1660 /* Acceptable internal fragmentation */ 1731 * Acceptable internal fragmentation?
1732 */
1733 if ((left_over * 8) <= (PAGE_SIZE << gfporder))
1661 break; 1734 break;
1662 } 1735 }
1663 return left_over; 1736 return left_over;
@@ -1717,6 +1790,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1717 BUG(); 1790 BUG();
1718 } 1791 }
1719 1792
1793 /*
1794 * Prevent CPUs from coming and going.
1795 * lock_cpu_hotplug() nests outside cache_chain_mutex
1796 */
1797 lock_cpu_hotplug();
1798
1720 mutex_lock(&cache_chain_mutex); 1799 mutex_lock(&cache_chain_mutex);
1721 1800
1722 list_for_each(p, &cache_chain) { 1801 list_for_each(p, &cache_chain) {
@@ -1863,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1863 1942
1864 size = ALIGN(size, align); 1943 size = ALIGN(size, align);
1865 1944
1866 if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { 1945 left_over = calculate_slab_order(cachep, size, align, flags);
1867 /*
1868 * A VFS-reclaimable slab tends to have most allocations
1869 * as GFP_NOFS and we really don't want to have to be allocating
1870 * higher-order pages when we are unable to shrink dcache.
1871 */
1872 cachep->gfporder = 0;
1873 cache_estimate(cachep->gfporder, size, align, flags,
1874 &left_over, &cachep->num);
1875 } else
1876 left_over = calculate_slab_order(cachep, size, align, flags);
1877 1946
1878 if (!cachep->num) { 1947 if (!cachep->num) {
1879 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1948 printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1918,8 +1987,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1918 cachep->dtor = dtor; 1987 cachep->dtor = dtor;
1919 cachep->name = name; 1988 cachep->name = name;
1920 1989
1921 /* Don't let CPUs to come and go */
1922 lock_cpu_hotplug();
1923 1990
1924 if (g_cpucache_up == FULL) { 1991 if (g_cpucache_up == FULL) {
1925 enable_cpucache(cachep); 1992 enable_cpucache(cachep);
@@ -1978,12 +2045,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1978 2045
1979 /* cache setup completed, link it into the list */ 2046 /* cache setup completed, link it into the list */
1980 list_add(&cachep->next, &cache_chain); 2047 list_add(&cachep->next, &cache_chain);
1981 unlock_cpu_hotplug();
1982 oops: 2048 oops:
1983 if (!cachep && (flags & SLAB_PANIC)) 2049 if (!cachep && (flags & SLAB_PANIC))
1984 panic("kmem_cache_create(): failed to create slab `%s'\n", 2050 panic("kmem_cache_create(): failed to create slab `%s'\n",
1985 name); 2051 name);
1986 mutex_unlock(&cache_chain_mutex); 2052 mutex_unlock(&cache_chain_mutex);
2053 unlock_cpu_hotplug();
1987 return cachep; 2054 return cachep;
1988} 2055}
1989EXPORT_SYMBOL(kmem_cache_create); 2056EXPORT_SYMBOL(kmem_cache_create);
@@ -2550,7 +2617,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2550 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2617 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2551 cachep->name, cachep->num, slabp, slabp->inuse); 2618 cachep->name, cachep->num, slabp, slabp->inuse);
2552 for (i = 0; 2619 for (i = 0;
2553 i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t); 2620 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2554 i++) { 2621 i++) {
2555 if ((i % 16) == 0) 2622 if ((i % 16) == 0)
2556 printk("\n%03x:", i); 2623 printk("\n%03x:", i);
@@ -3490,8 +3557,7 @@ static void cache_reap(void *unused)
3490 check_irq_on(); 3557 check_irq_on();
3491 3558
3492 l3 = searchp->nodelists[numa_node_id()]; 3559 l3 = searchp->nodelists[numa_node_id()];
3493 if (l3->alien) 3560 reap_alien(searchp, l3);
3494 drain_alien_cache(searchp, l3->alien);
3495 spin_lock_irq(&l3->list_lock); 3561 spin_lock_irq(&l3->list_lock);
3496 3562
3497 drain_array_locked(searchp, cpu_cache_get(searchp), 0, 3563 drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3541,7 +3607,7 @@ static void cache_reap(void *unused)
3541 } 3607 }
3542 check_irq_on(); 3608 check_irq_on();
3543 mutex_unlock(&cache_chain_mutex); 3609 mutex_unlock(&cache_chain_mutex);
3544 drain_remote_pages(); 3610 next_reap_node();
3545 /* Setup the next iteration */ 3611 /* Setup the next iteration */
3546 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3612 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3547} 3613}
diff --git a/mm/swap.c b/mm/swap.c
index 76247424dea1..e9ec06d845e8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -40,7 +40,7 @@ static void put_compound_page(struct page *page)
40 if (put_page_testzero(page)) { 40 if (put_page_testzero(page)) {
41 void (*dtor)(struct page *page); 41 void (*dtor)(struct page *page);
42 42
43 dtor = (void (*)(struct page *))page[1].mapping; 43 dtor = (void (*)(struct page *))page[1].lru.next;
44 (*dtor)(page); 44 (*dtor)(page);
45 } 45 }
46} 46}
@@ -489,13 +489,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount)
489 if (count >= FBC_BATCH || count <= -FBC_BATCH) { 489 if (count >= FBC_BATCH || count <= -FBC_BATCH) {
490 spin_lock(&fbc->lock); 490 spin_lock(&fbc->lock);
491 fbc->count += count; 491 fbc->count += count;
492 *pcount = 0;
492 spin_unlock(&fbc->lock); 493 spin_unlock(&fbc->lock);
493 count = 0; 494 } else {
495 *pcount = count;
494 } 496 }
495 *pcount = count;
496 put_cpu(); 497 put_cpu();
497} 498}
498EXPORT_SYMBOL(percpu_counter_mod); 499EXPORT_SYMBOL(percpu_counter_mod);
500
501/*
502 * Add up all the per-cpu counts, return the result. This is a more accurate
503 * but much slower version of percpu_counter_read_positive()
504 */
505long percpu_counter_sum(struct percpu_counter *fbc)
506{
507 long ret;
508 int cpu;
509
510 spin_lock(&fbc->lock);
511 ret = fbc->count;
512 for_each_cpu(cpu) {
513 long *pcount = per_cpu_ptr(fbc->counters, cpu);
514 ret += *pcount;
515 }
516 spin_unlock(&fbc->lock);
517 return ret < 0 ? 0 : ret;
518}
519EXPORT_SYMBOL(percpu_counter_sum);
499#endif 520#endif
500 521
501/* 522/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5a610804cd06..4fe7e3aa02e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -443,6 +443,10 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
443 BUG_ON(PageActive(page)); 443 BUG_ON(PageActive(page));
444 444
445 sc->nr_scanned++; 445 sc->nr_scanned++;
446
447 if (!sc->may_swap && page_mapped(page))
448 goto keep_locked;
449
446 /* Double the slab pressure for mapped and swapcache pages */ 450 /* Double the slab pressure for mapped and swapcache pages */
447 if (page_mapped(page) || PageSwapCache(page)) 451 if (page_mapped(page) || PageSwapCache(page))
448 sc->nr_scanned++; 452 sc->nr_scanned++;
@@ -632,7 +636,7 @@ static int swap_page(struct page *page)
632 struct address_space *mapping = page_mapping(page); 636 struct address_space *mapping = page_mapping(page);
633 637
634 if (page_mapped(page) && mapping) 638 if (page_mapped(page) && mapping)
635 if (try_to_unmap(page, 0) != SWAP_SUCCESS) 639 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
636 goto unlock_retry; 640 goto unlock_retry;
637 641
638 if (PageDirty(page)) { 642 if (PageDirty(page)) {
@@ -696,7 +700,7 @@ int migrate_page_remove_references(struct page *newpage,
696 * the page. 700 * the page.
697 */ 701 */
698 if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) 702 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
699 return 1; 703 return -EAGAIN;
700 704
701 /* 705 /*
702 * Establish swap ptes for anonymous pages or destroy pte 706 * Establish swap ptes for anonymous pages or destroy pte
@@ -717,13 +721,15 @@ int migrate_page_remove_references(struct page *newpage,
717 * If the page was not migrated then the PageSwapCache bit 721 * If the page was not migrated then the PageSwapCache bit
718 * is still set and the operation may continue. 722 * is still set and the operation may continue.
719 */ 723 */
720 try_to_unmap(page, 1); 724 if (try_to_unmap(page, 1) == SWAP_FAIL)
725 /* A vma has VM_LOCKED set -> Permanent failure */
726 return -EPERM;
721 727
722 /* 728 /*
723 * Give up if we were unable to remove all mappings. 729 * Give up if we were unable to remove all mappings.
724 */ 730 */
725 if (page_mapcount(page)) 731 if (page_mapcount(page))
726 return 1; 732 return -EAGAIN;
727 733
728 write_lock_irq(&mapping->tree_lock); 734 write_lock_irq(&mapping->tree_lock);
729 735
@@ -734,7 +740,7 @@ int migrate_page_remove_references(struct page *newpage,
734 if (!page_mapping(page) || page_count(page) != nr_refs || 740 if (!page_mapping(page) || page_count(page) != nr_refs ||
735 *radix_pointer != page) { 741 *radix_pointer != page) {
736 write_unlock_irq(&mapping->tree_lock); 742 write_unlock_irq(&mapping->tree_lock);
737 return 1; 743 return -EAGAIN;
738 } 744 }
739 745
740 /* 746 /*
@@ -809,10 +815,14 @@ EXPORT_SYMBOL(migrate_page_copy);
809 */ 815 */
810int migrate_page(struct page *newpage, struct page *page) 816int migrate_page(struct page *newpage, struct page *page)
811{ 817{
818 int rc;
819
812 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 820 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
813 821
814 if (migrate_page_remove_references(newpage, page, 2)) 822 rc = migrate_page_remove_references(newpage, page, 2);
815 return -EAGAIN; 823
824 if (rc)
825 return rc;
816 826
817 migrate_page_copy(newpage, page); 827 migrate_page_copy(newpage, page);
818 828
@@ -839,7 +849,7 @@ EXPORT_SYMBOL(migrate_page);
839 * pages are swapped out. 849 * pages are swapped out.
840 * 850 *
841 * The function returns after 10 attempts or if no pages 851 * The function returns after 10 attempts or if no pages
842 * are movable anymore because t has become empty 852 * are movable anymore because to has become empty
843 * or no retryable pages exist anymore. 853 * or no retryable pages exist anymore.
844 * 854 *
845 * Return: Number of pages not migrated when "to" ran empty. 855 * Return: Number of pages not migrated when "to" ran empty.
@@ -928,12 +938,21 @@ redo:
928 goto unlock_both; 938 goto unlock_both;
929 939
930 if (mapping->a_ops->migratepage) { 940 if (mapping->a_ops->migratepage) {
941 /*
942 * Most pages have a mapping and most filesystems
943 * should provide a migration function. Anonymous
944 * pages are part of swap space which also has its
945 * own migration function. This is the most common
946 * path for page migration.
947 */
931 rc = mapping->a_ops->migratepage(newpage, page); 948 rc = mapping->a_ops->migratepage(newpage, page);
932 goto unlock_both; 949 goto unlock_both;
933 } 950 }
934 951
935 /* 952 /*
936 * Trigger writeout if page is dirty 953 * Default handling if a filesystem does not provide
954 * a migration function. We can only migrate clean
955 * pages so try to write out any dirty pages first.
937 */ 956 */
938 if (PageDirty(page)) { 957 if (PageDirty(page)) {
939 switch (pageout(page, mapping)) { 958 switch (pageout(page, mapping)) {
@@ -949,9 +968,10 @@ redo:
949 ; /* try to migrate the page below */ 968 ; /* try to migrate the page below */
950 } 969 }
951 } 970 }
971
952 /* 972 /*
953 * If we have no buffer or can release the buffer 973 * Buffers are managed in a filesystem specific way.
954 * then do a simple migration. 974 * We must have no buffers or drop them.
955 */ 975 */
956 if (!page_has_buffers(page) || 976 if (!page_has_buffers(page) ||
957 try_to_release_page(page, GFP_KERNEL)) { 977 try_to_release_page(page, GFP_KERNEL)) {
@@ -966,6 +986,11 @@ redo:
966 * swap them out. 986 * swap them out.
967 */ 987 */
968 if (pass > 4) { 988 if (pass > 4) {
989 /*
990 * Persistently unable to drop buffers..... As a
991 * measure of last resort we fall back to
992 * swap_page().
993 */
969 unlock_page(newpage); 994 unlock_page(newpage);
970 newpage = NULL; 995 newpage = NULL;
971 rc = swap_page(page); 996 rc = swap_page(page);
@@ -1176,9 +1201,47 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1176 struct page *page; 1201 struct page *page;
1177 struct pagevec pvec; 1202 struct pagevec pvec;
1178 int reclaim_mapped = 0; 1203 int reclaim_mapped = 0;
1179 long mapped_ratio; 1204
1180 long distress; 1205 if (unlikely(sc->may_swap)) {
1181 long swap_tendency; 1206 long mapped_ratio;
1207 long distress;
1208 long swap_tendency;
1209
1210 /*
1211 * `distress' is a measure of how much trouble we're having
1212 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
1213 */
1214 distress = 100 >> zone->prev_priority;
1215
1216 /*
1217 * The point of this algorithm is to decide when to start
1218 * reclaiming mapped memory instead of just pagecache. Work out
1219 * how much memory
1220 * is mapped.
1221 */
1222 mapped_ratio = (sc->nr_mapped * 100) / total_memory;
1223
1224 /*
1225 * Now decide how much we really want to unmap some pages. The
1226 * mapped ratio is downgraded - just because there's a lot of
1227 * mapped memory doesn't necessarily mean that page reclaim
1228 * isn't succeeding.
1229 *
1230 * The distress ratio is important - we don't want to start
1231 * going oom.
1232 *
1233 * A 100% value of vm_swappiness overrides this algorithm
1234 * altogether.
1235 */
1236 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
1237
1238 /*
1239 * Now use this metric to decide whether to start moving mapped
1240 * memory onto the inactive list.
1241 */
1242 if (swap_tendency >= 100)
1243 reclaim_mapped = 1;
1244 }
1182 1245
1183 lru_add_drain(); 1246 lru_add_drain();
1184 spin_lock_irq(&zone->lru_lock); 1247 spin_lock_irq(&zone->lru_lock);
@@ -1188,37 +1251,6 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
1188 zone->nr_active -= pgmoved; 1251 zone->nr_active -= pgmoved;
1189 spin_unlock_irq(&zone->lru_lock); 1252 spin_unlock_irq(&zone->lru_lock);
1190 1253
1191 /*
1192 * `distress' is a measure of how much trouble we're having reclaiming
1193 * pages. 0 -> no problems. 100 -> great trouble.
1194 */
1195 distress = 100 >> zone->prev_priority;
1196
1197 /*
1198 * The point of this algorithm is to decide when to start reclaiming
1199 * mapped memory instead of just pagecache. Work out how much memory
1200 * is mapped.
1201 */
1202 mapped_ratio = (sc->nr_mapped * 100) / total_memory;
1203
1204 /*
1205 * Now decide how much we really want to unmap some pages. The mapped
1206 * ratio is downgraded - just because there's a lot of mapped memory
1207 * doesn't necessarily mean that page reclaim isn't succeeding.
1208 *
1209 * The distress ratio is important - we don't want to start going oom.
1210 *
1211 * A 100% value of vm_swappiness overrides this algorithm altogether.
1212 */
1213 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
1214
1215 /*
1216 * Now use this metric to decide whether to start moving mapped memory
1217 * onto the inactive list.
1218 */
1219 if (swap_tendency >= 100)
1220 reclaim_mapped = 1;
1221
1222 while (!list_empty(&l_hold)) { 1254 while (!list_empty(&l_hold)) {
1223 cond_resched(); 1255 cond_resched();
1224 page = lru_to_page(&l_hold); 1256 page = lru_to_page(&l_hold);
@@ -1595,9 +1627,7 @@ scan:
1595 sc.nr_reclaimed = 0; 1627 sc.nr_reclaimed = 0;
1596 sc.priority = priority; 1628 sc.priority = priority;
1597 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; 1629 sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
1598 atomic_inc(&zone->reclaim_in_progress);
1599 shrink_zone(zone, &sc); 1630 shrink_zone(zone, &sc);
1600 atomic_dec(&zone->reclaim_in_progress);
1601 reclaim_state->reclaimed_slab = 0; 1631 reclaim_state->reclaimed_slab = 0;
1602 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1632 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1603 lru_pages); 1633 lru_pages);
@@ -1859,7 +1889,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1859 1889
1860 if (!(gfp_mask & __GFP_WAIT) || 1890 if (!(gfp_mask & __GFP_WAIT) ||
1861 zone->all_unreclaimable || 1891 zone->all_unreclaimable ||
1862 atomic_read(&zone->reclaim_in_progress) > 0) 1892 atomic_read(&zone->reclaim_in_progress) > 0 ||
1893 (p->flags & PF_MEMALLOC))
1863 return 0; 1894 return 0;
1864 1895
1865 node_id = zone->zone_pgdat->node_id; 1896 node_id = zone->zone_pgdat->node_id;
@@ -1884,7 +1915,12 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1884 sc.swap_cluster_max = SWAP_CLUSTER_MAX; 1915 sc.swap_cluster_max = SWAP_CLUSTER_MAX;
1885 1916
1886 cond_resched(); 1917 cond_resched();
1887 p->flags |= PF_MEMALLOC; 1918 /*
1919 * We need to be able to allocate from the reserves for RECLAIM_SWAP
1920 * and we also need to be able to write out pages for RECLAIM_WRITE
1921 * and RECLAIM_SWAP.
1922 */
1923 p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
1888 reclaim_state.reclaimed_slab = 0; 1924 reclaim_state.reclaimed_slab = 0;
1889 p->reclaim_state = &reclaim_state; 1925 p->reclaim_state = &reclaim_state;
1890 1926
@@ -1908,11 +1944,10 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1908 * a long time. 1944 * a long time.
1909 */ 1945 */
1910 shrink_slab(sc.nr_scanned, gfp_mask, order); 1946 shrink_slab(sc.nr_scanned, gfp_mask, order);
1911 sc.nr_reclaimed = 1; /* Avoid getting the off node timeout */
1912 } 1947 }
1913 1948
1914 p->reclaim_state = NULL; 1949 p->reclaim_state = NULL;
1915 current->flags &= ~PF_MEMALLOC; 1950 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
1916 1951
1917 if (sc.nr_reclaimed == 0) 1952 if (sc.nr_reclaimed == 0)
1918 zone->last_unsuccessful_zone_reclaim = jiffies; 1953 zone->last_unsuccessful_zone_reclaim = jiffies;