aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-03-20 12:47:40 -0500
committerSteven Whitehouse <swhiteho@redhat.com>2006-03-20 12:47:40 -0500
commit9a21247181d93fdf99255911845ecdb041d21583 (patch)
tree6d69be36f8a2bd8c76bf02ead2f0121a511c0a92 /mm
parentc752666c17f870fa8ae9f16804dd457e9e6daaec (diff)
parent7705a8792b0fc82fd7d4dd923724606bbfd9fb20 (diff)
Merge branch 'master'
Diffstat (limited to 'mm')
-rw-r--r--mm/memory.c5
-rw-r--r--mm/memory_hotplug.c1
-rw-r--r--mm/mempolicy.c152
-rw-r--r--mm/nommu.c8
-rw-r--r--mm/oom_kill.c5
-rw-r--r--mm/page_alloc.c17
-rw-r--r--mm/rmap.c21
-rw-r--r--mm/slab.c122
-rw-r--r--mm/swap.c28
-rw-r--r--mm/vmscan.c21
10 files changed, 287 insertions, 93 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 9abc6008544b..85e80a57db29 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -623,11 +623,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
623 (*zap_work)--; 623 (*zap_work)--;
624 continue; 624 continue;
625 } 625 }
626
627 (*zap_work) -= PAGE_SIZE;
628
626 if (pte_present(ptent)) { 629 if (pte_present(ptent)) {
627 struct page *page; 630 struct page *page;
628 631
629 (*zap_work) -= PAGE_SIZE;
630
631 page = vm_normal_page(vma, addr, ptent); 632 page = vm_normal_page(vma, addr, ptent);
632 if (unlikely(details) && page) { 633 if (unlikely(details) && page) {
633 /* 634 /*
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a918f77f02f3..1fe76d963ac2 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -130,6 +130,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
130 onlined_pages++; 130 onlined_pages++;
131 } 131 }
132 zone->present_pages += onlined_pages; 132 zone->present_pages += onlined_pages;
133 zone->zone_pgdat->node_present_pages += onlined_pages;
133 134
134 setup_per_zone_pages_min(); 135 setup_per_zone_pages_min();
135 136
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 67af4cea1e23..b21869a39f0b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -197,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
197 return policy; 197 return policy;
198} 198}
199 199
200static void gather_stats(struct page *, void *); 200static void gather_stats(struct page *, void *, int pte_dirty);
201static void migrate_page_add(struct page *page, struct list_head *pagelist, 201static void migrate_page_add(struct page *page, struct list_head *pagelist,
202 unsigned long flags); 202 unsigned long flags);
203 203
@@ -239,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
239 continue; 239 continue;
240 240
241 if (flags & MPOL_MF_STATS) 241 if (flags & MPOL_MF_STATS)
242 gather_stats(page, private); 242 gather_stats(page, private, pte_dirty(*pte));
243 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 243 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
244 migrate_page_add(page, private, flags); 244 migrate_page_add(page, private, flags);
245 else 245 else
@@ -330,9 +330,19 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
330 int err; 330 int err;
331 struct vm_area_struct *first, *vma, *prev; 331 struct vm_area_struct *first, *vma, *prev;
332 332
333 /* Clear the LRU lists so pages can be isolated */ 333 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
334 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 334 /* Must have swap device for migration */
335 if (nr_swap_pages <= 0)
336 return ERR_PTR(-ENODEV);
337
338 /*
339 * Clear the LRU lists so pages can be isolated.
340 * Note that pages may be moved off the LRU after we have
341 * drained them. Those pages will fail to migrate like other
342 * pages that may be busy.
343 */
335 lru_add_drain_all(); 344 lru_add_drain_all();
345 }
336 346
337 first = find_vma(mm, start); 347 first = find_vma(mm, start);
338 if (!first) 348 if (!first)
@@ -748,7 +758,7 @@ long do_mbind(unsigned long start, unsigned long len,
748 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 758 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
749 || mode > MPOL_MAX) 759 || mode > MPOL_MAX)
750 return -EINVAL; 760 return -EINVAL;
751 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) 761 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
752 return -EPERM; 762 return -EPERM;
753 763
754 if (start & ~PAGE_MASK) 764 if (start & ~PAGE_MASK)
@@ -942,19 +952,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
942 */ 952 */
943 if ((current->euid != task->suid) && (current->euid != task->uid) && 953 if ((current->euid != task->suid) && (current->euid != task->uid) &&
944 (current->uid != task->suid) && (current->uid != task->uid) && 954 (current->uid != task->suid) && (current->uid != task->uid) &&
945 !capable(CAP_SYS_ADMIN)) { 955 !capable(CAP_SYS_NICE)) {
946 err = -EPERM; 956 err = -EPERM;
947 goto out; 957 goto out;
948 } 958 }
949 959
950 task_nodes = cpuset_mems_allowed(task); 960 task_nodes = cpuset_mems_allowed(task);
951 /* Is the user allowed to access the target nodes? */ 961 /* Is the user allowed to access the target nodes? */
952 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { 962 if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
953 err = -EPERM; 963 err = -EPERM;
954 goto out; 964 goto out;
955 } 965 }
956 966
957 err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); 967 err = do_migrate_pages(mm, &old, &new,
968 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
958out: 969out:
959 mmput(mm); 970 mmput(mm);
960 return err; 971 return err;
@@ -1752,66 +1763,145 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1752struct numa_maps { 1763struct numa_maps {
1753 unsigned long pages; 1764 unsigned long pages;
1754 unsigned long anon; 1765 unsigned long anon;
1755 unsigned long mapped; 1766 unsigned long active;
1767 unsigned long writeback;
1756 unsigned long mapcount_max; 1768 unsigned long mapcount_max;
1769 unsigned long dirty;
1770 unsigned long swapcache;
1757 unsigned long node[MAX_NUMNODES]; 1771 unsigned long node[MAX_NUMNODES];
1758}; 1772};
1759 1773
1760static void gather_stats(struct page *page, void *private) 1774static void gather_stats(struct page *page, void *private, int pte_dirty)
1761{ 1775{
1762 struct numa_maps *md = private; 1776 struct numa_maps *md = private;
1763 int count = page_mapcount(page); 1777 int count = page_mapcount(page);
1764 1778
1765 if (count) 1779 md->pages++;
1766 md->mapped++; 1780 if (pte_dirty || PageDirty(page))
1781 md->dirty++;
1767 1782
1768 if (count > md->mapcount_max) 1783 if (PageSwapCache(page))
1769 md->mapcount_max = count; 1784 md->swapcache++;
1770 1785
1771 md->pages++; 1786 if (PageActive(page))
1787 md->active++;
1788
1789 if (PageWriteback(page))
1790 md->writeback++;
1772 1791
1773 if (PageAnon(page)) 1792 if (PageAnon(page))
1774 md->anon++; 1793 md->anon++;
1775 1794
1795 if (count > md->mapcount_max)
1796 md->mapcount_max = count;
1797
1776 md->node[page_to_nid(page)]++; 1798 md->node[page_to_nid(page)]++;
1777 cond_resched(); 1799 cond_resched();
1778} 1800}
1779 1801
1802#ifdef CONFIG_HUGETLB_PAGE
1803static void check_huge_range(struct vm_area_struct *vma,
1804 unsigned long start, unsigned long end,
1805 struct numa_maps *md)
1806{
1807 unsigned long addr;
1808 struct page *page;
1809
1810 for (addr = start; addr < end; addr += HPAGE_SIZE) {
1811 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1812 pte_t pte;
1813
1814 if (!ptep)
1815 continue;
1816
1817 pte = *ptep;
1818 if (pte_none(pte))
1819 continue;
1820
1821 page = pte_page(pte);
1822 if (!page)
1823 continue;
1824
1825 gather_stats(page, md, pte_dirty(*ptep));
1826 }
1827}
1828#else
1829static inline void check_huge_range(struct vm_area_struct *vma,
1830 unsigned long start, unsigned long end,
1831 struct numa_maps *md)
1832{
1833}
1834#endif
1835
1780int show_numa_map(struct seq_file *m, void *v) 1836int show_numa_map(struct seq_file *m, void *v)
1781{ 1837{
1782 struct task_struct *task = m->private; 1838 struct task_struct *task = m->private;
1783 struct vm_area_struct *vma = v; 1839 struct vm_area_struct *vma = v;
1784 struct numa_maps *md; 1840 struct numa_maps *md;
1841 struct file *file = vma->vm_file;
1842 struct mm_struct *mm = vma->vm_mm;
1785 int n; 1843 int n;
1786 char buffer[50]; 1844 char buffer[50];
1787 1845
1788 if (!vma->vm_mm) 1846 if (!mm)
1789 return 0; 1847 return 0;
1790 1848
1791 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); 1849 md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1792 if (!md) 1850 if (!md)
1793 return 0; 1851 return 0;
1794 1852
1795 check_pgd_range(vma, vma->vm_start, vma->vm_end, 1853 mpol_to_str(buffer, sizeof(buffer),
1796 &node_online_map, MPOL_MF_STATS, md); 1854 get_vma_policy(task, vma, vma->vm_start));
1797 1855
1798 if (md->pages) { 1856 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1799 mpol_to_str(buffer, sizeof(buffer),
1800 get_vma_policy(task, vma, vma->vm_start));
1801 1857
1802 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", 1858 if (file) {
1803 vma->vm_start, buffer, md->pages, 1859 seq_printf(m, " file=");
1804 md->mapped, md->mapcount_max); 1860 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1861 } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1862 seq_printf(m, " heap");
1863 } else if (vma->vm_start <= mm->start_stack &&
1864 vma->vm_end >= mm->start_stack) {
1865 seq_printf(m, " stack");
1866 }
1805 1867
1806 if (md->anon) 1868 if (is_vm_hugetlb_page(vma)) {
1807 seq_printf(m," anon=%lu",md->anon); 1869 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1870 seq_printf(m, " huge");
1871 } else {
1872 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1873 &node_online_map, MPOL_MF_STATS, md);
1874 }
1808 1875
1809 for_each_online_node(n) 1876 if (!md->pages)
1810 if (md->node[n]) 1877 goto out;
1811 seq_printf(m, " N%d=%lu", n, md->node[n]);
1812 1878
1813 seq_putc(m, '\n'); 1879 if (md->anon)
1814 } 1880 seq_printf(m," anon=%lu",md->anon);
1881
1882 if (md->dirty)
1883 seq_printf(m," dirty=%lu",md->dirty);
1884
1885 if (md->pages != md->anon && md->pages != md->dirty)
1886 seq_printf(m, " mapped=%lu", md->pages);
1887
1888 if (md->mapcount_max > 1)
1889 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1890
1891 if (md->swapcache)
1892 seq_printf(m," swapcache=%lu", md->swapcache);
1893
1894 if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1895 seq_printf(m," active=%lu", md->active);
1896
1897 if (md->writeback)
1898 seq_printf(m," writeback=%lu", md->writeback);
1899
1900 for_each_online_node(n)
1901 if (md->node[n])
1902 seq_printf(m, " N%d=%lu", n, md->node[n]);
1903out:
1904 seq_putc(m, '\n');
1815 kfree(md); 1905 kfree(md);
1816 1906
1817 if (m->count < m->size) 1907 if (m->count < m->size)
diff --git a/mm/nommu.c b/mm/nommu.c
index 99d21020ec9d..4951f4786f28 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -53,7 +53,6 @@ DECLARE_RWSEM(nommu_vma_sem);
53struct vm_operations_struct generic_file_vm_ops = { 53struct vm_operations_struct generic_file_vm_ops = {
54}; 54};
55 55
56EXPORT_SYMBOL(vmalloc);
57EXPORT_SYMBOL(vfree); 56EXPORT_SYMBOL(vfree);
58EXPORT_SYMBOL(vmalloc_to_page); 57EXPORT_SYMBOL(vmalloc_to_page);
59EXPORT_SYMBOL(vmalloc_32); 58EXPORT_SYMBOL(vmalloc_32);
@@ -205,6 +204,13 @@ void *vmalloc(unsigned long size)
205{ 204{
206 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); 205 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
207} 206}
207EXPORT_SYMBOL(vmalloc);
208
209void *vmalloc_node(unsigned long size, int node)
210{
211 return vmalloc(size);
212}
213EXPORT_SYMBOL(vmalloc_node);
208 214
209/* 215/*
210 * vmalloc_32 - allocate virtually continguos memory (32bit addressable) 216 * vmalloc_32 - allocate virtually continguos memory (32bit addressable)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8123fad5a485..78747afad6b0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -302,7 +302,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
302{ 302{
303 struct mm_struct *mm = NULL; 303 struct mm_struct *mm = NULL;
304 task_t *p; 304 task_t *p;
305 unsigned long points; 305 unsigned long points = 0;
306 306
307 if (printk_ratelimit()) { 307 if (printk_ratelimit()) {
308 printk("oom-killer: gfp_mask=0x%x, order=%d\n", 308 printk("oom-killer: gfp_mask=0x%x, order=%d\n",
@@ -355,6 +355,7 @@ retry:
355 } 355 }
356 356
357out: 357out:
358 read_unlock(&tasklist_lock);
358 cpuset_unlock(); 359 cpuset_unlock();
359 if (mm) 360 if (mm)
360 mmput(mm); 361 mmput(mm);
@@ -364,5 +365,5 @@ out:
364 * retry to allocate memory unless "p" is current 365 * retry to allocate memory unless "p" is current
365 */ 366 */
366 if (!test_thread_flag(TIF_MEMDIE)) 367 if (!test_thread_flag(TIF_MEMDIE))
367 schedule_timeout_interruptible(1); 368 schedule_timeout_uninterruptible(1);
368} 369}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 791690d7d3fa..234bd4895d14 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -590,21 +590,20 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
590} 590}
591 591
592#ifdef CONFIG_NUMA 592#ifdef CONFIG_NUMA
593/* Called from the slab reaper to drain remote pagesets */ 593/*
594void drain_remote_pages(void) 594 * Called from the slab reaper to drain pagesets on a particular node that
595 * belong to the currently executing processor.
596 */
597void drain_node_pages(int nodeid)
595{ 598{
596 struct zone *zone; 599 int i, z;
597 int i;
598 unsigned long flags; 600 unsigned long flags;
599 601
600 local_irq_save(flags); 602 local_irq_save(flags);
601 for_each_zone(zone) { 603 for (z = 0; z < MAX_NR_ZONES; z++) {
604 struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
602 struct per_cpu_pageset *pset; 605 struct per_cpu_pageset *pset;
603 606
604 /* Do not drain local pagesets */
605 if (zone->zone_pgdat->node_id == numa_node_id())
606 continue;
607
608 pset = zone_pcp(zone, smp_processor_id()); 607 pset = zone_pcp(zone, smp_processor_id());
609 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 608 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
610 struct per_cpu_pages *pcp; 609 struct per_cpu_pages *pcp;
diff --git a/mm/rmap.c b/mm/rmap.c
index df2c41c2a9a2..67f0e20b101f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -212,25 +212,33 @@ out:
212 * through real pte's pointing to valid pages and then releasing 212 * through real pte's pointing to valid pages and then releasing
213 * the page from the swap cache. 213 * the page from the swap cache.
214 * 214 *
215 * Must hold page lock on page. 215 * Must hold page lock on page and mmap_sem of one vma that contains
216 * the page.
216 */ 217 */
217void remove_from_swap(struct page *page) 218void remove_from_swap(struct page *page)
218{ 219{
219 struct anon_vma *anon_vma; 220 struct anon_vma *anon_vma;
220 struct vm_area_struct *vma; 221 struct vm_area_struct *vma;
222 unsigned long mapping;
221 223
222 if (!PageAnon(page) || !PageSwapCache(page)) 224 if (!PageSwapCache(page))
223 return; 225 return;
224 226
225 anon_vma = page_lock_anon_vma(page); 227 mapping = (unsigned long)page->mapping;
226 if (!anon_vma) 228
229 if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
227 return; 230 return;
228 231
232 /*
233 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
234 */
235 anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
236 spin_lock(&anon_vma->lock);
237
229 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) 238 list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
230 remove_vma_swap(vma, page); 239 remove_vma_swap(vma, page);
231 240
232 spin_unlock(&anon_vma->lock); 241 spin_unlock(&anon_vma->lock);
233
234 delete_from_swap_cache(page); 242 delete_from_swap_cache(page);
235} 243}
236EXPORT_SYMBOL(remove_from_swap); 244EXPORT_SYMBOL(remove_from_swap);
@@ -529,9 +537,6 @@ void page_add_new_anon_rmap(struct page *page,
529 */ 537 */
530void page_add_file_rmap(struct page *page) 538void page_add_file_rmap(struct page *page)
531{ 539{
532 BUG_ON(PageAnon(page));
533 BUG_ON(!pfn_valid(page_to_pfn(page)));
534
535 if (atomic_inc_and_test(&page->_mapcount)) 540 if (atomic_inc_and_test(&page->_mapcount))
536 __inc_page_state(nr_mapped); 541 __inc_page_state(nr_mapped);
537} 542}
diff --git a/mm/slab.c b/mm/slab.c
index add05d808a4a..d0bd7f07ab04 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -789,6 +789,47 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, char *
789 dump_stack(); 789 dump_stack();
790} 790}
791 791
792#ifdef CONFIG_NUMA
793/*
794 * Special reaping functions for NUMA systems called from cache_reap().
795 * These take care of doing round robin flushing of alien caches (containing
796 * objects freed on different nodes from which they were allocated) and the
797 * flushing of remote pcps by calling drain_node_pages.
798 */
799static DEFINE_PER_CPU(unsigned long, reap_node);
800
801static void init_reap_node(int cpu)
802{
803 int node;
804
805 node = next_node(cpu_to_node(cpu), node_online_map);
806 if (node == MAX_NUMNODES)
807 node = 0;
808
809 __get_cpu_var(reap_node) = node;
810}
811
812static void next_reap_node(void)
813{
814 int node = __get_cpu_var(reap_node);
815
816 /*
817 * Also drain per cpu pages on remote zones
818 */
819 if (node != numa_node_id())
820 drain_node_pages(node);
821
822 node = next_node(node, node_online_map);
823 if (unlikely(node >= MAX_NUMNODES))
824 node = first_node(node_online_map);
825 __get_cpu_var(reap_node) = node;
826}
827
828#else
829#define init_reap_node(cpu) do { } while (0)
830#define next_reap_node(void) do { } while (0)
831#endif
832
792/* 833/*
793 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz 834 * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz
794 * via the workqueue/eventd. 835 * via the workqueue/eventd.
@@ -806,6 +847,7 @@ static void __devinit start_cpu_timer(int cpu)
806 * at that time. 847 * at that time.
807 */ 848 */
808 if (keventd_up() && reap_work->func == NULL) { 849 if (keventd_up() && reap_work->func == NULL) {
850 init_reap_node(cpu);
809 INIT_WORK(reap_work, cache_reap, NULL); 851 INIT_WORK(reap_work, cache_reap, NULL);
810 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 852 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
811 } 853 }
@@ -884,6 +926,23 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
884 } 926 }
885} 927}
886 928
929/*
930 * Called from cache_reap() to regularly drain alien caches round robin.
931 */
932static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
933{
934 int node = __get_cpu_var(reap_node);
935
936 if (l3->alien) {
937 struct array_cache *ac = l3->alien[node];
938 if (ac && ac->avail) {
939 spin_lock_irq(&ac->lock);
940 __drain_alien_cache(cachep, ac, node);
941 spin_unlock_irq(&ac->lock);
942 }
943 }
944}
945
887static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) 946static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
888{ 947{
889 int i = 0; 948 int i = 0;
@@ -902,6 +961,7 @@ static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **al
902#else 961#else
903 962
904#define drain_alien_cache(cachep, alien) do { } while (0) 963#define drain_alien_cache(cachep, alien) do { } while (0)
964#define reap_alien(cachep, l3) do { } while (0)
905 965
906static inline struct array_cache **alloc_alien_cache(int node, int limit) 966static inline struct array_cache **alloc_alien_cache(int node, int limit)
907{ 967{
@@ -1124,6 +1184,7 @@ void __init kmem_cache_init(void)
1124 struct cache_sizes *sizes; 1184 struct cache_sizes *sizes;
1125 struct cache_names *names; 1185 struct cache_names *names;
1126 int i; 1186 int i;
1187 int order;
1127 1188
1128 for (i = 0; i < NUM_INIT_LISTS; i++) { 1189 for (i = 0; i < NUM_INIT_LISTS; i++) {
1129 kmem_list3_init(&initkmem_list3[i]); 1190 kmem_list3_init(&initkmem_list3[i]);
@@ -1167,11 +1228,15 @@ void __init kmem_cache_init(void)
1167 1228
1168 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); 1229 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
1169 1230
1170 cache_estimate(0, cache_cache.buffer_size, cache_line_size(), 0, 1231 for (order = 0; order < MAX_ORDER; order++) {
1171 &left_over, &cache_cache.num); 1232 cache_estimate(order, cache_cache.buffer_size,
1233 cache_line_size(), 0, &left_over, &cache_cache.num);
1234 if (cache_cache.num)
1235 break;
1236 }
1172 if (!cache_cache.num) 1237 if (!cache_cache.num)
1173 BUG(); 1238 BUG();
1174 1239 cache_cache.gfporder = order;
1175 cache_cache.colour = left_over / cache_cache.colour_off; 1240 cache_cache.colour = left_over / cache_cache.colour_off;
1176 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) + 1241 cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
1177 sizeof(struct slab), cache_line_size()); 1242 sizeof(struct slab), cache_line_size());
@@ -1628,36 +1693,44 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
1628 size_t size, size_t align, unsigned long flags) 1693 size_t size, size_t align, unsigned long flags)
1629{ 1694{
1630 size_t left_over = 0; 1695 size_t left_over = 0;
1696 int gfporder;
1631 1697
1632 for (;; cachep->gfporder++) { 1698 for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
1633 unsigned int num; 1699 unsigned int num;
1634 size_t remainder; 1700 size_t remainder;
1635 1701
1636 if (cachep->gfporder > MAX_GFP_ORDER) { 1702 cache_estimate(gfporder, size, align, flags, &remainder, &num);
1637 cachep->num = 0;
1638 break;
1639 }
1640
1641 cache_estimate(cachep->gfporder, size, align, flags,
1642 &remainder, &num);
1643 if (!num) 1703 if (!num)
1644 continue; 1704 continue;
1705
1645 /* More than offslab_limit objects will cause problems */ 1706 /* More than offslab_limit objects will cause problems */
1646 if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit) 1707 if ((flags & CFLGS_OFF_SLAB) && num > offslab_limit)
1647 break; 1708 break;
1648 1709
1710 /* Found something acceptable - save it away */
1649 cachep->num = num; 1711 cachep->num = num;
1712 cachep->gfporder = gfporder;
1650 left_over = remainder; 1713 left_over = remainder;
1651 1714
1652 /* 1715 /*
1716 * A VFS-reclaimable slab tends to have most allocations
1717 * as GFP_NOFS and we really don't want to have to be allocating
1718 * higher-order pages when we are unable to shrink dcache.
1719 */
1720 if (flags & SLAB_RECLAIM_ACCOUNT)
1721 break;
1722
1723 /*
1653 * Large number of objects is good, but very large slabs are 1724 * Large number of objects is good, but very large slabs are
1654 * currently bad for the gfp()s. 1725 * currently bad for the gfp()s.
1655 */ 1726 */
1656 if (cachep->gfporder >= slab_break_gfp_order) 1727 if (gfporder >= slab_break_gfp_order)
1657 break; 1728 break;
1658 1729
1659 if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder)) 1730 /*
1660 /* Acceptable internal fragmentation */ 1731 * Acceptable internal fragmentation?
1732 */
1733 if ((left_over * 8) <= (PAGE_SIZE << gfporder))
1661 break; 1734 break;
1662 } 1735 }
1663 return left_over; 1736 return left_over;
@@ -1869,17 +1942,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1869 1942
1870 size = ALIGN(size, align); 1943 size = ALIGN(size, align);
1871 1944
1872 if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) { 1945 left_over = calculate_slab_order(cachep, size, align, flags);
1873 /*
1874 * A VFS-reclaimable slab tends to have most allocations
1875 * as GFP_NOFS and we really don't want to have to be allocating
1876 * higher-order pages when we are unable to shrink dcache.
1877 */
1878 cachep->gfporder = 0;
1879 cache_estimate(cachep->gfporder, size, align, flags,
1880 &left_over, &cachep->num);
1881 } else
1882 left_over = calculate_slab_order(cachep, size, align, flags);
1883 1946
1884 if (!cachep->num) { 1947 if (!cachep->num) {
1885 printk("kmem_cache_create: couldn't create cache %s.\n", name); 1948 printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -2554,7 +2617,7 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
2554 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", 2617 "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
2555 cachep->name, cachep->num, slabp, slabp->inuse); 2618 cachep->name, cachep->num, slabp, slabp->inuse);
2556 for (i = 0; 2619 for (i = 0;
2557 i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t); 2620 i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
2558 i++) { 2621 i++) {
2559 if ((i % 16) == 0) 2622 if ((i % 16) == 0)
2560 printk("\n%03x:", i); 2623 printk("\n%03x:", i);
@@ -3494,8 +3557,7 @@ static void cache_reap(void *unused)
3494 check_irq_on(); 3557 check_irq_on();
3495 3558
3496 l3 = searchp->nodelists[numa_node_id()]; 3559 l3 = searchp->nodelists[numa_node_id()];
3497 if (l3->alien) 3560 reap_alien(searchp, l3);
3498 drain_alien_cache(searchp, l3->alien);
3499 spin_lock_irq(&l3->list_lock); 3561 spin_lock_irq(&l3->list_lock);
3500 3562
3501 drain_array_locked(searchp, cpu_cache_get(searchp), 0, 3563 drain_array_locked(searchp, cpu_cache_get(searchp), 0,
@@ -3545,7 +3607,7 @@ static void cache_reap(void *unused)
3545 } 3607 }
3546 check_irq_on(); 3608 check_irq_on();
3547 mutex_unlock(&cache_chain_mutex); 3609 mutex_unlock(&cache_chain_mutex);
3548 drain_remote_pages(); 3610 next_reap_node();
3549 /* Setup the next iteration */ 3611 /* Setup the next iteration */
3550 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 3612 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
3551} 3613}
diff --git a/mm/swap.c b/mm/swap.c
index cce3dda59c59..b524ea90bddb 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -393,7 +393,8 @@ void pagevec_strip(struct pagevec *pvec)
393 struct page *page = pvec->pages[i]; 393 struct page *page = pvec->pages[i];
394 394
395 if (PagePrivate(page) && !TestSetPageLocked(page)) { 395 if (PagePrivate(page) && !TestSetPageLocked(page)) {
396 try_to_release_page(page, 0); 396 if (PagePrivate(page))
397 try_to_release_page(page, 0);
397 unlock_page(page); 398 unlock_page(page);
398 } 399 }
399 } 400 }
@@ -489,13 +490,34 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount)
489 if (count >= FBC_BATCH || count <= -FBC_BATCH) { 490 if (count >= FBC_BATCH || count <= -FBC_BATCH) {
490 spin_lock(&fbc->lock); 491 spin_lock(&fbc->lock);
491 fbc->count += count; 492 fbc->count += count;
493 *pcount = 0;
492 spin_unlock(&fbc->lock); 494 spin_unlock(&fbc->lock);
493 count = 0; 495 } else {
496 *pcount = count;
494 } 497 }
495 *pcount = count;
496 put_cpu(); 498 put_cpu();
497} 499}
498EXPORT_SYMBOL(percpu_counter_mod); 500EXPORT_SYMBOL(percpu_counter_mod);
501
502/*
503 * Add up all the per-cpu counts, return the result. This is a more accurate
504 * but much slower version of percpu_counter_read_positive()
505 */
506long percpu_counter_sum(struct percpu_counter *fbc)
507{
508 long ret;
509 int cpu;
510
511 spin_lock(&fbc->lock);
512 ret = fbc->count;
513 for_each_cpu(cpu) {
514 long *pcount = per_cpu_ptr(fbc->counters, cpu);
515 ret += *pcount;
516 }
517 spin_unlock(&fbc->lock);
518 return ret < 0 ? 0 : ret;
519}
520EXPORT_SYMBOL(percpu_counter_sum);
499#endif 521#endif
500 522
501/* 523/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0af7593d01e..4fe7e3aa02e2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -700,7 +700,7 @@ int migrate_page_remove_references(struct page *newpage,
700 * the page. 700 * the page.
701 */ 701 */
702 if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) 702 if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
703 return 1; 703 return -EAGAIN;
704 704
705 /* 705 /*
706 * Establish swap ptes for anonymous pages or destroy pte 706 * Establish swap ptes for anonymous pages or destroy pte
@@ -721,13 +721,15 @@ int migrate_page_remove_references(struct page *newpage,
721 * If the page was not migrated then the PageSwapCache bit 721 * If the page was not migrated then the PageSwapCache bit
722 * is still set and the operation may continue. 722 * is still set and the operation may continue.
723 */ 723 */
724 try_to_unmap(page, 1); 724 if (try_to_unmap(page, 1) == SWAP_FAIL)
725 /* A vma has VM_LOCKED set -> Permanent failure */
726 return -EPERM;
725 727
726 /* 728 /*
727 * Give up if we were unable to remove all mappings. 729 * Give up if we were unable to remove all mappings.
728 */ 730 */
729 if (page_mapcount(page)) 731 if (page_mapcount(page))
730 return 1; 732 return -EAGAIN;
731 733
732 write_lock_irq(&mapping->tree_lock); 734 write_lock_irq(&mapping->tree_lock);
733 735
@@ -738,7 +740,7 @@ int migrate_page_remove_references(struct page *newpage,
738 if (!page_mapping(page) || page_count(page) != nr_refs || 740 if (!page_mapping(page) || page_count(page) != nr_refs ||
739 *radix_pointer != page) { 741 *radix_pointer != page) {
740 write_unlock_irq(&mapping->tree_lock); 742 write_unlock_irq(&mapping->tree_lock);
741 return 1; 743 return -EAGAIN;
742 } 744 }
743 745
744 /* 746 /*
@@ -813,10 +815,14 @@ EXPORT_SYMBOL(migrate_page_copy);
813 */ 815 */
814int migrate_page(struct page *newpage, struct page *page) 816int migrate_page(struct page *newpage, struct page *page)
815{ 817{
818 int rc;
819
816 BUG_ON(PageWriteback(page)); /* Writeback must be complete */ 820 BUG_ON(PageWriteback(page)); /* Writeback must be complete */
817 821
818 if (migrate_page_remove_references(newpage, page, 2)) 822 rc = migrate_page_remove_references(newpage, page, 2);
819 return -EAGAIN; 823
824 if (rc)
825 return rc;
820 826
821 migrate_page_copy(newpage, page); 827 migrate_page_copy(newpage, page);
822 828
@@ -1883,7 +1889,8 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1883 1889
1884 if (!(gfp_mask & __GFP_WAIT) || 1890 if (!(gfp_mask & __GFP_WAIT) ||
1885 zone->all_unreclaimable || 1891 zone->all_unreclaimable ||
1886 atomic_read(&zone->reclaim_in_progress) > 0) 1892 atomic_read(&zone->reclaim_in_progress) > 0 ||
1893 (p->flags & PF_MEMALLOC))
1887 return 0; 1894 return 0;
1888 1895
1889 node_id = zone->zone_pgdat->node_id; 1896 node_id = zone->zone_pgdat->node_id;