aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/backing-dev.c16
-rw-r--r--mm/filemap.c26
-rw-r--r--mm/filemap_xip.c22
-rw-r--r--mm/hugetlb.c24
-rw-r--r--mm/madvise.c6
-rw-r--r--mm/memory.c25
-rw-r--r--mm/mempolicy.c51
-rw-r--r--mm/mempool.c3
-rw-r--r--mm/mlock.c5
-rw-r--r--mm/mmap.c38
-rw-r--r--mm/mremap.c13
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_alloc.c332
-rw-r--r--mm/rmap.c24
-rw-r--r--mm/shmem.c44
-rw-r--r--mm/slab.c67
-rw-r--r--mm/slob.c538
-rw-r--r--mm/slub.c142
-rw-r--r--mm/sparse.c42
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/truncate.c42
-rw-r--r--mm/vmstat.c2
25 files changed, 1009 insertions, 478 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 62e5d0d0bd5a..086af703da43 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -168,3 +168,7 @@ config NR_QUICK
168 depends on QUICKLIST 168 depends on QUICKLIST
169 default "2" if (SUPERH && !SUPERH64) 169 default "2" if (SUPERH && !SUPERH64)
170 default "1" 170 default "1"
171
172config VIRT_TO_BUS
173 def_bool y
174 depends on !ARCH_NO_VIRT_TO_BUS
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e5de3781d3fe..f50a2811f9dc 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -55,22 +55,6 @@ long congestion_wait(int rw, long timeout)
55} 55}
56EXPORT_SYMBOL(congestion_wait); 56EXPORT_SYMBOL(congestion_wait);
57 57
58long congestion_wait_interruptible(int rw, long timeout)
59{
60 long ret;
61 DEFINE_WAIT(wait);
62 wait_queue_head_t *wqh = &congestion_wqh[rw];
63
64 prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE);
65 if (signal_pending(current))
66 ret = -ERESTARTSYS;
67 else
68 ret = io_schedule_timeout(timeout);
69 finish_wait(wqh, &wait);
70 return ret;
71}
72EXPORT_SYMBOL(congestion_wait_interruptible);
73
74/** 58/**
75 * congestion_end - wake up sleepers on a congested backing_dev_info 59 * congestion_end - wake up sleepers on a congested backing_dev_info
76 * @rw: READ or WRITE 60 * @rw: READ or WRITE
diff --git a/mm/filemap.c b/mm/filemap.c
index edb1b0b5cc8d..100b99c2d504 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -120,6 +120,7 @@ void __remove_from_page_cache(struct page *page)
120 page->mapping = NULL; 120 page->mapping = NULL;
121 mapping->nrpages--; 121 mapping->nrpages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
123 BUG_ON(page_mapped(page));
123} 124}
124 125
125void remove_from_page_cache(struct page *page) 126void remove_from_page_cache(struct page *page)
@@ -1218,6 +1219,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1218 retval = retval ?: desc.error; 1219 retval = retval ?: desc.error;
1219 break; 1220 break;
1220 } 1221 }
1222 if (desc.count > 0)
1223 break;
1221 } 1224 }
1222 } 1225 }
1223out: 1226out:
@@ -1245,26 +1248,6 @@ int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long o
1245 return written; 1248 return written;
1246} 1249}
1247 1250
1248ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
1249 size_t count, read_actor_t actor, void *target)
1250{
1251 read_descriptor_t desc;
1252
1253 if (!count)
1254 return 0;
1255
1256 desc.written = 0;
1257 desc.count = count;
1258 desc.arg.data = target;
1259 desc.error = 0;
1260
1261 do_generic_file_read(in_file, ppos, &desc, actor);
1262 if (desc.written)
1263 return desc.written;
1264 return desc.error;
1265}
1266EXPORT_SYMBOL(generic_file_sendfile);
1267
1268static ssize_t 1251static ssize_t
1269do_readahead(struct address_space *mapping, struct file *filp, 1252do_readahead(struct address_space *mapping, struct file *filp,
1270 unsigned long index, unsigned long nr) 1253 unsigned long index, unsigned long nr)
@@ -1786,7 +1769,6 @@ retry:
1786 page = __read_cache_page(mapping, index, filler, data); 1769 page = __read_cache_page(mapping, index, filler, data);
1787 if (IS_ERR(page)) 1770 if (IS_ERR(page))
1788 return page; 1771 return page;
1789 mark_page_accessed(page);
1790 if (PageUptodate(page)) 1772 if (PageUptodate(page))
1791 goto out; 1773 goto out;
1792 1774
@@ -1985,7 +1967,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
1985 if (unlikely(*pos + *count > MAX_NON_LFS && 1967 if (unlikely(*pos + *count > MAX_NON_LFS &&
1986 !(file->f_flags & O_LARGEFILE))) { 1968 !(file->f_flags & O_LARGEFILE))) {
1987 if (*pos >= MAX_NON_LFS) { 1969 if (*pos >= MAX_NON_LFS) {
1988 send_sig(SIGXFSZ, current, 0);
1989 return -EFBIG; 1970 return -EFBIG;
1990 } 1971 }
1991 if (*count > MAX_NON_LFS - (unsigned long)*pos) { 1972 if (*count > MAX_NON_LFS - (unsigned long)*pos) {
@@ -2003,7 +1984,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
2003 if (likely(!isblk)) { 1984 if (likely(!isblk)) {
2004 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { 1985 if (unlikely(*pos >= inode->i_sb->s_maxbytes)) {
2005 if (*count || *pos > inode->i_sb->s_maxbytes) { 1986 if (*count || *pos > inode->i_sb->s_maxbytes) {
2006 send_sig(SIGXFSZ, current, 0);
2007 return -EFBIG; 1987 return -EFBIG;
2008 } 1988 }
2009 /* zero-length writes at ->s_maxbytes are OK */ 1989 /* zero-length writes at ->s_maxbytes are OK */
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index fa360e566d88..65ffc321f0c0 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -159,28 +159,6 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
159} 159}
160EXPORT_SYMBOL_GPL(xip_file_read); 160EXPORT_SYMBOL_GPL(xip_file_read);
161 161
162ssize_t
163xip_file_sendfile(struct file *in_file, loff_t *ppos,
164 size_t count, read_actor_t actor, void *target)
165{
166 read_descriptor_t desc;
167
168 if (!count)
169 return 0;
170
171 desc.written = 0;
172 desc.count = count;
173 desc.arg.data = target;
174 desc.error = 0;
175
176 do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
177 ppos, &desc, actor);
178 if (desc.written)
179 return desc.written;
180 return desc.error;
181}
182EXPORT_SYMBOL_GPL(xip_file_sendfile);
183
184/* 162/*
185 * __xip_unmap is invoked from xip_unmap and 163 * __xip_unmap is invoked from xip_unmap and
186 * xip_write 164 * xip_write
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eb7180db3033..acc0fb3cf067 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,7 +66,7 @@ static void enqueue_huge_page(struct page *page)
66static struct page *dequeue_huge_page(struct vm_area_struct *vma, 66static struct page *dequeue_huge_page(struct vm_area_struct *vma,
67 unsigned long address) 67 unsigned long address)
68{ 68{
69 int nid = numa_node_id(); 69 int nid;
70 struct page *page = NULL; 70 struct page *page = NULL;
71 struct zonelist *zonelist = huge_zonelist(vma, address); 71 struct zonelist *zonelist = huge_zonelist(vma, address);
72 struct zone **z; 72 struct zone **z;
@@ -101,13 +101,20 @@ static void free_huge_page(struct page *page)
101 101
102static int alloc_fresh_huge_page(void) 102static int alloc_fresh_huge_page(void)
103{ 103{
104 static int nid = 0; 104 static int prev_nid;
105 struct page *page; 105 struct page *page;
106 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, 106 static DEFINE_SPINLOCK(nid_lock);
107 HUGETLB_PAGE_ORDER); 107 int nid;
108 nid = next_node(nid, node_online_map); 108
109 spin_lock(&nid_lock);
110 nid = next_node(prev_nid, node_online_map);
109 if (nid == MAX_NUMNODES) 111 if (nid == MAX_NUMNODES)
110 nid = first_node(node_online_map); 112 nid = first_node(node_online_map);
113 prev_nid = nid;
114 spin_unlock(&nid_lock);
115
116 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
117 HUGETLB_PAGE_ORDER);
111 if (page) { 118 if (page) {
112 set_compound_page_dtor(page, free_huge_page); 119 set_compound_page_dtor(page, free_huge_page);
113 spin_lock(&hugetlb_lock); 120 spin_lock(&hugetlb_lock);
@@ -326,9 +333,10 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
326 pte_t entry; 333 pte_t entry;
327 334
328 entry = pte_mkwrite(pte_mkdirty(*ptep)); 335 entry = pte_mkwrite(pte_mkdirty(*ptep));
329 ptep_set_access_flags(vma, address, ptep, entry, 1); 336 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
330 update_mmu_cache(vma, address, entry); 337 update_mmu_cache(vma, address, entry);
331 lazy_mmu_prot_update(entry); 338 lazy_mmu_prot_update(entry);
339 }
332} 340}
333 341
334 342
diff --git a/mm/madvise.c b/mm/madvise.c
index 60542d006ec1..93ee375b38e7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -287,9 +287,11 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
287 struct vm_area_struct * vma, *prev; 287 struct vm_area_struct * vma, *prev;
288 int unmapped_error = 0; 288 int unmapped_error = 0;
289 int error = -EINVAL; 289 int error = -EINVAL;
290 int write;
290 size_t len; 291 size_t len;
291 292
292 if (madvise_need_mmap_write(behavior)) 293 write = madvise_need_mmap_write(behavior);
294 if (write)
293 down_write(&current->mm->mmap_sem); 295 down_write(&current->mm->mmap_sem);
294 else 296 else
295 down_read(&current->mm->mmap_sem); 297 down_read(&current->mm->mmap_sem);
@@ -354,7 +356,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior)
354 vma = find_vma(current->mm, start); 356 vma = find_vma(current->mm, start);
355 } 357 }
356out: 358out:
357 if (madvise_need_mmap_write(behavior)) 359 if (write)
358 up_write(&current->mm->mmap_sem); 360 up_write(&current->mm->mmap_sem);
359 else 361 else
360 up_read(&current->mm->mmap_sem); 362 up_read(&current->mm->mmap_sem);
diff --git a/mm/memory.c b/mm/memory.c
index cb94488ab96d..b3d73bb1f680 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,11 +78,9 @@ unsigned long num_physpages;
78 * and ZONE_HIGHMEM. 78 * and ZONE_HIGHMEM.
79 */ 79 */
80void * high_memory; 80void * high_memory;
81unsigned long vmalloc_earlyreserve;
82 81
83EXPORT_SYMBOL(num_physpages); 82EXPORT_SYMBOL(num_physpages);
84EXPORT_SYMBOL(high_memory); 83EXPORT_SYMBOL(high_memory);
85EXPORT_SYMBOL(vmalloc_earlyreserve);
86 84
87int randomize_va_space __read_mostly = 1; 85int randomize_va_space __read_mostly = 1;
88 86
@@ -1055,6 +1053,14 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1055 do { 1053 do {
1056 struct page *page; 1054 struct page *page;
1057 1055
1056 /*
1057 * If tsk is ooming, cut off its access to large memory
1058 * allocations. It has a pending SIGKILL, but it can't
1059 * be processed until returning to user space.
1060 */
1061 if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
1062 return -ENOMEM;
1063
1058 if (write) 1064 if (write)
1059 foll_flags |= FOLL_WRITE; 1065 foll_flags |= FOLL_WRITE;
1060 1066
@@ -1691,9 +1697,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1691 flush_cache_page(vma, address, pte_pfn(orig_pte)); 1697 flush_cache_page(vma, address, pte_pfn(orig_pte));
1692 entry = pte_mkyoung(orig_pte); 1698 entry = pte_mkyoung(orig_pte);
1693 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1699 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1694 ptep_set_access_flags(vma, address, page_table, entry, 1); 1700 if (ptep_set_access_flags(vma, address, page_table, entry,1)) {
1695 update_mmu_cache(vma, address, entry); 1701 update_mmu_cache(vma, address, entry);
1696 lazy_mmu_prot_update(entry); 1702 lazy_mmu_prot_update(entry);
1703 }
1697 ret |= VM_FAULT_WRITE; 1704 ret |= VM_FAULT_WRITE;
1698 goto unlock; 1705 goto unlock;
1699 } 1706 }
@@ -2525,10 +2532,9 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2525 pte_t *pte, pmd_t *pmd, int write_access) 2532 pte_t *pte, pmd_t *pmd, int write_access)
2526{ 2533{
2527 pte_t entry; 2534 pte_t entry;
2528 pte_t old_entry;
2529 spinlock_t *ptl; 2535 spinlock_t *ptl;
2530 2536
2531 old_entry = entry = *pte; 2537 entry = *pte;
2532 if (!pte_present(entry)) { 2538 if (!pte_present(entry)) {
2533 if (pte_none(entry)) { 2539 if (pte_none(entry)) {
2534 if (vma->vm_ops) { 2540 if (vma->vm_ops) {
@@ -2561,8 +2567,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2561 entry = pte_mkdirty(entry); 2567 entry = pte_mkdirty(entry);
2562 } 2568 }
2563 entry = pte_mkyoung(entry); 2569 entry = pte_mkyoung(entry);
2564 if (!pte_same(old_entry, entry)) { 2570 if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
2565 ptep_set_access_flags(vma, address, pte, entry, write_access);
2566 update_mmu_cache(vma, address, entry); 2571 update_mmu_cache(vma, address, entry);
2567 lazy_mmu_prot_update(entry); 2572 lazy_mmu_prot_update(entry);
2568 } else { 2573 } else {
@@ -2674,7 +2679,7 @@ int make_pages_present(unsigned long addr, unsigned long end)
2674 write = (vma->vm_flags & VM_WRITE) != 0; 2679 write = (vma->vm_flags & VM_WRITE) != 0;
2675 BUG_ON(addr >= end); 2680 BUG_ON(addr >= end);
2676 BUG_ON(end > vma->vm_end); 2681 BUG_ON(end > vma->vm_end);
2677 len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; 2682 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
2678 ret = get_user_pages(current, current->mm, addr, 2683 ret = get_user_pages(current, current->mm, addr,
2679 len, write, 0, NULL, NULL); 2684 len, write, 0, NULL, NULL);
2680 if (ret < 0) 2685 if (ret < 0)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d76e8eb342d0..188f8d9c4aed 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -101,8 +101,6 @@
101static struct kmem_cache *policy_cache; 101static struct kmem_cache *policy_cache;
102static struct kmem_cache *sn_cache; 102static struct kmem_cache *sn_cache;
103 103
104#define PDprintk(fmt...)
105
106/* Highest zone. An specific allocation for a zone below that is not 104/* Highest zone. An specific allocation for a zone below that is not
107 policied. */ 105 policied. */
108enum zone_type policy_zone = 0; 106enum zone_type policy_zone = 0;
@@ -175,7 +173,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
175{ 173{
176 struct mempolicy *policy; 174 struct mempolicy *policy;
177 175
178 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); 176 pr_debug("setting mode %d nodes[0] %lx\n",
177 mode, nodes ? nodes_addr(*nodes)[0] : -1);
178
179 if (mode == MPOL_DEFAULT) 179 if (mode == MPOL_DEFAULT)
180 return NULL; 180 return NULL;
181 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 181 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -379,7 +379,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
379 int err = 0; 379 int err = 0;
380 struct mempolicy *old = vma->vm_policy; 380 struct mempolicy *old = vma->vm_policy;
381 381
382 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", 382 pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
383 vma->vm_start, vma->vm_end, vma->vm_pgoff, 383 vma->vm_start, vma->vm_end, vma->vm_pgoff,
384 vma->vm_ops, vma->vm_file, 384 vma->vm_ops, vma->vm_file,
385 vma->vm_ops ? vma->vm_ops->set_policy : NULL); 385 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
@@ -776,8 +776,8 @@ long do_mbind(unsigned long start, unsigned long len,
776 if (!new) 776 if (!new)
777 flags |= MPOL_MF_DISCONTIG_OK; 777 flags |= MPOL_MF_DISCONTIG_OK;
778 778
779 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 779 pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
780 mode,nodes_addr(nodes)[0]); 780 mode, nmask ? nodes_addr(*nmask)[0] : -1);
781 781
782 down_write(&mm->mmap_sem); 782 down_write(&mm->mmap_sem);
783 vma = check_range(mm, start, end, nmask, 783 vma = check_range(mm, start, end, nmask,
@@ -1434,7 +1434,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1434 } 1434 }
1435 rb_link_node(&new->nd, parent, p); 1435 rb_link_node(&new->nd, parent, p);
1436 rb_insert_color(&new->nd, &sp->root); 1436 rb_insert_color(&new->nd, &sp->root);
1437 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, 1437 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1438 new->policy ? new->policy->policy : 0); 1438 new->policy ? new->policy->policy : 0);
1439} 1439}
1440 1440
@@ -1459,7 +1459,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1459 1459
1460static void sp_delete(struct shared_policy *sp, struct sp_node *n) 1460static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1461{ 1461{
1462 PDprintk("deleting %lx-l%x\n", n->start, n->end); 1462 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1463 rb_erase(&n->nd, &sp->root); 1463 rb_erase(&n->nd, &sp->root);
1464 mpol_free(n->policy); 1464 mpol_free(n->policy);
1465 kmem_cache_free(sn_cache, n); 1465 kmem_cache_free(sn_cache, n);
@@ -1558,10 +1558,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
1558 struct sp_node *new = NULL; 1558 struct sp_node *new = NULL;
1559 unsigned long sz = vma_pages(vma); 1559 unsigned long sz = vma_pages(vma);
1560 1560
1561 PDprintk("set_shared_policy %lx sz %lu %d %lx\n", 1561 pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
1562 vma->vm_pgoff, 1562 vma->vm_pgoff,
1563 sz, npol? npol->policy : -1, 1563 sz, npol? npol->policy : -1,
1564 npol ? nodes_addr(npol->v.nodes)[0] : -1); 1564 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1565 1565
1566 if (npol) { 1566 if (npol) {
1567 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 1567 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1597,6 +1597,10 @@ void mpol_free_shared_policy(struct shared_policy *p)
1597/* assumes fs == KERNEL_DS */ 1597/* assumes fs == KERNEL_DS */
1598void __init numa_policy_init(void) 1598void __init numa_policy_init(void)
1599{ 1599{
1600 nodemask_t interleave_nodes;
1601 unsigned long largest = 0;
1602 int nid, prefer = 0;
1603
1600 policy_cache = kmem_cache_create("numa_policy", 1604 policy_cache = kmem_cache_create("numa_policy",
1601 sizeof(struct mempolicy), 1605 sizeof(struct mempolicy),
1602 0, SLAB_PANIC, NULL, NULL); 1606 0, SLAB_PANIC, NULL, NULL);
@@ -1605,10 +1609,31 @@ void __init numa_policy_init(void)
1605 sizeof(struct sp_node), 1609 sizeof(struct sp_node),
1606 0, SLAB_PANIC, NULL, NULL); 1610 0, SLAB_PANIC, NULL, NULL);
1607 1611
1608 /* Set interleaving policy for system init. This way not all 1612 /*
1609 the data structures allocated at system boot end up in node zero. */ 1613 * Set interleaving policy for system init. Interleaving is only
1614 * enabled across suitably sized nodes (default is >= 16MB), or
1615 * fall back to the largest node if they're all smaller.
1616 */
1617 nodes_clear(interleave_nodes);
1618 for_each_online_node(nid) {
1619 unsigned long total_pages = node_present_pages(nid);
1620
1621 /* Preserve the largest node */
1622 if (largest < total_pages) {
1623 largest = total_pages;
1624 prefer = nid;
1625 }
1626
1627 /* Interleave this node? */
1628 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1629 node_set(nid, interleave_nodes);
1630 }
1631
1632 /* All too small, use the largest */
1633 if (unlikely(nodes_empty(interleave_nodes)))
1634 node_set(prefer, interleave_nodes);
1610 1635
1611 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) 1636 if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
1612 printk("numa_policy_init: interleaving failed\n"); 1637 printk("numa_policy_init: interleaving failed\n");
1613} 1638}
1614 1639
diff --git a/mm/mempool.c b/mm/mempool.c
index cc1ca86dfc24..3e8f1fed0e1f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -263,6 +263,9 @@ void mempool_free(void *element, mempool_t *pool)
263{ 263{
264 unsigned long flags; 264 unsigned long flags;
265 265
266 if (unlikely(element == NULL))
267 return;
268
266 smp_mb(); 269 smp_mb();
267 if (pool->curr_nr < pool->min_nr) { 270 if (pool->curr_nr < pool->min_nr) {
268 spin_lock_irqsave(&pool->lock, flags); 271 spin_lock_irqsave(&pool->lock, flags);
diff --git a/mm/mlock.c b/mm/mlock.c
index 4d3fea267e0d..7b2656055d6a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -244,9 +244,12 @@ int user_shm_lock(size_t size, struct user_struct *user)
244 244
245 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 245 locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
246 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 246 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
247 if (lock_limit == RLIM_INFINITY)
248 allowed = 1;
247 lock_limit >>= PAGE_SHIFT; 249 lock_limit >>= PAGE_SHIFT;
248 spin_lock(&shmlock_user_lock); 250 spin_lock(&shmlock_user_lock);
249 if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) 251 if (!allowed &&
252 locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
250 goto out; 253 goto out;
251 get_uid(user); 254 get_uid(user);
252 user->locked_shm += locked; 255 user->locked_shm += locked;
diff --git a/mm/mmap.c b/mm/mmap.c
index 68b9ad2ef1d6..144b4a290f2c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -894,14 +894,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
894 unsigned long flags, unsigned long pgoff) 894 unsigned long flags, unsigned long pgoff)
895{ 895{
896 struct mm_struct * mm = current->mm; 896 struct mm_struct * mm = current->mm;
897 struct vm_area_struct * vma, * prev;
898 struct inode *inode; 897 struct inode *inode;
899 unsigned int vm_flags; 898 unsigned int vm_flags;
900 int correct_wcount = 0;
901 int error; 899 int error;
902 struct rb_node ** rb_link, * rb_parent;
903 int accountable = 1; 900 int accountable = 1;
904 unsigned long charged = 0, reqprot = prot; 901 unsigned long reqprot = prot;
905 902
906 /* 903 /*
907 * Does the application expect PROT_READ to imply PROT_EXEC? 904 * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1023,10 +1020,28 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
1023 } 1020 }
1024 } 1021 }
1025 1022
1026 error = security_file_mmap(file, reqprot, prot, flags); 1023 error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
1027 if (error) 1024 if (error)
1028 return error; 1025 return error;
1029 1026
1027 return mmap_region(file, addr, len, flags, vm_flags, pgoff,
1028 accountable);
1029}
1030EXPORT_SYMBOL(do_mmap_pgoff);
1031
1032unsigned long mmap_region(struct file *file, unsigned long addr,
1033 unsigned long len, unsigned long flags,
1034 unsigned int vm_flags, unsigned long pgoff,
1035 int accountable)
1036{
1037 struct mm_struct *mm = current->mm;
1038 struct vm_area_struct *vma, *prev;
1039 int correct_wcount = 0;
1040 int error;
1041 struct rb_node **rb_link, *rb_parent;
1042 unsigned long charged = 0;
1043 struct inode *inode = file ? file->f_path.dentry->d_inode : NULL;
1044
1030 /* Clear old maps */ 1045 /* Clear old maps */
1031 error = -ENOMEM; 1046 error = -ENOMEM;
1032munmap_back: 1047munmap_back:
@@ -1175,8 +1190,6 @@ unacct_error:
1175 return error; 1190 return error;
1176} 1191}
1177 1192
1178EXPORT_SYMBOL(do_mmap_pgoff);
1179
1180/* Get an address range which is currently unmapped. 1193/* Get an address range which is currently unmapped.
1181 * For shmat() with addr=0. 1194 * For shmat() with addr=0.
1182 * 1195 *
@@ -1536,9 +1549,14 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1536 * vma->vm_start/vm_end cannot change under us because the caller 1549 * vma->vm_start/vm_end cannot change under us because the caller
1537 * is required to hold the mmap_sem in read mode. We need the 1550 * is required to hold the mmap_sem in read mode. We need the
1538 * anon_vma lock to serialize against concurrent expand_stacks. 1551 * anon_vma lock to serialize against concurrent expand_stacks.
1552 * Also guard against wrapping around to address 0.
1539 */ 1553 */
1540 address += 4 + PAGE_SIZE - 1; 1554 if (address < PAGE_ALIGN(address+4))
1541 address &= PAGE_MASK; 1555 address = PAGE_ALIGN(address+4);
1556 else {
1557 anon_vma_unlock(vma);
1558 return -ENOMEM;
1559 }
1542 error = 0; 1560 error = 0;
1543 1561
1544 /* Somebody else might have raced and expanded it already */ 1562 /* Somebody else might have raced and expanded it already */
diff --git a/mm/mremap.c b/mm/mremap.c
index 5d4bd4f95b8e..bc7c52efc71b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -291,6 +291,10 @@ unsigned long do_mremap(unsigned long addr,
291 if ((addr <= new_addr) && (addr+old_len) > new_addr) 291 if ((addr <= new_addr) && (addr+old_len) > new_addr)
292 goto out; 292 goto out;
293 293
294 ret = security_file_mmap(0, 0, 0, 0, new_addr, 1);
295 if (ret)
296 goto out;
297
294 ret = do_munmap(mm, new_addr, new_len); 298 ret = do_munmap(mm, new_addr, new_len);
295 if (ret) 299 if (ret)
296 goto out; 300 goto out;
@@ -390,8 +394,13 @@ unsigned long do_mremap(unsigned long addr,
390 394
391 new_addr = get_unmapped_area(vma->vm_file, 0, new_len, 395 new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
392 vma->vm_pgoff, map_flags); 396 vma->vm_pgoff, map_flags);
393 ret = new_addr; 397 if (new_addr & ~PAGE_MASK) {
394 if (new_addr & ~PAGE_MASK) 398 ret = new_addr;
399 goto out;
400 }
401
402 ret = security_file_mmap(0, 0, 0, 0, new_addr, 1);
403 if (ret)
395 goto out; 404 goto out;
396 } 405 }
397 ret = move_vma(vma, addr, old_len, new_len, new_addr); 406 ret = move_vma(vma, addr, old_len, new_len, new_addr);
diff --git a/mm/nommu.c b/mm/nommu.c
index 2b16b00a5b11..8bbbf147a794 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -367,6 +367,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
367 return find_vma(mm, addr); 367 return find_vma(mm, addr);
368} 368}
369 369
370int expand_stack(struct vm_area_struct *vma, unsigned long address)
371{
372 return -ENOMEM;
373}
374
370/* 375/*
371 * look up the first VMA exactly that exactly matches addr 376 * look up the first VMA exactly that exactly matches addr
372 * - should be called with mm->mmap_sem at least held readlocked 377 * - should be called with mm->mmap_sem at least held readlocked
@@ -639,7 +644,7 @@ static int validate_mmap_request(struct file *file,
639 } 644 }
640 645
641 /* allow the security API to have its say */ 646 /* allow the security API to have its say */
642 ret = security_file_mmap(file, reqprot, prot, flags); 647 ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
643 if (ret < 0) 648 if (ret < 0)
644 return ret; 649 return ret;
645 650
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index eec1481ba44f..ea9da3bed3e9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -476,15 +476,13 @@ static void wb_kupdate(unsigned long arg)
476 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs 476 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
477 */ 477 */
478int dirty_writeback_centisecs_handler(ctl_table *table, int write, 478int dirty_writeback_centisecs_handler(ctl_table *table, int write,
479 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 479 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
480{ 480{
481 proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); 481 proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
482 if (dirty_writeback_interval) { 482 if (dirty_writeback_interval)
483 mod_timer(&wb_timer, 483 mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
484 jiffies + dirty_writeback_interval); 484 else
485 } else {
486 del_timer(&wb_timer); 485 del_timer(&wb_timer);
487 }
488 return 0; 486 return 0;
489} 487}
490 488
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bd8e33582d25..f9e4e647d7e8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -126,13 +126,13 @@ static unsigned long __meminitdata dma_reserve;
126 #endif 126 #endif
127 #endif 127 #endif
128 128
129 struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; 129 static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
130 int __meminitdata nr_nodemap_entries; 130 static int __meminitdata nr_nodemap_entries;
131 unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; 131 static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
132 unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; 132 static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
133#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE 133#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
134 unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; 134 static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
135 unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; 135 static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
136#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ 136#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
137#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ 137#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
138 138
@@ -900,11 +900,13 @@ static struct fail_page_alloc_attr {
900 900
901 u32 ignore_gfp_highmem; 901 u32 ignore_gfp_highmem;
902 u32 ignore_gfp_wait; 902 u32 ignore_gfp_wait;
903 u32 min_order;
903 904
904#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS 905#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
905 906
906 struct dentry *ignore_gfp_highmem_file; 907 struct dentry *ignore_gfp_highmem_file;
907 struct dentry *ignore_gfp_wait_file; 908 struct dentry *ignore_gfp_wait_file;
909 struct dentry *min_order_file;
908 910
909#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ 911#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
910 912
@@ -912,6 +914,7 @@ static struct fail_page_alloc_attr {
912 .attr = FAULT_ATTR_INITIALIZER, 914 .attr = FAULT_ATTR_INITIALIZER,
913 .ignore_gfp_wait = 1, 915 .ignore_gfp_wait = 1,
914 .ignore_gfp_highmem = 1, 916 .ignore_gfp_highmem = 1,
917 .min_order = 1,
915}; 918};
916 919
917static int __init setup_fail_page_alloc(char *str) 920static int __init setup_fail_page_alloc(char *str)
@@ -922,6 +925,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc);
922 925
923static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 926static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
924{ 927{
928 if (order < fail_page_alloc.min_order)
929 return 0;
925 if (gfp_mask & __GFP_NOFAIL) 930 if (gfp_mask & __GFP_NOFAIL)
926 return 0; 931 return 0;
927 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 932 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
@@ -953,12 +958,17 @@ static int __init fail_page_alloc_debugfs(void)
953 fail_page_alloc.ignore_gfp_highmem_file = 958 fail_page_alloc.ignore_gfp_highmem_file =
954 debugfs_create_bool("ignore-gfp-highmem", mode, dir, 959 debugfs_create_bool("ignore-gfp-highmem", mode, dir,
955 &fail_page_alloc.ignore_gfp_highmem); 960 &fail_page_alloc.ignore_gfp_highmem);
961 fail_page_alloc.min_order_file =
962 debugfs_create_u32("min-order", mode, dir,
963 &fail_page_alloc.min_order);
956 964
957 if (!fail_page_alloc.ignore_gfp_wait_file || 965 if (!fail_page_alloc.ignore_gfp_wait_file ||
958 !fail_page_alloc.ignore_gfp_highmem_file) { 966 !fail_page_alloc.ignore_gfp_highmem_file ||
967 !fail_page_alloc.min_order_file) {
959 err = -ENOMEM; 968 err = -ENOMEM;
960 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); 969 debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
961 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); 970 debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
971 debugfs_remove(fail_page_alloc.min_order_file);
962 cleanup_fault_attr_dentries(&fail_page_alloc.attr); 972 cleanup_fault_attr_dentries(&fail_page_alloc.attr);
963 } 973 }
964 974
@@ -1621,8 +1631,8 @@ void show_free_areas(void)
1621 * 1631 *
1622 * Add all populated zones of a node to the zonelist. 1632 * Add all populated zones of a node to the zonelist.
1623 */ 1633 */
1624static int __meminit build_zonelists_node(pg_data_t *pgdat, 1634static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
1625 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) 1635 int nr_zones, enum zone_type zone_type)
1626{ 1636{
1627 struct zone *zone; 1637 struct zone *zone;
1628 1638
@@ -1641,9 +1651,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat,
1641 return nr_zones; 1651 return nr_zones;
1642} 1652}
1643 1653
1654
1655/*
1656 * zonelist_order:
1657 * 0 = automatic detection of better ordering.
1658 * 1 = order by ([node] distance, -zonetype)
1659 * 2 = order by (-zonetype, [node] distance)
1660 *
1661 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
1662 * the same zonelist. So only NUMA can configure this param.
1663 */
1664#define ZONELIST_ORDER_DEFAULT 0
1665#define ZONELIST_ORDER_NODE 1
1666#define ZONELIST_ORDER_ZONE 2
1667
1668/* zonelist order in the kernel.
1669 * set_zonelist_order() will set this to NODE or ZONE.
1670 */
1671static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
1672static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
1673
1674
1644#ifdef CONFIG_NUMA 1675#ifdef CONFIG_NUMA
1676/* The value user specified ....changed by config */
1677static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
1678/* string for sysctl */
1679#define NUMA_ZONELIST_ORDER_LEN 16
1680char numa_zonelist_order[16] = "default";
1681
1682/*
1683 * interface for configure zonelist ordering.
1684 * command line option "numa_zonelist_order"
1685 * = "[dD]efault - default, automatic configuration.
1686 * = "[nN]ode - order by node locality, then by zone within node
1687 * = "[zZ]one - order by zone, then by locality within zone
1688 */
1689
1690static int __parse_numa_zonelist_order(char *s)
1691{
1692 if (*s == 'd' || *s == 'D') {
1693 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
1694 } else if (*s == 'n' || *s == 'N') {
1695 user_zonelist_order = ZONELIST_ORDER_NODE;
1696 } else if (*s == 'z' || *s == 'Z') {
1697 user_zonelist_order = ZONELIST_ORDER_ZONE;
1698 } else {
1699 printk(KERN_WARNING
1700 "Ignoring invalid numa_zonelist_order value: "
1701 "%s\n", s);
1702 return -EINVAL;
1703 }
1704 return 0;
1705}
1706
1707static __init int setup_numa_zonelist_order(char *s)
1708{
1709 if (s)
1710 return __parse_numa_zonelist_order(s);
1711 return 0;
1712}
1713early_param("numa_zonelist_order", setup_numa_zonelist_order);
1714
1715/*
1716 * sysctl handler for numa_zonelist_order
1717 */
1718int numa_zonelist_order_handler(ctl_table *table, int write,
1719 struct file *file, void __user *buffer, size_t *length,
1720 loff_t *ppos)
1721{
1722 char saved_string[NUMA_ZONELIST_ORDER_LEN];
1723 int ret;
1724
1725 if (write)
1726 strncpy(saved_string, (char*)table->data,
1727 NUMA_ZONELIST_ORDER_LEN);
1728 ret = proc_dostring(table, write, file, buffer, length, ppos);
1729 if (ret)
1730 return ret;
1731 if (write) {
1732 int oldval = user_zonelist_order;
1733 if (__parse_numa_zonelist_order((char*)table->data)) {
1734 /*
1735 * bogus value. restore saved string
1736 */
1737 strncpy((char*)table->data, saved_string,
1738 NUMA_ZONELIST_ORDER_LEN);
1739 user_zonelist_order = oldval;
1740 } else if (oldval != user_zonelist_order)
1741 build_all_zonelists();
1742 }
1743 return 0;
1744}
1745
1746
1645#define MAX_NODE_LOAD (num_online_nodes()) 1747#define MAX_NODE_LOAD (num_online_nodes())
1646static int __meminitdata node_load[MAX_NUMNODES]; 1748static int node_load[MAX_NUMNODES];
1749
1647/** 1750/**
1648 * find_next_best_node - find the next node that should appear in a given node's fallback list 1751 * find_next_best_node - find the next node that should appear in a given node's fallback list
1649 * @node: node whose fallback list we're appending 1752 * @node: node whose fallback list we're appending
@@ -1658,7 +1761,7 @@ static int __meminitdata node_load[MAX_NUMNODES];
1658 * on them otherwise. 1761 * on them otherwise.
1659 * It returns -1 if no node is found. 1762 * It returns -1 if no node is found.
1660 */ 1763 */
1661static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) 1764static int find_next_best_node(int node, nodemask_t *used_node_mask)
1662{ 1765{
1663 int n, val; 1766 int n, val;
1664 int min_val = INT_MAX; 1767 int min_val = INT_MAX;
@@ -1704,13 +1807,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1704 return best_node; 1807 return best_node;
1705} 1808}
1706 1809
1707static void __meminit build_zonelists(pg_data_t *pgdat) 1810
1811/*
1812 * Build zonelists ordered by node and zones within node.
1813 * This results in maximum locality--normal zone overflows into local
1814 * DMA zone, if any--but risks exhausting DMA zone.
1815 */
1816static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
1708{ 1817{
1709 int j, node, local_node;
1710 enum zone_type i; 1818 enum zone_type i;
1711 int prev_node, load; 1819 int j;
1712 struct zonelist *zonelist; 1820 struct zonelist *zonelist;
1821
1822 for (i = 0; i < MAX_NR_ZONES; i++) {
1823 zonelist = pgdat->node_zonelists + i;
1824 for (j = 0; zonelist->zones[j] != NULL; j++)
1825 ;
1826 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1827 zonelist->zones[j] = NULL;
1828 }
1829}
1830
1831/*
1832 * Build zonelists ordered by zone and nodes within zones.
1833 * This results in conserving DMA zone[s] until all Normal memory is
1834 * exhausted, but results in overflowing to remote node while memory
1835 * may still exist in local DMA zone.
1836 */
1837static int node_order[MAX_NUMNODES];
1838
1839static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
1840{
1841 enum zone_type i;
1842 int pos, j, node;
1843 int zone_type; /* needs to be signed */
1844 struct zone *z;
1845 struct zonelist *zonelist;
1846
1847 for (i = 0; i < MAX_NR_ZONES; i++) {
1848 zonelist = pgdat->node_zonelists + i;
1849 pos = 0;
1850 for (zone_type = i; zone_type >= 0; zone_type--) {
1851 for (j = 0; j < nr_nodes; j++) {
1852 node = node_order[j];
1853 z = &NODE_DATA(node)->node_zones[zone_type];
1854 if (populated_zone(z)) {
1855 zonelist->zones[pos++] = z;
1856 check_highest_zone(zone_type);
1857 }
1858 }
1859 }
1860 zonelist->zones[pos] = NULL;
1861 }
1862}
1863
1864static int default_zonelist_order(void)
1865{
1866 int nid, zone_type;
1867 unsigned long low_kmem_size,total_size;
1868 struct zone *z;
1869 int average_size;
1870 /*
1871 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
1872 * If they are really small and used heavily, the system can fall
1873 * into OOM very easily.
1874 * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
1875 */
1876 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
1877 low_kmem_size = 0;
1878 total_size = 0;
1879 for_each_online_node(nid) {
1880 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
1881 z = &NODE_DATA(nid)->node_zones[zone_type];
1882 if (populated_zone(z)) {
1883 if (zone_type < ZONE_NORMAL)
1884 low_kmem_size += z->present_pages;
1885 total_size += z->present_pages;
1886 }
1887 }
1888 }
1889 if (!low_kmem_size || /* there are no DMA area. */
1890 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
1891 return ZONELIST_ORDER_NODE;
1892 /*
1893 * look into each node's config.
1894 * If there is a node whose DMA/DMA32 memory is very big area on
1895 * local memory, NODE_ORDER may be suitable.
1896 */
1897 average_size = total_size / (num_online_nodes() + 1);
1898 for_each_online_node(nid) {
1899 low_kmem_size = 0;
1900 total_size = 0;
1901 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
1902 z = &NODE_DATA(nid)->node_zones[zone_type];
1903 if (populated_zone(z)) {
1904 if (zone_type < ZONE_NORMAL)
1905 low_kmem_size += z->present_pages;
1906 total_size += z->present_pages;
1907 }
1908 }
1909 if (low_kmem_size &&
1910 total_size > average_size && /* ignore small node */
1911 low_kmem_size > total_size * 70/100)
1912 return ZONELIST_ORDER_NODE;
1913 }
1914 return ZONELIST_ORDER_ZONE;
1915}
1916
1917static void set_zonelist_order(void)
1918{
1919 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
1920 current_zonelist_order = default_zonelist_order();
1921 else
1922 current_zonelist_order = user_zonelist_order;
1923}
1924
1925static void build_zonelists(pg_data_t *pgdat)
1926{
1927 int j, node, load;
1928 enum zone_type i;
1713 nodemask_t used_mask; 1929 nodemask_t used_mask;
1930 int local_node, prev_node;
1931 struct zonelist *zonelist;
1932 int order = current_zonelist_order;
1714 1933
1715 /* initialize zonelists */ 1934 /* initialize zonelists */
1716 for (i = 0; i < MAX_NR_ZONES; i++) { 1935 for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -1723,6 +1942,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1723 load = num_online_nodes(); 1942 load = num_online_nodes();
1724 prev_node = local_node; 1943 prev_node = local_node;
1725 nodes_clear(used_mask); 1944 nodes_clear(used_mask);
1945
1946 memset(node_load, 0, sizeof(node_load));
1947 memset(node_order, 0, sizeof(node_order));
1948 j = 0;
1949
1726 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1950 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1727 int distance = node_distance(local_node, node); 1951 int distance = node_distance(local_node, node);
1728 1952
@@ -1738,23 +1962,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1738 * So adding penalty to the first node in same 1962 * So adding penalty to the first node in same
1739 * distance group to make it round-robin. 1963 * distance group to make it round-robin.
1740 */ 1964 */
1741
1742 if (distance != node_distance(local_node, prev_node)) 1965 if (distance != node_distance(local_node, prev_node))
1743 node_load[node] += load; 1966 node_load[node] = load;
1967
1744 prev_node = node; 1968 prev_node = node;
1745 load--; 1969 load--;
1746 for (i = 0; i < MAX_NR_ZONES; i++) { 1970 if (order == ZONELIST_ORDER_NODE)
1747 zonelist = pgdat->node_zonelists + i; 1971 build_zonelists_in_node_order(pgdat, node);
1748 for (j = 0; zonelist->zones[j] != NULL; j++); 1972 else
1973 node_order[j++] = node; /* remember order */
1974 }
1749 1975
1750 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 1976 if (order == ZONELIST_ORDER_ZONE) {
1751 zonelist->zones[j] = NULL; 1977 /* calculate node order -- i.e., DMA last! */
1752 } 1978 build_zonelists_in_zone_order(pgdat, j);
1753 } 1979 }
1754} 1980}
1755 1981
1756/* Construct the zonelist performance cache - see further mmzone.h */ 1982/* Construct the zonelist performance cache - see further mmzone.h */
1757static void __meminit build_zonelist_cache(pg_data_t *pgdat) 1983static void build_zonelist_cache(pg_data_t *pgdat)
1758{ 1984{
1759 int i; 1985 int i;
1760 1986
@@ -1771,9 +1997,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1771 } 1997 }
1772} 1998}
1773 1999
2000
1774#else /* CONFIG_NUMA */ 2001#else /* CONFIG_NUMA */
1775 2002
1776static void __meminit build_zonelists(pg_data_t *pgdat) 2003static void set_zonelist_order(void)
2004{
2005 current_zonelist_order = ZONELIST_ORDER_ZONE;
2006}
2007
2008static void build_zonelists(pg_data_t *pgdat)
1777{ 2009{
1778 int node, local_node; 2010 int node, local_node;
1779 enum zone_type i,j; 2011 enum zone_type i,j;
@@ -1809,7 +2041,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1809} 2041}
1810 2042
1811/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 2043/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1812static void __meminit build_zonelist_cache(pg_data_t *pgdat) 2044static void build_zonelist_cache(pg_data_t *pgdat)
1813{ 2045{
1814 int i; 2046 int i;
1815 2047
@@ -1820,7 +2052,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1820#endif /* CONFIG_NUMA */ 2052#endif /* CONFIG_NUMA */
1821 2053
1822/* return values int ....just for stop_machine_run() */ 2054/* return values int ....just for stop_machine_run() */
1823static int __meminit __build_all_zonelists(void *dummy) 2055static int __build_all_zonelists(void *dummy)
1824{ 2056{
1825 int nid; 2057 int nid;
1826 2058
@@ -1831,8 +2063,10 @@ static int __meminit __build_all_zonelists(void *dummy)
1831 return 0; 2063 return 0;
1832} 2064}
1833 2065
1834void __meminit build_all_zonelists(void) 2066void build_all_zonelists(void)
1835{ 2067{
2068 set_zonelist_order();
2069
1836 if (system_state == SYSTEM_BOOTING) { 2070 if (system_state == SYSTEM_BOOTING) {
1837 __build_all_zonelists(NULL); 2071 __build_all_zonelists(NULL);
1838 cpuset_init_current_mems_allowed(); 2072 cpuset_init_current_mems_allowed();
@@ -1843,8 +2077,13 @@ void __meminit build_all_zonelists(void)
1843 /* cpuset refresh routine should be here */ 2077 /* cpuset refresh routine should be here */
1844 } 2078 }
1845 vm_total_pages = nr_free_pagecache_pages(); 2079 vm_total_pages = nr_free_pagecache_pages();
1846 printk("Built %i zonelists. Total pages: %ld\n", 2080 printk("Built %i zonelists in %s order. Total pages: %ld\n",
1847 num_online_nodes(), vm_total_pages); 2081 num_online_nodes(),
2082 zonelist_order_name[current_zonelist_order],
2083 vm_total_pages);
2084#ifdef CONFIG_NUMA
2085 printk("Policy zone: %s\n", zone_names[policy_zone]);
2086#endif
1848} 2087}
1849 2088
1850/* 2089/*
@@ -1953,8 +2192,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1953 } 2192 }
1954} 2193}
1955 2194
1956void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, 2195static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
1957 unsigned long size) 2196 struct zone *zone, unsigned long size)
1958{ 2197{
1959 int order; 2198 int order;
1960 for (order = 0; order < MAX_ORDER ; order++) { 2199 for (order = 0; order < MAX_ORDER ; order++) {
@@ -1968,7 +2207,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone,
1968 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 2207 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
1969#endif 2208#endif
1970 2209
1971static int __cpuinit zone_batchsize(struct zone *zone) 2210static int __devinit zone_batchsize(struct zone *zone)
1972{ 2211{
1973 int batch; 2212 int batch;
1974 2213
@@ -2370,7 +2609,7 @@ void __init push_node_boundaries(unsigned int nid,
2370} 2609}
2371 2610
2372/* If necessary, push the node boundary out for reserve hotadd */ 2611/* If necessary, push the node boundary out for reserve hotadd */
2373static void __init account_node_boundary(unsigned int nid, 2612static void __meminit account_node_boundary(unsigned int nid,
2374 unsigned long *start_pfn, unsigned long *end_pfn) 2613 unsigned long *start_pfn, unsigned long *end_pfn)
2375{ 2614{
2376 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", 2615 printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
@@ -2390,7 +2629,7 @@ static void __init account_node_boundary(unsigned int nid,
2390void __init push_node_boundaries(unsigned int nid, 2629void __init push_node_boundaries(unsigned int nid,
2391 unsigned long start_pfn, unsigned long end_pfn) {} 2630 unsigned long start_pfn, unsigned long end_pfn) {}
2392 2631
2393static void __init account_node_boundary(unsigned int nid, 2632static void __meminit account_node_boundary(unsigned int nid,
2394 unsigned long *start_pfn, unsigned long *end_pfn) {} 2633 unsigned long *start_pfn, unsigned long *end_pfn) {}
2395#endif 2634#endif
2396 2635
@@ -2431,7 +2670,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
2431 * Return the number of pages a zone spans in a node, including holes 2670 * Return the number of pages a zone spans in a node, including holes
2432 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() 2671 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
2433 */ 2672 */
2434unsigned long __meminit zone_spanned_pages_in_node(int nid, 2673static unsigned long __meminit zone_spanned_pages_in_node(int nid,
2435 unsigned long zone_type, 2674 unsigned long zone_type,
2436 unsigned long *ignored) 2675 unsigned long *ignored)
2437{ 2676{
@@ -2519,7 +2758,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
2519} 2758}
2520 2759
2521/* Return the number of page frames in holes in a zone on a node */ 2760/* Return the number of page frames in holes in a zone on a node */
2522unsigned long __meminit zone_absent_pages_in_node(int nid, 2761static unsigned long __meminit zone_absent_pages_in_node(int nid,
2523 unsigned long zone_type, 2762 unsigned long zone_type,
2524 unsigned long *ignored) 2763 unsigned long *ignored)
2525{ 2764{
@@ -2536,14 +2775,14 @@ unsigned long __meminit zone_absent_pages_in_node(int nid,
2536} 2775}
2537 2776
2538#else 2777#else
2539static inline unsigned long zone_spanned_pages_in_node(int nid, 2778static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
2540 unsigned long zone_type, 2779 unsigned long zone_type,
2541 unsigned long *zones_size) 2780 unsigned long *zones_size)
2542{ 2781{
2543 return zones_size[zone_type]; 2782 return zones_size[zone_type];
2544} 2783}
2545 2784
2546static inline unsigned long zone_absent_pages_in_node(int nid, 2785static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
2547 unsigned long zone_type, 2786 unsigned long zone_type,
2548 unsigned long *zholes_size) 2787 unsigned long *zholes_size)
2549{ 2788{
@@ -3355,13 +3594,28 @@ void *__init alloc_large_system_hash(const char *tablename,
3355 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) 3594 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
3356 ; 3595 ;
3357 table = (void*) __get_free_pages(GFP_ATOMIC, order); 3596 table = (void*) __get_free_pages(GFP_ATOMIC, order);
3597 /*
3598 * If bucketsize is not a power-of-two, we may free
3599 * some pages at the end of hash table.
3600 */
3601 if (table) {
3602 unsigned long alloc_end = (unsigned long)table +
3603 (PAGE_SIZE << order);
3604 unsigned long used = (unsigned long)table +
3605 PAGE_ALIGN(size);
3606 split_page(virt_to_page(table), order);
3607 while (used < alloc_end) {
3608 free_page(used);
3609 used += PAGE_SIZE;
3610 }
3611 }
3358 } 3612 }
3359 } while (!table && size > PAGE_SIZE && --log2qty); 3613 } while (!table && size > PAGE_SIZE && --log2qty);
3360 3614
3361 if (!table) 3615 if (!table)
3362 panic("Failed to allocate %s hash table\n", tablename); 3616 panic("Failed to allocate %s hash table\n", tablename);
3363 3617
3364 printk("%s hash table entries: %d (order: %d, %lu bytes)\n", 3618 printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
3365 tablename, 3619 tablename,
3366 (1U << log2qty), 3620 (1U << log2qty),
3367 ilog2(size) - PAGE_SHIFT, 3621 ilog2(size) - PAGE_SHIFT,
diff --git a/mm/rmap.c b/mm/rmap.c
index 850165d32b7a..61e492597a0b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,24 +53,6 @@
53 53
54struct kmem_cache *anon_vma_cachep; 54struct kmem_cache *anon_vma_cachep;
55 55
56static inline void validate_anon_vma(struct vm_area_struct *find_vma)
57{
58#ifdef CONFIG_DEBUG_VM
59 struct anon_vma *anon_vma = find_vma->anon_vma;
60 struct vm_area_struct *vma;
61 unsigned int mapcount = 0;
62 int found = 0;
63
64 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
65 mapcount++;
66 BUG_ON(mapcount > 100000);
67 if (vma == find_vma)
68 found = 1;
69 }
70 BUG_ON(!found);
71#endif
72}
73
74/* This must be called under the mmap_sem. */ 56/* This must be called under the mmap_sem. */
75int anon_vma_prepare(struct vm_area_struct *vma) 57int anon_vma_prepare(struct vm_area_struct *vma)
76{ 58{
@@ -121,10 +103,8 @@ void __anon_vma_link(struct vm_area_struct *vma)
121{ 103{
122 struct anon_vma *anon_vma = vma->anon_vma; 104 struct anon_vma *anon_vma = vma->anon_vma;
123 105
124 if (anon_vma) { 106 if (anon_vma)
125 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 107 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
126 validate_anon_vma(vma);
127 }
128} 108}
129 109
130void anon_vma_link(struct vm_area_struct *vma) 110void anon_vma_link(struct vm_area_struct *vma)
@@ -134,7 +114,6 @@ void anon_vma_link(struct vm_area_struct *vma)
134 if (anon_vma) { 114 if (anon_vma) {
135 spin_lock(&anon_vma->lock); 115 spin_lock(&anon_vma->lock);
136 list_add_tail(&vma->anon_vma_node, &anon_vma->head); 116 list_add_tail(&vma->anon_vma_node, &anon_vma->head);
137 validate_anon_vma(vma);
138 spin_unlock(&anon_vma->lock); 117 spin_unlock(&anon_vma->lock);
139 } 118 }
140} 119}
@@ -148,7 +127,6 @@ void anon_vma_unlink(struct vm_area_struct *vma)
148 return; 127 return;
149 128
150 spin_lock(&anon_vma->lock); 129 spin_lock(&anon_vma->lock);
151 validate_anon_vma(vma);
152 list_del(&vma->anon_vma_node); 130 list_del(&vma->anon_vma_node);
153 131
154 /* We must garbage collect the anon_vma if it's empty */ 132 /* We must garbage collect the anon_vma if it's empty */
diff --git a/mm/shmem.c b/mm/shmem.c
index e537317bec4d..0493e4d0bcaa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -967,6 +967,8 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
967 *nodelist++ = '\0'; 967 *nodelist++ = '\0';
968 if (nodelist_parse(nodelist, *policy_nodes)) 968 if (nodelist_parse(nodelist, *policy_nodes))
969 goto out; 969 goto out;
970 if (!nodes_subset(*policy_nodes, node_online_map))
971 goto out;
970 } 972 }
971 if (!strcmp(value, "default")) { 973 if (!strcmp(value, "default")) {
972 *policy = MPOL_DEFAULT; 974 *policy = MPOL_DEFAULT;
@@ -1098,9 +1100,9 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
1098 * Normally, filepage is NULL on entry, and either found 1100 * Normally, filepage is NULL on entry, and either found
1099 * uptodate immediately, or allocated and zeroed, or read 1101 * uptodate immediately, or allocated and zeroed, or read
1100 * in under swappage, which is then assigned to filepage. 1102 * in under swappage, which is then assigned to filepage.
1101 * But shmem_prepare_write passes in a locked filepage, 1103 * But shmem_readpage and shmem_prepare_write pass in a locked
1102 * which may be found not uptodate by other callers too, 1104 * filepage, which may be found not uptodate by other callers
1103 * and may need to be copied from the swappage read in. 1105 * too, and may need to be copied from the swappage read in.
1104 */ 1106 */
1105repeat: 1107repeat:
1106 if (!filepage) 1108 if (!filepage)
@@ -1483,9 +1485,18 @@ static const struct inode_operations shmem_symlink_inode_operations;
1483static const struct inode_operations shmem_symlink_inline_operations; 1485static const struct inode_operations shmem_symlink_inline_operations;
1484 1486
1485/* 1487/*
1486 * Normally tmpfs makes no use of shmem_prepare_write, but it 1488 * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write;
1487 * lets a tmpfs file be used read-write below the loop driver. 1489 * but providing them allows a tmpfs file to be used for splice, sendfile, and
1490 * below the loop driver, in the generic fashion that many filesystems support.
1488 */ 1491 */
1492static int shmem_readpage(struct file *file, struct page *page)
1493{
1494 struct inode *inode = page->mapping->host;
1495 int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
1496 unlock_page(page);
1497 return error;
1498}
1499
1489static int 1500static int
1490shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) 1501shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
1491{ 1502{
@@ -1709,25 +1720,6 @@ static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count
1709 return desc.error; 1720 return desc.error;
1710} 1721}
1711 1722
1712static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
1713 size_t count, read_actor_t actor, void *target)
1714{
1715 read_descriptor_t desc;
1716
1717 if (!count)
1718 return 0;
1719
1720 desc.written = 0;
1721 desc.count = count;
1722 desc.arg.data = target;
1723 desc.error = 0;
1724
1725 do_shmem_file_read(in_file, ppos, &desc, actor);
1726 if (desc.written)
1727 return desc.written;
1728 return desc.error;
1729}
1730
1731static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) 1723static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
1732{ 1724{
1733 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); 1725 struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -2384,6 +2376,7 @@ static const struct address_space_operations shmem_aops = {
2384 .writepage = shmem_writepage, 2376 .writepage = shmem_writepage,
2385 .set_page_dirty = __set_page_dirty_no_writeback, 2377 .set_page_dirty = __set_page_dirty_no_writeback,
2386#ifdef CONFIG_TMPFS 2378#ifdef CONFIG_TMPFS
2379 .readpage = shmem_readpage,
2387 .prepare_write = shmem_prepare_write, 2380 .prepare_write = shmem_prepare_write,
2388 .commit_write = simple_commit_write, 2381 .commit_write = simple_commit_write,
2389#endif 2382#endif
@@ -2397,7 +2390,8 @@ static const struct file_operations shmem_file_operations = {
2397 .read = shmem_file_read, 2390 .read = shmem_file_read,
2398 .write = shmem_file_write, 2391 .write = shmem_file_write,
2399 .fsync = simple_sync_file, 2392 .fsync = simple_sync_file,
2400 .sendfile = shmem_file_sendfile, 2393 .splice_read = generic_file_splice_read,
2394 .splice_write = generic_file_splice_write,
2401#endif 2395#endif
2402}; 2396};
2403 2397
diff --git a/mm/slab.c b/mm/slab.c
index 2e71a328aa09..a453383333fc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -137,6 +137,7 @@
137 137
138/* Shouldn't this be in a header file somewhere? */ 138/* Shouldn't this be in a header file somewhere? */
139#define BYTES_PER_WORD sizeof(void *) 139#define BYTES_PER_WORD sizeof(void *)
140#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
140 141
141#ifndef cache_line_size 142#ifndef cache_line_size
142#define cache_line_size() L1_CACHE_BYTES 143#define cache_line_size() L1_CACHE_BYTES
@@ -547,7 +548,7 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
547 if (cachep->flags & SLAB_STORE_USER) 548 if (cachep->flags & SLAB_STORE_USER)
548 return (unsigned long long *)(objp + cachep->buffer_size - 549 return (unsigned long long *)(objp + cachep->buffer_size -
549 sizeof(unsigned long long) - 550 sizeof(unsigned long long) -
550 BYTES_PER_WORD); 551 REDZONE_ALIGN);
551 return (unsigned long long *) (objp + cachep->buffer_size - 552 return (unsigned long long *) (objp + cachep->buffer_size -
552 sizeof(unsigned long long)); 553 sizeof(unsigned long long));
553} 554}
@@ -774,7 +775,6 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
774 */ 775 */
775 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); 776 BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
776#endif 777#endif
777 WARN_ON_ONCE(size == 0);
778 while (size > csizep->cs_size) 778 while (size > csizep->cs_size)
779 csizep++; 779 csizep++;
780 780
@@ -929,7 +929,7 @@ static void next_reap_node(void)
929 * the CPUs getting into lockstep and contending for the global cache chain 929 * the CPUs getting into lockstep and contending for the global cache chain
930 * lock. 930 * lock.
931 */ 931 */
932static void __devinit start_cpu_timer(int cpu) 932static void __cpuinit start_cpu_timer(int cpu)
933{ 933{
934 struct delayed_work *reap_work = &per_cpu(reap_work, cpu); 934 struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
935 935
@@ -2179,7 +2179,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2179 * above the next power of two: caches with object sizes just above a 2179 * above the next power of two: caches with object sizes just above a
2180 * power of two have a significant amount of internal fragmentation. 2180 * power of two have a significant amount of internal fragmentation.
2181 */ 2181 */
2182 if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) 2182 if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN +
2183 2 * sizeof(unsigned long long)))
2183 flags |= SLAB_RED_ZONE | SLAB_STORE_USER; 2184 flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
2184 if (!(flags & SLAB_DESTROY_BY_RCU)) 2185 if (!(flags & SLAB_DESTROY_BY_RCU))
2185 flags |= SLAB_POISON; 2186 flags |= SLAB_POISON;
@@ -2220,12 +2221,20 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2220 } 2221 }
2221 2222
2222 /* 2223 /*
2223 * Redzoning and user store require word alignment. Note this will be 2224 * Redzoning and user store require word alignment or possibly larger.
2224 * overridden by architecture or caller mandated alignment if either 2225 * Note this will be overridden by architecture or caller mandated
2225 * is greater than BYTES_PER_WORD. 2226 * alignment if either is greater than BYTES_PER_WORD.
2226 */ 2227 */
2227 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) 2228 if (flags & SLAB_STORE_USER)
2228 ralign = __alignof__(unsigned long long); 2229 ralign = BYTES_PER_WORD;
2230
2231 if (flags & SLAB_RED_ZONE) {
2232 ralign = REDZONE_ALIGN;
2233 /* If redzoning, ensure that the second redzone is suitably
2234 * aligned, by adjusting the object size accordingly. */
2235 size += REDZONE_ALIGN - 1;
2236 size &= ~(REDZONE_ALIGN - 1);
2237 }
2229 2238
2230 /* 2) arch mandated alignment */ 2239 /* 2) arch mandated alignment */
2231 if (ralign < ARCH_SLAB_MINALIGN) { 2240 if (ralign < ARCH_SLAB_MINALIGN) {
@@ -2262,9 +2271,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2262 } 2271 }
2263 if (flags & SLAB_STORE_USER) { 2272 if (flags & SLAB_STORE_USER) {
2264 /* user store requires one word storage behind the end of 2273 /* user store requires one word storage behind the end of
2265 * the real object. 2274 * the real object. But if the second red zone needs to be
2275 * aligned to 64 bits, we must allow that much space.
2266 */ 2276 */
2267 size += BYTES_PER_WORD; 2277 if (flags & SLAB_RED_ZONE)
2278 size += REDZONE_ALIGN;
2279 else
2280 size += BYTES_PER_WORD;
2268 } 2281 }
2269#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 2282#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
2270 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size 2283 if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
@@ -3539,7 +3552,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3539 check_irq_off(); 3552 check_irq_off();
3540 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 3553 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
3541 3554
3542 if (use_alien_caches && cache_free_alien(cachep, objp)) 3555 if (cache_free_alien(cachep, objp))
3543 return; 3556 return;
3544 3557
3545 if (likely(ac->avail < ac->limit)) { 3558 if (likely(ac->avail < ac->limit)) {
@@ -4144,26 +4157,17 @@ static void print_slabinfo_header(struct seq_file *m)
4144static void *s_start(struct seq_file *m, loff_t *pos) 4157static void *s_start(struct seq_file *m, loff_t *pos)
4145{ 4158{
4146 loff_t n = *pos; 4159 loff_t n = *pos;
4147 struct list_head *p;
4148 4160
4149 mutex_lock(&cache_chain_mutex); 4161 mutex_lock(&cache_chain_mutex);
4150 if (!n) 4162 if (!n)
4151 print_slabinfo_header(m); 4163 print_slabinfo_header(m);
4152 p = cache_chain.next; 4164
4153 while (n--) { 4165 return seq_list_start(&cache_chain, *pos);
4154 p = p->next;
4155 if (p == &cache_chain)
4156 return NULL;
4157 }
4158 return list_entry(p, struct kmem_cache, next);
4159} 4166}
4160 4167
4161static void *s_next(struct seq_file *m, void *p, loff_t *pos) 4168static void *s_next(struct seq_file *m, void *p, loff_t *pos)
4162{ 4169{
4163 struct kmem_cache *cachep = p; 4170 return seq_list_next(p, &cache_chain, pos);
4164 ++*pos;
4165 return cachep->next.next == &cache_chain ?
4166 NULL : list_entry(cachep->next.next, struct kmem_cache, next);
4167} 4171}
4168 4172
4169static void s_stop(struct seq_file *m, void *p) 4173static void s_stop(struct seq_file *m, void *p)
@@ -4173,7 +4177,7 @@ static void s_stop(struct seq_file *m, void *p)
4173 4177
4174static int s_show(struct seq_file *m, void *p) 4178static int s_show(struct seq_file *m, void *p)
4175{ 4179{
4176 struct kmem_cache *cachep = p; 4180 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4177 struct slab *slabp; 4181 struct slab *slabp;
4178 unsigned long active_objs; 4182 unsigned long active_objs;
4179 unsigned long num_objs; 4183 unsigned long num_objs;
@@ -4342,17 +4346,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4342 4346
4343static void *leaks_start(struct seq_file *m, loff_t *pos) 4347static void *leaks_start(struct seq_file *m, loff_t *pos)
4344{ 4348{
4345 loff_t n = *pos;
4346 struct list_head *p;
4347
4348 mutex_lock(&cache_chain_mutex); 4349 mutex_lock(&cache_chain_mutex);
4349 p = cache_chain.next; 4350 return seq_list_start(&cache_chain, *pos);
4350 while (n--) {
4351 p = p->next;
4352 if (p == &cache_chain)
4353 return NULL;
4354 }
4355 return list_entry(p, struct kmem_cache, next);
4356} 4351}
4357 4352
4358static inline int add_caller(unsigned long *n, unsigned long v) 4353static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4417,7 +4412,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
4417 4412
4418static int leaks_show(struct seq_file *m, void *p) 4413static int leaks_show(struct seq_file *m, void *p)
4419{ 4414{
4420 struct kmem_cache *cachep = p; 4415 struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
4421 struct slab *slabp; 4416 struct slab *slabp;
4422 struct kmem_list3 *l3; 4417 struct kmem_list3 *l3;
4423 const char *name; 4418 const char *name;
diff --git a/mm/slob.c b/mm/slob.c
index 71976c5d40d3..b4899079d8b0 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -3,57 +3,159 @@
3 * 3 *
4 * Matt Mackall <mpm@selenic.com> 12/30/03 4 * Matt Mackall <mpm@selenic.com> 12/30/03
5 * 5 *
6 * NUMA support by Paul Mundt, 2007.
7 *
6 * How SLOB works: 8 * How SLOB works:
7 * 9 *
8 * The core of SLOB is a traditional K&R style heap allocator, with 10 * The core of SLOB is a traditional K&R style heap allocator, with
9 * support for returning aligned objects. The granularity of this 11 * support for returning aligned objects. The granularity of this
10 * allocator is 8 bytes on x86, though it's perhaps possible to reduce 12 * allocator is as little as 2 bytes, however typically most architectures
11 * this to 4 if it's deemed worth the effort. The slob heap is a 13 * will require 4 bytes on 32-bit and 8 bytes on 64-bit.
12 * singly-linked list of pages from __get_free_page, grown on demand 14 *
13 * and allocation from the heap is currently first-fit. 15 * The slob heap is a linked list of pages from alloc_pages(), and
16 * within each page, there is a singly-linked list of free blocks (slob_t).
17 * The heap is grown on demand and allocation from the heap is currently
18 * first-fit.
14 * 19 *
15 * Above this is an implementation of kmalloc/kfree. Blocks returned 20 * Above this is an implementation of kmalloc/kfree. Blocks returned
16 * from kmalloc are 8-byte aligned and prepended with a 8-byte header. 21 * from kmalloc are prepended with a 4-byte header with the kmalloc size.
17 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls 22 * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
18 * __get_free_pages directly so that it can return page-aligned blocks 23 * alloc_pages() directly, allocating compound pages so the page order
19 * and keeps a linked list of such pages and their orders. These 24 * does not have to be separately tracked, and also stores the exact
20 * objects are detected in kfree() by their page alignment. 25 * allocation size in page->private so that it can be used to accurately
26 * provide ksize(). These objects are detected in kfree() because slob_page()
27 * is false for them.
21 * 28 *
22 * SLAB is emulated on top of SLOB by simply calling constructors and 29 * SLAB is emulated on top of SLOB by simply calling constructors and
23 * destructors for every SLAB allocation. Objects are returned with 30 * destructors for every SLAB allocation. Objects are returned with the
24 * the 8-byte alignment unless the SLAB_HWCACHE_ALIGN flag is 31 * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which
25 * set, in which case the low-level allocator will fragment blocks to 32 * case the low-level allocator will fragment blocks to create the proper
26 * create the proper alignment. Again, objects of page-size or greater 33 * alignment. Again, objects of page-size or greater are allocated by
27 * are allocated by calling __get_free_pages. As SLAB objects know 34 * calling alloc_pages(). As SLAB objects know their size, no separate
28 * their size, no separate size bookkeeping is necessary and there is 35 * size bookkeeping is necessary and there is essentially no allocation
29 * essentially no allocation space overhead. 36 * space overhead, and compound pages aren't needed for multi-page
37 * allocations.
38 *
39 * NUMA support in SLOB is fairly simplistic, pushing most of the real
40 * logic down to the page allocator, and simply doing the node accounting
41 * on the upper levels. In the event that a node id is explicitly
42 * provided, alloc_pages_node() with the specified node id is used
43 * instead. The common case (or when the node id isn't explicitly provided)
44 * will default to the current node, as per numa_node_id().
45 *
46 * Node aware pages are still inserted in to the global freelist, and
47 * these are scanned for by matching against the node id encoded in the
48 * page flags. As a result, block allocations that can be satisfied from
49 * the freelist will only be done so on pages residing on the same node,
50 * in order to prevent random node placement.
30 */ 51 */
31 52
53#include <linux/kernel.h>
32#include <linux/slab.h> 54#include <linux/slab.h>
33#include <linux/mm.h> 55#include <linux/mm.h>
34#include <linux/cache.h> 56#include <linux/cache.h>
35#include <linux/init.h> 57#include <linux/init.h>
36#include <linux/module.h> 58#include <linux/module.h>
37#include <linux/timer.h>
38#include <linux/rcupdate.h> 59#include <linux/rcupdate.h>
60#include <linux/list.h>
61#include <asm/atomic.h>
62
63/*
64 * slob_block has a field 'units', which indicates size of block if +ve,
65 * or offset of next block if -ve (in SLOB_UNITs).
66 *
67 * Free blocks of size 1 unit simply contain the offset of the next block.
68 * Those with larger size contain their size in the first SLOB_UNIT of
69 * memory, and the offset of the next free block in the second SLOB_UNIT.
70 */
71#if PAGE_SIZE <= (32767 * 2)
72typedef s16 slobidx_t;
73#else
74typedef s32 slobidx_t;
75#endif
39 76
40struct slob_block { 77struct slob_block {
41 int units; 78 slobidx_t units;
42 struct slob_block *next;
43}; 79};
44typedef struct slob_block slob_t; 80typedef struct slob_block slob_t;
45 81
82/*
83 * We use struct page fields to manage some slob allocation aspects,
84 * however to avoid the horrible mess in include/linux/mm_types.h, we'll
85 * just define our own struct page type variant here.
86 */
87struct slob_page {
88 union {
89 struct {
90 unsigned long flags; /* mandatory */
91 atomic_t _count; /* mandatory */
92 slobidx_t units; /* free units left in page */
93 unsigned long pad[2];
94 slob_t *free; /* first free slob_t in page */
95 struct list_head list; /* linked list of free pages */
96 };
97 struct page page;
98 };
99};
100static inline void struct_slob_page_wrong_size(void)
101{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
102
103/*
104 * free_slob_page: call before a slob_page is returned to the page allocator.
105 */
106static inline void free_slob_page(struct slob_page *sp)
107{
108 reset_page_mapcount(&sp->page);
109 sp->page.mapping = NULL;
110}
111
112/*
113 * All (partially) free slob pages go on this list.
114 */
115static LIST_HEAD(free_slob_pages);
116
117/*
118 * slob_page: True for all slob pages (false for bigblock pages)
119 */
120static inline int slob_page(struct slob_page *sp)
121{
122 return test_bit(PG_active, &sp->flags);
123}
124
125static inline void set_slob_page(struct slob_page *sp)
126{
127 __set_bit(PG_active, &sp->flags);
128}
129
130static inline void clear_slob_page(struct slob_page *sp)
131{
132 __clear_bit(PG_active, &sp->flags);
133}
134
135/*
136 * slob_page_free: true for pages on free_slob_pages list.
137 */
138static inline int slob_page_free(struct slob_page *sp)
139{
140 return test_bit(PG_private, &sp->flags);
141}
142
143static inline void set_slob_page_free(struct slob_page *sp)
144{
145 list_add(&sp->list, &free_slob_pages);
146 __set_bit(PG_private, &sp->flags);
147}
148
149static inline void clear_slob_page_free(struct slob_page *sp)
150{
151 list_del(&sp->list);
152 __clear_bit(PG_private, &sp->flags);
153}
154
46#define SLOB_UNIT sizeof(slob_t) 155#define SLOB_UNIT sizeof(slob_t)
47#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) 156#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
48#define SLOB_ALIGN L1_CACHE_BYTES 157#define SLOB_ALIGN L1_CACHE_BYTES
49 158
50struct bigblock {
51 int order;
52 void *pages;
53 struct bigblock *next;
54};
55typedef struct bigblock bigblock_t;
56
57/* 159/*
58 * struct slob_rcu is inserted at the tail of allocated slob blocks, which 160 * struct slob_rcu is inserted at the tail of allocated slob blocks, which
59 * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free 161 * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
@@ -64,133 +166,285 @@ struct slob_rcu {
64 int size; 166 int size;
65}; 167};
66 168
67static slob_t arena = { .next = &arena, .units = 1 }; 169/*
68static slob_t *slobfree = &arena; 170 * slob_lock protects all slob allocator structures.
69static bigblock_t *bigblocks; 171 */
70static DEFINE_SPINLOCK(slob_lock); 172static DEFINE_SPINLOCK(slob_lock);
71static DEFINE_SPINLOCK(block_lock);
72 173
73static void slob_free(void *b, int size); 174/*
74static void slob_timer_cbk(void); 175 * Encode the given size and next info into a free slob block s.
176 */
177static void set_slob(slob_t *s, slobidx_t size, slob_t *next)
178{
179 slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
180 slobidx_t offset = next - base;
75 181
182 if (size > 1) {
183 s[0].units = size;
184 s[1].units = offset;
185 } else
186 s[0].units = -offset;
187}
76 188
77static void *slob_alloc(size_t size, gfp_t gfp, int align) 189/*
190 * Return the size of a slob block.
191 */
192static slobidx_t slob_units(slob_t *s)
193{
194 if (s->units > 0)
195 return s->units;
196 return 1;
197}
198
199/*
200 * Return the next free slob block pointer after this one.
201 */
202static slob_t *slob_next(slob_t *s)
203{
204 slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK);
205 slobidx_t next;
206
207 if (s[0].units < 0)
208 next = -s[0].units;
209 else
210 next = s[1].units;
211 return base+next;
212}
213
214/*
215 * Returns true if s is the last free block in its page.
216 */
217static int slob_last(slob_t *s)
218{
219 return !((unsigned long)slob_next(s) & ~PAGE_MASK);
220}
221
222static void *slob_new_page(gfp_t gfp, int order, int node)
223{
224 void *page;
225
226#ifdef CONFIG_NUMA
227 if (node != -1)
228 page = alloc_pages_node(node, gfp, order);
229 else
230#endif
231 page = alloc_pages(gfp, order);
232
233 if (!page)
234 return NULL;
235
236 return page_address(page);
237}
238
239/*
240 * Allocate a slob block within a given slob_page sp.
241 */
242static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
78{ 243{
79 slob_t *prev, *cur, *aligned = 0; 244 slob_t *prev, *cur, *aligned = 0;
80 int delta = 0, units = SLOB_UNITS(size); 245 int delta = 0, units = SLOB_UNITS(size);
81 unsigned long flags;
82 246
83 spin_lock_irqsave(&slob_lock, flags); 247 for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
84 prev = slobfree; 248 slobidx_t avail = slob_units(cur);
85 for (cur = prev->next; ; prev = cur, cur = cur->next) { 249
86 if (align) { 250 if (align) {
87 aligned = (slob_t *)ALIGN((unsigned long)cur, align); 251 aligned = (slob_t *)ALIGN((unsigned long)cur, align);
88 delta = aligned - cur; 252 delta = aligned - cur;
89 } 253 }
90 if (cur->units >= units + delta) { /* room enough? */ 254 if (avail >= units + delta) { /* room enough? */
255 slob_t *next;
256
91 if (delta) { /* need to fragment head to align? */ 257 if (delta) { /* need to fragment head to align? */
92 aligned->units = cur->units - delta; 258 next = slob_next(cur);
93 aligned->next = cur->next; 259 set_slob(aligned, avail - delta, next);
94 cur->next = aligned; 260 set_slob(cur, delta, aligned);
95 cur->units = delta;
96 prev = cur; 261 prev = cur;
97 cur = aligned; 262 cur = aligned;
263 avail = slob_units(cur);
98 } 264 }
99 265
100 if (cur->units == units) /* exact fit? */ 266 next = slob_next(cur);
101 prev->next = cur->next; /* unlink */ 267 if (avail == units) { /* exact fit? unlink. */
102 else { /* fragment */ 268 if (prev)
103 prev->next = cur + units; 269 set_slob(prev, slob_units(prev), next);
104 prev->next->units = cur->units - units; 270 else
105 prev->next->next = cur->next; 271 sp->free = next;
106 cur->units = units; 272 } else { /* fragment */
273 if (prev)
274 set_slob(prev, slob_units(prev), cur + units);
275 else
276 sp->free = cur + units;
277 set_slob(cur + units, avail - units, next);
107 } 278 }
108 279
109 slobfree = prev; 280 sp->units -= units;
110 spin_unlock_irqrestore(&slob_lock, flags); 281 if (!sp->units)
282 clear_slob_page_free(sp);
111 return cur; 283 return cur;
112 } 284 }
113 if (cur == slobfree) { 285 if (slob_last(cur))
114 spin_unlock_irqrestore(&slob_lock, flags); 286 return NULL;
115 287 }
116 if (size == PAGE_SIZE) /* trying to shrink arena? */ 288}
117 return 0;
118 289
119 cur = (slob_t *)__get_free_page(gfp); 290/*
120 if (!cur) 291 * slob_alloc: entry point into the slob allocator.
121 return 0; 292 */
293static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
294{
295 struct slob_page *sp;
296 slob_t *b = NULL;
297 unsigned long flags;
122 298
123 slob_free(cur, PAGE_SIZE); 299 spin_lock_irqsave(&slob_lock, flags);
124 spin_lock_irqsave(&slob_lock, flags); 300 /* Iterate through each partially free page, try to find room */
125 cur = slobfree; 301 list_for_each_entry(sp, &free_slob_pages, list) {
302#ifdef CONFIG_NUMA
303 /*
304 * If there's a node specification, search for a partial
305 * page with a matching node id in the freelist.
306 */
307 if (node != -1 && page_to_nid(&sp->page) != node)
308 continue;
309#endif
310
311 if (sp->units >= SLOB_UNITS(size)) {
312 b = slob_page_alloc(sp, size, align);
313 if (b)
314 break;
126 } 315 }
127 } 316 }
317 spin_unlock_irqrestore(&slob_lock, flags);
318
319 /* Not enough space: must allocate a new page */
320 if (!b) {
321 b = slob_new_page(gfp, 0, node);
322 if (!b)
323 return 0;
324 sp = (struct slob_page *)virt_to_page(b);
325 set_slob_page(sp);
326
327 spin_lock_irqsave(&slob_lock, flags);
328 sp->units = SLOB_UNITS(PAGE_SIZE);
329 sp->free = b;
330 INIT_LIST_HEAD(&sp->list);
331 set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
332 set_slob_page_free(sp);
333 b = slob_page_alloc(sp, size, align);
334 BUG_ON(!b);
335 spin_unlock_irqrestore(&slob_lock, flags);
336 }
337 return b;
128} 338}
129 339
340/*
341 * slob_free: entry point into the slob allocator.
342 */
130static void slob_free(void *block, int size) 343static void slob_free(void *block, int size)
131{ 344{
132 slob_t *cur, *b = (slob_t *)block; 345 struct slob_page *sp;
346 slob_t *prev, *next, *b = (slob_t *)block;
347 slobidx_t units;
133 unsigned long flags; 348 unsigned long flags;
134 349
135 if (!block) 350 if (!block)
136 return; 351 return;
352 BUG_ON(!size);
137 353
138 if (size) 354 sp = (struct slob_page *)virt_to_page(block);
139 b->units = SLOB_UNITS(size); 355 units = SLOB_UNITS(size);
140 356
141 /* Find reinsertion point */
142 spin_lock_irqsave(&slob_lock, flags); 357 spin_lock_irqsave(&slob_lock, flags);
143 for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
144 if (cur >= cur->next && (b > cur || b < cur->next))
145 break;
146 358
147 if (b + b->units == cur->next) { 359 if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) {
148 b->units += cur->next->units; 360 /* Go directly to page allocator. Do not pass slob allocator */
149 b->next = cur->next->next; 361 if (slob_page_free(sp))
150 } else 362 clear_slob_page_free(sp);
151 b->next = cur->next; 363 clear_slob_page(sp);
364 free_slob_page(sp);
365 free_page((unsigned long)b);
366 goto out;
367 }
152 368
153 if (cur + cur->units == b) { 369 if (!slob_page_free(sp)) {
154 cur->units += b->units; 370 /* This slob page is about to become partially free. Easy! */
155 cur->next = b->next; 371 sp->units = units;
156 } else 372 sp->free = b;
157 cur->next = b; 373 set_slob(b, units,
374 (void *)((unsigned long)(b +
375 SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
376 set_slob_page_free(sp);
377 goto out;
378 }
158 379
159 slobfree = cur; 380 /*
381 * Otherwise the page is already partially free, so find reinsertion
382 * point.
383 */
384 sp->units += units;
160 385
386 if (b < sp->free) {
387 set_slob(b, units, sp->free);
388 sp->free = b;
389 } else {
390 prev = sp->free;
391 next = slob_next(prev);
392 while (b > next) {
393 prev = next;
394 next = slob_next(prev);
395 }
396
397 if (!slob_last(prev) && b + units == next) {
398 units += slob_units(next);
399 set_slob(b, units, slob_next(next));
400 } else
401 set_slob(b, units, next);
402
403 if (prev + slob_units(prev) == b) {
404 units = slob_units(b) + slob_units(prev);
405 set_slob(prev, units, slob_next(b));
406 } else
407 set_slob(prev, slob_units(prev), b);
408 }
409out:
161 spin_unlock_irqrestore(&slob_lock, flags); 410 spin_unlock_irqrestore(&slob_lock, flags);
162} 411}
163 412
164void *__kmalloc(size_t size, gfp_t gfp) 413/*
165{ 414 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
166 slob_t *m; 415 */
167 bigblock_t *bb;
168 unsigned long flags;
169 416
170 if (size < PAGE_SIZE - SLOB_UNIT) { 417#ifndef ARCH_KMALLOC_MINALIGN
171 m = slob_alloc(size + SLOB_UNIT, gfp, 0); 418#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
172 return m ? (void *)(m + 1) : 0; 419#endif
173 }
174 420
175 bb = slob_alloc(sizeof(bigblock_t), gfp, 0); 421#ifndef ARCH_SLAB_MINALIGN
176 if (!bb) 422#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
177 return 0; 423#endif
178 424
179 bb->order = get_order(size); 425void *__kmalloc_node(size_t size, gfp_t gfp, int node)
180 bb->pages = (void *)__get_free_pages(gfp, bb->order); 426{
427 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
428
429 if (size < PAGE_SIZE - align) {
430 unsigned int *m;
431 m = slob_alloc(size + align, gfp, align, node);
432 if (m)
433 *m = size;
434 return (void *)m + align;
435 } else {
436 void *ret;
181 437
182 if (bb->pages) { 438 ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node);
183 spin_lock_irqsave(&block_lock, flags); 439 if (ret) {
184 bb->next = bigblocks; 440 struct page *page;
185 bigblocks = bb; 441 page = virt_to_page(ret);
186 spin_unlock_irqrestore(&block_lock, flags); 442 page->private = size;
187 return bb->pages; 443 }
444 return ret;
188 } 445 }
189
190 slob_free(bb, sizeof(bigblock_t));
191 return 0;
192} 446}
193EXPORT_SYMBOL(__kmalloc); 447EXPORT_SYMBOL(__kmalloc_node);
194 448
195/** 449/**
196 * krealloc - reallocate memory. The contents will remain unchanged. 450 * krealloc - reallocate memory. The contents will remain unchanged.
@@ -227,52 +481,34 @@ EXPORT_SYMBOL(krealloc);
227 481
228void kfree(const void *block) 482void kfree(const void *block)
229{ 483{
230 bigblock_t *bb, **last = &bigblocks; 484 struct slob_page *sp;
231 unsigned long flags;
232 485
233 if (!block) 486 if (!block)
234 return; 487 return;
235 488
236 if (!((unsigned long)block & (PAGE_SIZE-1))) { 489 sp = (struct slob_page *)virt_to_page(block);
237 /* might be on the big block list */ 490 if (slob_page(sp)) {
238 spin_lock_irqsave(&block_lock, flags); 491 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
239 for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) { 492 unsigned int *m = (unsigned int *)(block - align);
240 if (bb->pages == block) { 493 slob_free(m, *m + align);
241 *last = bb->next; 494 } else
242 spin_unlock_irqrestore(&block_lock, flags); 495 put_page(&sp->page);
243 free_pages((unsigned long)block, bb->order);
244 slob_free(bb, sizeof(bigblock_t));
245 return;
246 }
247 }
248 spin_unlock_irqrestore(&block_lock, flags);
249 }
250
251 slob_free((slob_t *)block - 1, 0);
252 return;
253} 496}
254
255EXPORT_SYMBOL(kfree); 497EXPORT_SYMBOL(kfree);
256 498
499/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
257size_t ksize(const void *block) 500size_t ksize(const void *block)
258{ 501{
259 bigblock_t *bb; 502 struct slob_page *sp;
260 unsigned long flags;
261 503
262 if (!block) 504 if (!block)
263 return 0; 505 return 0;
264 506
265 if (!((unsigned long)block & (PAGE_SIZE-1))) { 507 sp = (struct slob_page *)virt_to_page(block);
266 spin_lock_irqsave(&block_lock, flags); 508 if (slob_page(sp))
267 for (bb = bigblocks; bb; bb = bb->next) 509 return ((slob_t *)block - 1)->units + SLOB_UNIT;
268 if (bb->pages == block) { 510 else
269 spin_unlock_irqrestore(&slob_lock, flags); 511 return sp->page.private;
270 return PAGE_SIZE << bb->order;
271 }
272 spin_unlock_irqrestore(&block_lock, flags);
273 }
274
275 return ((slob_t *)block - 1)->units * SLOB_UNIT;
276} 512}
277 513
278struct kmem_cache { 514struct kmem_cache {
@@ -289,7 +525,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
289{ 525{
290 struct kmem_cache *c; 526 struct kmem_cache *c;
291 527
292 c = slob_alloc(sizeof(struct kmem_cache), flags, 0); 528 c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1);
293 529
294 if (c) { 530 if (c) {
295 c->name = name; 531 c->name = name;
@@ -302,6 +538,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
302 c->ctor = ctor; 538 c->ctor = ctor;
303 /* ignore alignment unless it's forced */ 539 /* ignore alignment unless it's forced */
304 c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; 540 c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
541 if (c->align < ARCH_SLAB_MINALIGN)
542 c->align = ARCH_SLAB_MINALIGN;
305 if (c->align < align) 543 if (c->align < align)
306 c->align = align; 544 c->align = align;
307 } else if (flags & SLAB_PANIC) 545 } else if (flags & SLAB_PANIC)
@@ -317,21 +555,21 @@ void kmem_cache_destroy(struct kmem_cache *c)
317} 555}
318EXPORT_SYMBOL(kmem_cache_destroy); 556EXPORT_SYMBOL(kmem_cache_destroy);
319 557
320void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) 558void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
321{ 559{
322 void *b; 560 void *b;
323 561
324 if (c->size < PAGE_SIZE) 562 if (c->size < PAGE_SIZE)
325 b = slob_alloc(c->size, flags, c->align); 563 b = slob_alloc(c->size, flags, c->align, node);
326 else 564 else
327 b = (void *)__get_free_pages(flags, get_order(c->size)); 565 b = slob_new_page(flags, get_order(c->size), node);
328 566
329 if (c->ctor) 567 if (c->ctor)
330 c->ctor(b, c, 0); 568 c->ctor(b, c, 0);
331 569
332 return b; 570 return b;
333} 571}
334EXPORT_SYMBOL(kmem_cache_alloc); 572EXPORT_SYMBOL(kmem_cache_alloc_node);
335 573
336void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) 574void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
337{ 575{
@@ -385,9 +623,6 @@ const char *kmem_cache_name(struct kmem_cache *c)
385} 623}
386EXPORT_SYMBOL(kmem_cache_name); 624EXPORT_SYMBOL(kmem_cache_name);
387 625
388static struct timer_list slob_timer = TIMER_INITIALIZER(
389 (void (*)(unsigned long))slob_timer_cbk, 0, 0);
390
391int kmem_cache_shrink(struct kmem_cache *d) 626int kmem_cache_shrink(struct kmem_cache *d)
392{ 627{
393 return 0; 628 return 0;
@@ -399,17 +634,14 @@ int kmem_ptr_validate(struct kmem_cache *a, const void *b)
399 return 0; 634 return 0;
400} 635}
401 636
402void __init kmem_cache_init(void) 637static unsigned int slob_ready __read_mostly;
638
639int slab_is_available(void)
403{ 640{
404 slob_timer_cbk(); 641 return slob_ready;
405} 642}
406 643
407static void slob_timer_cbk(void) 644void __init kmem_cache_init(void)
408{ 645{
409 void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); 646 slob_ready = 1;
410
411 if (p)
412 free_page((unsigned long)p);
413
414 mod_timer(&slob_timer, jiffies + HZ);
415} 647}
diff --git a/mm/slub.c b/mm/slub.c
index 51663a3c3c24..6aea48942c29 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -323,7 +323,11 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
323/* 323/*
324 * Debug settings: 324 * Debug settings:
325 */ 325 */
326#ifdef CONFIG_SLUB_DEBUG_ON
327static int slub_debug = DEBUG_DEFAULT_FLAGS;
328#else
326static int slub_debug; 329static int slub_debug;
330#endif
327 331
328static char *slub_debug_slabs; 332static char *slub_debug_slabs;
329 333
@@ -888,38 +892,57 @@ fail:
888 892
889static int __init setup_slub_debug(char *str) 893static int __init setup_slub_debug(char *str)
890{ 894{
891 if (!str || *str != '=') 895 slub_debug = DEBUG_DEFAULT_FLAGS;
892 slub_debug = DEBUG_DEFAULT_FLAGS; 896 if (*str++ != '=' || !*str)
893 else { 897 /*
894 str++; 898 * No options specified. Switch on full debugging.
895 if (*str == 0 || *str == ',') 899 */
896 slub_debug = DEBUG_DEFAULT_FLAGS; 900 goto out;
897 else 901
898 for( ;*str && *str != ','; str++) 902 if (*str == ',')
899 switch (*str) { 903 /*
900 case 'f' : case 'F' : 904 * No options but restriction on slabs. This means full
901 slub_debug |= SLAB_DEBUG_FREE; 905 * debugging for slabs matching a pattern.
902 break; 906 */
903 case 'z' : case 'Z' : 907 goto check_slabs;
904 slub_debug |= SLAB_RED_ZONE; 908
905 break; 909 slub_debug = 0;
906 case 'p' : case 'P' : 910 if (*str == '-')
907 slub_debug |= SLAB_POISON; 911 /*
908 break; 912 * Switch off all debugging measures.
909 case 'u' : case 'U' : 913 */
910 slub_debug |= SLAB_STORE_USER; 914 goto out;
911 break; 915
912 case 't' : case 'T' : 916 /*
913 slub_debug |= SLAB_TRACE; 917 * Determine which debug features should be switched on
914 break; 918 */
915 default: 919 for ( ;*str && *str != ','; str++) {
916 printk(KERN_ERR "slub_debug option '%c' " 920 switch (tolower(*str)) {
917 "unknown. skipped\n",*str); 921 case 'f':
918 } 922 slub_debug |= SLAB_DEBUG_FREE;
923 break;
924 case 'z':
925 slub_debug |= SLAB_RED_ZONE;
926 break;
927 case 'p':
928 slub_debug |= SLAB_POISON;
929 break;
930 case 'u':
931 slub_debug |= SLAB_STORE_USER;
932 break;
933 case 't':
934 slub_debug |= SLAB_TRACE;
935 break;
936 default:
937 printk(KERN_ERR "slub_debug option '%c' "
938 "unknown. skipped\n",*str);
939 }
919 } 940 }
920 941
942check_slabs:
921 if (*str == ',') 943 if (*str == ',')
922 slub_debug_slabs = str + 1; 944 slub_debug_slabs = str + 1;
945out:
923 return 1; 946 return 1;
924} 947}
925 948
@@ -1798,8 +1821,6 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
1798 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); 1821 BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
1799 1822
1800 page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); 1823 page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node);
1801 /* new_slab() disables interupts */
1802 local_irq_enable();
1803 1824
1804 BUG_ON(!page); 1825 BUG_ON(!page);
1805 n = page->freelist; 1826 n = page->freelist;
@@ -1811,6 +1832,12 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
1811 init_kmem_cache_node(n); 1832 init_kmem_cache_node(n);
1812 atomic_long_inc(&n->nr_slabs); 1833 atomic_long_inc(&n->nr_slabs);
1813 add_partial(n, page); 1834 add_partial(n, page);
1835
1836 /*
1837 * new_slab() disables interupts. If we do not reenable interrupts here
1838 * then bootup would continue with interrupts disabled.
1839 */
1840 local_irq_enable();
1814 return n; 1841 return n;
1815} 1842}
1816 1843
@@ -2016,7 +2043,6 @@ error:
2016 s->offset, flags); 2043 s->offset, flags);
2017 return 0; 2044 return 0;
2018} 2045}
2019EXPORT_SYMBOL(kmem_cache_open);
2020 2046
2021/* 2047/*
2022 * Check if a given pointer is valid 2048 * Check if a given pointer is valid
@@ -2241,7 +2267,7 @@ void *__kmalloc(size_t size, gfp_t flags)
2241 2267
2242 if (s) 2268 if (s)
2243 return slab_alloc(s, flags, -1, __builtin_return_address(0)); 2269 return slab_alloc(s, flags, -1, __builtin_return_address(0));
2244 return NULL; 2270 return ZERO_SIZE_PTR;
2245} 2271}
2246EXPORT_SYMBOL(__kmalloc); 2272EXPORT_SYMBOL(__kmalloc);
2247 2273
@@ -2252,16 +2278,20 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
2252 2278
2253 if (s) 2279 if (s)
2254 return slab_alloc(s, flags, node, __builtin_return_address(0)); 2280 return slab_alloc(s, flags, node, __builtin_return_address(0));
2255 return NULL; 2281 return ZERO_SIZE_PTR;
2256} 2282}
2257EXPORT_SYMBOL(__kmalloc_node); 2283EXPORT_SYMBOL(__kmalloc_node);
2258#endif 2284#endif
2259 2285
2260size_t ksize(const void *object) 2286size_t ksize(const void *object)
2261{ 2287{
2262 struct page *page = get_object_page(object); 2288 struct page *page;
2263 struct kmem_cache *s; 2289 struct kmem_cache *s;
2264 2290
2291 if (object == ZERO_SIZE_PTR)
2292 return 0;
2293
2294 page = get_object_page(object);
2265 BUG_ON(!page); 2295 BUG_ON(!page);
2266 s = page->slab; 2296 s = page->slab;
2267 BUG_ON(!s); 2297 BUG_ON(!s);
@@ -2293,7 +2323,13 @@ void kfree(const void *x)
2293 struct kmem_cache *s; 2323 struct kmem_cache *s;
2294 struct page *page; 2324 struct page *page;
2295 2325
2296 if (!x) 2326 /*
2327 * This has to be an unsigned comparison. According to Linus
2328 * some gcc version treat a pointer as a signed entity. Then
2329 * this comparison would be true for all "negative" pointers
2330 * (which would cover the whole upper half of the address space).
2331 */
2332 if ((unsigned long)x <= (unsigned long)ZERO_SIZE_PTR)
2297 return; 2333 return;
2298 2334
2299 page = virt_to_head_page(x); 2335 page = virt_to_head_page(x);
@@ -2398,12 +2434,12 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
2398 void *ret; 2434 void *ret;
2399 size_t ks; 2435 size_t ks;
2400 2436
2401 if (unlikely(!p)) 2437 if (unlikely(!p || p == ZERO_SIZE_PTR))
2402 return kmalloc(new_size, flags); 2438 return kmalloc(new_size, flags);
2403 2439
2404 if (unlikely(!new_size)) { 2440 if (unlikely(!new_size)) {
2405 kfree(p); 2441 kfree(p);
2406 return NULL; 2442 return ZERO_SIZE_PTR;
2407 } 2443 }
2408 2444
2409 ks = ksize(p); 2445 ks = ksize(p);
@@ -2426,6 +2462,7 @@ EXPORT_SYMBOL(krealloc);
2426void __init kmem_cache_init(void) 2462void __init kmem_cache_init(void)
2427{ 2463{
2428 int i; 2464 int i;
2465 int caches = 0;
2429 2466
2430#ifdef CONFIG_NUMA 2467#ifdef CONFIG_NUMA
2431 /* 2468 /*
@@ -2436,20 +2473,29 @@ void __init kmem_cache_init(void)
2436 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", 2473 create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
2437 sizeof(struct kmem_cache_node), GFP_KERNEL); 2474 sizeof(struct kmem_cache_node), GFP_KERNEL);
2438 kmalloc_caches[0].refcount = -1; 2475 kmalloc_caches[0].refcount = -1;
2476 caches++;
2439#endif 2477#endif
2440 2478
2441 /* Able to allocate the per node structures */ 2479 /* Able to allocate the per node structures */
2442 slab_state = PARTIAL; 2480 slab_state = PARTIAL;
2443 2481
2444 /* Caches that are not of the two-to-the-power-of size */ 2482 /* Caches that are not of the two-to-the-power-of size */
2445 create_kmalloc_cache(&kmalloc_caches[1], 2483 if (KMALLOC_MIN_SIZE <= 64) {
2484 create_kmalloc_cache(&kmalloc_caches[1],
2446 "kmalloc-96", 96, GFP_KERNEL); 2485 "kmalloc-96", 96, GFP_KERNEL);
2447 create_kmalloc_cache(&kmalloc_caches[2], 2486 caches++;
2487 }
2488 if (KMALLOC_MIN_SIZE <= 128) {
2489 create_kmalloc_cache(&kmalloc_caches[2],
2448 "kmalloc-192", 192, GFP_KERNEL); 2490 "kmalloc-192", 192, GFP_KERNEL);
2491 caches++;
2492 }
2449 2493
2450 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) 2494 for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
2451 create_kmalloc_cache(&kmalloc_caches[i], 2495 create_kmalloc_cache(&kmalloc_caches[i],
2452 "kmalloc", 1 << i, GFP_KERNEL); 2496 "kmalloc", 1 << i, GFP_KERNEL);
2497 caches++;
2498 }
2453 2499
2454 slab_state = UP; 2500 slab_state = UP;
2455 2501
@@ -2466,8 +2512,8 @@ void __init kmem_cache_init(void)
2466 nr_cpu_ids * sizeof(struct page *); 2512 nr_cpu_ids * sizeof(struct page *);
2467 2513
2468 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," 2514 printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
2469 " Processors=%d, Nodes=%d\n", 2515 " CPUs=%d, Nodes=%d\n",
2470 KMALLOC_SHIFT_HIGH, cache_line_size(), 2516 caches, cache_line_size(),
2471 slub_min_order, slub_max_order, slub_min_objects, 2517 slub_min_order, slub_max_order, slub_min_objects,
2472 nr_cpu_ids, nr_node_ids); 2518 nr_cpu_ids, nr_node_ids);
2473} 2519}
@@ -2652,7 +2698,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
2652 struct kmem_cache *s = get_slab(size, gfpflags); 2698 struct kmem_cache *s = get_slab(size, gfpflags);
2653 2699
2654 if (!s) 2700 if (!s)
2655 return NULL; 2701 return ZERO_SIZE_PTR;
2656 2702
2657 return slab_alloc(s, gfpflags, -1, caller); 2703 return slab_alloc(s, gfpflags, -1, caller);
2658} 2704}
@@ -2663,7 +2709,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
2663 struct kmem_cache *s = get_slab(size, gfpflags); 2709 struct kmem_cache *s = get_slab(size, gfpflags);
2664 2710
2665 if (!s) 2711 if (!s)
2666 return NULL; 2712 return ZERO_SIZE_PTR;
2667 2713
2668 return slab_alloc(s, gfpflags, node, caller); 2714 return slab_alloc(s, gfpflags, node, caller);
2669} 2715}
@@ -2857,7 +2903,7 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max)
2857 2903
2858 order = get_order(sizeof(struct location) * max); 2904 order = get_order(sizeof(struct location) * max);
2859 2905
2860 l = (void *)__get_free_pages(GFP_KERNEL, order); 2906 l = (void *)__get_free_pages(GFP_ATOMIC, order);
2861 2907
2862 if (!l) 2908 if (!l)
2863 return 0; 2909 return 0;
@@ -3022,13 +3068,15 @@ static int list_locations(struct kmem_cache *s, char *buf,
3022 n += sprintf(buf + n, " pid=%ld", 3068 n += sprintf(buf + n, " pid=%ld",
3023 l->min_pid); 3069 l->min_pid);
3024 3070
3025 if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) { 3071 if (num_online_cpus() > 1 && !cpus_empty(l->cpus) &&
3072 n < PAGE_SIZE - 60) {
3026 n += sprintf(buf + n, " cpus="); 3073 n += sprintf(buf + n, " cpus=");
3027 n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, 3074 n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50,
3028 l->cpus); 3075 l->cpus);
3029 } 3076 }
3030 3077
3031 if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) { 3078 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
3079 n < PAGE_SIZE - 60) {
3032 n += sprintf(buf + n, " nodes="); 3080 n += sprintf(buf + n, " nodes=");
3033 n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, 3081 n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50,
3034 l->nodes); 3082 l->nodes);
diff --git a/mm/sparse.c b/mm/sparse.c
index 545e4d3afcdf..e03b39f3540f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -240,6 +240,27 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
240 return NULL; 240 return NULL;
241} 241}
242 242
243/*
244 * Allocate the accumulated non-linear sections, allocate a mem_map
245 * for each and record the physical to section mapping.
246 */
247void __init sparse_init(void)
248{
249 unsigned long pnum;
250 struct page *map;
251
252 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
253 if (!valid_section_nr(pnum))
254 continue;
255
256 map = sparse_early_mem_map_alloc(pnum);
257 if (!map)
258 continue;
259 sparse_init_one_section(__nr_to_section(pnum), pnum, map);
260 }
261}
262
263#ifdef CONFIG_MEMORY_HOTPLUG
243static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 264static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
244{ 265{
245 struct page *page, *ret; 266 struct page *page, *ret;
@@ -280,27 +301,6 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
280} 301}
281 302
282/* 303/*
283 * Allocate the accumulated non-linear sections, allocate a mem_map
284 * for each and record the physical to section mapping.
285 */
286void __init sparse_init(void)
287{
288 unsigned long pnum;
289 struct page *map;
290
291 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
292 if (!valid_section_nr(pnum))
293 continue;
294
295 map = sparse_early_mem_map_alloc(pnum);
296 if (!map)
297 continue;
298 sparse_init_one_section(__nr_to_section(pnum), pnum, map);
299 }
300}
301
302#ifdef CONFIG_MEMORY_HOTPLUG
303/*
304 * returns the number of sections whose mem_maps were properly 304 * returns the number of sections whose mem_maps were properly
305 * set. If this is <=0, then that means that the passed-in 305 * set. If this is <=0, then that means that the passed-in
306 * map was not consumed and must be freed. 306 * map was not consumed and must be freed.
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5f7cf2a4cb55..925d5c50f18d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,7 +21,7 @@
21 21
22/* 22/*
23 * swapper_space is a fiction, retained to simplify the path through 23 * swapper_space is a fiction, retained to simplify the path through
24 * vmscan's shrink_list, to make sync_page look nicer, and to allow 24 * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
25 * future use of radix_tree tags in the swap cache. 25 * future use of radix_tree tags in the swap cache.
26 */ 26 */
27static const struct address_space_operations swap_aops = { 27static const struct address_space_operations swap_aops = {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index acc172cbe3aa..7ff0a81c7b01 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -885,7 +885,7 @@ static int try_to_unuse(unsigned int type)
885 /* 885 /*
886 * So we could skip searching mms once swap count went 886 * So we could skip searching mms once swap count went
887 * to 1, we did not mark any present ptes as dirty: must 887 * to 1, we did not mark any present ptes as dirty: must
888 * mark page dirty so shrink_list will preserve it. 888 * mark page dirty so shrink_page_list will preserve it.
889 */ 889 */
890 SetPageDirty(page); 890 SetPageDirty(page);
891 unlock_page(page); 891 unlock_page(page);
diff --git a/mm/truncate.c b/mm/truncate.c
index 4fbe1a2da5fb..7c994f2d6145 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -253,21 +253,8 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
253} 253}
254EXPORT_SYMBOL(truncate_inode_pages); 254EXPORT_SYMBOL(truncate_inode_pages);
255 255
256/** 256unsigned long __invalidate_mapping_pages(struct address_space *mapping,
257 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode 257 pgoff_t start, pgoff_t end, bool be_atomic)
258 * @mapping: the address_space which holds the pages to invalidate
259 * @start: the offset 'from' which to invalidate
260 * @end: the offset 'to' which to invalidate (inclusive)
261 *
262 * This function only removes the unlocked pages, if you want to
263 * remove all the pages of one inode, you must call truncate_inode_pages.
264 *
265 * invalidate_mapping_pages() will not block on IO activity. It will not
266 * invalidate pages which are dirty, locked, under writeback or mapped into
267 * pagetables.
268 */
269unsigned long invalidate_mapping_pages(struct address_space *mapping,
270 pgoff_t start, pgoff_t end)
271{ 258{
272 struct pagevec pvec; 259 struct pagevec pvec;
273 pgoff_t next = start; 260 pgoff_t next = start;
@@ -308,17 +295,38 @@ unlock:
308 break; 295 break;
309 } 296 }
310 pagevec_release(&pvec); 297 pagevec_release(&pvec);
298 if (likely(!be_atomic))
299 cond_resched();
311 } 300 }
312 return ret; 301 return ret;
313} 302}
303
304/**
305 * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
306 * @mapping: the address_space which holds the pages to invalidate
307 * @start: the offset 'from' which to invalidate
308 * @end: the offset 'to' which to invalidate (inclusive)
309 *
310 * This function only removes the unlocked pages, if you want to
311 * remove all the pages of one inode, you must call truncate_inode_pages.
312 *
313 * invalidate_mapping_pages() will not block on IO activity. It will not
314 * invalidate pages which are dirty, locked, under writeback or mapped into
315 * pagetables.
316 */
317unsigned long invalidate_mapping_pages(struct address_space *mapping,
318 pgoff_t start, pgoff_t end)
319{
320 return __invalidate_mapping_pages(mapping, start, end, false);
321}
314EXPORT_SYMBOL(invalidate_mapping_pages); 322EXPORT_SYMBOL(invalidate_mapping_pages);
315 323
316/* 324/*
317 * This is like invalidate_complete_page(), except it ignores the page's 325 * This is like invalidate_complete_page(), except it ignores the page's
318 * refcount. We do this because invalidate_inode_pages2() needs stronger 326 * refcount. We do this because invalidate_inode_pages2() needs stronger
319 * invalidation guarantees, and cannot afford to leave pages behind because 327 * invalidation guarantees, and cannot afford to leave pages behind because
320 * shrink_list() has a temp ref on them, or because they're transiently sitting 328 * shrink_page_list() has a temp ref on them, or because they're transiently
321 * in the lru_cache_add() pagevecs. 329 * sitting in the lru_cache_add() pagevecs.
322 */ 330 */
323static int 331static int
324invalidate_complete_page2(struct address_space *mapping, struct page *page) 332invalidate_complete_page2(struct address_space *mapping, struct page *page)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 38254297a494..eceaf496210f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -477,8 +477,8 @@ const struct seq_operations fragmentation_op = {
477static const char * const vmstat_text[] = { 477static const char * const vmstat_text[] = {
478 /* Zoned VM counters */ 478 /* Zoned VM counters */
479 "nr_free_pages", 479 "nr_free_pages",
480 "nr_active",
481 "nr_inactive", 480 "nr_inactive",
481 "nr_active",
482 "nr_anon_pages", 482 "nr_anon_pages",
483 "nr_mapped", 483 "nr_mapped",
484 "nr_file_pages", 484 "nr_file_pages",