diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/backing-dev.c | 16 | ||||
-rw-r--r-- | mm/filemap.c | 5 | ||||
-rw-r--r-- | mm/hugetlb.c | 17 | ||||
-rw-r--r-- | mm/madvise.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 12 | ||||
-rw-r--r-- | mm/mempolicy.c | 51 | ||||
-rw-r--r-- | mm/mempool.c | 3 | ||||
-rw-r--r-- | mm/mlock.c | 5 | ||||
-rw-r--r-- | mm/mmap.c | 25 | ||||
-rw-r--r-- | mm/nommu.c | 5 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 330 | ||||
-rw-r--r-- | mm/slab.c | 32 | ||||
-rw-r--r-- | mm/slob.c | 538 | ||||
-rw-r--r-- | mm/slub.c | 79 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 42 |
19 files changed, 868 insertions, 318 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 8ac412b45f18..086af703da43 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -117,7 +117,7 @@ config MEMORY_HOTPLUG | |||
117 | bool "Allow for memory hot-add" | 117 | bool "Allow for memory hot-add" |
118 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 118 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
119 | depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG | 119 | depends on HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG |
120 | depends on (IA64 || X86 || PPC64) | 120 | depends on (IA64 || X86 || PPC64 || SUPERH) |
121 | 121 | ||
122 | comment "Memory hotplug is currently incompatible with Software Suspend" | 122 | comment "Memory hotplug is currently incompatible with Software Suspend" |
123 | depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND | 123 | depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND |
@@ -168,3 +168,7 @@ config NR_QUICK | |||
168 | depends on QUICKLIST | 168 | depends on QUICKLIST |
169 | default "2" if (SUPERH && !SUPERH64) | 169 | default "2" if (SUPERH && !SUPERH64) |
170 | default "1" | 170 | default "1" |
171 | |||
172 | config VIRT_TO_BUS | ||
173 | def_bool y | ||
174 | depends on !ARCH_NO_VIRT_TO_BUS | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e5de3781d3fe..f50a2811f9dc 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -55,22 +55,6 @@ long congestion_wait(int rw, long timeout) | |||
55 | } | 55 | } |
56 | EXPORT_SYMBOL(congestion_wait); | 56 | EXPORT_SYMBOL(congestion_wait); |
57 | 57 | ||
58 | long congestion_wait_interruptible(int rw, long timeout) | ||
59 | { | ||
60 | long ret; | ||
61 | DEFINE_WAIT(wait); | ||
62 | wait_queue_head_t *wqh = &congestion_wqh[rw]; | ||
63 | |||
64 | prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE); | ||
65 | if (signal_pending(current)) | ||
66 | ret = -ERESTARTSYS; | ||
67 | else | ||
68 | ret = io_schedule_timeout(timeout); | ||
69 | finish_wait(wqh, &wait); | ||
70 | return ret; | ||
71 | } | ||
72 | EXPORT_SYMBOL(congestion_wait_interruptible); | ||
73 | |||
74 | /** | 58 | /** |
75 | * congestion_end - wake up sleepers on a congested backing_dev_info | 59 | * congestion_end - wake up sleepers on a congested backing_dev_info |
76 | * @rw: READ or WRITE | 60 | * @rw: READ or WRITE |
diff --git a/mm/filemap.c b/mm/filemap.c index c6ebd9f912ab..100b99c2d504 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -120,6 +120,7 @@ void __remove_from_page_cache(struct page *page) | |||
120 | page->mapping = NULL; | 120 | page->mapping = NULL; |
121 | mapping->nrpages--; | 121 | mapping->nrpages--; |
122 | __dec_zone_page_state(page, NR_FILE_PAGES); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); |
123 | BUG_ON(page_mapped(page)); | ||
123 | } | 124 | } |
124 | 125 | ||
125 | void remove_from_page_cache(struct page *page) | 126 | void remove_from_page_cache(struct page *page) |
@@ -1218,6 +1219,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1218 | retval = retval ?: desc.error; | 1219 | retval = retval ?: desc.error; |
1219 | break; | 1220 | break; |
1220 | } | 1221 | } |
1222 | if (desc.count > 0) | ||
1223 | break; | ||
1221 | } | 1224 | } |
1222 | } | 1225 | } |
1223 | out: | 1226 | out: |
@@ -1964,7 +1967,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
1964 | if (unlikely(*pos + *count > MAX_NON_LFS && | 1967 | if (unlikely(*pos + *count > MAX_NON_LFS && |
1965 | !(file->f_flags & O_LARGEFILE))) { | 1968 | !(file->f_flags & O_LARGEFILE))) { |
1966 | if (*pos >= MAX_NON_LFS) { | 1969 | if (*pos >= MAX_NON_LFS) { |
1967 | send_sig(SIGXFSZ, current, 0); | ||
1968 | return -EFBIG; | 1970 | return -EFBIG; |
1969 | } | 1971 | } |
1970 | if (*count > MAX_NON_LFS - (unsigned long)*pos) { | 1972 | if (*count > MAX_NON_LFS - (unsigned long)*pos) { |
@@ -1982,7 +1984,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
1982 | if (likely(!isblk)) { | 1984 | if (likely(!isblk)) { |
1983 | if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { | 1985 | if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { |
1984 | if (*count || *pos > inode->i_sb->s_maxbytes) { | 1986 | if (*count || *pos > inode->i_sb->s_maxbytes) { |
1985 | send_sig(SIGXFSZ, current, 0); | ||
1986 | return -EFBIG; | 1987 | return -EFBIG; |
1987 | } | 1988 | } |
1988 | /* zero-length writes at ->s_maxbytes are OK */ | 1989 | /* zero-length writes at ->s_maxbytes are OK */ |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a45d1f0691ce..acc0fb3cf067 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -66,7 +66,7 @@ static void enqueue_huge_page(struct page *page) | |||
66 | static struct page *dequeue_huge_page(struct vm_area_struct *vma, | 66 | static struct page *dequeue_huge_page(struct vm_area_struct *vma, |
67 | unsigned long address) | 67 | unsigned long address) |
68 | { | 68 | { |
69 | int nid = numa_node_id(); | 69 | int nid; |
70 | struct page *page = NULL; | 70 | struct page *page = NULL; |
71 | struct zonelist *zonelist = huge_zonelist(vma, address); | 71 | struct zonelist *zonelist = huge_zonelist(vma, address); |
72 | struct zone **z; | 72 | struct zone **z; |
@@ -101,13 +101,20 @@ static void free_huge_page(struct page *page) | |||
101 | 101 | ||
102 | static int alloc_fresh_huge_page(void) | 102 | static int alloc_fresh_huge_page(void) |
103 | { | 103 | { |
104 | static int nid = 0; | 104 | static int prev_nid; |
105 | struct page *page; | 105 | struct page *page; |
106 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, | 106 | static DEFINE_SPINLOCK(nid_lock); |
107 | HUGETLB_PAGE_ORDER); | 107 | int nid; |
108 | nid = next_node(nid, node_online_map); | 108 | |
109 | spin_lock(&nid_lock); | ||
110 | nid = next_node(prev_nid, node_online_map); | ||
109 | if (nid == MAX_NUMNODES) | 111 | if (nid == MAX_NUMNODES) |
110 | nid = first_node(node_online_map); | 112 | nid = first_node(node_online_map); |
113 | prev_nid = nid; | ||
114 | spin_unlock(&nid_lock); | ||
115 | |||
116 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, | ||
117 | HUGETLB_PAGE_ORDER); | ||
111 | if (page) { | 118 | if (page) { |
112 | set_compound_page_dtor(page, free_huge_page); | 119 | set_compound_page_dtor(page, free_huge_page); |
113 | spin_lock(&hugetlb_lock); | 120 | spin_lock(&hugetlb_lock); |
diff --git a/mm/madvise.c b/mm/madvise.c index 60542d006ec1..93ee375b38e7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -287,9 +287,11 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) | |||
287 | struct vm_area_struct * vma, *prev; | 287 | struct vm_area_struct * vma, *prev; |
288 | int unmapped_error = 0; | 288 | int unmapped_error = 0; |
289 | int error = -EINVAL; | 289 | int error = -EINVAL; |
290 | int write; | ||
290 | size_t len; | 291 | size_t len; |
291 | 292 | ||
292 | if (madvise_need_mmap_write(behavior)) | 293 | write = madvise_need_mmap_write(behavior); |
294 | if (write) | ||
293 | down_write(¤t->mm->mmap_sem); | 295 | down_write(¤t->mm->mmap_sem); |
294 | else | 296 | else |
295 | down_read(¤t->mm->mmap_sem); | 297 | down_read(¤t->mm->mmap_sem); |
@@ -354,7 +356,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) | |||
354 | vma = find_vma(current->mm, start); | 356 | vma = find_vma(current->mm, start); |
355 | } | 357 | } |
356 | out: | 358 | out: |
357 | if (madvise_need_mmap_write(behavior)) | 359 | if (write) |
358 | up_write(¤t->mm->mmap_sem); | 360 | up_write(¤t->mm->mmap_sem); |
359 | else | 361 | else |
360 | up_read(¤t->mm->mmap_sem); | 362 | up_read(¤t->mm->mmap_sem); |
diff --git a/mm/memory.c b/mm/memory.c index f64cbf9baa36..b3d73bb1f680 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -78,11 +78,9 @@ unsigned long num_physpages; | |||
78 | * and ZONE_HIGHMEM. | 78 | * and ZONE_HIGHMEM. |
79 | */ | 79 | */ |
80 | void * high_memory; | 80 | void * high_memory; |
81 | unsigned long vmalloc_earlyreserve; | ||
82 | 81 | ||
83 | EXPORT_SYMBOL(num_physpages); | 82 | EXPORT_SYMBOL(num_physpages); |
84 | EXPORT_SYMBOL(high_memory); | 83 | EXPORT_SYMBOL(high_memory); |
85 | EXPORT_SYMBOL(vmalloc_earlyreserve); | ||
86 | 84 | ||
87 | int randomize_va_space __read_mostly = 1; | 85 | int randomize_va_space __read_mostly = 1; |
88 | 86 | ||
@@ -1055,6 +1053,14 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1055 | do { | 1053 | do { |
1056 | struct page *page; | 1054 | struct page *page; |
1057 | 1055 | ||
1056 | /* | ||
1057 | * If tsk is ooming, cut off its access to large memory | ||
1058 | * allocations. It has a pending SIGKILL, but it can't | ||
1059 | * be processed until returning to user space. | ||
1060 | */ | ||
1061 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) | ||
1062 | return -ENOMEM; | ||
1063 | |||
1058 | if (write) | 1064 | if (write) |
1059 | foll_flags |= FOLL_WRITE; | 1065 | foll_flags |= FOLL_WRITE; |
1060 | 1066 | ||
@@ -2673,7 +2679,7 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
2673 | write = (vma->vm_flags & VM_WRITE) != 0; | 2679 | write = (vma->vm_flags & VM_WRITE) != 0; |
2674 | BUG_ON(addr >= end); | 2680 | BUG_ON(addr >= end); |
2675 | BUG_ON(end > vma->vm_end); | 2681 | BUG_ON(end > vma->vm_end); |
2676 | len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; | 2682 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
2677 | ret = get_user_pages(current, current->mm, addr, | 2683 | ret = get_user_pages(current, current->mm, addr, |
2678 | len, write, 0, NULL, NULL); | 2684 | len, write, 0, NULL, NULL); |
2679 | if (ret < 0) | 2685 | if (ret < 0) |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d76e8eb342d0..188f8d9c4aed 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -101,8 +101,6 @@ | |||
101 | static struct kmem_cache *policy_cache; | 101 | static struct kmem_cache *policy_cache; |
102 | static struct kmem_cache *sn_cache; | 102 | static struct kmem_cache *sn_cache; |
103 | 103 | ||
104 | #define PDprintk(fmt...) | ||
105 | |||
106 | /* Highest zone. An specific allocation for a zone below that is not | 104 | /* Highest zone. An specific allocation for a zone below that is not |
107 | policied. */ | 105 | policied. */ |
108 | enum zone_type policy_zone = 0; | 106 | enum zone_type policy_zone = 0; |
@@ -175,7 +173,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
175 | { | 173 | { |
176 | struct mempolicy *policy; | 174 | struct mempolicy *policy; |
177 | 175 | ||
178 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); | 176 | pr_debug("setting mode %d nodes[0] %lx\n", |
177 | mode, nodes ? nodes_addr(*nodes)[0] : -1); | ||
178 | |||
179 | if (mode == MPOL_DEFAULT) | 179 | if (mode == MPOL_DEFAULT) |
180 | return NULL; | 180 | return NULL; |
181 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 181 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
@@ -379,7 +379,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | |||
379 | int err = 0; | 379 | int err = 0; |
380 | struct mempolicy *old = vma->vm_policy; | 380 | struct mempolicy *old = vma->vm_policy; |
381 | 381 | ||
382 | PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | 382 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", |
383 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | 383 | vma->vm_start, vma->vm_end, vma->vm_pgoff, |
384 | vma->vm_ops, vma->vm_file, | 384 | vma->vm_ops, vma->vm_file, |
385 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | 385 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); |
@@ -776,8 +776,8 @@ long do_mbind(unsigned long start, unsigned long len, | |||
776 | if (!new) | 776 | if (!new) |
777 | flags |= MPOL_MF_DISCONTIG_OK; | 777 | flags |= MPOL_MF_DISCONTIG_OK; |
778 | 778 | ||
779 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | 779 | pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, |
780 | mode,nodes_addr(nodes)[0]); | 780 | mode, nmask ? nodes_addr(*nmask)[0] : -1); |
781 | 781 | ||
782 | down_write(&mm->mmap_sem); | 782 | down_write(&mm->mmap_sem); |
783 | vma = check_range(mm, start, end, nmask, | 783 | vma = check_range(mm, start, end, nmask, |
@@ -1434,7 +1434,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) | |||
1434 | } | 1434 | } |
1435 | rb_link_node(&new->nd, parent, p); | 1435 | rb_link_node(&new->nd, parent, p); |
1436 | rb_insert_color(&new->nd, &sp->root); | 1436 | rb_insert_color(&new->nd, &sp->root); |
1437 | PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, | 1437 | pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, |
1438 | new->policy ? new->policy->policy : 0); | 1438 | new->policy ? new->policy->policy : 0); |
1439 | } | 1439 | } |
1440 | 1440 | ||
@@ -1459,7 +1459,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) | |||
1459 | 1459 | ||
1460 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 1460 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
1461 | { | 1461 | { |
1462 | PDprintk("deleting %lx-l%x\n", n->start, n->end); | 1462 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
1463 | rb_erase(&n->nd, &sp->root); | 1463 | rb_erase(&n->nd, &sp->root); |
1464 | mpol_free(n->policy); | 1464 | mpol_free(n->policy); |
1465 | kmem_cache_free(sn_cache, n); | 1465 | kmem_cache_free(sn_cache, n); |
@@ -1558,10 +1558,10 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
1558 | struct sp_node *new = NULL; | 1558 | struct sp_node *new = NULL; |
1559 | unsigned long sz = vma_pages(vma); | 1559 | unsigned long sz = vma_pages(vma); |
1560 | 1560 | ||
1561 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", | 1561 | pr_debug("set_shared_policy %lx sz %lu %d %lx\n", |
1562 | vma->vm_pgoff, | 1562 | vma->vm_pgoff, |
1563 | sz, npol? npol->policy : -1, | 1563 | sz, npol? npol->policy : -1, |
1564 | npol ? nodes_addr(npol->v.nodes)[0] : -1); | 1564 | npol ? nodes_addr(npol->v.nodes)[0] : -1); |
1565 | 1565 | ||
1566 | if (npol) { | 1566 | if (npol) { |
1567 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | 1567 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); |
@@ -1597,6 +1597,10 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
1597 | /* assumes fs == KERNEL_DS */ | 1597 | /* assumes fs == KERNEL_DS */ |
1598 | void __init numa_policy_init(void) | 1598 | void __init numa_policy_init(void) |
1599 | { | 1599 | { |
1600 | nodemask_t interleave_nodes; | ||
1601 | unsigned long largest = 0; | ||
1602 | int nid, prefer = 0; | ||
1603 | |||
1600 | policy_cache = kmem_cache_create("numa_policy", | 1604 | policy_cache = kmem_cache_create("numa_policy", |
1601 | sizeof(struct mempolicy), | 1605 | sizeof(struct mempolicy), |
1602 | 0, SLAB_PANIC, NULL, NULL); | 1606 | 0, SLAB_PANIC, NULL, NULL); |
@@ -1605,10 +1609,31 @@ void __init numa_policy_init(void) | |||
1605 | sizeof(struct sp_node), | 1609 | sizeof(struct sp_node), |
1606 | 0, SLAB_PANIC, NULL, NULL); | 1610 | 0, SLAB_PANIC, NULL, NULL); |
1607 | 1611 | ||
1608 | /* Set interleaving policy for system init. This way not all | 1612 | /* |
1609 | the data structures allocated at system boot end up in node zero. */ | 1613 | * Set interleaving policy for system init. Interleaving is only |
1614 | * enabled across suitably sized nodes (default is >= 16MB), or | ||
1615 | * fall back to the largest node if they're all smaller. | ||
1616 | */ | ||
1617 | nodes_clear(interleave_nodes); | ||
1618 | for_each_online_node(nid) { | ||
1619 | unsigned long total_pages = node_present_pages(nid); | ||
1620 | |||
1621 | /* Preserve the largest node */ | ||
1622 | if (largest < total_pages) { | ||
1623 | largest = total_pages; | ||
1624 | prefer = nid; | ||
1625 | } | ||
1626 | |||
1627 | /* Interleave this node? */ | ||
1628 | if ((total_pages << PAGE_SHIFT) >= (16 << 20)) | ||
1629 | node_set(nid, interleave_nodes); | ||
1630 | } | ||
1631 | |||
1632 | /* All too small, use the largest */ | ||
1633 | if (unlikely(nodes_empty(interleave_nodes))) | ||
1634 | node_set(prefer, interleave_nodes); | ||
1610 | 1635 | ||
1611 | if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) | 1636 | if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes)) |
1612 | printk("numa_policy_init: interleaving failed\n"); | 1637 | printk("numa_policy_init: interleaving failed\n"); |
1613 | } | 1638 | } |
1614 | 1639 | ||
diff --git a/mm/mempool.c b/mm/mempool.c index cc1ca86dfc24..3e8f1fed0e1f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -263,6 +263,9 @@ void mempool_free(void *element, mempool_t *pool) | |||
263 | { | 263 | { |
264 | unsigned long flags; | 264 | unsigned long flags; |
265 | 265 | ||
266 | if (unlikely(element == NULL)) | ||
267 | return; | ||
268 | |||
266 | smp_mb(); | 269 | smp_mb(); |
267 | if (pool->curr_nr < pool->min_nr) { | 270 | if (pool->curr_nr < pool->min_nr) { |
268 | spin_lock_irqsave(&pool->lock, flags); | 271 | spin_lock_irqsave(&pool->lock, flags); |
diff --git a/mm/mlock.c b/mm/mlock.c index 4d3fea267e0d..7b2656055d6a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -244,9 +244,12 @@ int user_shm_lock(size_t size, struct user_struct *user) | |||
244 | 244 | ||
245 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 245 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
246 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 246 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; |
247 | if (lock_limit == RLIM_INFINITY) | ||
248 | allowed = 1; | ||
247 | lock_limit >>= PAGE_SHIFT; | 249 | lock_limit >>= PAGE_SHIFT; |
248 | spin_lock(&shmlock_user_lock); | 250 | spin_lock(&shmlock_user_lock); |
249 | if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) | 251 | if (!allowed && |
252 | locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) | ||
250 | goto out; | 253 | goto out; |
251 | get_uid(user); | 254 | get_uid(user); |
252 | user->locked_shm += locked; | 255 | user->locked_shm += locked; |
@@ -894,14 +894,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
894 | unsigned long flags, unsigned long pgoff) | 894 | unsigned long flags, unsigned long pgoff) |
895 | { | 895 | { |
896 | struct mm_struct * mm = current->mm; | 896 | struct mm_struct * mm = current->mm; |
897 | struct vm_area_struct * vma, * prev; | ||
898 | struct inode *inode; | 897 | struct inode *inode; |
899 | unsigned int vm_flags; | 898 | unsigned int vm_flags; |
900 | int correct_wcount = 0; | ||
901 | int error; | 899 | int error; |
902 | struct rb_node ** rb_link, * rb_parent; | ||
903 | int accountable = 1; | 900 | int accountable = 1; |
904 | unsigned long charged = 0, reqprot = prot; | 901 | unsigned long reqprot = prot; |
905 | 902 | ||
906 | /* | 903 | /* |
907 | * Does the application expect PROT_READ to imply PROT_EXEC? | 904 | * Does the application expect PROT_READ to imply PROT_EXEC? |
@@ -1027,6 +1024,24 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
1027 | if (error) | 1024 | if (error) |
1028 | return error; | 1025 | return error; |
1029 | 1026 | ||
1027 | return mmap_region(file, addr, len, flags, vm_flags, pgoff, | ||
1028 | accountable); | ||
1029 | } | ||
1030 | EXPORT_SYMBOL(do_mmap_pgoff); | ||
1031 | |||
1032 | unsigned long mmap_region(struct file *file, unsigned long addr, | ||
1033 | unsigned long len, unsigned long flags, | ||
1034 | unsigned int vm_flags, unsigned long pgoff, | ||
1035 | int accountable) | ||
1036 | { | ||
1037 | struct mm_struct *mm = current->mm; | ||
1038 | struct vm_area_struct *vma, *prev; | ||
1039 | int correct_wcount = 0; | ||
1040 | int error; | ||
1041 | struct rb_node **rb_link, *rb_parent; | ||
1042 | unsigned long charged = 0; | ||
1043 | struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; | ||
1044 | |||
1030 | /* Clear old maps */ | 1045 | /* Clear old maps */ |
1031 | error = -ENOMEM; | 1046 | error = -ENOMEM; |
1032 | munmap_back: | 1047 | munmap_back: |
@@ -1175,8 +1190,6 @@ unacct_error: | |||
1175 | return error; | 1190 | return error; |
1176 | } | 1191 | } |
1177 | 1192 | ||
1178 | EXPORT_SYMBOL(do_mmap_pgoff); | ||
1179 | |||
1180 | /* Get an address range which is currently unmapped. | 1193 | /* Get an address range which is currently unmapped. |
1181 | * For shmat() with addr=0. | 1194 | * For shmat() with addr=0. |
1182 | * | 1195 | * |
diff --git a/mm/nommu.c b/mm/nommu.c index 989e2e9af5c3..8bbbf147a794 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -367,6 +367,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
367 | return find_vma(mm, addr); | 367 | return find_vma(mm, addr); |
368 | } | 368 | } |
369 | 369 | ||
370 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | ||
371 | { | ||
372 | return -ENOMEM; | ||
373 | } | ||
374 | |||
370 | /* | 375 | /* |
371 | * look up the first VMA exactly that exactly matches addr | 376 | * look up the first VMA exactly that exactly matches addr |
372 | * - should be called with mm->mmap_sem at least held readlocked | 377 | * - should be called with mm->mmap_sem at least held readlocked |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index eec1481ba44f..ea9da3bed3e9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -476,15 +476,13 @@ static void wb_kupdate(unsigned long arg) | |||
476 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs | 476 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
477 | */ | 477 | */ |
478 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, | 478 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, |
479 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 479 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
480 | { | 480 | { |
481 | proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); | 481 | proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); |
482 | if (dirty_writeback_interval) { | 482 | if (dirty_writeback_interval) |
483 | mod_timer(&wb_timer, | 483 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); |
484 | jiffies + dirty_writeback_interval); | 484 | else |
485 | } else { | ||
486 | del_timer(&wb_timer); | 485 | del_timer(&wb_timer); |
487 | } | ||
488 | return 0; | 486 | return 0; |
489 | } | 487 | } |
490 | 488 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05ace44852eb..f9e4e647d7e8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -126,13 +126,13 @@ static unsigned long __meminitdata dma_reserve; | |||
126 | #endif | 126 | #endif |
127 | #endif | 127 | #endif |
128 | 128 | ||
129 | struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; | 129 | static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; |
130 | int __meminitdata nr_nodemap_entries; | 130 | static int __meminitdata nr_nodemap_entries; |
131 | unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 131 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
132 | unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 132 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
133 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | 133 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE |
134 | unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; | 134 | static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; |
135 | unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; | 135 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; |
136 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 136 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
137 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | 137 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ |
138 | 138 | ||
@@ -900,11 +900,13 @@ static struct fail_page_alloc_attr { | |||
900 | 900 | ||
901 | u32 ignore_gfp_highmem; | 901 | u32 ignore_gfp_highmem; |
902 | u32 ignore_gfp_wait; | 902 | u32 ignore_gfp_wait; |
903 | u32 min_order; | ||
903 | 904 | ||
904 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 905 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
905 | 906 | ||
906 | struct dentry *ignore_gfp_highmem_file; | 907 | struct dentry *ignore_gfp_highmem_file; |
907 | struct dentry *ignore_gfp_wait_file; | 908 | struct dentry *ignore_gfp_wait_file; |
909 | struct dentry *min_order_file; | ||
908 | 910 | ||
909 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | 911 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
910 | 912 | ||
@@ -912,6 +914,7 @@ static struct fail_page_alloc_attr { | |||
912 | .attr = FAULT_ATTR_INITIALIZER, | 914 | .attr = FAULT_ATTR_INITIALIZER, |
913 | .ignore_gfp_wait = 1, | 915 | .ignore_gfp_wait = 1, |
914 | .ignore_gfp_highmem = 1, | 916 | .ignore_gfp_highmem = 1, |
917 | .min_order = 1, | ||
915 | }; | 918 | }; |
916 | 919 | ||
917 | static int __init setup_fail_page_alloc(char *str) | 920 | static int __init setup_fail_page_alloc(char *str) |
@@ -922,6 +925,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc); | |||
922 | 925 | ||
923 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 926 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
924 | { | 927 | { |
928 | if (order < fail_page_alloc.min_order) | ||
929 | return 0; | ||
925 | if (gfp_mask & __GFP_NOFAIL) | 930 | if (gfp_mask & __GFP_NOFAIL) |
926 | return 0; | 931 | return 0; |
927 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | 932 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) |
@@ -953,12 +958,17 @@ static int __init fail_page_alloc_debugfs(void) | |||
953 | fail_page_alloc.ignore_gfp_highmem_file = | 958 | fail_page_alloc.ignore_gfp_highmem_file = |
954 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, | 959 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, |
955 | &fail_page_alloc.ignore_gfp_highmem); | 960 | &fail_page_alloc.ignore_gfp_highmem); |
961 | fail_page_alloc.min_order_file = | ||
962 | debugfs_create_u32("min-order", mode, dir, | ||
963 | &fail_page_alloc.min_order); | ||
956 | 964 | ||
957 | if (!fail_page_alloc.ignore_gfp_wait_file || | 965 | if (!fail_page_alloc.ignore_gfp_wait_file || |
958 | !fail_page_alloc.ignore_gfp_highmem_file) { | 966 | !fail_page_alloc.ignore_gfp_highmem_file || |
967 | !fail_page_alloc.min_order_file) { | ||
959 | err = -ENOMEM; | 968 | err = -ENOMEM; |
960 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); | 969 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); |
961 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); | 970 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); |
971 | debugfs_remove(fail_page_alloc.min_order_file); | ||
962 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | 972 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); |
963 | } | 973 | } |
964 | 974 | ||
@@ -1621,8 +1631,8 @@ void show_free_areas(void) | |||
1621 | * | 1631 | * |
1622 | * Add all populated zones of a node to the zonelist. | 1632 | * Add all populated zones of a node to the zonelist. |
1623 | */ | 1633 | */ |
1624 | static int __meminit build_zonelists_node(pg_data_t *pgdat, | 1634 | static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, |
1625 | struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) | 1635 | int nr_zones, enum zone_type zone_type) |
1626 | { | 1636 | { |
1627 | struct zone *zone; | 1637 | struct zone *zone; |
1628 | 1638 | ||
@@ -1641,9 +1651,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat, | |||
1641 | return nr_zones; | 1651 | return nr_zones; |
1642 | } | 1652 | } |
1643 | 1653 | ||
1654 | |||
1655 | /* | ||
1656 | * zonelist_order: | ||
1657 | * 0 = automatic detection of better ordering. | ||
1658 | * 1 = order by ([node] distance, -zonetype) | ||
1659 | * 2 = order by (-zonetype, [node] distance) | ||
1660 | * | ||
1661 | * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create | ||
1662 | * the same zonelist. So only NUMA can configure this param. | ||
1663 | */ | ||
1664 | #define ZONELIST_ORDER_DEFAULT 0 | ||
1665 | #define ZONELIST_ORDER_NODE 1 | ||
1666 | #define ZONELIST_ORDER_ZONE 2 | ||
1667 | |||
1668 | /* zonelist order in the kernel. | ||
1669 | * set_zonelist_order() will set this to NODE or ZONE. | ||
1670 | */ | ||
1671 | static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
1672 | static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; | ||
1673 | |||
1674 | |||
1644 | #ifdef CONFIG_NUMA | 1675 | #ifdef CONFIG_NUMA |
1676 | /* The value user specified ....changed by config */ | ||
1677 | static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
1678 | /* string for sysctl */ | ||
1679 | #define NUMA_ZONELIST_ORDER_LEN 16 | ||
1680 | char numa_zonelist_order[16] = "default"; | ||
1681 | |||
1682 | /* | ||
1683 | * interface for configure zonelist ordering. | ||
1684 | * command line option "numa_zonelist_order" | ||
1685 | * = "[dD]efault - default, automatic configuration. | ||
1686 | * = "[nN]ode - order by node locality, then by zone within node | ||
1687 | * = "[zZ]one - order by zone, then by locality within zone | ||
1688 | */ | ||
1689 | |||
1690 | static int __parse_numa_zonelist_order(char *s) | ||
1691 | { | ||
1692 | if (*s == 'd' || *s == 'D') { | ||
1693 | user_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
1694 | } else if (*s == 'n' || *s == 'N') { | ||
1695 | user_zonelist_order = ZONELIST_ORDER_NODE; | ||
1696 | } else if (*s == 'z' || *s == 'Z') { | ||
1697 | user_zonelist_order = ZONELIST_ORDER_ZONE; | ||
1698 | } else { | ||
1699 | printk(KERN_WARNING | ||
1700 | "Ignoring invalid numa_zonelist_order value: " | ||
1701 | "%s\n", s); | ||
1702 | return -EINVAL; | ||
1703 | } | ||
1704 | return 0; | ||
1705 | } | ||
1706 | |||
1707 | static __init int setup_numa_zonelist_order(char *s) | ||
1708 | { | ||
1709 | if (s) | ||
1710 | return __parse_numa_zonelist_order(s); | ||
1711 | return 0; | ||
1712 | } | ||
1713 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | ||
1714 | |||
1715 | /* | ||
1716 | * sysctl handler for numa_zonelist_order | ||
1717 | */ | ||
1718 | int numa_zonelist_order_handler(ctl_table *table, int write, | ||
1719 | struct file *file, void __user *buffer, size_t *length, | ||
1720 | loff_t *ppos) | ||
1721 | { | ||
1722 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | ||
1723 | int ret; | ||
1724 | |||
1725 | if (write) | ||
1726 | strncpy(saved_string, (char*)table->data, | ||
1727 | NUMA_ZONELIST_ORDER_LEN); | ||
1728 | ret = proc_dostring(table, write, file, buffer, length, ppos); | ||
1729 | if (ret) | ||
1730 | return ret; | ||
1731 | if (write) { | ||
1732 | int oldval = user_zonelist_order; | ||
1733 | if (__parse_numa_zonelist_order((char*)table->data)) { | ||
1734 | /* | ||
1735 | * bogus value. restore saved string | ||
1736 | */ | ||
1737 | strncpy((char*)table->data, saved_string, | ||
1738 | NUMA_ZONELIST_ORDER_LEN); | ||
1739 | user_zonelist_order = oldval; | ||
1740 | } else if (oldval != user_zonelist_order) | ||
1741 | build_all_zonelists(); | ||
1742 | } | ||
1743 | return 0; | ||
1744 | } | ||
1745 | |||
1746 | |||
1645 | #define MAX_NODE_LOAD (num_online_nodes()) | 1747 | #define MAX_NODE_LOAD (num_online_nodes()) |
1646 | static int __meminitdata node_load[MAX_NUMNODES]; | 1748 | static int node_load[MAX_NUMNODES]; |
1749 | |||
1647 | /** | 1750 | /** |
1648 | * find_next_best_node - find the next node that should appear in a given node's fallback list | 1751 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
1649 | * @node: node whose fallback list we're appending | 1752 | * @node: node whose fallback list we're appending |
@@ -1658,7 +1761,7 @@ static int __meminitdata node_load[MAX_NUMNODES]; | |||
1658 | * on them otherwise. | 1761 | * on them otherwise. |
1659 | * It returns -1 if no node is found. | 1762 | * It returns -1 if no node is found. |
1660 | */ | 1763 | */ |
1661 | static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) | 1764 | static int find_next_best_node(int node, nodemask_t *used_node_mask) |
1662 | { | 1765 | { |
1663 | int n, val; | 1766 | int n, val; |
1664 | int min_val = INT_MAX; | 1767 | int min_val = INT_MAX; |
@@ -1704,13 +1807,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) | |||
1704 | return best_node; | 1807 | return best_node; |
1705 | } | 1808 | } |
1706 | 1809 | ||
1707 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1810 | |
1811 | /* | ||
1812 | * Build zonelists ordered by node and zones within node. | ||
1813 | * This results in maximum locality--normal zone overflows into local | ||
1814 | * DMA zone, if any--but risks exhausting DMA zone. | ||
1815 | */ | ||
1816 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | ||
1708 | { | 1817 | { |
1709 | int j, node, local_node; | ||
1710 | enum zone_type i; | 1818 | enum zone_type i; |
1711 | int prev_node, load; | 1819 | int j; |
1712 | struct zonelist *zonelist; | 1820 | struct zonelist *zonelist; |
1821 | |||
1822 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1823 | zonelist = pgdat->node_zonelists + i; | ||
1824 | for (j = 0; zonelist->zones[j] != NULL; j++) | ||
1825 | ; | ||
1826 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | ||
1827 | zonelist->zones[j] = NULL; | ||
1828 | } | ||
1829 | } | ||
1830 | |||
1831 | /* | ||
1832 | * Build zonelists ordered by zone and nodes within zones. | ||
1833 | * This results in conserving DMA zone[s] until all Normal memory is | ||
1834 | * exhausted, but results in overflowing to remote node while memory | ||
1835 | * may still exist in local DMA zone. | ||
1836 | */ | ||
1837 | static int node_order[MAX_NUMNODES]; | ||
1838 | |||
1839 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | ||
1840 | { | ||
1841 | enum zone_type i; | ||
1842 | int pos, j, node; | ||
1843 | int zone_type; /* needs to be signed */ | ||
1844 | struct zone *z; | ||
1845 | struct zonelist *zonelist; | ||
1846 | |||
1847 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1848 | zonelist = pgdat->node_zonelists + i; | ||
1849 | pos = 0; | ||
1850 | for (zone_type = i; zone_type >= 0; zone_type--) { | ||
1851 | for (j = 0; j < nr_nodes; j++) { | ||
1852 | node = node_order[j]; | ||
1853 | z = &NODE_DATA(node)->node_zones[zone_type]; | ||
1854 | if (populated_zone(z)) { | ||
1855 | zonelist->zones[pos++] = z; | ||
1856 | check_highest_zone(zone_type); | ||
1857 | } | ||
1858 | } | ||
1859 | } | ||
1860 | zonelist->zones[pos] = NULL; | ||
1861 | } | ||
1862 | } | ||
1863 | |||
1864 | static int default_zonelist_order(void) | ||
1865 | { | ||
1866 | int nid, zone_type; | ||
1867 | unsigned long low_kmem_size,total_size; | ||
1868 | struct zone *z; | ||
1869 | int average_size; | ||
1870 | /* | ||
1871 | * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. | ||
1872 | * If they are really small and used heavily, the system can fall | ||
1873 | * into OOM very easily. | ||
1874 | * This function detect ZONE_DMA/DMA32 size and confgigures zone order. | ||
1875 | */ | ||
1876 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ | ||
1877 | low_kmem_size = 0; | ||
1878 | total_size = 0; | ||
1879 | for_each_online_node(nid) { | ||
1880 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | ||
1881 | z = &NODE_DATA(nid)->node_zones[zone_type]; | ||
1882 | if (populated_zone(z)) { | ||
1883 | if (zone_type < ZONE_NORMAL) | ||
1884 | low_kmem_size += z->present_pages; | ||
1885 | total_size += z->present_pages; | ||
1886 | } | ||
1887 | } | ||
1888 | } | ||
1889 | if (!low_kmem_size || /* there are no DMA area. */ | ||
1890 | low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ | ||
1891 | return ZONELIST_ORDER_NODE; | ||
1892 | /* | ||
1893 | * look into each node's config. | ||
1894 | * If there is a node whose DMA/DMA32 memory is very big area on | ||
1895 | * local memory, NODE_ORDER may be suitable. | ||
1896 | */ | ||
1897 | average_size = total_size / (num_online_nodes() + 1); | ||
1898 | for_each_online_node(nid) { | ||
1899 | low_kmem_size = 0; | ||
1900 | total_size = 0; | ||
1901 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | ||
1902 | z = &NODE_DATA(nid)->node_zones[zone_type]; | ||
1903 | if (populated_zone(z)) { | ||
1904 | if (zone_type < ZONE_NORMAL) | ||
1905 | low_kmem_size += z->present_pages; | ||
1906 | total_size += z->present_pages; | ||
1907 | } | ||
1908 | } | ||
1909 | if (low_kmem_size && | ||
1910 | total_size > average_size && /* ignore small node */ | ||
1911 | low_kmem_size > total_size * 70/100) | ||
1912 | return ZONELIST_ORDER_NODE; | ||
1913 | } | ||
1914 | return ZONELIST_ORDER_ZONE; | ||
1915 | } | ||
1916 | |||
1917 | static void set_zonelist_order(void) | ||
1918 | { | ||
1919 | if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) | ||
1920 | current_zonelist_order = default_zonelist_order(); | ||
1921 | else | ||
1922 | current_zonelist_order = user_zonelist_order; | ||
1923 | } | ||
1924 | |||
1925 | static void build_zonelists(pg_data_t *pgdat) | ||
1926 | { | ||
1927 | int j, node, load; | ||
1928 | enum zone_type i; | ||
1713 | nodemask_t used_mask; | 1929 | nodemask_t used_mask; |
1930 | int local_node, prev_node; | ||
1931 | struct zonelist *zonelist; | ||
1932 | int order = current_zonelist_order; | ||
1714 | 1933 | ||
1715 | /* initialize zonelists */ | 1934 | /* initialize zonelists */ |
1716 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1935 | for (i = 0; i < MAX_NR_ZONES; i++) { |
@@ -1723,6 +1942,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1723 | load = num_online_nodes(); | 1942 | load = num_online_nodes(); |
1724 | prev_node = local_node; | 1943 | prev_node = local_node; |
1725 | nodes_clear(used_mask); | 1944 | nodes_clear(used_mask); |
1945 | |||
1946 | memset(node_load, 0, sizeof(node_load)); | ||
1947 | memset(node_order, 0, sizeof(node_order)); | ||
1948 | j = 0; | ||
1949 | |||
1726 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 1950 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
1727 | int distance = node_distance(local_node, node); | 1951 | int distance = node_distance(local_node, node); |
1728 | 1952 | ||
@@ -1738,23 +1962,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1738 | * So adding penalty to the first node in same | 1962 | * So adding penalty to the first node in same |
1739 | * distance group to make it round-robin. | 1963 | * distance group to make it round-robin. |
1740 | */ | 1964 | */ |
1741 | |||
1742 | if (distance != node_distance(local_node, prev_node)) | 1965 | if (distance != node_distance(local_node, prev_node)) |
1743 | node_load[node] += load; | 1966 | node_load[node] = load; |
1967 | |||
1744 | prev_node = node; | 1968 | prev_node = node; |
1745 | load--; | 1969 | load--; |
1746 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1970 | if (order == ZONELIST_ORDER_NODE) |
1747 | zonelist = pgdat->node_zonelists + i; | 1971 | build_zonelists_in_node_order(pgdat, node); |
1748 | for (j = 0; zonelist->zones[j] != NULL; j++); | 1972 | else |
1973 | node_order[j++] = node; /* remember order */ | ||
1974 | } | ||
1749 | 1975 | ||
1750 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 1976 | if (order == ZONELIST_ORDER_ZONE) { |
1751 | zonelist->zones[j] = NULL; | 1977 | /* calculate node order -- i.e., DMA last! */ |
1752 | } | 1978 | build_zonelists_in_zone_order(pgdat, j); |
1753 | } | 1979 | } |
1754 | } | 1980 | } |
1755 | 1981 | ||
1756 | /* Construct the zonelist performance cache - see further mmzone.h */ | 1982 | /* Construct the zonelist performance cache - see further mmzone.h */ |
1757 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | 1983 | static void build_zonelist_cache(pg_data_t *pgdat) |
1758 | { | 1984 | { |
1759 | int i; | 1985 | int i; |
1760 | 1986 | ||
@@ -1771,9 +1997,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat) | |||
1771 | } | 1997 | } |
1772 | } | 1998 | } |
1773 | 1999 | ||
2000 | |||
1774 | #else /* CONFIG_NUMA */ | 2001 | #else /* CONFIG_NUMA */ |
1775 | 2002 | ||
1776 | static void __meminit build_zonelists(pg_data_t *pgdat) | 2003 | static void set_zonelist_order(void) |
2004 | { | ||
2005 | current_zonelist_order = ZONELIST_ORDER_ZONE; | ||
2006 | } | ||
2007 | |||
2008 | static void build_zonelists(pg_data_t *pgdat) | ||
1777 | { | 2009 | { |
1778 | int node, local_node; | 2010 | int node, local_node; |
1779 | enum zone_type i,j; | 2011 | enum zone_type i,j; |
@@ -1809,7 +2041,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1809 | } | 2041 | } |
1810 | 2042 | ||
1811 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | 2043 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ |
1812 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | 2044 | static void build_zonelist_cache(pg_data_t *pgdat) |
1813 | { | 2045 | { |
1814 | int i; | 2046 | int i; |
1815 | 2047 | ||
@@ -1820,7 +2052,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat) | |||
1820 | #endif /* CONFIG_NUMA */ | 2052 | #endif /* CONFIG_NUMA */ |
1821 | 2053 | ||
1822 | /* return values int ....just for stop_machine_run() */ | 2054 | /* return values int ....just for stop_machine_run() */ |
1823 | static int __meminit __build_all_zonelists(void *dummy) | 2055 | static int __build_all_zonelists(void *dummy) |
1824 | { | 2056 | { |
1825 | int nid; | 2057 | int nid; |
1826 | 2058 | ||
@@ -1831,8 +2063,10 @@ static int __meminit __build_all_zonelists(void *dummy) | |||
1831 | return 0; | 2063 | return 0; |
1832 | } | 2064 | } |
1833 | 2065 | ||
1834 | void __meminit build_all_zonelists(void) | 2066 | void build_all_zonelists(void) |
1835 | { | 2067 | { |
2068 | set_zonelist_order(); | ||
2069 | |||
1836 | if (system_state == SYSTEM_BOOTING) { | 2070 | if (system_state == SYSTEM_BOOTING) { |
1837 | __build_all_zonelists(NULL); | 2071 | __build_all_zonelists(NULL); |
1838 | cpuset_init_current_mems_allowed(); | 2072 | cpuset_init_current_mems_allowed(); |
@@ -1843,8 +2077,13 @@ void __meminit build_all_zonelists(void) | |||
1843 | /* cpuset refresh routine should be here */ | 2077 | /* cpuset refresh routine should be here */ |
1844 | } | 2078 | } |
1845 | vm_total_pages = nr_free_pagecache_pages(); | 2079 | vm_total_pages = nr_free_pagecache_pages(); |
1846 | printk("Built %i zonelists. Total pages: %ld\n", | 2080 | printk("Built %i zonelists in %s order. Total pages: %ld\n", |
1847 | num_online_nodes(), vm_total_pages); | 2081 | num_online_nodes(), |
2082 | zonelist_order_name[current_zonelist_order], | ||
2083 | vm_total_pages); | ||
2084 | #ifdef CONFIG_NUMA | ||
2085 | printk("Policy zone: %s\n", zone_names[policy_zone]); | ||
2086 | #endif | ||
1848 | } | 2087 | } |
1849 | 2088 | ||
1850 | /* | 2089 | /* |
@@ -1953,8 +2192,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
1953 | } | 2192 | } |
1954 | } | 2193 | } |
1955 | 2194 | ||
1956 | void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | 2195 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, |
1957 | unsigned long size) | 2196 | struct zone *zone, unsigned long size) |
1958 | { | 2197 | { |
1959 | int order; | 2198 | int order; |
1960 | for (order = 0; order < MAX_ORDER ; order++) { | 2199 | for (order = 0; order < MAX_ORDER ; order++) { |
@@ -2370,7 +2609,7 @@ void __init push_node_boundaries(unsigned int nid, | |||
2370 | } | 2609 | } |
2371 | 2610 | ||
2372 | /* If necessary, push the node boundary out for reserve hotadd */ | 2611 | /* If necessary, push the node boundary out for reserve hotadd */ |
2373 | static void __init account_node_boundary(unsigned int nid, | 2612 | static void __meminit account_node_boundary(unsigned int nid, |
2374 | unsigned long *start_pfn, unsigned long *end_pfn) | 2613 | unsigned long *start_pfn, unsigned long *end_pfn) |
2375 | { | 2614 | { |
2376 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", | 2615 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", |
@@ -2390,7 +2629,7 @@ static void __init account_node_boundary(unsigned int nid, | |||
2390 | void __init push_node_boundaries(unsigned int nid, | 2629 | void __init push_node_boundaries(unsigned int nid, |
2391 | unsigned long start_pfn, unsigned long end_pfn) {} | 2630 | unsigned long start_pfn, unsigned long end_pfn) {} |
2392 | 2631 | ||
2393 | static void __init account_node_boundary(unsigned int nid, | 2632 | static void __meminit account_node_boundary(unsigned int nid, |
2394 | unsigned long *start_pfn, unsigned long *end_pfn) {} | 2633 | unsigned long *start_pfn, unsigned long *end_pfn) {} |
2395 | #endif | 2634 | #endif |
2396 | 2635 | ||
@@ -2431,7 +2670,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
2431 | * Return the number of pages a zone spans in a node, including holes | 2670 | * Return the number of pages a zone spans in a node, including holes |
2432 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | 2671 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() |
2433 | */ | 2672 | */ |
2434 | unsigned long __meminit zone_spanned_pages_in_node(int nid, | 2673 | static unsigned long __meminit zone_spanned_pages_in_node(int nid, |
2435 | unsigned long zone_type, | 2674 | unsigned long zone_type, |
2436 | unsigned long *ignored) | 2675 | unsigned long *ignored) |
2437 | { | 2676 | { |
@@ -2519,7 +2758,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, | |||
2519 | } | 2758 | } |
2520 | 2759 | ||
2521 | /* Return the number of page frames in holes in a zone on a node */ | 2760 | /* Return the number of page frames in holes in a zone on a node */ |
2522 | unsigned long __meminit zone_absent_pages_in_node(int nid, | 2761 | static unsigned long __meminit zone_absent_pages_in_node(int nid, |
2523 | unsigned long zone_type, | 2762 | unsigned long zone_type, |
2524 | unsigned long *ignored) | 2763 | unsigned long *ignored) |
2525 | { | 2764 | { |
@@ -2536,14 +2775,14 @@ unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
2536 | } | 2775 | } |
2537 | 2776 | ||
2538 | #else | 2777 | #else |
2539 | static inline unsigned long zone_spanned_pages_in_node(int nid, | 2778 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, |
2540 | unsigned long zone_type, | 2779 | unsigned long zone_type, |
2541 | unsigned long *zones_size) | 2780 | unsigned long *zones_size) |
2542 | { | 2781 | { |
2543 | return zones_size[zone_type]; | 2782 | return zones_size[zone_type]; |
2544 | } | 2783 | } |
2545 | 2784 | ||
2546 | static inline unsigned long zone_absent_pages_in_node(int nid, | 2785 | static inline unsigned long __meminit zone_absent_pages_in_node(int nid, |
2547 | unsigned long zone_type, | 2786 | unsigned long zone_type, |
2548 | unsigned long *zholes_size) | 2787 | unsigned long *zholes_size) |
2549 | { | 2788 | { |
@@ -3355,13 +3594,28 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3355 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) | 3594 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) |
3356 | ; | 3595 | ; |
3357 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | 3596 | table = (void*) __get_free_pages(GFP_ATOMIC, order); |
3597 | /* | ||
3598 | * If bucketsize is not a power-of-two, we may free | ||
3599 | * some pages at the end of hash table. | ||
3600 | */ | ||
3601 | if (table) { | ||
3602 | unsigned long alloc_end = (unsigned long)table + | ||
3603 | (PAGE_SIZE << order); | ||
3604 | unsigned long used = (unsigned long)table + | ||
3605 | PAGE_ALIGN(size); | ||
3606 | split_page(virt_to_page(table), order); | ||
3607 | while (used < alloc_end) { | ||
3608 | free_page(used); | ||
3609 | used += PAGE_SIZE; | ||
3610 | } | ||
3611 | } | ||
3358 | } | 3612 | } |
3359 | } while (!table && size > PAGE_SIZE && --log2qty); | 3613 | } while (!table && size > PAGE_SIZE && --log2qty); |
3360 | 3614 | ||
3361 | if (!table) | 3615 | if (!table) |
3362 | panic("Failed to allocate %s hash table\n", tablename); | 3616 | panic("Failed to allocate %s hash table\n", tablename); |
3363 | 3617 | ||
3364 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", | 3618 | printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", |
3365 | tablename, | 3619 | tablename, |
3366 | (1U << log2qty), | 3620 | (1U << log2qty), |
3367 | ilog2(size) - PAGE_SHIFT, | 3621 | ilog2(size) - PAGE_SHIFT, |
@@ -929,7 +929,7 @@ static void next_reap_node(void) | |||
929 | * the CPUs getting into lockstep and contending for the global cache chain | 929 | * the CPUs getting into lockstep and contending for the global cache chain |
930 | * lock. | 930 | * lock. |
931 | */ | 931 | */ |
932 | static void __devinit start_cpu_timer(int cpu) | 932 | static void __cpuinit start_cpu_timer(int cpu) |
933 | { | 933 | { |
934 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); | 934 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); |
935 | 935 | ||
@@ -4157,26 +4157,17 @@ static void print_slabinfo_header(struct seq_file *m) | |||
4157 | static void *s_start(struct seq_file *m, loff_t *pos) | 4157 | static void *s_start(struct seq_file *m, loff_t *pos) |
4158 | { | 4158 | { |
4159 | loff_t n = *pos; | 4159 | loff_t n = *pos; |
4160 | struct list_head *p; | ||
4161 | 4160 | ||
4162 | mutex_lock(&cache_chain_mutex); | 4161 | mutex_lock(&cache_chain_mutex); |
4163 | if (!n) | 4162 | if (!n) |
4164 | print_slabinfo_header(m); | 4163 | print_slabinfo_header(m); |
4165 | p = cache_chain.next; | 4164 | |
4166 | while (n--) { | 4165 | return seq_list_start(&cache_chain, *pos); |
4167 | p = p->next; | ||
4168 | if (p == &cache_chain) | ||
4169 | return NULL; | ||
4170 | } | ||
4171 | return list_entry(p, struct kmem_cache, next); | ||
4172 | } | 4166 | } |
4173 | 4167 | ||
4174 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | 4168 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) |
4175 | { | 4169 | { |
4176 | struct kmem_cache *cachep = p; | 4170 | return seq_list_next(p, &cache_chain, pos); |
4177 | ++*pos; | ||
4178 | return cachep->next.next == &cache_chain ? | ||
4179 | NULL : list_entry(cachep->next.next, struct kmem_cache, next); | ||
4180 | } | 4171 | } |
4181 | 4172 | ||
4182 | static void s_stop(struct seq_file *m, void *p) | 4173 | static void s_stop(struct seq_file *m, void *p) |
@@ -4186,7 +4177,7 @@ static void s_stop(struct seq_file *m, void *p) | |||
4186 | 4177 | ||
4187 | static int s_show(struct seq_file *m, void *p) | 4178 | static int s_show(struct seq_file *m, void *p) |
4188 | { | 4179 | { |
4189 | struct kmem_cache *cachep = p; | 4180 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); |
4190 | struct slab *slabp; | 4181 | struct slab *slabp; |
4191 | unsigned long active_objs; | 4182 | unsigned long active_objs; |
4192 | unsigned long num_objs; | 4183 | unsigned long num_objs; |
@@ -4355,17 +4346,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
4355 | 4346 | ||
4356 | static void *leaks_start(struct seq_file *m, loff_t *pos) | 4347 | static void *leaks_start(struct seq_file *m, loff_t *pos) |
4357 | { | 4348 | { |
4358 | loff_t n = *pos; | ||
4359 | struct list_head *p; | ||
4360 | |||
4361 | mutex_lock(&cache_chain_mutex); | 4349 | mutex_lock(&cache_chain_mutex); |
4362 | p = cache_chain.next; | 4350 | return seq_list_start(&cache_chain, *pos); |
4363 | while (n--) { | ||
4364 | p = p->next; | ||
4365 | if (p == &cache_chain) | ||
4366 | return NULL; | ||
4367 | } | ||
4368 | return list_entry(p, struct kmem_cache, next); | ||
4369 | } | 4351 | } |
4370 | 4352 | ||
4371 | static inline int add_caller(unsigned long *n, unsigned long v) | 4353 | static inline int add_caller(unsigned long *n, unsigned long v) |
@@ -4430,7 +4412,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) | |||
4430 | 4412 | ||
4431 | static int leaks_show(struct seq_file *m, void *p) | 4413 | static int leaks_show(struct seq_file *m, void *p) |
4432 | { | 4414 | { |
4433 | struct kmem_cache *cachep = p; | 4415 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); |
4434 | struct slab *slabp; | 4416 | struct slab *slabp; |
4435 | struct kmem_list3 *l3; | 4417 | struct kmem_list3 *l3; |
4436 | const char *name; | 4418 | const char *name; |
@@ -3,57 +3,159 @@ | |||
3 | * | 3 | * |
4 | * Matt Mackall <mpm@selenic.com> 12/30/03 | 4 | * Matt Mackall <mpm@selenic.com> 12/30/03 |
5 | * | 5 | * |
6 | * NUMA support by Paul Mundt, 2007. | ||
7 | * | ||
6 | * How SLOB works: | 8 | * How SLOB works: |
7 | * | 9 | * |
8 | * The core of SLOB is a traditional K&R style heap allocator, with | 10 | * The core of SLOB is a traditional K&R style heap allocator, with |
9 | * support for returning aligned objects. The granularity of this | 11 | * support for returning aligned objects. The granularity of this |
10 | * allocator is 8 bytes on x86, though it's perhaps possible to reduce | 12 | * allocator is as little as 2 bytes, however typically most architectures |
11 | * this to 4 if it's deemed worth the effort. The slob heap is a | 13 | * will require 4 bytes on 32-bit and 8 bytes on 64-bit. |
12 | * singly-linked list of pages from __get_free_page, grown on demand | 14 | * |
13 | * and allocation from the heap is currently first-fit. | 15 | * The slob heap is a linked list of pages from alloc_pages(), and |
16 | * within each page, there is a singly-linked list of free blocks (slob_t). | ||
17 | * The heap is grown on demand and allocation from the heap is currently | ||
18 | * first-fit. | ||
14 | * | 19 | * |
15 | * Above this is an implementation of kmalloc/kfree. Blocks returned | 20 | * Above this is an implementation of kmalloc/kfree. Blocks returned |
16 | * from kmalloc are 8-byte aligned and prepended with a 8-byte header. | 21 | * from kmalloc are prepended with a 4-byte header with the kmalloc size. |
17 | * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls | 22 | * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls |
18 | * __get_free_pages directly so that it can return page-aligned blocks | 23 | * alloc_pages() directly, allocating compound pages so the page order |
19 | * and keeps a linked list of such pages and their orders. These | 24 | * does not have to be separately tracked, and also stores the exact |
20 | * objects are detected in kfree() by their page alignment. | 25 | * allocation size in page->private so that it can be used to accurately |
26 | * provide ksize(). These objects are detected in kfree() because slob_page() | ||
27 | * is false for them. | ||
21 | * | 28 | * |
22 | * SLAB is emulated on top of SLOB by simply calling constructors and | 29 | * SLAB is emulated on top of SLOB by simply calling constructors and |
23 | * destructors for every SLAB allocation. Objects are returned with | 30 | * destructors for every SLAB allocation. Objects are returned with the |
24 | * the 8-byte alignment unless the SLAB_HWCACHE_ALIGN flag is | 31 | * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which |
25 | * set, in which case the low-level allocator will fragment blocks to | 32 | * case the low-level allocator will fragment blocks to create the proper |
26 | * create the proper alignment. Again, objects of page-size or greater | 33 | * alignment. Again, objects of page-size or greater are allocated by |
27 | * are allocated by calling __get_free_pages. As SLAB objects know | 34 | * calling alloc_pages(). As SLAB objects know their size, no separate |
28 | * their size, no separate size bookkeeping is necessary and there is | 35 | * size bookkeeping is necessary and there is essentially no allocation |
29 | * essentially no allocation space overhead. | 36 | * space overhead, and compound pages aren't needed for multi-page |
37 | * allocations. | ||
38 | * | ||
39 | * NUMA support in SLOB is fairly simplistic, pushing most of the real | ||
40 | * logic down to the page allocator, and simply doing the node accounting | ||
41 | * on the upper levels. In the event that a node id is explicitly | ||
42 | * provided, alloc_pages_node() with the specified node id is used | ||
43 | * instead. The common case (or when the node id isn't explicitly provided) | ||
44 | * will default to the current node, as per numa_node_id(). | ||
45 | * | ||
46 | * Node aware pages are still inserted in to the global freelist, and | ||
47 | * these are scanned for by matching against the node id encoded in the | ||
48 | * page flags. As a result, block allocations that can be satisfied from | ||
49 | * the freelist will only be done so on pages residing on the same node, | ||
50 | * in order to prevent random node placement. | ||
30 | */ | 51 | */ |
31 | 52 | ||
53 | #include <linux/kernel.h> | ||
32 | #include <linux/slab.h> | 54 | #include <linux/slab.h> |
33 | #include <linux/mm.h> | 55 | #include <linux/mm.h> |
34 | #include <linux/cache.h> | 56 | #include <linux/cache.h> |
35 | #include <linux/init.h> | 57 | #include <linux/init.h> |
36 | #include <linux/module.h> | 58 | #include <linux/module.h> |
37 | #include <linux/timer.h> | ||
38 | #include <linux/rcupdate.h> | 59 | #include <linux/rcupdate.h> |
60 | #include <linux/list.h> | ||
61 | #include <asm/atomic.h> | ||
62 | |||
63 | /* | ||
64 | * slob_block has a field 'units', which indicates size of block if +ve, | ||
65 | * or offset of next block if -ve (in SLOB_UNITs). | ||
66 | * | ||
67 | * Free blocks of size 1 unit simply contain the offset of the next block. | ||
68 | * Those with larger size contain their size in the first SLOB_UNIT of | ||
69 | * memory, and the offset of the next free block in the second SLOB_UNIT. | ||
70 | */ | ||
71 | #if PAGE_SIZE <= (32767 * 2) | ||
72 | typedef s16 slobidx_t; | ||
73 | #else | ||
74 | typedef s32 slobidx_t; | ||
75 | #endif | ||
39 | 76 | ||
40 | struct slob_block { | 77 | struct slob_block { |
41 | int units; | 78 | slobidx_t units; |
42 | struct slob_block *next; | ||
43 | }; | 79 | }; |
44 | typedef struct slob_block slob_t; | 80 | typedef struct slob_block slob_t; |
45 | 81 | ||
82 | /* | ||
83 | * We use struct page fields to manage some slob allocation aspects, | ||
84 | * however to avoid the horrible mess in include/linux/mm_types.h, we'll | ||
85 | * just define our own struct page type variant here. | ||
86 | */ | ||
87 | struct slob_page { | ||
88 | union { | ||
89 | struct { | ||
90 | unsigned long flags; /* mandatory */ | ||
91 | atomic_t _count; /* mandatory */ | ||
92 | slobidx_t units; /* free units left in page */ | ||
93 | unsigned long pad[2]; | ||
94 | slob_t *free; /* first free slob_t in page */ | ||
95 | struct list_head list; /* linked list of free pages */ | ||
96 | }; | ||
97 | struct page page; | ||
98 | }; | ||
99 | }; | ||
100 | static inline void struct_slob_page_wrong_size(void) | ||
101 | { BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); } | ||
102 | |||
103 | /* | ||
104 | * free_slob_page: call before a slob_page is returned to the page allocator. | ||
105 | */ | ||
106 | static inline void free_slob_page(struct slob_page *sp) | ||
107 | { | ||
108 | reset_page_mapcount(&sp->page); | ||
109 | sp->page.mapping = NULL; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * All (partially) free slob pages go on this list. | ||
114 | */ | ||
115 | static LIST_HEAD(free_slob_pages); | ||
116 | |||
117 | /* | ||
118 | * slob_page: True for all slob pages (false for bigblock pages) | ||
119 | */ | ||
120 | static inline int slob_page(struct slob_page *sp) | ||
121 | { | ||
122 | return test_bit(PG_active, &sp->flags); | ||
123 | } | ||
124 | |||
125 | static inline void set_slob_page(struct slob_page *sp) | ||
126 | { | ||
127 | __set_bit(PG_active, &sp->flags); | ||
128 | } | ||
129 | |||
130 | static inline void clear_slob_page(struct slob_page *sp) | ||
131 | { | ||
132 | __clear_bit(PG_active, &sp->flags); | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * slob_page_free: true for pages on free_slob_pages list. | ||
137 | */ | ||
138 | static inline int slob_page_free(struct slob_page *sp) | ||
139 | { | ||
140 | return test_bit(PG_private, &sp->flags); | ||
141 | } | ||
142 | |||
143 | static inline void set_slob_page_free(struct slob_page *sp) | ||
144 | { | ||
145 | list_add(&sp->list, &free_slob_pages); | ||
146 | __set_bit(PG_private, &sp->flags); | ||
147 | } | ||
148 | |||
149 | static inline void clear_slob_page_free(struct slob_page *sp) | ||
150 | { | ||
151 | list_del(&sp->list); | ||
152 | __clear_bit(PG_private, &sp->flags); | ||
153 | } | ||
154 | |||
46 | #define SLOB_UNIT sizeof(slob_t) | 155 | #define SLOB_UNIT sizeof(slob_t) |
47 | #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) | 156 | #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) |
48 | #define SLOB_ALIGN L1_CACHE_BYTES | 157 | #define SLOB_ALIGN L1_CACHE_BYTES |
49 | 158 | ||
50 | struct bigblock { | ||
51 | int order; | ||
52 | void *pages; | ||
53 | struct bigblock *next; | ||
54 | }; | ||
55 | typedef struct bigblock bigblock_t; | ||
56 | |||
57 | /* | 159 | /* |
58 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which | 160 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which |
59 | * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free | 161 | * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free |
@@ -64,133 +166,285 @@ struct slob_rcu { | |||
64 | int size; | 166 | int size; |
65 | }; | 167 | }; |
66 | 168 | ||
67 | static slob_t arena = { .next = &arena, .units = 1 }; | 169 | /* |
68 | static slob_t *slobfree = &arena; | 170 | * slob_lock protects all slob allocator structures. |
69 | static bigblock_t *bigblocks; | 171 | */ |
70 | static DEFINE_SPINLOCK(slob_lock); | 172 | static DEFINE_SPINLOCK(slob_lock); |
71 | static DEFINE_SPINLOCK(block_lock); | ||
72 | 173 | ||
73 | static void slob_free(void *b, int size); | 174 | /* |
74 | static void slob_timer_cbk(void); | 175 | * Encode the given size and next info into a free slob block s. |
176 | */ | ||
177 | static void set_slob(slob_t *s, slobidx_t size, slob_t *next) | ||
178 | { | ||
179 | slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); | ||
180 | slobidx_t offset = next - base; | ||
75 | 181 | ||
182 | if (size > 1) { | ||
183 | s[0].units = size; | ||
184 | s[1].units = offset; | ||
185 | } else | ||
186 | s[0].units = -offset; | ||
187 | } | ||
76 | 188 | ||
77 | static void *slob_alloc(size_t size, gfp_t gfp, int align) | 189 | /* |
190 | * Return the size of a slob block. | ||
191 | */ | ||
192 | static slobidx_t slob_units(slob_t *s) | ||
193 | { | ||
194 | if (s->units > 0) | ||
195 | return s->units; | ||
196 | return 1; | ||
197 | } | ||
198 | |||
199 | /* | ||
200 | * Return the next free slob block pointer after this one. | ||
201 | */ | ||
202 | static slob_t *slob_next(slob_t *s) | ||
203 | { | ||
204 | slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); | ||
205 | slobidx_t next; | ||
206 | |||
207 | if (s[0].units < 0) | ||
208 | next = -s[0].units; | ||
209 | else | ||
210 | next = s[1].units; | ||
211 | return base+next; | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * Returns true if s is the last free block in its page. | ||
216 | */ | ||
217 | static int slob_last(slob_t *s) | ||
218 | { | ||
219 | return !((unsigned long)slob_next(s) & ~PAGE_MASK); | ||
220 | } | ||
221 | |||
222 | static void *slob_new_page(gfp_t gfp, int order, int node) | ||
223 | { | ||
224 | void *page; | ||
225 | |||
226 | #ifdef CONFIG_NUMA | ||
227 | if (node != -1) | ||
228 | page = alloc_pages_node(node, gfp, order); | ||
229 | else | ||
230 | #endif | ||
231 | page = alloc_pages(gfp, order); | ||
232 | |||
233 | if (!page) | ||
234 | return NULL; | ||
235 | |||
236 | return page_address(page); | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * Allocate a slob block within a given slob_page sp. | ||
241 | */ | ||
242 | static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) | ||
78 | { | 243 | { |
79 | slob_t *prev, *cur, *aligned = 0; | 244 | slob_t *prev, *cur, *aligned = 0; |
80 | int delta = 0, units = SLOB_UNITS(size); | 245 | int delta = 0, units = SLOB_UNITS(size); |
81 | unsigned long flags; | ||
82 | 246 | ||
83 | spin_lock_irqsave(&slob_lock, flags); | 247 | for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { |
84 | prev = slobfree; | 248 | slobidx_t avail = slob_units(cur); |
85 | for (cur = prev->next; ; prev = cur, cur = cur->next) { | 249 | |
86 | if (align) { | 250 | if (align) { |
87 | aligned = (slob_t *)ALIGN((unsigned long)cur, align); | 251 | aligned = (slob_t *)ALIGN((unsigned long)cur, align); |
88 | delta = aligned - cur; | 252 | delta = aligned - cur; |
89 | } | 253 | } |
90 | if (cur->units >= units + delta) { /* room enough? */ | 254 | if (avail >= units + delta) { /* room enough? */ |
255 | slob_t *next; | ||
256 | |||
91 | if (delta) { /* need to fragment head to align? */ | 257 | if (delta) { /* need to fragment head to align? */ |
92 | aligned->units = cur->units - delta; | 258 | next = slob_next(cur); |
93 | aligned->next = cur->next; | 259 | set_slob(aligned, avail - delta, next); |
94 | cur->next = aligned; | 260 | set_slob(cur, delta, aligned); |
95 | cur->units = delta; | ||
96 | prev = cur; | 261 | prev = cur; |
97 | cur = aligned; | 262 | cur = aligned; |
263 | avail = slob_units(cur); | ||
98 | } | 264 | } |
99 | 265 | ||
100 | if (cur->units == units) /* exact fit? */ | 266 | next = slob_next(cur); |
101 | prev->next = cur->next; /* unlink */ | 267 | if (avail == units) { /* exact fit? unlink. */ |
102 | else { /* fragment */ | 268 | if (prev) |
103 | prev->next = cur + units; | 269 | set_slob(prev, slob_units(prev), next); |
104 | prev->next->units = cur->units - units; | 270 | else |
105 | prev->next->next = cur->next; | 271 | sp->free = next; |
106 | cur->units = units; | 272 | } else { /* fragment */ |
273 | if (prev) | ||
274 | set_slob(prev, slob_units(prev), cur + units); | ||
275 | else | ||
276 | sp->free = cur + units; | ||
277 | set_slob(cur + units, avail - units, next); | ||
107 | } | 278 | } |
108 | 279 | ||
109 | slobfree = prev; | 280 | sp->units -= units; |
110 | spin_unlock_irqrestore(&slob_lock, flags); | 281 | if (!sp->units) |
282 | clear_slob_page_free(sp); | ||
111 | return cur; | 283 | return cur; |
112 | } | 284 | } |
113 | if (cur == slobfree) { | 285 | if (slob_last(cur)) |
114 | spin_unlock_irqrestore(&slob_lock, flags); | 286 | return NULL; |
115 | 287 | } | |
116 | if (size == PAGE_SIZE) /* trying to shrink arena? */ | 288 | } |
117 | return 0; | ||
118 | 289 | ||
119 | cur = (slob_t *)__get_free_page(gfp); | 290 | /* |
120 | if (!cur) | 291 | * slob_alloc: entry point into the slob allocator. |
121 | return 0; | 292 | */ |
293 | static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | ||
294 | { | ||
295 | struct slob_page *sp; | ||
296 | slob_t *b = NULL; | ||
297 | unsigned long flags; | ||
122 | 298 | ||
123 | slob_free(cur, PAGE_SIZE); | 299 | spin_lock_irqsave(&slob_lock, flags); |
124 | spin_lock_irqsave(&slob_lock, flags); | 300 | /* Iterate through each partially free page, try to find room */ |
125 | cur = slobfree; | 301 | list_for_each_entry(sp, &free_slob_pages, list) { |
302 | #ifdef CONFIG_NUMA | ||
303 | /* | ||
304 | * If there's a node specification, search for a partial | ||
305 | * page with a matching node id in the freelist. | ||
306 | */ | ||
307 | if (node != -1 && page_to_nid(&sp->page) != node) | ||
308 | continue; | ||
309 | #endif | ||
310 | |||
311 | if (sp->units >= SLOB_UNITS(size)) { | ||
312 | b = slob_page_alloc(sp, size, align); | ||
313 | if (b) | ||
314 | break; | ||
126 | } | 315 | } |
127 | } | 316 | } |
317 | spin_unlock_irqrestore(&slob_lock, flags); | ||
318 | |||
319 | /* Not enough space: must allocate a new page */ | ||
320 | if (!b) { | ||
321 | b = slob_new_page(gfp, 0, node); | ||
322 | if (!b) | ||
323 | return 0; | ||
324 | sp = (struct slob_page *)virt_to_page(b); | ||
325 | set_slob_page(sp); | ||
326 | |||
327 | spin_lock_irqsave(&slob_lock, flags); | ||
328 | sp->units = SLOB_UNITS(PAGE_SIZE); | ||
329 | sp->free = b; | ||
330 | INIT_LIST_HEAD(&sp->list); | ||
331 | set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); | ||
332 | set_slob_page_free(sp); | ||
333 | b = slob_page_alloc(sp, size, align); | ||
334 | BUG_ON(!b); | ||
335 | spin_unlock_irqrestore(&slob_lock, flags); | ||
336 | } | ||
337 | return b; | ||
128 | } | 338 | } |
129 | 339 | ||
340 | /* | ||
341 | * slob_free: entry point into the slob allocator. | ||
342 | */ | ||
130 | static void slob_free(void *block, int size) | 343 | static void slob_free(void *block, int size) |
131 | { | 344 | { |
132 | slob_t *cur, *b = (slob_t *)block; | 345 | struct slob_page *sp; |
346 | slob_t *prev, *next, *b = (slob_t *)block; | ||
347 | slobidx_t units; | ||
133 | unsigned long flags; | 348 | unsigned long flags; |
134 | 349 | ||
135 | if (!block) | 350 | if (!block) |
136 | return; | 351 | return; |
352 | BUG_ON(!size); | ||
137 | 353 | ||
138 | if (size) | 354 | sp = (struct slob_page *)virt_to_page(block); |
139 | b->units = SLOB_UNITS(size); | 355 | units = SLOB_UNITS(size); |
140 | 356 | ||
141 | /* Find reinsertion point */ | ||
142 | spin_lock_irqsave(&slob_lock, flags); | 357 | spin_lock_irqsave(&slob_lock, flags); |
143 | for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next) | ||
144 | if (cur >= cur->next && (b > cur || b < cur->next)) | ||
145 | break; | ||
146 | 358 | ||
147 | if (b + b->units == cur->next) { | 359 | if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) { |
148 | b->units += cur->next->units; | 360 | /* Go directly to page allocator. Do not pass slob allocator */ |
149 | b->next = cur->next->next; | 361 | if (slob_page_free(sp)) |
150 | } else | 362 | clear_slob_page_free(sp); |
151 | b->next = cur->next; | 363 | clear_slob_page(sp); |
364 | free_slob_page(sp); | ||
365 | free_page((unsigned long)b); | ||
366 | goto out; | ||
367 | } | ||
152 | 368 | ||
153 | if (cur + cur->units == b) { | 369 | if (!slob_page_free(sp)) { |
154 | cur->units += b->units; | 370 | /* This slob page is about to become partially free. Easy! */ |
155 | cur->next = b->next; | 371 | sp->units = units; |
156 | } else | 372 | sp->free = b; |
157 | cur->next = b; | 373 | set_slob(b, units, |
374 | (void *)((unsigned long)(b + | ||
375 | SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); | ||
376 | set_slob_page_free(sp); | ||
377 | goto out; | ||
378 | } | ||
158 | 379 | ||
159 | slobfree = cur; | 380 | /* |
381 | * Otherwise the page is already partially free, so find reinsertion | ||
382 | * point. | ||
383 | */ | ||
384 | sp->units += units; | ||
160 | 385 | ||
386 | if (b < sp->free) { | ||
387 | set_slob(b, units, sp->free); | ||
388 | sp->free = b; | ||
389 | } else { | ||
390 | prev = sp->free; | ||
391 | next = slob_next(prev); | ||
392 | while (b > next) { | ||
393 | prev = next; | ||
394 | next = slob_next(prev); | ||
395 | } | ||
396 | |||
397 | if (!slob_last(prev) && b + units == next) { | ||
398 | units += slob_units(next); | ||
399 | set_slob(b, units, slob_next(next)); | ||
400 | } else | ||
401 | set_slob(b, units, next); | ||
402 | |||
403 | if (prev + slob_units(prev) == b) { | ||
404 | units = slob_units(b) + slob_units(prev); | ||
405 | set_slob(prev, units, slob_next(b)); | ||
406 | } else | ||
407 | set_slob(prev, slob_units(prev), b); | ||
408 | } | ||
409 | out: | ||
161 | spin_unlock_irqrestore(&slob_lock, flags); | 410 | spin_unlock_irqrestore(&slob_lock, flags); |
162 | } | 411 | } |
163 | 412 | ||
164 | void *__kmalloc(size_t size, gfp_t gfp) | 413 | /* |
165 | { | 414 | * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. |
166 | slob_t *m; | 415 | */ |
167 | bigblock_t *bb; | ||
168 | unsigned long flags; | ||
169 | 416 | ||
170 | if (size < PAGE_SIZE - SLOB_UNIT) { | 417 | #ifndef ARCH_KMALLOC_MINALIGN |
171 | m = slob_alloc(size + SLOB_UNIT, gfp, 0); | 418 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) |
172 | return m ? (void *)(m + 1) : 0; | 419 | #endif |
173 | } | ||
174 | 420 | ||
175 | bb = slob_alloc(sizeof(bigblock_t), gfp, 0); | 421 | #ifndef ARCH_SLAB_MINALIGN |
176 | if (!bb) | 422 | #define ARCH_SLAB_MINALIGN __alignof__(unsigned long) |
177 | return 0; | 423 | #endif |
178 | 424 | ||
179 | bb->order = get_order(size); | 425 | void *__kmalloc_node(size_t size, gfp_t gfp, int node) |
180 | bb->pages = (void *)__get_free_pages(gfp, bb->order); | 426 | { |
427 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | ||
428 | |||
429 | if (size < PAGE_SIZE - align) { | ||
430 | unsigned int *m; | ||
431 | m = slob_alloc(size + align, gfp, align, node); | ||
432 | if (m) | ||
433 | *m = size; | ||
434 | return (void *)m + align; | ||
435 | } else { | ||
436 | void *ret; | ||
181 | 437 | ||
182 | if (bb->pages) { | 438 | ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); |
183 | spin_lock_irqsave(&block_lock, flags); | 439 | if (ret) { |
184 | bb->next = bigblocks; | 440 | struct page *page; |
185 | bigblocks = bb; | 441 | page = virt_to_page(ret); |
186 | spin_unlock_irqrestore(&block_lock, flags); | 442 | page->private = size; |
187 | return bb->pages; | 443 | } |
444 | return ret; | ||
188 | } | 445 | } |
189 | |||
190 | slob_free(bb, sizeof(bigblock_t)); | ||
191 | return 0; | ||
192 | } | 446 | } |
193 | EXPORT_SYMBOL(__kmalloc); | 447 | EXPORT_SYMBOL(__kmalloc_node); |
194 | 448 | ||
195 | /** | 449 | /** |
196 | * krealloc - reallocate memory. The contents will remain unchanged. | 450 | * krealloc - reallocate memory. The contents will remain unchanged. |
@@ -227,52 +481,34 @@ EXPORT_SYMBOL(krealloc); | |||
227 | 481 | ||
228 | void kfree(const void *block) | 482 | void kfree(const void *block) |
229 | { | 483 | { |
230 | bigblock_t *bb, **last = &bigblocks; | 484 | struct slob_page *sp; |
231 | unsigned long flags; | ||
232 | 485 | ||
233 | if (!block) | 486 | if (!block) |
234 | return; | 487 | return; |
235 | 488 | ||
236 | if (!((unsigned long)block & (PAGE_SIZE-1))) { | 489 | sp = (struct slob_page *)virt_to_page(block); |
237 | /* might be on the big block list */ | 490 | if (slob_page(sp)) { |
238 | spin_lock_irqsave(&block_lock, flags); | 491 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
239 | for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) { | 492 | unsigned int *m = (unsigned int *)(block - align); |
240 | if (bb->pages == block) { | 493 | slob_free(m, *m + align); |
241 | *last = bb->next; | 494 | } else |
242 | spin_unlock_irqrestore(&block_lock, flags); | 495 | put_page(&sp->page); |
243 | free_pages((unsigned long)block, bb->order); | ||
244 | slob_free(bb, sizeof(bigblock_t)); | ||
245 | return; | ||
246 | } | ||
247 | } | ||
248 | spin_unlock_irqrestore(&block_lock, flags); | ||
249 | } | ||
250 | |||
251 | slob_free((slob_t *)block - 1, 0); | ||
252 | return; | ||
253 | } | 496 | } |
254 | |||
255 | EXPORT_SYMBOL(kfree); | 497 | EXPORT_SYMBOL(kfree); |
256 | 498 | ||
499 | /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ | ||
257 | size_t ksize(const void *block) | 500 | size_t ksize(const void *block) |
258 | { | 501 | { |
259 | bigblock_t *bb; | 502 | struct slob_page *sp; |
260 | unsigned long flags; | ||
261 | 503 | ||
262 | if (!block) | 504 | if (!block) |
263 | return 0; | 505 | return 0; |
264 | 506 | ||
265 | if (!((unsigned long)block & (PAGE_SIZE-1))) { | 507 | sp = (struct slob_page *)virt_to_page(block); |
266 | spin_lock_irqsave(&block_lock, flags); | 508 | if (slob_page(sp)) |
267 | for (bb = bigblocks; bb; bb = bb->next) | 509 | return ((slob_t *)block - 1)->units + SLOB_UNIT; |
268 | if (bb->pages == block) { | 510 | else |
269 | spin_unlock_irqrestore(&slob_lock, flags); | 511 | return sp->page.private; |
270 | return PAGE_SIZE << bb->order; | ||
271 | } | ||
272 | spin_unlock_irqrestore(&block_lock, flags); | ||
273 | } | ||
274 | |||
275 | return ((slob_t *)block - 1)->units * SLOB_UNIT; | ||
276 | } | 512 | } |
277 | 513 | ||
278 | struct kmem_cache { | 514 | struct kmem_cache { |
@@ -289,7 +525,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
289 | { | 525 | { |
290 | struct kmem_cache *c; | 526 | struct kmem_cache *c; |
291 | 527 | ||
292 | c = slob_alloc(sizeof(struct kmem_cache), flags, 0); | 528 | c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1); |
293 | 529 | ||
294 | if (c) { | 530 | if (c) { |
295 | c->name = name; | 531 | c->name = name; |
@@ -302,6 +538,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
302 | c->ctor = ctor; | 538 | c->ctor = ctor; |
303 | /* ignore alignment unless it's forced */ | 539 | /* ignore alignment unless it's forced */ |
304 | c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; | 540 | c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; |
541 | if (c->align < ARCH_SLAB_MINALIGN) | ||
542 | c->align = ARCH_SLAB_MINALIGN; | ||
305 | if (c->align < align) | 543 | if (c->align < align) |
306 | c->align = align; | 544 | c->align = align; |
307 | } else if (flags & SLAB_PANIC) | 545 | } else if (flags & SLAB_PANIC) |
@@ -317,21 +555,21 @@ void kmem_cache_destroy(struct kmem_cache *c) | |||
317 | } | 555 | } |
318 | EXPORT_SYMBOL(kmem_cache_destroy); | 556 | EXPORT_SYMBOL(kmem_cache_destroy); |
319 | 557 | ||
320 | void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) | 558 | void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) |
321 | { | 559 | { |
322 | void *b; | 560 | void *b; |
323 | 561 | ||
324 | if (c->size < PAGE_SIZE) | 562 | if (c->size < PAGE_SIZE) |
325 | b = slob_alloc(c->size, flags, c->align); | 563 | b = slob_alloc(c->size, flags, c->align, node); |
326 | else | 564 | else |
327 | b = (void *)__get_free_pages(flags, get_order(c->size)); | 565 | b = slob_new_page(flags, get_order(c->size), node); |
328 | 566 | ||
329 | if (c->ctor) | 567 | if (c->ctor) |
330 | c->ctor(b, c, 0); | 568 | c->ctor(b, c, 0); |
331 | 569 | ||
332 | return b; | 570 | return b; |
333 | } | 571 | } |
334 | EXPORT_SYMBOL(kmem_cache_alloc); | 572 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
335 | 573 | ||
336 | void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) | 574 | void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) |
337 | { | 575 | { |
@@ -385,9 +623,6 @@ const char *kmem_cache_name(struct kmem_cache *c) | |||
385 | } | 623 | } |
386 | EXPORT_SYMBOL(kmem_cache_name); | 624 | EXPORT_SYMBOL(kmem_cache_name); |
387 | 625 | ||
388 | static struct timer_list slob_timer = TIMER_INITIALIZER( | ||
389 | (void (*)(unsigned long))slob_timer_cbk, 0, 0); | ||
390 | |||
391 | int kmem_cache_shrink(struct kmem_cache *d) | 626 | int kmem_cache_shrink(struct kmem_cache *d) |
392 | { | 627 | { |
393 | return 0; | 628 | return 0; |
@@ -399,17 +634,14 @@ int kmem_ptr_validate(struct kmem_cache *a, const void *b) | |||
399 | return 0; | 634 | return 0; |
400 | } | 635 | } |
401 | 636 | ||
402 | void __init kmem_cache_init(void) | 637 | static unsigned int slob_ready __read_mostly; |
638 | |||
639 | int slab_is_available(void) | ||
403 | { | 640 | { |
404 | slob_timer_cbk(); | 641 | return slob_ready; |
405 | } | 642 | } |
406 | 643 | ||
407 | static void slob_timer_cbk(void) | 644 | void __init kmem_cache_init(void) |
408 | { | 645 | { |
409 | void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); | 646 | slob_ready = 1; |
410 | |||
411 | if (p) | ||
412 | free_page((unsigned long)p); | ||
413 | |||
414 | mod_timer(&slob_timer, jiffies + HZ); | ||
415 | } | 647 | } |
@@ -323,7 +323,11 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) | |||
323 | /* | 323 | /* |
324 | * Debug settings: | 324 | * Debug settings: |
325 | */ | 325 | */ |
326 | #ifdef CONFIG_SLUB_DEBUG_ON | ||
327 | static int slub_debug = DEBUG_DEFAULT_FLAGS; | ||
328 | #else | ||
326 | static int slub_debug; | 329 | static int slub_debug; |
330 | #endif | ||
327 | 331 | ||
328 | static char *slub_debug_slabs; | 332 | static char *slub_debug_slabs; |
329 | 333 | ||
@@ -888,38 +892,57 @@ fail: | |||
888 | 892 | ||
889 | static int __init setup_slub_debug(char *str) | 893 | static int __init setup_slub_debug(char *str) |
890 | { | 894 | { |
891 | if (!str || *str != '=') | 895 | slub_debug = DEBUG_DEFAULT_FLAGS; |
892 | slub_debug = DEBUG_DEFAULT_FLAGS; | 896 | if (*str++ != '=' || !*str) |
893 | else { | 897 | /* |
894 | str++; | 898 | * No options specified. Switch on full debugging. |
895 | if (*str == 0 || *str == ',') | 899 | */ |
896 | slub_debug = DEBUG_DEFAULT_FLAGS; | 900 | goto out; |
897 | else | 901 | |
898 | for( ;*str && *str != ','; str++) | 902 | if (*str == ',') |
899 | switch (*str) { | 903 | /* |
900 | case 'f' : case 'F' : | 904 | * No options but restriction on slabs. This means full |
901 | slub_debug |= SLAB_DEBUG_FREE; | 905 | * debugging for slabs matching a pattern. |
902 | break; | 906 | */ |
903 | case 'z' : case 'Z' : | 907 | goto check_slabs; |
904 | slub_debug |= SLAB_RED_ZONE; | 908 | |
905 | break; | 909 | slub_debug = 0; |
906 | case 'p' : case 'P' : | 910 | if (*str == '-') |
907 | slub_debug |= SLAB_POISON; | 911 | /* |
908 | break; | 912 | * Switch off all debugging measures. |
909 | case 'u' : case 'U' : | 913 | */ |
910 | slub_debug |= SLAB_STORE_USER; | 914 | goto out; |
911 | break; | 915 | |
912 | case 't' : case 'T' : | 916 | /* |
913 | slub_debug |= SLAB_TRACE; | 917 | * Determine which debug features should be switched on |
914 | break; | 918 | */ |
915 | default: | 919 | for ( ;*str && *str != ','; str++) { |
916 | printk(KERN_ERR "slub_debug option '%c' " | 920 | switch (tolower(*str)) { |
917 | "unknown. skipped\n",*str); | 921 | case 'f': |
918 | } | 922 | slub_debug |= SLAB_DEBUG_FREE; |
923 | break; | ||
924 | case 'z': | ||
925 | slub_debug |= SLAB_RED_ZONE; | ||
926 | break; | ||
927 | case 'p': | ||
928 | slub_debug |= SLAB_POISON; | ||
929 | break; | ||
930 | case 'u': | ||
931 | slub_debug |= SLAB_STORE_USER; | ||
932 | break; | ||
933 | case 't': | ||
934 | slub_debug |= SLAB_TRACE; | ||
935 | break; | ||
936 | default: | ||
937 | printk(KERN_ERR "slub_debug option '%c' " | ||
938 | "unknown. skipped\n",*str); | ||
939 | } | ||
919 | } | 940 | } |
920 | 941 | ||
942 | check_slabs: | ||
921 | if (*str == ',') | 943 | if (*str == ',') |
922 | slub_debug_slabs = str + 1; | 944 | slub_debug_slabs = str + 1; |
945 | out: | ||
923 | return 1; | 946 | return 1; |
924 | } | 947 | } |
925 | 948 | ||
diff --git a/mm/swap_state.c b/mm/swap_state.c index 5f7cf2a4cb55..925d5c50f18d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -21,7 +21,7 @@ | |||
21 | 21 | ||
22 | /* | 22 | /* |
23 | * swapper_space is a fiction, retained to simplify the path through | 23 | * swapper_space is a fiction, retained to simplify the path through |
24 | * vmscan's shrink_list, to make sync_page look nicer, and to allow | 24 | * vmscan's shrink_page_list, to make sync_page look nicer, and to allow |
25 | * future use of radix_tree tags in the swap cache. | 25 | * future use of radix_tree tags in the swap cache. |
26 | */ | 26 | */ |
27 | static const struct address_space_operations swap_aops = { | 27 | static const struct address_space_operations swap_aops = { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index acc172cbe3aa..7ff0a81c7b01 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -885,7 +885,7 @@ static int try_to_unuse(unsigned int type) | |||
885 | /* | 885 | /* |
886 | * So we could skip searching mms once swap count went | 886 | * So we could skip searching mms once swap count went |
887 | * to 1, we did not mark any present ptes as dirty: must | 887 | * to 1, we did not mark any present ptes as dirty: must |
888 | * mark page dirty so shrink_list will preserve it. | 888 | * mark page dirty so shrink_page_list will preserve it. |
889 | */ | 889 | */ |
890 | SetPageDirty(page); | 890 | SetPageDirty(page); |
891 | unlock_page(page); | 891 | unlock_page(page); |
diff --git a/mm/truncate.c b/mm/truncate.c index 4fbe1a2da5fb..7c994f2d6145 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -253,21 +253,8 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
253 | } | 253 | } |
254 | EXPORT_SYMBOL(truncate_inode_pages); | 254 | EXPORT_SYMBOL(truncate_inode_pages); |
255 | 255 | ||
256 | /** | 256 | unsigned long __invalidate_mapping_pages(struct address_space *mapping, |
257 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode | 257 | pgoff_t start, pgoff_t end, bool be_atomic) |
258 | * @mapping: the address_space which holds the pages to invalidate | ||
259 | * @start: the offset 'from' which to invalidate | ||
260 | * @end: the offset 'to' which to invalidate (inclusive) | ||
261 | * | ||
262 | * This function only removes the unlocked pages, if you want to | ||
263 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
264 | * | ||
265 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
266 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
267 | * pagetables. | ||
268 | */ | ||
269 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
270 | pgoff_t start, pgoff_t end) | ||
271 | { | 258 | { |
272 | struct pagevec pvec; | 259 | struct pagevec pvec; |
273 | pgoff_t next = start; | 260 | pgoff_t next = start; |
@@ -308,17 +295,38 @@ unlock: | |||
308 | break; | 295 | break; |
309 | } | 296 | } |
310 | pagevec_release(&pvec); | 297 | pagevec_release(&pvec); |
298 | if (likely(!be_atomic)) | ||
299 | cond_resched(); | ||
311 | } | 300 | } |
312 | return ret; | 301 | return ret; |
313 | } | 302 | } |
303 | |||
304 | /** | ||
305 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode | ||
306 | * @mapping: the address_space which holds the pages to invalidate | ||
307 | * @start: the offset 'from' which to invalidate | ||
308 | * @end: the offset 'to' which to invalidate (inclusive) | ||
309 | * | ||
310 | * This function only removes the unlocked pages, if you want to | ||
311 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
312 | * | ||
313 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
314 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
315 | * pagetables. | ||
316 | */ | ||
317 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
318 | pgoff_t start, pgoff_t end) | ||
319 | { | ||
320 | return __invalidate_mapping_pages(mapping, start, end, false); | ||
321 | } | ||
314 | EXPORT_SYMBOL(invalidate_mapping_pages); | 322 | EXPORT_SYMBOL(invalidate_mapping_pages); |
315 | 323 | ||
316 | /* | 324 | /* |
317 | * This is like invalidate_complete_page(), except it ignores the page's | 325 | * This is like invalidate_complete_page(), except it ignores the page's |
318 | * refcount. We do this because invalidate_inode_pages2() needs stronger | 326 | * refcount. We do this because invalidate_inode_pages2() needs stronger |
319 | * invalidation guarantees, and cannot afford to leave pages behind because | 327 | * invalidation guarantees, and cannot afford to leave pages behind because |
320 | * shrink_list() has a temp ref on them, or because they're transiently sitting | 328 | * shrink_page_list() has a temp ref on them, or because they're transiently |
321 | * in the lru_cache_add() pagevecs. | 329 | * sitting in the lru_cache_add() pagevecs. |
322 | */ | 330 | */ |
323 | static int | 331 | static int |
324 | invalidate_complete_page2(struct address_space *mapping, struct page *page) | 332 | invalidate_complete_page2(struct address_space *mapping, struct page *page) |