diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/backing-dev.c | 16 | ||||
-rw-r--r-- | mm/filemap.c | 26 | ||||
-rw-r--r-- | mm/filemap_xip.c | 22 | ||||
-rw-r--r-- | mm/hugetlb.c | 24 | ||||
-rw-r--r-- | mm/madvise.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 25 | ||||
-rw-r--r-- | mm/mempolicy.c | 51 | ||||
-rw-r--r-- | mm/mempool.c | 3 | ||||
-rw-r--r-- | mm/mlock.c | 5 | ||||
-rw-r--r-- | mm/mmap.c | 38 | ||||
-rw-r--r-- | mm/mremap.c | 13 | ||||
-rw-r--r-- | mm/nommu.c | 7 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 332 | ||||
-rw-r--r-- | mm/rmap.c | 24 | ||||
-rw-r--r-- | mm/shmem.c | 44 | ||||
-rw-r--r-- | mm/slab.c | 67 | ||||
-rw-r--r-- | mm/slob.c | 538 | ||||
-rw-r--r-- | mm/slub.c | 142 | ||||
-rw-r--r-- | mm/sparse.c | 42 | ||||
-rw-r--r-- | mm/swap_state.c | 2 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 42 | ||||
-rw-r--r-- | mm/vmstat.c | 2 |
25 files changed, 1009 insertions, 478 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 62e5d0d0bd5a..086af703da43 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -168,3 +168,7 @@ config NR_QUICK | |||
168 | depends on QUICKLIST | 168 | depends on QUICKLIST |
169 | default "2" if (SUPERH && !SUPERH64) | 169 | default "2" if (SUPERH && !SUPERH64) |
170 | default "1" | 170 | default "1" |
171 | |||
172 | config VIRT_TO_BUS | ||
173 | def_bool y | ||
174 | depends on !ARCH_NO_VIRT_TO_BUS | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e5de3781d3fe..f50a2811f9dc 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -55,22 +55,6 @@ long congestion_wait(int rw, long timeout) | |||
55 | } | 55 | } |
56 | EXPORT_SYMBOL(congestion_wait); | 56 | EXPORT_SYMBOL(congestion_wait); |
57 | 57 | ||
58 | long congestion_wait_interruptible(int rw, long timeout) | ||
59 | { | ||
60 | long ret; | ||
61 | DEFINE_WAIT(wait); | ||
62 | wait_queue_head_t *wqh = &congestion_wqh[rw]; | ||
63 | |||
64 | prepare_to_wait(wqh, &wait, TASK_INTERRUPTIBLE); | ||
65 | if (signal_pending(current)) | ||
66 | ret = -ERESTARTSYS; | ||
67 | else | ||
68 | ret = io_schedule_timeout(timeout); | ||
69 | finish_wait(wqh, &wait); | ||
70 | return ret; | ||
71 | } | ||
72 | EXPORT_SYMBOL(congestion_wait_interruptible); | ||
73 | |||
74 | /** | 58 | /** |
75 | * congestion_end - wake up sleepers on a congested backing_dev_info | 59 | * congestion_end - wake up sleepers on a congested backing_dev_info |
76 | * @rw: READ or WRITE | 60 | * @rw: READ or WRITE |
diff --git a/mm/filemap.c b/mm/filemap.c index edb1b0b5cc8d..100b99c2d504 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -120,6 +120,7 @@ void __remove_from_page_cache(struct page *page) | |||
120 | page->mapping = NULL; | 120 | page->mapping = NULL; |
121 | mapping->nrpages--; | 121 | mapping->nrpages--; |
122 | __dec_zone_page_state(page, NR_FILE_PAGES); | 122 | __dec_zone_page_state(page, NR_FILE_PAGES); |
123 | BUG_ON(page_mapped(page)); | ||
123 | } | 124 | } |
124 | 125 | ||
125 | void remove_from_page_cache(struct page *page) | 126 | void remove_from_page_cache(struct page *page) |
@@ -1218,6 +1219,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1218 | retval = retval ?: desc.error; | 1219 | retval = retval ?: desc.error; |
1219 | break; | 1220 | break; |
1220 | } | 1221 | } |
1222 | if (desc.count > 0) | ||
1223 | break; | ||
1221 | } | 1224 | } |
1222 | } | 1225 | } |
1223 | out: | 1226 | out: |
@@ -1245,26 +1248,6 @@ int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long o | |||
1245 | return written; | 1248 | return written; |
1246 | } | 1249 | } |
1247 | 1250 | ||
1248 | ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos, | ||
1249 | size_t count, read_actor_t actor, void *target) | ||
1250 | { | ||
1251 | read_descriptor_t desc; | ||
1252 | |||
1253 | if (!count) | ||
1254 | return 0; | ||
1255 | |||
1256 | desc.written = 0; | ||
1257 | desc.count = count; | ||
1258 | desc.arg.data = target; | ||
1259 | desc.error = 0; | ||
1260 | |||
1261 | do_generic_file_read(in_file, ppos, &desc, actor); | ||
1262 | if (desc.written) | ||
1263 | return desc.written; | ||
1264 | return desc.error; | ||
1265 | } | ||
1266 | EXPORT_SYMBOL(generic_file_sendfile); | ||
1267 | |||
1268 | static ssize_t | 1251 | static ssize_t |
1269 | do_readahead(struct address_space *mapping, struct file *filp, | 1252 | do_readahead(struct address_space *mapping, struct file *filp, |
1270 | unsigned long index, unsigned long nr) | 1253 | unsigned long index, unsigned long nr) |
@@ -1786,7 +1769,6 @@ retry: | |||
1786 | page = __read_cache_page(mapping, index, filler, data); | 1769 | page = __read_cache_page(mapping, index, filler, data); |
1787 | if (IS_ERR(page)) | 1770 | if (IS_ERR(page)) |
1788 | return page; | 1771 | return page; |
1789 | mark_page_accessed(page); | ||
1790 | if (PageUptodate(page)) | 1772 | if (PageUptodate(page)) |
1791 | goto out; | 1773 | goto out; |
1792 | 1774 | ||
@@ -1985,7 +1967,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
1985 | if (unlikely(*pos + *count > MAX_NON_LFS && | 1967 | if (unlikely(*pos + *count > MAX_NON_LFS && |
1986 | !(file->f_flags & O_LARGEFILE))) { | 1968 | !(file->f_flags & O_LARGEFILE))) { |
1987 | if (*pos >= MAX_NON_LFS) { | 1969 | if (*pos >= MAX_NON_LFS) { |
1988 | send_sig(SIGXFSZ, current, 0); | ||
1989 | return -EFBIG; | 1970 | return -EFBIG; |
1990 | } | 1971 | } |
1991 | if (*count > MAX_NON_LFS - (unsigned long)*pos) { | 1972 | if (*count > MAX_NON_LFS - (unsigned long)*pos) { |
@@ -2003,7 +1984,6 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
2003 | if (likely(!isblk)) { | 1984 | if (likely(!isblk)) { |
2004 | if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { | 1985 | if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { |
2005 | if (*count || *pos > inode->i_sb->s_maxbytes) { | 1986 | if (*count || *pos > inode->i_sb->s_maxbytes) { |
2006 | send_sig(SIGXFSZ, current, 0); | ||
2007 | return -EFBIG; | 1987 | return -EFBIG; |
2008 | } | 1988 | } |
2009 | /* zero-length writes at ->s_maxbytes are OK */ | 1989 | /* zero-length writes at ->s_maxbytes are OK */ |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index fa360e566d88..65ffc321f0c0 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
@@ -159,28 +159,6 @@ xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) | |||
159 | } | 159 | } |
160 | EXPORT_SYMBOL_GPL(xip_file_read); | 160 | EXPORT_SYMBOL_GPL(xip_file_read); |
161 | 161 | ||
162 | ssize_t | ||
163 | xip_file_sendfile(struct file *in_file, loff_t *ppos, | ||
164 | size_t count, read_actor_t actor, void *target) | ||
165 | { | ||
166 | read_descriptor_t desc; | ||
167 | |||
168 | if (!count) | ||
169 | return 0; | ||
170 | |||
171 | desc.written = 0; | ||
172 | desc.count = count; | ||
173 | desc.arg.data = target; | ||
174 | desc.error = 0; | ||
175 | |||
176 | do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file, | ||
177 | ppos, &desc, actor); | ||
178 | if (desc.written) | ||
179 | return desc.written; | ||
180 | return desc.error; | ||
181 | } | ||
182 | EXPORT_SYMBOL_GPL(xip_file_sendfile); | ||
183 | |||
184 | /* | 162 | /* |
185 | * __xip_unmap is invoked from xip_unmap and | 163 | * __xip_unmap is invoked from xip_unmap and |
186 | * xip_write | 164 | * xip_write |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eb7180db3033..acc0fb3cf067 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -66,7 +66,7 @@ static void enqueue_huge_page(struct page *page) | |||
66 | static struct page *dequeue_huge_page(struct vm_area_struct *vma, | 66 | static struct page *dequeue_huge_page(struct vm_area_struct *vma, |
67 | unsigned long address) | 67 | unsigned long address) |
68 | { | 68 | { |
69 | int nid = numa_node_id(); | 69 | int nid; |
70 | struct page *page = NULL; | 70 | struct page *page = NULL; |
71 | struct zonelist *zonelist = huge_zonelist(vma, address); | 71 | struct zonelist *zonelist = huge_zonelist(vma, address); |
72 | struct zone **z; | 72 | struct zone **z; |
@@ -101,13 +101,20 @@ static void free_huge_page(struct page *page) | |||
101 | 101 | ||
102 | static int alloc_fresh_huge_page(void) | 102 | static int alloc_fresh_huge_page(void) |
103 | { | 103 | { |
104 | static int nid = 0; | 104 | static int prev_nid; |
105 | struct page *page; | 105 | struct page *page; |
106 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, | 106 | static DEFINE_SPINLOCK(nid_lock); |
107 | HUGETLB_PAGE_ORDER); | 107 | int nid; |
108 | nid = next_node(nid, node_online_map); | 108 | |
109 | spin_lock(&nid_lock); | ||
110 | nid = next_node(prev_nid, node_online_map); | ||
109 | if (nid == MAX_NUMNODES) | 111 | if (nid == MAX_NUMNODES) |
110 | nid = first_node(node_online_map); | 112 | nid = first_node(node_online_map); |
113 | prev_nid = nid; | ||
114 | spin_unlock(&nid_lock); | ||
115 | |||
116 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, | ||
117 | HUGETLB_PAGE_ORDER); | ||
111 | if (page) { | 118 | if (page) { |
112 | set_compound_page_dtor(page, free_huge_page); | 119 | set_compound_page_dtor(page, free_huge_page); |
113 | spin_lock(&hugetlb_lock); | 120 | spin_lock(&hugetlb_lock); |
@@ -326,9 +333,10 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
326 | pte_t entry; | 333 | pte_t entry; |
327 | 334 | ||
328 | entry = pte_mkwrite(pte_mkdirty(*ptep)); | 335 | entry = pte_mkwrite(pte_mkdirty(*ptep)); |
329 | ptep_set_access_flags(vma, address, ptep, entry, 1); | 336 | if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { |
330 | update_mmu_cache(vma, address, entry); | 337 | update_mmu_cache(vma, address, entry); |
331 | lazy_mmu_prot_update(entry); | 338 | lazy_mmu_prot_update(entry); |
339 | } | ||
332 | } | 340 | } |
333 | 341 | ||
334 | 342 | ||
diff --git a/mm/madvise.c b/mm/madvise.c index 60542d006ec1..93ee375b38e7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -287,9 +287,11 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) | |||
287 | struct vm_area_struct * vma, *prev; | 287 | struct vm_area_struct * vma, *prev; |
288 | int unmapped_error = 0; | 288 | int unmapped_error = 0; |
289 | int error = -EINVAL; | 289 | int error = -EINVAL; |
290 | int write; | ||
290 | size_t len; | 291 | size_t len; |
291 | 292 | ||
292 | if (madvise_need_mmap_write(behavior)) | 293 | write = madvise_need_mmap_write(behavior); |
294 | if (write) | ||
293 | down_write(¤t->mm->mmap_sem); | 295 | down_write(¤t->mm->mmap_sem); |
294 | else | 296 | else |
295 | down_read(¤t->mm->mmap_sem); | 297 | down_read(¤t->mm->mmap_sem); |
@@ -354,7 +356,7 @@ asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) | |||
354 | vma = find_vma(current->mm, start); | 356 | vma = find_vma(current->mm, start); |
355 | } | 357 | } |
356 | out: | 358 | out: |
357 | if (madvise_need_mmap_write(behavior)) | 359 | if (write) |
358 | up_write(¤t->mm->mmap_sem); | 360 | up_write(¤t->mm->mmap_sem); |
359 | else | 361 | else |
360 | up_read(¤t->mm->mmap_sem); | 362 | up_read(¤t->mm->mmap_sem); |
diff --git a/mm/memory.c b/mm/memory.c index cb94488ab96d..b3d73bb1f680 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -78,11 +78,9 @@ unsigned long num_physpages; | |||
78 | * and ZONE_HIGHMEM. | 78 | * and ZONE_HIGHMEM. |
79 | */ | 79 | */ |
80 | void * high_memory; | 80 | void * high_memory; |
81 | unsigned long vmalloc_earlyreserve; | ||
82 | 81 | ||
83 | EXPORT_SYMBOL(num_physpages); | 82 | EXPORT_SYMBOL(num_physpages); |
84 | EXPORT_SYMBOL(high_memory); | 83 | EXPORT_SYMBOL(high_memory); |
85 | EXPORT_SYMBOL(vmalloc_earlyreserve); | ||
86 | 84 | ||
87 | int randomize_va_space __read_mostly = 1; | 85 | int randomize_va_space __read_mostly = 1; |
88 | 86 | ||
@@ -1055,6 +1053,14 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1055 | do { | 1053 | do { |
1056 | struct page *page; | 1054 | struct page *page; |
1057 | 1055 | ||
1056 | /* | ||
1057 | * If tsk is ooming, cut off its access to large memory | ||
1058 | * allocations. It has a pending SIGKILL, but it can't | ||
1059 | * be processed until returning to user space. | ||
1060 | */ | ||
1061 | if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) | ||
1062 | return -ENOMEM; | ||
1063 | |||
1058 | if (write) | 1064 | if (write) |
1059 | foll_flags |= FOLL_WRITE; | 1065 | foll_flags |= FOLL_WRITE; |
1060 | 1066 | ||
@@ -1691,9 +1697,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1691 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 1697 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
1692 | entry = pte_mkyoung(orig_pte); | 1698 | entry = pte_mkyoung(orig_pte); |
1693 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1699 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1694 | ptep_set_access_flags(vma, address, page_table, entry, 1); | 1700 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) { |
1695 | update_mmu_cache(vma, address, entry); | 1701 | update_mmu_cache(vma, address, entry); |
1696 | lazy_mmu_prot_update(entry); | 1702 | lazy_mmu_prot_update(entry); |
1703 | } | ||
1697 | ret |= VM_FAULT_WRITE; | 1704 | ret |= VM_FAULT_WRITE; |
1698 | goto unlock; | 1705 | goto unlock; |
1699 | } | 1706 | } |
@@ -2525,10 +2532,9 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2525 | pte_t *pte, pmd_t *pmd, int write_access) | 2532 | pte_t *pte, pmd_t *pmd, int write_access) |
2526 | { | 2533 | { |
2527 | pte_t entry; | 2534 | pte_t entry; |
2528 | pte_t old_entry; | ||
2529 | spinlock_t *ptl; | 2535 | spinlock_t *ptl; |
2530 | 2536 | ||
2531 | old_entry = entry = *pte; | 2537 | entry = *pte; |
2532 | if (!pte_present(entry)) { | 2538 | if (!pte_present(entry)) { |
2533 | if (pte_none(entry)) { | 2539 | if (pte_none(entry)) { |
2534 | if (vma->vm_ops) { | 2540 | if (vma->vm_ops) { |
@@ -2561,8 +2567,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
2561 | entry = pte_mkdirty(entry); | 2567 | entry = pte_mkdirty(entry); |
2562 | } | 2568 | } |
2563 | entry = pte_mkyoung(entry); | 2569 | entry = pte_mkyoung(entry); |
2564 | if (!pte_same(old_entry, entry)) { | 2570 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { |
2565 | ptep_set_access_flags(vma, address, pte, entry, write_access); | ||
2566 | update_mmu_cache(vma, address, entry); | 2571 | update_mmu_cache(vma, address, entry); |
2567 | lazy_mmu_prot_update(entry); | 2572 | lazy_mmu_prot_update(entry); |
2568 | } else { | 2573 | } else { |
@@ -2674,7 +2679,7 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
2674 | write = (vma->vm_flags & VM_WRITE) != 0; | 2679 | write = (vma->vm_flags & VM_WRITE) != 0; |
2675 | BUG_ON(addr >= end); | 2680 | BUG_ON(addr >= end); |
2676 | BUG_ON(end > vma->vm_end); | 2681 | BUG_ON(end > vma->vm_end); |
2677 | len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE; | 2682 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
2678 | ret = get_user_pages(current, current->mm, addr, | 2683 | ret = get_user_pages(current, current->mm, addr, |
2679 | len, write, 0, NULL, NULL); | 2684 | len, write, 0, NULL, NULL); |
2680 | if (ret < 0) | 2685 | if (ret < 0) |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d76e8eb342d0..188f8d9c4aed 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -101,8 +101,6 @@ | |||
101 | static struct kmem_cache *policy_cache; | 101 | static struct kmem_cache *policy_cache; |
102 | static struct kmem_cache *sn_cache; | 102 | static struct kmem_cache *sn_cache; |
103 | 103 | ||
104 | #define PDprintk(fmt...) | ||
105 | |||
106 | /* Highest zone. An specific allocation for a zone below that is not | 104 | /* Highest zone. An specific allocation for a zone below that is not |
107 | policied. */ | 105 | policied. */ |
108 | enum zone_type policy_zone = 0; | 106 | enum zone_type policy_zone = 0; |
@@ -175,7 +173,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
175 | { | 173 | { |
176 | struct mempolicy *policy; | 174 | struct mempolicy *policy; |
177 | 175 | ||
178 | PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); | 176 | pr_debug("setting mode %d nodes[0] %lx\n", |
177 | mode, nodes ? nodes_addr(*nodes)[0] : -1); | ||
178 | |||
179 | if (mode == MPOL_DEFAULT) | 179 | if (mode == MPOL_DEFAULT) |
180 | return NULL; | 180 | return NULL; |
181 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); | 181 | policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); |
@@ -379,7 +379,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | |||
379 | int err = 0; | 379 | int err = 0; |
380 | struct mempolicy *old = vma->vm_policy; | 380 | struct mempolicy *old = vma->vm_policy; |
381 | 381 | ||
382 | PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", | 382 | pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", |
383 | vma->vm_start, vma->vm_end, vma->vm_pgoff, | 383 | vma->vm_start, vma->vm_end, vma->vm_pgoff, |
384 | vma->vm_ops, vma->vm_file, | 384 | vma->vm_ops, vma->vm_file, |
385 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); | 385 | vma->vm_ops ? vma->vm_ops->set_policy : NULL); |
@@ -776,8 +776,8 @@ long do_mbind(unsigned long start, unsigned long len, | |||
776 | if (!new) | 776 | if (!new) |
777 | flags |= MPOL_MF_DISCONTIG_OK; | 777 | flags |= MPOL_MF_DISCONTIG_OK; |
778 | 778 | ||
779 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | 779 | pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, |
780 | mode,nodes_addr(nodes)[0]); | 780 | mode, nmask ? nodes_addr(*nmask)[0] : -1); |
781 | 781 | ||
782 | down_write(&mm->mmap_sem); | 782 | down_write(&mm->mmap_sem); |
783 | vma = check_range(mm, start, end, nmask, | 783 | vma = check_range(mm, start, end, nmask, |
@@ -1434,7 +1434,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) | |||
1434 | } | 1434 | } |
1435 | rb_link_node(&new->nd, parent, p); | 1435 | rb_link_node(&new->nd, parent, p); |
1436 | rb_insert_color(&new->nd, &sp->root); | 1436 | rb_insert_color(&new->nd, &sp->root); |
1437 | PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, | 1437 | pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, |
1438 | new->policy ? new->policy->policy : 0); | 1438 | new->policy ? new->policy->policy : 0); |
1439 | } | 1439 | } |
1440 | 1440 | ||
@@ -1459,7 +1459,7 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) | |||
1459 | 1459 | ||
1460 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) | 1460 | static void sp_delete(struct shared_policy *sp, struct sp_node *n) |
1461 | { | 1461 | { |
1462 | PDprintk("deleting %lx-l%x\n", n->start, n->end); | 1462 | pr_debug("deleting %lx-l%lx\n", n->start, n->end); |
1463 | rb_erase(&n->nd, &sp->root); | 1463 | rb_erase(&n->nd, &sp->root); |
1464 | mpol_free(n->policy); | 1464 | mpol_free(n->policy); |
1465 | kmem_cache_free(sn_cache, n); | 1465 | kmem_cache_free(sn_cache, n); |
@@ -1558,10 +1558,10 @@ int mpol_set_shared_policy(struct shared_policy *info, | |||
1558 | struct sp_node *new = NULL; | 1558 | struct sp_node *new = NULL; |
1559 | unsigned long sz = vma_pages(vma); | 1559 | unsigned long sz = vma_pages(vma); |
1560 | 1560 | ||
1561 | PDprintk("set_shared_policy %lx sz %lu %d %lx\n", | 1561 | pr_debug("set_shared_policy %lx sz %lu %d %lx\n", |
1562 | vma->vm_pgoff, | 1562 | vma->vm_pgoff, |
1563 | sz, npol? npol->policy : -1, | 1563 | sz, npol? npol->policy : -1, |
1564 | npol ? nodes_addr(npol->v.nodes)[0] : -1); | 1564 | npol ? nodes_addr(npol->v.nodes)[0] : -1); |
1565 | 1565 | ||
1566 | if (npol) { | 1566 | if (npol) { |
1567 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); | 1567 | new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); |
@@ -1597,6 +1597,10 @@ void mpol_free_shared_policy(struct shared_policy *p) | |||
1597 | /* assumes fs == KERNEL_DS */ | 1597 | /* assumes fs == KERNEL_DS */ |
1598 | void __init numa_policy_init(void) | 1598 | void __init numa_policy_init(void) |
1599 | { | 1599 | { |
1600 | nodemask_t interleave_nodes; | ||
1601 | unsigned long largest = 0; | ||
1602 | int nid, prefer = 0; | ||
1603 | |||
1600 | policy_cache = kmem_cache_create("numa_policy", | 1604 | policy_cache = kmem_cache_create("numa_policy", |
1601 | sizeof(struct mempolicy), | 1605 | sizeof(struct mempolicy), |
1602 | 0, SLAB_PANIC, NULL, NULL); | 1606 | 0, SLAB_PANIC, NULL, NULL); |
@@ -1605,10 +1609,31 @@ void __init numa_policy_init(void) | |||
1605 | sizeof(struct sp_node), | 1609 | sizeof(struct sp_node), |
1606 | 0, SLAB_PANIC, NULL, NULL); | 1610 | 0, SLAB_PANIC, NULL, NULL); |
1607 | 1611 | ||
1608 | /* Set interleaving policy for system init. This way not all | 1612 | /* |
1609 | the data structures allocated at system boot end up in node zero. */ | 1613 | * Set interleaving policy for system init. Interleaving is only |
1614 | * enabled across suitably sized nodes (default is >= 16MB), or | ||
1615 | * fall back to the largest node if they're all smaller. | ||
1616 | */ | ||
1617 | nodes_clear(interleave_nodes); | ||
1618 | for_each_online_node(nid) { | ||
1619 | unsigned long total_pages = node_present_pages(nid); | ||
1620 | |||
1621 | /* Preserve the largest node */ | ||
1622 | if (largest < total_pages) { | ||
1623 | largest = total_pages; | ||
1624 | prefer = nid; | ||
1625 | } | ||
1626 | |||
1627 | /* Interleave this node? */ | ||
1628 | if ((total_pages << PAGE_SHIFT) >= (16 << 20)) | ||
1629 | node_set(nid, interleave_nodes); | ||
1630 | } | ||
1631 | |||
1632 | /* All too small, use the largest */ | ||
1633 | if (unlikely(nodes_empty(interleave_nodes))) | ||
1634 | node_set(prefer, interleave_nodes); | ||
1610 | 1635 | ||
1611 | if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) | 1636 | if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes)) |
1612 | printk("numa_policy_init: interleaving failed\n"); | 1637 | printk("numa_policy_init: interleaving failed\n"); |
1613 | } | 1638 | } |
1614 | 1639 | ||
diff --git a/mm/mempool.c b/mm/mempool.c index cc1ca86dfc24..3e8f1fed0e1f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
@@ -263,6 +263,9 @@ void mempool_free(void *element, mempool_t *pool) | |||
263 | { | 263 | { |
264 | unsigned long flags; | 264 | unsigned long flags; |
265 | 265 | ||
266 | if (unlikely(element == NULL)) | ||
267 | return; | ||
268 | |||
266 | smp_mb(); | 269 | smp_mb(); |
267 | if (pool->curr_nr < pool->min_nr) { | 270 | if (pool->curr_nr < pool->min_nr) { |
268 | spin_lock_irqsave(&pool->lock, flags); | 271 | spin_lock_irqsave(&pool->lock, flags); |
diff --git a/mm/mlock.c b/mm/mlock.c index 4d3fea267e0d..7b2656055d6a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -244,9 +244,12 @@ int user_shm_lock(size_t size, struct user_struct *user) | |||
244 | 244 | ||
245 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 245 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
246 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 246 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; |
247 | if (lock_limit == RLIM_INFINITY) | ||
248 | allowed = 1; | ||
247 | lock_limit >>= PAGE_SHIFT; | 249 | lock_limit >>= PAGE_SHIFT; |
248 | spin_lock(&shmlock_user_lock); | 250 | spin_lock(&shmlock_user_lock); |
249 | if (locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) | 251 | if (!allowed && |
252 | locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) | ||
250 | goto out; | 253 | goto out; |
251 | get_uid(user); | 254 | get_uid(user); |
252 | user->locked_shm += locked; | 255 | user->locked_shm += locked; |
@@ -894,14 +894,11 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
894 | unsigned long flags, unsigned long pgoff) | 894 | unsigned long flags, unsigned long pgoff) |
895 | { | 895 | { |
896 | struct mm_struct * mm = current->mm; | 896 | struct mm_struct * mm = current->mm; |
897 | struct vm_area_struct * vma, * prev; | ||
898 | struct inode *inode; | 897 | struct inode *inode; |
899 | unsigned int vm_flags; | 898 | unsigned int vm_flags; |
900 | int correct_wcount = 0; | ||
901 | int error; | 899 | int error; |
902 | struct rb_node ** rb_link, * rb_parent; | ||
903 | int accountable = 1; | 900 | int accountable = 1; |
904 | unsigned long charged = 0, reqprot = prot; | 901 | unsigned long reqprot = prot; |
905 | 902 | ||
906 | /* | 903 | /* |
907 | * Does the application expect PROT_READ to imply PROT_EXEC? | 904 | * Does the application expect PROT_READ to imply PROT_EXEC? |
@@ -1023,10 +1020,28 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, | |||
1023 | } | 1020 | } |
1024 | } | 1021 | } |
1025 | 1022 | ||
1026 | error = security_file_mmap(file, reqprot, prot, flags); | 1023 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); |
1027 | if (error) | 1024 | if (error) |
1028 | return error; | 1025 | return error; |
1029 | 1026 | ||
1027 | return mmap_region(file, addr, len, flags, vm_flags, pgoff, | ||
1028 | accountable); | ||
1029 | } | ||
1030 | EXPORT_SYMBOL(do_mmap_pgoff); | ||
1031 | |||
1032 | unsigned long mmap_region(struct file *file, unsigned long addr, | ||
1033 | unsigned long len, unsigned long flags, | ||
1034 | unsigned int vm_flags, unsigned long pgoff, | ||
1035 | int accountable) | ||
1036 | { | ||
1037 | struct mm_struct *mm = current->mm; | ||
1038 | struct vm_area_struct *vma, *prev; | ||
1039 | int correct_wcount = 0; | ||
1040 | int error; | ||
1041 | struct rb_node **rb_link, *rb_parent; | ||
1042 | unsigned long charged = 0; | ||
1043 | struct inode *inode = file ? file->f_path.dentry->d_inode : NULL; | ||
1044 | |||
1030 | /* Clear old maps */ | 1045 | /* Clear old maps */ |
1031 | error = -ENOMEM; | 1046 | error = -ENOMEM; |
1032 | munmap_back: | 1047 | munmap_back: |
@@ -1175,8 +1190,6 @@ unacct_error: | |||
1175 | return error; | 1190 | return error; |
1176 | } | 1191 | } |
1177 | 1192 | ||
1178 | EXPORT_SYMBOL(do_mmap_pgoff); | ||
1179 | |||
1180 | /* Get an address range which is currently unmapped. | 1193 | /* Get an address range which is currently unmapped. |
1181 | * For shmat() with addr=0. | 1194 | * For shmat() with addr=0. |
1182 | * | 1195 | * |
@@ -1536,9 +1549,14 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
1536 | * vma->vm_start/vm_end cannot change under us because the caller | 1549 | * vma->vm_start/vm_end cannot change under us because the caller |
1537 | * is required to hold the mmap_sem in read mode. We need the | 1550 | * is required to hold the mmap_sem in read mode. We need the |
1538 | * anon_vma lock to serialize against concurrent expand_stacks. | 1551 | * anon_vma lock to serialize against concurrent expand_stacks. |
1552 | * Also guard against wrapping around to address 0. | ||
1539 | */ | 1553 | */ |
1540 | address += 4 + PAGE_SIZE - 1; | 1554 | if (address < PAGE_ALIGN(address+4)) |
1541 | address &= PAGE_MASK; | 1555 | address = PAGE_ALIGN(address+4); |
1556 | else { | ||
1557 | anon_vma_unlock(vma); | ||
1558 | return -ENOMEM; | ||
1559 | } | ||
1542 | error = 0; | 1560 | error = 0; |
1543 | 1561 | ||
1544 | /* Somebody else might have raced and expanded it already */ | 1562 | /* Somebody else might have raced and expanded it already */ |
diff --git a/mm/mremap.c b/mm/mremap.c index 5d4bd4f95b8e..bc7c52efc71b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -291,6 +291,10 @@ unsigned long do_mremap(unsigned long addr, | |||
291 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | 291 | if ((addr <= new_addr) && (addr+old_len) > new_addr) |
292 | goto out; | 292 | goto out; |
293 | 293 | ||
294 | ret = security_file_mmap(0, 0, 0, 0, new_addr, 1); | ||
295 | if (ret) | ||
296 | goto out; | ||
297 | |||
294 | ret = do_munmap(mm, new_addr, new_len); | 298 | ret = do_munmap(mm, new_addr, new_len); |
295 | if (ret) | 299 | if (ret) |
296 | goto out; | 300 | goto out; |
@@ -390,8 +394,13 @@ unsigned long do_mremap(unsigned long addr, | |||
390 | 394 | ||
391 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, | 395 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, |
392 | vma->vm_pgoff, map_flags); | 396 | vma->vm_pgoff, map_flags); |
393 | ret = new_addr; | 397 | if (new_addr & ~PAGE_MASK) { |
394 | if (new_addr & ~PAGE_MASK) | 398 | ret = new_addr; |
399 | goto out; | ||
400 | } | ||
401 | |||
402 | ret = security_file_mmap(0, 0, 0, 0, new_addr, 1); | ||
403 | if (ret) | ||
395 | goto out; | 404 | goto out; |
396 | } | 405 | } |
397 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 406 | ret = move_vma(vma, addr, old_len, new_len, new_addr); |
diff --git a/mm/nommu.c b/mm/nommu.c index 2b16b00a5b11..8bbbf147a794 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -367,6 +367,11 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
367 | return find_vma(mm, addr); | 367 | return find_vma(mm, addr); |
368 | } | 368 | } |
369 | 369 | ||
370 | int expand_stack(struct vm_area_struct *vma, unsigned long address) | ||
371 | { | ||
372 | return -ENOMEM; | ||
373 | } | ||
374 | |||
370 | /* | 375 | /* |
371 | * look up the first VMA exactly that exactly matches addr | 376 | * look up the first VMA exactly that exactly matches addr |
372 | * - should be called with mm->mmap_sem at least held readlocked | 377 | * - should be called with mm->mmap_sem at least held readlocked |
@@ -639,7 +644,7 @@ static int validate_mmap_request(struct file *file, | |||
639 | } | 644 | } |
640 | 645 | ||
641 | /* allow the security API to have its say */ | 646 | /* allow the security API to have its say */ |
642 | ret = security_file_mmap(file, reqprot, prot, flags); | 647 | ret = security_file_mmap(file, reqprot, prot, flags, addr, 0); |
643 | if (ret < 0) | 648 | if (ret < 0) |
644 | return ret; | 649 | return ret; |
645 | 650 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index eec1481ba44f..ea9da3bed3e9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -476,15 +476,13 @@ static void wb_kupdate(unsigned long arg) | |||
476 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs | 476 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
477 | */ | 477 | */ |
478 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, | 478 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, |
479 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 479 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) |
480 | { | 480 | { |
481 | proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); | 481 | proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); |
482 | if (dirty_writeback_interval) { | 482 | if (dirty_writeback_interval) |
483 | mod_timer(&wb_timer, | 483 | mod_timer(&wb_timer, jiffies + dirty_writeback_interval); |
484 | jiffies + dirty_writeback_interval); | 484 | else |
485 | } else { | ||
486 | del_timer(&wb_timer); | 485 | del_timer(&wb_timer); |
487 | } | ||
488 | return 0; | 486 | return 0; |
489 | } | 487 | } |
490 | 488 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd8e33582d25..f9e4e647d7e8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -126,13 +126,13 @@ static unsigned long __meminitdata dma_reserve; | |||
126 | #endif | 126 | #endif |
127 | #endif | 127 | #endif |
128 | 128 | ||
129 | struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; | 129 | static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS]; |
130 | int __meminitdata nr_nodemap_entries; | 130 | static int __meminitdata nr_nodemap_entries; |
131 | unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 131 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
132 | unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 132 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
133 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | 133 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE |
134 | unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES]; | 134 | static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; |
135 | unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES]; | 135 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; |
136 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 136 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
137 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ | 137 | #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ |
138 | 138 | ||
@@ -900,11 +900,13 @@ static struct fail_page_alloc_attr { | |||
900 | 900 | ||
901 | u32 ignore_gfp_highmem; | 901 | u32 ignore_gfp_highmem; |
902 | u32 ignore_gfp_wait; | 902 | u32 ignore_gfp_wait; |
903 | u32 min_order; | ||
903 | 904 | ||
904 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 905 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
905 | 906 | ||
906 | struct dentry *ignore_gfp_highmem_file; | 907 | struct dentry *ignore_gfp_highmem_file; |
907 | struct dentry *ignore_gfp_wait_file; | 908 | struct dentry *ignore_gfp_wait_file; |
909 | struct dentry *min_order_file; | ||
908 | 910 | ||
909 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ | 911 | #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ |
910 | 912 | ||
@@ -912,6 +914,7 @@ static struct fail_page_alloc_attr { | |||
912 | .attr = FAULT_ATTR_INITIALIZER, | 914 | .attr = FAULT_ATTR_INITIALIZER, |
913 | .ignore_gfp_wait = 1, | 915 | .ignore_gfp_wait = 1, |
914 | .ignore_gfp_highmem = 1, | 916 | .ignore_gfp_highmem = 1, |
917 | .min_order = 1, | ||
915 | }; | 918 | }; |
916 | 919 | ||
917 | static int __init setup_fail_page_alloc(char *str) | 920 | static int __init setup_fail_page_alloc(char *str) |
@@ -922,6 +925,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc); | |||
922 | 925 | ||
923 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 926 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
924 | { | 927 | { |
928 | if (order < fail_page_alloc.min_order) | ||
929 | return 0; | ||
925 | if (gfp_mask & __GFP_NOFAIL) | 930 | if (gfp_mask & __GFP_NOFAIL) |
926 | return 0; | 931 | return 0; |
927 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | 932 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) |
@@ -953,12 +958,17 @@ static int __init fail_page_alloc_debugfs(void) | |||
953 | fail_page_alloc.ignore_gfp_highmem_file = | 958 | fail_page_alloc.ignore_gfp_highmem_file = |
954 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, | 959 | debugfs_create_bool("ignore-gfp-highmem", mode, dir, |
955 | &fail_page_alloc.ignore_gfp_highmem); | 960 | &fail_page_alloc.ignore_gfp_highmem); |
961 | fail_page_alloc.min_order_file = | ||
962 | debugfs_create_u32("min-order", mode, dir, | ||
963 | &fail_page_alloc.min_order); | ||
956 | 964 | ||
957 | if (!fail_page_alloc.ignore_gfp_wait_file || | 965 | if (!fail_page_alloc.ignore_gfp_wait_file || |
958 | !fail_page_alloc.ignore_gfp_highmem_file) { | 966 | !fail_page_alloc.ignore_gfp_highmem_file || |
967 | !fail_page_alloc.min_order_file) { | ||
959 | err = -ENOMEM; | 968 | err = -ENOMEM; |
960 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); | 969 | debugfs_remove(fail_page_alloc.ignore_gfp_wait_file); |
961 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); | 970 | debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file); |
971 | debugfs_remove(fail_page_alloc.min_order_file); | ||
962 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); | 972 | cleanup_fault_attr_dentries(&fail_page_alloc.attr); |
963 | } | 973 | } |
964 | 974 | ||
@@ -1621,8 +1631,8 @@ void show_free_areas(void) | |||
1621 | * | 1631 | * |
1622 | * Add all populated zones of a node to the zonelist. | 1632 | * Add all populated zones of a node to the zonelist. |
1623 | */ | 1633 | */ |
1624 | static int __meminit build_zonelists_node(pg_data_t *pgdat, | 1634 | static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, |
1625 | struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) | 1635 | int nr_zones, enum zone_type zone_type) |
1626 | { | 1636 | { |
1627 | struct zone *zone; | 1637 | struct zone *zone; |
1628 | 1638 | ||
@@ -1641,9 +1651,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat, | |||
1641 | return nr_zones; | 1651 | return nr_zones; |
1642 | } | 1652 | } |
1643 | 1653 | ||
1654 | |||
1655 | /* | ||
1656 | * zonelist_order: | ||
1657 | * 0 = automatic detection of better ordering. | ||
1658 | * 1 = order by ([node] distance, -zonetype) | ||
1659 | * 2 = order by (-zonetype, [node] distance) | ||
1660 | * | ||
1661 | * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create | ||
1662 | * the same zonelist. So only NUMA can configure this param. | ||
1663 | */ | ||
1664 | #define ZONELIST_ORDER_DEFAULT 0 | ||
1665 | #define ZONELIST_ORDER_NODE 1 | ||
1666 | #define ZONELIST_ORDER_ZONE 2 | ||
1667 | |||
1668 | /* zonelist order in the kernel. | ||
1669 | * set_zonelist_order() will set this to NODE or ZONE. | ||
1670 | */ | ||
1671 | static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
1672 | static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; | ||
1673 | |||
1674 | |||
1644 | #ifdef CONFIG_NUMA | 1675 | #ifdef CONFIG_NUMA |
1676 | /* The value user specified ....changed by config */ | ||
1677 | static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
1678 | /* string for sysctl */ | ||
1679 | #define NUMA_ZONELIST_ORDER_LEN 16 | ||
1680 | char numa_zonelist_order[16] = "default"; | ||
1681 | |||
1682 | /* | ||
1683 | * interface for configure zonelist ordering. | ||
1684 | * command line option "numa_zonelist_order" | ||
1685 | * = "[dD]efault - default, automatic configuration. | ||
1686 | * = "[nN]ode - order by node locality, then by zone within node | ||
1687 | * = "[zZ]one - order by zone, then by locality within zone | ||
1688 | */ | ||
1689 | |||
1690 | static int __parse_numa_zonelist_order(char *s) | ||
1691 | { | ||
1692 | if (*s == 'd' || *s == 'D') { | ||
1693 | user_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
1694 | } else if (*s == 'n' || *s == 'N') { | ||
1695 | user_zonelist_order = ZONELIST_ORDER_NODE; | ||
1696 | } else if (*s == 'z' || *s == 'Z') { | ||
1697 | user_zonelist_order = ZONELIST_ORDER_ZONE; | ||
1698 | } else { | ||
1699 | printk(KERN_WARNING | ||
1700 | "Ignoring invalid numa_zonelist_order value: " | ||
1701 | "%s\n", s); | ||
1702 | return -EINVAL; | ||
1703 | } | ||
1704 | return 0; | ||
1705 | } | ||
1706 | |||
1707 | static __init int setup_numa_zonelist_order(char *s) | ||
1708 | { | ||
1709 | if (s) | ||
1710 | return __parse_numa_zonelist_order(s); | ||
1711 | return 0; | ||
1712 | } | ||
1713 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | ||
1714 | |||
1715 | /* | ||
1716 | * sysctl handler for numa_zonelist_order | ||
1717 | */ | ||
1718 | int numa_zonelist_order_handler(ctl_table *table, int write, | ||
1719 | struct file *file, void __user *buffer, size_t *length, | ||
1720 | loff_t *ppos) | ||
1721 | { | ||
1722 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | ||
1723 | int ret; | ||
1724 | |||
1725 | if (write) | ||
1726 | strncpy(saved_string, (char*)table->data, | ||
1727 | NUMA_ZONELIST_ORDER_LEN); | ||
1728 | ret = proc_dostring(table, write, file, buffer, length, ppos); | ||
1729 | if (ret) | ||
1730 | return ret; | ||
1731 | if (write) { | ||
1732 | int oldval = user_zonelist_order; | ||
1733 | if (__parse_numa_zonelist_order((char*)table->data)) { | ||
1734 | /* | ||
1735 | * bogus value. restore saved string | ||
1736 | */ | ||
1737 | strncpy((char*)table->data, saved_string, | ||
1738 | NUMA_ZONELIST_ORDER_LEN); | ||
1739 | user_zonelist_order = oldval; | ||
1740 | } else if (oldval != user_zonelist_order) | ||
1741 | build_all_zonelists(); | ||
1742 | } | ||
1743 | return 0; | ||
1744 | } | ||
1745 | |||
1746 | |||
1645 | #define MAX_NODE_LOAD (num_online_nodes()) | 1747 | #define MAX_NODE_LOAD (num_online_nodes()) |
1646 | static int __meminitdata node_load[MAX_NUMNODES]; | 1748 | static int node_load[MAX_NUMNODES]; |
1749 | |||
1647 | /** | 1750 | /** |
1648 | * find_next_best_node - find the next node that should appear in a given node's fallback list | 1751 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
1649 | * @node: node whose fallback list we're appending | 1752 | * @node: node whose fallback list we're appending |
@@ -1658,7 +1761,7 @@ static int __meminitdata node_load[MAX_NUMNODES]; | |||
1658 | * on them otherwise. | 1761 | * on them otherwise. |
1659 | * It returns -1 if no node is found. | 1762 | * It returns -1 if no node is found. |
1660 | */ | 1763 | */ |
1661 | static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) | 1764 | static int find_next_best_node(int node, nodemask_t *used_node_mask) |
1662 | { | 1765 | { |
1663 | int n, val; | 1766 | int n, val; |
1664 | int min_val = INT_MAX; | 1767 | int min_val = INT_MAX; |
@@ -1704,13 +1807,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) | |||
1704 | return best_node; | 1807 | return best_node; |
1705 | } | 1808 | } |
1706 | 1809 | ||
1707 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1810 | |
1811 | /* | ||
1812 | * Build zonelists ordered by node and zones within node. | ||
1813 | * This results in maximum locality--normal zone overflows into local | ||
1814 | * DMA zone, if any--but risks exhausting DMA zone. | ||
1815 | */ | ||
1816 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | ||
1708 | { | 1817 | { |
1709 | int j, node, local_node; | ||
1710 | enum zone_type i; | 1818 | enum zone_type i; |
1711 | int prev_node, load; | 1819 | int j; |
1712 | struct zonelist *zonelist; | 1820 | struct zonelist *zonelist; |
1821 | |||
1822 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1823 | zonelist = pgdat->node_zonelists + i; | ||
1824 | for (j = 0; zonelist->zones[j] != NULL; j++) | ||
1825 | ; | ||
1826 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | ||
1827 | zonelist->zones[j] = NULL; | ||
1828 | } | ||
1829 | } | ||
1830 | |||
1831 | /* | ||
1832 | * Build zonelists ordered by zone and nodes within zones. | ||
1833 | * This results in conserving DMA zone[s] until all Normal memory is | ||
1834 | * exhausted, but results in overflowing to remote node while memory | ||
1835 | * may still exist in local DMA zone. | ||
1836 | */ | ||
1837 | static int node_order[MAX_NUMNODES]; | ||
1838 | |||
1839 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | ||
1840 | { | ||
1841 | enum zone_type i; | ||
1842 | int pos, j, node; | ||
1843 | int zone_type; /* needs to be signed */ | ||
1844 | struct zone *z; | ||
1845 | struct zonelist *zonelist; | ||
1846 | |||
1847 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1848 | zonelist = pgdat->node_zonelists + i; | ||
1849 | pos = 0; | ||
1850 | for (zone_type = i; zone_type >= 0; zone_type--) { | ||
1851 | for (j = 0; j < nr_nodes; j++) { | ||
1852 | node = node_order[j]; | ||
1853 | z = &NODE_DATA(node)->node_zones[zone_type]; | ||
1854 | if (populated_zone(z)) { | ||
1855 | zonelist->zones[pos++] = z; | ||
1856 | check_highest_zone(zone_type); | ||
1857 | } | ||
1858 | } | ||
1859 | } | ||
1860 | zonelist->zones[pos] = NULL; | ||
1861 | } | ||
1862 | } | ||
1863 | |||
1864 | static int default_zonelist_order(void) | ||
1865 | { | ||
1866 | int nid, zone_type; | ||
1867 | unsigned long low_kmem_size,total_size; | ||
1868 | struct zone *z; | ||
1869 | int average_size; | ||
1870 | /* | ||
1871 | * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. | ||
1872 | * If they are really small and used heavily, the system can fall | ||
1873 | * into OOM very easily. | ||
1874 | * This function detect ZONE_DMA/DMA32 size and confgigures zone order. | ||
1875 | */ | ||
1876 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ | ||
1877 | low_kmem_size = 0; | ||
1878 | total_size = 0; | ||
1879 | for_each_online_node(nid) { | ||
1880 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | ||
1881 | z = &NODE_DATA(nid)->node_zones[zone_type]; | ||
1882 | if (populated_zone(z)) { | ||
1883 | if (zone_type < ZONE_NORMAL) | ||
1884 | low_kmem_size += z->present_pages; | ||
1885 | total_size += z->present_pages; | ||
1886 | } | ||
1887 | } | ||
1888 | } | ||
1889 | if (!low_kmem_size || /* there are no DMA area. */ | ||
1890 | low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ | ||
1891 | return ZONELIST_ORDER_NODE; | ||
1892 | /* | ||
1893 | * look into each node's config. | ||
1894 | * If there is a node whose DMA/DMA32 memory is very big area on | ||
1895 | * local memory, NODE_ORDER may be suitable. | ||
1896 | */ | ||
1897 | average_size = total_size / (num_online_nodes() + 1); | ||
1898 | for_each_online_node(nid) { | ||
1899 | low_kmem_size = 0; | ||
1900 | total_size = 0; | ||
1901 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | ||
1902 | z = &NODE_DATA(nid)->node_zones[zone_type]; | ||
1903 | if (populated_zone(z)) { | ||
1904 | if (zone_type < ZONE_NORMAL) | ||
1905 | low_kmem_size += z->present_pages; | ||
1906 | total_size += z->present_pages; | ||
1907 | } | ||
1908 | } | ||
1909 | if (low_kmem_size && | ||
1910 | total_size > average_size && /* ignore small node */ | ||
1911 | low_kmem_size > total_size * 70/100) | ||
1912 | return ZONELIST_ORDER_NODE; | ||
1913 | } | ||
1914 | return ZONELIST_ORDER_ZONE; | ||
1915 | } | ||
1916 | |||
1917 | static void set_zonelist_order(void) | ||
1918 | { | ||
1919 | if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) | ||
1920 | current_zonelist_order = default_zonelist_order(); | ||
1921 | else | ||
1922 | current_zonelist_order = user_zonelist_order; | ||
1923 | } | ||
1924 | |||
1925 | static void build_zonelists(pg_data_t *pgdat) | ||
1926 | { | ||
1927 | int j, node, load; | ||
1928 | enum zone_type i; | ||
1713 | nodemask_t used_mask; | 1929 | nodemask_t used_mask; |
1930 | int local_node, prev_node; | ||
1931 | struct zonelist *zonelist; | ||
1932 | int order = current_zonelist_order; | ||
1714 | 1933 | ||
1715 | /* initialize zonelists */ | 1934 | /* initialize zonelists */ |
1716 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1935 | for (i = 0; i < MAX_NR_ZONES; i++) { |
@@ -1723,6 +1942,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1723 | load = num_online_nodes(); | 1942 | load = num_online_nodes(); |
1724 | prev_node = local_node; | 1943 | prev_node = local_node; |
1725 | nodes_clear(used_mask); | 1944 | nodes_clear(used_mask); |
1945 | |||
1946 | memset(node_load, 0, sizeof(node_load)); | ||
1947 | memset(node_order, 0, sizeof(node_order)); | ||
1948 | j = 0; | ||
1949 | |||
1726 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 1950 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
1727 | int distance = node_distance(local_node, node); | 1951 | int distance = node_distance(local_node, node); |
1728 | 1952 | ||
@@ -1738,23 +1962,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1738 | * So adding penalty to the first node in same | 1962 | * So adding penalty to the first node in same |
1739 | * distance group to make it round-robin. | 1963 | * distance group to make it round-robin. |
1740 | */ | 1964 | */ |
1741 | |||
1742 | if (distance != node_distance(local_node, prev_node)) | 1965 | if (distance != node_distance(local_node, prev_node)) |
1743 | node_load[node] += load; | 1966 | node_load[node] = load; |
1967 | |||
1744 | prev_node = node; | 1968 | prev_node = node; |
1745 | load--; | 1969 | load--; |
1746 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1970 | if (order == ZONELIST_ORDER_NODE) |
1747 | zonelist = pgdat->node_zonelists + i; | 1971 | build_zonelists_in_node_order(pgdat, node); |
1748 | for (j = 0; zonelist->zones[j] != NULL; j++); | 1972 | else |
1973 | node_order[j++] = node; /* remember order */ | ||
1974 | } | ||
1749 | 1975 | ||
1750 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 1976 | if (order == ZONELIST_ORDER_ZONE) { |
1751 | zonelist->zones[j] = NULL; | 1977 | /* calculate node order -- i.e., DMA last! */ |
1752 | } | 1978 | build_zonelists_in_zone_order(pgdat, j); |
1753 | } | 1979 | } |
1754 | } | 1980 | } |
1755 | 1981 | ||
1756 | /* Construct the zonelist performance cache - see further mmzone.h */ | 1982 | /* Construct the zonelist performance cache - see further mmzone.h */ |
1757 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | 1983 | static void build_zonelist_cache(pg_data_t *pgdat) |
1758 | { | 1984 | { |
1759 | int i; | 1985 | int i; |
1760 | 1986 | ||
@@ -1771,9 +1997,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat) | |||
1771 | } | 1997 | } |
1772 | } | 1998 | } |
1773 | 1999 | ||
2000 | |||
1774 | #else /* CONFIG_NUMA */ | 2001 | #else /* CONFIG_NUMA */ |
1775 | 2002 | ||
1776 | static void __meminit build_zonelists(pg_data_t *pgdat) | 2003 | static void set_zonelist_order(void) |
2004 | { | ||
2005 | current_zonelist_order = ZONELIST_ORDER_ZONE; | ||
2006 | } | ||
2007 | |||
2008 | static void build_zonelists(pg_data_t *pgdat) | ||
1777 | { | 2009 | { |
1778 | int node, local_node; | 2010 | int node, local_node; |
1779 | enum zone_type i,j; | 2011 | enum zone_type i,j; |
@@ -1809,7 +2041,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1809 | } | 2041 | } |
1810 | 2042 | ||
1811 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | 2043 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ |
1812 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | 2044 | static void build_zonelist_cache(pg_data_t *pgdat) |
1813 | { | 2045 | { |
1814 | int i; | 2046 | int i; |
1815 | 2047 | ||
@@ -1820,7 +2052,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat) | |||
1820 | #endif /* CONFIG_NUMA */ | 2052 | #endif /* CONFIG_NUMA */ |
1821 | 2053 | ||
1822 | /* return values int ....just for stop_machine_run() */ | 2054 | /* return values int ....just for stop_machine_run() */ |
1823 | static int __meminit __build_all_zonelists(void *dummy) | 2055 | static int __build_all_zonelists(void *dummy) |
1824 | { | 2056 | { |
1825 | int nid; | 2057 | int nid; |
1826 | 2058 | ||
@@ -1831,8 +2063,10 @@ static int __meminit __build_all_zonelists(void *dummy) | |||
1831 | return 0; | 2063 | return 0; |
1832 | } | 2064 | } |
1833 | 2065 | ||
1834 | void __meminit build_all_zonelists(void) | 2066 | void build_all_zonelists(void) |
1835 | { | 2067 | { |
2068 | set_zonelist_order(); | ||
2069 | |||
1836 | if (system_state == SYSTEM_BOOTING) { | 2070 | if (system_state == SYSTEM_BOOTING) { |
1837 | __build_all_zonelists(NULL); | 2071 | __build_all_zonelists(NULL); |
1838 | cpuset_init_current_mems_allowed(); | 2072 | cpuset_init_current_mems_allowed(); |
@@ -1843,8 +2077,13 @@ void __meminit build_all_zonelists(void) | |||
1843 | /* cpuset refresh routine should be here */ | 2077 | /* cpuset refresh routine should be here */ |
1844 | } | 2078 | } |
1845 | vm_total_pages = nr_free_pagecache_pages(); | 2079 | vm_total_pages = nr_free_pagecache_pages(); |
1846 | printk("Built %i zonelists. Total pages: %ld\n", | 2080 | printk("Built %i zonelists in %s order. Total pages: %ld\n", |
1847 | num_online_nodes(), vm_total_pages); | 2081 | num_online_nodes(), |
2082 | zonelist_order_name[current_zonelist_order], | ||
2083 | vm_total_pages); | ||
2084 | #ifdef CONFIG_NUMA | ||
2085 | printk("Policy zone: %s\n", zone_names[policy_zone]); | ||
2086 | #endif | ||
1848 | } | 2087 | } |
1849 | 2088 | ||
1850 | /* | 2089 | /* |
@@ -1953,8 +2192,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
1953 | } | 2192 | } |
1954 | } | 2193 | } |
1955 | 2194 | ||
1956 | void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | 2195 | static void __meminit zone_init_free_lists(struct pglist_data *pgdat, |
1957 | unsigned long size) | 2196 | struct zone *zone, unsigned long size) |
1958 | { | 2197 | { |
1959 | int order; | 2198 | int order; |
1960 | for (order = 0; order < MAX_ORDER ; order++) { | 2199 | for (order = 0; order < MAX_ORDER ; order++) { |
@@ -1968,7 +2207,7 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, | |||
1968 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) | 2207 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
1969 | #endif | 2208 | #endif |
1970 | 2209 | ||
1971 | static int __cpuinit zone_batchsize(struct zone *zone) | 2210 | static int __devinit zone_batchsize(struct zone *zone) |
1972 | { | 2211 | { |
1973 | int batch; | 2212 | int batch; |
1974 | 2213 | ||
@@ -2370,7 +2609,7 @@ void __init push_node_boundaries(unsigned int nid, | |||
2370 | } | 2609 | } |
2371 | 2610 | ||
2372 | /* If necessary, push the node boundary out for reserve hotadd */ | 2611 | /* If necessary, push the node boundary out for reserve hotadd */ |
2373 | static void __init account_node_boundary(unsigned int nid, | 2612 | static void __meminit account_node_boundary(unsigned int nid, |
2374 | unsigned long *start_pfn, unsigned long *end_pfn) | 2613 | unsigned long *start_pfn, unsigned long *end_pfn) |
2375 | { | 2614 | { |
2376 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", | 2615 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", |
@@ -2390,7 +2629,7 @@ static void __init account_node_boundary(unsigned int nid, | |||
2390 | void __init push_node_boundaries(unsigned int nid, | 2629 | void __init push_node_boundaries(unsigned int nid, |
2391 | unsigned long start_pfn, unsigned long end_pfn) {} | 2630 | unsigned long start_pfn, unsigned long end_pfn) {} |
2392 | 2631 | ||
2393 | static void __init account_node_boundary(unsigned int nid, | 2632 | static void __meminit account_node_boundary(unsigned int nid, |
2394 | unsigned long *start_pfn, unsigned long *end_pfn) {} | 2633 | unsigned long *start_pfn, unsigned long *end_pfn) {} |
2395 | #endif | 2634 | #endif |
2396 | 2635 | ||
@@ -2431,7 +2670,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
2431 | * Return the number of pages a zone spans in a node, including holes | 2670 | * Return the number of pages a zone spans in a node, including holes |
2432 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() | 2671 | * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node() |
2433 | */ | 2672 | */ |
2434 | unsigned long __meminit zone_spanned_pages_in_node(int nid, | 2673 | static unsigned long __meminit zone_spanned_pages_in_node(int nid, |
2435 | unsigned long zone_type, | 2674 | unsigned long zone_type, |
2436 | unsigned long *ignored) | 2675 | unsigned long *ignored) |
2437 | { | 2676 | { |
@@ -2519,7 +2758,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, | |||
2519 | } | 2758 | } |
2520 | 2759 | ||
2521 | /* Return the number of page frames in holes in a zone on a node */ | 2760 | /* Return the number of page frames in holes in a zone on a node */ |
2522 | unsigned long __meminit zone_absent_pages_in_node(int nid, | 2761 | static unsigned long __meminit zone_absent_pages_in_node(int nid, |
2523 | unsigned long zone_type, | 2762 | unsigned long zone_type, |
2524 | unsigned long *ignored) | 2763 | unsigned long *ignored) |
2525 | { | 2764 | { |
@@ -2536,14 +2775,14 @@ unsigned long __meminit zone_absent_pages_in_node(int nid, | |||
2536 | } | 2775 | } |
2537 | 2776 | ||
2538 | #else | 2777 | #else |
2539 | static inline unsigned long zone_spanned_pages_in_node(int nid, | 2778 | static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, |
2540 | unsigned long zone_type, | 2779 | unsigned long zone_type, |
2541 | unsigned long *zones_size) | 2780 | unsigned long *zones_size) |
2542 | { | 2781 | { |
2543 | return zones_size[zone_type]; | 2782 | return zones_size[zone_type]; |
2544 | } | 2783 | } |
2545 | 2784 | ||
2546 | static inline unsigned long zone_absent_pages_in_node(int nid, | 2785 | static inline unsigned long __meminit zone_absent_pages_in_node(int nid, |
2547 | unsigned long zone_type, | 2786 | unsigned long zone_type, |
2548 | unsigned long *zholes_size) | 2787 | unsigned long *zholes_size) |
2549 | { | 2788 | { |
@@ -3355,13 +3594,28 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
3355 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) | 3594 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) |
3356 | ; | 3595 | ; |
3357 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | 3596 | table = (void*) __get_free_pages(GFP_ATOMIC, order); |
3597 | /* | ||
3598 | * If bucketsize is not a power-of-two, we may free | ||
3599 | * some pages at the end of hash table. | ||
3600 | */ | ||
3601 | if (table) { | ||
3602 | unsigned long alloc_end = (unsigned long)table + | ||
3603 | (PAGE_SIZE << order); | ||
3604 | unsigned long used = (unsigned long)table + | ||
3605 | PAGE_ALIGN(size); | ||
3606 | split_page(virt_to_page(table), order); | ||
3607 | while (used < alloc_end) { | ||
3608 | free_page(used); | ||
3609 | used += PAGE_SIZE; | ||
3610 | } | ||
3611 | } | ||
3358 | } | 3612 | } |
3359 | } while (!table && size > PAGE_SIZE && --log2qty); | 3613 | } while (!table && size > PAGE_SIZE && --log2qty); |
3360 | 3614 | ||
3361 | if (!table) | 3615 | if (!table) |
3362 | panic("Failed to allocate %s hash table\n", tablename); | 3616 | panic("Failed to allocate %s hash table\n", tablename); |
3363 | 3617 | ||
3364 | printk("%s hash table entries: %d (order: %d, %lu bytes)\n", | 3618 | printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", |
3365 | tablename, | 3619 | tablename, |
3366 | (1U << log2qty), | 3620 | (1U << log2qty), |
3367 | ilog2(size) - PAGE_SHIFT, | 3621 | ilog2(size) - PAGE_SHIFT, |
@@ -53,24 +53,6 @@ | |||
53 | 53 | ||
54 | struct kmem_cache *anon_vma_cachep; | 54 | struct kmem_cache *anon_vma_cachep; |
55 | 55 | ||
56 | static inline void validate_anon_vma(struct vm_area_struct *find_vma) | ||
57 | { | ||
58 | #ifdef CONFIG_DEBUG_VM | ||
59 | struct anon_vma *anon_vma = find_vma->anon_vma; | ||
60 | struct vm_area_struct *vma; | ||
61 | unsigned int mapcount = 0; | ||
62 | int found = 0; | ||
63 | |||
64 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | ||
65 | mapcount++; | ||
66 | BUG_ON(mapcount > 100000); | ||
67 | if (vma == find_vma) | ||
68 | found = 1; | ||
69 | } | ||
70 | BUG_ON(!found); | ||
71 | #endif | ||
72 | } | ||
73 | |||
74 | /* This must be called under the mmap_sem. */ | 56 | /* This must be called under the mmap_sem. */ |
75 | int anon_vma_prepare(struct vm_area_struct *vma) | 57 | int anon_vma_prepare(struct vm_area_struct *vma) |
76 | { | 58 | { |
@@ -121,10 +103,8 @@ void __anon_vma_link(struct vm_area_struct *vma) | |||
121 | { | 103 | { |
122 | struct anon_vma *anon_vma = vma->anon_vma; | 104 | struct anon_vma *anon_vma = vma->anon_vma; |
123 | 105 | ||
124 | if (anon_vma) { | 106 | if (anon_vma) |
125 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 107 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
126 | validate_anon_vma(vma); | ||
127 | } | ||
128 | } | 108 | } |
129 | 109 | ||
130 | void anon_vma_link(struct vm_area_struct *vma) | 110 | void anon_vma_link(struct vm_area_struct *vma) |
@@ -134,7 +114,6 @@ void anon_vma_link(struct vm_area_struct *vma) | |||
134 | if (anon_vma) { | 114 | if (anon_vma) { |
135 | spin_lock(&anon_vma->lock); | 115 | spin_lock(&anon_vma->lock); |
136 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 116 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); |
137 | validate_anon_vma(vma); | ||
138 | spin_unlock(&anon_vma->lock); | 117 | spin_unlock(&anon_vma->lock); |
139 | } | 118 | } |
140 | } | 119 | } |
@@ -148,7 +127,6 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
148 | return; | 127 | return; |
149 | 128 | ||
150 | spin_lock(&anon_vma->lock); | 129 | spin_lock(&anon_vma->lock); |
151 | validate_anon_vma(vma); | ||
152 | list_del(&vma->anon_vma_node); | 130 | list_del(&vma->anon_vma_node); |
153 | 131 | ||
154 | /* We must garbage collect the anon_vma if it's empty */ | 132 | /* We must garbage collect the anon_vma if it's empty */ |
diff --git a/mm/shmem.c b/mm/shmem.c index e537317bec4d..0493e4d0bcaa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -967,6 +967,8 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_ | |||
967 | *nodelist++ = '\0'; | 967 | *nodelist++ = '\0'; |
968 | if (nodelist_parse(nodelist, *policy_nodes)) | 968 | if (nodelist_parse(nodelist, *policy_nodes)) |
969 | goto out; | 969 | goto out; |
970 | if (!nodes_subset(*policy_nodes, node_online_map)) | ||
971 | goto out; | ||
970 | } | 972 | } |
971 | if (!strcmp(value, "default")) { | 973 | if (!strcmp(value, "default")) { |
972 | *policy = MPOL_DEFAULT; | 974 | *policy = MPOL_DEFAULT; |
@@ -1098,9 +1100,9 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, | |||
1098 | * Normally, filepage is NULL on entry, and either found | 1100 | * Normally, filepage is NULL on entry, and either found |
1099 | * uptodate immediately, or allocated and zeroed, or read | 1101 | * uptodate immediately, or allocated and zeroed, or read |
1100 | * in under swappage, which is then assigned to filepage. | 1102 | * in under swappage, which is then assigned to filepage. |
1101 | * But shmem_prepare_write passes in a locked filepage, | 1103 | * But shmem_readpage and shmem_prepare_write pass in a locked |
1102 | * which may be found not uptodate by other callers too, | 1104 | * filepage, which may be found not uptodate by other callers |
1103 | * and may need to be copied from the swappage read in. | 1105 | * too, and may need to be copied from the swappage read in. |
1104 | */ | 1106 | */ |
1105 | repeat: | 1107 | repeat: |
1106 | if (!filepage) | 1108 | if (!filepage) |
@@ -1483,9 +1485,18 @@ static const struct inode_operations shmem_symlink_inode_operations; | |||
1483 | static const struct inode_operations shmem_symlink_inline_operations; | 1485 | static const struct inode_operations shmem_symlink_inline_operations; |
1484 | 1486 | ||
1485 | /* | 1487 | /* |
1486 | * Normally tmpfs makes no use of shmem_prepare_write, but it | 1488 | * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write; |
1487 | * lets a tmpfs file be used read-write below the loop driver. | 1489 | * but providing them allows a tmpfs file to be used for splice, sendfile, and |
1490 | * below the loop driver, in the generic fashion that many filesystems support. | ||
1488 | */ | 1491 | */ |
1492 | static int shmem_readpage(struct file *file, struct page *page) | ||
1493 | { | ||
1494 | struct inode *inode = page->mapping->host; | ||
1495 | int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); | ||
1496 | unlock_page(page); | ||
1497 | return error; | ||
1498 | } | ||
1499 | |||
1489 | static int | 1500 | static int |
1490 | shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) | 1501 | shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) |
1491 | { | 1502 | { |
@@ -1709,25 +1720,6 @@ static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count | |||
1709 | return desc.error; | 1720 | return desc.error; |
1710 | } | 1721 | } |
1711 | 1722 | ||
1712 | static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos, | ||
1713 | size_t count, read_actor_t actor, void *target) | ||
1714 | { | ||
1715 | read_descriptor_t desc; | ||
1716 | |||
1717 | if (!count) | ||
1718 | return 0; | ||
1719 | |||
1720 | desc.written = 0; | ||
1721 | desc.count = count; | ||
1722 | desc.arg.data = target; | ||
1723 | desc.error = 0; | ||
1724 | |||
1725 | do_shmem_file_read(in_file, ppos, &desc, actor); | ||
1726 | if (desc.written) | ||
1727 | return desc.written; | ||
1728 | return desc.error; | ||
1729 | } | ||
1730 | |||
1731 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1723 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
1732 | { | 1724 | { |
1733 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); | 1725 | struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); |
@@ -2384,6 +2376,7 @@ static const struct address_space_operations shmem_aops = { | |||
2384 | .writepage = shmem_writepage, | 2376 | .writepage = shmem_writepage, |
2385 | .set_page_dirty = __set_page_dirty_no_writeback, | 2377 | .set_page_dirty = __set_page_dirty_no_writeback, |
2386 | #ifdef CONFIG_TMPFS | 2378 | #ifdef CONFIG_TMPFS |
2379 | .readpage = shmem_readpage, | ||
2387 | .prepare_write = shmem_prepare_write, | 2380 | .prepare_write = shmem_prepare_write, |
2388 | .commit_write = simple_commit_write, | 2381 | .commit_write = simple_commit_write, |
2389 | #endif | 2382 | #endif |
@@ -2397,7 +2390,8 @@ static const struct file_operations shmem_file_operations = { | |||
2397 | .read = shmem_file_read, | 2390 | .read = shmem_file_read, |
2398 | .write = shmem_file_write, | 2391 | .write = shmem_file_write, |
2399 | .fsync = simple_sync_file, | 2392 | .fsync = simple_sync_file, |
2400 | .sendfile = shmem_file_sendfile, | 2393 | .splice_read = generic_file_splice_read, |
2394 | .splice_write = generic_file_splice_write, | ||
2401 | #endif | 2395 | #endif |
2402 | }; | 2396 | }; |
2403 | 2397 | ||
@@ -137,6 +137,7 @@ | |||
137 | 137 | ||
138 | /* Shouldn't this be in a header file somewhere? */ | 138 | /* Shouldn't this be in a header file somewhere? */ |
139 | #define BYTES_PER_WORD sizeof(void *) | 139 | #define BYTES_PER_WORD sizeof(void *) |
140 | #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) | ||
140 | 141 | ||
141 | #ifndef cache_line_size | 142 | #ifndef cache_line_size |
142 | #define cache_line_size() L1_CACHE_BYTES | 143 | #define cache_line_size() L1_CACHE_BYTES |
@@ -547,7 +548,7 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp) | |||
547 | if (cachep->flags & SLAB_STORE_USER) | 548 | if (cachep->flags & SLAB_STORE_USER) |
548 | return (unsigned long long *)(objp + cachep->buffer_size - | 549 | return (unsigned long long *)(objp + cachep->buffer_size - |
549 | sizeof(unsigned long long) - | 550 | sizeof(unsigned long long) - |
550 | BYTES_PER_WORD); | 551 | REDZONE_ALIGN); |
551 | return (unsigned long long *) (objp + cachep->buffer_size - | 552 | return (unsigned long long *) (objp + cachep->buffer_size - |
552 | sizeof(unsigned long long)); | 553 | sizeof(unsigned long long)); |
553 | } | 554 | } |
@@ -774,7 +775,6 @@ static inline struct kmem_cache *__find_general_cachep(size_t size, | |||
774 | */ | 775 | */ |
775 | BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); | 776 | BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL); |
776 | #endif | 777 | #endif |
777 | WARN_ON_ONCE(size == 0); | ||
778 | while (size > csizep->cs_size) | 778 | while (size > csizep->cs_size) |
779 | csizep++; | 779 | csizep++; |
780 | 780 | ||
@@ -929,7 +929,7 @@ static void next_reap_node(void) | |||
929 | * the CPUs getting into lockstep and contending for the global cache chain | 929 | * the CPUs getting into lockstep and contending for the global cache chain |
930 | * lock. | 930 | * lock. |
931 | */ | 931 | */ |
932 | static void __devinit start_cpu_timer(int cpu) | 932 | static void __cpuinit start_cpu_timer(int cpu) |
933 | { | 933 | { |
934 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); | 934 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); |
935 | 935 | ||
@@ -2179,7 +2179,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2179 | * above the next power of two: caches with object sizes just above a | 2179 | * above the next power of two: caches with object sizes just above a |
2180 | * power of two have a significant amount of internal fragmentation. | 2180 | * power of two have a significant amount of internal fragmentation. |
2181 | */ | 2181 | */ |
2182 | if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) | 2182 | if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + |
2183 | 2 * sizeof(unsigned long long))) | ||
2183 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; | 2184 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; |
2184 | if (!(flags & SLAB_DESTROY_BY_RCU)) | 2185 | if (!(flags & SLAB_DESTROY_BY_RCU)) |
2185 | flags |= SLAB_POISON; | 2186 | flags |= SLAB_POISON; |
@@ -2220,12 +2221,20 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2220 | } | 2221 | } |
2221 | 2222 | ||
2222 | /* | 2223 | /* |
2223 | * Redzoning and user store require word alignment. Note this will be | 2224 | * Redzoning and user store require word alignment or possibly larger. |
2224 | * overridden by architecture or caller mandated alignment if either | 2225 | * Note this will be overridden by architecture or caller mandated |
2225 | * is greater than BYTES_PER_WORD. | 2226 | * alignment if either is greater than BYTES_PER_WORD. |
2226 | */ | 2227 | */ |
2227 | if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) | 2228 | if (flags & SLAB_STORE_USER) |
2228 | ralign = __alignof__(unsigned long long); | 2229 | ralign = BYTES_PER_WORD; |
2230 | |||
2231 | if (flags & SLAB_RED_ZONE) { | ||
2232 | ralign = REDZONE_ALIGN; | ||
2233 | /* If redzoning, ensure that the second redzone is suitably | ||
2234 | * aligned, by adjusting the object size accordingly. */ | ||
2235 | size += REDZONE_ALIGN - 1; | ||
2236 | size &= ~(REDZONE_ALIGN - 1); | ||
2237 | } | ||
2229 | 2238 | ||
2230 | /* 2) arch mandated alignment */ | 2239 | /* 2) arch mandated alignment */ |
2231 | if (ralign < ARCH_SLAB_MINALIGN) { | 2240 | if (ralign < ARCH_SLAB_MINALIGN) { |
@@ -2262,9 +2271,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
2262 | } | 2271 | } |
2263 | if (flags & SLAB_STORE_USER) { | 2272 | if (flags & SLAB_STORE_USER) { |
2264 | /* user store requires one word storage behind the end of | 2273 | /* user store requires one word storage behind the end of |
2265 | * the real object. | 2274 | * the real object. But if the second red zone needs to be |
2275 | * aligned to 64 bits, we must allow that much space. | ||
2266 | */ | 2276 | */ |
2267 | size += BYTES_PER_WORD; | 2277 | if (flags & SLAB_RED_ZONE) |
2278 | size += REDZONE_ALIGN; | ||
2279 | else | ||
2280 | size += BYTES_PER_WORD; | ||
2268 | } | 2281 | } |
2269 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) | 2282 | #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) |
2270 | if (size >= malloc_sizes[INDEX_L3 + 1].cs_size | 2283 | if (size >= malloc_sizes[INDEX_L3 + 1].cs_size |
@@ -3539,7 +3552,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
3539 | check_irq_off(); | 3552 | check_irq_off(); |
3540 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3553 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); |
3541 | 3554 | ||
3542 | if (use_alien_caches && cache_free_alien(cachep, objp)) | 3555 | if (cache_free_alien(cachep, objp)) |
3543 | return; | 3556 | return; |
3544 | 3557 | ||
3545 | if (likely(ac->avail < ac->limit)) { | 3558 | if (likely(ac->avail < ac->limit)) { |
@@ -4144,26 +4157,17 @@ static void print_slabinfo_header(struct seq_file *m) | |||
4144 | static void *s_start(struct seq_file *m, loff_t *pos) | 4157 | static void *s_start(struct seq_file *m, loff_t *pos) |
4145 | { | 4158 | { |
4146 | loff_t n = *pos; | 4159 | loff_t n = *pos; |
4147 | struct list_head *p; | ||
4148 | 4160 | ||
4149 | mutex_lock(&cache_chain_mutex); | 4161 | mutex_lock(&cache_chain_mutex); |
4150 | if (!n) | 4162 | if (!n) |
4151 | print_slabinfo_header(m); | 4163 | print_slabinfo_header(m); |
4152 | p = cache_chain.next; | 4164 | |
4153 | while (n--) { | 4165 | return seq_list_start(&cache_chain, *pos); |
4154 | p = p->next; | ||
4155 | if (p == &cache_chain) | ||
4156 | return NULL; | ||
4157 | } | ||
4158 | return list_entry(p, struct kmem_cache, next); | ||
4159 | } | 4166 | } |
4160 | 4167 | ||
4161 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) | 4168 | static void *s_next(struct seq_file *m, void *p, loff_t *pos) |
4162 | { | 4169 | { |
4163 | struct kmem_cache *cachep = p; | 4170 | return seq_list_next(p, &cache_chain, pos); |
4164 | ++*pos; | ||
4165 | return cachep->next.next == &cache_chain ? | ||
4166 | NULL : list_entry(cachep->next.next, struct kmem_cache, next); | ||
4167 | } | 4171 | } |
4168 | 4172 | ||
4169 | static void s_stop(struct seq_file *m, void *p) | 4173 | static void s_stop(struct seq_file *m, void *p) |
@@ -4173,7 +4177,7 @@ static void s_stop(struct seq_file *m, void *p) | |||
4173 | 4177 | ||
4174 | static int s_show(struct seq_file *m, void *p) | 4178 | static int s_show(struct seq_file *m, void *p) |
4175 | { | 4179 | { |
4176 | struct kmem_cache *cachep = p; | 4180 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); |
4177 | struct slab *slabp; | 4181 | struct slab *slabp; |
4178 | unsigned long active_objs; | 4182 | unsigned long active_objs; |
4179 | unsigned long num_objs; | 4183 | unsigned long num_objs; |
@@ -4342,17 +4346,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
4342 | 4346 | ||
4343 | static void *leaks_start(struct seq_file *m, loff_t *pos) | 4347 | static void *leaks_start(struct seq_file *m, loff_t *pos) |
4344 | { | 4348 | { |
4345 | loff_t n = *pos; | ||
4346 | struct list_head *p; | ||
4347 | |||
4348 | mutex_lock(&cache_chain_mutex); | 4349 | mutex_lock(&cache_chain_mutex); |
4349 | p = cache_chain.next; | 4350 | return seq_list_start(&cache_chain, *pos); |
4350 | while (n--) { | ||
4351 | p = p->next; | ||
4352 | if (p == &cache_chain) | ||
4353 | return NULL; | ||
4354 | } | ||
4355 | return list_entry(p, struct kmem_cache, next); | ||
4356 | } | 4351 | } |
4357 | 4352 | ||
4358 | static inline int add_caller(unsigned long *n, unsigned long v) | 4353 | static inline int add_caller(unsigned long *n, unsigned long v) |
@@ -4417,7 +4412,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) | |||
4417 | 4412 | ||
4418 | static int leaks_show(struct seq_file *m, void *p) | 4413 | static int leaks_show(struct seq_file *m, void *p) |
4419 | { | 4414 | { |
4420 | struct kmem_cache *cachep = p; | 4415 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next); |
4421 | struct slab *slabp; | 4416 | struct slab *slabp; |
4422 | struct kmem_list3 *l3; | 4417 | struct kmem_list3 *l3; |
4423 | const char *name; | 4418 | const char *name; |
@@ -3,57 +3,159 @@ | |||
3 | * | 3 | * |
4 | * Matt Mackall <mpm@selenic.com> 12/30/03 | 4 | * Matt Mackall <mpm@selenic.com> 12/30/03 |
5 | * | 5 | * |
6 | * NUMA support by Paul Mundt, 2007. | ||
7 | * | ||
6 | * How SLOB works: | 8 | * How SLOB works: |
7 | * | 9 | * |
8 | * The core of SLOB is a traditional K&R style heap allocator, with | 10 | * The core of SLOB is a traditional K&R style heap allocator, with |
9 | * support for returning aligned objects. The granularity of this | 11 | * support for returning aligned objects. The granularity of this |
10 | * allocator is 8 bytes on x86, though it's perhaps possible to reduce | 12 | * allocator is as little as 2 bytes, however typically most architectures |
11 | * this to 4 if it's deemed worth the effort. The slob heap is a | 13 | * will require 4 bytes on 32-bit and 8 bytes on 64-bit. |
12 | * singly-linked list of pages from __get_free_page, grown on demand | 14 | * |
13 | * and allocation from the heap is currently first-fit. | 15 | * The slob heap is a linked list of pages from alloc_pages(), and |
16 | * within each page, there is a singly-linked list of free blocks (slob_t). | ||
17 | * The heap is grown on demand and allocation from the heap is currently | ||
18 | * first-fit. | ||
14 | * | 19 | * |
15 | * Above this is an implementation of kmalloc/kfree. Blocks returned | 20 | * Above this is an implementation of kmalloc/kfree. Blocks returned |
16 | * from kmalloc are 8-byte aligned and prepended with a 8-byte header. | 21 | * from kmalloc are prepended with a 4-byte header with the kmalloc size. |
17 | * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls | 22 | * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls |
18 | * __get_free_pages directly so that it can return page-aligned blocks | 23 | * alloc_pages() directly, allocating compound pages so the page order |
19 | * and keeps a linked list of such pages and their orders. These | 24 | * does not have to be separately tracked, and also stores the exact |
20 | * objects are detected in kfree() by their page alignment. | 25 | * allocation size in page->private so that it can be used to accurately |
26 | * provide ksize(). These objects are detected in kfree() because slob_page() | ||
27 | * is false for them. | ||
21 | * | 28 | * |
22 | * SLAB is emulated on top of SLOB by simply calling constructors and | 29 | * SLAB is emulated on top of SLOB by simply calling constructors and |
23 | * destructors for every SLAB allocation. Objects are returned with | 30 | * destructors for every SLAB allocation. Objects are returned with the |
24 | * the 8-byte alignment unless the SLAB_HWCACHE_ALIGN flag is | 31 | * 4-byte alignment unless the SLAB_HWCACHE_ALIGN flag is set, in which |
25 | * set, in which case the low-level allocator will fragment blocks to | 32 | * case the low-level allocator will fragment blocks to create the proper |
26 | * create the proper alignment. Again, objects of page-size or greater | 33 | * alignment. Again, objects of page-size or greater are allocated by |
27 | * are allocated by calling __get_free_pages. As SLAB objects know | 34 | * calling alloc_pages(). As SLAB objects know their size, no separate |
28 | * their size, no separate size bookkeeping is necessary and there is | 35 | * size bookkeeping is necessary and there is essentially no allocation |
29 | * essentially no allocation space overhead. | 36 | * space overhead, and compound pages aren't needed for multi-page |
37 | * allocations. | ||
38 | * | ||
39 | * NUMA support in SLOB is fairly simplistic, pushing most of the real | ||
40 | * logic down to the page allocator, and simply doing the node accounting | ||
41 | * on the upper levels. In the event that a node id is explicitly | ||
42 | * provided, alloc_pages_node() with the specified node id is used | ||
43 | * instead. The common case (or when the node id isn't explicitly provided) | ||
44 | * will default to the current node, as per numa_node_id(). | ||
45 | * | ||
46 | * Node aware pages are still inserted in to the global freelist, and | ||
47 | * these are scanned for by matching against the node id encoded in the | ||
48 | * page flags. As a result, block allocations that can be satisfied from | ||
49 | * the freelist will only be done so on pages residing on the same node, | ||
50 | * in order to prevent random node placement. | ||
30 | */ | 51 | */ |
31 | 52 | ||
53 | #include <linux/kernel.h> | ||
32 | #include <linux/slab.h> | 54 | #include <linux/slab.h> |
33 | #include <linux/mm.h> | 55 | #include <linux/mm.h> |
34 | #include <linux/cache.h> | 56 | #include <linux/cache.h> |
35 | #include <linux/init.h> | 57 | #include <linux/init.h> |
36 | #include <linux/module.h> | 58 | #include <linux/module.h> |
37 | #include <linux/timer.h> | ||
38 | #include <linux/rcupdate.h> | 59 | #include <linux/rcupdate.h> |
60 | #include <linux/list.h> | ||
61 | #include <asm/atomic.h> | ||
62 | |||
63 | /* | ||
64 | * slob_block has a field 'units', which indicates size of block if +ve, | ||
65 | * or offset of next block if -ve (in SLOB_UNITs). | ||
66 | * | ||
67 | * Free blocks of size 1 unit simply contain the offset of the next block. | ||
68 | * Those with larger size contain their size in the first SLOB_UNIT of | ||
69 | * memory, and the offset of the next free block in the second SLOB_UNIT. | ||
70 | */ | ||
71 | #if PAGE_SIZE <= (32767 * 2) | ||
72 | typedef s16 slobidx_t; | ||
73 | #else | ||
74 | typedef s32 slobidx_t; | ||
75 | #endif | ||
39 | 76 | ||
40 | struct slob_block { | 77 | struct slob_block { |
41 | int units; | 78 | slobidx_t units; |
42 | struct slob_block *next; | ||
43 | }; | 79 | }; |
44 | typedef struct slob_block slob_t; | 80 | typedef struct slob_block slob_t; |
45 | 81 | ||
82 | /* | ||
83 | * We use struct page fields to manage some slob allocation aspects, | ||
84 | * however to avoid the horrible mess in include/linux/mm_types.h, we'll | ||
85 | * just define our own struct page type variant here. | ||
86 | */ | ||
87 | struct slob_page { | ||
88 | union { | ||
89 | struct { | ||
90 | unsigned long flags; /* mandatory */ | ||
91 | atomic_t _count; /* mandatory */ | ||
92 | slobidx_t units; /* free units left in page */ | ||
93 | unsigned long pad[2]; | ||
94 | slob_t *free; /* first free slob_t in page */ | ||
95 | struct list_head list; /* linked list of free pages */ | ||
96 | }; | ||
97 | struct page page; | ||
98 | }; | ||
99 | }; | ||
100 | static inline void struct_slob_page_wrong_size(void) | ||
101 | { BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); } | ||
102 | |||
103 | /* | ||
104 | * free_slob_page: call before a slob_page is returned to the page allocator. | ||
105 | */ | ||
106 | static inline void free_slob_page(struct slob_page *sp) | ||
107 | { | ||
108 | reset_page_mapcount(&sp->page); | ||
109 | sp->page.mapping = NULL; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * All (partially) free slob pages go on this list. | ||
114 | */ | ||
115 | static LIST_HEAD(free_slob_pages); | ||
116 | |||
117 | /* | ||
118 | * slob_page: True for all slob pages (false for bigblock pages) | ||
119 | */ | ||
120 | static inline int slob_page(struct slob_page *sp) | ||
121 | { | ||
122 | return test_bit(PG_active, &sp->flags); | ||
123 | } | ||
124 | |||
125 | static inline void set_slob_page(struct slob_page *sp) | ||
126 | { | ||
127 | __set_bit(PG_active, &sp->flags); | ||
128 | } | ||
129 | |||
130 | static inline void clear_slob_page(struct slob_page *sp) | ||
131 | { | ||
132 | __clear_bit(PG_active, &sp->flags); | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * slob_page_free: true for pages on free_slob_pages list. | ||
137 | */ | ||
138 | static inline int slob_page_free(struct slob_page *sp) | ||
139 | { | ||
140 | return test_bit(PG_private, &sp->flags); | ||
141 | } | ||
142 | |||
143 | static inline void set_slob_page_free(struct slob_page *sp) | ||
144 | { | ||
145 | list_add(&sp->list, &free_slob_pages); | ||
146 | __set_bit(PG_private, &sp->flags); | ||
147 | } | ||
148 | |||
149 | static inline void clear_slob_page_free(struct slob_page *sp) | ||
150 | { | ||
151 | list_del(&sp->list); | ||
152 | __clear_bit(PG_private, &sp->flags); | ||
153 | } | ||
154 | |||
46 | #define SLOB_UNIT sizeof(slob_t) | 155 | #define SLOB_UNIT sizeof(slob_t) |
47 | #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) | 156 | #define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) |
48 | #define SLOB_ALIGN L1_CACHE_BYTES | 157 | #define SLOB_ALIGN L1_CACHE_BYTES |
49 | 158 | ||
50 | struct bigblock { | ||
51 | int order; | ||
52 | void *pages; | ||
53 | struct bigblock *next; | ||
54 | }; | ||
55 | typedef struct bigblock bigblock_t; | ||
56 | |||
57 | /* | 159 | /* |
58 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which | 160 | * struct slob_rcu is inserted at the tail of allocated slob blocks, which |
59 | * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free | 161 | * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free |
@@ -64,133 +166,285 @@ struct slob_rcu { | |||
64 | int size; | 166 | int size; |
65 | }; | 167 | }; |
66 | 168 | ||
67 | static slob_t arena = { .next = &arena, .units = 1 }; | 169 | /* |
68 | static slob_t *slobfree = &arena; | 170 | * slob_lock protects all slob allocator structures. |
69 | static bigblock_t *bigblocks; | 171 | */ |
70 | static DEFINE_SPINLOCK(slob_lock); | 172 | static DEFINE_SPINLOCK(slob_lock); |
71 | static DEFINE_SPINLOCK(block_lock); | ||
72 | 173 | ||
73 | static void slob_free(void *b, int size); | 174 | /* |
74 | static void slob_timer_cbk(void); | 175 | * Encode the given size and next info into a free slob block s. |
176 | */ | ||
177 | static void set_slob(slob_t *s, slobidx_t size, slob_t *next) | ||
178 | { | ||
179 | slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); | ||
180 | slobidx_t offset = next - base; | ||
75 | 181 | ||
182 | if (size > 1) { | ||
183 | s[0].units = size; | ||
184 | s[1].units = offset; | ||
185 | } else | ||
186 | s[0].units = -offset; | ||
187 | } | ||
76 | 188 | ||
77 | static void *slob_alloc(size_t size, gfp_t gfp, int align) | 189 | /* |
190 | * Return the size of a slob block. | ||
191 | */ | ||
192 | static slobidx_t slob_units(slob_t *s) | ||
193 | { | ||
194 | if (s->units > 0) | ||
195 | return s->units; | ||
196 | return 1; | ||
197 | } | ||
198 | |||
199 | /* | ||
200 | * Return the next free slob block pointer after this one. | ||
201 | */ | ||
202 | static slob_t *slob_next(slob_t *s) | ||
203 | { | ||
204 | slob_t *base = (slob_t *)((unsigned long)s & PAGE_MASK); | ||
205 | slobidx_t next; | ||
206 | |||
207 | if (s[0].units < 0) | ||
208 | next = -s[0].units; | ||
209 | else | ||
210 | next = s[1].units; | ||
211 | return base+next; | ||
212 | } | ||
213 | |||
214 | /* | ||
215 | * Returns true if s is the last free block in its page. | ||
216 | */ | ||
217 | static int slob_last(slob_t *s) | ||
218 | { | ||
219 | return !((unsigned long)slob_next(s) & ~PAGE_MASK); | ||
220 | } | ||
221 | |||
222 | static void *slob_new_page(gfp_t gfp, int order, int node) | ||
223 | { | ||
224 | void *page; | ||
225 | |||
226 | #ifdef CONFIG_NUMA | ||
227 | if (node != -1) | ||
228 | page = alloc_pages_node(node, gfp, order); | ||
229 | else | ||
230 | #endif | ||
231 | page = alloc_pages(gfp, order); | ||
232 | |||
233 | if (!page) | ||
234 | return NULL; | ||
235 | |||
236 | return page_address(page); | ||
237 | } | ||
238 | |||
239 | /* | ||
240 | * Allocate a slob block within a given slob_page sp. | ||
241 | */ | ||
242 | static void *slob_page_alloc(struct slob_page *sp, size_t size, int align) | ||
78 | { | 243 | { |
79 | slob_t *prev, *cur, *aligned = 0; | 244 | slob_t *prev, *cur, *aligned = 0; |
80 | int delta = 0, units = SLOB_UNITS(size); | 245 | int delta = 0, units = SLOB_UNITS(size); |
81 | unsigned long flags; | ||
82 | 246 | ||
83 | spin_lock_irqsave(&slob_lock, flags); | 247 | for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) { |
84 | prev = slobfree; | 248 | slobidx_t avail = slob_units(cur); |
85 | for (cur = prev->next; ; prev = cur, cur = cur->next) { | 249 | |
86 | if (align) { | 250 | if (align) { |
87 | aligned = (slob_t *)ALIGN((unsigned long)cur, align); | 251 | aligned = (slob_t *)ALIGN((unsigned long)cur, align); |
88 | delta = aligned - cur; | 252 | delta = aligned - cur; |
89 | } | 253 | } |
90 | if (cur->units >= units + delta) { /* room enough? */ | 254 | if (avail >= units + delta) { /* room enough? */ |
255 | slob_t *next; | ||
256 | |||
91 | if (delta) { /* need to fragment head to align? */ | 257 | if (delta) { /* need to fragment head to align? */ |
92 | aligned->units = cur->units - delta; | 258 | next = slob_next(cur); |
93 | aligned->next = cur->next; | 259 | set_slob(aligned, avail - delta, next); |
94 | cur->next = aligned; | 260 | set_slob(cur, delta, aligned); |
95 | cur->units = delta; | ||
96 | prev = cur; | 261 | prev = cur; |
97 | cur = aligned; | 262 | cur = aligned; |
263 | avail = slob_units(cur); | ||
98 | } | 264 | } |
99 | 265 | ||
100 | if (cur->units == units) /* exact fit? */ | 266 | next = slob_next(cur); |
101 | prev->next = cur->next; /* unlink */ | 267 | if (avail == units) { /* exact fit? unlink. */ |
102 | else { /* fragment */ | 268 | if (prev) |
103 | prev->next = cur + units; | 269 | set_slob(prev, slob_units(prev), next); |
104 | prev->next->units = cur->units - units; | 270 | else |
105 | prev->next->next = cur->next; | 271 | sp->free = next; |
106 | cur->units = units; | 272 | } else { /* fragment */ |
273 | if (prev) | ||
274 | set_slob(prev, slob_units(prev), cur + units); | ||
275 | else | ||
276 | sp->free = cur + units; | ||
277 | set_slob(cur + units, avail - units, next); | ||
107 | } | 278 | } |
108 | 279 | ||
109 | slobfree = prev; | 280 | sp->units -= units; |
110 | spin_unlock_irqrestore(&slob_lock, flags); | 281 | if (!sp->units) |
282 | clear_slob_page_free(sp); | ||
111 | return cur; | 283 | return cur; |
112 | } | 284 | } |
113 | if (cur == slobfree) { | 285 | if (slob_last(cur)) |
114 | spin_unlock_irqrestore(&slob_lock, flags); | 286 | return NULL; |
115 | 287 | } | |
116 | if (size == PAGE_SIZE) /* trying to shrink arena? */ | 288 | } |
117 | return 0; | ||
118 | 289 | ||
119 | cur = (slob_t *)__get_free_page(gfp); | 290 | /* |
120 | if (!cur) | 291 | * slob_alloc: entry point into the slob allocator. |
121 | return 0; | 292 | */ |
293 | static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) | ||
294 | { | ||
295 | struct slob_page *sp; | ||
296 | slob_t *b = NULL; | ||
297 | unsigned long flags; | ||
122 | 298 | ||
123 | slob_free(cur, PAGE_SIZE); | 299 | spin_lock_irqsave(&slob_lock, flags); |
124 | spin_lock_irqsave(&slob_lock, flags); | 300 | /* Iterate through each partially free page, try to find room */ |
125 | cur = slobfree; | 301 | list_for_each_entry(sp, &free_slob_pages, list) { |
302 | #ifdef CONFIG_NUMA | ||
303 | /* | ||
304 | * If there's a node specification, search for a partial | ||
305 | * page with a matching node id in the freelist. | ||
306 | */ | ||
307 | if (node != -1 && page_to_nid(&sp->page) != node) | ||
308 | continue; | ||
309 | #endif | ||
310 | |||
311 | if (sp->units >= SLOB_UNITS(size)) { | ||
312 | b = slob_page_alloc(sp, size, align); | ||
313 | if (b) | ||
314 | break; | ||
126 | } | 315 | } |
127 | } | 316 | } |
317 | spin_unlock_irqrestore(&slob_lock, flags); | ||
318 | |||
319 | /* Not enough space: must allocate a new page */ | ||
320 | if (!b) { | ||
321 | b = slob_new_page(gfp, 0, node); | ||
322 | if (!b) | ||
323 | return 0; | ||
324 | sp = (struct slob_page *)virt_to_page(b); | ||
325 | set_slob_page(sp); | ||
326 | |||
327 | spin_lock_irqsave(&slob_lock, flags); | ||
328 | sp->units = SLOB_UNITS(PAGE_SIZE); | ||
329 | sp->free = b; | ||
330 | INIT_LIST_HEAD(&sp->list); | ||
331 | set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE)); | ||
332 | set_slob_page_free(sp); | ||
333 | b = slob_page_alloc(sp, size, align); | ||
334 | BUG_ON(!b); | ||
335 | spin_unlock_irqrestore(&slob_lock, flags); | ||
336 | } | ||
337 | return b; | ||
128 | } | 338 | } |
129 | 339 | ||
340 | /* | ||
341 | * slob_free: entry point into the slob allocator. | ||
342 | */ | ||
130 | static void slob_free(void *block, int size) | 343 | static void slob_free(void *block, int size) |
131 | { | 344 | { |
132 | slob_t *cur, *b = (slob_t *)block; | 345 | struct slob_page *sp; |
346 | slob_t *prev, *next, *b = (slob_t *)block; | ||
347 | slobidx_t units; | ||
133 | unsigned long flags; | 348 | unsigned long flags; |
134 | 349 | ||
135 | if (!block) | 350 | if (!block) |
136 | return; | 351 | return; |
352 | BUG_ON(!size); | ||
137 | 353 | ||
138 | if (size) | 354 | sp = (struct slob_page *)virt_to_page(block); |
139 | b->units = SLOB_UNITS(size); | 355 | units = SLOB_UNITS(size); |
140 | 356 | ||
141 | /* Find reinsertion point */ | ||
142 | spin_lock_irqsave(&slob_lock, flags); | 357 | spin_lock_irqsave(&slob_lock, flags); |
143 | for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next) | ||
144 | if (cur >= cur->next && (b > cur || b < cur->next)) | ||
145 | break; | ||
146 | 358 | ||
147 | if (b + b->units == cur->next) { | 359 | if (sp->units + units == SLOB_UNITS(PAGE_SIZE)) { |
148 | b->units += cur->next->units; | 360 | /* Go directly to page allocator. Do not pass slob allocator */ |
149 | b->next = cur->next->next; | 361 | if (slob_page_free(sp)) |
150 | } else | 362 | clear_slob_page_free(sp); |
151 | b->next = cur->next; | 363 | clear_slob_page(sp); |
364 | free_slob_page(sp); | ||
365 | free_page((unsigned long)b); | ||
366 | goto out; | ||
367 | } | ||
152 | 368 | ||
153 | if (cur + cur->units == b) { | 369 | if (!slob_page_free(sp)) { |
154 | cur->units += b->units; | 370 | /* This slob page is about to become partially free. Easy! */ |
155 | cur->next = b->next; | 371 | sp->units = units; |
156 | } else | 372 | sp->free = b; |
157 | cur->next = b; | 373 | set_slob(b, units, |
374 | (void *)((unsigned long)(b + | ||
375 | SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); | ||
376 | set_slob_page_free(sp); | ||
377 | goto out; | ||
378 | } | ||
158 | 379 | ||
159 | slobfree = cur; | 380 | /* |
381 | * Otherwise the page is already partially free, so find reinsertion | ||
382 | * point. | ||
383 | */ | ||
384 | sp->units += units; | ||
160 | 385 | ||
386 | if (b < sp->free) { | ||
387 | set_slob(b, units, sp->free); | ||
388 | sp->free = b; | ||
389 | } else { | ||
390 | prev = sp->free; | ||
391 | next = slob_next(prev); | ||
392 | while (b > next) { | ||
393 | prev = next; | ||
394 | next = slob_next(prev); | ||
395 | } | ||
396 | |||
397 | if (!slob_last(prev) && b + units == next) { | ||
398 | units += slob_units(next); | ||
399 | set_slob(b, units, slob_next(next)); | ||
400 | } else | ||
401 | set_slob(b, units, next); | ||
402 | |||
403 | if (prev + slob_units(prev) == b) { | ||
404 | units = slob_units(b) + slob_units(prev); | ||
405 | set_slob(prev, units, slob_next(b)); | ||
406 | } else | ||
407 | set_slob(prev, slob_units(prev), b); | ||
408 | } | ||
409 | out: | ||
161 | spin_unlock_irqrestore(&slob_lock, flags); | 410 | spin_unlock_irqrestore(&slob_lock, flags); |
162 | } | 411 | } |
163 | 412 | ||
164 | void *__kmalloc(size_t size, gfp_t gfp) | 413 | /* |
165 | { | 414 | * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. |
166 | slob_t *m; | 415 | */ |
167 | bigblock_t *bb; | ||
168 | unsigned long flags; | ||
169 | 416 | ||
170 | if (size < PAGE_SIZE - SLOB_UNIT) { | 417 | #ifndef ARCH_KMALLOC_MINALIGN |
171 | m = slob_alloc(size + SLOB_UNIT, gfp, 0); | 418 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) |
172 | return m ? (void *)(m + 1) : 0; | 419 | #endif |
173 | } | ||
174 | 420 | ||
175 | bb = slob_alloc(sizeof(bigblock_t), gfp, 0); | 421 | #ifndef ARCH_SLAB_MINALIGN |
176 | if (!bb) | 422 | #define ARCH_SLAB_MINALIGN __alignof__(unsigned long) |
177 | return 0; | 423 | #endif |
178 | 424 | ||
179 | bb->order = get_order(size); | 425 | void *__kmalloc_node(size_t size, gfp_t gfp, int node) |
180 | bb->pages = (void *)__get_free_pages(gfp, bb->order); | 426 | { |
427 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | ||
428 | |||
429 | if (size < PAGE_SIZE - align) { | ||
430 | unsigned int *m; | ||
431 | m = slob_alloc(size + align, gfp, align, node); | ||
432 | if (m) | ||
433 | *m = size; | ||
434 | return (void *)m + align; | ||
435 | } else { | ||
436 | void *ret; | ||
181 | 437 | ||
182 | if (bb->pages) { | 438 | ret = slob_new_page(gfp | __GFP_COMP, get_order(size), node); |
183 | spin_lock_irqsave(&block_lock, flags); | 439 | if (ret) { |
184 | bb->next = bigblocks; | 440 | struct page *page; |
185 | bigblocks = bb; | 441 | page = virt_to_page(ret); |
186 | spin_unlock_irqrestore(&block_lock, flags); | 442 | page->private = size; |
187 | return bb->pages; | 443 | } |
444 | return ret; | ||
188 | } | 445 | } |
189 | |||
190 | slob_free(bb, sizeof(bigblock_t)); | ||
191 | return 0; | ||
192 | } | 446 | } |
193 | EXPORT_SYMBOL(__kmalloc); | 447 | EXPORT_SYMBOL(__kmalloc_node); |
194 | 448 | ||
195 | /** | 449 | /** |
196 | * krealloc - reallocate memory. The contents will remain unchanged. | 450 | * krealloc - reallocate memory. The contents will remain unchanged. |
@@ -227,52 +481,34 @@ EXPORT_SYMBOL(krealloc); | |||
227 | 481 | ||
228 | void kfree(const void *block) | 482 | void kfree(const void *block) |
229 | { | 483 | { |
230 | bigblock_t *bb, **last = &bigblocks; | 484 | struct slob_page *sp; |
231 | unsigned long flags; | ||
232 | 485 | ||
233 | if (!block) | 486 | if (!block) |
234 | return; | 487 | return; |
235 | 488 | ||
236 | if (!((unsigned long)block & (PAGE_SIZE-1))) { | 489 | sp = (struct slob_page *)virt_to_page(block); |
237 | /* might be on the big block list */ | 490 | if (slob_page(sp)) { |
238 | spin_lock_irqsave(&block_lock, flags); | 491 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
239 | for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) { | 492 | unsigned int *m = (unsigned int *)(block - align); |
240 | if (bb->pages == block) { | 493 | slob_free(m, *m + align); |
241 | *last = bb->next; | 494 | } else |
242 | spin_unlock_irqrestore(&block_lock, flags); | 495 | put_page(&sp->page); |
243 | free_pages((unsigned long)block, bb->order); | ||
244 | slob_free(bb, sizeof(bigblock_t)); | ||
245 | return; | ||
246 | } | ||
247 | } | ||
248 | spin_unlock_irqrestore(&block_lock, flags); | ||
249 | } | ||
250 | |||
251 | slob_free((slob_t *)block - 1, 0); | ||
252 | return; | ||
253 | } | 496 | } |
254 | |||
255 | EXPORT_SYMBOL(kfree); | 497 | EXPORT_SYMBOL(kfree); |
256 | 498 | ||
499 | /* can't use ksize for kmem_cache_alloc memory, only kmalloc */ | ||
257 | size_t ksize(const void *block) | 500 | size_t ksize(const void *block) |
258 | { | 501 | { |
259 | bigblock_t *bb; | 502 | struct slob_page *sp; |
260 | unsigned long flags; | ||
261 | 503 | ||
262 | if (!block) | 504 | if (!block) |
263 | return 0; | 505 | return 0; |
264 | 506 | ||
265 | if (!((unsigned long)block & (PAGE_SIZE-1))) { | 507 | sp = (struct slob_page *)virt_to_page(block); |
266 | spin_lock_irqsave(&block_lock, flags); | 508 | if (slob_page(sp)) |
267 | for (bb = bigblocks; bb; bb = bb->next) | 509 | return ((slob_t *)block - 1)->units + SLOB_UNIT; |
268 | if (bb->pages == block) { | 510 | else |
269 | spin_unlock_irqrestore(&slob_lock, flags); | 511 | return sp->page.private; |
270 | return PAGE_SIZE << bb->order; | ||
271 | } | ||
272 | spin_unlock_irqrestore(&block_lock, flags); | ||
273 | } | ||
274 | |||
275 | return ((slob_t *)block - 1)->units * SLOB_UNIT; | ||
276 | } | 512 | } |
277 | 513 | ||
278 | struct kmem_cache { | 514 | struct kmem_cache { |
@@ -289,7 +525,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
289 | { | 525 | { |
290 | struct kmem_cache *c; | 526 | struct kmem_cache *c; |
291 | 527 | ||
292 | c = slob_alloc(sizeof(struct kmem_cache), flags, 0); | 528 | c = slob_alloc(sizeof(struct kmem_cache), flags, 0, -1); |
293 | 529 | ||
294 | if (c) { | 530 | if (c) { |
295 | c->name = name; | 531 | c->name = name; |
@@ -302,6 +538,8 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
302 | c->ctor = ctor; | 538 | c->ctor = ctor; |
303 | /* ignore alignment unless it's forced */ | 539 | /* ignore alignment unless it's forced */ |
304 | c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; | 540 | c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0; |
541 | if (c->align < ARCH_SLAB_MINALIGN) | ||
542 | c->align = ARCH_SLAB_MINALIGN; | ||
305 | if (c->align < align) | 543 | if (c->align < align) |
306 | c->align = align; | 544 | c->align = align; |
307 | } else if (flags & SLAB_PANIC) | 545 | } else if (flags & SLAB_PANIC) |
@@ -317,21 +555,21 @@ void kmem_cache_destroy(struct kmem_cache *c) | |||
317 | } | 555 | } |
318 | EXPORT_SYMBOL(kmem_cache_destroy); | 556 | EXPORT_SYMBOL(kmem_cache_destroy); |
319 | 557 | ||
320 | void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags) | 558 | void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) |
321 | { | 559 | { |
322 | void *b; | 560 | void *b; |
323 | 561 | ||
324 | if (c->size < PAGE_SIZE) | 562 | if (c->size < PAGE_SIZE) |
325 | b = slob_alloc(c->size, flags, c->align); | 563 | b = slob_alloc(c->size, flags, c->align, node); |
326 | else | 564 | else |
327 | b = (void *)__get_free_pages(flags, get_order(c->size)); | 565 | b = slob_new_page(flags, get_order(c->size), node); |
328 | 566 | ||
329 | if (c->ctor) | 567 | if (c->ctor) |
330 | c->ctor(b, c, 0); | 568 | c->ctor(b, c, 0); |
331 | 569 | ||
332 | return b; | 570 | return b; |
333 | } | 571 | } |
334 | EXPORT_SYMBOL(kmem_cache_alloc); | 572 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
335 | 573 | ||
336 | void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) | 574 | void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags) |
337 | { | 575 | { |
@@ -385,9 +623,6 @@ const char *kmem_cache_name(struct kmem_cache *c) | |||
385 | } | 623 | } |
386 | EXPORT_SYMBOL(kmem_cache_name); | 624 | EXPORT_SYMBOL(kmem_cache_name); |
387 | 625 | ||
388 | static struct timer_list slob_timer = TIMER_INITIALIZER( | ||
389 | (void (*)(unsigned long))slob_timer_cbk, 0, 0); | ||
390 | |||
391 | int kmem_cache_shrink(struct kmem_cache *d) | 626 | int kmem_cache_shrink(struct kmem_cache *d) |
392 | { | 627 | { |
393 | return 0; | 628 | return 0; |
@@ -399,17 +634,14 @@ int kmem_ptr_validate(struct kmem_cache *a, const void *b) | |||
399 | return 0; | 634 | return 0; |
400 | } | 635 | } |
401 | 636 | ||
402 | void __init kmem_cache_init(void) | 637 | static unsigned int slob_ready __read_mostly; |
638 | |||
639 | int slab_is_available(void) | ||
403 | { | 640 | { |
404 | slob_timer_cbk(); | 641 | return slob_ready; |
405 | } | 642 | } |
406 | 643 | ||
407 | static void slob_timer_cbk(void) | 644 | void __init kmem_cache_init(void) |
408 | { | 645 | { |
409 | void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1); | 646 | slob_ready = 1; |
410 | |||
411 | if (p) | ||
412 | free_page((unsigned long)p); | ||
413 | |||
414 | mod_timer(&slob_timer, jiffies + HZ); | ||
415 | } | 647 | } |
@@ -323,7 +323,11 @@ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) | |||
323 | /* | 323 | /* |
324 | * Debug settings: | 324 | * Debug settings: |
325 | */ | 325 | */ |
326 | #ifdef CONFIG_SLUB_DEBUG_ON | ||
327 | static int slub_debug = DEBUG_DEFAULT_FLAGS; | ||
328 | #else | ||
326 | static int slub_debug; | 329 | static int slub_debug; |
330 | #endif | ||
327 | 331 | ||
328 | static char *slub_debug_slabs; | 332 | static char *slub_debug_slabs; |
329 | 333 | ||
@@ -888,38 +892,57 @@ fail: | |||
888 | 892 | ||
889 | static int __init setup_slub_debug(char *str) | 893 | static int __init setup_slub_debug(char *str) |
890 | { | 894 | { |
891 | if (!str || *str != '=') | 895 | slub_debug = DEBUG_DEFAULT_FLAGS; |
892 | slub_debug = DEBUG_DEFAULT_FLAGS; | 896 | if (*str++ != '=' || !*str) |
893 | else { | 897 | /* |
894 | str++; | 898 | * No options specified. Switch on full debugging. |
895 | if (*str == 0 || *str == ',') | 899 | */ |
896 | slub_debug = DEBUG_DEFAULT_FLAGS; | 900 | goto out; |
897 | else | 901 | |
898 | for( ;*str && *str != ','; str++) | 902 | if (*str == ',') |
899 | switch (*str) { | 903 | /* |
900 | case 'f' : case 'F' : | 904 | * No options but restriction on slabs. This means full |
901 | slub_debug |= SLAB_DEBUG_FREE; | 905 | * debugging for slabs matching a pattern. |
902 | break; | 906 | */ |
903 | case 'z' : case 'Z' : | 907 | goto check_slabs; |
904 | slub_debug |= SLAB_RED_ZONE; | 908 | |
905 | break; | 909 | slub_debug = 0; |
906 | case 'p' : case 'P' : | 910 | if (*str == '-') |
907 | slub_debug |= SLAB_POISON; | 911 | /* |
908 | break; | 912 | * Switch off all debugging measures. |
909 | case 'u' : case 'U' : | 913 | */ |
910 | slub_debug |= SLAB_STORE_USER; | 914 | goto out; |
911 | break; | 915 | |
912 | case 't' : case 'T' : | 916 | /* |
913 | slub_debug |= SLAB_TRACE; | 917 | * Determine which debug features should be switched on |
914 | break; | 918 | */ |
915 | default: | 919 | for ( ;*str && *str != ','; str++) { |
916 | printk(KERN_ERR "slub_debug option '%c' " | 920 | switch (tolower(*str)) { |
917 | "unknown. skipped\n",*str); | 921 | case 'f': |
918 | } | 922 | slub_debug |= SLAB_DEBUG_FREE; |
923 | break; | ||
924 | case 'z': | ||
925 | slub_debug |= SLAB_RED_ZONE; | ||
926 | break; | ||
927 | case 'p': | ||
928 | slub_debug |= SLAB_POISON; | ||
929 | break; | ||
930 | case 'u': | ||
931 | slub_debug |= SLAB_STORE_USER; | ||
932 | break; | ||
933 | case 't': | ||
934 | slub_debug |= SLAB_TRACE; | ||
935 | break; | ||
936 | default: | ||
937 | printk(KERN_ERR "slub_debug option '%c' " | ||
938 | "unknown. skipped\n",*str); | ||
939 | } | ||
919 | } | 940 | } |
920 | 941 | ||
942 | check_slabs: | ||
921 | if (*str == ',') | 943 | if (*str == ',') |
922 | slub_debug_slabs = str + 1; | 944 | slub_debug_slabs = str + 1; |
945 | out: | ||
923 | return 1; | 946 | return 1; |
924 | } | 947 | } |
925 | 948 | ||
@@ -1798,8 +1821,6 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag | |||
1798 | BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); | 1821 | BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); |
1799 | 1822 | ||
1800 | page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); | 1823 | page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); |
1801 | /* new_slab() disables interupts */ | ||
1802 | local_irq_enable(); | ||
1803 | 1824 | ||
1804 | BUG_ON(!page); | 1825 | BUG_ON(!page); |
1805 | n = page->freelist; | 1826 | n = page->freelist; |
@@ -1811,6 +1832,12 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag | |||
1811 | init_kmem_cache_node(n); | 1832 | init_kmem_cache_node(n); |
1812 | atomic_long_inc(&n->nr_slabs); | 1833 | atomic_long_inc(&n->nr_slabs); |
1813 | add_partial(n, page); | 1834 | add_partial(n, page); |
1835 | |||
1836 | /* | ||
1837 | * new_slab() disables interupts. If we do not reenable interrupts here | ||
1838 | * then bootup would continue with interrupts disabled. | ||
1839 | */ | ||
1840 | local_irq_enable(); | ||
1814 | return n; | 1841 | return n; |
1815 | } | 1842 | } |
1816 | 1843 | ||
@@ -2016,7 +2043,6 @@ error: | |||
2016 | s->offset, flags); | 2043 | s->offset, flags); |
2017 | return 0; | 2044 | return 0; |
2018 | } | 2045 | } |
2019 | EXPORT_SYMBOL(kmem_cache_open); | ||
2020 | 2046 | ||
2021 | /* | 2047 | /* |
2022 | * Check if a given pointer is valid | 2048 | * Check if a given pointer is valid |
@@ -2241,7 +2267,7 @@ void *__kmalloc(size_t size, gfp_t flags) | |||
2241 | 2267 | ||
2242 | if (s) | 2268 | if (s) |
2243 | return slab_alloc(s, flags, -1, __builtin_return_address(0)); | 2269 | return slab_alloc(s, flags, -1, __builtin_return_address(0)); |
2244 | return NULL; | 2270 | return ZERO_SIZE_PTR; |
2245 | } | 2271 | } |
2246 | EXPORT_SYMBOL(__kmalloc); | 2272 | EXPORT_SYMBOL(__kmalloc); |
2247 | 2273 | ||
@@ -2252,16 +2278,20 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
2252 | 2278 | ||
2253 | if (s) | 2279 | if (s) |
2254 | return slab_alloc(s, flags, node, __builtin_return_address(0)); | 2280 | return slab_alloc(s, flags, node, __builtin_return_address(0)); |
2255 | return NULL; | 2281 | return ZERO_SIZE_PTR; |
2256 | } | 2282 | } |
2257 | EXPORT_SYMBOL(__kmalloc_node); | 2283 | EXPORT_SYMBOL(__kmalloc_node); |
2258 | #endif | 2284 | #endif |
2259 | 2285 | ||
2260 | size_t ksize(const void *object) | 2286 | size_t ksize(const void *object) |
2261 | { | 2287 | { |
2262 | struct page *page = get_object_page(object); | 2288 | struct page *page; |
2263 | struct kmem_cache *s; | 2289 | struct kmem_cache *s; |
2264 | 2290 | ||
2291 | if (object == ZERO_SIZE_PTR) | ||
2292 | return 0; | ||
2293 | |||
2294 | page = get_object_page(object); | ||
2265 | BUG_ON(!page); | 2295 | BUG_ON(!page); |
2266 | s = page->slab; | 2296 | s = page->slab; |
2267 | BUG_ON(!s); | 2297 | BUG_ON(!s); |
@@ -2293,7 +2323,13 @@ void kfree(const void *x) | |||
2293 | struct kmem_cache *s; | 2323 | struct kmem_cache *s; |
2294 | struct page *page; | 2324 | struct page *page; |
2295 | 2325 | ||
2296 | if (!x) | 2326 | /* |
2327 | * This has to be an unsigned comparison. According to Linus | ||
2328 | * some gcc version treat a pointer as a signed entity. Then | ||
2329 | * this comparison would be true for all "negative" pointers | ||
2330 | * (which would cover the whole upper half of the address space). | ||
2331 | */ | ||
2332 | if ((unsigned long)x <= (unsigned long)ZERO_SIZE_PTR) | ||
2297 | return; | 2333 | return; |
2298 | 2334 | ||
2299 | page = virt_to_head_page(x); | 2335 | page = virt_to_head_page(x); |
@@ -2398,12 +2434,12 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) | |||
2398 | void *ret; | 2434 | void *ret; |
2399 | size_t ks; | 2435 | size_t ks; |
2400 | 2436 | ||
2401 | if (unlikely(!p)) | 2437 | if (unlikely(!p || p == ZERO_SIZE_PTR)) |
2402 | return kmalloc(new_size, flags); | 2438 | return kmalloc(new_size, flags); |
2403 | 2439 | ||
2404 | if (unlikely(!new_size)) { | 2440 | if (unlikely(!new_size)) { |
2405 | kfree(p); | 2441 | kfree(p); |
2406 | return NULL; | 2442 | return ZERO_SIZE_PTR; |
2407 | } | 2443 | } |
2408 | 2444 | ||
2409 | ks = ksize(p); | 2445 | ks = ksize(p); |
@@ -2426,6 +2462,7 @@ EXPORT_SYMBOL(krealloc); | |||
2426 | void __init kmem_cache_init(void) | 2462 | void __init kmem_cache_init(void) |
2427 | { | 2463 | { |
2428 | int i; | 2464 | int i; |
2465 | int caches = 0; | ||
2429 | 2466 | ||
2430 | #ifdef CONFIG_NUMA | 2467 | #ifdef CONFIG_NUMA |
2431 | /* | 2468 | /* |
@@ -2436,20 +2473,29 @@ void __init kmem_cache_init(void) | |||
2436 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", | 2473 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", |
2437 | sizeof(struct kmem_cache_node), GFP_KERNEL); | 2474 | sizeof(struct kmem_cache_node), GFP_KERNEL); |
2438 | kmalloc_caches[0].refcount = -1; | 2475 | kmalloc_caches[0].refcount = -1; |
2476 | caches++; | ||
2439 | #endif | 2477 | #endif |
2440 | 2478 | ||
2441 | /* Able to allocate the per node structures */ | 2479 | /* Able to allocate the per node structures */ |
2442 | slab_state = PARTIAL; | 2480 | slab_state = PARTIAL; |
2443 | 2481 | ||
2444 | /* Caches that are not of the two-to-the-power-of size */ | 2482 | /* Caches that are not of the two-to-the-power-of size */ |
2445 | create_kmalloc_cache(&kmalloc_caches[1], | 2483 | if (KMALLOC_MIN_SIZE <= 64) { |
2484 | create_kmalloc_cache(&kmalloc_caches[1], | ||
2446 | "kmalloc-96", 96, GFP_KERNEL); | 2485 | "kmalloc-96", 96, GFP_KERNEL); |
2447 | create_kmalloc_cache(&kmalloc_caches[2], | 2486 | caches++; |
2487 | } | ||
2488 | if (KMALLOC_MIN_SIZE <= 128) { | ||
2489 | create_kmalloc_cache(&kmalloc_caches[2], | ||
2448 | "kmalloc-192", 192, GFP_KERNEL); | 2490 | "kmalloc-192", 192, GFP_KERNEL); |
2491 | caches++; | ||
2492 | } | ||
2449 | 2493 | ||
2450 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) | 2494 | for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { |
2451 | create_kmalloc_cache(&kmalloc_caches[i], | 2495 | create_kmalloc_cache(&kmalloc_caches[i], |
2452 | "kmalloc", 1 << i, GFP_KERNEL); | 2496 | "kmalloc", 1 << i, GFP_KERNEL); |
2497 | caches++; | ||
2498 | } | ||
2453 | 2499 | ||
2454 | slab_state = UP; | 2500 | slab_state = UP; |
2455 | 2501 | ||
@@ -2466,8 +2512,8 @@ void __init kmem_cache_init(void) | |||
2466 | nr_cpu_ids * sizeof(struct page *); | 2512 | nr_cpu_ids * sizeof(struct page *); |
2467 | 2513 | ||
2468 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," | 2514 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," |
2469 | " Processors=%d, Nodes=%d\n", | 2515 | " CPUs=%d, Nodes=%d\n", |
2470 | KMALLOC_SHIFT_HIGH, cache_line_size(), | 2516 | caches, cache_line_size(), |
2471 | slub_min_order, slub_max_order, slub_min_objects, | 2517 | slub_min_order, slub_max_order, slub_min_objects, |
2472 | nr_cpu_ids, nr_node_ids); | 2518 | nr_cpu_ids, nr_node_ids); |
2473 | } | 2519 | } |
@@ -2652,7 +2698,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) | |||
2652 | struct kmem_cache *s = get_slab(size, gfpflags); | 2698 | struct kmem_cache *s = get_slab(size, gfpflags); |
2653 | 2699 | ||
2654 | if (!s) | 2700 | if (!s) |
2655 | return NULL; | 2701 | return ZERO_SIZE_PTR; |
2656 | 2702 | ||
2657 | return slab_alloc(s, gfpflags, -1, caller); | 2703 | return slab_alloc(s, gfpflags, -1, caller); |
2658 | } | 2704 | } |
@@ -2663,7 +2709,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | |||
2663 | struct kmem_cache *s = get_slab(size, gfpflags); | 2709 | struct kmem_cache *s = get_slab(size, gfpflags); |
2664 | 2710 | ||
2665 | if (!s) | 2711 | if (!s) |
2666 | return NULL; | 2712 | return ZERO_SIZE_PTR; |
2667 | 2713 | ||
2668 | return slab_alloc(s, gfpflags, node, caller); | 2714 | return slab_alloc(s, gfpflags, node, caller); |
2669 | } | 2715 | } |
@@ -2857,7 +2903,7 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max) | |||
2857 | 2903 | ||
2858 | order = get_order(sizeof(struct location) * max); | 2904 | order = get_order(sizeof(struct location) * max); |
2859 | 2905 | ||
2860 | l = (void *)__get_free_pages(GFP_KERNEL, order); | 2906 | l = (void *)__get_free_pages(GFP_ATOMIC, order); |
2861 | 2907 | ||
2862 | if (!l) | 2908 | if (!l) |
2863 | return 0; | 2909 | return 0; |
@@ -3022,13 +3068,15 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
3022 | n += sprintf(buf + n, " pid=%ld", | 3068 | n += sprintf(buf + n, " pid=%ld", |
3023 | l->min_pid); | 3069 | l->min_pid); |
3024 | 3070 | ||
3025 | if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) { | 3071 | if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && |
3072 | n < PAGE_SIZE - 60) { | ||
3026 | n += sprintf(buf + n, " cpus="); | 3073 | n += sprintf(buf + n, " cpus="); |
3027 | n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, | 3074 | n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, |
3028 | l->cpus); | 3075 | l->cpus); |
3029 | } | 3076 | } |
3030 | 3077 | ||
3031 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) { | 3078 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && |
3079 | n < PAGE_SIZE - 60) { | ||
3032 | n += sprintf(buf + n, " nodes="); | 3080 | n += sprintf(buf + n, " nodes="); |
3033 | n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, | 3081 | n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, |
3034 | l->nodes); | 3082 | l->nodes); |
diff --git a/mm/sparse.c b/mm/sparse.c index 545e4d3afcdf..e03b39f3540f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -240,6 +240,27 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | |||
240 | return NULL; | 240 | return NULL; |
241 | } | 241 | } |
242 | 242 | ||
243 | /* | ||
244 | * Allocate the accumulated non-linear sections, allocate a mem_map | ||
245 | * for each and record the physical to section mapping. | ||
246 | */ | ||
247 | void __init sparse_init(void) | ||
248 | { | ||
249 | unsigned long pnum; | ||
250 | struct page *map; | ||
251 | |||
252 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | ||
253 | if (!valid_section_nr(pnum)) | ||
254 | continue; | ||
255 | |||
256 | map = sparse_early_mem_map_alloc(pnum); | ||
257 | if (!map) | ||
258 | continue; | ||
259 | sparse_init_one_section(__nr_to_section(pnum), pnum, map); | ||
260 | } | ||
261 | } | ||
262 | |||
263 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
243 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) | 264 | static struct page *__kmalloc_section_memmap(unsigned long nr_pages) |
244 | { | 265 | { |
245 | struct page *page, *ret; | 266 | struct page *page, *ret; |
@@ -280,27 +301,6 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) | |||
280 | } | 301 | } |
281 | 302 | ||
282 | /* | 303 | /* |
283 | * Allocate the accumulated non-linear sections, allocate a mem_map | ||
284 | * for each and record the physical to section mapping. | ||
285 | */ | ||
286 | void __init sparse_init(void) | ||
287 | { | ||
288 | unsigned long pnum; | ||
289 | struct page *map; | ||
290 | |||
291 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | ||
292 | if (!valid_section_nr(pnum)) | ||
293 | continue; | ||
294 | |||
295 | map = sparse_early_mem_map_alloc(pnum); | ||
296 | if (!map) | ||
297 | continue; | ||
298 | sparse_init_one_section(__nr_to_section(pnum), pnum, map); | ||
299 | } | ||
300 | } | ||
301 | |||
302 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
303 | /* | ||
304 | * returns the number of sections whose mem_maps were properly | 304 | * returns the number of sections whose mem_maps were properly |
305 | * set. If this is <=0, then that means that the passed-in | 305 | * set. If this is <=0, then that means that the passed-in |
306 | * map was not consumed and must be freed. | 306 | * map was not consumed and must be freed. |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 5f7cf2a4cb55..925d5c50f18d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -21,7 +21,7 @@ | |||
21 | 21 | ||
22 | /* | 22 | /* |
23 | * swapper_space is a fiction, retained to simplify the path through | 23 | * swapper_space is a fiction, retained to simplify the path through |
24 | * vmscan's shrink_list, to make sync_page look nicer, and to allow | 24 | * vmscan's shrink_page_list, to make sync_page look nicer, and to allow |
25 | * future use of radix_tree tags in the swap cache. | 25 | * future use of radix_tree tags in the swap cache. |
26 | */ | 26 | */ |
27 | static const struct address_space_operations swap_aops = { | 27 | static const struct address_space_operations swap_aops = { |
diff --git a/mm/swapfile.c b/mm/swapfile.c index acc172cbe3aa..7ff0a81c7b01 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -885,7 +885,7 @@ static int try_to_unuse(unsigned int type) | |||
885 | /* | 885 | /* |
886 | * So we could skip searching mms once swap count went | 886 | * So we could skip searching mms once swap count went |
887 | * to 1, we did not mark any present ptes as dirty: must | 887 | * to 1, we did not mark any present ptes as dirty: must |
888 | * mark page dirty so shrink_list will preserve it. | 888 | * mark page dirty so shrink_page_list will preserve it. |
889 | */ | 889 | */ |
890 | SetPageDirty(page); | 890 | SetPageDirty(page); |
891 | unlock_page(page); | 891 | unlock_page(page); |
diff --git a/mm/truncate.c b/mm/truncate.c index 4fbe1a2da5fb..7c994f2d6145 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -253,21 +253,8 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
253 | } | 253 | } |
254 | EXPORT_SYMBOL(truncate_inode_pages); | 254 | EXPORT_SYMBOL(truncate_inode_pages); |
255 | 255 | ||
256 | /** | 256 | unsigned long __invalidate_mapping_pages(struct address_space *mapping, |
257 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode | 257 | pgoff_t start, pgoff_t end, bool be_atomic) |
258 | * @mapping: the address_space which holds the pages to invalidate | ||
259 | * @start: the offset 'from' which to invalidate | ||
260 | * @end: the offset 'to' which to invalidate (inclusive) | ||
261 | * | ||
262 | * This function only removes the unlocked pages, if you want to | ||
263 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
264 | * | ||
265 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
266 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
267 | * pagetables. | ||
268 | */ | ||
269 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
270 | pgoff_t start, pgoff_t end) | ||
271 | { | 258 | { |
272 | struct pagevec pvec; | 259 | struct pagevec pvec; |
273 | pgoff_t next = start; | 260 | pgoff_t next = start; |
@@ -308,17 +295,38 @@ unlock: | |||
308 | break; | 295 | break; |
309 | } | 296 | } |
310 | pagevec_release(&pvec); | 297 | pagevec_release(&pvec); |
298 | if (likely(!be_atomic)) | ||
299 | cond_resched(); | ||
311 | } | 300 | } |
312 | return ret; | 301 | return ret; |
313 | } | 302 | } |
303 | |||
304 | /** | ||
305 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode | ||
306 | * @mapping: the address_space which holds the pages to invalidate | ||
307 | * @start: the offset 'from' which to invalidate | ||
308 | * @end: the offset 'to' which to invalidate (inclusive) | ||
309 | * | ||
310 | * This function only removes the unlocked pages, if you want to | ||
311 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
312 | * | ||
313 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
314 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
315 | * pagetables. | ||
316 | */ | ||
317 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
318 | pgoff_t start, pgoff_t end) | ||
319 | { | ||
320 | return __invalidate_mapping_pages(mapping, start, end, false); | ||
321 | } | ||
314 | EXPORT_SYMBOL(invalidate_mapping_pages); | 322 | EXPORT_SYMBOL(invalidate_mapping_pages); |
315 | 323 | ||
316 | /* | 324 | /* |
317 | * This is like invalidate_complete_page(), except it ignores the page's | 325 | * This is like invalidate_complete_page(), except it ignores the page's |
318 | * refcount. We do this because invalidate_inode_pages2() needs stronger | 326 | * refcount. We do this because invalidate_inode_pages2() needs stronger |
319 | * invalidation guarantees, and cannot afford to leave pages behind because | 327 | * invalidation guarantees, and cannot afford to leave pages behind because |
320 | * shrink_list() has a temp ref on them, or because they're transiently sitting | 328 | * shrink_page_list() has a temp ref on them, or because they're transiently |
321 | * in the lru_cache_add() pagevecs. | 329 | * sitting in the lru_cache_add() pagevecs. |
322 | */ | 330 | */ |
323 | static int | 331 | static int |
324 | invalidate_complete_page2(struct address_space *mapping, struct page *page) | 332 | invalidate_complete_page2(struct address_space *mapping, struct page *page) |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 38254297a494..eceaf496210f 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -477,8 +477,8 @@ const struct seq_operations fragmentation_op = { | |||
477 | static const char * const vmstat_text[] = { | 477 | static const char * const vmstat_text[] = { |
478 | /* Zoned VM counters */ | 478 | /* Zoned VM counters */ |
479 | "nr_free_pages", | 479 | "nr_free_pages", |
480 | "nr_active", | ||
481 | "nr_inactive", | 480 | "nr_inactive", |
481 | "nr_active", | ||
482 | "nr_anon_pages", | 482 | "nr_anon_pages", |
483 | "nr_mapped", | 483 | "nr_mapped", |
484 | "nr_file_pages", | 484 | "nr_file_pages", |