diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 6 | ||||
| -rw-r--r-- | mm/Makefile | 2 | ||||
| -rw-r--r-- | mm/filemap.c | 2 | ||||
| -rw-r--r-- | mm/highmem.c | 3 | ||||
| -rw-r--r-- | mm/hugetlb.c | 286 | ||||
| -rw-r--r-- | mm/internal.h | 34 | ||||
| -rw-r--r-- | mm/memory.c | 21 | ||||
| -rw-r--r-- | mm/mempolicy.c | 117 | ||||
| -rw-r--r-- | mm/mempool.c | 4 | ||||
| -rw-r--r-- | mm/migrate.c | 655 | ||||
| -rw-r--r-- | mm/mmap.c | 10 | ||||
| -rw-r--r-- | mm/mprotect.c | 12 | ||||
| -rw-r--r-- | mm/nommu.c | 4 | ||||
| -rw-r--r-- | mm/page_alloc.c | 113 | ||||
| -rw-r--r-- | mm/readahead.c | 33 | ||||
| -rw-r--r-- | mm/rmap.c | 14 | ||||
| -rw-r--r-- | mm/shmem.c | 7 | ||||
| -rw-r--r-- | mm/slab.c | 890 | ||||
| -rw-r--r-- | mm/swap.c | 64 | ||||
| -rw-r--r-- | mm/swap_state.c | 1 | ||||
| -rw-r--r-- | mm/swapfile.c | 59 | ||||
| -rw-r--r-- | mm/vmscan.c | 882 |
22 files changed, 1833 insertions, 1386 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a9cb80ae6409..bd80460360db 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS | |||
| 137 | # support for page migration | 137 | # support for page migration |
| 138 | # | 138 | # |
| 139 | config MIGRATION | 139 | config MIGRATION |
| 140 | bool "Page migration" | ||
| 140 | def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM | 141 | def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM |
| 141 | depends on SWAP | 142 | depends on SWAP |
| 143 | help | ||
| 144 | Allows the migration of the physical location of pages of processes | ||
| 145 | while the virtual addresses are not changed. This is useful for | ||
| 146 | example on NUMA systems to put pages nearer to the processors accessing | ||
| 147 | the page. | ||
diff --git a/mm/Makefile b/mm/Makefile index 9aa03fa1dcc3..f10c753dce6d 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o | |||
| 22 | obj-$(CONFIG_SLAB) += slab.o | 22 | obj-$(CONFIG_SLAB) += slab.o |
| 23 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 23 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
| 24 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 24 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
| 25 | obj-$(CONFIG_MIGRATION) += migrate.o | ||
| 26 | |||
diff --git a/mm/filemap.c b/mm/filemap.c index 44da3d476994..e8f58f7dd7a5 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -30,6 +30,8 @@ | |||
| 30 | #include <linux/security.h> | 30 | #include <linux/security.h> |
| 31 | #include <linux/syscalls.h> | 31 | #include <linux/syscalls.h> |
| 32 | #include "filemap.h" | 32 | #include "filemap.h" |
| 33 | #include "internal.h" | ||
| 34 | |||
| 33 | /* | 35 | /* |
| 34 | * FIXME: remove all knowledge of the buffer layer from the core VM | 36 | * FIXME: remove all knowledge of the buffer layer from the core VM |
| 35 | */ | 37 | */ |
diff --git a/mm/highmem.c b/mm/highmem.c index ce2e7e8bbfa7..d0ea1eec6a9a 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
| @@ -26,6 +26,7 @@ | |||
| 26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
| 27 | #include <linux/hash.h> | 27 | #include <linux/hash.h> |
| 28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
| 29 | #include <linux/blktrace_api.h> | ||
| 29 | #include <asm/tlbflush.h> | 30 | #include <asm/tlbflush.h> |
| 30 | 31 | ||
| 31 | static mempool_t *page_pool, *isa_page_pool; | 32 | static mempool_t *page_pool, *isa_page_pool; |
| @@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) | |||
| 483 | pool = isa_page_pool; | 484 | pool = isa_page_pool; |
| 484 | } | 485 | } |
| 485 | 486 | ||
| 487 | blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE); | ||
| 488 | |||
| 486 | /* | 489 | /* |
| 487 | * slow path | 490 | * slow path |
| 488 | */ | 491 | */ |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 508707704d2c..ebad6bbb3501 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -13,24 +13,48 @@ | |||
| 13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
| 14 | #include <linux/mempolicy.h> | 14 | #include <linux/mempolicy.h> |
| 15 | #include <linux/cpuset.h> | 15 | #include <linux/cpuset.h> |
| 16 | #include <linux/mutex.h> | ||
| 16 | 17 | ||
| 17 | #include <asm/page.h> | 18 | #include <asm/page.h> |
| 18 | #include <asm/pgtable.h> | 19 | #include <asm/pgtable.h> |
| 19 | 20 | ||
| 20 | #include <linux/hugetlb.h> | 21 | #include <linux/hugetlb.h> |
| 22 | #include "internal.h" | ||
| 21 | 23 | ||
| 22 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
| 23 | static unsigned long nr_huge_pages, free_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; |
| 24 | unsigned long max_huge_pages; | 26 | unsigned long max_huge_pages; |
| 25 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
| 26 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
| 27 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | 29 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; |
| 28 | |||
| 29 | /* | 30 | /* |
| 30 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 31 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
| 31 | */ | 32 | */ |
| 32 | static DEFINE_SPINLOCK(hugetlb_lock); | 33 | static DEFINE_SPINLOCK(hugetlb_lock); |
| 33 | 34 | ||
| 35 | static void clear_huge_page(struct page *page, unsigned long addr) | ||
| 36 | { | ||
| 37 | int i; | ||
| 38 | |||
| 39 | might_sleep(); | ||
| 40 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { | ||
| 41 | cond_resched(); | ||
| 42 | clear_user_highpage(page + i, addr); | ||
| 43 | } | ||
| 44 | } | ||
| 45 | |||
| 46 | static void copy_huge_page(struct page *dst, struct page *src, | ||
| 47 | unsigned long addr) | ||
| 48 | { | ||
| 49 | int i; | ||
| 50 | |||
| 51 | might_sleep(); | ||
| 52 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { | ||
| 53 | cond_resched(); | ||
| 54 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE); | ||
| 55 | } | ||
| 56 | } | ||
| 57 | |||
| 34 | static void enqueue_huge_page(struct page *page) | 58 | static void enqueue_huge_page(struct page *page) |
| 35 | { | 59 | { |
| 36 | int nid = page_to_nid(page); | 60 | int nid = page_to_nid(page); |
| @@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma, | |||
| 64 | return page; | 88 | return page; |
| 65 | } | 89 | } |
| 66 | 90 | ||
| 67 | static struct page *alloc_fresh_huge_page(void) | 91 | static void free_huge_page(struct page *page) |
| 92 | { | ||
| 93 | BUG_ON(page_count(page)); | ||
| 94 | |||
| 95 | INIT_LIST_HEAD(&page->lru); | ||
| 96 | |||
| 97 | spin_lock(&hugetlb_lock); | ||
| 98 | enqueue_huge_page(page); | ||
| 99 | spin_unlock(&hugetlb_lock); | ||
| 100 | } | ||
| 101 | |||
| 102 | static int alloc_fresh_huge_page(void) | ||
| 68 | { | 103 | { |
| 69 | static int nid = 0; | 104 | static int nid = 0; |
| 70 | struct page *page; | 105 | struct page *page; |
| 71 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, | 106 | page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, |
| 72 | HUGETLB_PAGE_ORDER); | 107 | HUGETLB_PAGE_ORDER); |
| 73 | nid = (nid + 1) % num_online_nodes(); | 108 | nid = next_node(nid, node_online_map); |
| 109 | if (nid == MAX_NUMNODES) | ||
| 110 | nid = first_node(node_online_map); | ||
| 74 | if (page) { | 111 | if (page) { |
| 112 | page[1].lru.next = (void *)free_huge_page; /* dtor */ | ||
| 75 | spin_lock(&hugetlb_lock); | 113 | spin_lock(&hugetlb_lock); |
| 76 | nr_huge_pages++; | 114 | nr_huge_pages++; |
| 77 | nr_huge_pages_node[page_to_nid(page)]++; | 115 | nr_huge_pages_node[page_to_nid(page)]++; |
| 78 | spin_unlock(&hugetlb_lock); | 116 | spin_unlock(&hugetlb_lock); |
| 117 | put_page(page); /* free it into the hugepage allocator */ | ||
| 118 | return 1; | ||
| 79 | } | 119 | } |
| 80 | return page; | 120 | return 0; |
| 81 | } | 121 | } |
| 82 | 122 | ||
| 83 | void free_huge_page(struct page *page) | 123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
| 124 | unsigned long addr) | ||
| 84 | { | 125 | { |
| 85 | BUG_ON(page_count(page)); | 126 | struct inode *inode = vma->vm_file->f_dentry->d_inode; |
| 127 | struct page *page; | ||
| 128 | int use_reserve = 0; | ||
| 129 | unsigned long idx; | ||
| 86 | 130 | ||
| 87 | INIT_LIST_HEAD(&page->lru); | 131 | spin_lock(&hugetlb_lock); |
| 88 | page[1].lru.next = NULL; /* reset dtor */ | 132 | |
| 133 | if (vma->vm_flags & VM_MAYSHARE) { | ||
| 134 | |||
| 135 | /* idx = radix tree index, i.e. offset into file in | ||
| 136 | * HPAGE_SIZE units */ | ||
| 137 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
| 138 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
| 139 | |||
| 140 | /* The hugetlbfs specific inode info stores the number | ||
| 141 | * of "guaranteed available" (huge) pages. That is, | ||
| 142 | * the first 'prereserved_hpages' pages of the inode | ||
| 143 | * are either already instantiated, or have been | ||
| 144 | * pre-reserved (by hugetlb_reserve_for_inode()). Here | ||
| 145 | * we're in the process of instantiating the page, so | ||
| 146 | * we use this to determine whether to draw from the | ||
| 147 | * pre-reserved pool or the truly free pool. */ | ||
| 148 | if (idx < HUGETLBFS_I(inode)->prereserved_hpages) | ||
| 149 | use_reserve = 1; | ||
| 150 | } | ||
| 151 | |||
| 152 | if (!use_reserve) { | ||
| 153 | if (free_huge_pages <= reserved_huge_pages) | ||
| 154 | goto fail; | ||
| 155 | } else { | ||
| 156 | BUG_ON(reserved_huge_pages == 0); | ||
| 157 | reserved_huge_pages--; | ||
| 158 | } | ||
| 159 | |||
| 160 | page = dequeue_huge_page(vma, addr); | ||
| 161 | if (!page) | ||
| 162 | goto fail; | ||
| 163 | |||
| 164 | spin_unlock(&hugetlb_lock); | ||
| 165 | set_page_refcounted(page); | ||
| 166 | return page; | ||
| 167 | |||
| 168 | fail: | ||
| 169 | WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ | ||
| 170 | spin_unlock(&hugetlb_lock); | ||
| 171 | return NULL; | ||
| 172 | } | ||
| 173 | |||
| 174 | /* hugetlb_extend_reservation() | ||
| 175 | * | ||
| 176 | * Ensure that at least 'atleast' hugepages are, and will remain, | ||
| 177 | * available to instantiate the first 'atleast' pages of the given | ||
| 178 | * inode. If the inode doesn't already have this many pages reserved | ||
| 179 | * or instantiated, set aside some hugepages in the reserved pool to | ||
| 180 | * satisfy later faults (or fail now if there aren't enough, rather | ||
| 181 | * than getting the SIGBUS later). | ||
| 182 | */ | ||
| 183 | int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, | ||
| 184 | unsigned long atleast) | ||
| 185 | { | ||
| 186 | struct inode *inode = &info->vfs_inode; | ||
| 187 | unsigned long change_in_reserve = 0; | ||
| 188 | int ret = 0; | ||
| 89 | 189 | ||
| 90 | spin_lock(&hugetlb_lock); | 190 | spin_lock(&hugetlb_lock); |
| 91 | enqueue_huge_page(page); | 191 | read_lock_irq(&inode->i_mapping->tree_lock); |
| 192 | |||
| 193 | if (info->prereserved_hpages >= atleast) | ||
| 194 | goto out; | ||
| 195 | |||
| 196 | /* Because we always call this on shared mappings, none of the | ||
| 197 | * pages beyond info->prereserved_hpages can have been | ||
| 198 | * instantiated, so we need to reserve all of them now. */ | ||
| 199 | change_in_reserve = atleast - info->prereserved_hpages; | ||
| 200 | |||
| 201 | if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { | ||
| 202 | ret = -ENOMEM; | ||
| 203 | goto out; | ||
| 204 | } | ||
| 205 | |||
| 206 | reserved_huge_pages += change_in_reserve; | ||
| 207 | info->prereserved_hpages = atleast; | ||
| 208 | |||
| 209 | out: | ||
| 210 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
| 92 | spin_unlock(&hugetlb_lock); | 211 | spin_unlock(&hugetlb_lock); |
| 212 | |||
| 213 | return ret; | ||
| 93 | } | 214 | } |
| 94 | 215 | ||
| 95 | struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) | 216 | /* hugetlb_truncate_reservation() |
| 217 | * | ||
| 218 | * This returns pages reserved for the given inode to the general free | ||
| 219 | * hugepage pool. If the inode has any pages prereserved, but not | ||
| 220 | * instantiated, beyond offset (atmost << HPAGE_SIZE), then release | ||
| 221 | * them. | ||
| 222 | */ | ||
| 223 | void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, | ||
| 224 | unsigned long atmost) | ||
| 96 | { | 225 | { |
| 226 | struct inode *inode = &info->vfs_inode; | ||
| 227 | struct address_space *mapping = inode->i_mapping; | ||
| 228 | unsigned long idx; | ||
| 229 | unsigned long change_in_reserve = 0; | ||
| 97 | struct page *page; | 230 | struct page *page; |
| 98 | int i; | ||
| 99 | 231 | ||
| 100 | spin_lock(&hugetlb_lock); | 232 | spin_lock(&hugetlb_lock); |
| 101 | page = dequeue_huge_page(vma, addr); | 233 | read_lock_irq(&inode->i_mapping->tree_lock); |
| 102 | if (!page) { | 234 | |
| 103 | spin_unlock(&hugetlb_lock); | 235 | if (info->prereserved_hpages <= atmost) |
| 104 | return NULL; | 236 | goto out; |
| 237 | |||
| 238 | /* Count pages which were reserved, but not instantiated, and | ||
| 239 | * which we can now release. */ | ||
| 240 | for (idx = atmost; idx < info->prereserved_hpages; idx++) { | ||
| 241 | page = radix_tree_lookup(&mapping->page_tree, idx); | ||
| 242 | if (!page) | ||
| 243 | /* Pages which are already instantiated can't | ||
| 244 | * be unreserved (and in fact have already | ||
| 245 | * been removed from the reserved pool) */ | ||
| 246 | change_in_reserve++; | ||
| 105 | } | 247 | } |
| 248 | |||
| 249 | BUG_ON(reserved_huge_pages < change_in_reserve); | ||
| 250 | reserved_huge_pages -= change_in_reserve; | ||
| 251 | info->prereserved_hpages = atmost; | ||
| 252 | |||
| 253 | out: | ||
| 254 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
| 106 | spin_unlock(&hugetlb_lock); | 255 | spin_unlock(&hugetlb_lock); |
| 107 | set_page_count(page, 1); | ||
| 108 | page[1].lru.next = (void *)free_huge_page; /* set dtor */ | ||
| 109 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i) | ||
| 110 | clear_user_highpage(&page[i], addr); | ||
| 111 | return page; | ||
| 112 | } | 256 | } |
| 113 | 257 | ||
| 114 | static int __init hugetlb_init(void) | 258 | static int __init hugetlb_init(void) |
| 115 | { | 259 | { |
| 116 | unsigned long i; | 260 | unsigned long i; |
| 117 | struct page *page; | ||
| 118 | 261 | ||
| 119 | if (HPAGE_SHIFT == 0) | 262 | if (HPAGE_SHIFT == 0) |
| 120 | return 0; | 263 | return 0; |
| @@ -123,12 +266,8 @@ static int __init hugetlb_init(void) | |||
| 123 | INIT_LIST_HEAD(&hugepage_freelists[i]); | 266 | INIT_LIST_HEAD(&hugepage_freelists[i]); |
| 124 | 267 | ||
| 125 | for (i = 0; i < max_huge_pages; ++i) { | 268 | for (i = 0; i < max_huge_pages; ++i) { |
| 126 | page = alloc_fresh_huge_page(); | 269 | if (!alloc_fresh_huge_page()) |
| 127 | if (!page) | ||
| 128 | break; | 270 | break; |
| 129 | spin_lock(&hugetlb_lock); | ||
| 130 | enqueue_huge_page(page); | ||
| 131 | spin_unlock(&hugetlb_lock); | ||
| 132 | } | 271 | } |
| 133 | max_huge_pages = free_huge_pages = nr_huge_pages = i; | 272 | max_huge_pages = free_huge_pages = nr_huge_pages = i; |
| 134 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); | 273 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); |
| @@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page) | |||
| 154 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 293 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | |
| 155 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 294 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | |
| 156 | 1 << PG_private | 1<< PG_writeback); | 295 | 1 << PG_private | 1<< PG_writeback); |
| 157 | set_page_count(&page[i], 0); | ||
| 158 | } | 296 | } |
| 159 | set_page_count(page, 1); | 297 | page[1].lru.next = NULL; |
| 298 | set_page_refcounted(page); | ||
| 160 | __free_pages(page, HUGETLB_PAGE_ORDER); | 299 | __free_pages(page, HUGETLB_PAGE_ORDER); |
| 161 | } | 300 | } |
| 162 | 301 | ||
| @@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count) | |||
| 188 | static unsigned long set_max_huge_pages(unsigned long count) | 327 | static unsigned long set_max_huge_pages(unsigned long count) |
| 189 | { | 328 | { |
| 190 | while (count > nr_huge_pages) { | 329 | while (count > nr_huge_pages) { |
| 191 | struct page *page = alloc_fresh_huge_page(); | 330 | if (!alloc_fresh_huge_page()) |
| 192 | if (!page) | ||
| 193 | return nr_huge_pages; | 331 | return nr_huge_pages; |
| 194 | spin_lock(&hugetlb_lock); | ||
| 195 | enqueue_huge_page(page); | ||
| 196 | spin_unlock(&hugetlb_lock); | ||
| 197 | } | 332 | } |
| 198 | if (count >= nr_huge_pages) | 333 | if (count >= nr_huge_pages) |
| 199 | return nr_huge_pages; | 334 | return nr_huge_pages; |
| @@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf) | |||
| 225 | return sprintf(buf, | 360 | return sprintf(buf, |
| 226 | "HugePages_Total: %5lu\n" | 361 | "HugePages_Total: %5lu\n" |
| 227 | "HugePages_Free: %5lu\n" | 362 | "HugePages_Free: %5lu\n" |
| 363 | "HugePages_Rsvd: %5lu\n" | ||
| 228 | "Hugepagesize: %5lu kB\n", | 364 | "Hugepagesize: %5lu kB\n", |
| 229 | nr_huge_pages, | 365 | nr_huge_pages, |
| 230 | free_huge_pages, | 366 | free_huge_pages, |
| 367 | reserved_huge_pages, | ||
| 231 | HPAGE_SIZE/1024); | 368 | HPAGE_SIZE/1024); |
| 232 | } | 369 | } |
| 233 | 370 | ||
| @@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf) | |||
| 240 | nid, free_huge_pages_node[nid]); | 377 | nid, free_huge_pages_node[nid]); |
| 241 | } | 378 | } |
| 242 | 379 | ||
| 243 | int is_hugepage_mem_enough(size_t size) | ||
| 244 | { | ||
| 245 | return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; | ||
| 246 | } | ||
| 247 | |||
| 248 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | 380 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ |
| 249 | unsigned long hugetlb_total_pages(void) | 381 | unsigned long hugetlb_total_pages(void) |
| 250 | { | 382 | { |
| @@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 374 | unsigned long address, pte_t *ptep, pte_t pte) | 506 | unsigned long address, pte_t *ptep, pte_t pte) |
| 375 | { | 507 | { |
| 376 | struct page *old_page, *new_page; | 508 | struct page *old_page, *new_page; |
| 377 | int i, avoidcopy; | 509 | int avoidcopy; |
| 378 | 510 | ||
| 379 | old_page = pte_page(pte); | 511 | old_page = pte_page(pte); |
| 380 | 512 | ||
| @@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 395 | } | 527 | } |
| 396 | 528 | ||
| 397 | spin_unlock(&mm->page_table_lock); | 529 | spin_unlock(&mm->page_table_lock); |
| 398 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) | 530 | copy_huge_page(new_page, old_page, address); |
| 399 | copy_user_highpage(new_page + i, old_page + i, | ||
| 400 | address + i*PAGE_SIZE); | ||
| 401 | spin_lock(&mm->page_table_lock); | 531 | spin_lock(&mm->page_table_lock); |
| 402 | 532 | ||
| 403 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 533 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); |
| @@ -442,6 +572,7 @@ retry: | |||
| 442 | ret = VM_FAULT_OOM; | 572 | ret = VM_FAULT_OOM; |
| 443 | goto out; | 573 | goto out; |
| 444 | } | 574 | } |
| 575 | clear_huge_page(page, address); | ||
| 445 | 576 | ||
| 446 | if (vma->vm_flags & VM_SHARED) { | 577 | if (vma->vm_flags & VM_SHARED) { |
| 447 | int err; | 578 | int err; |
| @@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 496 | pte_t *ptep; | 627 | pte_t *ptep; |
| 497 | pte_t entry; | 628 | pte_t entry; |
| 498 | int ret; | 629 | int ret; |
| 630 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); | ||
| 499 | 631 | ||
| 500 | ptep = huge_pte_alloc(mm, address); | 632 | ptep = huge_pte_alloc(mm, address); |
| 501 | if (!ptep) | 633 | if (!ptep) |
| 502 | return VM_FAULT_OOM; | 634 | return VM_FAULT_OOM; |
| 503 | 635 | ||
| 636 | /* | ||
| 637 | * Serialize hugepage allocation and instantiation, so that we don't | ||
| 638 | * get spurious allocation failures if two CPUs race to instantiate | ||
| 639 | * the same page in the page cache. | ||
| 640 | */ | ||
| 641 | mutex_lock(&hugetlb_instantiation_mutex); | ||
| 504 | entry = *ptep; | 642 | entry = *ptep; |
| 505 | if (pte_none(entry)) | 643 | if (pte_none(entry)) { |
| 506 | return hugetlb_no_page(mm, vma, address, ptep, write_access); | 644 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); |
| 645 | mutex_unlock(&hugetlb_instantiation_mutex); | ||
| 646 | return ret; | ||
| 647 | } | ||
| 507 | 648 | ||
| 508 | ret = VM_FAULT_MINOR; | 649 | ret = VM_FAULT_MINOR; |
| 509 | 650 | ||
| @@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 513 | if (write_access && !pte_write(entry)) | 654 | if (write_access && !pte_write(entry)) |
| 514 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | 655 | ret = hugetlb_cow(mm, vma, address, ptep, entry); |
| 515 | spin_unlock(&mm->page_table_lock); | 656 | spin_unlock(&mm->page_table_lock); |
| 657 | mutex_unlock(&hugetlb_instantiation_mutex); | ||
| 516 | 658 | ||
| 517 | return ret; | 659 | return ret; |
| 518 | } | 660 | } |
| @@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 521 | struct page **pages, struct vm_area_struct **vmas, | 663 | struct page **pages, struct vm_area_struct **vmas, |
| 522 | unsigned long *position, int *length, int i) | 664 | unsigned long *position, int *length, int i) |
| 523 | { | 665 | { |
| 524 | unsigned long vpfn, vaddr = *position; | 666 | unsigned long pfn_offset; |
| 667 | unsigned long vaddr = *position; | ||
| 525 | int remainder = *length; | 668 | int remainder = *length; |
| 526 | 669 | ||
| 527 | vpfn = vaddr/PAGE_SIZE; | ||
| 528 | spin_lock(&mm->page_table_lock); | 670 | spin_lock(&mm->page_table_lock); |
| 529 | while (vaddr < vma->vm_end && remainder) { | 671 | while (vaddr < vma->vm_end && remainder) { |
| 530 | pte_t *pte; | 672 | pte_t *pte; |
| @@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 552 | break; | 694 | break; |
| 553 | } | 695 | } |
| 554 | 696 | ||
| 555 | if (pages) { | 697 | pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; |
| 556 | page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; | 698 | page = pte_page(*pte); |
| 557 | get_page(page); | 699 | same_page: |
| 558 | pages[i] = page; | 700 | get_page(page); |
| 559 | } | 701 | if (pages) |
| 702 | pages[i] = page + pfn_offset; | ||
| 560 | 703 | ||
| 561 | if (vmas) | 704 | if (vmas) |
| 562 | vmas[i] = vma; | 705 | vmas[i] = vma; |
| 563 | 706 | ||
| 564 | vaddr += PAGE_SIZE; | 707 | vaddr += PAGE_SIZE; |
| 565 | ++vpfn; | 708 | ++pfn_offset; |
| 566 | --remainder; | 709 | --remainder; |
| 567 | ++i; | 710 | ++i; |
| 711 | if (vaddr < vma->vm_end && remainder && | ||
| 712 | pfn_offset < HPAGE_SIZE/PAGE_SIZE) { | ||
| 713 | /* | ||
| 714 | * We use pfn_offset to avoid touching the pageframes | ||
| 715 | * of this compound page. | ||
| 716 | */ | ||
| 717 | goto same_page; | ||
| 718 | } | ||
| 568 | } | 719 | } |
| 569 | spin_unlock(&mm->page_table_lock); | 720 | spin_unlock(&mm->page_table_lock); |
| 570 | *length = remainder; | 721 | *length = remainder; |
| @@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 572 | 723 | ||
| 573 | return i; | 724 | return i; |
| 574 | } | 725 | } |
| 726 | |||
| 727 | void hugetlb_change_protection(struct vm_area_struct *vma, | ||
| 728 | unsigned long address, unsigned long end, pgprot_t newprot) | ||
| 729 | { | ||
| 730 | struct mm_struct *mm = vma->vm_mm; | ||
| 731 | unsigned long start = address; | ||
| 732 | pte_t *ptep; | ||
| 733 | pte_t pte; | ||
| 734 | |||
| 735 | BUG_ON(address >= end); | ||
| 736 | flush_cache_range(vma, address, end); | ||
| 737 | |||
| 738 | spin_lock(&mm->page_table_lock); | ||
| 739 | for (; address < end; address += HPAGE_SIZE) { | ||
| 740 | ptep = huge_pte_offset(mm, address); | ||
| 741 | if (!ptep) | ||
| 742 | continue; | ||
| 743 | if (!pte_none(*ptep)) { | ||
| 744 | pte = huge_ptep_get_and_clear(mm, address, ptep); | ||
| 745 | pte = pte_mkhuge(pte_modify(pte, newprot)); | ||
| 746 | set_huge_pte_at(mm, address, ptep, pte); | ||
| 747 | lazy_mmu_prot_update(pte); | ||
| 748 | } | ||
| 749 | } | ||
| 750 | spin_unlock(&mm->page_table_lock); | ||
| 751 | |||
| 752 | flush_tlb_range(vma, start, end); | ||
| 753 | } | ||
| 754 | |||
diff --git a/mm/internal.h b/mm/internal.h index 17256bb2f4ef..d20e3cc4aef0 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -8,23 +8,33 @@ | |||
| 8 | * as published by the Free Software Foundation; either version | 8 | * as published by the Free Software Foundation; either version |
| 9 | * 2 of the License, or (at your option) any later version. | 9 | * 2 of the License, or (at your option) any later version. |
| 10 | */ | 10 | */ |
| 11 | #ifndef __MM_INTERNAL_H | ||
| 12 | #define __MM_INTERNAL_H | ||
| 11 | 13 | ||
| 12 | static inline void set_page_refs(struct page *page, int order) | 14 | #include <linux/mm.h> |
| 15 | |||
| 16 | static inline void set_page_count(struct page *page, int v) | ||
| 17 | { | ||
| 18 | atomic_set(&page->_count, v); | ||
| 19 | } | ||
| 20 | |||
| 21 | /* | ||
| 22 | * Turn a non-refcounted page (->_count == 0) into refcounted with | ||
| 23 | * a count of one. | ||
| 24 | */ | ||
| 25 | static inline void set_page_refcounted(struct page *page) | ||
| 13 | { | 26 | { |
| 14 | #ifdef CONFIG_MMU | 27 | BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); |
| 28 | BUG_ON(atomic_read(&page->_count)); | ||
| 15 | set_page_count(page, 1); | 29 | set_page_count(page, 1); |
| 16 | #else | 30 | } |
| 17 | int i; | ||
| 18 | 31 | ||
| 19 | /* | 32 | static inline void __put_page(struct page *page) |
| 20 | * We need to reference all the pages for this order, otherwise if | 33 | { |
| 21 | * anyone accesses one of the pages with (get/put) it will be freed. | 34 | atomic_dec(&page->_count); |
| 22 | * - eg: access_process_vm() | ||
| 23 | */ | ||
| 24 | for (i = 0; i < (1 << order); i++) | ||
| 25 | set_page_count(page + i, 1); | ||
| 26 | #endif /* CONFIG_MMU */ | ||
| 27 | } | 35 | } |
| 28 | 36 | ||
| 29 | extern void fastcall __init __free_pages_bootmem(struct page *page, | 37 | extern void fastcall __init __free_pages_bootmem(struct page *page, |
| 30 | unsigned int order); | 38 | unsigned int order); |
| 39 | |||
| 40 | #endif | ||
diff --git a/mm/memory.c b/mm/memory.c index 85e80a57db29..80c3fb370f91 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
| 277 | anon_vma_unlink(vma); | 277 | anon_vma_unlink(vma); |
| 278 | unlink_file_vma(vma); | 278 | unlink_file_vma(vma); |
| 279 | 279 | ||
| 280 | if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { | 280 | if (is_vm_hugetlb_page(vma)) { |
| 281 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, | 281 | hugetlb_free_pgd_range(tlb, addr, vma->vm_end, |
| 282 | floor, next? next->vm_start: ceiling); | 282 | floor, next? next->vm_start: ceiling); |
| 283 | } else { | 283 | } else { |
| @@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | |||
| 285 | * Optimization: gather nearby vmas into one call down | 285 | * Optimization: gather nearby vmas into one call down |
| 286 | */ | 286 | */ |
| 287 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE | 287 | while (next && next->vm_start <= vma->vm_end + PMD_SIZE |
| 288 | && !is_hugepage_only_range(vma->vm_mm, next->vm_start, | 288 | && !is_vm_hugetlb_page(next)) { |
| 289 | HPAGE_SIZE)) { | ||
| 290 | vma = next; | 289 | vma = next; |
| 291 | next = vma->vm_next; | 290 | next = vma->vm_next; |
| 292 | anon_vma_unlink(vma); | 291 | anon_vma_unlink(vma); |
| @@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ | |||
| 388 | { | 387 | { |
| 389 | unsigned long pfn = pte_pfn(pte); | 388 | unsigned long pfn = pte_pfn(pte); |
| 390 | 389 | ||
| 391 | if (vma->vm_flags & VM_PFNMAP) { | 390 | if (unlikely(vma->vm_flags & VM_PFNMAP)) { |
| 392 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; | 391 | unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; |
| 393 | if (pfn == vma->vm_pgoff + off) | 392 | if (pfn == vma->vm_pgoff + off) |
| 394 | return NULL; | 393 | return NULL; |
| @@ -396,18 +395,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_ | |||
| 396 | return NULL; | 395 | return NULL; |
| 397 | } | 396 | } |
| 398 | 397 | ||
| 399 | /* | 398 | #ifdef CONFIG_DEBUG_VM |
| 400 | * Add some anal sanity checks for now. Eventually, | ||
| 401 | * we should just do "return pfn_to_page(pfn)", but | ||
| 402 | * in the meantime we check that we get a valid pfn, | ||
| 403 | * and that the resulting page looks ok. | ||
| 404 | * | ||
| 405 | * Remove this test eventually! | ||
| 406 | */ | ||
| 407 | if (unlikely(!pfn_valid(pfn))) { | 399 | if (unlikely(!pfn_valid(pfn))) { |
| 408 | print_bad_pte(vma, pte, addr); | 400 | print_bad_pte(vma, pte, addr); |
| 409 | return NULL; | 401 | return NULL; |
| 410 | } | 402 | } |
| 403 | #endif | ||
| 411 | 404 | ||
| 412 | /* | 405 | /* |
| 413 | * NOTE! We still have PageReserved() pages in the page | 406 | * NOTE! We still have PageReserved() pages in the page |
| @@ -1221,9 +1214,7 @@ out: | |||
| 1221 | * The page has to be a nice clean _individual_ kernel allocation. | 1214 | * The page has to be a nice clean _individual_ kernel allocation. |
| 1222 | * If you allocate a compound page, you need to have marked it as | 1215 | * If you allocate a compound page, you need to have marked it as |
| 1223 | * such (__GFP_COMP), or manually just split the page up yourself | 1216 | * such (__GFP_COMP), or manually just split the page up yourself |
| 1224 | * (which is mainly an issue of doing "set_page_count(page, 1)" for | 1217 | * (see split_page()). |
| 1225 | * each sub-page, and then freeing them one by one when you free | ||
| 1226 | * them rather than freeing it as a compound page). | ||
| 1227 | * | 1218 | * |
| 1228 | * NOTE! Traditionally this was done with "remap_pfn_range()" which | 1219 | * NOTE! Traditionally this was done with "remap_pfn_range()" which |
| 1229 | * took an arbitrary page protection parameter. This doesn't allow | 1220 | * took an arbitrary page protection parameter. This doesn't allow |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b21869a39f0b..e93cc740c22b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -86,6 +86,7 @@ | |||
| 86 | #include <linux/swap.h> | 86 | #include <linux/swap.h> |
| 87 | #include <linux/seq_file.h> | 87 | #include <linux/seq_file.h> |
| 88 | #include <linux/proc_fs.h> | 88 | #include <linux/proc_fs.h> |
| 89 | #include <linux/migrate.h> | ||
| 89 | 90 | ||
| 90 | #include <asm/tlbflush.h> | 91 | #include <asm/tlbflush.h> |
| 91 | #include <asm/uaccess.h> | 92 | #include <asm/uaccess.h> |
| @@ -95,11 +96,8 @@ | |||
| 95 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | 96 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ |
| 96 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ | 97 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ |
| 97 | 98 | ||
| 98 | /* The number of pages to migrate per call to migrate_pages() */ | 99 | static struct kmem_cache *policy_cache; |
| 99 | #define MIGRATE_CHUNK_SIZE 256 | 100 | static struct kmem_cache *sn_cache; |
| 100 | |||
| 101 | static kmem_cache_t *policy_cache; | ||
| 102 | static kmem_cache_t *sn_cache; | ||
| 103 | 101 | ||
| 104 | #define PDprintk(fmt...) | 102 | #define PDprintk(fmt...) |
| 105 | 103 | ||
| @@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
| 331 | struct vm_area_struct *first, *vma, *prev; | 329 | struct vm_area_struct *first, *vma, *prev; |
| 332 | 330 | ||
| 333 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { | 331 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { |
| 334 | /* Must have swap device for migration */ | ||
| 335 | if (nr_swap_pages <= 0) | ||
| 336 | return ERR_PTR(-ENODEV); | ||
| 337 | 332 | ||
| 338 | /* | 333 | err = migrate_prep(); |
| 339 | * Clear the LRU lists so pages can be isolated. | 334 | if (err) |
| 340 | * Note that pages may be moved off the LRU after we have | 335 | return ERR_PTR(err); |
| 341 | * drained them. Those pages will fail to migrate like other | ||
| 342 | * pages that may be busy. | ||
| 343 | */ | ||
| 344 | lru_add_drain_all(); | ||
| 345 | } | 336 | } |
| 346 | 337 | ||
| 347 | first = find_vma(mm, start); | 338 | first = find_vma(mm, start); |
| @@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 550 | return err; | 541 | return err; |
| 551 | } | 542 | } |
| 552 | 543 | ||
| 544 | #ifdef CONFIG_MIGRATION | ||
| 553 | /* | 545 | /* |
| 554 | * page migration | 546 | * page migration |
| 555 | */ | 547 | */ |
| 556 | |||
| 557 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | 548 | static void migrate_page_add(struct page *page, struct list_head *pagelist, |
| 558 | unsigned long flags) | 549 | unsigned long flags) |
| 559 | { | 550 | { |
| 560 | /* | 551 | /* |
| 561 | * Avoid migrating a page that is shared with others. | 552 | * Avoid migrating a page that is shared with others. |
| 562 | */ | 553 | */ |
| 563 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 554 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) |
| 564 | if (isolate_lru_page(page)) | 555 | isolate_lru_page(page, pagelist); |
| 565 | list_add_tail(&page->lru, pagelist); | ||
| 566 | } | ||
| 567 | } | ||
| 568 | |||
| 569 | /* | ||
| 570 | * Migrate the list 'pagelist' of pages to a certain destination. | ||
| 571 | * | ||
| 572 | * Specify destination with either non-NULL vma or dest_node >= 0 | ||
| 573 | * Return the number of pages not migrated or error code | ||
| 574 | */ | ||
| 575 | static int migrate_pages_to(struct list_head *pagelist, | ||
| 576 | struct vm_area_struct *vma, int dest) | ||
| 577 | { | ||
| 578 | LIST_HEAD(newlist); | ||
| 579 | LIST_HEAD(moved); | ||
| 580 | LIST_HEAD(failed); | ||
| 581 | int err = 0; | ||
| 582 | unsigned long offset = 0; | ||
| 583 | int nr_pages; | ||
| 584 | struct page *page; | ||
| 585 | struct list_head *p; | ||
| 586 | |||
| 587 | redo: | ||
| 588 | nr_pages = 0; | ||
| 589 | list_for_each(p, pagelist) { | ||
| 590 | if (vma) { | ||
| 591 | /* | ||
| 592 | * The address passed to alloc_page_vma is used to | ||
| 593 | * generate the proper interleave behavior. We fake | ||
| 594 | * the address here by an increasing offset in order | ||
| 595 | * to get the proper distribution of pages. | ||
| 596 | * | ||
| 597 | * No decision has been made as to which page | ||
| 598 | * a certain old page is moved to so we cannot | ||
| 599 | * specify the correct address. | ||
| 600 | */ | ||
| 601 | page = alloc_page_vma(GFP_HIGHUSER, vma, | ||
| 602 | offset + vma->vm_start); | ||
| 603 | offset += PAGE_SIZE; | ||
| 604 | } | ||
| 605 | else | ||
| 606 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | ||
| 607 | |||
| 608 | if (!page) { | ||
| 609 | err = -ENOMEM; | ||
| 610 | goto out; | ||
| 611 | } | ||
| 612 | list_add_tail(&page->lru, &newlist); | ||
| 613 | nr_pages++; | ||
| 614 | if (nr_pages > MIGRATE_CHUNK_SIZE) | ||
| 615 | break; | ||
| 616 | } | ||
| 617 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | ||
| 618 | |||
| 619 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | ||
| 620 | |||
| 621 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | ||
| 622 | goto redo; | ||
| 623 | out: | ||
| 624 | /* Return leftover allocated pages */ | ||
| 625 | while (!list_empty(&newlist)) { | ||
| 626 | page = list_entry(newlist.next, struct page, lru); | ||
| 627 | list_del(&page->lru); | ||
| 628 | __free_page(page); | ||
| 629 | } | ||
| 630 | list_splice(&failed, pagelist); | ||
| 631 | if (err < 0) | ||
| 632 | return err; | ||
| 633 | |||
| 634 | /* Calculate number of leftover pages */ | ||
| 635 | nr_pages = 0; | ||
| 636 | list_for_each(p, pagelist) | ||
| 637 | nr_pages++; | ||
| 638 | return nr_pages; | ||
| 639 | } | 556 | } |
| 640 | 557 | ||
| 641 | /* | 558 | /* |
| @@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm, | |||
| 742 | if (err < 0) | 659 | if (err < 0) |
| 743 | return err; | 660 | return err; |
| 744 | return busy; | 661 | return busy; |
| 662 | |||
| 745 | } | 663 | } |
| 746 | 664 | ||
| 665 | #else | ||
| 666 | |||
| 667 | static void migrate_page_add(struct page *page, struct list_head *pagelist, | ||
| 668 | unsigned long flags) | ||
| 669 | { | ||
| 670 | } | ||
| 671 | |||
| 672 | int do_migrate_pages(struct mm_struct *mm, | ||
| 673 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | ||
| 674 | { | ||
| 675 | return -ENOSYS; | ||
| 676 | } | ||
| 677 | #endif | ||
| 678 | |||
| 747 | long do_mbind(unsigned long start, unsigned long len, | 679 | long do_mbind(unsigned long start, unsigned long len, |
| 748 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | 680 | unsigned long mode, nodemask_t *nmask, unsigned long flags) |
| 749 | { | 681 | { |
| @@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len, | |||
| 808 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 740 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
| 809 | err = -EIO; | 741 | err = -EIO; |
| 810 | } | 742 | } |
| 743 | |||
| 811 | if (!list_empty(&pagelist)) | 744 | if (!list_empty(&pagelist)) |
| 812 | putback_lru_pages(&pagelist); | 745 | putback_lru_pages(&pagelist); |
| 813 | 746 | ||
diff --git a/mm/mempool.c b/mm/mempool.c index 1a99b80480d3..f71893ed3543 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
| @@ -278,14 +278,14 @@ EXPORT_SYMBOL(mempool_free); | |||
| 278 | */ | 278 | */ |
| 279 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) | 279 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) |
| 280 | { | 280 | { |
| 281 | kmem_cache_t *mem = (kmem_cache_t *) pool_data; | 281 | struct kmem_cache *mem = pool_data; |
| 282 | return kmem_cache_alloc(mem, gfp_mask); | 282 | return kmem_cache_alloc(mem, gfp_mask); |
| 283 | } | 283 | } |
| 284 | EXPORT_SYMBOL(mempool_alloc_slab); | 284 | EXPORT_SYMBOL(mempool_alloc_slab); |
| 285 | 285 | ||
| 286 | void mempool_free_slab(void *element, void *pool_data) | 286 | void mempool_free_slab(void *element, void *pool_data) |
| 287 | { | 287 | { |
| 288 | kmem_cache_t *mem = (kmem_cache_t *) pool_data; | 288 | struct kmem_cache *mem = pool_data; |
| 289 | kmem_cache_free(mem, element); | 289 | kmem_cache_free(mem, element); |
| 290 | } | 290 | } |
| 291 | EXPORT_SYMBOL(mempool_free_slab); | 291 | EXPORT_SYMBOL(mempool_free_slab); |
diff --git a/mm/migrate.c b/mm/migrate.c new file mode 100644 index 000000000000..09f6e4aa87fc --- /dev/null +++ b/mm/migrate.c | |||
| @@ -0,0 +1,655 @@ | |||
| 1 | /* | ||
| 2 | * Memory Migration functionality - linux/mm/migration.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter | ||
| 5 | * | ||
| 6 | * Page migration was first developed in the context of the memory hotplug | ||
| 7 | * project. The main authors of the migration code are: | ||
| 8 | * | ||
| 9 | * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> | ||
| 10 | * Hirokazu Takahashi <taka@valinux.co.jp> | ||
| 11 | * Dave Hansen <haveblue@us.ibm.com> | ||
| 12 | * Christoph Lameter <clameter@sgi.com> | ||
| 13 | */ | ||
| 14 | |||
| 15 | #include <linux/migrate.h> | ||
| 16 | #include <linux/module.h> | ||
| 17 | #include <linux/swap.h> | ||
| 18 | #include <linux/pagemap.h> | ||
| 19 | #include <linux/buffer_head.h> /* for try_to_release_page(), | ||
| 20 | buffer_heads_over_limit */ | ||
| 21 | #include <linux/mm_inline.h> | ||
| 22 | #include <linux/pagevec.h> | ||
| 23 | #include <linux/rmap.h> | ||
| 24 | #include <linux/topology.h> | ||
| 25 | #include <linux/cpu.h> | ||
| 26 | #include <linux/cpuset.h> | ||
| 27 | #include <linux/swapops.h> | ||
| 28 | |||
| 29 | #include "internal.h" | ||
| 30 | |||
| 31 | #include "internal.h" | ||
| 32 | |||
| 33 | /* The maximum number of pages to take off the LRU for migration */ | ||
| 34 | #define MIGRATE_CHUNK_SIZE 256 | ||
| 35 | |||
| 36 | #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) | ||
| 37 | |||
| 38 | /* | ||
| 39 | * Isolate one page from the LRU lists. If successful put it onto | ||
| 40 | * the indicated list with elevated page count. | ||
| 41 | * | ||
| 42 | * Result: | ||
| 43 | * -EBUSY: page not on LRU list | ||
| 44 | * 0: page removed from LRU list and added to the specified list. | ||
| 45 | */ | ||
| 46 | int isolate_lru_page(struct page *page, struct list_head *pagelist) | ||
| 47 | { | ||
| 48 | int ret = -EBUSY; | ||
| 49 | |||
| 50 | if (PageLRU(page)) { | ||
| 51 | struct zone *zone = page_zone(page); | ||
| 52 | |||
| 53 | spin_lock_irq(&zone->lru_lock); | ||
| 54 | if (PageLRU(page)) { | ||
| 55 | ret = 0; | ||
| 56 | get_page(page); | ||
| 57 | ClearPageLRU(page); | ||
| 58 | if (PageActive(page)) | ||
| 59 | del_page_from_active_list(zone, page); | ||
| 60 | else | ||
| 61 | del_page_from_inactive_list(zone, page); | ||
| 62 | list_add_tail(&page->lru, pagelist); | ||
| 63 | } | ||
| 64 | spin_unlock_irq(&zone->lru_lock); | ||
| 65 | } | ||
| 66 | return ret; | ||
| 67 | } | ||
| 68 | |||
| 69 | /* | ||
| 70 | * migrate_prep() needs to be called after we have compiled the list of pages | ||
| 71 | * to be migrated using isolate_lru_page() but before we begin a series of calls | ||
| 72 | * to migrate_pages(). | ||
| 73 | */ | ||
| 74 | int migrate_prep(void) | ||
| 75 | { | ||
| 76 | /* Must have swap device for migration */ | ||
| 77 | if (nr_swap_pages <= 0) | ||
| 78 | return -ENODEV; | ||
| 79 | |||
| 80 | /* | ||
| 81 | * Clear the LRU lists so pages can be isolated. | ||
| 82 | * Note that pages may be moved off the LRU after we have | ||
| 83 | * drained them. Those pages will fail to migrate like other | ||
| 84 | * pages that may be busy. | ||
| 85 | */ | ||
| 86 | lru_add_drain_all(); | ||
| 87 | |||
| 88 | return 0; | ||
| 89 | } | ||
| 90 | |||
| 91 | static inline void move_to_lru(struct page *page) | ||
| 92 | { | ||
| 93 | list_del(&page->lru); | ||
| 94 | if (PageActive(page)) { | ||
| 95 | /* | ||
| 96 | * lru_cache_add_active checks that | ||
| 97 | * the PG_active bit is off. | ||
| 98 | */ | ||
| 99 | ClearPageActive(page); | ||
| 100 | lru_cache_add_active(page); | ||
| 101 | } else { | ||
| 102 | lru_cache_add(page); | ||
| 103 | } | ||
| 104 | put_page(page); | ||
| 105 | } | ||
| 106 | |||
| 107 | /* | ||
| 108 | * Add isolated pages on the list back to the LRU. | ||
| 109 | * | ||
| 110 | * returns the number of pages put back. | ||
| 111 | */ | ||
| 112 | int putback_lru_pages(struct list_head *l) | ||
| 113 | { | ||
| 114 | struct page *page; | ||
| 115 | struct page *page2; | ||
| 116 | int count = 0; | ||
| 117 | |||
| 118 | list_for_each_entry_safe(page, page2, l, lru) { | ||
| 119 | move_to_lru(page); | ||
| 120 | count++; | ||
| 121 | } | ||
| 122 | return count; | ||
| 123 | } | ||
| 124 | |||
| 125 | /* | ||
| 126 | * Non migratable page | ||
| 127 | */ | ||
| 128 | int fail_migrate_page(struct page *newpage, struct page *page) | ||
| 129 | { | ||
| 130 | return -EIO; | ||
| 131 | } | ||
| 132 | EXPORT_SYMBOL(fail_migrate_page); | ||
| 133 | |||
| 134 | /* | ||
| 135 | * swapout a single page | ||
| 136 | * page is locked upon entry, unlocked on exit | ||
| 137 | */ | ||
| 138 | static int swap_page(struct page *page) | ||
| 139 | { | ||
| 140 | struct address_space *mapping = page_mapping(page); | ||
| 141 | |||
| 142 | if (page_mapped(page) && mapping) | ||
| 143 | if (try_to_unmap(page, 1) != SWAP_SUCCESS) | ||
| 144 | goto unlock_retry; | ||
| 145 | |||
| 146 | if (PageDirty(page)) { | ||
| 147 | /* Page is dirty, try to write it out here */ | ||
| 148 | switch(pageout(page, mapping)) { | ||
| 149 | case PAGE_KEEP: | ||
| 150 | case PAGE_ACTIVATE: | ||
| 151 | goto unlock_retry; | ||
| 152 | |||
| 153 | case PAGE_SUCCESS: | ||
| 154 | goto retry; | ||
| 155 | |||
| 156 | case PAGE_CLEAN: | ||
| 157 | ; /* try to free the page below */ | ||
| 158 | } | ||
| 159 | } | ||
| 160 | |||
| 161 | if (PagePrivate(page)) { | ||
| 162 | if (!try_to_release_page(page, GFP_KERNEL) || | ||
| 163 | (!mapping && page_count(page) == 1)) | ||
| 164 | goto unlock_retry; | ||
| 165 | } | ||
| 166 | |||
| 167 | if (remove_mapping(mapping, page)) { | ||
| 168 | /* Success */ | ||
| 169 | unlock_page(page); | ||
| 170 | return 0; | ||
| 171 | } | ||
| 172 | |||
| 173 | unlock_retry: | ||
| 174 | unlock_page(page); | ||
| 175 | |||
| 176 | retry: | ||
| 177 | return -EAGAIN; | ||
| 178 | } | ||
| 179 | EXPORT_SYMBOL(swap_page); | ||
| 180 | |||
| 181 | /* | ||
| 182 | * Remove references for a page and establish the new page with the correct | ||
| 183 | * basic settings to be able to stop accesses to the page. | ||
| 184 | */ | ||
| 185 | int migrate_page_remove_references(struct page *newpage, | ||
| 186 | struct page *page, int nr_refs) | ||
| 187 | { | ||
| 188 | struct address_space *mapping = page_mapping(page); | ||
| 189 | struct page **radix_pointer; | ||
| 190 | |||
| 191 | /* | ||
| 192 | * Avoid doing any of the following work if the page count | ||
| 193 | * indicates that the page is in use or truncate has removed | ||
| 194 | * the page. | ||
| 195 | */ | ||
| 196 | if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) | ||
| 197 | return -EAGAIN; | ||
| 198 | |||
| 199 | /* | ||
| 200 | * Establish swap ptes for anonymous pages or destroy pte | ||
| 201 | * maps for files. | ||
| 202 | * | ||
| 203 | * In order to reestablish file backed mappings the fault handlers | ||
| 204 | * will take the radix tree_lock which may then be used to stop | ||
| 205 | * processses from accessing this page until the new page is ready. | ||
| 206 | * | ||
| 207 | * A process accessing via a swap pte (an anonymous page) will take a | ||
| 208 | * page_lock on the old page which will block the process until the | ||
| 209 | * migration attempt is complete. At that time the PageSwapCache bit | ||
| 210 | * will be examined. If the page was migrated then the PageSwapCache | ||
| 211 | * bit will be clear and the operation to retrieve the page will be | ||
| 212 | * retried which will find the new page in the radix tree. Then a new | ||
| 213 | * direct mapping may be generated based on the radix tree contents. | ||
| 214 | * | ||
| 215 | * If the page was not migrated then the PageSwapCache bit | ||
| 216 | * is still set and the operation may continue. | ||
| 217 | */ | ||
| 218 | if (try_to_unmap(page, 1) == SWAP_FAIL) | ||
| 219 | /* A vma has VM_LOCKED set -> permanent failure */ | ||
| 220 | return -EPERM; | ||
| 221 | |||
| 222 | /* | ||
| 223 | * Give up if we were unable to remove all mappings. | ||
| 224 | */ | ||
| 225 | if (page_mapcount(page)) | ||
| 226 | return -EAGAIN; | ||
| 227 | |||
| 228 | write_lock_irq(&mapping->tree_lock); | ||
| 229 | |||
| 230 | radix_pointer = (struct page **)radix_tree_lookup_slot( | ||
| 231 | &mapping->page_tree, | ||
| 232 | page_index(page)); | ||
| 233 | |||
| 234 | if (!page_mapping(page) || page_count(page) != nr_refs || | ||
| 235 | *radix_pointer != page) { | ||
| 236 | write_unlock_irq(&mapping->tree_lock); | ||
| 237 | return 1; | ||
| 238 | } | ||
| 239 | |||
| 240 | /* | ||
| 241 | * Now we know that no one else is looking at the page. | ||
| 242 | * | ||
| 243 | * Certain minimal information about a page must be available | ||
| 244 | * in order for other subsystems to properly handle the page if they | ||
| 245 | * find it through the radix tree update before we are finished | ||
| 246 | * copying the page. | ||
| 247 | */ | ||
| 248 | get_page(newpage); | ||
| 249 | newpage->index = page->index; | ||
| 250 | newpage->mapping = page->mapping; | ||
| 251 | if (PageSwapCache(page)) { | ||
| 252 | SetPageSwapCache(newpage); | ||
| 253 | set_page_private(newpage, page_private(page)); | ||
| 254 | } | ||
| 255 | |||
| 256 | *radix_pointer = newpage; | ||
| 257 | __put_page(page); | ||
| 258 | write_unlock_irq(&mapping->tree_lock); | ||
| 259 | |||
| 260 | return 0; | ||
| 261 | } | ||
| 262 | EXPORT_SYMBOL(migrate_page_remove_references); | ||
| 263 | |||
| 264 | /* | ||
| 265 | * Copy the page to its new location | ||
| 266 | */ | ||
| 267 | void migrate_page_copy(struct page *newpage, struct page *page) | ||
| 268 | { | ||
| 269 | copy_highpage(newpage, page); | ||
| 270 | |||
| 271 | if (PageError(page)) | ||
| 272 | SetPageError(newpage); | ||
| 273 | if (PageReferenced(page)) | ||
| 274 | SetPageReferenced(newpage); | ||
| 275 | if (PageUptodate(page)) | ||
| 276 | SetPageUptodate(newpage); | ||
| 277 | if (PageActive(page)) | ||
| 278 | SetPageActive(newpage); | ||
| 279 | if (PageChecked(page)) | ||
| 280 | SetPageChecked(newpage); | ||
| 281 | if (PageMappedToDisk(page)) | ||
| 282 | SetPageMappedToDisk(newpage); | ||
| 283 | |||
| 284 | if (PageDirty(page)) { | ||
| 285 | clear_page_dirty_for_io(page); | ||
| 286 | set_page_dirty(newpage); | ||
| 287 | } | ||
| 288 | |||
| 289 | ClearPageSwapCache(page); | ||
| 290 | ClearPageActive(page); | ||
| 291 | ClearPagePrivate(page); | ||
| 292 | set_page_private(page, 0); | ||
| 293 | page->mapping = NULL; | ||
| 294 | |||
| 295 | /* | ||
| 296 | * If any waiters have accumulated on the new page then | ||
| 297 | * wake them up. | ||
| 298 | */ | ||
| 299 | if (PageWriteback(newpage)) | ||
| 300 | end_page_writeback(newpage); | ||
| 301 | } | ||
| 302 | EXPORT_SYMBOL(migrate_page_copy); | ||
| 303 | |||
| 304 | /* | ||
| 305 | * Common logic to directly migrate a single page suitable for | ||
| 306 | * pages that do not use PagePrivate. | ||
| 307 | * | ||
| 308 | * Pages are locked upon entry and exit. | ||
| 309 | */ | ||
| 310 | int migrate_page(struct page *newpage, struct page *page) | ||
| 311 | { | ||
| 312 | int rc; | ||
| 313 | |||
| 314 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | ||
| 315 | |||
| 316 | rc = migrate_page_remove_references(newpage, page, 2); | ||
| 317 | |||
| 318 | if (rc) | ||
| 319 | return rc; | ||
| 320 | |||
| 321 | migrate_page_copy(newpage, page); | ||
| 322 | |||
| 323 | /* | ||
| 324 | * Remove auxiliary swap entries and replace | ||
| 325 | * them with real ptes. | ||
| 326 | * | ||
| 327 | * Note that a real pte entry will allow processes that are not | ||
| 328 | * waiting on the page lock to use the new page via the page tables | ||
| 329 | * before the new page is unlocked. | ||
| 330 | */ | ||
| 331 | remove_from_swap(newpage); | ||
| 332 | return 0; | ||
| 333 | } | ||
| 334 | EXPORT_SYMBOL(migrate_page); | ||
| 335 | |||
| 336 | /* | ||
| 337 | * migrate_pages | ||
| 338 | * | ||
| 339 | * Two lists are passed to this function. The first list | ||
| 340 | * contains the pages isolated from the LRU to be migrated. | ||
| 341 | * The second list contains new pages that the pages isolated | ||
| 342 | * can be moved to. If the second list is NULL then all | ||
| 343 | * pages are swapped out. | ||
| 344 | * | ||
| 345 | * The function returns after 10 attempts or if no pages | ||
| 346 | * are movable anymore because to has become empty | ||
| 347 | * or no retryable pages exist anymore. | ||
| 348 | * | ||
| 349 | * Return: Number of pages not migrated when "to" ran empty. | ||
| 350 | */ | ||
| 351 | int migrate_pages(struct list_head *from, struct list_head *to, | ||
| 352 | struct list_head *moved, struct list_head *failed) | ||
| 353 | { | ||
| 354 | int retry; | ||
| 355 | int nr_failed = 0; | ||
| 356 | int pass = 0; | ||
| 357 | struct page *page; | ||
| 358 | struct page *page2; | ||
| 359 | int swapwrite = current->flags & PF_SWAPWRITE; | ||
| 360 | int rc; | ||
| 361 | |||
| 362 | if (!swapwrite) | ||
| 363 | current->flags |= PF_SWAPWRITE; | ||
| 364 | |||
| 365 | redo: | ||
| 366 | retry = 0; | ||
| 367 | |||
| 368 | list_for_each_entry_safe(page, page2, from, lru) { | ||
| 369 | struct page *newpage = NULL; | ||
| 370 | struct address_space *mapping; | ||
| 371 | |||
| 372 | cond_resched(); | ||
| 373 | |||
| 374 | rc = 0; | ||
| 375 | if (page_count(page) == 1) | ||
| 376 | /* page was freed from under us. So we are done. */ | ||
| 377 | goto next; | ||
| 378 | |||
| 379 | if (to && list_empty(to)) | ||
| 380 | break; | ||
| 381 | |||
| 382 | /* | ||
| 383 | * Skip locked pages during the first two passes to give the | ||
| 384 | * functions holding the lock time to release the page. Later we | ||
| 385 | * use lock_page() to have a higher chance of acquiring the | ||
| 386 | * lock. | ||
| 387 | */ | ||
| 388 | rc = -EAGAIN; | ||
| 389 | if (pass > 2) | ||
| 390 | lock_page(page); | ||
| 391 | else | ||
| 392 | if (TestSetPageLocked(page)) | ||
| 393 | goto next; | ||
| 394 | |||
| 395 | /* | ||
| 396 | * Only wait on writeback if we have already done a pass where | ||
| 397 | * we we may have triggered writeouts for lots of pages. | ||
| 398 | */ | ||
| 399 | if (pass > 0) { | ||
| 400 | wait_on_page_writeback(page); | ||
| 401 | } else { | ||
| 402 | if (PageWriteback(page)) | ||
| 403 | goto unlock_page; | ||
| 404 | } | ||
| 405 | |||
| 406 | /* | ||
| 407 | * Anonymous pages must have swap cache references otherwise | ||
| 408 | * the information contained in the page maps cannot be | ||
| 409 | * preserved. | ||
| 410 | */ | ||
| 411 | if (PageAnon(page) && !PageSwapCache(page)) { | ||
| 412 | if (!add_to_swap(page, GFP_KERNEL)) { | ||
| 413 | rc = -ENOMEM; | ||
| 414 | goto unlock_page; | ||
| 415 | } | ||
| 416 | } | ||
| 417 | |||
| 418 | if (!to) { | ||
| 419 | rc = swap_page(page); | ||
| 420 | goto next; | ||
| 421 | } | ||
| 422 | |||
| 423 | newpage = lru_to_page(to); | ||
| 424 | lock_page(newpage); | ||
| 425 | |||
| 426 | /* | ||
| 427 | * Pages are properly locked and writeback is complete. | ||
| 428 | * Try to migrate the page. | ||
| 429 | */ | ||
| 430 | mapping = page_mapping(page); | ||
| 431 | if (!mapping) | ||
| 432 | goto unlock_both; | ||
| 433 | |||
| 434 | if (mapping->a_ops->migratepage) { | ||
| 435 | /* | ||
| 436 | * Most pages have a mapping and most filesystems | ||
| 437 | * should provide a migration function. Anonymous | ||
| 438 | * pages are part of swap space which also has its | ||
| 439 | * own migration function. This is the most common | ||
| 440 | * path for page migration. | ||
| 441 | */ | ||
| 442 | rc = mapping->a_ops->migratepage(newpage, page); | ||
| 443 | goto unlock_both; | ||
| 444 | } | ||
| 445 | |||
| 446 | /* | ||
| 447 | * Default handling if a filesystem does not provide | ||
| 448 | * a migration function. We can only migrate clean | ||
| 449 | * pages so try to write out any dirty pages first. | ||
| 450 | */ | ||
| 451 | if (PageDirty(page)) { | ||
| 452 | switch (pageout(page, mapping)) { | ||
| 453 | case PAGE_KEEP: | ||
| 454 | case PAGE_ACTIVATE: | ||
| 455 | goto unlock_both; | ||
| 456 | |||
| 457 | case PAGE_SUCCESS: | ||
| 458 | unlock_page(newpage); | ||
| 459 | goto next; | ||
| 460 | |||
| 461 | case PAGE_CLEAN: | ||
| 462 | ; /* try to migrate the page below */ | ||
| 463 | } | ||
| 464 | } | ||
| 465 | |||
| 466 | /* | ||
| 467 | * Buffers are managed in a filesystem specific way. | ||
| 468 | * We must have no buffers or drop them. | ||
| 469 | */ | ||
| 470 | if (!page_has_buffers(page) || | ||
| 471 | try_to_release_page(page, GFP_KERNEL)) { | ||
| 472 | rc = migrate_page(newpage, page); | ||
| 473 | goto unlock_both; | ||
| 474 | } | ||
| 475 | |||
| 476 | /* | ||
| 477 | * On early passes with mapped pages simply | ||
| 478 | * retry. There may be a lock held for some | ||
| 479 | * buffers that may go away. Later | ||
| 480 | * swap them out. | ||
| 481 | */ | ||
| 482 | if (pass > 4) { | ||
| 483 | /* | ||
| 484 | * Persistently unable to drop buffers..... As a | ||
| 485 | * measure of last resort we fall back to | ||
| 486 | * swap_page(). | ||
| 487 | */ | ||
| 488 | unlock_page(newpage); | ||
| 489 | newpage = NULL; | ||
| 490 | rc = swap_page(page); | ||
| 491 | goto next; | ||
| 492 | } | ||
| 493 | |||
| 494 | unlock_both: | ||
| 495 | unlock_page(newpage); | ||
| 496 | |||
| 497 | unlock_page: | ||
| 498 | unlock_page(page); | ||
| 499 | |||
| 500 | next: | ||
| 501 | if (rc == -EAGAIN) { | ||
| 502 | retry++; | ||
| 503 | } else if (rc) { | ||
| 504 | /* Permanent failure */ | ||
| 505 | list_move(&page->lru, failed); | ||
| 506 | nr_failed++; | ||
| 507 | } else { | ||
| 508 | if (newpage) { | ||
| 509 | /* Successful migration. Return page to LRU */ | ||
| 510 | move_to_lru(newpage); | ||
| 511 | } | ||
| 512 | list_move(&page->lru, moved); | ||
| 513 | } | ||
| 514 | } | ||
| 515 | if (retry && pass++ < 10) | ||
| 516 | goto redo; | ||
| 517 | |||
| 518 | if (!swapwrite) | ||
| 519 | current->flags &= ~PF_SWAPWRITE; | ||
| 520 | |||
| 521 | return nr_failed + retry; | ||
| 522 | } | ||
| 523 | |||
| 524 | /* | ||
| 525 | * Migration function for pages with buffers. This function can only be used | ||
| 526 | * if the underlying filesystem guarantees that no other references to "page" | ||
| 527 | * exist. | ||
| 528 | */ | ||
| 529 | int buffer_migrate_page(struct page *newpage, struct page *page) | ||
| 530 | { | ||
| 531 | struct address_space *mapping = page->mapping; | ||
| 532 | struct buffer_head *bh, *head; | ||
| 533 | int rc; | ||
| 534 | |||
| 535 | if (!mapping) | ||
| 536 | return -EAGAIN; | ||
| 537 | |||
| 538 | if (!page_has_buffers(page)) | ||
| 539 | return migrate_page(newpage, page); | ||
| 540 | |||
| 541 | head = page_buffers(page); | ||
| 542 | |||
| 543 | rc = migrate_page_remove_references(newpage, page, 3); | ||
| 544 | |||
| 545 | if (rc) | ||
| 546 | return rc; | ||
| 547 | |||
| 548 | bh = head; | ||
| 549 | do { | ||
| 550 | get_bh(bh); | ||
| 551 | lock_buffer(bh); | ||
| 552 | bh = bh->b_this_page; | ||
| 553 | |||
| 554 | } while (bh != head); | ||
| 555 | |||
| 556 | ClearPagePrivate(page); | ||
| 557 | set_page_private(newpage, page_private(page)); | ||
| 558 | set_page_private(page, 0); | ||
| 559 | put_page(page); | ||
| 560 | get_page(newpage); | ||
| 561 | |||
| 562 | bh = head; | ||
| 563 | do { | ||
| 564 | set_bh_page(bh, newpage, bh_offset(bh)); | ||
| 565 | bh = bh->b_this_page; | ||
| 566 | |||
| 567 | } while (bh != head); | ||
| 568 | |||
| 569 | SetPagePrivate(newpage); | ||
| 570 | |||
| 571 | migrate_page_copy(newpage, page); | ||
| 572 | |||
| 573 | bh = head; | ||
| 574 | do { | ||
| 575 | unlock_buffer(bh); | ||
| 576 | put_bh(bh); | ||
| 577 | bh = bh->b_this_page; | ||
| 578 | |||
| 579 | } while (bh != head); | ||
| 580 | |||
| 581 | return 0; | ||
| 582 | } | ||
| 583 | EXPORT_SYMBOL(buffer_migrate_page); | ||
| 584 | |||
| 585 | /* | ||
| 586 | * Migrate the list 'pagelist' of pages to a certain destination. | ||
| 587 | * | ||
| 588 | * Specify destination with either non-NULL vma or dest_node >= 0 | ||
| 589 | * Return the number of pages not migrated or error code | ||
| 590 | */ | ||
| 591 | int migrate_pages_to(struct list_head *pagelist, | ||
| 592 | struct vm_area_struct *vma, int dest) | ||
| 593 | { | ||
| 594 | LIST_HEAD(newlist); | ||
| 595 | LIST_HEAD(moved); | ||
| 596 | LIST_HEAD(failed); | ||
| 597 | int err = 0; | ||
| 598 | unsigned long offset = 0; | ||
| 599 | int nr_pages; | ||
| 600 | struct page *page; | ||
| 601 | struct list_head *p; | ||
| 602 | |||
| 603 | redo: | ||
| 604 | nr_pages = 0; | ||
| 605 | list_for_each(p, pagelist) { | ||
| 606 | if (vma) { | ||
| 607 | /* | ||
| 608 | * The address passed to alloc_page_vma is used to | ||
| 609 | * generate the proper interleave behavior. We fake | ||
| 610 | * the address here by an increasing offset in order | ||
| 611 | * to get the proper distribution of pages. | ||
| 612 | * | ||
| 613 | * No decision has been made as to which page | ||
| 614 | * a certain old page is moved to so we cannot | ||
| 615 | * specify the correct address. | ||
| 616 | */ | ||
| 617 | page = alloc_page_vma(GFP_HIGHUSER, vma, | ||
| 618 | offset + vma->vm_start); | ||
| 619 | offset += PAGE_SIZE; | ||
| 620 | } | ||
| 621 | else | ||
| 622 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | ||
| 623 | |||
| 624 | if (!page) { | ||
| 625 | err = -ENOMEM; | ||
| 626 | goto out; | ||
| 627 | } | ||
| 628 | list_add_tail(&page->lru, &newlist); | ||
| 629 | nr_pages++; | ||
| 630 | if (nr_pages > MIGRATE_CHUNK_SIZE) | ||
| 631 | break; | ||
| 632 | } | ||
| 633 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | ||
| 634 | |||
| 635 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | ||
| 636 | |||
| 637 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | ||
| 638 | goto redo; | ||
| 639 | out: | ||
| 640 | /* Return leftover allocated pages */ | ||
| 641 | while (!list_empty(&newlist)) { | ||
| 642 | page = list_entry(newlist.next, struct page, lru); | ||
| 643 | list_del(&page->lru); | ||
| 644 | __free_page(page); | ||
| 645 | } | ||
| 646 | list_splice(&failed, pagelist); | ||
| 647 | if (err < 0) | ||
| 648 | return err; | ||
| 649 | |||
| 650 | /* Calculate number of leftover pages */ | ||
| 651 | nr_pages = 0; | ||
| 652 | list_for_each(p, pagelist) | ||
| 653 | nr_pages++; | ||
| 654 | return nr_pages; | ||
| 655 | } | ||
| @@ -612,7 +612,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 612 | * If the vma has a ->close operation then the driver probably needs to release | 612 | * If the vma has a ->close operation then the driver probably needs to release |
| 613 | * per-vma resources, so we don't attempt to merge those. | 613 | * per-vma resources, so we don't attempt to merge those. |
| 614 | */ | 614 | */ |
| 615 | #define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) | 615 | #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) |
| 616 | 616 | ||
| 617 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 617 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
| 618 | struct file *file, unsigned long vm_flags) | 618 | struct file *file, unsigned long vm_flags) |
| @@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
| 845 | const unsigned long stack_flags | 845 | const unsigned long stack_flags |
| 846 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); | 846 | = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); |
| 847 | 847 | ||
| 848 | #ifdef CONFIG_HUGETLB | ||
| 849 | if (flags & VM_HUGETLB) { | ||
| 850 | if (!(flags & VM_DONTCOPY)) | ||
| 851 | mm->shared_vm += pages; | ||
| 852 | return; | ||
| 853 | } | ||
| 854 | #endif /* CONFIG_HUGETLB */ | ||
| 855 | |||
| 856 | if (file) { | 848 | if (file) { |
| 857 | mm->shared_vm += pages; | 849 | mm->shared_vm += pages; |
| 858 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) | 850 | if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 653b8571c1ed..4c14d4289b61 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
| 124 | * a MAP_NORESERVE private mapping to writable will now reserve. | 124 | * a MAP_NORESERVE private mapping to writable will now reserve. |
| 125 | */ | 125 | */ |
| 126 | if (newflags & VM_WRITE) { | 126 | if (newflags & VM_WRITE) { |
| 127 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { | 127 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { |
| 128 | charged = nrpages; | 128 | charged = nrpages; |
| 129 | if (security_vm_enough_memory(charged)) | 129 | if (security_vm_enough_memory(charged)) |
| 130 | return -ENOMEM; | 130 | return -ENOMEM; |
| @@ -166,7 +166,10 @@ success: | |||
| 166 | */ | 166 | */ |
| 167 | vma->vm_flags = newflags; | 167 | vma->vm_flags = newflags; |
| 168 | vma->vm_page_prot = newprot; | 168 | vma->vm_page_prot = newprot; |
| 169 | change_protection(vma, start, end, newprot); | 169 | if (is_vm_hugetlb_page(vma)) |
| 170 | hugetlb_change_protection(vma, start, end, newprot); | ||
| 171 | else | ||
| 172 | change_protection(vma, start, end, newprot); | ||
| 170 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 173 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
| 171 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 174 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
| 172 | return 0; | 175 | return 0; |
| @@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot) | |||
| 240 | 243 | ||
| 241 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ | 244 | /* Here we know that vma->vm_start <= nstart < vma->vm_end. */ |
| 242 | 245 | ||
| 243 | if (is_vm_hugetlb_page(vma)) { | ||
| 244 | error = -EACCES; | ||
| 245 | goto out; | ||
| 246 | } | ||
| 247 | |||
| 248 | newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); | 246 | newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC)); |
| 249 | 247 | ||
| 250 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ | 248 | /* newflags >> 4 shift VM_MAY% in place of VM_% */ |
diff --git a/mm/nommu.c b/mm/nommu.c index 4951f4786f28..db45efac17cc 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | |||
| 159 | /* | 159 | /* |
| 160 | * kmalloc doesn't like __GFP_HIGHMEM for some reason | 160 | * kmalloc doesn't like __GFP_HIGHMEM for some reason |
| 161 | */ | 161 | */ |
| 162 | return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM); | 162 | return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM); |
| 163 | } | 163 | } |
| 164 | 164 | ||
| 165 | struct page * vmalloc_to_page(void *addr) | 165 | struct page * vmalloc_to_page(void *addr) |
| @@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) | |||
| 623 | * - note that this may not return a page-aligned address if the object | 623 | * - note that this may not return a page-aligned address if the object |
| 624 | * we're allocating is smaller than a page | 624 | * we're allocating is smaller than a page |
| 625 | */ | 625 | */ |
| 626 | base = kmalloc(len, GFP_KERNEL); | 626 | base = kmalloc(len, GFP_KERNEL|__GFP_COMP); |
| 627 | if (!base) | 627 | if (!base) |
| 628 | goto enomem; | 628 | goto enomem; |
| 629 | 629 | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 234bd4895d14..b7f14a4799a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -55,7 +55,6 @@ unsigned long totalhigh_pages __read_mostly; | |||
| 55 | long nr_swap_pages; | 55 | long nr_swap_pages; |
| 56 | int percpu_pagelist_fraction; | 56 | int percpu_pagelist_fraction; |
| 57 | 57 | ||
| 58 | static void fastcall free_hot_cold_page(struct page *page, int cold); | ||
| 59 | static void __free_pages_ok(struct page *page, unsigned int order); | 58 | static void __free_pages_ok(struct page *page, unsigned int order); |
| 60 | 59 | ||
| 61 | /* | 60 | /* |
| @@ -190,7 +189,7 @@ static void prep_compound_page(struct page *page, unsigned long order) | |||
| 190 | for (i = 0; i < nr_pages; i++) { | 189 | for (i = 0; i < nr_pages; i++) { |
| 191 | struct page *p = page + i; | 190 | struct page *p = page + i; |
| 192 | 191 | ||
| 193 | SetPageCompound(p); | 192 | __SetPageCompound(p); |
| 194 | set_page_private(p, (unsigned long)page); | 193 | set_page_private(p, (unsigned long)page); |
| 195 | } | 194 | } |
| 196 | } | 195 | } |
| @@ -209,10 +208,24 @@ static void destroy_compound_page(struct page *page, unsigned long order) | |||
| 209 | if (unlikely(!PageCompound(p) | | 208 | if (unlikely(!PageCompound(p) | |
| 210 | (page_private(p) != (unsigned long)page))) | 209 | (page_private(p) != (unsigned long)page))) |
| 211 | bad_page(page); | 210 | bad_page(page); |
| 212 | ClearPageCompound(p); | 211 | __ClearPageCompound(p); |
| 213 | } | 212 | } |
| 214 | } | 213 | } |
| 215 | 214 | ||
| 215 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | ||
| 216 | { | ||
| 217 | int i; | ||
| 218 | |||
| 219 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | ||
| 220 | /* | ||
| 221 | * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO | ||
| 222 | * and __GFP_HIGHMEM from hard or soft interrupt context. | ||
| 223 | */ | ||
| 224 | BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); | ||
| 225 | for (i = 0; i < (1 << order); i++) | ||
| 226 | clear_highpage(page + i); | ||
| 227 | } | ||
| 228 | |||
| 216 | /* | 229 | /* |
| 217 | * function for dealing with page's order in buddy system. | 230 | * function for dealing with page's order in buddy system. |
| 218 | * zone->lock is already acquired when we use these. | 231 | * zone->lock is already acquired when we use these. |
| @@ -423,11 +436,6 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 423 | mutex_debug_check_no_locks_freed(page_address(page), | 436 | mutex_debug_check_no_locks_freed(page_address(page), |
| 424 | PAGE_SIZE<<order); | 437 | PAGE_SIZE<<order); |
| 425 | 438 | ||
| 426 | #ifndef CONFIG_MMU | ||
| 427 | for (i = 1 ; i < (1 << order) ; ++i) | ||
| 428 | __put_page(page + i); | ||
| 429 | #endif | ||
| 430 | |||
| 431 | for (i = 0 ; i < (1 << order) ; ++i) | 439 | for (i = 0 ; i < (1 << order) ; ++i) |
| 432 | reserved += free_pages_check(page + i); | 440 | reserved += free_pages_check(page + i); |
| 433 | if (reserved) | 441 | if (reserved) |
| @@ -448,28 +456,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order) | |||
| 448 | if (order == 0) { | 456 | if (order == 0) { |
| 449 | __ClearPageReserved(page); | 457 | __ClearPageReserved(page); |
| 450 | set_page_count(page, 0); | 458 | set_page_count(page, 0); |
| 451 | 459 | set_page_refcounted(page); | |
| 452 | free_hot_cold_page(page, 0); | 460 | __free_page(page); |
| 453 | } else { | 461 | } else { |
| 454 | LIST_HEAD(list); | ||
| 455 | int loop; | 462 | int loop; |
| 456 | 463 | ||
| 464 | prefetchw(page); | ||
| 457 | for (loop = 0; loop < BITS_PER_LONG; loop++) { | 465 | for (loop = 0; loop < BITS_PER_LONG; loop++) { |
| 458 | struct page *p = &page[loop]; | 466 | struct page *p = &page[loop]; |
| 459 | 467 | ||
| 460 | if (loop + 16 < BITS_PER_LONG) | 468 | if (loop + 1 < BITS_PER_LONG) |
| 461 | prefetchw(p + 16); | 469 | prefetchw(p + 1); |
| 462 | __ClearPageReserved(p); | 470 | __ClearPageReserved(p); |
| 463 | set_page_count(p, 0); | 471 | set_page_count(p, 0); |
| 464 | } | 472 | } |
| 465 | 473 | ||
| 466 | arch_free_page(page, order); | 474 | set_page_refcounted(page); |
| 467 | 475 | __free_pages(page, order); | |
| 468 | mod_page_state(pgfree, 1 << order); | ||
| 469 | |||
| 470 | list_add(&page->lru, &list); | ||
| 471 | kernel_map_pages(page, 1 << order, 0); | ||
| 472 | free_pages_bulk(page_zone(page), 1, &list, order); | ||
| 473 | } | 476 | } |
| 474 | } | 477 | } |
| 475 | 478 | ||
| @@ -507,7 +510,7 @@ static inline void expand(struct zone *zone, struct page *page, | |||
| 507 | /* | 510 | /* |
| 508 | * This page is about to be returned from the page allocator | 511 | * This page is about to be returned from the page allocator |
| 509 | */ | 512 | */ |
| 510 | static int prep_new_page(struct page *page, int order) | 513 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) |
| 511 | { | 514 | { |
| 512 | if (unlikely(page_mapcount(page) | | 515 | if (unlikely(page_mapcount(page) | |
| 513 | (page->mapping != NULL) | | 516 | (page->mapping != NULL) | |
| @@ -536,8 +539,15 @@ static int prep_new_page(struct page *page, int order) | |||
| 536 | 1 << PG_referenced | 1 << PG_arch_1 | | 539 | 1 << PG_referenced | 1 << PG_arch_1 | |
| 537 | 1 << PG_checked | 1 << PG_mappedtodisk); | 540 | 1 << PG_checked | 1 << PG_mappedtodisk); |
| 538 | set_page_private(page, 0); | 541 | set_page_private(page, 0); |
| 539 | set_page_refs(page, order); | 542 | set_page_refcounted(page); |
| 540 | kernel_map_pages(page, 1 << order, 1); | 543 | kernel_map_pages(page, 1 << order, 1); |
| 544 | |||
| 545 | if (gfp_flags & __GFP_ZERO) | ||
| 546 | prep_zero_page(page, order, gfp_flags); | ||
| 547 | |||
| 548 | if (order && (gfp_flags & __GFP_COMP)) | ||
| 549 | prep_compound_page(page, order); | ||
| 550 | |||
| 541 | return 0; | 551 | return 0; |
| 542 | } | 552 | } |
| 543 | 553 | ||
| @@ -593,13 +603,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
| 593 | /* | 603 | /* |
| 594 | * Called from the slab reaper to drain pagesets on a particular node that | 604 | * Called from the slab reaper to drain pagesets on a particular node that |
| 595 | * belong to the currently executing processor. | 605 | * belong to the currently executing processor. |
| 606 | * Note that this function must be called with the thread pinned to | ||
| 607 | * a single processor. | ||
| 596 | */ | 608 | */ |
| 597 | void drain_node_pages(int nodeid) | 609 | void drain_node_pages(int nodeid) |
| 598 | { | 610 | { |
| 599 | int i, z; | 611 | int i, z; |
| 600 | unsigned long flags; | 612 | unsigned long flags; |
| 601 | 613 | ||
| 602 | local_irq_save(flags); | ||
| 603 | for (z = 0; z < MAX_NR_ZONES; z++) { | 614 | for (z = 0; z < MAX_NR_ZONES; z++) { |
| 604 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; | 615 | struct zone *zone = NODE_DATA(nodeid)->node_zones + z; |
| 605 | struct per_cpu_pageset *pset; | 616 | struct per_cpu_pageset *pset; |
| @@ -609,11 +620,14 @@ void drain_node_pages(int nodeid) | |||
| 609 | struct per_cpu_pages *pcp; | 620 | struct per_cpu_pages *pcp; |
| 610 | 621 | ||
| 611 | pcp = &pset->pcp[i]; | 622 | pcp = &pset->pcp[i]; |
| 612 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 623 | if (pcp->count) { |
| 613 | pcp->count = 0; | 624 | local_irq_save(flags); |
| 625 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | ||
| 626 | pcp->count = 0; | ||
| 627 | local_irq_restore(flags); | ||
| 628 | } | ||
| 614 | } | 629 | } |
| 615 | } | 630 | } |
| 616 | local_irq_restore(flags); | ||
| 617 | } | 631 | } |
| 618 | #endif | 632 | #endif |
| 619 | 633 | ||
| @@ -743,13 +757,22 @@ void fastcall free_cold_page(struct page *page) | |||
| 743 | free_hot_cold_page(page, 1); | 757 | free_hot_cold_page(page, 1); |
| 744 | } | 758 | } |
| 745 | 759 | ||
| 746 | static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) | 760 | /* |
| 761 | * split_page takes a non-compound higher-order page, and splits it into | ||
| 762 | * n (1<<order) sub-pages: page[0..n] | ||
| 763 | * Each sub-page must be freed individually. | ||
| 764 | * | ||
| 765 | * Note: this is probably too low level an operation for use in drivers. | ||
| 766 | * Please consult with lkml before using this in your driver. | ||
| 767 | */ | ||
| 768 | void split_page(struct page *page, unsigned int order) | ||
| 747 | { | 769 | { |
| 748 | int i; | 770 | int i; |
| 749 | 771 | ||
| 750 | BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); | 772 | BUG_ON(PageCompound(page)); |
| 751 | for(i = 0; i < (1 << order); i++) | 773 | BUG_ON(!page_count(page)); |
| 752 | clear_highpage(page + i); | 774 | for (i = 1; i < (1 << order); i++) |
| 775 | set_page_refcounted(page + i); | ||
| 753 | } | 776 | } |
| 754 | 777 | ||
| 755 | /* | 778 | /* |
| @@ -795,14 +818,8 @@ again: | |||
| 795 | put_cpu(); | 818 | put_cpu(); |
| 796 | 819 | ||
| 797 | BUG_ON(bad_range(zone, page)); | 820 | BUG_ON(bad_range(zone, page)); |
| 798 | if (prep_new_page(page, order)) | 821 | if (prep_new_page(page, order, gfp_flags)) |
| 799 | goto again; | 822 | goto again; |
| 800 | |||
| 801 | if (gfp_flags & __GFP_ZERO) | ||
| 802 | prep_zero_page(page, order, gfp_flags); | ||
| 803 | |||
| 804 | if (order && (gfp_flags & __GFP_COMP)) | ||
| 805 | prep_compound_page(page, order); | ||
| 806 | return page; | 823 | return page; |
| 807 | 824 | ||
| 808 | failed: | 825 | failed: |
| @@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0; | |||
| 1214 | 1231 | ||
| 1215 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) | 1232 | static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) |
| 1216 | { | 1233 | { |
| 1217 | int cpu = 0; | 1234 | unsigned cpu; |
| 1218 | 1235 | ||
| 1219 | memset(ret, 0, nr * sizeof(unsigned long)); | 1236 | memset(ret, 0, nr * sizeof(unsigned long)); |
| 1220 | cpus_and(*cpumask, *cpumask, cpu_online_map); | 1237 | cpus_and(*cpumask, *cpumask, cpu_online_map); |
| 1221 | 1238 | ||
| 1222 | cpu = first_cpu(*cpumask); | 1239 | for_each_cpu_mask(cpu, *cpumask) { |
| 1223 | while (cpu < NR_CPUS) { | 1240 | unsigned long *in; |
| 1224 | unsigned long *in, *out, off; | 1241 | unsigned long *out; |
| 1225 | 1242 | unsigned off; | |
| 1226 | if (!cpu_isset(cpu, *cpumask)) | 1243 | unsigned next_cpu; |
| 1227 | continue; | ||
| 1228 | 1244 | ||
| 1229 | in = (unsigned long *)&per_cpu(page_states, cpu); | 1245 | in = (unsigned long *)&per_cpu(page_states, cpu); |
| 1230 | 1246 | ||
| 1231 | cpu = next_cpu(cpu, *cpumask); | 1247 | next_cpu = next_cpu(cpu, *cpumask); |
| 1232 | 1248 | if (likely(next_cpu < NR_CPUS)) | |
| 1233 | if (likely(cpu < NR_CPUS)) | 1249 | prefetch(&per_cpu(page_states, next_cpu)); |
| 1234 | prefetch(&per_cpu(page_states, cpu)); | ||
| 1235 | 1250 | ||
| 1236 | out = (unsigned long *)ret; | 1251 | out = (unsigned long *)ret; |
| 1237 | for (off = 0; off < nr; off++) | 1252 | for (off = 0; off < nr; off++) |
| @@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
| 1764 | continue; | 1779 | continue; |
| 1765 | page = pfn_to_page(pfn); | 1780 | page = pfn_to_page(pfn); |
| 1766 | set_page_links(page, zone, nid, pfn); | 1781 | set_page_links(page, zone, nid, pfn); |
| 1767 | set_page_count(page, 1); | 1782 | init_page_count(page); |
| 1768 | reset_page_mapcount(page); | 1783 | reset_page_mapcount(page); |
| 1769 | SetPageReserved(page); | 1784 | SetPageReserved(page); |
| 1770 | INIT_LIST_HEAD(&page->lru); | 1785 | INIT_LIST_HEAD(&page->lru); |
diff --git a/mm/readahead.c b/mm/readahead.c index 8d6eeaaa6296..0f142a40984b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra) | |||
| 52 | return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; | 52 | return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; |
| 53 | } | 53 | } |
| 54 | 54 | ||
| 55 | static inline void reset_ahead_window(struct file_ra_state *ra) | ||
| 56 | { | ||
| 57 | /* | ||
| 58 | * ... but preserve ahead_start + ahead_size value, | ||
| 59 | * see 'recheck:' label in page_cache_readahead(). | ||
| 60 | * Note: We never use ->ahead_size as rvalue without | ||
| 61 | * checking ->ahead_start != 0 first. | ||
| 62 | */ | ||
| 63 | ra->ahead_size += ra->ahead_start; | ||
| 64 | ra->ahead_start = 0; | ||
| 65 | } | ||
| 66 | |||
| 55 | static inline void ra_off(struct file_ra_state *ra) | 67 | static inline void ra_off(struct file_ra_state *ra) |
| 56 | { | 68 | { |
| 57 | ra->start = 0; | 69 | ra->start = 0; |
| 58 | ra->flags = 0; | 70 | ra->flags = 0; |
| 59 | ra->size = 0; | 71 | ra->size = 0; |
| 60 | ra->ahead_start = 0; | 72 | reset_ahead_window(ra); |
| 61 | ra->ahead_size = 0; | ||
| 62 | return; | 73 | return; |
| 63 | } | 74 | } |
| 64 | 75 | ||
| @@ -72,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max) | |||
| 72 | { | 83 | { |
| 73 | unsigned long newsize = roundup_pow_of_two(size); | 84 | unsigned long newsize = roundup_pow_of_two(size); |
| 74 | 85 | ||
| 75 | if (newsize <= max / 64) | 86 | if (newsize <= max / 32) |
| 76 | newsize = newsize * newsize; | 87 | newsize = newsize * 4; |
| 77 | else if (newsize <= max / 4) | 88 | else if (newsize <= max / 4) |
| 78 | newsize = max / 4; | 89 | newsize = newsize * 2; |
| 79 | else | 90 | else |
| 80 | newsize = max; | 91 | newsize = max; |
| 81 | return newsize; | 92 | return newsize; |
| @@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp, | |||
| 426 | * congestion. The ahead window will any way be closed | 437 | * congestion. The ahead window will any way be closed |
| 427 | * in case we failed due to excessive page cache hits. | 438 | * in case we failed due to excessive page cache hits. |
| 428 | */ | 439 | */ |
| 429 | ra->ahead_start = 0; | 440 | reset_ahead_window(ra); |
| 430 | ra->ahead_size = 0; | ||
| 431 | } | 441 | } |
| 432 | 442 | ||
| 433 | return ret; | 443 | return ret; |
| @@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, | |||
| 520 | * If we get here we are doing sequential IO and this was not the first | 530 | * If we get here we are doing sequential IO and this was not the first |
| 521 | * occurence (ie we have an existing window) | 531 | * occurence (ie we have an existing window) |
| 522 | */ | 532 | */ |
| 523 | |||
| 524 | if (ra->ahead_start == 0) { /* no ahead window yet */ | 533 | if (ra->ahead_start == 0) { /* no ahead window yet */ |
| 525 | if (!make_ahead_window(mapping, filp, ra, 0)) | 534 | if (!make_ahead_window(mapping, filp, ra, 0)) |
| 526 | goto out; | 535 | goto recheck; |
| 527 | } | 536 | } |
| 537 | |||
| 528 | /* | 538 | /* |
| 529 | * Already have an ahead window, check if we crossed into it. | 539 | * Already have an ahead window, check if we crossed into it. |
| 530 | * If so, shift windows and issue a new ahead window. | 540 | * If so, shift windows and issue a new ahead window. |
| @@ -536,11 +546,16 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra, | |||
| 536 | ra->start = ra->ahead_start; | 546 | ra->start = ra->ahead_start; |
| 537 | ra->size = ra->ahead_size; | 547 | ra->size = ra->ahead_size; |
| 538 | make_ahead_window(mapping, filp, ra, 0); | 548 | make_ahead_window(mapping, filp, ra, 0); |
| 549 | recheck: | ||
| 550 | /* prev_page shouldn't overrun the ahead window */ | ||
| 551 | ra->prev_page = min(ra->prev_page, | ||
| 552 | ra->ahead_start + ra->ahead_size - 1); | ||
| 539 | } | 553 | } |
| 540 | 554 | ||
| 541 | out: | 555 | out: |
| 542 | return ra->prev_page + 1; | 556 | return ra->prev_page + 1; |
| 543 | } | 557 | } |
| 558 | EXPORT_SYMBOL_GPL(page_cache_readahead); | ||
| 544 | 559 | ||
| 545 | /* | 560 | /* |
| 546 | * handle_ra_miss() is called when it is known that a page which should have | 561 | * handle_ra_miss() is called when it is known that a page which should have |
| @@ -56,13 +56,11 @@ | |||
| 56 | 56 | ||
| 57 | #include <asm/tlbflush.h> | 57 | #include <asm/tlbflush.h> |
| 58 | 58 | ||
| 59 | //#define RMAP_DEBUG /* can be enabled only for debugging */ | 59 | struct kmem_cache *anon_vma_cachep; |
| 60 | |||
| 61 | kmem_cache_t *anon_vma_cachep; | ||
| 62 | 60 | ||
| 63 | static inline void validate_anon_vma(struct vm_area_struct *find_vma) | 61 | static inline void validate_anon_vma(struct vm_area_struct *find_vma) |
| 64 | { | 62 | { |
| 65 | #ifdef RMAP_DEBUG | 63 | #ifdef CONFIG_DEBUG_VM |
| 66 | struct anon_vma *anon_vma = find_vma->anon_vma; | 64 | struct anon_vma *anon_vma = find_vma->anon_vma; |
| 67 | struct vm_area_struct *vma; | 65 | struct vm_area_struct *vma; |
| 68 | unsigned int mapcount = 0; | 66 | unsigned int mapcount = 0; |
| @@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
| 166 | anon_vma_free(anon_vma); | 164 | anon_vma_free(anon_vma); |
| 167 | } | 165 | } |
| 168 | 166 | ||
| 169 | static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags) | 167 | static void anon_vma_ctor(void *data, struct kmem_cache *cachep, |
| 168 | unsigned long flags) | ||
| 170 | { | 169 | { |
| 171 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == | 170 | if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == |
| 172 | SLAB_CTOR_CONSTRUCTOR) { | 171 | SLAB_CTOR_CONSTRUCTOR) { |
| @@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page) | |||
| 550 | void page_remove_rmap(struct page *page) | 549 | void page_remove_rmap(struct page *page) |
| 551 | { | 550 | { |
| 552 | if (atomic_add_negative(-1, &page->_mapcount)) { | 551 | if (atomic_add_negative(-1, &page->_mapcount)) { |
| 553 | if (page_mapcount(page) < 0) { | 552 | #ifdef CONFIG_DEBUG_VM |
| 553 | if (unlikely(page_mapcount(page) < 0)) { | ||
| 554 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); | 554 | printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page)); |
| 555 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); | 555 | printk (KERN_EMERG " page->flags = %lx\n", page->flags); |
| 556 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); | 556 | printk (KERN_EMERG " page->count = %x\n", page_count(page)); |
| 557 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); | 557 | printk (KERN_EMERG " page->mapping = %p\n", page->mapping); |
| 558 | } | 558 | } |
| 559 | 559 | #endif | |
| 560 | BUG_ON(page_mapcount(page) < 0); | 560 | BUG_ON(page_mapcount(page) < 0); |
| 561 | /* | 561 | /* |
| 562 | * It would be tidy to reset the PageAnon mapping here, | 562 | * It would be tidy to reset the PageAnon mapping here, |
diff --git a/mm/shmem.c b/mm/shmem.c index 7c455fbaff7b..37eaf42ed2c6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -875,7 +875,7 @@ redirty: | |||
| 875 | } | 875 | } |
| 876 | 876 | ||
| 877 | #ifdef CONFIG_NUMA | 877 | #ifdef CONFIG_NUMA |
| 878 | static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) | 878 | static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) |
| 879 | { | 879 | { |
| 880 | char *nodelist = strchr(value, ':'); | 880 | char *nodelist = strchr(value, ':'); |
| 881 | int err = 1; | 881 | int err = 1; |
| @@ -2119,7 +2119,7 @@ failed: | |||
| 2119 | return err; | 2119 | return err; |
| 2120 | } | 2120 | } |
| 2121 | 2121 | ||
| 2122 | static kmem_cache_t *shmem_inode_cachep; | 2122 | static struct kmem_cache *shmem_inode_cachep; |
| 2123 | 2123 | ||
| 2124 | static struct inode *shmem_alloc_inode(struct super_block *sb) | 2124 | static struct inode *shmem_alloc_inode(struct super_block *sb) |
| 2125 | { | 2125 | { |
| @@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode) | |||
| 2139 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2139 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
| 2140 | } | 2140 | } |
| 2141 | 2141 | ||
| 2142 | static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags) | 2142 | static void init_once(void *foo, struct kmem_cache *cachep, |
| 2143 | unsigned long flags) | ||
| 2143 | { | 2144 | { |
| 2144 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2145 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
| 2145 | 2146 | ||
| @@ -50,7 +50,7 @@ | |||
| 50 | * The head array is strictly LIFO and should improve the cache hit rates. | 50 | * The head array is strictly LIFO and should improve the cache hit rates. |
| 51 | * On SMP, it additionally reduces the spinlock operations. | 51 | * On SMP, it additionally reduces the spinlock operations. |
| 52 | * | 52 | * |
| 53 | * The c_cpuarray may not be read with enabled local interrupts - | 53 | * The c_cpuarray may not be read with enabled local interrupts - |
| 54 | * it's changed with a smp_call_function(). | 54 | * it's changed with a smp_call_function(). |
| 55 | * | 55 | * |
| 56 | * SMP synchronization: | 56 | * SMP synchronization: |
| @@ -170,12 +170,12 @@ | |||
| 170 | #if DEBUG | 170 | #if DEBUG |
| 171 | # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ | 171 | # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ |
| 172 | SLAB_POISON | SLAB_HWCACHE_ALIGN | \ | 172 | SLAB_POISON | SLAB_HWCACHE_ALIGN | \ |
| 173 | SLAB_NO_REAP | SLAB_CACHE_DMA | \ | 173 | SLAB_CACHE_DMA | \ |
| 174 | SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ | 174 | SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \ |
| 175 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 175 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
| 176 | SLAB_DESTROY_BY_RCU) | 176 | SLAB_DESTROY_BY_RCU) |
| 177 | #else | 177 | #else |
| 178 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \ | 178 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ |
| 179 | SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ | 179 | SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \ |
| 180 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 180 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
| 181 | SLAB_DESTROY_BY_RCU) | 181 | SLAB_DESTROY_BY_RCU) |
| @@ -266,16 +266,17 @@ struct array_cache { | |||
| 266 | unsigned int batchcount; | 266 | unsigned int batchcount; |
| 267 | unsigned int touched; | 267 | unsigned int touched; |
| 268 | spinlock_t lock; | 268 | spinlock_t lock; |
| 269 | void *entry[0]; /* | 269 | void *entry[0]; /* |
| 270 | * Must have this definition in here for the proper | 270 | * Must have this definition in here for the proper |
| 271 | * alignment of array_cache. Also simplifies accessing | 271 | * alignment of array_cache. Also simplifies accessing |
| 272 | * the entries. | 272 | * the entries. |
| 273 | * [0] is for gcc 2.95. It should really be []. | 273 | * [0] is for gcc 2.95. It should really be []. |
| 274 | */ | 274 | */ |
| 275 | }; | 275 | }; |
| 276 | 276 | ||
| 277 | /* bootstrap: The caches do not work without cpuarrays anymore, | 277 | /* |
| 278 | * but the cpuarrays are allocated from the generic caches... | 278 | * bootstrap: The caches do not work without cpuarrays anymore, but the |
| 279 | * cpuarrays are allocated from the generic caches... | ||
| 279 | */ | 280 | */ |
| 280 | #define BOOT_CPUCACHE_ENTRIES 1 | 281 | #define BOOT_CPUCACHE_ENTRIES 1 |
| 281 | struct arraycache_init { | 282 | struct arraycache_init { |
| @@ -291,13 +292,13 @@ struct kmem_list3 { | |||
| 291 | struct list_head slabs_full; | 292 | struct list_head slabs_full; |
| 292 | struct list_head slabs_free; | 293 | struct list_head slabs_free; |
| 293 | unsigned long free_objects; | 294 | unsigned long free_objects; |
| 294 | unsigned long next_reap; | ||
| 295 | int free_touched; | ||
| 296 | unsigned int free_limit; | 295 | unsigned int free_limit; |
| 297 | unsigned int colour_next; /* Per-node cache coloring */ | 296 | unsigned int colour_next; /* Per-node cache coloring */ |
| 298 | spinlock_t list_lock; | 297 | spinlock_t list_lock; |
| 299 | struct array_cache *shared; /* shared per node */ | 298 | struct array_cache *shared; /* shared per node */ |
| 300 | struct array_cache **alien; /* on other nodes */ | 299 | struct array_cache **alien; /* on other nodes */ |
| 300 | unsigned long next_reap; /* updated without locking */ | ||
| 301 | int free_touched; /* updated without locking */ | ||
| 301 | }; | 302 | }; |
| 302 | 303 | ||
| 303 | /* | 304 | /* |
| @@ -310,10 +311,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; | |||
| 310 | #define SIZE_L3 (1 + MAX_NUMNODES) | 311 | #define SIZE_L3 (1 + MAX_NUMNODES) |
| 311 | 312 | ||
| 312 | /* | 313 | /* |
| 313 | * This function must be completely optimized away if | 314 | * This function must be completely optimized away if a constant is passed to |
| 314 | * a constant is passed to it. Mostly the same as | 315 | * it. Mostly the same as what is in linux/slab.h except it returns an index. |
| 315 | * what is in linux/slab.h except it returns an | ||
| 316 | * index. | ||
| 317 | */ | 316 | */ |
| 318 | static __always_inline int index_of(const size_t size) | 317 | static __always_inline int index_of(const size_t size) |
| 319 | { | 318 | { |
| @@ -351,14 +350,14 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
| 351 | parent->free_touched = 0; | 350 | parent->free_touched = 0; |
| 352 | } | 351 | } |
| 353 | 352 | ||
| 354 | #define MAKE_LIST(cachep, listp, slab, nodeid) \ | 353 | #define MAKE_LIST(cachep, listp, slab, nodeid) \ |
| 355 | do { \ | 354 | do { \ |
| 356 | INIT_LIST_HEAD(listp); \ | 355 | INIT_LIST_HEAD(listp); \ |
| 357 | list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ | 356 | list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ |
| 358 | } while (0) | 357 | } while (0) |
| 359 | 358 | ||
| 360 | #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ | 359 | #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ |
| 361 | do { \ | 360 | do { \ |
| 362 | MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ | 361 | MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ |
| 363 | MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ | 362 | MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ |
| 364 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 363 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
| @@ -373,28 +372,30 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
| 373 | struct kmem_cache { | 372 | struct kmem_cache { |
| 374 | /* 1) per-cpu data, touched during every alloc/free */ | 373 | /* 1) per-cpu data, touched during every alloc/free */ |
| 375 | struct array_cache *array[NR_CPUS]; | 374 | struct array_cache *array[NR_CPUS]; |
| 375 | /* 2) Cache tunables. Protected by cache_chain_mutex */ | ||
| 376 | unsigned int batchcount; | 376 | unsigned int batchcount; |
| 377 | unsigned int limit; | 377 | unsigned int limit; |
| 378 | unsigned int shared; | 378 | unsigned int shared; |
| 379 | |||
| 379 | unsigned int buffer_size; | 380 | unsigned int buffer_size; |
| 380 | /* 2) touched by every alloc & free from the backend */ | 381 | /* 3) touched by every alloc & free from the backend */ |
| 381 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | 382 | struct kmem_list3 *nodelists[MAX_NUMNODES]; |
| 382 | unsigned int flags; /* constant flags */ | ||
| 383 | unsigned int num; /* # of objs per slab */ | ||
| 384 | spinlock_t spinlock; | ||
| 385 | 383 | ||
| 386 | /* 3) cache_grow/shrink */ | 384 | unsigned int flags; /* constant flags */ |
| 385 | unsigned int num; /* # of objs per slab */ | ||
| 386 | |||
| 387 | /* 4) cache_grow/shrink */ | ||
| 387 | /* order of pgs per slab (2^n) */ | 388 | /* order of pgs per slab (2^n) */ |
| 388 | unsigned int gfporder; | 389 | unsigned int gfporder; |
| 389 | 390 | ||
| 390 | /* force GFP flags, e.g. GFP_DMA */ | 391 | /* force GFP flags, e.g. GFP_DMA */ |
| 391 | gfp_t gfpflags; | 392 | gfp_t gfpflags; |
| 392 | 393 | ||
| 393 | size_t colour; /* cache colouring range */ | 394 | size_t colour; /* cache colouring range */ |
| 394 | unsigned int colour_off; /* colour offset */ | 395 | unsigned int colour_off; /* colour offset */ |
| 395 | struct kmem_cache *slabp_cache; | 396 | struct kmem_cache *slabp_cache; |
| 396 | unsigned int slab_size; | 397 | unsigned int slab_size; |
| 397 | unsigned int dflags; /* dynamic flags */ | 398 | unsigned int dflags; /* dynamic flags */ |
| 398 | 399 | ||
| 399 | /* constructor func */ | 400 | /* constructor func */ |
| 400 | void (*ctor) (void *, struct kmem_cache *, unsigned long); | 401 | void (*ctor) (void *, struct kmem_cache *, unsigned long); |
| @@ -402,11 +403,11 @@ struct kmem_cache { | |||
| 402 | /* de-constructor func */ | 403 | /* de-constructor func */ |
| 403 | void (*dtor) (void *, struct kmem_cache *, unsigned long); | 404 | void (*dtor) (void *, struct kmem_cache *, unsigned long); |
| 404 | 405 | ||
| 405 | /* 4) cache creation/removal */ | 406 | /* 5) cache creation/removal */ |
| 406 | const char *name; | 407 | const char *name; |
| 407 | struct list_head next; | 408 | struct list_head next; |
| 408 | 409 | ||
| 409 | /* 5) statistics */ | 410 | /* 6) statistics */ |
| 410 | #if STATS | 411 | #if STATS |
| 411 | unsigned long num_active; | 412 | unsigned long num_active; |
| 412 | unsigned long num_allocations; | 413 | unsigned long num_allocations; |
| @@ -438,8 +439,9 @@ struct kmem_cache { | |||
| 438 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 439 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
| 439 | 440 | ||
| 440 | #define BATCHREFILL_LIMIT 16 | 441 | #define BATCHREFILL_LIMIT 16 |
| 441 | /* Optimization question: fewer reaps means less | 442 | /* |
| 442 | * probability for unnessary cpucache drain/refill cycles. | 443 | * Optimization question: fewer reaps means less probability for unnessary |
| 444 | * cpucache drain/refill cycles. | ||
| 443 | * | 445 | * |
| 444 | * OTOH the cpuarrays can contain lots of objects, | 446 | * OTOH the cpuarrays can contain lots of objects, |
| 445 | * which could lock up otherwise freeable slabs. | 447 | * which could lock up otherwise freeable slabs. |
| @@ -453,17 +455,19 @@ struct kmem_cache { | |||
| 453 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) | 455 | #define STATS_INC_ALLOCED(x) ((x)->num_allocations++) |
| 454 | #define STATS_INC_GROWN(x) ((x)->grown++) | 456 | #define STATS_INC_GROWN(x) ((x)->grown++) |
| 455 | #define STATS_INC_REAPED(x) ((x)->reaped++) | 457 | #define STATS_INC_REAPED(x) ((x)->reaped++) |
| 456 | #define STATS_SET_HIGH(x) do { if ((x)->num_active > (x)->high_mark) \ | 458 | #define STATS_SET_HIGH(x) \ |
| 457 | (x)->high_mark = (x)->num_active; \ | 459 | do { \ |
| 458 | } while (0) | 460 | if ((x)->num_active > (x)->high_mark) \ |
| 461 | (x)->high_mark = (x)->num_active; \ | ||
| 462 | } while (0) | ||
| 459 | #define STATS_INC_ERR(x) ((x)->errors++) | 463 | #define STATS_INC_ERR(x) ((x)->errors++) |
| 460 | #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) | 464 | #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) |
| 461 | #define STATS_INC_NODEFREES(x) ((x)->node_frees++) | 465 | #define STATS_INC_NODEFREES(x) ((x)->node_frees++) |
| 462 | #define STATS_SET_FREEABLE(x, i) \ | 466 | #define STATS_SET_FREEABLE(x, i) \ |
| 463 | do { if ((x)->max_freeable < i) \ | 467 | do { \ |
| 464 | (x)->max_freeable = i; \ | 468 | if ((x)->max_freeable < i) \ |
| 465 | } while (0) | 469 | (x)->max_freeable = i; \ |
| 466 | 470 | } while (0) | |
| 467 | #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) | 471 | #define STATS_INC_ALLOCHIT(x) atomic_inc(&(x)->allochit) |
| 468 | #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) | 472 | #define STATS_INC_ALLOCMISS(x) atomic_inc(&(x)->allocmiss) |
| 469 | #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) | 473 | #define STATS_INC_FREEHIT(x) atomic_inc(&(x)->freehit) |
| @@ -478,9 +482,7 @@ struct kmem_cache { | |||
| 478 | #define STATS_INC_ERR(x) do { } while (0) | 482 | #define STATS_INC_ERR(x) do { } while (0) |
| 479 | #define STATS_INC_NODEALLOCS(x) do { } while (0) | 483 | #define STATS_INC_NODEALLOCS(x) do { } while (0) |
| 480 | #define STATS_INC_NODEFREES(x) do { } while (0) | 484 | #define STATS_INC_NODEFREES(x) do { } while (0) |
| 481 | #define STATS_SET_FREEABLE(x, i) \ | 485 | #define STATS_SET_FREEABLE(x, i) do { } while (0) |
| 482 | do { } while (0) | ||
| 483 | |||
| 484 | #define STATS_INC_ALLOCHIT(x) do { } while (0) | 486 | #define STATS_INC_ALLOCHIT(x) do { } while (0) |
| 485 | #define STATS_INC_ALLOCMISS(x) do { } while (0) | 487 | #define STATS_INC_ALLOCMISS(x) do { } while (0) |
| 486 | #define STATS_INC_FREEHIT(x) do { } while (0) | 488 | #define STATS_INC_FREEHIT(x) do { } while (0) |
| @@ -488,7 +490,8 @@ struct kmem_cache { | |||
| 488 | #endif | 490 | #endif |
| 489 | 491 | ||
| 490 | #if DEBUG | 492 | #if DEBUG |
| 491 | /* Magic nums for obj red zoning. | 493 | /* |
| 494 | * Magic nums for obj red zoning. | ||
| 492 | * Placed in the first word before and the first word after an obj. | 495 | * Placed in the first word before and the first word after an obj. |
| 493 | */ | 496 | */ |
| 494 | #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ | 497 | #define RED_INACTIVE 0x5A2CF071UL /* when obj is inactive */ |
| @@ -499,7 +502,8 @@ struct kmem_cache { | |||
| 499 | #define POISON_FREE 0x6b /* for use-after-free poisoning */ | 502 | #define POISON_FREE 0x6b /* for use-after-free poisoning */ |
| 500 | #define POISON_END 0xa5 /* end-byte of poisoning */ | 503 | #define POISON_END 0xa5 /* end-byte of poisoning */ |
| 501 | 504 | ||
| 502 | /* memory layout of objects: | 505 | /* |
| 506 | * memory layout of objects: | ||
| 503 | * 0 : objp | 507 | * 0 : objp |
| 504 | * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that | 508 | * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that |
| 505 | * the end of an object is aligned with the end of the real | 509 | * the end of an object is aligned with the end of the real |
| @@ -508,7 +512,8 @@ struct kmem_cache { | |||
| 508 | * redzone word. | 512 | * redzone word. |
| 509 | * cachep->obj_offset: The real object. | 513 | * cachep->obj_offset: The real object. |
| 510 | * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] | 514 | * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long] |
| 511 | * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long] | 515 | * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address |
| 516 | * [BYTES_PER_WORD long] | ||
| 512 | */ | 517 | */ |
| 513 | static int obj_offset(struct kmem_cache *cachep) | 518 | static int obj_offset(struct kmem_cache *cachep) |
| 514 | { | 519 | { |
| @@ -552,8 +557,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
| 552 | #endif | 557 | #endif |
| 553 | 558 | ||
| 554 | /* | 559 | /* |
| 555 | * Maximum size of an obj (in 2^order pages) | 560 | * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp |
| 556 | * and absolute limit for the gfp order. | 561 | * order. |
| 557 | */ | 562 | */ |
| 558 | #if defined(CONFIG_LARGE_ALLOCS) | 563 | #if defined(CONFIG_LARGE_ALLOCS) |
| 559 | #define MAX_OBJ_ORDER 13 /* up to 32Mb */ | 564 | #define MAX_OBJ_ORDER 13 /* up to 32Mb */ |
| @@ -573,9 +578,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
| 573 | #define BREAK_GFP_ORDER_LO 0 | 578 | #define BREAK_GFP_ORDER_LO 0 |
| 574 | static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; | 579 | static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; |
| 575 | 580 | ||
| 576 | /* Functions for storing/retrieving the cachep and or slab from the | 581 | /* |
| 577 | * global 'mem_map'. These are used to find the slab an obj belongs to. | 582 | * Functions for storing/retrieving the cachep and or slab from the page |
| 578 | * With kfree(), these are used to find the cache which an obj belongs to. | 583 | * allocator. These are used to find the slab an obj belongs to. With kfree(), |
| 584 | * these are used to find the cache which an obj belongs to. | ||
| 579 | */ | 585 | */ |
| 580 | static inline void page_set_cache(struct page *page, struct kmem_cache *cache) | 586 | static inline void page_set_cache(struct page *page, struct kmem_cache *cache) |
| 581 | { | 587 | { |
| @@ -584,6 +590,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache) | |||
| 584 | 590 | ||
| 585 | static inline struct kmem_cache *page_get_cache(struct page *page) | 591 | static inline struct kmem_cache *page_get_cache(struct page *page) |
| 586 | { | 592 | { |
| 593 | if (unlikely(PageCompound(page))) | ||
| 594 | page = (struct page *)page_private(page); | ||
| 587 | return (struct kmem_cache *)page->lru.next; | 595 | return (struct kmem_cache *)page->lru.next; |
| 588 | } | 596 | } |
| 589 | 597 | ||
| @@ -594,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab) | |||
| 594 | 602 | ||
| 595 | static inline struct slab *page_get_slab(struct page *page) | 603 | static inline struct slab *page_get_slab(struct page *page) |
| 596 | { | 604 | { |
| 605 | if (unlikely(PageCompound(page))) | ||
| 606 | page = (struct page *)page_private(page); | ||
| 597 | return (struct slab *)page->lru.prev; | 607 | return (struct slab *)page->lru.prev; |
| 598 | } | 608 | } |
| 599 | 609 | ||
| @@ -609,7 +619,21 @@ static inline struct slab *virt_to_slab(const void *obj) | |||
| 609 | return page_get_slab(page); | 619 | return page_get_slab(page); |
| 610 | } | 620 | } |
| 611 | 621 | ||
| 612 | /* These are the default caches for kmalloc. Custom caches can have other sizes. */ | 622 | static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, |
| 623 | unsigned int idx) | ||
| 624 | { | ||
| 625 | return slab->s_mem + cache->buffer_size * idx; | ||
| 626 | } | ||
| 627 | |||
| 628 | static inline unsigned int obj_to_index(struct kmem_cache *cache, | ||
| 629 | struct slab *slab, void *obj) | ||
| 630 | { | ||
| 631 | return (unsigned)(obj - slab->s_mem) / cache->buffer_size; | ||
| 632 | } | ||
| 633 | |||
| 634 | /* | ||
| 635 | * These are the default caches for kmalloc. Custom caches can have other sizes. | ||
| 636 | */ | ||
| 613 | struct cache_sizes malloc_sizes[] = { | 637 | struct cache_sizes malloc_sizes[] = { |
| 614 | #define CACHE(x) { .cs_size = (x) }, | 638 | #define CACHE(x) { .cs_size = (x) }, |
| 615 | #include <linux/kmalloc_sizes.h> | 639 | #include <linux/kmalloc_sizes.h> |
| @@ -642,8 +666,6 @@ static struct kmem_cache cache_cache = { | |||
| 642 | .limit = BOOT_CPUCACHE_ENTRIES, | 666 | .limit = BOOT_CPUCACHE_ENTRIES, |
| 643 | .shared = 1, | 667 | .shared = 1, |
| 644 | .buffer_size = sizeof(struct kmem_cache), | 668 | .buffer_size = sizeof(struct kmem_cache), |
| 645 | .flags = SLAB_NO_REAP, | ||
| 646 | .spinlock = SPIN_LOCK_UNLOCKED, | ||
| 647 | .name = "kmem_cache", | 669 | .name = "kmem_cache", |
| 648 | #if DEBUG | 670 | #if DEBUG |
| 649 | .obj_size = sizeof(struct kmem_cache), | 671 | .obj_size = sizeof(struct kmem_cache), |
| @@ -655,8 +677,8 @@ static DEFINE_MUTEX(cache_chain_mutex); | |||
| 655 | static struct list_head cache_chain; | 677 | static struct list_head cache_chain; |
| 656 | 678 | ||
| 657 | /* | 679 | /* |
| 658 | * vm_enough_memory() looks at this to determine how many | 680 | * vm_enough_memory() looks at this to determine how many slab-allocated pages |
| 659 | * slab-allocated pages are possibly freeable under pressure | 681 | * are possibly freeable under pressure |
| 660 | * | 682 | * |
| 661 | * SLAB_RECLAIM_ACCOUNT turns this on per-slab | 683 | * SLAB_RECLAIM_ACCOUNT turns this on per-slab |
| 662 | */ | 684 | */ |
| @@ -675,7 +697,8 @@ static enum { | |||
| 675 | 697 | ||
| 676 | static DEFINE_PER_CPU(struct work_struct, reap_work); | 698 | static DEFINE_PER_CPU(struct work_struct, reap_work); |
| 677 | 699 | ||
| 678 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node); | 700 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
| 701 | int node); | ||
| 679 | static void enable_cpucache(struct kmem_cache *cachep); | 702 | static void enable_cpucache(struct kmem_cache *cachep); |
| 680 | static void cache_reap(void *unused); | 703 | static void cache_reap(void *unused); |
| 681 | static int __node_shrink(struct kmem_cache *cachep, int node); | 704 | static int __node_shrink(struct kmem_cache *cachep, int node); |
| @@ -685,7 +708,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | |||
| 685 | return cachep->array[smp_processor_id()]; | 708 | return cachep->array[smp_processor_id()]; |
| 686 | } | 709 | } |
| 687 | 710 | ||
| 688 | static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags) | 711 | static inline struct kmem_cache *__find_general_cachep(size_t size, |
| 712 | gfp_t gfpflags) | ||
| 689 | { | 713 | { |
| 690 | struct cache_sizes *csizep = malloc_sizes; | 714 | struct cache_sizes *csizep = malloc_sizes; |
| 691 | 715 | ||
| @@ -720,8 +744,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align) | |||
| 720 | return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); | 744 | return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); |
| 721 | } | 745 | } |
| 722 | 746 | ||
| 723 | /* Calculate the number of objects and left-over bytes for a given | 747 | /* |
| 724 | buffer size. */ | 748 | * Calculate the number of objects and left-over bytes for a given buffer size. |
| 749 | */ | ||
| 725 | static void cache_estimate(unsigned long gfporder, size_t buffer_size, | 750 | static void cache_estimate(unsigned long gfporder, size_t buffer_size, |
| 726 | size_t align, int flags, size_t *left_over, | 751 | size_t align, int flags, size_t *left_over, |
| 727 | unsigned int *num) | 752 | unsigned int *num) |
| @@ -782,7 +807,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, | |||
| 782 | 807 | ||
| 783 | #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) | 808 | #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg) |
| 784 | 809 | ||
| 785 | static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg) | 810 | static void __slab_error(const char *function, struct kmem_cache *cachep, |
| 811 | char *msg) | ||
| 786 | { | 812 | { |
| 787 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", | 813 | printk(KERN_ERR "slab error in %s(): cache `%s': %s\n", |
| 788 | function, cachep->name, msg); | 814 | function, cachep->name, msg); |
| @@ -804,7 +830,7 @@ static void init_reap_node(int cpu) | |||
| 804 | 830 | ||
| 805 | node = next_node(cpu_to_node(cpu), node_online_map); | 831 | node = next_node(cpu_to_node(cpu), node_online_map); |
| 806 | if (node == MAX_NUMNODES) | 832 | if (node == MAX_NUMNODES) |
| 807 | node = 0; | 833 | node = first_node(node_online_map); |
| 808 | 834 | ||
| 809 | __get_cpu_var(reap_node) = node; | 835 | __get_cpu_var(reap_node) = node; |
| 810 | } | 836 | } |
| @@ -906,10 +932,8 @@ static void free_alien_cache(struct array_cache **ac_ptr) | |||
| 906 | 932 | ||
| 907 | if (!ac_ptr) | 933 | if (!ac_ptr) |
| 908 | return; | 934 | return; |
| 909 | |||
| 910 | for_each_node(i) | 935 | for_each_node(i) |
| 911 | kfree(ac_ptr[i]); | 936 | kfree(ac_ptr[i]); |
| 912 | |||
| 913 | kfree(ac_ptr); | 937 | kfree(ac_ptr); |
| 914 | } | 938 | } |
| 915 | 939 | ||
| @@ -943,7 +967,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | |||
| 943 | } | 967 | } |
| 944 | } | 968 | } |
| 945 | 969 | ||
| 946 | static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien) | 970 | static void drain_alien_cache(struct kmem_cache *cachep, |
| 971 | struct array_cache **alien) | ||
| 947 | { | 972 | { |
| 948 | int i = 0; | 973 | int i = 0; |
| 949 | struct array_cache *ac; | 974 | struct array_cache *ac; |
| @@ -986,20 +1011,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
| 986 | switch (action) { | 1011 | switch (action) { |
| 987 | case CPU_UP_PREPARE: | 1012 | case CPU_UP_PREPARE: |
| 988 | mutex_lock(&cache_chain_mutex); | 1013 | mutex_lock(&cache_chain_mutex); |
| 989 | /* we need to do this right in the beginning since | 1014 | /* |
| 1015 | * We need to do this right in the beginning since | ||
| 990 | * alloc_arraycache's are going to use this list. | 1016 | * alloc_arraycache's are going to use this list. |
| 991 | * kmalloc_node allows us to add the slab to the right | 1017 | * kmalloc_node allows us to add the slab to the right |
| 992 | * kmem_list3 and not this cpu's kmem_list3 | 1018 | * kmem_list3 and not this cpu's kmem_list3 |
| 993 | */ | 1019 | */ |
| 994 | 1020 | ||
| 995 | list_for_each_entry(cachep, &cache_chain, next) { | 1021 | list_for_each_entry(cachep, &cache_chain, next) { |
| 996 | /* setup the size64 kmemlist for cpu before we can | 1022 | /* |
| 1023 | * Set up the size64 kmemlist for cpu before we can | ||
| 997 | * begin anything. Make sure some other cpu on this | 1024 | * begin anything. Make sure some other cpu on this |
| 998 | * node has not already allocated this | 1025 | * node has not already allocated this |
| 999 | */ | 1026 | */ |
| 1000 | if (!cachep->nodelists[node]) { | 1027 | if (!cachep->nodelists[node]) { |
| 1001 | if (!(l3 = kmalloc_node(memsize, | 1028 | l3 = kmalloc_node(memsize, GFP_KERNEL, node); |
| 1002 | GFP_KERNEL, node))) | 1029 | if (!l3) |
| 1003 | goto bad; | 1030 | goto bad; |
| 1004 | kmem_list3_init(l3); | 1031 | kmem_list3_init(l3); |
| 1005 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + | 1032 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + |
| @@ -1015,13 +1042,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
| 1015 | 1042 | ||
| 1016 | spin_lock_irq(&cachep->nodelists[node]->list_lock); | 1043 | spin_lock_irq(&cachep->nodelists[node]->list_lock); |
| 1017 | cachep->nodelists[node]->free_limit = | 1044 | cachep->nodelists[node]->free_limit = |
| 1018 | (1 + nr_cpus_node(node)) * | 1045 | (1 + nr_cpus_node(node)) * |
| 1019 | cachep->batchcount + cachep->num; | 1046 | cachep->batchcount + cachep->num; |
| 1020 | spin_unlock_irq(&cachep->nodelists[node]->list_lock); | 1047 | spin_unlock_irq(&cachep->nodelists[node]->list_lock); |
| 1021 | } | 1048 | } |
| 1022 | 1049 | ||
| 1023 | /* Now we can go ahead with allocating the shared array's | 1050 | /* |
| 1024 | & array cache's */ | 1051 | * Now we can go ahead with allocating the shared arrays and |
| 1052 | * array caches | ||
| 1053 | */ | ||
| 1025 | list_for_each_entry(cachep, &cache_chain, next) { | 1054 | list_for_each_entry(cachep, &cache_chain, next) { |
| 1026 | struct array_cache *nc; | 1055 | struct array_cache *nc; |
| 1027 | struct array_cache *shared; | 1056 | struct array_cache *shared; |
| @@ -1041,7 +1070,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
| 1041 | if (!alien) | 1070 | if (!alien) |
| 1042 | goto bad; | 1071 | goto bad; |
| 1043 | cachep->array[cpu] = nc; | 1072 | cachep->array[cpu] = nc; |
| 1044 | |||
| 1045 | l3 = cachep->nodelists[node]; | 1073 | l3 = cachep->nodelists[node]; |
| 1046 | BUG_ON(!l3); | 1074 | BUG_ON(!l3); |
| 1047 | 1075 | ||
| @@ -1061,7 +1089,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
| 1061 | } | 1089 | } |
| 1062 | #endif | 1090 | #endif |
| 1063 | spin_unlock_irq(&l3->list_lock); | 1091 | spin_unlock_irq(&l3->list_lock); |
| 1064 | |||
| 1065 | kfree(shared); | 1092 | kfree(shared); |
| 1066 | free_alien_cache(alien); | 1093 | free_alien_cache(alien); |
| 1067 | } | 1094 | } |
| @@ -1083,7 +1110,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb, | |||
| 1083 | /* fall thru */ | 1110 | /* fall thru */ |
| 1084 | case CPU_UP_CANCELED: | 1111 | case CPU_UP_CANCELED: |
| 1085 | mutex_lock(&cache_chain_mutex); | 1112 | mutex_lock(&cache_chain_mutex); |
| 1086 | |||
| 1087 | list_for_each_entry(cachep, &cache_chain, next) { | 1113 | list_for_each_entry(cachep, &cache_chain, next) { |
| 1088 | struct array_cache *nc; | 1114 | struct array_cache *nc; |
| 1089 | struct array_cache *shared; | 1115 | struct array_cache *shared; |
| @@ -1150,7 +1176,7 @@ free_array_cache: | |||
| 1150 | #endif | 1176 | #endif |
| 1151 | } | 1177 | } |
| 1152 | return NOTIFY_OK; | 1178 | return NOTIFY_OK; |
| 1153 | bad: | 1179 | bad: |
| 1154 | mutex_unlock(&cache_chain_mutex); | 1180 | mutex_unlock(&cache_chain_mutex); |
| 1155 | return NOTIFY_BAD; | 1181 | return NOTIFY_BAD; |
| 1156 | } | 1182 | } |
| @@ -1160,7 +1186,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; | |||
| 1160 | /* | 1186 | /* |
| 1161 | * swap the static kmem_list3 with kmalloced memory | 1187 | * swap the static kmem_list3 with kmalloced memory |
| 1162 | */ | 1188 | */ |
| 1163 | static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid) | 1189 | static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, |
| 1190 | int nodeid) | ||
| 1164 | { | 1191 | { |
| 1165 | struct kmem_list3 *ptr; | 1192 | struct kmem_list3 *ptr; |
| 1166 | 1193 | ||
| @@ -1175,8 +1202,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no | |||
| 1175 | local_irq_enable(); | 1202 | local_irq_enable(); |
| 1176 | } | 1203 | } |
| 1177 | 1204 | ||
| 1178 | /* Initialisation. | 1205 | /* |
| 1179 | * Called after the gfp() functions have been enabled, and before smp_init(). | 1206 | * Initialisation. Called after the page allocator have been initialised and |
| 1207 | * before smp_init(). | ||
| 1180 | */ | 1208 | */ |
| 1181 | void __init kmem_cache_init(void) | 1209 | void __init kmem_cache_init(void) |
| 1182 | { | 1210 | { |
| @@ -1201,9 +1229,9 @@ void __init kmem_cache_init(void) | |||
| 1201 | 1229 | ||
| 1202 | /* Bootstrap is tricky, because several objects are allocated | 1230 | /* Bootstrap is tricky, because several objects are allocated |
| 1203 | * from caches that do not exist yet: | 1231 | * from caches that do not exist yet: |
| 1204 | * 1) initialize the cache_cache cache: it contains the struct kmem_cache | 1232 | * 1) initialize the cache_cache cache: it contains the struct |
| 1205 | * structures of all caches, except cache_cache itself: cache_cache | 1233 | * kmem_cache structures of all caches, except cache_cache itself: |
| 1206 | * is statically allocated. | 1234 | * cache_cache is statically allocated. |
| 1207 | * Initially an __init data area is used for the head array and the | 1235 | * Initially an __init data area is used for the head array and the |
| 1208 | * kmem_list3 structures, it's replaced with a kmalloc allocated | 1236 | * kmem_list3 structures, it's replaced with a kmalloc allocated |
| 1209 | * array at the end of the bootstrap. | 1237 | * array at the end of the bootstrap. |
| @@ -1226,7 +1254,8 @@ void __init kmem_cache_init(void) | |||
| 1226 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; | 1254 | cache_cache.array[smp_processor_id()] = &initarray_cache.cache; |
| 1227 | cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; | 1255 | cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE]; |
| 1228 | 1256 | ||
| 1229 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size()); | 1257 | cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, |
| 1258 | cache_line_size()); | ||
| 1230 | 1259 | ||
| 1231 | for (order = 0; order < MAX_ORDER; order++) { | 1260 | for (order = 0; order < MAX_ORDER; order++) { |
| 1232 | cache_estimate(order, cache_cache.buffer_size, | 1261 | cache_estimate(order, cache_cache.buffer_size, |
| @@ -1245,24 +1274,26 @@ void __init kmem_cache_init(void) | |||
| 1245 | sizes = malloc_sizes; | 1274 | sizes = malloc_sizes; |
| 1246 | names = cache_names; | 1275 | names = cache_names; |
| 1247 | 1276 | ||
| 1248 | /* Initialize the caches that provide memory for the array cache | 1277 | /* |
| 1249 | * and the kmem_list3 structures first. | 1278 | * Initialize the caches that provide memory for the array cache and the |
| 1250 | * Without this, further allocations will bug | 1279 | * kmem_list3 structures first. Without this, further allocations will |
| 1280 | * bug. | ||
| 1251 | */ | 1281 | */ |
| 1252 | 1282 | ||
| 1253 | sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, | 1283 | sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name, |
| 1254 | sizes[INDEX_AC].cs_size, | 1284 | sizes[INDEX_AC].cs_size, |
| 1255 | ARCH_KMALLOC_MINALIGN, | 1285 | ARCH_KMALLOC_MINALIGN, |
| 1256 | (ARCH_KMALLOC_FLAGS | | 1286 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
| 1257 | SLAB_PANIC), NULL, NULL); | 1287 | NULL, NULL); |
| 1258 | 1288 | ||
| 1259 | if (INDEX_AC != INDEX_L3) | 1289 | if (INDEX_AC != INDEX_L3) { |
| 1260 | sizes[INDEX_L3].cs_cachep = | 1290 | sizes[INDEX_L3].cs_cachep = |
| 1261 | kmem_cache_create(names[INDEX_L3].name, | 1291 | kmem_cache_create(names[INDEX_L3].name, |
| 1262 | sizes[INDEX_L3].cs_size, | 1292 | sizes[INDEX_L3].cs_size, |
| 1263 | ARCH_KMALLOC_MINALIGN, | 1293 | ARCH_KMALLOC_MINALIGN, |
| 1264 | (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, | 1294 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
| 1265 | NULL); | 1295 | NULL, NULL); |
| 1296 | } | ||
| 1266 | 1297 | ||
| 1267 | while (sizes->cs_size != ULONG_MAX) { | 1298 | while (sizes->cs_size != ULONG_MAX) { |
| 1268 | /* | 1299 | /* |
| @@ -1272,13 +1303,13 @@ void __init kmem_cache_init(void) | |||
| 1272 | * Note for systems short on memory removing the alignment will | 1303 | * Note for systems short on memory removing the alignment will |
| 1273 | * allow tighter packing of the smaller caches. | 1304 | * allow tighter packing of the smaller caches. |
| 1274 | */ | 1305 | */ |
| 1275 | if (!sizes->cs_cachep) | 1306 | if (!sizes->cs_cachep) { |
| 1276 | sizes->cs_cachep = kmem_cache_create(names->name, | 1307 | sizes->cs_cachep = kmem_cache_create(names->name, |
| 1277 | sizes->cs_size, | 1308 | sizes->cs_size, |
| 1278 | ARCH_KMALLOC_MINALIGN, | 1309 | ARCH_KMALLOC_MINALIGN, |
| 1279 | (ARCH_KMALLOC_FLAGS | 1310 | ARCH_KMALLOC_FLAGS|SLAB_PANIC, |
| 1280 | | SLAB_PANIC), | 1311 | NULL, NULL); |
| 1281 | NULL, NULL); | 1312 | } |
| 1282 | 1313 | ||
| 1283 | /* Inc off-slab bufctl limit until the ceiling is hit. */ | 1314 | /* Inc off-slab bufctl limit until the ceiling is hit. */ |
| 1284 | if (!(OFF_SLAB(sizes->cs_cachep))) { | 1315 | if (!(OFF_SLAB(sizes->cs_cachep))) { |
| @@ -1287,13 +1318,11 @@ void __init kmem_cache_init(void) | |||
| 1287 | } | 1318 | } |
| 1288 | 1319 | ||
| 1289 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, | 1320 | sizes->cs_dmacachep = kmem_cache_create(names->name_dma, |
| 1290 | sizes->cs_size, | 1321 | sizes->cs_size, |
| 1291 | ARCH_KMALLOC_MINALIGN, | 1322 | ARCH_KMALLOC_MINALIGN, |
| 1292 | (ARCH_KMALLOC_FLAGS | | 1323 | ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA| |
| 1293 | SLAB_CACHE_DMA | | 1324 | SLAB_PANIC, |
| 1294 | SLAB_PANIC), NULL, | 1325 | NULL, NULL); |
| 1295 | NULL); | ||
| 1296 | |||
| 1297 | sizes++; | 1326 | sizes++; |
| 1298 | names++; | 1327 | names++; |
| 1299 | } | 1328 | } |
| @@ -1345,20 +1374,22 @@ void __init kmem_cache_init(void) | |||
| 1345 | struct kmem_cache *cachep; | 1374 | struct kmem_cache *cachep; |
| 1346 | mutex_lock(&cache_chain_mutex); | 1375 | mutex_lock(&cache_chain_mutex); |
| 1347 | list_for_each_entry(cachep, &cache_chain, next) | 1376 | list_for_each_entry(cachep, &cache_chain, next) |
| 1348 | enable_cpucache(cachep); | 1377 | enable_cpucache(cachep); |
| 1349 | mutex_unlock(&cache_chain_mutex); | 1378 | mutex_unlock(&cache_chain_mutex); |
| 1350 | } | 1379 | } |
| 1351 | 1380 | ||
| 1352 | /* Done! */ | 1381 | /* Done! */ |
| 1353 | g_cpucache_up = FULL; | 1382 | g_cpucache_up = FULL; |
| 1354 | 1383 | ||
| 1355 | /* Register a cpu startup notifier callback | 1384 | /* |
| 1356 | * that initializes cpu_cache_get for all new cpus | 1385 | * Register a cpu startup notifier callback that initializes |
| 1386 | * cpu_cache_get for all new cpus | ||
| 1357 | */ | 1387 | */ |
| 1358 | register_cpu_notifier(&cpucache_notifier); | 1388 | register_cpu_notifier(&cpucache_notifier); |
| 1359 | 1389 | ||
| 1360 | /* The reap timers are started later, with a module init call: | 1390 | /* |
| 1361 | * That part of the kernel is not yet operational. | 1391 | * The reap timers are started later, with a module init call: That part |
| 1392 | * of the kernel is not yet operational. | ||
| 1362 | */ | 1393 | */ |
| 1363 | } | 1394 | } |
| 1364 | 1395 | ||
| @@ -1366,16 +1397,13 @@ static int __init cpucache_init(void) | |||
| 1366 | { | 1397 | { |
| 1367 | int cpu; | 1398 | int cpu; |
| 1368 | 1399 | ||
| 1369 | /* | 1400 | /* |
| 1370 | * Register the timers that return unneeded | 1401 | * Register the timers that return unneeded pages to the page allocator |
| 1371 | * pages to gfp. | ||
| 1372 | */ | 1402 | */ |
| 1373 | for_each_online_cpu(cpu) | 1403 | for_each_online_cpu(cpu) |
| 1374 | start_cpu_timer(cpu); | 1404 | start_cpu_timer(cpu); |
| 1375 | |||
| 1376 | return 0; | 1405 | return 0; |
| 1377 | } | 1406 | } |
| 1378 | |||
| 1379 | __initcall(cpucache_init); | 1407 | __initcall(cpucache_init); |
| 1380 | 1408 | ||
| 1381 | /* | 1409 | /* |
| @@ -1402,7 +1430,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 1402 | atomic_add(i, &slab_reclaim_pages); | 1430 | atomic_add(i, &slab_reclaim_pages); |
| 1403 | add_page_state(nr_slab, i); | 1431 | add_page_state(nr_slab, i); |
| 1404 | while (i--) { | 1432 | while (i--) { |
| 1405 | SetPageSlab(page); | 1433 | __SetPageSlab(page); |
| 1406 | page++; | 1434 | page++; |
| 1407 | } | 1435 | } |
| 1408 | return addr; | 1436 | return addr; |
| @@ -1418,8 +1446,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
| 1418 | const unsigned long nr_freed = i; | 1446 | const unsigned long nr_freed = i; |
| 1419 | 1447 | ||
| 1420 | while (i--) { | 1448 | while (i--) { |
| 1421 | if (!TestClearPageSlab(page)) | 1449 | BUG_ON(!PageSlab(page)); |
| 1422 | BUG(); | 1450 | __ClearPageSlab(page); |
| 1423 | page++; | 1451 | page++; |
| 1424 | } | 1452 | } |
| 1425 | sub_page_state(nr_slab, nr_freed); | 1453 | sub_page_state(nr_slab, nr_freed); |
| @@ -1489,9 +1517,8 @@ static void dump_line(char *data, int offset, int limit) | |||
| 1489 | { | 1517 | { |
| 1490 | int i; | 1518 | int i; |
| 1491 | printk(KERN_ERR "%03x:", offset); | 1519 | printk(KERN_ERR "%03x:", offset); |
| 1492 | for (i = 0; i < limit; i++) { | 1520 | for (i = 0; i < limit; i++) |
| 1493 | printk(" %02x", (unsigned char)data[offset + i]); | 1521 | printk(" %02x", (unsigned char)data[offset + i]); |
| 1494 | } | ||
| 1495 | printk("\n"); | 1522 | printk("\n"); |
| 1496 | } | 1523 | } |
| 1497 | #endif | 1524 | #endif |
| @@ -1505,15 +1532,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines) | |||
| 1505 | 1532 | ||
| 1506 | if (cachep->flags & SLAB_RED_ZONE) { | 1533 | if (cachep->flags & SLAB_RED_ZONE) { |
| 1507 | printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", | 1534 | printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n", |
| 1508 | *dbg_redzone1(cachep, objp), | 1535 | *dbg_redzone1(cachep, objp), |
| 1509 | *dbg_redzone2(cachep, objp)); | 1536 | *dbg_redzone2(cachep, objp)); |
| 1510 | } | 1537 | } |
| 1511 | 1538 | ||
| 1512 | if (cachep->flags & SLAB_STORE_USER) { | 1539 | if (cachep->flags & SLAB_STORE_USER) { |
| 1513 | printk(KERN_ERR "Last user: [<%p>]", | 1540 | printk(KERN_ERR "Last user: [<%p>]", |
| 1514 | *dbg_userword(cachep, objp)); | 1541 | *dbg_userword(cachep, objp)); |
| 1515 | print_symbol("(%s)", | 1542 | print_symbol("(%s)", |
| 1516 | (unsigned long)*dbg_userword(cachep, objp)); | 1543 | (unsigned long)*dbg_userword(cachep, objp)); |
| 1517 | printk("\n"); | 1544 | printk("\n"); |
| 1518 | } | 1545 | } |
| 1519 | realobj = (char *)objp + obj_offset(cachep); | 1546 | realobj = (char *)objp + obj_offset(cachep); |
| @@ -1546,8 +1573,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
| 1546 | /* Print header */ | 1573 | /* Print header */ |
| 1547 | if (lines == 0) { | 1574 | if (lines == 0) { |
| 1548 | printk(KERN_ERR | 1575 | printk(KERN_ERR |
| 1549 | "Slab corruption: start=%p, len=%d\n", | 1576 | "Slab corruption: start=%p, len=%d\n", |
| 1550 | realobj, size); | 1577 | realobj, size); |
| 1551 | print_objinfo(cachep, objp, 0); | 1578 | print_objinfo(cachep, objp, 0); |
| 1552 | } | 1579 | } |
| 1553 | /* Hexdump the affected line */ | 1580 | /* Hexdump the affected line */ |
| @@ -1568,18 +1595,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
| 1568 | * exist: | 1595 | * exist: |
| 1569 | */ | 1596 | */ |
| 1570 | struct slab *slabp = virt_to_slab(objp); | 1597 | struct slab *slabp = virt_to_slab(objp); |
| 1571 | int objnr; | 1598 | unsigned int objnr; |
| 1572 | 1599 | ||
| 1573 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; | 1600 | objnr = obj_to_index(cachep, slabp, objp); |
| 1574 | if (objnr) { | 1601 | if (objnr) { |
| 1575 | objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size; | 1602 | objp = index_to_obj(cachep, slabp, objnr - 1); |
| 1576 | realobj = (char *)objp + obj_offset(cachep); | 1603 | realobj = (char *)objp + obj_offset(cachep); |
| 1577 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", | 1604 | printk(KERN_ERR "Prev obj: start=%p, len=%d\n", |
| 1578 | realobj, size); | 1605 | realobj, size); |
| 1579 | print_objinfo(cachep, objp, 2); | 1606 | print_objinfo(cachep, objp, 2); |
| 1580 | } | 1607 | } |
| 1581 | if (objnr + 1 < cachep->num) { | 1608 | if (objnr + 1 < cachep->num) { |
| 1582 | objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size; | 1609 | objp = index_to_obj(cachep, slabp, objnr + 1); |
| 1583 | realobj = (char *)objp + obj_offset(cachep); | 1610 | realobj = (char *)objp + obj_offset(cachep); |
| 1584 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", | 1611 | printk(KERN_ERR "Next obj: start=%p, len=%d\n", |
| 1585 | realobj, size); | 1612 | realobj, size); |
| @@ -1591,22 +1618,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) | |||
| 1591 | 1618 | ||
| 1592 | #if DEBUG | 1619 | #if DEBUG |
| 1593 | /** | 1620 | /** |
| 1594 | * slab_destroy_objs - call the registered destructor for each object in | 1621 | * slab_destroy_objs - destroy a slab and its objects |
| 1595 | * a slab that is to be destroyed. | 1622 | * @cachep: cache pointer being destroyed |
| 1623 | * @slabp: slab pointer being destroyed | ||
| 1624 | * | ||
| 1625 | * Call the registered destructor for each object in a slab that is being | ||
| 1626 | * destroyed. | ||
| 1596 | */ | 1627 | */ |
| 1597 | static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | 1628 | static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) |
| 1598 | { | 1629 | { |
| 1599 | int i; | 1630 | int i; |
| 1600 | for (i = 0; i < cachep->num; i++) { | 1631 | for (i = 0; i < cachep->num; i++) { |
| 1601 | void *objp = slabp->s_mem + cachep->buffer_size * i; | 1632 | void *objp = index_to_obj(cachep, slabp, i); |
| 1602 | 1633 | ||
| 1603 | if (cachep->flags & SLAB_POISON) { | 1634 | if (cachep->flags & SLAB_POISON) { |
| 1604 | #ifdef CONFIG_DEBUG_PAGEALLOC | 1635 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 1605 | if ((cachep->buffer_size % PAGE_SIZE) == 0 | 1636 | if (cachep->buffer_size % PAGE_SIZE == 0 && |
| 1606 | && OFF_SLAB(cachep)) | 1637 | OFF_SLAB(cachep)) |
| 1607 | kernel_map_pages(virt_to_page(objp), | 1638 | kernel_map_pages(virt_to_page(objp), |
| 1608 | cachep->buffer_size / PAGE_SIZE, | 1639 | cachep->buffer_size / PAGE_SIZE, 1); |
| 1609 | 1); | ||
| 1610 | else | 1640 | else |
| 1611 | check_poison_obj(cachep, objp); | 1641 | check_poison_obj(cachep, objp); |
| 1612 | #else | 1642 | #else |
| @@ -1631,7 +1661,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | |||
| 1631 | if (cachep->dtor) { | 1661 | if (cachep->dtor) { |
| 1632 | int i; | 1662 | int i; |
| 1633 | for (i = 0; i < cachep->num; i++) { | 1663 | for (i = 0; i < cachep->num; i++) { |
| 1634 | void *objp = slabp->s_mem + cachep->buffer_size * i; | 1664 | void *objp = index_to_obj(cachep, slabp, i); |
| 1635 | (cachep->dtor) (objp, cachep, 0); | 1665 | (cachep->dtor) (objp, cachep, 0); |
| 1636 | } | 1666 | } |
| 1637 | } | 1667 | } |
| @@ -1639,9 +1669,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp) | |||
| 1639 | #endif | 1669 | #endif |
| 1640 | 1670 | ||
| 1641 | /** | 1671 | /** |
| 1672 | * slab_destroy - destroy and release all objects in a slab | ||
| 1673 | * @cachep: cache pointer being destroyed | ||
| 1674 | * @slabp: slab pointer being destroyed | ||
| 1675 | * | ||
| 1642 | * Destroy all the objs in a slab, and release the mem back to the system. | 1676 | * Destroy all the objs in a slab, and release the mem back to the system. |
| 1643 | * Before calling the slab must have been unlinked from the cache. | 1677 | * Before calling the slab must have been unlinked from the cache. The |
| 1644 | * The cache-lock is not held/needed. | 1678 | * cache-lock is not held/needed. |
| 1645 | */ | 1679 | */ |
| 1646 | static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) | 1680 | static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) |
| 1647 | { | 1681 | { |
| @@ -1662,8 +1696,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) | |||
| 1662 | } | 1696 | } |
| 1663 | } | 1697 | } |
| 1664 | 1698 | ||
| 1665 | /* For setting up all the kmem_list3s for cache whose buffer_size is same | 1699 | /* |
| 1666 | as size of kmem_list3. */ | 1700 | * For setting up all the kmem_list3s for cache whose buffer_size is same as |
| 1701 | * size of kmem_list3. | ||
| 1702 | */ | ||
| 1667 | static void set_up_list3s(struct kmem_cache *cachep, int index) | 1703 | static void set_up_list3s(struct kmem_cache *cachep, int index) |
| 1668 | { | 1704 | { |
| 1669 | int node; | 1705 | int node; |
| @@ -1689,13 +1725,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index) | |||
| 1689 | * high order pages for slabs. When the gfp() functions are more friendly | 1725 | * high order pages for slabs. When the gfp() functions are more friendly |
| 1690 | * towards high-order requests, this should be changed. | 1726 | * towards high-order requests, this should be changed. |
| 1691 | */ | 1727 | */ |
| 1692 | static inline size_t calculate_slab_order(struct kmem_cache *cachep, | 1728 | static size_t calculate_slab_order(struct kmem_cache *cachep, |
| 1693 | size_t size, size_t align, unsigned long flags) | 1729 | size_t size, size_t align, unsigned long flags) |
| 1694 | { | 1730 | { |
| 1695 | size_t left_over = 0; | 1731 | size_t left_over = 0; |
| 1696 | int gfporder; | 1732 | int gfporder; |
| 1697 | 1733 | ||
| 1698 | for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) { | 1734 | for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) { |
| 1699 | unsigned int num; | 1735 | unsigned int num; |
| 1700 | size_t remainder; | 1736 | size_t remainder; |
| 1701 | 1737 | ||
| @@ -1730,12 +1766,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, | |||
| 1730 | /* | 1766 | /* |
| 1731 | * Acceptable internal fragmentation? | 1767 | * Acceptable internal fragmentation? |
| 1732 | */ | 1768 | */ |
| 1733 | if ((left_over * 8) <= (PAGE_SIZE << gfporder)) | 1769 | if (left_over * 8 <= (PAGE_SIZE << gfporder)) |
| 1734 | break; | 1770 | break; |
| 1735 | } | 1771 | } |
| 1736 | return left_over; | 1772 | return left_over; |
| 1737 | } | 1773 | } |
| 1738 | 1774 | ||
| 1775 | static void setup_cpu_cache(struct kmem_cache *cachep) | ||
| 1776 | { | ||
| 1777 | if (g_cpucache_up == FULL) { | ||
| 1778 | enable_cpucache(cachep); | ||
| 1779 | return; | ||
| 1780 | } | ||
| 1781 | if (g_cpucache_up == NONE) { | ||
| 1782 | /* | ||
| 1783 | * Note: the first kmem_cache_create must create the cache | ||
| 1784 | * that's used by kmalloc(24), otherwise the creation of | ||
| 1785 | * further caches will BUG(). | ||
| 1786 | */ | ||
| 1787 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | ||
| 1788 | |||
| 1789 | /* | ||
| 1790 | * If the cache that's used by kmalloc(sizeof(kmem_list3)) is | ||
| 1791 | * the first cache, then we need to set up all its list3s, | ||
| 1792 | * otherwise the creation of further caches will BUG(). | ||
| 1793 | */ | ||
| 1794 | set_up_list3s(cachep, SIZE_AC); | ||
| 1795 | if (INDEX_AC == INDEX_L3) | ||
| 1796 | g_cpucache_up = PARTIAL_L3; | ||
| 1797 | else | ||
| 1798 | g_cpucache_up = PARTIAL_AC; | ||
| 1799 | } else { | ||
| 1800 | cachep->array[smp_processor_id()] = | ||
| 1801 | kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | ||
| 1802 | |||
| 1803 | if (g_cpucache_up == PARTIAL_AC) { | ||
| 1804 | set_up_list3s(cachep, SIZE_L3); | ||
| 1805 | g_cpucache_up = PARTIAL_L3; | ||
| 1806 | } else { | ||
| 1807 | int node; | ||
| 1808 | for_each_online_node(node) { | ||
| 1809 | cachep->nodelists[node] = | ||
| 1810 | kmalloc_node(sizeof(struct kmem_list3), | ||
| 1811 | GFP_KERNEL, node); | ||
| 1812 | BUG_ON(!cachep->nodelists[node]); | ||
| 1813 | kmem_list3_init(cachep->nodelists[node]); | ||
| 1814 | } | ||
| 1815 | } | ||
| 1816 | } | ||
| 1817 | cachep->nodelists[numa_node_id()]->next_reap = | ||
| 1818 | jiffies + REAPTIMEOUT_LIST3 + | ||
| 1819 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | ||
| 1820 | |||
| 1821 | cpu_cache_get(cachep)->avail = 0; | ||
| 1822 | cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; | ||
| 1823 | cpu_cache_get(cachep)->batchcount = 1; | ||
| 1824 | cpu_cache_get(cachep)->touched = 0; | ||
| 1825 | cachep->batchcount = 1; | ||
| 1826 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | ||
| 1827 | } | ||
| 1828 | |||
| 1739 | /** | 1829 | /** |
| 1740 | * kmem_cache_create - Create a cache. | 1830 | * kmem_cache_create - Create a cache. |
| 1741 | * @name: A string which is used in /proc/slabinfo to identify this cache. | 1831 | * @name: A string which is used in /proc/slabinfo to identify this cache. |
| @@ -1751,9 +1841,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, | |||
| 1751 | * and the @dtor is run before the pages are handed back. | 1841 | * and the @dtor is run before the pages are handed back. |
| 1752 | * | 1842 | * |
| 1753 | * @name must be valid until the cache is destroyed. This implies that | 1843 | * @name must be valid until the cache is destroyed. This implies that |
| 1754 | * the module calling this has to destroy the cache before getting | 1844 | * the module calling this has to destroy the cache before getting unloaded. |
| 1755 | * unloaded. | 1845 | * |
| 1756 | * | ||
| 1757 | * The flags are | 1846 | * The flags are |
| 1758 | * | 1847 | * |
| 1759 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) | 1848 | * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5) |
| @@ -1762,16 +1851,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep, | |||
| 1762 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check | 1851 | * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check |
| 1763 | * for buffer overruns. | 1852 | * for buffer overruns. |
| 1764 | * | 1853 | * |
| 1765 | * %SLAB_NO_REAP - Don't automatically reap this cache when we're under | ||
| 1766 | * memory pressure. | ||
| 1767 | * | ||
| 1768 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware | 1854 | * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware |
| 1769 | * cacheline. This can be beneficial if you're counting cycles as closely | 1855 | * cacheline. This can be beneficial if you're counting cycles as closely |
| 1770 | * as davem. | 1856 | * as davem. |
| 1771 | */ | 1857 | */ |
| 1772 | struct kmem_cache * | 1858 | struct kmem_cache * |
| 1773 | kmem_cache_create (const char *name, size_t size, size_t align, | 1859 | kmem_cache_create (const char *name, size_t size, size_t align, |
| 1774 | unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long), | 1860 | unsigned long flags, |
| 1861 | void (*ctor)(void*, struct kmem_cache *, unsigned long), | ||
| 1775 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) | 1862 | void (*dtor)(void*, struct kmem_cache *, unsigned long)) |
| 1776 | { | 1863 | { |
| 1777 | size_t left_over, slab_size, ralign; | 1864 | size_t left_over, slab_size, ralign; |
| @@ -1781,12 +1868,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1781 | /* | 1868 | /* |
| 1782 | * Sanity checks... these are all serious usage bugs. | 1869 | * Sanity checks... these are all serious usage bugs. |
| 1783 | */ | 1870 | */ |
| 1784 | if ((!name) || | 1871 | if (!name || in_interrupt() || (size < BYTES_PER_WORD) || |
| 1785 | in_interrupt() || | ||
| 1786 | (size < BYTES_PER_WORD) || | ||
| 1787 | (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { | 1872 | (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) { |
| 1788 | printk(KERN_ERR "%s: Early error in slab %s\n", | 1873 | printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, |
| 1789 | __FUNCTION__, name); | 1874 | name); |
| 1790 | BUG(); | 1875 | BUG(); |
| 1791 | } | 1876 | } |
| 1792 | 1877 | ||
| @@ -1840,8 +1925,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1840 | * above the next power of two: caches with object sizes just above a | 1925 | * above the next power of two: caches with object sizes just above a |
| 1841 | * power of two have a significant amount of internal fragmentation. | 1926 | * power of two have a significant amount of internal fragmentation. |
| 1842 | */ | 1927 | */ |
| 1843 | if ((size < 4096 | 1928 | if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD)) |
| 1844 | || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD))) | ||
| 1845 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; | 1929 | flags |= SLAB_RED_ZONE | SLAB_STORE_USER; |
| 1846 | if (!(flags & SLAB_DESTROY_BY_RCU)) | 1930 | if (!(flags & SLAB_DESTROY_BY_RCU)) |
| 1847 | flags |= SLAB_POISON; | 1931 | flags |= SLAB_POISON; |
| @@ -1853,13 +1937,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1853 | BUG_ON(dtor); | 1937 | BUG_ON(dtor); |
| 1854 | 1938 | ||
| 1855 | /* | 1939 | /* |
| 1856 | * Always checks flags, a caller might be expecting debug | 1940 | * Always checks flags, a caller might be expecting debug support which |
| 1857 | * support which isn't available. | 1941 | * isn't available. |
| 1858 | */ | 1942 | */ |
| 1859 | if (flags & ~CREATE_MASK) | 1943 | if (flags & ~CREATE_MASK) |
| 1860 | BUG(); | 1944 | BUG(); |
| 1861 | 1945 | ||
| 1862 | /* Check that size is in terms of words. This is needed to avoid | 1946 | /* |
| 1947 | * Check that size is in terms of words. This is needed to avoid | ||
| 1863 | * unaligned accesses for some archs when redzoning is used, and makes | 1948 | * unaligned accesses for some archs when redzoning is used, and makes |
| 1864 | * sure any on-slab bufctl's are also correctly aligned. | 1949 | * sure any on-slab bufctl's are also correctly aligned. |
| 1865 | */ | 1950 | */ |
| @@ -1868,12 +1953,14 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1868 | size &= ~(BYTES_PER_WORD - 1); | 1953 | size &= ~(BYTES_PER_WORD - 1); |
| 1869 | } | 1954 | } |
| 1870 | 1955 | ||
| 1871 | /* calculate out the final buffer alignment: */ | 1956 | /* calculate the final buffer alignment: */ |
| 1957 | |||
| 1872 | /* 1) arch recommendation: can be overridden for debug */ | 1958 | /* 1) arch recommendation: can be overridden for debug */ |
| 1873 | if (flags & SLAB_HWCACHE_ALIGN) { | 1959 | if (flags & SLAB_HWCACHE_ALIGN) { |
| 1874 | /* Default alignment: as specified by the arch code. | 1960 | /* |
| 1875 | * Except if an object is really small, then squeeze multiple | 1961 | * Default alignment: as specified by the arch code. Except if |
| 1876 | * objects into one cacheline. | 1962 | * an object is really small, then squeeze multiple objects into |
| 1963 | * one cacheline. | ||
| 1877 | */ | 1964 | */ |
| 1878 | ralign = cache_line_size(); | 1965 | ralign = cache_line_size(); |
| 1879 | while (size <= ralign / 2) | 1966 | while (size <= ralign / 2) |
| @@ -1893,7 +1980,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1893 | if (ralign > BYTES_PER_WORD) | 1980 | if (ralign > BYTES_PER_WORD) |
| 1894 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | 1981 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); |
| 1895 | } | 1982 | } |
| 1896 | /* 4) Store it. Note that the debug code below can reduce | 1983 | /* |
| 1984 | * 4) Store it. Note that the debug code below can reduce | ||
| 1897 | * the alignment to BYTES_PER_WORD. | 1985 | * the alignment to BYTES_PER_WORD. |
| 1898 | */ | 1986 | */ |
| 1899 | align = ralign; | 1987 | align = ralign; |
| @@ -1978,7 +2066,6 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1978 | cachep->gfpflags = 0; | 2066 | cachep->gfpflags = 0; |
| 1979 | if (flags & SLAB_CACHE_DMA) | 2067 | if (flags & SLAB_CACHE_DMA) |
| 1980 | cachep->gfpflags |= GFP_DMA; | 2068 | cachep->gfpflags |= GFP_DMA; |
| 1981 | spin_lock_init(&cachep->spinlock); | ||
| 1982 | cachep->buffer_size = size; | 2069 | cachep->buffer_size = size; |
| 1983 | 2070 | ||
| 1984 | if (flags & CFLGS_OFF_SLAB) | 2071 | if (flags & CFLGS_OFF_SLAB) |
| @@ -1988,64 +2075,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 1988 | cachep->name = name; | 2075 | cachep->name = name; |
| 1989 | 2076 | ||
| 1990 | 2077 | ||
| 1991 | if (g_cpucache_up == FULL) { | 2078 | setup_cpu_cache(cachep); |
| 1992 | enable_cpucache(cachep); | ||
| 1993 | } else { | ||
| 1994 | if (g_cpucache_up == NONE) { | ||
| 1995 | /* Note: the first kmem_cache_create must create | ||
| 1996 | * the cache that's used by kmalloc(24), otherwise | ||
| 1997 | * the creation of further caches will BUG(). | ||
| 1998 | */ | ||
| 1999 | cachep->array[smp_processor_id()] = | ||
| 2000 | &initarray_generic.cache; | ||
| 2001 | |||
| 2002 | /* If the cache that's used by | ||
| 2003 | * kmalloc(sizeof(kmem_list3)) is the first cache, | ||
| 2004 | * then we need to set up all its list3s, otherwise | ||
| 2005 | * the creation of further caches will BUG(). | ||
| 2006 | */ | ||
| 2007 | set_up_list3s(cachep, SIZE_AC); | ||
| 2008 | if (INDEX_AC == INDEX_L3) | ||
| 2009 | g_cpucache_up = PARTIAL_L3; | ||
| 2010 | else | ||
| 2011 | g_cpucache_up = PARTIAL_AC; | ||
| 2012 | } else { | ||
| 2013 | cachep->array[smp_processor_id()] = | ||
| 2014 | kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | ||
| 2015 | |||
| 2016 | if (g_cpucache_up == PARTIAL_AC) { | ||
| 2017 | set_up_list3s(cachep, SIZE_L3); | ||
| 2018 | g_cpucache_up = PARTIAL_L3; | ||
| 2019 | } else { | ||
| 2020 | int node; | ||
| 2021 | for_each_online_node(node) { | ||
| 2022 | |||
| 2023 | cachep->nodelists[node] = | ||
| 2024 | kmalloc_node(sizeof | ||
| 2025 | (struct kmem_list3), | ||
| 2026 | GFP_KERNEL, node); | ||
| 2027 | BUG_ON(!cachep->nodelists[node]); | ||
| 2028 | kmem_list3_init(cachep-> | ||
| 2029 | nodelists[node]); | ||
| 2030 | } | ||
| 2031 | } | ||
| 2032 | } | ||
| 2033 | cachep->nodelists[numa_node_id()]->next_reap = | ||
| 2034 | jiffies + REAPTIMEOUT_LIST3 + | ||
| 2035 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | ||
| 2036 | |||
| 2037 | BUG_ON(!cpu_cache_get(cachep)); | ||
| 2038 | cpu_cache_get(cachep)->avail = 0; | ||
| 2039 | cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES; | ||
| 2040 | cpu_cache_get(cachep)->batchcount = 1; | ||
| 2041 | cpu_cache_get(cachep)->touched = 0; | ||
| 2042 | cachep->batchcount = 1; | ||
| 2043 | cachep->limit = BOOT_CPUCACHE_ENTRIES; | ||
| 2044 | } | ||
| 2045 | 2079 | ||
| 2046 | /* cache setup completed, link it into the list */ | 2080 | /* cache setup completed, link it into the list */ |
| 2047 | list_add(&cachep->next, &cache_chain); | 2081 | list_add(&cachep->next, &cache_chain); |
| 2048 | oops: | 2082 | oops: |
| 2049 | if (!cachep && (flags & SLAB_PANIC)) | 2083 | if (!cachep && (flags & SLAB_PANIC)) |
| 2050 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2084 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
| 2051 | name); | 2085 | name); |
| @@ -2089,30 +2123,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) | |||
| 2089 | #define check_spinlock_acquired_node(x, y) do { } while(0) | 2123 | #define check_spinlock_acquired_node(x, y) do { } while(0) |
| 2090 | #endif | 2124 | #endif |
| 2091 | 2125 | ||
| 2092 | /* | 2126 | static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, |
| 2093 | * Waits for all CPUs to execute func(). | 2127 | struct array_cache *ac, |
| 2094 | */ | 2128 | int force, int node); |
| 2095 | static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg) | ||
| 2096 | { | ||
| 2097 | check_irq_on(); | ||
| 2098 | preempt_disable(); | ||
| 2099 | |||
| 2100 | local_irq_disable(); | ||
| 2101 | func(arg); | ||
| 2102 | local_irq_enable(); | ||
| 2103 | |||
| 2104 | if (smp_call_function(func, arg, 1, 1)) | ||
| 2105 | BUG(); | ||
| 2106 | |||
| 2107 | preempt_enable(); | ||
| 2108 | } | ||
| 2109 | |||
| 2110 | static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, | ||
| 2111 | int force, int node); | ||
| 2112 | 2129 | ||
| 2113 | static void do_drain(void *arg) | 2130 | static void do_drain(void *arg) |
| 2114 | { | 2131 | { |
| 2115 | struct kmem_cache *cachep = (struct kmem_cache *) arg; | 2132 | struct kmem_cache *cachep = arg; |
| 2116 | struct array_cache *ac; | 2133 | struct array_cache *ac; |
| 2117 | int node = numa_node_id(); | 2134 | int node = numa_node_id(); |
| 2118 | 2135 | ||
| @@ -2129,14 +2146,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) | |||
| 2129 | struct kmem_list3 *l3; | 2146 | struct kmem_list3 *l3; |
| 2130 | int node; | 2147 | int node; |
| 2131 | 2148 | ||
| 2132 | smp_call_function_all_cpus(do_drain, cachep); | 2149 | on_each_cpu(do_drain, cachep, 1, 1); |
| 2133 | check_irq_on(); | 2150 | check_irq_on(); |
| 2134 | for_each_online_node(node) { | 2151 | for_each_online_node(node) { |
| 2135 | l3 = cachep->nodelists[node]; | 2152 | l3 = cachep->nodelists[node]; |
| 2136 | if (l3) { | 2153 | if (l3) { |
| 2137 | spin_lock_irq(&l3->list_lock); | 2154 | drain_array(cachep, l3, l3->shared, 1, node); |
| 2138 | drain_array_locked(cachep, l3->shared, 1, node); | ||
| 2139 | spin_unlock_irq(&l3->list_lock); | ||
| 2140 | if (l3->alien) | 2155 | if (l3->alien) |
| 2141 | drain_alien_cache(cachep, l3->alien); | 2156 | drain_alien_cache(cachep, l3->alien); |
| 2142 | } | 2157 | } |
| @@ -2260,16 +2275,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep) | |||
| 2260 | 2275 | ||
| 2261 | /* NUMA: free the list3 structures */ | 2276 | /* NUMA: free the list3 structures */ |
| 2262 | for_each_online_node(i) { | 2277 | for_each_online_node(i) { |
| 2263 | if ((l3 = cachep->nodelists[i])) { | 2278 | l3 = cachep->nodelists[i]; |
| 2279 | if (l3) { | ||
| 2264 | kfree(l3->shared); | 2280 | kfree(l3->shared); |
| 2265 | free_alien_cache(l3->alien); | 2281 | free_alien_cache(l3->alien); |
| 2266 | kfree(l3); | 2282 | kfree(l3); |
| 2267 | } | 2283 | } |
| 2268 | } | 2284 | } |
| 2269 | kmem_cache_free(&cache_cache, cachep); | 2285 | kmem_cache_free(&cache_cache, cachep); |
| 2270 | |||
| 2271 | unlock_cpu_hotplug(); | 2286 | unlock_cpu_hotplug(); |
| 2272 | |||
| 2273 | return 0; | 2287 | return 0; |
| 2274 | } | 2288 | } |
| 2275 | EXPORT_SYMBOL(kmem_cache_destroy); | 2289 | EXPORT_SYMBOL(kmem_cache_destroy); |
| @@ -2292,7 +2306,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
| 2292 | slabp->inuse = 0; | 2306 | slabp->inuse = 0; |
| 2293 | slabp->colouroff = colour_off; | 2307 | slabp->colouroff = colour_off; |
| 2294 | slabp->s_mem = objp + colour_off; | 2308 | slabp->s_mem = objp + colour_off; |
| 2295 | |||
| 2296 | return slabp; | 2309 | return slabp; |
| 2297 | } | 2310 | } |
| 2298 | 2311 | ||
| @@ -2307,7 +2320,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
| 2307 | int i; | 2320 | int i; |
| 2308 | 2321 | ||
| 2309 | for (i = 0; i < cachep->num; i++) { | 2322 | for (i = 0; i < cachep->num; i++) { |
| 2310 | void *objp = slabp->s_mem + cachep->buffer_size * i; | 2323 | void *objp = index_to_obj(cachep, slabp, i); |
| 2311 | #if DEBUG | 2324 | #if DEBUG |
| 2312 | /* need to poison the objs? */ | 2325 | /* need to poison the objs? */ |
| 2313 | if (cachep->flags & SLAB_POISON) | 2326 | if (cachep->flags & SLAB_POISON) |
| @@ -2320,9 +2333,9 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
| 2320 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; | 2333 | *dbg_redzone2(cachep, objp) = RED_INACTIVE; |
| 2321 | } | 2334 | } |
| 2322 | /* | 2335 | /* |
| 2323 | * Constructors are not allowed to allocate memory from | 2336 | * Constructors are not allowed to allocate memory from the same |
| 2324 | * the same cache which they are a constructor for. | 2337 | * cache which they are a constructor for. Otherwise, deadlock. |
| 2325 | * Otherwise, deadlock. They must also be threaded. | 2338 | * They must also be threaded. |
| 2326 | */ | 2339 | */ |
| 2327 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2340 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
| 2328 | cachep->ctor(objp + obj_offset(cachep), cachep, | 2341 | cachep->ctor(objp + obj_offset(cachep), cachep, |
| @@ -2336,8 +2349,8 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
| 2336 | slab_error(cachep, "constructor overwrote the" | 2349 | slab_error(cachep, "constructor overwrote the" |
| 2337 | " start of an object"); | 2350 | " start of an object"); |
| 2338 | } | 2351 | } |
| 2339 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep) | 2352 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && |
| 2340 | && cachep->flags & SLAB_POISON) | 2353 | OFF_SLAB(cachep) && cachep->flags & SLAB_POISON) |
| 2341 | kernel_map_pages(virt_to_page(objp), | 2354 | kernel_map_pages(virt_to_page(objp), |
| 2342 | cachep->buffer_size / PAGE_SIZE, 0); | 2355 | cachep->buffer_size / PAGE_SIZE, 0); |
| 2343 | #else | 2356 | #else |
| @@ -2352,18 +2365,16 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
| 2352 | 2365 | ||
| 2353 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) | 2366 | static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) |
| 2354 | { | 2367 | { |
| 2355 | if (flags & SLAB_DMA) { | 2368 | if (flags & SLAB_DMA) |
| 2356 | if (!(cachep->gfpflags & GFP_DMA)) | 2369 | BUG_ON(!(cachep->gfpflags & GFP_DMA)); |
| 2357 | BUG(); | 2370 | else |
| 2358 | } else { | 2371 | BUG_ON(cachep->gfpflags & GFP_DMA); |
| 2359 | if (cachep->gfpflags & GFP_DMA) | ||
| 2360 | BUG(); | ||
| 2361 | } | ||
| 2362 | } | 2372 | } |
| 2363 | 2373 | ||
| 2364 | static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) | 2374 | static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, |
| 2375 | int nodeid) | ||
| 2365 | { | 2376 | { |
| 2366 | void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size); | 2377 | void *objp = index_to_obj(cachep, slabp, slabp->free); |
| 2367 | kmem_bufctl_t next; | 2378 | kmem_bufctl_t next; |
| 2368 | 2379 | ||
| 2369 | slabp->inuse++; | 2380 | slabp->inuse++; |
| @@ -2377,10 +2388,10 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod | |||
| 2377 | return objp; | 2388 | return objp; |
| 2378 | } | 2389 | } |
| 2379 | 2390 | ||
| 2380 | static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, | 2391 | static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, |
| 2381 | int nodeid) | 2392 | void *objp, int nodeid) |
| 2382 | { | 2393 | { |
| 2383 | unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size; | 2394 | unsigned int objnr = obj_to_index(cachep, slabp, objp); |
| 2384 | 2395 | ||
| 2385 | #if DEBUG | 2396 | #if DEBUG |
| 2386 | /* Verify that the slab belongs to the intended node */ | 2397 | /* Verify that the slab belongs to the intended node */ |
| @@ -2388,7 +2399,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob | |||
| 2388 | 2399 | ||
| 2389 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { | 2400 | if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { |
| 2390 | printk(KERN_ERR "slab: double free detected in cache " | 2401 | printk(KERN_ERR "slab: double free detected in cache " |
| 2391 | "'%s', objp %p\n", cachep->name, objp); | 2402 | "'%s', objp %p\n", cachep->name, objp); |
| 2392 | BUG(); | 2403 | BUG(); |
| 2393 | } | 2404 | } |
| 2394 | #endif | 2405 | #endif |
| @@ -2397,14 +2408,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob | |||
| 2397 | slabp->inuse--; | 2408 | slabp->inuse--; |
| 2398 | } | 2409 | } |
| 2399 | 2410 | ||
| 2400 | static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp) | 2411 | static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, |
| 2412 | void *objp) | ||
| 2401 | { | 2413 | { |
| 2402 | int i; | 2414 | int i; |
| 2403 | struct page *page; | 2415 | struct page *page; |
| 2404 | 2416 | ||
| 2405 | /* Nasty!!!!!! I hope this is OK. */ | 2417 | /* Nasty!!!!!! I hope this is OK. */ |
| 2406 | i = 1 << cachep->gfporder; | ||
| 2407 | page = virt_to_page(objp); | 2418 | page = virt_to_page(objp); |
| 2419 | |||
| 2420 | i = 1; | ||
| 2421 | if (likely(!PageCompound(page))) | ||
| 2422 | i <<= cachep->gfporder; | ||
| 2408 | do { | 2423 | do { |
| 2409 | page_set_cache(page, cachep); | 2424 | page_set_cache(page, cachep); |
| 2410 | page_set_slab(page, slabp); | 2425 | page_set_slab(page, slabp); |
| @@ -2425,8 +2440,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 2425 | unsigned long ctor_flags; | 2440 | unsigned long ctor_flags; |
| 2426 | struct kmem_list3 *l3; | 2441 | struct kmem_list3 *l3; |
| 2427 | 2442 | ||
| 2428 | /* Be lazy and only check for valid flags here, | 2443 | /* |
| 2429 | * keeping it out of the critical path in kmem_cache_alloc(). | 2444 | * Be lazy and only check for valid flags here, keeping it out of the |
| 2445 | * critical path in kmem_cache_alloc(). | ||
| 2430 | */ | 2446 | */ |
| 2431 | if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) | 2447 | if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)) |
| 2432 | BUG(); | 2448 | BUG(); |
| @@ -2467,14 +2483,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 2467 | */ | 2483 | */ |
| 2468 | kmem_flagcheck(cachep, flags); | 2484 | kmem_flagcheck(cachep, flags); |
| 2469 | 2485 | ||
| 2470 | /* Get mem for the objs. | 2486 | /* |
| 2471 | * Attempt to allocate a physical page from 'nodeid', | 2487 | * Get mem for the objs. Attempt to allocate a physical page from |
| 2488 | * 'nodeid'. | ||
| 2472 | */ | 2489 | */ |
| 2473 | if (!(objp = kmem_getpages(cachep, flags, nodeid))) | 2490 | objp = kmem_getpages(cachep, flags, nodeid); |
| 2491 | if (!objp) | ||
| 2474 | goto failed; | 2492 | goto failed; |
| 2475 | 2493 | ||
| 2476 | /* Get slab management. */ | 2494 | /* Get slab management. */ |
| 2477 | if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) | 2495 | slabp = alloc_slabmgmt(cachep, objp, offset, local_flags); |
| 2496 | if (!slabp) | ||
| 2478 | goto opps1; | 2497 | goto opps1; |
| 2479 | 2498 | ||
| 2480 | slabp->nodeid = nodeid; | 2499 | slabp->nodeid = nodeid; |
| @@ -2493,9 +2512,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 2493 | l3->free_objects += cachep->num; | 2512 | l3->free_objects += cachep->num; |
| 2494 | spin_unlock(&l3->list_lock); | 2513 | spin_unlock(&l3->list_lock); |
| 2495 | return 1; | 2514 | return 1; |
| 2496 | opps1: | 2515 | opps1: |
| 2497 | kmem_freepages(cachep, objp); | 2516 | kmem_freepages(cachep, objp); |
| 2498 | failed: | 2517 | failed: |
| 2499 | if (local_flags & __GFP_WAIT) | 2518 | if (local_flags & __GFP_WAIT) |
| 2500 | local_irq_disable(); | 2519 | local_irq_disable(); |
| 2501 | return 0; | 2520 | return 0; |
| @@ -2538,8 +2557,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
| 2538 | page = virt_to_page(objp); | 2557 | page = virt_to_page(objp); |
| 2539 | 2558 | ||
| 2540 | if (page_get_cache(page) != cachep) { | 2559 | if (page_get_cache(page) != cachep) { |
| 2541 | printk(KERN_ERR | 2560 | printk(KERN_ERR "mismatch in kmem_cache_free: expected " |
| 2542 | "mismatch in kmem_cache_free: expected cache %p, got %p\n", | 2561 | "cache %p, got %p\n", |
| 2543 | page_get_cache(page), cachep); | 2562 | page_get_cache(page), cachep); |
| 2544 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); | 2563 | printk(KERN_ERR "%p is %s.\n", cachep, cachep->name); |
| 2545 | printk(KERN_ERR "%p is %s.\n", page_get_cache(page), | 2564 | printk(KERN_ERR "%p is %s.\n", page_get_cache(page), |
| @@ -2549,13 +2568,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
| 2549 | slabp = page_get_slab(page); | 2568 | slabp = page_get_slab(page); |
| 2550 | 2569 | ||
| 2551 | if (cachep->flags & SLAB_RED_ZONE) { | 2570 | if (cachep->flags & SLAB_RED_ZONE) { |
| 2552 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE | 2571 | if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || |
| 2553 | || *dbg_redzone2(cachep, objp) != RED_ACTIVE) { | 2572 | *dbg_redzone2(cachep, objp) != RED_ACTIVE) { |
| 2554 | slab_error(cachep, | 2573 | slab_error(cachep, "double free, or memory outside" |
| 2555 | "double free, or memory outside" | 2574 | " object was overwritten"); |
| 2556 | " object was overwritten"); | 2575 | printk(KERN_ERR "%p: redzone 1:0x%lx, " |
| 2557 | printk(KERN_ERR | 2576 | "redzone 2:0x%lx.\n", |
| 2558 | "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | ||
| 2559 | objp, *dbg_redzone1(cachep, objp), | 2577 | objp, *dbg_redzone1(cachep, objp), |
| 2560 | *dbg_redzone2(cachep, objp)); | 2578 | *dbg_redzone2(cachep, objp)); |
| 2561 | } | 2579 | } |
| @@ -2565,15 +2583,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
| 2565 | if (cachep->flags & SLAB_STORE_USER) | 2583 | if (cachep->flags & SLAB_STORE_USER) |
| 2566 | *dbg_userword(cachep, objp) = caller; | 2584 | *dbg_userword(cachep, objp) = caller; |
| 2567 | 2585 | ||
| 2568 | objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size; | 2586 | objnr = obj_to_index(cachep, slabp, objp); |
| 2569 | 2587 | ||
| 2570 | BUG_ON(objnr >= cachep->num); | 2588 | BUG_ON(objnr >= cachep->num); |
| 2571 | BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size); | 2589 | BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); |
| 2572 | 2590 | ||
| 2573 | if (cachep->flags & SLAB_DEBUG_INITIAL) { | 2591 | if (cachep->flags & SLAB_DEBUG_INITIAL) { |
| 2574 | /* Need to call the slab's constructor so the | 2592 | /* |
| 2575 | * caller can perform a verify of its state (debugging). | 2593 | * Need to call the slab's constructor so the caller can |
| 2576 | * Called without the cache-lock held. | 2594 | * perform a verify of its state (debugging). Called without |
| 2595 | * the cache-lock held. | ||
| 2577 | */ | 2596 | */ |
| 2578 | cachep->ctor(objp + obj_offset(cachep), | 2597 | cachep->ctor(objp + obj_offset(cachep), |
| 2579 | cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); | 2598 | cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY); |
| @@ -2586,7 +2605,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, | |||
| 2586 | } | 2605 | } |
| 2587 | if (cachep->flags & SLAB_POISON) { | 2606 | if (cachep->flags & SLAB_POISON) { |
| 2588 | #ifdef CONFIG_DEBUG_PAGEALLOC | 2607 | #ifdef CONFIG_DEBUG_PAGEALLOC |
| 2589 | if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) { | 2608 | if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { |
| 2590 | store_stackinfo(cachep, objp, (unsigned long)caller); | 2609 | store_stackinfo(cachep, objp, (unsigned long)caller); |
| 2591 | kernel_map_pages(virt_to_page(objp), | 2610 | kernel_map_pages(virt_to_page(objp), |
| 2592 | cachep->buffer_size / PAGE_SIZE, 0); | 2611 | cachep->buffer_size / PAGE_SIZE, 0); |
| @@ -2612,14 +2631,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) | |||
| 2612 | goto bad; | 2631 | goto bad; |
| 2613 | } | 2632 | } |
| 2614 | if (entries != cachep->num - slabp->inuse) { | 2633 | if (entries != cachep->num - slabp->inuse) { |
| 2615 | bad: | 2634 | bad: |
| 2616 | printk(KERN_ERR | 2635 | printk(KERN_ERR "slab: Internal list corruption detected in " |
| 2617 | "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n", | 2636 | "cache '%s'(%d), slabp %p(%d). Hexdump:\n", |
| 2618 | cachep->name, cachep->num, slabp, slabp->inuse); | 2637 | cachep->name, cachep->num, slabp, slabp->inuse); |
| 2619 | for (i = 0; | 2638 | for (i = 0; |
| 2620 | i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); | 2639 | i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); |
| 2621 | i++) { | 2640 | i++) { |
| 2622 | if ((i % 16) == 0) | 2641 | if (i % 16 == 0) |
| 2623 | printk("\n%03x:", i); | 2642 | printk("\n%03x:", i); |
| 2624 | printk(" %02x", ((unsigned char *)slabp)[i]); | 2643 | printk(" %02x", ((unsigned char *)slabp)[i]); |
| 2625 | } | 2644 | } |
| @@ -2641,12 +2660,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | |||
| 2641 | 2660 | ||
| 2642 | check_irq_off(); | 2661 | check_irq_off(); |
| 2643 | ac = cpu_cache_get(cachep); | 2662 | ac = cpu_cache_get(cachep); |
| 2644 | retry: | 2663 | retry: |
| 2645 | batchcount = ac->batchcount; | 2664 | batchcount = ac->batchcount; |
| 2646 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | 2665 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { |
| 2647 | /* if there was little recent activity on this | 2666 | /* |
| 2648 | * cache, then perform only a partial refill. | 2667 | * If there was little recent activity on this cache, then |
| 2649 | * Otherwise we could generate refill bouncing. | 2668 | * perform only a partial refill. Otherwise we could generate |
| 2669 | * refill bouncing. | ||
| 2650 | */ | 2670 | */ |
| 2651 | batchcount = BATCHREFILL_LIMIT; | 2671 | batchcount = BATCHREFILL_LIMIT; |
| 2652 | } | 2672 | } |
| @@ -2702,29 +2722,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | |||
| 2702 | list_add(&slabp->list, &l3->slabs_partial); | 2722 | list_add(&slabp->list, &l3->slabs_partial); |
| 2703 | } | 2723 | } |
| 2704 | 2724 | ||
| 2705 | must_grow: | 2725 | must_grow: |
| 2706 | l3->free_objects -= ac->avail; | 2726 | l3->free_objects -= ac->avail; |
| 2707 | alloc_done: | 2727 | alloc_done: |
| 2708 | spin_unlock(&l3->list_lock); | 2728 | spin_unlock(&l3->list_lock); |
| 2709 | 2729 | ||
| 2710 | if (unlikely(!ac->avail)) { | 2730 | if (unlikely(!ac->avail)) { |
| 2711 | int x; | 2731 | int x; |
| 2712 | x = cache_grow(cachep, flags, numa_node_id()); | 2732 | x = cache_grow(cachep, flags, numa_node_id()); |
| 2713 | 2733 | ||
| 2714 | // cache_grow can reenable interrupts, then ac could change. | 2734 | /* cache_grow can reenable interrupts, then ac could change. */ |
| 2715 | ac = cpu_cache_get(cachep); | 2735 | ac = cpu_cache_get(cachep); |
| 2716 | if (!x && ac->avail == 0) // no objects in sight? abort | 2736 | if (!x && ac->avail == 0) /* no objects in sight? abort */ |
| 2717 | return NULL; | 2737 | return NULL; |
| 2718 | 2738 | ||
| 2719 | if (!ac->avail) // objects refilled by interrupt? | 2739 | if (!ac->avail) /* objects refilled by interrupt? */ |
| 2720 | goto retry; | 2740 | goto retry; |
| 2721 | } | 2741 | } |
| 2722 | ac->touched = 1; | 2742 | ac->touched = 1; |
| 2723 | return ac->entry[--ac->avail]; | 2743 | return ac->entry[--ac->avail]; |
| 2724 | } | 2744 | } |
| 2725 | 2745 | ||
| 2726 | static inline void | 2746 | static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, |
| 2727 | cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) | 2747 | gfp_t flags) |
| 2728 | { | 2748 | { |
| 2729 | might_sleep_if(flags & __GFP_WAIT); | 2749 | might_sleep_if(flags & __GFP_WAIT); |
| 2730 | #if DEBUG | 2750 | #if DEBUG |
| @@ -2733,8 +2753,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) | |||
| 2733 | } | 2753 | } |
| 2734 | 2754 | ||
| 2735 | #if DEBUG | 2755 | #if DEBUG |
| 2736 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, | 2756 | static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, |
| 2737 | void *objp, void *caller) | 2757 | gfp_t flags, void *objp, void *caller) |
| 2738 | { | 2758 | { |
| 2739 | if (!objp) | 2759 | if (!objp) |
| 2740 | return objp; | 2760 | return objp; |
| @@ -2754,15 +2774,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags | |||
| 2754 | *dbg_userword(cachep, objp) = caller; | 2774 | *dbg_userword(cachep, objp) = caller; |
| 2755 | 2775 | ||
| 2756 | if (cachep->flags & SLAB_RED_ZONE) { | 2776 | if (cachep->flags & SLAB_RED_ZONE) { |
| 2757 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE | 2777 | if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || |
| 2758 | || *dbg_redzone2(cachep, objp) != RED_INACTIVE) { | 2778 | *dbg_redzone2(cachep, objp) != RED_INACTIVE) { |
| 2759 | slab_error(cachep, | 2779 | slab_error(cachep, "double free, or memory outside" |
| 2760 | "double free, or memory outside" | 2780 | " object was overwritten"); |
| 2761 | " object was overwritten"); | ||
| 2762 | printk(KERN_ERR | 2781 | printk(KERN_ERR |
| 2763 | "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n", | 2782 | "%p: redzone 1:0x%lx, redzone 2:0x%lx\n", |
| 2764 | objp, *dbg_redzone1(cachep, objp), | 2783 | objp, *dbg_redzone1(cachep, objp), |
| 2765 | *dbg_redzone2(cachep, objp)); | 2784 | *dbg_redzone2(cachep, objp)); |
| 2766 | } | 2785 | } |
| 2767 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; | 2786 | *dbg_redzone1(cachep, objp) = RED_ACTIVE; |
| 2768 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; | 2787 | *dbg_redzone2(cachep, objp) = RED_ACTIVE; |
| @@ -2809,8 +2828,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 2809 | return objp; | 2828 | return objp; |
| 2810 | } | 2829 | } |
| 2811 | 2830 | ||
| 2812 | static __always_inline void * | 2831 | static __always_inline void *__cache_alloc(struct kmem_cache *cachep, |
| 2813 | __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | 2832 | gfp_t flags, void *caller) |
| 2814 | { | 2833 | { |
| 2815 | unsigned long save_flags; | 2834 | unsigned long save_flags; |
| 2816 | void *objp; | 2835 | void *objp; |
| @@ -2830,7 +2849,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
| 2830 | /* | 2849 | /* |
| 2831 | * A interface to enable slab creation on nodeid | 2850 | * A interface to enable slab creation on nodeid |
| 2832 | */ | 2851 | */ |
| 2833 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 2852 | static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, |
| 2853 | int nodeid) | ||
| 2834 | { | 2854 | { |
| 2835 | struct list_head *entry; | 2855 | struct list_head *entry; |
| 2836 | struct slab *slabp; | 2856 | struct slab *slabp; |
| @@ -2841,7 +2861,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node | |||
| 2841 | l3 = cachep->nodelists[nodeid]; | 2861 | l3 = cachep->nodelists[nodeid]; |
| 2842 | BUG_ON(!l3); | 2862 | BUG_ON(!l3); |
| 2843 | 2863 | ||
| 2844 | retry: | 2864 | retry: |
| 2845 | check_irq_off(); | 2865 | check_irq_off(); |
| 2846 | spin_lock(&l3->list_lock); | 2866 | spin_lock(&l3->list_lock); |
| 2847 | entry = l3->slabs_partial.next; | 2867 | entry = l3->slabs_partial.next; |
| @@ -2868,16 +2888,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node | |||
| 2868 | /* move slabp to correct slabp list: */ | 2888 | /* move slabp to correct slabp list: */ |
| 2869 | list_del(&slabp->list); | 2889 | list_del(&slabp->list); |
| 2870 | 2890 | ||
| 2871 | if (slabp->free == BUFCTL_END) { | 2891 | if (slabp->free == BUFCTL_END) |
| 2872 | list_add(&slabp->list, &l3->slabs_full); | 2892 | list_add(&slabp->list, &l3->slabs_full); |
| 2873 | } else { | 2893 | else |
| 2874 | list_add(&slabp->list, &l3->slabs_partial); | 2894 | list_add(&slabp->list, &l3->slabs_partial); |
| 2875 | } | ||
| 2876 | 2895 | ||
| 2877 | spin_unlock(&l3->list_lock); | 2896 | spin_unlock(&l3->list_lock); |
| 2878 | goto done; | 2897 | goto done; |
| 2879 | 2898 | ||
| 2880 | must_grow: | 2899 | must_grow: |
| 2881 | spin_unlock(&l3->list_lock); | 2900 | spin_unlock(&l3->list_lock); |
| 2882 | x = cache_grow(cachep, flags, nodeid); | 2901 | x = cache_grow(cachep, flags, nodeid); |
| 2883 | 2902 | ||
| @@ -2885,7 +2904,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node | |||
| 2885 | return NULL; | 2904 | return NULL; |
| 2886 | 2905 | ||
| 2887 | goto retry; | 2906 | goto retry; |
| 2888 | done: | 2907 | done: |
| 2889 | return obj; | 2908 | return obj; |
| 2890 | } | 2909 | } |
| 2891 | #endif | 2910 | #endif |
| @@ -2958,7 +2977,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | |||
| 2958 | } | 2977 | } |
| 2959 | 2978 | ||
| 2960 | free_block(cachep, ac->entry, batchcount, node); | 2979 | free_block(cachep, ac->entry, batchcount, node); |
| 2961 | free_done: | 2980 | free_done: |
| 2962 | #if STATS | 2981 | #if STATS |
| 2963 | { | 2982 | { |
| 2964 | int i = 0; | 2983 | int i = 0; |
| @@ -2979,16 +2998,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | |||
| 2979 | #endif | 2998 | #endif |
| 2980 | spin_unlock(&l3->list_lock); | 2999 | spin_unlock(&l3->list_lock); |
| 2981 | ac->avail -= batchcount; | 3000 | ac->avail -= batchcount; |
| 2982 | memmove(ac->entry, &(ac->entry[batchcount]), | 3001 | memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); |
| 2983 | sizeof(void *) * ac->avail); | ||
| 2984 | } | 3002 | } |
| 2985 | 3003 | ||
| 2986 | /* | 3004 | /* |
| 2987 | * __cache_free | 3005 | * Release an obj back to its cache. If the obj has a constructed state, it must |
| 2988 | * Release an obj back to its cache. If the obj has a constructed | 3006 | * be in this state _before_ it is released. Called with disabled ints. |
| 2989 | * state, it must be in this state _before_ it is released. | ||
| 2990 | * | ||
| 2991 | * Called with disabled ints. | ||
| 2992 | */ | 3007 | */ |
| 2993 | static inline void __cache_free(struct kmem_cache *cachep, void *objp) | 3008 | static inline void __cache_free(struct kmem_cache *cachep, void *objp) |
| 2994 | { | 3009 | { |
| @@ -3007,9 +3022,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
| 3007 | if (unlikely(slabp->nodeid != numa_node_id())) { | 3022 | if (unlikely(slabp->nodeid != numa_node_id())) { |
| 3008 | struct array_cache *alien = NULL; | 3023 | struct array_cache *alien = NULL; |
| 3009 | int nodeid = slabp->nodeid; | 3024 | int nodeid = slabp->nodeid; |
| 3010 | struct kmem_list3 *l3 = | 3025 | struct kmem_list3 *l3; |
| 3011 | cachep->nodelists[numa_node_id()]; | ||
| 3012 | 3026 | ||
| 3027 | l3 = cachep->nodelists[numa_node_id()]; | ||
| 3013 | STATS_INC_NODEFREES(cachep); | 3028 | STATS_INC_NODEFREES(cachep); |
| 3014 | if (l3->alien && l3->alien[nodeid]) { | 3029 | if (l3->alien && l3->alien[nodeid]) { |
| 3015 | alien = l3->alien[nodeid]; | 3030 | alien = l3->alien[nodeid]; |
| @@ -3093,7 +3108,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) | |||
| 3093 | if (unlikely(page_get_cache(page) != cachep)) | 3108 | if (unlikely(page_get_cache(page) != cachep)) |
| 3094 | goto out; | 3109 | goto out; |
| 3095 | return 1; | 3110 | return 1; |
| 3096 | out: | 3111 | out: |
| 3097 | return 0; | 3112 | return 0; |
| 3098 | } | 3113 | } |
| 3099 | 3114 | ||
| @@ -3119,7 +3134,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 3119 | local_irq_save(save_flags); | 3134 | local_irq_save(save_flags); |
| 3120 | 3135 | ||
| 3121 | if (nodeid == -1 || nodeid == numa_node_id() || | 3136 | if (nodeid == -1 || nodeid == numa_node_id() || |
| 3122 | !cachep->nodelists[nodeid]) | 3137 | !cachep->nodelists[nodeid]) |
| 3123 | ptr = ____cache_alloc(cachep, flags); | 3138 | ptr = ____cache_alloc(cachep, flags); |
| 3124 | else | 3139 | else |
| 3125 | ptr = __cache_alloc_node(cachep, flags, nodeid); | 3140 | ptr = __cache_alloc_node(cachep, flags, nodeid); |
| @@ -3148,6 +3163,7 @@ EXPORT_SYMBOL(kmalloc_node); | |||
| 3148 | * kmalloc - allocate memory | 3163 | * kmalloc - allocate memory |
| 3149 | * @size: how many bytes of memory are required. | 3164 | * @size: how many bytes of memory are required. |
| 3150 | * @flags: the type of memory to allocate. | 3165 | * @flags: the type of memory to allocate. |
| 3166 | * @caller: function caller for debug tracking of the caller | ||
| 3151 | * | 3167 | * |
| 3152 | * kmalloc is the normal method of allocating memory | 3168 | * kmalloc is the normal method of allocating memory |
| 3153 | * in the kernel. | 3169 | * in the kernel. |
| @@ -3236,7 +3252,7 @@ void *__alloc_percpu(size_t size) | |||
| 3236 | /* Catch derefs w/o wrappers */ | 3252 | /* Catch derefs w/o wrappers */ |
| 3237 | return (void *)(~(unsigned long)pdata); | 3253 | return (void *)(~(unsigned long)pdata); |
| 3238 | 3254 | ||
| 3239 | unwind_oom: | 3255 | unwind_oom: |
| 3240 | while (--i >= 0) { | 3256 | while (--i >= 0) { |
| 3241 | if (!cpu_possible(i)) | 3257 | if (!cpu_possible(i)) |
| 3242 | continue; | 3258 | continue; |
| @@ -3339,18 +3355,20 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
| 3339 | struct array_cache *nc = NULL, *new; | 3355 | struct array_cache *nc = NULL, *new; |
| 3340 | struct array_cache **new_alien = NULL; | 3356 | struct array_cache **new_alien = NULL; |
| 3341 | #ifdef CONFIG_NUMA | 3357 | #ifdef CONFIG_NUMA |
| 3342 | if (!(new_alien = alloc_alien_cache(node, cachep->limit))) | 3358 | new_alien = alloc_alien_cache(node, cachep->limit); |
| 3359 | if (!new_alien) | ||
| 3343 | goto fail; | 3360 | goto fail; |
| 3344 | #endif | 3361 | #endif |
| 3345 | if (!(new = alloc_arraycache(node, (cachep->shared * | 3362 | new = alloc_arraycache(node, cachep->shared*cachep->batchcount, |
| 3346 | cachep->batchcount), | 3363 | 0xbaadf00d); |
| 3347 | 0xbaadf00d))) | 3364 | if (!new) |
| 3348 | goto fail; | 3365 | goto fail; |
| 3349 | if ((l3 = cachep->nodelists[node])) { | 3366 | l3 = cachep->nodelists[node]; |
| 3350 | 3367 | if (l3) { | |
| 3351 | spin_lock_irq(&l3->list_lock); | 3368 | spin_lock_irq(&l3->list_lock); |
| 3352 | 3369 | ||
| 3353 | if ((nc = cachep->nodelists[node]->shared)) | 3370 | nc = cachep->nodelists[node]->shared; |
| 3371 | if (nc) | ||
| 3354 | free_block(cachep, nc->entry, nc->avail, node); | 3372 | free_block(cachep, nc->entry, nc->avail, node); |
| 3355 | 3373 | ||
| 3356 | l3->shared = new; | 3374 | l3->shared = new; |
| @@ -3359,27 +3377,27 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
| 3359 | new_alien = NULL; | 3377 | new_alien = NULL; |
| 3360 | } | 3378 | } |
| 3361 | l3->free_limit = (1 + nr_cpus_node(node)) * | 3379 | l3->free_limit = (1 + nr_cpus_node(node)) * |
| 3362 | cachep->batchcount + cachep->num; | 3380 | cachep->batchcount + cachep->num; |
| 3363 | spin_unlock_irq(&l3->list_lock); | 3381 | spin_unlock_irq(&l3->list_lock); |
| 3364 | kfree(nc); | 3382 | kfree(nc); |
| 3365 | free_alien_cache(new_alien); | 3383 | free_alien_cache(new_alien); |
| 3366 | continue; | 3384 | continue; |
| 3367 | } | 3385 | } |
| 3368 | if (!(l3 = kmalloc_node(sizeof(struct kmem_list3), | 3386 | l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); |
| 3369 | GFP_KERNEL, node))) | 3387 | if (!l3) |
| 3370 | goto fail; | 3388 | goto fail; |
| 3371 | 3389 | ||
| 3372 | kmem_list3_init(l3); | 3390 | kmem_list3_init(l3); |
| 3373 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + | 3391 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + |
| 3374 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | 3392 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; |
| 3375 | l3->shared = new; | 3393 | l3->shared = new; |
| 3376 | l3->alien = new_alien; | 3394 | l3->alien = new_alien; |
| 3377 | l3->free_limit = (1 + nr_cpus_node(node)) * | 3395 | l3->free_limit = (1 + nr_cpus_node(node)) * |
| 3378 | cachep->batchcount + cachep->num; | 3396 | cachep->batchcount + cachep->num; |
| 3379 | cachep->nodelists[node] = l3; | 3397 | cachep->nodelists[node] = l3; |
| 3380 | } | 3398 | } |
| 3381 | return err; | 3399 | return err; |
| 3382 | fail: | 3400 | fail: |
| 3383 | err = -ENOMEM; | 3401 | err = -ENOMEM; |
| 3384 | return err; | 3402 | return err; |
| 3385 | } | 3403 | } |
| @@ -3391,7 +3409,7 @@ struct ccupdate_struct { | |||
| 3391 | 3409 | ||
| 3392 | static void do_ccupdate_local(void *info) | 3410 | static void do_ccupdate_local(void *info) |
| 3393 | { | 3411 | { |
| 3394 | struct ccupdate_struct *new = (struct ccupdate_struct *)info; | 3412 | struct ccupdate_struct *new = info; |
| 3395 | struct array_cache *old; | 3413 | struct array_cache *old; |
| 3396 | 3414 | ||
| 3397 | check_irq_off(); | 3415 | check_irq_off(); |
| @@ -3401,16 +3419,17 @@ static void do_ccupdate_local(void *info) | |||
| 3401 | new->new[smp_processor_id()] = old; | 3419 | new->new[smp_processor_id()] = old; |
| 3402 | } | 3420 | } |
| 3403 | 3421 | ||
| 3404 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount, | 3422 | /* Always called with the cache_chain_mutex held */ |
| 3405 | int shared) | 3423 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, |
| 3424 | int batchcount, int shared) | ||
| 3406 | { | 3425 | { |
| 3407 | struct ccupdate_struct new; | 3426 | struct ccupdate_struct new; |
| 3408 | int i, err; | 3427 | int i, err; |
| 3409 | 3428 | ||
| 3410 | memset(&new.new, 0, sizeof(new.new)); | 3429 | memset(&new.new, 0, sizeof(new.new)); |
| 3411 | for_each_online_cpu(i) { | 3430 | for_each_online_cpu(i) { |
| 3412 | new.new[i] = | 3431 | new.new[i] = alloc_arraycache(cpu_to_node(i), limit, |
| 3413 | alloc_arraycache(cpu_to_node(i), limit, batchcount); | 3432 | batchcount); |
| 3414 | if (!new.new[i]) { | 3433 | if (!new.new[i]) { |
| 3415 | for (i--; i >= 0; i--) | 3434 | for (i--; i >= 0; i--) |
| 3416 | kfree(new.new[i]); | 3435 | kfree(new.new[i]); |
| @@ -3419,14 +3438,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount | |||
| 3419 | } | 3438 | } |
| 3420 | new.cachep = cachep; | 3439 | new.cachep = cachep; |
| 3421 | 3440 | ||
| 3422 | smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); | 3441 | on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1); |
| 3423 | 3442 | ||
| 3424 | check_irq_on(); | 3443 | check_irq_on(); |
| 3425 | spin_lock(&cachep->spinlock); | ||
| 3426 | cachep->batchcount = batchcount; | 3444 | cachep->batchcount = batchcount; |
| 3427 | cachep->limit = limit; | 3445 | cachep->limit = limit; |
| 3428 | cachep->shared = shared; | 3446 | cachep->shared = shared; |
| 3429 | spin_unlock(&cachep->spinlock); | ||
| 3430 | 3447 | ||
| 3431 | for_each_online_cpu(i) { | 3448 | for_each_online_cpu(i) { |
| 3432 | struct array_cache *ccold = new.new[i]; | 3449 | struct array_cache *ccold = new.new[i]; |
| @@ -3447,15 +3464,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount | |||
| 3447 | return 0; | 3464 | return 0; |
| 3448 | } | 3465 | } |
| 3449 | 3466 | ||
| 3467 | /* Called with cache_chain_mutex held always */ | ||
| 3450 | static void enable_cpucache(struct kmem_cache *cachep) | 3468 | static void enable_cpucache(struct kmem_cache *cachep) |
| 3451 | { | 3469 | { |
| 3452 | int err; | 3470 | int err; |
| 3453 | int limit, shared; | 3471 | int limit, shared; |
| 3454 | 3472 | ||
| 3455 | /* The head array serves three purposes: | 3473 | /* |
| 3474 | * The head array serves three purposes: | ||
| 3456 | * - create a LIFO ordering, i.e. return objects that are cache-warm | 3475 | * - create a LIFO ordering, i.e. return objects that are cache-warm |
| 3457 | * - reduce the number of spinlock operations. | 3476 | * - reduce the number of spinlock operations. |
| 3458 | * - reduce the number of linked list operations on the slab and | 3477 | * - reduce the number of linked list operations on the slab and |
| 3459 | * bufctl chains: array operations are cheaper. | 3478 | * bufctl chains: array operations are cheaper. |
| 3460 | * The numbers are guessed, we should auto-tune as described by | 3479 | * The numbers are guessed, we should auto-tune as described by |
| 3461 | * Bonwick. | 3480 | * Bonwick. |
| @@ -3471,7 +3490,8 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
| 3471 | else | 3490 | else |
| 3472 | limit = 120; | 3491 | limit = 120; |
| 3473 | 3492 | ||
| 3474 | /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound | 3493 | /* |
| 3494 | * CPU bound tasks (e.g. network routing) can exhibit cpu bound | ||
| 3475 | * allocation behaviour: Most allocs on one cpu, most free operations | 3495 | * allocation behaviour: Most allocs on one cpu, most free operations |
| 3476 | * on another cpu. For these cases, an efficient object passing between | 3496 | * on another cpu. For these cases, an efficient object passing between |
| 3477 | * cpus is necessary. This is provided by a shared array. The array | 3497 | * cpus is necessary. This is provided by a shared array. The array |
| @@ -3486,9 +3506,9 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
| 3486 | #endif | 3506 | #endif |
| 3487 | 3507 | ||
| 3488 | #if DEBUG | 3508 | #if DEBUG |
| 3489 | /* With debugging enabled, large batchcount lead to excessively | 3509 | /* |
| 3490 | * long periods with disabled local interrupts. Limit the | 3510 | * With debugging enabled, large batchcount lead to excessively long |
| 3491 | * batchcount | 3511 | * periods with disabled local interrupts. Limit the batchcount |
| 3492 | */ | 3512 | */ |
| 3493 | if (limit > 32) | 3513 | if (limit > 32) |
| 3494 | limit = 32; | 3514 | limit = 32; |
| @@ -3499,23 +3519,32 @@ static void enable_cpucache(struct kmem_cache *cachep) | |||
| 3499 | cachep->name, -err); | 3519 | cachep->name, -err); |
| 3500 | } | 3520 | } |
| 3501 | 3521 | ||
| 3502 | static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, | 3522 | /* |
| 3503 | int force, int node) | 3523 | * Drain an array if it contains any elements taking the l3 lock only if |
| 3524 | * necessary. Note that the l3 listlock also protects the array_cache | ||
| 3525 | * if drain_array() is used on the shared array. | ||
| 3526 | */ | ||
| 3527 | void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, | ||
| 3528 | struct array_cache *ac, int force, int node) | ||
| 3504 | { | 3529 | { |
| 3505 | int tofree; | 3530 | int tofree; |
| 3506 | 3531 | ||
| 3507 | check_spinlock_acquired_node(cachep, node); | 3532 | if (!ac || !ac->avail) |
| 3533 | return; | ||
| 3508 | if (ac->touched && !force) { | 3534 | if (ac->touched && !force) { |
| 3509 | ac->touched = 0; | 3535 | ac->touched = 0; |
| 3510 | } else if (ac->avail) { | 3536 | } else { |
| 3511 | tofree = force ? ac->avail : (ac->limit + 4) / 5; | 3537 | spin_lock_irq(&l3->list_lock); |
| 3512 | if (tofree > ac->avail) { | 3538 | if (ac->avail) { |
| 3513 | tofree = (ac->avail + 1) / 2; | 3539 | tofree = force ? ac->avail : (ac->limit + 4) / 5; |
| 3540 | if (tofree > ac->avail) | ||
| 3541 | tofree = (ac->avail + 1) / 2; | ||
| 3542 | free_block(cachep, ac->entry, tofree, node); | ||
| 3543 | ac->avail -= tofree; | ||
| 3544 | memmove(ac->entry, &(ac->entry[tofree]), | ||
| 3545 | sizeof(void *) * ac->avail); | ||
| 3514 | } | 3546 | } |
| 3515 | free_block(cachep, ac->entry, tofree, node); | 3547 | spin_unlock_irq(&l3->list_lock); |
| 3516 | ac->avail -= tofree; | ||
| 3517 | memmove(ac->entry, &(ac->entry[tofree]), | ||
| 3518 | sizeof(void *) * ac->avail); | ||
| 3519 | } | 3548 | } |
| 3520 | } | 3549 | } |
| 3521 | 3550 | ||
| @@ -3528,13 +3557,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac | |||
| 3528 | * - clear the per-cpu caches for this CPU. | 3557 | * - clear the per-cpu caches for this CPU. |
| 3529 | * - return freeable pages to the main free memory pool. | 3558 | * - return freeable pages to the main free memory pool. |
| 3530 | * | 3559 | * |
| 3531 | * If we cannot acquire the cache chain mutex then just give up - we'll | 3560 | * If we cannot acquire the cache chain mutex then just give up - we'll try |
| 3532 | * try again on the next iteration. | 3561 | * again on the next iteration. |
| 3533 | */ | 3562 | */ |
| 3534 | static void cache_reap(void *unused) | 3563 | static void cache_reap(void *unused) |
| 3535 | { | 3564 | { |
| 3536 | struct list_head *walk; | 3565 | struct list_head *walk; |
| 3537 | struct kmem_list3 *l3; | 3566 | struct kmem_list3 *l3; |
| 3567 | int node = numa_node_id(); | ||
| 3538 | 3568 | ||
| 3539 | if (!mutex_trylock(&cache_chain_mutex)) { | 3569 | if (!mutex_trylock(&cache_chain_mutex)) { |
| 3540 | /* Give up. Setup the next iteration. */ | 3570 | /* Give up. Setup the next iteration. */ |
| @@ -3550,65 +3580,72 @@ static void cache_reap(void *unused) | |||
| 3550 | struct slab *slabp; | 3580 | struct slab *slabp; |
| 3551 | 3581 | ||
| 3552 | searchp = list_entry(walk, struct kmem_cache, next); | 3582 | searchp = list_entry(walk, struct kmem_cache, next); |
| 3553 | |||
| 3554 | if (searchp->flags & SLAB_NO_REAP) | ||
| 3555 | goto next; | ||
| 3556 | |||
| 3557 | check_irq_on(); | 3583 | check_irq_on(); |
| 3558 | 3584 | ||
| 3559 | l3 = searchp->nodelists[numa_node_id()]; | 3585 | /* |
| 3586 | * We only take the l3 lock if absolutely necessary and we | ||
| 3587 | * have established with reasonable certainty that | ||
| 3588 | * we can do some work if the lock was obtained. | ||
| 3589 | */ | ||
| 3590 | l3 = searchp->nodelists[node]; | ||
| 3591 | |||
| 3560 | reap_alien(searchp, l3); | 3592 | reap_alien(searchp, l3); |
| 3561 | spin_lock_irq(&l3->list_lock); | ||
| 3562 | 3593 | ||
| 3563 | drain_array_locked(searchp, cpu_cache_get(searchp), 0, | 3594 | drain_array(searchp, l3, cpu_cache_get(searchp), 0, node); |
| 3564 | numa_node_id()); | ||
| 3565 | 3595 | ||
| 3596 | /* | ||
| 3597 | * These are racy checks but it does not matter | ||
| 3598 | * if we skip one check or scan twice. | ||
| 3599 | */ | ||
| 3566 | if (time_after(l3->next_reap, jiffies)) | 3600 | if (time_after(l3->next_reap, jiffies)) |
| 3567 | goto next_unlock; | 3601 | goto next; |
| 3568 | 3602 | ||
| 3569 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3; | 3603 | l3->next_reap = jiffies + REAPTIMEOUT_LIST3; |
| 3570 | 3604 | ||
| 3571 | if (l3->shared) | 3605 | drain_array(searchp, l3, l3->shared, 0, node); |
| 3572 | drain_array_locked(searchp, l3->shared, 0, | ||
| 3573 | numa_node_id()); | ||
| 3574 | 3606 | ||
| 3575 | if (l3->free_touched) { | 3607 | if (l3->free_touched) { |
| 3576 | l3->free_touched = 0; | 3608 | l3->free_touched = 0; |
| 3577 | goto next_unlock; | 3609 | goto next; |
| 3578 | } | 3610 | } |
| 3579 | 3611 | ||
| 3580 | tofree = | 3612 | tofree = (l3->free_limit + 5 * searchp->num - 1) / |
| 3581 | (l3->free_limit + 5 * searchp->num - | 3613 | (5 * searchp->num); |
| 3582 | 1) / (5 * searchp->num); | ||
| 3583 | do { | 3614 | do { |
| 3615 | /* | ||
| 3616 | * Do not lock if there are no free blocks. | ||
| 3617 | */ | ||
| 3618 | if (list_empty(&l3->slabs_free)) | ||
| 3619 | break; | ||
| 3620 | |||
| 3621 | spin_lock_irq(&l3->list_lock); | ||
| 3584 | p = l3->slabs_free.next; | 3622 | p = l3->slabs_free.next; |
| 3585 | if (p == &(l3->slabs_free)) | 3623 | if (p == &(l3->slabs_free)) { |
| 3624 | spin_unlock_irq(&l3->list_lock); | ||
| 3586 | break; | 3625 | break; |
| 3626 | } | ||
| 3587 | 3627 | ||
| 3588 | slabp = list_entry(p, struct slab, list); | 3628 | slabp = list_entry(p, struct slab, list); |
| 3589 | BUG_ON(slabp->inuse); | 3629 | BUG_ON(slabp->inuse); |
| 3590 | list_del(&slabp->list); | 3630 | list_del(&slabp->list); |
| 3591 | STATS_INC_REAPED(searchp); | 3631 | STATS_INC_REAPED(searchp); |
| 3592 | 3632 | ||
| 3593 | /* Safe to drop the lock. The slab is no longer | 3633 | /* |
| 3594 | * linked to the cache. | 3634 | * Safe to drop the lock. The slab is no longer linked |
| 3595 | * searchp cannot disappear, we hold | 3635 | * to the cache. searchp cannot disappear, we hold |
| 3596 | * cache_chain_lock | 3636 | * cache_chain_lock |
| 3597 | */ | 3637 | */ |
| 3598 | l3->free_objects -= searchp->num; | 3638 | l3->free_objects -= searchp->num; |
| 3599 | spin_unlock_irq(&l3->list_lock); | 3639 | spin_unlock_irq(&l3->list_lock); |
| 3600 | slab_destroy(searchp, slabp); | 3640 | slab_destroy(searchp, slabp); |
| 3601 | spin_lock_irq(&l3->list_lock); | ||
| 3602 | } while (--tofree > 0); | 3641 | } while (--tofree > 0); |
| 3603 | next_unlock: | 3642 | next: |
| 3604 | spin_unlock_irq(&l3->list_lock); | ||
| 3605 | next: | ||
| 3606 | cond_resched(); | 3643 | cond_resched(); |
| 3607 | } | 3644 | } |
| 3608 | check_irq_on(); | 3645 | check_irq_on(); |
| 3609 | mutex_unlock(&cache_chain_mutex); | 3646 | mutex_unlock(&cache_chain_mutex); |
| 3610 | next_reap_node(); | 3647 | next_reap_node(); |
| 3611 | /* Setup the next iteration */ | 3648 | /* Set up the next iteration */ |
| 3612 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); | 3649 | schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); |
| 3613 | } | 3650 | } |
| 3614 | 3651 | ||
| @@ -3658,8 +3695,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) | |||
| 3658 | { | 3695 | { |
| 3659 | struct kmem_cache *cachep = p; | 3696 | struct kmem_cache *cachep = p; |
| 3660 | ++*pos; | 3697 | ++*pos; |
| 3661 | return cachep->next.next == &cache_chain ? NULL | 3698 | return cachep->next.next == &cache_chain ? |
| 3662 | : list_entry(cachep->next.next, struct kmem_cache, next); | 3699 | NULL : list_entry(cachep->next.next, struct kmem_cache, next); |
| 3663 | } | 3700 | } |
| 3664 | 3701 | ||
| 3665 | static void s_stop(struct seq_file *m, void *p) | 3702 | static void s_stop(struct seq_file *m, void *p) |
| @@ -3681,7 +3718,6 @@ static int s_show(struct seq_file *m, void *p) | |||
| 3681 | int node; | 3718 | int node; |
| 3682 | struct kmem_list3 *l3; | 3719 | struct kmem_list3 *l3; |
| 3683 | 3720 | ||
| 3684 | spin_lock(&cachep->spinlock); | ||
| 3685 | active_objs = 0; | 3721 | active_objs = 0; |
| 3686 | num_slabs = 0; | 3722 | num_slabs = 0; |
| 3687 | for_each_online_node(node) { | 3723 | for_each_online_node(node) { |
| @@ -3748,7 +3784,9 @@ static int s_show(struct seq_file *m, void *p) | |||
| 3748 | unsigned long node_frees = cachep->node_frees; | 3784 | unsigned long node_frees = cachep->node_frees; |
| 3749 | 3785 | ||
| 3750 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ | 3786 | seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ |
| 3751 | %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees); | 3787 | %4lu %4lu %4lu %4lu", allocs, high, grown, |
| 3788 | reaped, errors, max_freeable, node_allocs, | ||
| 3789 | node_frees); | ||
| 3752 | } | 3790 | } |
| 3753 | /* cpu stats */ | 3791 | /* cpu stats */ |
| 3754 | { | 3792 | { |
| @@ -3762,7 +3800,6 @@ static int s_show(struct seq_file *m, void *p) | |||
| 3762 | } | 3800 | } |
| 3763 | #endif | 3801 | #endif |
| 3764 | seq_putc(m, '\n'); | 3802 | seq_putc(m, '\n'); |
| 3765 | spin_unlock(&cachep->spinlock); | ||
| 3766 | return 0; | 3803 | return 0; |
| 3767 | } | 3804 | } |
| 3768 | 3805 | ||
| @@ -3820,13 +3857,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
| 3820 | mutex_lock(&cache_chain_mutex); | 3857 | mutex_lock(&cache_chain_mutex); |
| 3821 | res = -EINVAL; | 3858 | res = -EINVAL; |
| 3822 | list_for_each(p, &cache_chain) { | 3859 | list_for_each(p, &cache_chain) { |
| 3823 | struct kmem_cache *cachep = list_entry(p, struct kmem_cache, | 3860 | struct kmem_cache *cachep; |
| 3824 | next); | ||
| 3825 | 3861 | ||
| 3862 | cachep = list_entry(p, struct kmem_cache, next); | ||
| 3826 | if (!strcmp(cachep->name, kbuf)) { | 3863 | if (!strcmp(cachep->name, kbuf)) { |
| 3827 | if (limit < 1 || | 3864 | if (limit < 1 || batchcount < 1 || |
| 3828 | batchcount < 1 || | 3865 | batchcount > limit || shared < 0) { |
| 3829 | batchcount > limit || shared < 0) { | ||
| 3830 | res = 0; | 3866 | res = 0; |
| 3831 | } else { | 3867 | } else { |
| 3832 | res = do_tune_cpucache(cachep, limit, | 3868 | res = do_tune_cpucache(cachep, limit, |
| @@ -209,19 +209,18 @@ int lru_add_drain_all(void) | |||
| 209 | */ | 209 | */ |
| 210 | void fastcall __page_cache_release(struct page *page) | 210 | void fastcall __page_cache_release(struct page *page) |
| 211 | { | 211 | { |
| 212 | unsigned long flags; | 212 | if (PageLRU(page)) { |
| 213 | struct zone *zone = page_zone(page); | 213 | unsigned long flags; |
| 214 | struct zone *zone = page_zone(page); | ||
| 214 | 215 | ||
| 215 | spin_lock_irqsave(&zone->lru_lock, flags); | 216 | spin_lock_irqsave(&zone->lru_lock, flags); |
| 216 | if (TestClearPageLRU(page)) | 217 | BUG_ON(!PageLRU(page)); |
| 218 | __ClearPageLRU(page); | ||
| 217 | del_page_from_lru(zone, page); | 219 | del_page_from_lru(zone, page); |
| 218 | if (page_count(page) != 0) | 220 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 219 | page = NULL; | 221 | } |
| 220 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 222 | free_hot_page(page); |
| 221 | if (page) | ||
| 222 | free_hot_page(page); | ||
| 223 | } | 223 | } |
| 224 | |||
| 225 | EXPORT_SYMBOL(__page_cache_release); | 224 | EXPORT_SYMBOL(__page_cache_release); |
| 226 | 225 | ||
| 227 | /* | 226 | /* |
| @@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 245 | pagevec_init(&pages_to_free, cold); | 244 | pagevec_init(&pages_to_free, cold); |
| 246 | for (i = 0; i < nr; i++) { | 245 | for (i = 0; i < nr; i++) { |
| 247 | struct page *page = pages[i]; | 246 | struct page *page = pages[i]; |
| 248 | struct zone *pagezone; | ||
| 249 | 247 | ||
| 250 | if (unlikely(PageCompound(page))) { | 248 | if (unlikely(PageCompound(page))) { |
| 251 | if (zone) { | 249 | if (zone) { |
| @@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold) | |||
| 259 | if (!put_page_testzero(page)) | 257 | if (!put_page_testzero(page)) |
| 260 | continue; | 258 | continue; |
| 261 | 259 | ||
| 262 | pagezone = page_zone(page); | 260 | if (PageLRU(page)) { |
| 263 | if (pagezone != zone) { | 261 | struct zone *pagezone = page_zone(page); |
| 264 | if (zone) | 262 | if (pagezone != zone) { |
| 265 | spin_unlock_irq(&zone->lru_lock); | 263 | if (zone) |
| 266 | zone = pagezone; | 264 | spin_unlock_irq(&zone->lru_lock); |
| 267 | spin_lock_irq(&zone->lru_lock); | 265 | zone = pagezone; |
| 268 | } | 266 | spin_lock_irq(&zone->lru_lock); |
| 269 | if (TestClearPageLRU(page)) | 267 | } |
| 268 | BUG_ON(!PageLRU(page)); | ||
| 269 | __ClearPageLRU(page); | ||
| 270 | del_page_from_lru(zone, page); | 270 | del_page_from_lru(zone, page); |
| 271 | if (page_count(page) == 0) { | 271 | } |
| 272 | if (!pagevec_add(&pages_to_free, page)) { | 272 | |
| 273 | if (!pagevec_add(&pages_to_free, page)) { | ||
| 274 | if (zone) { | ||
| 273 | spin_unlock_irq(&zone->lru_lock); | 275 | spin_unlock_irq(&zone->lru_lock); |
| 274 | __pagevec_free(&pages_to_free); | 276 | zone = NULL; |
| 275 | pagevec_reinit(&pages_to_free); | ||
| 276 | zone = NULL; /* No lock is held */ | ||
| 277 | } | 277 | } |
| 278 | } | 278 | __pagevec_free(&pages_to_free); |
| 279 | pagevec_reinit(&pages_to_free); | ||
| 280 | } | ||
| 279 | } | 281 | } |
| 280 | if (zone) | 282 | if (zone) |
| 281 | spin_unlock_irq(&zone->lru_lock); | 283 | spin_unlock_irq(&zone->lru_lock); |
| @@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec) | |||
| 343 | zone = pagezone; | 345 | zone = pagezone; |
| 344 | spin_lock_irq(&zone->lru_lock); | 346 | spin_lock_irq(&zone->lru_lock); |
| 345 | } | 347 | } |
| 346 | if (TestSetPageLRU(page)) | 348 | BUG_ON(PageLRU(page)); |
| 347 | BUG(); | 349 | SetPageLRU(page); |
| 348 | add_page_to_inactive_list(zone, page); | 350 | add_page_to_inactive_list(zone, page); |
| 349 | } | 351 | } |
| 350 | if (zone) | 352 | if (zone) |
| @@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec) | |||
| 370 | zone = pagezone; | 372 | zone = pagezone; |
| 371 | spin_lock_irq(&zone->lru_lock); | 373 | spin_lock_irq(&zone->lru_lock); |
| 372 | } | 374 | } |
| 373 | if (TestSetPageLRU(page)) | 375 | BUG_ON(PageLRU(page)); |
| 374 | BUG(); | 376 | SetPageLRU(page); |
| 375 | if (TestSetPageActive(page)) | 377 | BUG_ON(PageActive(page)); |
| 376 | BUG(); | 378 | SetPageActive(page); |
| 377 | add_page_to_active_list(zone, page); | 379 | add_page_to_active_list(zone, page); |
| 378 | } | 380 | } |
| 379 | if (zone) | 381 | if (zone) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index db8a3d3e1636..d7af296833fc 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/buffer_head.h> | 15 | #include <linux/buffer_head.h> |
| 16 | #include <linux/backing-dev.h> | 16 | #include <linux/backing-dev.h> |
| 17 | #include <linux/pagevec.h> | 17 | #include <linux/pagevec.h> |
| 18 | #include <linux/migrate.h> | ||
| 18 | 19 | ||
| 19 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
| 20 | 21 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f9cf0d073b8..39aa9d129612 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry "; | |||
| 45 | 45 | ||
| 46 | struct swap_list_t swap_list = {-1, -1}; | 46 | struct swap_list_t swap_list = {-1, -1}; |
| 47 | 47 | ||
| 48 | struct swap_info_struct swap_info[MAX_SWAPFILES]; | 48 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; |
| 49 | 49 | ||
| 50 | static DEFINE_MUTEX(swapon_mutex); | 50 | static DEFINE_MUTEX(swapon_mutex); |
| 51 | 51 | ||
| @@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si) | |||
| 116 | last_in_cluster = offset + SWAPFILE_CLUSTER; | 116 | last_in_cluster = offset + SWAPFILE_CLUSTER; |
| 117 | else if (offset == last_in_cluster) { | 117 | else if (offset == last_in_cluster) { |
| 118 | spin_lock(&swap_lock); | 118 | spin_lock(&swap_lock); |
| 119 | si->cluster_next = offset-SWAPFILE_CLUSTER-1; | 119 | si->cluster_next = offset-SWAPFILE_CLUSTER+1; |
| 120 | goto cluster; | 120 | goto cluster; |
| 121 | } | 121 | } |
| 122 | if (unlikely(--latency_ration < 0)) { | 122 | if (unlikely(--latency_ration < 0)) { |
| @@ -417,6 +417,61 @@ void free_swap_and_cache(swp_entry_t entry) | |||
| 417 | } | 417 | } |
| 418 | } | 418 | } |
| 419 | 419 | ||
| 420 | #ifdef CONFIG_SOFTWARE_SUSPEND | ||
| 421 | /* | ||
| 422 | * Find the swap type that corresponds to given device (if any) | ||
| 423 | * | ||
| 424 | * This is needed for software suspend and is done in such a way that inode | ||
| 425 | * aliasing is allowed. | ||
| 426 | */ | ||
| 427 | int swap_type_of(dev_t device) | ||
| 428 | { | ||
| 429 | int i; | ||
| 430 | |||
| 431 | spin_lock(&swap_lock); | ||
| 432 | for (i = 0; i < nr_swapfiles; i++) { | ||
| 433 | struct inode *inode; | ||
| 434 | |||
| 435 | if (!(swap_info[i].flags & SWP_WRITEOK)) | ||
| 436 | continue; | ||
| 437 | if (!device) { | ||
| 438 | spin_unlock(&swap_lock); | ||
| 439 | return i; | ||
| 440 | } | ||
| 441 | inode = swap_info->swap_file->f_dentry->d_inode; | ||
| 442 | if (S_ISBLK(inode->i_mode) && | ||
| 443 | device == MKDEV(imajor(inode), iminor(inode))) { | ||
| 444 | spin_unlock(&swap_lock); | ||
| 445 | return i; | ||
| 446 | } | ||
| 447 | } | ||
| 448 | spin_unlock(&swap_lock); | ||
| 449 | return -ENODEV; | ||
| 450 | } | ||
| 451 | |||
| 452 | /* | ||
| 453 | * Return either the total number of swap pages of given type, or the number | ||
| 454 | * of free pages of that type (depending on @free) | ||
| 455 | * | ||
| 456 | * This is needed for software suspend | ||
| 457 | */ | ||
| 458 | unsigned int count_swap_pages(int type, int free) | ||
| 459 | { | ||
| 460 | unsigned int n = 0; | ||
| 461 | |||
| 462 | if (type < nr_swapfiles) { | ||
| 463 | spin_lock(&swap_lock); | ||
| 464 | if (swap_info[type].flags & SWP_WRITEOK) { | ||
| 465 | n = swap_info[type].pages; | ||
| 466 | if (free) | ||
| 467 | n -= swap_info[type].inuse_pages; | ||
| 468 | } | ||
| 469 | spin_unlock(&swap_lock); | ||
| 470 | } | ||
| 471 | return n; | ||
| 472 | } | ||
| 473 | #endif | ||
| 474 | |||
| 420 | /* | 475 | /* |
| 421 | * No need to decide whether this PTE shares the swap entry with others, | 476 | * No need to decide whether this PTE shares the swap entry with others, |
| 422 | * just let do_wp_page work it out if a write is requested later - to | 477 | * just let do_wp_page work it out if a write is requested later - to |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 4fe7e3aa02e2..fd572bbdc9f5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -33,39 +33,21 @@ | |||
| 33 | #include <linux/cpuset.h> | 33 | #include <linux/cpuset.h> |
| 34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
| 35 | #include <linux/rwsem.h> | 35 | #include <linux/rwsem.h> |
| 36 | #include <linux/delay.h> | ||
| 36 | 37 | ||
| 37 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
| 38 | #include <asm/div64.h> | 39 | #include <asm/div64.h> |
| 39 | 40 | ||
| 40 | #include <linux/swapops.h> | 41 | #include <linux/swapops.h> |
| 41 | 42 | ||
| 42 | /* possible outcome of pageout() */ | 43 | #include "internal.h" |
| 43 | typedef enum { | ||
| 44 | /* failed to write page out, page is locked */ | ||
| 45 | PAGE_KEEP, | ||
| 46 | /* move page to the active list, page is locked */ | ||
| 47 | PAGE_ACTIVATE, | ||
| 48 | /* page has been sent to the disk successfully, page is unlocked */ | ||
| 49 | PAGE_SUCCESS, | ||
| 50 | /* page is clean and locked */ | ||
| 51 | PAGE_CLEAN, | ||
| 52 | } pageout_t; | ||
| 53 | 44 | ||
| 54 | struct scan_control { | 45 | struct scan_control { |
| 55 | /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ | ||
| 56 | unsigned long nr_to_scan; | ||
| 57 | |||
| 58 | /* Incremented by the number of inactive pages that were scanned */ | 46 | /* Incremented by the number of inactive pages that were scanned */ |
| 59 | unsigned long nr_scanned; | 47 | unsigned long nr_scanned; |
| 60 | 48 | ||
| 61 | /* Incremented by the number of pages reclaimed */ | ||
| 62 | unsigned long nr_reclaimed; | ||
| 63 | |||
| 64 | unsigned long nr_mapped; /* From page_state */ | 49 | unsigned long nr_mapped; /* From page_state */ |
| 65 | 50 | ||
| 66 | /* Ask shrink_caches, or shrink_zone to scan at this priority */ | ||
| 67 | unsigned int priority; | ||
| 68 | |||
| 69 | /* This context's GFP mask */ | 51 | /* This context's GFP mask */ |
| 70 | gfp_t gfp_mask; | 52 | gfp_t gfp_mask; |
| 71 | 53 | ||
| @@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker); | |||
| 183 | * | 165 | * |
| 184 | * Returns the number of slab objects which we shrunk. | 166 | * Returns the number of slab objects which we shrunk. |
| 185 | */ | 167 | */ |
| 186 | int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) | 168 | unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, |
| 169 | unsigned long lru_pages) | ||
| 187 | { | 170 | { |
| 188 | struct shrinker *shrinker; | 171 | struct shrinker *shrinker; |
| 189 | int ret = 0; | 172 | unsigned long ret = 0; |
| 190 | 173 | ||
| 191 | if (scanned == 0) | 174 | if (scanned == 0) |
| 192 | scanned = SWAP_CLUSTER_MAX; | 175 | scanned = SWAP_CLUSTER_MAX; |
| @@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping, | |||
| 306 | } | 289 | } |
| 307 | 290 | ||
| 308 | /* | 291 | /* |
| 309 | * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). | 292 | * pageout is called by shrink_page_list() for each dirty page. |
| 293 | * Calls ->writepage(). | ||
| 310 | */ | 294 | */ |
| 311 | static pageout_t pageout(struct page *page, struct address_space *mapping) | 295 | pageout_t pageout(struct page *page, struct address_space *mapping) |
| 312 | { | 296 | { |
| 313 | /* | 297 | /* |
| 314 | * If the page is dirty, only perform writeback if that write | 298 | * If the page is dirty, only perform writeback if that write |
| @@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) | |||
| 376 | return PAGE_CLEAN; | 360 | return PAGE_CLEAN; |
| 377 | } | 361 | } |
| 378 | 362 | ||
| 379 | static int remove_mapping(struct address_space *mapping, struct page *page) | 363 | int remove_mapping(struct address_space *mapping, struct page *page) |
| 380 | { | 364 | { |
| 381 | if (!mapping) | 365 | if (!mapping) |
| 382 | return 0; /* truncate got there first */ | 366 | return 0; /* truncate got there first */ |
| @@ -414,14 +398,15 @@ cannot_free: | |||
| 414 | } | 398 | } |
| 415 | 399 | ||
| 416 | /* | 400 | /* |
| 417 | * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed | 401 | * shrink_page_list() returns the number of reclaimed pages |
| 418 | */ | 402 | */ |
| 419 | static int shrink_list(struct list_head *page_list, struct scan_control *sc) | 403 | static unsigned long shrink_page_list(struct list_head *page_list, |
| 404 | struct scan_control *sc) | ||
| 420 | { | 405 | { |
| 421 | LIST_HEAD(ret_pages); | 406 | LIST_HEAD(ret_pages); |
| 422 | struct pagevec freed_pvec; | 407 | struct pagevec freed_pvec; |
| 423 | int pgactivate = 0; | 408 | int pgactivate = 0; |
| 424 | int reclaimed = 0; | 409 | unsigned long nr_reclaimed = 0; |
| 425 | 410 | ||
| 426 | cond_resched(); | 411 | cond_resched(); |
| 427 | 412 | ||
| @@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
| 464 | * Anonymous process memory has backing store? | 449 | * Anonymous process memory has backing store? |
| 465 | * Try to allocate it some swap space here. | 450 | * Try to allocate it some swap space here. |
| 466 | */ | 451 | */ |
| 467 | if (PageAnon(page) && !PageSwapCache(page)) { | 452 | if (PageAnon(page) && !PageSwapCache(page)) |
| 468 | if (!sc->may_swap) | ||
| 469 | goto keep_locked; | ||
| 470 | if (!add_to_swap(page, GFP_ATOMIC)) | 453 | if (!add_to_swap(page, GFP_ATOMIC)) |
| 471 | goto activate_locked; | 454 | goto activate_locked; |
| 472 | } | ||
| 473 | #endif /* CONFIG_SWAP */ | 455 | #endif /* CONFIG_SWAP */ |
| 474 | 456 | ||
| 475 | mapping = page_mapping(page); | 457 | mapping = page_mapping(page); |
| @@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
| 481 | * processes. Try to unmap it here. | 463 | * processes. Try to unmap it here. |
| 482 | */ | 464 | */ |
| 483 | if (page_mapped(page) && mapping) { | 465 | if (page_mapped(page) && mapping) { |
| 484 | /* | ||
| 485 | * No unmapping if we do not swap | ||
| 486 | */ | ||
| 487 | if (!sc->may_swap) | ||
| 488 | goto keep_locked; | ||
| 489 | |||
| 490 | switch (try_to_unmap(page, 0)) { | 466 | switch (try_to_unmap(page, 0)) { |
| 491 | case SWAP_FAIL: | 467 | case SWAP_FAIL: |
| 492 | goto activate_locked; | 468 | goto activate_locked; |
| @@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) | |||
| 561 | 537 | ||
| 562 | free_it: | 538 | free_it: |
| 563 | unlock_page(page); | 539 | unlock_page(page); |
| 564 | reclaimed++; | 540 | nr_reclaimed++; |
| 565 | if (!pagevec_add(&freed_pvec, page)) | 541 | if (!pagevec_add(&freed_pvec, page)) |
| 566 | __pagevec_release_nonlru(&freed_pvec); | 542 | __pagevec_release_nonlru(&freed_pvec); |
| 567 | continue; | 543 | continue; |
| @@ -579,483 +555,8 @@ keep: | |||
| 579 | if (pagevec_count(&freed_pvec)) | 555 | if (pagevec_count(&freed_pvec)) |
| 580 | __pagevec_release_nonlru(&freed_pvec); | 556 | __pagevec_release_nonlru(&freed_pvec); |
| 581 | mod_page_state(pgactivate, pgactivate); | 557 | mod_page_state(pgactivate, pgactivate); |
| 582 | sc->nr_reclaimed += reclaimed; | 558 | return nr_reclaimed; |
| 583 | return reclaimed; | ||
| 584 | } | ||
| 585 | |||
| 586 | #ifdef CONFIG_MIGRATION | ||
| 587 | static inline void move_to_lru(struct page *page) | ||
| 588 | { | ||
| 589 | list_del(&page->lru); | ||
| 590 | if (PageActive(page)) { | ||
| 591 | /* | ||
| 592 | * lru_cache_add_active checks that | ||
| 593 | * the PG_active bit is off. | ||
| 594 | */ | ||
| 595 | ClearPageActive(page); | ||
| 596 | lru_cache_add_active(page); | ||
| 597 | } else { | ||
| 598 | lru_cache_add(page); | ||
| 599 | } | ||
| 600 | put_page(page); | ||
| 601 | } | ||
| 602 | |||
| 603 | /* | ||
| 604 | * Add isolated pages on the list back to the LRU. | ||
| 605 | * | ||
| 606 | * returns the number of pages put back. | ||
| 607 | */ | ||
| 608 | int putback_lru_pages(struct list_head *l) | ||
| 609 | { | ||
| 610 | struct page *page; | ||
| 611 | struct page *page2; | ||
| 612 | int count = 0; | ||
| 613 | |||
| 614 | list_for_each_entry_safe(page, page2, l, lru) { | ||
| 615 | move_to_lru(page); | ||
| 616 | count++; | ||
| 617 | } | ||
| 618 | return count; | ||
| 619 | } | ||
| 620 | |||
| 621 | /* | ||
| 622 | * Non migratable page | ||
| 623 | */ | ||
| 624 | int fail_migrate_page(struct page *newpage, struct page *page) | ||
| 625 | { | ||
| 626 | return -EIO; | ||
| 627 | } | ||
| 628 | EXPORT_SYMBOL(fail_migrate_page); | ||
| 629 | |||
| 630 | /* | ||
| 631 | * swapout a single page | ||
| 632 | * page is locked upon entry, unlocked on exit | ||
| 633 | */ | ||
| 634 | static int swap_page(struct page *page) | ||
| 635 | { | ||
| 636 | struct address_space *mapping = page_mapping(page); | ||
| 637 | |||
| 638 | if (page_mapped(page) && mapping) | ||
| 639 | if (try_to_unmap(page, 1) != SWAP_SUCCESS) | ||
| 640 | goto unlock_retry; | ||
| 641 | |||
| 642 | if (PageDirty(page)) { | ||
| 643 | /* Page is dirty, try to write it out here */ | ||
| 644 | switch(pageout(page, mapping)) { | ||
| 645 | case PAGE_KEEP: | ||
| 646 | case PAGE_ACTIVATE: | ||
| 647 | goto unlock_retry; | ||
| 648 | |||
| 649 | case PAGE_SUCCESS: | ||
| 650 | goto retry; | ||
| 651 | |||
| 652 | case PAGE_CLEAN: | ||
| 653 | ; /* try to free the page below */ | ||
| 654 | } | ||
| 655 | } | ||
| 656 | |||
| 657 | if (PagePrivate(page)) { | ||
| 658 | if (!try_to_release_page(page, GFP_KERNEL) || | ||
| 659 | (!mapping && page_count(page) == 1)) | ||
| 660 | goto unlock_retry; | ||
| 661 | } | ||
| 662 | |||
| 663 | if (remove_mapping(mapping, page)) { | ||
| 664 | /* Success */ | ||
| 665 | unlock_page(page); | ||
| 666 | return 0; | ||
| 667 | } | ||
| 668 | |||
| 669 | unlock_retry: | ||
| 670 | unlock_page(page); | ||
| 671 | |||
| 672 | retry: | ||
| 673 | return -EAGAIN; | ||
| 674 | } | ||
| 675 | EXPORT_SYMBOL(swap_page); | ||
| 676 | |||
| 677 | /* | ||
| 678 | * Page migration was first developed in the context of the memory hotplug | ||
| 679 | * project. The main authors of the migration code are: | ||
| 680 | * | ||
| 681 | * IWAMOTO Toshihiro <iwamoto@valinux.co.jp> | ||
| 682 | * Hirokazu Takahashi <taka@valinux.co.jp> | ||
| 683 | * Dave Hansen <haveblue@us.ibm.com> | ||
| 684 | * Christoph Lameter <clameter@sgi.com> | ||
| 685 | */ | ||
| 686 | |||
| 687 | /* | ||
| 688 | * Remove references for a page and establish the new page with the correct | ||
| 689 | * basic settings to be able to stop accesses to the page. | ||
| 690 | */ | ||
| 691 | int migrate_page_remove_references(struct page *newpage, | ||
| 692 | struct page *page, int nr_refs) | ||
| 693 | { | ||
| 694 | struct address_space *mapping = page_mapping(page); | ||
| 695 | struct page **radix_pointer; | ||
| 696 | |||
| 697 | /* | ||
| 698 | * Avoid doing any of the following work if the page count | ||
| 699 | * indicates that the page is in use or truncate has removed | ||
| 700 | * the page. | ||
| 701 | */ | ||
| 702 | if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) | ||
| 703 | return -EAGAIN; | ||
| 704 | |||
| 705 | /* | ||
| 706 | * Establish swap ptes for anonymous pages or destroy pte | ||
| 707 | * maps for files. | ||
| 708 | * | ||
| 709 | * In order to reestablish file backed mappings the fault handlers | ||
| 710 | * will take the radix tree_lock which may then be used to stop | ||
| 711 | * processses from accessing this page until the new page is ready. | ||
| 712 | * | ||
| 713 | * A process accessing via a swap pte (an anonymous page) will take a | ||
| 714 | * page_lock on the old page which will block the process until the | ||
| 715 | * migration attempt is complete. At that time the PageSwapCache bit | ||
| 716 | * will be examined. If the page was migrated then the PageSwapCache | ||
| 717 | * bit will be clear and the operation to retrieve the page will be | ||
| 718 | * retried which will find the new page in the radix tree. Then a new | ||
| 719 | * direct mapping may be generated based on the radix tree contents. | ||
| 720 | * | ||
| 721 | * If the page was not migrated then the PageSwapCache bit | ||
| 722 | * is still set and the operation may continue. | ||
| 723 | */ | ||
| 724 | if (try_to_unmap(page, 1) == SWAP_FAIL) | ||
| 725 | /* A vma has VM_LOCKED set -> Permanent failure */ | ||
| 726 | return -EPERM; | ||
| 727 | |||
| 728 | /* | ||
| 729 | * Give up if we were unable to remove all mappings. | ||
| 730 | */ | ||
| 731 | if (page_mapcount(page)) | ||
| 732 | return -EAGAIN; | ||
| 733 | |||
| 734 | write_lock_irq(&mapping->tree_lock); | ||
| 735 | |||
| 736 | radix_pointer = (struct page **)radix_tree_lookup_slot( | ||
| 737 | &mapping->page_tree, | ||
| 738 | page_index(page)); | ||
| 739 | |||
| 740 | if (!page_mapping(page) || page_count(page) != nr_refs || | ||
| 741 | *radix_pointer != page) { | ||
| 742 | write_unlock_irq(&mapping->tree_lock); | ||
| 743 | return -EAGAIN; | ||
| 744 | } | ||
| 745 | |||
| 746 | /* | ||
| 747 | * Now we know that no one else is looking at the page. | ||
| 748 | * | ||
| 749 | * Certain minimal information about a page must be available | ||
| 750 | * in order for other subsystems to properly handle the page if they | ||
| 751 | * find it through the radix tree update before we are finished | ||
| 752 | * copying the page. | ||
| 753 | */ | ||
| 754 | get_page(newpage); | ||
| 755 | newpage->index = page->index; | ||
| 756 | newpage->mapping = page->mapping; | ||
| 757 | if (PageSwapCache(page)) { | ||
| 758 | SetPageSwapCache(newpage); | ||
| 759 | set_page_private(newpage, page_private(page)); | ||
| 760 | } | ||
| 761 | |||
| 762 | *radix_pointer = newpage; | ||
| 763 | __put_page(page); | ||
| 764 | write_unlock_irq(&mapping->tree_lock); | ||
| 765 | |||
| 766 | return 0; | ||
| 767 | } | ||
| 768 | EXPORT_SYMBOL(migrate_page_remove_references); | ||
| 769 | |||
| 770 | /* | ||
| 771 | * Copy the page to its new location | ||
| 772 | */ | ||
| 773 | void migrate_page_copy(struct page *newpage, struct page *page) | ||
| 774 | { | ||
| 775 | copy_highpage(newpage, page); | ||
| 776 | |||
| 777 | if (PageError(page)) | ||
| 778 | SetPageError(newpage); | ||
| 779 | if (PageReferenced(page)) | ||
| 780 | SetPageReferenced(newpage); | ||
| 781 | if (PageUptodate(page)) | ||
| 782 | SetPageUptodate(newpage); | ||
| 783 | if (PageActive(page)) | ||
| 784 | SetPageActive(newpage); | ||
| 785 | if (PageChecked(page)) | ||
| 786 | SetPageChecked(newpage); | ||
| 787 | if (PageMappedToDisk(page)) | ||
| 788 | SetPageMappedToDisk(newpage); | ||
| 789 | |||
| 790 | if (PageDirty(page)) { | ||
| 791 | clear_page_dirty_for_io(page); | ||
| 792 | set_page_dirty(newpage); | ||
| 793 | } | ||
| 794 | |||
| 795 | ClearPageSwapCache(page); | ||
| 796 | ClearPageActive(page); | ||
| 797 | ClearPagePrivate(page); | ||
| 798 | set_page_private(page, 0); | ||
| 799 | page->mapping = NULL; | ||
| 800 | |||
| 801 | /* | ||
| 802 | * If any waiters have accumulated on the new page then | ||
| 803 | * wake them up. | ||
| 804 | */ | ||
| 805 | if (PageWriteback(newpage)) | ||
| 806 | end_page_writeback(newpage); | ||
| 807 | } | ||
| 808 | EXPORT_SYMBOL(migrate_page_copy); | ||
| 809 | |||
| 810 | /* | ||
| 811 | * Common logic to directly migrate a single page suitable for | ||
| 812 | * pages that do not use PagePrivate. | ||
| 813 | * | ||
| 814 | * Pages are locked upon entry and exit. | ||
| 815 | */ | ||
| 816 | int migrate_page(struct page *newpage, struct page *page) | ||
| 817 | { | ||
| 818 | int rc; | ||
| 819 | |||
| 820 | BUG_ON(PageWriteback(page)); /* Writeback must be complete */ | ||
| 821 | |||
| 822 | rc = migrate_page_remove_references(newpage, page, 2); | ||
| 823 | |||
| 824 | if (rc) | ||
| 825 | return rc; | ||
| 826 | |||
| 827 | migrate_page_copy(newpage, page); | ||
| 828 | |||
| 829 | /* | ||
| 830 | * Remove auxiliary swap entries and replace | ||
| 831 | * them with real ptes. | ||
| 832 | * | ||
| 833 | * Note that a real pte entry will allow processes that are not | ||
| 834 | * waiting on the page lock to use the new page via the page tables | ||
| 835 | * before the new page is unlocked. | ||
| 836 | */ | ||
| 837 | remove_from_swap(newpage); | ||
| 838 | return 0; | ||
| 839 | } | 559 | } |
| 840 | EXPORT_SYMBOL(migrate_page); | ||
| 841 | |||
| 842 | /* | ||
| 843 | * migrate_pages | ||
| 844 | * | ||
| 845 | * Two lists are passed to this function. The first list | ||
| 846 | * contains the pages isolated from the LRU to be migrated. | ||
| 847 | * The second list contains new pages that the pages isolated | ||
| 848 | * can be moved to. If the second list is NULL then all | ||
| 849 | * pages are swapped out. | ||
| 850 | * | ||
| 851 | * The function returns after 10 attempts or if no pages | ||
| 852 | * are movable anymore because to has become empty | ||
| 853 | * or no retryable pages exist anymore. | ||
| 854 | * | ||
| 855 | * Return: Number of pages not migrated when "to" ran empty. | ||
| 856 | */ | ||
| 857 | int migrate_pages(struct list_head *from, struct list_head *to, | ||
| 858 | struct list_head *moved, struct list_head *failed) | ||
| 859 | { | ||
| 860 | int retry; | ||
| 861 | int nr_failed = 0; | ||
| 862 | int pass = 0; | ||
| 863 | struct page *page; | ||
| 864 | struct page *page2; | ||
| 865 | int swapwrite = current->flags & PF_SWAPWRITE; | ||
| 866 | int rc; | ||
| 867 | |||
| 868 | if (!swapwrite) | ||
| 869 | current->flags |= PF_SWAPWRITE; | ||
| 870 | |||
| 871 | redo: | ||
| 872 | retry = 0; | ||
| 873 | |||
| 874 | list_for_each_entry_safe(page, page2, from, lru) { | ||
| 875 | struct page *newpage = NULL; | ||
| 876 | struct address_space *mapping; | ||
| 877 | |||
| 878 | cond_resched(); | ||
| 879 | |||
| 880 | rc = 0; | ||
| 881 | if (page_count(page) == 1) | ||
| 882 | /* page was freed from under us. So we are done. */ | ||
| 883 | goto next; | ||
| 884 | |||
| 885 | if (to && list_empty(to)) | ||
| 886 | break; | ||
| 887 | |||
| 888 | /* | ||
| 889 | * Skip locked pages during the first two passes to give the | ||
| 890 | * functions holding the lock time to release the page. Later we | ||
| 891 | * use lock_page() to have a higher chance of acquiring the | ||
| 892 | * lock. | ||
| 893 | */ | ||
| 894 | rc = -EAGAIN; | ||
| 895 | if (pass > 2) | ||
| 896 | lock_page(page); | ||
| 897 | else | ||
| 898 | if (TestSetPageLocked(page)) | ||
| 899 | goto next; | ||
| 900 | |||
| 901 | /* | ||
| 902 | * Only wait on writeback if we have already done a pass where | ||
| 903 | * we we may have triggered writeouts for lots of pages. | ||
| 904 | */ | ||
| 905 | if (pass > 0) { | ||
| 906 | wait_on_page_writeback(page); | ||
| 907 | } else { | ||
| 908 | if (PageWriteback(page)) | ||
| 909 | goto unlock_page; | ||
| 910 | } | ||
| 911 | |||
| 912 | /* | ||
| 913 | * Anonymous pages must have swap cache references otherwise | ||
| 914 | * the information contained in the page maps cannot be | ||
| 915 | * preserved. | ||
| 916 | */ | ||
| 917 | if (PageAnon(page) && !PageSwapCache(page)) { | ||
| 918 | if (!add_to_swap(page, GFP_KERNEL)) { | ||
| 919 | rc = -ENOMEM; | ||
| 920 | goto unlock_page; | ||
| 921 | } | ||
| 922 | } | ||
| 923 | |||
| 924 | if (!to) { | ||
| 925 | rc = swap_page(page); | ||
| 926 | goto next; | ||
| 927 | } | ||
| 928 | |||
| 929 | newpage = lru_to_page(to); | ||
| 930 | lock_page(newpage); | ||
| 931 | |||
| 932 | /* | ||
| 933 | * Pages are properly locked and writeback is complete. | ||
| 934 | * Try to migrate the page. | ||
| 935 | */ | ||
| 936 | mapping = page_mapping(page); | ||
| 937 | if (!mapping) | ||
| 938 | goto unlock_both; | ||
| 939 | |||
| 940 | if (mapping->a_ops->migratepage) { | ||
| 941 | /* | ||
| 942 | * Most pages have a mapping and most filesystems | ||
| 943 | * should provide a migration function. Anonymous | ||
| 944 | * pages are part of swap space which also has its | ||
| 945 | * own migration function. This is the most common | ||
| 946 | * path for page migration. | ||
| 947 | */ | ||
| 948 | rc = mapping->a_ops->migratepage(newpage, page); | ||
| 949 | goto unlock_both; | ||
| 950 | } | ||
| 951 | |||
| 952 | /* | ||
| 953 | * Default handling if a filesystem does not provide | ||
| 954 | * a migration function. We can only migrate clean | ||
| 955 | * pages so try to write out any dirty pages first. | ||
| 956 | */ | ||
| 957 | if (PageDirty(page)) { | ||
| 958 | switch (pageout(page, mapping)) { | ||
| 959 | case PAGE_KEEP: | ||
| 960 | case PAGE_ACTIVATE: | ||
| 961 | goto unlock_both; | ||
| 962 | |||
| 963 | case PAGE_SUCCESS: | ||
| 964 | unlock_page(newpage); | ||
| 965 | goto next; | ||
| 966 | |||
| 967 | case PAGE_CLEAN: | ||
| 968 | ; /* try to migrate the page below */ | ||
| 969 | } | ||
| 970 | } | ||
| 971 | |||
| 972 | /* | ||
| 973 | * Buffers are managed in a filesystem specific way. | ||
| 974 | * We must have no buffers or drop them. | ||
| 975 | */ | ||
| 976 | if (!page_has_buffers(page) || | ||
| 977 | try_to_release_page(page, GFP_KERNEL)) { | ||
| 978 | rc = migrate_page(newpage, page); | ||
| 979 | goto unlock_both; | ||
| 980 | } | ||
| 981 | |||
| 982 | /* | ||
| 983 | * On early passes with mapped pages simply | ||
| 984 | * retry. There may be a lock held for some | ||
| 985 | * buffers that may go away. Later | ||
| 986 | * swap them out. | ||
| 987 | */ | ||
| 988 | if (pass > 4) { | ||
| 989 | /* | ||
| 990 | * Persistently unable to drop buffers..... As a | ||
| 991 | * measure of last resort we fall back to | ||
| 992 | * swap_page(). | ||
| 993 | */ | ||
| 994 | unlock_page(newpage); | ||
| 995 | newpage = NULL; | ||
| 996 | rc = swap_page(page); | ||
| 997 | goto next; | ||
| 998 | } | ||
| 999 | |||
| 1000 | unlock_both: | ||
| 1001 | unlock_page(newpage); | ||
| 1002 | |||
| 1003 | unlock_page: | ||
| 1004 | unlock_page(page); | ||
| 1005 | |||
| 1006 | next: | ||
| 1007 | if (rc == -EAGAIN) { | ||
| 1008 | retry++; | ||
| 1009 | } else if (rc) { | ||
| 1010 | /* Permanent failure */ | ||
| 1011 | list_move(&page->lru, failed); | ||
| 1012 | nr_failed++; | ||
| 1013 | } else { | ||
| 1014 | if (newpage) { | ||
| 1015 | /* Successful migration. Return page to LRU */ | ||
| 1016 | move_to_lru(newpage); | ||
| 1017 | } | ||
| 1018 | list_move(&page->lru, moved); | ||
| 1019 | } | ||
| 1020 | } | ||
| 1021 | if (retry && pass++ < 10) | ||
| 1022 | goto redo; | ||
| 1023 | |||
| 1024 | if (!swapwrite) | ||
| 1025 | current->flags &= ~PF_SWAPWRITE; | ||
| 1026 | |||
| 1027 | return nr_failed + retry; | ||
| 1028 | } | ||
| 1029 | |||
| 1030 | /* | ||
| 1031 | * Isolate one page from the LRU lists and put it on the | ||
| 1032 | * indicated list with elevated refcount. | ||
| 1033 | * | ||
| 1034 | * Result: | ||
| 1035 | * 0 = page not on LRU list | ||
| 1036 | * 1 = page removed from LRU list and added to the specified list. | ||
| 1037 | */ | ||
| 1038 | int isolate_lru_page(struct page *page) | ||
| 1039 | { | ||
| 1040 | int ret = 0; | ||
| 1041 | |||
| 1042 | if (PageLRU(page)) { | ||
| 1043 | struct zone *zone = page_zone(page); | ||
| 1044 | spin_lock_irq(&zone->lru_lock); | ||
| 1045 | if (TestClearPageLRU(page)) { | ||
| 1046 | ret = 1; | ||
| 1047 | get_page(page); | ||
| 1048 | if (PageActive(page)) | ||
| 1049 | del_page_from_active_list(zone, page); | ||
| 1050 | else | ||
| 1051 | del_page_from_inactive_list(zone, page); | ||
| 1052 | } | ||
| 1053 | spin_unlock_irq(&zone->lru_lock); | ||
| 1054 | } | ||
| 1055 | |||
| 1056 | return ret; | ||
| 1057 | } | ||
| 1058 | #endif | ||
| 1059 | 560 | ||
| 1060 | /* | 561 | /* |
| 1061 | * zone->lru_lock is heavily contended. Some of the functions that | 562 | * zone->lru_lock is heavily contended. Some of the functions that |
| @@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page) | |||
| 1074 | * | 575 | * |
| 1075 | * returns how many pages were moved onto *@dst. | 576 | * returns how many pages were moved onto *@dst. |
| 1076 | */ | 577 | */ |
| 1077 | static int isolate_lru_pages(int nr_to_scan, struct list_head *src, | 578 | static unsigned long isolate_lru_pages(unsigned long nr_to_scan, |
| 1078 | struct list_head *dst, int *scanned) | 579 | struct list_head *src, struct list_head *dst, |
| 580 | unsigned long *scanned) | ||
| 1079 | { | 581 | { |
| 1080 | int nr_taken = 0; | 582 | unsigned long nr_taken = 0; |
| 1081 | struct page *page; | 583 | struct page *page; |
| 1082 | int scan = 0; | 584 | unsigned long scan; |
| 1083 | 585 | ||
| 1084 | while (scan++ < nr_to_scan && !list_empty(src)) { | 586 | for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { |
| 587 | struct list_head *target; | ||
| 1085 | page = lru_to_page(src); | 588 | page = lru_to_page(src); |
| 1086 | prefetchw_prev_lru_page(page, src, flags); | 589 | prefetchw_prev_lru_page(page, src, flags); |
| 1087 | 590 | ||
| 1088 | if (!TestClearPageLRU(page)) | 591 | BUG_ON(!PageLRU(page)); |
| 1089 | BUG(); | 592 | |
| 1090 | list_del(&page->lru); | 593 | list_del(&page->lru); |
| 1091 | if (get_page_testone(page)) { | 594 | target = src; |
| 595 | if (likely(get_page_unless_zero(page))) { | ||
| 1092 | /* | 596 | /* |
| 1093 | * It is being freed elsewhere | 597 | * Be careful not to clear PageLRU until after we're |
| 598 | * sure the page is not being freed elsewhere -- the | ||
| 599 | * page release code relies on it. | ||
| 1094 | */ | 600 | */ |
| 1095 | __put_page(page); | 601 | ClearPageLRU(page); |
| 1096 | SetPageLRU(page); | 602 | target = dst; |
| 1097 | list_add(&page->lru, src); | ||
| 1098 | continue; | ||
| 1099 | } else { | ||
| 1100 | list_add(&page->lru, dst); | ||
| 1101 | nr_taken++; | 603 | nr_taken++; |
| 1102 | } | 604 | } /* else it is being freed elsewhere */ |
| 605 | |||
| 606 | list_add(&page->lru, target); | ||
| 1103 | } | 607 | } |
| 1104 | 608 | ||
| 1105 | *scanned = scan; | 609 | *scanned = scan; |
| @@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, | |||
| 1107 | } | 611 | } |
| 1108 | 612 | ||
| 1109 | /* | 613 | /* |
| 1110 | * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed | 614 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
| 615 | * of reclaimed pages | ||
| 1111 | */ | 616 | */ |
| 1112 | static void shrink_cache(struct zone *zone, struct scan_control *sc) | 617 | static unsigned long shrink_inactive_list(unsigned long max_scan, |
| 618 | struct zone *zone, struct scan_control *sc) | ||
| 1113 | { | 619 | { |
| 1114 | LIST_HEAD(page_list); | 620 | LIST_HEAD(page_list); |
| 1115 | struct pagevec pvec; | 621 | struct pagevec pvec; |
| 1116 | int max_scan = sc->nr_to_scan; | 622 | unsigned long nr_scanned = 0; |
| 623 | unsigned long nr_reclaimed = 0; | ||
| 1117 | 624 | ||
| 1118 | pagevec_init(&pvec, 1); | 625 | pagevec_init(&pvec, 1); |
| 1119 | 626 | ||
| 1120 | lru_add_drain(); | 627 | lru_add_drain(); |
| 1121 | spin_lock_irq(&zone->lru_lock); | 628 | spin_lock_irq(&zone->lru_lock); |
| 1122 | while (max_scan > 0) { | 629 | do { |
| 1123 | struct page *page; | 630 | struct page *page; |
| 1124 | int nr_taken; | 631 | unsigned long nr_taken; |
| 1125 | int nr_scan; | 632 | unsigned long nr_scan; |
| 1126 | int nr_freed; | 633 | unsigned long nr_freed; |
| 1127 | 634 | ||
| 1128 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, | 635 | nr_taken = isolate_lru_pages(sc->swap_cluster_max, |
| 1129 | &zone->inactive_list, | 636 | &zone->inactive_list, |
| @@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
| 1132 | zone->pages_scanned += nr_scan; | 639 | zone->pages_scanned += nr_scan; |
| 1133 | spin_unlock_irq(&zone->lru_lock); | 640 | spin_unlock_irq(&zone->lru_lock); |
| 1134 | 641 | ||
| 1135 | if (nr_taken == 0) | 642 | nr_scanned += nr_scan; |
| 1136 | goto done; | 643 | nr_freed = shrink_page_list(&page_list, sc); |
| 1137 | 644 | nr_reclaimed += nr_freed; | |
| 1138 | max_scan -= nr_scan; | ||
| 1139 | nr_freed = shrink_list(&page_list, sc); | ||
| 1140 | |||
| 1141 | local_irq_disable(); | 645 | local_irq_disable(); |
| 1142 | if (current_is_kswapd()) { | 646 | if (current_is_kswapd()) { |
| 1143 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | 647 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); |
| @@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
| 1146 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); | 650 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); |
| 1147 | __mod_page_state_zone(zone, pgsteal, nr_freed); | 651 | __mod_page_state_zone(zone, pgsteal, nr_freed); |
| 1148 | 652 | ||
| 653 | if (nr_taken == 0) | ||
| 654 | goto done; | ||
| 655 | |||
| 1149 | spin_lock(&zone->lru_lock); | 656 | spin_lock(&zone->lru_lock); |
| 1150 | /* | 657 | /* |
| 1151 | * Put back any unfreeable pages. | 658 | * Put back any unfreeable pages. |
| 1152 | */ | 659 | */ |
| 1153 | while (!list_empty(&page_list)) { | 660 | while (!list_empty(&page_list)) { |
| 1154 | page = lru_to_page(&page_list); | 661 | page = lru_to_page(&page_list); |
| 1155 | if (TestSetPageLRU(page)) | 662 | BUG_ON(PageLRU(page)); |
| 1156 | BUG(); | 663 | SetPageLRU(page); |
| 1157 | list_del(&page->lru); | 664 | list_del(&page->lru); |
| 1158 | if (PageActive(page)) | 665 | if (PageActive(page)) |
| 1159 | add_page_to_active_list(zone, page); | 666 | add_page_to_active_list(zone, page); |
| @@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) | |||
| 1165 | spin_lock_irq(&zone->lru_lock); | 672 | spin_lock_irq(&zone->lru_lock); |
| 1166 | } | 673 | } |
| 1167 | } | 674 | } |
| 1168 | } | 675 | } while (nr_scanned < max_scan); |
| 1169 | spin_unlock_irq(&zone->lru_lock); | 676 | spin_unlock(&zone->lru_lock); |
| 1170 | done: | 677 | done: |
| 678 | local_irq_enable(); | ||
| 1171 | pagevec_release(&pvec); | 679 | pagevec_release(&pvec); |
| 680 | return nr_reclaimed; | ||
| 1172 | } | 681 | } |
| 1173 | 682 | ||
| 1174 | /* | 683 | /* |
| @@ -1188,13 +697,12 @@ done: | |||
| 1188 | * The downside is that we have to touch page->_count against each page. | 697 | * The downside is that we have to touch page->_count against each page. |
| 1189 | * But we had to alter page->flags anyway. | 698 | * But we had to alter page->flags anyway. |
| 1190 | */ | 699 | */ |
| 1191 | static void | 700 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
| 1192 | refill_inactive_zone(struct zone *zone, struct scan_control *sc) | 701 | struct scan_control *sc) |
| 1193 | { | 702 | { |
| 1194 | int pgmoved; | 703 | unsigned long pgmoved; |
| 1195 | int pgdeactivate = 0; | 704 | int pgdeactivate = 0; |
| 1196 | int pgscanned; | 705 | unsigned long pgscanned; |
| 1197 | int nr_pages = sc->nr_to_scan; | ||
| 1198 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 706 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
| 1199 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ | 707 | LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ |
| 1200 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ | 708 | LIST_HEAD(l_active); /* Pages to go onto the active_list */ |
| @@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
| 1202 | struct pagevec pvec; | 710 | struct pagevec pvec; |
| 1203 | int reclaim_mapped = 0; | 711 | int reclaim_mapped = 0; |
| 1204 | 712 | ||
| 1205 | if (unlikely(sc->may_swap)) { | 713 | if (sc->may_swap) { |
| 1206 | long mapped_ratio; | 714 | long mapped_ratio; |
| 1207 | long distress; | 715 | long distress; |
| 1208 | long swap_tendency; | 716 | long swap_tendency; |
| @@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
| 1272 | while (!list_empty(&l_inactive)) { | 780 | while (!list_empty(&l_inactive)) { |
| 1273 | page = lru_to_page(&l_inactive); | 781 | page = lru_to_page(&l_inactive); |
| 1274 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 782 | prefetchw_prev_lru_page(page, &l_inactive, flags); |
| 1275 | if (TestSetPageLRU(page)) | 783 | BUG_ON(PageLRU(page)); |
| 1276 | BUG(); | 784 | SetPageLRU(page); |
| 1277 | if (!TestClearPageActive(page)) | 785 | BUG_ON(!PageActive(page)); |
| 1278 | BUG(); | 786 | ClearPageActive(page); |
| 787 | |||
| 1279 | list_move(&page->lru, &zone->inactive_list); | 788 | list_move(&page->lru, &zone->inactive_list); |
| 1280 | pgmoved++; | 789 | pgmoved++; |
| 1281 | if (!pagevec_add(&pvec, page)) { | 790 | if (!pagevec_add(&pvec, page)) { |
| @@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
| 1301 | while (!list_empty(&l_active)) { | 810 | while (!list_empty(&l_active)) { |
| 1302 | page = lru_to_page(&l_active); | 811 | page = lru_to_page(&l_active); |
| 1303 | prefetchw_prev_lru_page(page, &l_active, flags); | 812 | prefetchw_prev_lru_page(page, &l_active, flags); |
| 1304 | if (TestSetPageLRU(page)) | 813 | BUG_ON(PageLRU(page)); |
| 1305 | BUG(); | 814 | SetPageLRU(page); |
| 1306 | BUG_ON(!PageActive(page)); | 815 | BUG_ON(!PageActive(page)); |
| 1307 | list_move(&page->lru, &zone->active_list); | 816 | list_move(&page->lru, &zone->active_list); |
| 1308 | pgmoved++; | 817 | pgmoved++; |
| @@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) | |||
| 1327 | /* | 836 | /* |
| 1328 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 837 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
| 1329 | */ | 838 | */ |
| 1330 | static void | 839 | static unsigned long shrink_zone(int priority, struct zone *zone, |
| 1331 | shrink_zone(struct zone *zone, struct scan_control *sc) | 840 | struct scan_control *sc) |
| 1332 | { | 841 | { |
| 1333 | unsigned long nr_active; | 842 | unsigned long nr_active; |
| 1334 | unsigned long nr_inactive; | 843 | unsigned long nr_inactive; |
| 844 | unsigned long nr_to_scan; | ||
| 845 | unsigned long nr_reclaimed = 0; | ||
| 1335 | 846 | ||
| 1336 | atomic_inc(&zone->reclaim_in_progress); | 847 | atomic_inc(&zone->reclaim_in_progress); |
| 1337 | 848 | ||
| @@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 1339 | * Add one to `nr_to_scan' just to make sure that the kernel will | 850 | * Add one to `nr_to_scan' just to make sure that the kernel will |
| 1340 | * slowly sift through the active list. | 851 | * slowly sift through the active list. |
| 1341 | */ | 852 | */ |
| 1342 | zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; | 853 | zone->nr_scan_active += (zone->nr_active >> priority) + 1; |
| 1343 | nr_active = zone->nr_scan_active; | 854 | nr_active = zone->nr_scan_active; |
| 1344 | if (nr_active >= sc->swap_cluster_max) | 855 | if (nr_active >= sc->swap_cluster_max) |
| 1345 | zone->nr_scan_active = 0; | 856 | zone->nr_scan_active = 0; |
| 1346 | else | 857 | else |
| 1347 | nr_active = 0; | 858 | nr_active = 0; |
| 1348 | 859 | ||
| 1349 | zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; | 860 | zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; |
| 1350 | nr_inactive = zone->nr_scan_inactive; | 861 | nr_inactive = zone->nr_scan_inactive; |
| 1351 | if (nr_inactive >= sc->swap_cluster_max) | 862 | if (nr_inactive >= sc->swap_cluster_max) |
| 1352 | zone->nr_scan_inactive = 0; | 863 | zone->nr_scan_inactive = 0; |
| @@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 1355 | 866 | ||
| 1356 | while (nr_active || nr_inactive) { | 867 | while (nr_active || nr_inactive) { |
| 1357 | if (nr_active) { | 868 | if (nr_active) { |
| 1358 | sc->nr_to_scan = min(nr_active, | 869 | nr_to_scan = min(nr_active, |
| 1359 | (unsigned long)sc->swap_cluster_max); | 870 | (unsigned long)sc->swap_cluster_max); |
| 1360 | nr_active -= sc->nr_to_scan; | 871 | nr_active -= nr_to_scan; |
| 1361 | refill_inactive_zone(zone, sc); | 872 | shrink_active_list(nr_to_scan, zone, sc); |
| 1362 | } | 873 | } |
| 1363 | 874 | ||
| 1364 | if (nr_inactive) { | 875 | if (nr_inactive) { |
| 1365 | sc->nr_to_scan = min(nr_inactive, | 876 | nr_to_scan = min(nr_inactive, |
| 1366 | (unsigned long)sc->swap_cluster_max); | 877 | (unsigned long)sc->swap_cluster_max); |
| 1367 | nr_inactive -= sc->nr_to_scan; | 878 | nr_inactive -= nr_to_scan; |
| 1368 | shrink_cache(zone, sc); | 879 | nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, |
| 880 | sc); | ||
| 1369 | } | 881 | } |
| 1370 | } | 882 | } |
| 1371 | 883 | ||
| 1372 | throttle_vm_writeout(); | 884 | throttle_vm_writeout(); |
| 1373 | 885 | ||
| 1374 | atomic_dec(&zone->reclaim_in_progress); | 886 | atomic_dec(&zone->reclaim_in_progress); |
| 887 | return nr_reclaimed; | ||
| 1375 | } | 888 | } |
| 1376 | 889 | ||
| 1377 | /* | 890 | /* |
| @@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc) | |||
| 1390 | * If a zone is deemed to be full of pinned pages then just give it a light | 903 | * If a zone is deemed to be full of pinned pages then just give it a light |
| 1391 | * scan then give up on it. | 904 | * scan then give up on it. |
| 1392 | */ | 905 | */ |
| 1393 | static void | 906 | static unsigned long shrink_zones(int priority, struct zone **zones, |
| 1394 | shrink_caches(struct zone **zones, struct scan_control *sc) | 907 | struct scan_control *sc) |
| 1395 | { | 908 | { |
| 909 | unsigned long nr_reclaimed = 0; | ||
| 1396 | int i; | 910 | int i; |
| 1397 | 911 | ||
| 1398 | for (i = 0; zones[i] != NULL; i++) { | 912 | for (i = 0; zones[i] != NULL; i++) { |
| @@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
| 1404 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) | 918 | if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) |
| 1405 | continue; | 919 | continue; |
| 1406 | 920 | ||
| 1407 | zone->temp_priority = sc->priority; | 921 | zone->temp_priority = priority; |
| 1408 | if (zone->prev_priority > sc->priority) | 922 | if (zone->prev_priority > priority) |
| 1409 | zone->prev_priority = sc->priority; | 923 | zone->prev_priority = priority; |
| 1410 | 924 | ||
| 1411 | if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) | 925 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 1412 | continue; /* Let kswapd poll it */ | 926 | continue; /* Let kswapd poll it */ |
| 1413 | 927 | ||
| 1414 | shrink_zone(zone, sc); | 928 | nr_reclaimed += shrink_zone(priority, zone, sc); |
| 1415 | } | 929 | } |
| 930 | return nr_reclaimed; | ||
| 1416 | } | 931 | } |
| 1417 | 932 | ||
| 1418 | /* | 933 | /* |
| @@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc) | |||
| 1428 | * holds filesystem locks which prevent writeout this might not work, and the | 943 | * holds filesystem locks which prevent writeout this might not work, and the |
| 1429 | * allocation attempt will fail. | 944 | * allocation attempt will fail. |
| 1430 | */ | 945 | */ |
| 1431 | int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | 946 | unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) |
| 1432 | { | 947 | { |
| 1433 | int priority; | 948 | int priority; |
| 1434 | int ret = 0; | 949 | int ret = 0; |
| 1435 | int total_scanned = 0, total_reclaimed = 0; | 950 | unsigned long total_scanned = 0; |
| 951 | unsigned long nr_reclaimed = 0; | ||
| 1436 | struct reclaim_state *reclaim_state = current->reclaim_state; | 952 | struct reclaim_state *reclaim_state = current->reclaim_state; |
| 1437 | struct scan_control sc; | ||
| 1438 | unsigned long lru_pages = 0; | 953 | unsigned long lru_pages = 0; |
| 1439 | int i; | 954 | int i; |
| 1440 | 955 | struct scan_control sc = { | |
| 1441 | sc.gfp_mask = gfp_mask; | 956 | .gfp_mask = gfp_mask, |
| 1442 | sc.may_writepage = !laptop_mode; | 957 | .may_writepage = !laptop_mode, |
| 1443 | sc.may_swap = 1; | 958 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
| 959 | .may_swap = 1, | ||
| 960 | }; | ||
| 1444 | 961 | ||
| 1445 | inc_page_state(allocstall); | 962 | inc_page_state(allocstall); |
| 1446 | 963 | ||
| @@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
| 1457 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 974 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
| 1458 | sc.nr_mapped = read_page_state(nr_mapped); | 975 | sc.nr_mapped = read_page_state(nr_mapped); |
| 1459 | sc.nr_scanned = 0; | 976 | sc.nr_scanned = 0; |
| 1460 | sc.nr_reclaimed = 0; | ||
| 1461 | sc.priority = priority; | ||
| 1462 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
| 1463 | if (!priority) | 977 | if (!priority) |
| 1464 | disable_swap_token(); | 978 | disable_swap_token(); |
| 1465 | shrink_caches(zones, &sc); | 979 | nr_reclaimed += shrink_zones(priority, zones, &sc); |
| 1466 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); | 980 | shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); |
| 1467 | if (reclaim_state) { | 981 | if (reclaim_state) { |
| 1468 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 982 | nr_reclaimed += reclaim_state->reclaimed_slab; |
| 1469 | reclaim_state->reclaimed_slab = 0; | 983 | reclaim_state->reclaimed_slab = 0; |
| 1470 | } | 984 | } |
| 1471 | total_scanned += sc.nr_scanned; | 985 | total_scanned += sc.nr_scanned; |
| 1472 | total_reclaimed += sc.nr_reclaimed; | 986 | if (nr_reclaimed >= sc.swap_cluster_max) { |
| 1473 | if (total_reclaimed >= sc.swap_cluster_max) { | ||
| 1474 | ret = 1; | 987 | ret = 1; |
| 1475 | goto out; | 988 | goto out; |
| 1476 | } | 989 | } |
| @@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
| 1482 | * that's undesirable in laptop mode, where we *want* lumpy | 995 | * that's undesirable in laptop mode, where we *want* lumpy |
| 1483 | * writeout. So in laptop mode, write out the whole world. | 996 | * writeout. So in laptop mode, write out the whole world. |
| 1484 | */ | 997 | */ |
| 1485 | if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { | 998 | if (total_scanned > sc.swap_cluster_max + |
| 999 | sc.swap_cluster_max / 2) { | ||
| 1486 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); | 1000 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); |
| 1487 | sc.may_writepage = 1; | 1001 | sc.may_writepage = 1; |
| 1488 | } | 1002 | } |
| @@ -1528,22 +1042,26 @@ out: | |||
| 1528 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1042 | * the page allocator fallback scheme to ensure that aging of pages is balanced |
| 1529 | * across the zones. | 1043 | * across the zones. |
| 1530 | */ | 1044 | */ |
| 1531 | static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) | 1045 | static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, |
| 1046 | int order) | ||
| 1532 | { | 1047 | { |
| 1533 | int to_free = nr_pages; | 1048 | unsigned long to_free = nr_pages; |
| 1534 | int all_zones_ok; | 1049 | int all_zones_ok; |
| 1535 | int priority; | 1050 | int priority; |
| 1536 | int i; | 1051 | int i; |
| 1537 | int total_scanned, total_reclaimed; | 1052 | unsigned long total_scanned; |
| 1053 | unsigned long nr_reclaimed; | ||
| 1538 | struct reclaim_state *reclaim_state = current->reclaim_state; | 1054 | struct reclaim_state *reclaim_state = current->reclaim_state; |
| 1539 | struct scan_control sc; | 1055 | struct scan_control sc = { |
| 1056 | .gfp_mask = GFP_KERNEL, | ||
| 1057 | .may_swap = 1, | ||
| 1058 | .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, | ||
| 1059 | }; | ||
| 1540 | 1060 | ||
| 1541 | loop_again: | 1061 | loop_again: |
| 1542 | total_scanned = 0; | 1062 | total_scanned = 0; |
| 1543 | total_reclaimed = 0; | 1063 | nr_reclaimed = 0; |
| 1544 | sc.gfp_mask = GFP_KERNEL; | 1064 | sc.may_writepage = !laptop_mode, |
| 1545 | sc.may_writepage = !laptop_mode; | ||
| 1546 | sc.may_swap = 1; | ||
| 1547 | sc.nr_mapped = read_page_state(nr_mapped); | 1065 | sc.nr_mapped = read_page_state(nr_mapped); |
| 1548 | 1066 | ||
| 1549 | inc_page_state(pageoutrun); | 1067 | inc_page_state(pageoutrun); |
| @@ -1624,15 +1142,11 @@ scan: | |||
| 1624 | if (zone->prev_priority > priority) | 1142 | if (zone->prev_priority > priority) |
| 1625 | zone->prev_priority = priority; | 1143 | zone->prev_priority = priority; |
| 1626 | sc.nr_scanned = 0; | 1144 | sc.nr_scanned = 0; |
| 1627 | sc.nr_reclaimed = 0; | 1145 | nr_reclaimed += shrink_zone(priority, zone, &sc); |
| 1628 | sc.priority = priority; | ||
| 1629 | sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; | ||
| 1630 | shrink_zone(zone, &sc); | ||
| 1631 | reclaim_state->reclaimed_slab = 0; | 1146 | reclaim_state->reclaimed_slab = 0; |
| 1632 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 1147 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
| 1633 | lru_pages); | 1148 | lru_pages); |
| 1634 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 1149 | nr_reclaimed += reclaim_state->reclaimed_slab; |
| 1635 | total_reclaimed += sc.nr_reclaimed; | ||
| 1636 | total_scanned += sc.nr_scanned; | 1150 | total_scanned += sc.nr_scanned; |
| 1637 | if (zone->all_unreclaimable) | 1151 | if (zone->all_unreclaimable) |
| 1638 | continue; | 1152 | continue; |
| @@ -1645,10 +1159,10 @@ scan: | |||
| 1645 | * even in laptop mode | 1159 | * even in laptop mode |
| 1646 | */ | 1160 | */ |
| 1647 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 1161 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
| 1648 | total_scanned > total_reclaimed+total_reclaimed/2) | 1162 | total_scanned > nr_reclaimed + nr_reclaimed / 2) |
| 1649 | sc.may_writepage = 1; | 1163 | sc.may_writepage = 1; |
| 1650 | } | 1164 | } |
| 1651 | if (nr_pages && to_free > total_reclaimed) | 1165 | if (nr_pages && to_free > nr_reclaimed) |
| 1652 | continue; /* swsusp: need to do more work */ | 1166 | continue; /* swsusp: need to do more work */ |
| 1653 | if (all_zones_ok) | 1167 | if (all_zones_ok) |
| 1654 | break; /* kswapd: all done */ | 1168 | break; /* kswapd: all done */ |
| @@ -1665,7 +1179,7 @@ scan: | |||
| 1665 | * matches the direct reclaim path behaviour in terms of impact | 1179 | * matches the direct reclaim path behaviour in terms of impact |
| 1666 | * on zone->*_priority. | 1180 | * on zone->*_priority. |
| 1667 | */ | 1181 | */ |
| 1668 | if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) | 1182 | if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) |
| 1669 | break; | 1183 | break; |
| 1670 | } | 1184 | } |
| 1671 | out: | 1185 | out: |
| @@ -1679,7 +1193,7 @@ out: | |||
| 1679 | goto loop_again; | 1193 | goto loop_again; |
| 1680 | } | 1194 | } |
| 1681 | 1195 | ||
| 1682 | return total_reclaimed; | 1196 | return nr_reclaimed; |
| 1683 | } | 1197 | } |
| 1684 | 1198 | ||
| 1685 | /* | 1199 | /* |
| @@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
| 1779 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed | 1293 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed |
| 1780 | * pages. | 1294 | * pages. |
| 1781 | */ | 1295 | */ |
| 1782 | int shrink_all_memory(int nr_pages) | 1296 | unsigned long shrink_all_memory(unsigned long nr_pages) |
| 1783 | { | 1297 | { |
| 1784 | pg_data_t *pgdat; | 1298 | pg_data_t *pgdat; |
| 1785 | int nr_to_free = nr_pages; | 1299 | unsigned long nr_to_free = nr_pages; |
| 1786 | int ret = 0; | 1300 | unsigned long ret = 0; |
| 1301 | unsigned retry = 2; | ||
| 1787 | struct reclaim_state reclaim_state = { | 1302 | struct reclaim_state reclaim_state = { |
| 1788 | .reclaimed_slab = 0, | 1303 | .reclaimed_slab = 0, |
| 1789 | }; | 1304 | }; |
| 1790 | 1305 | ||
| 1791 | current->reclaim_state = &reclaim_state; | 1306 | current->reclaim_state = &reclaim_state; |
| 1307 | repeat: | ||
| 1792 | for_each_pgdat(pgdat) { | 1308 | for_each_pgdat(pgdat) { |
| 1793 | int freed; | 1309 | unsigned long freed; |
| 1310 | |||
| 1794 | freed = balance_pgdat(pgdat, nr_to_free, 0); | 1311 | freed = balance_pgdat(pgdat, nr_to_free, 0); |
| 1795 | ret += freed; | 1312 | ret += freed; |
| 1796 | nr_to_free -= freed; | 1313 | nr_to_free -= freed; |
| 1797 | if (nr_to_free <= 0) | 1314 | if ((long)nr_to_free <= 0) |
| 1798 | break; | 1315 | break; |
| 1799 | } | 1316 | } |
| 1317 | if (retry-- && ret < nr_pages) { | ||
| 1318 | blk_congestion_wait(WRITE, HZ/5); | ||
| 1319 | goto repeat; | ||
| 1320 | } | ||
| 1800 | current->reclaim_state = NULL; | 1321 | current->reclaim_state = NULL; |
| 1801 | return ret; | 1322 | return ret; |
| 1802 | } | 1323 | } |
| @@ -1808,8 +1329,7 @@ int shrink_all_memory(int nr_pages) | |||
| 1808 | away, we get changed to run anywhere: as the first one comes back, | 1329 | away, we get changed to run anywhere: as the first one comes back, |
| 1809 | restore their cpu bindings. */ | 1330 | restore their cpu bindings. */ |
| 1810 | static int __devinit cpu_callback(struct notifier_block *nfb, | 1331 | static int __devinit cpu_callback(struct notifier_block *nfb, |
| 1811 | unsigned long action, | 1332 | unsigned long action, void *hcpu) |
| 1812 | void *hcpu) | ||
| 1813 | { | 1333 | { |
| 1814 | pg_data_t *pgdat; | 1334 | pg_data_t *pgdat; |
| 1815 | cpumask_t mask; | 1335 | cpumask_t mask; |
| @@ -1829,10 +1349,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
| 1829 | static int __init kswapd_init(void) | 1349 | static int __init kswapd_init(void) |
| 1830 | { | 1350 | { |
| 1831 | pg_data_t *pgdat; | 1351 | pg_data_t *pgdat; |
| 1352 | |||
| 1832 | swap_setup(); | 1353 | swap_setup(); |
| 1833 | for_each_pgdat(pgdat) | 1354 | for_each_pgdat(pgdat) { |
| 1834 | pgdat->kswapd | 1355 | pid_t pid; |
| 1835 | = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); | 1356 | |
| 1357 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); | ||
| 1358 | BUG_ON(pid < 0); | ||
| 1359 | pgdat->kswapd = find_task_by_pid(pid); | ||
| 1360 | } | ||
| 1836 | total_memory = nr_free_pagecache_pages(); | 1361 | total_memory = nr_free_pagecache_pages(); |
| 1837 | hotcpu_notifier(cpu_callback, 0); | 1362 | hotcpu_notifier(cpu_callback, 0); |
| 1838 | return 0; | 1363 | return 0; |
| @@ -1874,46 +1399,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ; | |||
| 1874 | /* | 1399 | /* |
| 1875 | * Try to free up some pages from this zone through reclaim. | 1400 | * Try to free up some pages from this zone through reclaim. |
| 1876 | */ | 1401 | */ |
| 1877 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | 1402 | static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) |
| 1878 | { | 1403 | { |
| 1879 | int nr_pages; | 1404 | /* Minimum pages needed in order to stay on node */ |
| 1405 | const unsigned long nr_pages = 1 << order; | ||
| 1880 | struct task_struct *p = current; | 1406 | struct task_struct *p = current; |
| 1881 | struct reclaim_state reclaim_state; | 1407 | struct reclaim_state reclaim_state; |
| 1882 | struct scan_control sc; | 1408 | int priority; |
| 1883 | cpumask_t mask; | 1409 | unsigned long nr_reclaimed = 0; |
| 1884 | int node_id; | 1410 | struct scan_control sc = { |
| 1885 | 1411 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | |
| 1886 | if (time_before(jiffies, | 1412 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
| 1887 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | 1413 | .nr_mapped = read_page_state(nr_mapped), |
| 1888 | return 0; | 1414 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
| 1889 | 1415 | SWAP_CLUSTER_MAX), | |
| 1890 | if (!(gfp_mask & __GFP_WAIT) || | 1416 | .gfp_mask = gfp_mask, |
| 1891 | zone->all_unreclaimable || | 1417 | }; |
| 1892 | atomic_read(&zone->reclaim_in_progress) > 0 || | ||
| 1893 | (p->flags & PF_MEMALLOC)) | ||
| 1894 | return 0; | ||
| 1895 | |||
| 1896 | node_id = zone->zone_pgdat->node_id; | ||
| 1897 | mask = node_to_cpumask(node_id); | ||
| 1898 | if (!cpus_empty(mask) && node_id != numa_node_id()) | ||
| 1899 | return 0; | ||
| 1900 | |||
| 1901 | sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); | ||
| 1902 | sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); | ||
| 1903 | sc.nr_scanned = 0; | ||
| 1904 | sc.nr_reclaimed = 0; | ||
| 1905 | sc.priority = ZONE_RECLAIM_PRIORITY + 1; | ||
| 1906 | sc.nr_mapped = read_page_state(nr_mapped); | ||
| 1907 | sc.gfp_mask = gfp_mask; | ||
| 1908 | 1418 | ||
| 1909 | disable_swap_token(); | 1419 | disable_swap_token(); |
| 1910 | |||
| 1911 | nr_pages = 1 << order; | ||
| 1912 | if (nr_pages > SWAP_CLUSTER_MAX) | ||
| 1913 | sc.swap_cluster_max = nr_pages; | ||
| 1914 | else | ||
| 1915 | sc.swap_cluster_max = SWAP_CLUSTER_MAX; | ||
| 1916 | |||
| 1917 | cond_resched(); | 1420 | cond_resched(); |
| 1918 | /* | 1421 | /* |
| 1919 | * We need to be able to allocate from the reserves for RECLAIM_SWAP | 1422 | * We need to be able to allocate from the reserves for RECLAIM_SWAP |
| @@ -1928,17 +1431,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1928 | * Free memory by calling shrink zone with increasing priorities | 1431 | * Free memory by calling shrink zone with increasing priorities |
| 1929 | * until we have enough memory freed. | 1432 | * until we have enough memory freed. |
| 1930 | */ | 1433 | */ |
| 1434 | priority = ZONE_RECLAIM_PRIORITY; | ||
| 1931 | do { | 1435 | do { |
| 1932 | sc.priority--; | 1436 | nr_reclaimed += shrink_zone(priority, zone, &sc); |
| 1933 | shrink_zone(zone, &sc); | 1437 | priority--; |
| 1438 | } while (priority >= 0 && nr_reclaimed < nr_pages); | ||
| 1934 | 1439 | ||
| 1935 | } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); | 1440 | if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { |
| 1936 | |||
| 1937 | if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { | ||
| 1938 | /* | 1441 | /* |
| 1939 | * shrink_slab does not currently allow us to determine | 1442 | * shrink_slab() does not currently allow us to determine how |
| 1940 | * how many pages were freed in the zone. So we just | 1443 | * many pages were freed in this zone. So we just shake the slab |
| 1941 | * shake the slab and then go offnode for a single allocation. | 1444 | * a bit and then go off node for this particular allocation |
| 1445 | * despite possibly having freed enough memory to allocate in | ||
| 1446 | * this zone. If we freed local memory then the next | ||
| 1447 | * allocations will be local again. | ||
| 1942 | * | 1448 | * |
| 1943 | * shrink_slab will free memory on all zones and may take | 1449 | * shrink_slab will free memory on all zones and may take |
| 1944 | * a long time. | 1450 | * a long time. |
| @@ -1949,10 +1455,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 1949 | p->reclaim_state = NULL; | 1455 | p->reclaim_state = NULL; |
| 1950 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 1456 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
| 1951 | 1457 | ||
| 1952 | if (sc.nr_reclaimed == 0) | 1458 | if (nr_reclaimed == 0) { |
| 1459 | /* | ||
| 1460 | * We were unable to reclaim enough pages to stay on node. We | ||
| 1461 | * now allow off node accesses for a certain time period before | ||
| 1462 | * trying again to reclaim pages from the local zone. | ||
| 1463 | */ | ||
| 1953 | zone->last_unsuccessful_zone_reclaim = jiffies; | 1464 | zone->last_unsuccessful_zone_reclaim = jiffies; |
| 1465 | } | ||
| 1954 | 1466 | ||
| 1955 | return sc.nr_reclaimed >= nr_pages; | 1467 | return nr_reclaimed >= nr_pages; |
| 1956 | } | 1468 | } |
| 1957 | #endif | ||
| 1958 | 1469 | ||
| 1470 | int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | ||
| 1471 | { | ||
| 1472 | cpumask_t mask; | ||
| 1473 | int node_id; | ||
| 1474 | |||
| 1475 | /* | ||
| 1476 | * Do not reclaim if there was a recent unsuccessful attempt at zone | ||
| 1477 | * reclaim. In that case we let allocations go off node for the | ||
| 1478 | * zone_reclaim_interval. Otherwise we would scan for each off-node | ||
| 1479 | * page allocation. | ||
| 1480 | */ | ||
| 1481 | if (time_before(jiffies, | ||
| 1482 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | ||
| 1483 | return 0; | ||
| 1484 | |||
| 1485 | /* | ||
| 1486 | * Avoid concurrent zone reclaims, do not reclaim in a zone that does | ||
| 1487 | * not have reclaimable pages and if we should not delay the allocation | ||
| 1488 | * then do not scan. | ||
| 1489 | */ | ||
| 1490 | if (!(gfp_mask & __GFP_WAIT) || | ||
| 1491 | zone->all_unreclaimable || | ||
| 1492 | atomic_read(&zone->reclaim_in_progress) > 0 || | ||
| 1493 | (current->flags & PF_MEMALLOC)) | ||
| 1494 | return 0; | ||
| 1495 | |||
| 1496 | /* | ||
| 1497 | * Only run zone reclaim on the local zone or on zones that do not | ||
| 1498 | * have associated processors. This will favor the local processor | ||
| 1499 | * over remote processors and spread off node memory allocations | ||
| 1500 | * as wide as possible. | ||
| 1501 | */ | ||
| 1502 | node_id = zone->zone_pgdat->node_id; | ||
| 1503 | mask = node_to_cpumask(node_id); | ||
| 1504 | if (!cpus_empty(mask) && node_id != numa_node_id()) | ||
| 1505 | return 0; | ||
| 1506 | return __zone_reclaim(zone, gfp_mask, order); | ||
| 1507 | } | ||
| 1508 | #endif | ||
