diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 4 | ||||
| -rw-r--r-- | mm/Makefile | 6 | ||||
| -rw-r--r-- | mm/backing-dev.c | 37 | ||||
| -rw-r--r-- | mm/bootmem.c | 198 | ||||
| -rw-r--r-- | mm/bounce.c | 1 | ||||
| -rw-r--r-- | mm/fadvise.c | 10 | ||||
| -rw-r--r-- | mm/failslab.c | 19 | ||||
| -rw-r--r-- | mm/filemap.c | 6 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 3 | ||||
| -rw-r--r-- | mm/fremap.c | 2 | ||||
| -rw-r--r-- | mm/highmem.c | 2 | ||||
| -rw-r--r-- | mm/hugetlb.c | 11 | ||||
| -rw-r--r-- | mm/kmemleak.c | 1 | ||||
| -rw-r--r-- | mm/ksm.c | 26 | ||||
| -rw-r--r-- | mm/memcontrol.c | 1427 | ||||
| -rw-r--r-- | mm/memory-failure.c | 6 | ||||
| -rw-r--r-- | mm/memory.c | 181 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 10 | ||||
| -rw-r--r-- | mm/mempolicy.c | 165 | ||||
| -rw-r--r-- | mm/migrate.c | 7 | ||||
| -rw-r--r-- | mm/mincore.c | 2 | ||||
| -rw-r--r-- | mm/mlock.c | 12 | ||||
| -rw-r--r-- | mm/mmap.c | 262 | ||||
| -rw-r--r-- | mm/mmu_context.c | 4 | ||||
| -rw-r--r-- | mm/mmu_notifier.c | 1 | ||||
| -rw-r--r-- | mm/mprotect.c | 1 | ||||
| -rw-r--r-- | mm/mremap.c | 10 | ||||
| -rw-r--r-- | mm/nommu.c | 35 | ||||
| -rw-r--r-- | mm/oom_kill.c | 15 | ||||
| -rw-r--r-- | mm/page_alloc.c | 401 | ||||
| -rw-r--r-- | mm/page_cgroup.c | 42 | ||||
| -rw-r--r-- | mm/page_io.c | 1 | ||||
| -rw-r--r-- | mm/pagewalk.c | 47 | ||||
| -rw-r--r-- | mm/percpu.c | 62 | ||||
| -rw-r--r-- | mm/percpu_up.c | 30 | ||||
| -rw-r--r-- | mm/quicklist.c | 1 | ||||
| -rw-r--r-- | mm/readahead.c | 7 | ||||
| -rw-r--r-- | mm/rmap.c | 211 | ||||
| -rw-r--r-- | mm/slab.c | 26 | ||||
| -rw-r--r-- | mm/slub.c | 346 | ||||
| -rw-r--r-- | mm/sparse-vmemmap.c | 77 | ||||
| -rw-r--r-- | mm/sparse.c | 197 | ||||
| -rw-r--r-- | mm/swap.c | 3 | ||||
| -rw-r--r-- | mm/swap_state.c | 1 | ||||
| -rw-r--r-- | mm/swapfile.c | 71 | ||||
| -rw-r--r-- | mm/truncate.c | 1 | ||||
| -rw-r--r-- | mm/util.c | 21 | ||||
| -rw-r--r-- | mm/vmscan.c | 156 | ||||
| -rw-r--r-- | mm/vmstat.c | 18 |
49 files changed, 3110 insertions, 1073 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index d34c2b971032..9c61158308dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME | |||
| 115 | config SPARSEMEM_VMEMMAP_ENABLE | 115 | config SPARSEMEM_VMEMMAP_ENABLE |
| 116 | bool | 116 | bool |
| 117 | 117 | ||
| 118 | config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 119 | def_bool y | ||
| 120 | depends on SPARSEMEM && X86_64 | ||
| 121 | |||
| 118 | config SPARSEMEM_VMEMMAP | 122 | config SPARSEMEM_VMEMMAP |
| 119 | bool "Sparse Memory virtual memmap" | 123 | bool "Sparse Memory virtual memmap" |
| 120 | depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE | 124 | depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE |
diff --git a/mm/Makefile b/mm/Makefile index 7a68d2ab5560..6c2a73a54a43 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -33,7 +33,11 @@ obj-$(CONFIG_FAILSLAB) += failslab.o | |||
| 33 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 33 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
| 34 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 34 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
| 35 | obj-$(CONFIG_MIGRATION) += migrate.o | 35 | obj-$(CONFIG_MIGRATION) += migrate.o |
| 36 | obj-$(CONFIG_SMP) += percpu.o | 36 | ifdef CONFIG_SMP |
| 37 | obj-y += percpu.o | ||
| 38 | else | ||
| 39 | obj-y += percpu_up.o | ||
| 40 | endif | ||
| 37 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 41 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
| 38 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 42 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
| 39 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 43 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0e8ca0347707..707d0dc6da0f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
| @@ -11,6 +11,8 @@ | |||
| 11 | #include <linux/writeback.h> | 11 | #include <linux/writeback.h> |
| 12 | #include <linux/device.h> | 12 | #include <linux/device.h> |
| 13 | 13 | ||
| 14 | static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); | ||
| 15 | |||
| 14 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | 16 | void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) |
| 15 | { | 17 | { |
| 16 | } | 18 | } |
| @@ -25,6 +27,11 @@ struct backing_dev_info default_backing_dev_info = { | |||
| 25 | }; | 27 | }; |
| 26 | EXPORT_SYMBOL_GPL(default_backing_dev_info); | 28 | EXPORT_SYMBOL_GPL(default_backing_dev_info); |
| 27 | 29 | ||
| 30 | struct backing_dev_info noop_backing_dev_info = { | ||
| 31 | .name = "noop", | ||
| 32 | }; | ||
| 33 | EXPORT_SYMBOL_GPL(noop_backing_dev_info); | ||
| 34 | |||
| 28 | static struct class *bdi_class; | 35 | static struct class *bdi_class; |
| 29 | 36 | ||
| 30 | /* | 37 | /* |
| @@ -227,6 +234,9 @@ static struct device_attribute bdi_dev_attrs[] = { | |||
| 227 | static __init int bdi_class_init(void) | 234 | static __init int bdi_class_init(void) |
| 228 | { | 235 | { |
| 229 | bdi_class = class_create(THIS_MODULE, "bdi"); | 236 | bdi_class = class_create(THIS_MODULE, "bdi"); |
| 237 | if (IS_ERR(bdi_class)) | ||
| 238 | return PTR_ERR(bdi_class); | ||
| 239 | |||
| 230 | bdi_class->dev_attrs = bdi_dev_attrs; | 240 | bdi_class->dev_attrs = bdi_dev_attrs; |
| 231 | bdi_debug_init(); | 241 | bdi_debug_init(); |
| 232 | return 0; | 242 | return 0; |
| @@ -712,6 +722,33 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
| 712 | } | 722 | } |
| 713 | EXPORT_SYMBOL(bdi_destroy); | 723 | EXPORT_SYMBOL(bdi_destroy); |
| 714 | 724 | ||
| 725 | /* | ||
| 726 | * For use from filesystems to quickly init and register a bdi associated | ||
| 727 | * with dirty writeback | ||
| 728 | */ | ||
| 729 | int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, | ||
| 730 | unsigned int cap) | ||
| 731 | { | ||
| 732 | char tmp[32]; | ||
| 733 | int err; | ||
| 734 | |||
| 735 | bdi->name = name; | ||
| 736 | bdi->capabilities = cap; | ||
| 737 | err = bdi_init(bdi); | ||
| 738 | if (err) | ||
| 739 | return err; | ||
| 740 | |||
| 741 | sprintf(tmp, "%.28s%s", name, "-%d"); | ||
| 742 | err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); | ||
| 743 | if (err) { | ||
| 744 | bdi_destroy(bdi); | ||
| 745 | return err; | ||
| 746 | } | ||
| 747 | |||
| 748 | return 0; | ||
| 749 | } | ||
| 750 | EXPORT_SYMBOL(bdi_setup_and_register); | ||
| 751 | |||
| 715 | static wait_queue_head_t congestion_wqh[2] = { | 752 | static wait_queue_head_t congestion_wqh[2] = { |
| 716 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), | 753 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), |
| 717 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) | 754 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 7d1486875e1c..58c66cc5056a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -10,9 +10,11 @@ | |||
| 10 | */ | 10 | */ |
| 11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
| 12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
| 13 | #include <linux/slab.h> | ||
| 13 | #include <linux/bootmem.h> | 14 | #include <linux/bootmem.h> |
| 14 | #include <linux/module.h> | 15 | #include <linux/module.h> |
| 15 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
| 17 | #include <linux/range.h> | ||
| 16 | 18 | ||
| 17 | #include <asm/bug.h> | 19 | #include <asm/bug.h> |
| 18 | #include <asm/io.h> | 20 | #include <asm/io.h> |
| @@ -32,6 +34,7 @@ unsigned long max_pfn; | |||
| 32 | unsigned long saved_max_pfn; | 34 | unsigned long saved_max_pfn; |
| 33 | #endif | 35 | #endif |
| 34 | 36 | ||
| 37 | #ifndef CONFIG_NO_BOOTMEM | ||
| 35 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; | 38 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; |
| 36 | 39 | ||
| 37 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); | 40 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); |
| @@ -142,7 +145,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
| 142 | min_low_pfn = start; | 145 | min_low_pfn = start; |
| 143 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); | 146 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); |
| 144 | } | 147 | } |
| 145 | 148 | #endif | |
| 146 | /* | 149 | /* |
| 147 | * free_bootmem_late - free bootmem pages directly to page allocator | 150 | * free_bootmem_late - free bootmem pages directly to page allocator |
| 148 | * @addr: starting address of the range | 151 | * @addr: starting address of the range |
| @@ -167,6 +170,53 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) | |||
| 167 | } | 170 | } |
| 168 | } | 171 | } |
| 169 | 172 | ||
| 173 | #ifdef CONFIG_NO_BOOTMEM | ||
| 174 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | ||
| 175 | { | ||
| 176 | int i; | ||
| 177 | unsigned long start_aligned, end_aligned; | ||
| 178 | int order = ilog2(BITS_PER_LONG); | ||
| 179 | |||
| 180 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | ||
| 181 | end_aligned = end & ~(BITS_PER_LONG - 1); | ||
| 182 | |||
| 183 | if (end_aligned <= start_aligned) { | ||
| 184 | for (i = start; i < end; i++) | ||
| 185 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
| 186 | |||
| 187 | return; | ||
| 188 | } | ||
| 189 | |||
| 190 | for (i = start; i < start_aligned; i++) | ||
| 191 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
| 192 | |||
| 193 | for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) | ||
| 194 | __free_pages_bootmem(pfn_to_page(i), order); | ||
| 195 | |||
| 196 | for (i = end_aligned; i < end; i++) | ||
| 197 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
| 198 | } | ||
| 199 | |||
| 200 | unsigned long __init free_all_memory_core_early(int nodeid) | ||
| 201 | { | ||
| 202 | int i; | ||
| 203 | u64 start, end; | ||
| 204 | unsigned long count = 0; | ||
| 205 | struct range *range = NULL; | ||
| 206 | int nr_range; | ||
| 207 | |||
| 208 | nr_range = get_free_all_memory_range(&range, nodeid); | ||
| 209 | |||
| 210 | for (i = 0; i < nr_range; i++) { | ||
| 211 | start = range[i].start; | ||
| 212 | end = range[i].end; | ||
| 213 | count += end - start; | ||
| 214 | __free_pages_memory(start, end); | ||
| 215 | } | ||
| 216 | |||
| 217 | return count; | ||
| 218 | } | ||
| 219 | #else | ||
| 170 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | 220 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
| 171 | { | 221 | { |
| 172 | int aligned; | 222 | int aligned; |
| @@ -227,6 +277,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
| 227 | 277 | ||
| 228 | return count; | 278 | return count; |
| 229 | } | 279 | } |
| 280 | #endif | ||
| 230 | 281 | ||
| 231 | /** | 282 | /** |
| 232 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 283 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
| @@ -237,7 +288,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
| 237 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 288 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
| 238 | { | 289 | { |
| 239 | register_page_bootmem_info_node(pgdat); | 290 | register_page_bootmem_info_node(pgdat); |
| 291 | #ifdef CONFIG_NO_BOOTMEM | ||
| 292 | /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ | ||
| 293 | return 0; | ||
| 294 | #else | ||
| 240 | return free_all_bootmem_core(pgdat->bdata); | 295 | return free_all_bootmem_core(pgdat->bdata); |
| 296 | #endif | ||
| 241 | } | 297 | } |
| 242 | 298 | ||
| 243 | /** | 299 | /** |
| @@ -247,9 +303,27 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | |||
| 247 | */ | 303 | */ |
| 248 | unsigned long __init free_all_bootmem(void) | 304 | unsigned long __init free_all_bootmem(void) |
| 249 | { | 305 | { |
| 250 | return free_all_bootmem_core(NODE_DATA(0)->bdata); | 306 | #ifdef CONFIG_NO_BOOTMEM |
| 307 | /* | ||
| 308 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | ||
| 309 | * because in some case like Node0 doesnt have RAM installed | ||
| 310 | * low ram will be on Node1 | ||
| 311 | * Use MAX_NUMNODES will make sure all ranges in early_node_map[] | ||
| 312 | * will be used instead of only Node0 related | ||
| 313 | */ | ||
| 314 | return free_all_memory_core_early(MAX_NUMNODES); | ||
| 315 | #else | ||
| 316 | unsigned long total_pages = 0; | ||
| 317 | bootmem_data_t *bdata; | ||
| 318 | |||
| 319 | list_for_each_entry(bdata, &bdata_list, list) | ||
| 320 | total_pages += free_all_bootmem_core(bdata); | ||
| 321 | |||
| 322 | return total_pages; | ||
| 323 | #endif | ||
| 251 | } | 324 | } |
| 252 | 325 | ||
| 326 | #ifndef CONFIG_NO_BOOTMEM | ||
| 253 | static void __init __free(bootmem_data_t *bdata, | 327 | static void __init __free(bootmem_data_t *bdata, |
| 254 | unsigned long sidx, unsigned long eidx) | 328 | unsigned long sidx, unsigned long eidx) |
| 255 | { | 329 | { |
| @@ -344,6 +418,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, | |||
| 344 | } | 418 | } |
| 345 | BUG(); | 419 | BUG(); |
| 346 | } | 420 | } |
| 421 | #endif | ||
| 347 | 422 | ||
| 348 | /** | 423 | /** |
| 349 | * free_bootmem_node - mark a page range as usable | 424 | * free_bootmem_node - mark a page range as usable |
| @@ -358,6 +433,9 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, | |||
| 358 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 433 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
| 359 | unsigned long size) | 434 | unsigned long size) |
| 360 | { | 435 | { |
| 436 | #ifdef CONFIG_NO_BOOTMEM | ||
| 437 | free_early(physaddr, physaddr + size); | ||
| 438 | #else | ||
| 361 | unsigned long start, end; | 439 | unsigned long start, end; |
| 362 | 440 | ||
| 363 | kmemleak_free_part(__va(physaddr), size); | 441 | kmemleak_free_part(__va(physaddr), size); |
| @@ -366,6 +444,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
| 366 | end = PFN_DOWN(physaddr + size); | 444 | end = PFN_DOWN(physaddr + size); |
| 367 | 445 | ||
| 368 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); | 446 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); |
| 447 | #endif | ||
| 369 | } | 448 | } |
| 370 | 449 | ||
| 371 | /** | 450 | /** |
| @@ -379,6 +458,9 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
| 379 | */ | 458 | */ |
| 380 | void __init free_bootmem(unsigned long addr, unsigned long size) | 459 | void __init free_bootmem(unsigned long addr, unsigned long size) |
| 381 | { | 460 | { |
| 461 | #ifdef CONFIG_NO_BOOTMEM | ||
| 462 | free_early(addr, addr + size); | ||
| 463 | #else | ||
| 382 | unsigned long start, end; | 464 | unsigned long start, end; |
| 383 | 465 | ||
| 384 | kmemleak_free_part(__va(addr), size); | 466 | kmemleak_free_part(__va(addr), size); |
| @@ -387,6 +469,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
| 387 | end = PFN_DOWN(addr + size); | 469 | end = PFN_DOWN(addr + size); |
| 388 | 470 | ||
| 389 | mark_bootmem(start, end, 0, 0); | 471 | mark_bootmem(start, end, 0, 0); |
| 472 | #endif | ||
| 390 | } | 473 | } |
| 391 | 474 | ||
| 392 | /** | 475 | /** |
| @@ -403,12 +486,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
| 403 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 486 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
| 404 | unsigned long size, int flags) | 487 | unsigned long size, int flags) |
| 405 | { | 488 | { |
| 489 | #ifdef CONFIG_NO_BOOTMEM | ||
| 490 | panic("no bootmem"); | ||
| 491 | return 0; | ||
| 492 | #else | ||
| 406 | unsigned long start, end; | 493 | unsigned long start, end; |
| 407 | 494 | ||
| 408 | start = PFN_DOWN(physaddr); | 495 | start = PFN_DOWN(physaddr); |
| 409 | end = PFN_UP(physaddr + size); | 496 | end = PFN_UP(physaddr + size); |
| 410 | 497 | ||
| 411 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); | 498 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); |
| 499 | #endif | ||
| 412 | } | 500 | } |
| 413 | 501 | ||
| 414 | /** | 502 | /** |
| @@ -424,14 +512,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
| 424 | int __init reserve_bootmem(unsigned long addr, unsigned long size, | 512 | int __init reserve_bootmem(unsigned long addr, unsigned long size, |
| 425 | int flags) | 513 | int flags) |
| 426 | { | 514 | { |
| 515 | #ifdef CONFIG_NO_BOOTMEM | ||
| 516 | panic("no bootmem"); | ||
| 517 | return 0; | ||
| 518 | #else | ||
| 427 | unsigned long start, end; | 519 | unsigned long start, end; |
| 428 | 520 | ||
| 429 | start = PFN_DOWN(addr); | 521 | start = PFN_DOWN(addr); |
| 430 | end = PFN_UP(addr + size); | 522 | end = PFN_UP(addr + size); |
| 431 | 523 | ||
| 432 | return mark_bootmem(start, end, 1, flags); | 524 | return mark_bootmem(start, end, 1, flags); |
| 525 | #endif | ||
| 433 | } | 526 | } |
| 434 | 527 | ||
| 528 | #ifndef CONFIG_NO_BOOTMEM | ||
| 435 | static unsigned long __init align_idx(struct bootmem_data *bdata, | 529 | static unsigned long __init align_idx(struct bootmem_data *bdata, |
| 436 | unsigned long idx, unsigned long step) | 530 | unsigned long idx, unsigned long step) |
| 437 | { | 531 | { |
| @@ -582,12 +676,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
| 582 | #endif | 676 | #endif |
| 583 | return NULL; | 677 | return NULL; |
| 584 | } | 678 | } |
| 679 | #endif | ||
| 585 | 680 | ||
| 586 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | 681 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, |
| 587 | unsigned long align, | 682 | unsigned long align, |
| 588 | unsigned long goal, | 683 | unsigned long goal, |
| 589 | unsigned long limit) | 684 | unsigned long limit) |
| 590 | { | 685 | { |
| 686 | #ifdef CONFIG_NO_BOOTMEM | ||
| 687 | void *ptr; | ||
| 688 | |||
| 689 | if (WARN_ON_ONCE(slab_is_available())) | ||
| 690 | return kzalloc(size, GFP_NOWAIT); | ||
| 691 | |||
| 692 | restart: | ||
| 693 | |||
| 694 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); | ||
| 695 | |||
| 696 | if (ptr) | ||
| 697 | return ptr; | ||
| 698 | |||
| 699 | if (goal != 0) { | ||
| 700 | goal = 0; | ||
| 701 | goto restart; | ||
| 702 | } | ||
| 703 | |||
| 704 | return NULL; | ||
| 705 | #else | ||
| 591 | bootmem_data_t *bdata; | 706 | bootmem_data_t *bdata; |
| 592 | void *region; | 707 | void *region; |
| 593 | 708 | ||
| @@ -613,6 +728,7 @@ restart: | |||
| 613 | } | 728 | } |
| 614 | 729 | ||
| 615 | return NULL; | 730 | return NULL; |
| 731 | #endif | ||
| 616 | } | 732 | } |
| 617 | 733 | ||
| 618 | /** | 734 | /** |
| @@ -631,7 +747,13 @@ restart: | |||
| 631 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | 747 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, |
| 632 | unsigned long goal) | 748 | unsigned long goal) |
| 633 | { | 749 | { |
| 634 | return ___alloc_bootmem_nopanic(size, align, goal, 0); | 750 | unsigned long limit = 0; |
| 751 | |||
| 752 | #ifdef CONFIG_NO_BOOTMEM | ||
| 753 | limit = -1UL; | ||
| 754 | #endif | ||
| 755 | |||
| 756 | return ___alloc_bootmem_nopanic(size, align, goal, limit); | ||
| 635 | } | 757 | } |
| 636 | 758 | ||
| 637 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | 759 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, |
| @@ -665,9 +787,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | |||
| 665 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, | 787 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, |
| 666 | unsigned long goal) | 788 | unsigned long goal) |
| 667 | { | 789 | { |
| 668 | return ___alloc_bootmem(size, align, goal, 0); | 790 | unsigned long limit = 0; |
| 791 | |||
| 792 | #ifdef CONFIG_NO_BOOTMEM | ||
| 793 | limit = -1UL; | ||
| 794 | #endif | ||
| 795 | |||
| 796 | return ___alloc_bootmem(size, align, goal, limit); | ||
| 669 | } | 797 | } |
| 670 | 798 | ||
| 799 | #ifndef CONFIG_NO_BOOTMEM | ||
| 671 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | 800 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, |
| 672 | unsigned long size, unsigned long align, | 801 | unsigned long size, unsigned long align, |
| 673 | unsigned long goal, unsigned long limit) | 802 | unsigned long goal, unsigned long limit) |
| @@ -684,6 +813,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | |||
| 684 | 813 | ||
| 685 | return ___alloc_bootmem(size, align, goal, limit); | 814 | return ___alloc_bootmem(size, align, goal, limit); |
| 686 | } | 815 | } |
| 816 | #endif | ||
| 687 | 817 | ||
| 688 | /** | 818 | /** |
| 689 | * __alloc_bootmem_node - allocate boot memory from a specific node | 819 | * __alloc_bootmem_node - allocate boot memory from a specific node |
| @@ -706,7 +836,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
| 706 | if (WARN_ON_ONCE(slab_is_available())) | 836 | if (WARN_ON_ONCE(slab_is_available())) |
| 707 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 837 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
| 708 | 838 | ||
| 839 | #ifdef CONFIG_NO_BOOTMEM | ||
| 840 | return __alloc_memory_core_early(pgdat->node_id, size, align, | ||
| 841 | goal, -1ULL); | ||
| 842 | #else | ||
| 709 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | 843 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); |
| 844 | #endif | ||
| 845 | } | ||
| 846 | |||
| 847 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | ||
| 848 | unsigned long align, unsigned long goal) | ||
| 849 | { | ||
| 850 | #ifdef MAX_DMA32_PFN | ||
| 851 | unsigned long end_pfn; | ||
| 852 | |||
| 853 | if (WARN_ON_ONCE(slab_is_available())) | ||
| 854 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
| 855 | |||
| 856 | /* update goal according ...MAX_DMA32_PFN */ | ||
| 857 | end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
| 858 | |||
| 859 | if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && | ||
| 860 | (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { | ||
| 861 | void *ptr; | ||
| 862 | unsigned long new_goal; | ||
| 863 | |||
| 864 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; | ||
| 865 | #ifdef CONFIG_NO_BOOTMEM | ||
| 866 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
| 867 | new_goal, -1ULL); | ||
| 868 | #else | ||
| 869 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, | ||
| 870 | new_goal, 0); | ||
| 871 | #endif | ||
| 872 | if (ptr) | ||
| 873 | return ptr; | ||
| 874 | } | ||
| 875 | #endif | ||
| 876 | |||
| 877 | return __alloc_bootmem_node(pgdat, size, align, goal); | ||
| 878 | |||
| 710 | } | 879 | } |
| 711 | 880 | ||
| 712 | #ifdef CONFIG_SPARSEMEM | 881 | #ifdef CONFIG_SPARSEMEM |
| @@ -720,6 +889,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
| 720 | void * __init alloc_bootmem_section(unsigned long size, | 889 | void * __init alloc_bootmem_section(unsigned long size, |
| 721 | unsigned long section_nr) | 890 | unsigned long section_nr) |
| 722 | { | 891 | { |
| 892 | #ifdef CONFIG_NO_BOOTMEM | ||
| 893 | unsigned long pfn, goal, limit; | ||
| 894 | |||
| 895 | pfn = section_nr_to_pfn(section_nr); | ||
| 896 | goal = pfn << PAGE_SHIFT; | ||
| 897 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
| 898 | |||
| 899 | return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, | ||
| 900 | SMP_CACHE_BYTES, goal, limit); | ||
| 901 | #else | ||
| 723 | bootmem_data_t *bdata; | 902 | bootmem_data_t *bdata; |
| 724 | unsigned long pfn, goal, limit; | 903 | unsigned long pfn, goal, limit; |
| 725 | 904 | ||
| @@ -729,6 +908,7 @@ void * __init alloc_bootmem_section(unsigned long size, | |||
| 729 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | 908 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
| 730 | 909 | ||
| 731 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); | 910 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); |
| 911 | #endif | ||
| 732 | } | 912 | } |
| 733 | #endif | 913 | #endif |
| 734 | 914 | ||
| @@ -740,11 +920,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | |||
| 740 | if (WARN_ON_ONCE(slab_is_available())) | 920 | if (WARN_ON_ONCE(slab_is_available())) |
| 741 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 921 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
| 742 | 922 | ||
| 923 | #ifdef CONFIG_NO_BOOTMEM | ||
| 924 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
| 925 | goal, -1ULL); | ||
| 926 | #else | ||
| 743 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | 927 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); |
| 744 | if (ptr) | 928 | if (ptr) |
| 745 | return ptr; | 929 | return ptr; |
| 746 | 930 | ||
| 747 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | 931 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); |
| 932 | #endif | ||
| 748 | if (ptr) | 933 | if (ptr) |
| 749 | return ptr; | 934 | return ptr; |
| 750 | 935 | ||
| @@ -795,6 +980,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | |||
| 795 | if (WARN_ON_ONCE(slab_is_available())) | 980 | if (WARN_ON_ONCE(slab_is_available())) |
| 796 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 981 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
| 797 | 982 | ||
| 983 | #ifdef CONFIG_NO_BOOTMEM | ||
| 984 | return __alloc_memory_core_early(pgdat->node_id, size, align, | ||
| 985 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
| 986 | #else | ||
| 798 | return ___alloc_bootmem_node(pgdat->bdata, size, align, | 987 | return ___alloc_bootmem_node(pgdat->bdata, size, align, |
| 799 | goal, ARCH_LOW_ADDRESS_LIMIT); | 988 | goal, ARCH_LOW_ADDRESS_LIMIT); |
| 989 | #endif | ||
| 800 | } | 990 | } |
diff --git a/mm/bounce.c b/mm/bounce.c index a2b76a588e34..13b6dad1eed2 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
| 7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
| 8 | #include <linux/swap.h> | 8 | #include <linux/swap.h> |
| 9 | #include <linux/gfp.h> | ||
| 9 | #include <linux/bio.h> | 10 | #include <linux/bio.h> |
| 10 | #include <linux/pagemap.h> | 11 | #include <linux/pagemap.h> |
| 11 | #include <linux/mempool.h> | 12 | #include <linux/mempool.h> |
diff --git a/mm/fadvise.c b/mm/fadvise.c index e43359214f6f..8d723c9e8b75 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
| @@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
| 77 | switch (advice) { | 77 | switch (advice) { |
| 78 | case POSIX_FADV_NORMAL: | 78 | case POSIX_FADV_NORMAL: |
| 79 | file->f_ra.ra_pages = bdi->ra_pages; | 79 | file->f_ra.ra_pages = bdi->ra_pages; |
| 80 | spin_lock(&file->f_lock); | ||
| 81 | file->f_mode &= ~FMODE_RANDOM; | ||
| 82 | spin_unlock(&file->f_lock); | ||
| 80 | break; | 83 | break; |
| 81 | case POSIX_FADV_RANDOM: | 84 | case POSIX_FADV_RANDOM: |
| 82 | file->f_ra.ra_pages = 0; | 85 | spin_lock(&file->f_lock); |
| 86 | file->f_mode |= FMODE_RANDOM; | ||
| 87 | spin_unlock(&file->f_lock); | ||
| 83 | break; | 88 | break; |
| 84 | case POSIX_FADV_SEQUENTIAL: | 89 | case POSIX_FADV_SEQUENTIAL: |
| 85 | file->f_ra.ra_pages = bdi->ra_pages * 2; | 90 | file->f_ra.ra_pages = bdi->ra_pages * 2; |
| 91 | spin_lock(&file->f_lock); | ||
| 92 | file->f_mode &= ~FMODE_RANDOM; | ||
| 93 | spin_unlock(&file->f_lock); | ||
| 86 | break; | 94 | break; |
| 87 | case POSIX_FADV_WILLNEED: | 95 | case POSIX_FADV_WILLNEED: |
| 88 | if (!mapping->a_ops->readpage) { | 96 | if (!mapping->a_ops->readpage) { |
diff --git a/mm/failslab.c b/mm/failslab.c index 9339de5f0a91..c5f88f240ddc 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
| @@ -1,18 +1,21 @@ | |||
| 1 | #include <linux/fault-inject.h> | 1 | #include <linux/fault-inject.h> |
| 2 | #include <linux/gfp.h> | 2 | #include <linux/slab.h> |
| 3 | 3 | ||
| 4 | static struct { | 4 | static struct { |
| 5 | struct fault_attr attr; | 5 | struct fault_attr attr; |
| 6 | u32 ignore_gfp_wait; | 6 | u32 ignore_gfp_wait; |
| 7 | int cache_filter; | ||
| 7 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 8 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
| 8 | struct dentry *ignore_gfp_wait_file; | 9 | struct dentry *ignore_gfp_wait_file; |
| 10 | struct dentry *cache_filter_file; | ||
| 9 | #endif | 11 | #endif |
| 10 | } failslab = { | 12 | } failslab = { |
| 11 | .attr = FAULT_ATTR_INITIALIZER, | 13 | .attr = FAULT_ATTR_INITIALIZER, |
| 12 | .ignore_gfp_wait = 1, | 14 | .ignore_gfp_wait = 1, |
| 15 | .cache_filter = 0, | ||
| 13 | }; | 16 | }; |
| 14 | 17 | ||
| 15 | bool should_failslab(size_t size, gfp_t gfpflags) | 18 | bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) |
| 16 | { | 19 | { |
| 17 | if (gfpflags & __GFP_NOFAIL) | 20 | if (gfpflags & __GFP_NOFAIL) |
| 18 | return false; | 21 | return false; |
| @@ -20,6 +23,9 @@ bool should_failslab(size_t size, gfp_t gfpflags) | |||
| 20 | if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) | 23 | if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) |
| 21 | return false; | 24 | return false; |
| 22 | 25 | ||
| 26 | if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) | ||
| 27 | return false; | ||
| 28 | |||
| 23 | return should_fail(&failslab.attr, size); | 29 | return should_fail(&failslab.attr, size); |
| 24 | } | 30 | } |
| 25 | 31 | ||
| @@ -30,7 +36,6 @@ static int __init setup_failslab(char *str) | |||
| 30 | __setup("failslab=", setup_failslab); | 36 | __setup("failslab=", setup_failslab); |
| 31 | 37 | ||
| 32 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 38 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
| 33 | |||
| 34 | static int __init failslab_debugfs_init(void) | 39 | static int __init failslab_debugfs_init(void) |
| 35 | { | 40 | { |
| 36 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 41 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
| @@ -46,8 +51,14 @@ static int __init failslab_debugfs_init(void) | |||
| 46 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | 51 | debugfs_create_bool("ignore-gfp-wait", mode, dir, |
| 47 | &failslab.ignore_gfp_wait); | 52 | &failslab.ignore_gfp_wait); |
| 48 | 53 | ||
| 49 | if (!failslab.ignore_gfp_wait_file) { | 54 | failslab.cache_filter_file = |
| 55 | debugfs_create_bool("cache-filter", mode, dir, | ||
| 56 | &failslab.cache_filter); | ||
| 57 | |||
| 58 | if (!failslab.ignore_gfp_wait_file || | ||
| 59 | !failslab.cache_filter_file) { | ||
| 50 | err = -ENOMEM; | 60 | err = -ENOMEM; |
| 61 | debugfs_remove(failslab.cache_filter_file); | ||
| 51 | debugfs_remove(failslab.ignore_gfp_wait_file); | 62 | debugfs_remove(failslab.ignore_gfp_wait_file); |
| 52 | cleanup_fault_attr_dentries(&failslab.attr); | 63 | cleanup_fault_attr_dentries(&failslab.attr); |
| 53 | } | 64 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 698ea80f2102..140ebda9640f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -10,13 +10,13 @@ | |||
| 10 | * the NFS filesystem used to do this differently, for example) | 10 | * the NFS filesystem used to do this differently, for example) |
| 11 | */ | 11 | */ |
| 12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
| 13 | #include <linux/slab.h> | ||
| 14 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
| 15 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
| 16 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
| 17 | #include <linux/aio.h> | 16 | #include <linux/aio.h> |
| 18 | #include <linux/capability.h> | 17 | #include <linux/capability.h> |
| 19 | #include <linux/kernel_stat.h> | 18 | #include <linux/kernel_stat.h> |
| 19 | #include <linux/gfp.h> | ||
| 20 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
| 21 | #include <linux/swap.h> | 21 | #include <linux/swap.h> |
| 22 | #include <linux/mman.h> | 22 | #include <linux/mman.h> |
| @@ -1117,7 +1117,7 @@ readpage: | |||
| 1117 | if (!PageUptodate(page)) { | 1117 | if (!PageUptodate(page)) { |
| 1118 | if (page->mapping == NULL) { | 1118 | if (page->mapping == NULL) { |
| 1119 | /* | 1119 | /* |
| 1120 | * invalidate_inode_pages got it | 1120 | * invalidate_mapping_pages got it |
| 1121 | */ | 1121 | */ |
| 1122 | unlock_page(page); | 1122 | unlock_page(page); |
| 1123 | page_cache_release(page); | 1123 | page_cache_release(page); |
| @@ -1986,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count); | |||
| 1986 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) | 1986 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) |
| 1987 | { | 1987 | { |
| 1988 | struct inode *inode = file->f_mapping->host; | 1988 | struct inode *inode = file->f_mapping->host; |
| 1989 | unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | 1989 | unsigned long limit = rlimit(RLIMIT_FSIZE); |
| 1990 | 1990 | ||
| 1991 | if (unlikely(*pos < 0)) | 1991 | if (unlikely(*pos < 0)) |
| 1992 | return -EINVAL; | 1992 | return -EINVAL; |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 1888b2d71bb8..83364df74a33 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
| @@ -17,6 +17,7 @@ | |||
| 17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
| 18 | #include <linux/seqlock.h> | 18 | #include <linux/seqlock.h> |
| 19 | #include <linux/mutex.h> | 19 | #include <linux/mutex.h> |
| 20 | #include <linux/gfp.h> | ||
| 20 | #include <asm/tlbflush.h> | 21 | #include <asm/tlbflush.h> |
| 21 | #include <asm/io.h> | 22 | #include <asm/io.h> |
| 22 | 23 | ||
| @@ -194,7 +195,7 @@ retry: | |||
| 194 | flush_cache_page(vma, address, pte_pfn(*pte)); | 195 | flush_cache_page(vma, address, pte_pfn(*pte)); |
| 195 | pteval = ptep_clear_flush_notify(vma, address, pte); | 196 | pteval = ptep_clear_flush_notify(vma, address, pte); |
| 196 | page_remove_rmap(page); | 197 | page_remove_rmap(page); |
| 197 | dec_mm_counter(mm, file_rss); | 198 | dec_mm_counter(mm, MM_FILEPAGES); |
| 198 | BUG_ON(pte_dirty(pteval)); | 199 | BUG_ON(pte_dirty(pteval)); |
| 199 | pte_unmap_unlock(pte, ptl); | 200 | pte_unmap_unlock(pte, ptl); |
| 200 | page_cache_release(page); | 201 | page_cache_release(page); |
diff --git a/mm/fremap.c b/mm/fremap.c index b6ec85abbb39..46f5dacf90a2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
| @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 40 | page_remove_rmap(page); | 40 | page_remove_rmap(page); |
| 41 | page_cache_release(page); | 41 | page_cache_release(page); |
| 42 | update_hiwater_rss(mm); | 42 | update_hiwater_rss(mm); |
| 43 | dec_mm_counter(mm, file_rss); | 43 | dec_mm_counter(mm, MM_FILEPAGES); |
| 44 | } | 44 | } |
| 45 | } else { | 45 | } else { |
| 46 | if (!pte_file(pte)) | 46 | if (!pte_file(pte)) |
diff --git a/mm/highmem.c b/mm/highmem.c index 9c1e627f282e..bed8a8bfd01f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
| @@ -220,7 +220,7 @@ EXPORT_SYMBOL(kmap_high); | |||
| 220 | * @page: &struct page to pin | 220 | * @page: &struct page to pin |
| 221 | * | 221 | * |
| 222 | * Returns the page's current virtual memory address, or NULL if no mapping | 222 | * Returns the page's current virtual memory address, or NULL if no mapping |
| 223 | * exists. When and only when a non null address is returned then a | 223 | * exists. If and only if a non null address is returned then a |
| 224 | * matching call to kunmap_high() is necessary. | 224 | * matching call to kunmap_high() is necessary. |
| 225 | * | 225 | * |
| 226 | * This can be called from any context. | 226 | * This can be called from any context. |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2d16fa6b8c2d..ffbdfc86aedf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -2,7 +2,6 @@ | |||
| 2 | * Generic hugetlb support. | 2 | * Generic hugetlb support. |
| 3 | * (C) William Irwin, April 2004 | 3 | * (C) William Irwin, April 2004 |
| 4 | */ | 4 | */ |
| 5 | #include <linux/gfp.h> | ||
| 6 | #include <linux/list.h> | 5 | #include <linux/list.h> |
| 7 | #include <linux/init.h> | 6 | #include <linux/init.h> |
| 8 | #include <linux/module.h> | 7 | #include <linux/module.h> |
| @@ -18,6 +17,7 @@ | |||
| 18 | #include <linux/mutex.h> | 17 | #include <linux/mutex.h> |
| 19 | #include <linux/bootmem.h> | 18 | #include <linux/bootmem.h> |
| 20 | #include <linux/sysfs.h> | 19 | #include <linux/sysfs.h> |
| 20 | #include <linux/slab.h> | ||
| 21 | 21 | ||
| 22 | #include <asm/page.h> | 22 | #include <asm/page.h> |
| 23 | #include <asm/pgtable.h> | 23 | #include <asm/pgtable.h> |
| @@ -546,6 +546,7 @@ static void free_huge_page(struct page *page) | |||
| 546 | 546 | ||
| 547 | mapping = (struct address_space *) page_private(page); | 547 | mapping = (struct address_space *) page_private(page); |
| 548 | set_page_private(page, 0); | 548 | set_page_private(page, 0); |
| 549 | page->mapping = NULL; | ||
| 549 | BUG_ON(page_count(page)); | 550 | BUG_ON(page_count(page)); |
| 550 | INIT_LIST_HEAD(&page->lru); | 551 | INIT_LIST_HEAD(&page->lru); |
| 551 | 552 | ||
| @@ -2087,7 +2088,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
| 2087 | 2088 | ||
| 2088 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); | 2089 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); |
| 2089 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 2090 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { |
| 2090 | update_mmu_cache(vma, address, entry); | 2091 | update_mmu_cache(vma, address, ptep); |
| 2091 | } | 2092 | } |
| 2092 | } | 2093 | } |
| 2093 | 2094 | ||
| @@ -2447,8 +2448,10 @@ retry: | |||
| 2447 | spin_lock(&inode->i_lock); | 2448 | spin_lock(&inode->i_lock); |
| 2448 | inode->i_blocks += blocks_per_huge_page(h); | 2449 | inode->i_blocks += blocks_per_huge_page(h); |
| 2449 | spin_unlock(&inode->i_lock); | 2450 | spin_unlock(&inode->i_lock); |
| 2450 | } else | 2451 | } else { |
| 2451 | lock_page(page); | 2452 | lock_page(page); |
| 2453 | page->mapping = HUGETLB_POISON; | ||
| 2454 | } | ||
| 2452 | } | 2455 | } |
| 2453 | 2456 | ||
| 2454 | /* | 2457 | /* |
| @@ -2558,7 +2561,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2558 | entry = pte_mkyoung(entry); | 2561 | entry = pte_mkyoung(entry); |
| 2559 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, | 2562 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
| 2560 | flags & FAULT_FLAG_WRITE)) | 2563 | flags & FAULT_FLAG_WRITE)) |
| 2561 | update_mmu_cache(vma, address, entry); | 2564 | update_mmu_cache(vma, address, ptep); |
| 2562 | 2565 | ||
| 2563 | out_page_table_lock: | 2566 | out_page_table_lock: |
| 2564 | spin_unlock(&mm->page_table_lock); | 2567 | spin_unlock(&mm->page_table_lock); |
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5b069e4f5e48..2c0d032ac898 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
| @@ -72,7 +72,6 @@ | |||
| 72 | #include <linux/module.h> | 72 | #include <linux/module.h> |
| 73 | #include <linux/kthread.h> | 73 | #include <linux/kthread.h> |
| 74 | #include <linux/prio_tree.h> | 74 | #include <linux/prio_tree.h> |
| 75 | #include <linux/gfp.h> | ||
| 76 | #include <linux/fs.h> | 75 | #include <linux/fs.h> |
| 77 | #include <linux/debugfs.h> | 76 | #include <linux/debugfs.h> |
| 78 | #include <linux/seq_file.h> | 77 | #include <linux/seq_file.h> |
| @@ -365,7 +365,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
| 365 | do { | 365 | do { |
| 366 | cond_resched(); | 366 | cond_resched(); |
| 367 | page = follow_page(vma, addr, FOLL_GET); | 367 | page = follow_page(vma, addr, FOLL_GET); |
| 368 | if (!page) | 368 | if (IS_ERR_OR_NULL(page)) |
| 369 | break; | 369 | break; |
| 370 | if (PageKsm(page)) | 370 | if (PageKsm(page)) |
| 371 | ret = handle_mm_fault(vma->vm_mm, vma, addr, | 371 | ret = handle_mm_fault(vma->vm_mm, vma, addr, |
| @@ -447,7 +447,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
| 447 | goto out; | 447 | goto out; |
| 448 | 448 | ||
| 449 | page = follow_page(vma, addr, FOLL_GET); | 449 | page = follow_page(vma, addr, FOLL_GET); |
| 450 | if (!page) | 450 | if (IS_ERR_OR_NULL(page)) |
| 451 | goto out; | 451 | goto out; |
| 452 | if (PageAnon(page)) { | 452 | if (PageAnon(page)) { |
| 453 | flush_anon_page(vma, page, addr); | 453 | flush_anon_page(vma, page, addr); |
| @@ -751,7 +751,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
| 751 | * page | 751 | * page |
| 752 | */ | 752 | */ |
| 753 | if (page_mapcount(page) + 1 + swapped != page_count(page)) { | 753 | if (page_mapcount(page) + 1 + swapped != page_count(page)) { |
| 754 | set_pte_at_notify(mm, addr, ptep, entry); | 754 | set_pte_at(mm, addr, ptep, entry); |
| 755 | goto out_unlock; | 755 | goto out_unlock; |
| 756 | } | 756 | } |
| 757 | entry = pte_wrprotect(entry); | 757 | entry = pte_wrprotect(entry); |
| @@ -1086,7 +1086,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, | |||
| 1086 | cond_resched(); | 1086 | cond_resched(); |
| 1087 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1087 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); |
| 1088 | tree_page = get_mergeable_page(tree_rmap_item); | 1088 | tree_page = get_mergeable_page(tree_rmap_item); |
| 1089 | if (!tree_page) | 1089 | if (IS_ERR_OR_NULL(tree_page)) |
| 1090 | return NULL; | 1090 | return NULL; |
| 1091 | 1091 | ||
| 1092 | /* | 1092 | /* |
| @@ -1294,7 +1294,7 @@ next_mm: | |||
| 1294 | if (ksm_test_exit(mm)) | 1294 | if (ksm_test_exit(mm)) |
| 1295 | break; | 1295 | break; |
| 1296 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); | 1296 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); |
| 1297 | if (*page && PageAnon(*page)) { | 1297 | if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { |
| 1298 | flush_anon_page(vma, *page, ksm_scan.address); | 1298 | flush_anon_page(vma, *page, ksm_scan.address); |
| 1299 | flush_dcache_page(*page); | 1299 | flush_dcache_page(*page); |
| 1300 | rmap_item = get_next_rmap_item(slot, | 1300 | rmap_item = get_next_rmap_item(slot, |
| @@ -1308,7 +1308,7 @@ next_mm: | |||
| 1308 | up_read(&mm->mmap_sem); | 1308 | up_read(&mm->mmap_sem); |
| 1309 | return rmap_item; | 1309 | return rmap_item; |
| 1310 | } | 1310 | } |
| 1311 | if (*page) | 1311 | if (!IS_ERR_OR_NULL(*page)) |
| 1312 | put_page(*page); | 1312 | put_page(*page); |
| 1313 | ksm_scan.address += PAGE_SIZE; | 1313 | ksm_scan.address += PAGE_SIZE; |
| 1314 | cond_resched(); | 1314 | cond_resched(); |
| @@ -1367,7 +1367,7 @@ next_mm: | |||
| 1367 | static void ksm_do_scan(unsigned int scan_npages) | 1367 | static void ksm_do_scan(unsigned int scan_npages) |
| 1368 | { | 1368 | { |
| 1369 | struct rmap_item *rmap_item; | 1369 | struct rmap_item *rmap_item; |
| 1370 | struct page *page; | 1370 | struct page *uninitialized_var(page); |
| 1371 | 1371 | ||
| 1372 | while (scan_npages--) { | 1372 | while (scan_npages--) { |
| 1373 | cond_resched(); | 1373 | cond_resched(); |
| @@ -1563,10 +1563,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | |||
| 1563 | again: | 1563 | again: |
| 1564 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 1564 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { |
| 1565 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1565 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
| 1566 | struct anon_vma_chain *vmac; | ||
| 1566 | struct vm_area_struct *vma; | 1567 | struct vm_area_struct *vma; |
| 1567 | 1568 | ||
| 1568 | spin_lock(&anon_vma->lock); | 1569 | spin_lock(&anon_vma->lock); |
| 1569 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1570 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { |
| 1571 | vma = vmac->vma; | ||
| 1570 | if (rmap_item->address < vma->vm_start || | 1572 | if (rmap_item->address < vma->vm_start || |
| 1571 | rmap_item->address >= vma->vm_end) | 1573 | rmap_item->address >= vma->vm_end) |
| 1572 | continue; | 1574 | continue; |
| @@ -1614,10 +1616,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | |||
| 1614 | again: | 1616 | again: |
| 1615 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 1617 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { |
| 1616 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1618 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
| 1619 | struct anon_vma_chain *vmac; | ||
| 1617 | struct vm_area_struct *vma; | 1620 | struct vm_area_struct *vma; |
| 1618 | 1621 | ||
| 1619 | spin_lock(&anon_vma->lock); | 1622 | spin_lock(&anon_vma->lock); |
| 1620 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1623 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { |
| 1624 | vma = vmac->vma; | ||
| 1621 | if (rmap_item->address < vma->vm_start || | 1625 | if (rmap_item->address < vma->vm_start || |
| 1622 | rmap_item->address >= vma->vm_end) | 1626 | rmap_item->address >= vma->vm_end) |
| 1623 | continue; | 1627 | continue; |
| @@ -1664,10 +1668,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | |||
| 1664 | again: | 1668 | again: |
| 1665 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | 1669 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { |
| 1666 | struct anon_vma *anon_vma = rmap_item->anon_vma; | 1670 | struct anon_vma *anon_vma = rmap_item->anon_vma; |
| 1671 | struct anon_vma_chain *vmac; | ||
| 1667 | struct vm_area_struct *vma; | 1672 | struct vm_area_struct *vma; |
| 1668 | 1673 | ||
| 1669 | spin_lock(&anon_vma->lock); | 1674 | spin_lock(&anon_vma->lock); |
| 1670 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1675 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { |
| 1676 | vma = vmac->vma; | ||
| 1671 | if (rmap_item->address < vma->vm_start || | 1677 | if (rmap_item->address < vma->vm_start || |
| 1672 | rmap_item->address >= vma->vm_end) | 1678 | rmap_item->address >= vma->vm_end) |
| 1673 | continue; | 1679 | continue; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 954032b80bed..0f711c213d2e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -6,6 +6,10 @@ | |||
| 6 | * Copyright 2007 OpenVZ SWsoft Inc | 6 | * Copyright 2007 OpenVZ SWsoft Inc |
| 7 | * Author: Pavel Emelianov <xemul@openvz.org> | 7 | * Author: Pavel Emelianov <xemul@openvz.org> |
| 8 | * | 8 | * |
| 9 | * Memory thresholds | ||
| 10 | * Copyright (C) 2009 Nokia Corporation | ||
| 11 | * Author: Kirill A. Shutemov | ||
| 12 | * | ||
| 9 | * This program is free software; you can redistribute it and/or modify | 13 | * This program is free software; you can redistribute it and/or modify |
| 10 | * it under the terms of the GNU General Public License as published by | 14 | * it under the terms of the GNU General Public License as published by |
| 11 | * the Free Software Foundation; either version 2 of the License, or | 15 | * the Free Software Foundation; either version 2 of the License, or |
| @@ -21,6 +25,7 @@ | |||
| 21 | #include <linux/memcontrol.h> | 25 | #include <linux/memcontrol.h> |
| 22 | #include <linux/cgroup.h> | 26 | #include <linux/cgroup.h> |
| 23 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
| 28 | #include <linux/hugetlb.h> | ||
| 24 | #include <linux/pagemap.h> | 29 | #include <linux/pagemap.h> |
| 25 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
| 26 | #include <linux/page-flags.h> | 31 | #include <linux/page-flags.h> |
| @@ -32,7 +37,10 @@ | |||
| 32 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
| 33 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
| 34 | #include <linux/swap.h> | 39 | #include <linux/swap.h> |
| 40 | #include <linux/swapops.h> | ||
| 35 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
| 42 | #include <linux/eventfd.h> | ||
| 43 | #include <linux/sort.h> | ||
| 36 | #include <linux/fs.h> | 44 | #include <linux/fs.h> |
| 37 | #include <linux/seq_file.h> | 45 | #include <linux/seq_file.h> |
| 38 | #include <linux/vmalloc.h> | 46 | #include <linux/vmalloc.h> |
| @@ -55,7 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
| 55 | #define do_swap_account (0) | 63 | #define do_swap_account (0) |
| 56 | #endif | 64 | #endif |
| 57 | 65 | ||
| 58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 66 | /* |
| 67 | * Per memcg event counter is incremented at every pagein/pageout. This counter | ||
| 68 | * is used for trigger some periodic events. This is straightforward and better | ||
| 69 | * than using jiffies etc. to handle periodic memcg event. | ||
| 70 | * | ||
| 71 | * These values will be used as !((event) & ((1 <<(thresh)) - 1)) | ||
| 72 | */ | ||
| 73 | #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ | ||
| 74 | #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ | ||
| 59 | 75 | ||
| 60 | /* | 76 | /* |
| 61 | * Statistics for memory cgroup. | 77 | * Statistics for memory cgroup. |
| @@ -69,62 +85,16 @@ enum mem_cgroup_stat_index { | |||
| 69 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 85 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
| 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 86 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
| 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 87 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
| 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
| 73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 88 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
| 89 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | ||
| 74 | 90 | ||
| 75 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
| 76 | }; | 92 | }; |
| 77 | 93 | ||
| 78 | struct mem_cgroup_stat_cpu { | 94 | struct mem_cgroup_stat_cpu { |
| 79 | s64 count[MEM_CGROUP_STAT_NSTATS]; | 95 | s64 count[MEM_CGROUP_STAT_NSTATS]; |
| 80 | } ____cacheline_aligned_in_smp; | ||
| 81 | |||
| 82 | struct mem_cgroup_stat { | ||
| 83 | struct mem_cgroup_stat_cpu cpustat[0]; | ||
| 84 | }; | 96 | }; |
| 85 | 97 | ||
| 86 | static inline void | ||
| 87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
| 88 | enum mem_cgroup_stat_index idx) | ||
| 89 | { | ||
| 90 | stat->count[idx] = 0; | ||
| 91 | } | ||
| 92 | |||
| 93 | static inline s64 | ||
| 94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
| 95 | enum mem_cgroup_stat_index idx) | ||
| 96 | { | ||
| 97 | return stat->count[idx]; | ||
| 98 | } | ||
| 99 | |||
| 100 | /* | ||
| 101 | * For accounting under irq disable, no need for increment preempt count. | ||
| 102 | */ | ||
| 103 | static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, | ||
| 104 | enum mem_cgroup_stat_index idx, int val) | ||
| 105 | { | ||
| 106 | stat->count[idx] += val; | ||
| 107 | } | ||
| 108 | |||
| 109 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | ||
| 110 | enum mem_cgroup_stat_index idx) | ||
| 111 | { | ||
| 112 | int cpu; | ||
| 113 | s64 ret = 0; | ||
| 114 | for_each_possible_cpu(cpu) | ||
| 115 | ret += stat->cpustat[cpu].count[idx]; | ||
| 116 | return ret; | ||
| 117 | } | ||
| 118 | |||
| 119 | static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) | ||
| 120 | { | ||
| 121 | s64 ret; | ||
| 122 | |||
| 123 | ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); | ||
| 124 | ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); | ||
| 125 | return ret; | ||
| 126 | } | ||
| 127 | |||
| 128 | /* | 98 | /* |
| 129 | * per-zone information in memory controller. | 99 | * per-zone information in memory controller. |
| 130 | */ | 100 | */ |
| @@ -174,6 +144,22 @@ struct mem_cgroup_tree { | |||
| 174 | 144 | ||
| 175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | 145 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
| 176 | 146 | ||
| 147 | struct mem_cgroup_threshold { | ||
| 148 | struct eventfd_ctx *eventfd; | ||
| 149 | u64 threshold; | ||
| 150 | }; | ||
| 151 | |||
| 152 | struct mem_cgroup_threshold_ary { | ||
| 153 | /* An array index points to threshold just below usage. */ | ||
| 154 | atomic_t current_threshold; | ||
| 155 | /* Size of entries[] */ | ||
| 156 | unsigned int size; | ||
| 157 | /* Array of thresholds */ | ||
| 158 | struct mem_cgroup_threshold entries[0]; | ||
| 159 | }; | ||
| 160 | |||
| 161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | ||
| 162 | |||
| 177 | /* | 163 | /* |
| 178 | * The memory controller data structure. The memory controller controls both | 164 | * The memory controller data structure. The memory controller controls both |
| 179 | * page cache and RSS per cgroup. We would eventually like to provide | 165 | * page cache and RSS per cgroup. We would eventually like to provide |
| @@ -217,7 +203,7 @@ struct mem_cgroup { | |||
| 217 | * Should the accounting and control be hierarchical, per subtree? | 203 | * Should the accounting and control be hierarchical, per subtree? |
| 218 | */ | 204 | */ |
| 219 | bool use_hierarchy; | 205 | bool use_hierarchy; |
| 220 | unsigned long last_oom_jiffies; | 206 | atomic_t oom_lock; |
| 221 | atomic_t refcnt; | 207 | atomic_t refcnt; |
| 222 | 208 | ||
| 223 | unsigned int swappiness; | 209 | unsigned int swappiness; |
| @@ -225,10 +211,48 @@ struct mem_cgroup { | |||
| 225 | /* set when res.limit == memsw.limit */ | 211 | /* set when res.limit == memsw.limit */ |
| 226 | bool memsw_is_minimum; | 212 | bool memsw_is_minimum; |
| 227 | 213 | ||
| 214 | /* protect arrays of thresholds */ | ||
| 215 | struct mutex thresholds_lock; | ||
| 216 | |||
| 217 | /* thresholds for memory usage. RCU-protected */ | ||
| 218 | struct mem_cgroup_threshold_ary *thresholds; | ||
| 219 | |||
| 220 | /* thresholds for mem+swap usage. RCU-protected */ | ||
| 221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | ||
| 222 | |||
| 223 | /* | ||
| 224 | * Should we move charges of a task when a task is moved into this | ||
| 225 | * mem_cgroup ? And what type of charges should we move ? | ||
| 226 | */ | ||
| 227 | unsigned long move_charge_at_immigrate; | ||
| 228 | |||
| 228 | /* | 229 | /* |
| 229 | * statistics. This must be placed at the end of memcg. | 230 | * percpu counter. |
| 230 | */ | 231 | */ |
| 231 | struct mem_cgroup_stat stat; | 232 | struct mem_cgroup_stat_cpu *stat; |
| 233 | }; | ||
| 234 | |||
| 235 | /* Stuffs for move charges at task migration. */ | ||
| 236 | /* | ||
| 237 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | ||
| 238 | * left-shifted bitmap of these types. | ||
| 239 | */ | ||
| 240 | enum move_type { | ||
| 241 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | ||
| 242 | NR_MOVE_TYPE, | ||
| 243 | }; | ||
| 244 | |||
| 245 | /* "mc" and its members are protected by cgroup_mutex */ | ||
| 246 | static struct move_charge_struct { | ||
| 247 | struct mem_cgroup *from; | ||
| 248 | struct mem_cgroup *to; | ||
| 249 | unsigned long precharge; | ||
| 250 | unsigned long moved_charge; | ||
| 251 | unsigned long moved_swap; | ||
| 252 | struct task_struct *moving_task; /* a task moving charges */ | ||
| 253 | wait_queue_head_t waitq; /* a waitq for other context */ | ||
| 254 | } mc = { | ||
| 255 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | ||
| 232 | }; | 256 | }; |
| 233 | 257 | ||
| 234 | /* | 258 | /* |
| @@ -371,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | |||
| 371 | spin_unlock(&mctz->lock); | 395 | spin_unlock(&mctz->lock); |
| 372 | } | 396 | } |
| 373 | 397 | ||
| 374 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
| 375 | { | ||
| 376 | bool ret = false; | ||
| 377 | int cpu; | ||
| 378 | s64 val; | ||
| 379 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 380 | |||
| 381 | cpu = get_cpu(); | ||
| 382 | cpustat = &mem->stat.cpustat[cpu]; | ||
| 383 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
| 384 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
| 385 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
| 386 | ret = true; | ||
| 387 | } | ||
| 388 | put_cpu(); | ||
| 389 | return ret; | ||
| 390 | } | ||
| 391 | 398 | ||
| 392 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | 399 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) |
| 393 | { | 400 | { |
| @@ -481,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
| 481 | return mz; | 488 | return mz; |
| 482 | } | 489 | } |
| 483 | 490 | ||
| 491 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | ||
| 492 | enum mem_cgroup_stat_index idx) | ||
| 493 | { | ||
| 494 | int cpu; | ||
| 495 | s64 val = 0; | ||
| 496 | |||
| 497 | for_each_possible_cpu(cpu) | ||
| 498 | val += per_cpu(mem->stat->count[idx], cpu); | ||
| 499 | return val; | ||
| 500 | } | ||
| 501 | |||
| 502 | static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) | ||
| 503 | { | ||
| 504 | s64 ret; | ||
| 505 | |||
| 506 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | ||
| 507 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | ||
| 508 | return ret; | ||
| 509 | } | ||
| 510 | |||
| 484 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 511 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, |
| 485 | bool charge) | 512 | bool charge) |
| 486 | { | 513 | { |
| 487 | int val = (charge) ? 1 : -1; | 514 | int val = (charge) ? 1 : -1; |
| 488 | struct mem_cgroup_stat *stat = &mem->stat; | 515 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
| 489 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 490 | int cpu = get_cpu(); | ||
| 491 | |||
| 492 | cpustat = &stat->cpustat[cpu]; | ||
| 493 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
| 494 | put_cpu(); | ||
| 495 | } | 516 | } |
| 496 | 517 | ||
| 497 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 518 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
| @@ -499,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
| 499 | bool charge) | 520 | bool charge) |
| 500 | { | 521 | { |
| 501 | int val = (charge) ? 1 : -1; | 522 | int val = (charge) ? 1 : -1; |
| 502 | struct mem_cgroup_stat *stat = &mem->stat; | ||
| 503 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 504 | int cpu = get_cpu(); | ||
| 505 | 523 | ||
| 506 | cpustat = &stat->cpustat[cpu]; | 524 | preempt_disable(); |
| 525 | |||
| 507 | if (PageCgroupCache(pc)) | 526 | if (PageCgroupCache(pc)) |
| 508 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | 527 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); |
| 509 | else | 528 | else |
| 510 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); | 529 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); |
| 511 | 530 | ||
| 512 | if (charge) | 531 | if (charge) |
| 513 | __mem_cgroup_stat_add_safe(cpustat, | 532 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); |
| 514 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); | ||
| 515 | else | 533 | else |
| 516 | __mem_cgroup_stat_add_safe(cpustat, | 534 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); |
| 517 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 535 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); |
| 518 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | 536 | |
| 519 | put_cpu(); | 537 | preempt_enable(); |
| 520 | } | 538 | } |
| 521 | 539 | ||
| 522 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 540 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
| @@ -534,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | |||
| 534 | return total; | 552 | return total; |
| 535 | } | 553 | } |
| 536 | 554 | ||
| 555 | static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | ||
| 556 | { | ||
| 557 | s64 val; | ||
| 558 | |||
| 559 | val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); | ||
| 560 | |||
| 561 | return !(val & ((1 << event_mask_shift) - 1)); | ||
| 562 | } | ||
| 563 | |||
| 564 | /* | ||
| 565 | * Check events in order. | ||
| 566 | * | ||
| 567 | */ | ||
| 568 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | ||
| 569 | { | ||
| 570 | /* threshold event is triggered in finer grain than soft limit */ | ||
| 571 | if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { | ||
| 572 | mem_cgroup_threshold(mem); | ||
| 573 | if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) | ||
| 574 | mem_cgroup_update_tree(mem, page); | ||
| 575 | } | ||
| 576 | } | ||
| 577 | |||
| 537 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 578 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
| 538 | { | 579 | { |
| 539 | return container_of(cgroup_subsys_state(cont, | 580 | return container_of(cgroup_subsys_state(cont, |
| @@ -770,10 +811,12 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
| 770 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | 811 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* |
| 771 | * hierarchy(even if use_hierarchy is disabled in "mem"). | 812 | * hierarchy(even if use_hierarchy is disabled in "mem"). |
| 772 | */ | 813 | */ |
| 814 | rcu_read_lock(); | ||
| 773 | if (mem->use_hierarchy) | 815 | if (mem->use_hierarchy) |
| 774 | ret = css_is_ancestor(&curr->css, &mem->css); | 816 | ret = css_is_ancestor(&curr->css, &mem->css); |
| 775 | else | 817 | else |
| 776 | ret = (curr == mem); | 818 | ret = (curr == mem); |
| 819 | rcu_read_unlock(); | ||
| 777 | css_put(&curr->css); | 820 | css_put(&curr->css); |
| 778 | return ret; | 821 | return ret; |
| 779 | } | 822 | } |
| @@ -1000,7 +1043,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | |||
| 1000 | } | 1043 | } |
| 1001 | 1044 | ||
| 1002 | /** | 1045 | /** |
| 1003 | * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. | 1046 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
| 1004 | * @memcg: The memory cgroup that went over limit | 1047 | * @memcg: The memory cgroup that went over limit |
| 1005 | * @p: Task that is going to be killed | 1048 | * @p: Task that is going to be killed |
| 1006 | * | 1049 | * |
| @@ -1174,7 +1217,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
| 1174 | } | 1217 | } |
| 1175 | } | 1218 | } |
| 1176 | } | 1219 | } |
| 1177 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1220 | if (!mem_cgroup_local_usage(victim)) { |
| 1178 | /* this cgroup's local usage == 0 */ | 1221 | /* this cgroup's local usage == 0 */ |
| 1179 | css_put(&victim->css); | 1222 | css_put(&victim->css); |
| 1180 | continue; | 1223 | continue; |
| @@ -1205,32 +1248,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
| 1205 | return total; | 1248 | return total; |
| 1206 | } | 1249 | } |
| 1207 | 1250 | ||
| 1208 | bool mem_cgroup_oom_called(struct task_struct *task) | 1251 | static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) |
| 1209 | { | 1252 | { |
| 1210 | bool ret = false; | 1253 | int *val = (int *)data; |
| 1211 | struct mem_cgroup *mem; | 1254 | int x; |
| 1212 | struct mm_struct *mm; | 1255 | /* |
| 1256 | * Logically, we can stop scanning immediately when we find | ||
| 1257 | * a memcg is already locked. But condidering unlock ops and | ||
| 1258 | * creation/removal of memcg, scan-all is simple operation. | ||
| 1259 | */ | ||
| 1260 | x = atomic_inc_return(&mem->oom_lock); | ||
| 1261 | *val = max(x, *val); | ||
| 1262 | return 0; | ||
| 1263 | } | ||
| 1264 | /* | ||
| 1265 | * Check OOM-Killer is already running under our hierarchy. | ||
| 1266 | * If someone is running, return false. | ||
| 1267 | */ | ||
| 1268 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | ||
| 1269 | { | ||
| 1270 | int lock_count = 0; | ||
| 1213 | 1271 | ||
| 1214 | rcu_read_lock(); | 1272 | mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); |
| 1215 | mm = task->mm; | 1273 | |
| 1216 | if (!mm) | 1274 | if (lock_count == 1) |
| 1217 | mm = &init_mm; | 1275 | return true; |
| 1218 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1276 | return false; |
| 1219 | if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | ||
| 1220 | ret = true; | ||
| 1221 | rcu_read_unlock(); | ||
| 1222 | return ret; | ||
| 1223 | } | 1277 | } |
| 1224 | 1278 | ||
| 1225 | static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | 1279 | static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) |
| 1226 | { | 1280 | { |
| 1227 | mem->last_oom_jiffies = jiffies; | 1281 | /* |
| 1282 | * When a new child is created while the hierarchy is under oom, | ||
| 1283 | * mem_cgroup_oom_lock() may not be called. We have to use | ||
| 1284 | * atomic_add_unless() here. | ||
| 1285 | */ | ||
| 1286 | atomic_add_unless(&mem->oom_lock, -1, 0); | ||
| 1228 | return 0; | 1287 | return 0; |
| 1229 | } | 1288 | } |
| 1230 | 1289 | ||
| 1231 | static void record_last_oom(struct mem_cgroup *mem) | 1290 | static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
| 1291 | { | ||
| 1292 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); | ||
| 1293 | } | ||
| 1294 | |||
| 1295 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
| 1296 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | ||
| 1297 | |||
| 1298 | /* | ||
| 1299 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | ||
| 1300 | */ | ||
| 1301 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | ||
| 1232 | { | 1302 | { |
| 1233 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | 1303 | DEFINE_WAIT(wait); |
| 1304 | bool locked; | ||
| 1305 | |||
| 1306 | /* At first, try to OOM lock hierarchy under mem.*/ | ||
| 1307 | mutex_lock(&memcg_oom_mutex); | ||
| 1308 | locked = mem_cgroup_oom_lock(mem); | ||
| 1309 | /* | ||
| 1310 | * Even if signal_pending(), we can't quit charge() loop without | ||
| 1311 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | ||
| 1312 | * under OOM is always welcomed, use TASK_KILLABLE here. | ||
| 1313 | */ | ||
| 1314 | if (!locked) | ||
| 1315 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | ||
| 1316 | mutex_unlock(&memcg_oom_mutex); | ||
| 1317 | |||
| 1318 | if (locked) | ||
| 1319 | mem_cgroup_out_of_memory(mem, mask); | ||
| 1320 | else { | ||
| 1321 | schedule(); | ||
| 1322 | finish_wait(&memcg_oom_waitq, &wait); | ||
| 1323 | } | ||
| 1324 | mutex_lock(&memcg_oom_mutex); | ||
| 1325 | mem_cgroup_oom_unlock(mem); | ||
| 1326 | /* | ||
| 1327 | * Here, we use global waitq .....more fine grained waitq ? | ||
| 1328 | * Assume following hierarchy. | ||
| 1329 | * A/ | ||
| 1330 | * 01 | ||
| 1331 | * 02 | ||
| 1332 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
| 1333 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
| 1334 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
| 1335 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
| 1336 | * It will not be a big problem. | ||
| 1337 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
| 1338 | */ | ||
| 1339 | wake_up_all(&memcg_oom_waitq); | ||
| 1340 | mutex_unlock(&memcg_oom_mutex); | ||
| 1341 | |||
| 1342 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
| 1343 | return false; | ||
| 1344 | /* Give chance to dying process */ | ||
| 1345 | schedule_timeout(1); | ||
| 1346 | return true; | ||
| 1234 | } | 1347 | } |
| 1235 | 1348 | ||
| 1236 | /* | 1349 | /* |
| @@ -1240,9 +1353,6 @@ static void record_last_oom(struct mem_cgroup *mem) | |||
| 1240 | void mem_cgroup_update_file_mapped(struct page *page, int val) | 1353 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
| 1241 | { | 1354 | { |
| 1242 | struct mem_cgroup *mem; | 1355 | struct mem_cgroup *mem; |
| 1243 | struct mem_cgroup_stat *stat; | ||
| 1244 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 1245 | int cpu; | ||
| 1246 | struct page_cgroup *pc; | 1356 | struct page_cgroup *pc; |
| 1247 | 1357 | ||
| 1248 | pc = lookup_page_cgroup(page); | 1358 | pc = lookup_page_cgroup(page); |
| @@ -1251,20 +1361,20 @@ void mem_cgroup_update_file_mapped(struct page *page, int val) | |||
| 1251 | 1361 | ||
| 1252 | lock_page_cgroup(pc); | 1362 | lock_page_cgroup(pc); |
| 1253 | mem = pc->mem_cgroup; | 1363 | mem = pc->mem_cgroup; |
| 1254 | if (!mem) | 1364 | if (!mem || !PageCgroupUsed(pc)) |
| 1255 | goto done; | ||
| 1256 | |||
| 1257 | if (!PageCgroupUsed(pc)) | ||
| 1258 | goto done; | 1365 | goto done; |
| 1259 | 1366 | ||
| 1260 | /* | 1367 | /* |
| 1261 | * Preemption is already disabled, we don't need get_cpu() | 1368 | * Preemption is already disabled. We can use __this_cpu_xxx |
| 1262 | */ | 1369 | */ |
| 1263 | cpu = smp_processor_id(); | 1370 | if (val > 0) { |
| 1264 | stat = &mem->stat; | 1371 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
| 1265 | cpustat = &stat->cpustat[cpu]; | 1372 | SetPageCgroupFileMapped(pc); |
| 1373 | } else { | ||
| 1374 | __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | ||
| 1375 | ClearPageCgroupFileMapped(pc); | ||
| 1376 | } | ||
| 1266 | 1377 | ||
| 1267 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); | ||
| 1268 | done: | 1378 | done: |
| 1269 | unlock_page_cgroup(pc); | 1379 | unlock_page_cgroup(pc); |
| 1270 | } | 1380 | } |
| @@ -1401,19 +1511,21 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | |||
| 1401 | * oom-killer can be invoked. | 1511 | * oom-killer can be invoked. |
| 1402 | */ | 1512 | */ |
| 1403 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1513 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
| 1404 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1514 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
| 1405 | bool oom, struct page *page) | ||
| 1406 | { | 1515 | { |
| 1407 | struct mem_cgroup *mem, *mem_over_limit; | 1516 | struct mem_cgroup *mem, *mem_over_limit; |
| 1408 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1517 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
| 1409 | struct res_counter *fail_res; | 1518 | struct res_counter *fail_res; |
| 1410 | int csize = CHARGE_SIZE; | 1519 | int csize = CHARGE_SIZE; |
| 1411 | 1520 | ||
| 1412 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1521 | /* |
| 1413 | /* Don't account this! */ | 1522 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
| 1414 | *memcg = NULL; | 1523 | * in system level. So, allow to go ahead dying process in addition to |
| 1415 | return 0; | 1524 | * MEMDIE process. |
| 1416 | } | 1525 | */ |
| 1526 | if (unlikely(test_thread_flag(TIF_MEMDIE) | ||
| 1527 | || fatal_signal_pending(current))) | ||
| 1528 | goto bypass; | ||
| 1417 | 1529 | ||
| 1418 | /* | 1530 | /* |
| 1419 | * We always charge the cgroup the mm_struct belongs to. | 1531 | * We always charge the cgroup the mm_struct belongs to. |
| @@ -1440,7 +1552,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1440 | unsigned long flags = 0; | 1552 | unsigned long flags = 0; |
| 1441 | 1553 | ||
| 1442 | if (consume_stock(mem)) | 1554 | if (consume_stock(mem)) |
| 1443 | goto charged; | 1555 | goto done; |
| 1444 | 1556 | ||
| 1445 | ret = res_counter_charge(&mem->res, csize, &fail_res); | 1557 | ret = res_counter_charge(&mem->res, csize, &fail_res); |
| 1446 | if (likely(!ret)) { | 1558 | if (likely(!ret)) { |
| @@ -1483,28 +1595,70 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1483 | if (mem_cgroup_check_under_limit(mem_over_limit)) | 1595 | if (mem_cgroup_check_under_limit(mem_over_limit)) |
| 1484 | continue; | 1596 | continue; |
| 1485 | 1597 | ||
| 1598 | /* try to avoid oom while someone is moving charge */ | ||
| 1599 | if (mc.moving_task && current != mc.moving_task) { | ||
| 1600 | struct mem_cgroup *from, *to; | ||
| 1601 | bool do_continue = false; | ||
| 1602 | /* | ||
| 1603 | * There is a small race that "from" or "to" can be | ||
| 1604 | * freed by rmdir, so we use css_tryget(). | ||
| 1605 | */ | ||
| 1606 | rcu_read_lock(); | ||
| 1607 | from = mc.from; | ||
| 1608 | to = mc.to; | ||
| 1609 | if (from && css_tryget(&from->css)) { | ||
| 1610 | if (mem_over_limit->use_hierarchy) | ||
| 1611 | do_continue = css_is_ancestor( | ||
| 1612 | &from->css, | ||
| 1613 | &mem_over_limit->css); | ||
| 1614 | else | ||
| 1615 | do_continue = (from == mem_over_limit); | ||
| 1616 | css_put(&from->css); | ||
| 1617 | } | ||
| 1618 | if (!do_continue && to && css_tryget(&to->css)) { | ||
| 1619 | if (mem_over_limit->use_hierarchy) | ||
| 1620 | do_continue = css_is_ancestor( | ||
| 1621 | &to->css, | ||
| 1622 | &mem_over_limit->css); | ||
| 1623 | else | ||
| 1624 | do_continue = (to == mem_over_limit); | ||
| 1625 | css_put(&to->css); | ||
| 1626 | } | ||
| 1627 | rcu_read_unlock(); | ||
| 1628 | if (do_continue) { | ||
| 1629 | DEFINE_WAIT(wait); | ||
| 1630 | prepare_to_wait(&mc.waitq, &wait, | ||
| 1631 | TASK_INTERRUPTIBLE); | ||
| 1632 | /* moving charge context might have finished. */ | ||
| 1633 | if (mc.moving_task) | ||
| 1634 | schedule(); | ||
| 1635 | finish_wait(&mc.waitq, &wait); | ||
| 1636 | continue; | ||
| 1637 | } | ||
| 1638 | } | ||
| 1639 | |||
| 1486 | if (!nr_retries--) { | 1640 | if (!nr_retries--) { |
| 1487 | if (oom) { | 1641 | if (!oom) |
| 1488 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1642 | goto nomem; |
| 1489 | record_last_oom(mem_over_limit); | 1643 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { |
| 1644 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
| 1645 | continue; | ||
| 1490 | } | 1646 | } |
| 1491 | goto nomem; | 1647 | /* When we reach here, current task is dying .*/ |
| 1648 | css_put(&mem->css); | ||
| 1649 | goto bypass; | ||
| 1492 | } | 1650 | } |
| 1493 | } | 1651 | } |
| 1494 | if (csize > PAGE_SIZE) | 1652 | if (csize > PAGE_SIZE) |
| 1495 | refill_stock(mem, csize - PAGE_SIZE); | 1653 | refill_stock(mem, csize - PAGE_SIZE); |
| 1496 | charged: | ||
| 1497 | /* | ||
| 1498 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
| 1499 | * if they exceeds softlimit. | ||
| 1500 | */ | ||
| 1501 | if (mem_cgroup_soft_limit_check(mem)) | ||
| 1502 | mem_cgroup_update_tree(mem, page); | ||
| 1503 | done: | 1654 | done: |
| 1504 | return 0; | 1655 | return 0; |
| 1505 | nomem: | 1656 | nomem: |
| 1506 | css_put(&mem->css); | 1657 | css_put(&mem->css); |
| 1507 | return -ENOMEM; | 1658 | return -ENOMEM; |
| 1659 | bypass: | ||
| 1660 | *memcg = NULL; | ||
| 1661 | return 0; | ||
| 1508 | } | 1662 | } |
| 1509 | 1663 | ||
| 1510 | /* | 1664 | /* |
| @@ -1512,14 +1666,23 @@ nomem: | |||
| 1512 | * This function is for that and do uncharge, put css's refcnt. | 1666 | * This function is for that and do uncharge, put css's refcnt. |
| 1513 | * gotten by try_charge(). | 1667 | * gotten by try_charge(). |
| 1514 | */ | 1668 | */ |
| 1515 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | 1669 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
| 1670 | unsigned long count) | ||
| 1516 | { | 1671 | { |
| 1517 | if (!mem_cgroup_is_root(mem)) { | 1672 | if (!mem_cgroup_is_root(mem)) { |
| 1518 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1673 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); |
| 1519 | if (do_swap_account) | 1674 | if (do_swap_account) |
| 1520 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1675 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); |
| 1676 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
| 1677 | WARN_ON_ONCE(count > INT_MAX); | ||
| 1678 | __css_put(&mem->css, (int)count); | ||
| 1521 | } | 1679 | } |
| 1522 | css_put(&mem->css); | 1680 | /* we don't need css_put for root */ |
| 1681 | } | ||
| 1682 | |||
| 1683 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
| 1684 | { | ||
| 1685 | __mem_cgroup_cancel_charge(mem, 1); | ||
| 1523 | } | 1686 | } |
| 1524 | 1687 | ||
| 1525 | /* | 1688 | /* |
| @@ -1615,6 +1778,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
| 1615 | mem_cgroup_charge_statistics(mem, pc, true); | 1778 | mem_cgroup_charge_statistics(mem, pc, true); |
| 1616 | 1779 | ||
| 1617 | unlock_page_cgroup(pc); | 1780 | unlock_page_cgroup(pc); |
| 1781 | /* | ||
| 1782 | * "charge_statistics" updated event counter. Then, check it. | ||
| 1783 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
| 1784 | * if they exceeds softlimit. | ||
| 1785 | */ | ||
| 1786 | memcg_check_events(mem, pc->page); | ||
| 1618 | } | 1787 | } |
| 1619 | 1788 | ||
| 1620 | /** | 1789 | /** |
| @@ -1622,61 +1791,48 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
| 1622 | * @pc: page_cgroup of the page. | 1791 | * @pc: page_cgroup of the page. |
| 1623 | * @from: mem_cgroup which the page is moved from. | 1792 | * @from: mem_cgroup which the page is moved from. |
| 1624 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1793 | * @to: mem_cgroup which the page is moved to. @from != @to. |
| 1794 | * @uncharge: whether we should call uncharge and css_put against @from. | ||
| 1625 | * | 1795 | * |
| 1626 | * The caller must confirm following. | 1796 | * The caller must confirm following. |
| 1627 | * - page is not on LRU (isolate_page() is useful.) | 1797 | * - page is not on LRU (isolate_page() is useful.) |
| 1628 | * - the pc is locked, used, and ->mem_cgroup points to @from. | 1798 | * - the pc is locked, used, and ->mem_cgroup points to @from. |
| 1629 | * | 1799 | * |
| 1630 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1800 | * This function doesn't do "charge" nor css_get to new cgroup. It should be |
| 1631 | * new cgroup. It should be done by a caller. | 1801 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is |
| 1802 | * true, this function does "uncharge" from old cgroup, but it doesn't if | ||
| 1803 | * @uncharge is false, so a caller should do "uncharge". | ||
| 1632 | */ | 1804 | */ |
| 1633 | 1805 | ||
| 1634 | static void __mem_cgroup_move_account(struct page_cgroup *pc, | 1806 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
| 1635 | struct mem_cgroup *from, struct mem_cgroup *to) | 1807 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
| 1636 | { | 1808 | { |
| 1637 | struct page *page; | ||
| 1638 | int cpu; | ||
| 1639 | struct mem_cgroup_stat *stat; | ||
| 1640 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 1641 | |||
| 1642 | VM_BUG_ON(from == to); | 1809 | VM_BUG_ON(from == to); |
| 1643 | VM_BUG_ON(PageLRU(pc->page)); | 1810 | VM_BUG_ON(PageLRU(pc->page)); |
| 1644 | VM_BUG_ON(!PageCgroupLocked(pc)); | 1811 | VM_BUG_ON(!PageCgroupLocked(pc)); |
| 1645 | VM_BUG_ON(!PageCgroupUsed(pc)); | 1812 | VM_BUG_ON(!PageCgroupUsed(pc)); |
| 1646 | VM_BUG_ON(pc->mem_cgroup != from); | 1813 | VM_BUG_ON(pc->mem_cgroup != from); |
| 1647 | 1814 | ||
| 1648 | if (!mem_cgroup_is_root(from)) | 1815 | if (PageCgroupFileMapped(pc)) { |
| 1649 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1816 | /* Update mapped_file data for mem_cgroup */ |
| 1650 | mem_cgroup_charge_statistics(from, pc, false); | 1817 | preempt_disable(); |
| 1651 | 1818 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | |
| 1652 | page = pc->page; | 1819 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
| 1653 | if (page_mapped(page) && !PageAnon(page)) { | 1820 | preempt_enable(); |
| 1654 | cpu = smp_processor_id(); | ||
| 1655 | /* Update mapped_file data for mem_cgroup "from" */ | ||
| 1656 | stat = &from->stat; | ||
| 1657 | cpustat = &stat->cpustat[cpu]; | ||
| 1658 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, | ||
| 1659 | -1); | ||
| 1660 | |||
| 1661 | /* Update mapped_file data for mem_cgroup "to" */ | ||
| 1662 | stat = &to->stat; | ||
| 1663 | cpustat = &stat->cpustat[cpu]; | ||
| 1664 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, | ||
| 1665 | 1); | ||
| 1666 | } | 1821 | } |
| 1822 | mem_cgroup_charge_statistics(from, pc, false); | ||
| 1823 | if (uncharge) | ||
| 1824 | /* This is not "cancel", but cancel_charge does all we need. */ | ||
| 1825 | mem_cgroup_cancel_charge(from); | ||
| 1667 | 1826 | ||
| 1668 | if (do_swap_account && !mem_cgroup_is_root(from)) | 1827 | /* caller should have done css_get */ |
| 1669 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
| 1670 | css_put(&from->css); | ||
| 1671 | |||
| 1672 | css_get(&to->css); | ||
| 1673 | pc->mem_cgroup = to; | 1828 | pc->mem_cgroup = to; |
| 1674 | mem_cgroup_charge_statistics(to, pc, true); | 1829 | mem_cgroup_charge_statistics(to, pc, true); |
| 1675 | /* | 1830 | /* |
| 1676 | * We charges against "to" which may not have any tasks. Then, "to" | 1831 | * We charges against "to" which may not have any tasks. Then, "to" |
| 1677 | * can be under rmdir(). But in current implementation, caller of | 1832 | * can be under rmdir(). But in current implementation, caller of |
| 1678 | * this function is just force_empty() and it's garanteed that | 1833 | * this function is just force_empty() and move charge, so it's |
| 1679 | * "to" is never removed. So, we don't check rmdir status here. | 1834 | * garanteed that "to" is never removed. So, we don't check rmdir |
| 1835 | * status here. | ||
| 1680 | */ | 1836 | */ |
| 1681 | } | 1837 | } |
| 1682 | 1838 | ||
| @@ -1685,15 +1841,20 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 1685 | * __mem_cgroup_move_account() | 1841 | * __mem_cgroup_move_account() |
| 1686 | */ | 1842 | */ |
| 1687 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1843 | static int mem_cgroup_move_account(struct page_cgroup *pc, |
| 1688 | struct mem_cgroup *from, struct mem_cgroup *to) | 1844 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
| 1689 | { | 1845 | { |
| 1690 | int ret = -EINVAL; | 1846 | int ret = -EINVAL; |
| 1691 | lock_page_cgroup(pc); | 1847 | lock_page_cgroup(pc); |
| 1692 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | 1848 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { |
| 1693 | __mem_cgroup_move_account(pc, from, to); | 1849 | __mem_cgroup_move_account(pc, from, to, uncharge); |
| 1694 | ret = 0; | 1850 | ret = 0; |
| 1695 | } | 1851 | } |
| 1696 | unlock_page_cgroup(pc); | 1852 | unlock_page_cgroup(pc); |
| 1853 | /* | ||
| 1854 | * check events | ||
| 1855 | */ | ||
| 1856 | memcg_check_events(to, pc->page); | ||
| 1857 | memcg_check_events(from, pc->page); | ||
| 1697 | return ret; | 1858 | return ret; |
| 1698 | } | 1859 | } |
| 1699 | 1860 | ||
| @@ -1722,15 +1883,13 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
| 1722 | goto put; | 1883 | goto put; |
| 1723 | 1884 | ||
| 1724 | parent = mem_cgroup_from_cont(pcg); | 1885 | parent = mem_cgroup_from_cont(pcg); |
| 1725 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | 1886 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); |
| 1726 | if (ret || !parent) | 1887 | if (ret || !parent) |
| 1727 | goto put_back; | 1888 | goto put_back; |
| 1728 | 1889 | ||
| 1729 | ret = mem_cgroup_move_account(pc, child, parent); | 1890 | ret = mem_cgroup_move_account(pc, child, parent, true); |
| 1730 | if (!ret) | 1891 | if (ret) |
| 1731 | css_put(&parent->css); /* drop extra refcnt by try_charge() */ | 1892 | mem_cgroup_cancel_charge(parent); |
| 1732 | else | ||
| 1733 | mem_cgroup_cancel_charge(parent); /* does css_put */ | ||
| 1734 | put_back: | 1893 | put_back: |
| 1735 | putback_lru_page(page); | 1894 | putback_lru_page(page); |
| 1736 | put: | 1895 | put: |
| @@ -1760,7 +1919,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 1760 | prefetchw(pc); | 1919 | prefetchw(pc); |
| 1761 | 1920 | ||
| 1762 | mem = memcg; | 1921 | mem = memcg; |
| 1763 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); | 1922 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); |
| 1764 | if (ret || !mem) | 1923 | if (ret || !mem) |
| 1765 | return ret; | 1924 | return ret; |
| 1766 | 1925 | ||
| @@ -1880,14 +2039,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
| 1880 | if (!mem) | 2039 | if (!mem) |
| 1881 | goto charge_cur_mm; | 2040 | goto charge_cur_mm; |
| 1882 | *ptr = mem; | 2041 | *ptr = mem; |
| 1883 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); | 2042 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
| 1884 | /* drop extra refcnt from tryget */ | 2043 | /* drop extra refcnt from tryget */ |
| 1885 | css_put(&mem->css); | 2044 | css_put(&mem->css); |
| 1886 | return ret; | 2045 | return ret; |
| 1887 | charge_cur_mm: | 2046 | charge_cur_mm: |
| 1888 | if (unlikely(!mm)) | 2047 | if (unlikely(!mm)) |
| 1889 | mm = &init_mm; | 2048 | mm = &init_mm; |
| 1890 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); | 2049 | return __mem_cgroup_try_charge(mm, mask, ptr, true); |
| 1891 | } | 2050 | } |
| 1892 | 2051 | ||
| 1893 | static void | 2052 | static void |
| @@ -2064,8 +2223,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 2064 | mz = page_cgroup_zoneinfo(pc); | 2223 | mz = page_cgroup_zoneinfo(pc); |
| 2065 | unlock_page_cgroup(pc); | 2224 | unlock_page_cgroup(pc); |
| 2066 | 2225 | ||
| 2067 | if (mem_cgroup_soft_limit_check(mem)) | 2226 | memcg_check_events(mem, page); |
| 2068 | mem_cgroup_update_tree(mem, page); | ||
| 2069 | /* at swapout, this memcg will be accessed to record to swap */ | 2227 | /* at swapout, this memcg will be accessed to record to swap */ |
| 2070 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2228 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
| 2071 | css_put(&mem->css); | 2229 | css_put(&mem->css); |
| @@ -2156,7 +2314,9 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
| 2156 | 2314 | ||
| 2157 | /* record memcg information */ | 2315 | /* record memcg information */ |
| 2158 | if (do_swap_account && swapout && memcg) { | 2316 | if (do_swap_account && swapout && memcg) { |
| 2317 | rcu_read_lock(); | ||
| 2159 | swap_cgroup_record(ent, css_id(&memcg->css)); | 2318 | swap_cgroup_record(ent, css_id(&memcg->css)); |
| 2319 | rcu_read_unlock(); | ||
| 2160 | mem_cgroup_get(memcg); | 2320 | mem_cgroup_get(memcg); |
| 2161 | } | 2321 | } |
| 2162 | if (swapout && memcg) | 2322 | if (swapout && memcg) |
| @@ -2192,6 +2352,66 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
| 2192 | } | 2352 | } |
| 2193 | rcu_read_unlock(); | 2353 | rcu_read_unlock(); |
| 2194 | } | 2354 | } |
| 2355 | |||
| 2356 | /** | ||
| 2357 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | ||
| 2358 | * @entry: swap entry to be moved | ||
| 2359 | * @from: mem_cgroup which the entry is moved from | ||
| 2360 | * @to: mem_cgroup which the entry is moved to | ||
| 2361 | * @need_fixup: whether we should fixup res_counters and refcounts. | ||
| 2362 | * | ||
| 2363 | * It succeeds only when the swap_cgroup's record for this entry is the same | ||
| 2364 | * as the mem_cgroup's id of @from. | ||
| 2365 | * | ||
| 2366 | * Returns 0 on success, -EINVAL on failure. | ||
| 2367 | * | ||
| 2368 | * The caller must have charged to @to, IOW, called res_counter_charge() about | ||
| 2369 | * both res and memsw, and called css_get(). | ||
| 2370 | */ | ||
| 2371 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
| 2372 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
| 2373 | { | ||
| 2374 | unsigned short old_id, new_id; | ||
| 2375 | |||
| 2376 | rcu_read_lock(); | ||
| 2377 | old_id = css_id(&from->css); | ||
| 2378 | new_id = css_id(&to->css); | ||
| 2379 | rcu_read_unlock(); | ||
| 2380 | |||
| 2381 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | ||
| 2382 | mem_cgroup_swap_statistics(from, false); | ||
| 2383 | mem_cgroup_swap_statistics(to, true); | ||
| 2384 | /* | ||
| 2385 | * This function is only called from task migration context now. | ||
| 2386 | * It postpones res_counter and refcount handling till the end | ||
| 2387 | * of task migration(mem_cgroup_clear_mc()) for performance | ||
| 2388 | * improvement. But we cannot postpone mem_cgroup_get(to) | ||
| 2389 | * because if the process that has been moved to @to does | ||
| 2390 | * swap-in, the refcount of @to might be decreased to 0. | ||
| 2391 | */ | ||
| 2392 | mem_cgroup_get(to); | ||
| 2393 | if (need_fixup) { | ||
| 2394 | if (!mem_cgroup_is_root(from)) | ||
| 2395 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
| 2396 | mem_cgroup_put(from); | ||
| 2397 | /* | ||
| 2398 | * we charged both to->res and to->memsw, so we should | ||
| 2399 | * uncharge to->res. | ||
| 2400 | */ | ||
| 2401 | if (!mem_cgroup_is_root(to)) | ||
| 2402 | res_counter_uncharge(&to->res, PAGE_SIZE); | ||
| 2403 | css_put(&to->css); | ||
| 2404 | } | ||
| 2405 | return 0; | ||
| 2406 | } | ||
| 2407 | return -EINVAL; | ||
| 2408 | } | ||
| 2409 | #else | ||
| 2410 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
| 2411 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
| 2412 | { | ||
| 2413 | return -EINVAL; | ||
| 2414 | } | ||
| 2195 | #endif | 2415 | #endif |
| 2196 | 2416 | ||
| 2197 | /* | 2417 | /* |
| @@ -2215,12 +2435,11 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
| 2215 | } | 2435 | } |
| 2216 | unlock_page_cgroup(pc); | 2436 | unlock_page_cgroup(pc); |
| 2217 | 2437 | ||
| 2438 | *ptr = mem; | ||
| 2218 | if (mem) { | 2439 | if (mem) { |
| 2219 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, | 2440 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); |
| 2220 | page); | ||
| 2221 | css_put(&mem->css); | 2441 | css_put(&mem->css); |
| 2222 | } | 2442 | } |
| 2223 | *ptr = mem; | ||
| 2224 | return ret; | 2443 | return ret; |
| 2225 | } | 2444 | } |
| 2226 | 2445 | ||
| @@ -2545,7 +2764,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
| 2545 | pc = list_entry(list->prev, struct page_cgroup, lru); | 2764 | pc = list_entry(list->prev, struct page_cgroup, lru); |
| 2546 | if (busy == pc) { | 2765 | if (busy == pc) { |
| 2547 | list_move(&pc->lru, list); | 2766 | list_move(&pc->lru, list); |
| 2548 | busy = 0; | 2767 | busy = NULL; |
| 2549 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 2768 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 2550 | continue; | 2769 | continue; |
| 2551 | } | 2770 | } |
| @@ -2704,7 +2923,7 @@ static int | |||
| 2704 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | 2923 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) |
| 2705 | { | 2924 | { |
| 2706 | struct mem_cgroup_idx_data *d = data; | 2925 | struct mem_cgroup_idx_data *d = data; |
| 2707 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | 2926 | d->val += mem_cgroup_read_stat(mem, d->idx); |
| 2708 | return 0; | 2927 | return 0; |
| 2709 | } | 2928 | } |
| 2710 | 2929 | ||
| @@ -2719,40 +2938,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | |||
| 2719 | *val = d.val; | 2938 | *val = d.val; |
| 2720 | } | 2939 | } |
| 2721 | 2940 | ||
| 2941 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | ||
| 2942 | { | ||
| 2943 | u64 idx_val, val; | ||
| 2944 | |||
| 2945 | if (!mem_cgroup_is_root(mem)) { | ||
| 2946 | if (!swap) | ||
| 2947 | return res_counter_read_u64(&mem->res, RES_USAGE); | ||
| 2948 | else | ||
| 2949 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | ||
| 2950 | } | ||
| 2951 | |||
| 2952 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); | ||
| 2953 | val = idx_val; | ||
| 2954 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); | ||
| 2955 | val += idx_val; | ||
| 2956 | |||
| 2957 | if (swap) { | ||
| 2958 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2959 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
| 2960 | val += idx_val; | ||
| 2961 | } | ||
| 2962 | |||
| 2963 | return val << PAGE_SHIFT; | ||
| 2964 | } | ||
| 2965 | |||
| 2722 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2966 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
| 2723 | { | 2967 | { |
| 2724 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2968 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
| 2725 | u64 idx_val, val; | 2969 | u64 val; |
| 2726 | int type, name; | 2970 | int type, name; |
| 2727 | 2971 | ||
| 2728 | type = MEMFILE_TYPE(cft->private); | 2972 | type = MEMFILE_TYPE(cft->private); |
| 2729 | name = MEMFILE_ATTR(cft->private); | 2973 | name = MEMFILE_ATTR(cft->private); |
| 2730 | switch (type) { | 2974 | switch (type) { |
| 2731 | case _MEM: | 2975 | case _MEM: |
| 2732 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2976 | if (name == RES_USAGE) |
| 2733 | mem_cgroup_get_recursive_idx_stat(mem, | 2977 | val = mem_cgroup_usage(mem, false); |
| 2734 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2978 | else |
| 2735 | val = idx_val; | ||
| 2736 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2737 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
| 2738 | val += idx_val; | ||
| 2739 | val <<= PAGE_SHIFT; | ||
| 2740 | } else | ||
| 2741 | val = res_counter_read_u64(&mem->res, name); | 2979 | val = res_counter_read_u64(&mem->res, name); |
| 2742 | break; | 2980 | break; |
| 2743 | case _MEMSWAP: | 2981 | case _MEMSWAP: |
| 2744 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2982 | if (name == RES_USAGE) |
| 2745 | mem_cgroup_get_recursive_idx_stat(mem, | 2983 | val = mem_cgroup_usage(mem, true); |
| 2746 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2984 | else |
| 2747 | val = idx_val; | ||
| 2748 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2749 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
| 2750 | val += idx_val; | ||
| 2751 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2752 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
| 2753 | val += idx_val; | ||
| 2754 | val <<= PAGE_SHIFT; | ||
| 2755 | } else | ||
| 2756 | val = res_counter_read_u64(&mem->memsw, name); | 2985 | val = res_counter_read_u64(&mem->memsw, name); |
| 2757 | break; | 2986 | break; |
| 2758 | default: | 2987 | default: |
| @@ -2865,6 +3094,39 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
| 2865 | return 0; | 3094 | return 0; |
| 2866 | } | 3095 | } |
| 2867 | 3096 | ||
| 3097 | static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | ||
| 3098 | struct cftype *cft) | ||
| 3099 | { | ||
| 3100 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; | ||
| 3101 | } | ||
| 3102 | |||
| 3103 | #ifdef CONFIG_MMU | ||
| 3104 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
| 3105 | struct cftype *cft, u64 val) | ||
| 3106 | { | ||
| 3107 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
| 3108 | |||
| 3109 | if (val >= (1 << NR_MOVE_TYPE)) | ||
| 3110 | return -EINVAL; | ||
| 3111 | /* | ||
| 3112 | * We check this value several times in both in can_attach() and | ||
| 3113 | * attach(), so we need cgroup lock to prevent this value from being | ||
| 3114 | * inconsistent. | ||
| 3115 | */ | ||
| 3116 | cgroup_lock(); | ||
| 3117 | mem->move_charge_at_immigrate = val; | ||
| 3118 | cgroup_unlock(); | ||
| 3119 | |||
| 3120 | return 0; | ||
| 3121 | } | ||
| 3122 | #else | ||
| 3123 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
| 3124 | struct cftype *cft, u64 val) | ||
| 3125 | { | ||
| 3126 | return -ENOSYS; | ||
| 3127 | } | ||
| 3128 | #endif | ||
| 3129 | |||
| 2868 | 3130 | ||
| 2869 | /* For read statistics */ | 3131 | /* For read statistics */ |
| 2870 | enum { | 3132 | enum { |
| @@ -2910,18 +3172,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
| 2910 | s64 val; | 3172 | s64 val; |
| 2911 | 3173 | ||
| 2912 | /* per cpu stat */ | 3174 | /* per cpu stat */ |
| 2913 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); | 3175 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
| 2914 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 3176 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
| 2915 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 3177 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
| 2916 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 3178 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
| 2917 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); | 3179 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); |
| 2918 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | 3180 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
| 2919 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 3181 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); |
| 2920 | s->stat[MCS_PGPGIN] += val; | 3182 | s->stat[MCS_PGPGIN] += val; |
| 2921 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 3183 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
| 2922 | s->stat[MCS_PGPGOUT] += val; | 3184 | s->stat[MCS_PGPGOUT] += val; |
| 2923 | if (do_swap_account) { | 3185 | if (do_swap_account) { |
| 2924 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | 3186 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
| 2925 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | 3187 | s->stat[MCS_SWAP] += val * PAGE_SIZE; |
| 2926 | } | 3188 | } |
| 2927 | 3189 | ||
| @@ -3049,12 +3311,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
| 3049 | return 0; | 3311 | return 0; |
| 3050 | } | 3312 | } |
| 3051 | 3313 | ||
| 3314 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | ||
| 3315 | { | ||
| 3316 | struct mem_cgroup_threshold_ary *t; | ||
| 3317 | u64 usage; | ||
| 3318 | int i; | ||
| 3319 | |||
| 3320 | rcu_read_lock(); | ||
| 3321 | if (!swap) | ||
| 3322 | t = rcu_dereference(memcg->thresholds); | ||
| 3323 | else | ||
| 3324 | t = rcu_dereference(memcg->memsw_thresholds); | ||
| 3325 | |||
| 3326 | if (!t) | ||
| 3327 | goto unlock; | ||
| 3328 | |||
| 3329 | usage = mem_cgroup_usage(memcg, swap); | ||
| 3330 | |||
| 3331 | /* | ||
| 3332 | * current_threshold points to threshold just below usage. | ||
| 3333 | * If it's not true, a threshold was crossed after last | ||
| 3334 | * call of __mem_cgroup_threshold(). | ||
| 3335 | */ | ||
| 3336 | i = atomic_read(&t->current_threshold); | ||
| 3337 | |||
| 3338 | /* | ||
| 3339 | * Iterate backward over array of thresholds starting from | ||
| 3340 | * current_threshold and check if a threshold is crossed. | ||
| 3341 | * If none of thresholds below usage is crossed, we read | ||
| 3342 | * only one element of the array here. | ||
| 3343 | */ | ||
| 3344 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | ||
| 3345 | eventfd_signal(t->entries[i].eventfd, 1); | ||
| 3346 | |||
| 3347 | /* i = current_threshold + 1 */ | ||
| 3348 | i++; | ||
| 3349 | |||
| 3350 | /* | ||
| 3351 | * Iterate forward over array of thresholds starting from | ||
| 3352 | * current_threshold+1 and check if a threshold is crossed. | ||
| 3353 | * If none of thresholds above usage is crossed, we read | ||
| 3354 | * only one element of the array here. | ||
| 3355 | */ | ||
| 3356 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | ||
| 3357 | eventfd_signal(t->entries[i].eventfd, 1); | ||
| 3358 | |||
| 3359 | /* Update current_threshold */ | ||
| 3360 | atomic_set(&t->current_threshold, i - 1); | ||
| 3361 | unlock: | ||
| 3362 | rcu_read_unlock(); | ||
| 3363 | } | ||
| 3364 | |||
| 3365 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) | ||
| 3366 | { | ||
| 3367 | __mem_cgroup_threshold(memcg, false); | ||
| 3368 | if (do_swap_account) | ||
| 3369 | __mem_cgroup_threshold(memcg, true); | ||
| 3370 | } | ||
| 3371 | |||
| 3372 | static int compare_thresholds(const void *a, const void *b) | ||
| 3373 | { | ||
| 3374 | const struct mem_cgroup_threshold *_a = a; | ||
| 3375 | const struct mem_cgroup_threshold *_b = b; | ||
| 3376 | |||
| 3377 | return _a->threshold - _b->threshold; | ||
| 3378 | } | ||
| 3379 | |||
| 3380 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | ||
| 3381 | struct eventfd_ctx *eventfd, const char *args) | ||
| 3382 | { | ||
| 3383 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
| 3384 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
| 3385 | int type = MEMFILE_TYPE(cft->private); | ||
| 3386 | u64 threshold, usage; | ||
| 3387 | int size; | ||
| 3388 | int i, ret; | ||
| 3389 | |||
| 3390 | ret = res_counter_memparse_write_strategy(args, &threshold); | ||
| 3391 | if (ret) | ||
| 3392 | return ret; | ||
| 3393 | |||
| 3394 | mutex_lock(&memcg->thresholds_lock); | ||
| 3395 | if (type == _MEM) | ||
| 3396 | thresholds = memcg->thresholds; | ||
| 3397 | else if (type == _MEMSWAP) | ||
| 3398 | thresholds = memcg->memsw_thresholds; | ||
| 3399 | else | ||
| 3400 | BUG(); | ||
| 3401 | |||
| 3402 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
| 3403 | |||
| 3404 | /* Check if a threshold crossed before adding a new one */ | ||
| 3405 | if (thresholds) | ||
| 3406 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
| 3407 | |||
| 3408 | if (thresholds) | ||
| 3409 | size = thresholds->size + 1; | ||
| 3410 | else | ||
| 3411 | size = 1; | ||
| 3412 | |||
| 3413 | /* Allocate memory for new array of thresholds */ | ||
| 3414 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
| 3415 | size * sizeof(struct mem_cgroup_threshold), | ||
| 3416 | GFP_KERNEL); | ||
| 3417 | if (!thresholds_new) { | ||
| 3418 | ret = -ENOMEM; | ||
| 3419 | goto unlock; | ||
| 3420 | } | ||
| 3421 | thresholds_new->size = size; | ||
| 3422 | |||
| 3423 | /* Copy thresholds (if any) to new array */ | ||
| 3424 | if (thresholds) | ||
| 3425 | memcpy(thresholds_new->entries, thresholds->entries, | ||
| 3426 | thresholds->size * | ||
| 3427 | sizeof(struct mem_cgroup_threshold)); | ||
| 3428 | /* Add new threshold */ | ||
| 3429 | thresholds_new->entries[size - 1].eventfd = eventfd; | ||
| 3430 | thresholds_new->entries[size - 1].threshold = threshold; | ||
| 3431 | |||
| 3432 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | ||
| 3433 | sort(thresholds_new->entries, size, | ||
| 3434 | sizeof(struct mem_cgroup_threshold), | ||
| 3435 | compare_thresholds, NULL); | ||
| 3436 | |||
| 3437 | /* Find current threshold */ | ||
| 3438 | atomic_set(&thresholds_new->current_threshold, -1); | ||
| 3439 | for (i = 0; i < size; i++) { | ||
| 3440 | if (thresholds_new->entries[i].threshold < usage) { | ||
| 3441 | /* | ||
| 3442 | * thresholds_new->current_threshold will not be used | ||
| 3443 | * until rcu_assign_pointer(), so it's safe to increment | ||
| 3444 | * it here. | ||
| 3445 | */ | ||
| 3446 | atomic_inc(&thresholds_new->current_threshold); | ||
| 3447 | } | ||
| 3448 | } | ||
| 3449 | |||
| 3450 | if (type == _MEM) | ||
| 3451 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
| 3452 | else | ||
| 3453 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
| 3454 | |||
| 3455 | /* To be sure that nobody uses thresholds before freeing it */ | ||
| 3456 | synchronize_rcu(); | ||
| 3457 | |||
| 3458 | kfree(thresholds); | ||
| 3459 | unlock: | ||
| 3460 | mutex_unlock(&memcg->thresholds_lock); | ||
| 3461 | |||
| 3462 | return ret; | ||
| 3463 | } | ||
| 3464 | |||
| 3465 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | ||
| 3466 | struct eventfd_ctx *eventfd) | ||
| 3467 | { | ||
| 3468 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
| 3469 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
| 3470 | int type = MEMFILE_TYPE(cft->private); | ||
| 3471 | u64 usage; | ||
| 3472 | int size = 0; | ||
| 3473 | int i, j, ret; | ||
| 3474 | |||
| 3475 | mutex_lock(&memcg->thresholds_lock); | ||
| 3476 | if (type == _MEM) | ||
| 3477 | thresholds = memcg->thresholds; | ||
| 3478 | else if (type == _MEMSWAP) | ||
| 3479 | thresholds = memcg->memsw_thresholds; | ||
| 3480 | else | ||
| 3481 | BUG(); | ||
| 3482 | |||
| 3483 | /* | ||
| 3484 | * Something went wrong if we trying to unregister a threshold | ||
| 3485 | * if we don't have thresholds | ||
| 3486 | */ | ||
| 3487 | BUG_ON(!thresholds); | ||
| 3488 | |||
| 3489 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
| 3490 | |||
| 3491 | /* Check if a threshold crossed before removing */ | ||
| 3492 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
| 3493 | |||
| 3494 | /* Calculate new number of threshold */ | ||
| 3495 | for (i = 0; i < thresholds->size; i++) { | ||
| 3496 | if (thresholds->entries[i].eventfd != eventfd) | ||
| 3497 | size++; | ||
| 3498 | } | ||
| 3499 | |||
| 3500 | /* Set thresholds array to NULL if we don't have thresholds */ | ||
| 3501 | if (!size) { | ||
| 3502 | thresholds_new = NULL; | ||
| 3503 | goto assign; | ||
| 3504 | } | ||
| 3505 | |||
| 3506 | /* Allocate memory for new array of thresholds */ | ||
| 3507 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
| 3508 | size * sizeof(struct mem_cgroup_threshold), | ||
| 3509 | GFP_KERNEL); | ||
| 3510 | if (!thresholds_new) { | ||
| 3511 | ret = -ENOMEM; | ||
| 3512 | goto unlock; | ||
| 3513 | } | ||
| 3514 | thresholds_new->size = size; | ||
| 3515 | |||
| 3516 | /* Copy thresholds and find current threshold */ | ||
| 3517 | atomic_set(&thresholds_new->current_threshold, -1); | ||
| 3518 | for (i = 0, j = 0; i < thresholds->size; i++) { | ||
| 3519 | if (thresholds->entries[i].eventfd == eventfd) | ||
| 3520 | continue; | ||
| 3521 | |||
| 3522 | thresholds_new->entries[j] = thresholds->entries[i]; | ||
| 3523 | if (thresholds_new->entries[j].threshold < usage) { | ||
| 3524 | /* | ||
| 3525 | * thresholds_new->current_threshold will not be used | ||
| 3526 | * until rcu_assign_pointer(), so it's safe to increment | ||
| 3527 | * it here. | ||
| 3528 | */ | ||
| 3529 | atomic_inc(&thresholds_new->current_threshold); | ||
| 3530 | } | ||
| 3531 | j++; | ||
| 3532 | } | ||
| 3533 | |||
| 3534 | assign: | ||
| 3535 | if (type == _MEM) | ||
| 3536 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
| 3537 | else | ||
| 3538 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
| 3539 | |||
| 3540 | /* To be sure that nobody uses thresholds before freeing it */ | ||
| 3541 | synchronize_rcu(); | ||
| 3542 | |||
| 3543 | kfree(thresholds); | ||
| 3544 | unlock: | ||
| 3545 | mutex_unlock(&memcg->thresholds_lock); | ||
| 3546 | |||
| 3547 | return ret; | ||
| 3548 | } | ||
| 3052 | 3549 | ||
| 3053 | static struct cftype mem_cgroup_files[] = { | 3550 | static struct cftype mem_cgroup_files[] = { |
| 3054 | { | 3551 | { |
| 3055 | .name = "usage_in_bytes", | 3552 | .name = "usage_in_bytes", |
| 3056 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3553 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
| 3057 | .read_u64 = mem_cgroup_read, | 3554 | .read_u64 = mem_cgroup_read, |
| 3555 | .register_event = mem_cgroup_register_event, | ||
| 3556 | .unregister_event = mem_cgroup_unregister_event, | ||
| 3058 | }, | 3557 | }, |
| 3059 | { | 3558 | { |
| 3060 | .name = "max_usage_in_bytes", | 3559 | .name = "max_usage_in_bytes", |
| @@ -3098,6 +3597,11 @@ static struct cftype mem_cgroup_files[] = { | |||
| 3098 | .read_u64 = mem_cgroup_swappiness_read, | 3597 | .read_u64 = mem_cgroup_swappiness_read, |
| 3099 | .write_u64 = mem_cgroup_swappiness_write, | 3598 | .write_u64 = mem_cgroup_swappiness_write, |
| 3100 | }, | 3599 | }, |
| 3600 | { | ||
| 3601 | .name = "move_charge_at_immigrate", | ||
| 3602 | .read_u64 = mem_cgroup_move_charge_read, | ||
| 3603 | .write_u64 = mem_cgroup_move_charge_write, | ||
| 3604 | }, | ||
| 3101 | }; | 3605 | }; |
| 3102 | 3606 | ||
| 3103 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3607 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
| @@ -3106,6 +3610,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
| 3106 | .name = "memsw.usage_in_bytes", | 3610 | .name = "memsw.usage_in_bytes", |
| 3107 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3611 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
| 3108 | .read_u64 = mem_cgroup_read, | 3612 | .read_u64 = mem_cgroup_read, |
| 3613 | .register_event = mem_cgroup_register_event, | ||
| 3614 | .unregister_event = mem_cgroup_unregister_event, | ||
| 3109 | }, | 3615 | }, |
| 3110 | { | 3616 | { |
| 3111 | .name = "memsw.max_usage_in_bytes", | 3617 | .name = "memsw.max_usage_in_bytes", |
| @@ -3180,24 +3686,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
| 3180 | kfree(mem->info.nodeinfo[node]); | 3686 | kfree(mem->info.nodeinfo[node]); |
| 3181 | } | 3687 | } |
| 3182 | 3688 | ||
| 3183 | static int mem_cgroup_size(void) | ||
| 3184 | { | ||
| 3185 | int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); | ||
| 3186 | return sizeof(struct mem_cgroup) + cpustat_size; | ||
| 3187 | } | ||
| 3188 | |||
| 3189 | static struct mem_cgroup *mem_cgroup_alloc(void) | 3689 | static struct mem_cgroup *mem_cgroup_alloc(void) |
| 3190 | { | 3690 | { |
| 3191 | struct mem_cgroup *mem; | 3691 | struct mem_cgroup *mem; |
| 3192 | int size = mem_cgroup_size(); | 3692 | int size = sizeof(struct mem_cgroup); |
| 3193 | 3693 | ||
| 3694 | /* Can be very big if MAX_NUMNODES is very big */ | ||
| 3194 | if (size < PAGE_SIZE) | 3695 | if (size < PAGE_SIZE) |
| 3195 | mem = kmalloc(size, GFP_KERNEL); | 3696 | mem = kmalloc(size, GFP_KERNEL); |
| 3196 | else | 3697 | else |
| 3197 | mem = vmalloc(size); | 3698 | mem = vmalloc(size); |
| 3198 | 3699 | ||
| 3199 | if (mem) | 3700 | if (!mem) |
| 3200 | memset(mem, 0, size); | 3701 | return NULL; |
| 3702 | |||
| 3703 | memset(mem, 0, size); | ||
| 3704 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | ||
| 3705 | if (!mem->stat) { | ||
| 3706 | if (size < PAGE_SIZE) | ||
| 3707 | kfree(mem); | ||
| 3708 | else | ||
| 3709 | vfree(mem); | ||
| 3710 | mem = NULL; | ||
| 3711 | } | ||
| 3201 | return mem; | 3712 | return mem; |
| 3202 | } | 3713 | } |
| 3203 | 3714 | ||
| @@ -3222,7 +3733,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
| 3222 | for_each_node_state(node, N_POSSIBLE) | 3733 | for_each_node_state(node, N_POSSIBLE) |
| 3223 | free_mem_cgroup_per_zone_info(mem, node); | 3734 | free_mem_cgroup_per_zone_info(mem, node); |
| 3224 | 3735 | ||
| 3225 | if (mem_cgroup_size() < PAGE_SIZE) | 3736 | free_percpu(mem->stat); |
| 3737 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | ||
| 3226 | kfree(mem); | 3738 | kfree(mem); |
| 3227 | else | 3739 | else |
| 3228 | vfree(mem); | 3740 | vfree(mem); |
| @@ -3233,9 +3745,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem) | |||
| 3233 | atomic_inc(&mem->refcnt); | 3745 | atomic_inc(&mem->refcnt); |
| 3234 | } | 3746 | } |
| 3235 | 3747 | ||
| 3236 | static void mem_cgroup_put(struct mem_cgroup *mem) | 3748 | static void __mem_cgroup_put(struct mem_cgroup *mem, int count) |
| 3237 | { | 3749 | { |
| 3238 | if (atomic_dec_and_test(&mem->refcnt)) { | 3750 | if (atomic_sub_and_test(count, &mem->refcnt)) { |
| 3239 | struct mem_cgroup *parent = parent_mem_cgroup(mem); | 3751 | struct mem_cgroup *parent = parent_mem_cgroup(mem); |
| 3240 | __mem_cgroup_free(mem); | 3752 | __mem_cgroup_free(mem); |
| 3241 | if (parent) | 3753 | if (parent) |
| @@ -3243,6 +3755,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem) | |||
| 3243 | } | 3755 | } |
| 3244 | } | 3756 | } |
| 3245 | 3757 | ||
| 3758 | static void mem_cgroup_put(struct mem_cgroup *mem) | ||
| 3759 | { | ||
| 3760 | __mem_cgroup_put(mem, 1); | ||
| 3761 | } | ||
| 3762 | |||
| 3246 | /* | 3763 | /* |
| 3247 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 3764 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
| 3248 | */ | 3765 | */ |
| @@ -3319,7 +3836,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 3319 | INIT_WORK(&stock->work, drain_local_stock); | 3836 | INIT_WORK(&stock->work, drain_local_stock); |
| 3320 | } | 3837 | } |
| 3321 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | 3838 | hotcpu_notifier(memcg_stock_cpu_callback, 0); |
| 3322 | |||
| 3323 | } else { | 3839 | } else { |
| 3324 | parent = mem_cgroup_from_cont(cont->parent); | 3840 | parent = mem_cgroup_from_cont(cont->parent); |
| 3325 | mem->use_hierarchy = parent->use_hierarchy; | 3841 | mem->use_hierarchy = parent->use_hierarchy; |
| @@ -3345,6 +3861,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 3345 | if (parent) | 3861 | if (parent) |
| 3346 | mem->swappiness = get_swappiness(parent); | 3862 | mem->swappiness = get_swappiness(parent); |
| 3347 | atomic_set(&mem->refcnt, 1); | 3863 | atomic_set(&mem->refcnt, 1); |
| 3864 | mem->move_charge_at_immigrate = 0; | ||
| 3865 | mutex_init(&mem->thresholds_lock); | ||
| 3348 | return &mem->css; | 3866 | return &mem->css; |
| 3349 | free_out: | 3867 | free_out: |
| 3350 | __mem_cgroup_free(mem); | 3868 | __mem_cgroup_free(mem); |
| @@ -3381,17 +3899,450 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
| 3381 | return ret; | 3899 | return ret; |
| 3382 | } | 3900 | } |
| 3383 | 3901 | ||
| 3902 | #ifdef CONFIG_MMU | ||
| 3903 | /* Handlers for move charge at task migration. */ | ||
| 3904 | #define PRECHARGE_COUNT_AT_ONCE 256 | ||
| 3905 | static int mem_cgroup_do_precharge(unsigned long count) | ||
| 3906 | { | ||
| 3907 | int ret = 0; | ||
| 3908 | int batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
| 3909 | struct mem_cgroup *mem = mc.to; | ||
| 3910 | |||
| 3911 | if (mem_cgroup_is_root(mem)) { | ||
| 3912 | mc.precharge += count; | ||
| 3913 | /* we don't need css_get for root */ | ||
| 3914 | return ret; | ||
| 3915 | } | ||
| 3916 | /* try to charge at once */ | ||
| 3917 | if (count > 1) { | ||
| 3918 | struct res_counter *dummy; | ||
| 3919 | /* | ||
| 3920 | * "mem" cannot be under rmdir() because we've already checked | ||
| 3921 | * by cgroup_lock_live_cgroup() that it is not removed and we | ||
| 3922 | * are still under the same cgroup_mutex. So we can postpone | ||
| 3923 | * css_get(). | ||
| 3924 | */ | ||
| 3925 | if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) | ||
| 3926 | goto one_by_one; | ||
| 3927 | if (do_swap_account && res_counter_charge(&mem->memsw, | ||
| 3928 | PAGE_SIZE * count, &dummy)) { | ||
| 3929 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | ||
| 3930 | goto one_by_one; | ||
| 3931 | } | ||
| 3932 | mc.precharge += count; | ||
| 3933 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
| 3934 | WARN_ON_ONCE(count > INT_MAX); | ||
| 3935 | __css_get(&mem->css, (int)count); | ||
| 3936 | return ret; | ||
| 3937 | } | ||
| 3938 | one_by_one: | ||
| 3939 | /* fall back to one by one charge */ | ||
| 3940 | while (count--) { | ||
| 3941 | if (signal_pending(current)) { | ||
| 3942 | ret = -EINTR; | ||
| 3943 | break; | ||
| 3944 | } | ||
| 3945 | if (!batch_count--) { | ||
| 3946 | batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
| 3947 | cond_resched(); | ||
| 3948 | } | ||
| 3949 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | ||
| 3950 | if (ret || !mem) | ||
| 3951 | /* mem_cgroup_clear_mc() will do uncharge later */ | ||
| 3952 | return -ENOMEM; | ||
| 3953 | mc.precharge++; | ||
| 3954 | } | ||
| 3955 | return ret; | ||
| 3956 | } | ||
| 3957 | |||
| 3958 | /** | ||
| 3959 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | ||
| 3960 | * @vma: the vma the pte to be checked belongs | ||
| 3961 | * @addr: the address corresponding to the pte to be checked | ||
| 3962 | * @ptent: the pte to be checked | ||
| 3963 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | ||
| 3964 | * | ||
| 3965 | * Returns | ||
| 3966 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | ||
| 3967 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | ||
| 3968 | * move charge. if @target is not NULL, the page is stored in target->page | ||
| 3969 | * with extra refcnt got(Callers should handle it). | ||
| 3970 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | ||
| 3971 | * target for charge migration. if @target is not NULL, the entry is stored | ||
| 3972 | * in target->ent. | ||
| 3973 | * | ||
| 3974 | * Called with pte lock held. | ||
| 3975 | */ | ||
| 3976 | union mc_target { | ||
| 3977 | struct page *page; | ||
| 3978 | swp_entry_t ent; | ||
| 3979 | }; | ||
| 3980 | |||
| 3981 | enum mc_target_type { | ||
| 3982 | MC_TARGET_NONE, /* not used */ | ||
| 3983 | MC_TARGET_PAGE, | ||
| 3984 | MC_TARGET_SWAP, | ||
| 3985 | }; | ||
| 3986 | |||
| 3987 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | ||
| 3988 | unsigned long addr, pte_t ptent, union mc_target *target) | ||
| 3989 | { | ||
| 3990 | struct page *page = NULL; | ||
| 3991 | struct page_cgroup *pc; | ||
| 3992 | int ret = 0; | ||
| 3993 | swp_entry_t ent = { .val = 0 }; | ||
| 3994 | int usage_count = 0; | ||
| 3995 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
| 3996 | &mc.to->move_charge_at_immigrate); | ||
| 3997 | |||
| 3998 | if (!pte_present(ptent)) { | ||
| 3999 | /* TODO: handle swap of shmes/tmpfs */ | ||
| 4000 | if (pte_none(ptent) || pte_file(ptent)) | ||
| 4001 | return 0; | ||
| 4002 | else if (is_swap_pte(ptent)) { | ||
| 4003 | ent = pte_to_swp_entry(ptent); | ||
| 4004 | if (!move_anon || non_swap_entry(ent)) | ||
| 4005 | return 0; | ||
| 4006 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
| 4007 | } | ||
| 4008 | } else { | ||
| 4009 | page = vm_normal_page(vma, addr, ptent); | ||
| 4010 | if (!page || !page_mapped(page)) | ||
| 4011 | return 0; | ||
| 4012 | /* | ||
| 4013 | * TODO: We don't move charges of file(including shmem/tmpfs) | ||
| 4014 | * pages for now. | ||
| 4015 | */ | ||
| 4016 | if (!move_anon || !PageAnon(page)) | ||
| 4017 | return 0; | ||
| 4018 | if (!get_page_unless_zero(page)) | ||
| 4019 | return 0; | ||
| 4020 | usage_count = page_mapcount(page); | ||
| 4021 | } | ||
| 4022 | if (usage_count > 1) { | ||
| 4023 | /* | ||
| 4024 | * TODO: We don't move charges of shared(used by multiple | ||
| 4025 | * processes) pages for now. | ||
| 4026 | */ | ||
| 4027 | if (page) | ||
| 4028 | put_page(page); | ||
| 4029 | return 0; | ||
| 4030 | } | ||
| 4031 | if (page) { | ||
| 4032 | pc = lookup_page_cgroup(page); | ||
| 4033 | /* | ||
| 4034 | * Do only loose check w/o page_cgroup lock. | ||
| 4035 | * mem_cgroup_move_account() checks the pc is valid or not under | ||
| 4036 | * the lock. | ||
| 4037 | */ | ||
| 4038 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
| 4039 | ret = MC_TARGET_PAGE; | ||
| 4040 | if (target) | ||
| 4041 | target->page = page; | ||
| 4042 | } | ||
| 4043 | if (!ret || !target) | ||
| 4044 | put_page(page); | ||
| 4045 | } | ||
| 4046 | /* throught */ | ||
| 4047 | if (ent.val && do_swap_account && !ret) { | ||
| 4048 | unsigned short id; | ||
| 4049 | rcu_read_lock(); | ||
| 4050 | id = css_id(&mc.from->css); | ||
| 4051 | rcu_read_unlock(); | ||
| 4052 | if (id == lookup_swap_cgroup(ent)) { | ||
| 4053 | ret = MC_TARGET_SWAP; | ||
| 4054 | if (target) | ||
| 4055 | target->ent = ent; | ||
| 4056 | } | ||
| 4057 | } | ||
| 4058 | return ret; | ||
| 4059 | } | ||
| 4060 | |||
| 4061 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | ||
| 4062 | unsigned long addr, unsigned long end, | ||
| 4063 | struct mm_walk *walk) | ||
| 4064 | { | ||
| 4065 | struct vm_area_struct *vma = walk->private; | ||
| 4066 | pte_t *pte; | ||
| 4067 | spinlock_t *ptl; | ||
| 4068 | |||
| 4069 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
| 4070 | for (; addr != end; pte++, addr += PAGE_SIZE) | ||
| 4071 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | ||
| 4072 | mc.precharge++; /* increment precharge temporarily */ | ||
| 4073 | pte_unmap_unlock(pte - 1, ptl); | ||
| 4074 | cond_resched(); | ||
| 4075 | |||
| 4076 | return 0; | ||
| 4077 | } | ||
| 4078 | |||
| 4079 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | ||
| 4080 | { | ||
| 4081 | unsigned long precharge; | ||
| 4082 | struct vm_area_struct *vma; | ||
| 4083 | |||
| 4084 | down_read(&mm->mmap_sem); | ||
| 4085 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
| 4086 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
| 4087 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
| 4088 | .mm = mm, | ||
| 4089 | .private = vma, | ||
| 4090 | }; | ||
| 4091 | if (is_vm_hugetlb_page(vma)) | ||
| 4092 | continue; | ||
| 4093 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
| 4094 | if (vma->vm_flags & VM_SHARED) | ||
| 4095 | continue; | ||
| 4096 | walk_page_range(vma->vm_start, vma->vm_end, | ||
| 4097 | &mem_cgroup_count_precharge_walk); | ||
| 4098 | } | ||
| 4099 | up_read(&mm->mmap_sem); | ||
| 4100 | |||
| 4101 | precharge = mc.precharge; | ||
| 4102 | mc.precharge = 0; | ||
| 4103 | |||
| 4104 | return precharge; | ||
| 4105 | } | ||
| 4106 | |||
| 4107 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | ||
| 4108 | { | ||
| 4109 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | ||
| 4110 | } | ||
| 4111 | |||
| 4112 | static void mem_cgroup_clear_mc(void) | ||
| 4113 | { | ||
| 4114 | /* we must uncharge all the leftover precharges from mc.to */ | ||
| 4115 | if (mc.precharge) { | ||
| 4116 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | ||
| 4117 | mc.precharge = 0; | ||
| 4118 | } | ||
| 4119 | /* | ||
| 4120 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | ||
| 4121 | * we must uncharge here. | ||
| 4122 | */ | ||
| 4123 | if (mc.moved_charge) { | ||
| 4124 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | ||
| 4125 | mc.moved_charge = 0; | ||
| 4126 | } | ||
| 4127 | /* we must fixup refcnts and charges */ | ||
| 4128 | if (mc.moved_swap) { | ||
| 4129 | WARN_ON_ONCE(mc.moved_swap > INT_MAX); | ||
| 4130 | /* uncharge swap account from the old cgroup */ | ||
| 4131 | if (!mem_cgroup_is_root(mc.from)) | ||
| 4132 | res_counter_uncharge(&mc.from->memsw, | ||
| 4133 | PAGE_SIZE * mc.moved_swap); | ||
| 4134 | __mem_cgroup_put(mc.from, mc.moved_swap); | ||
| 4135 | |||
| 4136 | if (!mem_cgroup_is_root(mc.to)) { | ||
| 4137 | /* | ||
| 4138 | * we charged both to->res and to->memsw, so we should | ||
| 4139 | * uncharge to->res. | ||
| 4140 | */ | ||
| 4141 | res_counter_uncharge(&mc.to->res, | ||
| 4142 | PAGE_SIZE * mc.moved_swap); | ||
| 4143 | VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); | ||
| 4144 | __css_put(&mc.to->css, mc.moved_swap); | ||
| 4145 | } | ||
| 4146 | /* we've already done mem_cgroup_get(mc.to) */ | ||
| 4147 | |||
| 4148 | mc.moved_swap = 0; | ||
| 4149 | } | ||
| 4150 | mc.from = NULL; | ||
| 4151 | mc.to = NULL; | ||
| 4152 | mc.moving_task = NULL; | ||
| 4153 | wake_up_all(&mc.waitq); | ||
| 4154 | } | ||
| 4155 | |||
| 4156 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
| 4157 | struct cgroup *cgroup, | ||
| 4158 | struct task_struct *p, | ||
| 4159 | bool threadgroup) | ||
| 4160 | { | ||
| 4161 | int ret = 0; | ||
| 4162 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | ||
| 4163 | |||
| 4164 | if (mem->move_charge_at_immigrate) { | ||
| 4165 | struct mm_struct *mm; | ||
| 4166 | struct mem_cgroup *from = mem_cgroup_from_task(p); | ||
| 4167 | |||
| 4168 | VM_BUG_ON(from == mem); | ||
| 4169 | |||
| 4170 | mm = get_task_mm(p); | ||
| 4171 | if (!mm) | ||
| 4172 | return 0; | ||
| 4173 | /* We move charges only when we move a owner of the mm */ | ||
| 4174 | if (mm->owner == p) { | ||
| 4175 | VM_BUG_ON(mc.from); | ||
| 4176 | VM_BUG_ON(mc.to); | ||
| 4177 | VM_BUG_ON(mc.precharge); | ||
| 4178 | VM_BUG_ON(mc.moved_charge); | ||
| 4179 | VM_BUG_ON(mc.moved_swap); | ||
| 4180 | VM_BUG_ON(mc.moving_task); | ||
| 4181 | mc.from = from; | ||
| 4182 | mc.to = mem; | ||
| 4183 | mc.precharge = 0; | ||
| 4184 | mc.moved_charge = 0; | ||
| 4185 | mc.moved_swap = 0; | ||
| 4186 | mc.moving_task = current; | ||
| 4187 | |||
| 4188 | ret = mem_cgroup_precharge_mc(mm); | ||
| 4189 | if (ret) | ||
| 4190 | mem_cgroup_clear_mc(); | ||
| 4191 | } | ||
| 4192 | mmput(mm); | ||
| 4193 | } | ||
| 4194 | return ret; | ||
| 4195 | } | ||
| 4196 | |||
| 4197 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
| 4198 | struct cgroup *cgroup, | ||
| 4199 | struct task_struct *p, | ||
| 4200 | bool threadgroup) | ||
| 4201 | { | ||
| 4202 | mem_cgroup_clear_mc(); | ||
| 4203 | } | ||
| 4204 | |||
| 4205 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | ||
| 4206 | unsigned long addr, unsigned long end, | ||
| 4207 | struct mm_walk *walk) | ||
| 4208 | { | ||
| 4209 | int ret = 0; | ||
| 4210 | struct vm_area_struct *vma = walk->private; | ||
| 4211 | pte_t *pte; | ||
| 4212 | spinlock_t *ptl; | ||
| 4213 | |||
| 4214 | retry: | ||
| 4215 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
| 4216 | for (; addr != end; addr += PAGE_SIZE) { | ||
| 4217 | pte_t ptent = *(pte++); | ||
| 4218 | union mc_target target; | ||
| 4219 | int type; | ||
| 4220 | struct page *page; | ||
| 4221 | struct page_cgroup *pc; | ||
| 4222 | swp_entry_t ent; | ||
| 4223 | |||
| 4224 | if (!mc.precharge) | ||
| 4225 | break; | ||
| 4226 | |||
| 4227 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | ||
| 4228 | switch (type) { | ||
| 4229 | case MC_TARGET_PAGE: | ||
| 4230 | page = target.page; | ||
| 4231 | if (isolate_lru_page(page)) | ||
| 4232 | goto put; | ||
| 4233 | pc = lookup_page_cgroup(page); | ||
| 4234 | if (!mem_cgroup_move_account(pc, | ||
| 4235 | mc.from, mc.to, false)) { | ||
| 4236 | mc.precharge--; | ||
| 4237 | /* we uncharge from mc.from later. */ | ||
| 4238 | mc.moved_charge++; | ||
| 4239 | } | ||
| 4240 | putback_lru_page(page); | ||
| 4241 | put: /* is_target_pte_for_mc() gets the page */ | ||
| 4242 | put_page(page); | ||
| 4243 | break; | ||
| 4244 | case MC_TARGET_SWAP: | ||
| 4245 | ent = target.ent; | ||
| 4246 | if (!mem_cgroup_move_swap_account(ent, | ||
| 4247 | mc.from, mc.to, false)) { | ||
| 4248 | mc.precharge--; | ||
| 4249 | /* we fixup refcnts and charges later. */ | ||
| 4250 | mc.moved_swap++; | ||
| 4251 | } | ||
| 4252 | break; | ||
| 4253 | default: | ||
| 4254 | break; | ||
| 4255 | } | ||
| 4256 | } | ||
| 4257 | pte_unmap_unlock(pte - 1, ptl); | ||
| 4258 | cond_resched(); | ||
| 4259 | |||
| 4260 | if (addr != end) { | ||
| 4261 | /* | ||
| 4262 | * We have consumed all precharges we got in can_attach(). | ||
| 4263 | * We try charge one by one, but don't do any additional | ||
| 4264 | * charges to mc.to if we have failed in charge once in attach() | ||
| 4265 | * phase. | ||
| 4266 | */ | ||
| 4267 | ret = mem_cgroup_do_precharge(1); | ||
| 4268 | if (!ret) | ||
| 4269 | goto retry; | ||
| 4270 | } | ||
| 4271 | |||
| 4272 | return ret; | ||
| 4273 | } | ||
| 4274 | |||
| 4275 | static void mem_cgroup_move_charge(struct mm_struct *mm) | ||
| 4276 | { | ||
| 4277 | struct vm_area_struct *vma; | ||
| 4278 | |||
| 4279 | lru_add_drain_all(); | ||
| 4280 | down_read(&mm->mmap_sem); | ||
| 4281 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
| 4282 | int ret; | ||
| 4283 | struct mm_walk mem_cgroup_move_charge_walk = { | ||
| 4284 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
| 4285 | .mm = mm, | ||
| 4286 | .private = vma, | ||
| 4287 | }; | ||
| 4288 | if (is_vm_hugetlb_page(vma)) | ||
| 4289 | continue; | ||
| 4290 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
| 4291 | if (vma->vm_flags & VM_SHARED) | ||
| 4292 | continue; | ||
| 4293 | ret = walk_page_range(vma->vm_start, vma->vm_end, | ||
| 4294 | &mem_cgroup_move_charge_walk); | ||
| 4295 | if (ret) | ||
| 4296 | /* | ||
| 4297 | * means we have consumed all precharges and failed in | ||
| 4298 | * doing additional charge. Just abandon here. | ||
| 4299 | */ | ||
| 4300 | break; | ||
| 4301 | } | ||
| 4302 | up_read(&mm->mmap_sem); | ||
| 4303 | } | ||
| 4304 | |||
| 3384 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 4305 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
| 3385 | struct cgroup *cont, | 4306 | struct cgroup *cont, |
| 3386 | struct cgroup *old_cont, | 4307 | struct cgroup *old_cont, |
| 3387 | struct task_struct *p, | 4308 | struct task_struct *p, |
| 3388 | bool threadgroup) | 4309 | bool threadgroup) |
| 3389 | { | 4310 | { |
| 3390 | /* | 4311 | struct mm_struct *mm; |
| 3391 | * FIXME: It's better to move charges of this process from old | 4312 | |
| 3392 | * memcg to new memcg. But it's just on TODO-List now. | 4313 | if (!mc.to) |
| 3393 | */ | 4314 | /* no need to move charge */ |
| 4315 | return; | ||
| 4316 | |||
| 4317 | mm = get_task_mm(p); | ||
| 4318 | if (mm) { | ||
| 4319 | mem_cgroup_move_charge(mm); | ||
| 4320 | mmput(mm); | ||
| 4321 | } | ||
| 4322 | mem_cgroup_clear_mc(); | ||
| 4323 | } | ||
| 4324 | #else /* !CONFIG_MMU */ | ||
| 4325 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
| 4326 | struct cgroup *cgroup, | ||
| 4327 | struct task_struct *p, | ||
| 4328 | bool threadgroup) | ||
| 4329 | { | ||
| 4330 | return 0; | ||
| 4331 | } | ||
| 4332 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
| 4333 | struct cgroup *cgroup, | ||
| 4334 | struct task_struct *p, | ||
| 4335 | bool threadgroup) | ||
| 4336 | { | ||
| 3394 | } | 4337 | } |
| 4338 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | ||
| 4339 | struct cgroup *cont, | ||
| 4340 | struct cgroup *old_cont, | ||
| 4341 | struct task_struct *p, | ||
| 4342 | bool threadgroup) | ||
| 4343 | { | ||
| 4344 | } | ||
| 4345 | #endif | ||
| 3395 | 4346 | ||
| 3396 | struct cgroup_subsys mem_cgroup_subsys = { | 4347 | struct cgroup_subsys mem_cgroup_subsys = { |
| 3397 | .name = "memory", | 4348 | .name = "memory", |
| @@ -3400,6 +4351,8 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
| 3400 | .pre_destroy = mem_cgroup_pre_destroy, | 4351 | .pre_destroy = mem_cgroup_pre_destroy, |
| 3401 | .destroy = mem_cgroup_destroy, | 4352 | .destroy = mem_cgroup_destroy, |
| 3402 | .populate = mem_cgroup_populate, | 4353 | .populate = mem_cgroup_populate, |
| 4354 | .can_attach = mem_cgroup_can_attach, | ||
| 4355 | .cancel_attach = mem_cgroup_cancel_attach, | ||
| 3403 | .attach = mem_cgroup_move_task, | 4356 | .attach = mem_cgroup_move_task, |
| 3404 | .early_init = 0, | 4357 | .early_init = 0, |
| 3405 | .use_id = 1, | 4358 | .use_id = 1, |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 17299fd4577c..620b0b461593 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -44,6 +44,7 @@ | |||
| 44 | #include <linux/migrate.h> | 44 | #include <linux/migrate.h> |
| 45 | #include <linux/page-isolation.h> | 45 | #include <linux/page-isolation.h> |
| 46 | #include <linux/suspend.h> | 46 | #include <linux/suspend.h> |
| 47 | #include <linux/slab.h> | ||
| 47 | #include "internal.h" | 48 | #include "internal.h" |
| 48 | 49 | ||
| 49 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 50 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
| @@ -383,9 +384,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
| 383 | if (av == NULL) /* Not actually mapped anymore */ | 384 | if (av == NULL) /* Not actually mapped anymore */ |
| 384 | goto out; | 385 | goto out; |
| 385 | for_each_process (tsk) { | 386 | for_each_process (tsk) { |
| 387 | struct anon_vma_chain *vmac; | ||
| 388 | |||
| 386 | if (!task_early_kill(tsk)) | 389 | if (!task_early_kill(tsk)) |
| 387 | continue; | 390 | continue; |
| 388 | list_for_each_entry (vma, &av->head, anon_vma_node) { | 391 | list_for_each_entry(vmac, &av->head, same_anon_vma) { |
| 392 | vma = vmac->vma; | ||
| 389 | if (!page_mapped_in_vma(page, vma)) | 393 | if (!page_mapped_in_vma(page, vma)) |
| 390 | continue; | 394 | continue; |
| 391 | if (vma->vm_mm == tsk->mm) | 395 | if (vma->vm_mm == tsk->mm) |
diff --git a/mm/memory.c b/mm/memory.c index 09e4b1be7b67..833952d8b74d 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -56,6 +56,7 @@ | |||
| 56 | #include <linux/kallsyms.h> | 56 | #include <linux/kallsyms.h> |
| 57 | #include <linux/swapops.h> | 57 | #include <linux/swapops.h> |
| 58 | #include <linux/elf.h> | 58 | #include <linux/elf.h> |
| 59 | #include <linux/gfp.h> | ||
| 59 | 60 | ||
| 60 | #include <asm/io.h> | 61 | #include <asm/io.h> |
| 61 | #include <asm/pgalloc.h> | 62 | #include <asm/pgalloc.h> |
| @@ -121,6 +122,77 @@ static int __init init_zero_pfn(void) | |||
| 121 | } | 122 | } |
| 122 | core_initcall(init_zero_pfn); | 123 | core_initcall(init_zero_pfn); |
| 123 | 124 | ||
| 125 | |||
| 126 | #if defined(SPLIT_RSS_COUNTING) | ||
| 127 | |||
| 128 | static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) | ||
| 129 | { | ||
| 130 | int i; | ||
| 131 | |||
| 132 | for (i = 0; i < NR_MM_COUNTERS; i++) { | ||
| 133 | if (task->rss_stat.count[i]) { | ||
| 134 | add_mm_counter(mm, i, task->rss_stat.count[i]); | ||
| 135 | task->rss_stat.count[i] = 0; | ||
| 136 | } | ||
| 137 | } | ||
| 138 | task->rss_stat.events = 0; | ||
| 139 | } | ||
| 140 | |||
| 141 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) | ||
| 142 | { | ||
| 143 | struct task_struct *task = current; | ||
| 144 | |||
| 145 | if (likely(task->mm == mm)) | ||
| 146 | task->rss_stat.count[member] += val; | ||
| 147 | else | ||
| 148 | add_mm_counter(mm, member, val); | ||
| 149 | } | ||
| 150 | #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) | ||
| 151 | #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) | ||
| 152 | |||
| 153 | /* sync counter once per 64 page faults */ | ||
| 154 | #define TASK_RSS_EVENTS_THRESH (64) | ||
| 155 | static void check_sync_rss_stat(struct task_struct *task) | ||
| 156 | { | ||
| 157 | if (unlikely(task != current)) | ||
| 158 | return; | ||
| 159 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) | ||
| 160 | __sync_task_rss_stat(task, task->mm); | ||
| 161 | } | ||
| 162 | |||
| 163 | unsigned long get_mm_counter(struct mm_struct *mm, int member) | ||
| 164 | { | ||
| 165 | long val = 0; | ||
| 166 | |||
| 167 | /* | ||
| 168 | * Don't use task->mm here...for avoiding to use task_get_mm().. | ||
| 169 | * The caller must guarantee task->mm is not invalid. | ||
| 170 | */ | ||
| 171 | val = atomic_long_read(&mm->rss_stat.count[member]); | ||
| 172 | /* | ||
| 173 | * counter is updated in asynchronous manner and may go to minus. | ||
| 174 | * But it's never be expected number for users. | ||
| 175 | */ | ||
| 176 | if (val < 0) | ||
| 177 | return 0; | ||
| 178 | return (unsigned long)val; | ||
| 179 | } | ||
| 180 | |||
| 181 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
| 182 | { | ||
| 183 | __sync_task_rss_stat(task, mm); | ||
| 184 | } | ||
| 185 | #else | ||
| 186 | |||
| 187 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) | ||
| 188 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) | ||
| 189 | |||
| 190 | static void check_sync_rss_stat(struct task_struct *task) | ||
| 191 | { | ||
| 192 | } | ||
| 193 | |||
| 194 | #endif | ||
| 195 | |||
| 124 | /* | 196 | /* |
| 125 | * If a p?d_bad entry is found while walking page tables, report | 197 | * If a p?d_bad entry is found while walking page tables, report |
| 126 | * the error, before resetting entry to p?d_none. Usually (but | 198 | * the error, before resetting entry to p?d_none. Usually (but |
| @@ -300,7 +372,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 300 | * Hide vma from rmap and truncate_pagecache before freeing | 372 | * Hide vma from rmap and truncate_pagecache before freeing |
| 301 | * pgtables | 373 | * pgtables |
| 302 | */ | 374 | */ |
| 303 | anon_vma_unlink(vma); | 375 | unlink_anon_vmas(vma); |
| 304 | unlink_file_vma(vma); | 376 | unlink_file_vma(vma); |
| 305 | 377 | ||
| 306 | if (is_vm_hugetlb_page(vma)) { | 378 | if (is_vm_hugetlb_page(vma)) { |
| @@ -314,7 +386,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 314 | && !is_vm_hugetlb_page(next)) { | 386 | && !is_vm_hugetlb_page(next)) { |
| 315 | vma = next; | 387 | vma = next; |
| 316 | next = vma->vm_next; | 388 | next = vma->vm_next; |
| 317 | anon_vma_unlink(vma); | 389 | unlink_anon_vmas(vma); |
| 318 | unlink_file_vma(vma); | 390 | unlink_file_vma(vma); |
| 319 | } | 391 | } |
| 320 | free_pgd_range(tlb, addr, vma->vm_end, | 392 | free_pgd_range(tlb, addr, vma->vm_end, |
| @@ -376,12 +448,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
| 376 | return 0; | 448 | return 0; |
| 377 | } | 449 | } |
| 378 | 450 | ||
| 379 | static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | 451 | static inline void init_rss_vec(int *rss) |
| 380 | { | 452 | { |
| 381 | if (file_rss) | 453 | memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); |
| 382 | add_mm_counter(mm, file_rss, file_rss); | 454 | } |
| 383 | if (anon_rss) | 455 | |
| 384 | add_mm_counter(mm, anon_rss, anon_rss); | 456 | static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) |
| 457 | { | ||
| 458 | int i; | ||
| 459 | |||
| 460 | if (current->mm == mm) | ||
| 461 | sync_mm_rss(current, mm); | ||
| 462 | for (i = 0; i < NR_MM_COUNTERS; i++) | ||
| 463 | if (rss[i]) | ||
| 464 | add_mm_counter(mm, i, rss[i]); | ||
| 385 | } | 465 | } |
| 386 | 466 | ||
| 387 | /* | 467 | /* |
| @@ -430,12 +510,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
| 430 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", | 510 | "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", |
| 431 | current->comm, | 511 | current->comm, |
| 432 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); | 512 | (long long)pte_val(pte), (long long)pmd_val(*pmd)); |
| 433 | if (page) { | 513 | if (page) |
| 434 | printk(KERN_ALERT | 514 | dump_page(page); |
| 435 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
| 436 | page, (void *)page->flags, page_count(page), | ||
| 437 | page_mapcount(page), page->mapping, page->index); | ||
| 438 | } | ||
| 439 | printk(KERN_ALERT | 515 | printk(KERN_ALERT |
| 440 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", | 516 | "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", |
| 441 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); | 517 | (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); |
| @@ -597,7 +673,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 597 | &src_mm->mmlist); | 673 | &src_mm->mmlist); |
| 598 | spin_unlock(&mmlist_lock); | 674 | spin_unlock(&mmlist_lock); |
| 599 | } | 675 | } |
| 600 | if (is_write_migration_entry(entry) && | 676 | if (likely(!non_swap_entry(entry))) |
| 677 | rss[MM_SWAPENTS]++; | ||
| 678 | else if (is_write_migration_entry(entry) && | ||
| 601 | is_cow_mapping(vm_flags)) { | 679 | is_cow_mapping(vm_flags)) { |
| 602 | /* | 680 | /* |
| 603 | * COW mappings require pages in both parent | 681 | * COW mappings require pages in both parent |
| @@ -632,7 +710,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 632 | if (page) { | 710 | if (page) { |
| 633 | get_page(page); | 711 | get_page(page); |
| 634 | page_dup_rmap(page); | 712 | page_dup_rmap(page); |
| 635 | rss[PageAnon(page)]++; | 713 | if (PageAnon(page)) |
| 714 | rss[MM_ANONPAGES]++; | ||
| 715 | else | ||
| 716 | rss[MM_FILEPAGES]++; | ||
| 636 | } | 717 | } |
| 637 | 718 | ||
| 638 | out_set_pte: | 719 | out_set_pte: |
| @@ -648,11 +729,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 648 | pte_t *src_pte, *dst_pte; | 729 | pte_t *src_pte, *dst_pte; |
| 649 | spinlock_t *src_ptl, *dst_ptl; | 730 | spinlock_t *src_ptl, *dst_ptl; |
| 650 | int progress = 0; | 731 | int progress = 0; |
| 651 | int rss[2]; | 732 | int rss[NR_MM_COUNTERS]; |
| 652 | swp_entry_t entry = (swp_entry_t){0}; | 733 | swp_entry_t entry = (swp_entry_t){0}; |
| 653 | 734 | ||
| 654 | again: | 735 | again: |
| 655 | rss[1] = rss[0] = 0; | 736 | init_rss_vec(rss); |
| 737 | |||
| 656 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | 738 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); |
| 657 | if (!dst_pte) | 739 | if (!dst_pte) |
| 658 | return -ENOMEM; | 740 | return -ENOMEM; |
| @@ -688,7 +770,7 @@ again: | |||
| 688 | arch_leave_lazy_mmu_mode(); | 770 | arch_leave_lazy_mmu_mode(); |
| 689 | spin_unlock(src_ptl); | 771 | spin_unlock(src_ptl); |
| 690 | pte_unmap_nested(orig_src_pte); | 772 | pte_unmap_nested(orig_src_pte); |
| 691 | add_mm_rss(dst_mm, rss[0], rss[1]); | 773 | add_mm_rss_vec(dst_mm, rss); |
| 692 | pte_unmap_unlock(orig_dst_pte, dst_ptl); | 774 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
| 693 | cond_resched(); | 775 | cond_resched(); |
| 694 | 776 | ||
| @@ -816,8 +898,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
| 816 | struct mm_struct *mm = tlb->mm; | 898 | struct mm_struct *mm = tlb->mm; |
| 817 | pte_t *pte; | 899 | pte_t *pte; |
| 818 | spinlock_t *ptl; | 900 | spinlock_t *ptl; |
| 819 | int file_rss = 0; | 901 | int rss[NR_MM_COUNTERS]; |
| 820 | int anon_rss = 0; | 902 | |
| 903 | init_rss_vec(rss); | ||
| 821 | 904 | ||
| 822 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 905 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
| 823 | arch_enter_lazy_mmu_mode(); | 906 | arch_enter_lazy_mmu_mode(); |
| @@ -863,14 +946,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
| 863 | set_pte_at(mm, addr, pte, | 946 | set_pte_at(mm, addr, pte, |
| 864 | pgoff_to_pte(page->index)); | 947 | pgoff_to_pte(page->index)); |
| 865 | if (PageAnon(page)) | 948 | if (PageAnon(page)) |
| 866 | anon_rss--; | 949 | rss[MM_ANONPAGES]--; |
| 867 | else { | 950 | else { |
| 868 | if (pte_dirty(ptent)) | 951 | if (pte_dirty(ptent)) |
| 869 | set_page_dirty(page); | 952 | set_page_dirty(page); |
| 870 | if (pte_young(ptent) && | 953 | if (pte_young(ptent) && |
| 871 | likely(!VM_SequentialReadHint(vma))) | 954 | likely(!VM_SequentialReadHint(vma))) |
| 872 | mark_page_accessed(page); | 955 | mark_page_accessed(page); |
| 873 | file_rss--; | 956 | rss[MM_FILEPAGES]--; |
| 874 | } | 957 | } |
| 875 | page_remove_rmap(page); | 958 | page_remove_rmap(page); |
| 876 | if (unlikely(page_mapcount(page) < 0)) | 959 | if (unlikely(page_mapcount(page) < 0)) |
| @@ -887,13 +970,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
| 887 | if (pte_file(ptent)) { | 970 | if (pte_file(ptent)) { |
| 888 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) | 971 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) |
| 889 | print_bad_pte(vma, addr, ptent, NULL); | 972 | print_bad_pte(vma, addr, ptent, NULL); |
| 890 | } else if | 973 | } else { |
| 891 | (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) | 974 | swp_entry_t entry = pte_to_swp_entry(ptent); |
| 892 | print_bad_pte(vma, addr, ptent, NULL); | 975 | |
| 976 | if (!non_swap_entry(entry)) | ||
| 977 | rss[MM_SWAPENTS]--; | ||
| 978 | if (unlikely(!free_swap_and_cache(entry))) | ||
| 979 | print_bad_pte(vma, addr, ptent, NULL); | ||
| 980 | } | ||
| 893 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 981 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
| 894 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 982 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); |
| 895 | 983 | ||
| 896 | add_mm_rss(mm, file_rss, anon_rss); | 984 | add_mm_rss_vec(mm, rss); |
| 897 | arch_leave_lazy_mmu_mode(); | 985 | arch_leave_lazy_mmu_mode(); |
| 898 | pte_unmap_unlock(pte - 1, ptl); | 986 | pte_unmap_unlock(pte - 1, ptl); |
| 899 | 987 | ||
| @@ -1527,7 +1615,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 1527 | 1615 | ||
| 1528 | /* Ok, finally just insert the thing.. */ | 1616 | /* Ok, finally just insert the thing.. */ |
| 1529 | get_page(page); | 1617 | get_page(page); |
| 1530 | inc_mm_counter(mm, file_rss); | 1618 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
| 1531 | page_add_file_rmap(page); | 1619 | page_add_file_rmap(page); |
| 1532 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1620 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
| 1533 | 1621 | ||
| @@ -1593,7 +1681,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
| 1593 | /* Ok, finally just insert the thing.. */ | 1681 | /* Ok, finally just insert the thing.. */ |
| 1594 | entry = pte_mkspecial(pfn_pte(pfn, prot)); | 1682 | entry = pte_mkspecial(pfn_pte(pfn, prot)); |
| 1595 | set_pte_at(mm, addr, pte, entry); | 1683 | set_pte_at(mm, addr, pte, entry); |
| 1596 | update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ | 1684 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ |
| 1597 | 1685 | ||
| 1598 | retval = 0; | 1686 | retval = 0; |
| 1599 | out_unlock: | 1687 | out_unlock: |
| @@ -2044,6 +2132,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2044 | page_cache_release(old_page); | 2132 | page_cache_release(old_page); |
| 2045 | } | 2133 | } |
| 2046 | reuse = reuse_swap_page(old_page); | 2134 | reuse = reuse_swap_page(old_page); |
| 2135 | if (reuse) | ||
| 2136 | /* | ||
| 2137 | * The page is all ours. Move it to our anon_vma so | ||
| 2138 | * the rmap code will not search our parent or siblings. | ||
| 2139 | * Protected against the rmap code by the page lock. | ||
| 2140 | */ | ||
| 2141 | page_move_anon_rmap(old_page, vma, address); | ||
| 2047 | unlock_page(old_page); | 2142 | unlock_page(old_page); |
| 2048 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2143 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
| 2049 | (VM_WRITE|VM_SHARED))) { | 2144 | (VM_WRITE|VM_SHARED))) { |
| @@ -2116,7 +2211,7 @@ reuse: | |||
| 2116 | entry = pte_mkyoung(orig_pte); | 2211 | entry = pte_mkyoung(orig_pte); |
| 2117 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2212 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2118 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 2213 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
| 2119 | update_mmu_cache(vma, address, entry); | 2214 | update_mmu_cache(vma, address, page_table); |
| 2120 | ret |= VM_FAULT_WRITE; | 2215 | ret |= VM_FAULT_WRITE; |
| 2121 | goto unlock; | 2216 | goto unlock; |
| 2122 | } | 2217 | } |
| @@ -2163,11 +2258,11 @@ gotten: | |||
| 2163 | if (likely(pte_same(*page_table, orig_pte))) { | 2258 | if (likely(pte_same(*page_table, orig_pte))) { |
| 2164 | if (old_page) { | 2259 | if (old_page) { |
| 2165 | if (!PageAnon(old_page)) { | 2260 | if (!PageAnon(old_page)) { |
| 2166 | dec_mm_counter(mm, file_rss); | 2261 | dec_mm_counter_fast(mm, MM_FILEPAGES); |
| 2167 | inc_mm_counter(mm, anon_rss); | 2262 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2168 | } | 2263 | } |
| 2169 | } else | 2264 | } else |
| 2170 | inc_mm_counter(mm, anon_rss); | 2265 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2171 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2266 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
| 2172 | entry = mk_pte(new_page, vma->vm_page_prot); | 2267 | entry = mk_pte(new_page, vma->vm_page_prot); |
| 2173 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2268 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| @@ -2185,7 +2280,7 @@ gotten: | |||
| 2185 | * new page to be mapped directly into the secondary page table. | 2280 | * new page to be mapped directly into the secondary page table. |
| 2186 | */ | 2281 | */ |
| 2187 | set_pte_at_notify(mm, address, page_table, entry); | 2282 | set_pte_at_notify(mm, address, page_table, entry); |
| 2188 | update_mmu_cache(vma, address, entry); | 2283 | update_mmu_cache(vma, address, page_table); |
| 2189 | if (old_page) { | 2284 | if (old_page) { |
| 2190 | /* | 2285 | /* |
| 2191 | * Only after switching the pte to the new page may | 2286 | * Only after switching the pte to the new page may |
| @@ -2604,7 +2699,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2604 | * discarded at swap_free(). | 2699 | * discarded at swap_free(). |
| 2605 | */ | 2700 | */ |
| 2606 | 2701 | ||
| 2607 | inc_mm_counter(mm, anon_rss); | 2702 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2703 | dec_mm_counter_fast(mm, MM_SWAPENTS); | ||
| 2608 | pte = mk_pte(page, vma->vm_page_prot); | 2704 | pte = mk_pte(page, vma->vm_page_prot); |
| 2609 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { | 2705 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { |
| 2610 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2706 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
| @@ -2629,7 +2725,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2629 | } | 2725 | } |
| 2630 | 2726 | ||
| 2631 | /* No need to invalidate - it was non-present before */ | 2727 | /* No need to invalidate - it was non-present before */ |
| 2632 | update_mmu_cache(vma, address, pte); | 2728 | update_mmu_cache(vma, address, page_table); |
| 2633 | unlock: | 2729 | unlock: |
| 2634 | pte_unmap_unlock(page_table, ptl); | 2730 | pte_unmap_unlock(page_table, ptl); |
| 2635 | out: | 2731 | out: |
| @@ -2688,13 +2784,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2688 | if (!pte_none(*page_table)) | 2784 | if (!pte_none(*page_table)) |
| 2689 | goto release; | 2785 | goto release; |
| 2690 | 2786 | ||
| 2691 | inc_mm_counter(mm, anon_rss); | 2787 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2692 | page_add_new_anon_rmap(page, vma, address); | 2788 | page_add_new_anon_rmap(page, vma, address); |
| 2693 | setpte: | 2789 | setpte: |
| 2694 | set_pte_at(mm, address, page_table, entry); | 2790 | set_pte_at(mm, address, page_table, entry); |
| 2695 | 2791 | ||
| 2696 | /* No need to invalidate - it was non-present before */ | 2792 | /* No need to invalidate - it was non-present before */ |
| 2697 | update_mmu_cache(vma, address, entry); | 2793 | update_mmu_cache(vma, address, page_table); |
| 2698 | unlock: | 2794 | unlock: |
| 2699 | pte_unmap_unlock(page_table, ptl); | 2795 | pte_unmap_unlock(page_table, ptl); |
| 2700 | return 0; | 2796 | return 0; |
| @@ -2842,10 +2938,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2842 | if (flags & FAULT_FLAG_WRITE) | 2938 | if (flags & FAULT_FLAG_WRITE) |
| 2843 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2939 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2844 | if (anon) { | 2940 | if (anon) { |
| 2845 | inc_mm_counter(mm, anon_rss); | 2941 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2846 | page_add_new_anon_rmap(page, vma, address); | 2942 | page_add_new_anon_rmap(page, vma, address); |
| 2847 | } else { | 2943 | } else { |
| 2848 | inc_mm_counter(mm, file_rss); | 2944 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
| 2849 | page_add_file_rmap(page); | 2945 | page_add_file_rmap(page); |
| 2850 | if (flags & FAULT_FLAG_WRITE) { | 2946 | if (flags & FAULT_FLAG_WRITE) { |
| 2851 | dirty_page = page; | 2947 | dirty_page = page; |
| @@ -2855,7 +2951,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2855 | set_pte_at(mm, address, page_table, entry); | 2951 | set_pte_at(mm, address, page_table, entry); |
| 2856 | 2952 | ||
| 2857 | /* no need to invalidate: a not-present page won't be cached */ | 2953 | /* no need to invalidate: a not-present page won't be cached */ |
| 2858 | update_mmu_cache(vma, address, entry); | 2954 | update_mmu_cache(vma, address, page_table); |
| 2859 | } else { | 2955 | } else { |
| 2860 | if (charged) | 2956 | if (charged) |
| 2861 | mem_cgroup_uncharge_page(page); | 2957 | mem_cgroup_uncharge_page(page); |
| @@ -2992,7 +3088,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
| 2992 | } | 3088 | } |
| 2993 | entry = pte_mkyoung(entry); | 3089 | entry = pte_mkyoung(entry); |
| 2994 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { | 3090 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { |
| 2995 | update_mmu_cache(vma, address, entry); | 3091 | update_mmu_cache(vma, address, pte); |
| 2996 | } else { | 3092 | } else { |
| 2997 | /* | 3093 | /* |
| 2998 | * This is needed only for protection faults but the arch code | 3094 | * This is needed only for protection faults but the arch code |
| @@ -3023,6 +3119,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3023 | 3119 | ||
| 3024 | count_vm_event(PGFAULT); | 3120 | count_vm_event(PGFAULT); |
| 3025 | 3121 | ||
| 3122 | /* do counter updates before entering really critical section. */ | ||
| 3123 | check_sync_rss_stat(current); | ||
| 3124 | |||
| 3026 | if (unlikely(is_vm_hugetlb_page(vma))) | 3125 | if (unlikely(is_vm_hugetlb_page(vma))) |
| 3027 | return hugetlb_fault(mm, vma, address, flags); | 3126 | return hugetlb_fault(mm, vma, address, flags); |
| 3028 | 3127 | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 030ce8a5bb0e..be211a582930 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include <linux/pfn.h> | 28 | #include <linux/pfn.h> |
| 29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
| 30 | #include <linux/mm_inline.h> | 30 | #include <linux/mm_inline.h> |
| 31 | #include <linux/firmware-map.h> | ||
| 31 | 32 | ||
| 32 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
| 33 | 34 | ||
| @@ -523,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
| 523 | BUG_ON(ret); | 524 | BUG_ON(ret); |
| 524 | } | 525 | } |
| 525 | 526 | ||
| 527 | /* create new memmap entry */ | ||
| 528 | firmware_map_add_hotplug(start, start + size, "System RAM"); | ||
| 529 | |||
| 526 | goto out; | 530 | goto out; |
| 527 | 531 | ||
| 528 | error: | 532 | error: |
| @@ -684,9 +688,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
| 684 | if (page_count(page)) | 688 | if (page_count(page)) |
| 685 | not_managed++; | 689 | not_managed++; |
| 686 | #ifdef CONFIG_DEBUG_VM | 690 | #ifdef CONFIG_DEBUG_VM |
| 687 | printk(KERN_INFO "removing from LRU failed" | 691 | printk(KERN_ALERT "removing pfn %lx from LRU failed\n", |
| 688 | " %lx/%d/%lx\n", | 692 | pfn); |
| 689 | pfn, page_count(page), page->flags); | 693 | dump_page(page); |
| 690 | #endif | 694 | #endif |
| 691 | } | 695 | } |
| 692 | } | 696 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 290fb5bf0440..08f40a2f3fe0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -73,7 +73,6 @@ | |||
| 73 | #include <linux/sched.h> | 73 | #include <linux/sched.h> |
| 74 | #include <linux/nodemask.h> | 74 | #include <linux/nodemask.h> |
| 75 | #include <linux/cpuset.h> | 75 | #include <linux/cpuset.h> |
| 76 | #include <linux/gfp.h> | ||
| 77 | #include <linux/slab.h> | 76 | #include <linux/slab.h> |
| 78 | #include <linux/string.h> | 77 | #include <linux/string.h> |
| 79 | #include <linux/module.h> | 78 | #include <linux/module.h> |
| @@ -563,24 +562,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | |||
| 563 | } | 562 | } |
| 564 | 563 | ||
| 565 | /* Step 2: apply policy to a range and do splits. */ | 564 | /* Step 2: apply policy to a range and do splits. */ |
| 566 | static int mbind_range(struct vm_area_struct *vma, unsigned long start, | 565 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
| 567 | unsigned long end, struct mempolicy *new) | 566 | unsigned long end, struct mempolicy *new_pol) |
| 568 | { | 567 | { |
| 569 | struct vm_area_struct *next; | 568 | struct vm_area_struct *next; |
| 570 | int err; | 569 | struct vm_area_struct *prev; |
| 570 | struct vm_area_struct *vma; | ||
| 571 | int err = 0; | ||
| 572 | pgoff_t pgoff; | ||
| 573 | unsigned long vmstart; | ||
| 574 | unsigned long vmend; | ||
| 571 | 575 | ||
| 572 | err = 0; | 576 | vma = find_vma_prev(mm, start, &prev); |
| 573 | for (; vma && vma->vm_start < end; vma = next) { | 577 | if (!vma || vma->vm_start > start) |
| 578 | return -EFAULT; | ||
| 579 | |||
| 580 | for (; vma && vma->vm_start < end; prev = vma, vma = next) { | ||
| 574 | next = vma->vm_next; | 581 | next = vma->vm_next; |
| 575 | if (vma->vm_start < start) | 582 | vmstart = max(start, vma->vm_start); |
| 576 | err = split_vma(vma->vm_mm, vma, start, 1); | 583 | vmend = min(end, vma->vm_end); |
| 577 | if (!err && vma->vm_end > end) | 584 | |
| 578 | err = split_vma(vma->vm_mm, vma, end, 0); | 585 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
| 579 | if (!err) | 586 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
| 580 | err = policy_vma(vma, new); | 587 | vma->anon_vma, vma->vm_file, pgoff, new_pol); |
| 588 | if (prev) { | ||
| 589 | vma = prev; | ||
| 590 | next = vma->vm_next; | ||
| 591 | continue; | ||
| 592 | } | ||
| 593 | if (vma->vm_start != vmstart) { | ||
| 594 | err = split_vma(vma->vm_mm, vma, vmstart, 1); | ||
| 595 | if (err) | ||
| 596 | goto out; | ||
| 597 | } | ||
| 598 | if (vma->vm_end != vmend) { | ||
| 599 | err = split_vma(vma->vm_mm, vma, vmend, 0); | ||
| 600 | if (err) | ||
| 601 | goto out; | ||
| 602 | } | ||
| 603 | err = policy_vma(vma, new_pol); | ||
| 581 | if (err) | 604 | if (err) |
| 582 | break; | 605 | goto out; |
| 583 | } | 606 | } |
| 607 | |||
| 608 | out: | ||
| 584 | return err; | 609 | return err; |
| 585 | } | 610 | } |
| 586 | 611 | ||
| @@ -780,9 +805,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 780 | 805 | ||
| 781 | err = 0; | 806 | err = 0; |
| 782 | if (nmask) { | 807 | if (nmask) { |
| 783 | task_lock(current); | 808 | if (mpol_store_user_nodemask(pol)) { |
| 784 | get_policy_nodemask(pol, nmask); | 809 | *nmask = pol->w.user_nodemask; |
| 785 | task_unlock(current); | 810 | } else { |
| 811 | task_lock(current); | ||
| 812 | get_policy_nodemask(pol, nmask); | ||
| 813 | task_unlock(current); | ||
| 814 | } | ||
| 786 | } | 815 | } |
| 787 | 816 | ||
| 788 | out: | 817 | out: |
| @@ -862,36 +891,36 @@ int do_migrate_pages(struct mm_struct *mm, | |||
| 862 | if (err) | 891 | if (err) |
| 863 | goto out; | 892 | goto out; |
| 864 | 893 | ||
| 865 | /* | 894 | /* |
| 866 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' | 895 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
| 867 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' | 896 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' |
| 868 | * bit in 'tmp', and return that <source, dest> pair for migration. | 897 | * bit in 'tmp', and return that <source, dest> pair for migration. |
| 869 | * The pair of nodemasks 'to' and 'from' define the map. | 898 | * The pair of nodemasks 'to' and 'from' define the map. |
| 870 | * | 899 | * |
| 871 | * If no pair of bits is found that way, fallback to picking some | 900 | * If no pair of bits is found that way, fallback to picking some |
| 872 | * pair of 'source' and 'dest' bits that are not the same. If the | 901 | * pair of 'source' and 'dest' bits that are not the same. If the |
| 873 | * 'source' and 'dest' bits are the same, this represents a node | 902 | * 'source' and 'dest' bits are the same, this represents a node |
| 874 | * that will be migrating to itself, so no pages need move. | 903 | * that will be migrating to itself, so no pages need move. |
| 875 | * | 904 | * |
| 876 | * If no bits are left in 'tmp', or if all remaining bits left | 905 | * If no bits are left in 'tmp', or if all remaining bits left |
| 877 | * in 'tmp' correspond to the same bit in 'to', return false | 906 | * in 'tmp' correspond to the same bit in 'to', return false |
| 878 | * (nothing left to migrate). | 907 | * (nothing left to migrate). |
| 879 | * | 908 | * |
| 880 | * This lets us pick a pair of nodes to migrate between, such that | 909 | * This lets us pick a pair of nodes to migrate between, such that |
| 881 | * if possible the dest node is not already occupied by some other | 910 | * if possible the dest node is not already occupied by some other |
| 882 | * source node, minimizing the risk of overloading the memory on a | 911 | * source node, minimizing the risk of overloading the memory on a |
| 883 | * node that would happen if we migrated incoming memory to a node | 912 | * node that would happen if we migrated incoming memory to a node |
| 884 | * before migrating outgoing memory source that same node. | 913 | * before migrating outgoing memory source that same node. |
| 885 | * | 914 | * |
| 886 | * A single scan of tmp is sufficient. As we go, we remember the | 915 | * A single scan of tmp is sufficient. As we go, we remember the |
| 887 | * most recent <s, d> pair that moved (s != d). If we find a pair | 916 | * most recent <s, d> pair that moved (s != d). If we find a pair |
| 888 | * that not only moved, but what's better, moved to an empty slot | 917 | * that not only moved, but what's better, moved to an empty slot |
| 889 | * (d is not set in tmp), then we break out then, with that pair. | 918 | * (d is not set in tmp), then we break out then, with that pair. |
| 890 | * Otherwise when we finish scannng from_tmp, we at least have the | 919 | * Otherwise when we finish scannng from_tmp, we at least have the |
| 891 | * most recent <s, d> pair that moved. If we get all the way through | 920 | * most recent <s, d> pair that moved. If we get all the way through |
| 892 | * the scan of tmp without finding any node that moved, much less | 921 | * the scan of tmp without finding any node that moved, much less |
| 893 | * moved to an empty node, then there is nothing left worth migrating. | 922 | * moved to an empty node, then there is nothing left worth migrating. |
| 894 | */ | 923 | */ |
| 895 | 924 | ||
| 896 | tmp = *from_nodes; | 925 | tmp = *from_nodes; |
| 897 | while (!nodes_empty(tmp)) { | 926 | while (!nodes_empty(tmp)) { |
| @@ -1047,7 +1076,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
| 1047 | if (!IS_ERR(vma)) { | 1076 | if (!IS_ERR(vma)) { |
| 1048 | int nr_failed = 0; | 1077 | int nr_failed = 0; |
| 1049 | 1078 | ||
| 1050 | err = mbind_range(vma, start, end, new); | 1079 | err = mbind_range(mm, start, end, new); |
| 1051 | 1080 | ||
| 1052 | if (!list_empty(&pagelist)) | 1081 | if (!list_empty(&pagelist)) |
| 1053 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1082 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
| @@ -1730,10 +1759,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
| 1730 | 1759 | ||
| 1731 | if (!new) | 1760 | if (!new) |
| 1732 | return ERR_PTR(-ENOMEM); | 1761 | return ERR_PTR(-ENOMEM); |
| 1762 | rcu_read_lock(); | ||
| 1733 | if (current_cpuset_is_being_rebound()) { | 1763 | if (current_cpuset_is_being_rebound()) { |
| 1734 | nodemask_t mems = cpuset_mems_allowed(current); | 1764 | nodemask_t mems = cpuset_mems_allowed(current); |
| 1735 | mpol_rebind_policy(old, &mems); | 1765 | mpol_rebind_policy(old, &mems); |
| 1736 | } | 1766 | } |
| 1767 | rcu_read_unlock(); | ||
| 1737 | *new = *old; | 1768 | *new = *old; |
| 1738 | atomic_set(&new->refcnt, 1); | 1769 | atomic_set(&new->refcnt, 1); |
| 1739 | return new; | 1770 | return new; |
| @@ -2167,8 +2198,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2167 | char *rest = nodelist; | 2198 | char *rest = nodelist; |
| 2168 | while (isdigit(*rest)) | 2199 | while (isdigit(*rest)) |
| 2169 | rest++; | 2200 | rest++; |
| 2170 | if (!*rest) | 2201 | if (*rest) |
| 2171 | err = 0; | 2202 | goto out; |
| 2172 | } | 2203 | } |
| 2173 | break; | 2204 | break; |
| 2174 | case MPOL_INTERLEAVE: | 2205 | case MPOL_INTERLEAVE: |
| @@ -2177,7 +2208,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2177 | */ | 2208 | */ |
| 2178 | if (!nodelist) | 2209 | if (!nodelist) |
| 2179 | nodes = node_states[N_HIGH_MEMORY]; | 2210 | nodes = node_states[N_HIGH_MEMORY]; |
| 2180 | err = 0; | ||
| 2181 | break; | 2211 | break; |
| 2182 | case MPOL_LOCAL: | 2212 | case MPOL_LOCAL: |
| 2183 | /* | 2213 | /* |
| @@ -2187,11 +2217,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2187 | goto out; | 2217 | goto out; |
| 2188 | mode = MPOL_PREFERRED; | 2218 | mode = MPOL_PREFERRED; |
| 2189 | break; | 2219 | break; |
| 2190 | 2220 | case MPOL_DEFAULT: | |
| 2191 | /* | 2221 | /* |
| 2192 | * case MPOL_BIND: mpol_new() enforces non-empty nodemask. | 2222 | * Insist on a empty nodelist |
| 2193 | * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. | 2223 | */ |
| 2194 | */ | 2224 | if (!nodelist) |
| 2225 | err = 0; | ||
| 2226 | goto out; | ||
| 2227 | case MPOL_BIND: | ||
| 2228 | /* | ||
| 2229 | * Insist on a nodelist | ||
| 2230 | */ | ||
| 2231 | if (!nodelist) | ||
| 2232 | goto out; | ||
| 2195 | } | 2233 | } |
| 2196 | 2234 | ||
| 2197 | mode_flags = 0; | 2235 | mode_flags = 0; |
| @@ -2205,13 +2243,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2205 | else if (!strcmp(flags, "relative")) | 2243 | else if (!strcmp(flags, "relative")) |
| 2206 | mode_flags |= MPOL_F_RELATIVE_NODES; | 2244 | mode_flags |= MPOL_F_RELATIVE_NODES; |
| 2207 | else | 2245 | else |
| 2208 | err = 1; | 2246 | goto out; |
| 2209 | } | 2247 | } |
| 2210 | 2248 | ||
| 2211 | new = mpol_new(mode, mode_flags, &nodes); | 2249 | new = mpol_new(mode, mode_flags, &nodes); |
| 2212 | if (IS_ERR(new)) | 2250 | if (IS_ERR(new)) |
| 2213 | err = 1; | 2251 | goto out; |
| 2214 | else { | 2252 | |
| 2253 | { | ||
| 2215 | int ret; | 2254 | int ret; |
| 2216 | NODEMASK_SCRATCH(scratch); | 2255 | NODEMASK_SCRATCH(scratch); |
| 2217 | if (scratch) { | 2256 | if (scratch) { |
| @@ -2222,13 +2261,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2222 | ret = -ENOMEM; | 2261 | ret = -ENOMEM; |
| 2223 | NODEMASK_SCRATCH_FREE(scratch); | 2262 | NODEMASK_SCRATCH_FREE(scratch); |
| 2224 | if (ret) { | 2263 | if (ret) { |
| 2225 | err = 1; | ||
| 2226 | mpol_put(new); | 2264 | mpol_put(new); |
| 2227 | } else if (no_context) { | 2265 | goto out; |
| 2228 | /* save for contextualization */ | ||
| 2229 | new->w.user_nodemask = nodes; | ||
| 2230 | } | 2266 | } |
| 2231 | } | 2267 | } |
| 2268 | err = 0; | ||
| 2269 | if (no_context) { | ||
| 2270 | /* save for contextualization */ | ||
| 2271 | new->w.user_nodemask = nodes; | ||
| 2272 | } | ||
| 2232 | 2273 | ||
| 2233 | out: | 2274 | out: |
| 2234 | /* Restore string for error message */ | 2275 | /* Restore string for error message */ |
diff --git a/mm/migrate.c b/mm/migrate.c index 880bd592d38e..d3f3f7f81075 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -32,6 +32,7 @@ | |||
| 32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
| 33 | #include <linux/memcontrol.h> | 33 | #include <linux/memcontrol.h> |
| 34 | #include <linux/syscalls.h> | 34 | #include <linux/syscalls.h> |
| 35 | #include <linux/gfp.h> | ||
| 35 | 36 | ||
| 36 | #include "internal.h" | 37 | #include "internal.h" |
| 37 | 38 | ||
| @@ -134,7 +135,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
| 134 | page_add_file_rmap(new); | 135 | page_add_file_rmap(new); |
| 135 | 136 | ||
| 136 | /* No need to invalidate - it was non-present before */ | 137 | /* No need to invalidate - it was non-present before */ |
| 137 | update_mmu_cache(vma, addr, pte); | 138 | update_mmu_cache(vma, addr, ptep); |
| 138 | unlock: | 139 | unlock: |
| 139 | pte_unmap_unlock(ptep, ptl); | 140 | pte_unmap_unlock(ptep, ptl); |
| 140 | out: | 141 | out: |
| @@ -275,8 +276,6 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 275 | */ | 276 | */ |
| 276 | static void migrate_page_copy(struct page *newpage, struct page *page) | 277 | static void migrate_page_copy(struct page *newpage, struct page *page) |
| 277 | { | 278 | { |
| 278 | int anon; | ||
| 279 | |||
| 280 | copy_highpage(newpage, page); | 279 | copy_highpage(newpage, page); |
| 281 | 280 | ||
| 282 | if (PageError(page)) | 281 | if (PageError(page)) |
| @@ -313,8 +312,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
| 313 | ClearPageSwapCache(page); | 312 | ClearPageSwapCache(page); |
| 314 | ClearPagePrivate(page); | 313 | ClearPagePrivate(page); |
| 315 | set_page_private(page, 0); | 314 | set_page_private(page, 0); |
| 316 | /* page->mapping contains a flag for PageAnon() */ | ||
| 317 | anon = PageAnon(page); | ||
| 318 | page->mapping = NULL; | 315 | page->mapping = NULL; |
| 319 | 316 | ||
| 320 | /* | 317 | /* |
diff --git a/mm/mincore.c b/mm/mincore.c index 7a3436ef39eb..f77433c20279 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
| @@ -7,8 +7,8 @@ | |||
| 7 | /* | 7 | /* |
| 8 | * The mincore() system call. | 8 | * The mincore() system call. |
| 9 | */ | 9 | */ |
| 10 | #include <linux/slab.h> | ||
| 11 | #include <linux/pagemap.h> | 10 | #include <linux/pagemap.h> |
| 11 | #include <linux/gfp.h> | ||
| 12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
| 13 | #include <linux/mman.h> | 13 | #include <linux/mman.h> |
| 14 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
diff --git a/mm/mlock.c b/mm/mlock.c index 2b8335a89400..8f4e2dfceec1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -25,7 +25,7 @@ int can_do_mlock(void) | |||
| 25 | { | 25 | { |
| 26 | if (capable(CAP_IPC_LOCK)) | 26 | if (capable(CAP_IPC_LOCK)) |
| 27 | return 1; | 27 | return 1; |
| 28 | if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) | 28 | if (rlimit(RLIMIT_MEMLOCK) != 0) |
| 29 | return 1; | 29 | return 1; |
| 30 | return 0; | 30 | return 0; |
| 31 | } | 31 | } |
| @@ -487,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
| 487 | locked = len >> PAGE_SHIFT; | 487 | locked = len >> PAGE_SHIFT; |
| 488 | locked += current->mm->locked_vm; | 488 | locked += current->mm->locked_vm; |
| 489 | 489 | ||
| 490 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 490 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 491 | lock_limit >>= PAGE_SHIFT; | 491 | lock_limit >>= PAGE_SHIFT; |
| 492 | 492 | ||
| 493 | /* check against resource limits */ | 493 | /* check against resource limits */ |
| @@ -550,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
| 550 | 550 | ||
| 551 | down_write(¤t->mm->mmap_sem); | 551 | down_write(¤t->mm->mmap_sem); |
| 552 | 552 | ||
| 553 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 553 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 554 | lock_limit >>= PAGE_SHIFT; | 554 | lock_limit >>= PAGE_SHIFT; |
| 555 | 555 | ||
| 556 | ret = -ENOMEM; | 556 | ret = -ENOMEM; |
| @@ -584,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user) | |||
| 584 | int allowed = 0; | 584 | int allowed = 0; |
| 585 | 585 | ||
| 586 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 586 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
| 587 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 587 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 588 | if (lock_limit == RLIM_INFINITY) | 588 | if (lock_limit == RLIM_INFINITY) |
| 589 | allowed = 1; | 589 | allowed = 1; |
| 590 | lock_limit >>= PAGE_SHIFT; | 590 | lock_limit >>= PAGE_SHIFT; |
| @@ -618,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, | |||
| 618 | 618 | ||
| 619 | down_write(&mm->mmap_sem); | 619 | down_write(&mm->mmap_sem); |
| 620 | 620 | ||
| 621 | lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 621 | lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT; |
| 622 | vm = mm->total_vm + pgsz; | 622 | vm = mm->total_vm + pgsz; |
| 623 | if (lim < vm) | 623 | if (lim < vm) |
| 624 | goto out; | 624 | goto out; |
| 625 | 625 | ||
| 626 | lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 626 | lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT; |
| 627 | vm = mm->locked_vm + pgsz; | 627 | vm = mm->locked_vm + pgsz; |
| 628 | if (lim < vm) | 628 | if (lim < vm) |
| 629 | goto out; | 629 | goto out; |
| @@ -265,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
| 265 | * segment grow beyond its set limit the in case where the limit is | 265 | * segment grow beyond its set limit the in case where the limit is |
| 266 | * not page aligned -Ram Gupta | 266 | * not page aligned -Ram Gupta |
| 267 | */ | 267 | */ |
| 268 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; | 268 | rlim = rlimit(RLIMIT_DATA); |
| 269 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + | 269 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + |
| 270 | (mm->end_data - mm->start_data) > rlim) | 270 | (mm->end_data - mm->start_data) > rlim) |
| 271 | goto out; | 271 | goto out; |
| @@ -437,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 437 | { | 437 | { |
| 438 | __vma_link_list(mm, vma, prev, rb_parent); | 438 | __vma_link_list(mm, vma, prev, rb_parent); |
| 439 | __vma_link_rb(mm, vma, rb_link, rb_parent); | 439 | __vma_link_rb(mm, vma, rb_link, rb_parent); |
| 440 | __anon_vma_link(vma); | ||
| 441 | } | 440 | } |
| 442 | 441 | ||
| 443 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | 442 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, |
| @@ -499,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 499 | * are necessary. The "insert" vma (if any) is to be inserted | 498 | * are necessary. The "insert" vma (if any) is to be inserted |
| 500 | * before we drop the necessary locks. | 499 | * before we drop the necessary locks. |
| 501 | */ | 500 | */ |
| 502 | void vma_adjust(struct vm_area_struct *vma, unsigned long start, | 501 | int vma_adjust(struct vm_area_struct *vma, unsigned long start, |
| 503 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) | 502 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) |
| 504 | { | 503 | { |
| 505 | struct mm_struct *mm = vma->vm_mm; | 504 | struct mm_struct *mm = vma->vm_mm; |
| @@ -508,11 +507,12 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
| 508 | struct address_space *mapping = NULL; | 507 | struct address_space *mapping = NULL; |
| 509 | struct prio_tree_root *root = NULL; | 508 | struct prio_tree_root *root = NULL; |
| 510 | struct file *file = vma->vm_file; | 509 | struct file *file = vma->vm_file; |
| 511 | struct anon_vma *anon_vma = NULL; | ||
| 512 | long adjust_next = 0; | 510 | long adjust_next = 0; |
| 513 | int remove_next = 0; | 511 | int remove_next = 0; |
| 514 | 512 | ||
| 515 | if (next && !insert) { | 513 | if (next && !insert) { |
| 514 | struct vm_area_struct *exporter = NULL; | ||
| 515 | |||
| 516 | if (end >= next->vm_end) { | 516 | if (end >= next->vm_end) { |
| 517 | /* | 517 | /* |
| 518 | * vma expands, overlapping all the next, and | 518 | * vma expands, overlapping all the next, and |
| @@ -520,7 +520,7 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start, | |||
| 520 | */ | 520 | */ |
| 521 | again: remove_next = 1 + (end > next->vm_end); | 521 | again: remove_next = 1 + (end > next->vm_end); |
| 522 | end = next->vm_end; | 522 | end = next->vm_end; |
| 523 | anon_vma = next->anon_vma; | 523 | exporter = next; |
| 524 | importer = vma; | 524 | importer = vma; |
| 525 | } else if (end > next->vm_start) { | 525 | } else if (end > next->vm_start) { |
| 526 | /* | 526 | /* |
| @@ -528,7 +528,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 528 | * mprotect case 5 shifting the boundary up. | 528 | * mprotect case 5 shifting the boundary up. |
| 529 | */ | 529 | */ |
| 530 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; | 530 | adjust_next = (end - next->vm_start) >> PAGE_SHIFT; |
| 531 | anon_vma = next->anon_vma; | 531 | exporter = next; |
| 532 | importer = vma; | 532 | importer = vma; |
| 533 | } else if (end < vma->vm_end) { | 533 | } else if (end < vma->vm_end) { |
| 534 | /* | 534 | /* |
| @@ -537,9 +537,20 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 537 | * mprotect case 4 shifting the boundary down. | 537 | * mprotect case 4 shifting the boundary down. |
| 538 | */ | 538 | */ |
| 539 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); | 539 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); |
| 540 | anon_vma = next->anon_vma; | 540 | exporter = vma; |
| 541 | importer = next; | 541 | importer = next; |
| 542 | } | 542 | } |
| 543 | |||
| 544 | /* | ||
| 545 | * Easily overlooked: when mprotect shifts the boundary, | ||
| 546 | * make sure the expanding vma has anon_vma set if the | ||
| 547 | * shrinking vma had, to cover any anon pages imported. | ||
| 548 | */ | ||
| 549 | if (exporter && exporter->anon_vma && !importer->anon_vma) { | ||
| 550 | if (anon_vma_clone(importer, exporter)) | ||
| 551 | return -ENOMEM; | ||
| 552 | importer->anon_vma = exporter->anon_vma; | ||
| 553 | } | ||
| 543 | } | 554 | } |
| 544 | 555 | ||
| 545 | if (file) { | 556 | if (file) { |
| @@ -567,25 +578,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 567 | } | 578 | } |
| 568 | } | 579 | } |
| 569 | 580 | ||
| 570 | /* | ||
| 571 | * When changing only vma->vm_end, we don't really need | ||
| 572 | * anon_vma lock. | ||
| 573 | */ | ||
| 574 | if (vma->anon_vma && (insert || importer || start != vma->vm_start)) | ||
| 575 | anon_vma = vma->anon_vma; | ||
| 576 | if (anon_vma) { | ||
| 577 | spin_lock(&anon_vma->lock); | ||
| 578 | /* | ||
| 579 | * Easily overlooked: when mprotect shifts the boundary, | ||
| 580 | * make sure the expanding vma has anon_vma set if the | ||
| 581 | * shrinking vma had, to cover any anon pages imported. | ||
| 582 | */ | ||
| 583 | if (importer && !importer->anon_vma) { | ||
| 584 | importer->anon_vma = anon_vma; | ||
| 585 | __anon_vma_link(importer); | ||
| 586 | } | ||
| 587 | } | ||
| 588 | |||
| 589 | if (root) { | 581 | if (root) { |
| 590 | flush_dcache_mmap_lock(mapping); | 582 | flush_dcache_mmap_lock(mapping); |
| 591 | vma_prio_tree_remove(vma, root); | 583 | vma_prio_tree_remove(vma, root); |
| @@ -616,8 +608,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 616 | __vma_unlink(mm, next, vma); | 608 | __vma_unlink(mm, next, vma); |
| 617 | if (file) | 609 | if (file) |
| 618 | __remove_shared_vm_struct(next, file, mapping); | 610 | __remove_shared_vm_struct(next, file, mapping); |
| 619 | if (next->anon_vma) | ||
| 620 | __anon_vma_merge(vma, next); | ||
| 621 | } else if (insert) { | 611 | } else if (insert) { |
| 622 | /* | 612 | /* |
| 623 | * split_vma has split insert from vma, and needs | 613 | * split_vma has split insert from vma, and needs |
| @@ -627,8 +617,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 627 | __insert_vm_struct(mm, insert); | 617 | __insert_vm_struct(mm, insert); |
| 628 | } | 618 | } |
| 629 | 619 | ||
| 630 | if (anon_vma) | ||
| 631 | spin_unlock(&anon_vma->lock); | ||
| 632 | if (mapping) | 620 | if (mapping) |
| 633 | spin_unlock(&mapping->i_mmap_lock); | 621 | spin_unlock(&mapping->i_mmap_lock); |
| 634 | 622 | ||
| @@ -638,6 +626,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 638 | if (next->vm_flags & VM_EXECUTABLE) | 626 | if (next->vm_flags & VM_EXECUTABLE) |
| 639 | removed_exe_file_vma(mm); | 627 | removed_exe_file_vma(mm); |
| 640 | } | 628 | } |
| 629 | if (next->anon_vma) | ||
| 630 | anon_vma_merge(vma, next); | ||
| 641 | mm->map_count--; | 631 | mm->map_count--; |
| 642 | mpol_put(vma_policy(next)); | 632 | mpol_put(vma_policy(next)); |
| 643 | kmem_cache_free(vm_area_cachep, next); | 633 | kmem_cache_free(vm_area_cachep, next); |
| @@ -653,6 +643,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 653 | } | 643 | } |
| 654 | 644 | ||
| 655 | validate_mm(mm); | 645 | validate_mm(mm); |
| 646 | |||
| 647 | return 0; | ||
| 656 | } | 648 | } |
| 657 | 649 | ||
| 658 | /* | 650 | /* |
| @@ -759,6 +751,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
| 759 | { | 751 | { |
| 760 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | 752 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
| 761 | struct vm_area_struct *area, *next; | 753 | struct vm_area_struct *area, *next; |
| 754 | int err; | ||
| 762 | 755 | ||
| 763 | /* | 756 | /* |
| 764 | * We later require that vma->vm_flags == vm_flags, | 757 | * We later require that vma->vm_flags == vm_flags, |
| @@ -792,11 +785,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
| 792 | is_mergeable_anon_vma(prev->anon_vma, | 785 | is_mergeable_anon_vma(prev->anon_vma, |
| 793 | next->anon_vma)) { | 786 | next->anon_vma)) { |
| 794 | /* cases 1, 6 */ | 787 | /* cases 1, 6 */ |
| 795 | vma_adjust(prev, prev->vm_start, | 788 | err = vma_adjust(prev, prev->vm_start, |
| 796 | next->vm_end, prev->vm_pgoff, NULL); | 789 | next->vm_end, prev->vm_pgoff, NULL); |
| 797 | } else /* cases 2, 5, 7 */ | 790 | } else /* cases 2, 5, 7 */ |
| 798 | vma_adjust(prev, prev->vm_start, | 791 | err = vma_adjust(prev, prev->vm_start, |
| 799 | end, prev->vm_pgoff, NULL); | 792 | end, prev->vm_pgoff, NULL); |
| 793 | if (err) | ||
| 794 | return NULL; | ||
| 800 | return prev; | 795 | return prev; |
| 801 | } | 796 | } |
| 802 | 797 | ||
| @@ -808,11 +803,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
| 808 | can_vma_merge_before(next, vm_flags, | 803 | can_vma_merge_before(next, vm_flags, |
| 809 | anon_vma, file, pgoff+pglen)) { | 804 | anon_vma, file, pgoff+pglen)) { |
| 810 | if (prev && addr < prev->vm_end) /* case 4 */ | 805 | if (prev && addr < prev->vm_end) /* case 4 */ |
| 811 | vma_adjust(prev, prev->vm_start, | 806 | err = vma_adjust(prev, prev->vm_start, |
| 812 | addr, prev->vm_pgoff, NULL); | 807 | addr, prev->vm_pgoff, NULL); |
| 813 | else /* cases 3, 8 */ | 808 | else /* cases 3, 8 */ |
| 814 | vma_adjust(area, addr, next->vm_end, | 809 | err = vma_adjust(area, addr, next->vm_end, |
| 815 | next->vm_pgoff - pglen, NULL); | 810 | next->vm_pgoff - pglen, NULL); |
| 811 | if (err) | ||
| 812 | return NULL; | ||
| 816 | return area; | 813 | return area; |
| 817 | } | 814 | } |
| 818 | 815 | ||
| @@ -820,6 +817,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
| 820 | } | 817 | } |
| 821 | 818 | ||
| 822 | /* | 819 | /* |
| 820 | * Rough compatbility check to quickly see if it's even worth looking | ||
| 821 | * at sharing an anon_vma. | ||
| 822 | * | ||
| 823 | * They need to have the same vm_file, and the flags can only differ | ||
| 824 | * in things that mprotect may change. | ||
| 825 | * | ||
| 826 | * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that | ||
| 827 | * we can merge the two vma's. For example, we refuse to merge a vma if | ||
| 828 | * there is a vm_ops->close() function, because that indicates that the | ||
| 829 | * driver is doing some kind of reference counting. But that doesn't | ||
| 830 | * really matter for the anon_vma sharing case. | ||
| 831 | */ | ||
| 832 | static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) | ||
| 833 | { | ||
| 834 | return a->vm_end == b->vm_start && | ||
| 835 | mpol_equal(vma_policy(a), vma_policy(b)) && | ||
| 836 | a->vm_file == b->vm_file && | ||
| 837 | !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && | ||
| 838 | b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); | ||
| 839 | } | ||
| 840 | |||
| 841 | /* | ||
| 842 | * Do some basic sanity checking to see if we can re-use the anon_vma | ||
| 843 | * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be | ||
| 844 | * the same as 'old', the other will be the new one that is trying | ||
| 845 | * to share the anon_vma. | ||
| 846 | * | ||
| 847 | * NOTE! This runs with mm_sem held for reading, so it is possible that | ||
| 848 | * the anon_vma of 'old' is concurrently in the process of being set up | ||
| 849 | * by another page fault trying to merge _that_. But that's ok: if it | ||
| 850 | * is being set up, that automatically means that it will be a singleton | ||
| 851 | * acceptable for merging, so we can do all of this optimistically. But | ||
| 852 | * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. | ||
| 853 | * | ||
| 854 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only | ||
| 855 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid | ||
| 856 | * is to return an anon_vma that is "complex" due to having gone through | ||
| 857 | * a fork). | ||
| 858 | * | ||
| 859 | * We also make sure that the two vma's are compatible (adjacent, | ||
| 860 | * and with the same memory policies). That's all stable, even with just | ||
| 861 | * a read lock on the mm_sem. | ||
| 862 | */ | ||
| 863 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) | ||
| 864 | { | ||
| 865 | if (anon_vma_compatible(a, b)) { | ||
| 866 | struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); | ||
| 867 | |||
| 868 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) | ||
| 869 | return anon_vma; | ||
| 870 | } | ||
| 871 | return NULL; | ||
| 872 | } | ||
| 873 | |||
| 874 | /* | ||
| 823 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check | 875 | * find_mergeable_anon_vma is used by anon_vma_prepare, to check |
| 824 | * neighbouring vmas for a suitable anon_vma, before it goes off | 876 | * neighbouring vmas for a suitable anon_vma, before it goes off |
| 825 | * to allocate a new anon_vma. It checks because a repetitive | 877 | * to allocate a new anon_vma. It checks because a repetitive |
| @@ -829,28 +881,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
| 829 | */ | 881 | */ |
| 830 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) | 882 | struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) |
| 831 | { | 883 | { |
| 884 | struct anon_vma *anon_vma; | ||
| 832 | struct vm_area_struct *near; | 885 | struct vm_area_struct *near; |
| 833 | unsigned long vm_flags; | ||
| 834 | 886 | ||
| 835 | near = vma->vm_next; | 887 | near = vma->vm_next; |
| 836 | if (!near) | 888 | if (!near) |
| 837 | goto try_prev; | 889 | goto try_prev; |
| 838 | 890 | ||
| 839 | /* | 891 | anon_vma = reusable_anon_vma(near, vma, near); |
| 840 | * Since only mprotect tries to remerge vmas, match flags | 892 | if (anon_vma) |
| 841 | * which might be mprotected into each other later on. | 893 | return anon_vma; |
| 842 | * Neither mlock nor madvise tries to remerge at present, | ||
| 843 | * so leave their flags as obstructing a merge. | ||
| 844 | */ | ||
| 845 | vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); | ||
| 846 | vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); | ||
| 847 | |||
| 848 | if (near->anon_vma && vma->vm_end == near->vm_start && | ||
| 849 | mpol_equal(vma_policy(vma), vma_policy(near)) && | ||
| 850 | can_vma_merge_before(near, vm_flags, | ||
| 851 | NULL, vma->vm_file, vma->vm_pgoff + | ||
| 852 | ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) | ||
| 853 | return near->anon_vma; | ||
| 854 | try_prev: | 894 | try_prev: |
| 855 | /* | 895 | /* |
| 856 | * It is potentially slow to have to call find_vma_prev here. | 896 | * It is potentially slow to have to call find_vma_prev here. |
| @@ -863,14 +903,9 @@ try_prev: | |||
| 863 | if (!near) | 903 | if (!near) |
| 864 | goto none; | 904 | goto none; |
| 865 | 905 | ||
| 866 | vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); | 906 | anon_vma = reusable_anon_vma(near, near, vma); |
| 867 | vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); | 907 | if (anon_vma) |
| 868 | 908 | return anon_vma; | |
| 869 | if (near->anon_vma && near->vm_end == vma->vm_start && | ||
| 870 | mpol_equal(vma_policy(near), vma_policy(vma)) && | ||
| 871 | can_vma_merge_after(near, vm_flags, | ||
| 872 | NULL, vma->vm_file, vma->vm_pgoff)) | ||
| 873 | return near->anon_vma; | ||
| 874 | none: | 909 | none: |
| 875 | /* | 910 | /* |
| 876 | * There's no absolute need to look only at touching neighbours: | 911 | * There's no absolute need to look only at touching neighbours: |
| @@ -967,7 +1002,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 967 | unsigned long locked, lock_limit; | 1002 | unsigned long locked, lock_limit; |
| 968 | locked = len >> PAGE_SHIFT; | 1003 | locked = len >> PAGE_SHIFT; |
| 969 | locked += mm->locked_vm; | 1004 | locked += mm->locked_vm; |
| 970 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 1005 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 971 | lock_limit >>= PAGE_SHIFT; | 1006 | lock_limit >>= PAGE_SHIFT; |
| 972 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 1007 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
| 973 | return -EAGAIN; | 1008 | return -EAGAIN; |
| @@ -1083,6 +1118,30 @@ out: | |||
| 1083 | return retval; | 1118 | return retval; |
| 1084 | } | 1119 | } |
| 1085 | 1120 | ||
| 1121 | #ifdef __ARCH_WANT_SYS_OLD_MMAP | ||
| 1122 | struct mmap_arg_struct { | ||
| 1123 | unsigned long addr; | ||
| 1124 | unsigned long len; | ||
| 1125 | unsigned long prot; | ||
| 1126 | unsigned long flags; | ||
| 1127 | unsigned long fd; | ||
| 1128 | unsigned long offset; | ||
| 1129 | }; | ||
| 1130 | |||
| 1131 | SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | ||
| 1132 | { | ||
| 1133 | struct mmap_arg_struct a; | ||
| 1134 | |||
| 1135 | if (copy_from_user(&a, arg, sizeof(a))) | ||
| 1136 | return -EFAULT; | ||
| 1137 | if (a.offset & ~PAGE_MASK) | ||
| 1138 | return -EINVAL; | ||
| 1139 | |||
| 1140 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, | ||
| 1141 | a.offset >> PAGE_SHIFT); | ||
| 1142 | } | ||
| 1143 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ | ||
| 1144 | |||
| 1086 | /* | 1145 | /* |
| 1087 | * Some shared mappigns will want the pages marked read-only | 1146 | * Some shared mappigns will want the pages marked read-only |
| 1088 | * to track write events. If so, we'll downgrade vm_page_prot | 1147 | * to track write events. If so, we'll downgrade vm_page_prot |
| @@ -1205,6 +1264,7 @@ munmap_back: | |||
| 1205 | vma->vm_flags = vm_flags; | 1264 | vma->vm_flags = vm_flags; |
| 1206 | vma->vm_page_prot = vm_get_page_prot(vm_flags); | 1265 | vma->vm_page_prot = vm_get_page_prot(vm_flags); |
| 1207 | vma->vm_pgoff = pgoff; | 1266 | vma->vm_pgoff = pgoff; |
| 1267 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
| 1208 | 1268 | ||
| 1209 | if (file) { | 1269 | if (file) { |
| 1210 | error = -EINVAL; | 1270 | error = -EINVAL; |
| @@ -1265,13 +1325,8 @@ out: | |||
| 1265 | mm->total_vm += len >> PAGE_SHIFT; | 1325 | mm->total_vm += len >> PAGE_SHIFT; |
| 1266 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1326 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
| 1267 | if (vm_flags & VM_LOCKED) { | 1327 | if (vm_flags & VM_LOCKED) { |
| 1268 | /* | 1328 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
| 1269 | * makes pages present; downgrades, drops, reacquires mmap_sem | 1329 | mm->locked_vm += (len >> PAGE_SHIFT); |
| 1270 | */ | ||
| 1271 | long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); | ||
| 1272 | if (nr_pages < 0) | ||
| 1273 | return nr_pages; /* vma gone! */ | ||
| 1274 | mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; | ||
| 1275 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1330 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) |
| 1276 | make_pages_present(addr, addr + len); | 1331 | make_pages_present(addr, addr + len); |
| 1277 | return addr; | 1332 | return addr; |
| @@ -1599,7 +1654,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
| 1599 | return -ENOMEM; | 1654 | return -ENOMEM; |
| 1600 | 1655 | ||
| 1601 | /* Stack limit test */ | 1656 | /* Stack limit test */ |
| 1602 | if (size > rlim[RLIMIT_STACK].rlim_cur) | 1657 | if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) |
| 1603 | return -ENOMEM; | 1658 | return -ENOMEM; |
| 1604 | 1659 | ||
| 1605 | /* mlock limit tests */ | 1660 | /* mlock limit tests */ |
| @@ -1607,7 +1662,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
| 1607 | unsigned long locked; | 1662 | unsigned long locked; |
| 1608 | unsigned long limit; | 1663 | unsigned long limit; |
| 1609 | locked = mm->locked_vm + grow; | 1664 | locked = mm->locked_vm + grow; |
| 1610 | limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 1665 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); |
| 1666 | limit >>= PAGE_SHIFT; | ||
| 1611 | if (locked > limit && !capable(CAP_IPC_LOCK)) | 1667 | if (locked > limit && !capable(CAP_IPC_LOCK)) |
| 1612 | return -ENOMEM; | 1668 | return -ENOMEM; |
| 1613 | } | 1669 | } |
| @@ -1754,8 +1810,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
| 1754 | if (!prev || expand_stack(prev, addr)) | 1810 | if (!prev || expand_stack(prev, addr)) |
| 1755 | return NULL; | 1811 | return NULL; |
| 1756 | if (prev->vm_flags & VM_LOCKED) { | 1812 | if (prev->vm_flags & VM_LOCKED) { |
| 1757 | if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) | 1813 | mlock_vma_pages_range(prev, addr, prev->vm_end); |
| 1758 | return NULL; /* vma gone! */ | ||
| 1759 | } | 1814 | } |
| 1760 | return prev; | 1815 | return prev; |
| 1761 | } | 1816 | } |
| @@ -1783,8 +1838,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
| 1783 | if (expand_stack(vma, addr)) | 1838 | if (expand_stack(vma, addr)) |
| 1784 | return NULL; | 1839 | return NULL; |
| 1785 | if (vma->vm_flags & VM_LOCKED) { | 1840 | if (vma->vm_flags & VM_LOCKED) { |
| 1786 | if (mlock_vma_pages_range(vma, addr, start) < 0) | 1841 | mlock_vma_pages_range(vma, addr, start); |
| 1787 | return NULL; /* vma gone! */ | ||
| 1788 | } | 1842 | } |
| 1789 | return vma; | 1843 | return vma; |
| 1790 | } | 1844 | } |
| @@ -1871,6 +1925,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
| 1871 | { | 1925 | { |
| 1872 | struct mempolicy *pol; | 1926 | struct mempolicy *pol; |
| 1873 | struct vm_area_struct *new; | 1927 | struct vm_area_struct *new; |
| 1928 | int err = -ENOMEM; | ||
| 1874 | 1929 | ||
| 1875 | if (is_vm_hugetlb_page(vma) && (addr & | 1930 | if (is_vm_hugetlb_page(vma) && (addr & |
| 1876 | ~(huge_page_mask(hstate_vma(vma))))) | 1931 | ~(huge_page_mask(hstate_vma(vma))))) |
| @@ -1878,11 +1933,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
| 1878 | 1933 | ||
| 1879 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 1934 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
| 1880 | if (!new) | 1935 | if (!new) |
| 1881 | return -ENOMEM; | 1936 | goto out_err; |
| 1882 | 1937 | ||
| 1883 | /* most fields are the same, copy all, and then fixup */ | 1938 | /* most fields are the same, copy all, and then fixup */ |
| 1884 | *new = *vma; | 1939 | *new = *vma; |
| 1885 | 1940 | ||
| 1941 | INIT_LIST_HEAD(&new->anon_vma_chain); | ||
| 1942 | |||
| 1886 | if (new_below) | 1943 | if (new_below) |
| 1887 | new->vm_end = addr; | 1944 | new->vm_end = addr; |
| 1888 | else { | 1945 | else { |
| @@ -1892,11 +1949,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
| 1892 | 1949 | ||
| 1893 | pol = mpol_dup(vma_policy(vma)); | 1950 | pol = mpol_dup(vma_policy(vma)); |
| 1894 | if (IS_ERR(pol)) { | 1951 | if (IS_ERR(pol)) { |
| 1895 | kmem_cache_free(vm_area_cachep, new); | 1952 | err = PTR_ERR(pol); |
| 1896 | return PTR_ERR(pol); | 1953 | goto out_free_vma; |
| 1897 | } | 1954 | } |
| 1898 | vma_set_policy(new, pol); | 1955 | vma_set_policy(new, pol); |
| 1899 | 1956 | ||
| 1957 | if (anon_vma_clone(new, vma)) | ||
| 1958 | goto out_free_mpol; | ||
| 1959 | |||
| 1900 | if (new->vm_file) { | 1960 | if (new->vm_file) { |
| 1901 | get_file(new->vm_file); | 1961 | get_file(new->vm_file); |
| 1902 | if (vma->vm_flags & VM_EXECUTABLE) | 1962 | if (vma->vm_flags & VM_EXECUTABLE) |
| @@ -1907,12 +1967,29 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
| 1907 | new->vm_ops->open(new); | 1967 | new->vm_ops->open(new); |
| 1908 | 1968 | ||
| 1909 | if (new_below) | 1969 | if (new_below) |
| 1910 | vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + | 1970 | err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + |
| 1911 | ((addr - new->vm_start) >> PAGE_SHIFT), new); | 1971 | ((addr - new->vm_start) >> PAGE_SHIFT), new); |
| 1912 | else | 1972 | else |
| 1913 | vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); | 1973 | err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); |
| 1914 | 1974 | ||
| 1915 | return 0; | 1975 | /* Success. */ |
| 1976 | if (!err) | ||
| 1977 | return 0; | ||
| 1978 | |||
| 1979 | /* Clean everything up if vma_adjust failed. */ | ||
| 1980 | if (new->vm_ops && new->vm_ops->close) | ||
| 1981 | new->vm_ops->close(new); | ||
| 1982 | if (new->vm_file) { | ||
| 1983 | if (vma->vm_flags & VM_EXECUTABLE) | ||
| 1984 | removed_exe_file_vma(mm); | ||
| 1985 | fput(new->vm_file); | ||
| 1986 | } | ||
| 1987 | out_free_mpol: | ||
| 1988 | mpol_put(pol); | ||
| 1989 | out_free_vma: | ||
| 1990 | kmem_cache_free(vm_area_cachep, new); | ||
| 1991 | out_err: | ||
| 1992 | return err; | ||
| 1916 | } | 1993 | } |
| 1917 | 1994 | ||
| 1918 | /* | 1995 | /* |
| @@ -2074,7 +2151,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2074 | unsigned long locked, lock_limit; | 2151 | unsigned long locked, lock_limit; |
| 2075 | locked = len >> PAGE_SHIFT; | 2152 | locked = len >> PAGE_SHIFT; |
| 2076 | locked += mm->locked_vm; | 2153 | locked += mm->locked_vm; |
| 2077 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 2154 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 2078 | lock_limit >>= PAGE_SHIFT; | 2155 | lock_limit >>= PAGE_SHIFT; |
| 2079 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 2156 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
| 2080 | return -EAGAIN; | 2157 | return -EAGAIN; |
| @@ -2122,6 +2199,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2122 | return -ENOMEM; | 2199 | return -ENOMEM; |
| 2123 | } | 2200 | } |
| 2124 | 2201 | ||
| 2202 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
| 2125 | vma->vm_mm = mm; | 2203 | vma->vm_mm = mm; |
| 2126 | vma->vm_start = addr; | 2204 | vma->vm_start = addr; |
| 2127 | vma->vm_end = addr + len; | 2205 | vma->vm_end = addr + len; |
| @@ -2258,10 +2336,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
| 2258 | if (new_vma) { | 2336 | if (new_vma) { |
| 2259 | *new_vma = *vma; | 2337 | *new_vma = *vma; |
| 2260 | pol = mpol_dup(vma_policy(vma)); | 2338 | pol = mpol_dup(vma_policy(vma)); |
| 2261 | if (IS_ERR(pol)) { | 2339 | if (IS_ERR(pol)) |
| 2262 | kmem_cache_free(vm_area_cachep, new_vma); | 2340 | goto out_free_vma; |
| 2263 | return NULL; | 2341 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
| 2264 | } | 2342 | if (anon_vma_clone(new_vma, vma)) |
| 2343 | goto out_free_mempol; | ||
| 2265 | vma_set_policy(new_vma, pol); | 2344 | vma_set_policy(new_vma, pol); |
| 2266 | new_vma->vm_start = addr; | 2345 | new_vma->vm_start = addr; |
| 2267 | new_vma->vm_end = addr + len; | 2346 | new_vma->vm_end = addr + len; |
| @@ -2277,6 +2356,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
| 2277 | } | 2356 | } |
| 2278 | } | 2357 | } |
| 2279 | return new_vma; | 2358 | return new_vma; |
| 2359 | |||
| 2360 | out_free_mempol: | ||
| 2361 | mpol_put(pol); | ||
| 2362 | out_free_vma: | ||
| 2363 | kmem_cache_free(vm_area_cachep, new_vma); | ||
| 2364 | return NULL; | ||
| 2280 | } | 2365 | } |
| 2281 | 2366 | ||
| 2282 | /* | 2367 | /* |
| @@ -2288,7 +2373,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) | |||
| 2288 | unsigned long cur = mm->total_vm; /* pages */ | 2373 | unsigned long cur = mm->total_vm; /* pages */ |
| 2289 | unsigned long lim; | 2374 | unsigned long lim; |
| 2290 | 2375 | ||
| 2291 | lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 2376 | lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; |
| 2292 | 2377 | ||
| 2293 | if (cur + npages > lim) | 2378 | if (cur + npages > lim) |
| 2294 | return 0; | 2379 | return 0; |
| @@ -2354,6 +2439,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
| 2354 | if (unlikely(vma == NULL)) | 2439 | if (unlikely(vma == NULL)) |
| 2355 | return -ENOMEM; | 2440 | return -ENOMEM; |
| 2356 | 2441 | ||
| 2442 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
| 2357 | vma->vm_mm = mm; | 2443 | vma->vm_mm = mm; |
| 2358 | vma->vm_start = addr; | 2444 | vma->vm_start = addr; |
| 2359 | vma->vm_end = addr + len; | 2445 | vma->vm_end = addr + len; |
| @@ -2454,6 +2540,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
| 2454 | int mm_take_all_locks(struct mm_struct *mm) | 2540 | int mm_take_all_locks(struct mm_struct *mm) |
| 2455 | { | 2541 | { |
| 2456 | struct vm_area_struct *vma; | 2542 | struct vm_area_struct *vma; |
| 2543 | struct anon_vma_chain *avc; | ||
| 2457 | int ret = -EINTR; | 2544 | int ret = -EINTR; |
| 2458 | 2545 | ||
| 2459 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2546 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
| @@ -2471,7 +2558,8 @@ int mm_take_all_locks(struct mm_struct *mm) | |||
| 2471 | if (signal_pending(current)) | 2558 | if (signal_pending(current)) |
| 2472 | goto out_unlock; | 2559 | goto out_unlock; |
| 2473 | if (vma->anon_vma) | 2560 | if (vma->anon_vma) |
| 2474 | vm_lock_anon_vma(mm, vma->anon_vma); | 2561 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
| 2562 | vm_lock_anon_vma(mm, avc->anon_vma); | ||
| 2475 | } | 2563 | } |
| 2476 | 2564 | ||
| 2477 | ret = 0; | 2565 | ret = 0; |
| @@ -2526,13 +2614,15 @@ static void vm_unlock_mapping(struct address_space *mapping) | |||
| 2526 | void mm_drop_all_locks(struct mm_struct *mm) | 2614 | void mm_drop_all_locks(struct mm_struct *mm) |
| 2527 | { | 2615 | { |
| 2528 | struct vm_area_struct *vma; | 2616 | struct vm_area_struct *vma; |
| 2617 | struct anon_vma_chain *avc; | ||
| 2529 | 2618 | ||
| 2530 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2619 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
| 2531 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); | 2620 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); |
| 2532 | 2621 | ||
| 2533 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 2622 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
| 2534 | if (vma->anon_vma) | 2623 | if (vma->anon_vma) |
| 2535 | vm_unlock_anon_vma(vma->anon_vma); | 2624 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
| 2625 | vm_unlock_anon_vma(avc->anon_vma); | ||
| 2536 | if (vma->vm_file && vma->vm_file->f_mapping) | 2626 | if (vma->vm_file && vma->vm_file->f_mapping) |
| 2537 | vm_unlock_mapping(vma->vm_file->f_mapping); | 2627 | vm_unlock_mapping(vma->vm_file->f_mapping); |
| 2538 | } | 2628 | } |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index ded9081f4021..9e82e937000e 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | 5 | ||
| 6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
| 7 | #include <linux/mmu_context.h> | 7 | #include <linux/mmu_context.h> |
| 8 | #include <linux/module.h> | ||
| 8 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
| 9 | 10 | ||
| 10 | #include <asm/mmu_context.h> | 11 | #include <asm/mmu_context.h> |
| @@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm) | |||
| 37 | if (active_mm != mm) | 38 | if (active_mm != mm) |
| 38 | mmdrop(active_mm); | 39 | mmdrop(active_mm); |
| 39 | } | 40 | } |
| 41 | EXPORT_SYMBOL_GPL(use_mm); | ||
| 40 | 42 | ||
| 41 | /* | 43 | /* |
| 42 | * unuse_mm | 44 | * unuse_mm |
| @@ -51,8 +53,10 @@ void unuse_mm(struct mm_struct *mm) | |||
| 51 | struct task_struct *tsk = current; | 53 | struct task_struct *tsk = current; |
| 52 | 54 | ||
| 53 | task_lock(tsk); | 55 | task_lock(tsk); |
| 56 | sync_mm_rss(tsk, mm); | ||
| 54 | tsk->mm = NULL; | 57 | tsk->mm = NULL; |
| 55 | /* active_mm is still 'mm' */ | 58 | /* active_mm is still 'mm' */ |
| 56 | enter_lazy_tlb(mm, tsk); | 59 | enter_lazy_tlb(mm, tsk); |
| 57 | task_unlock(tsk); | 60 | task_unlock(tsk); |
| 58 | } | 61 | } |
| 62 | EXPORT_SYMBOL_GPL(unuse_mm); | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 7e33f2cb3c77..438951d366f2 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/err.h> | 16 | #include <linux/err.h> |
| 17 | #include <linux/rcupdate.h> | 17 | #include <linux/rcupdate.h> |
| 18 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
| 19 | #include <linux/slab.h> | ||
| 19 | 20 | ||
| 20 | /* | 21 | /* |
| 21 | * This function can't run concurrently against mmu_notifier_register | 22 | * This function can't run concurrently against mmu_notifier_register |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 8bc969d8112d..2d1bf7cf8851 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -10,7 +10,6 @@ | |||
| 10 | 10 | ||
| 11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
| 12 | #include <linux/hugetlb.h> | 12 | #include <linux/hugetlb.h> |
| 13 | #include <linux/slab.h> | ||
| 14 | #include <linux/shm.h> | 13 | #include <linux/shm.h> |
| 15 | #include <linux/mman.h> | 14 | #include <linux/mman.h> |
| 16 | #include <linux/fs.h> | 15 | #include <linux/fs.h> |
diff --git a/mm/mremap.c b/mm/mremap.c index 845190898d59..cde56ee51ef7 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -9,7 +9,6 @@ | |||
| 9 | 9 | ||
| 10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
| 11 | #include <linux/hugetlb.h> | 11 | #include <linux/hugetlb.h> |
| 12 | #include <linux/slab.h> | ||
| 13 | #include <linux/shm.h> | 12 | #include <linux/shm.h> |
| 14 | #include <linux/ksm.h> | 13 | #include <linux/ksm.h> |
| 15 | #include <linux/mman.h> | 14 | #include <linux/mman.h> |
| @@ -285,7 +284,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
| 285 | if (vma->vm_flags & VM_LOCKED) { | 284 | if (vma->vm_flags & VM_LOCKED) { |
| 286 | unsigned long locked, lock_limit; | 285 | unsigned long locked, lock_limit; |
| 287 | locked = mm->locked_vm << PAGE_SHIFT; | 286 | locked = mm->locked_vm << PAGE_SHIFT; |
| 288 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 287 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 289 | locked += new_len - old_len; | 288 | locked += new_len - old_len; |
| 290 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 289 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
| 291 | goto Eagain; | 290 | goto Eagain; |
| @@ -460,8 +459,11 @@ unsigned long do_mremap(unsigned long addr, | |||
| 460 | if (vma_expandable(vma, new_len - old_len)) { | 459 | if (vma_expandable(vma, new_len - old_len)) { |
| 461 | int pages = (new_len - old_len) >> PAGE_SHIFT; | 460 | int pages = (new_len - old_len) >> PAGE_SHIFT; |
| 462 | 461 | ||
| 463 | vma_adjust(vma, vma->vm_start, | 462 | if (vma_adjust(vma, vma->vm_start, addr + new_len, |
| 464 | addr + new_len, vma->vm_pgoff, NULL); | 463 | vma->vm_pgoff, NULL)) { |
| 464 | ret = -ENOMEM; | ||
| 465 | goto out; | ||
| 466 | } | ||
| 465 | 467 | ||
| 466 | mm->total_vm += pages; | 468 | mm->total_vm += pages; |
| 467 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 469 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
diff --git a/mm/nommu.c b/mm/nommu.c index 48a2ecfaf059..63fa17d121f0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -162,7 +162,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 162 | } | 162 | } |
| 163 | if (vmas) | 163 | if (vmas) |
| 164 | vmas[i] = vma; | 164 | vmas[i] = vma; |
| 165 | start += PAGE_SIZE; | 165 | start = (start + PAGE_SIZE) & PAGE_MASK; |
| 166 | } | 166 | } |
| 167 | 167 | ||
| 168 | return i; | 168 | return i; |
| @@ -1040,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) | |||
| 1040 | if (ret != -ENOSYS) | 1040 | if (ret != -ENOSYS) |
| 1041 | return ret; | 1041 | return ret; |
| 1042 | 1042 | ||
| 1043 | /* getting an ENOSYS error indicates that direct mmap isn't | 1043 | /* getting -ENOSYS indicates that direct mmap isn't possible (as |
| 1044 | * possible (as opposed to tried but failed) so we'll fall | 1044 | * opposed to tried but failed) so we can only give a suitable error as |
| 1045 | * through to making a private copy of the data and mapping | 1045 | * it's not possible to make a private copy if MAP_SHARED was given */ |
| 1046 | * that if we can */ | ||
| 1047 | return -ENODEV; | 1046 | return -ENODEV; |
| 1048 | } | 1047 | } |
| 1049 | 1048 | ||
| @@ -1209,7 +1208,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1209 | region->vm_flags = vm_flags; | 1208 | region->vm_flags = vm_flags; |
| 1210 | region->vm_pgoff = pgoff; | 1209 | region->vm_pgoff = pgoff; |
| 1211 | 1210 | ||
| 1212 | INIT_LIST_HEAD(&vma->anon_vma_node); | 1211 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
| 1213 | vma->vm_flags = vm_flags; | 1212 | vma->vm_flags = vm_flags; |
| 1214 | vma->vm_pgoff = pgoff; | 1213 | vma->vm_pgoff = pgoff; |
| 1215 | 1214 | ||
| @@ -1428,6 +1427,30 @@ out: | |||
| 1428 | return retval; | 1427 | return retval; |
| 1429 | } | 1428 | } |
| 1430 | 1429 | ||
| 1430 | #ifdef __ARCH_WANT_SYS_OLD_MMAP | ||
| 1431 | struct mmap_arg_struct { | ||
| 1432 | unsigned long addr; | ||
| 1433 | unsigned long len; | ||
| 1434 | unsigned long prot; | ||
| 1435 | unsigned long flags; | ||
| 1436 | unsigned long fd; | ||
| 1437 | unsigned long offset; | ||
| 1438 | }; | ||
| 1439 | |||
| 1440 | SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) | ||
| 1441 | { | ||
| 1442 | struct mmap_arg_struct a; | ||
| 1443 | |||
| 1444 | if (copy_from_user(&a, arg, sizeof(a))) | ||
| 1445 | return -EFAULT; | ||
| 1446 | if (a.offset & ~PAGE_MASK) | ||
| 1447 | return -EINVAL; | ||
| 1448 | |||
| 1449 | return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, | ||
| 1450 | a.offset >> PAGE_SHIFT); | ||
| 1451 | } | ||
| 1452 | #endif /* __ARCH_WANT_SYS_OLD_MMAP */ | ||
| 1453 | |||
| 1431 | /* | 1454 | /* |
| 1432 | * split a vma into two pieces at address 'addr', a new vma is allocated either | 1455 | * split a vma into two pieces at address 'addr', a new vma is allocated either |
| 1433 | * for the first part or the tail. | 1456 | * for the first part or the tail. |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 237050478f28..b68e802a7a7d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/oom.h> | 18 | #include <linux/oom.h> |
| 19 | #include <linux/mm.h> | 19 | #include <linux/mm.h> |
| 20 | #include <linux/err.h> | 20 | #include <linux/err.h> |
| 21 | #include <linux/gfp.h> | ||
| 21 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
| 22 | #include <linux/swap.h> | 23 | #include <linux/swap.h> |
| 23 | #include <linux/timex.h> | 24 | #include <linux/timex.h> |
| @@ -401,8 +402,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
| 401 | "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", | 402 | "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
| 402 | task_pid_nr(p), p->comm, | 403 | task_pid_nr(p), p->comm, |
| 403 | K(p->mm->total_vm), | 404 | K(p->mm->total_vm), |
| 404 | K(get_mm_counter(p->mm, anon_rss)), | 405 | K(get_mm_counter(p->mm, MM_ANONPAGES)), |
| 405 | K(get_mm_counter(p->mm, file_rss))); | 406 | K(get_mm_counter(p->mm, MM_FILEPAGES))); |
| 406 | task_unlock(p); | 407 | task_unlock(p); |
| 407 | 408 | ||
| 408 | /* | 409 | /* |
| @@ -473,6 +474,8 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | |||
| 473 | unsigned long points = 0; | 474 | unsigned long points = 0; |
| 474 | struct task_struct *p; | 475 | struct task_struct *p; |
| 475 | 476 | ||
| 477 | if (sysctl_panic_on_oom == 2) | ||
| 478 | panic("out of memory(memcg). panic_on_oom is selected.\n"); | ||
| 476 | read_lock(&tasklist_lock); | 479 | read_lock(&tasklist_lock); |
| 477 | retry: | 480 | retry: |
| 478 | p = select_bad_process(&points, mem); | 481 | p = select_bad_process(&points, mem); |
| @@ -601,13 +604,6 @@ void pagefault_out_of_memory(void) | |||
| 601 | /* Got some memory back in the last second. */ | 604 | /* Got some memory back in the last second. */ |
| 602 | return; | 605 | return; |
| 603 | 606 | ||
| 604 | /* | ||
| 605 | * If this is from memcg, oom-killer is already invoked. | ||
| 606 | * and not worth to go system-wide-oom. | ||
| 607 | */ | ||
| 608 | if (mem_cgroup_oom_called(current)) | ||
| 609 | goto rest_and_return; | ||
| 610 | |||
| 611 | if (sysctl_panic_on_oom) | 607 | if (sysctl_panic_on_oom) |
| 612 | panic("out of memory from page fault. panic_on_oom is selected.\n"); | 608 | panic("out of memory from page fault. panic_on_oom is selected.\n"); |
| 613 | 609 | ||
| @@ -619,7 +615,6 @@ void pagefault_out_of_memory(void) | |||
| 619 | * Give "p" a good chance of killing itself before we | 615 | * Give "p" a good chance of killing itself before we |
| 620 | * retry to allocate memory. | 616 | * retry to allocate memory. |
| 621 | */ | 617 | */ |
| 622 | rest_and_return: | ||
| 623 | if (!test_thread_flag(TIF_MEMDIE)) | 618 | if (!test_thread_flag(TIF_MEMDIE)) |
| 624 | schedule_timeout_uninterruptible(1); | 619 | schedule_timeout_uninterruptible(1); |
| 625 | } | 620 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8deb9d0fd5b1..d03c946d5566 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -50,6 +50,7 @@ | |||
| 50 | #include <linux/kmemleak.h> | 50 | #include <linux/kmemleak.h> |
| 51 | #include <linux/memory.h> | 51 | #include <linux/memory.h> |
| 52 | #include <trace/events/kmem.h> | 52 | #include <trace/events/kmem.h> |
| 53 | #include <linux/ftrace_event.h> | ||
| 53 | 54 | ||
| 54 | #include <asm/tlbflush.h> | 55 | #include <asm/tlbflush.h> |
| 55 | #include <asm/div64.h> | 56 | #include <asm/div64.h> |
| @@ -76,6 +77,31 @@ unsigned long totalreserve_pages __read_mostly; | |||
| 76 | int percpu_pagelist_fraction; | 77 | int percpu_pagelist_fraction; |
| 77 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | 78 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
| 78 | 79 | ||
| 80 | #ifdef CONFIG_PM_SLEEP | ||
| 81 | /* | ||
| 82 | * The following functions are used by the suspend/hibernate code to temporarily | ||
| 83 | * change gfp_allowed_mask in order to avoid using I/O during memory allocations | ||
| 84 | * while devices are suspended. To avoid races with the suspend/hibernate code, | ||
| 85 | * they should always be called with pm_mutex held (gfp_allowed_mask also should | ||
| 86 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | ||
| 87 | * guaranteed not to run in parallel with that modification). | ||
| 88 | */ | ||
| 89 | void set_gfp_allowed_mask(gfp_t mask) | ||
| 90 | { | ||
| 91 | WARN_ON(!mutex_is_locked(&pm_mutex)); | ||
| 92 | gfp_allowed_mask = mask; | ||
| 93 | } | ||
| 94 | |||
| 95 | gfp_t clear_gfp_allowed_mask(gfp_t mask) | ||
| 96 | { | ||
| 97 | gfp_t ret = gfp_allowed_mask; | ||
| 98 | |||
| 99 | WARN_ON(!mutex_is_locked(&pm_mutex)); | ||
| 100 | gfp_allowed_mask &= ~mask; | ||
| 101 | return ret; | ||
| 102 | } | ||
| 103 | #endif /* CONFIG_PM_SLEEP */ | ||
| 104 | |||
| 79 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 105 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
| 80 | int pageblock_order __read_mostly; | 106 | int pageblock_order __read_mostly; |
| 81 | #endif | 107 | #endif |
| @@ -263,10 +289,7 @@ static void bad_page(struct page *page) | |||
| 263 | 289 | ||
| 264 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", | 290 | printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", |
| 265 | current->comm, page_to_pfn(page)); | 291 | current->comm, page_to_pfn(page)); |
| 266 | printk(KERN_ALERT | 292 | dump_page(page); |
| 267 | "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", | ||
| 268 | page, (void *)page->flags, page_count(page), | ||
| 269 | page_mapcount(page), page->mapping, page->index); | ||
| 270 | 293 | ||
| 271 | dump_stack(); | 294 | dump_stack(); |
| 272 | out: | 295 | out: |
| @@ -530,7 +553,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
| 530 | int batch_free = 0; | 553 | int batch_free = 0; |
| 531 | 554 | ||
| 532 | spin_lock(&zone->lock); | 555 | spin_lock(&zone->lock); |
| 533 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 556 | zone->all_unreclaimable = 0; |
| 534 | zone->pages_scanned = 0; | 557 | zone->pages_scanned = 0; |
| 535 | 558 | ||
| 536 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | 559 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); |
| @@ -568,7 +591,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
| 568 | int migratetype) | 591 | int migratetype) |
| 569 | { | 592 | { |
| 570 | spin_lock(&zone->lock); | 593 | spin_lock(&zone->lock); |
| 571 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 594 | zone->all_unreclaimable = 0; |
| 572 | zone->pages_scanned = 0; | 595 | zone->pages_scanned = 0; |
| 573 | 596 | ||
| 574 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 597 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); |
| @@ -583,6 +606,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 583 | int bad = 0; | 606 | int bad = 0; |
| 584 | int wasMlocked = __TestClearPageMlocked(page); | 607 | int wasMlocked = __TestClearPageMlocked(page); |
| 585 | 608 | ||
| 609 | trace_mm_page_free_direct(page, order); | ||
| 586 | kmemcheck_free_shadow(page, order); | 610 | kmemcheck_free_shadow(page, order); |
| 587 | 611 | ||
| 588 | for (i = 0 ; i < (1 << order) ; ++i) | 612 | for (i = 0 ; i < (1 << order) ; ++i) |
| @@ -1009,10 +1033,10 @@ static void drain_pages(unsigned int cpu) | |||
| 1009 | struct per_cpu_pageset *pset; | 1033 | struct per_cpu_pageset *pset; |
| 1010 | struct per_cpu_pages *pcp; | 1034 | struct per_cpu_pages *pcp; |
| 1011 | 1035 | ||
| 1012 | pset = zone_pcp(zone, cpu); | 1036 | local_irq_save(flags); |
| 1037 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
| 1013 | 1038 | ||
| 1014 | pcp = &pset->pcp; | 1039 | pcp = &pset->pcp; |
| 1015 | local_irq_save(flags); | ||
| 1016 | free_pcppages_bulk(zone, pcp->count, pcp); | 1040 | free_pcppages_bulk(zone, pcp->count, pcp); |
| 1017 | pcp->count = 0; | 1041 | pcp->count = 0; |
| 1018 | local_irq_restore(flags); | 1042 | local_irq_restore(flags); |
| @@ -1073,8 +1097,9 @@ void mark_free_pages(struct zone *zone) | |||
| 1073 | 1097 | ||
| 1074 | /* | 1098 | /* |
| 1075 | * Free a 0-order page | 1099 | * Free a 0-order page |
| 1100 | * cold == 1 ? free a cold page : free a hot page | ||
| 1076 | */ | 1101 | */ |
| 1077 | static void free_hot_cold_page(struct page *page, int cold) | 1102 | void free_hot_cold_page(struct page *page, int cold) |
| 1078 | { | 1103 | { |
| 1079 | struct zone *zone = page_zone(page); | 1104 | struct zone *zone = page_zone(page); |
| 1080 | struct per_cpu_pages *pcp; | 1105 | struct per_cpu_pages *pcp; |
| @@ -1082,6 +1107,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 1082 | int migratetype; | 1107 | int migratetype; |
| 1083 | int wasMlocked = __TestClearPageMlocked(page); | 1108 | int wasMlocked = __TestClearPageMlocked(page); |
| 1084 | 1109 | ||
| 1110 | trace_mm_page_free_direct(page, 0); | ||
| 1085 | kmemcheck_free_shadow(page, 0); | 1111 | kmemcheck_free_shadow(page, 0); |
| 1086 | 1112 | ||
| 1087 | if (PageAnon(page)) | 1113 | if (PageAnon(page)) |
| @@ -1096,7 +1122,6 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 1096 | arch_free_page(page, 0); | 1122 | arch_free_page(page, 0); |
| 1097 | kernel_map_pages(page, 1, 0); | 1123 | kernel_map_pages(page, 1, 0); |
| 1098 | 1124 | ||
| 1099 | pcp = &zone_pcp(zone, get_cpu())->pcp; | ||
| 1100 | migratetype = get_pageblock_migratetype(page); | 1125 | migratetype = get_pageblock_migratetype(page); |
| 1101 | set_page_private(page, migratetype); | 1126 | set_page_private(page, migratetype); |
| 1102 | local_irq_save(flags); | 1127 | local_irq_save(flags); |
| @@ -1119,6 +1144,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 1119 | migratetype = MIGRATE_MOVABLE; | 1144 | migratetype = MIGRATE_MOVABLE; |
| 1120 | } | 1145 | } |
| 1121 | 1146 | ||
| 1147 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
| 1122 | if (cold) | 1148 | if (cold) |
| 1123 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | 1149 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
| 1124 | else | 1150 | else |
| @@ -1131,15 +1157,8 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 1131 | 1157 | ||
| 1132 | out: | 1158 | out: |
| 1133 | local_irq_restore(flags); | 1159 | local_irq_restore(flags); |
| 1134 | put_cpu(); | ||
| 1135 | } | 1160 | } |
| 1136 | 1161 | ||
| 1137 | void free_hot_page(struct page *page) | ||
| 1138 | { | ||
| 1139 | trace_mm_page_free_direct(page, 0); | ||
| 1140 | free_hot_cold_page(page, 0); | ||
| 1141 | } | ||
| 1142 | |||
| 1143 | /* | 1162 | /* |
| 1144 | * split_page takes a non-compound higher-order page, and splits it into | 1163 | * split_page takes a non-compound higher-order page, and splits it into |
| 1145 | * n (1<<order) sub-pages: page[0..n] | 1164 | * n (1<<order) sub-pages: page[0..n] |
| @@ -1181,17 +1200,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
| 1181 | unsigned long flags; | 1200 | unsigned long flags; |
| 1182 | struct page *page; | 1201 | struct page *page; |
| 1183 | int cold = !!(gfp_flags & __GFP_COLD); | 1202 | int cold = !!(gfp_flags & __GFP_COLD); |
| 1184 | int cpu; | ||
| 1185 | 1203 | ||
| 1186 | again: | 1204 | again: |
| 1187 | cpu = get_cpu(); | ||
| 1188 | if (likely(order == 0)) { | 1205 | if (likely(order == 0)) { |
| 1189 | struct per_cpu_pages *pcp; | 1206 | struct per_cpu_pages *pcp; |
| 1190 | struct list_head *list; | 1207 | struct list_head *list; |
| 1191 | 1208 | ||
| 1192 | pcp = &zone_pcp(zone, cpu)->pcp; | ||
| 1193 | list = &pcp->lists[migratetype]; | ||
| 1194 | local_irq_save(flags); | 1209 | local_irq_save(flags); |
| 1210 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
| 1211 | list = &pcp->lists[migratetype]; | ||
| 1195 | if (list_empty(list)) { | 1212 | if (list_empty(list)) { |
| 1196 | pcp->count += rmqueue_bulk(zone, 0, | 1213 | pcp->count += rmqueue_bulk(zone, 0, |
| 1197 | pcp->batch, list, | 1214 | pcp->batch, list, |
| @@ -1232,7 +1249,6 @@ again: | |||
| 1232 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1249 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
| 1233 | zone_statistics(preferred_zone, zone); | 1250 | zone_statistics(preferred_zone, zone); |
| 1234 | local_irq_restore(flags); | 1251 | local_irq_restore(flags); |
| 1235 | put_cpu(); | ||
| 1236 | 1252 | ||
| 1237 | VM_BUG_ON(bad_range(zone, page)); | 1253 | VM_BUG_ON(bad_range(zone, page)); |
| 1238 | if (prep_new_page(page, order, gfp_flags)) | 1254 | if (prep_new_page(page, order, gfp_flags)) |
| @@ -1241,7 +1257,6 @@ again: | |||
| 1241 | 1257 | ||
| 1242 | failed: | 1258 | failed: |
| 1243 | local_irq_restore(flags); | 1259 | local_irq_restore(flags); |
| 1244 | put_cpu(); | ||
| 1245 | return NULL; | 1260 | return NULL; |
| 1246 | } | 1261 | } |
| 1247 | 1262 | ||
| @@ -2013,9 +2028,8 @@ void __pagevec_free(struct pagevec *pvec) | |||
| 2013 | void __free_pages(struct page *page, unsigned int order) | 2028 | void __free_pages(struct page *page, unsigned int order) |
| 2014 | { | 2029 | { |
| 2015 | if (put_page_testzero(page)) { | 2030 | if (put_page_testzero(page)) { |
| 2016 | trace_mm_page_free_direct(page, order); | ||
| 2017 | if (order == 0) | 2031 | if (order == 0) |
| 2018 | free_hot_page(page); | 2032 | free_hot_cold_page(page, 0); |
| 2019 | else | 2033 | else |
| 2020 | __free_pages_ok(page, order); | 2034 | __free_pages_ok(page, order); |
| 2021 | } | 2035 | } |
| @@ -2180,7 +2194,7 @@ void show_free_areas(void) | |||
| 2180 | for_each_online_cpu(cpu) { | 2194 | for_each_online_cpu(cpu) { |
| 2181 | struct per_cpu_pageset *pageset; | 2195 | struct per_cpu_pageset *pageset; |
| 2182 | 2196 | ||
| 2183 | pageset = zone_pcp(zone, cpu); | 2197 | pageset = per_cpu_ptr(zone->pageset, cpu); |
| 2184 | 2198 | ||
| 2185 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", | 2199 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", |
| 2186 | cpu, pageset->pcp.high, | 2200 | cpu, pageset->pcp.high, |
| @@ -2271,7 +2285,7 @@ void show_free_areas(void) | |||
| 2271 | K(zone_page_state(zone, NR_BOUNCE)), | 2285 | K(zone_page_state(zone, NR_BOUNCE)), |
| 2272 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 2286 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
| 2273 | zone->pages_scanned, | 2287 | zone->pages_scanned, |
| 2274 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 2288 | (zone->all_unreclaimable ? "yes" : "no") |
| 2275 | ); | 2289 | ); |
| 2276 | printk("lowmem_reserve[]:"); | 2290 | printk("lowmem_reserve[]:"); |
| 2277 | for (i = 0; i < MAX_NR_ZONES; i++) | 2291 | for (i = 0; i < MAX_NR_ZONES; i++) |
| @@ -2745,10 +2759,29 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
| 2745 | 2759 | ||
| 2746 | #endif /* CONFIG_NUMA */ | 2760 | #endif /* CONFIG_NUMA */ |
| 2747 | 2761 | ||
| 2762 | /* | ||
| 2763 | * Boot pageset table. One per cpu which is going to be used for all | ||
| 2764 | * zones and all nodes. The parameters will be set in such a way | ||
| 2765 | * that an item put on a list will immediately be handed over to | ||
| 2766 | * the buddy list. This is safe since pageset manipulation is done | ||
| 2767 | * with interrupts disabled. | ||
| 2768 | * | ||
| 2769 | * The boot_pagesets must be kept even after bootup is complete for | ||
| 2770 | * unused processors and/or zones. They do play a role for bootstrapping | ||
| 2771 | * hotplugged processors. | ||
| 2772 | * | ||
| 2773 | * zoneinfo_show() and maybe other functions do | ||
| 2774 | * not check if the processor is online before following the pageset pointer. | ||
| 2775 | * Other parts of the kernel may not check if the zone is available. | ||
| 2776 | */ | ||
| 2777 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | ||
| 2778 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | ||
| 2779 | |||
| 2748 | /* return values int ....just for stop_machine() */ | 2780 | /* return values int ....just for stop_machine() */ |
| 2749 | static int __build_all_zonelists(void *dummy) | 2781 | static int __build_all_zonelists(void *dummy) |
| 2750 | { | 2782 | { |
| 2751 | int nid; | 2783 | int nid; |
| 2784 | int cpu; | ||
| 2752 | 2785 | ||
| 2753 | #ifdef CONFIG_NUMA | 2786 | #ifdef CONFIG_NUMA |
| 2754 | memset(node_load, 0, sizeof(node_load)); | 2787 | memset(node_load, 0, sizeof(node_load)); |
| @@ -2759,6 +2792,23 @@ static int __build_all_zonelists(void *dummy) | |||
| 2759 | build_zonelists(pgdat); | 2792 | build_zonelists(pgdat); |
| 2760 | build_zonelist_cache(pgdat); | 2793 | build_zonelist_cache(pgdat); |
| 2761 | } | 2794 | } |
| 2795 | |||
| 2796 | /* | ||
| 2797 | * Initialize the boot_pagesets that are going to be used | ||
| 2798 | * for bootstrapping processors. The real pagesets for | ||
| 2799 | * each zone will be allocated later when the per cpu | ||
| 2800 | * allocator is available. | ||
| 2801 | * | ||
| 2802 | * boot_pagesets are used also for bootstrapping offline | ||
| 2803 | * cpus if the system is already booted because the pagesets | ||
| 2804 | * are needed to initialize allocators on a specific cpu too. | ||
| 2805 | * F.e. the percpu allocator needs the page allocator which | ||
| 2806 | * needs the percpu allocator in order to allocate its pagesets | ||
| 2807 | * (a chicken-egg dilemma). | ||
| 2808 | */ | ||
| 2809 | for_each_possible_cpu(cpu) | ||
| 2810 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | ||
| 2811 | |||
| 2762 | return 0; | 2812 | return 0; |
| 2763 | } | 2813 | } |
| 2764 | 2814 | ||
| @@ -3096,121 +3146,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
| 3096 | pcp->batch = PAGE_SHIFT * 8; | 3146 | pcp->batch = PAGE_SHIFT * 8; |
| 3097 | } | 3147 | } |
| 3098 | 3148 | ||
| 3099 | |||
| 3100 | #ifdef CONFIG_NUMA | ||
| 3101 | /* | ||
| 3102 | * Boot pageset table. One per cpu which is going to be used for all | ||
| 3103 | * zones and all nodes. The parameters will be set in such a way | ||
| 3104 | * that an item put on a list will immediately be handed over to | ||
| 3105 | * the buddy list. This is safe since pageset manipulation is done | ||
| 3106 | * with interrupts disabled. | ||
| 3107 | * | ||
| 3108 | * Some NUMA counter updates may also be caught by the boot pagesets. | ||
| 3109 | * | ||
| 3110 | * The boot_pagesets must be kept even after bootup is complete for | ||
| 3111 | * unused processors and/or zones. They do play a role for bootstrapping | ||
| 3112 | * hotplugged processors. | ||
| 3113 | * | ||
| 3114 | * zoneinfo_show() and maybe other functions do | ||
| 3115 | * not check if the processor is online before following the pageset pointer. | ||
| 3116 | * Other parts of the kernel may not check if the zone is available. | ||
| 3117 | */ | ||
| 3118 | static struct per_cpu_pageset boot_pageset[NR_CPUS]; | ||
| 3119 | |||
| 3120 | /* | 3149 | /* |
| 3121 | * Dynamically allocate memory for the | 3150 | * Allocate per cpu pagesets and initialize them. |
| 3122 | * per cpu pageset array in struct zone. | 3151 | * Before this call only boot pagesets were available. |
| 3152 | * Boot pagesets will no longer be used by this processorr | ||
| 3153 | * after setup_per_cpu_pageset(). | ||
| 3123 | */ | 3154 | */ |
| 3124 | static int __cpuinit process_zones(int cpu) | 3155 | void __init setup_per_cpu_pageset(void) |
| 3125 | { | 3156 | { |
| 3126 | struct zone *zone, *dzone; | 3157 | struct zone *zone; |
| 3127 | int node = cpu_to_node(cpu); | 3158 | int cpu; |
| 3128 | |||
| 3129 | node_set_state(node, N_CPU); /* this node has a cpu */ | ||
| 3130 | 3159 | ||
| 3131 | for_each_populated_zone(zone) { | 3160 | for_each_populated_zone(zone) { |
| 3132 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 3161 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
| 3133 | GFP_KERNEL, node); | ||
| 3134 | if (!zone_pcp(zone, cpu)) | ||
| 3135 | goto bad; | ||
| 3136 | 3162 | ||
| 3137 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); | 3163 | for_each_possible_cpu(cpu) { |
| 3164 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); | ||
| 3138 | 3165 | ||
| 3139 | if (percpu_pagelist_fraction) | 3166 | setup_pageset(pcp, zone_batchsize(zone)); |
| 3140 | setup_pagelist_highmark(zone_pcp(zone, cpu), | ||
| 3141 | (zone->present_pages / percpu_pagelist_fraction)); | ||
| 3142 | } | ||
| 3143 | 3167 | ||
| 3144 | return 0; | 3168 | if (percpu_pagelist_fraction) |
| 3145 | bad: | 3169 | setup_pagelist_highmark(pcp, |
| 3146 | for_each_zone(dzone) { | 3170 | (zone->present_pages / |
| 3147 | if (!populated_zone(dzone)) | 3171 | percpu_pagelist_fraction)); |
| 3148 | continue; | 3172 | } |
| 3149 | if (dzone == zone) | ||
| 3150 | break; | ||
| 3151 | kfree(zone_pcp(dzone, cpu)); | ||
| 3152 | zone_pcp(dzone, cpu) = &boot_pageset[cpu]; | ||
| 3153 | } | ||
| 3154 | return -ENOMEM; | ||
| 3155 | } | ||
| 3156 | |||
| 3157 | static inline void free_zone_pagesets(int cpu) | ||
| 3158 | { | ||
| 3159 | struct zone *zone; | ||
| 3160 | |||
| 3161 | for_each_zone(zone) { | ||
| 3162 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | ||
| 3163 | |||
| 3164 | /* Free per_cpu_pageset if it is slab allocated */ | ||
| 3165 | if (pset != &boot_pageset[cpu]) | ||
| 3166 | kfree(pset); | ||
| 3167 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | ||
| 3168 | } | ||
| 3169 | } | ||
| 3170 | |||
| 3171 | static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | ||
| 3172 | unsigned long action, | ||
| 3173 | void *hcpu) | ||
| 3174 | { | ||
| 3175 | int cpu = (long)hcpu; | ||
| 3176 | int ret = NOTIFY_OK; | ||
| 3177 | |||
| 3178 | switch (action) { | ||
| 3179 | case CPU_UP_PREPARE: | ||
| 3180 | case CPU_UP_PREPARE_FROZEN: | ||
| 3181 | if (process_zones(cpu)) | ||
| 3182 | ret = NOTIFY_BAD; | ||
| 3183 | break; | ||
| 3184 | case CPU_UP_CANCELED: | ||
| 3185 | case CPU_UP_CANCELED_FROZEN: | ||
| 3186 | case CPU_DEAD: | ||
| 3187 | case CPU_DEAD_FROZEN: | ||
| 3188 | free_zone_pagesets(cpu); | ||
| 3189 | break; | ||
| 3190 | default: | ||
| 3191 | break; | ||
| 3192 | } | 3173 | } |
| 3193 | return ret; | ||
| 3194 | } | 3174 | } |
| 3195 | 3175 | ||
| 3196 | static struct notifier_block __cpuinitdata pageset_notifier = | ||
| 3197 | { &pageset_cpuup_callback, NULL, 0 }; | ||
| 3198 | |||
| 3199 | void __init setup_per_cpu_pageset(void) | ||
| 3200 | { | ||
| 3201 | int err; | ||
| 3202 | |||
| 3203 | /* Initialize per_cpu_pageset for cpu 0. | ||
| 3204 | * A cpuup callback will do this for every cpu | ||
| 3205 | * as it comes online | ||
| 3206 | */ | ||
| 3207 | err = process_zones(smp_processor_id()); | ||
| 3208 | BUG_ON(err); | ||
| 3209 | register_cpu_notifier(&pageset_notifier); | ||
| 3210 | } | ||
| 3211 | |||
| 3212 | #endif | ||
| 3213 | |||
| 3214 | static noinline __init_refok | 3176 | static noinline __init_refok |
| 3215 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 3177 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
| 3216 | { | 3178 | { |
| @@ -3260,11 +3222,11 @@ static int __zone_pcp_update(void *data) | |||
| 3260 | int cpu; | 3222 | int cpu; |
| 3261 | unsigned long batch = zone_batchsize(zone), flags; | 3223 | unsigned long batch = zone_batchsize(zone), flags; |
| 3262 | 3224 | ||
| 3263 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | 3225 | for_each_possible_cpu(cpu) { |
| 3264 | struct per_cpu_pageset *pset; | 3226 | struct per_cpu_pageset *pset; |
| 3265 | struct per_cpu_pages *pcp; | 3227 | struct per_cpu_pages *pcp; |
| 3266 | 3228 | ||
| 3267 | pset = zone_pcp(zone, cpu); | 3229 | pset = per_cpu_ptr(zone->pageset, cpu); |
| 3268 | pcp = &pset->pcp; | 3230 | pcp = &pset->pcp; |
| 3269 | 3231 | ||
| 3270 | local_irq_save(flags); | 3232 | local_irq_save(flags); |
| @@ -3282,21 +3244,17 @@ void zone_pcp_update(struct zone *zone) | |||
| 3282 | 3244 | ||
| 3283 | static __meminit void zone_pcp_init(struct zone *zone) | 3245 | static __meminit void zone_pcp_init(struct zone *zone) |
| 3284 | { | 3246 | { |
| 3285 | int cpu; | 3247 | /* |
| 3286 | unsigned long batch = zone_batchsize(zone); | 3248 | * per cpu subsystem is not up at this point. The following code |
| 3249 | * relies on the ability of the linker to provide the | ||
| 3250 | * offset of a (static) per cpu variable into the per cpu area. | ||
| 3251 | */ | ||
| 3252 | zone->pageset = &boot_pageset; | ||
| 3287 | 3253 | ||
| 3288 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
| 3289 | #ifdef CONFIG_NUMA | ||
| 3290 | /* Early boot. Slab allocator not functional yet */ | ||
| 3291 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | ||
| 3292 | setup_pageset(&boot_pageset[cpu],0); | ||
| 3293 | #else | ||
| 3294 | setup_pageset(zone_pcp(zone,cpu), batch); | ||
| 3295 | #endif | ||
| 3296 | } | ||
| 3297 | if (zone->present_pages) | 3254 | if (zone->present_pages) |
| 3298 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 3255 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", |
| 3299 | zone->name, zone->present_pages, batch); | 3256 | zone->name, zone->present_pages, |
| 3257 | zone_batchsize(zone)); | ||
| 3300 | } | 3258 | } |
| 3301 | 3259 | ||
| 3302 | __meminit int init_currently_empty_zone(struct zone *zone, | 3260 | __meminit int init_currently_empty_zone(struct zone *zone, |
| @@ -3435,6 +3393,61 @@ void __init free_bootmem_with_active_regions(int nid, | |||
| 3435 | } | 3393 | } |
| 3436 | } | 3394 | } |
| 3437 | 3395 | ||
| 3396 | int __init add_from_early_node_map(struct range *range, int az, | ||
| 3397 | int nr_range, int nid) | ||
| 3398 | { | ||
| 3399 | int i; | ||
| 3400 | u64 start, end; | ||
| 3401 | |||
| 3402 | /* need to go over early_node_map to find out good range for node */ | ||
| 3403 | for_each_active_range_index_in_nid(i, nid) { | ||
| 3404 | start = early_node_map[i].start_pfn; | ||
| 3405 | end = early_node_map[i].end_pfn; | ||
| 3406 | nr_range = add_range(range, az, nr_range, start, end); | ||
| 3407 | } | ||
| 3408 | return nr_range; | ||
| 3409 | } | ||
| 3410 | |||
| 3411 | #ifdef CONFIG_NO_BOOTMEM | ||
| 3412 | void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | ||
| 3413 | u64 goal, u64 limit) | ||
| 3414 | { | ||
| 3415 | int i; | ||
| 3416 | void *ptr; | ||
| 3417 | |||
| 3418 | /* need to go over early_node_map to find out good range for node */ | ||
| 3419 | for_each_active_range_index_in_nid(i, nid) { | ||
| 3420 | u64 addr; | ||
| 3421 | u64 ei_start, ei_last; | ||
| 3422 | |||
| 3423 | ei_last = early_node_map[i].end_pfn; | ||
| 3424 | ei_last <<= PAGE_SHIFT; | ||
| 3425 | ei_start = early_node_map[i].start_pfn; | ||
| 3426 | ei_start <<= PAGE_SHIFT; | ||
| 3427 | addr = find_early_area(ei_start, ei_last, | ||
| 3428 | goal, limit, size, align); | ||
| 3429 | |||
| 3430 | if (addr == -1ULL) | ||
| 3431 | continue; | ||
| 3432 | |||
| 3433 | #if 0 | ||
| 3434 | printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", | ||
| 3435 | nid, | ||
| 3436 | ei_start, ei_last, goal, limit, size, | ||
| 3437 | align, addr); | ||
| 3438 | #endif | ||
| 3439 | |||
| 3440 | ptr = phys_to_virt(addr); | ||
| 3441 | memset(ptr, 0, size); | ||
| 3442 | reserve_early_without_check(addr, addr + size, "BOOTMEM"); | ||
| 3443 | return ptr; | ||
| 3444 | } | ||
| 3445 | |||
| 3446 | return NULL; | ||
| 3447 | } | ||
| 3448 | #endif | ||
| 3449 | |||
| 3450 | |||
| 3438 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) | 3451 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) |
| 3439 | { | 3452 | { |
| 3440 | int i; | 3453 | int i; |
| @@ -4377,8 +4390,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
| 4377 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4390 | for (i = 0; i < MAX_NR_ZONES; i++) { |
| 4378 | if (i == ZONE_MOVABLE) | 4391 | if (i == ZONE_MOVABLE) |
| 4379 | continue; | 4392 | continue; |
| 4380 | printk(" %-8s %0#10lx -> %0#10lx\n", | 4393 | printk(" %-8s ", zone_names[i]); |
| 4381 | zone_names[i], | 4394 | if (arch_zone_lowest_possible_pfn[i] == |
| 4395 | arch_zone_highest_possible_pfn[i]) | ||
| 4396 | printk("empty\n"); | ||
| 4397 | else | ||
| 4398 | printk("%0#10lx -> %0#10lx\n", | ||
| 4382 | arch_zone_lowest_possible_pfn[i], | 4399 | arch_zone_lowest_possible_pfn[i], |
| 4383 | arch_zone_highest_possible_pfn[i]); | 4400 | arch_zone_highest_possible_pfn[i]); |
| 4384 | } | 4401 | } |
| @@ -4467,7 +4484,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) | |||
| 4467 | } | 4484 | } |
| 4468 | 4485 | ||
| 4469 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4486 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
| 4470 | struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; | 4487 | struct pglist_data __refdata contig_page_data = { |
| 4488 | #ifndef CONFIG_NO_BOOTMEM | ||
| 4489 | .bdata = &bootmem_node_data[0] | ||
| 4490 | #endif | ||
| 4491 | }; | ||
| 4471 | EXPORT_SYMBOL(contig_page_data); | 4492 | EXPORT_SYMBOL(contig_page_data); |
| 4472 | #endif | 4493 | #endif |
| 4473 | 4494 | ||
| @@ -4810,10 +4831,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
| 4810 | if (!write || (ret == -EINVAL)) | 4831 | if (!write || (ret == -EINVAL)) |
| 4811 | return ret; | 4832 | return ret; |
| 4812 | for_each_populated_zone(zone) { | 4833 | for_each_populated_zone(zone) { |
| 4813 | for_each_online_cpu(cpu) { | 4834 | for_each_possible_cpu(cpu) { |
| 4814 | unsigned long high; | 4835 | unsigned long high; |
| 4815 | high = zone->present_pages / percpu_pagelist_fraction; | 4836 | high = zone->present_pages / percpu_pagelist_fraction; |
| 4816 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | 4837 | setup_pagelist_highmark( |
| 4838 | per_cpu_ptr(zone->pageset, cpu), high); | ||
| 4817 | } | 4839 | } |
| 4818 | } | 4840 | } |
| 4819 | return 0; | 4841 | return 0; |
| @@ -5159,3 +5181,80 @@ bool is_free_buddy_page(struct page *page) | |||
| 5159 | return order < MAX_ORDER; | 5181 | return order < MAX_ORDER; |
| 5160 | } | 5182 | } |
| 5161 | #endif | 5183 | #endif |
| 5184 | |||
| 5185 | static struct trace_print_flags pageflag_names[] = { | ||
| 5186 | {1UL << PG_locked, "locked" }, | ||
| 5187 | {1UL << PG_error, "error" }, | ||
| 5188 | {1UL << PG_referenced, "referenced" }, | ||
| 5189 | {1UL << PG_uptodate, "uptodate" }, | ||
| 5190 | {1UL << PG_dirty, "dirty" }, | ||
| 5191 | {1UL << PG_lru, "lru" }, | ||
| 5192 | {1UL << PG_active, "active" }, | ||
| 5193 | {1UL << PG_slab, "slab" }, | ||
| 5194 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | ||
| 5195 | {1UL << PG_arch_1, "arch_1" }, | ||
| 5196 | {1UL << PG_reserved, "reserved" }, | ||
| 5197 | {1UL << PG_private, "private" }, | ||
| 5198 | {1UL << PG_private_2, "private_2" }, | ||
| 5199 | {1UL << PG_writeback, "writeback" }, | ||
| 5200 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
| 5201 | {1UL << PG_head, "head" }, | ||
| 5202 | {1UL << PG_tail, "tail" }, | ||
| 5203 | #else | ||
| 5204 | {1UL << PG_compound, "compound" }, | ||
| 5205 | #endif | ||
| 5206 | {1UL << PG_swapcache, "swapcache" }, | ||
| 5207 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | ||
| 5208 | {1UL << PG_reclaim, "reclaim" }, | ||
| 5209 | {1UL << PG_buddy, "buddy" }, | ||
| 5210 | {1UL << PG_swapbacked, "swapbacked" }, | ||
| 5211 | {1UL << PG_unevictable, "unevictable" }, | ||
| 5212 | #ifdef CONFIG_MMU | ||
| 5213 | {1UL << PG_mlocked, "mlocked" }, | ||
| 5214 | #endif | ||
| 5215 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | ||
| 5216 | {1UL << PG_uncached, "uncached" }, | ||
| 5217 | #endif | ||
| 5218 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 5219 | {1UL << PG_hwpoison, "hwpoison" }, | ||
| 5220 | #endif | ||
| 5221 | {-1UL, NULL }, | ||
| 5222 | }; | ||
| 5223 | |||
| 5224 | static void dump_page_flags(unsigned long flags) | ||
| 5225 | { | ||
| 5226 | const char *delim = ""; | ||
| 5227 | unsigned long mask; | ||
| 5228 | int i; | ||
| 5229 | |||
| 5230 | printk(KERN_ALERT "page flags: %#lx(", flags); | ||
| 5231 | |||
| 5232 | /* remove zone id */ | ||
| 5233 | flags &= (1UL << NR_PAGEFLAGS) - 1; | ||
| 5234 | |||
| 5235 | for (i = 0; pageflag_names[i].name && flags; i++) { | ||
| 5236 | |||
| 5237 | mask = pageflag_names[i].mask; | ||
| 5238 | if ((flags & mask) != mask) | ||
| 5239 | continue; | ||
| 5240 | |||
| 5241 | flags &= ~mask; | ||
| 5242 | printk("%s%s", delim, pageflag_names[i].name); | ||
| 5243 | delim = "|"; | ||
| 5244 | } | ||
| 5245 | |||
| 5246 | /* check for left over flags */ | ||
| 5247 | if (flags) | ||
| 5248 | printk("%s%#lx", delim, flags); | ||
| 5249 | |||
| 5250 | printk(")\n"); | ||
| 5251 | } | ||
| 5252 | |||
| 5253 | void dump_page(struct page *page) | ||
| 5254 | { | ||
| 5255 | printk(KERN_ALERT | ||
| 5256 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | ||
| 5257 | page, page_count(page), page_mapcount(page), | ||
| 5258 | page->mapping, page->index); | ||
| 5259 | dump_page_flags(page->flags); | ||
| 5260 | } | ||
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 3d535d594826..6c0081441a32 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
| @@ -284,6 +284,7 @@ static DEFINE_MUTEX(swap_cgroup_mutex); | |||
| 284 | struct swap_cgroup_ctrl { | 284 | struct swap_cgroup_ctrl { |
| 285 | struct page **map; | 285 | struct page **map; |
| 286 | unsigned long length; | 286 | unsigned long length; |
| 287 | spinlock_t lock; | ||
| 287 | }; | 288 | }; |
| 288 | 289 | ||
| 289 | struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | 290 | struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; |
| @@ -335,6 +336,43 @@ not_enough_page: | |||
| 335 | } | 336 | } |
| 336 | 337 | ||
| 337 | /** | 338 | /** |
| 339 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | ||
| 340 | * @end: swap entry to be cmpxchged | ||
| 341 | * @old: old id | ||
| 342 | * @new: new id | ||
| 343 | * | ||
| 344 | * Returns old id at success, 0 at failure. | ||
| 345 | * (There is no mem_cgroup useing 0 as its id) | ||
| 346 | */ | ||
| 347 | unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | ||
| 348 | unsigned short old, unsigned short new) | ||
| 349 | { | ||
| 350 | int type = swp_type(ent); | ||
| 351 | unsigned long offset = swp_offset(ent); | ||
| 352 | unsigned long idx = offset / SC_PER_PAGE; | ||
| 353 | unsigned long pos = offset & SC_POS_MASK; | ||
| 354 | struct swap_cgroup_ctrl *ctrl; | ||
| 355 | struct page *mappage; | ||
| 356 | struct swap_cgroup *sc; | ||
| 357 | unsigned long flags; | ||
| 358 | unsigned short retval; | ||
| 359 | |||
| 360 | ctrl = &swap_cgroup_ctrl[type]; | ||
| 361 | |||
| 362 | mappage = ctrl->map[idx]; | ||
| 363 | sc = page_address(mappage); | ||
| 364 | sc += pos; | ||
| 365 | spin_lock_irqsave(&ctrl->lock, flags); | ||
| 366 | retval = sc->id; | ||
| 367 | if (retval == old) | ||
| 368 | sc->id = new; | ||
| 369 | else | ||
| 370 | retval = 0; | ||
| 371 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
| 372 | return retval; | ||
| 373 | } | ||
| 374 | |||
| 375 | /** | ||
| 338 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | 376 | * swap_cgroup_record - record mem_cgroup for this swp_entry. |
| 339 | * @ent: swap entry to be recorded into | 377 | * @ent: swap entry to be recorded into |
| 340 | * @mem: mem_cgroup to be recorded | 378 | * @mem: mem_cgroup to be recorded |
| @@ -352,14 +390,17 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) | |||
| 352 | struct page *mappage; | 390 | struct page *mappage; |
| 353 | struct swap_cgroup *sc; | 391 | struct swap_cgroup *sc; |
| 354 | unsigned short old; | 392 | unsigned short old; |
| 393 | unsigned long flags; | ||
| 355 | 394 | ||
| 356 | ctrl = &swap_cgroup_ctrl[type]; | 395 | ctrl = &swap_cgroup_ctrl[type]; |
| 357 | 396 | ||
| 358 | mappage = ctrl->map[idx]; | 397 | mappage = ctrl->map[idx]; |
| 359 | sc = page_address(mappage); | 398 | sc = page_address(mappage); |
| 360 | sc += pos; | 399 | sc += pos; |
| 400 | spin_lock_irqsave(&ctrl->lock, flags); | ||
| 361 | old = sc->id; | 401 | old = sc->id; |
| 362 | sc->id = id; | 402 | sc->id = id; |
| 403 | spin_unlock_irqrestore(&ctrl->lock, flags); | ||
| 363 | 404 | ||
| 364 | return old; | 405 | return old; |
| 365 | } | 406 | } |
| @@ -411,6 +452,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) | |||
| 411 | mutex_lock(&swap_cgroup_mutex); | 452 | mutex_lock(&swap_cgroup_mutex); |
| 412 | ctrl->length = length; | 453 | ctrl->length = length; |
| 413 | ctrl->map = array; | 454 | ctrl->map = array; |
| 455 | spin_lock_init(&ctrl->lock); | ||
| 414 | if (swap_cgroup_prepare(type)) { | 456 | if (swap_cgroup_prepare(type)) { |
| 415 | /* memory shortage */ | 457 | /* memory shortage */ |
| 416 | ctrl->map = NULL; | 458 | ctrl->map = NULL; |
diff --git a/mm/page_io.c b/mm/page_io.c index a19af956ee1b..31a3b962230a 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | 12 | ||
| 13 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
| 14 | #include <linux/kernel_stat.h> | 14 | #include <linux/kernel_stat.h> |
| 15 | #include <linux/gfp.h> | ||
| 15 | #include <linux/pagemap.h> | 16 | #include <linux/pagemap.h> |
| 16 | #include <linux/swap.h> | 17 | #include <linux/swap.h> |
| 17 | #include <linux/bio.h> | 18 | #include <linux/bio.h> |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 7b47a57b6646..8b1a2ce21ee5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
| @@ -80,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, | |||
| 80 | return err; | 80 | return err; |
| 81 | } | 81 | } |
| 82 | 82 | ||
| 83 | #ifdef CONFIG_HUGETLB_PAGE | ||
| 84 | static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, | ||
| 85 | unsigned long end) | ||
| 86 | { | ||
| 87 | unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); | ||
| 88 | return boundary < end ? boundary : end; | ||
| 89 | } | ||
| 90 | |||
| 91 | static int walk_hugetlb_range(struct vm_area_struct *vma, | ||
| 92 | unsigned long addr, unsigned long end, | ||
| 93 | struct mm_walk *walk) | ||
| 94 | { | ||
| 95 | struct hstate *h = hstate_vma(vma); | ||
| 96 | unsigned long next; | ||
| 97 | unsigned long hmask = huge_page_mask(h); | ||
| 98 | pte_t *pte; | ||
| 99 | int err = 0; | ||
| 100 | |||
| 101 | do { | ||
| 102 | next = hugetlb_entry_end(h, addr, end); | ||
| 103 | pte = huge_pte_offset(walk->mm, addr & hmask); | ||
| 104 | if (pte && walk->hugetlb_entry) | ||
| 105 | err = walk->hugetlb_entry(pte, hmask, addr, next, walk); | ||
| 106 | if (err) | ||
| 107 | return err; | ||
| 108 | } while (addr = next, addr != end); | ||
| 109 | |||
| 110 | return 0; | ||
| 111 | } | ||
| 112 | #endif | ||
| 113 | |||
| 83 | /** | 114 | /** |
| 84 | * walk_page_range - walk a memory map's page tables with a callback | 115 | * walk_page_range - walk a memory map's page tables with a callback |
| 85 | * @mm: memory map to walk | 116 | * @mm: memory map to walk |
| @@ -128,20 +159,16 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
| 128 | vma = find_vma(walk->mm, addr); | 159 | vma = find_vma(walk->mm, addr); |
| 129 | #ifdef CONFIG_HUGETLB_PAGE | 160 | #ifdef CONFIG_HUGETLB_PAGE |
| 130 | if (vma && is_vm_hugetlb_page(vma)) { | 161 | if (vma && is_vm_hugetlb_page(vma)) { |
| 131 | pte_t *pte; | ||
| 132 | struct hstate *hs; | ||
| 133 | |||
| 134 | if (vma->vm_end < next) | 162 | if (vma->vm_end < next) |
| 135 | next = vma->vm_end; | 163 | next = vma->vm_end; |
| 136 | hs = hstate_vma(vma); | 164 | /* |
| 137 | pte = huge_pte_offset(walk->mm, | 165 | * Hugepage is very tightly coupled with vma, so |
| 138 | addr & huge_page_mask(hs)); | 166 | * walk through hugetlb entries within a given vma. |
| 139 | if (pte && !huge_pte_none(huge_ptep_get(pte)) | 167 | */ |
| 140 | && walk->hugetlb_entry) | 168 | err = walk_hugetlb_range(vma, addr, next, walk); |
| 141 | err = walk->hugetlb_entry(pte, addr, | ||
| 142 | next, walk); | ||
| 143 | if (err) | 169 | if (err) |
| 144 | break; | 170 | break; |
| 171 | pgd = pgd_offset(walk->mm, next); | ||
| 145 | continue; | 172 | continue; |
| 146 | } | 173 | } |
| 147 | #endif | 174 | #endif |
diff --git a/mm/percpu.c b/mm/percpu.c index 083e7c91e5f6..6e09741ddc62 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
| @@ -80,13 +80,15 @@ | |||
| 80 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ | 80 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ |
| 81 | #ifndef __addr_to_pcpu_ptr | 81 | #ifndef __addr_to_pcpu_ptr |
| 82 | #define __addr_to_pcpu_ptr(addr) \ | 82 | #define __addr_to_pcpu_ptr(addr) \ |
| 83 | (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ | 83 | (void __percpu *)((unsigned long)(addr) - \ |
| 84 | + (unsigned long)__per_cpu_start) | 84 | (unsigned long)pcpu_base_addr + \ |
| 85 | (unsigned long)__per_cpu_start) | ||
| 85 | #endif | 86 | #endif |
| 86 | #ifndef __pcpu_ptr_to_addr | 87 | #ifndef __pcpu_ptr_to_addr |
| 87 | #define __pcpu_ptr_to_addr(ptr) \ | 88 | #define __pcpu_ptr_to_addr(ptr) \ |
| 88 | (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ | 89 | (void __force *)((unsigned long)(ptr) + \ |
| 89 | - (unsigned long)__per_cpu_start) | 90 | (unsigned long)pcpu_base_addr - \ |
| 91 | (unsigned long)__per_cpu_start) | ||
| 90 | #endif | 92 | #endif |
| 91 | 93 | ||
| 92 | struct pcpu_chunk { | 94 | struct pcpu_chunk { |
| @@ -913,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
| 913 | int rs, re; | 915 | int rs, re; |
| 914 | 916 | ||
| 915 | /* quick path, check whether it's empty already */ | 917 | /* quick path, check whether it's empty already */ |
| 916 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | 918 | rs = page_start; |
| 917 | if (rs == page_start && re == page_end) | 919 | pcpu_next_unpop(chunk, &rs, &re, page_end); |
| 918 | return; | 920 | if (rs == page_start && re == page_end) |
| 919 | break; | 921 | return; |
| 920 | } | ||
| 921 | 922 | ||
| 922 | /* immutable chunks can't be depopulated */ | 923 | /* immutable chunks can't be depopulated */ |
| 923 | WARN_ON(chunk->immutable); | 924 | WARN_ON(chunk->immutable); |
| @@ -968,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
| 968 | int rs, re, rc; | 969 | int rs, re, rc; |
| 969 | 970 | ||
| 970 | /* quick path, check whether all pages are already there */ | 971 | /* quick path, check whether all pages are already there */ |
| 971 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { | 972 | rs = page_start; |
| 972 | if (rs == page_start && re == page_end) | 973 | pcpu_next_pop(chunk, &rs, &re, page_end); |
| 973 | goto clear; | 974 | if (rs == page_start && re == page_end) |
| 974 | break; | 975 | goto clear; |
| 975 | } | ||
| 976 | 976 | ||
| 977 | /* need to allocate and map pages, this chunk can't be immutable */ | 977 | /* need to allocate and map pages, this chunk can't be immutable */ |
| 978 | WARN_ON(chunk->immutable); | 978 | WARN_ON(chunk->immutable); |
| @@ -1067,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) | |||
| 1067 | * RETURNS: | 1067 | * RETURNS: |
| 1068 | * Percpu pointer to the allocated area on success, NULL on failure. | 1068 | * Percpu pointer to the allocated area on success, NULL on failure. |
| 1069 | */ | 1069 | */ |
| 1070 | static void *pcpu_alloc(size_t size, size_t align, bool reserved) | 1070 | static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) |
| 1071 | { | 1071 | { |
| 1072 | static int warn_limit = 10; | 1072 | static int warn_limit = 10; |
| 1073 | struct pcpu_chunk *chunk; | 1073 | struct pcpu_chunk *chunk; |
| @@ -1196,7 +1196,7 @@ fail_unlock_mutex: | |||
| 1196 | * RETURNS: | 1196 | * RETURNS: |
| 1197 | * Percpu pointer to the allocated area on success, NULL on failure. | 1197 | * Percpu pointer to the allocated area on success, NULL on failure. |
| 1198 | */ | 1198 | */ |
| 1199 | void *__alloc_percpu(size_t size, size_t align) | 1199 | void __percpu *__alloc_percpu(size_t size, size_t align) |
| 1200 | { | 1200 | { |
| 1201 | return pcpu_alloc(size, align, false); | 1201 | return pcpu_alloc(size, align, false); |
| 1202 | } | 1202 | } |
| @@ -1217,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); | |||
| 1217 | * RETURNS: | 1217 | * RETURNS: |
| 1218 | * Percpu pointer to the allocated area on success, NULL on failure. | 1218 | * Percpu pointer to the allocated area on success, NULL on failure. |
| 1219 | */ | 1219 | */ |
| 1220 | void *__alloc_reserved_percpu(size_t size, size_t align) | 1220 | void __percpu *__alloc_reserved_percpu(size_t size, size_t align) |
| 1221 | { | 1221 | { |
| 1222 | return pcpu_alloc(size, align, true); | 1222 | return pcpu_alloc(size, align, true); |
| 1223 | } | 1223 | } |
| @@ -1269,7 +1269,7 @@ static void pcpu_reclaim(struct work_struct *work) | |||
| 1269 | * CONTEXT: | 1269 | * CONTEXT: |
| 1270 | * Can be called from atomic context. | 1270 | * Can be called from atomic context. |
| 1271 | */ | 1271 | */ |
| 1272 | void free_percpu(void *ptr) | 1272 | void free_percpu(void __percpu *ptr) |
| 1273 | { | 1273 | { |
| 1274 | void *addr; | 1274 | void *addr; |
| 1275 | struct pcpu_chunk *chunk; | 1275 | struct pcpu_chunk *chunk; |
| @@ -1304,6 +1304,32 @@ void free_percpu(void *ptr) | |||
| 1304 | EXPORT_SYMBOL_GPL(free_percpu); | 1304 | EXPORT_SYMBOL_GPL(free_percpu); |
| 1305 | 1305 | ||
| 1306 | /** | 1306 | /** |
| 1307 | * is_kernel_percpu_address - test whether address is from static percpu area | ||
| 1308 | * @addr: address to test | ||
| 1309 | * | ||
| 1310 | * Test whether @addr belongs to in-kernel static percpu area. Module | ||
| 1311 | * static percpu areas are not considered. For those, use | ||
| 1312 | * is_module_percpu_address(). | ||
| 1313 | * | ||
| 1314 | * RETURNS: | ||
| 1315 | * %true if @addr is from in-kernel static percpu area, %false otherwise. | ||
| 1316 | */ | ||
| 1317 | bool is_kernel_percpu_address(unsigned long addr) | ||
| 1318 | { | ||
| 1319 | const size_t static_size = __per_cpu_end - __per_cpu_start; | ||
| 1320 | void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); | ||
| 1321 | unsigned int cpu; | ||
| 1322 | |||
| 1323 | for_each_possible_cpu(cpu) { | ||
| 1324 | void *start = per_cpu_ptr(base, cpu); | ||
| 1325 | |||
| 1326 | if ((void *)addr >= start && (void *)addr < start + static_size) | ||
| 1327 | return true; | ||
| 1328 | } | ||
| 1329 | return false; | ||
| 1330 | } | ||
| 1331 | |||
| 1332 | /** | ||
| 1307 | * per_cpu_ptr_to_phys - convert translated percpu address to physical address | 1333 | * per_cpu_ptr_to_phys - convert translated percpu address to physical address |
| 1308 | * @addr: the address to be converted to physical address | 1334 | * @addr: the address to be converted to physical address |
| 1309 | * | 1335 | * |
diff --git a/mm/percpu_up.c b/mm/percpu_up.c new file mode 100644 index 000000000000..c4351c7f57d2 --- /dev/null +++ b/mm/percpu_up.c | |||
| @@ -0,0 +1,30 @@ | |||
| 1 | /* | ||
| 2 | * mm/percpu_up.c - dummy percpu memory allocator implementation for UP | ||
| 3 | */ | ||
| 4 | |||
| 5 | #include <linux/module.h> | ||
| 6 | #include <linux/percpu.h> | ||
| 7 | #include <linux/slab.h> | ||
| 8 | |||
| 9 | void __percpu *__alloc_percpu(size_t size, size_t align) | ||
| 10 | { | ||
| 11 | /* | ||
| 12 | * Can't easily make larger alignment work with kmalloc. WARN | ||
| 13 | * on it. Larger alignment should only be used for module | ||
| 14 | * percpu sections on SMP for which this path isn't used. | ||
| 15 | */ | ||
| 16 | WARN_ON_ONCE(align > SMP_CACHE_BYTES); | ||
| 17 | return kzalloc(size, GFP_KERNEL); | ||
| 18 | } | ||
| 19 | EXPORT_SYMBOL_GPL(__alloc_percpu); | ||
| 20 | |||
| 21 | void free_percpu(void __percpu *p) | ||
| 22 | { | ||
| 23 | kfree(p); | ||
| 24 | } | ||
| 25 | EXPORT_SYMBOL_GPL(free_percpu); | ||
| 26 | |||
| 27 | phys_addr_t per_cpu_ptr_to_phys(void *addr) | ||
| 28 | { | ||
| 29 | return __pa(addr); | ||
| 30 | } | ||
diff --git a/mm/quicklist.c b/mm/quicklist.c index 6633965bb27b..2876349339a7 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | */ | 14 | */ |
| 15 | #include <linux/kernel.h> | 15 | #include <linux/kernel.h> |
| 16 | 16 | ||
| 17 | #include <linux/gfp.h> | ||
| 17 | #include <linux/mm.h> | 18 | #include <linux/mm.h> |
| 18 | #include <linux/mmzone.h> | 19 | #include <linux/mmzone.h> |
| 19 | #include <linux/module.h> | 20 | #include <linux/module.h> |
diff --git a/mm/readahead.c b/mm/readahead.c index 033bc135a41f..dfa9a1a03a11 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | 9 | ||
| 10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
| 11 | #include <linux/fs.h> | 11 | #include <linux/fs.h> |
| 12 | #include <linux/gfp.h> | ||
| 12 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
| 13 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 14 | #include <linux/blkdev.h> | 15 | #include <linux/blkdev.h> |
| @@ -501,6 +502,12 @@ void page_cache_sync_readahead(struct address_space *mapping, | |||
| 501 | if (!ra->ra_pages) | 502 | if (!ra->ra_pages) |
| 502 | return; | 503 | return; |
| 503 | 504 | ||
| 505 | /* be dumb */ | ||
| 506 | if (filp && (filp->f_mode & FMODE_RANDOM)) { | ||
| 507 | force_page_cache_readahead(mapping, filp, offset, req_size); | ||
| 508 | return; | ||
| 509 | } | ||
| 510 | |||
| 504 | /* do read-ahead */ | 511 | /* do read-ahead */ |
| 505 | ondemand_readahead(mapping, ra, filp, false, offset, req_size); | 512 | ondemand_readahead(mapping, ra, filp, false, offset, req_size); |
| 506 | } | 513 | } |
| @@ -62,6 +62,7 @@ | |||
| 62 | #include "internal.h" | 62 | #include "internal.h" |
| 63 | 63 | ||
| 64 | static struct kmem_cache *anon_vma_cachep; | 64 | static struct kmem_cache *anon_vma_cachep; |
| 65 | static struct kmem_cache *anon_vma_chain_cachep; | ||
| 65 | 66 | ||
| 66 | static inline struct anon_vma *anon_vma_alloc(void) | 67 | static inline struct anon_vma *anon_vma_alloc(void) |
| 67 | { | 68 | { |
| @@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma) | |||
| 73 | kmem_cache_free(anon_vma_cachep, anon_vma); | 74 | kmem_cache_free(anon_vma_cachep, anon_vma); |
| 74 | } | 75 | } |
| 75 | 76 | ||
| 77 | static inline struct anon_vma_chain *anon_vma_chain_alloc(void) | ||
| 78 | { | ||
| 79 | return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); | ||
| 80 | } | ||
| 81 | |||
| 82 | void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | ||
| 83 | { | ||
| 84 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | ||
| 85 | } | ||
| 86 | |||
| 76 | /** | 87 | /** |
| 77 | * anon_vma_prepare - attach an anon_vma to a memory region | 88 | * anon_vma_prepare - attach an anon_vma to a memory region |
| 78 | * @vma: the memory region in question | 89 | * @vma: the memory region in question |
| @@ -103,73 +114,140 @@ void anon_vma_free(struct anon_vma *anon_vma) | |||
| 103 | int anon_vma_prepare(struct vm_area_struct *vma) | 114 | int anon_vma_prepare(struct vm_area_struct *vma) |
| 104 | { | 115 | { |
| 105 | struct anon_vma *anon_vma = vma->anon_vma; | 116 | struct anon_vma *anon_vma = vma->anon_vma; |
| 117 | struct anon_vma_chain *avc; | ||
| 106 | 118 | ||
| 107 | might_sleep(); | 119 | might_sleep(); |
| 108 | if (unlikely(!anon_vma)) { | 120 | if (unlikely(!anon_vma)) { |
| 109 | struct mm_struct *mm = vma->vm_mm; | 121 | struct mm_struct *mm = vma->vm_mm; |
| 110 | struct anon_vma *allocated; | 122 | struct anon_vma *allocated; |
| 111 | 123 | ||
| 124 | avc = anon_vma_chain_alloc(); | ||
| 125 | if (!avc) | ||
| 126 | goto out_enomem; | ||
| 127 | |||
| 112 | anon_vma = find_mergeable_anon_vma(vma); | 128 | anon_vma = find_mergeable_anon_vma(vma); |
| 113 | allocated = NULL; | 129 | allocated = NULL; |
| 114 | if (!anon_vma) { | 130 | if (!anon_vma) { |
| 115 | anon_vma = anon_vma_alloc(); | 131 | anon_vma = anon_vma_alloc(); |
| 116 | if (unlikely(!anon_vma)) | 132 | if (unlikely(!anon_vma)) |
| 117 | return -ENOMEM; | 133 | goto out_enomem_free_avc; |
| 118 | allocated = anon_vma; | 134 | allocated = anon_vma; |
| 119 | } | 135 | } |
| 120 | spin_lock(&anon_vma->lock); | ||
| 121 | 136 | ||
| 137 | spin_lock(&anon_vma->lock); | ||
| 122 | /* page_table_lock to protect against threads */ | 138 | /* page_table_lock to protect against threads */ |
| 123 | spin_lock(&mm->page_table_lock); | 139 | spin_lock(&mm->page_table_lock); |
| 124 | if (likely(!vma->anon_vma)) { | 140 | if (likely(!vma->anon_vma)) { |
| 125 | vma->anon_vma = anon_vma; | 141 | vma->anon_vma = anon_vma; |
| 126 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 142 | avc->anon_vma = anon_vma; |
| 143 | avc->vma = vma; | ||
| 144 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
| 145 | list_add(&avc->same_anon_vma, &anon_vma->head); | ||
| 127 | allocated = NULL; | 146 | allocated = NULL; |
| 147 | avc = NULL; | ||
| 128 | } | 148 | } |
| 129 | spin_unlock(&mm->page_table_lock); | 149 | spin_unlock(&mm->page_table_lock); |
| 130 | |||
| 131 | spin_unlock(&anon_vma->lock); | 150 | spin_unlock(&anon_vma->lock); |
| 151 | |||
| 132 | if (unlikely(allocated)) | 152 | if (unlikely(allocated)) |
| 133 | anon_vma_free(allocated); | 153 | anon_vma_free(allocated); |
| 154 | if (unlikely(avc)) | ||
| 155 | anon_vma_chain_free(avc); | ||
| 134 | } | 156 | } |
| 135 | return 0; | 157 | return 0; |
| 158 | |||
| 159 | out_enomem_free_avc: | ||
| 160 | anon_vma_chain_free(avc); | ||
| 161 | out_enomem: | ||
| 162 | return -ENOMEM; | ||
| 136 | } | 163 | } |
| 137 | 164 | ||
| 138 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) | 165 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
| 166 | struct anon_vma_chain *avc, | ||
| 167 | struct anon_vma *anon_vma) | ||
| 139 | { | 168 | { |
| 140 | BUG_ON(vma->anon_vma != next->anon_vma); | 169 | avc->vma = vma; |
| 141 | list_del(&next->anon_vma_node); | 170 | avc->anon_vma = anon_vma; |
| 171 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
| 172 | |||
| 173 | spin_lock(&anon_vma->lock); | ||
| 174 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
| 175 | spin_unlock(&anon_vma->lock); | ||
| 142 | } | 176 | } |
| 143 | 177 | ||
| 144 | void __anon_vma_link(struct vm_area_struct *vma) | 178 | /* |
| 179 | * Attach the anon_vmas from src to dst. | ||
| 180 | * Returns 0 on success, -ENOMEM on failure. | ||
| 181 | */ | ||
| 182 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | ||
| 145 | { | 183 | { |
| 146 | struct anon_vma *anon_vma = vma->anon_vma; | 184 | struct anon_vma_chain *avc, *pavc; |
| 147 | 185 | ||
| 148 | if (anon_vma) | 186 | list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { |
| 149 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 187 | avc = anon_vma_chain_alloc(); |
| 188 | if (!avc) | ||
| 189 | goto enomem_failure; | ||
| 190 | anon_vma_chain_link(dst, avc, pavc->anon_vma); | ||
| 191 | } | ||
| 192 | return 0; | ||
| 193 | |||
| 194 | enomem_failure: | ||
| 195 | unlink_anon_vmas(dst); | ||
| 196 | return -ENOMEM; | ||
| 150 | } | 197 | } |
| 151 | 198 | ||
| 152 | void anon_vma_link(struct vm_area_struct *vma) | 199 | /* |
| 200 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | ||
| 201 | * the corresponding VMA in the parent process is attached to. | ||
| 202 | * Returns 0 on success, non-zero on failure. | ||
| 203 | */ | ||
| 204 | int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | ||
| 153 | { | 205 | { |
| 154 | struct anon_vma *anon_vma = vma->anon_vma; | 206 | struct anon_vma_chain *avc; |
| 207 | struct anon_vma *anon_vma; | ||
| 155 | 208 | ||
| 156 | if (anon_vma) { | 209 | /* Don't bother if the parent process has no anon_vma here. */ |
| 157 | spin_lock(&anon_vma->lock); | 210 | if (!pvma->anon_vma) |
| 158 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 211 | return 0; |
| 159 | spin_unlock(&anon_vma->lock); | 212 | |
| 160 | } | 213 | /* |
| 214 | * First, attach the new VMA to the parent VMA's anon_vmas, | ||
| 215 | * so rmap can find non-COWed pages in child processes. | ||
| 216 | */ | ||
| 217 | if (anon_vma_clone(vma, pvma)) | ||
| 218 | return -ENOMEM; | ||
| 219 | |||
| 220 | /* Then add our own anon_vma. */ | ||
| 221 | anon_vma = anon_vma_alloc(); | ||
| 222 | if (!anon_vma) | ||
| 223 | goto out_error; | ||
| 224 | avc = anon_vma_chain_alloc(); | ||
| 225 | if (!avc) | ||
| 226 | goto out_error_free_anon_vma; | ||
| 227 | anon_vma_chain_link(vma, avc, anon_vma); | ||
| 228 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | ||
| 229 | vma->anon_vma = anon_vma; | ||
| 230 | |||
| 231 | return 0; | ||
| 232 | |||
| 233 | out_error_free_anon_vma: | ||
| 234 | anon_vma_free(anon_vma); | ||
| 235 | out_error: | ||
| 236 | unlink_anon_vmas(vma); | ||
| 237 | return -ENOMEM; | ||
| 161 | } | 238 | } |
| 162 | 239 | ||
| 163 | void anon_vma_unlink(struct vm_area_struct *vma) | 240 | static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) |
| 164 | { | 241 | { |
| 165 | struct anon_vma *anon_vma = vma->anon_vma; | 242 | struct anon_vma *anon_vma = anon_vma_chain->anon_vma; |
| 166 | int empty; | 243 | int empty; |
| 167 | 244 | ||
| 245 | /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ | ||
| 168 | if (!anon_vma) | 246 | if (!anon_vma) |
| 169 | return; | 247 | return; |
| 170 | 248 | ||
| 171 | spin_lock(&anon_vma->lock); | 249 | spin_lock(&anon_vma->lock); |
| 172 | list_del(&vma->anon_vma_node); | 250 | list_del(&anon_vma_chain->same_anon_vma); |
| 173 | 251 | ||
| 174 | /* We must garbage collect the anon_vma if it's empty */ | 252 | /* We must garbage collect the anon_vma if it's empty */ |
| 175 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); | 253 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); |
| @@ -179,6 +257,18 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
| 179 | anon_vma_free(anon_vma); | 257 | anon_vma_free(anon_vma); |
| 180 | } | 258 | } |
| 181 | 259 | ||
| 260 | void unlink_anon_vmas(struct vm_area_struct *vma) | ||
| 261 | { | ||
| 262 | struct anon_vma_chain *avc, *next; | ||
| 263 | |||
| 264 | /* Unlink each anon_vma chained to the VMA. */ | ||
| 265 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | ||
| 266 | anon_vma_unlink(avc); | ||
| 267 | list_del(&avc->same_vma); | ||
| 268 | anon_vma_chain_free(avc); | ||
| 269 | } | ||
| 270 | } | ||
| 271 | |||
| 182 | static void anon_vma_ctor(void *data) | 272 | static void anon_vma_ctor(void *data) |
| 183 | { | 273 | { |
| 184 | struct anon_vma *anon_vma = data; | 274 | struct anon_vma *anon_vma = data; |
| @@ -192,6 +282,7 @@ void __init anon_vma_init(void) | |||
| 192 | { | 282 | { |
| 193 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 283 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
| 194 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); | 284 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); |
| 285 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); | ||
| 195 | } | 286 | } |
| 196 | 287 | ||
| 197 | /* | 288 | /* |
| @@ -396,7 +487,7 @@ static int page_referenced_anon(struct page *page, | |||
| 396 | { | 487 | { |
| 397 | unsigned int mapcount; | 488 | unsigned int mapcount; |
| 398 | struct anon_vma *anon_vma; | 489 | struct anon_vma *anon_vma; |
| 399 | struct vm_area_struct *vma; | 490 | struct anon_vma_chain *avc; |
| 400 | int referenced = 0; | 491 | int referenced = 0; |
| 401 | 492 | ||
| 402 | anon_vma = page_lock_anon_vma(page); | 493 | anon_vma = page_lock_anon_vma(page); |
| @@ -404,7 +495,8 @@ static int page_referenced_anon(struct page *page, | |||
| 404 | return referenced; | 495 | return referenced; |
| 405 | 496 | ||
| 406 | mapcount = page_mapcount(page); | 497 | mapcount = page_mapcount(page); |
| 407 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 498 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
| 499 | struct vm_area_struct *vma = avc->vma; | ||
| 408 | unsigned long address = vma_address(page, vma); | 500 | unsigned long address = vma_address(page, vma); |
| 409 | if (address == -EFAULT) | 501 | if (address == -EFAULT) |
| 410 | continue; | 502 | continue; |
| @@ -511,9 +603,6 @@ int page_referenced(struct page *page, | |||
| 511 | int referenced = 0; | 603 | int referenced = 0; |
| 512 | int we_locked = 0; | 604 | int we_locked = 0; |
| 513 | 605 | ||
| 514 | if (TestClearPageReferenced(page)) | ||
| 515 | referenced++; | ||
| 516 | |||
| 517 | *vm_flags = 0; | 606 | *vm_flags = 0; |
| 518 | if (page_mapped(page) && page_rmapping(page)) { | 607 | if (page_mapped(page) && page_rmapping(page)) { |
| 519 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { | 608 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
| @@ -614,17 +703,57 @@ int page_mkclean(struct page *page) | |||
| 614 | EXPORT_SYMBOL_GPL(page_mkclean); | 703 | EXPORT_SYMBOL_GPL(page_mkclean); |
| 615 | 704 | ||
| 616 | /** | 705 | /** |
| 706 | * page_move_anon_rmap - move a page to our anon_vma | ||
| 707 | * @page: the page to move to our anon_vma | ||
| 708 | * @vma: the vma the page belongs to | ||
| 709 | * @address: the user virtual address mapped | ||
| 710 | * | ||
| 711 | * When a page belongs exclusively to one process after a COW event, | ||
| 712 | * that page can be moved into the anon_vma that belongs to just that | ||
| 713 | * process, so the rmap code will not search the parent or sibling | ||
| 714 | * processes. | ||
| 715 | */ | ||
| 716 | void page_move_anon_rmap(struct page *page, | ||
| 717 | struct vm_area_struct *vma, unsigned long address) | ||
| 718 | { | ||
| 719 | struct anon_vma *anon_vma = vma->anon_vma; | ||
| 720 | |||
| 721 | VM_BUG_ON(!PageLocked(page)); | ||
| 722 | VM_BUG_ON(!anon_vma); | ||
| 723 | VM_BUG_ON(page->index != linear_page_index(vma, address)); | ||
| 724 | |||
| 725 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
| 726 | page->mapping = (struct address_space *) anon_vma; | ||
| 727 | } | ||
| 728 | |||
| 729 | /** | ||
| 617 | * __page_set_anon_rmap - setup new anonymous rmap | 730 | * __page_set_anon_rmap - setup new anonymous rmap |
| 618 | * @page: the page to add the mapping to | 731 | * @page: the page to add the mapping to |
| 619 | * @vma: the vm area in which the mapping is added | 732 | * @vma: the vm area in which the mapping is added |
| 620 | * @address: the user virtual address mapped | 733 | * @address: the user virtual address mapped |
| 734 | * @exclusive: the page is exclusively owned by the current process | ||
| 621 | */ | 735 | */ |
| 622 | static void __page_set_anon_rmap(struct page *page, | 736 | static void __page_set_anon_rmap(struct page *page, |
| 623 | struct vm_area_struct *vma, unsigned long address) | 737 | struct vm_area_struct *vma, unsigned long address, int exclusive) |
| 624 | { | 738 | { |
| 625 | struct anon_vma *anon_vma = vma->anon_vma; | 739 | struct anon_vma *anon_vma = vma->anon_vma; |
| 626 | 740 | ||
| 627 | BUG_ON(!anon_vma); | 741 | BUG_ON(!anon_vma); |
| 742 | |||
| 743 | /* | ||
| 744 | * If the page isn't exclusively mapped into this vma, | ||
| 745 | * we must use the _oldest_ possible anon_vma for the | ||
| 746 | * page mapping! | ||
| 747 | * | ||
| 748 | * So take the last AVC chain entry in the vma, which is | ||
| 749 | * the deepest ancestor, and use the anon_vma from that. | ||
| 750 | */ | ||
| 751 | if (!exclusive) { | ||
| 752 | struct anon_vma_chain *avc; | ||
| 753 | avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma); | ||
| 754 | anon_vma = avc->anon_vma; | ||
| 755 | } | ||
| 756 | |||
| 628 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 757 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
| 629 | page->mapping = (struct address_space *) anon_vma; | 758 | page->mapping = (struct address_space *) anon_vma; |
| 630 | page->index = linear_page_index(vma, address); | 759 | page->index = linear_page_index(vma, address); |
| @@ -652,9 +781,6 @@ static void __page_check_anon_rmap(struct page *page, | |||
| 652 | * are initially only visible via the pagetables, and the pte is locked | 781 | * are initially only visible via the pagetables, and the pte is locked |
| 653 | * over the call to page_add_new_anon_rmap. | 782 | * over the call to page_add_new_anon_rmap. |
| 654 | */ | 783 | */ |
| 655 | struct anon_vma *anon_vma = vma->anon_vma; | ||
| 656 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
| 657 | BUG_ON(page->mapping != (struct address_space *)anon_vma); | ||
| 658 | BUG_ON(page->index != linear_page_index(vma, address)); | 784 | BUG_ON(page->index != linear_page_index(vma, address)); |
| 659 | #endif | 785 | #endif |
| 660 | } | 786 | } |
| @@ -682,7 +808,7 @@ void page_add_anon_rmap(struct page *page, | |||
| 682 | VM_BUG_ON(!PageLocked(page)); | 808 | VM_BUG_ON(!PageLocked(page)); |
| 683 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 809 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
| 684 | if (first) | 810 | if (first) |
| 685 | __page_set_anon_rmap(page, vma, address); | 811 | __page_set_anon_rmap(page, vma, address, 0); |
| 686 | else | 812 | else |
| 687 | __page_check_anon_rmap(page, vma, address); | 813 | __page_check_anon_rmap(page, vma, address); |
| 688 | } | 814 | } |
| @@ -704,7 +830,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
| 704 | SetPageSwapBacked(page); | 830 | SetPageSwapBacked(page); |
| 705 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 831 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
| 706 | __inc_zone_page_state(page, NR_ANON_PAGES); | 832 | __inc_zone_page_state(page, NR_ANON_PAGES); |
| 707 | __page_set_anon_rmap(page, vma, address); | 833 | __page_set_anon_rmap(page, vma, address, 1); |
| 708 | if (page_evictable(page, vma)) | 834 | if (page_evictable(page, vma)) |
| 709 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 835 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
| 710 | else | 836 | else |
| @@ -815,9 +941,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 815 | 941 | ||
| 816 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 942 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
| 817 | if (PageAnon(page)) | 943 | if (PageAnon(page)) |
| 818 | dec_mm_counter(mm, anon_rss); | 944 | dec_mm_counter(mm, MM_ANONPAGES); |
| 819 | else | 945 | else |
| 820 | dec_mm_counter(mm, file_rss); | 946 | dec_mm_counter(mm, MM_FILEPAGES); |
| 821 | set_pte_at(mm, address, pte, | 947 | set_pte_at(mm, address, pte, |
| 822 | swp_entry_to_pte(make_hwpoison_entry(page))); | 948 | swp_entry_to_pte(make_hwpoison_entry(page))); |
| 823 | } else if (PageAnon(page)) { | 949 | } else if (PageAnon(page)) { |
| @@ -839,7 +965,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 839 | list_add(&mm->mmlist, &init_mm.mmlist); | 965 | list_add(&mm->mmlist, &init_mm.mmlist); |
| 840 | spin_unlock(&mmlist_lock); | 966 | spin_unlock(&mmlist_lock); |
| 841 | } | 967 | } |
| 842 | dec_mm_counter(mm, anon_rss); | 968 | dec_mm_counter(mm, MM_ANONPAGES); |
| 969 | inc_mm_counter(mm, MM_SWAPENTS); | ||
| 843 | } else if (PAGE_MIGRATION) { | 970 | } else if (PAGE_MIGRATION) { |
| 844 | /* | 971 | /* |
| 845 | * Store the pfn of the page in a special migration | 972 | * Store the pfn of the page in a special migration |
| @@ -857,7 +984,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 857 | entry = make_migration_entry(page, pte_write(pteval)); | 984 | entry = make_migration_entry(page, pte_write(pteval)); |
| 858 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 985 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
| 859 | } else | 986 | } else |
| 860 | dec_mm_counter(mm, file_rss); | 987 | dec_mm_counter(mm, MM_FILEPAGES); |
| 861 | 988 | ||
| 862 | page_remove_rmap(page); | 989 | page_remove_rmap(page); |
| 863 | page_cache_release(page); | 990 | page_cache_release(page); |
| @@ -996,7 +1123,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
| 996 | 1123 | ||
| 997 | page_remove_rmap(page); | 1124 | page_remove_rmap(page); |
| 998 | page_cache_release(page); | 1125 | page_cache_release(page); |
| 999 | dec_mm_counter(mm, file_rss); | 1126 | dec_mm_counter(mm, MM_FILEPAGES); |
| 1000 | (*mapcount)--; | 1127 | (*mapcount)--; |
| 1001 | } | 1128 | } |
| 1002 | pte_unmap_unlock(pte - 1, ptl); | 1129 | pte_unmap_unlock(pte - 1, ptl); |
| @@ -1024,14 +1151,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
| 1024 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1151 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
| 1025 | { | 1152 | { |
| 1026 | struct anon_vma *anon_vma; | 1153 | struct anon_vma *anon_vma; |
| 1027 | struct vm_area_struct *vma; | 1154 | struct anon_vma_chain *avc; |
| 1028 | int ret = SWAP_AGAIN; | 1155 | int ret = SWAP_AGAIN; |
| 1029 | 1156 | ||
| 1030 | anon_vma = page_lock_anon_vma(page); | 1157 | anon_vma = page_lock_anon_vma(page); |
| 1031 | if (!anon_vma) | 1158 | if (!anon_vma) |
| 1032 | return ret; | 1159 | return ret; |
| 1033 | 1160 | ||
| 1034 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1161 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
| 1162 | struct vm_area_struct *vma = avc->vma; | ||
| 1035 | unsigned long address = vma_address(page, vma); | 1163 | unsigned long address = vma_address(page, vma); |
| 1036 | if (address == -EFAULT) | 1164 | if (address == -EFAULT) |
| 1037 | continue; | 1165 | continue; |
| @@ -1222,7 +1350,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
| 1222 | struct vm_area_struct *, unsigned long, void *), void *arg) | 1350 | struct vm_area_struct *, unsigned long, void *), void *arg) |
| 1223 | { | 1351 | { |
| 1224 | struct anon_vma *anon_vma; | 1352 | struct anon_vma *anon_vma; |
| 1225 | struct vm_area_struct *vma; | 1353 | struct anon_vma_chain *avc; |
| 1226 | int ret = SWAP_AGAIN; | 1354 | int ret = SWAP_AGAIN; |
| 1227 | 1355 | ||
| 1228 | /* | 1356 | /* |
| @@ -1237,7 +1365,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | |||
| 1237 | if (!anon_vma) | 1365 | if (!anon_vma) |
| 1238 | return ret; | 1366 | return ret; |
| 1239 | spin_lock(&anon_vma->lock); | 1367 | spin_lock(&anon_vma->lock); |
| 1240 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1368 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
| 1369 | struct vm_area_struct *vma = avc->vma; | ||
| 1241 | unsigned long address = vma_address(page, vma); | 1370 | unsigned long address = vma_address(page, vma); |
| 1242 | if (address == -EFAULT) | 1371 | if (address == -EFAULT) |
| 1243 | continue; | 1372 | continue; |
| @@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to, | |||
| 935 | 935 | ||
| 936 | from->avail -= nr; | 936 | from->avail -= nr; |
| 937 | to->avail += nr; | 937 | to->avail += nr; |
| 938 | to->touched = 1; | ||
| 939 | return nr; | 938 | return nr; |
| 940 | } | 939 | } |
| 941 | 940 | ||
| @@ -983,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) | |||
| 983 | 982 | ||
| 984 | if (limit > 1) | 983 | if (limit > 1) |
| 985 | limit = 12; | 984 | limit = 12; |
| 986 | ac_ptr = kmalloc_node(memsize, gfp, node); | 985 | ac_ptr = kzalloc_node(memsize, gfp, node); |
| 987 | if (ac_ptr) { | 986 | if (ac_ptr) { |
| 988 | for_each_node(i) { | 987 | for_each_node(i) { |
| 989 | if (i == node || !node_online(i)) { | 988 | if (i == node || !node_online(i)) |
| 990 | ac_ptr[i] = NULL; | ||
| 991 | continue; | 989 | continue; |
| 992 | } | ||
| 993 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); | 990 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); |
| 994 | if (!ac_ptr[i]) { | 991 | if (!ac_ptr[i]) { |
| 995 | for (i--; i >= 0; i--) | 992 | for (i--; i >= 0; i--) |
| @@ -2963,8 +2960,10 @@ retry: | |||
| 2963 | spin_lock(&l3->list_lock); | 2960 | spin_lock(&l3->list_lock); |
| 2964 | 2961 | ||
| 2965 | /* See if we can refill from the shared array */ | 2962 | /* See if we can refill from the shared array */ |
| 2966 | if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) | 2963 | if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { |
| 2964 | l3->shared->touched = 1; | ||
| 2967 | goto alloc_done; | 2965 | goto alloc_done; |
| 2966 | } | ||
| 2968 | 2967 | ||
| 2969 | while (batchcount > 0) { | 2968 | while (batchcount > 0) { |
| 2970 | struct list_head *entry; | 2969 | struct list_head *entry; |
| @@ -3101,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) | |||
| 3101 | if (cachep == &cache_cache) | 3100 | if (cachep == &cache_cache) |
| 3102 | return false; | 3101 | return false; |
| 3103 | 3102 | ||
| 3104 | return should_failslab(obj_size(cachep), flags); | 3103 | return should_failslab(obj_size(cachep), flags, cachep->flags); |
| 3105 | } | 3104 | } |
| 3106 | 3105 | ||
| 3107 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3106 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
| @@ -3603,21 +3602,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_notrace); | |||
| 3603 | */ | 3602 | */ |
| 3604 | int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) | 3603 | int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) |
| 3605 | { | 3604 | { |
| 3606 | unsigned long addr = (unsigned long)ptr; | ||
| 3607 | unsigned long min_addr = PAGE_OFFSET; | ||
| 3608 | unsigned long align_mask = BYTES_PER_WORD - 1; | ||
| 3609 | unsigned long size = cachep->buffer_size; | 3605 | unsigned long size = cachep->buffer_size; |
| 3610 | struct page *page; | 3606 | struct page *page; |
| 3611 | 3607 | ||
| 3612 | if (unlikely(addr < min_addr)) | 3608 | if (unlikely(!kern_ptr_validate(ptr, size))) |
| 3613 | goto out; | ||
| 3614 | if (unlikely(addr > (unsigned long)high_memory - size)) | ||
| 3615 | goto out; | ||
| 3616 | if (unlikely(addr & align_mask)) | ||
| 3617 | goto out; | ||
| 3618 | if (unlikely(!kern_addr_valid(addr))) | ||
| 3619 | goto out; | ||
| 3620 | if (unlikely(!kern_addr_valid(addr + size - 1))) | ||
| 3621 | goto out; | 3609 | goto out; |
| 3622 | page = virt_to_page(ptr); | 3610 | page = virt_to_page(ptr); |
| 3623 | if (unlikely(!PageSlab(page))) | 3611 | if (unlikely(!PageSlab(page))) |
| @@ -151,7 +151,8 @@ | |||
| 151 | * Set of flags that will prevent slab merging | 151 | * Set of flags that will prevent slab merging |
| 152 | */ | 152 | */ |
| 153 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 153 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
| 154 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) | 154 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ |
| 155 | SLAB_FAILSLAB) | ||
| 155 | 156 | ||
| 156 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | 157 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ |
| 157 | SLAB_CACHE_DMA | SLAB_NOTRACK) | 158 | SLAB_CACHE_DMA | SLAB_NOTRACK) |
| @@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) | |||
| 217 | 218 | ||
| 218 | #endif | 219 | #endif |
| 219 | 220 | ||
| 220 | static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) | 221 | static inline void stat(struct kmem_cache *s, enum stat_item si) |
| 221 | { | 222 | { |
| 222 | #ifdef CONFIG_SLUB_STATS | 223 | #ifdef CONFIG_SLUB_STATS |
| 223 | c->stat[si]++; | 224 | __this_cpu_inc(s->cpu_slab->stat[si]); |
| 224 | #endif | 225 | #endif |
| 225 | } | 226 | } |
| 226 | 227 | ||
| @@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
| 242 | #endif | 243 | #endif |
| 243 | } | 244 | } |
| 244 | 245 | ||
| 245 | static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) | ||
| 246 | { | ||
| 247 | #ifdef CONFIG_SMP | ||
| 248 | return s->cpu_slab[cpu]; | ||
| 249 | #else | ||
| 250 | return &s->cpu_slab; | ||
| 251 | #endif | ||
| 252 | } | ||
| 253 | |||
| 254 | /* Verify that a pointer has an address that is valid within a slab page */ | 246 | /* Verify that a pointer has an address that is valid within a slab page */ |
| 255 | static inline int check_valid_pointer(struct kmem_cache *s, | 247 | static inline int check_valid_pointer(struct kmem_cache *s, |
| 256 | struct page *page, const void *object) | 248 | struct page *page, const void *object) |
| @@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s, | |||
| 269 | return 1; | 261 | return 1; |
| 270 | } | 262 | } |
| 271 | 263 | ||
| 272 | /* | ||
| 273 | * Slow version of get and set free pointer. | ||
| 274 | * | ||
| 275 | * This version requires touching the cache lines of kmem_cache which | ||
| 276 | * we avoid to do in the fast alloc free paths. There we obtain the offset | ||
| 277 | * from the page struct. | ||
| 278 | */ | ||
| 279 | static inline void *get_freepointer(struct kmem_cache *s, void *object) | 264 | static inline void *get_freepointer(struct kmem_cache *s, void *object) |
| 280 | { | 265 | { |
| 281 | return *(void **)(object + s->offset); | 266 | return *(void **)(object + s->offset); |
| @@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str) | |||
| 1020 | case 't': | 1005 | case 't': |
| 1021 | slub_debug |= SLAB_TRACE; | 1006 | slub_debug |= SLAB_TRACE; |
| 1022 | break; | 1007 | break; |
| 1008 | case 'a': | ||
| 1009 | slub_debug |= SLAB_FAILSLAB; | ||
| 1010 | break; | ||
| 1023 | default: | 1011 | default: |
| 1024 | printk(KERN_ERR "slub_debug option '%c' " | 1012 | printk(KERN_ERR "slub_debug option '%c' " |
| 1025 | "unknown. skipped\n", *str); | 1013 | "unknown. skipped\n", *str); |
| @@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1124 | if (!page) | 1112 | if (!page) |
| 1125 | return NULL; | 1113 | return NULL; |
| 1126 | 1114 | ||
| 1127 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); | 1115 | stat(s, ORDER_FALLBACK); |
| 1128 | } | 1116 | } |
| 1129 | 1117 | ||
| 1130 | if (kmemcheck_enabled | 1118 | if (kmemcheck_enabled |
| @@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1422 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | 1410 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) |
| 1423 | { | 1411 | { |
| 1424 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | 1412 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
| 1425 | struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); | ||
| 1426 | 1413 | ||
| 1427 | __ClearPageSlubFrozen(page); | 1414 | __ClearPageSlubFrozen(page); |
| 1428 | if (page->inuse) { | 1415 | if (page->inuse) { |
| 1429 | 1416 | ||
| 1430 | if (page->freelist) { | 1417 | if (page->freelist) { |
| 1431 | add_partial(n, page, tail); | 1418 | add_partial(n, page, tail); |
| 1432 | stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | 1419 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); |
| 1433 | } else { | 1420 | } else { |
| 1434 | stat(c, DEACTIVATE_FULL); | 1421 | stat(s, DEACTIVATE_FULL); |
| 1435 | if (SLABDEBUG && PageSlubDebug(page) && | 1422 | if (SLABDEBUG && PageSlubDebug(page) && |
| 1436 | (s->flags & SLAB_STORE_USER)) | 1423 | (s->flags & SLAB_STORE_USER)) |
| 1437 | add_full(n, page); | 1424 | add_full(n, page); |
| 1438 | } | 1425 | } |
| 1439 | slab_unlock(page); | 1426 | slab_unlock(page); |
| 1440 | } else { | 1427 | } else { |
| 1441 | stat(c, DEACTIVATE_EMPTY); | 1428 | stat(s, DEACTIVATE_EMPTY); |
| 1442 | if (n->nr_partial < s->min_partial) { | 1429 | if (n->nr_partial < s->min_partial) { |
| 1443 | /* | 1430 | /* |
| 1444 | * Adding an empty slab to the partial slabs in order | 1431 | * Adding an empty slab to the partial slabs in order |
| @@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
| 1454 | slab_unlock(page); | 1441 | slab_unlock(page); |
| 1455 | } else { | 1442 | } else { |
| 1456 | slab_unlock(page); | 1443 | slab_unlock(page); |
| 1457 | stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); | 1444 | stat(s, FREE_SLAB); |
| 1458 | discard_slab(s, page); | 1445 | discard_slab(s, page); |
| 1459 | } | 1446 | } |
| 1460 | } | 1447 | } |
| @@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
| 1469 | int tail = 1; | 1456 | int tail = 1; |
| 1470 | 1457 | ||
| 1471 | if (page->freelist) | 1458 | if (page->freelist) |
| 1472 | stat(c, DEACTIVATE_REMOTE_FREES); | 1459 | stat(s, DEACTIVATE_REMOTE_FREES); |
| 1473 | /* | 1460 | /* |
| 1474 | * Merge cpu freelist into slab freelist. Typically we get here | 1461 | * Merge cpu freelist into slab freelist. Typically we get here |
| 1475 | * because both freelists are empty. So this is unlikely | 1462 | * because both freelists are empty. So this is unlikely |
| @@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
| 1482 | 1469 | ||
| 1483 | /* Retrieve object from cpu_freelist */ | 1470 | /* Retrieve object from cpu_freelist */ |
| 1484 | object = c->freelist; | 1471 | object = c->freelist; |
| 1485 | c->freelist = c->freelist[c->offset]; | 1472 | c->freelist = get_freepointer(s, c->freelist); |
| 1486 | 1473 | ||
| 1487 | /* And put onto the regular freelist */ | 1474 | /* And put onto the regular freelist */ |
| 1488 | object[c->offset] = page->freelist; | 1475 | set_freepointer(s, object, page->freelist); |
| 1489 | page->freelist = object; | 1476 | page->freelist = object; |
| 1490 | page->inuse--; | 1477 | page->inuse--; |
| 1491 | } | 1478 | } |
| @@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
| 1495 | 1482 | ||
| 1496 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1483 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
| 1497 | { | 1484 | { |
| 1498 | stat(c, CPUSLAB_FLUSH); | 1485 | stat(s, CPUSLAB_FLUSH); |
| 1499 | slab_lock(c->page); | 1486 | slab_lock(c->page); |
| 1500 | deactivate_slab(s, c); | 1487 | deactivate_slab(s, c); |
| 1501 | } | 1488 | } |
| @@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
| 1507 | */ | 1494 | */ |
| 1508 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | 1495 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) |
| 1509 | { | 1496 | { |
| 1510 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | 1497 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
| 1511 | 1498 | ||
| 1512 | if (likely(c && c->page)) | 1499 | if (likely(c && c->page)) |
| 1513 | flush_slab(s, c); | 1500 | flush_slab(s, c); |
| @@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
| 1635 | if (unlikely(!node_match(c, node))) | 1622 | if (unlikely(!node_match(c, node))) |
| 1636 | goto another_slab; | 1623 | goto another_slab; |
| 1637 | 1624 | ||
| 1638 | stat(c, ALLOC_REFILL); | 1625 | stat(s, ALLOC_REFILL); |
| 1639 | 1626 | ||
| 1640 | load_freelist: | 1627 | load_freelist: |
| 1641 | object = c->page->freelist; | 1628 | object = c->page->freelist; |
| @@ -1644,13 +1631,13 @@ load_freelist: | |||
| 1644 | if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) | 1631 | if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) |
| 1645 | goto debug; | 1632 | goto debug; |
| 1646 | 1633 | ||
| 1647 | c->freelist = object[c->offset]; | 1634 | c->freelist = get_freepointer(s, object); |
| 1648 | c->page->inuse = c->page->objects; | 1635 | c->page->inuse = c->page->objects; |
| 1649 | c->page->freelist = NULL; | 1636 | c->page->freelist = NULL; |
| 1650 | c->node = page_to_nid(c->page); | 1637 | c->node = page_to_nid(c->page); |
| 1651 | unlock_out: | 1638 | unlock_out: |
| 1652 | slab_unlock(c->page); | 1639 | slab_unlock(c->page); |
| 1653 | stat(c, ALLOC_SLOWPATH); | 1640 | stat(s, ALLOC_SLOWPATH); |
| 1654 | return object; | 1641 | return object; |
| 1655 | 1642 | ||
| 1656 | another_slab: | 1643 | another_slab: |
| @@ -1660,7 +1647,7 @@ new_slab: | |||
| 1660 | new = get_partial(s, gfpflags, node); | 1647 | new = get_partial(s, gfpflags, node); |
| 1661 | if (new) { | 1648 | if (new) { |
| 1662 | c->page = new; | 1649 | c->page = new; |
| 1663 | stat(c, ALLOC_FROM_PARTIAL); | 1650 | stat(s, ALLOC_FROM_PARTIAL); |
| 1664 | goto load_freelist; | 1651 | goto load_freelist; |
| 1665 | } | 1652 | } |
| 1666 | 1653 | ||
| @@ -1673,8 +1660,8 @@ new_slab: | |||
| 1673 | local_irq_disable(); | 1660 | local_irq_disable(); |
| 1674 | 1661 | ||
| 1675 | if (new) { | 1662 | if (new) { |
| 1676 | c = get_cpu_slab(s, smp_processor_id()); | 1663 | c = __this_cpu_ptr(s->cpu_slab); |
| 1677 | stat(c, ALLOC_SLAB); | 1664 | stat(s, ALLOC_SLAB); |
| 1678 | if (c->page) | 1665 | if (c->page) |
| 1679 | flush_slab(s, c); | 1666 | flush_slab(s, c); |
| 1680 | slab_lock(new); | 1667 | slab_lock(new); |
| @@ -1690,7 +1677,7 @@ debug: | |||
| 1690 | goto another_slab; | 1677 | goto another_slab; |
| 1691 | 1678 | ||
| 1692 | c->page->inuse++; | 1679 | c->page->inuse++; |
| 1693 | c->page->freelist = object[c->offset]; | 1680 | c->page->freelist = get_freepointer(s, object); |
| 1694 | c->node = -1; | 1681 | c->node = -1; |
| 1695 | goto unlock_out; | 1682 | goto unlock_out; |
| 1696 | } | 1683 | } |
| @@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
| 1711 | void **object; | 1698 | void **object; |
| 1712 | struct kmem_cache_cpu *c; | 1699 | struct kmem_cache_cpu *c; |
| 1713 | unsigned long flags; | 1700 | unsigned long flags; |
| 1714 | unsigned int objsize; | ||
| 1715 | 1701 | ||
| 1716 | gfpflags &= gfp_allowed_mask; | 1702 | gfpflags &= gfp_allowed_mask; |
| 1717 | 1703 | ||
| 1718 | lockdep_trace_alloc(gfpflags); | 1704 | lockdep_trace_alloc(gfpflags); |
| 1719 | might_sleep_if(gfpflags & __GFP_WAIT); | 1705 | might_sleep_if(gfpflags & __GFP_WAIT); |
| 1720 | 1706 | ||
| 1721 | if (should_failslab(s->objsize, gfpflags)) | 1707 | if (should_failslab(s->objsize, gfpflags, s->flags)) |
| 1722 | return NULL; | 1708 | return NULL; |
| 1723 | 1709 | ||
| 1724 | local_irq_save(flags); | 1710 | local_irq_save(flags); |
| 1725 | c = get_cpu_slab(s, smp_processor_id()); | 1711 | c = __this_cpu_ptr(s->cpu_slab); |
| 1726 | objsize = c->objsize; | 1712 | object = c->freelist; |
| 1727 | if (unlikely(!c->freelist || !node_match(c, node))) | 1713 | if (unlikely(!object || !node_match(c, node))) |
| 1728 | 1714 | ||
| 1729 | object = __slab_alloc(s, gfpflags, node, addr, c); | 1715 | object = __slab_alloc(s, gfpflags, node, addr, c); |
| 1730 | 1716 | ||
| 1731 | else { | 1717 | else { |
| 1732 | object = c->freelist; | 1718 | c->freelist = get_freepointer(s, object); |
| 1733 | c->freelist = object[c->offset]; | 1719 | stat(s, ALLOC_FASTPATH); |
| 1734 | stat(c, ALLOC_FASTPATH); | ||
| 1735 | } | 1720 | } |
| 1736 | local_irq_restore(flags); | 1721 | local_irq_restore(flags); |
| 1737 | 1722 | ||
| 1738 | if (unlikely(gfpflags & __GFP_ZERO) && object) | 1723 | if (unlikely(gfpflags & __GFP_ZERO) && object) |
| 1739 | memset(object, 0, objsize); | 1724 | memset(object, 0, s->objsize); |
| 1740 | 1725 | ||
| 1741 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | 1726 | kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); |
| 1742 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); | 1727 | kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); |
| 1743 | 1728 | ||
| 1744 | return object; | 1729 | return object; |
| 1745 | } | 1730 | } |
| @@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | |||
| 1794 | * handling required then we can return immediately. | 1779 | * handling required then we can return immediately. |
| 1795 | */ | 1780 | */ |
| 1796 | static void __slab_free(struct kmem_cache *s, struct page *page, | 1781 | static void __slab_free(struct kmem_cache *s, struct page *page, |
| 1797 | void *x, unsigned long addr, unsigned int offset) | 1782 | void *x, unsigned long addr) |
| 1798 | { | 1783 | { |
| 1799 | void *prior; | 1784 | void *prior; |
| 1800 | void **object = (void *)x; | 1785 | void **object = (void *)x; |
| 1801 | struct kmem_cache_cpu *c; | ||
| 1802 | 1786 | ||
| 1803 | c = get_cpu_slab(s, raw_smp_processor_id()); | 1787 | stat(s, FREE_SLOWPATH); |
| 1804 | stat(c, FREE_SLOWPATH); | ||
| 1805 | slab_lock(page); | 1788 | slab_lock(page); |
| 1806 | 1789 | ||
| 1807 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) | 1790 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) |
| 1808 | goto debug; | 1791 | goto debug; |
| 1809 | 1792 | ||
| 1810 | checks_ok: | 1793 | checks_ok: |
| 1811 | prior = object[offset] = page->freelist; | 1794 | prior = page->freelist; |
| 1795 | set_freepointer(s, object, prior); | ||
| 1812 | page->freelist = object; | 1796 | page->freelist = object; |
| 1813 | page->inuse--; | 1797 | page->inuse--; |
| 1814 | 1798 | ||
| 1815 | if (unlikely(PageSlubFrozen(page))) { | 1799 | if (unlikely(PageSlubFrozen(page))) { |
| 1816 | stat(c, FREE_FROZEN); | 1800 | stat(s, FREE_FROZEN); |
| 1817 | goto out_unlock; | 1801 | goto out_unlock; |
| 1818 | } | 1802 | } |
| 1819 | 1803 | ||
| @@ -1826,7 +1810,7 @@ checks_ok: | |||
| 1826 | */ | 1810 | */ |
| 1827 | if (unlikely(!prior)) { | 1811 | if (unlikely(!prior)) { |
| 1828 | add_partial(get_node(s, page_to_nid(page)), page, 1); | 1812 | add_partial(get_node(s, page_to_nid(page)), page, 1); |
| 1829 | stat(c, FREE_ADD_PARTIAL); | 1813 | stat(s, FREE_ADD_PARTIAL); |
| 1830 | } | 1814 | } |
| 1831 | 1815 | ||
| 1832 | out_unlock: | 1816 | out_unlock: |
| @@ -1839,10 +1823,10 @@ slab_empty: | |||
| 1839 | * Slab still on the partial list. | 1823 | * Slab still on the partial list. |
| 1840 | */ | 1824 | */ |
| 1841 | remove_partial(s, page); | 1825 | remove_partial(s, page); |
| 1842 | stat(c, FREE_REMOVE_PARTIAL); | 1826 | stat(s, FREE_REMOVE_PARTIAL); |
| 1843 | } | 1827 | } |
| 1844 | slab_unlock(page); | 1828 | slab_unlock(page); |
| 1845 | stat(c, FREE_SLAB); | 1829 | stat(s, FREE_SLAB); |
| 1846 | discard_slab(s, page); | 1830 | discard_slab(s, page); |
| 1847 | return; | 1831 | return; |
| 1848 | 1832 | ||
| @@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
| 1872 | 1856 | ||
| 1873 | kmemleak_free_recursive(x, s->flags); | 1857 | kmemleak_free_recursive(x, s->flags); |
| 1874 | local_irq_save(flags); | 1858 | local_irq_save(flags); |
| 1875 | c = get_cpu_slab(s, smp_processor_id()); | 1859 | c = __this_cpu_ptr(s->cpu_slab); |
| 1876 | kmemcheck_slab_free(s, object, c->objsize); | 1860 | kmemcheck_slab_free(s, object, s->objsize); |
| 1877 | debug_check_no_locks_freed(object, c->objsize); | 1861 | debug_check_no_locks_freed(object, s->objsize); |
| 1878 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1862 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
| 1879 | debug_check_no_obj_freed(object, c->objsize); | 1863 | debug_check_no_obj_freed(object, s->objsize); |
| 1880 | if (likely(page == c->page && c->node >= 0)) { | 1864 | if (likely(page == c->page && c->node >= 0)) { |
| 1881 | object[c->offset] = c->freelist; | 1865 | set_freepointer(s, object, c->freelist); |
| 1882 | c->freelist = object; | 1866 | c->freelist = object; |
| 1883 | stat(c, FREE_FASTPATH); | 1867 | stat(s, FREE_FASTPATH); |
| 1884 | } else | 1868 | } else |
| 1885 | __slab_free(s, page, x, addr, c->offset); | 1869 | __slab_free(s, page, x, addr); |
| 1886 | 1870 | ||
| 1887 | local_irq_restore(flags); | 1871 | local_irq_restore(flags); |
| 1888 | } | 1872 | } |
| @@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
| 2069 | return ALIGN(align, sizeof(void *)); | 2053 | return ALIGN(align, sizeof(void *)); |
| 2070 | } | 2054 | } |
| 2071 | 2055 | ||
| 2072 | static void init_kmem_cache_cpu(struct kmem_cache *s, | ||
| 2073 | struct kmem_cache_cpu *c) | ||
| 2074 | { | ||
| 2075 | c->page = NULL; | ||
| 2076 | c->freelist = NULL; | ||
| 2077 | c->node = 0; | ||
| 2078 | c->offset = s->offset / sizeof(void *); | ||
| 2079 | c->objsize = s->objsize; | ||
| 2080 | #ifdef CONFIG_SLUB_STATS | ||
| 2081 | memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); | ||
| 2082 | #endif | ||
| 2083 | } | ||
| 2084 | |||
| 2085 | static void | 2056 | static void |
| 2086 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | 2057 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) |
| 2087 | { | 2058 | { |
| @@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | |||
| 2095 | #endif | 2066 | #endif |
| 2096 | } | 2067 | } |
| 2097 | 2068 | ||
| 2098 | #ifdef CONFIG_SMP | 2069 | static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); |
| 2099 | /* | ||
| 2100 | * Per cpu array for per cpu structures. | ||
| 2101 | * | ||
| 2102 | * The per cpu array places all kmem_cache_cpu structures from one processor | ||
| 2103 | * close together meaning that it becomes possible that multiple per cpu | ||
| 2104 | * structures are contained in one cacheline. This may be particularly | ||
| 2105 | * beneficial for the kmalloc caches. | ||
| 2106 | * | ||
| 2107 | * A desktop system typically has around 60-80 slabs. With 100 here we are | ||
| 2108 | * likely able to get per cpu structures for all caches from the array defined | ||
| 2109 | * here. We must be able to cover all kmalloc caches during bootstrap. | ||
| 2110 | * | ||
| 2111 | * If the per cpu array is exhausted then fall back to kmalloc | ||
| 2112 | * of individual cachelines. No sharing is possible then. | ||
| 2113 | */ | ||
| 2114 | #define NR_KMEM_CACHE_CPU 100 | ||
| 2115 | |||
| 2116 | static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], | ||
| 2117 | kmem_cache_cpu); | ||
| 2118 | |||
| 2119 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); | ||
| 2120 | static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); | ||
| 2121 | |||
| 2122 | static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, | ||
| 2123 | int cpu, gfp_t flags) | ||
| 2124 | { | ||
| 2125 | struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); | ||
| 2126 | |||
| 2127 | if (c) | ||
| 2128 | per_cpu(kmem_cache_cpu_free, cpu) = | ||
| 2129 | (void *)c->freelist; | ||
| 2130 | else { | ||
| 2131 | /* Table overflow: So allocate ourselves */ | ||
| 2132 | c = kmalloc_node( | ||
| 2133 | ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), | ||
| 2134 | flags, cpu_to_node(cpu)); | ||
| 2135 | if (!c) | ||
| 2136 | return NULL; | ||
| 2137 | } | ||
| 2138 | |||
| 2139 | init_kmem_cache_cpu(s, c); | ||
| 2140 | return c; | ||
| 2141 | } | ||
| 2142 | |||
| 2143 | static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) | ||
| 2144 | { | ||
| 2145 | if (c < per_cpu(kmem_cache_cpu, cpu) || | ||
| 2146 | c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { | ||
| 2147 | kfree(c); | ||
| 2148 | return; | ||
| 2149 | } | ||
| 2150 | c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); | ||
| 2151 | per_cpu(kmem_cache_cpu_free, cpu) = c; | ||
| 2152 | } | ||
| 2153 | |||
| 2154 | static void free_kmem_cache_cpus(struct kmem_cache *s) | ||
| 2155 | { | ||
| 2156 | int cpu; | ||
| 2157 | |||
| 2158 | for_each_online_cpu(cpu) { | ||
| 2159 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
| 2160 | |||
| 2161 | if (c) { | ||
| 2162 | s->cpu_slab[cpu] = NULL; | ||
| 2163 | free_kmem_cache_cpu(c, cpu); | ||
| 2164 | } | ||
| 2165 | } | ||
| 2166 | } | ||
| 2167 | |||
| 2168 | static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
| 2169 | { | ||
| 2170 | int cpu; | ||
| 2171 | |||
| 2172 | for_each_online_cpu(cpu) { | ||
| 2173 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
| 2174 | |||
| 2175 | if (c) | ||
| 2176 | continue; | ||
| 2177 | |||
| 2178 | c = alloc_kmem_cache_cpu(s, cpu, flags); | ||
| 2179 | if (!c) { | ||
| 2180 | free_kmem_cache_cpus(s); | ||
| 2181 | return 0; | ||
| 2182 | } | ||
| 2183 | s->cpu_slab[cpu] = c; | ||
| 2184 | } | ||
| 2185 | return 1; | ||
| 2186 | } | ||
| 2187 | |||
| 2188 | /* | ||
| 2189 | * Initialize the per cpu array. | ||
| 2190 | */ | ||
| 2191 | static void init_alloc_cpu_cpu(int cpu) | ||
| 2192 | { | ||
| 2193 | int i; | ||
| 2194 | 2070 | ||
| 2195 | if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) | 2071 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) |
| 2196 | return; | ||
| 2197 | |||
| 2198 | for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) | ||
| 2199 | free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); | ||
| 2200 | |||
| 2201 | cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); | ||
| 2202 | } | ||
| 2203 | |||
| 2204 | static void __init init_alloc_cpu(void) | ||
| 2205 | { | 2072 | { |
| 2206 | int cpu; | 2073 | if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) |
| 2207 | 2074 | /* | |
| 2208 | for_each_online_cpu(cpu) | 2075 | * Boot time creation of the kmalloc array. Use static per cpu data |
| 2209 | init_alloc_cpu_cpu(cpu); | 2076 | * since the per cpu allocator is not available yet. |
| 2210 | } | 2077 | */ |
| 2078 | s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches); | ||
| 2079 | else | ||
| 2080 | s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); | ||
| 2211 | 2081 | ||
| 2212 | #else | 2082 | if (!s->cpu_slab) |
| 2213 | static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} | 2083 | return 0; |
| 2214 | static inline void init_alloc_cpu(void) {} | ||
| 2215 | 2084 | ||
| 2216 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
| 2217 | { | ||
| 2218 | init_kmem_cache_cpu(s, &s->cpu_slab); | ||
| 2219 | return 1; | 2085 | return 1; |
| 2220 | } | 2086 | } |
| 2221 | #endif | ||
| 2222 | 2087 | ||
| 2223 | #ifdef CONFIG_NUMA | 2088 | #ifdef CONFIG_NUMA |
| 2224 | /* | 2089 | /* |
| @@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | |||
| 2287 | int node; | 2152 | int node; |
| 2288 | int local_node; | 2153 | int local_node; |
| 2289 | 2154 | ||
| 2290 | if (slab_state >= UP) | 2155 | if (slab_state >= UP && (s < kmalloc_caches || |
| 2156 | s >= kmalloc_caches + KMALLOC_CACHES)) | ||
| 2291 | local_node = page_to_nid(virt_to_page(s)); | 2157 | local_node = page_to_nid(virt_to_page(s)); |
| 2292 | else | 2158 | else |
| 2293 | local_node = 0; | 2159 | local_node = 0; |
| @@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
| 2502 | 2368 | ||
| 2503 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) | 2369 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) |
| 2504 | return 1; | 2370 | return 1; |
| 2371 | |||
| 2505 | free_kmem_cache_nodes(s); | 2372 | free_kmem_cache_nodes(s); |
| 2506 | error: | 2373 | error: |
| 2507 | if (flags & SLAB_PANIC) | 2374 | if (flags & SLAB_PANIC) |
| @@ -2519,6 +2386,9 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object) | |||
| 2519 | { | 2386 | { |
| 2520 | struct page *page; | 2387 | struct page *page; |
| 2521 | 2388 | ||
| 2389 | if (!kern_ptr_validate(object, s->size)) | ||
| 2390 | return 0; | ||
| 2391 | |||
| 2522 | page = get_object_page(object); | 2392 | page = get_object_page(object); |
| 2523 | 2393 | ||
| 2524 | if (!page || s != page->slab) | 2394 | if (!page || s != page->slab) |
| @@ -2609,9 +2479,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
| 2609 | int node; | 2479 | int node; |
| 2610 | 2480 | ||
| 2611 | flush_all(s); | 2481 | flush_all(s); |
| 2612 | 2482 | free_percpu(s->cpu_slab); | |
| 2613 | /* Attempt to free all objects */ | 2483 | /* Attempt to free all objects */ |
| 2614 | free_kmem_cache_cpus(s); | ||
| 2615 | for_each_node_state(node, N_NORMAL_MEMORY) { | 2484 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 2616 | struct kmem_cache_node *n = get_node(s, node); | 2485 | struct kmem_cache_node *n = get_node(s, node); |
| 2617 | 2486 | ||
| @@ -2651,7 +2520,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); | |||
| 2651 | * Kmalloc subsystem | 2520 | * Kmalloc subsystem |
| 2652 | *******************************************************************/ | 2521 | *******************************************************************/ |
| 2653 | 2522 | ||
| 2654 | struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; | 2523 | struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; |
| 2655 | EXPORT_SYMBOL(kmalloc_caches); | 2524 | EXPORT_SYMBOL(kmalloc_caches); |
| 2656 | 2525 | ||
| 2657 | static int __init setup_slub_min_order(char *str) | 2526 | static int __init setup_slub_min_order(char *str) |
| @@ -2741,6 +2610,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
| 2741 | char *text; | 2610 | char *text; |
| 2742 | size_t realsize; | 2611 | size_t realsize; |
| 2743 | unsigned long slabflags; | 2612 | unsigned long slabflags; |
| 2613 | int i; | ||
| 2744 | 2614 | ||
| 2745 | s = kmalloc_caches_dma[index]; | 2615 | s = kmalloc_caches_dma[index]; |
| 2746 | if (s) | 2616 | if (s) |
| @@ -2760,7 +2630,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
| 2760 | realsize = kmalloc_caches[index].objsize; | 2630 | realsize = kmalloc_caches[index].objsize; |
| 2761 | text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", | 2631 | text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", |
| 2762 | (unsigned int)realsize); | 2632 | (unsigned int)realsize); |
| 2763 | s = kmalloc(kmem_size, flags & ~SLUB_DMA); | 2633 | |
| 2634 | s = NULL; | ||
| 2635 | for (i = 0; i < KMALLOC_CACHES; i++) | ||
| 2636 | if (!kmalloc_caches[i].size) | ||
| 2637 | break; | ||
| 2638 | |||
| 2639 | BUG_ON(i >= KMALLOC_CACHES); | ||
| 2640 | s = kmalloc_caches + i; | ||
| 2764 | 2641 | ||
| 2765 | /* | 2642 | /* |
| 2766 | * Must defer sysfs creation to a workqueue because we don't know | 2643 | * Must defer sysfs creation to a workqueue because we don't know |
| @@ -2772,9 +2649,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
| 2772 | if (slab_state >= SYSFS) | 2649 | if (slab_state >= SYSFS) |
| 2773 | slabflags |= __SYSFS_ADD_DEFERRED; | 2650 | slabflags |= __SYSFS_ADD_DEFERRED; |
| 2774 | 2651 | ||
| 2775 | if (!s || !text || !kmem_cache_open(s, flags, text, | 2652 | if (!text || !kmem_cache_open(s, flags, text, |
| 2776 | realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { | 2653 | realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { |
| 2777 | kfree(s); | 2654 | s->size = 0; |
| 2778 | kfree(text); | 2655 | kfree(text); |
| 2779 | goto unlock_out; | 2656 | goto unlock_out; |
| 2780 | } | 2657 | } |
| @@ -3086,7 +2963,7 @@ static void slab_mem_offline_callback(void *arg) | |||
| 3086 | /* | 2963 | /* |
| 3087 | * if n->nr_slabs > 0, slabs still exist on the node | 2964 | * if n->nr_slabs > 0, slabs still exist on the node |
| 3088 | * that is going down. We were unable to free them, | 2965 | * that is going down. We were unable to free them, |
| 3089 | * and offline_pages() function shoudn't call this | 2966 | * and offline_pages() function shouldn't call this |
| 3090 | * callback. So, we must fail. | 2967 | * callback. So, we must fail. |
| 3091 | */ | 2968 | */ |
| 3092 | BUG_ON(slabs_node(s, offline_node)); | 2969 | BUG_ON(slabs_node(s, offline_node)); |
| @@ -3176,8 +3053,6 @@ void __init kmem_cache_init(void) | |||
| 3176 | int i; | 3053 | int i; |
| 3177 | int caches = 0; | 3054 | int caches = 0; |
| 3178 | 3055 | ||
| 3179 | init_alloc_cpu(); | ||
| 3180 | |||
| 3181 | #ifdef CONFIG_NUMA | 3056 | #ifdef CONFIG_NUMA |
| 3182 | /* | 3057 | /* |
| 3183 | * Must first have the slab cache available for the allocations of the | 3058 | * Must first have the slab cache available for the allocations of the |
| @@ -3261,8 +3136,10 @@ void __init kmem_cache_init(void) | |||
| 3261 | 3136 | ||
| 3262 | #ifdef CONFIG_SMP | 3137 | #ifdef CONFIG_SMP |
| 3263 | register_cpu_notifier(&slab_notifier); | 3138 | register_cpu_notifier(&slab_notifier); |
| 3264 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | 3139 | #endif |
| 3265 | nr_cpu_ids * sizeof(struct kmem_cache_cpu *); | 3140 | #ifdef CONFIG_NUMA |
| 3141 | kmem_size = offsetof(struct kmem_cache, node) + | ||
| 3142 | nr_node_ids * sizeof(struct kmem_cache_node *); | ||
| 3266 | #else | 3143 | #else |
| 3267 | kmem_size = sizeof(struct kmem_cache); | 3144 | kmem_size = sizeof(struct kmem_cache); |
| 3268 | #endif | 3145 | #endif |
| @@ -3351,22 +3228,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
| 3351 | down_write(&slub_lock); | 3228 | down_write(&slub_lock); |
| 3352 | s = find_mergeable(size, align, flags, name, ctor); | 3229 | s = find_mergeable(size, align, flags, name, ctor); |
| 3353 | if (s) { | 3230 | if (s) { |
| 3354 | int cpu; | ||
| 3355 | |||
| 3356 | s->refcount++; | 3231 | s->refcount++; |
| 3357 | /* | 3232 | /* |
| 3358 | * Adjust the object sizes so that we clear | 3233 | * Adjust the object sizes so that we clear |
| 3359 | * the complete object on kzalloc. | 3234 | * the complete object on kzalloc. |
| 3360 | */ | 3235 | */ |
| 3361 | s->objsize = max(s->objsize, (int)size); | 3236 | s->objsize = max(s->objsize, (int)size); |
| 3362 | |||
| 3363 | /* | ||
| 3364 | * And then we need to update the object size in the | ||
| 3365 | * per cpu structures | ||
| 3366 | */ | ||
| 3367 | for_each_online_cpu(cpu) | ||
| 3368 | get_cpu_slab(s, cpu)->objsize = s->objsize; | ||
| 3369 | |||
| 3370 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 3237 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
| 3371 | up_write(&slub_lock); | 3238 | up_write(&slub_lock); |
| 3372 | 3239 | ||
| @@ -3420,29 +3287,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
| 3420 | unsigned long flags; | 3287 | unsigned long flags; |
| 3421 | 3288 | ||
| 3422 | switch (action) { | 3289 | switch (action) { |
| 3423 | case CPU_UP_PREPARE: | ||
| 3424 | case CPU_UP_PREPARE_FROZEN: | ||
| 3425 | init_alloc_cpu_cpu(cpu); | ||
| 3426 | down_read(&slub_lock); | ||
| 3427 | list_for_each_entry(s, &slab_caches, list) | ||
| 3428 | s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, | ||
| 3429 | GFP_KERNEL); | ||
| 3430 | up_read(&slub_lock); | ||
| 3431 | break; | ||
| 3432 | |||
| 3433 | case CPU_UP_CANCELED: | 3290 | case CPU_UP_CANCELED: |
| 3434 | case CPU_UP_CANCELED_FROZEN: | 3291 | case CPU_UP_CANCELED_FROZEN: |
| 3435 | case CPU_DEAD: | 3292 | case CPU_DEAD: |
| 3436 | case CPU_DEAD_FROZEN: | 3293 | case CPU_DEAD_FROZEN: |
| 3437 | down_read(&slub_lock); | 3294 | down_read(&slub_lock); |
| 3438 | list_for_each_entry(s, &slab_caches, list) { | 3295 | list_for_each_entry(s, &slab_caches, list) { |
| 3439 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
| 3440 | |||
| 3441 | local_irq_save(flags); | 3296 | local_irq_save(flags); |
| 3442 | __flush_cpu_slab(s, cpu); | 3297 | __flush_cpu_slab(s, cpu); |
| 3443 | local_irq_restore(flags); | 3298 | local_irq_restore(flags); |
| 3444 | free_kmem_cache_cpu(c, cpu); | ||
| 3445 | s->cpu_slab[cpu] = NULL; | ||
| 3446 | } | 3299 | } |
| 3447 | up_read(&slub_lock); | 3300 | up_read(&slub_lock); |
| 3448 | break; | 3301 | break; |
| @@ -3928,7 +3781,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
| 3928 | int cpu; | 3781 | int cpu; |
| 3929 | 3782 | ||
| 3930 | for_each_possible_cpu(cpu) { | 3783 | for_each_possible_cpu(cpu) { |
| 3931 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | 3784 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
| 3932 | 3785 | ||
| 3933 | if (!c || c->node < 0) | 3786 | if (!c || c->node < 0) |
| 3934 | continue; | 3787 | continue; |
| @@ -4171,6 +4024,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, | |||
| 4171 | } | 4024 | } |
| 4172 | SLAB_ATTR(trace); | 4025 | SLAB_ATTR(trace); |
| 4173 | 4026 | ||
| 4027 | #ifdef CONFIG_FAILSLAB | ||
| 4028 | static ssize_t failslab_show(struct kmem_cache *s, char *buf) | ||
| 4029 | { | ||
| 4030 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); | ||
| 4031 | } | ||
| 4032 | |||
| 4033 | static ssize_t failslab_store(struct kmem_cache *s, const char *buf, | ||
| 4034 | size_t length) | ||
| 4035 | { | ||
| 4036 | s->flags &= ~SLAB_FAILSLAB; | ||
| 4037 | if (buf[0] == '1') | ||
| 4038 | s->flags |= SLAB_FAILSLAB; | ||
| 4039 | return length; | ||
| 4040 | } | ||
| 4041 | SLAB_ATTR(failslab); | ||
| 4042 | #endif | ||
| 4043 | |||
| 4174 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) | 4044 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) |
| 4175 | { | 4045 | { |
| 4176 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); | 4046 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); |
| @@ -4353,7 +4223,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) | |||
| 4353 | return -ENOMEM; | 4223 | return -ENOMEM; |
| 4354 | 4224 | ||
| 4355 | for_each_online_cpu(cpu) { | 4225 | for_each_online_cpu(cpu) { |
| 4356 | unsigned x = get_cpu_slab(s, cpu)->stat[si]; | 4226 | unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; |
| 4357 | 4227 | ||
| 4358 | data[cpu] = x; | 4228 | data[cpu] = x; |
| 4359 | sum += x; | 4229 | sum += x; |
| @@ -4376,7 +4246,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si) | |||
| 4376 | int cpu; | 4246 | int cpu; |
| 4377 | 4247 | ||
| 4378 | for_each_online_cpu(cpu) | 4248 | for_each_online_cpu(cpu) |
| 4379 | get_cpu_slab(s, cpu)->stat[si] = 0; | 4249 | per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; |
| 4380 | } | 4250 | } |
| 4381 | 4251 | ||
| 4382 | #define STAT_ATTR(si, text) \ | 4252 | #define STAT_ATTR(si, text) \ |
| @@ -4467,6 +4337,10 @@ static struct attribute *slab_attrs[] = { | |||
| 4467 | &deactivate_remote_frees_attr.attr, | 4337 | &deactivate_remote_frees_attr.attr, |
| 4468 | &order_fallback_attr.attr, | 4338 | &order_fallback_attr.attr, |
| 4469 | #endif | 4339 | #endif |
| 4340 | #ifdef CONFIG_FAILSLAB | ||
| 4341 | &failslab_attr.attr, | ||
| 4342 | #endif | ||
| 4343 | |||
| 4470 | NULL | 4344 | NULL |
| 4471 | }; | 4345 | }; |
| 4472 | 4346 | ||
| @@ -4519,7 +4393,7 @@ static void kmem_cache_release(struct kobject *kobj) | |||
| 4519 | kfree(s); | 4393 | kfree(s); |
| 4520 | } | 4394 | } |
| 4521 | 4395 | ||
| 4522 | static struct sysfs_ops slab_sysfs_ops = { | 4396 | static const struct sysfs_ops slab_sysfs_ops = { |
| 4523 | .show = slab_attr_show, | 4397 | .show = slab_attr_show, |
| 4524 | .store = slab_attr_store, | 4398 | .store = slab_attr_store, |
| 4525 | }; | 4399 | }; |
| @@ -4538,7 +4412,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) | |||
| 4538 | return 0; | 4412 | return 0; |
| 4539 | } | 4413 | } |
| 4540 | 4414 | ||
| 4541 | static struct kset_uevent_ops slab_uevent_ops = { | 4415 | static const struct kset_uevent_ops slab_uevent_ops = { |
| 4542 | .filter = uevent_filter, | 4416 | .filter = uevent_filter, |
| 4543 | }; | 4417 | }; |
| 4544 | 4418 | ||
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d9714bdcb4a3..aa33fd67fa41 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
| 23 | #include <linux/highmem.h> | 23 | #include <linux/highmem.h> |
| 24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
| 25 | #include <linux/slab.h> | ||
| 25 | #include <linux/spinlock.h> | 26 | #include <linux/spinlock.h> |
| 26 | #include <linux/vmalloc.h> | 27 | #include <linux/vmalloc.h> |
| 27 | #include <linux/sched.h> | 28 | #include <linux/sched.h> |
| @@ -40,9 +41,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, | |||
| 40 | unsigned long align, | 41 | unsigned long align, |
| 41 | unsigned long goal) | 42 | unsigned long goal) |
| 42 | { | 43 | { |
| 43 | return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); | 44 | return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); |
| 44 | } | 45 | } |
| 45 | 46 | ||
| 47 | static void *vmemmap_buf; | ||
| 48 | static void *vmemmap_buf_end; | ||
| 46 | 49 | ||
| 47 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) | 50 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) |
| 48 | { | 51 | { |
| @@ -64,6 +67,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) | |||
| 64 | __pa(MAX_DMA_ADDRESS)); | 67 | __pa(MAX_DMA_ADDRESS)); |
| 65 | } | 68 | } |
| 66 | 69 | ||
| 70 | /* need to make sure size is all the same during early stage */ | ||
| 71 | void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) | ||
| 72 | { | ||
| 73 | void *ptr; | ||
| 74 | |||
| 75 | if (!vmemmap_buf) | ||
| 76 | return vmemmap_alloc_block(size, node); | ||
| 77 | |||
| 78 | /* take the from buf */ | ||
| 79 | ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); | ||
| 80 | if (ptr + size > vmemmap_buf_end) | ||
| 81 | return vmemmap_alloc_block(size, node); | ||
| 82 | |||
| 83 | vmemmap_buf = ptr + size; | ||
| 84 | |||
| 85 | return ptr; | ||
| 86 | } | ||
| 87 | |||
| 67 | void __meminit vmemmap_verify(pte_t *pte, int node, | 88 | void __meminit vmemmap_verify(pte_t *pte, int node, |
| 68 | unsigned long start, unsigned long end) | 89 | unsigned long start, unsigned long end) |
| 69 | { | 90 | { |
| @@ -80,7 +101,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) | |||
| 80 | pte_t *pte = pte_offset_kernel(pmd, addr); | 101 | pte_t *pte = pte_offset_kernel(pmd, addr); |
| 81 | if (pte_none(*pte)) { | 102 | if (pte_none(*pte)) { |
| 82 | pte_t entry; | 103 | pte_t entry; |
| 83 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | 104 | void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); |
| 84 | if (!p) | 105 | if (!p) |
| 85 | return NULL; | 106 | return NULL; |
| 86 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); | 107 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); |
| @@ -163,3 +184,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) | |||
| 163 | 184 | ||
| 164 | return map; | 185 | return map; |
| 165 | } | 186 | } |
| 187 | |||
| 188 | void __init sparse_mem_maps_populate_node(struct page **map_map, | ||
| 189 | unsigned long pnum_begin, | ||
| 190 | unsigned long pnum_end, | ||
| 191 | unsigned long map_count, int nodeid) | ||
| 192 | { | ||
| 193 | unsigned long pnum; | ||
| 194 | unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; | ||
| 195 | void *vmemmap_buf_start; | ||
| 196 | |||
| 197 | size = ALIGN(size, PMD_SIZE); | ||
| 198 | vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, | ||
| 199 | PMD_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
| 200 | |||
| 201 | if (vmemmap_buf_start) { | ||
| 202 | vmemmap_buf = vmemmap_buf_start; | ||
| 203 | vmemmap_buf_end = vmemmap_buf_start + size * map_count; | ||
| 204 | } | ||
| 205 | |||
| 206 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
| 207 | struct mem_section *ms; | ||
| 208 | |||
| 209 | if (!present_section_nr(pnum)) | ||
| 210 | continue; | ||
| 211 | |||
| 212 | map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); | ||
| 213 | if (map_map[pnum]) | ||
| 214 | continue; | ||
| 215 | ms = __nr_to_section(pnum); | ||
| 216 | printk(KERN_ERR "%s: sparsemem memory map backing failed " | ||
| 217 | "some memory will not be available.\n", __func__); | ||
| 218 | ms->section_mem_map = 0; | ||
| 219 | } | ||
| 220 | |||
| 221 | if (vmemmap_buf_start) { | ||
| 222 | /* need to free left buf */ | ||
| 223 | #ifdef CONFIG_NO_BOOTMEM | ||
| 224 | free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end)); | ||
| 225 | if (vmemmap_buf_start < vmemmap_buf) { | ||
| 226 | char name[15]; | ||
| 227 | |||
| 228 | snprintf(name, sizeof(name), "MEMMAP %d", nodeid); | ||
| 229 | reserve_early_without_check(__pa(vmemmap_buf_start), | ||
| 230 | __pa(vmemmap_buf), name); | ||
| 231 | } | ||
| 232 | #else | ||
| 233 | free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); | ||
| 234 | #endif | ||
| 235 | vmemmap_buf = NULL; | ||
| 236 | vmemmap_buf_end = NULL; | ||
| 237 | } | ||
| 238 | } | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 6ce4aab69e99..dc0cc4d43ff3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | * sparse memory mappings. | 2 | * sparse memory mappings. |
| 3 | */ | 3 | */ |
| 4 | #include <linux/mm.h> | 4 | #include <linux/mm.h> |
| 5 | #include <linux/slab.h> | ||
| 5 | #include <linux/mmzone.h> | 6 | #include <linux/mmzone.h> |
| 6 | #include <linux/bootmem.h> | 7 | #include <linux/bootmem.h> |
| 7 | #include <linux/highmem.h> | 8 | #include <linux/highmem.h> |
| @@ -271,7 +272,8 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
| 271 | 272 | ||
| 272 | #ifdef CONFIG_MEMORY_HOTREMOVE | 273 | #ifdef CONFIG_MEMORY_HOTREMOVE |
| 273 | static unsigned long * __init | 274 | static unsigned long * __init |
| 274 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | 275 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
| 276 | unsigned long count) | ||
| 275 | { | 277 | { |
| 276 | unsigned long section_nr; | 278 | unsigned long section_nr; |
| 277 | 279 | ||
| @@ -286,7 +288,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | |||
| 286 | * this problem. | 288 | * this problem. |
| 287 | */ | 289 | */ |
| 288 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | 290 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); |
| 289 | return alloc_bootmem_section(usemap_size(), section_nr); | 291 | return alloc_bootmem_section(usemap_size() * count, section_nr); |
| 290 | } | 292 | } |
| 291 | 293 | ||
| 292 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 294 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
| @@ -329,7 +331,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
| 329 | } | 331 | } |
| 330 | #else | 332 | #else |
| 331 | static unsigned long * __init | 333 | static unsigned long * __init |
| 332 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | 334 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
| 335 | unsigned long count) | ||
| 333 | { | 336 | { |
| 334 | return NULL; | 337 | return NULL; |
| 335 | } | 338 | } |
| @@ -339,27 +342,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
| 339 | } | 342 | } |
| 340 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 343 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
| 341 | 344 | ||
| 342 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) | 345 | static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, |
| 346 | unsigned long pnum_begin, | ||
| 347 | unsigned long pnum_end, | ||
| 348 | unsigned long usemap_count, int nodeid) | ||
| 343 | { | 349 | { |
| 344 | unsigned long *usemap; | 350 | void *usemap; |
| 345 | struct mem_section *ms = __nr_to_section(pnum); | 351 | unsigned long pnum; |
| 346 | int nid = sparse_early_nid(ms); | 352 | int size = usemap_size(); |
| 347 | |||
| 348 | usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); | ||
| 349 | if (usemap) | ||
| 350 | return usemap; | ||
| 351 | 353 | ||
| 352 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | 354 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
| 355 | usemap_count); | ||
| 353 | if (usemap) { | 356 | if (usemap) { |
| 354 | check_usemap_section_nr(nid, usemap); | 357 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
| 355 | return usemap; | 358 | if (!present_section_nr(pnum)) |
| 359 | continue; | ||
| 360 | usemap_map[pnum] = usemap; | ||
| 361 | usemap += size; | ||
| 362 | } | ||
| 363 | return; | ||
| 356 | } | 364 | } |
| 357 | 365 | ||
| 358 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ | 366 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); |
| 359 | nid = 0; | 367 | if (usemap) { |
| 368 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
| 369 | if (!present_section_nr(pnum)) | ||
| 370 | continue; | ||
| 371 | usemap_map[pnum] = usemap; | ||
| 372 | usemap += size; | ||
| 373 | check_usemap_section_nr(nodeid, usemap_map[pnum]); | ||
| 374 | } | ||
| 375 | return; | ||
| 376 | } | ||
| 360 | 377 | ||
| 361 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | 378 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
| 362 | return NULL; | ||
| 363 | } | 379 | } |
| 364 | 380 | ||
| 365 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 381 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
| @@ -375,8 +391,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
| 375 | PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); | 391 | PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); |
| 376 | return map; | 392 | return map; |
| 377 | } | 393 | } |
| 394 | void __init sparse_mem_maps_populate_node(struct page **map_map, | ||
| 395 | unsigned long pnum_begin, | ||
| 396 | unsigned long pnum_end, | ||
| 397 | unsigned long map_count, int nodeid) | ||
| 398 | { | ||
| 399 | void *map; | ||
| 400 | unsigned long pnum; | ||
| 401 | unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; | ||
| 402 | |||
| 403 | map = alloc_remap(nodeid, size * map_count); | ||
| 404 | if (map) { | ||
| 405 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
| 406 | if (!present_section_nr(pnum)) | ||
| 407 | continue; | ||
| 408 | map_map[pnum] = map; | ||
| 409 | map += size; | ||
| 410 | } | ||
| 411 | return; | ||
| 412 | } | ||
| 413 | |||
| 414 | size = PAGE_ALIGN(size); | ||
| 415 | map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); | ||
| 416 | if (map) { | ||
| 417 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
| 418 | if (!present_section_nr(pnum)) | ||
| 419 | continue; | ||
| 420 | map_map[pnum] = map; | ||
| 421 | map += size; | ||
| 422 | } | ||
| 423 | return; | ||
| 424 | } | ||
| 425 | |||
| 426 | /* fallback */ | ||
| 427 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
| 428 | struct mem_section *ms; | ||
| 429 | |||
| 430 | if (!present_section_nr(pnum)) | ||
| 431 | continue; | ||
| 432 | map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); | ||
| 433 | if (map_map[pnum]) | ||
| 434 | continue; | ||
| 435 | ms = __nr_to_section(pnum); | ||
| 436 | printk(KERN_ERR "%s: sparsemem memory map backing failed " | ||
| 437 | "some memory will not be available.\n", __func__); | ||
| 438 | ms->section_mem_map = 0; | ||
| 439 | } | ||
| 440 | } | ||
| 378 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 441 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ |
| 379 | 442 | ||
| 443 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 444 | static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, | ||
| 445 | unsigned long pnum_begin, | ||
| 446 | unsigned long pnum_end, | ||
| 447 | unsigned long map_count, int nodeid) | ||
| 448 | { | ||
| 449 | sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, | ||
| 450 | map_count, nodeid); | ||
| 451 | } | ||
| 452 | #else | ||
| 380 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | 453 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) |
| 381 | { | 454 | { |
| 382 | struct page *map; | 455 | struct page *map; |
| @@ -392,10 +465,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | |||
| 392 | ms->section_mem_map = 0; | 465 | ms->section_mem_map = 0; |
| 393 | return NULL; | 466 | return NULL; |
| 394 | } | 467 | } |
| 468 | #endif | ||
| 395 | 469 | ||
| 396 | void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) | 470 | void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) |
| 397 | { | 471 | { |
| 398 | } | 472 | } |
| 473 | |||
| 399 | /* | 474 | /* |
| 400 | * Allocate the accumulated non-linear sections, allocate a mem_map | 475 | * Allocate the accumulated non-linear sections, allocate a mem_map |
| 401 | * for each and record the physical to section mapping. | 476 | * for each and record the physical to section mapping. |
| @@ -407,6 +482,14 @@ void __init sparse_init(void) | |||
| 407 | unsigned long *usemap; | 482 | unsigned long *usemap; |
| 408 | unsigned long **usemap_map; | 483 | unsigned long **usemap_map; |
| 409 | int size; | 484 | int size; |
| 485 | int nodeid_begin = 0; | ||
| 486 | unsigned long pnum_begin = 0; | ||
| 487 | unsigned long usemap_count; | ||
| 488 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 489 | unsigned long map_count; | ||
| 490 | int size2; | ||
| 491 | struct page **map_map; | ||
| 492 | #endif | ||
| 410 | 493 | ||
| 411 | /* | 494 | /* |
| 412 | * map is using big page (aka 2M in x86 64 bit) | 495 | * map is using big page (aka 2M in x86 64 bit) |
| @@ -425,10 +508,81 @@ void __init sparse_init(void) | |||
| 425 | panic("can not allocate usemap_map\n"); | 508 | panic("can not allocate usemap_map\n"); |
| 426 | 509 | ||
| 427 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 510 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
| 511 | struct mem_section *ms; | ||
| 512 | |||
| 428 | if (!present_section_nr(pnum)) | 513 | if (!present_section_nr(pnum)) |
| 429 | continue; | 514 | continue; |
| 430 | usemap_map[pnum] = sparse_early_usemap_alloc(pnum); | 515 | ms = __nr_to_section(pnum); |
| 516 | nodeid_begin = sparse_early_nid(ms); | ||
| 517 | pnum_begin = pnum; | ||
| 518 | break; | ||
| 431 | } | 519 | } |
| 520 | usemap_count = 1; | ||
| 521 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
| 522 | struct mem_section *ms; | ||
| 523 | int nodeid; | ||
| 524 | |||
| 525 | if (!present_section_nr(pnum)) | ||
| 526 | continue; | ||
| 527 | ms = __nr_to_section(pnum); | ||
| 528 | nodeid = sparse_early_nid(ms); | ||
| 529 | if (nodeid == nodeid_begin) { | ||
| 530 | usemap_count++; | ||
| 531 | continue; | ||
| 532 | } | ||
| 533 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
| 534 | sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, | ||
| 535 | usemap_count, nodeid_begin); | ||
| 536 | /* new start, update count etc*/ | ||
| 537 | nodeid_begin = nodeid; | ||
| 538 | pnum_begin = pnum; | ||
| 539 | usemap_count = 1; | ||
| 540 | } | ||
| 541 | /* ok, last chunk */ | ||
| 542 | sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, | ||
| 543 | usemap_count, nodeid_begin); | ||
| 544 | |||
| 545 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 546 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; | ||
| 547 | map_map = alloc_bootmem(size2); | ||
| 548 | if (!map_map) | ||
| 549 | panic("can not allocate map_map\n"); | ||
| 550 | |||
| 551 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | ||
| 552 | struct mem_section *ms; | ||
| 553 | |||
| 554 | if (!present_section_nr(pnum)) | ||
| 555 | continue; | ||
| 556 | ms = __nr_to_section(pnum); | ||
| 557 | nodeid_begin = sparse_early_nid(ms); | ||
| 558 | pnum_begin = pnum; | ||
| 559 | break; | ||
| 560 | } | ||
| 561 | map_count = 1; | ||
| 562 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
| 563 | struct mem_section *ms; | ||
| 564 | int nodeid; | ||
| 565 | |||
| 566 | if (!present_section_nr(pnum)) | ||
| 567 | continue; | ||
| 568 | ms = __nr_to_section(pnum); | ||
| 569 | nodeid = sparse_early_nid(ms); | ||
| 570 | if (nodeid == nodeid_begin) { | ||
| 571 | map_count++; | ||
| 572 | continue; | ||
| 573 | } | ||
| 574 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
| 575 | sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, | ||
| 576 | map_count, nodeid_begin); | ||
| 577 | /* new start, update count etc*/ | ||
| 578 | nodeid_begin = nodeid; | ||
| 579 | pnum_begin = pnum; | ||
| 580 | map_count = 1; | ||
| 581 | } | ||
| 582 | /* ok, last chunk */ | ||
| 583 | sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, | ||
| 584 | map_count, nodeid_begin); | ||
| 585 | #endif | ||
| 432 | 586 | ||
| 433 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 587 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
| 434 | if (!present_section_nr(pnum)) | 588 | if (!present_section_nr(pnum)) |
| @@ -438,7 +592,11 @@ void __init sparse_init(void) | |||
| 438 | if (!usemap) | 592 | if (!usemap) |
| 439 | continue; | 593 | continue; |
| 440 | 594 | ||
| 595 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 596 | map = map_map[pnum]; | ||
| 597 | #else | ||
| 441 | map = sparse_early_mem_map_alloc(pnum); | 598 | map = sparse_early_mem_map_alloc(pnum); |
| 599 | #endif | ||
| 442 | if (!map) | 600 | if (!map) |
| 443 | continue; | 601 | continue; |
| 444 | 602 | ||
| @@ -448,6 +606,9 @@ void __init sparse_init(void) | |||
| 448 | 606 | ||
| 449 | vmemmap_populate_print_last(); | 607 | vmemmap_populate_print_last(); |
| 450 | 608 | ||
| 609 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 610 | free_bootmem(__pa(map_map), size2); | ||
| 611 | #endif | ||
| 451 | free_bootmem(__pa(usemap_map), size); | 612 | free_bootmem(__pa(usemap_map), size); |
| 452 | } | 613 | } |
| 453 | 614 | ||
| @@ -30,6 +30,7 @@ | |||
| 30 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
| 31 | #include <linux/backing-dev.h> | 31 | #include <linux/backing-dev.h> |
| 32 | #include <linux/memcontrol.h> | 32 | #include <linux/memcontrol.h> |
| 33 | #include <linux/gfp.h> | ||
| 33 | 34 | ||
| 34 | #include "internal.h" | 35 | #include "internal.h" |
| 35 | 36 | ||
| @@ -55,7 +56,7 @@ static void __page_cache_release(struct page *page) | |||
| 55 | del_page_from_lru(zone, page); | 56 | del_page_from_lru(zone, page); |
| 56 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 57 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 57 | } | 58 | } |
| 58 | free_hot_page(page); | 59 | free_hot_cold_page(page, 0); |
| 59 | } | 60 | } |
| 60 | 61 | ||
| 61 | static void put_compound_page(struct page *page) | 62 | static void put_compound_page(struct page *page) |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 6d1daeb1cb4a..e10f5833167f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | */ | 8 | */ |
| 9 | #include <linux/module.h> | 9 | #include <linux/module.h> |
| 10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
| 11 | #include <linux/gfp.h> | ||
| 11 | #include <linux/kernel_stat.h> | 12 | #include <linux/kernel_stat.h> |
| 12 | #include <linux/swap.h> | 13 | #include <linux/swap.h> |
| 13 | #include <linux/swapops.h> | 14 | #include <linux/swapops.h> |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c0585b16418..6cd0a8f90dc7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry) | |||
| 723 | return p != NULL; | 723 | return p != NULL; |
| 724 | } | 724 | } |
| 725 | 725 | ||
| 726 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
| 727 | /** | ||
| 728 | * mem_cgroup_count_swap_user - count the user of a swap entry | ||
| 729 | * @ent: the swap entry to be checked | ||
| 730 | * @pagep: the pointer for the swap cache page of the entry to be stored | ||
| 731 | * | ||
| 732 | * Returns the number of the user of the swap entry. The number is valid only | ||
| 733 | * for swaps of anonymous pages. | ||
| 734 | * If the entry is found on swap cache, the page is stored to pagep with | ||
| 735 | * refcount of it being incremented. | ||
| 736 | */ | ||
| 737 | int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) | ||
| 738 | { | ||
| 739 | struct page *page; | ||
| 740 | struct swap_info_struct *p; | ||
| 741 | int count = 0; | ||
| 742 | |||
| 743 | page = find_get_page(&swapper_space, ent.val); | ||
| 744 | if (page) | ||
| 745 | count += page_mapcount(page); | ||
| 746 | p = swap_info_get(ent); | ||
| 747 | if (p) { | ||
| 748 | count += swap_count(p->swap_map[swp_offset(ent)]); | ||
| 749 | spin_unlock(&swap_lock); | ||
| 750 | } | ||
| 751 | |||
| 752 | *pagep = page; | ||
| 753 | return count; | ||
| 754 | } | ||
| 755 | #endif | ||
| 756 | |||
| 726 | #ifdef CONFIG_HIBERNATION | 757 | #ifdef CONFIG_HIBERNATION |
| 727 | /* | 758 | /* |
| 728 | * Find the swap type that corresponds to given device (if any). | 759 | * Find the swap type that corresponds to given device (if any). |
| @@ -840,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 840 | goto out; | 871 | goto out; |
| 841 | } | 872 | } |
| 842 | 873 | ||
| 843 | inc_mm_counter(vma->vm_mm, anon_rss); | 874 | dec_mm_counter(vma->vm_mm, MM_SWAPENTS); |
| 875 | inc_mm_counter(vma->vm_mm, MM_ANONPAGES); | ||
| 844 | get_page(page); | 876 | get_page(page); |
| 845 | set_pte_at(vma->vm_mm, addr, pte, | 877 | set_pte_at(vma->vm_mm, addr, pte, |
| 846 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 878 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
| @@ -1759,11 +1791,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1759 | unsigned int type; | 1791 | unsigned int type; |
| 1760 | int i, prev; | 1792 | int i, prev; |
| 1761 | int error; | 1793 | int error; |
| 1762 | union swap_header *swap_header = NULL; | 1794 | union swap_header *swap_header; |
| 1763 | unsigned int nr_good_pages = 0; | 1795 | unsigned int nr_good_pages; |
| 1764 | int nr_extents = 0; | 1796 | int nr_extents = 0; |
| 1765 | sector_t span; | 1797 | sector_t span; |
| 1766 | unsigned long maxpages = 1; | 1798 | unsigned long maxpages; |
| 1767 | unsigned long swapfilepages; | 1799 | unsigned long swapfilepages; |
| 1768 | unsigned char *swap_map = NULL; | 1800 | unsigned char *swap_map = NULL; |
| 1769 | struct page *page = NULL; | 1801 | struct page *page = NULL; |
| @@ -1922,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1922 | * swap pte. | 1954 | * swap pte. |
| 1923 | */ | 1955 | */ |
| 1924 | maxpages = swp_offset(pte_to_swp_entry( | 1956 | maxpages = swp_offset(pte_to_swp_entry( |
| 1925 | swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; | 1957 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
| 1926 | if (maxpages > swap_header->info.last_page) | 1958 | if (maxpages > swap_header->info.last_page) { |
| 1927 | maxpages = swap_header->info.last_page; | 1959 | maxpages = swap_header->info.last_page + 1; |
| 1960 | /* p->max is an unsigned int: don't overflow it */ | ||
| 1961 | if ((unsigned int)maxpages == 0) | ||
| 1962 | maxpages = UINT_MAX; | ||
| 1963 | } | ||
| 1928 | p->highest_bit = maxpages - 1; | 1964 | p->highest_bit = maxpages - 1; |
| 1929 | 1965 | ||
| 1930 | error = -EINVAL; | 1966 | error = -EINVAL; |
| @@ -1948,23 +1984,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1948 | } | 1984 | } |
| 1949 | 1985 | ||
| 1950 | memset(swap_map, 0, maxpages); | 1986 | memset(swap_map, 0, maxpages); |
| 1987 | nr_good_pages = maxpages - 1; /* omit header page */ | ||
| 1988 | |||
| 1951 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1989 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
| 1952 | int page_nr = swap_header->info.badpages[i]; | 1990 | unsigned int page_nr = swap_header->info.badpages[i]; |
| 1953 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { | 1991 | if (page_nr == 0 || page_nr > swap_header->info.last_page) { |
| 1954 | error = -EINVAL; | 1992 | error = -EINVAL; |
| 1955 | goto bad_swap; | 1993 | goto bad_swap; |
| 1956 | } | 1994 | } |
| 1957 | swap_map[page_nr] = SWAP_MAP_BAD; | 1995 | if (page_nr < maxpages) { |
| 1996 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
| 1997 | nr_good_pages--; | ||
| 1998 | } | ||
| 1958 | } | 1999 | } |
| 1959 | 2000 | ||
| 1960 | error = swap_cgroup_swapon(type, maxpages); | 2001 | error = swap_cgroup_swapon(type, maxpages); |
| 1961 | if (error) | 2002 | if (error) |
| 1962 | goto bad_swap; | 2003 | goto bad_swap; |
| 1963 | 2004 | ||
| 1964 | nr_good_pages = swap_header->info.last_page - | ||
| 1965 | swap_header->info.nr_badpages - | ||
| 1966 | 1 /* header page */; | ||
| 1967 | |||
| 1968 | if (nr_good_pages) { | 2005 | if (nr_good_pages) { |
| 1969 | swap_map[0] = SWAP_MAP_BAD; | 2006 | swap_map[0] = SWAP_MAP_BAD; |
| 1970 | p->max = maxpages; | 2007 | p->max = maxpages; |
| @@ -2155,7 +2192,11 @@ void swap_shmem_alloc(swp_entry_t entry) | |||
| 2155 | } | 2192 | } |
| 2156 | 2193 | ||
| 2157 | /* | 2194 | /* |
| 2158 | * increase reference count of swap entry by 1. | 2195 | * Increase reference count of swap entry by 1. |
| 2196 | * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required | ||
| 2197 | * but could not be atomically allocated. Returns 0, just as if it succeeded, | ||
| 2198 | * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which | ||
| 2199 | * might occur if a page table entry has got corrupted. | ||
| 2159 | */ | 2200 | */ |
| 2160 | int swap_duplicate(swp_entry_t entry) | 2201 | int swap_duplicate(swp_entry_t entry) |
| 2161 | { | 2202 | { |
diff --git a/mm/truncate.c b/mm/truncate.c index e87e37244829..f42675a3615d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | 9 | ||
| 10 | #include <linux/kernel.h> | 10 | #include <linux/kernel.h> |
| 11 | #include <linux/backing-dev.h> | 11 | #include <linux/backing-dev.h> |
| 12 | #include <linux/gfp.h> | ||
| 12 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
| 13 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
| 14 | #include <linux/module.h> | 15 | #include <linux/module.h> |
| @@ -186,6 +186,27 @@ void kzfree(const void *p) | |||
| 186 | } | 186 | } |
| 187 | EXPORT_SYMBOL(kzfree); | 187 | EXPORT_SYMBOL(kzfree); |
| 188 | 188 | ||
| 189 | int kern_ptr_validate(const void *ptr, unsigned long size) | ||
| 190 | { | ||
| 191 | unsigned long addr = (unsigned long)ptr; | ||
| 192 | unsigned long min_addr = PAGE_OFFSET; | ||
| 193 | unsigned long align_mask = sizeof(void *) - 1; | ||
| 194 | |||
| 195 | if (unlikely(addr < min_addr)) | ||
| 196 | goto out; | ||
| 197 | if (unlikely(addr > (unsigned long)high_memory - size)) | ||
| 198 | goto out; | ||
| 199 | if (unlikely(addr & align_mask)) | ||
| 200 | goto out; | ||
| 201 | if (unlikely(!kern_addr_valid(addr))) | ||
| 202 | goto out; | ||
| 203 | if (unlikely(!kern_addr_valid(addr + size - 1))) | ||
| 204 | goto out; | ||
| 205 | return 1; | ||
| 206 | out: | ||
| 207 | return 0; | ||
| 208 | } | ||
| 209 | |||
| 189 | /* | 210 | /* |
| 190 | * strndup_user - duplicate an existing string from user space | 211 | * strndup_user - duplicate an existing string from user space |
| 191 | * @s: The string to duplicate | 212 | * @s: The string to duplicate |
diff --git a/mm/vmscan.c b/mm/vmscan.c index c26986c85ce0..3ff3311447f5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -13,7 +13,7 @@ | |||
| 13 | 13 | ||
| 14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
| 15 | #include <linux/module.h> | 15 | #include <linux/module.h> |
| 16 | #include <linux/slab.h> | 16 | #include <linux/gfp.h> |
| 17 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
| 18 | #include <linux/swap.h> | 18 | #include <linux/swap.h> |
| 19 | #include <linux/pagemap.h> | 19 | #include <linux/pagemap.h> |
| @@ -262,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
| 262 | return ret; | 262 | return ret; |
| 263 | } | 263 | } |
| 264 | 264 | ||
| 265 | /* Called without lock on whether page is mapped, so answer is unstable */ | ||
| 266 | static inline int page_mapping_inuse(struct page *page) | ||
| 267 | { | ||
| 268 | struct address_space *mapping; | ||
| 269 | |||
| 270 | /* Page is in somebody's page tables. */ | ||
| 271 | if (page_mapped(page)) | ||
| 272 | return 1; | ||
| 273 | |||
| 274 | /* Be more reluctant to reclaim swapcache than pagecache */ | ||
| 275 | if (PageSwapCache(page)) | ||
| 276 | return 1; | ||
| 277 | |||
| 278 | mapping = page_mapping(page); | ||
| 279 | if (!mapping) | ||
| 280 | return 0; | ||
| 281 | |||
| 282 | /* File is mmap'd by somebody? */ | ||
| 283 | return mapping_mapped(mapping); | ||
| 284 | } | ||
| 285 | |||
| 286 | static inline int is_page_cache_freeable(struct page *page) | 265 | static inline int is_page_cache_freeable(struct page *page) |
| 287 | { | 266 | { |
| 288 | /* | 267 | /* |
| @@ -579,6 +558,65 @@ redo: | |||
| 579 | put_page(page); /* drop ref from isolate */ | 558 | put_page(page); /* drop ref from isolate */ |
| 580 | } | 559 | } |
| 581 | 560 | ||
| 561 | enum page_references { | ||
| 562 | PAGEREF_RECLAIM, | ||
| 563 | PAGEREF_RECLAIM_CLEAN, | ||
| 564 | PAGEREF_KEEP, | ||
| 565 | PAGEREF_ACTIVATE, | ||
| 566 | }; | ||
| 567 | |||
| 568 | static enum page_references page_check_references(struct page *page, | ||
| 569 | struct scan_control *sc) | ||
| 570 | { | ||
| 571 | int referenced_ptes, referenced_page; | ||
| 572 | unsigned long vm_flags; | ||
| 573 | |||
| 574 | referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); | ||
| 575 | referenced_page = TestClearPageReferenced(page); | ||
| 576 | |||
| 577 | /* Lumpy reclaim - ignore references */ | ||
| 578 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
| 579 | return PAGEREF_RECLAIM; | ||
| 580 | |||
| 581 | /* | ||
| 582 | * Mlock lost the isolation race with us. Let try_to_unmap() | ||
| 583 | * move the page to the unevictable list. | ||
| 584 | */ | ||
| 585 | if (vm_flags & VM_LOCKED) | ||
| 586 | return PAGEREF_RECLAIM; | ||
| 587 | |||
| 588 | if (referenced_ptes) { | ||
| 589 | if (PageAnon(page)) | ||
| 590 | return PAGEREF_ACTIVATE; | ||
| 591 | /* | ||
| 592 | * All mapped pages start out with page table | ||
| 593 | * references from the instantiating fault, so we need | ||
| 594 | * to look twice if a mapped file page is used more | ||
| 595 | * than once. | ||
| 596 | * | ||
| 597 | * Mark it and spare it for another trip around the | ||
| 598 | * inactive list. Another page table reference will | ||
| 599 | * lead to its activation. | ||
| 600 | * | ||
| 601 | * Note: the mark is set for activated pages as well | ||
| 602 | * so that recently deactivated but used pages are | ||
| 603 | * quickly recovered. | ||
| 604 | */ | ||
| 605 | SetPageReferenced(page); | ||
| 606 | |||
| 607 | if (referenced_page) | ||
| 608 | return PAGEREF_ACTIVATE; | ||
| 609 | |||
| 610 | return PAGEREF_KEEP; | ||
| 611 | } | ||
| 612 | |||
| 613 | /* Reclaim if clean, defer dirty pages to writeback */ | ||
| 614 | if (referenced_page) | ||
| 615 | return PAGEREF_RECLAIM_CLEAN; | ||
| 616 | |||
| 617 | return PAGEREF_RECLAIM; | ||
| 618 | } | ||
| 619 | |||
| 582 | /* | 620 | /* |
| 583 | * shrink_page_list() returns the number of reclaimed pages | 621 | * shrink_page_list() returns the number of reclaimed pages |
| 584 | */ | 622 | */ |
| @@ -590,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 590 | struct pagevec freed_pvec; | 628 | struct pagevec freed_pvec; |
| 591 | int pgactivate = 0; | 629 | int pgactivate = 0; |
| 592 | unsigned long nr_reclaimed = 0; | 630 | unsigned long nr_reclaimed = 0; |
| 593 | unsigned long vm_flags; | ||
| 594 | 631 | ||
| 595 | cond_resched(); | 632 | cond_resched(); |
| 596 | 633 | ||
| 597 | pagevec_init(&freed_pvec, 1); | 634 | pagevec_init(&freed_pvec, 1); |
| 598 | while (!list_empty(page_list)) { | 635 | while (!list_empty(page_list)) { |
| 636 | enum page_references references; | ||
| 599 | struct address_space *mapping; | 637 | struct address_space *mapping; |
| 600 | struct page *page; | 638 | struct page *page; |
| 601 | int may_enter_fs; | 639 | int may_enter_fs; |
| 602 | int referenced; | ||
| 603 | 640 | ||
| 604 | cond_resched(); | 641 | cond_resched(); |
| 605 | 642 | ||
| @@ -641,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 641 | goto keep_locked; | 678 | goto keep_locked; |
| 642 | } | 679 | } |
| 643 | 680 | ||
| 644 | referenced = page_referenced(page, 1, | 681 | references = page_check_references(page, sc); |
| 645 | sc->mem_cgroup, &vm_flags); | 682 | switch (references) { |
| 646 | /* | 683 | case PAGEREF_ACTIVATE: |
| 647 | * In active use or really unfreeable? Activate it. | ||
| 648 | * If page which have PG_mlocked lost isoltation race, | ||
| 649 | * try_to_unmap moves it to unevictable list | ||
| 650 | */ | ||
| 651 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | ||
| 652 | referenced && page_mapping_inuse(page) | ||
| 653 | && !(vm_flags & VM_LOCKED)) | ||
| 654 | goto activate_locked; | 684 | goto activate_locked; |
| 685 | case PAGEREF_KEEP: | ||
| 686 | goto keep_locked; | ||
| 687 | case PAGEREF_RECLAIM: | ||
| 688 | case PAGEREF_RECLAIM_CLEAN: | ||
| 689 | ; /* try to reclaim the page below */ | ||
| 690 | } | ||
| 655 | 691 | ||
| 656 | /* | 692 | /* |
| 657 | * Anonymous process memory has backing store? | 693 | * Anonymous process memory has backing store? |
| @@ -685,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 685 | } | 721 | } |
| 686 | 722 | ||
| 687 | if (PageDirty(page)) { | 723 | if (PageDirty(page)) { |
| 688 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) | 724 | if (references == PAGEREF_RECLAIM_CLEAN) |
| 689 | goto keep_locked; | 725 | goto keep_locked; |
| 690 | if (!may_enter_fs) | 726 | if (!may_enter_fs) |
| 691 | goto keep_locked; | 727 | goto keep_locked; |
| @@ -1350,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 1350 | continue; | 1386 | continue; |
| 1351 | } | 1387 | } |
| 1352 | 1388 | ||
| 1353 | /* page_referenced clears PageReferenced */ | 1389 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
| 1354 | if (page_mapping_inuse(page) && | ||
| 1355 | page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | ||
| 1356 | nr_rotated++; | 1390 | nr_rotated++; |
| 1357 | /* | 1391 | /* |
| 1358 | * Identify referenced, file-backed active pages and | 1392 | * Identify referenced, file-backed active pages and |
| @@ -1694,8 +1728,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
| 1694 | continue; | 1728 | continue; |
| 1695 | note_zone_scanning_priority(zone, priority); | 1729 | note_zone_scanning_priority(zone, priority); |
| 1696 | 1730 | ||
| 1697 | if (zone_is_all_unreclaimable(zone) && | 1731 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 1698 | priority != DEF_PRIORITY) | ||
| 1699 | continue; /* Let kswapd poll it */ | 1732 | continue; /* Let kswapd poll it */ |
| 1700 | sc->all_unreclaimable = 0; | 1733 | sc->all_unreclaimable = 0; |
| 1701 | } else { | 1734 | } else { |
| @@ -1922,7 +1955,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
| 1922 | if (!populated_zone(zone)) | 1955 | if (!populated_zone(zone)) |
| 1923 | continue; | 1956 | continue; |
| 1924 | 1957 | ||
| 1925 | if (zone_is_all_unreclaimable(zone)) | 1958 | if (zone->all_unreclaimable) |
| 1926 | continue; | 1959 | continue; |
| 1927 | 1960 | ||
| 1928 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 1961 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), |
| @@ -2012,8 +2045,7 @@ loop_again: | |||
| 2012 | if (!populated_zone(zone)) | 2045 | if (!populated_zone(zone)) |
| 2013 | continue; | 2046 | continue; |
| 2014 | 2047 | ||
| 2015 | if (zone_is_all_unreclaimable(zone) && | 2048 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 2016 | priority != DEF_PRIORITY) | ||
| 2017 | continue; | 2049 | continue; |
| 2018 | 2050 | ||
| 2019 | /* | 2051 | /* |
| @@ -2056,13 +2088,9 @@ loop_again: | |||
| 2056 | if (!populated_zone(zone)) | 2088 | if (!populated_zone(zone)) |
| 2057 | continue; | 2089 | continue; |
| 2058 | 2090 | ||
| 2059 | if (zone_is_all_unreclaimable(zone) && | 2091 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 2060 | priority != DEF_PRIORITY) | ||
| 2061 | continue; | 2092 | continue; |
| 2062 | 2093 | ||
| 2063 | if (!zone_watermark_ok(zone, order, | ||
| 2064 | high_wmark_pages(zone), end_zone, 0)) | ||
| 2065 | all_zones_ok = 0; | ||
| 2066 | temp_priority[i] = priority; | 2094 | temp_priority[i] = priority; |
| 2067 | sc.nr_scanned = 0; | 2095 | sc.nr_scanned = 0; |
| 2068 | note_zone_scanning_priority(zone, priority); | 2096 | note_zone_scanning_priority(zone, priority); |
| @@ -2087,12 +2115,11 @@ loop_again: | |||
| 2087 | lru_pages); | 2115 | lru_pages); |
| 2088 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2116 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
| 2089 | total_scanned += sc.nr_scanned; | 2117 | total_scanned += sc.nr_scanned; |
| 2090 | if (zone_is_all_unreclaimable(zone)) | 2118 | if (zone->all_unreclaimable) |
| 2091 | continue; | 2119 | continue; |
| 2092 | if (nr_slab == 0 && zone->pages_scanned >= | 2120 | if (nr_slab == 0 && |
| 2093 | (zone_reclaimable_pages(zone) * 6)) | 2121 | zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6)) |
| 2094 | zone_set_flag(zone, | 2122 | zone->all_unreclaimable = 1; |
| 2095 | ZONE_ALL_UNRECLAIMABLE); | ||
| 2096 | /* | 2123 | /* |
| 2097 | * If we've done a decent amount of scanning and | 2124 | * If we've done a decent amount of scanning and |
| 2098 | * the reclaim ratio is low, start doing writepage | 2125 | * the reclaim ratio is low, start doing writepage |
| @@ -2102,13 +2129,18 @@ loop_again: | |||
| 2102 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2129 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
| 2103 | sc.may_writepage = 1; | 2130 | sc.may_writepage = 1; |
| 2104 | 2131 | ||
| 2105 | /* | 2132 | if (!zone_watermark_ok(zone, order, |
| 2106 | * We are still under min water mark. it mean we have | 2133 | high_wmark_pages(zone), end_zone, 0)) { |
| 2107 | * GFP_ATOMIC allocation failure risk. Hurry up! | 2134 | all_zones_ok = 0; |
| 2108 | */ | 2135 | /* |
| 2109 | if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), | 2136 | * We are still under min water mark. This |
| 2110 | end_zone, 0)) | 2137 | * means that we have a GFP_ATOMIC allocation |
| 2111 | has_under_min_watermark_zone = 1; | 2138 | * failure risk. Hurry up! |
| 2139 | */ | ||
| 2140 | if (!zone_watermark_ok(zone, order, | ||
| 2141 | min_wmark_pages(zone), end_zone, 0)) | ||
| 2142 | has_under_min_watermark_zone = 1; | ||
| 2143 | } | ||
| 2112 | 2144 | ||
| 2113 | } | 2145 | } |
| 2114 | if (all_zones_ok) | 2146 | if (all_zones_ok) |
| @@ -2550,6 +2582,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2550 | * and RECLAIM_SWAP. | 2582 | * and RECLAIM_SWAP. |
| 2551 | */ | 2583 | */ |
| 2552 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; | 2584 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; |
| 2585 | lockdep_set_current_reclaim_state(gfp_mask); | ||
| 2553 | reclaim_state.reclaimed_slab = 0; | 2586 | reclaim_state.reclaimed_slab = 0; |
| 2554 | p->reclaim_state = &reclaim_state; | 2587 | p->reclaim_state = &reclaim_state; |
| 2555 | 2588 | ||
| @@ -2593,6 +2626,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2593 | 2626 | ||
| 2594 | p->reclaim_state = NULL; | 2627 | p->reclaim_state = NULL; |
| 2595 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 2628 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
| 2629 | lockdep_clear_current_reclaim_state(); | ||
| 2596 | return sc.nr_reclaimed >= nr_pages; | 2630 | return sc.nr_reclaimed >= nr_pages; |
| 2597 | } | 2631 | } |
| 2598 | 2632 | ||
| @@ -2615,7 +2649,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2615 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) | 2649 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
| 2616 | return ZONE_RECLAIM_FULL; | 2650 | return ZONE_RECLAIM_FULL; |
| 2617 | 2651 | ||
| 2618 | if (zone_is_all_unreclaimable(zone)) | 2652 | if (zone->all_unreclaimable) |
| 2619 | return ZONE_RECLAIM_FULL; | 2653 | return ZONE_RECLAIM_FULL; |
| 2620 | 2654 | ||
| 2621 | /* | 2655 | /* |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 6051fbab67ba..fa12ea3051fb 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
| 13 | #include <linux/err.h> | 13 | #include <linux/err.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/slab.h> | ||
| 15 | #include <linux/cpu.h> | 16 | #include <linux/cpu.h> |
| 16 | #include <linux/vmstat.h> | 17 | #include <linux/vmstat.h> |
| 17 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
| @@ -139,7 +140,8 @@ static void refresh_zone_stat_thresholds(void) | |||
| 139 | threshold = calculate_threshold(zone); | 140 | threshold = calculate_threshold(zone); |
| 140 | 141 | ||
| 141 | for_each_online_cpu(cpu) | 142 | for_each_online_cpu(cpu) |
| 142 | zone_pcp(zone, cpu)->stat_threshold = threshold; | 143 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
| 144 | = threshold; | ||
| 143 | } | 145 | } |
| 144 | } | 146 | } |
| 145 | 147 | ||
| @@ -149,7 +151,8 @@ static void refresh_zone_stat_thresholds(void) | |||
| 149 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 151 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
| 150 | int delta) | 152 | int delta) |
| 151 | { | 153 | { |
| 152 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 154 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
| 155 | |||
| 153 | s8 *p = pcp->vm_stat_diff + item; | 156 | s8 *p = pcp->vm_stat_diff + item; |
| 154 | long x; | 157 | long x; |
| 155 | 158 | ||
| @@ -202,7 +205,7 @@ EXPORT_SYMBOL(mod_zone_page_state); | |||
| 202 | */ | 205 | */ |
| 203 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | 206 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
| 204 | { | 207 | { |
| 205 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 208 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
| 206 | s8 *p = pcp->vm_stat_diff + item; | 209 | s8 *p = pcp->vm_stat_diff + item; |
| 207 | 210 | ||
| 208 | (*p)++; | 211 | (*p)++; |
| @@ -223,7 +226,7 @@ EXPORT_SYMBOL(__inc_zone_page_state); | |||
| 223 | 226 | ||
| 224 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | 227 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
| 225 | { | 228 | { |
| 226 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 229 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
| 227 | s8 *p = pcp->vm_stat_diff + item; | 230 | s8 *p = pcp->vm_stat_diff + item; |
| 228 | 231 | ||
| 229 | (*p)--; | 232 | (*p)--; |
| @@ -300,7 +303,7 @@ void refresh_cpu_vm_stats(int cpu) | |||
| 300 | for_each_populated_zone(zone) { | 303 | for_each_populated_zone(zone) { |
| 301 | struct per_cpu_pageset *p; | 304 | struct per_cpu_pageset *p; |
| 302 | 305 | ||
| 303 | p = zone_pcp(zone, cpu); | 306 | p = per_cpu_ptr(zone->pageset, cpu); |
| 304 | 307 | ||
| 305 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 308 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
| 306 | if (p->vm_stat_diff[i]) { | 309 | if (p->vm_stat_diff[i]) { |
| @@ -741,7 +744,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 741 | for_each_online_cpu(i) { | 744 | for_each_online_cpu(i) { |
| 742 | struct per_cpu_pageset *pageset; | 745 | struct per_cpu_pageset *pageset; |
| 743 | 746 | ||
| 744 | pageset = zone_pcp(zone, i); | 747 | pageset = per_cpu_ptr(zone->pageset, i); |
| 745 | seq_printf(m, | 748 | seq_printf(m, |
| 746 | "\n cpu: %i" | 749 | "\n cpu: %i" |
| 747 | "\n count: %i" | 750 | "\n count: %i" |
| @@ -761,7 +764,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 761 | "\n prev_priority: %i" | 764 | "\n prev_priority: %i" |
| 762 | "\n start_pfn: %lu" | 765 | "\n start_pfn: %lu" |
| 763 | "\n inactive_ratio: %u", | 766 | "\n inactive_ratio: %u", |
| 764 | zone_is_all_unreclaimable(zone), | 767 | zone->all_unreclaimable, |
| 765 | zone->prev_priority, | 768 | zone->prev_priority, |
| 766 | zone->zone_start_pfn, | 769 | zone->zone_start_pfn, |
| 767 | zone->inactive_ratio); | 770 | zone->inactive_ratio); |
| @@ -906,6 +909,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
| 906 | case CPU_ONLINE: | 909 | case CPU_ONLINE: |
| 907 | case CPU_ONLINE_FROZEN: | 910 | case CPU_ONLINE_FROZEN: |
| 908 | start_cpu_timer(cpu); | 911 | start_cpu_timer(cpu); |
| 912 | node_set_state(cpu_to_node(cpu), N_CPU); | ||
| 909 | break; | 913 | break; |
| 910 | case CPU_DOWN_PREPARE: | 914 | case CPU_DOWN_PREPARE: |
| 911 | case CPU_DOWN_PREPARE_FROZEN: | 915 | case CPU_DOWN_PREPARE_FROZEN: |
