diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 5 | ||||
| -rw-r--r-- | mm/Makefile | 3 | ||||
| -rw-r--r-- | mm/allocpercpu.c | 24 | ||||
| -rw-r--r-- | mm/bootmem.c | 935 | ||||
| -rw-r--r-- | mm/filemap.c | 422 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 5 | ||||
| -rw-r--r-- | mm/fremap.c | 3 | ||||
| -rw-r--r-- | mm/hugetlb.c | 1681 | ||||
| -rw-r--r-- | mm/internal.h | 61 | ||||
| -rw-r--r-- | mm/madvise.c | 4 | ||||
| -rw-r--r-- | mm/memcontrol.c | 369 | ||||
| -rw-r--r-- | mm/memory.c | 322 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 80 | ||||
| -rw-r--r-- | mm/mempolicy.c | 10 | ||||
| -rw-r--r-- | mm/migrate.c | 53 | ||||
| -rw-r--r-- | mm/mlock.c | 2 | ||||
| -rw-r--r-- | mm/mm_init.c | 152 | ||||
| -rw-r--r-- | mm/mmap.c | 180 | ||||
| -rw-r--r-- | mm/mmu_notifier.c | 277 | ||||
| -rw-r--r-- | mm/mprotect.c | 9 | ||||
| -rw-r--r-- | mm/mremap.c | 6 | ||||
| -rw-r--r-- | mm/nommu.c | 25 | ||||
| -rw-r--r-- | mm/page-writeback.c | 12 | ||||
| -rw-r--r-- | mm/page_alloc.c | 175 | ||||
| -rw-r--r-- | mm/pdflush.c | 4 | ||||
| -rw-r--r-- | mm/readahead.c | 6 | ||||
| -rw-r--r-- | mm/rmap.c | 34 | ||||
| -rw-r--r-- | mm/shmem.c | 106 | ||||
| -rw-r--r-- | mm/shmem_acl.c | 2 | ||||
| -rw-r--r-- | mm/slab.c | 12 | ||||
| -rw-r--r-- | mm/slob.c | 20 | ||||
| -rw-r--r-- | mm/slub.c | 105 | ||||
| -rw-r--r-- | mm/sparse.c | 116 | ||||
| -rw-r--r-- | mm/swap.c | 17 | ||||
| -rw-r--r-- | mm/swap_state.c | 38 | ||||
| -rw-r--r-- | mm/swapfile.c | 65 | ||||
| -rw-r--r-- | mm/truncate.c | 12 | ||||
| -rw-r--r-- | mm/util.c | 70 | ||||
| -rw-r--r-- | mm/vmalloc.c | 26 | ||||
| -rw-r--r-- | mm/vmscan.c | 93 | ||||
| -rw-r--r-- | mm/vmstat.c | 3 |
41 files changed, 3918 insertions, 1626 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index c4de85285bb4..0bd9c2dbb2a0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS | |||
| 174 | config MIGRATION | 174 | config MIGRATION |
| 175 | bool "Page migration" | 175 | bool "Page migration" |
| 176 | def_bool y | 176 | def_bool y |
| 177 | depends on NUMA | 177 | depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE |
| 178 | help | 178 | help |
| 179 | Allows the migration of the physical location of pages of processes | 179 | Allows the migration of the physical location of pages of processes |
| 180 | while the virtual addresses are not changed. This is useful for | 180 | while the virtual addresses are not changed. This is useful for |
| @@ -205,3 +205,6 @@ config NR_QUICK | |||
| 205 | config VIRT_TO_BUS | 205 | config VIRT_TO_BUS |
| 206 | def_bool y | 206 | def_bool y |
| 207 | depends on !ARCH_NO_VIRT_TO_BUS | 207 | depends on !ARCH_NO_VIRT_TO_BUS |
| 208 | |||
| 209 | config MMU_NOTIFIER | ||
| 210 | bool | ||
diff --git a/mm/Makefile b/mm/Makefile index 18c143b3c46c..da4ccf015aea 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | |||
| 11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ | 11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ |
| 12 | readahead.o swap.o truncate.o vmscan.o \ | 12 | readahead.o swap.o truncate.o vmscan.o \ |
| 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
| 14 | page_isolation.o $(mmu-y) | 14 | page_isolation.o mm_init.o $(mmu-y) |
| 15 | 15 | ||
| 16 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | 16 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o |
| 17 | obj-$(CONFIG_BOUNCE) += bounce.o | 17 | obj-$(CONFIG_BOUNCE) += bounce.o |
| @@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o | |||
| 25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
| 26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o | 26 | obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o |
| 27 | obj-$(CONFIG_SLOB) += slob.o | 27 | obj-$(CONFIG_SLOB) += slob.o |
| 28 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | ||
| 28 | obj-$(CONFIG_SLAB) += slab.o | 29 | obj-$(CONFIG_SLAB) += slab.o |
| 29 | obj-$(CONFIG_SLUB) += slub.o | 30 | obj-$(CONFIG_SLUB) += slub.o |
| 30 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 31 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 05f2b4009ccc..4297bc41bfd2 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c | |||
| @@ -18,27 +18,28 @@ | |||
| 18 | * Depopulating per-cpu data for a cpu going offline would be a typical | 18 | * Depopulating per-cpu data for a cpu going offline would be a typical |
| 19 | * use case. You need to register a cpu hotplug handler for that purpose. | 19 | * use case. You need to register a cpu hotplug handler for that purpose. |
| 20 | */ | 20 | */ |
| 21 | void percpu_depopulate(void *__pdata, int cpu) | 21 | static void percpu_depopulate(void *__pdata, int cpu) |
| 22 | { | 22 | { |
| 23 | struct percpu_data *pdata = __percpu_disguise(__pdata); | 23 | struct percpu_data *pdata = __percpu_disguise(__pdata); |
| 24 | 24 | ||
| 25 | kfree(pdata->ptrs[cpu]); | 25 | kfree(pdata->ptrs[cpu]); |
| 26 | pdata->ptrs[cpu] = NULL; | 26 | pdata->ptrs[cpu] = NULL; |
| 27 | } | 27 | } |
| 28 | EXPORT_SYMBOL_GPL(percpu_depopulate); | ||
| 29 | 28 | ||
| 30 | /** | 29 | /** |
| 31 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's | 30 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's |
| 32 | * @__pdata: per-cpu data to depopulate | 31 | * @__pdata: per-cpu data to depopulate |
| 33 | * @mask: depopulate per-cpu data for cpu's selected through mask bits | 32 | * @mask: depopulate per-cpu data for cpu's selected through mask bits |
| 34 | */ | 33 | */ |
| 35 | void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) | 34 | static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) |
| 36 | { | 35 | { |
| 37 | int cpu; | 36 | int cpu; |
| 38 | for_each_cpu_mask(cpu, *mask) | 37 | for_each_cpu_mask_nr(cpu, *mask) |
| 39 | percpu_depopulate(__pdata, cpu); | 38 | percpu_depopulate(__pdata, cpu); |
| 40 | } | 39 | } |
| 41 | EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); | 40 | |
| 41 | #define percpu_depopulate_mask(__pdata, mask) \ | ||
| 42 | __percpu_depopulate_mask((__pdata), &(mask)) | ||
| 42 | 43 | ||
| 43 | /** | 44 | /** |
| 44 | * percpu_populate - populate per-cpu data for given cpu | 45 | * percpu_populate - populate per-cpu data for given cpu |
| @@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask); | |||
| 51 | * use case. You need to register a cpu hotplug handler for that purpose. | 52 | * use case. You need to register a cpu hotplug handler for that purpose. |
| 52 | * Per-cpu object is populated with zeroed buffer. | 53 | * Per-cpu object is populated with zeroed buffer. |
| 53 | */ | 54 | */ |
| 54 | void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | 55 | static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) |
| 55 | { | 56 | { |
| 56 | struct percpu_data *pdata = __percpu_disguise(__pdata); | 57 | struct percpu_data *pdata = __percpu_disguise(__pdata); |
| 57 | int node = cpu_to_node(cpu); | 58 | int node = cpu_to_node(cpu); |
| @@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | |||
| 68 | pdata->ptrs[cpu] = kzalloc(size, gfp); | 69 | pdata->ptrs[cpu] = kzalloc(size, gfp); |
| 69 | return pdata->ptrs[cpu]; | 70 | return pdata->ptrs[cpu]; |
| 70 | } | 71 | } |
| 71 | EXPORT_SYMBOL_GPL(percpu_populate); | ||
| 72 | 72 | ||
| 73 | /** | 73 | /** |
| 74 | * percpu_populate_mask - populate per-cpu data for more cpu's | 74 | * percpu_populate_mask - populate per-cpu data for more cpu's |
| @@ -79,14 +79,14 @@ EXPORT_SYMBOL_GPL(percpu_populate); | |||
| 79 | * | 79 | * |
| 80 | * Per-cpu objects are populated with zeroed buffers. | 80 | * Per-cpu objects are populated with zeroed buffers. |
| 81 | */ | 81 | */ |
| 82 | int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | 82 | static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, |
| 83 | cpumask_t *mask) | 83 | cpumask_t *mask) |
| 84 | { | 84 | { |
| 85 | cpumask_t populated; | 85 | cpumask_t populated; |
| 86 | int cpu; | 86 | int cpu; |
| 87 | 87 | ||
| 88 | cpus_clear(populated); | 88 | cpus_clear(populated); |
| 89 | for_each_cpu_mask(cpu, *mask) | 89 | for_each_cpu_mask_nr(cpu, *mask) |
| 90 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { | 90 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { |
| 91 | __percpu_depopulate_mask(__pdata, &populated); | 91 | __percpu_depopulate_mask(__pdata, &populated); |
| 92 | return -ENOMEM; | 92 | return -ENOMEM; |
| @@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | |||
| 94 | cpu_set(cpu, populated); | 94 | cpu_set(cpu, populated); |
| 95 | return 0; | 95 | return 0; |
| 96 | } | 96 | } |
| 97 | EXPORT_SYMBOL_GPL(__percpu_populate_mask); | 97 | |
| 98 | #define percpu_populate_mask(__pdata, size, gfp, mask) \ | ||
| 99 | __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) | ||
| 98 | 100 | ||
| 99 | /** | 101 | /** |
| 100 | * percpu_alloc_mask - initial setup of per-cpu data | 102 | * percpu_alloc_mask - initial setup of per-cpu data |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 8d9f60e06f62..4af15d0340ad 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -1,12 +1,12 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * linux/mm/bootmem.c | 2 | * bootmem - A boot-time physical memory allocator and configurator |
| 3 | * | 3 | * |
| 4 | * Copyright (C) 1999 Ingo Molnar | 4 | * Copyright (C) 1999 Ingo Molnar |
| 5 | * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 | 5 | * 1999 Kanoj Sarcar, SGI |
| 6 | * 2008 Johannes Weiner | ||
| 6 | * | 7 | * |
| 7 | * simple boot-time physical memory area allocator and | 8 | * Access to this subsystem has to be serialized externally (which is true |
| 8 | * free memory collector. It's used to deal with reserved | 9 | * for the boot process anyway). |
| 9 | * system memory and memory holes as well. | ||
| 10 | */ | 10 | */ |
| 11 | #include <linux/init.h> | 11 | #include <linux/init.h> |
| 12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
| @@ -19,15 +19,10 @@ | |||
| 19 | 19 | ||
| 20 | #include "internal.h" | 20 | #include "internal.h" |
| 21 | 21 | ||
| 22 | /* | ||
| 23 | * Access to this subsystem has to be serialized externally. (this is | ||
| 24 | * true for the boot process anyway) | ||
| 25 | */ | ||
| 26 | unsigned long max_low_pfn; | 22 | unsigned long max_low_pfn; |
| 27 | unsigned long min_low_pfn; | 23 | unsigned long min_low_pfn; |
| 28 | unsigned long max_pfn; | 24 | unsigned long max_pfn; |
| 29 | 25 | ||
| 30 | static LIST_HEAD(bdata_list); | ||
| 31 | #ifdef CONFIG_CRASH_DUMP | 26 | #ifdef CONFIG_CRASH_DUMP |
| 32 | /* | 27 | /* |
| 33 | * If we have booted due to a crash, max_pfn will be a very low value. We need | 28 | * If we have booted due to a crash, max_pfn will be a very low value. We need |
| @@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list); | |||
| 36 | unsigned long saved_max_pfn; | 31 | unsigned long saved_max_pfn; |
| 37 | #endif | 32 | #endif |
| 38 | 33 | ||
| 39 | /* return the number of _pages_ that will be allocated for the boot bitmap */ | 34 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; |
| 40 | unsigned long __init bootmem_bootmap_pages(unsigned long pages) | 35 | |
| 36 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); | ||
| 37 | |||
| 38 | static int bootmem_debug; | ||
| 39 | |||
| 40 | static int __init bootmem_debug_setup(char *buf) | ||
| 41 | { | 41 | { |
| 42 | unsigned long mapsize; | 42 | bootmem_debug = 1; |
| 43 | return 0; | ||
| 44 | } | ||
| 45 | early_param("bootmem_debug", bootmem_debug_setup); | ||
| 43 | 46 | ||
| 44 | mapsize = (pages+7)/8; | 47 | #define bdebug(fmt, args...) ({ \ |
| 45 | mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK; | 48 | if (unlikely(bootmem_debug)) \ |
| 46 | mapsize >>= PAGE_SHIFT; | 49 | printk(KERN_INFO \ |
| 50 | "bootmem::%s " fmt, \ | ||
| 51 | __FUNCTION__, ## args); \ | ||
| 52 | }) | ||
| 47 | 53 | ||
| 48 | return mapsize; | 54 | static unsigned long __init bootmap_bytes(unsigned long pages) |
| 55 | { | ||
| 56 | unsigned long bytes = (pages + 7) / 8; | ||
| 57 | |||
| 58 | return ALIGN(bytes, sizeof(long)); | ||
| 49 | } | 59 | } |
| 50 | 60 | ||
| 51 | /* | 61 | /** |
| 52 | * link bdata in order | 62 | * bootmem_bootmap_pages - calculate bitmap size in pages |
| 63 | * @pages: number of pages the bitmap has to represent | ||
| 53 | */ | 64 | */ |
| 54 | static void __init link_bootmem(bootmem_data_t *bdata) | 65 | unsigned long __init bootmem_bootmap_pages(unsigned long pages) |
| 55 | { | 66 | { |
| 56 | bootmem_data_t *ent; | 67 | unsigned long bytes = bootmap_bytes(pages); |
| 57 | 68 | ||
| 58 | if (list_empty(&bdata_list)) { | 69 | return PAGE_ALIGN(bytes) >> PAGE_SHIFT; |
| 59 | list_add(&bdata->list, &bdata_list); | ||
| 60 | return; | ||
| 61 | } | ||
| 62 | /* insert in order */ | ||
| 63 | list_for_each_entry(ent, &bdata_list, list) { | ||
| 64 | if (bdata->node_boot_start < ent->node_boot_start) { | ||
| 65 | list_add_tail(&bdata->list, &ent->list); | ||
| 66 | return; | ||
| 67 | } | ||
| 68 | } | ||
| 69 | list_add_tail(&bdata->list, &bdata_list); | ||
| 70 | } | 70 | } |
| 71 | 71 | ||
| 72 | /* | 72 | /* |
| 73 | * Given an initialised bdata, it returns the size of the boot bitmap | 73 | * link bdata in order |
| 74 | */ | 74 | */ |
| 75 | static unsigned long __init get_mapsize(bootmem_data_t *bdata) | 75 | static void __init link_bootmem(bootmem_data_t *bdata) |
| 76 | { | 76 | { |
| 77 | unsigned long mapsize; | 77 | struct list_head *iter; |
| 78 | unsigned long start = PFN_DOWN(bdata->node_boot_start); | ||
| 79 | unsigned long end = bdata->node_low_pfn; | ||
| 80 | 78 | ||
| 81 | mapsize = ((end - start) + 7) / 8; | 79 | list_for_each(iter, &bdata_list) { |
| 82 | return ALIGN(mapsize, sizeof(long)); | 80 | bootmem_data_t *ent; |
| 81 | |||
| 82 | ent = list_entry(iter, bootmem_data_t, list); | ||
| 83 | if (bdata->node_min_pfn < ent->node_min_pfn) | ||
| 84 | break; | ||
| 85 | } | ||
| 86 | list_add_tail(&bdata->list, iter); | ||
| 83 | } | 87 | } |
| 84 | 88 | ||
| 85 | /* | 89 | /* |
| 86 | * Called once to set up the allocator itself. | 90 | * Called once to set up the allocator itself. |
| 87 | */ | 91 | */ |
| 88 | static unsigned long __init init_bootmem_core(pg_data_t *pgdat, | 92 | static unsigned long __init init_bootmem_core(bootmem_data_t *bdata, |
| 89 | unsigned long mapstart, unsigned long start, unsigned long end) | 93 | unsigned long mapstart, unsigned long start, unsigned long end) |
| 90 | { | 94 | { |
| 91 | bootmem_data_t *bdata = pgdat->bdata; | ||
| 92 | unsigned long mapsize; | 95 | unsigned long mapsize; |
| 93 | 96 | ||
| 97 | mminit_validate_memmodel_limits(&start, &end); | ||
| 94 | bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); | 98 | bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart)); |
| 95 | bdata->node_boot_start = PFN_PHYS(start); | 99 | bdata->node_min_pfn = start; |
| 96 | bdata->node_low_pfn = end; | 100 | bdata->node_low_pfn = end; |
| 97 | link_bootmem(bdata); | 101 | link_bootmem(bdata); |
| 98 | 102 | ||
| @@ -100,429 +104,461 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat, | |||
| 100 | * Initially all pages are reserved - setup_arch() has to | 104 | * Initially all pages are reserved - setup_arch() has to |
| 101 | * register free RAM areas explicitly. | 105 | * register free RAM areas explicitly. |
| 102 | */ | 106 | */ |
| 103 | mapsize = get_mapsize(bdata); | 107 | mapsize = bootmap_bytes(end - start); |
| 104 | memset(bdata->node_bootmem_map, 0xff, mapsize); | 108 | memset(bdata->node_bootmem_map, 0xff, mapsize); |
| 105 | 109 | ||
| 110 | bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n", | ||
| 111 | bdata - bootmem_node_data, start, mapstart, end, mapsize); | ||
| 112 | |||
| 106 | return mapsize; | 113 | return mapsize; |
| 107 | } | 114 | } |
| 108 | 115 | ||
| 109 | /* | 116 | /** |
| 110 | * Marks a particular physical memory range as unallocatable. Usable RAM | 117 | * init_bootmem_node - register a node as boot memory |
| 111 | * might be used for boot-time allocations - or it might get added | 118 | * @pgdat: node to register |
| 112 | * to the free page pool later on. | 119 | * @freepfn: pfn where the bitmap for this node is to be placed |
| 120 | * @startpfn: first pfn on the node | ||
| 121 | * @endpfn: first pfn after the node | ||
| 122 | * | ||
| 123 | * Returns the number of bytes needed to hold the bitmap for this node. | ||
| 113 | */ | 124 | */ |
| 114 | static int __init can_reserve_bootmem_core(bootmem_data_t *bdata, | 125 | unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, |
| 115 | unsigned long addr, unsigned long size, int flags) | 126 | unsigned long startpfn, unsigned long endpfn) |
| 116 | { | 127 | { |
| 117 | unsigned long sidx, eidx; | 128 | return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn); |
| 118 | unsigned long i; | 129 | } |
| 119 | 130 | ||
| 120 | BUG_ON(!size); | 131 | /** |
| 132 | * init_bootmem - register boot memory | ||
| 133 | * @start: pfn where the bitmap is to be placed | ||
| 134 | * @pages: number of available physical pages | ||
| 135 | * | ||
| 136 | * Returns the number of bytes needed to hold the bitmap. | ||
| 137 | */ | ||
| 138 | unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | ||
| 139 | { | ||
| 140 | max_low_pfn = pages; | ||
| 141 | min_low_pfn = start; | ||
| 142 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); | ||
| 143 | } | ||
| 121 | 144 | ||
| 122 | /* out of range, don't hold other */ | 145 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
| 123 | if (addr + size < bdata->node_boot_start || | 146 | { |
| 124 | PFN_DOWN(addr) > bdata->node_low_pfn) | 147 | int aligned; |
| 148 | struct page *page; | ||
| 149 | unsigned long start, end, pages, count = 0; | ||
| 150 | |||
| 151 | if (!bdata->node_bootmem_map) | ||
| 125 | return 0; | 152 | return 0; |
| 126 | 153 | ||
| 154 | start = bdata->node_min_pfn; | ||
| 155 | end = bdata->node_low_pfn; | ||
| 156 | |||
| 127 | /* | 157 | /* |
| 128 | * Round up to index to the range. | 158 | * If the start is aligned to the machines wordsize, we might |
| 159 | * be able to free pages in bulks of that order. | ||
| 129 | */ | 160 | */ |
| 130 | if (addr > bdata->node_boot_start) | 161 | aligned = !(start & (BITS_PER_LONG - 1)); |
| 131 | sidx= PFN_DOWN(addr - bdata->node_boot_start); | ||
| 132 | else | ||
| 133 | sidx = 0; | ||
| 134 | 162 | ||
| 135 | eidx = PFN_UP(addr + size - bdata->node_boot_start); | 163 | bdebug("nid=%td start=%lx end=%lx aligned=%d\n", |
| 136 | if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) | 164 | bdata - bootmem_node_data, start, end, aligned); |
| 137 | eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); | ||
| 138 | 165 | ||
| 139 | for (i = sidx; i < eidx; i++) { | 166 | while (start < end) { |
| 140 | if (test_bit(i, bdata->node_bootmem_map)) { | 167 | unsigned long *map, idx, vec; |
| 141 | if (flags & BOOTMEM_EXCLUSIVE) | ||
| 142 | return -EBUSY; | ||
| 143 | } | ||
| 144 | } | ||
| 145 | 168 | ||
| 146 | return 0; | 169 | map = bdata->node_bootmem_map; |
| 170 | idx = start - bdata->node_min_pfn; | ||
| 171 | vec = ~map[idx / BITS_PER_LONG]; | ||
| 147 | 172 | ||
| 148 | } | 173 | if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { |
| 174 | int order = ilog2(BITS_PER_LONG); | ||
| 149 | 175 | ||
| 150 | static void __init reserve_bootmem_core(bootmem_data_t *bdata, | 176 | __free_pages_bootmem(pfn_to_page(start), order); |
| 151 | unsigned long addr, unsigned long size, int flags) | 177 | count += BITS_PER_LONG; |
| 152 | { | 178 | } else { |
| 153 | unsigned long sidx, eidx; | 179 | unsigned long off = 0; |
| 154 | unsigned long i; | ||
| 155 | |||
| 156 | BUG_ON(!size); | ||
| 157 | 180 | ||
| 158 | /* out of range */ | 181 | while (vec && off < BITS_PER_LONG) { |
| 159 | if (addr + size < bdata->node_boot_start || | 182 | if (vec & 1) { |
| 160 | PFN_DOWN(addr) > bdata->node_low_pfn) | 183 | page = pfn_to_page(start + off); |
| 161 | return; | 184 | __free_pages_bootmem(page, 0); |
| 185 | count++; | ||
| 186 | } | ||
| 187 | vec >>= 1; | ||
| 188 | off++; | ||
| 189 | } | ||
| 190 | } | ||
| 191 | start += BITS_PER_LONG; | ||
| 192 | } | ||
| 162 | 193 | ||
| 163 | /* | 194 | page = virt_to_page(bdata->node_bootmem_map); |
| 164 | * Round up to index to the range. | 195 | pages = bdata->node_low_pfn - bdata->node_min_pfn; |
| 165 | */ | 196 | pages = bootmem_bootmap_pages(pages); |
| 166 | if (addr > bdata->node_boot_start) | 197 | count += pages; |
| 167 | sidx= PFN_DOWN(addr - bdata->node_boot_start); | 198 | while (pages--) |
| 168 | else | 199 | __free_pages_bootmem(page++, 0); |
| 169 | sidx = 0; | ||
| 170 | 200 | ||
| 171 | eidx = PFN_UP(addr + size - bdata->node_boot_start); | 201 | bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); |
| 172 | if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) | ||
| 173 | eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); | ||
| 174 | 202 | ||
| 175 | for (i = sidx; i < eidx; i++) { | 203 | return count; |
| 176 | if (test_and_set_bit(i, bdata->node_bootmem_map)) { | ||
| 177 | #ifdef CONFIG_DEBUG_BOOTMEM | ||
| 178 | printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); | ||
| 179 | #endif | ||
| 180 | } | ||
| 181 | } | ||
| 182 | } | 204 | } |
| 183 | 205 | ||
| 184 | static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, | 206 | /** |
| 185 | unsigned long size) | 207 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
| 208 | * @pgdat: node to be released | ||
| 209 | * | ||
| 210 | * Returns the number of pages actually released. | ||
| 211 | */ | ||
| 212 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | ||
| 186 | { | 213 | { |
| 187 | unsigned long sidx, eidx; | 214 | register_page_bootmem_info_node(pgdat); |
| 188 | unsigned long i; | 215 | return free_all_bootmem_core(pgdat->bdata); |
| 189 | 216 | } | |
| 190 | BUG_ON(!size); | ||
| 191 | 217 | ||
| 192 | /* out range */ | 218 | /** |
| 193 | if (addr + size < bdata->node_boot_start || | 219 | * free_all_bootmem - release free pages to the buddy allocator |
| 194 | PFN_DOWN(addr) > bdata->node_low_pfn) | 220 | * |
| 195 | return; | 221 | * Returns the number of pages actually released. |
| 196 | /* | 222 | */ |
| 197 | * round down end of usable mem, partially free pages are | 223 | unsigned long __init free_all_bootmem(void) |
| 198 | * considered reserved. | 224 | { |
| 199 | */ | 225 | return free_all_bootmem_core(NODE_DATA(0)->bdata); |
| 226 | } | ||
| 200 | 227 | ||
| 201 | if (addr >= bdata->node_boot_start && addr < bdata->last_success) | 228 | static void __init __free(bootmem_data_t *bdata, |
| 202 | bdata->last_success = addr; | 229 | unsigned long sidx, unsigned long eidx) |
| 230 | { | ||
| 231 | unsigned long idx; | ||
| 203 | 232 | ||
| 204 | /* | 233 | bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data, |
| 205 | * Round up to index to the range. | 234 | sidx + bdata->node_min_pfn, |
| 206 | */ | 235 | eidx + bdata->node_min_pfn); |
| 207 | if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start)) | ||
| 208 | sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start); | ||
| 209 | else | ||
| 210 | sidx = 0; | ||
| 211 | 236 | ||
| 212 | eidx = PFN_DOWN(addr + size - bdata->node_boot_start); | 237 | if (bdata->hint_idx > sidx) |
| 213 | if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start)) | 238 | bdata->hint_idx = sidx; |
| 214 | eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start); | ||
| 215 | 239 | ||
| 216 | for (i = sidx; i < eidx; i++) { | 240 | for (idx = sidx; idx < eidx; idx++) |
| 217 | if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map))) | 241 | if (!test_and_clear_bit(idx, bdata->node_bootmem_map)) |
| 218 | BUG(); | 242 | BUG(); |
| 219 | } | ||
| 220 | } | 243 | } |
| 221 | 244 | ||
| 222 | /* | 245 | static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx, |
| 223 | * We 'merge' subsequent allocations to save space. We might 'lose' | 246 | unsigned long eidx, int flags) |
| 224 | * some fraction of a page if allocations cannot be satisfied due to | ||
| 225 | * size constraints on boxes where there is physical RAM space | ||
| 226 | * fragmentation - in these cases (mostly large memory boxes) this | ||
| 227 | * is not a problem. | ||
| 228 | * | ||
| 229 | * On low memory boxes we get it right in 100% of the cases. | ||
| 230 | * | ||
| 231 | * alignment has to be a power of 2 value. | ||
| 232 | * | ||
| 233 | * NOTE: This function is _not_ reentrant. | ||
| 234 | */ | ||
| 235 | void * __init | ||
| 236 | __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, | ||
| 237 | unsigned long align, unsigned long goal, unsigned long limit) | ||
| 238 | { | 247 | { |
| 239 | unsigned long areasize, preferred; | 248 | unsigned long idx; |
| 240 | unsigned long i, start = 0, incr, eidx, end_pfn; | 249 | int exclusive = flags & BOOTMEM_EXCLUSIVE; |
| 241 | void *ret; | 250 | |
| 242 | unsigned long node_boot_start; | 251 | bdebug("nid=%td start=%lx end=%lx flags=%x\n", |
| 243 | void *node_bootmem_map; | 252 | bdata - bootmem_node_data, |
| 244 | 253 | sidx + bdata->node_min_pfn, | |
| 245 | if (!size) { | 254 | eidx + bdata->node_min_pfn, |
| 246 | printk("__alloc_bootmem_core(): zero-sized request\n"); | 255 | flags); |
| 247 | BUG(); | 256 | |
| 248 | } | 257 | for (idx = sidx; idx < eidx; idx++) |
| 249 | BUG_ON(align & (align-1)); | 258 | if (test_and_set_bit(idx, bdata->node_bootmem_map)) { |
| 250 | 259 | if (exclusive) { | |
| 251 | /* on nodes without memory - bootmem_map is NULL */ | 260 | __free(bdata, sidx, idx); |
| 252 | if (!bdata->node_bootmem_map) | 261 | return -EBUSY; |
| 253 | return NULL; | 262 | } |
| 263 | bdebug("silent double reserve of PFN %lx\n", | ||
| 264 | idx + bdata->node_min_pfn); | ||
| 265 | } | ||
| 266 | return 0; | ||
| 267 | } | ||
| 254 | 268 | ||
| 255 | /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */ | 269 | static int __init mark_bootmem_node(bootmem_data_t *bdata, |
| 256 | node_boot_start = bdata->node_boot_start; | 270 | unsigned long start, unsigned long end, |
| 257 | node_bootmem_map = bdata->node_bootmem_map; | 271 | int reserve, int flags) |
| 258 | if (align) { | 272 | { |
| 259 | node_boot_start = ALIGN(bdata->node_boot_start, align); | 273 | unsigned long sidx, eidx; |
| 260 | if (node_boot_start > bdata->node_boot_start) | ||
| 261 | node_bootmem_map = (unsigned long *)bdata->node_bootmem_map + | ||
| 262 | PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG; | ||
| 263 | } | ||
| 264 | 274 | ||
| 265 | if (limit && node_boot_start >= limit) | 275 | bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n", |
| 266 | return NULL; | 276 | bdata - bootmem_node_data, start, end, reserve, flags); |
| 267 | 277 | ||
| 268 | end_pfn = bdata->node_low_pfn; | 278 | BUG_ON(start < bdata->node_min_pfn); |
| 269 | limit = PFN_DOWN(limit); | 279 | BUG_ON(end > bdata->node_low_pfn); |
| 270 | if (limit && end_pfn > limit) | ||
| 271 | end_pfn = limit; | ||
| 272 | 280 | ||
| 273 | eidx = end_pfn - PFN_DOWN(node_boot_start); | 281 | sidx = start - bdata->node_min_pfn; |
| 282 | eidx = end - bdata->node_min_pfn; | ||
| 274 | 283 | ||
| 275 | /* | 284 | if (reserve) |
| 276 | * We try to allocate bootmem pages above 'goal' | 285 | return __reserve(bdata, sidx, eidx, flags); |
| 277 | * first, then we try to allocate lower pages. | 286 | else |
| 278 | */ | 287 | __free(bdata, sidx, eidx); |
| 279 | preferred = 0; | 288 | return 0; |
| 280 | if (goal && PFN_DOWN(goal) < end_pfn) { | 289 | } |
| 281 | if (goal > node_boot_start) | ||
| 282 | preferred = goal - node_boot_start; | ||
| 283 | |||
| 284 | if (bdata->last_success > node_boot_start && | ||
| 285 | bdata->last_success - node_boot_start >= preferred) | ||
| 286 | if (!limit || (limit && limit > bdata->last_success)) | ||
| 287 | preferred = bdata->last_success - node_boot_start; | ||
| 288 | } | ||
| 289 | 290 | ||
| 290 | preferred = PFN_DOWN(ALIGN(preferred, align)); | 291 | static int __init mark_bootmem(unsigned long start, unsigned long end, |
| 291 | areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; | 292 | int reserve, int flags) |
| 292 | incr = align >> PAGE_SHIFT ? : 1; | 293 | { |
| 294 | unsigned long pos; | ||
| 295 | bootmem_data_t *bdata; | ||
| 293 | 296 | ||
| 294 | restart_scan: | 297 | pos = start; |
| 295 | for (i = preferred; i < eidx;) { | 298 | list_for_each_entry(bdata, &bdata_list, list) { |
| 296 | unsigned long j; | 299 | int err; |
| 300 | unsigned long max; | ||
| 297 | 301 | ||
| 298 | i = find_next_zero_bit(node_bootmem_map, eidx, i); | 302 | if (pos < bdata->node_min_pfn || |
| 299 | i = ALIGN(i, incr); | 303 | pos >= bdata->node_low_pfn) { |
| 300 | if (i >= eidx) | 304 | BUG_ON(pos != start); |
| 301 | break; | ||
| 302 | if (test_bit(i, node_bootmem_map)) { | ||
| 303 | i += incr; | ||
| 304 | continue; | 305 | continue; |
| 305 | } | 306 | } |
| 306 | for (j = i + 1; j < i + areasize; ++j) { | ||
| 307 | if (j >= eidx) | ||
| 308 | goto fail_block; | ||
| 309 | if (test_bit(j, node_bootmem_map)) | ||
| 310 | goto fail_block; | ||
| 311 | } | ||
| 312 | start = i; | ||
| 313 | goto found; | ||
| 314 | fail_block: | ||
| 315 | i = ALIGN(j, incr); | ||
| 316 | if (i == j) | ||
| 317 | i += incr; | ||
| 318 | } | ||
| 319 | 307 | ||
| 320 | if (preferred > 0) { | 308 | max = min(bdata->node_low_pfn, end); |
| 321 | preferred = 0; | ||
| 322 | goto restart_scan; | ||
| 323 | } | ||
| 324 | return NULL; | ||
| 325 | 309 | ||
| 326 | found: | 310 | err = mark_bootmem_node(bdata, pos, max, reserve, flags); |
| 327 | bdata->last_success = PFN_PHYS(start) + node_boot_start; | 311 | if (reserve && err) { |
| 328 | BUG_ON(start >= eidx); | 312 | mark_bootmem(start, pos, 0, 0); |
| 329 | 313 | return err; | |
| 330 | /* | ||
| 331 | * Is the next page of the previous allocation-end the start | ||
| 332 | * of this allocation's buffer? If yes then we can 'merge' | ||
| 333 | * the previous partial page with this allocation. | ||
| 334 | */ | ||
| 335 | if (align < PAGE_SIZE && | ||
| 336 | bdata->last_offset && bdata->last_pos+1 == start) { | ||
| 337 | unsigned long offset, remaining_size; | ||
| 338 | offset = ALIGN(bdata->last_offset, align); | ||
| 339 | BUG_ON(offset > PAGE_SIZE); | ||
| 340 | remaining_size = PAGE_SIZE - offset; | ||
| 341 | if (size < remaining_size) { | ||
| 342 | areasize = 0; | ||
| 343 | /* last_pos unchanged */ | ||
| 344 | bdata->last_offset = offset + size; | ||
| 345 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + | ||
| 346 | offset + node_boot_start); | ||
| 347 | } else { | ||
| 348 | remaining_size = size - remaining_size; | ||
| 349 | areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; | ||
| 350 | ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + | ||
| 351 | offset + node_boot_start); | ||
| 352 | bdata->last_pos = start + areasize - 1; | ||
| 353 | bdata->last_offset = remaining_size; | ||
| 354 | } | 314 | } |
| 355 | bdata->last_offset &= ~PAGE_MASK; | ||
| 356 | } else { | ||
| 357 | bdata->last_pos = start + areasize - 1; | ||
| 358 | bdata->last_offset = size & ~PAGE_MASK; | ||
| 359 | ret = phys_to_virt(start * PAGE_SIZE + node_boot_start); | ||
| 360 | } | ||
| 361 | 315 | ||
| 362 | /* | 316 | if (max == end) |
| 363 | * Reserve the area now: | 317 | return 0; |
| 364 | */ | 318 | pos = bdata->node_low_pfn; |
| 365 | for (i = start; i < start + areasize; i++) | 319 | } |
| 366 | if (unlikely(test_and_set_bit(i, node_bootmem_map))) | 320 | BUG(); |
| 367 | BUG(); | ||
| 368 | memset(ret, 0, size); | ||
| 369 | return ret; | ||
| 370 | } | 321 | } |
| 371 | 322 | ||
| 372 | static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat) | 323 | /** |
| 324 | * free_bootmem_node - mark a page range as usable | ||
| 325 | * @pgdat: node the range resides on | ||
| 326 | * @physaddr: starting address of the range | ||
| 327 | * @size: size of the range in bytes | ||
| 328 | * | ||
| 329 | * Partial pages will be considered reserved and left as they are. | ||
| 330 | * | ||
| 331 | * The range must reside completely on the specified node. | ||
| 332 | */ | ||
| 333 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | ||
| 334 | unsigned long size) | ||
| 373 | { | 335 | { |
| 374 | struct page *page; | 336 | unsigned long start, end; |
| 375 | unsigned long pfn; | ||
| 376 | bootmem_data_t *bdata = pgdat->bdata; | ||
| 377 | unsigned long i, count, total = 0; | ||
| 378 | unsigned long idx; | ||
| 379 | unsigned long *map; | ||
| 380 | int gofast = 0; | ||
| 381 | |||
| 382 | BUG_ON(!bdata->node_bootmem_map); | ||
| 383 | |||
| 384 | count = 0; | ||
| 385 | /* first extant page of the node */ | ||
| 386 | pfn = PFN_DOWN(bdata->node_boot_start); | ||
| 387 | idx = bdata->node_low_pfn - pfn; | ||
| 388 | map = bdata->node_bootmem_map; | ||
| 389 | /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */ | ||
| 390 | if (bdata->node_boot_start == 0 || | ||
| 391 | ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG)) | ||
| 392 | gofast = 1; | ||
| 393 | for (i = 0; i < idx; ) { | ||
| 394 | unsigned long v = ~map[i / BITS_PER_LONG]; | ||
| 395 | |||
| 396 | if (gofast && v == ~0UL) { | ||
| 397 | int order; | ||
| 398 | |||
| 399 | page = pfn_to_page(pfn); | ||
| 400 | count += BITS_PER_LONG; | ||
| 401 | order = ffs(BITS_PER_LONG) - 1; | ||
| 402 | __free_pages_bootmem(page, order); | ||
| 403 | i += BITS_PER_LONG; | ||
| 404 | page += BITS_PER_LONG; | ||
| 405 | } else if (v) { | ||
| 406 | unsigned long m; | ||
| 407 | |||
| 408 | page = pfn_to_page(pfn); | ||
| 409 | for (m = 1; m && i < idx; m<<=1, page++, i++) { | ||
| 410 | if (v & m) { | ||
| 411 | count++; | ||
| 412 | __free_pages_bootmem(page, 0); | ||
| 413 | } | ||
| 414 | } | ||
| 415 | } else { | ||
| 416 | i += BITS_PER_LONG; | ||
| 417 | } | ||
| 418 | pfn += BITS_PER_LONG; | ||
| 419 | } | ||
| 420 | total += count; | ||
| 421 | 337 | ||
| 422 | /* | 338 | start = PFN_UP(physaddr); |
| 423 | * Now free the allocator bitmap itself, it's not | 339 | end = PFN_DOWN(physaddr + size); |
| 424 | * needed anymore: | ||
| 425 | */ | ||
| 426 | page = virt_to_page(bdata->node_bootmem_map); | ||
| 427 | count = 0; | ||
| 428 | idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT; | ||
| 429 | for (i = 0; i < idx; i++, page++) { | ||
| 430 | __free_pages_bootmem(page, 0); | ||
| 431 | count++; | ||
| 432 | } | ||
| 433 | total += count; | ||
| 434 | bdata->node_bootmem_map = NULL; | ||
| 435 | 340 | ||
| 436 | return total; | 341 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); |
| 437 | } | 342 | } |
| 438 | 343 | ||
| 439 | unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn, | 344 | /** |
| 440 | unsigned long startpfn, unsigned long endpfn) | 345 | * free_bootmem - mark a page range as usable |
| 441 | { | 346 | * @addr: starting address of the range |
| 442 | return init_bootmem_core(pgdat, freepfn, startpfn, endpfn); | 347 | * @size: size of the range in bytes |
| 443 | } | 348 | * |
| 444 | 349 | * Partial pages will be considered reserved and left as they are. | |
| 445 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 350 | * |
| 446 | unsigned long size, int flags) | 351 | * The range must be contiguous but may span node boundaries. |
| 352 | */ | ||
| 353 | void __init free_bootmem(unsigned long addr, unsigned long size) | ||
| 447 | { | 354 | { |
| 448 | int ret; | 355 | unsigned long start, end; |
| 449 | 356 | ||
| 450 | ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); | 357 | start = PFN_UP(addr); |
| 451 | if (ret < 0) | 358 | end = PFN_DOWN(addr + size); |
| 452 | return -ENOMEM; | ||
| 453 | reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); | ||
| 454 | 359 | ||
| 455 | return 0; | 360 | mark_bootmem(start, end, 0, 0); |
| 456 | } | 361 | } |
| 457 | 362 | ||
| 458 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 363 | /** |
| 459 | unsigned long size) | 364 | * reserve_bootmem_node - mark a page range as reserved |
| 365 | * @pgdat: node the range resides on | ||
| 366 | * @physaddr: starting address of the range | ||
| 367 | * @size: size of the range in bytes | ||
| 368 | * @flags: reservation flags (see linux/bootmem.h) | ||
| 369 | * | ||
| 370 | * Partial pages will be reserved. | ||
| 371 | * | ||
| 372 | * The range must reside completely on the specified node. | ||
| 373 | */ | ||
| 374 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | ||
| 375 | unsigned long size, int flags) | ||
| 460 | { | 376 | { |
| 461 | free_bootmem_core(pgdat->bdata, physaddr, size); | 377 | unsigned long start, end; |
| 462 | } | ||
| 463 | 378 | ||
| 464 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 379 | start = PFN_DOWN(physaddr); |
| 465 | { | 380 | end = PFN_UP(physaddr + size); |
| 466 | register_page_bootmem_info_node(pgdat); | ||
| 467 | return free_all_bootmem_core(pgdat); | ||
| 468 | } | ||
| 469 | 381 | ||
| 470 | unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | 382 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); |
| 471 | { | ||
| 472 | max_low_pfn = pages; | ||
| 473 | min_low_pfn = start; | ||
| 474 | return init_bootmem_core(NODE_DATA(0), start, 0, pages); | ||
| 475 | } | 383 | } |
| 476 | 384 | ||
| 477 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE | 385 | #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE |
| 386 | /** | ||
| 387 | * reserve_bootmem - mark a page range as usable | ||
| 388 | * @addr: starting address of the range | ||
| 389 | * @size: size of the range in bytes | ||
| 390 | * @flags: reservation flags (see linux/bootmem.h) | ||
| 391 | * | ||
| 392 | * Partial pages will be reserved. | ||
| 393 | * | ||
| 394 | * The range must be contiguous but may span node boundaries. | ||
| 395 | */ | ||
| 478 | int __init reserve_bootmem(unsigned long addr, unsigned long size, | 396 | int __init reserve_bootmem(unsigned long addr, unsigned long size, |
| 479 | int flags) | 397 | int flags) |
| 480 | { | 398 | { |
| 481 | bootmem_data_t *bdata; | 399 | unsigned long start, end; |
| 482 | int ret; | ||
| 483 | 400 | ||
| 484 | list_for_each_entry(bdata, &bdata_list, list) { | 401 | start = PFN_DOWN(addr); |
| 485 | ret = can_reserve_bootmem_core(bdata, addr, size, flags); | 402 | end = PFN_UP(addr + size); |
| 486 | if (ret < 0) | ||
| 487 | return ret; | ||
| 488 | } | ||
| 489 | list_for_each_entry(bdata, &bdata_list, list) | ||
| 490 | reserve_bootmem_core(bdata, addr, size, flags); | ||
| 491 | 403 | ||
| 492 | return 0; | 404 | return mark_bootmem(start, end, 1, flags); |
| 493 | } | 405 | } |
| 494 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ | 406 | #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ |
| 495 | 407 | ||
| 496 | void __init free_bootmem(unsigned long addr, unsigned long size) | 408 | static void * __init alloc_bootmem_core(struct bootmem_data *bdata, |
| 409 | unsigned long size, unsigned long align, | ||
| 410 | unsigned long goal, unsigned long limit) | ||
| 497 | { | 411 | { |
| 498 | bootmem_data_t *bdata; | 412 | unsigned long fallback = 0; |
| 499 | list_for_each_entry(bdata, &bdata_list, list) | 413 | unsigned long min, max, start, sidx, midx, step; |
| 500 | free_bootmem_core(bdata, addr, size); | ||
| 501 | } | ||
| 502 | 414 | ||
| 503 | unsigned long __init free_all_bootmem(void) | 415 | BUG_ON(!size); |
| 504 | { | 416 | BUG_ON(align & (align - 1)); |
| 505 | return free_all_bootmem_core(NODE_DATA(0)); | 417 | BUG_ON(limit && goal + size > limit); |
| 418 | |||
| 419 | if (!bdata->node_bootmem_map) | ||
| 420 | return NULL; | ||
| 421 | |||
| 422 | bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n", | ||
| 423 | bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT, | ||
| 424 | align, goal, limit); | ||
| 425 | |||
| 426 | min = bdata->node_min_pfn; | ||
| 427 | max = bdata->node_low_pfn; | ||
| 428 | |||
| 429 | goal >>= PAGE_SHIFT; | ||
| 430 | limit >>= PAGE_SHIFT; | ||
| 431 | |||
| 432 | if (limit && max > limit) | ||
| 433 | max = limit; | ||
| 434 | if (max <= min) | ||
| 435 | return NULL; | ||
| 436 | |||
| 437 | step = max(align >> PAGE_SHIFT, 1UL); | ||
| 438 | |||
| 439 | if (goal && min < goal && goal < max) | ||
| 440 | start = ALIGN(goal, step); | ||
| 441 | else | ||
| 442 | start = ALIGN(min, step); | ||
| 443 | |||
| 444 | sidx = start - bdata->node_min_pfn;; | ||
| 445 | midx = max - bdata->node_min_pfn; | ||
| 446 | |||
| 447 | if (bdata->hint_idx > sidx) { | ||
| 448 | /* | ||
| 449 | * Handle the valid case of sidx being zero and still | ||
| 450 | * catch the fallback below. | ||
| 451 | */ | ||
| 452 | fallback = sidx + 1; | ||
| 453 | sidx = ALIGN(bdata->hint_idx, step); | ||
| 454 | } | ||
| 455 | |||
| 456 | while (1) { | ||
| 457 | int merge; | ||
| 458 | void *region; | ||
| 459 | unsigned long eidx, i, start_off, end_off; | ||
| 460 | find_block: | ||
| 461 | sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx); | ||
| 462 | sidx = ALIGN(sidx, step); | ||
| 463 | eidx = sidx + PFN_UP(size); | ||
| 464 | |||
| 465 | if (sidx >= midx || eidx > midx) | ||
| 466 | break; | ||
| 467 | |||
| 468 | for (i = sidx; i < eidx; i++) | ||
| 469 | if (test_bit(i, bdata->node_bootmem_map)) { | ||
| 470 | sidx = ALIGN(i, step); | ||
| 471 | if (sidx == i) | ||
| 472 | sidx += step; | ||
| 473 | goto find_block; | ||
| 474 | } | ||
| 475 | |||
| 476 | if (bdata->last_end_off && | ||
| 477 | PFN_DOWN(bdata->last_end_off) + 1 == sidx) | ||
| 478 | start_off = ALIGN(bdata->last_end_off, align); | ||
| 479 | else | ||
| 480 | start_off = PFN_PHYS(sidx); | ||
| 481 | |||
| 482 | merge = PFN_DOWN(start_off) < sidx; | ||
| 483 | end_off = start_off + size; | ||
| 484 | |||
| 485 | bdata->last_end_off = end_off; | ||
| 486 | bdata->hint_idx = PFN_UP(end_off); | ||
| 487 | |||
| 488 | /* | ||
| 489 | * Reserve the area now: | ||
| 490 | */ | ||
| 491 | if (__reserve(bdata, PFN_DOWN(start_off) + merge, | ||
| 492 | PFN_UP(end_off), BOOTMEM_EXCLUSIVE)) | ||
| 493 | BUG(); | ||
| 494 | |||
| 495 | region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + | ||
| 496 | start_off); | ||
| 497 | memset(region, 0, size); | ||
| 498 | return region; | ||
| 499 | } | ||
| 500 | |||
| 501 | if (fallback) { | ||
| 502 | sidx = ALIGN(fallback - 1, step); | ||
| 503 | fallback = 0; | ||
| 504 | goto find_block; | ||
| 505 | } | ||
| 506 | |||
| 507 | return NULL; | ||
| 506 | } | 508 | } |
| 507 | 509 | ||
| 508 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | 510 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, |
| 509 | unsigned long goal) | 511 | unsigned long align, |
| 512 | unsigned long goal, | ||
| 513 | unsigned long limit) | ||
| 510 | { | 514 | { |
| 511 | bootmem_data_t *bdata; | 515 | bootmem_data_t *bdata; |
| 512 | void *ptr; | ||
| 513 | 516 | ||
| 517 | restart: | ||
| 514 | list_for_each_entry(bdata, &bdata_list, list) { | 518 | list_for_each_entry(bdata, &bdata_list, list) { |
| 515 | ptr = __alloc_bootmem_core(bdata, size, align, goal, 0); | 519 | void *region; |
| 516 | if (ptr) | 520 | |
| 517 | return ptr; | 521 | if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) |
| 522 | continue; | ||
| 523 | if (limit && bdata->node_min_pfn >= PFN_DOWN(limit)) | ||
| 524 | break; | ||
| 525 | |||
| 526 | region = alloc_bootmem_core(bdata, size, align, goal, limit); | ||
| 527 | if (region) | ||
| 528 | return region; | ||
| 529 | } | ||
| 530 | |||
| 531 | if (goal) { | ||
| 532 | goal = 0; | ||
| 533 | goto restart; | ||
| 518 | } | 534 | } |
| 535 | |||
| 519 | return NULL; | 536 | return NULL; |
| 520 | } | 537 | } |
| 521 | 538 | ||
| 522 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, | 539 | /** |
| 523 | unsigned long goal) | 540 | * __alloc_bootmem_nopanic - allocate boot memory without panicking |
| 541 | * @size: size of the request in bytes | ||
| 542 | * @align: alignment of the region | ||
| 543 | * @goal: preferred starting address of the region | ||
| 544 | * | ||
| 545 | * The goal is dropped if it can not be satisfied and the allocation will | ||
| 546 | * fall back to memory below @goal. | ||
| 547 | * | ||
| 548 | * Allocation may happen on any node in the system. | ||
| 549 | * | ||
| 550 | * Returns NULL on failure. | ||
| 551 | */ | ||
| 552 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | ||
| 553 | unsigned long goal) | ||
| 524 | { | 554 | { |
| 525 | void *mem = __alloc_bootmem_nopanic(size,align,goal); | 555 | return ___alloc_bootmem_nopanic(size, align, goal, 0); |
| 556 | } | ||
| 557 | |||
| 558 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | ||
| 559 | unsigned long goal, unsigned long limit) | ||
| 560 | { | ||
| 561 | void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit); | ||
| 526 | 562 | ||
| 527 | if (mem) | 563 | if (mem) |
| 528 | return mem; | 564 | return mem; |
| @@ -534,78 +570,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
| 534 | return NULL; | 570 | return NULL; |
| 535 | } | 571 | } |
| 536 | 572 | ||
| 573 | /** | ||
| 574 | * __alloc_bootmem - allocate boot memory | ||
| 575 | * @size: size of the request in bytes | ||
| 576 | * @align: alignment of the region | ||
| 577 | * @goal: preferred starting address of the region | ||
| 578 | * | ||
| 579 | * The goal is dropped if it can not be satisfied and the allocation will | ||
| 580 | * fall back to memory below @goal. | ||
| 581 | * | ||
| 582 | * Allocation may happen on any node in the system. | ||
| 583 | * | ||
| 584 | * The function panics if the request can not be satisfied. | ||
| 585 | */ | ||
| 586 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, | ||
| 587 | unsigned long goal) | ||
| 588 | { | ||
| 589 | return ___alloc_bootmem(size, align, goal, 0); | ||
| 590 | } | ||
| 537 | 591 | ||
| 538 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | 592 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, |
| 539 | unsigned long align, unsigned long goal) | 593 | unsigned long size, unsigned long align, |
| 594 | unsigned long goal, unsigned long limit) | ||
| 540 | { | 595 | { |
| 541 | void *ptr; | 596 | void *ptr; |
| 542 | 597 | ||
| 543 | ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | 598 | ptr = alloc_bootmem_core(bdata, size, align, goal, limit); |
| 544 | if (ptr) | 599 | if (ptr) |
| 545 | return ptr; | 600 | return ptr; |
| 546 | 601 | ||
| 547 | return __alloc_bootmem(size, align, goal); | 602 | return ___alloc_bootmem(size, align, goal, limit); |
| 603 | } | ||
| 604 | |||
| 605 | /** | ||
| 606 | * __alloc_bootmem_node - allocate boot memory from a specific node | ||
| 607 | * @pgdat: node to allocate from | ||
| 608 | * @size: size of the request in bytes | ||
| 609 | * @align: alignment of the region | ||
| 610 | * @goal: preferred starting address of the region | ||
| 611 | * | ||
| 612 | * The goal is dropped if it can not be satisfied and the allocation will | ||
| 613 | * fall back to memory below @goal. | ||
| 614 | * | ||
| 615 | * Allocation may fall back to any node in the system if the specified node | ||
| 616 | * can not hold the requested memory. | ||
| 617 | * | ||
| 618 | * The function panics if the request can not be satisfied. | ||
| 619 | */ | ||
| 620 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | ||
| 621 | unsigned long align, unsigned long goal) | ||
| 622 | { | ||
| 623 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | ||
| 548 | } | 624 | } |
| 549 | 625 | ||
| 550 | #ifdef CONFIG_SPARSEMEM | 626 | #ifdef CONFIG_SPARSEMEM |
| 627 | /** | ||
| 628 | * alloc_bootmem_section - allocate boot memory from a specific section | ||
| 629 | * @size: size of the request in bytes | ||
| 630 | * @section_nr: sparse map section to allocate from | ||
| 631 | * | ||
| 632 | * Return NULL on failure. | ||
| 633 | */ | ||
| 551 | void * __init alloc_bootmem_section(unsigned long size, | 634 | void * __init alloc_bootmem_section(unsigned long size, |
| 552 | unsigned long section_nr) | 635 | unsigned long section_nr) |
| 553 | { | 636 | { |
| 554 | void *ptr; | 637 | bootmem_data_t *bdata; |
| 555 | unsigned long limit, goal, start_nr, end_nr, pfn; | 638 | unsigned long pfn, goal, limit; |
| 556 | struct pglist_data *pgdat; | ||
| 557 | 639 | ||
| 558 | pfn = section_nr_to_pfn(section_nr); | 640 | pfn = section_nr_to_pfn(section_nr); |
| 559 | goal = PFN_PHYS(pfn); | 641 | goal = pfn << PAGE_SHIFT; |
| 560 | limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1; | 642 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; |
| 561 | pgdat = NODE_DATA(early_pfn_to_nid(pfn)); | 643 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
| 562 | ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal, | ||
| 563 | limit); | ||
| 564 | 644 | ||
| 565 | if (!ptr) | 645 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); |
| 566 | return NULL; | 646 | } |
| 647 | #endif | ||
| 567 | 648 | ||
| 568 | start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr))); | 649 | void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, |
| 569 | end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size)); | 650 | unsigned long align, unsigned long goal) |
| 570 | if (start_nr != section_nr || end_nr != section_nr) { | 651 | { |
| 571 | printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n", | 652 | void *ptr; |
| 572 | section_nr); | ||
| 573 | free_bootmem_core(pgdat->bdata, __pa(ptr), size); | ||
| 574 | ptr = NULL; | ||
| 575 | } | ||
| 576 | 653 | ||
| 577 | return ptr; | 654 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); |
| 655 | if (ptr) | ||
| 656 | return ptr; | ||
| 657 | |||
| 658 | return __alloc_bootmem_nopanic(size, align, goal); | ||
| 578 | } | 659 | } |
| 579 | #endif | ||
| 580 | 660 | ||
| 581 | #ifndef ARCH_LOW_ADDRESS_LIMIT | 661 | #ifndef ARCH_LOW_ADDRESS_LIMIT |
| 582 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL | 662 | #define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL |
| 583 | #endif | 663 | #endif |
| 584 | 664 | ||
| 665 | /** | ||
| 666 | * __alloc_bootmem_low - allocate low boot memory | ||
| 667 | * @size: size of the request in bytes | ||
| 668 | * @align: alignment of the region | ||
| 669 | * @goal: preferred starting address of the region | ||
| 670 | * | ||
| 671 | * The goal is dropped if it can not be satisfied and the allocation will | ||
| 672 | * fall back to memory below @goal. | ||
| 673 | * | ||
| 674 | * Allocation may happen on any node in the system. | ||
| 675 | * | ||
| 676 | * The function panics if the request can not be satisfied. | ||
| 677 | */ | ||
| 585 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | 678 | void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, |
| 586 | unsigned long goal) | 679 | unsigned long goal) |
| 587 | { | 680 | { |
| 588 | bootmem_data_t *bdata; | 681 | return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT); |
| 589 | void *ptr; | ||
| 590 | |||
| 591 | list_for_each_entry(bdata, &bdata_list, list) { | ||
| 592 | ptr = __alloc_bootmem_core(bdata, size, align, goal, | ||
| 593 | ARCH_LOW_ADDRESS_LIMIT); | ||
| 594 | if (ptr) | ||
| 595 | return ptr; | ||
| 596 | } | ||
| 597 | |||
| 598 | /* | ||
| 599 | * Whoops, we cannot satisfy the allocation request. | ||
| 600 | */ | ||
| 601 | printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size); | ||
| 602 | panic("Out of low memory"); | ||
| 603 | return NULL; | ||
| 604 | } | 682 | } |
| 605 | 683 | ||
| 684 | /** | ||
| 685 | * __alloc_bootmem_low_node - allocate low boot memory from a specific node | ||
| 686 | * @pgdat: node to allocate from | ||
| 687 | * @size: size of the request in bytes | ||
| 688 | * @align: alignment of the region | ||
| 689 | * @goal: preferred starting address of the region | ||
| 690 | * | ||
| 691 | * The goal is dropped if it can not be satisfied and the allocation will | ||
| 692 | * fall back to memory below @goal. | ||
| 693 | * | ||
| 694 | * Allocation may fall back to any node in the system if the specified node | ||
| 695 | * can not hold the requested memory. | ||
| 696 | * | ||
| 697 | * The function panics if the request can not be satisfied. | ||
| 698 | */ | ||
| 606 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 699 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, |
| 607 | unsigned long align, unsigned long goal) | 700 | unsigned long align, unsigned long goal) |
| 608 | { | 701 | { |
| 609 | return __alloc_bootmem_core(pgdat->bdata, size, align, goal, | 702 | return ___alloc_bootmem_node(pgdat->bdata, size, align, |
| 610 | ARCH_LOW_ADDRESS_LIMIT); | 703 | goal, ARCH_LOW_ADDRESS_LIMIT); |
| 611 | } | 704 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 65d9d9e2b755..54e968650855 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -42,9 +42,6 @@ | |||
| 42 | 42 | ||
| 43 | #include <asm/mman.h> | 43 | #include <asm/mman.h> |
| 44 | 44 | ||
| 45 | static ssize_t | ||
| 46 | generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | ||
| 47 | loff_t offset, unsigned long nr_segs); | ||
| 48 | 45 | ||
| 49 | /* | 46 | /* |
| 50 | * Shared mappings implemented 30.11.1994. It's not fully working yet, | 47 | * Shared mappings implemented 30.11.1994. It's not fully working yet, |
| @@ -112,13 +109,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | |||
| 112 | /* | 109 | /* |
| 113 | * Remove a page from the page cache and free it. Caller has to make | 110 | * Remove a page from the page cache and free it. Caller has to make |
| 114 | * sure the page is locked and that nobody else uses it - or that usage | 111 | * sure the page is locked and that nobody else uses it - or that usage |
| 115 | * is safe. The caller must hold a write_lock on the mapping's tree_lock. | 112 | * is safe. The caller must hold the mapping's tree_lock. |
| 116 | */ | 113 | */ |
| 117 | void __remove_from_page_cache(struct page *page) | 114 | void __remove_from_page_cache(struct page *page) |
| 118 | { | 115 | { |
| 119 | struct address_space *mapping = page->mapping; | 116 | struct address_space *mapping = page->mapping; |
| 120 | 117 | ||
| 121 | mem_cgroup_uncharge_page(page); | 118 | mem_cgroup_uncharge_cache_page(page); |
| 122 | radix_tree_delete(&mapping->page_tree, page->index); | 119 | radix_tree_delete(&mapping->page_tree, page->index); |
| 123 | page->mapping = NULL; | 120 | page->mapping = NULL; |
| 124 | mapping->nrpages--; | 121 | mapping->nrpages--; |
| @@ -144,9 +141,9 @@ void remove_from_page_cache(struct page *page) | |||
| 144 | 141 | ||
| 145 | BUG_ON(!PageLocked(page)); | 142 | BUG_ON(!PageLocked(page)); |
| 146 | 143 | ||
| 147 | write_lock_irq(&mapping->tree_lock); | 144 | spin_lock_irq(&mapping->tree_lock); |
| 148 | __remove_from_page_cache(page); | 145 | __remove_from_page_cache(page); |
| 149 | write_unlock_irq(&mapping->tree_lock); | 146 | spin_unlock_irq(&mapping->tree_lock); |
| 150 | } | 147 | } |
| 151 | 148 | ||
| 152 | static int sync_page(void *word) | 149 | static int sync_page(void *word) |
| @@ -445,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
| 445 | } | 442 | } |
| 446 | 443 | ||
| 447 | /** | 444 | /** |
| 448 | * add_to_page_cache - add newly allocated pagecache pages | 445 | * add_to_page_cache_locked - add a locked page to the pagecache |
| 449 | * @page: page to add | 446 | * @page: page to add |
| 450 | * @mapping: the page's address_space | 447 | * @mapping: the page's address_space |
| 451 | * @offset: page index | 448 | * @offset: page index |
| 452 | * @gfp_mask: page allocation mode | 449 | * @gfp_mask: page allocation mode |
| 453 | * | 450 | * |
| 454 | * This function is used to add newly allocated pagecache pages; | 451 | * This function is used to add a page to the pagecache. It must be locked. |
| 455 | * the page is new, so we can just run SetPageLocked() against it. | ||
| 456 | * The other page state flags were set by rmqueue(). | ||
| 457 | * | ||
| 458 | * This function does not add the page to the LRU. The caller must do that. | 452 | * This function does not add the page to the LRU. The caller must do that. |
| 459 | */ | 453 | */ |
| 460 | int add_to_page_cache(struct page *page, struct address_space *mapping, | 454 | int add_to_page_cache_locked(struct page *page, struct address_space *mapping, |
| 461 | pgoff_t offset, gfp_t gfp_mask) | 455 | pgoff_t offset, gfp_t gfp_mask) |
| 462 | { | 456 | { |
| 463 | int error = mem_cgroup_cache_charge(page, current->mm, | 457 | int error; |
| 458 | |||
| 459 | VM_BUG_ON(!PageLocked(page)); | ||
| 460 | |||
| 461 | error = mem_cgroup_cache_charge(page, current->mm, | ||
| 464 | gfp_mask & ~__GFP_HIGHMEM); | 462 | gfp_mask & ~__GFP_HIGHMEM); |
| 465 | if (error) | 463 | if (error) |
| 466 | goto out; | 464 | goto out; |
| 467 | 465 | ||
| 468 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 466 | error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
| 469 | if (error == 0) { | 467 | if (error == 0) { |
| 470 | write_lock_irq(&mapping->tree_lock); | 468 | page_cache_get(page); |
| 469 | page->mapping = mapping; | ||
| 470 | page->index = offset; | ||
| 471 | |||
| 472 | spin_lock_irq(&mapping->tree_lock); | ||
| 471 | error = radix_tree_insert(&mapping->page_tree, offset, page); | 473 | error = radix_tree_insert(&mapping->page_tree, offset, page); |
| 472 | if (!error) { | 474 | if (likely(!error)) { |
| 473 | page_cache_get(page); | ||
| 474 | SetPageLocked(page); | ||
| 475 | page->mapping = mapping; | ||
| 476 | page->index = offset; | ||
| 477 | mapping->nrpages++; | 475 | mapping->nrpages++; |
| 478 | __inc_zone_page_state(page, NR_FILE_PAGES); | 476 | __inc_zone_page_state(page, NR_FILE_PAGES); |
| 479 | } else | 477 | } else { |
| 480 | mem_cgroup_uncharge_page(page); | 478 | page->mapping = NULL; |
| 479 | mem_cgroup_uncharge_cache_page(page); | ||
| 480 | page_cache_release(page); | ||
| 481 | } | ||
| 481 | 482 | ||
| 482 | write_unlock_irq(&mapping->tree_lock); | 483 | spin_unlock_irq(&mapping->tree_lock); |
| 483 | radix_tree_preload_end(); | 484 | radix_tree_preload_end(); |
| 484 | } else | 485 | } else |
| 485 | mem_cgroup_uncharge_page(page); | 486 | mem_cgroup_uncharge_cache_page(page); |
| 486 | out: | 487 | out: |
| 487 | return error; | 488 | return error; |
| 488 | } | 489 | } |
| 489 | EXPORT_SYMBOL(add_to_page_cache); | 490 | EXPORT_SYMBOL(add_to_page_cache_locked); |
| 490 | 491 | ||
| 491 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | 492 | int add_to_page_cache_lru(struct page *page, struct address_space *mapping, |
| 492 | pgoff_t offset, gfp_t gfp_mask) | 493 | pgoff_t offset, gfp_t gfp_mask) |
| @@ -557,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit); | |||
| 557 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. | 558 | * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. |
| 558 | * | 559 | * |
| 559 | * The first mb is necessary to safely close the critical section opened by the | 560 | * The first mb is necessary to safely close the critical section opened by the |
| 560 | * TestSetPageLocked(), the second mb is necessary to enforce ordering between | 561 | * test_and_set_bit() to lock the page; the second mb is necessary to enforce |
| 561 | * the clear_bit and the read of the waitqueue (to avoid SMP races with a | 562 | * ordering between the clear_bit and the read of the waitqueue (to avoid SMP |
| 562 | * parallel wait_on_page_locked()). | 563 | * races with a parallel wait_on_page_locked()). |
| 563 | */ | 564 | */ |
| 564 | void unlock_page(struct page *page) | 565 | void unlock_page(struct page *page) |
| 565 | { | 566 | { |
| 566 | smp_mb__before_clear_bit(); | 567 | smp_mb__before_clear_bit(); |
| 567 | if (!TestClearPageLocked(page)) | 568 | if (!test_and_clear_bit(PG_locked, &page->flags)) |
| 568 | BUG(); | 569 | BUG(); |
| 569 | smp_mb__after_clear_bit(); | 570 | smp_mb__after_clear_bit(); |
| 570 | wake_up_page(page, PG_locked); | 571 | wake_up_page(page, PG_locked); |
| @@ -636,15 +637,35 @@ void __lock_page_nosync(struct page *page) | |||
| 636 | * Is there a pagecache struct page at the given (mapping, offset) tuple? | 637 | * Is there a pagecache struct page at the given (mapping, offset) tuple? |
| 637 | * If yes, increment its refcount and return it; if no, return NULL. | 638 | * If yes, increment its refcount and return it; if no, return NULL. |
| 638 | */ | 639 | */ |
| 639 | struct page * find_get_page(struct address_space *mapping, pgoff_t offset) | 640 | struct page *find_get_page(struct address_space *mapping, pgoff_t offset) |
| 640 | { | 641 | { |
| 642 | void **pagep; | ||
| 641 | struct page *page; | 643 | struct page *page; |
| 642 | 644 | ||
| 643 | read_lock_irq(&mapping->tree_lock); | 645 | rcu_read_lock(); |
| 644 | page = radix_tree_lookup(&mapping->page_tree, offset); | 646 | repeat: |
| 645 | if (page) | 647 | page = NULL; |
| 646 | page_cache_get(page); | 648 | pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); |
| 647 | read_unlock_irq(&mapping->tree_lock); | 649 | if (pagep) { |
| 650 | page = radix_tree_deref_slot(pagep); | ||
| 651 | if (unlikely(!page || page == RADIX_TREE_RETRY)) | ||
| 652 | goto repeat; | ||
| 653 | |||
| 654 | if (!page_cache_get_speculative(page)) | ||
| 655 | goto repeat; | ||
| 656 | |||
| 657 | /* | ||
| 658 | * Has the page moved? | ||
| 659 | * This is part of the lockless pagecache protocol. See | ||
| 660 | * include/linux/pagemap.h for details. | ||
| 661 | */ | ||
| 662 | if (unlikely(page != *pagep)) { | ||
| 663 | page_cache_release(page); | ||
| 664 | goto repeat; | ||
| 665 | } | ||
| 666 | } | ||
| 667 | rcu_read_unlock(); | ||
| 668 | |||
| 648 | return page; | 669 | return page; |
| 649 | } | 670 | } |
| 650 | EXPORT_SYMBOL(find_get_page); | 671 | EXPORT_SYMBOL(find_get_page); |
| @@ -659,32 +680,22 @@ EXPORT_SYMBOL(find_get_page); | |||
| 659 | * | 680 | * |
| 660 | * Returns zero if the page was not present. find_lock_page() may sleep. | 681 | * Returns zero if the page was not present. find_lock_page() may sleep. |
| 661 | */ | 682 | */ |
| 662 | struct page *find_lock_page(struct address_space *mapping, | 683 | struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) |
| 663 | pgoff_t offset) | ||
| 664 | { | 684 | { |
| 665 | struct page *page; | 685 | struct page *page; |
| 666 | 686 | ||
| 667 | repeat: | 687 | repeat: |
| 668 | read_lock_irq(&mapping->tree_lock); | 688 | page = find_get_page(mapping, offset); |
| 669 | page = radix_tree_lookup(&mapping->page_tree, offset); | ||
| 670 | if (page) { | 689 | if (page) { |
| 671 | page_cache_get(page); | 690 | lock_page(page); |
| 672 | if (TestSetPageLocked(page)) { | 691 | /* Has the page been truncated? */ |
| 673 | read_unlock_irq(&mapping->tree_lock); | 692 | if (unlikely(page->mapping != mapping)) { |
| 674 | __lock_page(page); | 693 | unlock_page(page); |
| 675 | 694 | page_cache_release(page); | |
| 676 | /* Has the page been truncated while we slept? */ | 695 | goto repeat; |
| 677 | if (unlikely(page->mapping != mapping)) { | ||
| 678 | unlock_page(page); | ||
| 679 | page_cache_release(page); | ||
| 680 | goto repeat; | ||
| 681 | } | ||
| 682 | VM_BUG_ON(page->index != offset); | ||
| 683 | goto out; | ||
| 684 | } | 696 | } |
| 697 | VM_BUG_ON(page->index != offset); | ||
| 685 | } | 698 | } |
| 686 | read_unlock_irq(&mapping->tree_lock); | ||
| 687 | out: | ||
| 688 | return page; | 699 | return page; |
| 689 | } | 700 | } |
| 690 | EXPORT_SYMBOL(find_lock_page); | 701 | EXPORT_SYMBOL(find_lock_page); |
| @@ -750,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, | |||
| 750 | { | 761 | { |
| 751 | unsigned int i; | 762 | unsigned int i; |
| 752 | unsigned int ret; | 763 | unsigned int ret; |
| 764 | unsigned int nr_found; | ||
| 765 | |||
| 766 | rcu_read_lock(); | ||
| 767 | restart: | ||
| 768 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
| 769 | (void ***)pages, start, nr_pages); | ||
| 770 | ret = 0; | ||
| 771 | for (i = 0; i < nr_found; i++) { | ||
| 772 | struct page *page; | ||
| 773 | repeat: | ||
| 774 | page = radix_tree_deref_slot((void **)pages[i]); | ||
| 775 | if (unlikely(!page)) | ||
| 776 | continue; | ||
| 777 | /* | ||
| 778 | * this can only trigger if nr_found == 1, making livelock | ||
| 779 | * a non issue. | ||
| 780 | */ | ||
| 781 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
| 782 | goto restart; | ||
| 753 | 783 | ||
| 754 | read_lock_irq(&mapping->tree_lock); | 784 | if (!page_cache_get_speculative(page)) |
| 755 | ret = radix_tree_gang_lookup(&mapping->page_tree, | 785 | goto repeat; |
| 756 | (void **)pages, start, nr_pages); | 786 | |
| 757 | for (i = 0; i < ret; i++) | 787 | /* Has the page moved? */ |
| 758 | page_cache_get(pages[i]); | 788 | if (unlikely(page != *((void **)pages[i]))) { |
| 759 | read_unlock_irq(&mapping->tree_lock); | 789 | page_cache_release(page); |
| 790 | goto repeat; | ||
| 791 | } | ||
| 792 | |||
| 793 | pages[ret] = page; | ||
| 794 | ret++; | ||
| 795 | } | ||
| 796 | rcu_read_unlock(); | ||
| 760 | return ret; | 797 | return ret; |
| 761 | } | 798 | } |
| 762 | 799 | ||
| @@ -777,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, | |||
| 777 | { | 814 | { |
| 778 | unsigned int i; | 815 | unsigned int i; |
| 779 | unsigned int ret; | 816 | unsigned int ret; |
| 817 | unsigned int nr_found; | ||
| 818 | |||
| 819 | rcu_read_lock(); | ||
| 820 | restart: | ||
| 821 | nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, | ||
| 822 | (void ***)pages, index, nr_pages); | ||
| 823 | ret = 0; | ||
| 824 | for (i = 0; i < nr_found; i++) { | ||
| 825 | struct page *page; | ||
| 826 | repeat: | ||
| 827 | page = radix_tree_deref_slot((void **)pages[i]); | ||
| 828 | if (unlikely(!page)) | ||
| 829 | continue; | ||
| 830 | /* | ||
| 831 | * this can only trigger if nr_found == 1, making livelock | ||
| 832 | * a non issue. | ||
| 833 | */ | ||
| 834 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
| 835 | goto restart; | ||
| 780 | 836 | ||
| 781 | read_lock_irq(&mapping->tree_lock); | 837 | if (page->mapping == NULL || page->index != index) |
| 782 | ret = radix_tree_gang_lookup(&mapping->page_tree, | ||
| 783 | (void **)pages, index, nr_pages); | ||
| 784 | for (i = 0; i < ret; i++) { | ||
| 785 | if (pages[i]->mapping == NULL || pages[i]->index != index) | ||
| 786 | break; | 838 | break; |
| 787 | 839 | ||
| 788 | page_cache_get(pages[i]); | 840 | if (!page_cache_get_speculative(page)) |
| 841 | goto repeat; | ||
| 842 | |||
| 843 | /* Has the page moved? */ | ||
| 844 | if (unlikely(page != *((void **)pages[i]))) { | ||
| 845 | page_cache_release(page); | ||
| 846 | goto repeat; | ||
| 847 | } | ||
| 848 | |||
| 849 | pages[ret] = page; | ||
| 850 | ret++; | ||
| 789 | index++; | 851 | index++; |
| 790 | } | 852 | } |
| 791 | read_unlock_irq(&mapping->tree_lock); | 853 | rcu_read_unlock(); |
| 792 | return i; | 854 | return ret; |
| 793 | } | 855 | } |
| 794 | EXPORT_SYMBOL(find_get_pages_contig); | 856 | EXPORT_SYMBOL(find_get_pages_contig); |
| 795 | 857 | ||
| @@ -809,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | |||
| 809 | { | 871 | { |
| 810 | unsigned int i; | 872 | unsigned int i; |
| 811 | unsigned int ret; | 873 | unsigned int ret; |
| 874 | unsigned int nr_found; | ||
| 875 | |||
| 876 | rcu_read_lock(); | ||
| 877 | restart: | ||
| 878 | nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, | ||
| 879 | (void ***)pages, *index, nr_pages, tag); | ||
| 880 | ret = 0; | ||
| 881 | for (i = 0; i < nr_found; i++) { | ||
| 882 | struct page *page; | ||
| 883 | repeat: | ||
| 884 | page = radix_tree_deref_slot((void **)pages[i]); | ||
| 885 | if (unlikely(!page)) | ||
| 886 | continue; | ||
| 887 | /* | ||
| 888 | * this can only trigger if nr_found == 1, making livelock | ||
| 889 | * a non issue. | ||
| 890 | */ | ||
| 891 | if (unlikely(page == RADIX_TREE_RETRY)) | ||
| 892 | goto restart; | ||
| 893 | |||
| 894 | if (!page_cache_get_speculative(page)) | ||
| 895 | goto repeat; | ||
| 896 | |||
| 897 | /* Has the page moved? */ | ||
| 898 | if (unlikely(page != *((void **)pages[i]))) { | ||
| 899 | page_cache_release(page); | ||
| 900 | goto repeat; | ||
| 901 | } | ||
| 902 | |||
| 903 | pages[ret] = page; | ||
| 904 | ret++; | ||
| 905 | } | ||
| 906 | rcu_read_unlock(); | ||
| 812 | 907 | ||
| 813 | read_lock_irq(&mapping->tree_lock); | ||
| 814 | ret = radix_tree_gang_lookup_tag(&mapping->page_tree, | ||
| 815 | (void **)pages, *index, nr_pages, tag); | ||
| 816 | for (i = 0; i < ret; i++) | ||
| 817 | page_cache_get(pages[i]); | ||
| 818 | if (ret) | 908 | if (ret) |
| 819 | *index = pages[ret - 1]->index + 1; | 909 | *index = pages[ret - 1]->index + 1; |
| 820 | read_unlock_irq(&mapping->tree_lock); | 910 | |
| 821 | return ret; | 911 | return ret; |
| 822 | } | 912 | } |
| 823 | EXPORT_SYMBOL(find_get_pages_tag); | 913 | EXPORT_SYMBOL(find_get_pages_tag); |
| @@ -841,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) | |||
| 841 | struct page *page = find_get_page(mapping, index); | 931 | struct page *page = find_get_page(mapping, index); |
| 842 | 932 | ||
| 843 | if (page) { | 933 | if (page) { |
| 844 | if (!TestSetPageLocked(page)) | 934 | if (trylock_page(page)) |
| 845 | return page; | 935 | return page; |
| 846 | page_cache_release(page); | 936 | page_cache_release(page); |
| 847 | return NULL; | 937 | return NULL; |
| @@ -933,8 +1023,17 @@ find_page: | |||
| 933 | ra, filp, page, | 1023 | ra, filp, page, |
| 934 | index, last_index - index); | 1024 | index, last_index - index); |
| 935 | } | 1025 | } |
| 936 | if (!PageUptodate(page)) | 1026 | if (!PageUptodate(page)) { |
| 937 | goto page_not_up_to_date; | 1027 | if (inode->i_blkbits == PAGE_CACHE_SHIFT || |
| 1028 | !mapping->a_ops->is_partially_uptodate) | ||
| 1029 | goto page_not_up_to_date; | ||
| 1030 | if (!trylock_page(page)) | ||
| 1031 | goto page_not_up_to_date; | ||
| 1032 | if (!mapping->a_ops->is_partially_uptodate(page, | ||
| 1033 | desc, offset)) | ||
| 1034 | goto page_not_up_to_date_locked; | ||
| 1035 | unlock_page(page); | ||
| 1036 | } | ||
| 938 | page_ok: | 1037 | page_ok: |
| 939 | /* | 1038 | /* |
| 940 | * i_size must be checked after we know the page is Uptodate. | 1039 | * i_size must be checked after we know the page is Uptodate. |
| @@ -1004,6 +1103,7 @@ page_not_up_to_date: | |||
| 1004 | if (lock_page_killable(page)) | 1103 | if (lock_page_killable(page)) |
| 1005 | goto readpage_eio; | 1104 | goto readpage_eio; |
| 1006 | 1105 | ||
| 1106 | page_not_up_to_date_locked: | ||
| 1007 | /* Did it get truncated before we got the lock? */ | 1107 | /* Did it get truncated before we got the lock? */ |
| 1008 | if (!page->mapping) { | 1108 | if (!page->mapping) { |
| 1009 | unlock_page(page); | 1109 | unlock_page(page); |
| @@ -1200,42 +1300,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
| 1200 | 1300 | ||
| 1201 | mapping = filp->f_mapping; | 1301 | mapping = filp->f_mapping; |
| 1202 | inode = mapping->host; | 1302 | inode = mapping->host; |
| 1203 | retval = 0; | ||
| 1204 | if (!count) | 1303 | if (!count) |
| 1205 | goto out; /* skip atime */ | 1304 | goto out; /* skip atime */ |
| 1206 | size = i_size_read(inode); | 1305 | size = i_size_read(inode); |
| 1207 | if (pos < size) { | 1306 | if (pos < size) { |
| 1208 | retval = generic_file_direct_IO(READ, iocb, | 1307 | retval = filemap_write_and_wait(mapping); |
| 1209 | iov, pos, nr_segs); | 1308 | if (!retval) { |
| 1309 | retval = mapping->a_ops->direct_IO(READ, iocb, | ||
| 1310 | iov, pos, nr_segs); | ||
| 1311 | } | ||
| 1210 | if (retval > 0) | 1312 | if (retval > 0) |
| 1211 | *ppos = pos + retval; | 1313 | *ppos = pos + retval; |
| 1212 | } | 1314 | if (retval) { |
| 1213 | if (likely(retval != 0)) { | 1315 | file_accessed(filp); |
| 1214 | file_accessed(filp); | 1316 | goto out; |
| 1215 | goto out; | 1317 | } |
| 1216 | } | 1318 | } |
| 1217 | } | 1319 | } |
| 1218 | 1320 | ||
| 1219 | retval = 0; | 1321 | for (seg = 0; seg < nr_segs; seg++) { |
| 1220 | if (count) { | 1322 | read_descriptor_t desc; |
| 1221 | for (seg = 0; seg < nr_segs; seg++) { | ||
| 1222 | read_descriptor_t desc; | ||
| 1223 | 1323 | ||
| 1224 | desc.written = 0; | 1324 | desc.written = 0; |
| 1225 | desc.arg.buf = iov[seg].iov_base; | 1325 | desc.arg.buf = iov[seg].iov_base; |
| 1226 | desc.count = iov[seg].iov_len; | 1326 | desc.count = iov[seg].iov_len; |
| 1227 | if (desc.count == 0) | 1327 | if (desc.count == 0) |
| 1228 | continue; | 1328 | continue; |
| 1229 | desc.error = 0; | 1329 | desc.error = 0; |
| 1230 | do_generic_file_read(filp,ppos,&desc,file_read_actor); | 1330 | do_generic_file_read(filp, ppos, &desc, file_read_actor); |
| 1231 | retval += desc.written; | 1331 | retval += desc.written; |
| 1232 | if (desc.error) { | 1332 | if (desc.error) { |
| 1233 | retval = retval ?: desc.error; | 1333 | retval = retval ?: desc.error; |
| 1234 | break; | 1334 | break; |
| 1235 | } | ||
| 1236 | if (desc.count > 0) | ||
| 1237 | break; | ||
| 1238 | } | 1335 | } |
| 1336 | if (desc.count > 0) | ||
| 1337 | break; | ||
| 1239 | } | 1338 | } |
| 1240 | out: | 1339 | out: |
| 1241 | return retval; | 1340 | return retval; |
| @@ -1669,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill) | |||
| 1669 | return notify_change(dentry, &newattrs); | 1768 | return notify_change(dentry, &newattrs); |
| 1670 | } | 1769 | } |
| 1671 | 1770 | ||
| 1672 | int remove_suid(struct dentry *dentry) | 1771 | int file_remove_suid(struct file *file) |
| 1673 | { | 1772 | { |
| 1773 | struct dentry *dentry = file->f_path.dentry; | ||
| 1674 | int killsuid = should_remove_suid(dentry); | 1774 | int killsuid = should_remove_suid(dentry); |
| 1675 | int killpriv = security_inode_need_killpriv(dentry); | 1775 | int killpriv = security_inode_need_killpriv(dentry); |
| 1676 | int error = 0; | 1776 | int error = 0; |
| @@ -1684,7 +1784,7 @@ int remove_suid(struct dentry *dentry) | |||
| 1684 | 1784 | ||
| 1685 | return error; | 1785 | return error; |
| 1686 | } | 1786 | } |
| 1687 | EXPORT_SYMBOL(remove_suid); | 1787 | EXPORT_SYMBOL(file_remove_suid); |
| 1688 | 1788 | ||
| 1689 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, | 1789 | static size_t __iovec_copy_from_user_inatomic(char *vaddr, |
| 1690 | const struct iovec *iov, size_t base, size_t bytes) | 1790 | const struct iovec *iov, size_t base, size_t bytes) |
| @@ -1779,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
| 1779 | * The !iov->iov_len check ensures we skip over unlikely | 1879 | * The !iov->iov_len check ensures we skip over unlikely |
| 1780 | * zero-length segments (without overruning the iovec). | 1880 | * zero-length segments (without overruning the iovec). |
| 1781 | */ | 1881 | */ |
| 1782 | while (bytes || unlikely(!iov->iov_len && i->count)) { | 1882 | while (bytes || unlikely(i->count && !iov->iov_len)) { |
| 1783 | int copy; | 1883 | int copy; |
| 1784 | 1884 | ||
| 1785 | copy = min(bytes, iov->iov_len - base); | 1885 | copy = min(bytes, iov->iov_len - base); |
| @@ -2004,11 +2104,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2004 | struct address_space *mapping = file->f_mapping; | 2104 | struct address_space *mapping = file->f_mapping; |
| 2005 | struct inode *inode = mapping->host; | 2105 | struct inode *inode = mapping->host; |
| 2006 | ssize_t written; | 2106 | ssize_t written; |
| 2107 | size_t write_len; | ||
| 2108 | pgoff_t end; | ||
| 2007 | 2109 | ||
| 2008 | if (count != ocount) | 2110 | if (count != ocount) |
| 2009 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); | 2111 | *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); |
| 2010 | 2112 | ||
| 2011 | written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs); | 2113 | /* |
| 2114 | * Unmap all mmappings of the file up-front. | ||
| 2115 | * | ||
| 2116 | * This will cause any pte dirty bits to be propagated into the | ||
| 2117 | * pageframes for the subsequent filemap_write_and_wait(). | ||
| 2118 | */ | ||
| 2119 | write_len = iov_length(iov, *nr_segs); | ||
| 2120 | end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; | ||
| 2121 | if (mapping_mapped(mapping)) | ||
| 2122 | unmap_mapping_range(mapping, pos, write_len, 0); | ||
| 2123 | |||
| 2124 | written = filemap_write_and_wait(mapping); | ||
| 2125 | if (written) | ||
| 2126 | goto out; | ||
| 2127 | |||
| 2128 | /* | ||
| 2129 | * After a write we want buffered reads to be sure to go to disk to get | ||
| 2130 | * the new data. We invalidate clean cached page from the region we're | ||
| 2131 | * about to write. We do this *before* the write so that we can return | ||
| 2132 | * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). | ||
| 2133 | */ | ||
| 2134 | if (mapping->nrpages) { | ||
| 2135 | written = invalidate_inode_pages2_range(mapping, | ||
| 2136 | pos >> PAGE_CACHE_SHIFT, end); | ||
| 2137 | if (written) | ||
| 2138 | goto out; | ||
| 2139 | } | ||
| 2140 | |||
| 2141 | written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs); | ||
| 2142 | |||
| 2143 | /* | ||
| 2144 | * Finally, try again to invalidate clean pages which might have been | ||
| 2145 | * cached by non-direct readahead, or faulted in by get_user_pages() | ||
| 2146 | * if the source of the write was an mmap'ed region of the file | ||
| 2147 | * we're writing. Either one is a pretty crazy thing to do, | ||
| 2148 | * so we don't support it 100%. If this invalidation | ||
| 2149 | * fails, tough, the write still worked... | ||
| 2150 | */ | ||
| 2151 | if (mapping->nrpages) { | ||
| 2152 | invalidate_inode_pages2_range(mapping, | ||
| 2153 | pos >> PAGE_CACHE_SHIFT, end); | ||
| 2154 | } | ||
| 2155 | |||
| 2012 | if (written > 0) { | 2156 | if (written > 0) { |
| 2013 | loff_t end = pos + written; | 2157 | loff_t end = pos + written; |
| 2014 | if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { | 2158 | if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { |
| @@ -2024,6 +2168,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2024 | * i_mutex is held, which protects generic_osync_inode() from | 2168 | * i_mutex is held, which protects generic_osync_inode() from |
| 2025 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. | 2169 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. |
| 2026 | */ | 2170 | */ |
| 2171 | out: | ||
| 2027 | if ((written >= 0 || written == -EIOCBQUEUED) && | 2172 | if ((written >= 0 || written == -EIOCBQUEUED) && |
| 2028 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2173 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| 2029 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); | 2174 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); |
| @@ -2395,7 +2540,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | |||
| 2395 | if (count == 0) | 2540 | if (count == 0) |
| 2396 | goto out; | 2541 | goto out; |
| 2397 | 2542 | ||
| 2398 | err = remove_suid(file->f_path.dentry); | 2543 | err = file_remove_suid(file); |
| 2399 | if (err) | 2544 | if (err) |
| 2400 | goto out; | 2545 | goto out; |
| 2401 | 2546 | ||
| @@ -2511,66 +2656,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2511 | } | 2656 | } |
| 2512 | EXPORT_SYMBOL(generic_file_aio_write); | 2657 | EXPORT_SYMBOL(generic_file_aio_write); |
| 2513 | 2658 | ||
| 2514 | /* | ||
| 2515 | * Called under i_mutex for writes to S_ISREG files. Returns -EIO if something | ||
| 2516 | * went wrong during pagecache shootdown. | ||
| 2517 | */ | ||
| 2518 | static ssize_t | ||
| 2519 | generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, | ||
| 2520 | loff_t offset, unsigned long nr_segs) | ||
| 2521 | { | ||
| 2522 | struct file *file = iocb->ki_filp; | ||
| 2523 | struct address_space *mapping = file->f_mapping; | ||
| 2524 | ssize_t retval; | ||
| 2525 | size_t write_len; | ||
| 2526 | pgoff_t end = 0; /* silence gcc */ | ||
| 2527 | |||
| 2528 | /* | ||
| 2529 | * If it's a write, unmap all mmappings of the file up-front. This | ||
| 2530 | * will cause any pte dirty bits to be propagated into the pageframes | ||
| 2531 | * for the subsequent filemap_write_and_wait(). | ||
| 2532 | */ | ||
| 2533 | if (rw == WRITE) { | ||
| 2534 | write_len = iov_length(iov, nr_segs); | ||
| 2535 | end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT; | ||
| 2536 | if (mapping_mapped(mapping)) | ||
| 2537 | unmap_mapping_range(mapping, offset, write_len, 0); | ||
| 2538 | } | ||
| 2539 | |||
| 2540 | retval = filemap_write_and_wait(mapping); | ||
| 2541 | if (retval) | ||
| 2542 | goto out; | ||
| 2543 | |||
| 2544 | /* | ||
| 2545 | * After a write we want buffered reads to be sure to go to disk to get | ||
| 2546 | * the new data. We invalidate clean cached page from the region we're | ||
| 2547 | * about to write. We do this *before* the write so that we can return | ||
| 2548 | * -EIO without clobbering -EIOCBQUEUED from ->direct_IO(). | ||
| 2549 | */ | ||
| 2550 | if (rw == WRITE && mapping->nrpages) { | ||
| 2551 | retval = invalidate_inode_pages2_range(mapping, | ||
| 2552 | offset >> PAGE_CACHE_SHIFT, end); | ||
| 2553 | if (retval) | ||
| 2554 | goto out; | ||
| 2555 | } | ||
| 2556 | |||
| 2557 | retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); | ||
| 2558 | |||
| 2559 | /* | ||
| 2560 | * Finally, try again to invalidate clean pages which might have been | ||
| 2561 | * cached by non-direct readahead, or faulted in by get_user_pages() | ||
| 2562 | * if the source of the write was an mmap'ed region of the file | ||
| 2563 | * we're writing. Either one is a pretty crazy thing to do, | ||
| 2564 | * so we don't support it 100%. If this invalidation | ||
| 2565 | * fails, tough, the write still worked... | ||
| 2566 | */ | ||
| 2567 | if (rw == WRITE && mapping->nrpages) { | ||
| 2568 | invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end); | ||
| 2569 | } | ||
| 2570 | out: | ||
| 2571 | return retval; | ||
| 2572 | } | ||
| 2573 | |||
| 2574 | /** | 2659 | /** |
| 2575 | * try_to_release_page() - release old fs-specific metadata on a page | 2660 | * try_to_release_page() - release old fs-specific metadata on a page |
| 2576 | * | 2661 | * |
| @@ -2582,9 +2667,8 @@ out: | |||
| 2582 | * Otherwise return zero. | 2667 | * Otherwise return zero. |
| 2583 | * | 2668 | * |
| 2584 | * The @gfp_mask argument specifies whether I/O may be performed to release | 2669 | * The @gfp_mask argument specifies whether I/O may be performed to release |
| 2585 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT). | 2670 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). |
| 2586 | * | 2671 | * |
| 2587 | * NOTE: @gfp_mask may go away, and this function may become non-blocking. | ||
| 2588 | */ | 2672 | */ |
| 2589 | int try_to_release_page(struct page *page, gfp_t gfp_mask) | 2673 | int try_to_release_page(struct page *page, gfp_t gfp_mask) |
| 2590 | { | 2674 | { |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 3e744abcce9d..380ab402d711 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
| 14 | #include <linux/uio.h> | 14 | #include <linux/uio.h> |
| 15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
| 16 | #include <linux/mmu_notifier.h> | ||
| 16 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
| 17 | #include <asm/tlbflush.h> | 18 | #include <asm/tlbflush.h> |
| 18 | #include <asm/io.h> | 19 | #include <asm/io.h> |
| @@ -188,7 +189,7 @@ __xip_unmap (struct address_space * mapping, | |||
| 188 | if (pte) { | 189 | if (pte) { |
| 189 | /* Nuke the page table entry. */ | 190 | /* Nuke the page table entry. */ |
| 190 | flush_cache_page(vma, address, pte_pfn(*pte)); | 191 | flush_cache_page(vma, address, pte_pfn(*pte)); |
| 191 | pteval = ptep_clear_flush(vma, address, pte); | 192 | pteval = ptep_clear_flush_notify(vma, address, pte); |
| 192 | page_remove_rmap(page, vma); | 193 | page_remove_rmap(page, vma); |
| 193 | dec_mm_counter(mm, file_rss); | 194 | dec_mm_counter(mm, file_rss); |
| 194 | BUG_ON(pte_dirty(pteval)); | 195 | BUG_ON(pte_dirty(pteval)); |
| @@ -380,7 +381,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len, | |||
| 380 | if (count == 0) | 381 | if (count == 0) |
| 381 | goto out_backing; | 382 | goto out_backing; |
| 382 | 383 | ||
| 383 | ret = remove_suid(filp->f_path.dentry); | 384 | ret = file_remove_suid(filp); |
| 384 | if (ret) | 385 | if (ret) |
| 385 | goto out_backing; | 386 | goto out_backing; |
| 386 | 387 | ||
diff --git a/mm/fremap.c b/mm/fremap.c index 07a9c82ce1a3..7881638e4a12 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/rmap.h> | 15 | #include <linux/rmap.h> |
| 16 | #include <linux/module.h> | 16 | #include <linux/module.h> |
| 17 | #include <linux/syscalls.h> | 17 | #include <linux/syscalls.h> |
| 18 | #include <linux/mmu_notifier.h> | ||
| 18 | 19 | ||
| 19 | #include <asm/mmu_context.h> | 20 | #include <asm/mmu_context.h> |
| 20 | #include <asm/cacheflush.h> | 21 | #include <asm/cacheflush.h> |
| @@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, | |||
| 214 | spin_unlock(&mapping->i_mmap_lock); | 215 | spin_unlock(&mapping->i_mmap_lock); |
| 215 | } | 216 | } |
| 216 | 217 | ||
| 218 | mmu_notifier_invalidate_range_start(mm, start, start + size); | ||
| 217 | err = populate_range(mm, vma, start, size, pgoff); | 219 | err = populate_range(mm, vma, start, size, pgoff); |
| 220 | mmu_notifier_invalidate_range_end(mm, start, start + size); | ||
| 218 | if (!err && !(flags & MAP_NONBLOCK)) { | 221 | if (!err && !(flags & MAP_NONBLOCK)) { |
| 219 | if (unlikely(has_write_lock)) { | 222 | if (unlikely(has_write_lock)) { |
| 220 | downgrade_write(&mm->mmap_sem); | 223 | downgrade_write(&mm->mmap_sem); |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab171274ef21..67a71191136e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -9,43 +9,357 @@ | |||
| 9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
| 10 | #include <linux/sysctl.h> | 10 | #include <linux/sysctl.h> |
| 11 | #include <linux/highmem.h> | 11 | #include <linux/highmem.h> |
| 12 | #include <linux/mmu_notifier.h> | ||
| 12 | #include <linux/nodemask.h> | 13 | #include <linux/nodemask.h> |
| 13 | #include <linux/pagemap.h> | 14 | #include <linux/pagemap.h> |
| 14 | #include <linux/mempolicy.h> | 15 | #include <linux/mempolicy.h> |
| 15 | #include <linux/cpuset.h> | 16 | #include <linux/cpuset.h> |
| 16 | #include <linux/mutex.h> | 17 | #include <linux/mutex.h> |
| 18 | #include <linux/bootmem.h> | ||
| 19 | #include <linux/sysfs.h> | ||
| 17 | 20 | ||
| 18 | #include <asm/page.h> | 21 | #include <asm/page.h> |
| 19 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
| 23 | #include <asm/io.h> | ||
| 20 | 24 | ||
| 21 | #include <linux/hugetlb.h> | 25 | #include <linux/hugetlb.h> |
| 22 | #include "internal.h" | 26 | #include "internal.h" |
| 23 | 27 | ||
| 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 28 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
| 25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; | ||
| 26 | static unsigned long surplus_huge_pages; | ||
| 27 | static unsigned long nr_overcommit_huge_pages; | ||
| 28 | unsigned long max_huge_pages; | ||
| 29 | unsigned long sysctl_overcommit_huge_pages; | ||
| 30 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | ||
| 31 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | ||
| 32 | static unsigned int free_huge_pages_node[MAX_NUMNODES]; | ||
| 33 | static unsigned int surplus_huge_pages_node[MAX_NUMNODES]; | ||
| 34 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; | 29 | static gfp_t htlb_alloc_mask = GFP_HIGHUSER; |
| 35 | unsigned long hugepages_treat_as_movable; | 30 | unsigned long hugepages_treat_as_movable; |
| 36 | static int hugetlb_next_nid; | 31 | |
| 32 | static int max_hstate; | ||
| 33 | unsigned int default_hstate_idx; | ||
| 34 | struct hstate hstates[HUGE_MAX_HSTATE]; | ||
| 35 | |||
| 36 | __initdata LIST_HEAD(huge_boot_pages); | ||
| 37 | |||
| 38 | /* for command line parsing */ | ||
| 39 | static struct hstate * __initdata parsed_hstate; | ||
| 40 | static unsigned long __initdata default_hstate_max_huge_pages; | ||
| 41 | static unsigned long __initdata default_hstate_size; | ||
| 42 | |||
| 43 | #define for_each_hstate(h) \ | ||
| 44 | for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) | ||
| 37 | 45 | ||
| 38 | /* | 46 | /* |
| 39 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages | 47 | * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages |
| 40 | */ | 48 | */ |
| 41 | static DEFINE_SPINLOCK(hugetlb_lock); | 49 | static DEFINE_SPINLOCK(hugetlb_lock); |
| 42 | 50 | ||
| 43 | static void clear_huge_page(struct page *page, unsigned long addr) | 51 | /* |
| 52 | * Region tracking -- allows tracking of reservations and instantiated pages | ||
| 53 | * across the pages in a mapping. | ||
| 54 | * | ||
| 55 | * The region data structures are protected by a combination of the mmap_sem | ||
| 56 | * and the hugetlb_instantion_mutex. To access or modify a region the caller | ||
| 57 | * must either hold the mmap_sem for write, or the mmap_sem for read and | ||
| 58 | * the hugetlb_instantiation mutex: | ||
| 59 | * | ||
| 60 | * down_write(&mm->mmap_sem); | ||
| 61 | * or | ||
| 62 | * down_read(&mm->mmap_sem); | ||
| 63 | * mutex_lock(&hugetlb_instantiation_mutex); | ||
| 64 | */ | ||
| 65 | struct file_region { | ||
| 66 | struct list_head link; | ||
| 67 | long from; | ||
| 68 | long to; | ||
| 69 | }; | ||
| 70 | |||
| 71 | static long region_add(struct list_head *head, long f, long t) | ||
| 72 | { | ||
| 73 | struct file_region *rg, *nrg, *trg; | ||
| 74 | |||
| 75 | /* Locate the region we are either in or before. */ | ||
| 76 | list_for_each_entry(rg, head, link) | ||
| 77 | if (f <= rg->to) | ||
| 78 | break; | ||
| 79 | |||
| 80 | /* Round our left edge to the current segment if it encloses us. */ | ||
| 81 | if (f > rg->from) | ||
| 82 | f = rg->from; | ||
| 83 | |||
| 84 | /* Check for and consume any regions we now overlap with. */ | ||
| 85 | nrg = rg; | ||
| 86 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
| 87 | if (&rg->link == head) | ||
| 88 | break; | ||
| 89 | if (rg->from > t) | ||
| 90 | break; | ||
| 91 | |||
| 92 | /* If this area reaches higher then extend our area to | ||
| 93 | * include it completely. If this is not the first area | ||
| 94 | * which we intend to reuse, free it. */ | ||
| 95 | if (rg->to > t) | ||
| 96 | t = rg->to; | ||
| 97 | if (rg != nrg) { | ||
| 98 | list_del(&rg->link); | ||
| 99 | kfree(rg); | ||
| 100 | } | ||
| 101 | } | ||
| 102 | nrg->from = f; | ||
| 103 | nrg->to = t; | ||
| 104 | return 0; | ||
| 105 | } | ||
| 106 | |||
| 107 | static long region_chg(struct list_head *head, long f, long t) | ||
| 108 | { | ||
| 109 | struct file_region *rg, *nrg; | ||
| 110 | long chg = 0; | ||
| 111 | |||
| 112 | /* Locate the region we are before or in. */ | ||
| 113 | list_for_each_entry(rg, head, link) | ||
| 114 | if (f <= rg->to) | ||
| 115 | break; | ||
| 116 | |||
| 117 | /* If we are below the current region then a new region is required. | ||
| 118 | * Subtle, allocate a new region at the position but make it zero | ||
| 119 | * size such that we can guarantee to record the reservation. */ | ||
| 120 | if (&rg->link == head || t < rg->from) { | ||
| 121 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); | ||
| 122 | if (!nrg) | ||
| 123 | return -ENOMEM; | ||
| 124 | nrg->from = f; | ||
| 125 | nrg->to = f; | ||
| 126 | INIT_LIST_HEAD(&nrg->link); | ||
| 127 | list_add(&nrg->link, rg->link.prev); | ||
| 128 | |||
| 129 | return t - f; | ||
| 130 | } | ||
| 131 | |||
| 132 | /* Round our left edge to the current segment if it encloses us. */ | ||
| 133 | if (f > rg->from) | ||
| 134 | f = rg->from; | ||
| 135 | chg = t - f; | ||
| 136 | |||
| 137 | /* Check for and consume any regions we now overlap with. */ | ||
| 138 | list_for_each_entry(rg, rg->link.prev, link) { | ||
| 139 | if (&rg->link == head) | ||
| 140 | break; | ||
| 141 | if (rg->from > t) | ||
| 142 | return chg; | ||
| 143 | |||
| 144 | /* We overlap with this area, if it extends futher than | ||
| 145 | * us then we must extend ourselves. Account for its | ||
| 146 | * existing reservation. */ | ||
| 147 | if (rg->to > t) { | ||
| 148 | chg += rg->to - t; | ||
| 149 | t = rg->to; | ||
| 150 | } | ||
| 151 | chg -= rg->to - rg->from; | ||
| 152 | } | ||
| 153 | return chg; | ||
| 154 | } | ||
| 155 | |||
| 156 | static long region_truncate(struct list_head *head, long end) | ||
| 157 | { | ||
| 158 | struct file_region *rg, *trg; | ||
| 159 | long chg = 0; | ||
| 160 | |||
| 161 | /* Locate the region we are either in or before. */ | ||
| 162 | list_for_each_entry(rg, head, link) | ||
| 163 | if (end <= rg->to) | ||
| 164 | break; | ||
| 165 | if (&rg->link == head) | ||
| 166 | return 0; | ||
| 167 | |||
| 168 | /* If we are in the middle of a region then adjust it. */ | ||
| 169 | if (end > rg->from) { | ||
| 170 | chg = rg->to - end; | ||
| 171 | rg->to = end; | ||
| 172 | rg = list_entry(rg->link.next, typeof(*rg), link); | ||
| 173 | } | ||
| 174 | |||
| 175 | /* Drop any remaining regions. */ | ||
| 176 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
| 177 | if (&rg->link == head) | ||
| 178 | break; | ||
| 179 | chg += rg->to - rg->from; | ||
| 180 | list_del(&rg->link); | ||
| 181 | kfree(rg); | ||
| 182 | } | ||
| 183 | return chg; | ||
| 184 | } | ||
| 185 | |||
| 186 | static long region_count(struct list_head *head, long f, long t) | ||
| 187 | { | ||
| 188 | struct file_region *rg; | ||
| 189 | long chg = 0; | ||
| 190 | |||
| 191 | /* Locate each segment we overlap with, and count that overlap. */ | ||
| 192 | list_for_each_entry(rg, head, link) { | ||
| 193 | int seg_from; | ||
| 194 | int seg_to; | ||
| 195 | |||
| 196 | if (rg->to <= f) | ||
| 197 | continue; | ||
| 198 | if (rg->from >= t) | ||
| 199 | break; | ||
| 200 | |||
| 201 | seg_from = max(rg->from, f); | ||
| 202 | seg_to = min(rg->to, t); | ||
| 203 | |||
| 204 | chg += seg_to - seg_from; | ||
| 205 | } | ||
| 206 | |||
| 207 | return chg; | ||
| 208 | } | ||
| 209 | |||
| 210 | /* | ||
| 211 | * Convert the address within this vma to the page offset within | ||
| 212 | * the mapping, in pagecache page units; huge pages here. | ||
| 213 | */ | ||
| 214 | static pgoff_t vma_hugecache_offset(struct hstate *h, | ||
| 215 | struct vm_area_struct *vma, unsigned long address) | ||
| 216 | { | ||
| 217 | return ((address - vma->vm_start) >> huge_page_shift(h)) + | ||
| 218 | (vma->vm_pgoff >> huge_page_order(h)); | ||
| 219 | } | ||
| 220 | |||
| 221 | /* | ||
| 222 | * Flags for MAP_PRIVATE reservations. These are stored in the bottom | ||
| 223 | * bits of the reservation map pointer, which are always clear due to | ||
| 224 | * alignment. | ||
| 225 | */ | ||
| 226 | #define HPAGE_RESV_OWNER (1UL << 0) | ||
| 227 | #define HPAGE_RESV_UNMAPPED (1UL << 1) | ||
| 228 | #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED) | ||
| 229 | |||
| 230 | /* | ||
| 231 | * These helpers are used to track how many pages are reserved for | ||
| 232 | * faults in a MAP_PRIVATE mapping. Only the process that called mmap() | ||
| 233 | * is guaranteed to have their future faults succeed. | ||
| 234 | * | ||
| 235 | * With the exception of reset_vma_resv_huge_pages() which is called at fork(), | ||
| 236 | * the reserve counters are updated with the hugetlb_lock held. It is safe | ||
| 237 | * to reset the VMA at fork() time as it is not in use yet and there is no | ||
| 238 | * chance of the global counters getting corrupted as a result of the values. | ||
| 239 | * | ||
| 240 | * The private mapping reservation is represented in a subtly different | ||
| 241 | * manner to a shared mapping. A shared mapping has a region map associated | ||
| 242 | * with the underlying file, this region map represents the backing file | ||
| 243 | * pages which have ever had a reservation assigned which this persists even | ||
| 244 | * after the page is instantiated. A private mapping has a region map | ||
| 245 | * associated with the original mmap which is attached to all VMAs which | ||
| 246 | * reference it, this region map represents those offsets which have consumed | ||
| 247 | * reservation ie. where pages have been instantiated. | ||
| 248 | */ | ||
| 249 | static unsigned long get_vma_private_data(struct vm_area_struct *vma) | ||
| 250 | { | ||
| 251 | return (unsigned long)vma->vm_private_data; | ||
| 252 | } | ||
| 253 | |||
| 254 | static void set_vma_private_data(struct vm_area_struct *vma, | ||
| 255 | unsigned long value) | ||
| 256 | { | ||
| 257 | vma->vm_private_data = (void *)value; | ||
| 258 | } | ||
| 259 | |||
| 260 | struct resv_map { | ||
| 261 | struct kref refs; | ||
| 262 | struct list_head regions; | ||
| 263 | }; | ||
| 264 | |||
| 265 | struct resv_map *resv_map_alloc(void) | ||
| 266 | { | ||
| 267 | struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); | ||
| 268 | if (!resv_map) | ||
| 269 | return NULL; | ||
| 270 | |||
| 271 | kref_init(&resv_map->refs); | ||
| 272 | INIT_LIST_HEAD(&resv_map->regions); | ||
| 273 | |||
| 274 | return resv_map; | ||
| 275 | } | ||
| 276 | |||
| 277 | void resv_map_release(struct kref *ref) | ||
| 278 | { | ||
| 279 | struct resv_map *resv_map = container_of(ref, struct resv_map, refs); | ||
| 280 | |||
| 281 | /* Clear out any active regions before we release the map. */ | ||
| 282 | region_truncate(&resv_map->regions, 0); | ||
| 283 | kfree(resv_map); | ||
| 284 | } | ||
| 285 | |||
| 286 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | ||
| 287 | { | ||
| 288 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
| 289 | if (!(vma->vm_flags & VM_SHARED)) | ||
| 290 | return (struct resv_map *)(get_vma_private_data(vma) & | ||
| 291 | ~HPAGE_RESV_MASK); | ||
| 292 | return 0; | ||
| 293 | } | ||
| 294 | |||
| 295 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | ||
| 296 | { | ||
| 297 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
| 298 | VM_BUG_ON(vma->vm_flags & VM_SHARED); | ||
| 299 | |||
| 300 | set_vma_private_data(vma, (get_vma_private_data(vma) & | ||
| 301 | HPAGE_RESV_MASK) | (unsigned long)map); | ||
| 302 | } | ||
| 303 | |||
| 304 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) | ||
| 305 | { | ||
| 306 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
| 307 | VM_BUG_ON(vma->vm_flags & VM_SHARED); | ||
| 308 | |||
| 309 | set_vma_private_data(vma, get_vma_private_data(vma) | flags); | ||
| 310 | } | ||
| 311 | |||
| 312 | static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) | ||
| 313 | { | ||
| 314 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
| 315 | |||
| 316 | return (get_vma_private_data(vma) & flag) != 0; | ||
| 317 | } | ||
| 318 | |||
| 319 | /* Decrement the reserved pages in the hugepage pool by one */ | ||
| 320 | static void decrement_hugepage_resv_vma(struct hstate *h, | ||
| 321 | struct vm_area_struct *vma) | ||
| 322 | { | ||
| 323 | if (vma->vm_flags & VM_NORESERVE) | ||
| 324 | return; | ||
| 325 | |||
| 326 | if (vma->vm_flags & VM_SHARED) { | ||
| 327 | /* Shared mappings always use reserves */ | ||
| 328 | h->resv_huge_pages--; | ||
| 329 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | ||
| 330 | /* | ||
| 331 | * Only the process that called mmap() has reserves for | ||
| 332 | * private mappings. | ||
| 333 | */ | ||
| 334 | h->resv_huge_pages--; | ||
| 335 | } | ||
| 336 | } | ||
| 337 | |||
| 338 | /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ | ||
| 339 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | ||
| 340 | { | ||
| 341 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
| 342 | if (!(vma->vm_flags & VM_SHARED)) | ||
| 343 | vma->vm_private_data = (void *)0; | ||
| 344 | } | ||
| 345 | |||
| 346 | /* Returns true if the VMA has associated reserve pages */ | ||
| 347 | static int vma_has_reserves(struct vm_area_struct *vma) | ||
| 348 | { | ||
| 349 | if (vma->vm_flags & VM_SHARED) | ||
| 350 | return 1; | ||
| 351 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | ||
| 352 | return 1; | ||
| 353 | return 0; | ||
| 354 | } | ||
| 355 | |||
| 356 | static void clear_huge_page(struct page *page, | ||
| 357 | unsigned long addr, unsigned long sz) | ||
| 44 | { | 358 | { |
| 45 | int i; | 359 | int i; |
| 46 | 360 | ||
| 47 | might_sleep(); | 361 | might_sleep(); |
| 48 | for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { | 362 | for (i = 0; i < sz/PAGE_SIZE; i++) { |
| 49 | cond_resched(); | 363 | cond_resched(); |
| 50 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); | 364 | clear_user_highpage(page + i, addr + i * PAGE_SIZE); |
| 51 | } | 365 | } |
| @@ -55,42 +369,44 @@ static void copy_huge_page(struct page *dst, struct page *src, | |||
| 55 | unsigned long addr, struct vm_area_struct *vma) | 369 | unsigned long addr, struct vm_area_struct *vma) |
| 56 | { | 370 | { |
| 57 | int i; | 371 | int i; |
| 372 | struct hstate *h = hstate_vma(vma); | ||
| 58 | 373 | ||
| 59 | might_sleep(); | 374 | might_sleep(); |
| 60 | for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) { | 375 | for (i = 0; i < pages_per_huge_page(h); i++) { |
| 61 | cond_resched(); | 376 | cond_resched(); |
| 62 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); | 377 | copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); |
| 63 | } | 378 | } |
| 64 | } | 379 | } |
| 65 | 380 | ||
| 66 | static void enqueue_huge_page(struct page *page) | 381 | static void enqueue_huge_page(struct hstate *h, struct page *page) |
| 67 | { | 382 | { |
| 68 | int nid = page_to_nid(page); | 383 | int nid = page_to_nid(page); |
| 69 | list_add(&page->lru, &hugepage_freelists[nid]); | 384 | list_add(&page->lru, &h->hugepage_freelists[nid]); |
| 70 | free_huge_pages++; | 385 | h->free_huge_pages++; |
| 71 | free_huge_pages_node[nid]++; | 386 | h->free_huge_pages_node[nid]++; |
| 72 | } | 387 | } |
| 73 | 388 | ||
| 74 | static struct page *dequeue_huge_page(void) | 389 | static struct page *dequeue_huge_page(struct hstate *h) |
| 75 | { | 390 | { |
| 76 | int nid; | 391 | int nid; |
| 77 | struct page *page = NULL; | 392 | struct page *page = NULL; |
| 78 | 393 | ||
| 79 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { | 394 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { |
| 80 | if (!list_empty(&hugepage_freelists[nid])) { | 395 | if (!list_empty(&h->hugepage_freelists[nid])) { |
| 81 | page = list_entry(hugepage_freelists[nid].next, | 396 | page = list_entry(h->hugepage_freelists[nid].next, |
| 82 | struct page, lru); | 397 | struct page, lru); |
| 83 | list_del(&page->lru); | 398 | list_del(&page->lru); |
| 84 | free_huge_pages--; | 399 | h->free_huge_pages--; |
| 85 | free_huge_pages_node[nid]--; | 400 | h->free_huge_pages_node[nid]--; |
| 86 | break; | 401 | break; |
| 87 | } | 402 | } |
| 88 | } | 403 | } |
| 89 | return page; | 404 | return page; |
| 90 | } | 405 | } |
| 91 | 406 | ||
| 92 | static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | 407 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
| 93 | unsigned long address) | 408 | struct vm_area_struct *vma, |
| 409 | unsigned long address, int avoid_reserve) | ||
| 94 | { | 410 | { |
| 95 | int nid; | 411 | int nid; |
| 96 | struct page *page = NULL; | 412 | struct page *page = NULL; |
| @@ -101,18 +417,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
| 101 | struct zone *zone; | 417 | struct zone *zone; |
| 102 | struct zoneref *z; | 418 | struct zoneref *z; |
| 103 | 419 | ||
| 420 | /* | ||
| 421 | * A child process with MAP_PRIVATE mappings created by their parent | ||
| 422 | * have no page reserves. This check ensures that reservations are | ||
| 423 | * not "stolen". The child may still get SIGKILLed | ||
| 424 | */ | ||
| 425 | if (!vma_has_reserves(vma) && | ||
| 426 | h->free_huge_pages - h->resv_huge_pages == 0) | ||
| 427 | return NULL; | ||
| 428 | |||
| 429 | /* If reserves cannot be used, ensure enough pages are in the pool */ | ||
| 430 | if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) | ||
| 431 | return NULL; | ||
| 432 | |||
| 104 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 433 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
| 105 | MAX_NR_ZONES - 1, nodemask) { | 434 | MAX_NR_ZONES - 1, nodemask) { |
| 106 | nid = zone_to_nid(zone); | 435 | nid = zone_to_nid(zone); |
| 107 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 436 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && |
| 108 | !list_empty(&hugepage_freelists[nid])) { | 437 | !list_empty(&h->hugepage_freelists[nid])) { |
| 109 | page = list_entry(hugepage_freelists[nid].next, | 438 | page = list_entry(h->hugepage_freelists[nid].next, |
| 110 | struct page, lru); | 439 | struct page, lru); |
| 111 | list_del(&page->lru); | 440 | list_del(&page->lru); |
| 112 | free_huge_pages--; | 441 | h->free_huge_pages--; |
| 113 | free_huge_pages_node[nid]--; | 442 | h->free_huge_pages_node[nid]--; |
| 114 | if (vma && vma->vm_flags & VM_MAYSHARE) | 443 | |
| 115 | resv_huge_pages--; | 444 | if (!avoid_reserve) |
| 445 | decrement_hugepage_resv_vma(h, vma); | ||
| 446 | |||
| 116 | break; | 447 | break; |
| 117 | } | 448 | } |
| 118 | } | 449 | } |
| @@ -120,12 +451,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
| 120 | return page; | 451 | return page; |
| 121 | } | 452 | } |
| 122 | 453 | ||
| 123 | static void update_and_free_page(struct page *page) | 454 | static void update_and_free_page(struct hstate *h, struct page *page) |
| 124 | { | 455 | { |
| 125 | int i; | 456 | int i; |
| 126 | nr_huge_pages--; | 457 | |
| 127 | nr_huge_pages_node[page_to_nid(page)]--; | 458 | h->nr_huge_pages--; |
| 128 | for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { | 459 | h->nr_huge_pages_node[page_to_nid(page)]--; |
| 460 | for (i = 0; i < pages_per_huge_page(h); i++) { | ||
| 129 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | | 461 | page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | |
| 130 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | | 462 | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | |
| 131 | 1 << PG_private | 1<< PG_writeback); | 463 | 1 << PG_private | 1<< PG_writeback); |
| @@ -133,11 +465,27 @@ static void update_and_free_page(struct page *page) | |||
| 133 | set_compound_page_dtor(page, NULL); | 465 | set_compound_page_dtor(page, NULL); |
| 134 | set_page_refcounted(page); | 466 | set_page_refcounted(page); |
| 135 | arch_release_hugepage(page); | 467 | arch_release_hugepage(page); |
| 136 | __free_pages(page, HUGETLB_PAGE_ORDER); | 468 | __free_pages(page, huge_page_order(h)); |
| 469 | } | ||
| 470 | |||
| 471 | struct hstate *size_to_hstate(unsigned long size) | ||
| 472 | { | ||
| 473 | struct hstate *h; | ||
| 474 | |||
| 475 | for_each_hstate(h) { | ||
| 476 | if (huge_page_size(h) == size) | ||
| 477 | return h; | ||
| 478 | } | ||
| 479 | return NULL; | ||
| 137 | } | 480 | } |
| 138 | 481 | ||
| 139 | static void free_huge_page(struct page *page) | 482 | static void free_huge_page(struct page *page) |
| 140 | { | 483 | { |
| 484 | /* | ||
| 485 | * Can't pass hstate in here because it is called from the | ||
| 486 | * compound page destructor. | ||
| 487 | */ | ||
| 488 | struct hstate *h = page_hstate(page); | ||
| 141 | int nid = page_to_nid(page); | 489 | int nid = page_to_nid(page); |
| 142 | struct address_space *mapping; | 490 | struct address_space *mapping; |
| 143 | 491 | ||
| @@ -147,12 +495,12 @@ static void free_huge_page(struct page *page) | |||
| 147 | INIT_LIST_HEAD(&page->lru); | 495 | INIT_LIST_HEAD(&page->lru); |
| 148 | 496 | ||
| 149 | spin_lock(&hugetlb_lock); | 497 | spin_lock(&hugetlb_lock); |
| 150 | if (surplus_huge_pages_node[nid]) { | 498 | if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { |
| 151 | update_and_free_page(page); | 499 | update_and_free_page(h, page); |
| 152 | surplus_huge_pages--; | 500 | h->surplus_huge_pages--; |
| 153 | surplus_huge_pages_node[nid]--; | 501 | h->surplus_huge_pages_node[nid]--; |
| 154 | } else { | 502 | } else { |
| 155 | enqueue_huge_page(page); | 503 | enqueue_huge_page(h, page); |
| 156 | } | 504 | } |
| 157 | spin_unlock(&hugetlb_lock); | 505 | spin_unlock(&hugetlb_lock); |
| 158 | if (mapping) | 506 | if (mapping) |
| @@ -164,7 +512,7 @@ static void free_huge_page(struct page *page) | |||
| 164 | * balanced by operating on them in a round-robin fashion. | 512 | * balanced by operating on them in a round-robin fashion. |
| 165 | * Returns 1 if an adjustment was made. | 513 | * Returns 1 if an adjustment was made. |
| 166 | */ | 514 | */ |
| 167 | static int adjust_pool_surplus(int delta) | 515 | static int adjust_pool_surplus(struct hstate *h, int delta) |
| 168 | { | 516 | { |
| 169 | static int prev_nid; | 517 | static int prev_nid; |
| 170 | int nid = prev_nid; | 518 | int nid = prev_nid; |
| @@ -177,15 +525,15 @@ static int adjust_pool_surplus(int delta) | |||
| 177 | nid = first_node(node_online_map); | 525 | nid = first_node(node_online_map); |
| 178 | 526 | ||
| 179 | /* To shrink on this node, there must be a surplus page */ | 527 | /* To shrink on this node, there must be a surplus page */ |
| 180 | if (delta < 0 && !surplus_huge_pages_node[nid]) | 528 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) |
| 181 | continue; | 529 | continue; |
| 182 | /* Surplus cannot exceed the total number of pages */ | 530 | /* Surplus cannot exceed the total number of pages */ |
| 183 | if (delta > 0 && surplus_huge_pages_node[nid] >= | 531 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= |
| 184 | nr_huge_pages_node[nid]) | 532 | h->nr_huge_pages_node[nid]) |
| 185 | continue; | 533 | continue; |
| 186 | 534 | ||
| 187 | surplus_huge_pages += delta; | 535 | h->surplus_huge_pages += delta; |
| 188 | surplus_huge_pages_node[nid] += delta; | 536 | h->surplus_huge_pages_node[nid] += delta; |
| 189 | ret = 1; | 537 | ret = 1; |
| 190 | break; | 538 | break; |
| 191 | } while (nid != prev_nid); | 539 | } while (nid != prev_nid); |
| @@ -194,59 +542,74 @@ static int adjust_pool_surplus(int delta) | |||
| 194 | return ret; | 542 | return ret; |
| 195 | } | 543 | } |
| 196 | 544 | ||
| 197 | static struct page *alloc_fresh_huge_page_node(int nid) | 545 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
| 546 | { | ||
| 547 | set_compound_page_dtor(page, free_huge_page); | ||
| 548 | spin_lock(&hugetlb_lock); | ||
| 549 | h->nr_huge_pages++; | ||
| 550 | h->nr_huge_pages_node[nid]++; | ||
| 551 | spin_unlock(&hugetlb_lock); | ||
| 552 | put_page(page); /* free it into the hugepage allocator */ | ||
| 553 | } | ||
| 554 | |||
| 555 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | ||
| 198 | { | 556 | { |
| 199 | struct page *page; | 557 | struct page *page; |
| 200 | 558 | ||
| 559 | if (h->order >= MAX_ORDER) | ||
| 560 | return NULL; | ||
| 561 | |||
| 201 | page = alloc_pages_node(nid, | 562 | page = alloc_pages_node(nid, |
| 202 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 563 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| |
| 203 | __GFP_REPEAT|__GFP_NOWARN, | 564 | __GFP_REPEAT|__GFP_NOWARN, |
| 204 | HUGETLB_PAGE_ORDER); | 565 | huge_page_order(h)); |
| 205 | if (page) { | 566 | if (page) { |
| 206 | if (arch_prepare_hugepage(page)) { | 567 | if (arch_prepare_hugepage(page)) { |
| 207 | __free_pages(page, HUGETLB_PAGE_ORDER); | 568 | __free_pages(page, huge_page_order(h)); |
| 208 | return NULL; | 569 | return NULL; |
| 209 | } | 570 | } |
| 210 | set_compound_page_dtor(page, free_huge_page); | 571 | prep_new_huge_page(h, page, nid); |
| 211 | spin_lock(&hugetlb_lock); | ||
| 212 | nr_huge_pages++; | ||
| 213 | nr_huge_pages_node[nid]++; | ||
| 214 | spin_unlock(&hugetlb_lock); | ||
| 215 | put_page(page); /* free it into the hugepage allocator */ | ||
| 216 | } | 572 | } |
| 217 | 573 | ||
| 218 | return page; | 574 | return page; |
| 219 | } | 575 | } |
| 220 | 576 | ||
| 221 | static int alloc_fresh_huge_page(void) | 577 | /* |
| 578 | * Use a helper variable to find the next node and then | ||
| 579 | * copy it back to hugetlb_next_nid afterwards: | ||
| 580 | * otherwise there's a window in which a racer might | ||
| 581 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | ||
| 582 | * But we don't need to use a spin_lock here: it really | ||
| 583 | * doesn't matter if occasionally a racer chooses the | ||
| 584 | * same nid as we do. Move nid forward in the mask even | ||
| 585 | * if we just successfully allocated a hugepage so that | ||
| 586 | * the next caller gets hugepages on the next node. | ||
| 587 | */ | ||
| 588 | static int hstate_next_node(struct hstate *h) | ||
| 589 | { | ||
| 590 | int next_nid; | ||
| 591 | next_nid = next_node(h->hugetlb_next_nid, node_online_map); | ||
| 592 | if (next_nid == MAX_NUMNODES) | ||
| 593 | next_nid = first_node(node_online_map); | ||
| 594 | h->hugetlb_next_nid = next_nid; | ||
| 595 | return next_nid; | ||
| 596 | } | ||
| 597 | |||
| 598 | static int alloc_fresh_huge_page(struct hstate *h) | ||
| 222 | { | 599 | { |
| 223 | struct page *page; | 600 | struct page *page; |
| 224 | int start_nid; | 601 | int start_nid; |
| 225 | int next_nid; | 602 | int next_nid; |
| 226 | int ret = 0; | 603 | int ret = 0; |
| 227 | 604 | ||
| 228 | start_nid = hugetlb_next_nid; | 605 | start_nid = h->hugetlb_next_nid; |
| 229 | 606 | ||
| 230 | do { | 607 | do { |
| 231 | page = alloc_fresh_huge_page_node(hugetlb_next_nid); | 608 | page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); |
| 232 | if (page) | 609 | if (page) |
| 233 | ret = 1; | 610 | ret = 1; |
| 234 | /* | 611 | next_nid = hstate_next_node(h); |
| 235 | * Use a helper variable to find the next node and then | 612 | } while (!page && h->hugetlb_next_nid != start_nid); |
| 236 | * copy it back to hugetlb_next_nid afterwards: | ||
| 237 | * otherwise there's a window in which a racer might | ||
| 238 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | ||
| 239 | * But we don't need to use a spin_lock here: it really | ||
| 240 | * doesn't matter if occasionally a racer chooses the | ||
| 241 | * same nid as we do. Move nid forward in the mask even | ||
| 242 | * if we just successfully allocated a hugepage so that | ||
| 243 | * the next caller gets hugepages on the next node. | ||
| 244 | */ | ||
| 245 | next_nid = next_node(hugetlb_next_nid, node_online_map); | ||
| 246 | if (next_nid == MAX_NUMNODES) | ||
| 247 | next_nid = first_node(node_online_map); | ||
| 248 | hugetlb_next_nid = next_nid; | ||
| 249 | } while (!page && hugetlb_next_nid != start_nid); | ||
| 250 | 613 | ||
| 251 | if (ret) | 614 | if (ret) |
| 252 | count_vm_event(HTLB_BUDDY_PGALLOC); | 615 | count_vm_event(HTLB_BUDDY_PGALLOC); |
| @@ -256,12 +619,15 @@ static int alloc_fresh_huge_page(void) | |||
| 256 | return ret; | 619 | return ret; |
| 257 | } | 620 | } |
| 258 | 621 | ||
| 259 | static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | 622 | static struct page *alloc_buddy_huge_page(struct hstate *h, |
| 260 | unsigned long address) | 623 | struct vm_area_struct *vma, unsigned long address) |
| 261 | { | 624 | { |
| 262 | struct page *page; | 625 | struct page *page; |
| 263 | unsigned int nid; | 626 | unsigned int nid; |
| 264 | 627 | ||
| 628 | if (h->order >= MAX_ORDER) | ||
| 629 | return NULL; | ||
| 630 | |||
| 265 | /* | 631 | /* |
| 266 | * Assume we will successfully allocate the surplus page to | 632 | * Assume we will successfully allocate the surplus page to |
| 267 | * prevent racing processes from causing the surplus to exceed | 633 | * prevent racing processes from causing the surplus to exceed |
| @@ -286,18 +652,23 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
| 286 | * per-node value is checked there. | 652 | * per-node value is checked there. |
| 287 | */ | 653 | */ |
| 288 | spin_lock(&hugetlb_lock); | 654 | spin_lock(&hugetlb_lock); |
| 289 | if (surplus_huge_pages >= nr_overcommit_huge_pages) { | 655 | if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { |
| 290 | spin_unlock(&hugetlb_lock); | 656 | spin_unlock(&hugetlb_lock); |
| 291 | return NULL; | 657 | return NULL; |
| 292 | } else { | 658 | } else { |
| 293 | nr_huge_pages++; | 659 | h->nr_huge_pages++; |
| 294 | surplus_huge_pages++; | 660 | h->surplus_huge_pages++; |
| 295 | } | 661 | } |
| 296 | spin_unlock(&hugetlb_lock); | 662 | spin_unlock(&hugetlb_lock); |
| 297 | 663 | ||
| 298 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| | 664 | page = alloc_pages(htlb_alloc_mask|__GFP_COMP| |
| 299 | __GFP_REPEAT|__GFP_NOWARN, | 665 | __GFP_REPEAT|__GFP_NOWARN, |
| 300 | HUGETLB_PAGE_ORDER); | 666 | huge_page_order(h)); |
| 667 | |||
| 668 | if (page && arch_prepare_hugepage(page)) { | ||
| 669 | __free_pages(page, huge_page_order(h)); | ||
| 670 | return NULL; | ||
| 671 | } | ||
| 301 | 672 | ||
| 302 | spin_lock(&hugetlb_lock); | 673 | spin_lock(&hugetlb_lock); |
| 303 | if (page) { | 674 | if (page) { |
| @@ -312,12 +683,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
| 312 | /* | 683 | /* |
| 313 | * We incremented the global counters already | 684 | * We incremented the global counters already |
| 314 | */ | 685 | */ |
| 315 | nr_huge_pages_node[nid]++; | 686 | h->nr_huge_pages_node[nid]++; |
| 316 | surplus_huge_pages_node[nid]++; | 687 | h->surplus_huge_pages_node[nid]++; |
| 317 | __count_vm_event(HTLB_BUDDY_PGALLOC); | 688 | __count_vm_event(HTLB_BUDDY_PGALLOC); |
| 318 | } else { | 689 | } else { |
| 319 | nr_huge_pages--; | 690 | h->nr_huge_pages--; |
| 320 | surplus_huge_pages--; | 691 | h->surplus_huge_pages--; |
| 321 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); | 692 | __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); |
| 322 | } | 693 | } |
| 323 | spin_unlock(&hugetlb_lock); | 694 | spin_unlock(&hugetlb_lock); |
| @@ -329,16 +700,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma, | |||
| 329 | * Increase the hugetlb pool such that it can accomodate a reservation | 700 | * Increase the hugetlb pool such that it can accomodate a reservation |
| 330 | * of size 'delta'. | 701 | * of size 'delta'. |
| 331 | */ | 702 | */ |
| 332 | static int gather_surplus_pages(int delta) | 703 | static int gather_surplus_pages(struct hstate *h, int delta) |
| 333 | { | 704 | { |
| 334 | struct list_head surplus_list; | 705 | struct list_head surplus_list; |
| 335 | struct page *page, *tmp; | 706 | struct page *page, *tmp; |
| 336 | int ret, i; | 707 | int ret, i; |
| 337 | int needed, allocated; | 708 | int needed, allocated; |
| 338 | 709 | ||
| 339 | needed = (resv_huge_pages + delta) - free_huge_pages; | 710 | needed = (h->resv_huge_pages + delta) - h->free_huge_pages; |
| 340 | if (needed <= 0) { | 711 | if (needed <= 0) { |
| 341 | resv_huge_pages += delta; | 712 | h->resv_huge_pages += delta; |
| 342 | return 0; | 713 | return 0; |
| 343 | } | 714 | } |
| 344 | 715 | ||
| @@ -349,7 +720,7 @@ static int gather_surplus_pages(int delta) | |||
| 349 | retry: | 720 | retry: |
| 350 | spin_unlock(&hugetlb_lock); | 721 | spin_unlock(&hugetlb_lock); |
| 351 | for (i = 0; i < needed; i++) { | 722 | for (i = 0; i < needed; i++) { |
| 352 | page = alloc_buddy_huge_page(NULL, 0); | 723 | page = alloc_buddy_huge_page(h, NULL, 0); |
| 353 | if (!page) { | 724 | if (!page) { |
| 354 | /* | 725 | /* |
| 355 | * We were not able to allocate enough pages to | 726 | * We were not able to allocate enough pages to |
| @@ -370,7 +741,8 @@ retry: | |||
| 370 | * because either resv_huge_pages or free_huge_pages may have changed. | 741 | * because either resv_huge_pages or free_huge_pages may have changed. |
| 371 | */ | 742 | */ |
| 372 | spin_lock(&hugetlb_lock); | 743 | spin_lock(&hugetlb_lock); |
| 373 | needed = (resv_huge_pages + delta) - (free_huge_pages + allocated); | 744 | needed = (h->resv_huge_pages + delta) - |
| 745 | (h->free_huge_pages + allocated); | ||
| 374 | if (needed > 0) | 746 | if (needed > 0) |
| 375 | goto retry; | 747 | goto retry; |
| 376 | 748 | ||
| @@ -383,7 +755,7 @@ retry: | |||
| 383 | * before they are reserved. | 755 | * before they are reserved. |
| 384 | */ | 756 | */ |
| 385 | needed += allocated; | 757 | needed += allocated; |
| 386 | resv_huge_pages += delta; | 758 | h->resv_huge_pages += delta; |
| 387 | ret = 0; | 759 | ret = 0; |
| 388 | free: | 760 | free: |
| 389 | /* Free the needed pages to the hugetlb pool */ | 761 | /* Free the needed pages to the hugetlb pool */ |
| @@ -391,7 +763,7 @@ free: | |||
| 391 | if ((--needed) < 0) | 763 | if ((--needed) < 0) |
| 392 | break; | 764 | break; |
| 393 | list_del(&page->lru); | 765 | list_del(&page->lru); |
| 394 | enqueue_huge_page(page); | 766 | enqueue_huge_page(h, page); |
| 395 | } | 767 | } |
| 396 | 768 | ||
| 397 | /* Free unnecessary surplus pages to the buddy allocator */ | 769 | /* Free unnecessary surplus pages to the buddy allocator */ |
| @@ -419,7 +791,8 @@ free: | |||
| 419 | * allocated to satisfy the reservation must be explicitly freed if they were | 791 | * allocated to satisfy the reservation must be explicitly freed if they were |
| 420 | * never used. | 792 | * never used. |
| 421 | */ | 793 | */ |
| 422 | static void return_unused_surplus_pages(unsigned long unused_resv_pages) | 794 | static void return_unused_surplus_pages(struct hstate *h, |
| 795 | unsigned long unused_resv_pages) | ||
| 423 | { | 796 | { |
| 424 | static int nid = -1; | 797 | static int nid = -1; |
| 425 | struct page *page; | 798 | struct page *page; |
| @@ -434,157 +807,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) | |||
| 434 | unsigned long remaining_iterations = num_online_nodes(); | 807 | unsigned long remaining_iterations = num_online_nodes(); |
| 435 | 808 | ||
| 436 | /* Uncommit the reservation */ | 809 | /* Uncommit the reservation */ |
| 437 | resv_huge_pages -= unused_resv_pages; | 810 | h->resv_huge_pages -= unused_resv_pages; |
| 438 | 811 | ||
| 439 | nr_pages = min(unused_resv_pages, surplus_huge_pages); | 812 | /* Cannot return gigantic pages currently */ |
| 813 | if (h->order >= MAX_ORDER) | ||
| 814 | return; | ||
| 815 | |||
| 816 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); | ||
| 440 | 817 | ||
| 441 | while (remaining_iterations-- && nr_pages) { | 818 | while (remaining_iterations-- && nr_pages) { |
| 442 | nid = next_node(nid, node_online_map); | 819 | nid = next_node(nid, node_online_map); |
| 443 | if (nid == MAX_NUMNODES) | 820 | if (nid == MAX_NUMNODES) |
| 444 | nid = first_node(node_online_map); | 821 | nid = first_node(node_online_map); |
| 445 | 822 | ||
| 446 | if (!surplus_huge_pages_node[nid]) | 823 | if (!h->surplus_huge_pages_node[nid]) |
| 447 | continue; | 824 | continue; |
| 448 | 825 | ||
| 449 | if (!list_empty(&hugepage_freelists[nid])) { | 826 | if (!list_empty(&h->hugepage_freelists[nid])) { |
| 450 | page = list_entry(hugepage_freelists[nid].next, | 827 | page = list_entry(h->hugepage_freelists[nid].next, |
| 451 | struct page, lru); | 828 | struct page, lru); |
| 452 | list_del(&page->lru); | 829 | list_del(&page->lru); |
| 453 | update_and_free_page(page); | 830 | update_and_free_page(h, page); |
| 454 | free_huge_pages--; | 831 | h->free_huge_pages--; |
| 455 | free_huge_pages_node[nid]--; | 832 | h->free_huge_pages_node[nid]--; |
| 456 | surplus_huge_pages--; | 833 | h->surplus_huge_pages--; |
| 457 | surplus_huge_pages_node[nid]--; | 834 | h->surplus_huge_pages_node[nid]--; |
| 458 | nr_pages--; | 835 | nr_pages--; |
| 459 | remaining_iterations = num_online_nodes(); | 836 | remaining_iterations = num_online_nodes(); |
| 460 | } | 837 | } |
| 461 | } | 838 | } |
| 462 | } | 839 | } |
| 463 | 840 | ||
| 841 | /* | ||
| 842 | * Determine if the huge page at addr within the vma has an associated | ||
| 843 | * reservation. Where it does not we will need to logically increase | ||
| 844 | * reservation and actually increase quota before an allocation can occur. | ||
| 845 | * Where any new reservation would be required the reservation change is | ||
| 846 | * prepared, but not committed. Once the page has been quota'd allocated | ||
| 847 | * an instantiated the change should be committed via vma_commit_reservation. | ||
| 848 | * No action is required on failure. | ||
| 849 | */ | ||
| 850 | static int vma_needs_reservation(struct hstate *h, | ||
| 851 | struct vm_area_struct *vma, unsigned long addr) | ||
| 852 | { | ||
| 853 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
| 854 | struct inode *inode = mapping->host; | ||
| 855 | |||
| 856 | if (vma->vm_flags & VM_SHARED) { | ||
| 857 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | ||
| 858 | return region_chg(&inode->i_mapping->private_list, | ||
| 859 | idx, idx + 1); | ||
| 860 | |||
| 861 | } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | ||
| 862 | return 1; | ||
| 863 | |||
| 864 | } else { | ||
| 865 | int err; | ||
| 866 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | ||
| 867 | struct resv_map *reservations = vma_resv_map(vma); | ||
| 464 | 868 | ||
| 465 | static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, | 869 | err = region_chg(&reservations->regions, idx, idx + 1); |
| 466 | unsigned long addr) | 870 | if (err < 0) |
| 871 | return err; | ||
| 872 | return 0; | ||
| 873 | } | ||
| 874 | } | ||
| 875 | static void vma_commit_reservation(struct hstate *h, | ||
| 876 | struct vm_area_struct *vma, unsigned long addr) | ||
| 467 | { | 877 | { |
| 468 | struct page *page; | 878 | struct address_space *mapping = vma->vm_file->f_mapping; |
| 879 | struct inode *inode = mapping->host; | ||
| 469 | 880 | ||
| 470 | spin_lock(&hugetlb_lock); | 881 | if (vma->vm_flags & VM_SHARED) { |
| 471 | page = dequeue_huge_page_vma(vma, addr); | 882 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
| 472 | spin_unlock(&hugetlb_lock); | 883 | region_add(&inode->i_mapping->private_list, idx, idx + 1); |
| 473 | return page ? page : ERR_PTR(-VM_FAULT_OOM); | 884 | |
| 885 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | ||
| 886 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | ||
| 887 | struct resv_map *reservations = vma_resv_map(vma); | ||
| 888 | |||
| 889 | /* Mark this page used in the map. */ | ||
| 890 | region_add(&reservations->regions, idx, idx + 1); | ||
| 891 | } | ||
| 474 | } | 892 | } |
| 475 | 893 | ||
| 476 | static struct page *alloc_huge_page_private(struct vm_area_struct *vma, | 894 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
| 477 | unsigned long addr) | 895 | unsigned long addr, int avoid_reserve) |
| 478 | { | 896 | { |
| 479 | struct page *page = NULL; | 897 | struct hstate *h = hstate_vma(vma); |
| 898 | struct page *page; | ||
| 899 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
| 900 | struct inode *inode = mapping->host; | ||
| 901 | unsigned int chg; | ||
| 480 | 902 | ||
| 481 | if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) | 903 | /* |
| 482 | return ERR_PTR(-VM_FAULT_SIGBUS); | 904 | * Processes that did not create the mapping will have no reserves and |
| 905 | * will not have accounted against quota. Check that the quota can be | ||
| 906 | * made before satisfying the allocation | ||
| 907 | * MAP_NORESERVE mappings may also need pages and quota allocated | ||
| 908 | * if no reserve mapping overlaps. | ||
| 909 | */ | ||
| 910 | chg = vma_needs_reservation(h, vma, addr); | ||
| 911 | if (chg < 0) | ||
| 912 | return ERR_PTR(chg); | ||
| 913 | if (chg) | ||
| 914 | if (hugetlb_get_quota(inode->i_mapping, chg)) | ||
| 915 | return ERR_PTR(-ENOSPC); | ||
| 483 | 916 | ||
| 484 | spin_lock(&hugetlb_lock); | 917 | spin_lock(&hugetlb_lock); |
| 485 | if (free_huge_pages > resv_huge_pages) | 918 | page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); |
| 486 | page = dequeue_huge_page_vma(vma, addr); | ||
| 487 | spin_unlock(&hugetlb_lock); | 919 | spin_unlock(&hugetlb_lock); |
| 920 | |||
| 488 | if (!page) { | 921 | if (!page) { |
| 489 | page = alloc_buddy_huge_page(vma, addr); | 922 | page = alloc_buddy_huge_page(h, vma, addr); |
| 490 | if (!page) { | 923 | if (!page) { |
| 491 | hugetlb_put_quota(vma->vm_file->f_mapping, 1); | 924 | hugetlb_put_quota(inode->i_mapping, chg); |
| 492 | return ERR_PTR(-VM_FAULT_OOM); | 925 | return ERR_PTR(-VM_FAULT_OOM); |
| 493 | } | 926 | } |
| 494 | } | 927 | } |
| 928 | |||
| 929 | set_page_refcounted(page); | ||
| 930 | set_page_private(page, (unsigned long) mapping); | ||
| 931 | |||
| 932 | vma_commit_reservation(h, vma, addr); | ||
| 933 | |||
| 495 | return page; | 934 | return page; |
| 496 | } | 935 | } |
| 497 | 936 | ||
| 498 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 937 | __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) |
| 499 | unsigned long addr) | ||
| 500 | { | 938 | { |
| 501 | struct page *page; | 939 | struct huge_bootmem_page *m; |
| 502 | struct address_space *mapping = vma->vm_file->f_mapping; | 940 | int nr_nodes = nodes_weight(node_online_map); |
| 503 | 941 | ||
| 504 | if (vma->vm_flags & VM_MAYSHARE) | 942 | while (nr_nodes) { |
| 505 | page = alloc_huge_page_shared(vma, addr); | 943 | void *addr; |
| 506 | else | 944 | |
| 507 | page = alloc_huge_page_private(vma, addr); | 945 | addr = __alloc_bootmem_node_nopanic( |
| 946 | NODE_DATA(h->hugetlb_next_nid), | ||
| 947 | huge_page_size(h), huge_page_size(h), 0); | ||
| 508 | 948 | ||
| 509 | if (!IS_ERR(page)) { | 949 | if (addr) { |
| 510 | set_page_refcounted(page); | 950 | /* |
| 511 | set_page_private(page, (unsigned long) mapping); | 951 | * Use the beginning of the huge page to store the |
| 952 | * huge_bootmem_page struct (until gather_bootmem | ||
| 953 | * puts them into the mem_map). | ||
| 954 | */ | ||
| 955 | m = addr; | ||
| 956 | if (m) | ||
| 957 | goto found; | ||
| 958 | } | ||
| 959 | hstate_next_node(h); | ||
| 960 | nr_nodes--; | ||
| 512 | } | 961 | } |
| 513 | return page; | 962 | return 0; |
| 963 | |||
| 964 | found: | ||
| 965 | BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); | ||
| 966 | /* Put them into a private list first because mem_map is not up yet */ | ||
| 967 | list_add(&m->list, &huge_boot_pages); | ||
| 968 | m->hstate = h; | ||
| 969 | return 1; | ||
| 514 | } | 970 | } |
| 515 | 971 | ||
| 516 | static int __init hugetlb_init(void) | 972 | /* Put bootmem huge pages into the standard lists after mem_map is up */ |
| 973 | static void __init gather_bootmem_prealloc(void) | ||
| 517 | { | 974 | { |
| 518 | unsigned long i; | 975 | struct huge_bootmem_page *m; |
| 519 | 976 | ||
| 520 | if (HPAGE_SHIFT == 0) | 977 | list_for_each_entry(m, &huge_boot_pages, list) { |
| 521 | return 0; | 978 | struct page *page = virt_to_page(m); |
| 522 | 979 | struct hstate *h = m->hstate; | |
| 523 | for (i = 0; i < MAX_NUMNODES; ++i) | 980 | __ClearPageReserved(page); |
| 524 | INIT_LIST_HEAD(&hugepage_freelists[i]); | 981 | WARN_ON(page_count(page) != 1); |
| 982 | prep_compound_page(page, h->order); | ||
| 983 | prep_new_huge_page(h, page, page_to_nid(page)); | ||
| 984 | } | ||
| 985 | } | ||
| 525 | 986 | ||
| 526 | hugetlb_next_nid = first_node(node_online_map); | 987 | static void __init hugetlb_hstate_alloc_pages(struct hstate *h) |
| 988 | { | ||
| 989 | unsigned long i; | ||
| 527 | 990 | ||
| 528 | for (i = 0; i < max_huge_pages; ++i) { | 991 | for (i = 0; i < h->max_huge_pages; ++i) { |
| 529 | if (!alloc_fresh_huge_page()) | 992 | if (h->order >= MAX_ORDER) { |
| 993 | if (!alloc_bootmem_huge_page(h)) | ||
| 994 | break; | ||
| 995 | } else if (!alloc_fresh_huge_page(h)) | ||
| 530 | break; | 996 | break; |
| 531 | } | 997 | } |
| 532 | max_huge_pages = free_huge_pages = nr_huge_pages = i; | 998 | h->max_huge_pages = i; |
| 533 | printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); | ||
| 534 | return 0; | ||
| 535 | } | 999 | } |
| 536 | module_init(hugetlb_init); | ||
| 537 | 1000 | ||
| 538 | static int __init hugetlb_setup(char *s) | 1001 | static void __init hugetlb_init_hstates(void) |
| 539 | { | 1002 | { |
| 540 | if (sscanf(s, "%lu", &max_huge_pages) <= 0) | 1003 | struct hstate *h; |
| 541 | max_huge_pages = 0; | 1004 | |
| 542 | return 1; | 1005 | for_each_hstate(h) { |
| 1006 | /* oversize hugepages were init'ed in early boot */ | ||
| 1007 | if (h->order < MAX_ORDER) | ||
| 1008 | hugetlb_hstate_alloc_pages(h); | ||
| 1009 | } | ||
| 543 | } | 1010 | } |
| 544 | __setup("hugepages=", hugetlb_setup); | ||
| 545 | 1011 | ||
| 546 | static unsigned int cpuset_mems_nr(unsigned int *array) | 1012 | static char * __init memfmt(char *buf, unsigned long n) |
| 547 | { | 1013 | { |
| 548 | int node; | 1014 | if (n >= (1UL << 30)) |
| 549 | unsigned int nr = 0; | 1015 | sprintf(buf, "%lu GB", n >> 30); |
| 550 | 1016 | else if (n >= (1UL << 20)) | |
| 551 | for_each_node_mask(node, cpuset_current_mems_allowed) | 1017 | sprintf(buf, "%lu MB", n >> 20); |
| 552 | nr += array[node]; | 1018 | else |
| 1019 | sprintf(buf, "%lu KB", n >> 10); | ||
| 1020 | return buf; | ||
| 1021 | } | ||
| 553 | 1022 | ||
| 554 | return nr; | 1023 | static void __init report_hugepages(void) |
| 1024 | { | ||
| 1025 | struct hstate *h; | ||
| 1026 | |||
| 1027 | for_each_hstate(h) { | ||
| 1028 | char buf[32]; | ||
| 1029 | printk(KERN_INFO "HugeTLB registered %s page size, " | ||
| 1030 | "pre-allocated %ld pages\n", | ||
| 1031 | memfmt(buf, huge_page_size(h)), | ||
| 1032 | h->free_huge_pages); | ||
| 1033 | } | ||
| 555 | } | 1034 | } |
| 556 | 1035 | ||
| 557 | #ifdef CONFIG_SYSCTL | ||
| 558 | #ifdef CONFIG_HIGHMEM | 1036 | #ifdef CONFIG_HIGHMEM |
| 559 | static void try_to_free_low(unsigned long count) | 1037 | static void try_to_free_low(struct hstate *h, unsigned long count) |
| 560 | { | 1038 | { |
| 561 | int i; | 1039 | int i; |
| 562 | 1040 | ||
| 1041 | if (h->order >= MAX_ORDER) | ||
| 1042 | return; | ||
| 1043 | |||
| 563 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1044 | for (i = 0; i < MAX_NUMNODES; ++i) { |
| 564 | struct page *page, *next; | 1045 | struct page *page, *next; |
| 565 | list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) { | 1046 | struct list_head *freel = &h->hugepage_freelists[i]; |
| 566 | if (count >= nr_huge_pages) | 1047 | list_for_each_entry_safe(page, next, freel, lru) { |
| 1048 | if (count >= h->nr_huge_pages) | ||
| 567 | return; | 1049 | return; |
| 568 | if (PageHighMem(page)) | 1050 | if (PageHighMem(page)) |
| 569 | continue; | 1051 | continue; |
| 570 | list_del(&page->lru); | 1052 | list_del(&page->lru); |
| 571 | update_and_free_page(page); | 1053 | update_and_free_page(h, page); |
| 572 | free_huge_pages--; | 1054 | h->free_huge_pages--; |
| 573 | free_huge_pages_node[page_to_nid(page)]--; | 1055 | h->free_huge_pages_node[page_to_nid(page)]--; |
| 574 | } | 1056 | } |
| 575 | } | 1057 | } |
| 576 | } | 1058 | } |
| 577 | #else | 1059 | #else |
| 578 | static inline void try_to_free_low(unsigned long count) | 1060 | static inline void try_to_free_low(struct hstate *h, unsigned long count) |
| 579 | { | 1061 | { |
| 580 | } | 1062 | } |
| 581 | #endif | 1063 | #endif |
| 582 | 1064 | ||
| 583 | #define persistent_huge_pages (nr_huge_pages - surplus_huge_pages) | 1065 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
| 584 | static unsigned long set_max_huge_pages(unsigned long count) | 1066 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) |
| 585 | { | 1067 | { |
| 586 | unsigned long min_count, ret; | 1068 | unsigned long min_count, ret; |
| 587 | 1069 | ||
| 1070 | if (h->order >= MAX_ORDER) | ||
| 1071 | return h->max_huge_pages; | ||
| 1072 | |||
| 588 | /* | 1073 | /* |
| 589 | * Increase the pool size | 1074 | * Increase the pool size |
| 590 | * First take pages out of surplus state. Then make up the | 1075 | * First take pages out of surplus state. Then make up the |
| @@ -597,20 +1082,19 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
| 597 | * within all the constraints specified by the sysctls. | 1082 | * within all the constraints specified by the sysctls. |
| 598 | */ | 1083 | */ |
| 599 | spin_lock(&hugetlb_lock); | 1084 | spin_lock(&hugetlb_lock); |
| 600 | while (surplus_huge_pages && count > persistent_huge_pages) { | 1085 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
| 601 | if (!adjust_pool_surplus(-1)) | 1086 | if (!adjust_pool_surplus(h, -1)) |
| 602 | break; | 1087 | break; |
| 603 | } | 1088 | } |
| 604 | 1089 | ||
| 605 | while (count > persistent_huge_pages) { | 1090 | while (count > persistent_huge_pages(h)) { |
| 606 | int ret; | ||
| 607 | /* | 1091 | /* |
| 608 | * If this allocation races such that we no longer need the | 1092 | * If this allocation races such that we no longer need the |
| 609 | * page, free_huge_page will handle it by freeing the page | 1093 | * page, free_huge_page will handle it by freeing the page |
| 610 | * and reducing the surplus. | 1094 | * and reducing the surplus. |
| 611 | */ | 1095 | */ |
| 612 | spin_unlock(&hugetlb_lock); | 1096 | spin_unlock(&hugetlb_lock); |
| 613 | ret = alloc_fresh_huge_page(); | 1097 | ret = alloc_fresh_huge_page(h); |
| 614 | spin_lock(&hugetlb_lock); | 1098 | spin_lock(&hugetlb_lock); |
| 615 | if (!ret) | 1099 | if (!ret) |
| 616 | goto out; | 1100 | goto out; |
| @@ -632,31 +1116,305 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
| 632 | * and won't grow the pool anywhere else. Not until one of the | 1116 | * and won't grow the pool anywhere else. Not until one of the |
| 633 | * sysctls are changed, or the surplus pages go out of use. | 1117 | * sysctls are changed, or the surplus pages go out of use. |
| 634 | */ | 1118 | */ |
| 635 | min_count = resv_huge_pages + nr_huge_pages - free_huge_pages; | 1119 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
| 636 | min_count = max(count, min_count); | 1120 | min_count = max(count, min_count); |
| 637 | try_to_free_low(min_count); | 1121 | try_to_free_low(h, min_count); |
| 638 | while (min_count < persistent_huge_pages) { | 1122 | while (min_count < persistent_huge_pages(h)) { |
| 639 | struct page *page = dequeue_huge_page(); | 1123 | struct page *page = dequeue_huge_page(h); |
| 640 | if (!page) | 1124 | if (!page) |
| 641 | break; | 1125 | break; |
| 642 | update_and_free_page(page); | 1126 | update_and_free_page(h, page); |
| 643 | } | 1127 | } |
| 644 | while (count < persistent_huge_pages) { | 1128 | while (count < persistent_huge_pages(h)) { |
| 645 | if (!adjust_pool_surplus(1)) | 1129 | if (!adjust_pool_surplus(h, 1)) |
| 646 | break; | 1130 | break; |
| 647 | } | 1131 | } |
| 648 | out: | 1132 | out: |
| 649 | ret = persistent_huge_pages; | 1133 | ret = persistent_huge_pages(h); |
| 650 | spin_unlock(&hugetlb_lock); | 1134 | spin_unlock(&hugetlb_lock); |
| 651 | return ret; | 1135 | return ret; |
| 652 | } | 1136 | } |
| 653 | 1137 | ||
| 1138 | #define HSTATE_ATTR_RO(_name) \ | ||
| 1139 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | ||
| 1140 | |||
| 1141 | #define HSTATE_ATTR(_name) \ | ||
| 1142 | static struct kobj_attribute _name##_attr = \ | ||
| 1143 | __ATTR(_name, 0644, _name##_show, _name##_store) | ||
| 1144 | |||
| 1145 | static struct kobject *hugepages_kobj; | ||
| 1146 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | ||
| 1147 | |||
| 1148 | static struct hstate *kobj_to_hstate(struct kobject *kobj) | ||
| 1149 | { | ||
| 1150 | int i; | ||
| 1151 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | ||
| 1152 | if (hstate_kobjs[i] == kobj) | ||
| 1153 | return &hstates[i]; | ||
| 1154 | BUG(); | ||
| 1155 | return NULL; | ||
| 1156 | } | ||
| 1157 | |||
| 1158 | static ssize_t nr_hugepages_show(struct kobject *kobj, | ||
| 1159 | struct kobj_attribute *attr, char *buf) | ||
| 1160 | { | ||
| 1161 | struct hstate *h = kobj_to_hstate(kobj); | ||
| 1162 | return sprintf(buf, "%lu\n", h->nr_huge_pages); | ||
| 1163 | } | ||
| 1164 | static ssize_t nr_hugepages_store(struct kobject *kobj, | ||
| 1165 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
| 1166 | { | ||
| 1167 | int err; | ||
| 1168 | unsigned long input; | ||
| 1169 | struct hstate *h = kobj_to_hstate(kobj); | ||
| 1170 | |||
| 1171 | err = strict_strtoul(buf, 10, &input); | ||
| 1172 | if (err) | ||
| 1173 | return 0; | ||
| 1174 | |||
| 1175 | h->max_huge_pages = set_max_huge_pages(h, input); | ||
| 1176 | |||
| 1177 | return count; | ||
| 1178 | } | ||
| 1179 | HSTATE_ATTR(nr_hugepages); | ||
| 1180 | |||
| 1181 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | ||
| 1182 | struct kobj_attribute *attr, char *buf) | ||
| 1183 | { | ||
| 1184 | struct hstate *h = kobj_to_hstate(kobj); | ||
| 1185 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | ||
| 1186 | } | ||
| 1187 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | ||
| 1188 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
| 1189 | { | ||
| 1190 | int err; | ||
| 1191 | unsigned long input; | ||
| 1192 | struct hstate *h = kobj_to_hstate(kobj); | ||
| 1193 | |||
| 1194 | err = strict_strtoul(buf, 10, &input); | ||
| 1195 | if (err) | ||
| 1196 | return 0; | ||
| 1197 | |||
| 1198 | spin_lock(&hugetlb_lock); | ||
| 1199 | h->nr_overcommit_huge_pages = input; | ||
| 1200 | spin_unlock(&hugetlb_lock); | ||
| 1201 | |||
| 1202 | return count; | ||
| 1203 | } | ||
| 1204 | HSTATE_ATTR(nr_overcommit_hugepages); | ||
| 1205 | |||
| 1206 | static ssize_t free_hugepages_show(struct kobject *kobj, | ||
| 1207 | struct kobj_attribute *attr, char *buf) | ||
| 1208 | { | ||
| 1209 | struct hstate *h = kobj_to_hstate(kobj); | ||
| 1210 | return sprintf(buf, "%lu\n", h->free_huge_pages); | ||
| 1211 | } | ||
| 1212 | HSTATE_ATTR_RO(free_hugepages); | ||
| 1213 | |||
| 1214 | static ssize_t resv_hugepages_show(struct kobject *kobj, | ||
| 1215 | struct kobj_attribute *attr, char *buf) | ||
| 1216 | { | ||
| 1217 | struct hstate *h = kobj_to_hstate(kobj); | ||
| 1218 | return sprintf(buf, "%lu\n", h->resv_huge_pages); | ||
| 1219 | } | ||
| 1220 | HSTATE_ATTR_RO(resv_hugepages); | ||
| 1221 | |||
| 1222 | static ssize_t surplus_hugepages_show(struct kobject *kobj, | ||
| 1223 | struct kobj_attribute *attr, char *buf) | ||
| 1224 | { | ||
| 1225 | struct hstate *h = kobj_to_hstate(kobj); | ||
| 1226 | return sprintf(buf, "%lu\n", h->surplus_huge_pages); | ||
| 1227 | } | ||
| 1228 | HSTATE_ATTR_RO(surplus_hugepages); | ||
| 1229 | |||
| 1230 | static struct attribute *hstate_attrs[] = { | ||
| 1231 | &nr_hugepages_attr.attr, | ||
| 1232 | &nr_overcommit_hugepages_attr.attr, | ||
| 1233 | &free_hugepages_attr.attr, | ||
| 1234 | &resv_hugepages_attr.attr, | ||
| 1235 | &surplus_hugepages_attr.attr, | ||
| 1236 | NULL, | ||
| 1237 | }; | ||
| 1238 | |||
| 1239 | static struct attribute_group hstate_attr_group = { | ||
| 1240 | .attrs = hstate_attrs, | ||
| 1241 | }; | ||
| 1242 | |||
| 1243 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h) | ||
| 1244 | { | ||
| 1245 | int retval; | ||
| 1246 | |||
| 1247 | hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, | ||
| 1248 | hugepages_kobj); | ||
| 1249 | if (!hstate_kobjs[h - hstates]) | ||
| 1250 | return -ENOMEM; | ||
| 1251 | |||
| 1252 | retval = sysfs_create_group(hstate_kobjs[h - hstates], | ||
| 1253 | &hstate_attr_group); | ||
| 1254 | if (retval) | ||
| 1255 | kobject_put(hstate_kobjs[h - hstates]); | ||
| 1256 | |||
| 1257 | return retval; | ||
| 1258 | } | ||
| 1259 | |||
| 1260 | static void __init hugetlb_sysfs_init(void) | ||
| 1261 | { | ||
| 1262 | struct hstate *h; | ||
| 1263 | int err; | ||
| 1264 | |||
| 1265 | hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); | ||
| 1266 | if (!hugepages_kobj) | ||
| 1267 | return; | ||
| 1268 | |||
| 1269 | for_each_hstate(h) { | ||
| 1270 | err = hugetlb_sysfs_add_hstate(h); | ||
| 1271 | if (err) | ||
| 1272 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | ||
| 1273 | h->name); | ||
| 1274 | } | ||
| 1275 | } | ||
| 1276 | |||
| 1277 | static void __exit hugetlb_exit(void) | ||
| 1278 | { | ||
| 1279 | struct hstate *h; | ||
| 1280 | |||
| 1281 | for_each_hstate(h) { | ||
| 1282 | kobject_put(hstate_kobjs[h - hstates]); | ||
| 1283 | } | ||
| 1284 | |||
| 1285 | kobject_put(hugepages_kobj); | ||
| 1286 | } | ||
| 1287 | module_exit(hugetlb_exit); | ||
| 1288 | |||
| 1289 | static int __init hugetlb_init(void) | ||
| 1290 | { | ||
| 1291 | /* Some platform decide whether they support huge pages at boot | ||
| 1292 | * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when | ||
| 1293 | * there is no such support | ||
| 1294 | */ | ||
| 1295 | if (HPAGE_SHIFT == 0) | ||
| 1296 | return 0; | ||
| 1297 | |||
| 1298 | if (!size_to_hstate(default_hstate_size)) { | ||
| 1299 | default_hstate_size = HPAGE_SIZE; | ||
| 1300 | if (!size_to_hstate(default_hstate_size)) | ||
| 1301 | hugetlb_add_hstate(HUGETLB_PAGE_ORDER); | ||
| 1302 | } | ||
| 1303 | default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; | ||
| 1304 | if (default_hstate_max_huge_pages) | ||
| 1305 | default_hstate.max_huge_pages = default_hstate_max_huge_pages; | ||
| 1306 | |||
| 1307 | hugetlb_init_hstates(); | ||
| 1308 | |||
| 1309 | gather_bootmem_prealloc(); | ||
| 1310 | |||
| 1311 | report_hugepages(); | ||
| 1312 | |||
| 1313 | hugetlb_sysfs_init(); | ||
| 1314 | |||
| 1315 | return 0; | ||
| 1316 | } | ||
| 1317 | module_init(hugetlb_init); | ||
| 1318 | |||
| 1319 | /* Should be called on processing a hugepagesz=... option */ | ||
| 1320 | void __init hugetlb_add_hstate(unsigned order) | ||
| 1321 | { | ||
| 1322 | struct hstate *h; | ||
| 1323 | unsigned long i; | ||
| 1324 | |||
| 1325 | if (size_to_hstate(PAGE_SIZE << order)) { | ||
| 1326 | printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); | ||
| 1327 | return; | ||
| 1328 | } | ||
| 1329 | BUG_ON(max_hstate >= HUGE_MAX_HSTATE); | ||
| 1330 | BUG_ON(order == 0); | ||
| 1331 | h = &hstates[max_hstate++]; | ||
| 1332 | h->order = order; | ||
| 1333 | h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); | ||
| 1334 | h->nr_huge_pages = 0; | ||
| 1335 | h->free_huge_pages = 0; | ||
| 1336 | for (i = 0; i < MAX_NUMNODES; ++i) | ||
| 1337 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | ||
| 1338 | h->hugetlb_next_nid = first_node(node_online_map); | ||
| 1339 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | ||
| 1340 | huge_page_size(h)/1024); | ||
| 1341 | |||
| 1342 | parsed_hstate = h; | ||
| 1343 | } | ||
| 1344 | |||
| 1345 | static int __init hugetlb_nrpages_setup(char *s) | ||
| 1346 | { | ||
| 1347 | unsigned long *mhp; | ||
| 1348 | static unsigned long *last_mhp; | ||
| 1349 | |||
| 1350 | /* | ||
| 1351 | * !max_hstate means we haven't parsed a hugepagesz= parameter yet, | ||
| 1352 | * so this hugepages= parameter goes to the "default hstate". | ||
| 1353 | */ | ||
| 1354 | if (!max_hstate) | ||
| 1355 | mhp = &default_hstate_max_huge_pages; | ||
| 1356 | else | ||
| 1357 | mhp = &parsed_hstate->max_huge_pages; | ||
| 1358 | |||
| 1359 | if (mhp == last_mhp) { | ||
| 1360 | printk(KERN_WARNING "hugepages= specified twice without " | ||
| 1361 | "interleaving hugepagesz=, ignoring\n"); | ||
| 1362 | return 1; | ||
| 1363 | } | ||
| 1364 | |||
| 1365 | if (sscanf(s, "%lu", mhp) <= 0) | ||
| 1366 | *mhp = 0; | ||
| 1367 | |||
| 1368 | /* | ||
| 1369 | * Global state is always initialized later in hugetlb_init. | ||
| 1370 | * But we need to allocate >= MAX_ORDER hstates here early to still | ||
| 1371 | * use the bootmem allocator. | ||
| 1372 | */ | ||
| 1373 | if (max_hstate && parsed_hstate->order >= MAX_ORDER) | ||
| 1374 | hugetlb_hstate_alloc_pages(parsed_hstate); | ||
| 1375 | |||
| 1376 | last_mhp = mhp; | ||
| 1377 | |||
| 1378 | return 1; | ||
| 1379 | } | ||
| 1380 | __setup("hugepages=", hugetlb_nrpages_setup); | ||
| 1381 | |||
| 1382 | static int __init hugetlb_default_setup(char *s) | ||
| 1383 | { | ||
| 1384 | default_hstate_size = memparse(s, &s); | ||
| 1385 | return 1; | ||
| 1386 | } | ||
| 1387 | __setup("default_hugepagesz=", hugetlb_default_setup); | ||
| 1388 | |||
| 1389 | static unsigned int cpuset_mems_nr(unsigned int *array) | ||
| 1390 | { | ||
| 1391 | int node; | ||
| 1392 | unsigned int nr = 0; | ||
| 1393 | |||
| 1394 | for_each_node_mask(node, cpuset_current_mems_allowed) | ||
| 1395 | nr += array[node]; | ||
| 1396 | |||
| 1397 | return nr; | ||
| 1398 | } | ||
| 1399 | |||
| 1400 | #ifdef CONFIG_SYSCTL | ||
| 654 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1401 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
| 655 | struct file *file, void __user *buffer, | 1402 | struct file *file, void __user *buffer, |
| 656 | size_t *length, loff_t *ppos) | 1403 | size_t *length, loff_t *ppos) |
| 657 | { | 1404 | { |
| 1405 | struct hstate *h = &default_hstate; | ||
| 1406 | unsigned long tmp; | ||
| 1407 | |||
| 1408 | if (!write) | ||
| 1409 | tmp = h->max_huge_pages; | ||
| 1410 | |||
| 1411 | table->data = &tmp; | ||
| 1412 | table->maxlen = sizeof(unsigned long); | ||
| 658 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1413 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); |
| 659 | max_huge_pages = set_max_huge_pages(max_huge_pages); | 1414 | |
| 1415 | if (write) | ||
| 1416 | h->max_huge_pages = set_max_huge_pages(h, tmp); | ||
| 1417 | |||
| 660 | return 0; | 1418 | return 0; |
| 661 | } | 1419 | } |
| 662 | 1420 | ||
| @@ -676,10 +1434,22 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
| 676 | struct file *file, void __user *buffer, | 1434 | struct file *file, void __user *buffer, |
| 677 | size_t *length, loff_t *ppos) | 1435 | size_t *length, loff_t *ppos) |
| 678 | { | 1436 | { |
| 1437 | struct hstate *h = &default_hstate; | ||
| 1438 | unsigned long tmp; | ||
| 1439 | |||
| 1440 | if (!write) | ||
| 1441 | tmp = h->nr_overcommit_huge_pages; | ||
| 1442 | |||
| 1443 | table->data = &tmp; | ||
| 1444 | table->maxlen = sizeof(unsigned long); | ||
| 679 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1445 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); |
| 680 | spin_lock(&hugetlb_lock); | 1446 | |
| 681 | nr_overcommit_huge_pages = sysctl_overcommit_huge_pages; | 1447 | if (write) { |
| 682 | spin_unlock(&hugetlb_lock); | 1448 | spin_lock(&hugetlb_lock); |
| 1449 | h->nr_overcommit_huge_pages = tmp; | ||
| 1450 | spin_unlock(&hugetlb_lock); | ||
| 1451 | } | ||
| 1452 | |||
| 683 | return 0; | 1453 | return 0; |
| 684 | } | 1454 | } |
| 685 | 1455 | ||
| @@ -687,34 +1457,118 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
| 687 | 1457 | ||
| 688 | int hugetlb_report_meminfo(char *buf) | 1458 | int hugetlb_report_meminfo(char *buf) |
| 689 | { | 1459 | { |
| 1460 | struct hstate *h = &default_hstate; | ||
| 690 | return sprintf(buf, | 1461 | return sprintf(buf, |
| 691 | "HugePages_Total: %5lu\n" | 1462 | "HugePages_Total: %5lu\n" |
| 692 | "HugePages_Free: %5lu\n" | 1463 | "HugePages_Free: %5lu\n" |
| 693 | "HugePages_Rsvd: %5lu\n" | 1464 | "HugePages_Rsvd: %5lu\n" |
| 694 | "HugePages_Surp: %5lu\n" | 1465 | "HugePages_Surp: %5lu\n" |
| 695 | "Hugepagesize: %5lu kB\n", | 1466 | "Hugepagesize: %5lu kB\n", |
| 696 | nr_huge_pages, | 1467 | h->nr_huge_pages, |
| 697 | free_huge_pages, | 1468 | h->free_huge_pages, |
| 698 | resv_huge_pages, | 1469 | h->resv_huge_pages, |
| 699 | surplus_huge_pages, | 1470 | h->surplus_huge_pages, |
| 700 | HPAGE_SIZE/1024); | 1471 | 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); |
| 701 | } | 1472 | } |
| 702 | 1473 | ||
| 703 | int hugetlb_report_node_meminfo(int nid, char *buf) | 1474 | int hugetlb_report_node_meminfo(int nid, char *buf) |
| 704 | { | 1475 | { |
| 1476 | struct hstate *h = &default_hstate; | ||
| 705 | return sprintf(buf, | 1477 | return sprintf(buf, |
| 706 | "Node %d HugePages_Total: %5u\n" | 1478 | "Node %d HugePages_Total: %5u\n" |
| 707 | "Node %d HugePages_Free: %5u\n" | 1479 | "Node %d HugePages_Free: %5u\n" |
| 708 | "Node %d HugePages_Surp: %5u\n", | 1480 | "Node %d HugePages_Surp: %5u\n", |
| 709 | nid, nr_huge_pages_node[nid], | 1481 | nid, h->nr_huge_pages_node[nid], |
| 710 | nid, free_huge_pages_node[nid], | 1482 | nid, h->free_huge_pages_node[nid], |
| 711 | nid, surplus_huge_pages_node[nid]); | 1483 | nid, h->surplus_huge_pages_node[nid]); |
| 712 | } | 1484 | } |
| 713 | 1485 | ||
| 714 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | 1486 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ |
| 715 | unsigned long hugetlb_total_pages(void) | 1487 | unsigned long hugetlb_total_pages(void) |
| 716 | { | 1488 | { |
| 717 | return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE); | 1489 | struct hstate *h = &default_hstate; |
| 1490 | return h->nr_huge_pages * pages_per_huge_page(h); | ||
| 1491 | } | ||
| 1492 | |||
| 1493 | static int hugetlb_acct_memory(struct hstate *h, long delta) | ||
| 1494 | { | ||
| 1495 | int ret = -ENOMEM; | ||
| 1496 | |||
| 1497 | spin_lock(&hugetlb_lock); | ||
| 1498 | /* | ||
| 1499 | * When cpuset is configured, it breaks the strict hugetlb page | ||
| 1500 | * reservation as the accounting is done on a global variable. Such | ||
| 1501 | * reservation is completely rubbish in the presence of cpuset because | ||
| 1502 | * the reservation is not checked against page availability for the | ||
| 1503 | * current cpuset. Application can still potentially OOM'ed by kernel | ||
| 1504 | * with lack of free htlb page in cpuset that the task is in. | ||
| 1505 | * Attempt to enforce strict accounting with cpuset is almost | ||
| 1506 | * impossible (or too ugly) because cpuset is too fluid that | ||
| 1507 | * task or memory node can be dynamically moved between cpusets. | ||
| 1508 | * | ||
| 1509 | * The change of semantics for shared hugetlb mapping with cpuset is | ||
| 1510 | * undesirable. However, in order to preserve some of the semantics, | ||
| 1511 | * we fall back to check against current free page availability as | ||
| 1512 | * a best attempt and hopefully to minimize the impact of changing | ||
| 1513 | * semantics that cpuset has. | ||
| 1514 | */ | ||
| 1515 | if (delta > 0) { | ||
| 1516 | if (gather_surplus_pages(h, delta) < 0) | ||
| 1517 | goto out; | ||
| 1518 | |||
| 1519 | if (delta > cpuset_mems_nr(h->free_huge_pages_node)) { | ||
| 1520 | return_unused_surplus_pages(h, delta); | ||
| 1521 | goto out; | ||
| 1522 | } | ||
| 1523 | } | ||
| 1524 | |||
| 1525 | ret = 0; | ||
| 1526 | if (delta < 0) | ||
| 1527 | return_unused_surplus_pages(h, (unsigned long) -delta); | ||
| 1528 | |||
| 1529 | out: | ||
| 1530 | spin_unlock(&hugetlb_lock); | ||
| 1531 | return ret; | ||
| 1532 | } | ||
| 1533 | |||
| 1534 | static void hugetlb_vm_op_open(struct vm_area_struct *vma) | ||
| 1535 | { | ||
| 1536 | struct resv_map *reservations = vma_resv_map(vma); | ||
| 1537 | |||
| 1538 | /* | ||
| 1539 | * This new VMA should share its siblings reservation map if present. | ||
| 1540 | * The VMA will only ever have a valid reservation map pointer where | ||
| 1541 | * it is being copied for another still existing VMA. As that VMA | ||
| 1542 | * has a reference to the reservation map it cannot dissappear until | ||
| 1543 | * after this open call completes. It is therefore safe to take a | ||
| 1544 | * new reference here without additional locking. | ||
| 1545 | */ | ||
| 1546 | if (reservations) | ||
| 1547 | kref_get(&reservations->refs); | ||
| 1548 | } | ||
| 1549 | |||
| 1550 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | ||
| 1551 | { | ||
| 1552 | struct hstate *h = hstate_vma(vma); | ||
| 1553 | struct resv_map *reservations = vma_resv_map(vma); | ||
| 1554 | unsigned long reserve; | ||
| 1555 | unsigned long start; | ||
| 1556 | unsigned long end; | ||
| 1557 | |||
| 1558 | if (reservations) { | ||
| 1559 | start = vma_hugecache_offset(h, vma, vma->vm_start); | ||
| 1560 | end = vma_hugecache_offset(h, vma, vma->vm_end); | ||
| 1561 | |||
| 1562 | reserve = (end - start) - | ||
| 1563 | region_count(&reservations->regions, start, end); | ||
| 1564 | |||
| 1565 | kref_put(&reservations->refs, resv_map_release); | ||
| 1566 | |||
| 1567 | if (reserve) { | ||
| 1568 | hugetlb_acct_memory(h, -reserve); | ||
| 1569 | hugetlb_put_quota(vma->vm_file->f_mapping, reserve); | ||
| 1570 | } | ||
| 1571 | } | ||
| 718 | } | 1572 | } |
| 719 | 1573 | ||
| 720 | /* | 1574 | /* |
| @@ -731,6 +1585,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 731 | 1585 | ||
| 732 | struct vm_operations_struct hugetlb_vm_ops = { | 1586 | struct vm_operations_struct hugetlb_vm_ops = { |
| 733 | .fault = hugetlb_vm_op_fault, | 1587 | .fault = hugetlb_vm_op_fault, |
| 1588 | .open = hugetlb_vm_op_open, | ||
| 1589 | .close = hugetlb_vm_op_close, | ||
| 734 | }; | 1590 | }; |
| 735 | 1591 | ||
| 736 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, | 1592 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, |
| @@ -769,14 +1625,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
| 769 | struct page *ptepage; | 1625 | struct page *ptepage; |
| 770 | unsigned long addr; | 1626 | unsigned long addr; |
| 771 | int cow; | 1627 | int cow; |
| 1628 | struct hstate *h = hstate_vma(vma); | ||
| 1629 | unsigned long sz = huge_page_size(h); | ||
| 772 | 1630 | ||
| 773 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 1631 | cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
| 774 | 1632 | ||
| 775 | for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) { | 1633 | for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { |
| 776 | src_pte = huge_pte_offset(src, addr); | 1634 | src_pte = huge_pte_offset(src, addr); |
| 777 | if (!src_pte) | 1635 | if (!src_pte) |
| 778 | continue; | 1636 | continue; |
| 779 | dst_pte = huge_pte_alloc(dst, addr); | 1637 | dst_pte = huge_pte_alloc(dst, addr, sz); |
| 780 | if (!dst_pte) | 1638 | if (!dst_pte) |
| 781 | goto nomem; | 1639 | goto nomem; |
| 782 | 1640 | ||
| @@ -804,7 +1662,7 @@ nomem: | |||
| 804 | } | 1662 | } |
| 805 | 1663 | ||
| 806 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 1664 | void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
| 807 | unsigned long end) | 1665 | unsigned long end, struct page *ref_page) |
| 808 | { | 1666 | { |
| 809 | struct mm_struct *mm = vma->vm_mm; | 1667 | struct mm_struct *mm = vma->vm_mm; |
| 810 | unsigned long address; | 1668 | unsigned long address; |
| @@ -812,6 +1670,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 812 | pte_t pte; | 1670 | pte_t pte; |
| 813 | struct page *page; | 1671 | struct page *page; |
| 814 | struct page *tmp; | 1672 | struct page *tmp; |
| 1673 | struct hstate *h = hstate_vma(vma); | ||
| 1674 | unsigned long sz = huge_page_size(h); | ||
| 1675 | |||
| 815 | /* | 1676 | /* |
| 816 | * A page gathering list, protected by per file i_mmap_lock. The | 1677 | * A page gathering list, protected by per file i_mmap_lock. The |
| 817 | * lock is used to avoid list corruption from multiple unmapping | 1678 | * lock is used to avoid list corruption from multiple unmapping |
| @@ -820,11 +1681,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 820 | LIST_HEAD(page_list); | 1681 | LIST_HEAD(page_list); |
| 821 | 1682 | ||
| 822 | WARN_ON(!is_vm_hugetlb_page(vma)); | 1683 | WARN_ON(!is_vm_hugetlb_page(vma)); |
| 823 | BUG_ON(start & ~HPAGE_MASK); | 1684 | BUG_ON(start & ~huge_page_mask(h)); |
| 824 | BUG_ON(end & ~HPAGE_MASK); | 1685 | BUG_ON(end & ~huge_page_mask(h)); |
| 825 | 1686 | ||
| 1687 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
| 826 | spin_lock(&mm->page_table_lock); | 1688 | spin_lock(&mm->page_table_lock); |
| 827 | for (address = start; address < end; address += HPAGE_SIZE) { | 1689 | for (address = start; address < end; address += sz) { |
| 828 | ptep = huge_pte_offset(mm, address); | 1690 | ptep = huge_pte_offset(mm, address); |
| 829 | if (!ptep) | 1691 | if (!ptep) |
| 830 | continue; | 1692 | continue; |
| @@ -832,6 +1694,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 832 | if (huge_pmd_unshare(mm, &address, ptep)) | 1694 | if (huge_pmd_unshare(mm, &address, ptep)) |
| 833 | continue; | 1695 | continue; |
| 834 | 1696 | ||
| 1697 | /* | ||
| 1698 | * If a reference page is supplied, it is because a specific | ||
| 1699 | * page is being unmapped, not a range. Ensure the page we | ||
| 1700 | * are about to unmap is the actual page of interest. | ||
| 1701 | */ | ||
| 1702 | if (ref_page) { | ||
| 1703 | pte = huge_ptep_get(ptep); | ||
| 1704 | if (huge_pte_none(pte)) | ||
| 1705 | continue; | ||
| 1706 | page = pte_page(pte); | ||
| 1707 | if (page != ref_page) | ||
| 1708 | continue; | ||
| 1709 | |||
| 1710 | /* | ||
| 1711 | * Mark the VMA as having unmapped its page so that | ||
| 1712 | * future faults in this VMA will fail rather than | ||
| 1713 | * looking like data was lost | ||
| 1714 | */ | ||
| 1715 | set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); | ||
| 1716 | } | ||
| 1717 | |||
| 835 | pte = huge_ptep_get_and_clear(mm, address, ptep); | 1718 | pte = huge_ptep_get_and_clear(mm, address, ptep); |
| 836 | if (huge_pte_none(pte)) | 1719 | if (huge_pte_none(pte)) |
| 837 | continue; | 1720 | continue; |
| @@ -843,6 +1726,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 843 | } | 1726 | } |
| 844 | spin_unlock(&mm->page_table_lock); | 1727 | spin_unlock(&mm->page_table_lock); |
| 845 | flush_tlb_range(vma, start, end); | 1728 | flush_tlb_range(vma, start, end); |
| 1729 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
| 846 | list_for_each_entry_safe(page, tmp, &page_list, lru) { | 1730 | list_for_each_entry_safe(page, tmp, &page_list, lru) { |
| 847 | list_del(&page->lru); | 1731 | list_del(&page->lru); |
| 848 | put_page(page); | 1732 | put_page(page); |
| @@ -850,31 +1734,71 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | |||
| 850 | } | 1734 | } |
| 851 | 1735 | ||
| 852 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, | 1736 | void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, |
| 853 | unsigned long end) | 1737 | unsigned long end, struct page *ref_page) |
| 854 | { | 1738 | { |
| 1739 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | ||
| 1740 | __unmap_hugepage_range(vma, start, end, ref_page); | ||
| 1741 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | ||
| 1742 | } | ||
| 1743 | |||
| 1744 | /* | ||
| 1745 | * This is called when the original mapper is failing to COW a MAP_PRIVATE | ||
| 1746 | * mappping it owns the reserve page for. The intention is to unmap the page | ||
| 1747 | * from other VMAs and let the children be SIGKILLed if they are faulting the | ||
| 1748 | * same region. | ||
| 1749 | */ | ||
| 1750 | int unmap_ref_private(struct mm_struct *mm, | ||
| 1751 | struct vm_area_struct *vma, | ||
| 1752 | struct page *page, | ||
| 1753 | unsigned long address) | ||
| 1754 | { | ||
| 1755 | struct vm_area_struct *iter_vma; | ||
| 1756 | struct address_space *mapping; | ||
| 1757 | struct prio_tree_iter iter; | ||
| 1758 | pgoff_t pgoff; | ||
| 1759 | |||
| 855 | /* | 1760 | /* |
| 856 | * It is undesirable to test vma->vm_file as it should be non-null | 1761 | * vm_pgoff is in PAGE_SIZE units, hence the different calculation |
| 857 | * for valid hugetlb area. However, vm_file will be NULL in the error | 1762 | * from page cache lookup which is in HPAGE_SIZE units. |
| 858 | * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, | ||
| 859 | * do_mmap_pgoff() nullifies vma->vm_file before calling this function | ||
| 860 | * to clean up. Since no pte has actually been setup, it is safe to | ||
| 861 | * do nothing in this case. | ||
| 862 | */ | 1763 | */ |
| 863 | if (vma->vm_file) { | 1764 | address = address & huge_page_mask(hstate_vma(vma)); |
| 864 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 1765 | pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) |
| 865 | __unmap_hugepage_range(vma, start, end); | 1766 | + (vma->vm_pgoff >> PAGE_SHIFT); |
| 866 | spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); | 1767 | mapping = (struct address_space *)page_private(page); |
| 1768 | |||
| 1769 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
| 1770 | /* Do not unmap the current VMA */ | ||
| 1771 | if (iter_vma == vma) | ||
| 1772 | continue; | ||
| 1773 | |||
| 1774 | /* | ||
| 1775 | * Unmap the page from other VMAs without their own reserves. | ||
| 1776 | * They get marked to be SIGKILLed if they fault in these | ||
| 1777 | * areas. This is because a future no-page fault on this VMA | ||
| 1778 | * could insert a zeroed page instead of the data existing | ||
| 1779 | * from the time of fork. This would look like data corruption | ||
| 1780 | */ | ||
| 1781 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | ||
| 1782 | unmap_hugepage_range(iter_vma, | ||
| 1783 | address, address + HPAGE_SIZE, | ||
| 1784 | page); | ||
| 867 | } | 1785 | } |
| 1786 | |||
| 1787 | return 1; | ||
| 868 | } | 1788 | } |
| 869 | 1789 | ||
| 870 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | 1790 | static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, |
| 871 | unsigned long address, pte_t *ptep, pte_t pte) | 1791 | unsigned long address, pte_t *ptep, pte_t pte, |
| 1792 | struct page *pagecache_page) | ||
| 872 | { | 1793 | { |
| 1794 | struct hstate *h = hstate_vma(vma); | ||
| 873 | struct page *old_page, *new_page; | 1795 | struct page *old_page, *new_page; |
| 874 | int avoidcopy; | 1796 | int avoidcopy; |
| 1797 | int outside_reserve = 0; | ||
| 875 | 1798 | ||
| 876 | old_page = pte_page(pte); | 1799 | old_page = pte_page(pte); |
| 877 | 1800 | ||
| 1801 | retry_avoidcopy: | ||
| 878 | /* If no-one else is actually using this page, avoid the copy | 1802 | /* If no-one else is actually using this page, avoid the copy |
| 879 | * and just make the page writable */ | 1803 | * and just make the page writable */ |
| 880 | avoidcopy = (page_count(old_page) == 1); | 1804 | avoidcopy = (page_count(old_page) == 1); |
| @@ -883,11 +1807,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 883 | return 0; | 1807 | return 0; |
| 884 | } | 1808 | } |
| 885 | 1809 | ||
| 1810 | /* | ||
| 1811 | * If the process that created a MAP_PRIVATE mapping is about to | ||
| 1812 | * perform a COW due to a shared page count, attempt to satisfy | ||
| 1813 | * the allocation without using the existing reserves. The pagecache | ||
| 1814 | * page is used to determine if the reserve at this address was | ||
| 1815 | * consumed or not. If reserves were used, a partial faulted mapping | ||
| 1816 | * at the time of fork() could consume its reserves on COW instead | ||
| 1817 | * of the full address range. | ||
| 1818 | */ | ||
| 1819 | if (!(vma->vm_flags & VM_SHARED) && | ||
| 1820 | is_vma_resv_set(vma, HPAGE_RESV_OWNER) && | ||
| 1821 | old_page != pagecache_page) | ||
| 1822 | outside_reserve = 1; | ||
| 1823 | |||
| 886 | page_cache_get(old_page); | 1824 | page_cache_get(old_page); |
| 887 | new_page = alloc_huge_page(vma, address); | 1825 | new_page = alloc_huge_page(vma, address, outside_reserve); |
| 888 | 1826 | ||
| 889 | if (IS_ERR(new_page)) { | 1827 | if (IS_ERR(new_page)) { |
| 890 | page_cache_release(old_page); | 1828 | page_cache_release(old_page); |
| 1829 | |||
| 1830 | /* | ||
| 1831 | * If a process owning a MAP_PRIVATE mapping fails to COW, | ||
| 1832 | * it is due to references held by a child and an insufficient | ||
| 1833 | * huge page pool. To guarantee the original mappers | ||
| 1834 | * reliability, unmap the page from child processes. The child | ||
| 1835 | * may get SIGKILLed if it later faults. | ||
| 1836 | */ | ||
| 1837 | if (outside_reserve) { | ||
| 1838 | BUG_ON(huge_pte_none(pte)); | ||
| 1839 | if (unmap_ref_private(mm, vma, old_page, address)) { | ||
| 1840 | BUG_ON(page_count(old_page) != 1); | ||
| 1841 | BUG_ON(huge_pte_none(pte)); | ||
| 1842 | goto retry_avoidcopy; | ||
| 1843 | } | ||
| 1844 | WARN_ON_ONCE(1); | ||
| 1845 | } | ||
| 1846 | |||
| 891 | return -PTR_ERR(new_page); | 1847 | return -PTR_ERR(new_page); |
| 892 | } | 1848 | } |
| 893 | 1849 | ||
| @@ -896,7 +1852,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 896 | __SetPageUptodate(new_page); | 1852 | __SetPageUptodate(new_page); |
| 897 | spin_lock(&mm->page_table_lock); | 1853 | spin_lock(&mm->page_table_lock); |
| 898 | 1854 | ||
| 899 | ptep = huge_pte_offset(mm, address & HPAGE_MASK); | 1855 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
| 900 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 1856 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
| 901 | /* Break COW */ | 1857 | /* Break COW */ |
| 902 | huge_ptep_clear_flush(vma, address, ptep); | 1858 | huge_ptep_clear_flush(vma, address, ptep); |
| @@ -910,19 +1866,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 910 | return 0; | 1866 | return 0; |
| 911 | } | 1867 | } |
| 912 | 1868 | ||
| 1869 | /* Return the pagecache page at a given address within a VMA */ | ||
| 1870 | static struct page *hugetlbfs_pagecache_page(struct hstate *h, | ||
| 1871 | struct vm_area_struct *vma, unsigned long address) | ||
| 1872 | { | ||
| 1873 | struct address_space *mapping; | ||
| 1874 | pgoff_t idx; | ||
| 1875 | |||
| 1876 | mapping = vma->vm_file->f_mapping; | ||
| 1877 | idx = vma_hugecache_offset(h, vma, address); | ||
| 1878 | |||
| 1879 | return find_lock_page(mapping, idx); | ||
| 1880 | } | ||
| 1881 | |||
| 913 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1882 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 914 | unsigned long address, pte_t *ptep, int write_access) | 1883 | unsigned long address, pte_t *ptep, int write_access) |
| 915 | { | 1884 | { |
| 1885 | struct hstate *h = hstate_vma(vma); | ||
| 916 | int ret = VM_FAULT_SIGBUS; | 1886 | int ret = VM_FAULT_SIGBUS; |
| 917 | unsigned long idx; | 1887 | pgoff_t idx; |
| 918 | unsigned long size; | 1888 | unsigned long size; |
| 919 | struct page *page; | 1889 | struct page *page; |
| 920 | struct address_space *mapping; | 1890 | struct address_space *mapping; |
| 921 | pte_t new_pte; | 1891 | pte_t new_pte; |
| 922 | 1892 | ||
| 1893 | /* | ||
| 1894 | * Currently, we are forced to kill the process in the event the | ||
| 1895 | * original mapper has unmapped pages from the child due to a failed | ||
| 1896 | * COW. Warn that such a situation has occured as it may not be obvious | ||
| 1897 | */ | ||
| 1898 | if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { | ||
| 1899 | printk(KERN_WARNING | ||
| 1900 | "PID %d killed due to inadequate hugepage pool\n", | ||
| 1901 | current->pid); | ||
| 1902 | return ret; | ||
| 1903 | } | ||
| 1904 | |||
| 923 | mapping = vma->vm_file->f_mapping; | 1905 | mapping = vma->vm_file->f_mapping; |
| 924 | idx = ((address - vma->vm_start) >> HPAGE_SHIFT) | 1906 | idx = vma_hugecache_offset(h, vma, address); |
| 925 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
| 926 | 1907 | ||
| 927 | /* | 1908 | /* |
| 928 | * Use page lock to guard against racing truncation | 1909 | * Use page lock to guard against racing truncation |
| @@ -931,15 +1912,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 931 | retry: | 1912 | retry: |
| 932 | page = find_lock_page(mapping, idx); | 1913 | page = find_lock_page(mapping, idx); |
| 933 | if (!page) { | 1914 | if (!page) { |
| 934 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 1915 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
| 935 | if (idx >= size) | 1916 | if (idx >= size) |
| 936 | goto out; | 1917 | goto out; |
| 937 | page = alloc_huge_page(vma, address); | 1918 | page = alloc_huge_page(vma, address, 0); |
| 938 | if (IS_ERR(page)) { | 1919 | if (IS_ERR(page)) { |
| 939 | ret = -PTR_ERR(page); | 1920 | ret = -PTR_ERR(page); |
| 940 | goto out; | 1921 | goto out; |
| 941 | } | 1922 | } |
| 942 | clear_huge_page(page, address); | 1923 | clear_huge_page(page, address, huge_page_size(h)); |
| 943 | __SetPageUptodate(page); | 1924 | __SetPageUptodate(page); |
| 944 | 1925 | ||
| 945 | if (vma->vm_flags & VM_SHARED) { | 1926 | if (vma->vm_flags & VM_SHARED) { |
| @@ -955,14 +1936,26 @@ retry: | |||
| 955 | } | 1936 | } |
| 956 | 1937 | ||
| 957 | spin_lock(&inode->i_lock); | 1938 | spin_lock(&inode->i_lock); |
| 958 | inode->i_blocks += BLOCKS_PER_HUGEPAGE; | 1939 | inode->i_blocks += blocks_per_huge_page(h); |
| 959 | spin_unlock(&inode->i_lock); | 1940 | spin_unlock(&inode->i_lock); |
| 960 | } else | 1941 | } else |
| 961 | lock_page(page); | 1942 | lock_page(page); |
| 962 | } | 1943 | } |
| 963 | 1944 | ||
| 1945 | /* | ||
| 1946 | * If we are going to COW a private mapping later, we examine the | ||
| 1947 | * pending reservations for this page now. This will ensure that | ||
| 1948 | * any allocations necessary to record that reservation occur outside | ||
| 1949 | * the spinlock. | ||
| 1950 | */ | ||
| 1951 | if (write_access && !(vma->vm_flags & VM_SHARED)) | ||
| 1952 | if (vma_needs_reservation(h, vma, address) < 0) { | ||
| 1953 | ret = VM_FAULT_OOM; | ||
| 1954 | goto backout_unlocked; | ||
| 1955 | } | ||
| 1956 | |||
| 964 | spin_lock(&mm->page_table_lock); | 1957 | spin_lock(&mm->page_table_lock); |
| 965 | size = i_size_read(mapping->host) >> HPAGE_SHIFT; | 1958 | size = i_size_read(mapping->host) >> huge_page_shift(h); |
| 966 | if (idx >= size) | 1959 | if (idx >= size) |
| 967 | goto backout; | 1960 | goto backout; |
| 968 | 1961 | ||
| @@ -976,7 +1969,7 @@ retry: | |||
| 976 | 1969 | ||
| 977 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | 1970 | if (write_access && !(vma->vm_flags & VM_SHARED)) { |
| 978 | /* Optimization, do the COW without a second fault */ | 1971 | /* Optimization, do the COW without a second fault */ |
| 979 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte); | 1972 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); |
| 980 | } | 1973 | } |
| 981 | 1974 | ||
| 982 | spin_unlock(&mm->page_table_lock); | 1975 | spin_unlock(&mm->page_table_lock); |
| @@ -986,6 +1979,7 @@ out: | |||
| 986 | 1979 | ||
| 987 | backout: | 1980 | backout: |
| 988 | spin_unlock(&mm->page_table_lock); | 1981 | spin_unlock(&mm->page_table_lock); |
| 1982 | backout_unlocked: | ||
| 989 | unlock_page(page); | 1983 | unlock_page(page); |
| 990 | put_page(page); | 1984 | put_page(page); |
| 991 | goto out; | 1985 | goto out; |
| @@ -997,9 +1991,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 997 | pte_t *ptep; | 1991 | pte_t *ptep; |
| 998 | pte_t entry; | 1992 | pte_t entry; |
| 999 | int ret; | 1993 | int ret; |
| 1994 | struct page *pagecache_page = NULL; | ||
| 1000 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); | 1995 | static DEFINE_MUTEX(hugetlb_instantiation_mutex); |
| 1996 | struct hstate *h = hstate_vma(vma); | ||
| 1001 | 1997 | ||
| 1002 | ptep = huge_pte_alloc(mm, address); | 1998 | ptep = huge_pte_alloc(mm, address, huge_page_size(h)); |
| 1003 | if (!ptep) | 1999 | if (!ptep) |
| 1004 | return VM_FAULT_OOM; | 2000 | return VM_FAULT_OOM; |
| 1005 | 2001 | ||
| @@ -1012,23 +2008,58 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1012 | entry = huge_ptep_get(ptep); | 2008 | entry = huge_ptep_get(ptep); |
| 1013 | if (huge_pte_none(entry)) { | 2009 | if (huge_pte_none(entry)) { |
| 1014 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); | 2010 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); |
| 1015 | mutex_unlock(&hugetlb_instantiation_mutex); | 2011 | goto out_unlock; |
| 1016 | return ret; | ||
| 1017 | } | 2012 | } |
| 1018 | 2013 | ||
| 1019 | ret = 0; | 2014 | ret = 0; |
| 1020 | 2015 | ||
| 2016 | /* | ||
| 2017 | * If we are going to COW the mapping later, we examine the pending | ||
| 2018 | * reservations for this page now. This will ensure that any | ||
| 2019 | * allocations necessary to record that reservation occur outside the | ||
| 2020 | * spinlock. For private mappings, we also lookup the pagecache | ||
| 2021 | * page now as it is used to determine if a reservation has been | ||
| 2022 | * consumed. | ||
| 2023 | */ | ||
| 2024 | if (write_access && !pte_write(entry)) { | ||
| 2025 | if (vma_needs_reservation(h, vma, address) < 0) { | ||
| 2026 | ret = VM_FAULT_OOM; | ||
| 2027 | goto out_unlock; | ||
| 2028 | } | ||
| 2029 | |||
| 2030 | if (!(vma->vm_flags & VM_SHARED)) | ||
| 2031 | pagecache_page = hugetlbfs_pagecache_page(h, | ||
| 2032 | vma, address); | ||
| 2033 | } | ||
| 2034 | |||
| 1021 | spin_lock(&mm->page_table_lock); | 2035 | spin_lock(&mm->page_table_lock); |
| 1022 | /* Check for a racing update before calling hugetlb_cow */ | 2036 | /* Check for a racing update before calling hugetlb_cow */ |
| 1023 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) | 2037 | if (likely(pte_same(entry, huge_ptep_get(ptep)))) |
| 1024 | if (write_access && !pte_write(entry)) | 2038 | if (write_access && !pte_write(entry)) |
| 1025 | ret = hugetlb_cow(mm, vma, address, ptep, entry); | 2039 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
| 2040 | pagecache_page); | ||
| 1026 | spin_unlock(&mm->page_table_lock); | 2041 | spin_unlock(&mm->page_table_lock); |
| 2042 | |||
| 2043 | if (pagecache_page) { | ||
| 2044 | unlock_page(pagecache_page); | ||
| 2045 | put_page(pagecache_page); | ||
| 2046 | } | ||
| 2047 | |||
| 2048 | out_unlock: | ||
| 1027 | mutex_unlock(&hugetlb_instantiation_mutex); | 2049 | mutex_unlock(&hugetlb_instantiation_mutex); |
| 1028 | 2050 | ||
| 1029 | return ret; | 2051 | return ret; |
| 1030 | } | 2052 | } |
| 1031 | 2053 | ||
| 2054 | /* Can be overriden by architectures */ | ||
| 2055 | __attribute__((weak)) struct page * | ||
| 2056 | follow_huge_pud(struct mm_struct *mm, unsigned long address, | ||
| 2057 | pud_t *pud, int write) | ||
| 2058 | { | ||
| 2059 | BUG(); | ||
| 2060 | return NULL; | ||
| 2061 | } | ||
| 2062 | |||
| 1032 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2063 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 1033 | struct page **pages, struct vm_area_struct **vmas, | 2064 | struct page **pages, struct vm_area_struct **vmas, |
| 1034 | unsigned long *position, int *length, int i, | 2065 | unsigned long *position, int *length, int i, |
| @@ -1037,6 +2068,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1037 | unsigned long pfn_offset; | 2068 | unsigned long pfn_offset; |
| 1038 | unsigned long vaddr = *position; | 2069 | unsigned long vaddr = *position; |
| 1039 | int remainder = *length; | 2070 | int remainder = *length; |
| 2071 | struct hstate *h = hstate_vma(vma); | ||
| 1040 | 2072 | ||
| 1041 | spin_lock(&mm->page_table_lock); | 2073 | spin_lock(&mm->page_table_lock); |
| 1042 | while (vaddr < vma->vm_end && remainder) { | 2074 | while (vaddr < vma->vm_end && remainder) { |
| @@ -1048,7 +2080,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1048 | * each hugepage. We have to make * sure we get the | 2080 | * each hugepage. We have to make * sure we get the |
| 1049 | * first, for the page indexing below to work. | 2081 | * first, for the page indexing below to work. |
| 1050 | */ | 2082 | */ |
| 1051 | pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); | 2083 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); |
| 1052 | 2084 | ||
| 1053 | if (!pte || huge_pte_none(huge_ptep_get(pte)) || | 2085 | if (!pte || huge_pte_none(huge_ptep_get(pte)) || |
| 1054 | (write && !pte_write(huge_ptep_get(pte)))) { | 2086 | (write && !pte_write(huge_ptep_get(pte)))) { |
| @@ -1066,7 +2098,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1066 | break; | 2098 | break; |
| 1067 | } | 2099 | } |
| 1068 | 2100 | ||
| 1069 | pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; | 2101 | pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; |
| 1070 | page = pte_page(huge_ptep_get(pte)); | 2102 | page = pte_page(huge_ptep_get(pte)); |
| 1071 | same_page: | 2103 | same_page: |
| 1072 | if (pages) { | 2104 | if (pages) { |
| @@ -1082,7 +2114,7 @@ same_page: | |||
| 1082 | --remainder; | 2114 | --remainder; |
| 1083 | ++i; | 2115 | ++i; |
| 1084 | if (vaddr < vma->vm_end && remainder && | 2116 | if (vaddr < vma->vm_end && remainder && |
| 1085 | pfn_offset < HPAGE_SIZE/PAGE_SIZE) { | 2117 | pfn_offset < pages_per_huge_page(h)) { |
| 1086 | /* | 2118 | /* |
| 1087 | * We use pfn_offset to avoid touching the pageframes | 2119 | * We use pfn_offset to avoid touching the pageframes |
| 1088 | * of this compound page. | 2120 | * of this compound page. |
| @@ -1104,13 +2136,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
| 1104 | unsigned long start = address; | 2136 | unsigned long start = address; |
| 1105 | pte_t *ptep; | 2137 | pte_t *ptep; |
| 1106 | pte_t pte; | 2138 | pte_t pte; |
| 2139 | struct hstate *h = hstate_vma(vma); | ||
| 1107 | 2140 | ||
| 1108 | BUG_ON(address >= end); | 2141 | BUG_ON(address >= end); |
| 1109 | flush_cache_range(vma, address, end); | 2142 | flush_cache_range(vma, address, end); |
| 1110 | 2143 | ||
| 1111 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); | 2144 | spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); |
| 1112 | spin_lock(&mm->page_table_lock); | 2145 | spin_lock(&mm->page_table_lock); |
| 1113 | for (; address < end; address += HPAGE_SIZE) { | 2146 | for (; address < end; address += huge_page_size(h)) { |
| 1114 | ptep = huge_pte_offset(mm, address); | 2147 | ptep = huge_pte_offset(mm, address); |
| 1115 | if (!ptep) | 2148 | if (!ptep) |
| 1116 | continue; | 2149 | continue; |
| @@ -1128,195 +2161,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
| 1128 | flush_tlb_range(vma, start, end); | 2161 | flush_tlb_range(vma, start, end); |
| 1129 | } | 2162 | } |
| 1130 | 2163 | ||
| 1131 | struct file_region { | 2164 | int hugetlb_reserve_pages(struct inode *inode, |
| 1132 | struct list_head link; | 2165 | long from, long to, |
| 1133 | long from; | 2166 | struct vm_area_struct *vma) |
| 1134 | long to; | ||
| 1135 | }; | ||
| 1136 | |||
| 1137 | static long region_add(struct list_head *head, long f, long t) | ||
| 1138 | { | ||
| 1139 | struct file_region *rg, *nrg, *trg; | ||
| 1140 | |||
| 1141 | /* Locate the region we are either in or before. */ | ||
| 1142 | list_for_each_entry(rg, head, link) | ||
| 1143 | if (f <= rg->to) | ||
| 1144 | break; | ||
| 1145 | |||
| 1146 | /* Round our left edge to the current segment if it encloses us. */ | ||
| 1147 | if (f > rg->from) | ||
| 1148 | f = rg->from; | ||
| 1149 | |||
| 1150 | /* Check for and consume any regions we now overlap with. */ | ||
| 1151 | nrg = rg; | ||
| 1152 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
| 1153 | if (&rg->link == head) | ||
| 1154 | break; | ||
| 1155 | if (rg->from > t) | ||
| 1156 | break; | ||
| 1157 | |||
| 1158 | /* If this area reaches higher then extend our area to | ||
| 1159 | * include it completely. If this is not the first area | ||
| 1160 | * which we intend to reuse, free it. */ | ||
| 1161 | if (rg->to > t) | ||
| 1162 | t = rg->to; | ||
| 1163 | if (rg != nrg) { | ||
| 1164 | list_del(&rg->link); | ||
| 1165 | kfree(rg); | ||
| 1166 | } | ||
| 1167 | } | ||
| 1168 | nrg->from = f; | ||
| 1169 | nrg->to = t; | ||
| 1170 | return 0; | ||
| 1171 | } | ||
| 1172 | |||
| 1173 | static long region_chg(struct list_head *head, long f, long t) | ||
| 1174 | { | 2167 | { |
| 1175 | struct file_region *rg, *nrg; | 2168 | long ret, chg; |
| 1176 | long chg = 0; | 2169 | struct hstate *h = hstate_inode(inode); |
| 1177 | |||
| 1178 | /* Locate the region we are before or in. */ | ||
| 1179 | list_for_each_entry(rg, head, link) | ||
| 1180 | if (f <= rg->to) | ||
| 1181 | break; | ||
| 1182 | |||
| 1183 | /* If we are below the current region then a new region is required. | ||
| 1184 | * Subtle, allocate a new region at the position but make it zero | ||
| 1185 | * size such that we can guarantee to record the reservation. */ | ||
| 1186 | if (&rg->link == head || t < rg->from) { | ||
| 1187 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); | ||
| 1188 | if (!nrg) | ||
| 1189 | return -ENOMEM; | ||
| 1190 | nrg->from = f; | ||
| 1191 | nrg->to = f; | ||
| 1192 | INIT_LIST_HEAD(&nrg->link); | ||
| 1193 | list_add(&nrg->link, rg->link.prev); | ||
| 1194 | |||
| 1195 | return t - f; | ||
| 1196 | } | ||
| 1197 | |||
| 1198 | /* Round our left edge to the current segment if it encloses us. */ | ||
| 1199 | if (f > rg->from) | ||
| 1200 | f = rg->from; | ||
| 1201 | chg = t - f; | ||
| 1202 | |||
| 1203 | /* Check for and consume any regions we now overlap with. */ | ||
| 1204 | list_for_each_entry(rg, rg->link.prev, link) { | ||
| 1205 | if (&rg->link == head) | ||
| 1206 | break; | ||
| 1207 | if (rg->from > t) | ||
| 1208 | return chg; | ||
| 1209 | |||
| 1210 | /* We overlap with this area, if it extends futher than | ||
| 1211 | * us then we must extend ourselves. Account for its | ||
| 1212 | * existing reservation. */ | ||
| 1213 | if (rg->to > t) { | ||
| 1214 | chg += rg->to - t; | ||
| 1215 | t = rg->to; | ||
| 1216 | } | ||
| 1217 | chg -= rg->to - rg->from; | ||
| 1218 | } | ||
| 1219 | return chg; | ||
| 1220 | } | ||
| 1221 | |||
| 1222 | static long region_truncate(struct list_head *head, long end) | ||
| 1223 | { | ||
| 1224 | struct file_region *rg, *trg; | ||
| 1225 | long chg = 0; | ||
| 1226 | 2170 | ||
| 1227 | /* Locate the region we are either in or before. */ | 2171 | if (vma && vma->vm_flags & VM_NORESERVE) |
| 1228 | list_for_each_entry(rg, head, link) | ||
| 1229 | if (end <= rg->to) | ||
| 1230 | break; | ||
| 1231 | if (&rg->link == head) | ||
| 1232 | return 0; | 2172 | return 0; |
| 1233 | 2173 | ||
| 1234 | /* If we are in the middle of a region then adjust it. */ | ||
| 1235 | if (end > rg->from) { | ||
| 1236 | chg = rg->to - end; | ||
| 1237 | rg->to = end; | ||
| 1238 | rg = list_entry(rg->link.next, typeof(*rg), link); | ||
| 1239 | } | ||
| 1240 | |||
| 1241 | /* Drop any remaining regions. */ | ||
| 1242 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
| 1243 | if (&rg->link == head) | ||
| 1244 | break; | ||
| 1245 | chg += rg->to - rg->from; | ||
| 1246 | list_del(&rg->link); | ||
| 1247 | kfree(rg); | ||
| 1248 | } | ||
| 1249 | return chg; | ||
| 1250 | } | ||
| 1251 | |||
| 1252 | static int hugetlb_acct_memory(long delta) | ||
| 1253 | { | ||
| 1254 | int ret = -ENOMEM; | ||
| 1255 | |||
| 1256 | spin_lock(&hugetlb_lock); | ||
| 1257 | /* | 2174 | /* |
| 1258 | * When cpuset is configured, it breaks the strict hugetlb page | 2175 | * Shared mappings base their reservation on the number of pages that |
| 1259 | * reservation as the accounting is done on a global variable. Such | 2176 | * are already allocated on behalf of the file. Private mappings need |
| 1260 | * reservation is completely rubbish in the presence of cpuset because | 2177 | * to reserve the full area even if read-only as mprotect() may be |
| 1261 | * the reservation is not checked against page availability for the | 2178 | * called to make the mapping read-write. Assume !vma is a shm mapping |
| 1262 | * current cpuset. Application can still potentially OOM'ed by kernel | ||
| 1263 | * with lack of free htlb page in cpuset that the task is in. | ||
| 1264 | * Attempt to enforce strict accounting with cpuset is almost | ||
| 1265 | * impossible (or too ugly) because cpuset is too fluid that | ||
| 1266 | * task or memory node can be dynamically moved between cpusets. | ||
| 1267 | * | ||
| 1268 | * The change of semantics for shared hugetlb mapping with cpuset is | ||
| 1269 | * undesirable. However, in order to preserve some of the semantics, | ||
| 1270 | * we fall back to check against current free page availability as | ||
| 1271 | * a best attempt and hopefully to minimize the impact of changing | ||
| 1272 | * semantics that cpuset has. | ||
| 1273 | */ | 2179 | */ |
| 1274 | if (delta > 0) { | 2180 | if (!vma || vma->vm_flags & VM_SHARED) |
| 1275 | if (gather_surplus_pages(delta) < 0) | 2181 | chg = region_chg(&inode->i_mapping->private_list, from, to); |
| 1276 | goto out; | 2182 | else { |
| 1277 | 2183 | struct resv_map *resv_map = resv_map_alloc(); | |
| 1278 | if (delta > cpuset_mems_nr(free_huge_pages_node)) { | 2184 | if (!resv_map) |
| 1279 | return_unused_surplus_pages(delta); | 2185 | return -ENOMEM; |
| 1280 | goto out; | ||
| 1281 | } | ||
| 1282 | } | ||
| 1283 | |||
| 1284 | ret = 0; | ||
| 1285 | if (delta < 0) | ||
| 1286 | return_unused_surplus_pages((unsigned long) -delta); | ||
| 1287 | 2186 | ||
| 1288 | out: | 2187 | chg = to - from; |
| 1289 | spin_unlock(&hugetlb_lock); | ||
| 1290 | return ret; | ||
| 1291 | } | ||
| 1292 | 2188 | ||
| 1293 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | 2189 | set_vma_resv_map(vma, resv_map); |
| 1294 | { | 2190 | set_vma_resv_flags(vma, HPAGE_RESV_OWNER); |
| 1295 | long ret, chg; | 2191 | } |
| 1296 | 2192 | ||
| 1297 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
| 1298 | if (chg < 0) | 2193 | if (chg < 0) |
| 1299 | return chg; | 2194 | return chg; |
| 1300 | 2195 | ||
| 1301 | if (hugetlb_get_quota(inode->i_mapping, chg)) | 2196 | if (hugetlb_get_quota(inode->i_mapping, chg)) |
| 1302 | return -ENOSPC; | 2197 | return -ENOSPC; |
| 1303 | ret = hugetlb_acct_memory(chg); | 2198 | ret = hugetlb_acct_memory(h, chg); |
| 1304 | if (ret < 0) { | 2199 | if (ret < 0) { |
| 1305 | hugetlb_put_quota(inode->i_mapping, chg); | 2200 | hugetlb_put_quota(inode->i_mapping, chg); |
| 1306 | return ret; | 2201 | return ret; |
| 1307 | } | 2202 | } |
| 1308 | region_add(&inode->i_mapping->private_list, from, to); | 2203 | if (!vma || vma->vm_flags & VM_SHARED) |
| 2204 | region_add(&inode->i_mapping->private_list, from, to); | ||
| 1309 | return 0; | 2205 | return 0; |
| 1310 | } | 2206 | } |
| 1311 | 2207 | ||
| 1312 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | 2208 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) |
| 1313 | { | 2209 | { |
| 2210 | struct hstate *h = hstate_inode(inode); | ||
| 1314 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | 2211 | long chg = region_truncate(&inode->i_mapping->private_list, offset); |
| 1315 | 2212 | ||
| 1316 | spin_lock(&inode->i_lock); | 2213 | spin_lock(&inode->i_lock); |
| 1317 | inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; | 2214 | inode->i_blocks -= blocks_per_huge_page(h); |
| 1318 | spin_unlock(&inode->i_lock); | 2215 | spin_unlock(&inode->i_lock); |
| 1319 | 2216 | ||
| 1320 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); | 2217 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); |
| 1321 | hugetlb_acct_memory(-(chg - freed)); | 2218 | hugetlb_acct_memory(h, -(chg - freed)); |
| 1322 | } | 2219 | } |
diff --git a/mm/internal.h b/mm/internal.h index 0034e947e4bc..1f43f7416972 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -13,6 +13,11 @@ | |||
| 13 | 13 | ||
| 14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
| 15 | 15 | ||
| 16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | ||
| 17 | unsigned long floor, unsigned long ceiling); | ||
| 18 | |||
| 19 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
| 20 | |||
| 16 | static inline void set_page_count(struct page *page, int v) | 21 | static inline void set_page_count(struct page *page, int v) |
| 17 | { | 22 | { |
| 18 | atomic_set(&page->_count, v); | 23 | atomic_set(&page->_count, v); |
| @@ -59,4 +64,60 @@ static inline unsigned long page_order(struct page *page) | |||
| 59 | #define __paginginit __init | 64 | #define __paginginit __init |
| 60 | #endif | 65 | #endif |
| 61 | 66 | ||
| 67 | /* Memory initialisation debug and verification */ | ||
| 68 | enum mminit_level { | ||
| 69 | MMINIT_WARNING, | ||
| 70 | MMINIT_VERIFY, | ||
| 71 | MMINIT_TRACE | ||
| 72 | }; | ||
| 73 | |||
| 74 | #ifdef CONFIG_DEBUG_MEMORY_INIT | ||
| 75 | |||
| 76 | extern int mminit_loglevel; | ||
| 77 | |||
| 78 | #define mminit_dprintk(level, prefix, fmt, arg...) \ | ||
| 79 | do { \ | ||
| 80 | if (level < mminit_loglevel) { \ | ||
| 81 | printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ | ||
| 82 | printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ | ||
| 83 | } \ | ||
| 84 | } while (0) | ||
| 85 | |||
| 86 | extern void mminit_verify_pageflags_layout(void); | ||
| 87 | extern void mminit_verify_page_links(struct page *page, | ||
| 88 | enum zone_type zone, unsigned long nid, unsigned long pfn); | ||
| 89 | extern void mminit_verify_zonelist(void); | ||
| 90 | |||
| 91 | #else | ||
| 92 | |||
| 93 | static inline void mminit_dprintk(enum mminit_level level, | ||
| 94 | const char *prefix, const char *fmt, ...) | ||
| 95 | { | ||
| 96 | } | ||
| 97 | |||
| 98 | static inline void mminit_verify_pageflags_layout(void) | ||
| 99 | { | ||
| 100 | } | ||
| 101 | |||
| 102 | static inline void mminit_verify_page_links(struct page *page, | ||
| 103 | enum zone_type zone, unsigned long nid, unsigned long pfn) | ||
| 104 | { | ||
| 105 | } | ||
| 106 | |||
| 107 | static inline void mminit_verify_zonelist(void) | ||
| 108 | { | ||
| 109 | } | ||
| 110 | #endif /* CONFIG_DEBUG_MEMORY_INIT */ | ||
| 111 | |||
| 112 | /* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */ | ||
| 113 | #if defined(CONFIG_SPARSEMEM) | ||
| 114 | extern void mminit_validate_memmodel_limits(unsigned long *start_pfn, | ||
| 115 | unsigned long *end_pfn); | ||
| 116 | #else | ||
| 117 | static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | ||
| 118 | unsigned long *end_pfn) | ||
| 119 | { | ||
| 120 | } | ||
| 121 | #endif /* CONFIG_SPARSEMEM */ | ||
| 122 | |||
| 62 | #endif | 123 | #endif |
diff --git a/mm/madvise.c b/mm/madvise.c index 23a0ec3e0ea0..f9349c18a1b5 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
| 132 | * Application no longer needs these pages. If the pages are dirty, | 132 | * Application no longer needs these pages. If the pages are dirty, |
| 133 | * it's OK to just throw them away. The app will be more careful about | 133 | * it's OK to just throw them away. The app will be more careful about |
| 134 | * data it wants to keep. Be sure to free swap resources too. The | 134 | * data it wants to keep. Be sure to free swap resources too. The |
| 135 | * zap_page_range call sets things up for refill_inactive to actually free | 135 | * zap_page_range call sets things up for shrink_active_list to actually free |
| 136 | * these pages later if no one else has touched them in the meantime, | 136 | * these pages later if no one else has touched them in the meantime, |
| 137 | * although we could add these pages to a global reuse list for | 137 | * although we could add these pages to a global reuse list for |
| 138 | * refill_inactive to pick up before reclaiming other pages. | 138 | * shrink_active_list to pick up before reclaiming other pages. |
| 139 | * | 139 | * |
| 140 | * NB: This interface discards data rather than pushes it out to swap, | 140 | * NB: This interface discards data rather than pushes it out to swap, |
| 141 | * as some implementations do. This has performance implications for | 141 | * as some implementations do. This has performance implications for |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e46451e1d9b7..0f1f7a7374ba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -35,9 +35,9 @@ | |||
| 35 | 35 | ||
| 36 | #include <asm/uaccess.h> | 36 | #include <asm/uaccess.h> |
| 37 | 37 | ||
| 38 | struct cgroup_subsys mem_cgroup_subsys; | 38 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
| 39 | static const int MEM_CGROUP_RECLAIM_RETRIES = 5; | 39 | static struct kmem_cache *page_cgroup_cache __read_mostly; |
| 40 | static struct kmem_cache *page_cgroup_cache; | 40 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
| 41 | 41 | ||
| 42 | /* | 42 | /* |
| 43 | * Statistics for memory cgroup. | 43 | * Statistics for memory cgroup. |
| @@ -166,7 +166,6 @@ struct page_cgroup { | |||
| 166 | struct list_head lru; /* per cgroup LRU list */ | 166 | struct list_head lru; /* per cgroup LRU list */ |
| 167 | struct page *page; | 167 | struct page *page; |
| 168 | struct mem_cgroup *mem_cgroup; | 168 | struct mem_cgroup *mem_cgroup; |
| 169 | int ref_cnt; /* cached, mapped, migrating */ | ||
| 170 | int flags; | 169 | int flags; |
| 171 | }; | 170 | }; |
| 172 | #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ | 171 | #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ |
| @@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc) | |||
| 185 | enum charge_type { | 184 | enum charge_type { |
| 186 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 185 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
| 187 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 186 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
| 187 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | ||
| 188 | }; | 188 | }; |
| 189 | 189 | ||
| 190 | /* | 190 | /* |
| @@ -296,7 +296,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, | |||
| 296 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; | 296 | MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; |
| 297 | 297 | ||
| 298 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); | 298 | mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); |
| 299 | list_del_init(&pc->lru); | 299 | list_del(&pc->lru); |
| 300 | } | 300 | } |
| 301 | 301 | ||
| 302 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, | 302 | static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, |
| @@ -354,6 +354,9 @@ void mem_cgroup_move_lists(struct page *page, bool active) | |||
| 354 | struct mem_cgroup_per_zone *mz; | 354 | struct mem_cgroup_per_zone *mz; |
| 355 | unsigned long flags; | 355 | unsigned long flags; |
| 356 | 356 | ||
| 357 | if (mem_cgroup_subsys.disabled) | ||
| 358 | return; | ||
| 359 | |||
| 357 | /* | 360 | /* |
| 358 | * We cannot lock_page_cgroup while holding zone's lru_lock, | 361 | * We cannot lock_page_cgroup while holding zone's lru_lock, |
| 359 | * because other holders of lock_page_cgroup can be interrupted | 362 | * because other holders of lock_page_cgroup can be interrupted |
| @@ -524,7 +527,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
| 524 | * < 0 if the cgroup is over its limit | 527 | * < 0 if the cgroup is over its limit |
| 525 | */ | 528 | */ |
| 526 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | 529 | static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, |
| 527 | gfp_t gfp_mask, enum charge_type ctype) | 530 | gfp_t gfp_mask, enum charge_type ctype, |
| 531 | struct mem_cgroup *memcg) | ||
| 528 | { | 532 | { |
| 529 | struct mem_cgroup *mem; | 533 | struct mem_cgroup *mem; |
| 530 | struct page_cgroup *pc; | 534 | struct page_cgroup *pc; |
| @@ -532,35 +536,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 532 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 536 | unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
| 533 | struct mem_cgroup_per_zone *mz; | 537 | struct mem_cgroup_per_zone *mz; |
| 534 | 538 | ||
| 535 | if (mem_cgroup_subsys.disabled) | 539 | pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); |
| 536 | return 0; | 540 | if (unlikely(pc == NULL)) |
| 537 | |||
| 538 | /* | ||
| 539 | * Should page_cgroup's go to their own slab? | ||
| 540 | * One could optimize the performance of the charging routine | ||
| 541 | * by saving a bit in the page_flags and using it as a lock | ||
| 542 | * to see if the cgroup page already has a page_cgroup associated | ||
| 543 | * with it | ||
| 544 | */ | ||
| 545 | retry: | ||
| 546 | lock_page_cgroup(page); | ||
| 547 | pc = page_get_page_cgroup(page); | ||
| 548 | /* | ||
| 549 | * The page_cgroup exists and | ||
| 550 | * the page has already been accounted. | ||
| 551 | */ | ||
| 552 | if (pc) { | ||
| 553 | VM_BUG_ON(pc->page != page); | ||
| 554 | VM_BUG_ON(pc->ref_cnt <= 0); | ||
| 555 | |||
| 556 | pc->ref_cnt++; | ||
| 557 | unlock_page_cgroup(page); | ||
| 558 | goto done; | ||
| 559 | } | ||
| 560 | unlock_page_cgroup(page); | ||
| 561 | |||
| 562 | pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask); | ||
| 563 | if (pc == NULL) | ||
| 564 | goto err; | 541 | goto err; |
| 565 | 542 | ||
| 566 | /* | 543 | /* |
| @@ -569,16 +546,18 @@ retry: | |||
| 569 | * thread group leader migrates. It's possible that mm is not | 546 | * thread group leader migrates. It's possible that mm is not |
| 570 | * set, if so charge the init_mm (happens for pagecache usage). | 547 | * set, if so charge the init_mm (happens for pagecache usage). |
| 571 | */ | 548 | */ |
| 572 | if (!mm) | 549 | if (likely(!memcg)) { |
| 573 | mm = &init_mm; | 550 | rcu_read_lock(); |
| 574 | 551 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | |
| 575 | rcu_read_lock(); | 552 | /* |
| 576 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 553 | * For every charge from the cgroup, increment reference count |
| 577 | /* | 554 | */ |
| 578 | * For every charge from the cgroup, increment reference count | 555 | css_get(&mem->css); |
| 579 | */ | 556 | rcu_read_unlock(); |
| 580 | css_get(&mem->css); | 557 | } else { |
| 581 | rcu_read_unlock(); | 558 | mem = memcg; |
| 559 | css_get(&memcg->css); | ||
| 560 | } | ||
| 582 | 561 | ||
| 583 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { | 562 | while (res_counter_charge(&mem->res, PAGE_SIZE)) { |
| 584 | if (!(gfp_mask & __GFP_WAIT)) | 563 | if (!(gfp_mask & __GFP_WAIT)) |
| @@ -603,25 +582,24 @@ retry: | |||
| 603 | } | 582 | } |
| 604 | } | 583 | } |
| 605 | 584 | ||
| 606 | pc->ref_cnt = 1; | ||
| 607 | pc->mem_cgroup = mem; | 585 | pc->mem_cgroup = mem; |
| 608 | pc->page = page; | 586 | pc->page = page; |
| 609 | pc->flags = PAGE_CGROUP_FLAG_ACTIVE; | 587 | /* |
| 588 | * If a page is accounted as a page cache, insert to inactive list. | ||
| 589 | * If anon, insert to active list. | ||
| 590 | */ | ||
| 610 | if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) | 591 | if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) |
| 611 | pc->flags = PAGE_CGROUP_FLAG_CACHE; | 592 | pc->flags = PAGE_CGROUP_FLAG_CACHE; |
| 593 | else | ||
| 594 | pc->flags = PAGE_CGROUP_FLAG_ACTIVE; | ||
| 612 | 595 | ||
| 613 | lock_page_cgroup(page); | 596 | lock_page_cgroup(page); |
| 614 | if (page_get_page_cgroup(page)) { | 597 | if (unlikely(page_get_page_cgroup(page))) { |
| 615 | unlock_page_cgroup(page); | 598 | unlock_page_cgroup(page); |
| 616 | /* | ||
| 617 | * Another charge has been added to this page already. | ||
| 618 | * We take lock_page_cgroup(page) again and read | ||
| 619 | * page->cgroup, increment refcnt.... just retry is OK. | ||
| 620 | */ | ||
| 621 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 599 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
| 622 | css_put(&mem->css); | 600 | css_put(&mem->css); |
| 623 | kmem_cache_free(page_cgroup_cache, pc); | 601 | kmem_cache_free(page_cgroup_cache, pc); |
| 624 | goto retry; | 602 | goto done; |
| 625 | } | 603 | } |
| 626 | page_assign_page_cgroup(page, pc); | 604 | page_assign_page_cgroup(page, pc); |
| 627 | 605 | ||
| @@ -642,24 +620,65 @@ err: | |||
| 642 | 620 | ||
| 643 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) | 621 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) |
| 644 | { | 622 | { |
| 623 | if (mem_cgroup_subsys.disabled) | ||
| 624 | return 0; | ||
| 625 | |||
| 626 | /* | ||
| 627 | * If already mapped, we don't have to account. | ||
| 628 | * If page cache, page->mapping has address_space. | ||
| 629 | * But page->mapping may have out-of-use anon_vma pointer, | ||
| 630 | * detecit it by PageAnon() check. newly-mapped-anon's page->mapping | ||
| 631 | * is NULL. | ||
| 632 | */ | ||
| 633 | if (page_mapped(page) || (page->mapping && !PageAnon(page))) | ||
| 634 | return 0; | ||
| 635 | if (unlikely(!mm)) | ||
| 636 | mm = &init_mm; | ||
| 645 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 637 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
| 646 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | 638 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); |
| 647 | } | 639 | } |
| 648 | 640 | ||
| 649 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 641 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
| 650 | gfp_t gfp_mask) | 642 | gfp_t gfp_mask) |
| 651 | { | 643 | { |
| 652 | if (!mm) | 644 | if (mem_cgroup_subsys.disabled) |
| 645 | return 0; | ||
| 646 | |||
| 647 | /* | ||
| 648 | * Corner case handling. This is called from add_to_page_cache() | ||
| 649 | * in usual. But some FS (shmem) precharges this page before calling it | ||
| 650 | * and call add_to_page_cache() with GFP_NOWAIT. | ||
| 651 | * | ||
| 652 | * For GFP_NOWAIT case, the page may be pre-charged before calling | ||
| 653 | * add_to_page_cache(). (See shmem.c) check it here and avoid to call | ||
| 654 | * charge twice. (It works but has to pay a bit larger cost.) | ||
| 655 | */ | ||
| 656 | if (!(gfp_mask & __GFP_WAIT)) { | ||
| 657 | struct page_cgroup *pc; | ||
| 658 | |||
| 659 | lock_page_cgroup(page); | ||
| 660 | pc = page_get_page_cgroup(page); | ||
| 661 | if (pc) { | ||
| 662 | VM_BUG_ON(pc->page != page); | ||
| 663 | VM_BUG_ON(!pc->mem_cgroup); | ||
| 664 | unlock_page_cgroup(page); | ||
| 665 | return 0; | ||
| 666 | } | ||
| 667 | unlock_page_cgroup(page); | ||
| 668 | } | ||
| 669 | |||
| 670 | if (unlikely(!mm)) | ||
| 653 | mm = &init_mm; | 671 | mm = &init_mm; |
| 672 | |||
| 654 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 673 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
| 655 | MEM_CGROUP_CHARGE_TYPE_CACHE); | 674 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); |
| 656 | } | 675 | } |
| 657 | 676 | ||
| 658 | /* | 677 | /* |
| 659 | * Uncharging is always a welcome operation, we never complain, simply | 678 | * uncharge if !page_mapped(page) |
| 660 | * uncharge. | ||
| 661 | */ | 679 | */ |
| 662 | void mem_cgroup_uncharge_page(struct page *page) | 680 | static void |
| 681 | __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | ||
| 663 | { | 682 | { |
| 664 | struct page_cgroup *pc; | 683 | struct page_cgroup *pc; |
| 665 | struct mem_cgroup *mem; | 684 | struct mem_cgroup *mem; |
| @@ -674,98 +693,153 @@ void mem_cgroup_uncharge_page(struct page *page) | |||
| 674 | */ | 693 | */ |
| 675 | lock_page_cgroup(page); | 694 | lock_page_cgroup(page); |
| 676 | pc = page_get_page_cgroup(page); | 695 | pc = page_get_page_cgroup(page); |
| 677 | if (!pc) | 696 | if (unlikely(!pc)) |
| 678 | goto unlock; | 697 | goto unlock; |
| 679 | 698 | ||
| 680 | VM_BUG_ON(pc->page != page); | 699 | VM_BUG_ON(pc->page != page); |
| 681 | VM_BUG_ON(pc->ref_cnt <= 0); | ||
| 682 | 700 | ||
| 683 | if (--(pc->ref_cnt) == 0) { | 701 | if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) |
| 684 | mz = page_cgroup_zoneinfo(pc); | 702 | && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) |
| 685 | spin_lock_irqsave(&mz->lru_lock, flags); | 703 | || page_mapped(page))) |
| 686 | __mem_cgroup_remove_list(mz, pc); | 704 | goto unlock; |
| 687 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
| 688 | 705 | ||
| 689 | page_assign_page_cgroup(page, NULL); | 706 | mz = page_cgroup_zoneinfo(pc); |
| 690 | unlock_page_cgroup(page); | 707 | spin_lock_irqsave(&mz->lru_lock, flags); |
| 708 | __mem_cgroup_remove_list(mz, pc); | ||
| 709 | spin_unlock_irqrestore(&mz->lru_lock, flags); | ||
| 691 | 710 | ||
| 692 | mem = pc->mem_cgroup; | 711 | page_assign_page_cgroup(page, NULL); |
| 693 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 712 | unlock_page_cgroup(page); |
| 694 | css_put(&mem->css); | ||
| 695 | 713 | ||
| 696 | kmem_cache_free(page_cgroup_cache, pc); | 714 | mem = pc->mem_cgroup; |
| 697 | return; | 715 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
| 698 | } | 716 | css_put(&mem->css); |
| 699 | 717 | ||
| 718 | kmem_cache_free(page_cgroup_cache, pc); | ||
| 719 | return; | ||
| 700 | unlock: | 720 | unlock: |
| 701 | unlock_page_cgroup(page); | 721 | unlock_page_cgroup(page); |
| 702 | } | 722 | } |
| 703 | 723 | ||
| 724 | void mem_cgroup_uncharge_page(struct page *page) | ||
| 725 | { | ||
| 726 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); | ||
| 727 | } | ||
| 728 | |||
| 729 | void mem_cgroup_uncharge_cache_page(struct page *page) | ||
| 730 | { | ||
| 731 | VM_BUG_ON(page_mapped(page)); | ||
| 732 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | ||
| 733 | } | ||
| 734 | |||
| 704 | /* | 735 | /* |
| 705 | * Returns non-zero if a page (under migration) has valid page_cgroup member. | 736 | * Before starting migration, account against new page. |
| 706 | * Refcnt of page_cgroup is incremented. | ||
| 707 | */ | 737 | */ |
| 708 | int mem_cgroup_prepare_migration(struct page *page) | 738 | int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) |
| 709 | { | 739 | { |
| 710 | struct page_cgroup *pc; | 740 | struct page_cgroup *pc; |
| 741 | struct mem_cgroup *mem = NULL; | ||
| 742 | enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
| 743 | int ret = 0; | ||
| 711 | 744 | ||
| 712 | if (mem_cgroup_subsys.disabled) | 745 | if (mem_cgroup_subsys.disabled) |
| 713 | return 0; | 746 | return 0; |
| 714 | 747 | ||
| 715 | lock_page_cgroup(page); | 748 | lock_page_cgroup(page); |
| 716 | pc = page_get_page_cgroup(page); | 749 | pc = page_get_page_cgroup(page); |
| 717 | if (pc) | 750 | if (pc) { |
| 718 | pc->ref_cnt++; | 751 | mem = pc->mem_cgroup; |
| 752 | css_get(&mem->css); | ||
| 753 | if (pc->flags & PAGE_CGROUP_FLAG_CACHE) | ||
| 754 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
| 755 | } | ||
| 719 | unlock_page_cgroup(page); | 756 | unlock_page_cgroup(page); |
| 720 | return pc != NULL; | 757 | if (mem) { |
| 758 | ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, | ||
| 759 | ctype, mem); | ||
| 760 | css_put(&mem->css); | ||
| 761 | } | ||
| 762 | return ret; | ||
| 721 | } | 763 | } |
| 722 | 764 | ||
| 723 | void mem_cgroup_end_migration(struct page *page) | 765 | /* remove redundant charge if migration failed*/ |
| 766 | void mem_cgroup_end_migration(struct page *newpage) | ||
| 724 | { | 767 | { |
| 725 | mem_cgroup_uncharge_page(page); | 768 | /* |
| 769 | * At success, page->mapping is not NULL. | ||
| 770 | * special rollback care is necessary when | ||
| 771 | * 1. at migration failure. (newpage->mapping is cleared in this case) | ||
| 772 | * 2. the newpage was moved but not remapped again because the task | ||
| 773 | * exits and the newpage is obsolete. In this case, the new page | ||
| 774 | * may be a swapcache. So, we just call mem_cgroup_uncharge_page() | ||
| 775 | * always for avoiding mess. The page_cgroup will be removed if | ||
| 776 | * unnecessary. File cache pages is still on radix-tree. Don't | ||
| 777 | * care it. | ||
| 778 | */ | ||
| 779 | if (!newpage->mapping) | ||
| 780 | __mem_cgroup_uncharge_common(newpage, | ||
| 781 | MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
| 782 | else if (PageAnon(newpage)) | ||
| 783 | mem_cgroup_uncharge_page(newpage); | ||
| 726 | } | 784 | } |
| 727 | 785 | ||
| 728 | /* | 786 | /* |
| 729 | * We know both *page* and *newpage* are now not-on-LRU and PG_locked. | 787 | * A call to try to shrink memory usage under specified resource controller. |
| 730 | * And no race with uncharge() routines because page_cgroup for *page* | 788 | * This is typically used for page reclaiming for shmem for reducing side |
| 731 | * has extra one reference by mem_cgroup_prepare_migration. | 789 | * effect of page allocation from shmem, which is used by some mem_cgroup. |
| 732 | */ | 790 | */ |
| 733 | void mem_cgroup_page_migration(struct page *page, struct page *newpage) | 791 | int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) |
| 734 | { | 792 | { |
| 735 | struct page_cgroup *pc; | 793 | struct mem_cgroup *mem; |
| 736 | struct mem_cgroup_per_zone *mz; | 794 | int progress = 0; |
| 737 | unsigned long flags; | 795 | int retry = MEM_CGROUP_RECLAIM_RETRIES; |
| 738 | 796 | ||
| 739 | lock_page_cgroup(page); | 797 | if (mem_cgroup_subsys.disabled) |
| 740 | pc = page_get_page_cgroup(page); | 798 | return 0; |
| 741 | if (!pc) { | 799 | if (!mm) |
| 742 | unlock_page_cgroup(page); | 800 | return 0; |
| 743 | return; | ||
| 744 | } | ||
| 745 | 801 | ||
| 746 | mz = page_cgroup_zoneinfo(pc); | 802 | rcu_read_lock(); |
| 747 | spin_lock_irqsave(&mz->lru_lock, flags); | 803 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
| 748 | __mem_cgroup_remove_list(mz, pc); | 804 | css_get(&mem->css); |
| 749 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 805 | rcu_read_unlock(); |
| 750 | 806 | ||
| 751 | page_assign_page_cgroup(page, NULL); | 807 | do { |
| 752 | unlock_page_cgroup(page); | 808 | progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); |
| 809 | } while (!progress && --retry); | ||
| 753 | 810 | ||
| 754 | pc->page = newpage; | 811 | css_put(&mem->css); |
| 755 | lock_page_cgroup(newpage); | 812 | if (!retry) |
| 756 | page_assign_page_cgroup(newpage, pc); | 813 | return -ENOMEM; |
| 814 | return 0; | ||
| 815 | } | ||
| 757 | 816 | ||
| 758 | mz = page_cgroup_zoneinfo(pc); | 817 | int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) |
| 759 | spin_lock_irqsave(&mz->lru_lock, flags); | 818 | { |
| 760 | __mem_cgroup_add_list(mz, pc); | 819 | |
| 761 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 820 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; |
| 821 | int progress; | ||
| 822 | int ret = 0; | ||
| 762 | 823 | ||
| 763 | unlock_page_cgroup(newpage); | 824 | while (res_counter_set_limit(&memcg->res, val)) { |
| 825 | if (signal_pending(current)) { | ||
| 826 | ret = -EINTR; | ||
| 827 | break; | ||
| 828 | } | ||
| 829 | if (!retry_count) { | ||
| 830 | ret = -EBUSY; | ||
| 831 | break; | ||
| 832 | } | ||
| 833 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); | ||
| 834 | if (!progress) | ||
| 835 | retry_count--; | ||
| 836 | } | ||
| 837 | return ret; | ||
| 764 | } | 838 | } |
| 765 | 839 | ||
| 840 | |||
| 766 | /* | 841 | /* |
| 767 | * This routine traverse page_cgroup in given list and drop them all. | 842 | * This routine traverse page_cgroup in given list and drop them all. |
| 768 | * This routine ignores page_cgroup->ref_cnt. | ||
| 769 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 843 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
| 770 | */ | 844 | */ |
| 771 | #define FORCE_UNCHARGE_BATCH (128) | 845 | #define FORCE_UNCHARGE_BATCH (128) |
| @@ -790,12 +864,20 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
| 790 | page = pc->page; | 864 | page = pc->page; |
| 791 | get_page(page); | 865 | get_page(page); |
| 792 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 866 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
| 793 | mem_cgroup_uncharge_page(page); | 867 | /* |
| 794 | put_page(page); | 868 | * Check if this page is on LRU. !LRU page can be found |
| 795 | if (--count <= 0) { | 869 | * if it's under page migration. |
| 796 | count = FORCE_UNCHARGE_BATCH; | 870 | */ |
| 871 | if (PageLRU(page)) { | ||
| 872 | __mem_cgroup_uncharge_common(page, | ||
| 873 | MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
| 874 | put_page(page); | ||
| 875 | if (--count <= 0) { | ||
| 876 | count = FORCE_UNCHARGE_BATCH; | ||
| 877 | cond_resched(); | ||
| 878 | } | ||
| 879 | } else | ||
| 797 | cond_resched(); | 880 | cond_resched(); |
| 798 | } | ||
| 799 | spin_lock_irqsave(&mz->lru_lock, flags); | 881 | spin_lock_irqsave(&mz->lru_lock, flags); |
| 800 | } | 882 | } |
| 801 | spin_unlock_irqrestore(&mz->lru_lock, flags); | 883 | spin_unlock_irqrestore(&mz->lru_lock, flags); |
| @@ -810,9 +892,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) | |||
| 810 | int ret = -EBUSY; | 892 | int ret = -EBUSY; |
| 811 | int node, zid; | 893 | int node, zid; |
| 812 | 894 | ||
| 813 | if (mem_cgroup_subsys.disabled) | ||
| 814 | return 0; | ||
| 815 | |||
| 816 | css_get(&mem->css); | 895 | css_get(&mem->css); |
| 817 | /* | 896 | /* |
| 818 | * page reclaim code (kswapd etc..) will move pages between | 897 | * page reclaim code (kswapd etc..) will move pages between |
| @@ -838,32 +917,34 @@ out: | |||
| 838 | return ret; | 917 | return ret; |
| 839 | } | 918 | } |
| 840 | 919 | ||
| 841 | static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp) | ||
| 842 | { | ||
| 843 | *tmp = memparse(buf, &buf); | ||
| 844 | if (*buf != '\0') | ||
| 845 | return -EINVAL; | ||
| 846 | |||
| 847 | /* | ||
| 848 | * Round up the value to the closest page size | ||
| 849 | */ | ||
| 850 | *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT; | ||
| 851 | return 0; | ||
| 852 | } | ||
| 853 | |||
| 854 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 920 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
| 855 | { | 921 | { |
| 856 | return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, | 922 | return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, |
| 857 | cft->private); | 923 | cft->private); |
| 858 | } | 924 | } |
| 859 | 925 | /* | |
| 860 | static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | 926 | * The user of this function is... |
| 861 | struct file *file, const char __user *userbuf, | 927 | * RES_LIMIT. |
| 862 | size_t nbytes, loff_t *ppos) | 928 | */ |
| 929 | static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | ||
| 930 | const char *buffer) | ||
| 863 | { | 931 | { |
| 864 | return res_counter_write(&mem_cgroup_from_cont(cont)->res, | 932 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); |
| 865 | cft->private, userbuf, nbytes, ppos, | 933 | unsigned long long val; |
| 866 | mem_cgroup_write_strategy); | 934 | int ret; |
| 935 | |||
| 936 | switch (cft->private) { | ||
| 937 | case RES_LIMIT: | ||
| 938 | /* This function does all necessary parse...reuse it */ | ||
| 939 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
| 940 | if (!ret) | ||
| 941 | ret = mem_cgroup_resize_limit(memcg, val); | ||
| 942 | break; | ||
| 943 | default: | ||
| 944 | ret = -EINVAL; /* should be BUG() ? */ | ||
| 945 | break; | ||
| 946 | } | ||
| 947 | return ret; | ||
| 867 | } | 948 | } |
| 868 | 949 | ||
| 869 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | 950 | static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) |
| @@ -940,7 +1021,7 @@ static struct cftype mem_cgroup_files[] = { | |||
| 940 | { | 1021 | { |
| 941 | .name = "limit_in_bytes", | 1022 | .name = "limit_in_bytes", |
| 942 | .private = RES_LIMIT, | 1023 | .private = RES_LIMIT, |
| 943 | .write = mem_cgroup_write, | 1024 | .write_string = mem_cgroup_write, |
| 944 | .read_u64 = mem_cgroup_read, | 1025 | .read_u64 = mem_cgroup_read, |
| 945 | }, | 1026 | }, |
| 946 | { | 1027 | { |
| @@ -1070,8 +1151,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss, | |||
| 1070 | static int mem_cgroup_populate(struct cgroup_subsys *ss, | 1151 | static int mem_cgroup_populate(struct cgroup_subsys *ss, |
| 1071 | struct cgroup *cont) | 1152 | struct cgroup *cont) |
| 1072 | { | 1153 | { |
| 1073 | if (mem_cgroup_subsys.disabled) | ||
| 1074 | return 0; | ||
| 1075 | return cgroup_add_files(cont, ss, mem_cgroup_files, | 1154 | return cgroup_add_files(cont, ss, mem_cgroup_files, |
| 1076 | ARRAY_SIZE(mem_cgroup_files)); | 1155 | ARRAY_SIZE(mem_cgroup_files)); |
| 1077 | } | 1156 | } |
| @@ -1084,9 +1163,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
| 1084 | struct mm_struct *mm; | 1163 | struct mm_struct *mm; |
| 1085 | struct mem_cgroup *mem, *old_mem; | 1164 | struct mem_cgroup *mem, *old_mem; |
| 1086 | 1165 | ||
| 1087 | if (mem_cgroup_subsys.disabled) | ||
| 1088 | return; | ||
| 1089 | |||
| 1090 | mm = get_task_mm(p); | 1166 | mm = get_task_mm(p); |
| 1091 | if (mm == NULL) | 1167 | if (mm == NULL) |
| 1092 | return; | 1168 | return; |
| @@ -1094,9 +1170,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
| 1094 | mem = mem_cgroup_from_cont(cont); | 1170 | mem = mem_cgroup_from_cont(cont); |
| 1095 | old_mem = mem_cgroup_from_cont(old_cont); | 1171 | old_mem = mem_cgroup_from_cont(old_cont); |
| 1096 | 1172 | ||
| 1097 | if (mem == old_mem) | ||
| 1098 | goto out; | ||
| 1099 | |||
| 1100 | /* | 1173 | /* |
| 1101 | * Only thread group leaders are allowed to migrate, the mm_struct is | 1174 | * Only thread group leaders are allowed to migrate, the mm_struct is |
| 1102 | * in effect owned by the leader | 1175 | * in effect owned by the leader |
diff --git a/mm/memory.c b/mm/memory.c index 2302d228fe04..1002f473f497 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -51,6 +51,7 @@ | |||
| 51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
| 52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
| 53 | #include <linux/memcontrol.h> | 53 | #include <linux/memcontrol.h> |
| 54 | #include <linux/mmu_notifier.h> | ||
| 54 | 55 | ||
| 55 | #include <asm/pgalloc.h> | 56 | #include <asm/pgalloc.h> |
| 56 | #include <asm/uaccess.h> | 57 | #include <asm/uaccess.h> |
| @@ -61,6 +62,8 @@ | |||
| 61 | #include <linux/swapops.h> | 62 | #include <linux/swapops.h> |
| 62 | #include <linux/elf.h> | 63 | #include <linux/elf.h> |
| 63 | 64 | ||
| 65 | #include "internal.h" | ||
| 66 | |||
| 64 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 67 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
| 65 | /* use the per-pgdat data instead for discontigmem - mbligh */ | 68 | /* use the per-pgdat data instead for discontigmem - mbligh */ |
| 66 | unsigned long max_mapnr; | 69 | unsigned long max_mapnr; |
| @@ -211,7 +214,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
| 211 | * | 214 | * |
| 212 | * Must be called with pagetable lock held. | 215 | * Must be called with pagetable lock held. |
| 213 | */ | 216 | */ |
| 214 | void free_pgd_range(struct mmu_gather **tlb, | 217 | void free_pgd_range(struct mmu_gather *tlb, |
| 215 | unsigned long addr, unsigned long end, | 218 | unsigned long addr, unsigned long end, |
| 216 | unsigned long floor, unsigned long ceiling) | 219 | unsigned long floor, unsigned long ceiling) |
| 217 | { | 220 | { |
| @@ -262,16 +265,16 @@ void free_pgd_range(struct mmu_gather **tlb, | |||
| 262 | return; | 265 | return; |
| 263 | 266 | ||
| 264 | start = addr; | 267 | start = addr; |
| 265 | pgd = pgd_offset((*tlb)->mm, addr); | 268 | pgd = pgd_offset(tlb->mm, addr); |
| 266 | do { | 269 | do { |
| 267 | next = pgd_addr_end(addr, end); | 270 | next = pgd_addr_end(addr, end); |
| 268 | if (pgd_none_or_clear_bad(pgd)) | 271 | if (pgd_none_or_clear_bad(pgd)) |
| 269 | continue; | 272 | continue; |
| 270 | free_pud_range(*tlb, pgd, addr, next, floor, ceiling); | 273 | free_pud_range(tlb, pgd, addr, next, floor, ceiling); |
| 271 | } while (pgd++, addr = next, addr != end); | 274 | } while (pgd++, addr = next, addr != end); |
| 272 | } | 275 | } |
| 273 | 276 | ||
| 274 | void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, | 277 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, |
| 275 | unsigned long floor, unsigned long ceiling) | 278 | unsigned long floor, unsigned long ceiling) |
| 276 | { | 279 | { |
| 277 | while (vma) { | 280 | while (vma) { |
| @@ -372,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | |||
| 372 | * | 375 | * |
| 373 | * The calling function must still handle the error. | 376 | * The calling function must still handle the error. |
| 374 | */ | 377 | */ |
| 375 | void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) | 378 | static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, |
| 379 | unsigned long vaddr) | ||
| 376 | { | 380 | { |
| 377 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " | 381 | printk(KERN_ERR "Bad pte = %08llx, process = %s, " |
| 378 | "vm_flags = %lx, vaddr = %lx\n", | 382 | "vm_flags = %lx, vaddr = %lx\n", |
| @@ -649,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 649 | unsigned long next; | 653 | unsigned long next; |
| 650 | unsigned long addr = vma->vm_start; | 654 | unsigned long addr = vma->vm_start; |
| 651 | unsigned long end = vma->vm_end; | 655 | unsigned long end = vma->vm_end; |
| 656 | int ret; | ||
| 652 | 657 | ||
| 653 | /* | 658 | /* |
| 654 | * Don't copy ptes where a page fault will fill them correctly. | 659 | * Don't copy ptes where a page fault will fill them correctly. |
| @@ -664,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 664 | if (is_vm_hugetlb_page(vma)) | 669 | if (is_vm_hugetlb_page(vma)) |
| 665 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); | 670 | return copy_hugetlb_page_range(dst_mm, src_mm, vma); |
| 666 | 671 | ||
| 672 | /* | ||
| 673 | * We need to invalidate the secondary MMU mappings only when | ||
| 674 | * there could be a permission downgrade on the ptes of the | ||
| 675 | * parent mm. And a permission downgrade will only happen if | ||
| 676 | * is_cow_mapping() returns true. | ||
| 677 | */ | ||
| 678 | if (is_cow_mapping(vma->vm_flags)) | ||
| 679 | mmu_notifier_invalidate_range_start(src_mm, addr, end); | ||
| 680 | |||
| 681 | ret = 0; | ||
| 667 | dst_pgd = pgd_offset(dst_mm, addr); | 682 | dst_pgd = pgd_offset(dst_mm, addr); |
| 668 | src_pgd = pgd_offset(src_mm, addr); | 683 | src_pgd = pgd_offset(src_mm, addr); |
| 669 | do { | 684 | do { |
| 670 | next = pgd_addr_end(addr, end); | 685 | next = pgd_addr_end(addr, end); |
| 671 | if (pgd_none_or_clear_bad(src_pgd)) | 686 | if (pgd_none_or_clear_bad(src_pgd)) |
| 672 | continue; | 687 | continue; |
| 673 | if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, | 688 | if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, |
| 674 | vma, addr, next)) | 689 | vma, addr, next))) { |
| 675 | return -ENOMEM; | 690 | ret = -ENOMEM; |
| 691 | break; | ||
| 692 | } | ||
| 676 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); | 693 | } while (dst_pgd++, src_pgd++, addr = next, addr != end); |
| 677 | return 0; | 694 | |
| 695 | if (is_cow_mapping(vma->vm_flags)) | ||
| 696 | mmu_notifier_invalidate_range_end(src_mm, | ||
| 697 | vma->vm_start, end); | ||
| 698 | return ret; | ||
| 678 | } | 699 | } |
| 679 | 700 | ||
| 680 | static unsigned long zap_pte_range(struct mmu_gather *tlb, | 701 | static unsigned long zap_pte_range(struct mmu_gather *tlb, |
| @@ -878,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
| 878 | unsigned long start = start_addr; | 899 | unsigned long start = start_addr; |
| 879 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; | 900 | spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; |
| 880 | int fullmm = (*tlbp)->fullmm; | 901 | int fullmm = (*tlbp)->fullmm; |
| 902 | struct mm_struct *mm = vma->vm_mm; | ||
| 881 | 903 | ||
| 904 | mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); | ||
| 882 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { | 905 | for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { |
| 883 | unsigned long end; | 906 | unsigned long end; |
| 884 | 907 | ||
| @@ -899,9 +922,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
| 899 | } | 922 | } |
| 900 | 923 | ||
| 901 | if (unlikely(is_vm_hugetlb_page(vma))) { | 924 | if (unlikely(is_vm_hugetlb_page(vma))) { |
| 902 | unmap_hugepage_range(vma, start, end); | 925 | /* |
| 903 | zap_work -= (end - start) / | 926 | * It is undesirable to test vma->vm_file as it |
| 904 | (HPAGE_SIZE / PAGE_SIZE); | 927 | * should be non-null for valid hugetlb area. |
| 928 | * However, vm_file will be NULL in the error | ||
| 929 | * cleanup path of do_mmap_pgoff. When | ||
| 930 | * hugetlbfs ->mmap method fails, | ||
| 931 | * do_mmap_pgoff() nullifies vma->vm_file | ||
| 932 | * before calling this function to clean up. | ||
| 933 | * Since no pte has actually been setup, it is | ||
| 934 | * safe to do nothing in this case. | ||
| 935 | */ | ||
| 936 | if (vma->vm_file) { | ||
| 937 | unmap_hugepage_range(vma, start, end, NULL); | ||
| 938 | zap_work -= (end - start) / | ||
| 939 | pages_per_huge_page(hstate_vma(vma)); | ||
| 940 | } | ||
| 941 | |||
| 905 | start = end; | 942 | start = end; |
| 906 | } else | 943 | } else |
| 907 | start = unmap_page_range(*tlbp, vma, | 944 | start = unmap_page_range(*tlbp, vma, |
| @@ -929,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp, | |||
| 929 | } | 966 | } |
| 930 | } | 967 | } |
| 931 | out: | 968 | out: |
| 969 | mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); | ||
| 932 | return start; /* which is now the end (or restart) address */ | 970 | return start; /* which is now the end (or restart) address */ |
| 933 | } | 971 | } |
| 934 | 972 | ||
| @@ -956,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, | |||
| 956 | return end; | 994 | return end; |
| 957 | } | 995 | } |
| 958 | 996 | ||
| 997 | /** | ||
| 998 | * zap_vma_ptes - remove ptes mapping the vma | ||
| 999 | * @vma: vm_area_struct holding ptes to be zapped | ||
| 1000 | * @address: starting address of pages to zap | ||
| 1001 | * @size: number of bytes to zap | ||
| 1002 | * | ||
| 1003 | * This function only unmaps ptes assigned to VM_PFNMAP vmas. | ||
| 1004 | * | ||
| 1005 | * The entire address range must be fully contained within the vma. | ||
| 1006 | * | ||
| 1007 | * Returns 0 if successful. | ||
| 1008 | */ | ||
| 1009 | int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, | ||
| 1010 | unsigned long size) | ||
| 1011 | { | ||
| 1012 | if (address < vma->vm_start || address + size > vma->vm_end || | ||
| 1013 | !(vma->vm_flags & VM_PFNMAP)) | ||
| 1014 | return -1; | ||
| 1015 | zap_page_range(vma, address, size, NULL); | ||
| 1016 | return 0; | ||
| 1017 | } | ||
| 1018 | EXPORT_SYMBOL_GPL(zap_vma_ptes); | ||
| 1019 | |||
| 959 | /* | 1020 | /* |
| 960 | * Do a quick page-table lookup for a single page. | 1021 | * Do a quick page-table lookup for a single page. |
| 961 | */ | 1022 | */ |
| @@ -982,19 +1043,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
| 982 | goto no_page_table; | 1043 | goto no_page_table; |
| 983 | 1044 | ||
| 984 | pud = pud_offset(pgd, address); | 1045 | pud = pud_offset(pgd, address); |
| 985 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | 1046 | if (pud_none(*pud)) |
| 986 | goto no_page_table; | 1047 | goto no_page_table; |
| 987 | 1048 | if (pud_huge(*pud)) { | |
| 1049 | BUG_ON(flags & FOLL_GET); | ||
| 1050 | page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); | ||
| 1051 | goto out; | ||
| 1052 | } | ||
| 1053 | if (unlikely(pud_bad(*pud))) | ||
| 1054 | goto no_page_table; | ||
| 1055 | |||
| 988 | pmd = pmd_offset(pud, address); | 1056 | pmd = pmd_offset(pud, address); |
| 989 | if (pmd_none(*pmd)) | 1057 | if (pmd_none(*pmd)) |
| 990 | goto no_page_table; | 1058 | goto no_page_table; |
| 991 | |||
| 992 | if (pmd_huge(*pmd)) { | 1059 | if (pmd_huge(*pmd)) { |
| 993 | BUG_ON(flags & FOLL_GET); | 1060 | BUG_ON(flags & FOLL_GET); |
| 994 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); | 1061 | page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); |
| 995 | goto out; | 1062 | goto out; |
| 996 | } | 1063 | } |
| 997 | |||
| 998 | if (unlikely(pmd_bad(*pmd))) | 1064 | if (unlikely(pmd_bad(*pmd))) |
| 999 | goto no_page_table; | 1065 | goto no_page_table; |
| 1000 | 1066 | ||
| @@ -1058,11 +1124,9 @@ static inline int use_zero_page(struct vm_area_struct *vma) | |||
| 1058 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) | 1124 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) |
| 1059 | return 0; | 1125 | return 0; |
| 1060 | /* | 1126 | /* |
| 1061 | * And if we have a fault or a nopfn routine, it's not an | 1127 | * And if we have a fault routine, it's not an anonymous region. |
| 1062 | * anonymous region. | ||
| 1063 | */ | 1128 | */ |
| 1064 | return !vma->vm_ops || | 1129 | return !vma->vm_ops || !vma->vm_ops->fault; |
| 1065 | (!vma->vm_ops->fault && !vma->vm_ops->nopfn); | ||
| 1066 | } | 1130 | } |
| 1067 | 1131 | ||
| 1068 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1132 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| @@ -1338,6 +1402,11 @@ out: | |||
| 1338 | * | 1402 | * |
| 1339 | * This function should only be called from a vm_ops->fault handler, and | 1403 | * This function should only be called from a vm_ops->fault handler, and |
| 1340 | * in that case the handler should return NULL. | 1404 | * in that case the handler should return NULL. |
| 1405 | * | ||
| 1406 | * vma cannot be a COW mapping. | ||
| 1407 | * | ||
| 1408 | * As this is called only for pages that do not currently exist, we | ||
| 1409 | * do not need to flush old virtual caches or the TLB. | ||
| 1341 | */ | 1410 | */ |
| 1342 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1411 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
| 1343 | unsigned long pfn) | 1412 | unsigned long pfn) |
| @@ -1548,6 +1617,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, | |||
| 1548 | unsigned long next; | 1617 | unsigned long next; |
| 1549 | int err; | 1618 | int err; |
| 1550 | 1619 | ||
| 1620 | BUG_ON(pud_huge(*pud)); | ||
| 1621 | |||
| 1551 | pmd = pmd_alloc(mm, pud, addr); | 1622 | pmd = pmd_alloc(mm, pud, addr); |
| 1552 | if (!pmd) | 1623 | if (!pmd) |
| 1553 | return -ENOMEM; | 1624 | return -ENOMEM; |
| @@ -1589,10 +1660,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
| 1589 | { | 1660 | { |
| 1590 | pgd_t *pgd; | 1661 | pgd_t *pgd; |
| 1591 | unsigned long next; | 1662 | unsigned long next; |
| 1592 | unsigned long end = addr + size; | 1663 | unsigned long start = addr, end = addr + size; |
| 1593 | int err; | 1664 | int err; |
| 1594 | 1665 | ||
| 1595 | BUG_ON(addr >= end); | 1666 | BUG_ON(addr >= end); |
| 1667 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
| 1596 | pgd = pgd_offset(mm, addr); | 1668 | pgd = pgd_offset(mm, addr); |
| 1597 | do { | 1669 | do { |
| 1598 | next = pgd_addr_end(addr, end); | 1670 | next = pgd_addr_end(addr, end); |
| @@ -1600,6 +1672,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, | |||
| 1600 | if (err) | 1672 | if (err) |
| 1601 | break; | 1673 | break; |
| 1602 | } while (pgd++, addr = next, addr != end); | 1674 | } while (pgd++, addr = next, addr != end); |
| 1675 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
| 1603 | return err; | 1676 | return err; |
| 1604 | } | 1677 | } |
| 1605 | EXPORT_SYMBOL_GPL(apply_to_page_range); | 1678 | EXPORT_SYMBOL_GPL(apply_to_page_range); |
| @@ -1716,7 +1789,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1716 | * not dirty accountable. | 1789 | * not dirty accountable. |
| 1717 | */ | 1790 | */ |
| 1718 | if (PageAnon(old_page)) { | 1791 | if (PageAnon(old_page)) { |
| 1719 | if (!TestSetPageLocked(old_page)) { | 1792 | if (trylock_page(old_page)) { |
| 1720 | reuse = can_share_swap_page(old_page); | 1793 | reuse = can_share_swap_page(old_page); |
| 1721 | unlock_page(old_page); | 1794 | unlock_page(old_page); |
| 1722 | } | 1795 | } |
| @@ -1812,7 +1885,7 @@ gotten: | |||
| 1812 | * seen in the presence of one thread doing SMC and another | 1885 | * seen in the presence of one thread doing SMC and another |
| 1813 | * thread doing COW. | 1886 | * thread doing COW. |
| 1814 | */ | 1887 | */ |
| 1815 | ptep_clear_flush(vma, address, page_table); | 1888 | ptep_clear_flush_notify(vma, address, page_table); |
| 1816 | set_pte_at(mm, address, page_table, entry); | 1889 | set_pte_at(mm, address, page_table, entry); |
| 1817 | update_mmu_cache(vma, address, entry); | 1890 | update_mmu_cache(vma, address, entry); |
| 1818 | lru_cache_add_active(new_page); | 1891 | lru_cache_add_active(new_page); |
| @@ -2501,59 +2574,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2501 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 2574 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
| 2502 | } | 2575 | } |
| 2503 | 2576 | ||
| 2504 | |||
| 2505 | /* | ||
| 2506 | * do_no_pfn() tries to create a new page mapping for a page without | ||
| 2507 | * a struct_page backing it | ||
| 2508 | * | ||
| 2509 | * As this is called only for pages that do not currently exist, we | ||
| 2510 | * do not need to flush old virtual caches or the TLB. | ||
| 2511 | * | ||
| 2512 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
| 2513 | * but allow concurrent faults), and pte mapped but not yet locked. | ||
| 2514 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
| 2515 | * | ||
| 2516 | * It is expected that the ->nopfn handler always returns the same pfn | ||
| 2517 | * for a given virtual mapping. | ||
| 2518 | * | ||
| 2519 | * Mark this `noinline' to prevent it from bloating the main pagefault code. | ||
| 2520 | */ | ||
| 2521 | static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 2522 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
| 2523 | int write_access) | ||
| 2524 | { | ||
| 2525 | spinlock_t *ptl; | ||
| 2526 | pte_t entry; | ||
| 2527 | unsigned long pfn; | ||
| 2528 | |||
| 2529 | pte_unmap(page_table); | ||
| 2530 | BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); | ||
| 2531 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); | ||
| 2532 | |||
| 2533 | pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); | ||
| 2534 | |||
| 2535 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | ||
| 2536 | |||
| 2537 | if (unlikely(pfn == NOPFN_OOM)) | ||
| 2538 | return VM_FAULT_OOM; | ||
| 2539 | else if (unlikely(pfn == NOPFN_SIGBUS)) | ||
| 2540 | return VM_FAULT_SIGBUS; | ||
| 2541 | else if (unlikely(pfn == NOPFN_REFAULT)) | ||
| 2542 | return 0; | ||
| 2543 | |||
| 2544 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
| 2545 | |||
| 2546 | /* Only go through if we didn't race with anybody else... */ | ||
| 2547 | if (pte_none(*page_table)) { | ||
| 2548 | entry = pfn_pte(pfn, vma->vm_page_prot); | ||
| 2549 | if (write_access) | ||
| 2550 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
| 2551 | set_pte_at(mm, address, page_table, entry); | ||
| 2552 | } | ||
| 2553 | pte_unmap_unlock(page_table, ptl); | ||
| 2554 | return 0; | ||
| 2555 | } | ||
| 2556 | |||
| 2557 | /* | 2577 | /* |
| 2558 | * Fault of a previously existing named mapping. Repopulate the pte | 2578 | * Fault of a previously existing named mapping. Repopulate the pte |
| 2559 | * from the encoded file_pte if possible. This enables swappable | 2579 | * from the encoded file_pte if possible. This enables swappable |
| @@ -2614,9 +2634,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
| 2614 | if (likely(vma->vm_ops->fault)) | 2634 | if (likely(vma->vm_ops->fault)) |
| 2615 | return do_linear_fault(mm, vma, address, | 2635 | return do_linear_fault(mm, vma, address, |
| 2616 | pte, pmd, write_access, entry); | 2636 | pte, pmd, write_access, entry); |
| 2617 | if (unlikely(vma->vm_ops->nopfn)) | ||
| 2618 | return do_no_pfn(mm, vma, address, pte, | ||
| 2619 | pmd, write_access); | ||
| 2620 | } | 2637 | } |
| 2621 | return do_anonymous_page(mm, vma, address, | 2638 | return do_anonymous_page(mm, vma, address, |
| 2622 | pte, pmd, write_access); | 2639 | pte, pmd, write_access); |
| @@ -2748,16 +2765,26 @@ int make_pages_present(unsigned long addr, unsigned long end) | |||
| 2748 | 2765 | ||
| 2749 | vma = find_vma(current->mm, addr); | 2766 | vma = find_vma(current->mm, addr); |
| 2750 | if (!vma) | 2767 | if (!vma) |
| 2751 | return -1; | 2768 | return -ENOMEM; |
| 2752 | write = (vma->vm_flags & VM_WRITE) != 0; | 2769 | write = (vma->vm_flags & VM_WRITE) != 0; |
| 2753 | BUG_ON(addr >= end); | 2770 | BUG_ON(addr >= end); |
| 2754 | BUG_ON(end > vma->vm_end); | 2771 | BUG_ON(end > vma->vm_end); |
| 2755 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; | 2772 | len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; |
| 2756 | ret = get_user_pages(current, current->mm, addr, | 2773 | ret = get_user_pages(current, current->mm, addr, |
| 2757 | len, write, 0, NULL, NULL); | 2774 | len, write, 0, NULL, NULL); |
| 2758 | if (ret < 0) | 2775 | if (ret < 0) { |
| 2776 | /* | ||
| 2777 | SUS require strange return value to mlock | ||
| 2778 | - invalid addr generate to ENOMEM. | ||
| 2779 | - out of memory should generate EAGAIN. | ||
| 2780 | */ | ||
| 2781 | if (ret == -EFAULT) | ||
| 2782 | ret = -ENOMEM; | ||
| 2783 | else if (ret == -ENOMEM) | ||
| 2784 | ret = -EAGAIN; | ||
| 2759 | return ret; | 2785 | return ret; |
| 2760 | return ret == len ? 0 : -1; | 2786 | } |
| 2787 | return ret == len ? 0 : -ENOMEM; | ||
| 2761 | } | 2788 | } |
| 2762 | 2789 | ||
| 2763 | #if !defined(__HAVE_ARCH_GATE_AREA) | 2790 | #if !defined(__HAVE_ARCH_GATE_AREA) |
| @@ -2804,6 +2831,86 @@ int in_gate_area_no_task(unsigned long addr) | |||
| 2804 | 2831 | ||
| 2805 | #endif /* __HAVE_ARCH_GATE_AREA */ | 2832 | #endif /* __HAVE_ARCH_GATE_AREA */ |
| 2806 | 2833 | ||
| 2834 | #ifdef CONFIG_HAVE_IOREMAP_PROT | ||
| 2835 | static resource_size_t follow_phys(struct vm_area_struct *vma, | ||
| 2836 | unsigned long address, unsigned int flags, | ||
| 2837 | unsigned long *prot) | ||
| 2838 | { | ||
| 2839 | pgd_t *pgd; | ||
| 2840 | pud_t *pud; | ||
| 2841 | pmd_t *pmd; | ||
| 2842 | pte_t *ptep, pte; | ||
| 2843 | spinlock_t *ptl; | ||
| 2844 | resource_size_t phys_addr = 0; | ||
| 2845 | struct mm_struct *mm = vma->vm_mm; | ||
| 2846 | |||
| 2847 | VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP))); | ||
| 2848 | |||
| 2849 | pgd = pgd_offset(mm, address); | ||
| 2850 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | ||
| 2851 | goto no_page_table; | ||
| 2852 | |||
| 2853 | pud = pud_offset(pgd, address); | ||
| 2854 | if (pud_none(*pud) || unlikely(pud_bad(*pud))) | ||
| 2855 | goto no_page_table; | ||
| 2856 | |||
| 2857 | pmd = pmd_offset(pud, address); | ||
| 2858 | if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) | ||
| 2859 | goto no_page_table; | ||
| 2860 | |||
| 2861 | /* We cannot handle huge page PFN maps. Luckily they don't exist. */ | ||
| 2862 | if (pmd_huge(*pmd)) | ||
| 2863 | goto no_page_table; | ||
| 2864 | |||
| 2865 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
| 2866 | if (!ptep) | ||
| 2867 | goto out; | ||
| 2868 | |||
| 2869 | pte = *ptep; | ||
| 2870 | if (!pte_present(pte)) | ||
| 2871 | goto unlock; | ||
| 2872 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | ||
| 2873 | goto unlock; | ||
| 2874 | phys_addr = pte_pfn(pte); | ||
| 2875 | phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ | ||
| 2876 | |||
| 2877 | *prot = pgprot_val(pte_pgprot(pte)); | ||
| 2878 | |||
| 2879 | unlock: | ||
| 2880 | pte_unmap_unlock(ptep, ptl); | ||
| 2881 | out: | ||
| 2882 | return phys_addr; | ||
| 2883 | no_page_table: | ||
| 2884 | return 0; | ||
| 2885 | } | ||
| 2886 | |||
| 2887 | int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, | ||
| 2888 | void *buf, int len, int write) | ||
| 2889 | { | ||
| 2890 | resource_size_t phys_addr; | ||
| 2891 | unsigned long prot = 0; | ||
| 2892 | void *maddr; | ||
| 2893 | int offset = addr & (PAGE_SIZE-1); | ||
| 2894 | |||
| 2895 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
| 2896 | return -EINVAL; | ||
| 2897 | |||
| 2898 | phys_addr = follow_phys(vma, addr, write, &prot); | ||
| 2899 | |||
| 2900 | if (!phys_addr) | ||
| 2901 | return -EINVAL; | ||
| 2902 | |||
| 2903 | maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); | ||
| 2904 | if (write) | ||
| 2905 | memcpy_toio(maddr + offset, buf, len); | ||
| 2906 | else | ||
| 2907 | memcpy_fromio(buf, maddr + offset, len); | ||
| 2908 | iounmap(maddr); | ||
| 2909 | |||
| 2910 | return len; | ||
| 2911 | } | ||
| 2912 | #endif | ||
| 2913 | |||
| 2807 | /* | 2914 | /* |
| 2808 | * Access another process' address space. | 2915 | * Access another process' address space. |
| 2809 | * Source/target buffer must be kernel space, | 2916 | * Source/target buffer must be kernel space, |
| @@ -2813,7 +2920,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
| 2813 | { | 2920 | { |
| 2814 | struct mm_struct *mm; | 2921 | struct mm_struct *mm; |
| 2815 | struct vm_area_struct *vma; | 2922 | struct vm_area_struct *vma; |
| 2816 | struct page *page; | ||
| 2817 | void *old_buf = buf; | 2923 | void *old_buf = buf; |
| 2818 | 2924 | ||
| 2819 | mm = get_task_mm(tsk); | 2925 | mm = get_task_mm(tsk); |
| @@ -2825,28 +2931,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
| 2825 | while (len) { | 2931 | while (len) { |
| 2826 | int bytes, ret, offset; | 2932 | int bytes, ret, offset; |
| 2827 | void *maddr; | 2933 | void *maddr; |
| 2934 | struct page *page = NULL; | ||
| 2828 | 2935 | ||
| 2829 | ret = get_user_pages(tsk, mm, addr, 1, | 2936 | ret = get_user_pages(tsk, mm, addr, 1, |
| 2830 | write, 1, &page, &vma); | 2937 | write, 1, &page, &vma); |
| 2831 | if (ret <= 0) | 2938 | if (ret <= 0) { |
| 2832 | break; | 2939 | /* |
| 2833 | 2940 | * Check if this is a VM_IO | VM_PFNMAP VMA, which | |
| 2834 | bytes = len; | 2941 | * we can access using slightly different code. |
| 2835 | offset = addr & (PAGE_SIZE-1); | 2942 | */ |
| 2836 | if (bytes > PAGE_SIZE-offset) | 2943 | #ifdef CONFIG_HAVE_IOREMAP_PROT |
| 2837 | bytes = PAGE_SIZE-offset; | 2944 | vma = find_vma(mm, addr); |
| 2838 | 2945 | if (!vma) | |
| 2839 | maddr = kmap(page); | 2946 | break; |
| 2840 | if (write) { | 2947 | if (vma->vm_ops && vma->vm_ops->access) |
| 2841 | copy_to_user_page(vma, page, addr, | 2948 | ret = vma->vm_ops->access(vma, addr, buf, |
| 2842 | maddr + offset, buf, bytes); | 2949 | len, write); |
| 2843 | set_page_dirty_lock(page); | 2950 | if (ret <= 0) |
| 2951 | #endif | ||
| 2952 | break; | ||
| 2953 | bytes = ret; | ||
| 2844 | } else { | 2954 | } else { |
| 2845 | copy_from_user_page(vma, page, addr, | 2955 | bytes = len; |
| 2846 | buf, maddr + offset, bytes); | 2956 | offset = addr & (PAGE_SIZE-1); |
| 2957 | if (bytes > PAGE_SIZE-offset) | ||
| 2958 | bytes = PAGE_SIZE-offset; | ||
| 2959 | |||
| 2960 | maddr = kmap(page); | ||
| 2961 | if (write) { | ||
| 2962 | copy_to_user_page(vma, page, addr, | ||
| 2963 | maddr + offset, buf, bytes); | ||
| 2964 | set_page_dirty_lock(page); | ||
| 2965 | } else { | ||
| 2966 | copy_from_user_page(vma, page, addr, | ||
| 2967 | buf, maddr + offset, bytes); | ||
| 2968 | } | ||
| 2969 | kunmap(page); | ||
| 2970 | page_cache_release(page); | ||
| 2847 | } | 2971 | } |
| 2848 | kunmap(page); | ||
| 2849 | page_cache_release(page); | ||
| 2850 | len -= bytes; | 2972 | len -= bytes; |
| 2851 | buf += bytes; | 2973 | buf += bytes; |
| 2852 | addr += bytes; | 2974 | addr += bytes; |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 833f854eabe5..89fee2dcb039 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -62,9 +62,9 @@ static void release_memory_resource(struct resource *res) | |||
| 62 | 62 | ||
| 63 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE | 63 | #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE |
| 64 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 64 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
| 65 | static void get_page_bootmem(unsigned long info, struct page *page, int magic) | 65 | static void get_page_bootmem(unsigned long info, struct page *page, int type) |
| 66 | { | 66 | { |
| 67 | atomic_set(&page->_mapcount, magic); | 67 | atomic_set(&page->_mapcount, type); |
| 68 | SetPagePrivate(page); | 68 | SetPagePrivate(page); |
| 69 | set_page_private(page, info); | 69 | set_page_private(page, info); |
| 70 | atomic_inc(&page->_count); | 70 | atomic_inc(&page->_count); |
| @@ -72,10 +72,10 @@ static void get_page_bootmem(unsigned long info, struct page *page, int magic) | |||
| 72 | 72 | ||
| 73 | void put_page_bootmem(struct page *page) | 73 | void put_page_bootmem(struct page *page) |
| 74 | { | 74 | { |
| 75 | int magic; | 75 | int type; |
| 76 | 76 | ||
| 77 | magic = atomic_read(&page->_mapcount); | 77 | type = atomic_read(&page->_mapcount); |
| 78 | BUG_ON(magic >= -1); | 78 | BUG_ON(type >= -1); |
| 79 | 79 | ||
| 80 | if (atomic_dec_return(&page->_count) == 1) { | 80 | if (atomic_dec_return(&page->_count) == 1) { |
| 81 | ClearPagePrivate(page); | 81 | ClearPagePrivate(page); |
| @@ -86,7 +86,7 @@ void put_page_bootmem(struct page *page) | |||
| 86 | 86 | ||
| 87 | } | 87 | } |
| 88 | 88 | ||
| 89 | void register_page_bootmem_info_section(unsigned long start_pfn) | 89 | static void register_page_bootmem_info_section(unsigned long start_pfn) |
| 90 | { | 90 | { |
| 91 | unsigned long *usemap, mapsize, section_nr, i; | 91 | unsigned long *usemap, mapsize, section_nr, i; |
| 92 | struct mem_section *ms; | 92 | struct mem_section *ms; |
| @@ -119,7 +119,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn) | |||
| 119 | mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; | 119 | mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; |
| 120 | 120 | ||
| 121 | for (i = 0; i < mapsize; i++, page++) | 121 | for (i = 0; i < mapsize; i++, page++) |
| 122 | get_page_bootmem(section_nr, page, MIX_INFO); | 122 | get_page_bootmem(section_nr, page, MIX_SECTION_INFO); |
| 123 | 123 | ||
| 124 | } | 124 | } |
| 125 | 125 | ||
| @@ -429,7 +429,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 429 | 429 | ||
| 430 | if (need_zonelists_rebuild) | 430 | if (need_zonelists_rebuild) |
| 431 | build_all_zonelists(); | 431 | build_all_zonelists(); |
| 432 | vm_total_pages = nr_free_pagecache_pages(); | 432 | else |
| 433 | vm_total_pages = nr_free_pagecache_pages(); | ||
| 434 | |||
| 433 | writeback_set_ratelimit(); | 435 | writeback_set_ratelimit(); |
| 434 | 436 | ||
| 435 | if (onlined_pages) | 437 | if (onlined_pages) |
| @@ -455,7 +457,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start) | |||
| 455 | /* we can use NODE_DATA(nid) from here */ | 457 | /* we can use NODE_DATA(nid) from here */ |
| 456 | 458 | ||
| 457 | /* init node's zones as empty zones, we don't have any present pages.*/ | 459 | /* init node's zones as empty zones, we don't have any present pages.*/ |
| 458 | free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size); | 460 | free_area_init_node(nid, zones_size, start_pfn, zholes_size); |
| 459 | 461 | ||
| 460 | return pgdat; | 462 | return pgdat; |
| 461 | } | 463 | } |
| @@ -521,6 +523,66 @@ EXPORT_SYMBOL_GPL(add_memory); | |||
| 521 | 523 | ||
| 522 | #ifdef CONFIG_MEMORY_HOTREMOVE | 524 | #ifdef CONFIG_MEMORY_HOTREMOVE |
| 523 | /* | 525 | /* |
| 526 | * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy | ||
| 527 | * set and the size of the free page is given by page_order(). Using this, | ||
| 528 | * the function determines if the pageblock contains only free pages. | ||
| 529 | * Due to buddy contraints, a free page at least the size of a pageblock will | ||
| 530 | * be located at the start of the pageblock | ||
| 531 | */ | ||
| 532 | static inline int pageblock_free(struct page *page) | ||
| 533 | { | ||
| 534 | return PageBuddy(page) && page_order(page) >= pageblock_order; | ||
| 535 | } | ||
| 536 | |||
| 537 | /* Return the start of the next active pageblock after a given page */ | ||
| 538 | static struct page *next_active_pageblock(struct page *page) | ||
| 539 | { | ||
| 540 | int pageblocks_stride; | ||
| 541 | |||
| 542 | /* Ensure the starting page is pageblock-aligned */ | ||
| 543 | BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); | ||
| 544 | |||
| 545 | /* Move forward by at least 1 * pageblock_nr_pages */ | ||
| 546 | pageblocks_stride = 1; | ||
| 547 | |||
| 548 | /* If the entire pageblock is free, move to the end of free page */ | ||
| 549 | if (pageblock_free(page)) | ||
| 550 | pageblocks_stride += page_order(page) - pageblock_order; | ||
| 551 | |||
| 552 | return page + (pageblocks_stride * pageblock_nr_pages); | ||
| 553 | } | ||
| 554 | |||
| 555 | /* Checks if this range of memory is likely to be hot-removable. */ | ||
| 556 | int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) | ||
| 557 | { | ||
| 558 | int type; | ||
| 559 | struct page *page = pfn_to_page(start_pfn); | ||
| 560 | struct page *end_page = page + nr_pages; | ||
| 561 | |||
| 562 | /* Check the starting page of each pageblock within the range */ | ||
| 563 | for (; page < end_page; page = next_active_pageblock(page)) { | ||
| 564 | type = get_pageblock_migratetype(page); | ||
| 565 | |||
| 566 | /* | ||
| 567 | * A pageblock containing MOVABLE or free pages is considered | ||
| 568 | * removable | ||
| 569 | */ | ||
| 570 | if (type != MIGRATE_MOVABLE && !pageblock_free(page)) | ||
| 571 | return 0; | ||
| 572 | |||
| 573 | /* | ||
| 574 | * A pageblock starting with a PageReserved page is not | ||
| 575 | * considered removable. | ||
| 576 | */ | ||
| 577 | if (PageReserved(page)) | ||
| 578 | return 0; | ||
| 579 | } | ||
| 580 | |||
| 581 | /* All pageblocks in the memory block are likely to be hot-removable */ | ||
| 582 | return 1; | ||
| 583 | } | ||
| 584 | |||
| 585 | /* | ||
| 524 | * Confirm all pages in a range [start, end) is belongs to the same zone. | 586 | * Confirm all pages in a range [start, end) is belongs to the same zone. |
| 525 | */ | 587 | */ |
| 526 | static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) | 588 | static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c94e58b192c3..83369058ec13 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
| 803 | int do_migrate_pages(struct mm_struct *mm, | 803 | int do_migrate_pages(struct mm_struct *mm, |
| 804 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 804 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) |
| 805 | { | 805 | { |
| 806 | LIST_HEAD(pagelist); | ||
| 807 | int busy = 0; | 806 | int busy = 0; |
| 808 | int err = 0; | 807 | int err = 0; |
| 809 | nodemask_t tmp; | 808 | nodemask_t tmp; |
| @@ -1481,7 +1480,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | |||
| 1481 | 1480 | ||
| 1482 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { | 1481 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { |
| 1483 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, | 1482 | zl = node_zonelist(interleave_nid(*mpol, vma, addr, |
| 1484 | HPAGE_SHIFT), gfp_flags); | 1483 | huge_page_shift(hstate_vma(vma))), gfp_flags); |
| 1485 | } else { | 1484 | } else { |
| 1486 | zl = policy_zonelist(gfp_flags, *mpol); | 1485 | zl = policy_zonelist(gfp_flags, *mpol); |
| 1487 | if ((*mpol)->mode == MPOL_BIND) | 1486 | if ((*mpol)->mode == MPOL_BIND) |
| @@ -2220,9 +2219,12 @@ static void check_huge_range(struct vm_area_struct *vma, | |||
| 2220 | { | 2219 | { |
| 2221 | unsigned long addr; | 2220 | unsigned long addr; |
| 2222 | struct page *page; | 2221 | struct page *page; |
| 2222 | struct hstate *h = hstate_vma(vma); | ||
| 2223 | unsigned long sz = huge_page_size(h); | ||
| 2223 | 2224 | ||
| 2224 | for (addr = start; addr < end; addr += HPAGE_SIZE) { | 2225 | for (addr = start; addr < end; addr += sz) { |
| 2225 | pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); | 2226 | pte_t *ptep = huge_pte_offset(vma->vm_mm, |
| 2227 | addr & huge_page_mask(h)); | ||
| 2226 | pte_t pte; | 2228 | pte_t pte; |
| 2227 | 2229 | ||
| 2228 | if (!ptep) | 2230 | if (!ptep) |
diff --git a/mm/migrate.c b/mm/migrate.c index 55bd355d170d..2a80136b23bb 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -30,6 +30,7 @@ | |||
| 30 | #include <linux/vmalloc.h> | 30 | #include <linux/vmalloc.h> |
| 31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
| 32 | #include <linux/memcontrol.h> | 32 | #include <linux/memcontrol.h> |
| 33 | #include <linux/syscalls.h> | ||
| 33 | 34 | ||
| 34 | #include "internal.h" | 35 | #include "internal.h" |
| 35 | 36 | ||
| @@ -284,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, | |||
| 284 | 285 | ||
| 285 | page = migration_entry_to_page(entry); | 286 | page = migration_entry_to_page(entry); |
| 286 | 287 | ||
| 287 | get_page(page); | 288 | /* |
| 289 | * Once radix-tree replacement of page migration started, page_count | ||
| 290 | * *must* be zero. And, we don't want to call wait_on_page_locked() | ||
| 291 | * against a page without get_page(). | ||
| 292 | * So, we use get_page_unless_zero(), here. Even failed, page fault | ||
| 293 | * will occur again. | ||
| 294 | */ | ||
| 295 | if (!get_page_unless_zero(page)) | ||
| 296 | goto out; | ||
| 288 | pte_unmap_unlock(ptep, ptl); | 297 | pte_unmap_unlock(ptep, ptl); |
| 289 | wait_on_page_locked(page); | 298 | wait_on_page_locked(page); |
| 290 | put_page(page); | 299 | put_page(page); |
| @@ -304,6 +313,7 @@ out: | |||
| 304 | static int migrate_page_move_mapping(struct address_space *mapping, | 313 | static int migrate_page_move_mapping(struct address_space *mapping, |
| 305 | struct page *newpage, struct page *page) | 314 | struct page *newpage, struct page *page) |
| 306 | { | 315 | { |
| 316 | int expected_count; | ||
| 307 | void **pslot; | 317 | void **pslot; |
| 308 | 318 | ||
| 309 | if (!mapping) { | 319 | if (!mapping) { |
| @@ -313,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 313 | return 0; | 323 | return 0; |
| 314 | } | 324 | } |
| 315 | 325 | ||
| 316 | write_lock_irq(&mapping->tree_lock); | 326 | spin_lock_irq(&mapping->tree_lock); |
| 317 | 327 | ||
| 318 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | 328 | pslot = radix_tree_lookup_slot(&mapping->page_tree, |
| 319 | page_index(page)); | 329 | page_index(page)); |
| 320 | 330 | ||
| 321 | if (page_count(page) != 2 + !!PagePrivate(page) || | 331 | expected_count = 2 + !!PagePrivate(page); |
| 332 | if (page_count(page) != expected_count || | ||
| 322 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 333 | (struct page *)radix_tree_deref_slot(pslot) != page) { |
| 323 | write_unlock_irq(&mapping->tree_lock); | 334 | spin_unlock_irq(&mapping->tree_lock); |
| 335 | return -EAGAIN; | ||
| 336 | } | ||
| 337 | |||
| 338 | if (!page_freeze_refs(page, expected_count)) { | ||
| 339 | spin_unlock_irq(&mapping->tree_lock); | ||
| 324 | return -EAGAIN; | 340 | return -EAGAIN; |
| 325 | } | 341 | } |
| 326 | 342 | ||
| @@ -337,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 337 | 353 | ||
| 338 | radix_tree_replace_slot(pslot, newpage); | 354 | radix_tree_replace_slot(pslot, newpage); |
| 339 | 355 | ||
| 356 | page_unfreeze_refs(page, expected_count); | ||
| 340 | /* | 357 | /* |
| 341 | * Drop cache reference from old page. | 358 | * Drop cache reference from old page. |
| 342 | * We know this isn't the last reference. | 359 | * We know this isn't the last reference. |
| @@ -356,7 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 356 | __dec_zone_page_state(page, NR_FILE_PAGES); | 373 | __dec_zone_page_state(page, NR_FILE_PAGES); |
| 357 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 374 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
| 358 | 375 | ||
| 359 | write_unlock_irq(&mapping->tree_lock); | 376 | spin_unlock_irq(&mapping->tree_lock); |
| 377 | if (!PageSwapCache(newpage)) | ||
| 378 | mem_cgroup_uncharge_cache_page(page); | ||
| 360 | 379 | ||
| 361 | return 0; | 380 | return 0; |
| 362 | } | 381 | } |
| @@ -586,7 +605,7 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
| 586 | * establishing additional references. We are the only one | 605 | * establishing additional references. We are the only one |
| 587 | * holding a reference to the new page at this point. | 606 | * holding a reference to the new page at this point. |
| 588 | */ | 607 | */ |
| 589 | if (TestSetPageLocked(newpage)) | 608 | if (!trylock_page(newpage)) |
| 590 | BUG(); | 609 | BUG(); |
| 591 | 610 | ||
| 592 | /* Prepare mapping for the new page.*/ | 611 | /* Prepare mapping for the new page.*/ |
| @@ -610,7 +629,6 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
| 610 | rc = fallback_migrate_page(mapping, newpage, page); | 629 | rc = fallback_migrate_page(mapping, newpage, page); |
| 611 | 630 | ||
| 612 | if (!rc) { | 631 | if (!rc) { |
| 613 | mem_cgroup_page_migration(page, newpage); | ||
| 614 | remove_migration_ptes(page, newpage); | 632 | remove_migration_ptes(page, newpage); |
| 615 | } else | 633 | } else |
| 616 | newpage->mapping = NULL; | 634 | newpage->mapping = NULL; |
| @@ -640,8 +658,16 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 640 | /* page was freed from under us. So we are done. */ | 658 | /* page was freed from under us. So we are done. */ |
| 641 | goto move_newpage; | 659 | goto move_newpage; |
| 642 | 660 | ||
| 661 | charge = mem_cgroup_prepare_migration(page, newpage); | ||
| 662 | if (charge == -ENOMEM) { | ||
| 663 | rc = -ENOMEM; | ||
| 664 | goto move_newpage; | ||
| 665 | } | ||
| 666 | /* prepare cgroup just returns 0 or -ENOMEM */ | ||
| 667 | BUG_ON(charge); | ||
| 668 | |||
| 643 | rc = -EAGAIN; | 669 | rc = -EAGAIN; |
| 644 | if (TestSetPageLocked(page)) { | 670 | if (!trylock_page(page)) { |
| 645 | if (!force) | 671 | if (!force) |
| 646 | goto move_newpage; | 672 | goto move_newpage; |
| 647 | lock_page(page); | 673 | lock_page(page); |
| @@ -691,19 +717,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 691 | goto rcu_unlock; | 717 | goto rcu_unlock; |
| 692 | } | 718 | } |
| 693 | 719 | ||
| 694 | charge = mem_cgroup_prepare_migration(page); | ||
| 695 | /* Establish migration ptes or remove ptes */ | 720 | /* Establish migration ptes or remove ptes */ |
| 696 | try_to_unmap(page, 1); | 721 | try_to_unmap(page, 1); |
| 697 | 722 | ||
| 698 | if (!page_mapped(page)) | 723 | if (!page_mapped(page)) |
| 699 | rc = move_to_new_page(newpage, page); | 724 | rc = move_to_new_page(newpage, page); |
| 700 | 725 | ||
| 701 | if (rc) { | 726 | if (rc) |
| 702 | remove_migration_ptes(page, page); | 727 | remove_migration_ptes(page, page); |
| 703 | if (charge) | ||
| 704 | mem_cgroup_end_migration(page); | ||
| 705 | } else if (charge) | ||
| 706 | mem_cgroup_end_migration(newpage); | ||
| 707 | rcu_unlock: | 728 | rcu_unlock: |
| 708 | if (rcu_locked) | 729 | if (rcu_locked) |
| 709 | rcu_read_unlock(); | 730 | rcu_read_unlock(); |
| @@ -724,6 +745,8 @@ unlock: | |||
| 724 | } | 745 | } |
| 725 | 746 | ||
| 726 | move_newpage: | 747 | move_newpage: |
| 748 | if (!charge) | ||
| 749 | mem_cgroup_end_migration(newpage); | ||
| 727 | /* | 750 | /* |
| 728 | * Move the new page to the LRU. If migration was not successful | 751 | * Move the new page to the LRU. If migration was not successful |
| 729 | * then this will free the page. | 752 | * then this will free the page. |
| @@ -1070,7 +1093,6 @@ out2: | |||
| 1070 | mmput(mm); | 1093 | mmput(mm); |
| 1071 | return err; | 1094 | return err; |
| 1072 | } | 1095 | } |
| 1073 | #endif | ||
| 1074 | 1096 | ||
| 1075 | /* | 1097 | /* |
| 1076 | * Call migration functions in the vma_ops that may prepare | 1098 | * Call migration functions in the vma_ops that may prepare |
| @@ -1092,3 +1114,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, | |||
| 1092 | } | 1114 | } |
| 1093 | return err; | 1115 | return err; |
| 1094 | } | 1116 | } |
| 1117 | #endif | ||
diff --git a/mm/mlock.c b/mm/mlock.c index 7b2656055d6a..01fbe93eff5c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -78,8 +78,6 @@ success: | |||
| 78 | 78 | ||
| 79 | mm->locked_vm -= pages; | 79 | mm->locked_vm -= pages; |
| 80 | out: | 80 | out: |
| 81 | if (ret == -ENOMEM) | ||
| 82 | ret = -EAGAIN; | ||
| 83 | return ret; | 81 | return ret; |
| 84 | } | 82 | } |
| 85 | 83 | ||
diff --git a/mm/mm_init.c b/mm/mm_init.c new file mode 100644 index 000000000000..936ef2efd892 --- /dev/null +++ b/mm/mm_init.c | |||
| @@ -0,0 +1,152 @@ | |||
| 1 | /* | ||
| 2 | * mm_init.c - Memory initialisation verification and debugging | ||
| 3 | * | ||
| 4 | * Copyright 2008 IBM Corporation, 2008 | ||
| 5 | * Author Mel Gorman <mel@csn.ul.ie> | ||
| 6 | * | ||
| 7 | */ | ||
| 8 | #include <linux/kernel.h> | ||
| 9 | #include <linux/init.h> | ||
| 10 | #include <linux/kobject.h> | ||
| 11 | #include <linux/module.h> | ||
| 12 | #include "internal.h" | ||
| 13 | |||
| 14 | #ifdef CONFIG_DEBUG_MEMORY_INIT | ||
| 15 | int __meminitdata mminit_loglevel; | ||
| 16 | |||
| 17 | #ifndef SECTIONS_SHIFT | ||
| 18 | #define SECTIONS_SHIFT 0 | ||
| 19 | #endif | ||
| 20 | |||
| 21 | /* The zonelists are simply reported, validation is manual. */ | ||
| 22 | void mminit_verify_zonelist(void) | ||
| 23 | { | ||
| 24 | int nid; | ||
| 25 | |||
| 26 | if (mminit_loglevel < MMINIT_VERIFY) | ||
| 27 | return; | ||
| 28 | |||
| 29 | for_each_online_node(nid) { | ||
| 30 | pg_data_t *pgdat = NODE_DATA(nid); | ||
| 31 | struct zone *zone; | ||
| 32 | struct zoneref *z; | ||
| 33 | struct zonelist *zonelist; | ||
| 34 | int i, listid, zoneid; | ||
| 35 | |||
| 36 | BUG_ON(MAX_ZONELISTS > 2); | ||
| 37 | for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { | ||
| 38 | |||
| 39 | /* Identify the zone and nodelist */ | ||
| 40 | zoneid = i % MAX_NR_ZONES; | ||
| 41 | listid = i / MAX_NR_ZONES; | ||
| 42 | zonelist = &pgdat->node_zonelists[listid]; | ||
| 43 | zone = &pgdat->node_zones[zoneid]; | ||
| 44 | if (!populated_zone(zone)) | ||
| 45 | continue; | ||
| 46 | |||
| 47 | /* Print information about the zonelist */ | ||
| 48 | printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ", | ||
| 49 | listid > 0 ? "thisnode" : "general", nid, | ||
| 50 | zone->name); | ||
| 51 | |||
| 52 | /* Iterate the zonelist */ | ||
| 53 | for_each_zone_zonelist(zone, z, zonelist, zoneid) { | ||
| 54 | #ifdef CONFIG_NUMA | ||
| 55 | printk(KERN_CONT "%d:%s ", | ||
| 56 | zone->node, zone->name); | ||
| 57 | #else | ||
| 58 | printk(KERN_CONT "0:%s ", zone->name); | ||
| 59 | #endif /* CONFIG_NUMA */ | ||
| 60 | } | ||
| 61 | printk(KERN_CONT "\n"); | ||
| 62 | } | ||
| 63 | } | ||
| 64 | } | ||
| 65 | |||
| 66 | void __init mminit_verify_pageflags_layout(void) | ||
| 67 | { | ||
| 68 | int shift, width; | ||
| 69 | unsigned long or_mask, add_mask; | ||
| 70 | |||
| 71 | shift = 8 * sizeof(unsigned long); | ||
| 72 | width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH; | ||
| 73 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", | ||
| 74 | "Section %d Node %d Zone %d Flags %d\n", | ||
| 75 | SECTIONS_WIDTH, | ||
| 76 | NODES_WIDTH, | ||
| 77 | ZONES_WIDTH, | ||
| 78 | NR_PAGEFLAGS); | ||
| 79 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", | ||
| 80 | "Section %d Node %d Zone %d\n", | ||
| 81 | SECTIONS_SHIFT, | ||
| 82 | NODES_SHIFT, | ||
| 83 | ZONES_SHIFT); | ||
| 84 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets", | ||
| 85 | "Section %lu Node %lu Zone %lu\n", | ||
| 86 | (unsigned long)SECTIONS_PGSHIFT, | ||
| 87 | (unsigned long)NODES_PGSHIFT, | ||
| 88 | (unsigned long)ZONES_PGSHIFT); | ||
| 89 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid", | ||
| 90 | "Zone ID: %lu -> %lu\n", | ||
| 91 | (unsigned long)ZONEID_PGOFF, | ||
| 92 | (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT)); | ||
| 93 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage", | ||
| 94 | "location: %d -> %d unused %d -> %d flags %d -> %d\n", | ||
| 95 | shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0); | ||
| 96 | #ifdef NODE_NOT_IN_PAGE_FLAGS | ||
| 97 | mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", | ||
| 98 | "Node not in page flags"); | ||
| 99 | #endif | ||
| 100 | |||
| 101 | if (SECTIONS_WIDTH) { | ||
| 102 | shift -= SECTIONS_WIDTH; | ||
| 103 | BUG_ON(shift != SECTIONS_PGSHIFT); | ||
| 104 | } | ||
| 105 | if (NODES_WIDTH) { | ||
| 106 | shift -= NODES_WIDTH; | ||
| 107 | BUG_ON(shift != NODES_PGSHIFT); | ||
| 108 | } | ||
| 109 | if (ZONES_WIDTH) { | ||
| 110 | shift -= ZONES_WIDTH; | ||
| 111 | BUG_ON(shift != ZONES_PGSHIFT); | ||
| 112 | } | ||
| 113 | |||
| 114 | /* Check for bitmask overlaps */ | ||
| 115 | or_mask = (ZONES_MASK << ZONES_PGSHIFT) | | ||
| 116 | (NODES_MASK << NODES_PGSHIFT) | | ||
| 117 | (SECTIONS_MASK << SECTIONS_PGSHIFT); | ||
| 118 | add_mask = (ZONES_MASK << ZONES_PGSHIFT) + | ||
| 119 | (NODES_MASK << NODES_PGSHIFT) + | ||
| 120 | (SECTIONS_MASK << SECTIONS_PGSHIFT); | ||
| 121 | BUG_ON(or_mask != add_mask); | ||
| 122 | } | ||
| 123 | |||
| 124 | void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone, | ||
| 125 | unsigned long nid, unsigned long pfn) | ||
| 126 | { | ||
| 127 | BUG_ON(page_to_nid(page) != nid); | ||
| 128 | BUG_ON(page_zonenum(page) != zone); | ||
| 129 | BUG_ON(page_to_pfn(page) != pfn); | ||
| 130 | } | ||
| 131 | |||
| 132 | static __init int set_mminit_loglevel(char *str) | ||
| 133 | { | ||
| 134 | get_option(&str, &mminit_loglevel); | ||
| 135 | return 0; | ||
| 136 | } | ||
| 137 | early_param("mminit_loglevel", set_mminit_loglevel); | ||
| 138 | #endif /* CONFIG_DEBUG_MEMORY_INIT */ | ||
| 139 | |||
| 140 | struct kobject *mm_kobj; | ||
| 141 | EXPORT_SYMBOL_GPL(mm_kobj); | ||
| 142 | |||
| 143 | static int __init mm_sysfs_init(void) | ||
| 144 | { | ||
| 145 | mm_kobj = kobject_create_and_add("mm", kernel_kobj); | ||
| 146 | if (!mm_kobj) | ||
| 147 | return -ENOMEM; | ||
| 148 | |||
| 149 | return 0; | ||
| 150 | } | ||
| 151 | |||
| 152 | __initcall(mm_sysfs_init); | ||
| @@ -26,12 +26,15 @@ | |||
| 26 | #include <linux/mount.h> | 26 | #include <linux/mount.h> |
| 27 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
| 28 | #include <linux/rmap.h> | 28 | #include <linux/rmap.h> |
| 29 | #include <linux/mmu_notifier.h> | ||
| 29 | 30 | ||
| 30 | #include <asm/uaccess.h> | 31 | #include <asm/uaccess.h> |
| 31 | #include <asm/cacheflush.h> | 32 | #include <asm/cacheflush.h> |
| 32 | #include <asm/tlb.h> | 33 | #include <asm/tlb.h> |
| 33 | #include <asm/mmu_context.h> | 34 | #include <asm/mmu_context.h> |
| 34 | 35 | ||
| 36 | #include "internal.h" | ||
| 37 | |||
| 35 | #ifndef arch_mmap_check | 38 | #ifndef arch_mmap_check |
| 36 | #define arch_mmap_check(addr, len, flags) (0) | 39 | #define arch_mmap_check(addr, len, flags) (0) |
| 37 | #endif | 40 | #endif |
| @@ -367,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr, | |||
| 367 | if (vma_tmp->vm_end > addr) { | 370 | if (vma_tmp->vm_end > addr) { |
| 368 | vma = vma_tmp; | 371 | vma = vma_tmp; |
| 369 | if (vma_tmp->vm_start <= addr) | 372 | if (vma_tmp->vm_start <= addr) |
| 370 | return vma; | 373 | break; |
| 371 | __rb_link = &__rb_parent->rb_left; | 374 | __rb_link = &__rb_parent->rb_left; |
| 372 | } else { | 375 | } else { |
| 373 | rb_prev = __rb_parent; | 376 | rb_prev = __rb_parent; |
| @@ -1108,6 +1111,9 @@ munmap_back: | |||
| 1108 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) | 1111 | if (!may_expand_vm(mm, len >> PAGE_SHIFT)) |
| 1109 | return -ENOMEM; | 1112 | return -ENOMEM; |
| 1110 | 1113 | ||
| 1114 | if (flags & MAP_NORESERVE) | ||
| 1115 | vm_flags |= VM_NORESERVE; | ||
| 1116 | |||
| 1111 | if (accountable && (!(flags & MAP_NORESERVE) || | 1117 | if (accountable && (!(flags & MAP_NORESERVE) || |
| 1112 | sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { | 1118 | sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { |
| 1113 | if (vm_flags & VM_SHARED) { | 1119 | if (vm_flags & VM_SHARED) { |
| @@ -1763,7 +1769,7 @@ static void unmap_region(struct mm_struct *mm, | |||
| 1763 | update_hiwater_rss(mm); | 1769 | update_hiwater_rss(mm); |
| 1764 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); | 1770 | unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); |
| 1765 | vm_unacct_memory(nr_accounted); | 1771 | vm_unacct_memory(nr_accounted); |
| 1766 | free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, | 1772 | free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, |
| 1767 | next? next->vm_start: 0); | 1773 | next? next->vm_start: 0); |
| 1768 | tlb_finish_mmu(tlb, start, end); | 1774 | tlb_finish_mmu(tlb, start, end); |
| 1769 | } | 1775 | } |
| @@ -1807,7 +1813,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
| 1807 | struct mempolicy *pol; | 1813 | struct mempolicy *pol; |
| 1808 | struct vm_area_struct *new; | 1814 | struct vm_area_struct *new; |
| 1809 | 1815 | ||
| 1810 | if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK)) | 1816 | if (is_vm_hugetlb_page(vma) && (addr & |
| 1817 | ~(huge_page_mask(hstate_vma(vma))))) | ||
| 1811 | return -EINVAL; | 1818 | return -EINVAL; |
| 1812 | 1819 | ||
| 1813 | if (mm->map_count >= sysctl_max_map_count) | 1820 | if (mm->map_count >= sysctl_max_map_count) |
| @@ -2055,6 +2062,7 @@ void exit_mmap(struct mm_struct *mm) | |||
| 2055 | 2062 | ||
| 2056 | /* mm's last user has gone, and its about to be pulled down */ | 2063 | /* mm's last user has gone, and its about to be pulled down */ |
| 2057 | arch_exit_mmap(mm); | 2064 | arch_exit_mmap(mm); |
| 2065 | mmu_notifier_release(mm); | ||
| 2058 | 2066 | ||
| 2059 | lru_add_drain(); | 2067 | lru_add_drain(); |
| 2060 | flush_cache_mm(mm); | 2068 | flush_cache_mm(mm); |
| @@ -2063,7 +2071,7 @@ void exit_mmap(struct mm_struct *mm) | |||
| 2063 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2071 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
| 2064 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2072 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
| 2065 | vm_unacct_memory(nr_accounted); | 2073 | vm_unacct_memory(nr_accounted); |
| 2066 | free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0); | 2074 | free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); |
| 2067 | tlb_finish_mmu(tlb, 0, end); | 2075 | tlb_finish_mmu(tlb, 0, end); |
| 2068 | 2076 | ||
| 2069 | /* | 2077 | /* |
| @@ -2262,3 +2270,167 @@ int install_special_mapping(struct mm_struct *mm, | |||
| 2262 | 2270 | ||
| 2263 | return 0; | 2271 | return 0; |
| 2264 | } | 2272 | } |
| 2273 | |||
| 2274 | static DEFINE_MUTEX(mm_all_locks_mutex); | ||
| 2275 | |||
| 2276 | static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) | ||
| 2277 | { | ||
| 2278 | if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { | ||
| 2279 | /* | ||
| 2280 | * The LSB of head.next can't change from under us | ||
| 2281 | * because we hold the mm_all_locks_mutex. | ||
| 2282 | */ | ||
| 2283 | spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); | ||
| 2284 | /* | ||
| 2285 | * We can safely modify head.next after taking the | ||
| 2286 | * anon_vma->lock. If some other vma in this mm shares | ||
| 2287 | * the same anon_vma we won't take it again. | ||
| 2288 | * | ||
| 2289 | * No need of atomic instructions here, head.next | ||
| 2290 | * can't change from under us thanks to the | ||
| 2291 | * anon_vma->lock. | ||
| 2292 | */ | ||
| 2293 | if (__test_and_set_bit(0, (unsigned long *) | ||
| 2294 | &anon_vma->head.next)) | ||
| 2295 | BUG(); | ||
| 2296 | } | ||
| 2297 | } | ||
| 2298 | |||
| 2299 | static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | ||
| 2300 | { | ||
| 2301 | if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | ||
| 2302 | /* | ||
| 2303 | * AS_MM_ALL_LOCKS can't change from under us because | ||
| 2304 | * we hold the mm_all_locks_mutex. | ||
| 2305 | * | ||
| 2306 | * Operations on ->flags have to be atomic because | ||
| 2307 | * even if AS_MM_ALL_LOCKS is stable thanks to the | ||
| 2308 | * mm_all_locks_mutex, there may be other cpus | ||
| 2309 | * changing other bitflags in parallel to us. | ||
| 2310 | */ | ||
| 2311 | if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) | ||
| 2312 | BUG(); | ||
| 2313 | spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); | ||
| 2314 | } | ||
| 2315 | } | ||
| 2316 | |||
| 2317 | /* | ||
| 2318 | * This operation locks against the VM for all pte/vma/mm related | ||
| 2319 | * operations that could ever happen on a certain mm. This includes | ||
| 2320 | * vmtruncate, try_to_unmap, and all page faults. | ||
| 2321 | * | ||
| 2322 | * The caller must take the mmap_sem in write mode before calling | ||
| 2323 | * mm_take_all_locks(). The caller isn't allowed to release the | ||
| 2324 | * mmap_sem until mm_drop_all_locks() returns. | ||
| 2325 | * | ||
| 2326 | * mmap_sem in write mode is required in order to block all operations | ||
| 2327 | * that could modify pagetables and free pages without need of | ||
| 2328 | * altering the vma layout (for example populate_range() with | ||
| 2329 | * nonlinear vmas). It's also needed in write mode to avoid new | ||
| 2330 | * anon_vmas to be associated with existing vmas. | ||
| 2331 | * | ||
| 2332 | * A single task can't take more than one mm_take_all_locks() in a row | ||
| 2333 | * or it would deadlock. | ||
| 2334 | * | ||
| 2335 | * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in | ||
| 2336 | * mapping->flags avoid to take the same lock twice, if more than one | ||
| 2337 | * vma in this mm is backed by the same anon_vma or address_space. | ||
| 2338 | * | ||
| 2339 | * We can take all the locks in random order because the VM code | ||
| 2340 | * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never | ||
| 2341 | * takes more than one of them in a row. Secondly we're protected | ||
| 2342 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | ||
| 2343 | * | ||
| 2344 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations | ||
| 2345 | * that may have to take thousand of locks. | ||
| 2346 | * | ||
| 2347 | * mm_take_all_locks() can fail if it's interrupted by signals. | ||
| 2348 | */ | ||
| 2349 | int mm_take_all_locks(struct mm_struct *mm) | ||
| 2350 | { | ||
| 2351 | struct vm_area_struct *vma; | ||
| 2352 | int ret = -EINTR; | ||
| 2353 | |||
| 2354 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | ||
| 2355 | |||
| 2356 | mutex_lock(&mm_all_locks_mutex); | ||
| 2357 | |||
| 2358 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
| 2359 | if (signal_pending(current)) | ||
| 2360 | goto out_unlock; | ||
| 2361 | if (vma->vm_file && vma->vm_file->f_mapping) | ||
| 2362 | vm_lock_mapping(mm, vma->vm_file->f_mapping); | ||
| 2363 | } | ||
| 2364 | |||
| 2365 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
| 2366 | if (signal_pending(current)) | ||
| 2367 | goto out_unlock; | ||
| 2368 | if (vma->anon_vma) | ||
| 2369 | vm_lock_anon_vma(mm, vma->anon_vma); | ||
| 2370 | } | ||
| 2371 | |||
| 2372 | ret = 0; | ||
| 2373 | |||
| 2374 | out_unlock: | ||
| 2375 | if (ret) | ||
| 2376 | mm_drop_all_locks(mm); | ||
| 2377 | |||
| 2378 | return ret; | ||
| 2379 | } | ||
| 2380 | |||
| 2381 | static void vm_unlock_anon_vma(struct anon_vma *anon_vma) | ||
| 2382 | { | ||
| 2383 | if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { | ||
| 2384 | /* | ||
| 2385 | * The LSB of head.next can't change to 0 from under | ||
| 2386 | * us because we hold the mm_all_locks_mutex. | ||
| 2387 | * | ||
| 2388 | * We must however clear the bitflag before unlocking | ||
| 2389 | * the vma so the users using the anon_vma->head will | ||
| 2390 | * never see our bitflag. | ||
| 2391 | * | ||
| 2392 | * No need of atomic instructions here, head.next | ||
| 2393 | * can't change from under us until we release the | ||
| 2394 | * anon_vma->lock. | ||
| 2395 | */ | ||
| 2396 | if (!__test_and_clear_bit(0, (unsigned long *) | ||
| 2397 | &anon_vma->head.next)) | ||
| 2398 | BUG(); | ||
| 2399 | spin_unlock(&anon_vma->lock); | ||
| 2400 | } | ||
| 2401 | } | ||
| 2402 | |||
| 2403 | static void vm_unlock_mapping(struct address_space *mapping) | ||
| 2404 | { | ||
| 2405 | if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { | ||
| 2406 | /* | ||
| 2407 | * AS_MM_ALL_LOCKS can't change to 0 from under us | ||
| 2408 | * because we hold the mm_all_locks_mutex. | ||
| 2409 | */ | ||
| 2410 | spin_unlock(&mapping->i_mmap_lock); | ||
| 2411 | if (!test_and_clear_bit(AS_MM_ALL_LOCKS, | ||
| 2412 | &mapping->flags)) | ||
| 2413 | BUG(); | ||
| 2414 | } | ||
| 2415 | } | ||
| 2416 | |||
| 2417 | /* | ||
| 2418 | * The mmap_sem cannot be released by the caller until | ||
| 2419 | * mm_drop_all_locks() returns. | ||
| 2420 | */ | ||
| 2421 | void mm_drop_all_locks(struct mm_struct *mm) | ||
| 2422 | { | ||
| 2423 | struct vm_area_struct *vma; | ||
| 2424 | |||
| 2425 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | ||
| 2426 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); | ||
| 2427 | |||
| 2428 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
| 2429 | if (vma->anon_vma) | ||
| 2430 | vm_unlock_anon_vma(vma->anon_vma); | ||
| 2431 | if (vma->vm_file && vma->vm_file->f_mapping) | ||
| 2432 | vm_unlock_mapping(vma->vm_file->f_mapping); | ||
| 2433 | } | ||
| 2434 | |||
| 2435 | mutex_unlock(&mm_all_locks_mutex); | ||
| 2436 | } | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c new file mode 100644 index 000000000000..5f4ef0250bee --- /dev/null +++ b/mm/mmu_notifier.c | |||
| @@ -0,0 +1,277 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/mmu_notifier.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2008 Qumranet, Inc. | ||
| 5 | * Copyright (C) 2008 SGI | ||
| 6 | * Christoph Lameter <clameter@sgi.com> | ||
| 7 | * | ||
| 8 | * This work is licensed under the terms of the GNU GPL, version 2. See | ||
| 9 | * the COPYING file in the top-level directory. | ||
| 10 | */ | ||
| 11 | |||
| 12 | #include <linux/rculist.h> | ||
| 13 | #include <linux/mmu_notifier.h> | ||
| 14 | #include <linux/module.h> | ||
| 15 | #include <linux/mm.h> | ||
| 16 | #include <linux/err.h> | ||
| 17 | #include <linux/rcupdate.h> | ||
| 18 | #include <linux/sched.h> | ||
| 19 | |||
| 20 | /* | ||
| 21 | * This function can't run concurrently against mmu_notifier_register | ||
| 22 | * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap | ||
| 23 | * runs with mm_users == 0. Other tasks may still invoke mmu notifiers | ||
| 24 | * in parallel despite there being no task using this mm any more, | ||
| 25 | * through the vmas outside of the exit_mmap context, such as with | ||
| 26 | * vmtruncate. This serializes against mmu_notifier_unregister with | ||
| 27 | * the mmu_notifier_mm->lock in addition to RCU and it serializes | ||
| 28 | * against the other mmu notifiers with RCU. struct mmu_notifier_mm | ||
| 29 | * can't go away from under us as exit_mmap holds an mm_count pin | ||
| 30 | * itself. | ||
| 31 | */ | ||
| 32 | void __mmu_notifier_release(struct mm_struct *mm) | ||
| 33 | { | ||
| 34 | struct mmu_notifier *mn; | ||
| 35 | |||
| 36 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
| 37 | while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { | ||
| 38 | mn = hlist_entry(mm->mmu_notifier_mm->list.first, | ||
| 39 | struct mmu_notifier, | ||
| 40 | hlist); | ||
| 41 | /* | ||
| 42 | * We arrived before mmu_notifier_unregister so | ||
| 43 | * mmu_notifier_unregister will do nothing other than | ||
| 44 | * to wait ->release to finish and | ||
| 45 | * mmu_notifier_unregister to return. | ||
| 46 | */ | ||
| 47 | hlist_del_init_rcu(&mn->hlist); | ||
| 48 | /* | ||
| 49 | * RCU here will block mmu_notifier_unregister until | ||
| 50 | * ->release returns. | ||
| 51 | */ | ||
| 52 | rcu_read_lock(); | ||
| 53 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
| 54 | /* | ||
| 55 | * if ->release runs before mmu_notifier_unregister it | ||
| 56 | * must be handled as it's the only way for the driver | ||
| 57 | * to flush all existing sptes and stop the driver | ||
| 58 | * from establishing any more sptes before all the | ||
| 59 | * pages in the mm are freed. | ||
| 60 | */ | ||
| 61 | if (mn->ops->release) | ||
| 62 | mn->ops->release(mn, mm); | ||
| 63 | rcu_read_unlock(); | ||
| 64 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
| 65 | } | ||
| 66 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
| 67 | |||
| 68 | /* | ||
| 69 | * synchronize_rcu here prevents mmu_notifier_release to | ||
| 70 | * return to exit_mmap (which would proceed freeing all pages | ||
| 71 | * in the mm) until the ->release method returns, if it was | ||
| 72 | * invoked by mmu_notifier_unregister. | ||
| 73 | * | ||
| 74 | * The mmu_notifier_mm can't go away from under us because one | ||
| 75 | * mm_count is hold by exit_mmap. | ||
| 76 | */ | ||
| 77 | synchronize_rcu(); | ||
| 78 | } | ||
| 79 | |||
| 80 | /* | ||
| 81 | * If no young bitflag is supported by the hardware, ->clear_flush_young can | ||
| 82 | * unmap the address and return 1 or 0 depending if the mapping previously | ||
| 83 | * existed or not. | ||
| 84 | */ | ||
| 85 | int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | ||
| 86 | unsigned long address) | ||
| 87 | { | ||
| 88 | struct mmu_notifier *mn; | ||
| 89 | struct hlist_node *n; | ||
| 90 | int young = 0; | ||
| 91 | |||
| 92 | rcu_read_lock(); | ||
| 93 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
| 94 | if (mn->ops->clear_flush_young) | ||
| 95 | young |= mn->ops->clear_flush_young(mn, mm, address); | ||
| 96 | } | ||
| 97 | rcu_read_unlock(); | ||
| 98 | |||
| 99 | return young; | ||
| 100 | } | ||
| 101 | |||
| 102 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, | ||
| 103 | unsigned long address) | ||
| 104 | { | ||
| 105 | struct mmu_notifier *mn; | ||
| 106 | struct hlist_node *n; | ||
| 107 | |||
| 108 | rcu_read_lock(); | ||
| 109 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
| 110 | if (mn->ops->invalidate_page) | ||
| 111 | mn->ops->invalidate_page(mn, mm, address); | ||
| 112 | } | ||
| 113 | rcu_read_unlock(); | ||
| 114 | } | ||
| 115 | |||
| 116 | void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, | ||
| 117 | unsigned long start, unsigned long end) | ||
| 118 | { | ||
| 119 | struct mmu_notifier *mn; | ||
| 120 | struct hlist_node *n; | ||
| 121 | |||
| 122 | rcu_read_lock(); | ||
| 123 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
| 124 | if (mn->ops->invalidate_range_start) | ||
| 125 | mn->ops->invalidate_range_start(mn, mm, start, end); | ||
| 126 | } | ||
| 127 | rcu_read_unlock(); | ||
| 128 | } | ||
| 129 | |||
| 130 | void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, | ||
| 131 | unsigned long start, unsigned long end) | ||
| 132 | { | ||
| 133 | struct mmu_notifier *mn; | ||
| 134 | struct hlist_node *n; | ||
| 135 | |||
| 136 | rcu_read_lock(); | ||
| 137 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
| 138 | if (mn->ops->invalidate_range_end) | ||
| 139 | mn->ops->invalidate_range_end(mn, mm, start, end); | ||
| 140 | } | ||
| 141 | rcu_read_unlock(); | ||
| 142 | } | ||
| 143 | |||
| 144 | static int do_mmu_notifier_register(struct mmu_notifier *mn, | ||
| 145 | struct mm_struct *mm, | ||
| 146 | int take_mmap_sem) | ||
| 147 | { | ||
| 148 | struct mmu_notifier_mm *mmu_notifier_mm; | ||
| 149 | int ret; | ||
| 150 | |||
| 151 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | ||
| 152 | |||
| 153 | ret = -ENOMEM; | ||
| 154 | mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); | ||
| 155 | if (unlikely(!mmu_notifier_mm)) | ||
| 156 | goto out; | ||
| 157 | |||
| 158 | if (take_mmap_sem) | ||
| 159 | down_write(&mm->mmap_sem); | ||
| 160 | ret = mm_take_all_locks(mm); | ||
| 161 | if (unlikely(ret)) | ||
| 162 | goto out_cleanup; | ||
| 163 | |||
| 164 | if (!mm_has_notifiers(mm)) { | ||
| 165 | INIT_HLIST_HEAD(&mmu_notifier_mm->list); | ||
| 166 | spin_lock_init(&mmu_notifier_mm->lock); | ||
| 167 | mm->mmu_notifier_mm = mmu_notifier_mm; | ||
| 168 | mmu_notifier_mm = NULL; | ||
| 169 | } | ||
| 170 | atomic_inc(&mm->mm_count); | ||
| 171 | |||
| 172 | /* | ||
| 173 | * Serialize the update against mmu_notifier_unregister. A | ||
| 174 | * side note: mmu_notifier_release can't run concurrently with | ||
| 175 | * us because we hold the mm_users pin (either implicitly as | ||
| 176 | * current->mm or explicitly with get_task_mm() or similar). | ||
| 177 | * We can't race against any other mmu notifier method either | ||
| 178 | * thanks to mm_take_all_locks(). | ||
| 179 | */ | ||
| 180 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
| 181 | hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); | ||
| 182 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
| 183 | |||
| 184 | mm_drop_all_locks(mm); | ||
| 185 | out_cleanup: | ||
| 186 | if (take_mmap_sem) | ||
| 187 | up_write(&mm->mmap_sem); | ||
| 188 | /* kfree() does nothing if mmu_notifier_mm is NULL */ | ||
| 189 | kfree(mmu_notifier_mm); | ||
| 190 | out: | ||
| 191 | BUG_ON(atomic_read(&mm->mm_users) <= 0); | ||
| 192 | return ret; | ||
| 193 | } | ||
| 194 | |||
| 195 | /* | ||
| 196 | * Must not hold mmap_sem nor any other VM related lock when calling | ||
| 197 | * this registration function. Must also ensure mm_users can't go down | ||
| 198 | * to zero while this runs to avoid races with mmu_notifier_release, | ||
| 199 | * so mm has to be current->mm or the mm should be pinned safely such | ||
| 200 | * as with get_task_mm(). If the mm is not current->mm, the mm_users | ||
| 201 | * pin should be released by calling mmput after mmu_notifier_register | ||
| 202 | * returns. mmu_notifier_unregister must be always called to | ||
| 203 | * unregister the notifier. mm_count is automatically pinned to allow | ||
| 204 | * mmu_notifier_unregister to safely run at any time later, before or | ||
| 205 | * after exit_mmap. ->release will always be called before exit_mmap | ||
| 206 | * frees the pages. | ||
| 207 | */ | ||
| 208 | int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | ||
| 209 | { | ||
| 210 | return do_mmu_notifier_register(mn, mm, 1); | ||
| 211 | } | ||
| 212 | EXPORT_SYMBOL_GPL(mmu_notifier_register); | ||
| 213 | |||
| 214 | /* | ||
| 215 | * Same as mmu_notifier_register but here the caller must hold the | ||
| 216 | * mmap_sem in write mode. | ||
| 217 | */ | ||
| 218 | int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) | ||
| 219 | { | ||
| 220 | return do_mmu_notifier_register(mn, mm, 0); | ||
| 221 | } | ||
| 222 | EXPORT_SYMBOL_GPL(__mmu_notifier_register); | ||
| 223 | |||
| 224 | /* this is called after the last mmu_notifier_unregister() returned */ | ||
| 225 | void __mmu_notifier_mm_destroy(struct mm_struct *mm) | ||
| 226 | { | ||
| 227 | BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list)); | ||
| 228 | kfree(mm->mmu_notifier_mm); | ||
| 229 | mm->mmu_notifier_mm = LIST_POISON1; /* debug */ | ||
| 230 | } | ||
| 231 | |||
| 232 | /* | ||
| 233 | * This releases the mm_count pin automatically and frees the mm | ||
| 234 | * structure if it was the last user of it. It serializes against | ||
| 235 | * running mmu notifiers with RCU and against mmu_notifier_unregister | ||
| 236 | * with the unregister lock + RCU. All sptes must be dropped before | ||
| 237 | * calling mmu_notifier_unregister. ->release or any other notifier | ||
| 238 | * method may be invoked concurrently with mmu_notifier_unregister, | ||
| 239 | * and only after mmu_notifier_unregister returned we're guaranteed | ||
| 240 | * that ->release or any other method can't run anymore. | ||
| 241 | */ | ||
| 242 | void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) | ||
| 243 | { | ||
| 244 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | ||
| 245 | |||
| 246 | spin_lock(&mm->mmu_notifier_mm->lock); | ||
| 247 | if (!hlist_unhashed(&mn->hlist)) { | ||
| 248 | hlist_del_rcu(&mn->hlist); | ||
| 249 | |||
| 250 | /* | ||
| 251 | * RCU here will force exit_mmap to wait ->release to finish | ||
| 252 | * before freeing the pages. | ||
| 253 | */ | ||
| 254 | rcu_read_lock(); | ||
| 255 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
| 256 | /* | ||
| 257 | * exit_mmap will block in mmu_notifier_release to | ||
| 258 | * guarantee ->release is called before freeing the | ||
| 259 | * pages. | ||
| 260 | */ | ||
| 261 | if (mn->ops->release) | ||
| 262 | mn->ops->release(mn, mm); | ||
| 263 | rcu_read_unlock(); | ||
| 264 | } else | ||
| 265 | spin_unlock(&mm->mmu_notifier_mm->lock); | ||
| 266 | |||
| 267 | /* | ||
| 268 | * Wait any running method to finish, of course including | ||
| 269 | * ->release if it was run by mmu_notifier_relase instead of us. | ||
| 270 | */ | ||
| 271 | synchronize_rcu(); | ||
| 272 | |||
| 273 | BUG_ON(atomic_read(&mm->mm_count) <= 0); | ||
| 274 | |||
| 275 | mmdrop(mm); | ||
| 276 | } | ||
| 277 | EXPORT_SYMBOL_GPL(mmu_notifier_unregister); | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 360d9cc8b38c..fded06f923f4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/syscalls.h> | 21 | #include <linux/syscalls.h> |
| 22 | #include <linux/swap.h> | 22 | #include <linux/swap.h> |
| 23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
| 24 | #include <linux/mmu_notifier.h> | ||
| 24 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
| 25 | #include <asm/pgtable.h> | 26 | #include <asm/pgtable.h> |
| 26 | #include <asm/cacheflush.h> | 27 | #include <asm/cacheflush.h> |
| @@ -153,12 +154,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, | |||
| 153 | * If we make a private mapping writable we increase our commit; | 154 | * If we make a private mapping writable we increase our commit; |
| 154 | * but (without finer accounting) cannot reduce our commit if we | 155 | * but (without finer accounting) cannot reduce our commit if we |
| 155 | * make it unwritable again. | 156 | * make it unwritable again. |
| 156 | * | ||
| 157 | * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting | ||
| 158 | * a MAP_NORESERVE private mapping to writable will now reserve. | ||
| 159 | */ | 157 | */ |
| 160 | if (newflags & VM_WRITE) { | 158 | if (newflags & VM_WRITE) { |
| 161 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) { | 159 | if (!(oldflags & (VM_ACCOUNT|VM_WRITE| |
| 160 | VM_SHARED|VM_NORESERVE))) { | ||
| 162 | charged = nrpages; | 161 | charged = nrpages; |
| 163 | if (security_vm_enough_memory(charged)) | 162 | if (security_vm_enough_memory(charged)) |
| 164 | return -ENOMEM; | 163 | return -ENOMEM; |
| @@ -205,10 +204,12 @@ success: | |||
| 205 | dirty_accountable = 1; | 204 | dirty_accountable = 1; |
| 206 | } | 205 | } |
| 207 | 206 | ||
| 207 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
| 208 | if (is_vm_hugetlb_page(vma)) | 208 | if (is_vm_hugetlb_page(vma)) |
| 209 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); | 209 | hugetlb_change_protection(vma, start, end, vma->vm_page_prot); |
| 210 | else | 210 | else |
| 211 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); | 211 | change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); |
| 212 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
| 212 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); | 213 | vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); |
| 213 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); | 214 | vm_stat_account(mm, newflags, vma->vm_file, nrpages); |
| 214 | return 0; | 215 | return 0; |
diff --git a/mm/mremap.c b/mm/mremap.c index 08e3c7f2bd15..1a7743923c8c 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/highmem.h> | 18 | #include <linux/highmem.h> |
| 19 | #include <linux/security.h> | 19 | #include <linux/security.h> |
| 20 | #include <linux/syscalls.h> | 20 | #include <linux/syscalls.h> |
| 21 | #include <linux/mmu_notifier.h> | ||
| 21 | 22 | ||
| 22 | #include <asm/uaccess.h> | 23 | #include <asm/uaccess.h> |
| 23 | #include <asm/cacheflush.h> | 24 | #include <asm/cacheflush.h> |
| @@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
| 74 | struct mm_struct *mm = vma->vm_mm; | 75 | struct mm_struct *mm = vma->vm_mm; |
| 75 | pte_t *old_pte, *new_pte, pte; | 76 | pte_t *old_pte, *new_pte, pte; |
| 76 | spinlock_t *old_ptl, *new_ptl; | 77 | spinlock_t *old_ptl, *new_ptl; |
| 78 | unsigned long old_start; | ||
| 77 | 79 | ||
| 80 | old_start = old_addr; | ||
| 81 | mmu_notifier_invalidate_range_start(vma->vm_mm, | ||
| 82 | old_start, old_end); | ||
| 78 | if (vma->vm_file) { | 83 | if (vma->vm_file) { |
| 79 | /* | 84 | /* |
| 80 | * Subtle point from Rajesh Venkatasubramanian: before | 85 | * Subtle point from Rajesh Venkatasubramanian: before |
| @@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
| 116 | pte_unmap_unlock(old_pte - 1, old_ptl); | 121 | pte_unmap_unlock(old_pte - 1, old_ptl); |
| 117 | if (mapping) | 122 | if (mapping) |
| 118 | spin_unlock(&mapping->i_mmap_lock); | 123 | spin_unlock(&mapping->i_mmap_lock); |
| 124 | mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); | ||
| 119 | } | 125 | } |
| 120 | 126 | ||
| 121 | #define LATENCY_LIMIT (64 * PAGE_SIZE) | 127 | #define LATENCY_LIMIT (64 * PAGE_SIZE) |
diff --git a/mm/nommu.c b/mm/nommu.c index 4462b6a3fcb9..ed75bc962fbe 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -22,7 +22,7 @@ | |||
| 22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
| 23 | #include <linux/slab.h> | 23 | #include <linux/slab.h> |
| 24 | #include <linux/vmalloc.h> | 24 | #include <linux/vmalloc.h> |
| 25 | #include <linux/ptrace.h> | 25 | #include <linux/tracehook.h> |
| 26 | #include <linux/blkdev.h> | 26 | #include <linux/blkdev.h> |
| 27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
| 28 | #include <linux/mount.h> | 28 | #include <linux/mount.h> |
| @@ -266,6 +266,27 @@ void *vmalloc_node(unsigned long size, int node) | |||
| 266 | } | 266 | } |
| 267 | EXPORT_SYMBOL(vmalloc_node); | 267 | EXPORT_SYMBOL(vmalloc_node); |
| 268 | 268 | ||
| 269 | #ifndef PAGE_KERNEL_EXEC | ||
| 270 | # define PAGE_KERNEL_EXEC PAGE_KERNEL | ||
| 271 | #endif | ||
| 272 | |||
| 273 | /** | ||
| 274 | * vmalloc_exec - allocate virtually contiguous, executable memory | ||
| 275 | * @size: allocation size | ||
| 276 | * | ||
| 277 | * Kernel-internal function to allocate enough pages to cover @size | ||
| 278 | * the page level allocator and map them into contiguous and | ||
| 279 | * executable kernel virtual space. | ||
| 280 | * | ||
| 281 | * For tight control over page level allocator and protection flags | ||
| 282 | * use __vmalloc() instead. | ||
| 283 | */ | ||
| 284 | |||
| 285 | void *vmalloc_exec(unsigned long size) | ||
| 286 | { | ||
| 287 | return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); | ||
| 288 | } | ||
| 289 | |||
| 269 | /** | 290 | /** |
| 270 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) | 291 | * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) |
| 271 | * @size: allocation size | 292 | * @size: allocation size |
| @@ -745,7 +766,7 @@ static unsigned long determine_vm_flags(struct file *file, | |||
| 745 | * it's being traced - otherwise breakpoints set in it may interfere | 766 | * it's being traced - otherwise breakpoints set in it may interfere |
| 746 | * with another untraced process | 767 | * with another untraced process |
| 747 | */ | 768 | */ |
| 748 | if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED)) | 769 | if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current)) |
| 749 | vm_flags &= ~VM_MAYSHARE; | 770 | vm_flags &= ~VM_MAYSHARE; |
| 750 | 771 | ||
| 751 | return vm_flags; | 772 | return vm_flags; |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 94c6d8988ab3..24de8b65fdbd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
| 1088 | if (!mapping) | 1088 | if (!mapping) |
| 1089 | return 1; | 1089 | return 1; |
| 1090 | 1090 | ||
| 1091 | write_lock_irq(&mapping->tree_lock); | 1091 | spin_lock_irq(&mapping->tree_lock); |
| 1092 | mapping2 = page_mapping(page); | 1092 | mapping2 = page_mapping(page); |
| 1093 | if (mapping2) { /* Race with truncate? */ | 1093 | if (mapping2) { /* Race with truncate? */ |
| 1094 | BUG_ON(mapping2 != mapping); | 1094 | BUG_ON(mapping2 != mapping); |
| @@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page) | |||
| 1102 | radix_tree_tag_set(&mapping->page_tree, | 1102 | radix_tree_tag_set(&mapping->page_tree, |
| 1103 | page_index(page), PAGECACHE_TAG_DIRTY); | 1103 | page_index(page), PAGECACHE_TAG_DIRTY); |
| 1104 | } | 1104 | } |
| 1105 | write_unlock_irq(&mapping->tree_lock); | 1105 | spin_unlock_irq(&mapping->tree_lock); |
| 1106 | if (mapping->host) { | 1106 | if (mapping->host) { |
| 1107 | /* !PageAnon && !swapper_space */ | 1107 | /* !PageAnon && !swapper_space */ |
| 1108 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 1108 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
| @@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page) | |||
| 1258 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1258 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
| 1259 | unsigned long flags; | 1259 | unsigned long flags; |
| 1260 | 1260 | ||
| 1261 | write_lock_irqsave(&mapping->tree_lock, flags); | 1261 | spin_lock_irqsave(&mapping->tree_lock, flags); |
| 1262 | ret = TestClearPageWriteback(page); | 1262 | ret = TestClearPageWriteback(page); |
| 1263 | if (ret) { | 1263 | if (ret) { |
| 1264 | radix_tree_tag_clear(&mapping->page_tree, | 1264 | radix_tree_tag_clear(&mapping->page_tree, |
| @@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page) | |||
| 1269 | __bdi_writeout_inc(bdi); | 1269 | __bdi_writeout_inc(bdi); |
| 1270 | } | 1270 | } |
| 1271 | } | 1271 | } |
| 1272 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 1272 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
| 1273 | } else { | 1273 | } else { |
| 1274 | ret = TestClearPageWriteback(page); | 1274 | ret = TestClearPageWriteback(page); |
| 1275 | } | 1275 | } |
| @@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page) | |||
| 1287 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 1287 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
| 1288 | unsigned long flags; | 1288 | unsigned long flags; |
| 1289 | 1289 | ||
| 1290 | write_lock_irqsave(&mapping->tree_lock, flags); | 1290 | spin_lock_irqsave(&mapping->tree_lock, flags); |
| 1291 | ret = TestSetPageWriteback(page); | 1291 | ret = TestSetPageWriteback(page); |
| 1292 | if (!ret) { | 1292 | if (!ret) { |
| 1293 | radix_tree_tag_set(&mapping->page_tree, | 1293 | radix_tree_tag_set(&mapping->page_tree, |
| @@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page) | |||
| 1300 | radix_tree_tag_clear(&mapping->page_tree, | 1300 | radix_tree_tag_clear(&mapping->page_tree, |
| 1301 | page_index(page), | 1301 | page_index(page), |
| 1302 | PAGECACHE_TAG_DIRTY); | 1302 | PAGECACHE_TAG_DIRTY); |
| 1303 | write_unlock_irqrestore(&mapping->tree_lock, flags); | 1303 | spin_unlock_irqrestore(&mapping->tree_lock, flags); |
| 1304 | } else { | 1304 | } else { |
| 1305 | ret = TestSetPageWriteback(page); | 1305 | ret = TestSetPageWriteback(page); |
| 1306 | } | 1306 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 79ac4afc908c..af982f7cdb2a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve; | |||
| 153 | static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; | 153 | static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; |
| 154 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; | 154 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; |
| 155 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | 155 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ |
| 156 | unsigned long __initdata required_kernelcore; | 156 | static unsigned long __initdata required_kernelcore; |
| 157 | static unsigned long __initdata required_movablecore; | 157 | static unsigned long __initdata required_movablecore; |
| 158 | unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 158 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
| 159 | 159 | ||
| 160 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ | 160 | /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
| 161 | int movable_zone; | 161 | int movable_zone; |
| @@ -264,7 +264,7 @@ static void free_compound_page(struct page *page) | |||
| 264 | __free_pages_ok(page, compound_order(page)); | 264 | __free_pages_ok(page, compound_order(page)); |
| 265 | } | 265 | } |
| 266 | 266 | ||
| 267 | static void prep_compound_page(struct page *page, unsigned long order) | 267 | void prep_compound_page(struct page *page, unsigned long order) |
| 268 | { | 268 | { |
| 269 | int i; | 269 | int i; |
| 270 | int nr_pages = 1 << order; | 270 | int nr_pages = 1 << order; |
| @@ -432,8 +432,9 @@ static inline void __free_one_page(struct page *page, | |||
| 432 | 432 | ||
| 433 | buddy = __page_find_buddy(page, page_idx, order); | 433 | buddy = __page_find_buddy(page, page_idx, order); |
| 434 | if (!page_is_buddy(page, buddy, order)) | 434 | if (!page_is_buddy(page, buddy, order)) |
| 435 | break; /* Move the buddy up one level. */ | 435 | break; |
| 436 | 436 | ||
| 437 | /* Our buddy is free, merge with it and move up one order. */ | ||
| 437 | list_del(&buddy->lru); | 438 | list_del(&buddy->lru); |
| 438 | zone->free_area[order].nr_free--; | 439 | zone->free_area[order].nr_free--; |
| 439 | rmv_page_order(buddy); | 440 | rmv_page_order(buddy); |
| @@ -532,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 532 | /* | 533 | /* |
| 533 | * permit the bootmem allocator to evade page validation on high-order frees | 534 | * permit the bootmem allocator to evade page validation on high-order frees |
| 534 | */ | 535 | */ |
| 535 | void __free_pages_bootmem(struct page *page, unsigned int order) | 536 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
| 536 | { | 537 | { |
| 537 | if (order == 0) { | 538 | if (order == 0) { |
| 538 | __ClearPageReserved(page); | 539 | __ClearPageReserved(page); |
| @@ -673,9 +674,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = { | |||
| 673 | * Note that start_page and end_pages are not aligned on a pageblock | 674 | * Note that start_page and end_pages are not aligned on a pageblock |
| 674 | * boundary. If alignment is required, use move_freepages_block() | 675 | * boundary. If alignment is required, use move_freepages_block() |
| 675 | */ | 676 | */ |
| 676 | int move_freepages(struct zone *zone, | 677 | static int move_freepages(struct zone *zone, |
| 677 | struct page *start_page, struct page *end_page, | 678 | struct page *start_page, struct page *end_page, |
| 678 | int migratetype) | 679 | int migratetype) |
| 679 | { | 680 | { |
| 680 | struct page *page; | 681 | struct page *page; |
| 681 | unsigned long order; | 682 | unsigned long order; |
| @@ -714,7 +715,8 @@ int move_freepages(struct zone *zone, | |||
| 714 | return pages_moved; | 715 | return pages_moved; |
| 715 | } | 716 | } |
| 716 | 717 | ||
| 717 | int move_freepages_block(struct zone *zone, struct page *page, int migratetype) | 718 | static int move_freepages_block(struct zone *zone, struct page *page, |
| 719 | int migratetype) | ||
| 718 | { | 720 | { |
| 719 | unsigned long start_pfn, end_pfn; | 721 | unsigned long start_pfn, end_pfn; |
| 720 | struct page *start_page, *end_page; | 722 | struct page *start_page, *end_page; |
| @@ -1429,7 +1431,7 @@ try_next_zone: | |||
| 1429 | /* | 1431 | /* |
| 1430 | * This is the 'heart' of the zoned buddy allocator. | 1432 | * This is the 'heart' of the zoned buddy allocator. |
| 1431 | */ | 1433 | */ |
| 1432 | static struct page * | 1434 | struct page * |
| 1433 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, | 1435 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, |
| 1434 | struct zonelist *zonelist, nodemask_t *nodemask) | 1436 | struct zonelist *zonelist, nodemask_t *nodemask) |
| 1435 | { | 1437 | { |
| @@ -1632,22 +1634,7 @@ nopage: | |||
| 1632 | got_pg: | 1634 | got_pg: |
| 1633 | return page; | 1635 | return page; |
| 1634 | } | 1636 | } |
| 1635 | 1637 | EXPORT_SYMBOL(__alloc_pages_internal); | |
| 1636 | struct page * | ||
| 1637 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | ||
| 1638 | struct zonelist *zonelist) | ||
| 1639 | { | ||
| 1640 | return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); | ||
| 1641 | } | ||
| 1642 | |||
| 1643 | struct page * | ||
| 1644 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
| 1645 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
| 1646 | { | ||
| 1647 | return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); | ||
| 1648 | } | ||
| 1649 | |||
| 1650 | EXPORT_SYMBOL(__alloc_pages); | ||
| 1651 | 1638 | ||
| 1652 | /* | 1639 | /* |
| 1653 | * Common helper functions. | 1640 | * Common helper functions. |
| @@ -1711,6 +1698,59 @@ void free_pages(unsigned long addr, unsigned int order) | |||
| 1711 | 1698 | ||
| 1712 | EXPORT_SYMBOL(free_pages); | 1699 | EXPORT_SYMBOL(free_pages); |
| 1713 | 1700 | ||
| 1701 | /** | ||
| 1702 | * alloc_pages_exact - allocate an exact number physically-contiguous pages. | ||
| 1703 | * @size: the number of bytes to allocate | ||
| 1704 | * @gfp_mask: GFP flags for the allocation | ||
| 1705 | * | ||
| 1706 | * This function is similar to alloc_pages(), except that it allocates the | ||
| 1707 | * minimum number of pages to satisfy the request. alloc_pages() can only | ||
| 1708 | * allocate memory in power-of-two pages. | ||
| 1709 | * | ||
| 1710 | * This function is also limited by MAX_ORDER. | ||
| 1711 | * | ||
| 1712 | * Memory allocated by this function must be released by free_pages_exact(). | ||
| 1713 | */ | ||
| 1714 | void *alloc_pages_exact(size_t size, gfp_t gfp_mask) | ||
| 1715 | { | ||
| 1716 | unsigned int order = get_order(size); | ||
| 1717 | unsigned long addr; | ||
| 1718 | |||
| 1719 | addr = __get_free_pages(gfp_mask, order); | ||
| 1720 | if (addr) { | ||
| 1721 | unsigned long alloc_end = addr + (PAGE_SIZE << order); | ||
| 1722 | unsigned long used = addr + PAGE_ALIGN(size); | ||
| 1723 | |||
| 1724 | split_page(virt_to_page(addr), order); | ||
| 1725 | while (used < alloc_end) { | ||
| 1726 | free_page(used); | ||
| 1727 | used += PAGE_SIZE; | ||
| 1728 | } | ||
| 1729 | } | ||
| 1730 | |||
| 1731 | return (void *)addr; | ||
| 1732 | } | ||
| 1733 | EXPORT_SYMBOL(alloc_pages_exact); | ||
| 1734 | |||
| 1735 | /** | ||
| 1736 | * free_pages_exact - release memory allocated via alloc_pages_exact() | ||
| 1737 | * @virt: the value returned by alloc_pages_exact. | ||
| 1738 | * @size: size of allocation, same value as passed to alloc_pages_exact(). | ||
| 1739 | * | ||
| 1740 | * Release the memory allocated by a previous call to alloc_pages_exact. | ||
| 1741 | */ | ||
| 1742 | void free_pages_exact(void *virt, size_t size) | ||
| 1743 | { | ||
| 1744 | unsigned long addr = (unsigned long)virt; | ||
| 1745 | unsigned long end = addr + PAGE_ALIGN(size); | ||
| 1746 | |||
| 1747 | while (addr < end) { | ||
| 1748 | free_page(addr); | ||
| 1749 | addr += PAGE_SIZE; | ||
| 1750 | } | ||
| 1751 | } | ||
| 1752 | EXPORT_SYMBOL(free_pages_exact); | ||
| 1753 | |||
| 1714 | static unsigned int nr_free_zone_pages(int offset) | 1754 | static unsigned int nr_free_zone_pages(int offset) |
| 1715 | { | 1755 | { |
| 1716 | struct zoneref *z; | 1756 | struct zoneref *z; |
| @@ -2332,7 +2372,7 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
| 2332 | 2372 | ||
| 2333 | #endif /* CONFIG_NUMA */ | 2373 | #endif /* CONFIG_NUMA */ |
| 2334 | 2374 | ||
| 2335 | /* return values int ....just for stop_machine_run() */ | 2375 | /* return values int ....just for stop_machine() */ |
| 2336 | static int __build_all_zonelists(void *dummy) | 2376 | static int __build_all_zonelists(void *dummy) |
| 2337 | { | 2377 | { |
| 2338 | int nid; | 2378 | int nid; |
| @@ -2352,11 +2392,12 @@ void build_all_zonelists(void) | |||
| 2352 | 2392 | ||
| 2353 | if (system_state == SYSTEM_BOOTING) { | 2393 | if (system_state == SYSTEM_BOOTING) { |
| 2354 | __build_all_zonelists(NULL); | 2394 | __build_all_zonelists(NULL); |
| 2395 | mminit_verify_zonelist(); | ||
| 2355 | cpuset_init_current_mems_allowed(); | 2396 | cpuset_init_current_mems_allowed(); |
| 2356 | } else { | 2397 | } else { |
| 2357 | /* we have to stop all cpus to guarantee there is no user | 2398 | /* we have to stop all cpus to guarantee there is no user |
| 2358 | of zonelist */ | 2399 | of zonelist */ |
| 2359 | stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); | 2400 | stop_machine(__build_all_zonelists, NULL, NULL); |
| 2360 | /* cpuset refresh routine should be here */ | 2401 | /* cpuset refresh routine should be here */ |
| 2361 | } | 2402 | } |
| 2362 | vm_total_pages = nr_free_pagecache_pages(); | 2403 | vm_total_pages = nr_free_pagecache_pages(); |
| @@ -2534,6 +2575,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
| 2534 | } | 2575 | } |
| 2535 | page = pfn_to_page(pfn); | 2576 | page = pfn_to_page(pfn); |
| 2536 | set_page_links(page, zone, nid, pfn); | 2577 | set_page_links(page, zone, nid, pfn); |
| 2578 | mminit_verify_page_links(page, zone, nid, pfn); | ||
| 2537 | init_page_count(page); | 2579 | init_page_count(page); |
| 2538 | reset_page_mapcount(page); | 2580 | reset_page_mapcount(page); |
| 2539 | SetPageReserved(page); | 2581 | SetPageReserved(page); |
| @@ -2611,7 +2653,7 @@ static int zone_batchsize(struct zone *zone) | |||
| 2611 | return batch; | 2653 | return batch; |
| 2612 | } | 2654 | } |
| 2613 | 2655 | ||
| 2614 | inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | 2656 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) |
| 2615 | { | 2657 | { |
| 2616 | struct per_cpu_pages *pcp; | 2658 | struct per_cpu_pages *pcp; |
| 2617 | 2659 | ||
| @@ -2836,6 +2878,12 @@ __meminit int init_currently_empty_zone(struct zone *zone, | |||
| 2836 | 2878 | ||
| 2837 | zone->zone_start_pfn = zone_start_pfn; | 2879 | zone->zone_start_pfn = zone_start_pfn; |
| 2838 | 2880 | ||
| 2881 | mminit_dprintk(MMINIT_TRACE, "memmap_init", | ||
| 2882 | "Initialising map node %d zone %lu pfns %lu -> %lu\n", | ||
| 2883 | pgdat->node_id, | ||
| 2884 | (unsigned long)zone_idx(zone), | ||
| 2885 | zone_start_pfn, (zone_start_pfn + size)); | ||
| 2886 | |||
| 2839 | zone_init_free_lists(zone); | 2887 | zone_init_free_lists(zone); |
| 2840 | 2888 | ||
| 2841 | return 0; | 2889 | return 0; |
| @@ -2975,7 +3023,8 @@ void __init sparse_memory_present_with_active_regions(int nid) | |||
| 2975 | void __init push_node_boundaries(unsigned int nid, | 3023 | void __init push_node_boundaries(unsigned int nid, |
| 2976 | unsigned long start_pfn, unsigned long end_pfn) | 3024 | unsigned long start_pfn, unsigned long end_pfn) |
| 2977 | { | 3025 | { |
| 2978 | printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n", | 3026 | mminit_dprintk(MMINIT_TRACE, "zoneboundary", |
| 3027 | "Entering push_node_boundaries(%u, %lu, %lu)\n", | ||
| 2979 | nid, start_pfn, end_pfn); | 3028 | nid, start_pfn, end_pfn); |
| 2980 | 3029 | ||
| 2981 | /* Initialise the boundary for this node if necessary */ | 3030 | /* Initialise the boundary for this node if necessary */ |
| @@ -2993,7 +3042,8 @@ void __init push_node_boundaries(unsigned int nid, | |||
| 2993 | static void __meminit account_node_boundary(unsigned int nid, | 3042 | static void __meminit account_node_boundary(unsigned int nid, |
| 2994 | unsigned long *start_pfn, unsigned long *end_pfn) | 3043 | unsigned long *start_pfn, unsigned long *end_pfn) |
| 2995 | { | 3044 | { |
| 2996 | printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n", | 3045 | mminit_dprintk(MMINIT_TRACE, "zoneboundary", |
| 3046 | "Entering account_node_boundary(%u, %lu, %lu)\n", | ||
| 2997 | nid, *start_pfn, *end_pfn); | 3047 | nid, *start_pfn, *end_pfn); |
| 2998 | 3048 | ||
| 2999 | /* Return if boundary information has not been provided */ | 3049 | /* Return if boundary information has not been provided */ |
| @@ -3050,7 +3100,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
| 3050 | * assumption is made that zones within a node are ordered in monotonic | 3100 | * assumption is made that zones within a node are ordered in monotonic |
| 3051 | * increasing memory addresses so that the "highest" populated zone is used | 3101 | * increasing memory addresses so that the "highest" populated zone is used |
| 3052 | */ | 3102 | */ |
| 3053 | void __init find_usable_zone_for_movable(void) | 3103 | static void __init find_usable_zone_for_movable(void) |
| 3054 | { | 3104 | { |
| 3055 | int zone_index; | 3105 | int zone_index; |
| 3056 | for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { | 3106 | for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) { |
| @@ -3076,7 +3126,7 @@ void __init find_usable_zone_for_movable(void) | |||
| 3076 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that | 3126 | * highest usable zone for ZONE_MOVABLE. This preserves the assumption that |
| 3077 | * zones within a node are in order of monotonic increases memory addresses | 3127 | * zones within a node are in order of monotonic increases memory addresses |
| 3078 | */ | 3128 | */ |
| 3079 | void __meminit adjust_zone_range_for_zone_movable(int nid, | 3129 | static void __meminit adjust_zone_range_for_zone_movable(int nid, |
| 3080 | unsigned long zone_type, | 3130 | unsigned long zone_type, |
| 3081 | unsigned long node_start_pfn, | 3131 | unsigned long node_start_pfn, |
| 3082 | unsigned long node_end_pfn, | 3132 | unsigned long node_end_pfn, |
| @@ -3137,7 +3187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
| 3137 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 3187 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
| 3138 | * then all holes in the requested range will be accounted for. | 3188 | * then all holes in the requested range will be accounted for. |
| 3139 | */ | 3189 | */ |
| 3140 | unsigned long __meminit __absent_pages_in_range(int nid, | 3190 | static unsigned long __meminit __absent_pages_in_range(int nid, |
| 3141 | unsigned long range_start_pfn, | 3191 | unsigned long range_start_pfn, |
| 3142 | unsigned long range_end_pfn) | 3192 | unsigned long range_end_pfn) |
| 3143 | { | 3193 | { |
| @@ -3368,8 +3418,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3368 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 3418 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; |
| 3369 | if (realsize >= memmap_pages) { | 3419 | if (realsize >= memmap_pages) { |
| 3370 | realsize -= memmap_pages; | 3420 | realsize -= memmap_pages; |
| 3371 | printk(KERN_DEBUG | 3421 | mminit_dprintk(MMINIT_TRACE, "memmap_init", |
| 3372 | " %s zone: %lu pages used for memmap\n", | 3422 | "%s zone: %lu pages used for memmap\n", |
| 3373 | zone_names[j], memmap_pages); | 3423 | zone_names[j], memmap_pages); |
| 3374 | } else | 3424 | } else |
| 3375 | printk(KERN_WARNING | 3425 | printk(KERN_WARNING |
| @@ -3379,7 +3429,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3379 | /* Account for reserved pages */ | 3429 | /* Account for reserved pages */ |
| 3380 | if (j == 0 && realsize > dma_reserve) { | 3430 | if (j == 0 && realsize > dma_reserve) { |
| 3381 | realsize -= dma_reserve; | 3431 | realsize -= dma_reserve; |
| 3382 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", | 3432 | mminit_dprintk(MMINIT_TRACE, "memmap_init", |
| 3433 | "%s zone: %lu pages reserved\n", | ||
| 3383 | zone_names[0], dma_reserve); | 3434 | zone_names[0], dma_reserve); |
| 3384 | } | 3435 | } |
| 3385 | 3436 | ||
| @@ -3464,10 +3515,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) | |||
| 3464 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ | 3515 | #endif /* CONFIG_FLAT_NODE_MEM_MAP */ |
| 3465 | } | 3516 | } |
| 3466 | 3517 | ||
| 3467 | void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat, | 3518 | void __paginginit free_area_init_node(int nid, unsigned long *zones_size, |
| 3468 | unsigned long *zones_size, unsigned long node_start_pfn, | 3519 | unsigned long node_start_pfn, unsigned long *zholes_size) |
| 3469 | unsigned long *zholes_size) | ||
| 3470 | { | 3520 | { |
| 3521 | pg_data_t *pgdat = NODE_DATA(nid); | ||
| 3522 | |||
| 3471 | pgdat->node_id = nid; | 3523 | pgdat->node_id = nid; |
| 3472 | pgdat->node_start_pfn = node_start_pfn; | 3524 | pgdat->node_start_pfn = node_start_pfn; |
| 3473 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 3525 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
| @@ -3520,10 +3572,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, | |||
| 3520 | { | 3572 | { |
| 3521 | int i; | 3573 | int i; |
| 3522 | 3574 | ||
| 3523 | printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) " | 3575 | mminit_dprintk(MMINIT_TRACE, "memory_register", |
| 3524 | "%d entries of %d used\n", | 3576 | "Entering add_active_range(%d, %#lx, %#lx) " |
| 3525 | nid, start_pfn, end_pfn, | 3577 | "%d entries of %d used\n", |
| 3526 | nr_nodemap_entries, MAX_ACTIVE_REGIONS); | 3578 | nid, start_pfn, end_pfn, |
| 3579 | nr_nodemap_entries, MAX_ACTIVE_REGIONS); | ||
| 3580 | |||
| 3581 | mminit_validate_memmodel_limits(&start_pfn, &end_pfn); | ||
| 3527 | 3582 | ||
| 3528 | /* Merge with existing active regions if possible */ | 3583 | /* Merge with existing active regions if possible */ |
| 3529 | for (i = 0; i < nr_nodemap_entries; i++) { | 3584 | for (i = 0; i < nr_nodemap_entries; i++) { |
| @@ -3669,7 +3724,7 @@ static void __init sort_node_map(void) | |||
| 3669 | } | 3724 | } |
| 3670 | 3725 | ||
| 3671 | /* Find the lowest pfn for a node */ | 3726 | /* Find the lowest pfn for a node */ |
| 3672 | unsigned long __init find_min_pfn_for_node(int nid) | 3727 | static unsigned long __init find_min_pfn_for_node(int nid) |
| 3673 | { | 3728 | { |
| 3674 | int i; | 3729 | int i; |
| 3675 | unsigned long min_pfn = ULONG_MAX; | 3730 | unsigned long min_pfn = ULONG_MAX; |
| @@ -3698,23 +3753,6 @@ unsigned long __init find_min_pfn_with_active_regions(void) | |||
| 3698 | return find_min_pfn_for_node(MAX_NUMNODES); | 3753 | return find_min_pfn_for_node(MAX_NUMNODES); |
| 3699 | } | 3754 | } |
| 3700 | 3755 | ||
| 3701 | /** | ||
| 3702 | * find_max_pfn_with_active_regions - Find the maximum PFN registered | ||
| 3703 | * | ||
| 3704 | * It returns the maximum PFN based on information provided via | ||
| 3705 | * add_active_range(). | ||
| 3706 | */ | ||
| 3707 | unsigned long __init find_max_pfn_with_active_regions(void) | ||
| 3708 | { | ||
| 3709 | int i; | ||
| 3710 | unsigned long max_pfn = 0; | ||
| 3711 | |||
| 3712 | for (i = 0; i < nr_nodemap_entries; i++) | ||
| 3713 | max_pfn = max(max_pfn, early_node_map[i].end_pfn); | ||
| 3714 | |||
| 3715 | return max_pfn; | ||
| 3716 | } | ||
| 3717 | |||
| 3718 | /* | 3756 | /* |
| 3719 | * early_calculate_totalpages() | 3757 | * early_calculate_totalpages() |
| 3720 | * Sum pages in active regions for movable zone. | 3758 | * Sum pages in active regions for movable zone. |
| @@ -3741,7 +3779,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
| 3741 | * memory. When they don't, some nodes will have more kernelcore than | 3779 | * memory. When they don't, some nodes will have more kernelcore than |
| 3742 | * others | 3780 | * others |
| 3743 | */ | 3781 | */ |
| 3744 | void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | 3782 | static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) |
| 3745 | { | 3783 | { |
| 3746 | int i, nid; | 3784 | int i, nid; |
| 3747 | unsigned long usable_startpfn; | 3785 | unsigned long usable_startpfn; |
| @@ -3957,10 +3995,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
| 3957 | early_node_map[i].end_pfn); | 3995 | early_node_map[i].end_pfn); |
| 3958 | 3996 | ||
| 3959 | /* Initialise every node */ | 3997 | /* Initialise every node */ |
| 3998 | mminit_verify_pageflags_layout(); | ||
| 3960 | setup_nr_node_ids(); | 3999 | setup_nr_node_ids(); |
| 3961 | for_each_online_node(nid) { | 4000 | for_each_online_node(nid) { |
| 3962 | pg_data_t *pgdat = NODE_DATA(nid); | 4001 | pg_data_t *pgdat = NODE_DATA(nid); |
| 3963 | free_area_init_node(nid, pgdat, NULL, | 4002 | free_area_init_node(nid, NULL, |
| 3964 | find_min_pfn_for_node(nid), NULL); | 4003 | find_min_pfn_for_node(nid), NULL); |
| 3965 | 4004 | ||
| 3966 | /* Any memory on that node */ | 4005 | /* Any memory on that node */ |
| @@ -4025,15 +4064,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) | |||
| 4025 | } | 4064 | } |
| 4026 | 4065 | ||
| 4027 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4066 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
| 4028 | static bootmem_data_t contig_bootmem_data; | 4067 | struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] }; |
| 4029 | struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; | ||
| 4030 | |||
| 4031 | EXPORT_SYMBOL(contig_page_data); | 4068 | EXPORT_SYMBOL(contig_page_data); |
| 4032 | #endif | 4069 | #endif |
| 4033 | 4070 | ||
| 4034 | void __init free_area_init(unsigned long *zones_size) | 4071 | void __init free_area_init(unsigned long *zones_size) |
| 4035 | { | 4072 | { |
| 4036 | free_area_init_node(0, NODE_DATA(0), zones_size, | 4073 | free_area_init_node(0, zones_size, |
| 4037 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); | 4074 | __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); |
| 4038 | } | 4075 | } |
| 4039 | 4076 | ||
| @@ -4400,7 +4437,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
| 4400 | do { | 4437 | do { |
| 4401 | size = bucketsize << log2qty; | 4438 | size = bucketsize << log2qty; |
| 4402 | if (flags & HASH_EARLY) | 4439 | if (flags & HASH_EARLY) |
| 4403 | table = alloc_bootmem(size); | 4440 | table = alloc_bootmem_nopanic(size); |
| 4404 | else if (hashdist) | 4441 | else if (hashdist) |
| 4405 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 4442 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
| 4406 | else { | 4443 | else { |
diff --git a/mm/pdflush.c b/mm/pdflush.c index 9d834aa4b979..0cbe0c60c6bf 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c | |||
| @@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work) | |||
| 130 | * Thread creation: For how long have there been zero | 130 | * Thread creation: For how long have there been zero |
| 131 | * available threads? | 131 | * available threads? |
| 132 | */ | 132 | */ |
| 133 | if (jiffies - last_empty_jifs > 1 * HZ) { | 133 | if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { |
| 134 | /* unlocked list_empty() test is OK here */ | 134 | /* unlocked list_empty() test is OK here */ |
| 135 | if (list_empty(&pdflush_list)) { | 135 | if (list_empty(&pdflush_list)) { |
| 136 | /* unlocked test is OK here */ | 136 | /* unlocked test is OK here */ |
| @@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work) | |||
| 151 | if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) | 151 | if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) |
| 152 | continue; | 152 | continue; |
| 153 | pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); | 153 | pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); |
| 154 | if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) { | 154 | if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { |
| 155 | /* Limit exit rate */ | 155 | /* Limit exit rate */ |
| 156 | pdf->when_i_went_to_sleep = jiffies; | 156 | pdf->when_i_went_to_sleep = jiffies; |
| 157 | break; /* exeunt */ | 157 | break; /* exeunt */ |
diff --git a/mm/readahead.c b/mm/readahead.c index d8723a5f6496..77e8ddf945e9 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping, | |||
| 382 | if (hit_readahead_marker) { | 382 | if (hit_readahead_marker) { |
| 383 | pgoff_t start; | 383 | pgoff_t start; |
| 384 | 384 | ||
| 385 | read_lock_irq(&mapping->tree_lock); | 385 | rcu_read_lock(); |
| 386 | start = radix_tree_next_hole(&mapping->page_tree, offset, max+1); | 386 | start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); |
| 387 | read_unlock_irq(&mapping->tree_lock); | 387 | rcu_read_unlock(); |
| 388 | 388 | ||
| 389 | if (!start || start - offset > max) | 389 | if (!start || start - offset > max) |
| 390 | return 0; | 390 | return 0; |
| @@ -49,6 +49,7 @@ | |||
| 49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
| 50 | #include <linux/kallsyms.h> | 50 | #include <linux/kallsyms.h> |
| 51 | #include <linux/memcontrol.h> | 51 | #include <linux/memcontrol.h> |
| 52 | #include <linux/mmu_notifier.h> | ||
| 52 | 53 | ||
| 53 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
| 54 | 55 | ||
| @@ -138,7 +139,7 @@ void anon_vma_unlink(struct vm_area_struct *vma) | |||
| 138 | anon_vma_free(anon_vma); | 139 | anon_vma_free(anon_vma); |
| 139 | } | 140 | } |
| 140 | 141 | ||
| 141 | static void anon_vma_ctor(struct kmem_cache *cachep, void *data) | 142 | static void anon_vma_ctor(void *data) |
| 142 | { | 143 | { |
| 143 | struct anon_vma *anon_vma = data; | 144 | struct anon_vma *anon_vma = data; |
| 144 | 145 | ||
| @@ -287,7 +288,7 @@ static int page_referenced_one(struct page *page, | |||
| 287 | if (vma->vm_flags & VM_LOCKED) { | 288 | if (vma->vm_flags & VM_LOCKED) { |
| 288 | referenced++; | 289 | referenced++; |
| 289 | *mapcount = 1; /* break early from loop */ | 290 | *mapcount = 1; /* break early from loop */ |
| 290 | } else if (ptep_clear_flush_young(vma, address, pte)) | 291 | } else if (ptep_clear_flush_young_notify(vma, address, pte)) |
| 291 | referenced++; | 292 | referenced++; |
| 292 | 293 | ||
| 293 | /* Pretend the page is referenced if the task has the | 294 | /* Pretend the page is referenced if the task has the |
| @@ -421,7 +422,7 @@ int page_referenced(struct page *page, int is_locked, | |||
| 421 | referenced += page_referenced_anon(page, mem_cont); | 422 | referenced += page_referenced_anon(page, mem_cont); |
| 422 | else if (is_locked) | 423 | else if (is_locked) |
| 423 | referenced += page_referenced_file(page, mem_cont); | 424 | referenced += page_referenced_file(page, mem_cont); |
| 424 | else if (TestSetPageLocked(page)) | 425 | else if (!trylock_page(page)) |
| 425 | referenced++; | 426 | referenced++; |
| 426 | else { | 427 | else { |
| 427 | if (page->mapping) | 428 | if (page->mapping) |
| @@ -457,7 +458,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | |||
| 457 | pte_t entry; | 458 | pte_t entry; |
| 458 | 459 | ||
| 459 | flush_cache_page(vma, address, pte_pfn(*pte)); | 460 | flush_cache_page(vma, address, pte_pfn(*pte)); |
| 460 | entry = ptep_clear_flush(vma, address, pte); | 461 | entry = ptep_clear_flush_notify(vma, address, pte); |
| 461 | entry = pte_wrprotect(entry); | 462 | entry = pte_wrprotect(entry); |
| 462 | entry = pte_mkclean(entry); | 463 | entry = pte_mkclean(entry); |
| 463 | set_pte_at(mm, address, pte, entry); | 464 | set_pte_at(mm, address, pte, entry); |
| @@ -576,14 +577,8 @@ void page_add_anon_rmap(struct page *page, | |||
| 576 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 577 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
| 577 | if (atomic_inc_and_test(&page->_mapcount)) | 578 | if (atomic_inc_and_test(&page->_mapcount)) |
| 578 | __page_set_anon_rmap(page, vma, address); | 579 | __page_set_anon_rmap(page, vma, address); |
| 579 | else { | 580 | else |
| 580 | __page_check_anon_rmap(page, vma, address); | 581 | __page_check_anon_rmap(page, vma, address); |
| 581 | /* | ||
| 582 | * We unconditionally charged during prepare, we uncharge here | ||
| 583 | * This takes care of balancing the reference counts | ||
| 584 | */ | ||
| 585 | mem_cgroup_uncharge_page(page); | ||
| 586 | } | ||
| 587 | } | 582 | } |
| 588 | 583 | ||
| 589 | /** | 584 | /** |
| @@ -614,12 +609,6 @@ void page_add_file_rmap(struct page *page) | |||
| 614 | { | 609 | { |
| 615 | if (atomic_inc_and_test(&page->_mapcount)) | 610 | if (atomic_inc_and_test(&page->_mapcount)) |
| 616 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 611 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
| 617 | else | ||
| 618 | /* | ||
| 619 | * We unconditionally charged during prepare, we uncharge here | ||
| 620 | * This takes care of balancing the reference counts | ||
| 621 | */ | ||
| 622 | mem_cgroup_uncharge_page(page); | ||
| 623 | } | 612 | } |
| 624 | 613 | ||
| 625 | #ifdef CONFIG_DEBUG_VM | 614 | #ifdef CONFIG_DEBUG_VM |
| @@ -678,7 +667,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | |||
| 678 | * Leaving it set also helps swapoff to reinstate ptes | 667 | * Leaving it set also helps swapoff to reinstate ptes |
| 679 | * faster for those pages still in swapcache. | 668 | * faster for those pages still in swapcache. |
| 680 | */ | 669 | */ |
| 681 | if (page_test_dirty(page)) { | 670 | if ((!PageAnon(page) || PageSwapCache(page)) && |
| 671 | page_test_dirty(page)) { | ||
| 682 | page_clear_dirty(page); | 672 | page_clear_dirty(page); |
| 683 | set_page_dirty(page); | 673 | set_page_dirty(page); |
| 684 | } | 674 | } |
| @@ -717,14 +707,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 717 | * skipped over this mm) then we should reactivate it. | 707 | * skipped over this mm) then we should reactivate it. |
| 718 | */ | 708 | */ |
| 719 | if (!migration && ((vma->vm_flags & VM_LOCKED) || | 709 | if (!migration && ((vma->vm_flags & VM_LOCKED) || |
| 720 | (ptep_clear_flush_young(vma, address, pte)))) { | 710 | (ptep_clear_flush_young_notify(vma, address, pte)))) { |
| 721 | ret = SWAP_FAIL; | 711 | ret = SWAP_FAIL; |
| 722 | goto out_unmap; | 712 | goto out_unmap; |
| 723 | } | 713 | } |
| 724 | 714 | ||
| 725 | /* Nuke the page table entry. */ | 715 | /* Nuke the page table entry. */ |
| 726 | flush_cache_page(vma, address, page_to_pfn(page)); | 716 | flush_cache_page(vma, address, page_to_pfn(page)); |
| 727 | pteval = ptep_clear_flush(vma, address, pte); | 717 | pteval = ptep_clear_flush_notify(vma, address, pte); |
| 728 | 718 | ||
| 729 | /* Move the dirty bit to the physical page now the pte is gone. */ | 719 | /* Move the dirty bit to the physical page now the pte is gone. */ |
| 730 | if (pte_dirty(pteval)) | 720 | if (pte_dirty(pteval)) |
| @@ -849,12 +839,12 @@ static void try_to_unmap_cluster(unsigned long cursor, | |||
| 849 | page = vm_normal_page(vma, address, *pte); | 839 | page = vm_normal_page(vma, address, *pte); |
| 850 | BUG_ON(!page || PageAnon(page)); | 840 | BUG_ON(!page || PageAnon(page)); |
| 851 | 841 | ||
| 852 | if (ptep_clear_flush_young(vma, address, pte)) | 842 | if (ptep_clear_flush_young_notify(vma, address, pte)) |
| 853 | continue; | 843 | continue; |
| 854 | 844 | ||
| 855 | /* Nuke the page table entry. */ | 845 | /* Nuke the page table entry. */ |
| 856 | flush_cache_page(vma, address, pte_pfn(*pte)); | 846 | flush_cache_page(vma, address, pte_pfn(*pte)); |
| 857 | pteval = ptep_clear_flush(vma, address, pte); | 847 | pteval = ptep_clear_flush_notify(vma, address, pte); |
| 858 | 848 | ||
| 859 | /* If nonlinear, store the file page offset in the pte. */ | 849 | /* If nonlinear, store the file page offset in the pte. */ |
| 860 | if (page->index != linear_page_index(vma, address)) | 850 | if (page->index != linear_page_index(vma, address)) |
diff --git a/mm/shmem.c b/mm/shmem.c index e2a6ae1a44e9..04fb4f1ab88e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -922,20 +922,26 @@ found: | |||
| 922 | error = 1; | 922 | error = 1; |
| 923 | if (!inode) | 923 | if (!inode) |
| 924 | goto out; | 924 | goto out; |
| 925 | /* Precharge page while we can wait, compensate afterwards */ | 925 | /* Precharge page using GFP_KERNEL while we can wait */ |
| 926 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); | 926 | error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); |
| 927 | if (error) | 927 | if (error) |
| 928 | goto out; | 928 | goto out; |
| 929 | error = radix_tree_preload(GFP_KERNEL); | 929 | error = radix_tree_preload(GFP_KERNEL); |
| 930 | if (error) | 930 | if (error) { |
| 931 | goto uncharge; | 931 | mem_cgroup_uncharge_cache_page(page); |
| 932 | goto out; | ||
| 933 | } | ||
| 932 | error = 1; | 934 | error = 1; |
| 933 | 935 | ||
| 934 | spin_lock(&info->lock); | 936 | spin_lock(&info->lock); |
| 935 | ptr = shmem_swp_entry(info, idx, NULL); | 937 | ptr = shmem_swp_entry(info, idx, NULL); |
| 936 | if (ptr && ptr->val == entry.val) | 938 | if (ptr && ptr->val == entry.val) { |
| 937 | error = add_to_page_cache(page, inode->i_mapping, | 939 | error = add_to_page_cache_locked(page, inode->i_mapping, |
| 938 | idx, GFP_NOWAIT); | 940 | idx, GFP_NOWAIT); |
| 941 | /* does mem_cgroup_uncharge_cache_page on error */ | ||
| 942 | } else /* we must compensate for our precharge above */ | ||
| 943 | mem_cgroup_uncharge_cache_page(page); | ||
| 944 | |||
| 939 | if (error == -EEXIST) { | 945 | if (error == -EEXIST) { |
| 940 | struct page *filepage = find_get_page(inode->i_mapping, idx); | 946 | struct page *filepage = find_get_page(inode->i_mapping, idx); |
| 941 | error = 1; | 947 | error = 1; |
| @@ -961,8 +967,6 @@ found: | |||
| 961 | shmem_swp_unmap(ptr); | 967 | shmem_swp_unmap(ptr); |
| 962 | spin_unlock(&info->lock); | 968 | spin_unlock(&info->lock); |
| 963 | radix_tree_preload_end(); | 969 | radix_tree_preload_end(); |
| 964 | uncharge: | ||
| 965 | mem_cgroup_uncharge_page(page); | ||
| 966 | out: | 970 | out: |
| 967 | unlock_page(page); | 971 | unlock_page(page); |
| 968 | page_cache_release(page); | 972 | page_cache_release(page); |
| @@ -1261,7 +1265,7 @@ repeat: | |||
| 1261 | } | 1265 | } |
| 1262 | 1266 | ||
| 1263 | /* We have to do this with page locked to prevent races */ | 1267 | /* We have to do this with page locked to prevent races */ |
| 1264 | if (TestSetPageLocked(swappage)) { | 1268 | if (!trylock_page(swappage)) { |
| 1265 | shmem_swp_unmap(entry); | 1269 | shmem_swp_unmap(entry); |
| 1266 | spin_unlock(&info->lock); | 1270 | spin_unlock(&info->lock); |
| 1267 | wait_on_page_locked(swappage); | 1271 | wait_on_page_locked(swappage); |
| @@ -1297,8 +1301,8 @@ repeat: | |||
| 1297 | SetPageUptodate(filepage); | 1301 | SetPageUptodate(filepage); |
| 1298 | set_page_dirty(filepage); | 1302 | set_page_dirty(filepage); |
| 1299 | swap_free(swap); | 1303 | swap_free(swap); |
| 1300 | } else if (!(error = add_to_page_cache( | 1304 | } else if (!(error = add_to_page_cache_locked(swappage, mapping, |
| 1301 | swappage, mapping, idx, GFP_NOWAIT))) { | 1305 | idx, GFP_NOWAIT))) { |
| 1302 | info->flags |= SHMEM_PAGEIN; | 1306 | info->flags |= SHMEM_PAGEIN; |
| 1303 | shmem_swp_set(info, entry, 0); | 1307 | shmem_swp_set(info, entry, 0); |
| 1304 | shmem_swp_unmap(entry); | 1308 | shmem_swp_unmap(entry); |
| @@ -1311,24 +1315,21 @@ repeat: | |||
| 1311 | shmem_swp_unmap(entry); | 1315 | shmem_swp_unmap(entry); |
| 1312 | spin_unlock(&info->lock); | 1316 | spin_unlock(&info->lock); |
| 1313 | unlock_page(swappage); | 1317 | unlock_page(swappage); |
| 1318 | page_cache_release(swappage); | ||
| 1314 | if (error == -ENOMEM) { | 1319 | if (error == -ENOMEM) { |
| 1315 | /* allow reclaim from this memory cgroup */ | 1320 | /* allow reclaim from this memory cgroup */ |
| 1316 | error = mem_cgroup_cache_charge(swappage, | 1321 | error = mem_cgroup_shrink_usage(current->mm, |
| 1317 | current->mm, gfp & ~__GFP_HIGHMEM); | 1322 | gfp); |
| 1318 | if (error) { | 1323 | if (error) |
| 1319 | page_cache_release(swappage); | ||
| 1320 | goto failed; | 1324 | goto failed; |
| 1321 | } | ||
| 1322 | mem_cgroup_uncharge_page(swappage); | ||
| 1323 | } | 1325 | } |
| 1324 | page_cache_release(swappage); | ||
| 1325 | goto repeat; | 1326 | goto repeat; |
| 1326 | } | 1327 | } |
| 1327 | } else if (sgp == SGP_READ && !filepage) { | 1328 | } else if (sgp == SGP_READ && !filepage) { |
| 1328 | shmem_swp_unmap(entry); | 1329 | shmem_swp_unmap(entry); |
| 1329 | filepage = find_get_page(mapping, idx); | 1330 | filepage = find_get_page(mapping, idx); |
| 1330 | if (filepage && | 1331 | if (filepage && |
| 1331 | (!PageUptodate(filepage) || TestSetPageLocked(filepage))) { | 1332 | (!PageUptodate(filepage) || !trylock_page(filepage))) { |
| 1332 | spin_unlock(&info->lock); | 1333 | spin_unlock(&info->lock); |
| 1333 | wait_on_page_locked(filepage); | 1334 | wait_on_page_locked(filepage); |
| 1334 | page_cache_release(filepage); | 1335 | page_cache_release(filepage); |
| @@ -1358,6 +1359,8 @@ repeat: | |||
| 1358 | } | 1359 | } |
| 1359 | 1360 | ||
| 1360 | if (!filepage) { | 1361 | if (!filepage) { |
| 1362 | int ret; | ||
| 1363 | |||
| 1361 | spin_unlock(&info->lock); | 1364 | spin_unlock(&info->lock); |
| 1362 | filepage = shmem_alloc_page(gfp, info, idx); | 1365 | filepage = shmem_alloc_page(gfp, info, idx); |
| 1363 | if (!filepage) { | 1366 | if (!filepage) { |
| @@ -1386,10 +1389,18 @@ repeat: | |||
| 1386 | swap = *entry; | 1389 | swap = *entry; |
| 1387 | shmem_swp_unmap(entry); | 1390 | shmem_swp_unmap(entry); |
| 1388 | } | 1391 | } |
| 1389 | if (error || swap.val || 0 != add_to_page_cache_lru( | 1392 | ret = error || swap.val; |
| 1390 | filepage, mapping, idx, GFP_NOWAIT)) { | 1393 | if (ret) |
| 1394 | mem_cgroup_uncharge_cache_page(filepage); | ||
| 1395 | else | ||
| 1396 | ret = add_to_page_cache_lru(filepage, mapping, | ||
| 1397 | idx, GFP_NOWAIT); | ||
| 1398 | /* | ||
| 1399 | * At add_to_page_cache_lru() failure, uncharge will | ||
| 1400 | * be done automatically. | ||
| 1401 | */ | ||
| 1402 | if (ret) { | ||
| 1391 | spin_unlock(&info->lock); | 1403 | spin_unlock(&info->lock); |
| 1392 | mem_cgroup_uncharge_page(filepage); | ||
| 1393 | page_cache_release(filepage); | 1404 | page_cache_release(filepage); |
| 1394 | shmem_unacct_blocks(info->flags, 1); | 1405 | shmem_unacct_blocks(info->flags, 1); |
| 1395 | shmem_free_blocks(inode, 1); | 1406 | shmem_free_blocks(inode, 1); |
| @@ -1398,7 +1409,6 @@ repeat: | |||
| 1398 | goto failed; | 1409 | goto failed; |
| 1399 | goto repeat; | 1410 | goto repeat; |
| 1400 | } | 1411 | } |
| 1401 | mem_cgroup_uncharge_page(filepage); | ||
| 1402 | info->flags |= SHMEM_PAGEIN; | 1412 | info->flags |= SHMEM_PAGEIN; |
| 1403 | } | 1413 | } |
| 1404 | 1414 | ||
| @@ -1503,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
| 1503 | inode->i_uid = current->fsuid; | 1513 | inode->i_uid = current->fsuid; |
| 1504 | inode->i_gid = current->fsgid; | 1514 | inode->i_gid = current->fsgid; |
| 1505 | inode->i_blocks = 0; | 1515 | inode->i_blocks = 0; |
| 1506 | inode->i_mapping->a_ops = &shmem_aops; | ||
| 1507 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; | 1516 | inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; |
| 1508 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; | 1517 | inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; |
| 1509 | inode->i_generation = get_seconds(); | 1518 | inode->i_generation = get_seconds(); |
| @@ -1518,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev) | |||
| 1518 | init_special_inode(inode, mode, dev); | 1527 | init_special_inode(inode, mode, dev); |
| 1519 | break; | 1528 | break; |
| 1520 | case S_IFREG: | 1529 | case S_IFREG: |
| 1530 | inode->i_mapping->a_ops = &shmem_aops; | ||
| 1521 | inode->i_op = &shmem_inode_operations; | 1531 | inode->i_op = &shmem_inode_operations; |
| 1522 | inode->i_fop = &shmem_file_operations; | 1532 | inode->i_fop = &shmem_file_operations; |
| 1523 | mpol_shared_policy_init(&info->policy, | 1533 | mpol_shared_policy_init(&info->policy, |
| @@ -1690,26 +1700,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ | |||
| 1690 | file_accessed(filp); | 1700 | file_accessed(filp); |
| 1691 | } | 1701 | } |
| 1692 | 1702 | ||
| 1693 | static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) | 1703 | static ssize_t shmem_file_aio_read(struct kiocb *iocb, |
| 1704 | const struct iovec *iov, unsigned long nr_segs, loff_t pos) | ||
| 1694 | { | 1705 | { |
| 1695 | read_descriptor_t desc; | 1706 | struct file *filp = iocb->ki_filp; |
| 1707 | ssize_t retval; | ||
| 1708 | unsigned long seg; | ||
| 1709 | size_t count; | ||
| 1710 | loff_t *ppos = &iocb->ki_pos; | ||
| 1696 | 1711 | ||
| 1697 | if ((ssize_t) count < 0) | 1712 | retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); |
| 1698 | return -EINVAL; | 1713 | if (retval) |
| 1699 | if (!access_ok(VERIFY_WRITE, buf, count)) | 1714 | return retval; |
| 1700 | return -EFAULT; | ||
| 1701 | if (!count) | ||
| 1702 | return 0; | ||
| 1703 | 1715 | ||
| 1704 | desc.written = 0; | 1716 | for (seg = 0; seg < nr_segs; seg++) { |
| 1705 | desc.count = count; | 1717 | read_descriptor_t desc; |
| 1706 | desc.arg.buf = buf; | ||
| 1707 | desc.error = 0; | ||
| 1708 | 1718 | ||
| 1709 | do_shmem_file_read(filp, ppos, &desc, file_read_actor); | 1719 | desc.written = 0; |
| 1710 | if (desc.written) | 1720 | desc.arg.buf = iov[seg].iov_base; |
| 1711 | return desc.written; | 1721 | desc.count = iov[seg].iov_len; |
| 1712 | return desc.error; | 1722 | if (desc.count == 0) |
| 1723 | continue; | ||
| 1724 | desc.error = 0; | ||
| 1725 | do_shmem_file_read(filp, ppos, &desc, file_read_actor); | ||
| 1726 | retval += desc.written; | ||
| 1727 | if (desc.error) { | ||
| 1728 | retval = retval ?: desc.error; | ||
| 1729 | break; | ||
| 1730 | } | ||
| 1731 | if (desc.count > 0) | ||
| 1732 | break; | ||
| 1733 | } | ||
| 1734 | return retval; | ||
| 1713 | } | 1735 | } |
| 1714 | 1736 | ||
| 1715 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) | 1737 | static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) |
| @@ -1907,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
| 1907 | return error; | 1929 | return error; |
| 1908 | } | 1930 | } |
| 1909 | unlock_page(page); | 1931 | unlock_page(page); |
| 1932 | inode->i_mapping->a_ops = &shmem_aops; | ||
| 1910 | inode->i_op = &shmem_symlink_inode_operations; | 1933 | inode->i_op = &shmem_symlink_inode_operations; |
| 1911 | kaddr = kmap_atomic(page, KM_USER0); | 1934 | kaddr = kmap_atomic(page, KM_USER0); |
| 1912 | memcpy(kaddr, symname, len); | 1935 | memcpy(kaddr, symname, len); |
| @@ -2330,7 +2353,7 @@ static void shmem_destroy_inode(struct inode *inode) | |||
| 2330 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2353 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
| 2331 | } | 2354 | } |
| 2332 | 2355 | ||
| 2333 | static void init_once(struct kmem_cache *cachep, void *foo) | 2356 | static void init_once(void *foo) |
| 2334 | { | 2357 | { |
| 2335 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2358 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
| 2336 | 2359 | ||
| @@ -2369,8 +2392,9 @@ static const struct file_operations shmem_file_operations = { | |||
| 2369 | .mmap = shmem_mmap, | 2392 | .mmap = shmem_mmap, |
| 2370 | #ifdef CONFIG_TMPFS | 2393 | #ifdef CONFIG_TMPFS |
| 2371 | .llseek = generic_file_llseek, | 2394 | .llseek = generic_file_llseek, |
| 2372 | .read = shmem_file_read, | 2395 | .read = do_sync_read, |
| 2373 | .write = do_sync_write, | 2396 | .write = do_sync_write, |
| 2397 | .aio_read = shmem_file_aio_read, | ||
| 2374 | .aio_write = generic_file_aio_write, | 2398 | .aio_write = generic_file_aio_write, |
| 2375 | .fsync = simple_sync_file, | 2399 | .fsync = simple_sync_file, |
| 2376 | .splice_read = generic_file_splice_read, | 2400 | .splice_read = generic_file_splice_read, |
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index f5664c5b9eb1..8e5aadd7dcd6 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c | |||
| @@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask) | |||
| 191 | * shmem_permission - permission() inode operation | 191 | * shmem_permission - permission() inode operation |
| 192 | */ | 192 | */ |
| 193 | int | 193 | int |
| 194 | shmem_permission(struct inode *inode, int mask, struct nameidata *nd) | 194 | shmem_permission(struct inode *inode, int mask) |
| 195 | { | 195 | { |
| 196 | return generic_permission(inode, mask, shmem_check_acl); | 196 | return generic_permission(inode, mask, shmem_check_acl); |
| 197 | } | 197 | } |
| @@ -406,7 +406,7 @@ struct kmem_cache { | |||
| 406 | unsigned int dflags; /* dynamic flags */ | 406 | unsigned int dflags; /* dynamic flags */ |
| 407 | 407 | ||
| 408 | /* constructor func */ | 408 | /* constructor func */ |
| 409 | void (*ctor)(struct kmem_cache *, void *); | 409 | void (*ctor)(void *obj); |
| 410 | 410 | ||
| 411 | /* 5) cache creation/removal */ | 411 | /* 5) cache creation/removal */ |
| 412 | const char *name; | 412 | const char *name; |
| @@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
| 2137 | */ | 2137 | */ |
| 2138 | struct kmem_cache * | 2138 | struct kmem_cache * |
| 2139 | kmem_cache_create (const char *name, size_t size, size_t align, | 2139 | kmem_cache_create (const char *name, size_t size, size_t align, |
| 2140 | unsigned long flags, | 2140 | unsigned long flags, void (*ctor)(void *)) |
| 2141 | void (*ctor)(struct kmem_cache *, void *)) | ||
| 2142 | { | 2141 | { |
| 2143 | size_t left_over, slab_size, ralign; | 2142 | size_t left_over, slab_size, ralign; |
| 2144 | struct kmem_cache *cachep = NULL, *pc; | 2143 | struct kmem_cache *cachep = NULL, *pc; |
| @@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
| 2653 | * They must also be threaded. | 2652 | * They must also be threaded. |
| 2654 | */ | 2653 | */ |
| 2655 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) | 2654 | if (cachep->ctor && !(cachep->flags & SLAB_POISON)) |
| 2656 | cachep->ctor(cachep, objp + obj_offset(cachep)); | 2655 | cachep->ctor(objp + obj_offset(cachep)); |
| 2657 | 2656 | ||
| 2658 | if (cachep->flags & SLAB_RED_ZONE) { | 2657 | if (cachep->flags & SLAB_RED_ZONE) { |
| 2659 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) | 2658 | if (*dbg_redzone2(cachep, objp) != RED_INACTIVE) |
| @@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep, | |||
| 2669 | cachep->buffer_size / PAGE_SIZE, 0); | 2668 | cachep->buffer_size / PAGE_SIZE, 0); |
| 2670 | #else | 2669 | #else |
| 2671 | if (cachep->ctor) | 2670 | if (cachep->ctor) |
| 2672 | cachep->ctor(cachep, objp); | 2671 | cachep->ctor(objp); |
| 2673 | #endif | 2672 | #endif |
| 2674 | slab_bufctl(slabp)[i] = i + 1; | 2673 | slab_bufctl(slabp)[i] = i + 1; |
| 2675 | } | 2674 | } |
| @@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, | |||
| 3093 | #endif | 3092 | #endif |
| 3094 | objp += obj_offset(cachep); | 3093 | objp += obj_offset(cachep); |
| 3095 | if (cachep->ctor && cachep->flags & SLAB_POISON) | 3094 | if (cachep->ctor && cachep->flags & SLAB_POISON) |
| 3096 | cachep->ctor(cachep, objp); | 3095 | cachep->ctor(objp); |
| 3097 | #if ARCH_SLAB_MINALIGN | 3096 | #if ARCH_SLAB_MINALIGN |
| 3098 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { | 3097 | if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) { |
| 3099 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", | 3098 | printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n", |
| @@ -4473,4 +4472,3 @@ size_t ksize(const void *objp) | |||
| 4473 | 4472 | ||
| 4474 | return obj_size(virt_to_cache(objp)); | 4473 | return obj_size(virt_to_cache(objp)); |
| 4475 | } | 4474 | } |
| 4476 | EXPORT_SYMBOL(ksize); | ||
| @@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large); | |||
| 130 | */ | 130 | */ |
| 131 | static inline int slob_page(struct slob_page *sp) | 131 | static inline int slob_page(struct slob_page *sp) |
| 132 | { | 132 | { |
| 133 | return test_bit(PG_active, &sp->flags); | 133 | return PageSlobPage((struct page *)sp); |
| 134 | } | 134 | } |
| 135 | 135 | ||
| 136 | static inline void set_slob_page(struct slob_page *sp) | 136 | static inline void set_slob_page(struct slob_page *sp) |
| 137 | { | 137 | { |
| 138 | __set_bit(PG_active, &sp->flags); | 138 | __SetPageSlobPage((struct page *)sp); |
| 139 | } | 139 | } |
| 140 | 140 | ||
| 141 | static inline void clear_slob_page(struct slob_page *sp) | 141 | static inline void clear_slob_page(struct slob_page *sp) |
| 142 | { | 142 | { |
| 143 | __clear_bit(PG_active, &sp->flags); | 143 | __ClearPageSlobPage((struct page *)sp); |
| 144 | } | 144 | } |
| 145 | 145 | ||
| 146 | /* | 146 | /* |
| @@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp) | |||
| 148 | */ | 148 | */ |
| 149 | static inline int slob_page_free(struct slob_page *sp) | 149 | static inline int slob_page_free(struct slob_page *sp) |
| 150 | { | 150 | { |
| 151 | return test_bit(PG_private, &sp->flags); | 151 | return PageSlobFree((struct page *)sp); |
| 152 | } | 152 | } |
| 153 | 153 | ||
| 154 | static void set_slob_page_free(struct slob_page *sp, struct list_head *list) | 154 | static void set_slob_page_free(struct slob_page *sp, struct list_head *list) |
| 155 | { | 155 | { |
| 156 | list_add(&sp->list, list); | 156 | list_add(&sp->list, list); |
| 157 | __set_bit(PG_private, &sp->flags); | 157 | __SetPageSlobFree((struct page *)sp); |
| 158 | } | 158 | } |
| 159 | 159 | ||
| 160 | static inline void clear_slob_page_free(struct slob_page *sp) | 160 | static inline void clear_slob_page_free(struct slob_page *sp) |
| 161 | { | 161 | { |
| 162 | list_del(&sp->list); | 162 | list_del(&sp->list); |
| 163 | __clear_bit(PG_private, &sp->flags); | 163 | __ClearPageSlobFree((struct page *)sp); |
| 164 | } | 164 | } |
| 165 | 165 | ||
| 166 | #define SLOB_UNIT sizeof(slob_t) | 166 | #define SLOB_UNIT sizeof(slob_t) |
| @@ -519,18 +519,16 @@ size_t ksize(const void *block) | |||
| 519 | else | 519 | else |
| 520 | return sp->page.private; | 520 | return sp->page.private; |
| 521 | } | 521 | } |
| 522 | EXPORT_SYMBOL(ksize); | ||
| 523 | 522 | ||
| 524 | struct kmem_cache { | 523 | struct kmem_cache { |
| 525 | unsigned int size, align; | 524 | unsigned int size, align; |
| 526 | unsigned long flags; | 525 | unsigned long flags; |
| 527 | const char *name; | 526 | const char *name; |
| 528 | void (*ctor)(struct kmem_cache *, void *); | 527 | void (*ctor)(void *); |
| 529 | }; | 528 | }; |
| 530 | 529 | ||
| 531 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 530 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
| 532 | size_t align, unsigned long flags, | 531 | size_t align, unsigned long flags, void (*ctor)(void *)) |
| 533 | void (*ctor)(struct kmem_cache *, void *)) | ||
| 534 | { | 532 | { |
| 535 | struct kmem_cache *c; | 533 | struct kmem_cache *c; |
| 536 | 534 | ||
| @@ -575,7 +573,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
| 575 | b = slob_new_page(flags, get_order(c->size), node); | 573 | b = slob_new_page(flags, get_order(c->size), node); |
| 576 | 574 | ||
| 577 | if (c->ctor) | 575 | if (c->ctor) |
| 578 | c->ctor(c, b); | 576 | c->ctor(b); |
| 579 | 577 | ||
| 580 | return b; | 578 | return b; |
| 581 | } | 579 | } |
| @@ -102,44 +102,12 @@ | |||
| 102 | * the fast path and disables lockless freelists. | 102 | * the fast path and disables lockless freelists. |
| 103 | */ | 103 | */ |
| 104 | 104 | ||
| 105 | #define FROZEN (1 << PG_active) | ||
| 106 | |||
| 107 | #ifdef CONFIG_SLUB_DEBUG | 105 | #ifdef CONFIG_SLUB_DEBUG |
| 108 | #define SLABDEBUG (1 << PG_error) | 106 | #define SLABDEBUG 1 |
| 109 | #else | 107 | #else |
| 110 | #define SLABDEBUG 0 | 108 | #define SLABDEBUG 0 |
| 111 | #endif | 109 | #endif |
| 112 | 110 | ||
| 113 | static inline int SlabFrozen(struct page *page) | ||
| 114 | { | ||
| 115 | return page->flags & FROZEN; | ||
| 116 | } | ||
| 117 | |||
| 118 | static inline void SetSlabFrozen(struct page *page) | ||
| 119 | { | ||
| 120 | page->flags |= FROZEN; | ||
| 121 | } | ||
| 122 | |||
| 123 | static inline void ClearSlabFrozen(struct page *page) | ||
| 124 | { | ||
| 125 | page->flags &= ~FROZEN; | ||
| 126 | } | ||
| 127 | |||
| 128 | static inline int SlabDebug(struct page *page) | ||
| 129 | { | ||
| 130 | return page->flags & SLABDEBUG; | ||
| 131 | } | ||
| 132 | |||
| 133 | static inline void SetSlabDebug(struct page *page) | ||
| 134 | { | ||
| 135 | page->flags |= SLABDEBUG; | ||
| 136 | } | ||
| 137 | |||
| 138 | static inline void ClearSlabDebug(struct page *page) | ||
| 139 | { | ||
| 140 | page->flags &= ~SLABDEBUG; | ||
| 141 | } | ||
| 142 | |||
| 143 | /* | 111 | /* |
| 144 | * Issues still to be resolved: | 112 | * Issues still to be resolved: |
| 145 | * | 113 | * |
| @@ -971,7 +939,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, | |||
| 971 | } | 939 | } |
| 972 | 940 | ||
| 973 | /* Special debug activities for freeing objects */ | 941 | /* Special debug activities for freeing objects */ |
| 974 | if (!SlabFrozen(page) && !page->freelist) | 942 | if (!PageSlubFrozen(page) && !page->freelist) |
| 975 | remove_full(s, page); | 943 | remove_full(s, page); |
| 976 | if (s->flags & SLAB_STORE_USER) | 944 | if (s->flags & SLAB_STORE_USER) |
| 977 | set_track(s, object, TRACK_FREE, addr); | 945 | set_track(s, object, TRACK_FREE, addr); |
| @@ -1044,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug); | |||
| 1044 | 1012 | ||
| 1045 | static unsigned long kmem_cache_flags(unsigned long objsize, | 1013 | static unsigned long kmem_cache_flags(unsigned long objsize, |
| 1046 | unsigned long flags, const char *name, | 1014 | unsigned long flags, const char *name, |
| 1047 | void (*ctor)(struct kmem_cache *, void *)) | 1015 | void (*ctor)(void *)) |
| 1048 | { | 1016 | { |
| 1049 | /* | 1017 | /* |
| 1050 | * Enable debugging if selected on the kernel commandline. | 1018 | * Enable debugging if selected on the kernel commandline. |
| @@ -1072,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page, | |||
| 1072 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | 1040 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} |
| 1073 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1041 | static inline unsigned long kmem_cache_flags(unsigned long objsize, |
| 1074 | unsigned long flags, const char *name, | 1042 | unsigned long flags, const char *name, |
| 1075 | void (*ctor)(struct kmem_cache *, void *)) | 1043 | void (*ctor)(void *)) |
| 1076 | { | 1044 | { |
| 1077 | return flags; | 1045 | return flags; |
| 1078 | } | 1046 | } |
| @@ -1135,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page, | |||
| 1135 | { | 1103 | { |
| 1136 | setup_object_debug(s, page, object); | 1104 | setup_object_debug(s, page, object); |
| 1137 | if (unlikely(s->ctor)) | 1105 | if (unlikely(s->ctor)) |
| 1138 | s->ctor(s, object); | 1106 | s->ctor(object); |
| 1139 | } | 1107 | } |
| 1140 | 1108 | ||
| 1141 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | 1109 | static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) |
| @@ -1157,7 +1125,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1157 | page->flags |= 1 << PG_slab; | 1125 | page->flags |= 1 << PG_slab; |
| 1158 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | | 1126 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | |
| 1159 | SLAB_STORE_USER | SLAB_TRACE)) | 1127 | SLAB_STORE_USER | SLAB_TRACE)) |
| 1160 | SetSlabDebug(page); | 1128 | __SetPageSlubDebug(page); |
| 1161 | 1129 | ||
| 1162 | start = page_address(page); | 1130 | start = page_address(page); |
| 1163 | 1131 | ||
| @@ -1184,14 +1152,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
| 1184 | int order = compound_order(page); | 1152 | int order = compound_order(page); |
| 1185 | int pages = 1 << order; | 1153 | int pages = 1 << order; |
| 1186 | 1154 | ||
| 1187 | if (unlikely(SlabDebug(page))) { | 1155 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) { |
| 1188 | void *p; | 1156 | void *p; |
| 1189 | 1157 | ||
| 1190 | slab_pad_check(s, page); | 1158 | slab_pad_check(s, page); |
| 1191 | for_each_object(p, s, page_address(page), | 1159 | for_each_object(p, s, page_address(page), |
| 1192 | page->objects) | 1160 | page->objects) |
| 1193 | check_object(s, page, p, 0); | 1161 | check_object(s, page, p, 0); |
| 1194 | ClearSlabDebug(page); | 1162 | __ClearPageSlubDebug(page); |
| 1195 | } | 1163 | } |
| 1196 | 1164 | ||
| 1197 | mod_zone_page_state(page_zone(page), | 1165 | mod_zone_page_state(page_zone(page), |
| @@ -1288,7 +1256,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, | |||
| 1288 | if (slab_trylock(page)) { | 1256 | if (slab_trylock(page)) { |
| 1289 | list_del(&page->lru); | 1257 | list_del(&page->lru); |
| 1290 | n->nr_partial--; | 1258 | n->nr_partial--; |
| 1291 | SetSlabFrozen(page); | 1259 | __SetPageSlubFrozen(page); |
| 1292 | return 1; | 1260 | return 1; |
| 1293 | } | 1261 | } |
| 1294 | return 0; | 1262 | return 0; |
| @@ -1361,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
| 1361 | n = get_node(s, zone_to_nid(zone)); | 1329 | n = get_node(s, zone_to_nid(zone)); |
| 1362 | 1330 | ||
| 1363 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1331 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
| 1364 | n->nr_partial > MIN_PARTIAL) { | 1332 | n->nr_partial > n->min_partial) { |
| 1365 | page = get_partial_node(n); | 1333 | page = get_partial_node(n); |
| 1366 | if (page) | 1334 | if (page) |
| 1367 | return page; | 1335 | return page; |
| @@ -1398,7 +1366,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
| 1398 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | 1366 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
| 1399 | struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); | 1367 | struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); |
| 1400 | 1368 | ||
| 1401 | ClearSlabFrozen(page); | 1369 | __ClearPageSlubFrozen(page); |
| 1402 | if (page->inuse) { | 1370 | if (page->inuse) { |
| 1403 | 1371 | ||
| 1404 | if (page->freelist) { | 1372 | if (page->freelist) { |
| @@ -1406,13 +1374,14 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
| 1406 | stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | 1374 | stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); |
| 1407 | } else { | 1375 | } else { |
| 1408 | stat(c, DEACTIVATE_FULL); | 1376 | stat(c, DEACTIVATE_FULL); |
| 1409 | if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) | 1377 | if (SLABDEBUG && PageSlubDebug(page) && |
| 1378 | (s->flags & SLAB_STORE_USER)) | ||
| 1410 | add_full(n, page); | 1379 | add_full(n, page); |
| 1411 | } | 1380 | } |
| 1412 | slab_unlock(page); | 1381 | slab_unlock(page); |
| 1413 | } else { | 1382 | } else { |
| 1414 | stat(c, DEACTIVATE_EMPTY); | 1383 | stat(c, DEACTIVATE_EMPTY); |
| 1415 | if (n->nr_partial < MIN_PARTIAL) { | 1384 | if (n->nr_partial < n->min_partial) { |
| 1416 | /* | 1385 | /* |
| 1417 | * Adding an empty slab to the partial slabs in order | 1386 | * Adding an empty slab to the partial slabs in order |
| 1418 | * to avoid page allocator overhead. This slab needs | 1387 | * to avoid page allocator overhead. This slab needs |
| @@ -1551,7 +1520,7 @@ load_freelist: | |||
| 1551 | object = c->page->freelist; | 1520 | object = c->page->freelist; |
| 1552 | if (unlikely(!object)) | 1521 | if (unlikely(!object)) |
| 1553 | goto another_slab; | 1522 | goto another_slab; |
| 1554 | if (unlikely(SlabDebug(c->page))) | 1523 | if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) |
| 1555 | goto debug; | 1524 | goto debug; |
| 1556 | 1525 | ||
| 1557 | c->freelist = object[c->offset]; | 1526 | c->freelist = object[c->offset]; |
| @@ -1588,7 +1557,7 @@ new_slab: | |||
| 1588 | if (c->page) | 1557 | if (c->page) |
| 1589 | flush_slab(s, c); | 1558 | flush_slab(s, c); |
| 1590 | slab_lock(new); | 1559 | slab_lock(new); |
| 1591 | SetSlabFrozen(new); | 1560 | __SetPageSlubFrozen(new); |
| 1592 | c->page = new; | 1561 | c->page = new; |
| 1593 | goto load_freelist; | 1562 | goto load_freelist; |
| 1594 | } | 1563 | } |
| @@ -1674,7 +1643,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
| 1674 | stat(c, FREE_SLOWPATH); | 1643 | stat(c, FREE_SLOWPATH); |
| 1675 | slab_lock(page); | 1644 | slab_lock(page); |
| 1676 | 1645 | ||
| 1677 | if (unlikely(SlabDebug(page))) | 1646 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) |
| 1678 | goto debug; | 1647 | goto debug; |
| 1679 | 1648 | ||
| 1680 | checks_ok: | 1649 | checks_ok: |
| @@ -1682,7 +1651,7 @@ checks_ok: | |||
| 1682 | page->freelist = object; | 1651 | page->freelist = object; |
| 1683 | page->inuse--; | 1652 | page->inuse--; |
| 1684 | 1653 | ||
| 1685 | if (unlikely(SlabFrozen(page))) { | 1654 | if (unlikely(PageSlubFrozen(page))) { |
| 1686 | stat(c, FREE_FROZEN); | 1655 | stat(c, FREE_FROZEN); |
| 1687 | goto out_unlock; | 1656 | goto out_unlock; |
| 1688 | } | 1657 | } |
| @@ -1944,9 +1913,21 @@ static void init_kmem_cache_cpu(struct kmem_cache *s, | |||
| 1944 | #endif | 1913 | #endif |
| 1945 | } | 1914 | } |
| 1946 | 1915 | ||
| 1947 | static void init_kmem_cache_node(struct kmem_cache_node *n) | 1916 | static void |
| 1917 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | ||
| 1948 | { | 1918 | { |
| 1949 | n->nr_partial = 0; | 1919 | n->nr_partial = 0; |
| 1920 | |||
| 1921 | /* | ||
| 1922 | * The larger the object size is, the more pages we want on the partial | ||
| 1923 | * list to avoid pounding the page allocator excessively. | ||
| 1924 | */ | ||
| 1925 | n->min_partial = ilog2(s->size); | ||
| 1926 | if (n->min_partial < MIN_PARTIAL) | ||
| 1927 | n->min_partial = MIN_PARTIAL; | ||
| 1928 | else if (n->min_partial > MAX_PARTIAL) | ||
| 1929 | n->min_partial = MAX_PARTIAL; | ||
| 1930 | |||
| 1950 | spin_lock_init(&n->list_lock); | 1931 | spin_lock_init(&n->list_lock); |
| 1951 | INIT_LIST_HEAD(&n->partial); | 1932 | INIT_LIST_HEAD(&n->partial); |
| 1952 | #ifdef CONFIG_SLUB_DEBUG | 1933 | #ifdef CONFIG_SLUB_DEBUG |
| @@ -2118,7 +2099,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, | |||
| 2118 | init_object(kmalloc_caches, n, 1); | 2099 | init_object(kmalloc_caches, n, 1); |
| 2119 | init_tracking(kmalloc_caches, n); | 2100 | init_tracking(kmalloc_caches, n); |
| 2120 | #endif | 2101 | #endif |
| 2121 | init_kmem_cache_node(n); | 2102 | init_kmem_cache_node(n, kmalloc_caches); |
| 2122 | inc_slabs_node(kmalloc_caches, node, page->objects); | 2103 | inc_slabs_node(kmalloc_caches, node, page->objects); |
| 2123 | 2104 | ||
| 2124 | /* | 2105 | /* |
| @@ -2175,7 +2156,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | |||
| 2175 | 2156 | ||
| 2176 | } | 2157 | } |
| 2177 | s->node[node] = n; | 2158 | s->node[node] = n; |
| 2178 | init_kmem_cache_node(n); | 2159 | init_kmem_cache_node(n, s); |
| 2179 | } | 2160 | } |
| 2180 | return 1; | 2161 | return 1; |
| 2181 | } | 2162 | } |
| @@ -2186,7 +2167,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) | |||
| 2186 | 2167 | ||
| 2187 | static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | 2168 | static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) |
| 2188 | { | 2169 | { |
| 2189 | init_kmem_cache_node(&s->local_node); | 2170 | init_kmem_cache_node(&s->local_node, s); |
| 2190 | return 1; | 2171 | return 1; |
| 2191 | } | 2172 | } |
| 2192 | #endif | 2173 | #endif |
| @@ -2317,7 +2298,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
| 2317 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | 2298 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, |
| 2318 | const char *name, size_t size, | 2299 | const char *name, size_t size, |
| 2319 | size_t align, unsigned long flags, | 2300 | size_t align, unsigned long flags, |
| 2320 | void (*ctor)(struct kmem_cache *, void *)) | 2301 | void (*ctor)(void *)) |
| 2321 | { | 2302 | { |
| 2322 | memset(s, 0, kmem_size); | 2303 | memset(s, 0, kmem_size); |
| 2323 | s->name = name; | 2304 | s->name = name; |
| @@ -2746,7 +2727,6 @@ size_t ksize(const void *object) | |||
| 2746 | */ | 2727 | */ |
| 2747 | return s->size; | 2728 | return s->size; |
| 2748 | } | 2729 | } |
| 2749 | EXPORT_SYMBOL(ksize); | ||
| 2750 | 2730 | ||
| 2751 | void kfree(const void *x) | 2731 | void kfree(const void *x) |
| 2752 | { | 2732 | { |
| @@ -2921,7 +2901,7 @@ static int slab_mem_going_online_callback(void *arg) | |||
| 2921 | ret = -ENOMEM; | 2901 | ret = -ENOMEM; |
| 2922 | goto out; | 2902 | goto out; |
| 2923 | } | 2903 | } |
| 2924 | init_kmem_cache_node(n); | 2904 | init_kmem_cache_node(n, s); |
| 2925 | s->node[nid] = n; | 2905 | s->node[nid] = n; |
| 2926 | } | 2906 | } |
| 2927 | out: | 2907 | out: |
| @@ -3073,7 +3053,7 @@ static int slab_unmergeable(struct kmem_cache *s) | |||
| 3073 | 3053 | ||
| 3074 | static struct kmem_cache *find_mergeable(size_t size, | 3054 | static struct kmem_cache *find_mergeable(size_t size, |
| 3075 | size_t align, unsigned long flags, const char *name, | 3055 | size_t align, unsigned long flags, const char *name, |
| 3076 | void (*ctor)(struct kmem_cache *, void *)) | 3056 | void (*ctor)(void *)) |
| 3077 | { | 3057 | { |
| 3078 | struct kmem_cache *s; | 3058 | struct kmem_cache *s; |
| 3079 | 3059 | ||
| @@ -3113,8 +3093,7 @@ static struct kmem_cache *find_mergeable(size_t size, | |||
| 3113 | } | 3093 | } |
| 3114 | 3094 | ||
| 3115 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, | 3095 | struct kmem_cache *kmem_cache_create(const char *name, size_t size, |
| 3116 | size_t align, unsigned long flags, | 3096 | size_t align, unsigned long flags, void (*ctor)(void *)) |
| 3117 | void (*ctor)(struct kmem_cache *, void *)) | ||
| 3118 | { | 3097 | { |
| 3119 | struct kmem_cache *s; | 3098 | struct kmem_cache *s; |
| 3120 | 3099 | ||
| @@ -3317,12 +3296,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page, | |||
| 3317 | s->name, page); | 3296 | s->name, page); |
| 3318 | 3297 | ||
| 3319 | if (s->flags & DEBUG_DEFAULT_FLAGS) { | 3298 | if (s->flags & DEBUG_DEFAULT_FLAGS) { |
| 3320 | if (!SlabDebug(page)) | 3299 | if (!PageSlubDebug(page)) |
| 3321 | printk(KERN_ERR "SLUB %s: SlabDebug not set " | 3300 | printk(KERN_ERR "SLUB %s: SlubDebug not set " |
| 3322 | "on slab 0x%p\n", s->name, page); | 3301 | "on slab 0x%p\n", s->name, page); |
| 3323 | } else { | 3302 | } else { |
| 3324 | if (SlabDebug(page)) | 3303 | if (PageSlubDebug(page)) |
| 3325 | printk(KERN_ERR "SLUB %s: SlabDebug set on " | 3304 | printk(KERN_ERR "SLUB %s: SlubDebug set on " |
| 3326 | "slab 0x%p\n", s->name, page); | 3305 | "slab 0x%p\n", s->name, page); |
| 3327 | } | 3306 | } |
| 3328 | } | 3307 | } |
diff --git a/mm/sparse.c b/mm/sparse.c index 36511c7b5e2c..39db301b920d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -147,22 +147,41 @@ static inline int sparse_early_nid(struct mem_section *section) | |||
| 147 | return (section->section_mem_map >> SECTION_NID_SHIFT); | 147 | return (section->section_mem_map >> SECTION_NID_SHIFT); |
| 148 | } | 148 | } |
| 149 | 149 | ||
| 150 | /* Record a memory area against a node. */ | 150 | /* Validate the physical addressing limitations of the model */ |
| 151 | void __init memory_present(int nid, unsigned long start, unsigned long end) | 151 | void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, |
| 152 | unsigned long *end_pfn) | ||
| 152 | { | 153 | { |
| 153 | unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); | 154 | unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); |
| 154 | unsigned long pfn; | ||
| 155 | 155 | ||
| 156 | /* | 156 | /* |
| 157 | * Sanity checks - do not allow an architecture to pass | 157 | * Sanity checks - do not allow an architecture to pass |
| 158 | * in larger pfns than the maximum scope of sparsemem: | 158 | * in larger pfns than the maximum scope of sparsemem: |
| 159 | */ | 159 | */ |
| 160 | if (start >= max_arch_pfn) | 160 | if (*start_pfn > max_sparsemem_pfn) { |
| 161 | return; | 161 | mminit_dprintk(MMINIT_WARNING, "pfnvalidation", |
| 162 | if (end >= max_arch_pfn) | 162 | "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n", |
| 163 | end = max_arch_pfn; | 163 | *start_pfn, *end_pfn, max_sparsemem_pfn); |
| 164 | WARN_ON_ONCE(1); | ||
| 165 | *start_pfn = max_sparsemem_pfn; | ||
| 166 | *end_pfn = max_sparsemem_pfn; | ||
| 167 | } | ||
| 168 | |||
| 169 | if (*end_pfn > max_sparsemem_pfn) { | ||
| 170 | mminit_dprintk(MMINIT_WARNING, "pfnvalidation", | ||
| 171 | "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", | ||
| 172 | *start_pfn, *end_pfn, max_sparsemem_pfn); | ||
| 173 | WARN_ON_ONCE(1); | ||
| 174 | *end_pfn = max_sparsemem_pfn; | ||
| 175 | } | ||
| 176 | } | ||
| 177 | |||
| 178 | /* Record a memory area against a node. */ | ||
| 179 | void __init memory_present(int nid, unsigned long start, unsigned long end) | ||
| 180 | { | ||
| 181 | unsigned long pfn; | ||
| 164 | 182 | ||
| 165 | start &= PAGE_SECTION_MASK; | 183 | start &= PAGE_SECTION_MASK; |
| 184 | mminit_validate_memmodel_limits(&start, &end); | ||
| 166 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { | 185 | for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { |
| 167 | unsigned long section = pfn_to_section_nr(pfn); | 186 | unsigned long section = pfn_to_section_nr(pfn); |
| 168 | struct mem_section *ms; | 187 | struct mem_section *ms; |
| @@ -187,6 +206,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, | |||
| 187 | unsigned long pfn; | 206 | unsigned long pfn; |
| 188 | unsigned long nr_pages = 0; | 207 | unsigned long nr_pages = 0; |
| 189 | 208 | ||
| 209 | mminit_validate_memmodel_limits(&start_pfn, &end_pfn); | ||
| 190 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { | 210 | for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { |
| 191 | if (nid != early_pfn_to_nid(pfn)) | 211 | if (nid != early_pfn_to_nid(pfn)) |
| 192 | continue; | 212 | continue; |
| @@ -248,16 +268,92 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
| 248 | } | 268 | } |
| 249 | #endif /* CONFIG_MEMORY_HOTPLUG */ | 269 | #endif /* CONFIG_MEMORY_HOTPLUG */ |
| 250 | 270 | ||
| 271 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 272 | static unsigned long * __init | ||
| 273 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | ||
| 274 | { | ||
| 275 | unsigned long section_nr; | ||
| 276 | |||
| 277 | /* | ||
| 278 | * A page may contain usemaps for other sections preventing the | ||
| 279 | * page being freed and making a section unremovable while | ||
| 280 | * other sections referencing the usemap retmain active. Similarly, | ||
| 281 | * a pgdat can prevent a section being removed. If section A | ||
| 282 | * contains a pgdat and section B contains the usemap, both | ||
| 283 | * sections become inter-dependent. This allocates usemaps | ||
| 284 | * from the same section as the pgdat where possible to avoid | ||
| 285 | * this problem. | ||
| 286 | */ | ||
| 287 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | ||
| 288 | return alloc_bootmem_section(usemap_size(), section_nr); | ||
| 289 | } | ||
| 290 | |||
| 291 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | ||
| 292 | { | ||
| 293 | unsigned long usemap_snr, pgdat_snr; | ||
| 294 | static unsigned long old_usemap_snr = NR_MEM_SECTIONS; | ||
| 295 | static unsigned long old_pgdat_snr = NR_MEM_SECTIONS; | ||
| 296 | struct pglist_data *pgdat = NODE_DATA(nid); | ||
| 297 | int usemap_nid; | ||
| 298 | |||
| 299 | usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); | ||
| 300 | pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | ||
| 301 | if (usemap_snr == pgdat_snr) | ||
| 302 | return; | ||
| 303 | |||
| 304 | if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr) | ||
| 305 | /* skip redundant message */ | ||
| 306 | return; | ||
| 307 | |||
| 308 | old_usemap_snr = usemap_snr; | ||
| 309 | old_pgdat_snr = pgdat_snr; | ||
| 310 | |||
| 311 | usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr)); | ||
| 312 | if (usemap_nid != nid) { | ||
| 313 | printk(KERN_INFO | ||
| 314 | "node %d must be removed before remove section %ld\n", | ||
| 315 | nid, usemap_snr); | ||
| 316 | return; | ||
| 317 | } | ||
| 318 | /* | ||
| 319 | * There is a circular dependency. | ||
| 320 | * Some platforms allow un-removable section because they will just | ||
| 321 | * gather other removable sections for dynamic partitioning. | ||
| 322 | * Just notify un-removable section's number here. | ||
| 323 | */ | ||
| 324 | printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr, | ||
| 325 | pgdat_snr, nid); | ||
| 326 | printk(KERN_CONT | ||
| 327 | " have a circular dependency on usemap and pgdat allocations\n"); | ||
| 328 | } | ||
| 329 | #else | ||
| 330 | static unsigned long * __init | ||
| 331 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | ||
| 332 | { | ||
| 333 | return NULL; | ||
| 334 | } | ||
| 335 | |||
| 336 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | ||
| 337 | { | ||
| 338 | } | ||
| 339 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
| 340 | |||
| 251 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) | 341 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) |
| 252 | { | 342 | { |
| 253 | unsigned long *usemap; | 343 | unsigned long *usemap; |
| 254 | struct mem_section *ms = __nr_to_section(pnum); | 344 | struct mem_section *ms = __nr_to_section(pnum); |
| 255 | int nid = sparse_early_nid(ms); | 345 | int nid = sparse_early_nid(ms); |
| 256 | 346 | ||
| 257 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | 347 | usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); |
| 258 | if (usemap) | 348 | if (usemap) |
| 259 | return usemap; | 349 | return usemap; |
| 260 | 350 | ||
| 351 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | ||
| 352 | if (usemap) { | ||
| 353 | check_usemap_section_nr(nid, usemap); | ||
| 354 | return usemap; | ||
| 355 | } | ||
| 356 | |||
| 261 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ | 357 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ |
| 262 | nid = 0; | 358 | nid = 0; |
| 263 | 359 | ||
| @@ -280,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
| 280 | } | 376 | } |
| 281 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 377 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ |
| 282 | 378 | ||
| 283 | struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | 379 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) |
| 284 | { | 380 | { |
| 285 | struct page *map; | 381 | struct page *map; |
| 286 | struct mem_section *ms = __nr_to_section(pnum); | 382 | struct mem_section *ms = __nr_to_section(pnum); |
| @@ -34,9 +34,9 @@ | |||
| 34 | /* How many pages do we try to swap or page in/out together? */ | 34 | /* How many pages do we try to swap or page in/out together? */ |
| 35 | int page_cluster; | 35 | int page_cluster; |
| 36 | 36 | ||
| 37 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; | 37 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs); |
| 38 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; | 38 | static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs); |
| 39 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, }; | 39 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
| 40 | 40 | ||
| 41 | /* | 41 | /* |
| 42 | * This path almost never happens for VM activity - pages are normally | 42 | * This path almost never happens for VM activity - pages are normally |
| @@ -278,9 +278,10 @@ int lru_add_drain_all(void) | |||
| 278 | * Avoid taking zone->lru_lock if possible, but if it is taken, retain it | 278 | * Avoid taking zone->lru_lock if possible, but if it is taken, retain it |
| 279 | * for the remainder of the operation. | 279 | * for the remainder of the operation. |
| 280 | * | 280 | * |
| 281 | * The locking in this function is against shrink_cache(): we recheck the | 281 | * The locking in this function is against shrink_inactive_list(): we recheck |
| 282 | * page count inside the lock to see whether shrink_cache grabbed the page | 282 | * the page count inside the lock to see whether shrink_inactive_list() |
| 283 | * via the LRU. If it did, give up: shrink_cache will free it. | 283 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() |
| 284 | * will free it. | ||
| 284 | */ | 285 | */ |
| 285 | void release_pages(struct page **pages, int nr, int cold) | 286 | void release_pages(struct page **pages, int nr, int cold) |
| 286 | { | 287 | { |
| @@ -443,7 +444,7 @@ void pagevec_strip(struct pagevec *pvec) | |||
| 443 | for (i = 0; i < pagevec_count(pvec); i++) { | 444 | for (i = 0; i < pagevec_count(pvec); i++) { |
| 444 | struct page *page = pvec->pages[i]; | 445 | struct page *page = pvec->pages[i]; |
| 445 | 446 | ||
| 446 | if (PagePrivate(page) && !TestSetPageLocked(page)) { | 447 | if (PagePrivate(page) && trylock_page(page)) { |
| 447 | if (PagePrivate(page)) | 448 | if (PagePrivate(page)) |
| 448 | try_to_release_page(page, 0); | 449 | try_to_release_page(page, 0); |
| 449 | unlock_page(page); | 450 | unlock_page(page); |
| @@ -493,7 +494,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag); | |||
| 493 | */ | 494 | */ |
| 494 | #define ACCT_THRESHOLD max(16, NR_CPUS * 2) | 495 | #define ACCT_THRESHOLD max(16, NR_CPUS * 2) |
| 495 | 496 | ||
| 496 | static DEFINE_PER_CPU(long, committed_space) = 0; | 497 | static DEFINE_PER_CPU(long, committed_space); |
| 497 | 498 | ||
| 498 | void vm_acct_memory(long pages) | 499 | void vm_acct_memory(long pages) |
| 499 | { | 500 | { |
diff --git a/mm/swap_state.c b/mm/swap_state.c index d8aadaf2a0ba..167cf2dc8a03 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = { | |||
| 39 | 39 | ||
| 40 | struct address_space swapper_space = { | 40 | struct address_space swapper_space = { |
| 41 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), | 41 | .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), |
| 42 | .tree_lock = __RW_LOCK_UNLOCKED(swapper_space.tree_lock), | 42 | .tree_lock = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock), |
| 43 | .a_ops = &swap_aops, | 43 | .a_ops = &swap_aops, |
| 44 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), | 44 | .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), |
| 45 | .backing_dev_info = &swap_backing_dev_info, | 45 | .backing_dev_info = &swap_backing_dev_info, |
| @@ -56,7 +56,8 @@ static struct { | |||
| 56 | 56 | ||
| 57 | void show_swap_cache_info(void) | 57 | void show_swap_cache_info(void) |
| 58 | { | 58 | { |
| 59 | printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n", | 59 | printk("%lu pages in swap cache\n", total_swapcache_pages); |
| 60 | printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", | ||
| 60 | swap_cache_info.add_total, swap_cache_info.del_total, | 61 | swap_cache_info.add_total, swap_cache_info.del_total, |
| 61 | swap_cache_info.find_success, swap_cache_info.find_total); | 62 | swap_cache_info.find_success, swap_cache_info.find_total); |
| 62 | printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); | 63 | printk("Free swap = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10)); |
| @@ -64,7 +65,7 @@ void show_swap_cache_info(void) | |||
| 64 | } | 65 | } |
| 65 | 66 | ||
| 66 | /* | 67 | /* |
| 67 | * add_to_swap_cache resembles add_to_page_cache on swapper_space, | 68 | * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, |
| 68 | * but sets SwapCache flag and private instead of mapping and index. | 69 | * but sets SwapCache flag and private instead of mapping and index. |
| 69 | */ | 70 | */ |
| 70 | int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | 71 | int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) |
| @@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
| 76 | BUG_ON(PagePrivate(page)); | 77 | BUG_ON(PagePrivate(page)); |
| 77 | error = radix_tree_preload(gfp_mask); | 78 | error = radix_tree_preload(gfp_mask); |
| 78 | if (!error) { | 79 | if (!error) { |
| 79 | write_lock_irq(&swapper_space.tree_lock); | 80 | page_cache_get(page); |
| 81 | SetPageSwapCache(page); | ||
| 82 | set_page_private(page, entry.val); | ||
| 83 | |||
| 84 | spin_lock_irq(&swapper_space.tree_lock); | ||
| 80 | error = radix_tree_insert(&swapper_space.page_tree, | 85 | error = radix_tree_insert(&swapper_space.page_tree, |
| 81 | entry.val, page); | 86 | entry.val, page); |
| 82 | if (!error) { | 87 | if (likely(!error)) { |
| 83 | page_cache_get(page); | ||
| 84 | SetPageSwapCache(page); | ||
| 85 | set_page_private(page, entry.val); | ||
| 86 | total_swapcache_pages++; | 88 | total_swapcache_pages++; |
| 87 | __inc_zone_page_state(page, NR_FILE_PAGES); | 89 | __inc_zone_page_state(page, NR_FILE_PAGES); |
| 88 | INC_CACHE_INFO(add_total); | 90 | INC_CACHE_INFO(add_total); |
| 89 | } | 91 | } |
| 90 | write_unlock_irq(&swapper_space.tree_lock); | 92 | spin_unlock_irq(&swapper_space.tree_lock); |
| 91 | radix_tree_preload_end(); | 93 | radix_tree_preload_end(); |
| 94 | |||
| 95 | if (unlikely(error)) { | ||
| 96 | set_page_private(page, 0UL); | ||
| 97 | ClearPageSwapCache(page); | ||
| 98 | page_cache_release(page); | ||
| 99 | } | ||
| 92 | } | 100 | } |
| 93 | return error; | 101 | return error; |
| 94 | } | 102 | } |
| @@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page) | |||
| 175 | 183 | ||
| 176 | entry.val = page_private(page); | 184 | entry.val = page_private(page); |
| 177 | 185 | ||
| 178 | write_lock_irq(&swapper_space.tree_lock); | 186 | spin_lock_irq(&swapper_space.tree_lock); |
| 179 | __delete_from_swap_cache(page); | 187 | __delete_from_swap_cache(page); |
| 180 | write_unlock_irq(&swapper_space.tree_lock); | 188 | spin_unlock_irq(&swapper_space.tree_lock); |
| 181 | 189 | ||
| 182 | swap_free(entry); | 190 | swap_free(entry); |
| 183 | page_cache_release(page); | 191 | page_cache_release(page); |
| @@ -193,7 +201,7 @@ void delete_from_swap_cache(struct page *page) | |||
| 193 | */ | 201 | */ |
| 194 | static inline void free_swap_cache(struct page *page) | 202 | static inline void free_swap_cache(struct page *page) |
| 195 | { | 203 | { |
| 196 | if (PageSwapCache(page) && !TestSetPageLocked(page)) { | 204 | if (PageSwapCache(page) && trylock_page(page)) { |
| 197 | remove_exclusive_swap_page(page); | 205 | remove_exclusive_swap_page(page); |
| 198 | unlock_page(page); | 206 | unlock_page(page); |
| 199 | } | 207 | } |
| @@ -294,9 +302,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 294 | * re-using the just freed swap entry for an existing page. | 302 | * re-using the just freed swap entry for an existing page. |
| 295 | * May fail (-ENOMEM) if radix-tree node allocation failed. | 303 | * May fail (-ENOMEM) if radix-tree node allocation failed. |
| 296 | */ | 304 | */ |
| 297 | SetPageLocked(new_page); | 305 | set_page_locked(new_page); |
| 298 | err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); | 306 | err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); |
| 299 | if (!err) { | 307 | if (likely(!err)) { |
| 300 | /* | 308 | /* |
| 301 | * Initiate read into locked page and return. | 309 | * Initiate read into locked page and return. |
| 302 | */ | 310 | */ |
| @@ -304,7 +312,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 304 | swap_readpage(NULL, new_page); | 312 | swap_readpage(NULL, new_page); |
| 305 | return new_page; | 313 | return new_page; |
| 306 | } | 314 | } |
| 307 | ClearPageLocked(new_page); | 315 | clear_page_locked(new_page); |
| 308 | swap_free(entry); | 316 | swap_free(entry); |
| 309 | } while (err != -ENOMEM); | 317 | } while (err != -ENOMEM); |
| 310 | 318 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index bd1bb5920306..1e330f2998fa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -33,17 +33,18 @@ | |||
| 33 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
| 34 | #include <linux/swapops.h> | 34 | #include <linux/swapops.h> |
| 35 | 35 | ||
| 36 | DEFINE_SPINLOCK(swap_lock); | 36 | static DEFINE_SPINLOCK(swap_lock); |
| 37 | unsigned int nr_swapfiles; | 37 | static unsigned int nr_swapfiles; |
| 38 | long total_swap_pages; | 38 | long total_swap_pages; |
| 39 | static int swap_overflow; | 39 | static int swap_overflow; |
| 40 | static int least_priority; | ||
| 40 | 41 | ||
| 41 | static const char Bad_file[] = "Bad swap file entry "; | 42 | static const char Bad_file[] = "Bad swap file entry "; |
| 42 | static const char Unused_file[] = "Unused swap file entry "; | 43 | static const char Unused_file[] = "Unused swap file entry "; |
| 43 | static const char Bad_offset[] = "Bad swap offset entry "; | 44 | static const char Bad_offset[] = "Bad swap offset entry "; |
| 44 | static const char Unused_offset[] = "Unused swap offset entry "; | 45 | static const char Unused_offset[] = "Unused swap offset entry "; |
| 45 | 46 | ||
| 46 | struct swap_list_t swap_list = {-1, -1}; | 47 | static struct swap_list_t swap_list = {-1, -1}; |
| 47 | 48 | ||
| 48 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; | 49 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; |
| 49 | 50 | ||
| @@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page) | |||
| 368 | retval = 0; | 369 | retval = 0; |
| 369 | if (p->swap_map[swp_offset(entry)] == 1) { | 370 | if (p->swap_map[swp_offset(entry)] == 1) { |
| 370 | /* Recheck the page count with the swapcache lock held.. */ | 371 | /* Recheck the page count with the swapcache lock held.. */ |
| 371 | write_lock_irq(&swapper_space.tree_lock); | 372 | spin_lock_irq(&swapper_space.tree_lock); |
| 372 | if ((page_count(page) == 2) && !PageWriteback(page)) { | 373 | if ((page_count(page) == 2) && !PageWriteback(page)) { |
| 373 | __delete_from_swap_cache(page); | 374 | __delete_from_swap_cache(page); |
| 374 | SetPageDirty(page); | 375 | SetPageDirty(page); |
| 375 | retval = 1; | 376 | retval = 1; |
| 376 | } | 377 | } |
| 377 | write_unlock_irq(&swapper_space.tree_lock); | 378 | spin_unlock_irq(&swapper_space.tree_lock); |
| 378 | } | 379 | } |
| 379 | spin_unlock(&swap_lock); | 380 | spin_unlock(&swap_lock); |
| 380 | 381 | ||
| @@ -402,7 +403,7 @@ void free_swap_and_cache(swp_entry_t entry) | |||
| 402 | if (p) { | 403 | if (p) { |
| 403 | if (swap_entry_free(p, swp_offset(entry)) == 1) { | 404 | if (swap_entry_free(p, swp_offset(entry)) == 1) { |
| 404 | page = find_get_page(&swapper_space, entry.val); | 405 | page = find_get_page(&swapper_space, entry.val); |
| 405 | if (page && unlikely(TestSetPageLocked(page))) { | 406 | if (page && unlikely(!trylock_page(page))) { |
| 406 | page_cache_release(page); | 407 | page_cache_release(page); |
| 407 | page = NULL; | 408 | page = NULL; |
| 408 | } | 409 | } |
| @@ -655,8 +656,8 @@ static int unuse_mm(struct mm_struct *mm, | |||
| 655 | 656 | ||
| 656 | if (!down_read_trylock(&mm->mmap_sem)) { | 657 | if (!down_read_trylock(&mm->mmap_sem)) { |
| 657 | /* | 658 | /* |
| 658 | * Activate page so shrink_cache is unlikely to unmap its | 659 | * Activate page so shrink_inactive_list is unlikely to unmap |
| 659 | * ptes while lock is dropped, so swapoff can make progress. | 660 | * its ptes while lock is dropped, so swapoff can make progress. |
| 660 | */ | 661 | */ |
| 661 | activate_page(page); | 662 | activate_page(page); |
| 662 | unlock_page(page); | 663 | unlock_page(page); |
| @@ -1260,6 +1261,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
| 1260 | /* just pick something that's safe... */ | 1261 | /* just pick something that's safe... */ |
| 1261 | swap_list.next = swap_list.head; | 1262 | swap_list.next = swap_list.head; |
| 1262 | } | 1263 | } |
| 1264 | if (p->prio < 0) { | ||
| 1265 | for (i = p->next; i >= 0; i = swap_info[i].next) | ||
| 1266 | swap_info[i].prio = p->prio--; | ||
| 1267 | least_priority++; | ||
| 1268 | } | ||
| 1263 | nr_swap_pages -= p->pages; | 1269 | nr_swap_pages -= p->pages; |
| 1264 | total_swap_pages -= p->pages; | 1270 | total_swap_pages -= p->pages; |
| 1265 | p->flags &= ~SWP_WRITEOK; | 1271 | p->flags &= ~SWP_WRITEOK; |
| @@ -1272,9 +1278,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile) | |||
| 1272 | if (err) { | 1278 | if (err) { |
| 1273 | /* re-insert swap space back into swap_list */ | 1279 | /* re-insert swap space back into swap_list */ |
| 1274 | spin_lock(&swap_lock); | 1280 | spin_lock(&swap_lock); |
| 1275 | for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) | 1281 | if (p->prio < 0) |
| 1282 | p->prio = --least_priority; | ||
| 1283 | prev = -1; | ||
| 1284 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | ||
| 1276 | if (p->prio >= swap_info[i].prio) | 1285 | if (p->prio >= swap_info[i].prio) |
| 1277 | break; | 1286 | break; |
| 1287 | prev = i; | ||
| 1288 | } | ||
| 1278 | p->next = i; | 1289 | p->next = i; |
| 1279 | if (prev < 0) | 1290 | if (prev < 0) |
| 1280 | swap_list.head = swap_list.next = p - swap_info; | 1291 | swap_list.head = swap_list.next = p - swap_info; |
| @@ -1447,7 +1458,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
| 1447 | unsigned int type; | 1458 | unsigned int type; |
| 1448 | int i, prev; | 1459 | int i, prev; |
| 1449 | int error; | 1460 | int error; |
| 1450 | static int least_priority; | ||
| 1451 | union swap_header *swap_header = NULL; | 1461 | union swap_header *swap_header = NULL; |
| 1452 | int swap_header_version; | 1462 | int swap_header_version; |
| 1453 | unsigned int nr_good_pages = 0; | 1463 | unsigned int nr_good_pages = 0; |
| @@ -1455,7 +1465,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
| 1455 | sector_t span; | 1465 | sector_t span; |
| 1456 | unsigned long maxpages = 1; | 1466 | unsigned long maxpages = 1; |
| 1457 | int swapfilesize; | 1467 | int swapfilesize; |
| 1458 | unsigned short *swap_map; | 1468 | unsigned short *swap_map = NULL; |
| 1459 | struct page *page = NULL; | 1469 | struct page *page = NULL; |
| 1460 | struct inode *inode = NULL; | 1470 | struct inode *inode = NULL; |
| 1461 | int did_down = 0; | 1471 | int did_down = 0; |
| @@ -1474,22 +1484,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
| 1474 | } | 1484 | } |
| 1475 | if (type >= nr_swapfiles) | 1485 | if (type >= nr_swapfiles) |
| 1476 | nr_swapfiles = type+1; | 1486 | nr_swapfiles = type+1; |
| 1487 | memset(p, 0, sizeof(*p)); | ||
| 1477 | INIT_LIST_HEAD(&p->extent_list); | 1488 | INIT_LIST_HEAD(&p->extent_list); |
| 1478 | p->flags = SWP_USED; | 1489 | p->flags = SWP_USED; |
| 1479 | p->swap_file = NULL; | ||
| 1480 | p->old_block_size = 0; | ||
| 1481 | p->swap_map = NULL; | ||
| 1482 | p->lowest_bit = 0; | ||
| 1483 | p->highest_bit = 0; | ||
| 1484 | p->cluster_nr = 0; | ||
| 1485 | p->inuse_pages = 0; | ||
| 1486 | p->next = -1; | 1490 | p->next = -1; |
| 1487 | if (swap_flags & SWAP_FLAG_PREFER) { | ||
| 1488 | p->prio = | ||
| 1489 | (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; | ||
| 1490 | } else { | ||
| 1491 | p->prio = --least_priority; | ||
| 1492 | } | ||
| 1493 | spin_unlock(&swap_lock); | 1491 | spin_unlock(&swap_lock); |
| 1494 | name = getname(specialfile); | 1492 | name = getname(specialfile); |
| 1495 | error = PTR_ERR(name); | 1493 | error = PTR_ERR(name); |
| @@ -1632,19 +1630,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
| 1632 | goto bad_swap; | 1630 | goto bad_swap; |
| 1633 | 1631 | ||
| 1634 | /* OK, set up the swap map and apply the bad block list */ | 1632 | /* OK, set up the swap map and apply the bad block list */ |
| 1635 | if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { | 1633 | swap_map = vmalloc(maxpages * sizeof(short)); |
| 1634 | if (!swap_map) { | ||
| 1636 | error = -ENOMEM; | 1635 | error = -ENOMEM; |
| 1637 | goto bad_swap; | 1636 | goto bad_swap; |
| 1638 | } | 1637 | } |
| 1639 | 1638 | ||
| 1640 | error = 0; | 1639 | error = 0; |
| 1641 | memset(p->swap_map, 0, maxpages * sizeof(short)); | 1640 | memset(swap_map, 0, maxpages * sizeof(short)); |
| 1642 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1641 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
| 1643 | int page_nr = swap_header->info.badpages[i]; | 1642 | int page_nr = swap_header->info.badpages[i]; |
| 1644 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) | 1643 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) |
| 1645 | error = -EINVAL; | 1644 | error = -EINVAL; |
| 1646 | else | 1645 | else |
| 1647 | p->swap_map[page_nr] = SWAP_MAP_BAD; | 1646 | swap_map[page_nr] = SWAP_MAP_BAD; |
| 1648 | } | 1647 | } |
| 1649 | nr_good_pages = swap_header->info.last_page - | 1648 | nr_good_pages = swap_header->info.last_page - |
| 1650 | swap_header->info.nr_badpages - | 1649 | swap_header->info.nr_badpages - |
| @@ -1654,7 +1653,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
| 1654 | } | 1653 | } |
| 1655 | 1654 | ||
| 1656 | if (nr_good_pages) { | 1655 | if (nr_good_pages) { |
| 1657 | p->swap_map[0] = SWAP_MAP_BAD; | 1656 | swap_map[0] = SWAP_MAP_BAD; |
| 1658 | p->max = maxpages; | 1657 | p->max = maxpages; |
| 1659 | p->pages = nr_good_pages; | 1658 | p->pages = nr_good_pages; |
| 1660 | nr_extents = setup_swap_extents(p, &span); | 1659 | nr_extents = setup_swap_extents(p, &span); |
| @@ -1672,6 +1671,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) | |||
| 1672 | 1671 | ||
| 1673 | mutex_lock(&swapon_mutex); | 1672 | mutex_lock(&swapon_mutex); |
| 1674 | spin_lock(&swap_lock); | 1673 | spin_lock(&swap_lock); |
| 1674 | if (swap_flags & SWAP_FLAG_PREFER) | ||
| 1675 | p->prio = | ||
| 1676 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | ||
| 1677 | else | ||
| 1678 | p->prio = --least_priority; | ||
| 1679 | p->swap_map = swap_map; | ||
| 1675 | p->flags = SWP_ACTIVE; | 1680 | p->flags = SWP_ACTIVE; |
| 1676 | nr_swap_pages += nr_good_pages; | 1681 | nr_swap_pages += nr_good_pages; |
| 1677 | total_swap_pages += nr_good_pages; | 1682 | total_swap_pages += nr_good_pages; |
| @@ -1707,12 +1712,8 @@ bad_swap: | |||
| 1707 | destroy_swap_extents(p); | 1712 | destroy_swap_extents(p); |
| 1708 | bad_swap_2: | 1713 | bad_swap_2: |
| 1709 | spin_lock(&swap_lock); | 1714 | spin_lock(&swap_lock); |
| 1710 | swap_map = p->swap_map; | ||
| 1711 | p->swap_file = NULL; | 1715 | p->swap_file = NULL; |
| 1712 | p->swap_map = NULL; | ||
| 1713 | p->flags = 0; | 1716 | p->flags = 0; |
| 1714 | if (!(swap_flags & SWAP_FLAG_PREFER)) | ||
| 1715 | ++least_priority; | ||
| 1716 | spin_unlock(&swap_lock); | 1717 | spin_unlock(&swap_lock); |
| 1717 | vfree(swap_map); | 1718 | vfree(swap_map); |
| 1718 | if (swap_file) | 1719 | if (swap_file) |
diff --git a/mm/truncate.c b/mm/truncate.c index b8961cb63414..250505091d37 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -104,7 +104,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
| 104 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 104 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
| 105 | 105 | ||
| 106 | remove_from_page_cache(page); | 106 | remove_from_page_cache(page); |
| 107 | ClearPageUptodate(page); | ||
| 108 | ClearPageMappedToDisk(page); | 107 | ClearPageMappedToDisk(page); |
| 109 | page_cache_release(page); /* pagecache ref */ | 108 | page_cache_release(page); /* pagecache ref */ |
| 110 | } | 109 | } |
| @@ -188,7 +187,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
| 188 | if (page_index > next) | 187 | if (page_index > next) |
| 189 | next = page_index; | 188 | next = page_index; |
| 190 | next++; | 189 | next++; |
| 191 | if (TestSetPageLocked(page)) | 190 | if (!trylock_page(page)) |
| 192 | continue; | 191 | continue; |
| 193 | if (PageWriteback(page)) { | 192 | if (PageWriteback(page)) { |
| 194 | unlock_page(page); | 193 | unlock_page(page); |
| @@ -281,7 +280,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping, | |||
| 281 | pgoff_t index; | 280 | pgoff_t index; |
| 282 | int lock_failed; | 281 | int lock_failed; |
| 283 | 282 | ||
| 284 | lock_failed = TestSetPageLocked(page); | 283 | lock_failed = !trylock_page(page); |
| 285 | 284 | ||
| 286 | /* | 285 | /* |
| 287 | * We really shouldn't be looking at the ->index of an | 286 | * We really shouldn't be looking at the ->index of an |
| @@ -349,18 +348,17 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
| 349 | if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) | 348 | if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) |
| 350 | return 0; | 349 | return 0; |
| 351 | 350 | ||
| 352 | write_lock_irq(&mapping->tree_lock); | 351 | spin_lock_irq(&mapping->tree_lock); |
| 353 | if (PageDirty(page)) | 352 | if (PageDirty(page)) |
| 354 | goto failed; | 353 | goto failed; |
| 355 | 354 | ||
| 356 | BUG_ON(PagePrivate(page)); | 355 | BUG_ON(PagePrivate(page)); |
| 357 | __remove_from_page_cache(page); | 356 | __remove_from_page_cache(page); |
| 358 | write_unlock_irq(&mapping->tree_lock); | 357 | spin_unlock_irq(&mapping->tree_lock); |
| 359 | ClearPageUptodate(page); | ||
| 360 | page_cache_release(page); /* pagecache ref */ | 358 | page_cache_release(page); /* pagecache ref */ |
| 361 | return 1; | 359 | return 1; |
| 362 | failed: | 360 | failed: |
| 363 | write_unlock_irq(&mapping->tree_lock); | 361 | spin_unlock_irq(&mapping->tree_lock); |
| 364 | return 0; | 362 | return 0; |
| 365 | } | 363 | } |
| 366 | 364 | ||
| @@ -1,7 +1,9 @@ | |||
| 1 | #include <linux/mm.h> | ||
| 1 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
| 2 | #include <linux/string.h> | 3 | #include <linux/string.h> |
| 3 | #include <linux/module.h> | 4 | #include <linux/module.h> |
| 4 | #include <linux/err.h> | 5 | #include <linux/err.h> |
| 6 | #include <linux/sched.h> | ||
| 5 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
| 6 | 8 | ||
| 7 | /** | 9 | /** |
| @@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp) | |||
| 68 | EXPORT_SYMBOL(kmemdup); | 70 | EXPORT_SYMBOL(kmemdup); |
| 69 | 71 | ||
| 70 | /** | 72 | /** |
| 71 | * krealloc - reallocate memory. The contents will remain unchanged. | 73 | * __krealloc - like krealloc() but don't free @p. |
| 72 | * @p: object to reallocate memory for. | 74 | * @p: object to reallocate memory for. |
| 73 | * @new_size: how many bytes of memory are required. | 75 | * @new_size: how many bytes of memory are required. |
| 74 | * @flags: the type of memory to allocate. | 76 | * @flags: the type of memory to allocate. |
| 75 | * | 77 | * |
| 76 | * The contents of the object pointed to are preserved up to the | 78 | * This function is like krealloc() except it never frees the originally |
| 77 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | 79 | * allocated buffer. Use this if you don't want to free the buffer immediately |
| 78 | * behaves exactly like kmalloc(). If @size is 0 and @p is not a | 80 | * like, for example, with RCU. |
| 79 | * %NULL pointer, the object pointed to is freed. | ||
| 80 | */ | 81 | */ |
| 81 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 82 | void *__krealloc(const void *p, size_t new_size, gfp_t flags) |
| 82 | { | 83 | { |
| 83 | void *ret; | 84 | void *ret; |
| 84 | size_t ks = 0; | 85 | size_t ks = 0; |
| 85 | 86 | ||
| 86 | if (unlikely(!new_size)) { | 87 | if (unlikely(!new_size)) |
| 87 | kfree(p); | ||
| 88 | return ZERO_SIZE_PTR; | 88 | return ZERO_SIZE_PTR; |
| 89 | } | ||
| 90 | 89 | ||
| 91 | if (p) | 90 | if (p) |
| 92 | ks = ksize(p); | 91 | ks = ksize(p); |
| @@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) | |||
| 95 | return (void *)p; | 94 | return (void *)p; |
| 96 | 95 | ||
| 97 | ret = kmalloc_track_caller(new_size, flags); | 96 | ret = kmalloc_track_caller(new_size, flags); |
| 98 | if (ret && p) { | 97 | if (ret && p) |
| 99 | memcpy(ret, p, ks); | 98 | memcpy(ret, p, ks); |
| 99 | |||
| 100 | return ret; | ||
| 101 | } | ||
| 102 | EXPORT_SYMBOL(__krealloc); | ||
| 103 | |||
| 104 | /** | ||
| 105 | * krealloc - reallocate memory. The contents will remain unchanged. | ||
| 106 | * @p: object to reallocate memory for. | ||
| 107 | * @new_size: how many bytes of memory are required. | ||
| 108 | * @flags: the type of memory to allocate. | ||
| 109 | * | ||
| 110 | * The contents of the object pointed to are preserved up to the | ||
| 111 | * lesser of the new and old sizes. If @p is %NULL, krealloc() | ||
| 112 | * behaves exactly like kmalloc(). If @size is 0 and @p is not a | ||
| 113 | * %NULL pointer, the object pointed to is freed. | ||
| 114 | */ | ||
| 115 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | ||
| 116 | { | ||
| 117 | void *ret; | ||
| 118 | |||
| 119 | if (unlikely(!new_size)) { | ||
| 100 | kfree(p); | 120 | kfree(p); |
| 121 | return ZERO_SIZE_PTR; | ||
| 101 | } | 122 | } |
| 123 | |||
| 124 | ret = __krealloc(p, new_size, flags); | ||
| 125 | if (ret && p != ret) | ||
| 126 | kfree(p); | ||
| 127 | |||
| 102 | return ret; | 128 | return ret; |
| 103 | } | 129 | } |
| 104 | EXPORT_SYMBOL(krealloc); | 130 | EXPORT_SYMBOL(krealloc); |
| @@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n) | |||
| 136 | return p; | 162 | return p; |
| 137 | } | 163 | } |
| 138 | EXPORT_SYMBOL(strndup_user); | 164 | EXPORT_SYMBOL(strndup_user); |
| 165 | |||
| 166 | #ifndef HAVE_ARCH_PICK_MMAP_LAYOUT | ||
| 167 | void arch_pick_mmap_layout(struct mm_struct *mm) | ||
| 168 | { | ||
| 169 | mm->mmap_base = TASK_UNMAPPED_BASE; | ||
| 170 | mm->get_unmapped_area = arch_get_unmapped_area; | ||
| 171 | mm->unmap_area = arch_unmap_area; | ||
| 172 | } | ||
| 173 | #endif | ||
| 174 | |||
| 175 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, | ||
| 176 | int nr_pages, int write, struct page **pages) | ||
| 177 | { | ||
| 178 | struct mm_struct *mm = current->mm; | ||
| 179 | int ret; | ||
| 180 | |||
| 181 | down_read(&mm->mmap_sem); | ||
| 182 | ret = get_user_pages(current, mm, start, nr_pages, | ||
| 183 | write, 0, pages, NULL); | ||
| 184 | up_read(&mm->mmap_sem); | ||
| 185 | |||
| 186 | return ret; | ||
| 187 | } | ||
| 188 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6e45b0f3d125..85b9a0d2c877 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -381,16 +381,14 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
| 381 | return; | 381 | return; |
| 382 | 382 | ||
| 383 | if ((PAGE_SIZE-1) & (unsigned long)addr) { | 383 | if ((PAGE_SIZE-1) & (unsigned long)addr) { |
| 384 | printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); | 384 | WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); |
| 385 | WARN_ON(1); | ||
| 386 | return; | 385 | return; |
| 387 | } | 386 | } |
| 388 | 387 | ||
| 389 | area = remove_vm_area(addr); | 388 | area = remove_vm_area(addr); |
| 390 | if (unlikely(!area)) { | 389 | if (unlikely(!area)) { |
| 391 | printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", | 390 | WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", |
| 392 | addr); | 391 | addr); |
| 393 | WARN_ON(1); | ||
| 394 | return; | 392 | return; |
| 395 | } | 393 | } |
| 396 | 394 | ||
| @@ -931,6 +929,25 @@ static void s_stop(struct seq_file *m, void *p) | |||
| 931 | read_unlock(&vmlist_lock); | 929 | read_unlock(&vmlist_lock); |
| 932 | } | 930 | } |
| 933 | 931 | ||
| 932 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) | ||
| 933 | { | ||
| 934 | if (NUMA_BUILD) { | ||
| 935 | unsigned int nr, *counters = m->private; | ||
| 936 | |||
| 937 | if (!counters) | ||
| 938 | return; | ||
| 939 | |||
| 940 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); | ||
| 941 | |||
| 942 | for (nr = 0; nr < v->nr_pages; nr++) | ||
| 943 | counters[page_to_nid(v->pages[nr])]++; | ||
| 944 | |||
| 945 | for_each_node_state(nr, N_HIGH_MEMORY) | ||
| 946 | if (counters[nr]) | ||
| 947 | seq_printf(m, " N%u=%u", nr, counters[nr]); | ||
| 948 | } | ||
| 949 | } | ||
| 950 | |||
| 934 | static int s_show(struct seq_file *m, void *p) | 951 | static int s_show(struct seq_file *m, void *p) |
| 935 | { | 952 | { |
| 936 | struct vm_struct *v = p; | 953 | struct vm_struct *v = p; |
| @@ -967,6 +984,7 @@ static int s_show(struct seq_file *m, void *p) | |||
| 967 | if (v->flags & VM_VPAGES) | 984 | if (v->flags & VM_VPAGES) |
| 968 | seq_printf(m, " vpages"); | 985 | seq_printf(m, " vpages"); |
| 969 | 986 | ||
| 987 | show_numa_info(m, v); | ||
| 970 | seq_putc(m, '\n'); | 988 | seq_putc(m, '\n'); |
| 971 | return 0; | 989 | return 0; |
| 972 | } | 990 | } |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 967d30ccd92b..1ff1a58e7c10 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <linux/kthread.h> | 38 | #include <linux/kthread.h> |
| 39 | #include <linux/freezer.h> | 39 | #include <linux/freezer.h> |
| 40 | #include <linux/memcontrol.h> | 40 | #include <linux/memcontrol.h> |
| 41 | #include <linux/delayacct.h> | ||
| 41 | 42 | ||
| 42 | #include <asm/tlbflush.h> | 43 | #include <asm/tlbflush.h> |
| 43 | #include <asm/div64.h> | 44 | #include <asm/div64.h> |
| @@ -390,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
| 390 | } | 391 | } |
| 391 | 392 | ||
| 392 | /* | 393 | /* |
| 393 | * Attempt to detach a locked page from its ->mapping. If it is dirty or if | 394 | * Same as remove_mapping, but if the page is removed from the mapping, it |
| 394 | * someone else has a ref on the page, abort and return 0. If it was | 395 | * gets returned with a refcount of 0. |
| 395 | * successfully detached, return 1. Assumes the caller has a single ref on | ||
| 396 | * this page. | ||
| 397 | */ | 396 | */ |
| 398 | int remove_mapping(struct address_space *mapping, struct page *page) | 397 | static int __remove_mapping(struct address_space *mapping, struct page *page) |
| 399 | { | 398 | { |
| 400 | BUG_ON(!PageLocked(page)); | 399 | BUG_ON(!PageLocked(page)); |
| 401 | BUG_ON(mapping != page_mapping(page)); | 400 | BUG_ON(mapping != page_mapping(page)); |
| 402 | 401 | ||
| 403 | write_lock_irq(&mapping->tree_lock); | 402 | spin_lock_irq(&mapping->tree_lock); |
| 404 | /* | 403 | /* |
| 405 | * The non racy check for a busy page. | 404 | * The non racy check for a busy page. |
| 406 | * | 405 | * |
| @@ -426,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
| 426 | * Note that if SetPageDirty is always performed via set_page_dirty, | 425 | * Note that if SetPageDirty is always performed via set_page_dirty, |
| 427 | * and thus under tree_lock, then this ordering is not required. | 426 | * and thus under tree_lock, then this ordering is not required. |
| 428 | */ | 427 | */ |
| 429 | if (unlikely(page_count(page) != 2)) | 428 | if (!page_freeze_refs(page, 2)) |
| 430 | goto cannot_free; | 429 | goto cannot_free; |
| 431 | smp_rmb(); | 430 | /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ |
| 432 | if (unlikely(PageDirty(page))) | 431 | if (unlikely(PageDirty(page))) { |
| 432 | page_unfreeze_refs(page, 2); | ||
| 433 | goto cannot_free; | 433 | goto cannot_free; |
| 434 | } | ||
| 434 | 435 | ||
| 435 | if (PageSwapCache(page)) { | 436 | if (PageSwapCache(page)) { |
| 436 | swp_entry_t swap = { .val = page_private(page) }; | 437 | swp_entry_t swap = { .val = page_private(page) }; |
| 437 | __delete_from_swap_cache(page); | 438 | __delete_from_swap_cache(page); |
| 438 | write_unlock_irq(&mapping->tree_lock); | 439 | spin_unlock_irq(&mapping->tree_lock); |
| 439 | swap_free(swap); | 440 | swap_free(swap); |
| 440 | __put_page(page); /* The pagecache ref */ | 441 | } else { |
| 441 | return 1; | 442 | __remove_from_page_cache(page); |
| 443 | spin_unlock_irq(&mapping->tree_lock); | ||
| 442 | } | 444 | } |
| 443 | 445 | ||
| 444 | __remove_from_page_cache(page); | ||
| 445 | write_unlock_irq(&mapping->tree_lock); | ||
| 446 | __put_page(page); | ||
| 447 | return 1; | 446 | return 1; |
| 448 | 447 | ||
| 449 | cannot_free: | 448 | cannot_free: |
| 450 | write_unlock_irq(&mapping->tree_lock); | 449 | spin_unlock_irq(&mapping->tree_lock); |
| 450 | return 0; | ||
| 451 | } | ||
| 452 | |||
| 453 | /* | ||
| 454 | * Attempt to detach a locked page from its ->mapping. If it is dirty or if | ||
| 455 | * someone else has a ref on the page, abort and return 0. If it was | ||
| 456 | * successfully detached, return 1. Assumes the caller has a single ref on | ||
| 457 | * this page. | ||
| 458 | */ | ||
| 459 | int remove_mapping(struct address_space *mapping, struct page *page) | ||
| 460 | { | ||
| 461 | if (__remove_mapping(mapping, page)) { | ||
| 462 | /* | ||
| 463 | * Unfreezing the refcount with 1 rather than 2 effectively | ||
| 464 | * drops the pagecache ref for us without requiring another | ||
| 465 | * atomic operation. | ||
| 466 | */ | ||
| 467 | page_unfreeze_refs(page, 1); | ||
| 468 | return 1; | ||
| 469 | } | ||
| 451 | return 0; | 470 | return 0; |
| 452 | } | 471 | } |
| 453 | 472 | ||
| @@ -477,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 477 | page = lru_to_page(page_list); | 496 | page = lru_to_page(page_list); |
| 478 | list_del(&page->lru); | 497 | list_del(&page->lru); |
| 479 | 498 | ||
| 480 | if (TestSetPageLocked(page)) | 499 | if (!trylock_page(page)) |
| 481 | goto keep; | 500 | goto keep; |
| 482 | 501 | ||
| 483 | VM_BUG_ON(PageActive(page)); | 502 | VM_BUG_ON(PageActive(page)); |
| @@ -563,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 563 | * A synchronous write - probably a ramdisk. Go | 582 | * A synchronous write - probably a ramdisk. Go |
| 564 | * ahead and try to reclaim the page. | 583 | * ahead and try to reclaim the page. |
| 565 | */ | 584 | */ |
| 566 | if (TestSetPageLocked(page)) | 585 | if (!trylock_page(page)) |
| 567 | goto keep; | 586 | goto keep; |
| 568 | if (PageDirty(page) || PageWriteback(page)) | 587 | if (PageDirty(page) || PageWriteback(page)) |
| 569 | goto keep_locked; | 588 | goto keep_locked; |
| @@ -597,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 597 | if (PagePrivate(page)) { | 616 | if (PagePrivate(page)) { |
| 598 | if (!try_to_release_page(page, sc->gfp_mask)) | 617 | if (!try_to_release_page(page, sc->gfp_mask)) |
| 599 | goto activate_locked; | 618 | goto activate_locked; |
| 600 | if (!mapping && page_count(page) == 1) | 619 | if (!mapping && page_count(page) == 1) { |
| 601 | goto free_it; | 620 | unlock_page(page); |
| 621 | if (put_page_testzero(page)) | ||
| 622 | goto free_it; | ||
| 623 | else { | ||
| 624 | /* | ||
| 625 | * rare race with speculative reference. | ||
| 626 | * the speculative reference will free | ||
| 627 | * this page shortly, so we may | ||
| 628 | * increment nr_reclaimed here (and | ||
| 629 | * leave it off the LRU). | ||
| 630 | */ | ||
| 631 | nr_reclaimed++; | ||
| 632 | continue; | ||
| 633 | } | ||
| 634 | } | ||
| 602 | } | 635 | } |
| 603 | 636 | ||
| 604 | if (!mapping || !remove_mapping(mapping, page)) | 637 | if (!mapping || !__remove_mapping(mapping, page)) |
| 605 | goto keep_locked; | 638 | goto keep_locked; |
| 606 | 639 | ||
| 607 | free_it: | ||
| 608 | unlock_page(page); | 640 | unlock_page(page); |
| 641 | free_it: | ||
| 609 | nr_reclaimed++; | 642 | nr_reclaimed++; |
| 610 | if (!pagevec_add(&freed_pvec, page)) | 643 | if (!pagevec_add(&freed_pvec, page)) { |
| 611 | __pagevec_release_nonlru(&freed_pvec); | 644 | __pagevec_free(&freed_pvec); |
| 645 | pagevec_reinit(&freed_pvec); | ||
| 646 | } | ||
| 612 | continue; | 647 | continue; |
| 613 | 648 | ||
| 614 | activate_locked: | 649 | activate_locked: |
| @@ -622,7 +657,7 @@ keep: | |||
| 622 | } | 657 | } |
| 623 | list_splice(&ret_pages, page_list); | 658 | list_splice(&ret_pages, page_list); |
| 624 | if (pagevec_count(&freed_pvec)) | 659 | if (pagevec_count(&freed_pvec)) |
| 625 | __pagevec_release_nonlru(&freed_pvec); | 660 | __pagevec_free(&freed_pvec); |
| 626 | count_vm_events(PGACTIVATE, pgactivate); | 661 | count_vm_events(PGACTIVATE, pgactivate); |
| 627 | return nr_reclaimed; | 662 | return nr_reclaimed; |
| 628 | } | 663 | } |
| @@ -1316,6 +1351,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1316 | struct zone *zone; | 1351 | struct zone *zone; |
| 1317 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1352 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
| 1318 | 1353 | ||
| 1354 | delayacct_freepages_start(); | ||
| 1355 | |||
| 1319 | if (scan_global_lru(sc)) | 1356 | if (scan_global_lru(sc)) |
| 1320 | count_vm_event(ALLOCSTALL); | 1357 | count_vm_event(ALLOCSTALL); |
| 1321 | /* | 1358 | /* |
| @@ -1371,7 +1408,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1371 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1408 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) |
| 1372 | congestion_wait(WRITE, HZ/10); | 1409 | congestion_wait(WRITE, HZ/10); |
| 1373 | } | 1410 | } |
| 1374 | /* top priority shrink_caches still had more to do? don't OOM, then */ | 1411 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
| 1375 | if (!sc->all_unreclaimable && scan_global_lru(sc)) | 1412 | if (!sc->all_unreclaimable && scan_global_lru(sc)) |
| 1376 | ret = nr_reclaimed; | 1413 | ret = nr_reclaimed; |
| 1377 | out: | 1414 | out: |
| @@ -1396,6 +1433,8 @@ out: | |||
| 1396 | } else | 1433 | } else |
| 1397 | mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); | 1434 | mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); |
| 1398 | 1435 | ||
| 1436 | delayacct_freepages_end(); | ||
| 1437 | |||
| 1399 | return ret; | 1438 | return ret; |
| 1400 | } | 1439 | } |
| 1401 | 1440 | ||
| @@ -1940,7 +1979,7 @@ module_init(kswapd_init) | |||
| 1940 | int zone_reclaim_mode __read_mostly; | 1979 | int zone_reclaim_mode __read_mostly; |
| 1941 | 1980 | ||
| 1942 | #define RECLAIM_OFF 0 | 1981 | #define RECLAIM_OFF 0 |
| 1943 | #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ | 1982 | #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ |
| 1944 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ | 1983 | #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ |
| 1945 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ | 1984 | #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ |
| 1946 | 1985 | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index db9eabb2c5b3..b0d08e667ece 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/err.h> | 13 | #include <linux/err.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/cpu.h> | 15 | #include <linux/cpu.h> |
| 16 | #include <linux/vmstat.h> | ||
| 16 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
| 17 | 18 | ||
| 18 | #ifdef CONFIG_VM_EVENT_COUNTERS | 19 | #ifdef CONFIG_VM_EVENT_COUNTERS |
| @@ -26,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) | |||
| 26 | 27 | ||
| 27 | memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); | 28 | memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); |
| 28 | 29 | ||
| 29 | for_each_cpu_mask(cpu, *cpumask) { | 30 | for_each_cpu_mask_nr(cpu, *cpumask) { |
| 30 | struct vm_event_state *this = &per_cpu(vm_event_states, cpu); | 31 | struct vm_event_state *this = &per_cpu(vm_event_states, cpu); |
| 31 | 32 | ||
| 32 | for (i = 0; i < NR_VM_EVENT_ITEMS; i++) | 33 | for (i = 0; i < NR_VM_EVENT_ITEMS; i++) |
