diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 28 | ||||
| -rw-r--r-- | mm/Makefile | 5 | ||||
| -rw-r--r-- | mm/allocpercpu.c | 177 | ||||
| -rw-r--r-- | mm/backing-dev.c | 2 | ||||
| -rw-r--r-- | mm/bootmem.c | 225 | ||||
| -rw-r--r-- | mm/fadvise.c | 10 | ||||
| -rw-r--r-- | mm/failslab.c | 18 | ||||
| -rw-r--r-- | mm/filemap.c | 173 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 2 | ||||
| -rw-r--r-- | mm/fremap.c | 2 | ||||
| -rw-r--r-- | mm/hugetlb.c | 556 | ||||
| -rw-r--r-- | mm/hwpoison-inject.c | 113 | ||||
| -rw-r--r-- | mm/internal.h | 35 | ||||
| -rw-r--r-- | mm/kmemleak.c | 192 | ||||
| -rw-r--r-- | mm/ksm.c | 959 | ||||
| -rw-r--r-- | mm/maccess.c | 11 | ||||
| -rw-r--r-- | mm/madvise.c | 21 | ||||
| -rw-r--r-- | mm/memcontrol.c | 461 | ||||
| -rw-r--r-- | mm/memory-failure.c | 578 | ||||
| -rw-r--r-- | mm/memory.c | 210 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 20 | ||||
| -rw-r--r-- | mm/mempolicy.c | 181 | ||||
| -rw-r--r-- | mm/migrate.c | 176 | ||||
| -rw-r--r-- | mm/mincore.c | 37 | ||||
| -rw-r--r-- | mm/mlock.c | 57 | ||||
| -rw-r--r-- | mm/mmap.c | 287 | ||||
| -rw-r--r-- | mm/mmu_context.c | 3 | ||||
| -rw-r--r-- | mm/mremap.c | 248 | ||||
| -rw-r--r-- | mm/nommu.c | 158 | ||||
| -rw-r--r-- | mm/oom_kill.c | 105 | ||||
| -rw-r--r-- | mm/page-writeback.c | 12 | ||||
| -rw-r--r-- | mm/page_alloc.c | 442 | ||||
| -rw-r--r-- | mm/page_io.c | 17 | ||||
| -rw-r--r-- | mm/pagewalk.c | 32 | ||||
| -rw-r--r-- | mm/percpu.c | 64 | ||||
| -rw-r--r-- | mm/readahead.c | 18 | ||||
| -rw-r--r-- | mm/rmap.c | 535 | ||||
| -rw-r--r-- | mm/shmem.c | 84 | ||||
| -rw-r--r-- | mm/shmem_acl.c | 171 | ||||
| -rw-r--r-- | mm/slab.c | 173 | ||||
| -rw-r--r-- | mm/slub.c | 363 | ||||
| -rw-r--r-- | mm/sparse-vmemmap.c | 76 | ||||
| -rw-r--r-- | mm/sparse.c | 196 | ||||
| -rw-r--r-- | mm/swap.c | 2 | ||||
| -rw-r--r-- | mm/swapfile.c | 887 | ||||
| -rw-r--r-- | mm/truncate.c | 38 | ||||
| -rw-r--r-- | mm/util.c | 2 | ||||
| -rw-r--r-- | mm/vmalloc.c | 125 | ||||
| -rw-r--r-- | mm/vmscan.c | 485 | ||||
| -rw-r--r-- | mm/vmstat.c | 27 |
50 files changed, 5686 insertions, 3113 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 44cf6f0a3a6d..9c61158308dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME | |||
| 115 | config SPARSEMEM_VMEMMAP_ENABLE | 115 | config SPARSEMEM_VMEMMAP_ENABLE |
| 116 | bool | 116 | bool |
| 117 | 117 | ||
| 118 | config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 119 | def_bool y | ||
| 120 | depends on SPARSEMEM && X86_64 | ||
| 121 | |||
| 118 | config SPARSEMEM_VMEMMAP | 122 | config SPARSEMEM_VMEMMAP |
| 119 | bool "Sparse Memory virtual memmap" | 123 | bool "Sparse Memory virtual memmap" |
| 120 | depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE | 124 | depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE |
| @@ -158,11 +162,13 @@ config PAGEFLAGS_EXTENDED | |||
| 158 | # Default to 4 for wider testing, though 8 might be more appropriate. | 162 | # Default to 4 for wider testing, though 8 might be more appropriate. |
| 159 | # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. | 163 | # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. |
| 160 | # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. | 164 | # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. |
| 165 | # DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. | ||
| 161 | # | 166 | # |
| 162 | config SPLIT_PTLOCK_CPUS | 167 | config SPLIT_PTLOCK_CPUS |
| 163 | int | 168 | int |
| 164 | default "4096" if ARM && !CPU_CACHE_VIPT | 169 | default "999999" if ARM && !CPU_CACHE_VIPT |
| 165 | default "4096" if PARISC && !PA20 | 170 | default "999999" if PARISC && !PA20 |
| 171 | default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC | ||
| 166 | default "4" | 172 | default "4" |
| 167 | 173 | ||
| 168 | # | 174 | # |
| @@ -193,21 +199,13 @@ config BOUNCE | |||
| 193 | config NR_QUICK | 199 | config NR_QUICK |
| 194 | int | 200 | int |
| 195 | depends on QUICKLIST | 201 | depends on QUICKLIST |
| 196 | default "2" if SUPERH || AVR32 | 202 | default "2" if AVR32 |
| 197 | default "1" | 203 | default "1" |
| 198 | 204 | ||
| 199 | config VIRT_TO_BUS | 205 | config VIRT_TO_BUS |
| 200 | def_bool y | 206 | def_bool y |
| 201 | depends on !ARCH_NO_VIRT_TO_BUS | 207 | depends on !ARCH_NO_VIRT_TO_BUS |
| 202 | 208 | ||
| 203 | config HAVE_MLOCK | ||
| 204 | bool | ||
| 205 | default y if MMU=y | ||
| 206 | |||
| 207 | config HAVE_MLOCKED_PAGE_BIT | ||
| 208 | bool | ||
| 209 | default y if HAVE_MLOCK=y | ||
| 210 | |||
| 211 | config MMU_NOTIFIER | 209 | config MMU_NOTIFIER |
| 212 | bool | 210 | bool |
| 213 | 211 | ||
| @@ -218,7 +216,7 @@ config KSM | |||
| 218 | Enable Kernel Samepage Merging: KSM periodically scans those areas | 216 | Enable Kernel Samepage Merging: KSM periodically scans those areas |
| 219 | of an application's address space that an app has advised may be | 217 | of an application's address space that an app has advised may be |
| 220 | mergeable. When it finds pages of identical content, it replaces | 218 | mergeable. When it finds pages of identical content, it replaces |
| 221 | the many instances by a single resident page with that content, so | 219 | the many instances by a single page with that content, so |
| 222 | saving memory until one or another app needs to modify the content. | 220 | saving memory until one or another app needs to modify the content. |
| 223 | Recommended for use with KVM, or with other duplicative applications. | 221 | Recommended for use with KVM, or with other duplicative applications. |
| 224 | See Documentation/vm/ksm.txt for more information: KSM is inactive | 222 | See Documentation/vm/ksm.txt for more information: KSM is inactive |
| @@ -227,6 +225,7 @@ config KSM | |||
| 227 | 225 | ||
| 228 | config DEFAULT_MMAP_MIN_ADDR | 226 | config DEFAULT_MMAP_MIN_ADDR |
| 229 | int "Low address space to protect from user allocation" | 227 | int "Low address space to protect from user allocation" |
| 228 | depends on MMU | ||
| 230 | default 4096 | 229 | default 4096 |
| 231 | help | 230 | help |
| 232 | This is the portion of low virtual memory which should be protected | 231 | This is the portion of low virtual memory which should be protected |
| @@ -257,8 +256,9 @@ config MEMORY_FAILURE | |||
| 257 | special hardware support and typically ECC memory. | 256 | special hardware support and typically ECC memory. |
| 258 | 257 | ||
| 259 | config HWPOISON_INJECT | 258 | config HWPOISON_INJECT |
| 260 | tristate "Poison pages injector" | 259 | tristate "HWPoison pages injector" |
| 261 | depends on MEMORY_FAILURE && DEBUG_KERNEL | 260 | depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS |
| 261 | select PROC_PAGE_MONITOR | ||
| 262 | 262 | ||
| 263 | config NOMMU_INITIAL_TRIM_EXCESS | 263 | config NOMMU_INITIAL_TRIM_EXCESS |
| 264 | int "Turn on mmap() excess space trimming before booting" | 264 | int "Turn on mmap() excess space trimming before booting" |
diff --git a/mm/Makefile b/mm/Makefile index ebf849042ed3..7a68d2ab5560 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
| 22 | obj-$(CONFIG_NUMA) += mempolicy.o | 22 | obj-$(CONFIG_NUMA) += mempolicy.o |
| 23 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 23 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
| 24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
| 25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | ||
| 26 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
| 27 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
| 28 | obj-$(CONFIG_KSM) += ksm.o | 27 | obj-$(CONFIG_KSM) += ksm.o |
| @@ -34,11 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o | |||
| 34 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 33 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
| 35 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 34 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
| 36 | obj-$(CONFIG_MIGRATION) += migrate.o | 35 | obj-$(CONFIG_MIGRATION) += migrate.o |
| 37 | ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA | ||
| 38 | obj-$(CONFIG_SMP) += percpu.o | 36 | obj-$(CONFIG_SMP) += percpu.o |
| 39 | else | ||
| 40 | obj-$(CONFIG_SMP) += allocpercpu.o | ||
| 41 | endif | ||
| 42 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 37 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
| 43 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 38 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
| 44 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | 39 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o |
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c deleted file mode 100644 index df34ceae0c67..000000000000 --- a/mm/allocpercpu.c +++ /dev/null | |||
| @@ -1,177 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * linux/mm/allocpercpu.c | ||
| 3 | * | ||
| 4 | * Separated from slab.c August 11, 2006 Christoph Lameter | ||
| 5 | */ | ||
| 6 | #include <linux/mm.h> | ||
| 7 | #include <linux/module.h> | ||
| 8 | #include <linux/bootmem.h> | ||
| 9 | #include <asm/sections.h> | ||
| 10 | |||
| 11 | #ifndef cache_line_size | ||
| 12 | #define cache_line_size() L1_CACHE_BYTES | ||
| 13 | #endif | ||
| 14 | |||
| 15 | /** | ||
| 16 | * percpu_depopulate - depopulate per-cpu data for given cpu | ||
| 17 | * @__pdata: per-cpu data to depopulate | ||
| 18 | * @cpu: depopulate per-cpu data for this cpu | ||
| 19 | * | ||
| 20 | * Depopulating per-cpu data for a cpu going offline would be a typical | ||
| 21 | * use case. You need to register a cpu hotplug handler for that purpose. | ||
| 22 | */ | ||
| 23 | static void percpu_depopulate(void *__pdata, int cpu) | ||
| 24 | { | ||
| 25 | struct percpu_data *pdata = __percpu_disguise(__pdata); | ||
| 26 | |||
| 27 | kfree(pdata->ptrs[cpu]); | ||
| 28 | pdata->ptrs[cpu] = NULL; | ||
| 29 | } | ||
| 30 | |||
| 31 | /** | ||
| 32 | * percpu_depopulate_mask - depopulate per-cpu data for some cpu's | ||
| 33 | * @__pdata: per-cpu data to depopulate | ||
| 34 | * @mask: depopulate per-cpu data for cpu's selected through mask bits | ||
| 35 | */ | ||
| 36 | static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask) | ||
| 37 | { | ||
| 38 | int cpu; | ||
| 39 | for_each_cpu_mask_nr(cpu, *mask) | ||
| 40 | percpu_depopulate(__pdata, cpu); | ||
| 41 | } | ||
| 42 | |||
| 43 | #define percpu_depopulate_mask(__pdata, mask) \ | ||
| 44 | __percpu_depopulate_mask((__pdata), &(mask)) | ||
| 45 | |||
| 46 | /** | ||
| 47 | * percpu_populate - populate per-cpu data for given cpu | ||
| 48 | * @__pdata: per-cpu data to populate further | ||
| 49 | * @size: size of per-cpu object | ||
| 50 | * @gfp: may sleep or not etc. | ||
| 51 | * @cpu: populate per-data for this cpu | ||
| 52 | * | ||
| 53 | * Populating per-cpu data for a cpu coming online would be a typical | ||
| 54 | * use case. You need to register a cpu hotplug handler for that purpose. | ||
| 55 | * Per-cpu object is populated with zeroed buffer. | ||
| 56 | */ | ||
| 57 | static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) | ||
| 58 | { | ||
| 59 | struct percpu_data *pdata = __percpu_disguise(__pdata); | ||
| 60 | int node = cpu_to_node(cpu); | ||
| 61 | |||
| 62 | /* | ||
| 63 | * We should make sure each CPU gets private memory. | ||
| 64 | */ | ||
| 65 | size = roundup(size, cache_line_size()); | ||
| 66 | |||
| 67 | BUG_ON(pdata->ptrs[cpu]); | ||
| 68 | if (node_online(node)) | ||
| 69 | pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); | ||
| 70 | else | ||
| 71 | pdata->ptrs[cpu] = kzalloc(size, gfp); | ||
| 72 | return pdata->ptrs[cpu]; | ||
| 73 | } | ||
| 74 | |||
| 75 | /** | ||
| 76 | * percpu_populate_mask - populate per-cpu data for more cpu's | ||
| 77 | * @__pdata: per-cpu data to populate further | ||
| 78 | * @size: size of per-cpu object | ||
| 79 | * @gfp: may sleep or not etc. | ||
| 80 | * @mask: populate per-cpu data for cpu's selected through mask bits | ||
| 81 | * | ||
| 82 | * Per-cpu objects are populated with zeroed buffers. | ||
| 83 | */ | ||
| 84 | static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, | ||
| 85 | cpumask_t *mask) | ||
| 86 | { | ||
| 87 | cpumask_t populated; | ||
| 88 | int cpu; | ||
| 89 | |||
| 90 | cpus_clear(populated); | ||
| 91 | for_each_cpu_mask_nr(cpu, *mask) | ||
| 92 | if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { | ||
| 93 | __percpu_depopulate_mask(__pdata, &populated); | ||
| 94 | return -ENOMEM; | ||
| 95 | } else | ||
| 96 | cpu_set(cpu, populated); | ||
| 97 | return 0; | ||
| 98 | } | ||
| 99 | |||
| 100 | #define percpu_populate_mask(__pdata, size, gfp, mask) \ | ||
| 101 | __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) | ||
| 102 | |||
| 103 | /** | ||
| 104 | * alloc_percpu - initial setup of per-cpu data | ||
| 105 | * @size: size of per-cpu object | ||
| 106 | * @align: alignment | ||
| 107 | * | ||
| 108 | * Allocate dynamic percpu area. Percpu objects are populated with | ||
| 109 | * zeroed buffers. | ||
| 110 | */ | ||
| 111 | void *__alloc_percpu(size_t size, size_t align) | ||
| 112 | { | ||
| 113 | /* | ||
| 114 | * We allocate whole cache lines to avoid false sharing | ||
| 115 | */ | ||
| 116 | size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); | ||
| 117 | void *pdata = kzalloc(sz, GFP_KERNEL); | ||
| 118 | void *__pdata = __percpu_disguise(pdata); | ||
| 119 | |||
| 120 | /* | ||
| 121 | * Can't easily make larger alignment work with kmalloc. WARN | ||
| 122 | * on it. Larger alignment should only be used for module | ||
| 123 | * percpu sections on SMP for which this path isn't used. | ||
| 124 | */ | ||
| 125 | WARN_ON_ONCE(align > SMP_CACHE_BYTES); | ||
| 126 | |||
| 127 | if (unlikely(!pdata)) | ||
| 128 | return NULL; | ||
| 129 | if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, | ||
| 130 | &cpu_possible_map))) | ||
| 131 | return __pdata; | ||
| 132 | kfree(pdata); | ||
| 133 | return NULL; | ||
| 134 | } | ||
| 135 | EXPORT_SYMBOL_GPL(__alloc_percpu); | ||
| 136 | |||
| 137 | /** | ||
| 138 | * free_percpu - final cleanup of per-cpu data | ||
| 139 | * @__pdata: object to clean up | ||
| 140 | * | ||
| 141 | * We simply clean up any per-cpu object left. No need for the client to | ||
| 142 | * track and specify through a bis mask which per-cpu objects are to free. | ||
| 143 | */ | ||
| 144 | void free_percpu(void *__pdata) | ||
| 145 | { | ||
| 146 | if (unlikely(!__pdata)) | ||
| 147 | return; | ||
| 148 | __percpu_depopulate_mask(__pdata, cpu_possible_mask); | ||
| 149 | kfree(__percpu_disguise(__pdata)); | ||
| 150 | } | ||
| 151 | EXPORT_SYMBOL_GPL(free_percpu); | ||
| 152 | |||
| 153 | /* | ||
| 154 | * Generic percpu area setup. | ||
| 155 | */ | ||
| 156 | #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA | ||
| 157 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; | ||
| 158 | |||
| 159 | EXPORT_SYMBOL(__per_cpu_offset); | ||
| 160 | |||
| 161 | void __init setup_per_cpu_areas(void) | ||
| 162 | { | ||
| 163 | unsigned long size, i; | ||
| 164 | char *ptr; | ||
| 165 | unsigned long nr_possible_cpus = num_possible_cpus(); | ||
| 166 | |||
| 167 | /* Copy section for each CPU (we discard the original) */ | ||
| 168 | size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); | ||
| 169 | ptr = alloc_bootmem_pages(size * nr_possible_cpus); | ||
| 170 | |||
| 171 | for_each_possible_cpu(i) { | ||
| 172 | __per_cpu_offset[i] = ptr - __per_cpu_start; | ||
| 173 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | ||
| 174 | ptr += size; | ||
| 175 | } | ||
| 176 | } | ||
| 177 | #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 67a33a5a1a93..0e8ca0347707 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
| @@ -609,7 +609,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) | |||
| 609 | * it would never exet if it is currently stuck in the refrigerator. | 609 | * it would never exet if it is currently stuck in the refrigerator. |
| 610 | */ | 610 | */ |
| 611 | list_for_each_entry(wb, &bdi->wb_list, list) { | 611 | list_for_each_entry(wb, &bdi->wb_list, list) { |
| 612 | wb->task->flags &= ~PF_FROZEN; | 612 | thaw_process(wb->task); |
| 613 | kthread_stop(wb->task); | 613 | kthread_stop(wb->task); |
| 614 | } | 614 | } |
| 615 | } | 615 | } |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 555d5d2731c6..d7c791ef0036 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/bootmem.h> | 13 | #include <linux/bootmem.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/kmemleak.h> | 15 | #include <linux/kmemleak.h> |
| 16 | #include <linux/range.h> | ||
| 16 | 17 | ||
| 17 | #include <asm/bug.h> | 18 | #include <asm/bug.h> |
| 18 | #include <asm/io.h> | 19 | #include <asm/io.h> |
| @@ -32,6 +33,7 @@ unsigned long max_pfn; | |||
| 32 | unsigned long saved_max_pfn; | 33 | unsigned long saved_max_pfn; |
| 33 | #endif | 34 | #endif |
| 34 | 35 | ||
| 36 | #ifndef CONFIG_NO_BOOTMEM | ||
| 35 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; | 37 | bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; |
| 36 | 38 | ||
| 37 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); | 39 | static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); |
| @@ -142,7 +144,85 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
| 142 | min_low_pfn = start; | 144 | min_low_pfn = start; |
| 143 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); | 145 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); |
| 144 | } | 146 | } |
| 147 | #endif | ||
| 148 | /* | ||
| 149 | * free_bootmem_late - free bootmem pages directly to page allocator | ||
| 150 | * @addr: starting address of the range | ||
| 151 | * @size: size of the range in bytes | ||
| 152 | * | ||
| 153 | * This is only useful when the bootmem allocator has already been torn | ||
| 154 | * down, but we are still initializing the system. Pages are given directly | ||
| 155 | * to the page allocator, no bootmem metadata is updated because it is gone. | ||
| 156 | */ | ||
| 157 | void __init free_bootmem_late(unsigned long addr, unsigned long size) | ||
| 158 | { | ||
| 159 | unsigned long cursor, end; | ||
| 160 | |||
| 161 | kmemleak_free_part(__va(addr), size); | ||
| 162 | |||
| 163 | cursor = PFN_UP(addr); | ||
| 164 | end = PFN_DOWN(addr + size); | ||
| 165 | |||
| 166 | for (; cursor < end; cursor++) { | ||
| 167 | __free_pages_bootmem(pfn_to_page(cursor), 0); | ||
| 168 | totalram_pages++; | ||
| 169 | } | ||
| 170 | } | ||
| 171 | |||
| 172 | #ifdef CONFIG_NO_BOOTMEM | ||
| 173 | static void __init __free_pages_memory(unsigned long start, unsigned long end) | ||
| 174 | { | ||
| 175 | int i; | ||
| 176 | unsigned long start_aligned, end_aligned; | ||
| 177 | int order = ilog2(BITS_PER_LONG); | ||
| 178 | |||
| 179 | start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); | ||
| 180 | end_aligned = end & ~(BITS_PER_LONG - 1); | ||
| 181 | |||
| 182 | if (end_aligned <= start_aligned) { | ||
| 183 | #if 1 | ||
| 184 | printk(KERN_DEBUG " %lx - %lx\n", start, end); | ||
| 185 | #endif | ||
| 186 | for (i = start; i < end; i++) | ||
| 187 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
| 188 | |||
| 189 | return; | ||
| 190 | } | ||
| 191 | |||
| 192 | #if 1 | ||
| 193 | printk(KERN_DEBUG " %lx %lx - %lx %lx\n", | ||
| 194 | start, start_aligned, end_aligned, end); | ||
| 195 | #endif | ||
| 196 | for (i = start; i < start_aligned; i++) | ||
| 197 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
| 198 | |||
| 199 | for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) | ||
| 200 | __free_pages_bootmem(pfn_to_page(i), order); | ||
| 145 | 201 | ||
| 202 | for (i = end_aligned; i < end; i++) | ||
| 203 | __free_pages_bootmem(pfn_to_page(i), 0); | ||
| 204 | } | ||
| 205 | |||
| 206 | unsigned long __init free_all_memory_core_early(int nodeid) | ||
| 207 | { | ||
| 208 | int i; | ||
| 209 | u64 start, end; | ||
| 210 | unsigned long count = 0; | ||
| 211 | struct range *range = NULL; | ||
| 212 | int nr_range; | ||
| 213 | |||
| 214 | nr_range = get_free_all_memory_range(&range, nodeid); | ||
| 215 | |||
| 216 | for (i = 0; i < nr_range; i++) { | ||
| 217 | start = range[i].start; | ||
| 218 | end = range[i].end; | ||
| 219 | count += end - start; | ||
| 220 | __free_pages_memory(start, end); | ||
| 221 | } | ||
| 222 | |||
| 223 | return count; | ||
| 224 | } | ||
| 225 | #else | ||
| 146 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | 226 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
| 147 | { | 227 | { |
| 148 | int aligned; | 228 | int aligned; |
| @@ -203,6 +283,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
| 203 | 283 | ||
| 204 | return count; | 284 | return count; |
| 205 | } | 285 | } |
| 286 | #endif | ||
| 206 | 287 | ||
| 207 | /** | 288 | /** |
| 208 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 289 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
| @@ -213,7 +294,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
| 213 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 294 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
| 214 | { | 295 | { |
| 215 | register_page_bootmem_info_node(pgdat); | 296 | register_page_bootmem_info_node(pgdat); |
| 297 | #ifdef CONFIG_NO_BOOTMEM | ||
| 298 | /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ | ||
| 299 | return 0; | ||
| 300 | #else | ||
| 216 | return free_all_bootmem_core(pgdat->bdata); | 301 | return free_all_bootmem_core(pgdat->bdata); |
| 302 | #endif | ||
| 217 | } | 303 | } |
| 218 | 304 | ||
| 219 | /** | 305 | /** |
| @@ -223,9 +309,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | |||
| 223 | */ | 309 | */ |
| 224 | unsigned long __init free_all_bootmem(void) | 310 | unsigned long __init free_all_bootmem(void) |
| 225 | { | 311 | { |
| 312 | #ifdef CONFIG_NO_BOOTMEM | ||
| 313 | return free_all_memory_core_early(NODE_DATA(0)->node_id); | ||
| 314 | #else | ||
| 226 | return free_all_bootmem_core(NODE_DATA(0)->bdata); | 315 | return free_all_bootmem_core(NODE_DATA(0)->bdata); |
| 316 | #endif | ||
| 227 | } | 317 | } |
| 228 | 318 | ||
| 319 | #ifndef CONFIG_NO_BOOTMEM | ||
| 229 | static void __init __free(bootmem_data_t *bdata, | 320 | static void __init __free(bootmem_data_t *bdata, |
| 230 | unsigned long sidx, unsigned long eidx) | 321 | unsigned long sidx, unsigned long eidx) |
| 231 | { | 322 | { |
| @@ -320,6 +411,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, | |||
| 320 | } | 411 | } |
| 321 | BUG(); | 412 | BUG(); |
| 322 | } | 413 | } |
| 414 | #endif | ||
| 323 | 415 | ||
| 324 | /** | 416 | /** |
| 325 | * free_bootmem_node - mark a page range as usable | 417 | * free_bootmem_node - mark a page range as usable |
| @@ -334,6 +426,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, | |||
| 334 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 426 | void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
| 335 | unsigned long size) | 427 | unsigned long size) |
| 336 | { | 428 | { |
| 429 | #ifdef CONFIG_NO_BOOTMEM | ||
| 430 | free_early(physaddr, physaddr + size); | ||
| 431 | #if 0 | ||
| 432 | printk(KERN_DEBUG "free %lx %lx\n", physaddr, size); | ||
| 433 | #endif | ||
| 434 | #else | ||
| 337 | unsigned long start, end; | 435 | unsigned long start, end; |
| 338 | 436 | ||
| 339 | kmemleak_free_part(__va(physaddr), size); | 437 | kmemleak_free_part(__va(physaddr), size); |
| @@ -342,6 +440,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
| 342 | end = PFN_DOWN(physaddr + size); | 440 | end = PFN_DOWN(physaddr + size); |
| 343 | 441 | ||
| 344 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); | 442 | mark_bootmem_node(pgdat->bdata, start, end, 0, 0); |
| 443 | #endif | ||
| 345 | } | 444 | } |
| 346 | 445 | ||
| 347 | /** | 446 | /** |
| @@ -355,6 +454,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
| 355 | */ | 454 | */ |
| 356 | void __init free_bootmem(unsigned long addr, unsigned long size) | 455 | void __init free_bootmem(unsigned long addr, unsigned long size) |
| 357 | { | 456 | { |
| 457 | #ifdef CONFIG_NO_BOOTMEM | ||
| 458 | free_early(addr, addr + size); | ||
| 459 | #if 0 | ||
| 460 | printk(KERN_DEBUG "free %lx %lx\n", addr, size); | ||
| 461 | #endif | ||
| 462 | #else | ||
| 358 | unsigned long start, end; | 463 | unsigned long start, end; |
| 359 | 464 | ||
| 360 | kmemleak_free_part(__va(addr), size); | 465 | kmemleak_free_part(__va(addr), size); |
| @@ -363,6 +468,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
| 363 | end = PFN_DOWN(addr + size); | 468 | end = PFN_DOWN(addr + size); |
| 364 | 469 | ||
| 365 | mark_bootmem(start, end, 0, 0); | 470 | mark_bootmem(start, end, 0, 0); |
| 471 | #endif | ||
| 366 | } | 472 | } |
| 367 | 473 | ||
| 368 | /** | 474 | /** |
| @@ -379,12 +485,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
| 379 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | 485 | int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, |
| 380 | unsigned long size, int flags) | 486 | unsigned long size, int flags) |
| 381 | { | 487 | { |
| 488 | #ifdef CONFIG_NO_BOOTMEM | ||
| 489 | panic("no bootmem"); | ||
| 490 | return 0; | ||
| 491 | #else | ||
| 382 | unsigned long start, end; | 492 | unsigned long start, end; |
| 383 | 493 | ||
| 384 | start = PFN_DOWN(physaddr); | 494 | start = PFN_DOWN(physaddr); |
| 385 | end = PFN_UP(physaddr + size); | 495 | end = PFN_UP(physaddr + size); |
| 386 | 496 | ||
| 387 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); | 497 | return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); |
| 498 | #endif | ||
| 388 | } | 499 | } |
| 389 | 500 | ||
| 390 | /** | 501 | /** |
| @@ -400,16 +511,22 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
| 400 | int __init reserve_bootmem(unsigned long addr, unsigned long size, | 511 | int __init reserve_bootmem(unsigned long addr, unsigned long size, |
| 401 | int flags) | 512 | int flags) |
| 402 | { | 513 | { |
| 514 | #ifdef CONFIG_NO_BOOTMEM | ||
| 515 | panic("no bootmem"); | ||
| 516 | return 0; | ||
| 517 | #else | ||
| 403 | unsigned long start, end; | 518 | unsigned long start, end; |
| 404 | 519 | ||
| 405 | start = PFN_DOWN(addr); | 520 | start = PFN_DOWN(addr); |
| 406 | end = PFN_UP(addr + size); | 521 | end = PFN_UP(addr + size); |
| 407 | 522 | ||
| 408 | return mark_bootmem(start, end, 1, flags); | 523 | return mark_bootmem(start, end, 1, flags); |
| 524 | #endif | ||
| 409 | } | 525 | } |
| 410 | 526 | ||
| 411 | static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, | 527 | #ifndef CONFIG_NO_BOOTMEM |
| 412 | unsigned long step) | 528 | static unsigned long __init align_idx(struct bootmem_data *bdata, |
| 529 | unsigned long idx, unsigned long step) | ||
| 413 | { | 530 | { |
| 414 | unsigned long base = bdata->node_min_pfn; | 531 | unsigned long base = bdata->node_min_pfn; |
| 415 | 532 | ||
| @@ -421,8 +538,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, | |||
| 421 | return ALIGN(base + idx, step) - base; | 538 | return ALIGN(base + idx, step) - base; |
| 422 | } | 539 | } |
| 423 | 540 | ||
| 424 | static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, | 541 | static unsigned long __init align_off(struct bootmem_data *bdata, |
| 425 | unsigned long align) | 542 | unsigned long off, unsigned long align) |
| 426 | { | 543 | { |
| 427 | unsigned long base = PFN_PHYS(bdata->node_min_pfn); | 544 | unsigned long base = PFN_PHYS(bdata->node_min_pfn); |
| 428 | 545 | ||
| @@ -558,12 +675,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
| 558 | #endif | 675 | #endif |
| 559 | return NULL; | 676 | return NULL; |
| 560 | } | 677 | } |
| 678 | #endif | ||
| 561 | 679 | ||
| 562 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, | 680 | static void * __init ___alloc_bootmem_nopanic(unsigned long size, |
| 563 | unsigned long align, | 681 | unsigned long align, |
| 564 | unsigned long goal, | 682 | unsigned long goal, |
| 565 | unsigned long limit) | 683 | unsigned long limit) |
| 566 | { | 684 | { |
| 685 | #ifdef CONFIG_NO_BOOTMEM | ||
| 686 | void *ptr; | ||
| 687 | |||
| 688 | if (WARN_ON_ONCE(slab_is_available())) | ||
| 689 | return kzalloc(size, GFP_NOWAIT); | ||
| 690 | |||
| 691 | restart: | ||
| 692 | |||
| 693 | ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); | ||
| 694 | |||
| 695 | if (ptr) | ||
| 696 | return ptr; | ||
| 697 | |||
| 698 | if (goal != 0) { | ||
| 699 | goal = 0; | ||
| 700 | goto restart; | ||
| 701 | } | ||
| 702 | |||
| 703 | return NULL; | ||
| 704 | #else | ||
| 567 | bootmem_data_t *bdata; | 705 | bootmem_data_t *bdata; |
| 568 | void *region; | 706 | void *region; |
| 569 | 707 | ||
| @@ -589,6 +727,7 @@ restart: | |||
| 589 | } | 727 | } |
| 590 | 728 | ||
| 591 | return NULL; | 729 | return NULL; |
| 730 | #endif | ||
| 592 | } | 731 | } |
| 593 | 732 | ||
| 594 | /** | 733 | /** |
| @@ -607,7 +746,13 @@ restart: | |||
| 607 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, | 746 | void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, |
| 608 | unsigned long goal) | 747 | unsigned long goal) |
| 609 | { | 748 | { |
| 610 | return ___alloc_bootmem_nopanic(size, align, goal, 0); | 749 | unsigned long limit = 0; |
| 750 | |||
| 751 | #ifdef CONFIG_NO_BOOTMEM | ||
| 752 | limit = -1UL; | ||
| 753 | #endif | ||
| 754 | |||
| 755 | return ___alloc_bootmem_nopanic(size, align, goal, limit); | ||
| 611 | } | 756 | } |
| 612 | 757 | ||
| 613 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | 758 | static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, |
| @@ -641,9 +786,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, | |||
| 641 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, | 786 | void * __init __alloc_bootmem(unsigned long size, unsigned long align, |
| 642 | unsigned long goal) | 787 | unsigned long goal) |
| 643 | { | 788 | { |
| 644 | return ___alloc_bootmem(size, align, goal, 0); | 789 | unsigned long limit = 0; |
| 790 | |||
| 791 | #ifdef CONFIG_NO_BOOTMEM | ||
| 792 | limit = -1UL; | ||
| 793 | #endif | ||
| 794 | |||
| 795 | return ___alloc_bootmem(size, align, goal, limit); | ||
| 645 | } | 796 | } |
| 646 | 797 | ||
| 798 | #ifndef CONFIG_NO_BOOTMEM | ||
| 647 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | 799 | static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, |
| 648 | unsigned long size, unsigned long align, | 800 | unsigned long size, unsigned long align, |
| 649 | unsigned long goal, unsigned long limit) | 801 | unsigned long goal, unsigned long limit) |
| @@ -660,6 +812,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | |||
| 660 | 812 | ||
| 661 | return ___alloc_bootmem(size, align, goal, limit); | 813 | return ___alloc_bootmem(size, align, goal, limit); |
| 662 | } | 814 | } |
| 815 | #endif | ||
| 663 | 816 | ||
| 664 | /** | 817 | /** |
| 665 | * __alloc_bootmem_node - allocate boot memory from a specific node | 818 | * __alloc_bootmem_node - allocate boot memory from a specific node |
| @@ -682,7 +835,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
| 682 | if (WARN_ON_ONCE(slab_is_available())) | 835 | if (WARN_ON_ONCE(slab_is_available())) |
| 683 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 836 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
| 684 | 837 | ||
| 838 | #ifdef CONFIG_NO_BOOTMEM | ||
| 839 | return __alloc_memory_core_early(pgdat->node_id, size, align, | ||
| 840 | goal, -1ULL); | ||
| 841 | #else | ||
| 685 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | 842 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); |
| 843 | #endif | ||
| 844 | } | ||
| 845 | |||
| 846 | void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, | ||
| 847 | unsigned long align, unsigned long goal) | ||
| 848 | { | ||
| 849 | #ifdef MAX_DMA32_PFN | ||
| 850 | unsigned long end_pfn; | ||
| 851 | |||
| 852 | if (WARN_ON_ONCE(slab_is_available())) | ||
| 853 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
| 854 | |||
| 855 | /* update goal according ...MAX_DMA32_PFN */ | ||
| 856 | end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; | ||
| 857 | |||
| 858 | if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && | ||
| 859 | (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { | ||
| 860 | void *ptr; | ||
| 861 | unsigned long new_goal; | ||
| 862 | |||
| 863 | new_goal = MAX_DMA32_PFN << PAGE_SHIFT; | ||
| 864 | #ifdef CONFIG_NO_BOOTMEM | ||
| 865 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
| 866 | new_goal, -1ULL); | ||
| 867 | #else | ||
| 868 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, | ||
| 869 | new_goal, 0); | ||
| 870 | #endif | ||
| 871 | if (ptr) | ||
| 872 | return ptr; | ||
| 873 | } | ||
| 874 | #endif | ||
| 875 | |||
| 876 | return __alloc_bootmem_node(pgdat, size, align, goal); | ||
| 877 | |||
| 686 | } | 878 | } |
| 687 | 879 | ||
| 688 | #ifdef CONFIG_SPARSEMEM | 880 | #ifdef CONFIG_SPARSEMEM |
| @@ -696,6 +888,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | |||
| 696 | void * __init alloc_bootmem_section(unsigned long size, | 888 | void * __init alloc_bootmem_section(unsigned long size, |
| 697 | unsigned long section_nr) | 889 | unsigned long section_nr) |
| 698 | { | 890 | { |
| 891 | #ifdef CONFIG_NO_BOOTMEM | ||
| 892 | unsigned long pfn, goal, limit; | ||
| 893 | |||
| 894 | pfn = section_nr_to_pfn(section_nr); | ||
| 895 | goal = pfn << PAGE_SHIFT; | ||
| 896 | limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; | ||
| 897 | |||
| 898 | return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, | ||
| 899 | SMP_CACHE_BYTES, goal, limit); | ||
| 900 | #else | ||
| 699 | bootmem_data_t *bdata; | 901 | bootmem_data_t *bdata; |
| 700 | unsigned long pfn, goal, limit; | 902 | unsigned long pfn, goal, limit; |
| 701 | 903 | ||
| @@ -705,6 +907,7 @@ void * __init alloc_bootmem_section(unsigned long size, | |||
| 705 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; | 907 | bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; |
| 706 | 908 | ||
| 707 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); | 909 | return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); |
| 910 | #endif | ||
| 708 | } | 911 | } |
| 709 | #endif | 912 | #endif |
| 710 | 913 | ||
| @@ -716,11 +919,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | |||
| 716 | if (WARN_ON_ONCE(slab_is_available())) | 919 | if (WARN_ON_ONCE(slab_is_available())) |
| 717 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 920 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
| 718 | 921 | ||
| 922 | #ifdef CONFIG_NO_BOOTMEM | ||
| 923 | ptr = __alloc_memory_core_early(pgdat->node_id, size, align, | ||
| 924 | goal, -1ULL); | ||
| 925 | #else | ||
| 719 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | 926 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); |
| 720 | if (ptr) | 927 | if (ptr) |
| 721 | return ptr; | 928 | return ptr; |
| 722 | 929 | ||
| 723 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); | 930 | ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); |
| 931 | #endif | ||
| 724 | if (ptr) | 932 | if (ptr) |
| 725 | return ptr; | 933 | return ptr; |
| 726 | 934 | ||
| @@ -771,6 +979,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | |||
| 771 | if (WARN_ON_ONCE(slab_is_available())) | 979 | if (WARN_ON_ONCE(slab_is_available())) |
| 772 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | 980 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); |
| 773 | 981 | ||
| 982 | #ifdef CONFIG_NO_BOOTMEM | ||
| 983 | return __alloc_memory_core_early(pgdat->node_id, size, align, | ||
| 984 | goal, ARCH_LOW_ADDRESS_LIMIT); | ||
| 985 | #else | ||
| 774 | return ___alloc_bootmem_node(pgdat->bdata, size, align, | 986 | return ___alloc_bootmem_node(pgdat->bdata, size, align, |
| 775 | goal, ARCH_LOW_ADDRESS_LIMIT); | 987 | goal, ARCH_LOW_ADDRESS_LIMIT); |
| 988 | #endif | ||
| 776 | } | 989 | } |
diff --git a/mm/fadvise.c b/mm/fadvise.c index e43359214f6f..8d723c9e8b75 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
| @@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
| 77 | switch (advice) { | 77 | switch (advice) { |
| 78 | case POSIX_FADV_NORMAL: | 78 | case POSIX_FADV_NORMAL: |
| 79 | file->f_ra.ra_pages = bdi->ra_pages; | 79 | file->f_ra.ra_pages = bdi->ra_pages; |
| 80 | spin_lock(&file->f_lock); | ||
| 81 | file->f_mode &= ~FMODE_RANDOM; | ||
| 82 | spin_unlock(&file->f_lock); | ||
| 80 | break; | 83 | break; |
| 81 | case POSIX_FADV_RANDOM: | 84 | case POSIX_FADV_RANDOM: |
| 82 | file->f_ra.ra_pages = 0; | 85 | spin_lock(&file->f_lock); |
| 86 | file->f_mode |= FMODE_RANDOM; | ||
| 87 | spin_unlock(&file->f_lock); | ||
| 83 | break; | 88 | break; |
| 84 | case POSIX_FADV_SEQUENTIAL: | 89 | case POSIX_FADV_SEQUENTIAL: |
| 85 | file->f_ra.ra_pages = bdi->ra_pages * 2; | 90 | file->f_ra.ra_pages = bdi->ra_pages * 2; |
| 91 | spin_lock(&file->f_lock); | ||
| 92 | file->f_mode &= ~FMODE_RANDOM; | ||
| 93 | spin_unlock(&file->f_lock); | ||
| 86 | break; | 94 | break; |
| 87 | case POSIX_FADV_WILLNEED: | 95 | case POSIX_FADV_WILLNEED: |
| 88 | if (!mapping->a_ops->readpage) { | 96 | if (!mapping->a_ops->readpage) { |
diff --git a/mm/failslab.c b/mm/failslab.c index 9339de5f0a91..bb41f98dd8b7 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
| @@ -1,18 +1,22 @@ | |||
| 1 | #include <linux/fault-inject.h> | 1 | #include <linux/fault-inject.h> |
| 2 | #include <linux/gfp.h> | 2 | #include <linux/gfp.h> |
| 3 | #include <linux/slab.h> | ||
| 3 | 4 | ||
| 4 | static struct { | 5 | static struct { |
| 5 | struct fault_attr attr; | 6 | struct fault_attr attr; |
| 6 | u32 ignore_gfp_wait; | 7 | u32 ignore_gfp_wait; |
| 8 | int cache_filter; | ||
| 7 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 9 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
| 8 | struct dentry *ignore_gfp_wait_file; | 10 | struct dentry *ignore_gfp_wait_file; |
| 11 | struct dentry *cache_filter_file; | ||
| 9 | #endif | 12 | #endif |
| 10 | } failslab = { | 13 | } failslab = { |
| 11 | .attr = FAULT_ATTR_INITIALIZER, | 14 | .attr = FAULT_ATTR_INITIALIZER, |
| 12 | .ignore_gfp_wait = 1, | 15 | .ignore_gfp_wait = 1, |
| 16 | .cache_filter = 0, | ||
| 13 | }; | 17 | }; |
| 14 | 18 | ||
| 15 | bool should_failslab(size_t size, gfp_t gfpflags) | 19 | bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) |
| 16 | { | 20 | { |
| 17 | if (gfpflags & __GFP_NOFAIL) | 21 | if (gfpflags & __GFP_NOFAIL) |
| 18 | return false; | 22 | return false; |
| @@ -20,6 +24,9 @@ bool should_failslab(size_t size, gfp_t gfpflags) | |||
| 20 | if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) | 24 | if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) |
| 21 | return false; | 25 | return false; |
| 22 | 26 | ||
| 27 | if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) | ||
| 28 | return false; | ||
| 29 | |||
| 23 | return should_fail(&failslab.attr, size); | 30 | return should_fail(&failslab.attr, size); |
| 24 | } | 31 | } |
| 25 | 32 | ||
| @@ -30,7 +37,6 @@ static int __init setup_failslab(char *str) | |||
| 30 | __setup("failslab=", setup_failslab); | 37 | __setup("failslab=", setup_failslab); |
| 31 | 38 | ||
| 32 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS | 39 | #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS |
| 33 | |||
| 34 | static int __init failslab_debugfs_init(void) | 40 | static int __init failslab_debugfs_init(void) |
| 35 | { | 41 | { |
| 36 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; | 42 | mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; |
| @@ -46,8 +52,14 @@ static int __init failslab_debugfs_init(void) | |||
| 46 | debugfs_create_bool("ignore-gfp-wait", mode, dir, | 52 | debugfs_create_bool("ignore-gfp-wait", mode, dir, |
| 47 | &failslab.ignore_gfp_wait); | 53 | &failslab.ignore_gfp_wait); |
| 48 | 54 | ||
| 49 | if (!failslab.ignore_gfp_wait_file) { | 55 | failslab.cache_filter_file = |
| 56 | debugfs_create_bool("cache-filter", mode, dir, | ||
| 57 | &failslab.cache_filter); | ||
| 58 | |||
| 59 | if (!failslab.ignore_gfp_wait_file || | ||
| 60 | !failslab.cache_filter_file) { | ||
| 50 | err = -ENOMEM; | 61 | err = -ENOMEM; |
| 62 | debugfs_remove(failslab.cache_filter_file); | ||
| 51 | debugfs_remove(failslab.ignore_gfp_wait_file); | 63 | debugfs_remove(failslab.ignore_gfp_wait_file); |
| 52 | cleanup_fault_attr_dentries(&failslab.attr); | 64 | cleanup_fault_attr_dentries(&failslab.attr); |
| 53 | } | 65 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index ef169f37156d..045b31c37653 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -260,27 +260,27 @@ int filemap_flush(struct address_space *mapping) | |||
| 260 | EXPORT_SYMBOL(filemap_flush); | 260 | EXPORT_SYMBOL(filemap_flush); |
| 261 | 261 | ||
| 262 | /** | 262 | /** |
| 263 | * wait_on_page_writeback_range - wait for writeback to complete | 263 | * filemap_fdatawait_range - wait for writeback to complete |
| 264 | * @mapping: target address_space | 264 | * @mapping: address space structure to wait for |
| 265 | * @start: beginning page index | 265 | * @start_byte: offset in bytes where the range starts |
| 266 | * @end: ending page index | 266 | * @end_byte: offset in bytes where the range ends (inclusive) |
| 267 | * | 267 | * |
| 268 | * Wait for writeback to complete against pages indexed by start->end | 268 | * Walk the list of under-writeback pages of the given address space |
| 269 | * inclusive | 269 | * in the given range and wait for all of them. |
| 270 | */ | 270 | */ |
| 271 | int wait_on_page_writeback_range(struct address_space *mapping, | 271 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, |
| 272 | pgoff_t start, pgoff_t end) | 272 | loff_t end_byte) |
| 273 | { | 273 | { |
| 274 | pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; | ||
| 275 | pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; | ||
| 274 | struct pagevec pvec; | 276 | struct pagevec pvec; |
| 275 | int nr_pages; | 277 | int nr_pages; |
| 276 | int ret = 0; | 278 | int ret = 0; |
| 277 | pgoff_t index; | ||
| 278 | 279 | ||
| 279 | if (end < start) | 280 | if (end_byte < start_byte) |
| 280 | return 0; | 281 | return 0; |
| 281 | 282 | ||
| 282 | pagevec_init(&pvec, 0); | 283 | pagevec_init(&pvec, 0); |
| 283 | index = start; | ||
| 284 | while ((index <= end) && | 284 | while ((index <= end) && |
| 285 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, | 285 | (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, |
| 286 | PAGECACHE_TAG_WRITEBACK, | 286 | PAGECACHE_TAG_WRITEBACK, |
| @@ -310,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping, | |||
| 310 | 310 | ||
| 311 | return ret; | 311 | return ret; |
| 312 | } | 312 | } |
| 313 | |||
| 314 | /** | ||
| 315 | * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range | ||
| 316 | * @mapping: address space structure to wait for | ||
| 317 | * @start: offset in bytes where the range starts | ||
| 318 | * @end: offset in bytes where the range ends (inclusive) | ||
| 319 | * | ||
| 320 | * Walk the list of under-writeback pages of the given address space | ||
| 321 | * in the given range and wait for all of them. | ||
| 322 | * | ||
| 323 | * This is just a simple wrapper so that callers don't have to convert offsets | ||
| 324 | * to page indexes themselves | ||
| 325 | */ | ||
| 326 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start, | ||
| 327 | loff_t end) | ||
| 328 | { | ||
| 329 | return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, | ||
| 330 | end >> PAGE_CACHE_SHIFT); | ||
| 331 | } | ||
| 332 | EXPORT_SYMBOL(filemap_fdatawait_range); | 313 | EXPORT_SYMBOL(filemap_fdatawait_range); |
| 333 | 314 | ||
| 334 | /** | 315 | /** |
| @@ -345,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping) | |||
| 345 | if (i_size == 0) | 326 | if (i_size == 0) |
| 346 | return 0; | 327 | return 0; |
| 347 | 328 | ||
| 348 | return wait_on_page_writeback_range(mapping, 0, | 329 | return filemap_fdatawait_range(mapping, 0, i_size - 1); |
| 349 | (i_size - 1) >> PAGE_CACHE_SHIFT); | ||
| 350 | } | 330 | } |
| 351 | EXPORT_SYMBOL(filemap_fdatawait); | 331 | EXPORT_SYMBOL(filemap_fdatawait); |
| 352 | 332 | ||
| @@ -393,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
| 393 | WB_SYNC_ALL); | 373 | WB_SYNC_ALL); |
| 394 | /* See comment of filemap_write_and_wait() */ | 374 | /* See comment of filemap_write_and_wait() */ |
| 395 | if (err != -EIO) { | 375 | if (err != -EIO) { |
| 396 | int err2 = wait_on_page_writeback_range(mapping, | 376 | int err2 = filemap_fdatawait_range(mapping, |
| 397 | lstart >> PAGE_CACHE_SHIFT, | 377 | lstart, lend); |
| 398 | lend >> PAGE_CACHE_SHIFT); | ||
| 399 | if (!err) | 378 | if (!err) |
| 400 | err = err2; | 379 | err = err2; |
| 401 | } | 380 | } |
| @@ -1138,7 +1117,7 @@ readpage: | |||
| 1138 | if (!PageUptodate(page)) { | 1117 | if (!PageUptodate(page)) { |
| 1139 | if (page->mapping == NULL) { | 1118 | if (page->mapping == NULL) { |
| 1140 | /* | 1119 | /* |
| 1141 | * invalidate_inode_pages got it | 1120 | * invalidate_mapping_pages got it |
| 1142 | */ | 1121 | */ |
| 1143 | unlock_page(page); | 1122 | unlock_page(page); |
| 1144 | page_cache_release(page); | 1123 | page_cache_release(page); |
| @@ -1655,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); | |||
| 1655 | static struct page *__read_cache_page(struct address_space *mapping, | 1634 | static struct page *__read_cache_page(struct address_space *mapping, |
| 1656 | pgoff_t index, | 1635 | pgoff_t index, |
| 1657 | int (*filler)(void *,struct page*), | 1636 | int (*filler)(void *,struct page*), |
| 1658 | void *data) | 1637 | void *data, |
| 1638 | gfp_t gfp) | ||
| 1659 | { | 1639 | { |
| 1660 | struct page *page; | 1640 | struct page *page; |
| 1661 | int err; | 1641 | int err; |
| 1662 | repeat: | 1642 | repeat: |
| 1663 | page = find_get_page(mapping, index); | 1643 | page = find_get_page(mapping, index); |
| 1664 | if (!page) { | 1644 | if (!page) { |
| 1665 | page = page_cache_alloc_cold(mapping); | 1645 | page = __page_cache_alloc(gfp | __GFP_COLD); |
| 1666 | if (!page) | 1646 | if (!page) |
| 1667 | return ERR_PTR(-ENOMEM); | 1647 | return ERR_PTR(-ENOMEM); |
| 1668 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); | 1648 | err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); |
| @@ -1682,31 +1662,18 @@ repeat: | |||
| 1682 | return page; | 1662 | return page; |
| 1683 | } | 1663 | } |
| 1684 | 1664 | ||
| 1685 | /** | 1665 | static struct page *do_read_cache_page(struct address_space *mapping, |
| 1686 | * read_cache_page_async - read into page cache, fill it if needed | ||
| 1687 | * @mapping: the page's address_space | ||
| 1688 | * @index: the page index | ||
| 1689 | * @filler: function to perform the read | ||
| 1690 | * @data: destination for read data | ||
| 1691 | * | ||
| 1692 | * Same as read_cache_page, but don't wait for page to become unlocked | ||
| 1693 | * after submitting it to the filler. | ||
| 1694 | * | ||
| 1695 | * Read into the page cache. If a page already exists, and PageUptodate() is | ||
| 1696 | * not set, try to fill the page but don't wait for it to become unlocked. | ||
| 1697 | * | ||
| 1698 | * If the page does not get brought uptodate, return -EIO. | ||
| 1699 | */ | ||
| 1700 | struct page *read_cache_page_async(struct address_space *mapping, | ||
| 1701 | pgoff_t index, | 1666 | pgoff_t index, |
| 1702 | int (*filler)(void *,struct page*), | 1667 | int (*filler)(void *,struct page*), |
| 1703 | void *data) | 1668 | void *data, |
| 1669 | gfp_t gfp) | ||
| 1670 | |||
| 1704 | { | 1671 | { |
| 1705 | struct page *page; | 1672 | struct page *page; |
| 1706 | int err; | 1673 | int err; |
| 1707 | 1674 | ||
| 1708 | retry: | 1675 | retry: |
| 1709 | page = __read_cache_page(mapping, index, filler, data); | 1676 | page = __read_cache_page(mapping, index, filler, data, gfp); |
| 1710 | if (IS_ERR(page)) | 1677 | if (IS_ERR(page)) |
| 1711 | return page; | 1678 | return page; |
| 1712 | if (PageUptodate(page)) | 1679 | if (PageUptodate(page)) |
| @@ -1731,8 +1698,67 @@ out: | |||
| 1731 | mark_page_accessed(page); | 1698 | mark_page_accessed(page); |
| 1732 | return page; | 1699 | return page; |
| 1733 | } | 1700 | } |
| 1701 | |||
| 1702 | /** | ||
| 1703 | * read_cache_page_async - read into page cache, fill it if needed | ||
| 1704 | * @mapping: the page's address_space | ||
| 1705 | * @index: the page index | ||
| 1706 | * @filler: function to perform the read | ||
| 1707 | * @data: destination for read data | ||
| 1708 | * | ||
| 1709 | * Same as read_cache_page, but don't wait for page to become unlocked | ||
| 1710 | * after submitting it to the filler. | ||
| 1711 | * | ||
| 1712 | * Read into the page cache. If a page already exists, and PageUptodate() is | ||
| 1713 | * not set, try to fill the page but don't wait for it to become unlocked. | ||
| 1714 | * | ||
| 1715 | * If the page does not get brought uptodate, return -EIO. | ||
| 1716 | */ | ||
| 1717 | struct page *read_cache_page_async(struct address_space *mapping, | ||
| 1718 | pgoff_t index, | ||
| 1719 | int (*filler)(void *,struct page*), | ||
| 1720 | void *data) | ||
| 1721 | { | ||
| 1722 | return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); | ||
| 1723 | } | ||
| 1734 | EXPORT_SYMBOL(read_cache_page_async); | 1724 | EXPORT_SYMBOL(read_cache_page_async); |
| 1735 | 1725 | ||
| 1726 | static struct page *wait_on_page_read(struct page *page) | ||
| 1727 | { | ||
| 1728 | if (!IS_ERR(page)) { | ||
| 1729 | wait_on_page_locked(page); | ||
| 1730 | if (!PageUptodate(page)) { | ||
| 1731 | page_cache_release(page); | ||
| 1732 | page = ERR_PTR(-EIO); | ||
| 1733 | } | ||
| 1734 | } | ||
| 1735 | return page; | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | /** | ||
| 1739 | * read_cache_page_gfp - read into page cache, using specified page allocation flags. | ||
| 1740 | * @mapping: the page's address_space | ||
| 1741 | * @index: the page index | ||
| 1742 | * @gfp: the page allocator flags to use if allocating | ||
| 1743 | * | ||
| 1744 | * This is the same as "read_mapping_page(mapping, index, NULL)", but with | ||
| 1745 | * any new page allocations done using the specified allocation flags. Note | ||
| 1746 | * that the Radix tree operations will still use GFP_KERNEL, so you can't | ||
| 1747 | * expect to do this atomically or anything like that - but you can pass in | ||
| 1748 | * other page requirements. | ||
| 1749 | * | ||
| 1750 | * If the page does not get brought uptodate, return -EIO. | ||
| 1751 | */ | ||
| 1752 | struct page *read_cache_page_gfp(struct address_space *mapping, | ||
| 1753 | pgoff_t index, | ||
| 1754 | gfp_t gfp) | ||
| 1755 | { | ||
| 1756 | filler_t *filler = (filler_t *)mapping->a_ops->readpage; | ||
| 1757 | |||
| 1758 | return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); | ||
| 1759 | } | ||
| 1760 | EXPORT_SYMBOL(read_cache_page_gfp); | ||
| 1761 | |||
| 1736 | /** | 1762 | /** |
| 1737 | * read_cache_page - read into page cache, fill it if needed | 1763 | * read_cache_page - read into page cache, fill it if needed |
| 1738 | * @mapping: the page's address_space | 1764 | * @mapping: the page's address_space |
| @@ -1750,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping, | |||
| 1750 | int (*filler)(void *,struct page*), | 1776 | int (*filler)(void *,struct page*), |
| 1751 | void *data) | 1777 | void *data) |
| 1752 | { | 1778 | { |
| 1753 | struct page *page; | 1779 | return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); |
| 1754 | |||
| 1755 | page = read_cache_page_async(mapping, index, filler, data); | ||
| 1756 | if (IS_ERR(page)) | ||
| 1757 | goto out; | ||
| 1758 | wait_on_page_locked(page); | ||
| 1759 | if (!PageUptodate(page)) { | ||
| 1760 | page_cache_release(page); | ||
| 1761 | page = ERR_PTR(-EIO); | ||
| 1762 | } | ||
| 1763 | out: | ||
| 1764 | return page; | ||
| 1765 | } | 1780 | } |
| 1766 | EXPORT_SYMBOL(read_cache_page); | 1781 | EXPORT_SYMBOL(read_cache_page); |
| 1767 | 1782 | ||
| @@ -1844,7 +1859,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr, | |||
| 1844 | 1859 | ||
| 1845 | /* | 1860 | /* |
| 1846 | * Copy as much as we can into the page and return the number of bytes which | 1861 | * Copy as much as we can into the page and return the number of bytes which |
| 1847 | * were sucessfully copied. If a fault is encountered then return the number of | 1862 | * were successfully copied. If a fault is encountered then return the number of |
| 1848 | * bytes which were copied. | 1863 | * bytes which were copied. |
| 1849 | */ | 1864 | */ |
| 1850 | size_t iov_iter_copy_from_user_atomic(struct page *page, | 1865 | size_t iov_iter_copy_from_user_atomic(struct page *page, |
| @@ -1971,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count); | |||
| 1971 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) | 1986 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) |
| 1972 | { | 1987 | { |
| 1973 | struct inode *inode = file->f_mapping->host; | 1988 | struct inode *inode = file->f_mapping->host; |
| 1974 | unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | 1989 | unsigned long limit = rlimit(RLIMIT_FSIZE); |
| 1975 | 1990 | ||
| 1976 | if (unlikely(*pos < 0)) | 1991 | if (unlikely(*pos < 0)) |
| 1977 | return -EINVAL; | 1992 | return -EINVAL; |
| @@ -2217,6 +2232,9 @@ again: | |||
| 2217 | if (unlikely(status)) | 2232 | if (unlikely(status)) |
| 2218 | break; | 2233 | break; |
| 2219 | 2234 | ||
| 2235 | if (mapping_writably_mapped(mapping)) | ||
| 2236 | flush_dcache_page(page); | ||
| 2237 | |||
| 2220 | pagefault_disable(); | 2238 | pagefault_disable(); |
| 2221 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | 2239 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); |
| 2222 | pagefault_enable(); | 2240 | pagefault_enable(); |
| @@ -2261,7 +2279,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2261 | size_t count, ssize_t written) | 2279 | size_t count, ssize_t written) |
| 2262 | { | 2280 | { |
| 2263 | struct file *file = iocb->ki_filp; | 2281 | struct file *file = iocb->ki_filp; |
| 2264 | struct address_space *mapping = file->f_mapping; | ||
| 2265 | ssize_t status; | 2282 | ssize_t status; |
| 2266 | struct iov_iter i; | 2283 | struct iov_iter i; |
| 2267 | 2284 | ||
| @@ -2273,15 +2290,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2273 | *ppos = pos + status; | 2290 | *ppos = pos + status; |
| 2274 | } | 2291 | } |
| 2275 | 2292 | ||
| 2276 | /* | ||
| 2277 | * If we get here for O_DIRECT writes then we must have fallen through | ||
| 2278 | * to buffered writes (block instantiation inside i_size). So we sync | ||
| 2279 | * the file data here, to try to honour O_DIRECT expectations. | ||
| 2280 | */ | ||
| 2281 | if (unlikely(file->f_flags & O_DIRECT) && written) | ||
| 2282 | status = filemap_write_and_wait_range(mapping, | ||
| 2283 | pos, pos + written - 1); | ||
| 2284 | |||
| 2285 | return written ? written : status; | 2293 | return written ? written : status; |
| 2286 | } | 2294 | } |
| 2287 | EXPORT_SYMBOL(generic_file_buffered_write); | 2295 | EXPORT_SYMBOL(generic_file_buffered_write); |
| @@ -2380,10 +2388,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2380 | * semantics. | 2388 | * semantics. |
| 2381 | */ | 2389 | */ |
| 2382 | endbyte = pos + written_buffered - written - 1; | 2390 | endbyte = pos + written_buffered - written - 1; |
| 2383 | err = do_sync_mapping_range(file->f_mapping, pos, endbyte, | 2391 | err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); |
| 2384 | SYNC_FILE_RANGE_WAIT_BEFORE| | ||
| 2385 | SYNC_FILE_RANGE_WRITE| | ||
| 2386 | SYNC_FILE_RANGE_WAIT_AFTER); | ||
| 2387 | if (err == 0) { | 2392 | if (err == 0) { |
| 2388 | written = written_buffered; | 2393 | written = written_buffered; |
| 2389 | invalidate_mapping_pages(mapping, | 2394 | invalidate_mapping_pages(mapping, |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 1888b2d71bb8..78b94f0b6d5d 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
| @@ -194,7 +194,7 @@ retry: | |||
| 194 | flush_cache_page(vma, address, pte_pfn(*pte)); | 194 | flush_cache_page(vma, address, pte_pfn(*pte)); |
| 195 | pteval = ptep_clear_flush_notify(vma, address, pte); | 195 | pteval = ptep_clear_flush_notify(vma, address, pte); |
| 196 | page_remove_rmap(page); | 196 | page_remove_rmap(page); |
| 197 | dec_mm_counter(mm, file_rss); | 197 | dec_mm_counter(mm, MM_FILEPAGES); |
| 198 | BUG_ON(pte_dirty(pteval)); | 198 | BUG_ON(pte_dirty(pteval)); |
| 199 | pte_unmap_unlock(pte, ptl); | 199 | pte_unmap_unlock(pte, ptl); |
| 200 | page_cache_release(page); | 200 | page_cache_release(page); |
diff --git a/mm/fremap.c b/mm/fremap.c index b6ec85abbb39..46f5dacf90a2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c | |||
| @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 40 | page_remove_rmap(page); | 40 | page_remove_rmap(page); |
| 41 | page_cache_release(page); | 41 | page_cache_release(page); |
| 42 | update_hiwater_rss(mm); | 42 | update_hiwater_rss(mm); |
| 43 | dec_mm_counter(mm, file_rss); | 43 | dec_mm_counter(mm, MM_FILEPAGES); |
| 44 | } | 44 | } |
| 45 | } else { | 45 | } else { |
| 46 | if (!pte_file(pte)) | 46 | if (!pte_file(pte)) |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d7601b02874..3a5aeb37c110 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include <asm/io.h> | 24 | #include <asm/io.h> |
| 25 | 25 | ||
| 26 | #include <linux/hugetlb.h> | 26 | #include <linux/hugetlb.h> |
| 27 | #include <linux/node.h> | ||
| 27 | #include "internal.h" | 28 | #include "internal.h" |
| 28 | 29 | ||
| 29 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 30 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
| @@ -401,7 +402,7 @@ static void clear_huge_page(struct page *page, | |||
| 401 | { | 402 | { |
| 402 | int i; | 403 | int i; |
| 403 | 404 | ||
| 404 | if (unlikely(sz > MAX_ORDER_NR_PAGES)) { | 405 | if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { |
| 405 | clear_gigantic_page(page, addr, sz); | 406 | clear_gigantic_page(page, addr, sz); |
| 406 | return; | 407 | return; |
| 407 | } | 408 | } |
| @@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
| 622 | } | 623 | } |
| 623 | 624 | ||
| 624 | /* | 625 | /* |
| 625 | * Use a helper variable to find the next node and then | 626 | * common helper functions for hstate_next_node_to_{alloc|free}. |
| 626 | * copy it back to next_nid_to_alloc afterwards: | 627 | * We may have allocated or freed a huge page based on a different |
| 627 | * otherwise there's a window in which a racer might | 628 | * nodes_allowed previously, so h->next_node_to_{alloc|free} might |
| 628 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. | 629 | * be outside of *nodes_allowed. Ensure that we use an allowed |
| 629 | * But we don't need to use a spin_lock here: it really | 630 | * node for alloc or free. |
| 630 | * doesn't matter if occasionally a racer chooses the | ||
| 631 | * same nid as we do. Move nid forward in the mask even | ||
| 632 | * if we just successfully allocated a hugepage so that | ||
| 633 | * the next caller gets hugepages on the next node. | ||
| 634 | */ | 631 | */ |
| 635 | static int hstate_next_node_to_alloc(struct hstate *h) | 632 | static int next_node_allowed(int nid, nodemask_t *nodes_allowed) |
| 636 | { | 633 | { |
| 637 | int next_nid; | 634 | nid = next_node(nid, *nodes_allowed); |
| 638 | next_nid = next_node(h->next_nid_to_alloc, node_online_map); | 635 | if (nid == MAX_NUMNODES) |
| 639 | if (next_nid == MAX_NUMNODES) | 636 | nid = first_node(*nodes_allowed); |
| 640 | next_nid = first_node(node_online_map); | 637 | VM_BUG_ON(nid >= MAX_NUMNODES); |
| 641 | h->next_nid_to_alloc = next_nid; | 638 | |
| 642 | return next_nid; | 639 | return nid; |
| 640 | } | ||
| 641 | |||
| 642 | static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) | ||
| 643 | { | ||
| 644 | if (!node_isset(nid, *nodes_allowed)) | ||
| 645 | nid = next_node_allowed(nid, nodes_allowed); | ||
| 646 | return nid; | ||
| 647 | } | ||
| 648 | |||
| 649 | /* | ||
| 650 | * returns the previously saved node ["this node"] from which to | ||
| 651 | * allocate a persistent huge page for the pool and advance the | ||
| 652 | * next node from which to allocate, handling wrap at end of node | ||
| 653 | * mask. | ||
| 654 | */ | ||
| 655 | static int hstate_next_node_to_alloc(struct hstate *h, | ||
| 656 | nodemask_t *nodes_allowed) | ||
| 657 | { | ||
| 658 | int nid; | ||
| 659 | |||
| 660 | VM_BUG_ON(!nodes_allowed); | ||
| 661 | |||
| 662 | nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); | ||
| 663 | h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); | ||
| 664 | |||
| 665 | return nid; | ||
| 643 | } | 666 | } |
| 644 | 667 | ||
| 645 | static int alloc_fresh_huge_page(struct hstate *h) | 668 | static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) |
| 646 | { | 669 | { |
| 647 | struct page *page; | 670 | struct page *page; |
| 648 | int start_nid; | 671 | int start_nid; |
| 649 | int next_nid; | 672 | int next_nid; |
| 650 | int ret = 0; | 673 | int ret = 0; |
| 651 | 674 | ||
| 652 | start_nid = h->next_nid_to_alloc; | 675 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
| 653 | next_nid = start_nid; | 676 | next_nid = start_nid; |
| 654 | 677 | ||
| 655 | do { | 678 | do { |
| 656 | page = alloc_fresh_huge_page_node(h, next_nid); | 679 | page = alloc_fresh_huge_page_node(h, next_nid); |
| 657 | if (page) | 680 | if (page) { |
| 658 | ret = 1; | 681 | ret = 1; |
| 659 | next_nid = hstate_next_node_to_alloc(h); | 682 | break; |
| 660 | } while (!page && next_nid != start_nid); | 683 | } |
| 684 | next_nid = hstate_next_node_to_alloc(h, nodes_allowed); | ||
| 685 | } while (next_nid != start_nid); | ||
| 661 | 686 | ||
| 662 | if (ret) | 687 | if (ret) |
| 663 | count_vm_event(HTLB_BUDDY_PGALLOC); | 688 | count_vm_event(HTLB_BUDDY_PGALLOC); |
| @@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
| 668 | } | 693 | } |
| 669 | 694 | ||
| 670 | /* | 695 | /* |
| 671 | * helper for free_pool_huge_page() - find next node | 696 | * helper for free_pool_huge_page() - return the previously saved |
| 672 | * from which to free a huge page | 697 | * node ["this node"] from which to free a huge page. Advance the |
| 698 | * next node id whether or not we find a free huge page to free so | ||
| 699 | * that the next attempt to free addresses the next node. | ||
| 673 | */ | 700 | */ |
| 674 | static int hstate_next_node_to_free(struct hstate *h) | 701 | static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) |
| 675 | { | 702 | { |
| 676 | int next_nid; | 703 | int nid; |
| 677 | next_nid = next_node(h->next_nid_to_free, node_online_map); | 704 | |
| 678 | if (next_nid == MAX_NUMNODES) | 705 | VM_BUG_ON(!nodes_allowed); |
| 679 | next_nid = first_node(node_online_map); | 706 | |
| 680 | h->next_nid_to_free = next_nid; | 707 | nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); |
| 681 | return next_nid; | 708 | h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); |
| 709 | |||
| 710 | return nid; | ||
| 682 | } | 711 | } |
| 683 | 712 | ||
| 684 | /* | 713 | /* |
| @@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h) | |||
| 687 | * balanced over allowed nodes. | 716 | * balanced over allowed nodes. |
| 688 | * Called with hugetlb_lock locked. | 717 | * Called with hugetlb_lock locked. |
| 689 | */ | 718 | */ |
| 690 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | 719 | static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, |
| 720 | bool acct_surplus) | ||
| 691 | { | 721 | { |
| 692 | int start_nid; | 722 | int start_nid; |
| 693 | int next_nid; | 723 | int next_nid; |
| 694 | int ret = 0; | 724 | int ret = 0; |
| 695 | 725 | ||
| 696 | start_nid = h->next_nid_to_free; | 726 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
| 697 | next_nid = start_nid; | 727 | next_nid = start_nid; |
| 698 | 728 | ||
| 699 | do { | 729 | do { |
| @@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | |||
| 715 | } | 745 | } |
| 716 | update_and_free_page(h, page); | 746 | update_and_free_page(h, page); |
| 717 | ret = 1; | 747 | ret = 1; |
| 748 | break; | ||
| 718 | } | 749 | } |
| 719 | next_nid = hstate_next_node_to_free(h); | 750 | next_nid = hstate_next_node_to_free(h, nodes_allowed); |
| 720 | } while (!ret && next_nid != start_nid); | 751 | } while (next_nid != start_nid); |
| 721 | 752 | ||
| 722 | return ret; | 753 | return ret; |
| 723 | } | 754 | } |
| @@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
| 911 | 942 | ||
| 912 | /* | 943 | /* |
| 913 | * We want to release as many surplus pages as possible, spread | 944 | * We want to release as many surplus pages as possible, spread |
| 914 | * evenly across all nodes. Iterate across all nodes until we | 945 | * evenly across all nodes with memory. Iterate across these nodes |
| 915 | * can no longer free unreserved surplus pages. This occurs when | 946 | * until we can no longer free unreserved surplus pages. This occurs |
| 916 | * the nodes with surplus pages have no free pages. | 947 | * when the nodes with surplus pages have no free pages. |
| 917 | * free_pool_huge_page() will balance the the frees across the | 948 | * free_pool_huge_page() will balance the the freed pages across the |
| 918 | * on-line nodes for us and will handle the hstate accounting. | 949 | * on-line nodes with memory and will handle the hstate accounting. |
| 919 | */ | 950 | */ |
| 920 | while (nr_pages--) { | 951 | while (nr_pages--) { |
| 921 | if (!free_pool_huge_page(h, 1)) | 952 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) |
| 922 | break; | 953 | break; |
| 923 | } | 954 | } |
| 924 | } | 955 | } |
| @@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
| 1022 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1053 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
| 1023 | { | 1054 | { |
| 1024 | struct huge_bootmem_page *m; | 1055 | struct huge_bootmem_page *m; |
| 1025 | int nr_nodes = nodes_weight(node_online_map); | 1056 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
| 1026 | 1057 | ||
| 1027 | while (nr_nodes) { | 1058 | while (nr_nodes) { |
| 1028 | void *addr; | 1059 | void *addr; |
| 1029 | 1060 | ||
| 1030 | addr = __alloc_bootmem_node_nopanic( | 1061 | addr = __alloc_bootmem_node_nopanic( |
| 1031 | NODE_DATA(h->next_nid_to_alloc), | 1062 | NODE_DATA(hstate_next_node_to_alloc(h, |
| 1063 | &node_states[N_HIGH_MEMORY])), | ||
| 1032 | huge_page_size(h), huge_page_size(h), 0); | 1064 | huge_page_size(h), huge_page_size(h), 0); |
| 1033 | 1065 | ||
| 1034 | hstate_next_node_to_alloc(h); | ||
| 1035 | if (addr) { | 1066 | if (addr) { |
| 1036 | /* | 1067 | /* |
| 1037 | * Use the beginning of the huge page to store the | 1068 | * Use the beginning of the huge page to store the |
| @@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
| 1084 | if (h->order >= MAX_ORDER) { | 1115 | if (h->order >= MAX_ORDER) { |
| 1085 | if (!alloc_bootmem_huge_page(h)) | 1116 | if (!alloc_bootmem_huge_page(h)) |
| 1086 | break; | 1117 | break; |
| 1087 | } else if (!alloc_fresh_huge_page(h)) | 1118 | } else if (!alloc_fresh_huge_page(h, |
| 1119 | &node_states[N_HIGH_MEMORY])) | ||
| 1088 | break; | 1120 | break; |
| 1089 | } | 1121 | } |
| 1090 | h->max_huge_pages = i; | 1122 | h->max_huge_pages = i; |
| @@ -1126,14 +1158,15 @@ static void __init report_hugepages(void) | |||
| 1126 | } | 1158 | } |
| 1127 | 1159 | ||
| 1128 | #ifdef CONFIG_HIGHMEM | 1160 | #ifdef CONFIG_HIGHMEM |
| 1129 | static void try_to_free_low(struct hstate *h, unsigned long count) | 1161 | static void try_to_free_low(struct hstate *h, unsigned long count, |
| 1162 | nodemask_t *nodes_allowed) | ||
| 1130 | { | 1163 | { |
| 1131 | int i; | 1164 | int i; |
| 1132 | 1165 | ||
| 1133 | if (h->order >= MAX_ORDER) | 1166 | if (h->order >= MAX_ORDER) |
| 1134 | return; | 1167 | return; |
| 1135 | 1168 | ||
| 1136 | for (i = 0; i < MAX_NUMNODES; ++i) { | 1169 | for_each_node_mask(i, *nodes_allowed) { |
| 1137 | struct page *page, *next; | 1170 | struct page *page, *next; |
| 1138 | struct list_head *freel = &h->hugepage_freelists[i]; | 1171 | struct list_head *freel = &h->hugepage_freelists[i]; |
| 1139 | list_for_each_entry_safe(page, next, freel, lru) { | 1172 | list_for_each_entry_safe(page, next, freel, lru) { |
| @@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) | |||
| 1149 | } | 1182 | } |
| 1150 | } | 1183 | } |
| 1151 | #else | 1184 | #else |
| 1152 | static inline void try_to_free_low(struct hstate *h, unsigned long count) | 1185 | static inline void try_to_free_low(struct hstate *h, unsigned long count, |
| 1186 | nodemask_t *nodes_allowed) | ||
| 1153 | { | 1187 | { |
| 1154 | } | 1188 | } |
| 1155 | #endif | 1189 | #endif |
| @@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
| 1159 | * balanced by operating on them in a round-robin fashion. | 1193 | * balanced by operating on them in a round-robin fashion. |
| 1160 | * Returns 1 if an adjustment was made. | 1194 | * Returns 1 if an adjustment was made. |
| 1161 | */ | 1195 | */ |
| 1162 | static int adjust_pool_surplus(struct hstate *h, int delta) | 1196 | static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, |
| 1197 | int delta) | ||
| 1163 | { | 1198 | { |
| 1164 | int start_nid, next_nid; | 1199 | int start_nid, next_nid; |
| 1165 | int ret = 0; | 1200 | int ret = 0; |
| @@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
| 1167 | VM_BUG_ON(delta != -1 && delta != 1); | 1202 | VM_BUG_ON(delta != -1 && delta != 1); |
| 1168 | 1203 | ||
| 1169 | if (delta < 0) | 1204 | if (delta < 0) |
| 1170 | start_nid = h->next_nid_to_alloc; | 1205 | start_nid = hstate_next_node_to_alloc(h, nodes_allowed); |
| 1171 | else | 1206 | else |
| 1172 | start_nid = h->next_nid_to_free; | 1207 | start_nid = hstate_next_node_to_free(h, nodes_allowed); |
| 1173 | next_nid = start_nid; | 1208 | next_nid = start_nid; |
| 1174 | 1209 | ||
| 1175 | do { | 1210 | do { |
| 1176 | int nid = next_nid; | 1211 | int nid = next_nid; |
| 1177 | if (delta < 0) { | 1212 | if (delta < 0) { |
| 1178 | next_nid = hstate_next_node_to_alloc(h); | ||
| 1179 | /* | 1213 | /* |
| 1180 | * To shrink on this node, there must be a surplus page | 1214 | * To shrink on this node, there must be a surplus page |
| 1181 | */ | 1215 | */ |
| 1182 | if (!h->surplus_huge_pages_node[nid]) | 1216 | if (!h->surplus_huge_pages_node[nid]) { |
| 1217 | next_nid = hstate_next_node_to_alloc(h, | ||
| 1218 | nodes_allowed); | ||
| 1183 | continue; | 1219 | continue; |
| 1220 | } | ||
| 1184 | } | 1221 | } |
| 1185 | if (delta > 0) { | 1222 | if (delta > 0) { |
| 1186 | next_nid = hstate_next_node_to_free(h); | ||
| 1187 | /* | 1223 | /* |
| 1188 | * Surplus cannot exceed the total number of pages | 1224 | * Surplus cannot exceed the total number of pages |
| 1189 | */ | 1225 | */ |
| 1190 | if (h->surplus_huge_pages_node[nid] >= | 1226 | if (h->surplus_huge_pages_node[nid] >= |
| 1191 | h->nr_huge_pages_node[nid]) | 1227 | h->nr_huge_pages_node[nid]) { |
| 1228 | next_nid = hstate_next_node_to_free(h, | ||
| 1229 | nodes_allowed); | ||
| 1192 | continue; | 1230 | continue; |
| 1231 | } | ||
| 1193 | } | 1232 | } |
| 1194 | 1233 | ||
| 1195 | h->surplus_huge_pages += delta; | 1234 | h->surplus_huge_pages += delta; |
| @@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) | |||
| 1202 | } | 1241 | } |
| 1203 | 1242 | ||
| 1204 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1243 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
| 1205 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1244 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, |
| 1245 | nodemask_t *nodes_allowed) | ||
| 1206 | { | 1246 | { |
| 1207 | unsigned long min_count, ret; | 1247 | unsigned long min_count, ret; |
| 1208 | 1248 | ||
| @@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
| 1222 | */ | 1262 | */ |
| 1223 | spin_lock(&hugetlb_lock); | 1263 | spin_lock(&hugetlb_lock); |
| 1224 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { | 1264 | while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { |
| 1225 | if (!adjust_pool_surplus(h, -1)) | 1265 | if (!adjust_pool_surplus(h, nodes_allowed, -1)) |
| 1226 | break; | 1266 | break; |
| 1227 | } | 1267 | } |
| 1228 | 1268 | ||
| @@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
| 1233 | * and reducing the surplus. | 1273 | * and reducing the surplus. |
| 1234 | */ | 1274 | */ |
| 1235 | spin_unlock(&hugetlb_lock); | 1275 | spin_unlock(&hugetlb_lock); |
| 1236 | ret = alloc_fresh_huge_page(h); | 1276 | ret = alloc_fresh_huge_page(h, nodes_allowed); |
| 1237 | spin_lock(&hugetlb_lock); | 1277 | spin_lock(&hugetlb_lock); |
| 1238 | if (!ret) | 1278 | if (!ret) |
| 1239 | goto out; | 1279 | goto out; |
| 1240 | 1280 | ||
| 1281 | /* Bail for signals. Probably ctrl-c from user */ | ||
| 1282 | if (signal_pending(current)) | ||
| 1283 | goto out; | ||
| 1241 | } | 1284 | } |
| 1242 | 1285 | ||
| 1243 | /* | 1286 | /* |
| @@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
| 1257 | */ | 1300 | */ |
| 1258 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; | 1301 | min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; |
| 1259 | min_count = max(count, min_count); | 1302 | min_count = max(count, min_count); |
| 1260 | try_to_free_low(h, min_count); | 1303 | try_to_free_low(h, min_count, nodes_allowed); |
| 1261 | while (min_count < persistent_huge_pages(h)) { | 1304 | while (min_count < persistent_huge_pages(h)) { |
| 1262 | if (!free_pool_huge_page(h, 0)) | 1305 | if (!free_pool_huge_page(h, nodes_allowed, 0)) |
| 1263 | break; | 1306 | break; |
| 1264 | } | 1307 | } |
| 1265 | while (count < persistent_huge_pages(h)) { | 1308 | while (count < persistent_huge_pages(h)) { |
| 1266 | if (!adjust_pool_surplus(h, 1)) | 1309 | if (!adjust_pool_surplus(h, nodes_allowed, 1)) |
| 1267 | break; | 1310 | break; |
| 1268 | } | 1311 | } |
| 1269 | out: | 1312 | out: |
| @@ -1282,43 +1325,117 @@ out: | |||
| 1282 | static struct kobject *hugepages_kobj; | 1325 | static struct kobject *hugepages_kobj; |
| 1283 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | 1326 | static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; |
| 1284 | 1327 | ||
| 1285 | static struct hstate *kobj_to_hstate(struct kobject *kobj) | 1328 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); |
| 1329 | |||
| 1330 | static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) | ||
| 1286 | { | 1331 | { |
| 1287 | int i; | 1332 | int i; |
| 1333 | |||
| 1288 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | 1334 | for (i = 0; i < HUGE_MAX_HSTATE; i++) |
| 1289 | if (hstate_kobjs[i] == kobj) | 1335 | if (hstate_kobjs[i] == kobj) { |
| 1336 | if (nidp) | ||
| 1337 | *nidp = NUMA_NO_NODE; | ||
| 1290 | return &hstates[i]; | 1338 | return &hstates[i]; |
| 1291 | BUG(); | 1339 | } |
| 1292 | return NULL; | 1340 | |
| 1341 | return kobj_to_node_hstate(kobj, nidp); | ||
| 1293 | } | 1342 | } |
| 1294 | 1343 | ||
| 1295 | static ssize_t nr_hugepages_show(struct kobject *kobj, | 1344 | static ssize_t nr_hugepages_show_common(struct kobject *kobj, |
| 1296 | struct kobj_attribute *attr, char *buf) | 1345 | struct kobj_attribute *attr, char *buf) |
| 1297 | { | 1346 | { |
| 1298 | struct hstate *h = kobj_to_hstate(kobj); | 1347 | struct hstate *h; |
| 1299 | return sprintf(buf, "%lu\n", h->nr_huge_pages); | 1348 | unsigned long nr_huge_pages; |
| 1349 | int nid; | ||
| 1350 | |||
| 1351 | h = kobj_to_hstate(kobj, &nid); | ||
| 1352 | if (nid == NUMA_NO_NODE) | ||
| 1353 | nr_huge_pages = h->nr_huge_pages; | ||
| 1354 | else | ||
| 1355 | nr_huge_pages = h->nr_huge_pages_node[nid]; | ||
| 1356 | |||
| 1357 | return sprintf(buf, "%lu\n", nr_huge_pages); | ||
| 1300 | } | 1358 | } |
| 1301 | static ssize_t nr_hugepages_store(struct kobject *kobj, | 1359 | static ssize_t nr_hugepages_store_common(bool obey_mempolicy, |
| 1302 | struct kobj_attribute *attr, const char *buf, size_t count) | 1360 | struct kobject *kobj, struct kobj_attribute *attr, |
| 1361 | const char *buf, size_t len) | ||
| 1303 | { | 1362 | { |
| 1304 | int err; | 1363 | int err; |
| 1305 | unsigned long input; | 1364 | int nid; |
| 1306 | struct hstate *h = kobj_to_hstate(kobj); | 1365 | unsigned long count; |
| 1366 | struct hstate *h; | ||
| 1367 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); | ||
| 1307 | 1368 | ||
| 1308 | err = strict_strtoul(buf, 10, &input); | 1369 | err = strict_strtoul(buf, 10, &count); |
| 1309 | if (err) | 1370 | if (err) |
| 1310 | return 0; | 1371 | return 0; |
| 1311 | 1372 | ||
| 1312 | h->max_huge_pages = set_max_huge_pages(h, input); | 1373 | h = kobj_to_hstate(kobj, &nid); |
| 1374 | if (nid == NUMA_NO_NODE) { | ||
| 1375 | /* | ||
| 1376 | * global hstate attribute | ||
| 1377 | */ | ||
| 1378 | if (!(obey_mempolicy && | ||
| 1379 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
| 1380 | NODEMASK_FREE(nodes_allowed); | ||
| 1381 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
| 1382 | } | ||
| 1383 | } else if (nodes_allowed) { | ||
| 1384 | /* | ||
| 1385 | * per node hstate attribute: adjust count to global, | ||
| 1386 | * but restrict alloc/free to the specified node. | ||
| 1387 | */ | ||
| 1388 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | ||
| 1389 | init_nodemask_of_node(nodes_allowed, nid); | ||
| 1390 | } else | ||
| 1391 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
| 1392 | |||
| 1393 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); | ||
| 1313 | 1394 | ||
| 1314 | return count; | 1395 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) |
| 1396 | NODEMASK_FREE(nodes_allowed); | ||
| 1397 | |||
| 1398 | return len; | ||
| 1399 | } | ||
| 1400 | |||
| 1401 | static ssize_t nr_hugepages_show(struct kobject *kobj, | ||
| 1402 | struct kobj_attribute *attr, char *buf) | ||
| 1403 | { | ||
| 1404 | return nr_hugepages_show_common(kobj, attr, buf); | ||
| 1405 | } | ||
| 1406 | |||
| 1407 | static ssize_t nr_hugepages_store(struct kobject *kobj, | ||
| 1408 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
| 1409 | { | ||
| 1410 | return nr_hugepages_store_common(false, kobj, attr, buf, len); | ||
| 1315 | } | 1411 | } |
| 1316 | HSTATE_ATTR(nr_hugepages); | 1412 | HSTATE_ATTR(nr_hugepages); |
| 1317 | 1413 | ||
| 1414 | #ifdef CONFIG_NUMA | ||
| 1415 | |||
| 1416 | /* | ||
| 1417 | * hstate attribute for optionally mempolicy-based constraint on persistent | ||
| 1418 | * huge page alloc/free. | ||
| 1419 | */ | ||
| 1420 | static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, | ||
| 1421 | struct kobj_attribute *attr, char *buf) | ||
| 1422 | { | ||
| 1423 | return nr_hugepages_show_common(kobj, attr, buf); | ||
| 1424 | } | ||
| 1425 | |||
| 1426 | static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, | ||
| 1427 | struct kobj_attribute *attr, const char *buf, size_t len) | ||
| 1428 | { | ||
| 1429 | return nr_hugepages_store_common(true, kobj, attr, buf, len); | ||
| 1430 | } | ||
| 1431 | HSTATE_ATTR(nr_hugepages_mempolicy); | ||
| 1432 | #endif | ||
| 1433 | |||
| 1434 | |||
| 1318 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, | 1435 | static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, |
| 1319 | struct kobj_attribute *attr, char *buf) | 1436 | struct kobj_attribute *attr, char *buf) |
| 1320 | { | 1437 | { |
| 1321 | struct hstate *h = kobj_to_hstate(kobj); | 1438 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
| 1322 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); | 1439 | return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); |
| 1323 | } | 1440 | } |
| 1324 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | 1441 | static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, |
| @@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, | |||
| 1326 | { | 1443 | { |
| 1327 | int err; | 1444 | int err; |
| 1328 | unsigned long input; | 1445 | unsigned long input; |
| 1329 | struct hstate *h = kobj_to_hstate(kobj); | 1446 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
| 1330 | 1447 | ||
| 1331 | err = strict_strtoul(buf, 10, &input); | 1448 | err = strict_strtoul(buf, 10, &input); |
| 1332 | if (err) | 1449 | if (err) |
| @@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages); | |||
| 1343 | static ssize_t free_hugepages_show(struct kobject *kobj, | 1460 | static ssize_t free_hugepages_show(struct kobject *kobj, |
| 1344 | struct kobj_attribute *attr, char *buf) | 1461 | struct kobj_attribute *attr, char *buf) |
| 1345 | { | 1462 | { |
| 1346 | struct hstate *h = kobj_to_hstate(kobj); | 1463 | struct hstate *h; |
| 1347 | return sprintf(buf, "%lu\n", h->free_huge_pages); | 1464 | unsigned long free_huge_pages; |
| 1465 | int nid; | ||
| 1466 | |||
| 1467 | h = kobj_to_hstate(kobj, &nid); | ||
| 1468 | if (nid == NUMA_NO_NODE) | ||
| 1469 | free_huge_pages = h->free_huge_pages; | ||
| 1470 | else | ||
| 1471 | free_huge_pages = h->free_huge_pages_node[nid]; | ||
| 1472 | |||
| 1473 | return sprintf(buf, "%lu\n", free_huge_pages); | ||
| 1348 | } | 1474 | } |
| 1349 | HSTATE_ATTR_RO(free_hugepages); | 1475 | HSTATE_ATTR_RO(free_hugepages); |
| 1350 | 1476 | ||
| 1351 | static ssize_t resv_hugepages_show(struct kobject *kobj, | 1477 | static ssize_t resv_hugepages_show(struct kobject *kobj, |
| 1352 | struct kobj_attribute *attr, char *buf) | 1478 | struct kobj_attribute *attr, char *buf) |
| 1353 | { | 1479 | { |
| 1354 | struct hstate *h = kobj_to_hstate(kobj); | 1480 | struct hstate *h = kobj_to_hstate(kobj, NULL); |
| 1355 | return sprintf(buf, "%lu\n", h->resv_huge_pages); | 1481 | return sprintf(buf, "%lu\n", h->resv_huge_pages); |
| 1356 | } | 1482 | } |
| 1357 | HSTATE_ATTR_RO(resv_hugepages); | 1483 | HSTATE_ATTR_RO(resv_hugepages); |
| @@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages); | |||
| 1359 | static ssize_t surplus_hugepages_show(struct kobject *kobj, | 1485 | static ssize_t surplus_hugepages_show(struct kobject *kobj, |
| 1360 | struct kobj_attribute *attr, char *buf) | 1486 | struct kobj_attribute *attr, char *buf) |
| 1361 | { | 1487 | { |
| 1362 | struct hstate *h = kobj_to_hstate(kobj); | 1488 | struct hstate *h; |
| 1363 | return sprintf(buf, "%lu\n", h->surplus_huge_pages); | 1489 | unsigned long surplus_huge_pages; |
| 1490 | int nid; | ||
| 1491 | |||
| 1492 | h = kobj_to_hstate(kobj, &nid); | ||
| 1493 | if (nid == NUMA_NO_NODE) | ||
| 1494 | surplus_huge_pages = h->surplus_huge_pages; | ||
| 1495 | else | ||
| 1496 | surplus_huge_pages = h->surplus_huge_pages_node[nid]; | ||
| 1497 | |||
| 1498 | return sprintf(buf, "%lu\n", surplus_huge_pages); | ||
| 1364 | } | 1499 | } |
| 1365 | HSTATE_ATTR_RO(surplus_hugepages); | 1500 | HSTATE_ATTR_RO(surplus_hugepages); |
| 1366 | 1501 | ||
| @@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = { | |||
| 1370 | &free_hugepages_attr.attr, | 1505 | &free_hugepages_attr.attr, |
| 1371 | &resv_hugepages_attr.attr, | 1506 | &resv_hugepages_attr.attr, |
| 1372 | &surplus_hugepages_attr.attr, | 1507 | &surplus_hugepages_attr.attr, |
| 1508 | #ifdef CONFIG_NUMA | ||
| 1509 | &nr_hugepages_mempolicy_attr.attr, | ||
| 1510 | #endif | ||
| 1373 | NULL, | 1511 | NULL, |
| 1374 | }; | 1512 | }; |
| 1375 | 1513 | ||
| @@ -1377,19 +1515,20 @@ static struct attribute_group hstate_attr_group = { | |||
| 1377 | .attrs = hstate_attrs, | 1515 | .attrs = hstate_attrs, |
| 1378 | }; | 1516 | }; |
| 1379 | 1517 | ||
| 1380 | static int __init hugetlb_sysfs_add_hstate(struct hstate *h) | 1518 | static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, |
| 1519 | struct kobject **hstate_kobjs, | ||
| 1520 | struct attribute_group *hstate_attr_group) | ||
| 1381 | { | 1521 | { |
| 1382 | int retval; | 1522 | int retval; |
| 1523 | int hi = h - hstates; | ||
| 1383 | 1524 | ||
| 1384 | hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, | 1525 | hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); |
| 1385 | hugepages_kobj); | 1526 | if (!hstate_kobjs[hi]) |
| 1386 | if (!hstate_kobjs[h - hstates]) | ||
| 1387 | return -ENOMEM; | 1527 | return -ENOMEM; |
| 1388 | 1528 | ||
| 1389 | retval = sysfs_create_group(hstate_kobjs[h - hstates], | 1529 | retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); |
| 1390 | &hstate_attr_group); | ||
| 1391 | if (retval) | 1530 | if (retval) |
| 1392 | kobject_put(hstate_kobjs[h - hstates]); | 1531 | kobject_put(hstate_kobjs[hi]); |
| 1393 | 1532 | ||
| 1394 | return retval; | 1533 | return retval; |
| 1395 | } | 1534 | } |
| @@ -1404,17 +1543,184 @@ static void __init hugetlb_sysfs_init(void) | |||
| 1404 | return; | 1543 | return; |
| 1405 | 1544 | ||
| 1406 | for_each_hstate(h) { | 1545 | for_each_hstate(h) { |
| 1407 | err = hugetlb_sysfs_add_hstate(h); | 1546 | err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, |
| 1547 | hstate_kobjs, &hstate_attr_group); | ||
| 1408 | if (err) | 1548 | if (err) |
| 1409 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", | 1549 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s", |
| 1410 | h->name); | 1550 | h->name); |
| 1411 | } | 1551 | } |
| 1412 | } | 1552 | } |
| 1413 | 1553 | ||
| 1554 | #ifdef CONFIG_NUMA | ||
| 1555 | |||
| 1556 | /* | ||
| 1557 | * node_hstate/s - associate per node hstate attributes, via their kobjects, | ||
| 1558 | * with node sysdevs in node_devices[] using a parallel array. The array | ||
| 1559 | * index of a node sysdev or _hstate == node id. | ||
| 1560 | * This is here to avoid any static dependency of the node sysdev driver, in | ||
| 1561 | * the base kernel, on the hugetlb module. | ||
| 1562 | */ | ||
| 1563 | struct node_hstate { | ||
| 1564 | struct kobject *hugepages_kobj; | ||
| 1565 | struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; | ||
| 1566 | }; | ||
| 1567 | struct node_hstate node_hstates[MAX_NUMNODES]; | ||
| 1568 | |||
| 1569 | /* | ||
| 1570 | * A subset of global hstate attributes for node sysdevs | ||
| 1571 | */ | ||
| 1572 | static struct attribute *per_node_hstate_attrs[] = { | ||
| 1573 | &nr_hugepages_attr.attr, | ||
| 1574 | &free_hugepages_attr.attr, | ||
| 1575 | &surplus_hugepages_attr.attr, | ||
| 1576 | NULL, | ||
| 1577 | }; | ||
| 1578 | |||
| 1579 | static struct attribute_group per_node_hstate_attr_group = { | ||
| 1580 | .attrs = per_node_hstate_attrs, | ||
| 1581 | }; | ||
| 1582 | |||
| 1583 | /* | ||
| 1584 | * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. | ||
| 1585 | * Returns node id via non-NULL nidp. | ||
| 1586 | */ | ||
| 1587 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
| 1588 | { | ||
| 1589 | int nid; | ||
| 1590 | |||
| 1591 | for (nid = 0; nid < nr_node_ids; nid++) { | ||
| 1592 | struct node_hstate *nhs = &node_hstates[nid]; | ||
| 1593 | int i; | ||
| 1594 | for (i = 0; i < HUGE_MAX_HSTATE; i++) | ||
| 1595 | if (nhs->hstate_kobjs[i] == kobj) { | ||
| 1596 | if (nidp) | ||
| 1597 | *nidp = nid; | ||
| 1598 | return &hstates[i]; | ||
| 1599 | } | ||
| 1600 | } | ||
| 1601 | |||
| 1602 | BUG(); | ||
| 1603 | return NULL; | ||
| 1604 | } | ||
| 1605 | |||
| 1606 | /* | ||
| 1607 | * Unregister hstate attributes from a single node sysdev. | ||
| 1608 | * No-op if no hstate attributes attached. | ||
| 1609 | */ | ||
| 1610 | void hugetlb_unregister_node(struct node *node) | ||
| 1611 | { | ||
| 1612 | struct hstate *h; | ||
| 1613 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
| 1614 | |||
| 1615 | if (!nhs->hugepages_kobj) | ||
| 1616 | return; /* no hstate attributes */ | ||
| 1617 | |||
| 1618 | for_each_hstate(h) | ||
| 1619 | if (nhs->hstate_kobjs[h - hstates]) { | ||
| 1620 | kobject_put(nhs->hstate_kobjs[h - hstates]); | ||
| 1621 | nhs->hstate_kobjs[h - hstates] = NULL; | ||
| 1622 | } | ||
| 1623 | |||
| 1624 | kobject_put(nhs->hugepages_kobj); | ||
| 1625 | nhs->hugepages_kobj = NULL; | ||
| 1626 | } | ||
| 1627 | |||
| 1628 | /* | ||
| 1629 | * hugetlb module exit: unregister hstate attributes from node sysdevs | ||
| 1630 | * that have them. | ||
| 1631 | */ | ||
| 1632 | static void hugetlb_unregister_all_nodes(void) | ||
| 1633 | { | ||
| 1634 | int nid; | ||
| 1635 | |||
| 1636 | /* | ||
| 1637 | * disable node sysdev registrations. | ||
| 1638 | */ | ||
| 1639 | register_hugetlbfs_with_node(NULL, NULL); | ||
| 1640 | |||
| 1641 | /* | ||
| 1642 | * remove hstate attributes from any nodes that have them. | ||
| 1643 | */ | ||
| 1644 | for (nid = 0; nid < nr_node_ids; nid++) | ||
| 1645 | hugetlb_unregister_node(&node_devices[nid]); | ||
| 1646 | } | ||
| 1647 | |||
| 1648 | /* | ||
| 1649 | * Register hstate attributes for a single node sysdev. | ||
| 1650 | * No-op if attributes already registered. | ||
| 1651 | */ | ||
| 1652 | void hugetlb_register_node(struct node *node) | ||
| 1653 | { | ||
| 1654 | struct hstate *h; | ||
| 1655 | struct node_hstate *nhs = &node_hstates[node->sysdev.id]; | ||
| 1656 | int err; | ||
| 1657 | |||
| 1658 | if (nhs->hugepages_kobj) | ||
| 1659 | return; /* already allocated */ | ||
| 1660 | |||
| 1661 | nhs->hugepages_kobj = kobject_create_and_add("hugepages", | ||
| 1662 | &node->sysdev.kobj); | ||
| 1663 | if (!nhs->hugepages_kobj) | ||
| 1664 | return; | ||
| 1665 | |||
| 1666 | for_each_hstate(h) { | ||
| 1667 | err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, | ||
| 1668 | nhs->hstate_kobjs, | ||
| 1669 | &per_node_hstate_attr_group); | ||
| 1670 | if (err) { | ||
| 1671 | printk(KERN_ERR "Hugetlb: Unable to add hstate %s" | ||
| 1672 | " for node %d\n", | ||
| 1673 | h->name, node->sysdev.id); | ||
| 1674 | hugetlb_unregister_node(node); | ||
| 1675 | break; | ||
| 1676 | } | ||
| 1677 | } | ||
| 1678 | } | ||
| 1679 | |||
| 1680 | /* | ||
| 1681 | * hugetlb init time: register hstate attributes for all registered node | ||
| 1682 | * sysdevs of nodes that have memory. All on-line nodes should have | ||
| 1683 | * registered their associated sysdev by this time. | ||
| 1684 | */ | ||
| 1685 | static void hugetlb_register_all_nodes(void) | ||
| 1686 | { | ||
| 1687 | int nid; | ||
| 1688 | |||
| 1689 | for_each_node_state(nid, N_HIGH_MEMORY) { | ||
| 1690 | struct node *node = &node_devices[nid]; | ||
| 1691 | if (node->sysdev.id == nid) | ||
| 1692 | hugetlb_register_node(node); | ||
| 1693 | } | ||
| 1694 | |||
| 1695 | /* | ||
| 1696 | * Let the node sysdev driver know we're here so it can | ||
| 1697 | * [un]register hstate attributes on node hotplug. | ||
| 1698 | */ | ||
| 1699 | register_hugetlbfs_with_node(hugetlb_register_node, | ||
| 1700 | hugetlb_unregister_node); | ||
| 1701 | } | ||
| 1702 | #else /* !CONFIG_NUMA */ | ||
| 1703 | |||
| 1704 | static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) | ||
| 1705 | { | ||
| 1706 | BUG(); | ||
| 1707 | if (nidp) | ||
| 1708 | *nidp = -1; | ||
| 1709 | return NULL; | ||
| 1710 | } | ||
| 1711 | |||
| 1712 | static void hugetlb_unregister_all_nodes(void) { } | ||
| 1713 | |||
| 1714 | static void hugetlb_register_all_nodes(void) { } | ||
| 1715 | |||
| 1716 | #endif | ||
| 1717 | |||
| 1414 | static void __exit hugetlb_exit(void) | 1718 | static void __exit hugetlb_exit(void) |
| 1415 | { | 1719 | { |
| 1416 | struct hstate *h; | 1720 | struct hstate *h; |
| 1417 | 1721 | ||
| 1722 | hugetlb_unregister_all_nodes(); | ||
| 1723 | |||
| 1418 | for_each_hstate(h) { | 1724 | for_each_hstate(h) { |
| 1419 | kobject_put(hstate_kobjs[h - hstates]); | 1725 | kobject_put(hstate_kobjs[h - hstates]); |
| 1420 | } | 1726 | } |
| @@ -1449,6 +1755,8 @@ static int __init hugetlb_init(void) | |||
| 1449 | 1755 | ||
| 1450 | hugetlb_sysfs_init(); | 1756 | hugetlb_sysfs_init(); |
| 1451 | 1757 | ||
| 1758 | hugetlb_register_all_nodes(); | ||
| 1759 | |||
| 1452 | return 0; | 1760 | return 0; |
| 1453 | } | 1761 | } |
| 1454 | module_init(hugetlb_init); | 1762 | module_init(hugetlb_init); |
| @@ -1472,8 +1780,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
| 1472 | h->free_huge_pages = 0; | 1780 | h->free_huge_pages = 0; |
| 1473 | for (i = 0; i < MAX_NUMNODES; ++i) | 1781 | for (i = 0; i < MAX_NUMNODES; ++i) |
| 1474 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1782 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
| 1475 | h->next_nid_to_alloc = first_node(node_online_map); | 1783 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); |
| 1476 | h->next_nid_to_free = first_node(node_online_map); | 1784 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); |
| 1477 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1785 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
| 1478 | huge_page_size(h)/1024); | 1786 | huge_page_size(h)/1024); |
| 1479 | 1787 | ||
| @@ -1536,9 +1844,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
| 1536 | } | 1844 | } |
| 1537 | 1845 | ||
| 1538 | #ifdef CONFIG_SYSCTL | 1846 | #ifdef CONFIG_SYSCTL |
| 1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1847 | static int hugetlb_sysctl_handler_common(bool obey_mempolicy, |
| 1540 | void __user *buffer, | 1848 | struct ctl_table *table, int write, |
| 1541 | size_t *length, loff_t *ppos) | 1849 | void __user *buffer, size_t *length, loff_t *ppos) |
| 1542 | { | 1850 | { |
| 1543 | struct hstate *h = &default_hstate; | 1851 | struct hstate *h = &default_hstate; |
| 1544 | unsigned long tmp; | 1852 | unsigned long tmp; |
| @@ -1550,12 +1858,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
| 1550 | table->maxlen = sizeof(unsigned long); | 1858 | table->maxlen = sizeof(unsigned long); |
| 1551 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | 1859 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
| 1552 | 1860 | ||
| 1553 | if (write) | 1861 | if (write) { |
| 1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1862 | NODEMASK_ALLOC(nodemask_t, nodes_allowed, |
| 1863 | GFP_KERNEL | __GFP_NORETRY); | ||
| 1864 | if (!(obey_mempolicy && | ||
| 1865 | init_nodemask_of_mempolicy(nodes_allowed))) { | ||
| 1866 | NODEMASK_FREE(nodes_allowed); | ||
| 1867 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | ||
| 1868 | } | ||
| 1869 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | ||
| 1870 | |||
| 1871 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | ||
| 1872 | NODEMASK_FREE(nodes_allowed); | ||
| 1873 | } | ||
| 1555 | 1874 | ||
| 1556 | return 0; | 1875 | return 0; |
| 1557 | } | 1876 | } |
| 1558 | 1877 | ||
| 1878 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | ||
| 1879 | void __user *buffer, size_t *length, loff_t *ppos) | ||
| 1880 | { | ||
| 1881 | |||
| 1882 | return hugetlb_sysctl_handler_common(false, table, write, | ||
| 1883 | buffer, length, ppos); | ||
| 1884 | } | ||
| 1885 | |||
| 1886 | #ifdef CONFIG_NUMA | ||
| 1887 | int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, | ||
| 1888 | void __user *buffer, size_t *length, loff_t *ppos) | ||
| 1889 | { | ||
| 1890 | return hugetlb_sysctl_handler_common(true, table, write, | ||
| 1891 | buffer, length, ppos); | ||
| 1892 | } | ||
| 1893 | #endif /* CONFIG_NUMA */ | ||
| 1894 | |||
| 1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | 1895 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, |
| 1560 | void __user *buffer, | 1896 | void __user *buffer, |
| 1561 | size_t *length, loff_t *ppos) | 1897 | size_t *length, loff_t *ppos) |
| @@ -1751,7 +2087,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, | |||
| 1751 | 2087 | ||
| 1752 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); | 2088 | entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); |
| 1753 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { | 2089 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { |
| 1754 | update_mmu_cache(vma, address, entry); | 2090 | update_mmu_cache(vma, address, ptep); |
| 1755 | } | 2091 | } |
| 1756 | } | 2092 | } |
| 1757 | 2093 | ||
| @@ -1903,6 +2239,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1903 | + (vma->vm_pgoff >> PAGE_SHIFT); | 2239 | + (vma->vm_pgoff >> PAGE_SHIFT); |
| 1904 | mapping = (struct address_space *)page_private(page); | 2240 | mapping = (struct address_space *)page_private(page); |
| 1905 | 2241 | ||
| 2242 | /* | ||
| 2243 | * Take the mapping lock for the duration of the table walk. As | ||
| 2244 | * this mapping should be shared between all the VMAs, | ||
| 2245 | * __unmap_hugepage_range() is called as the lock is already held | ||
| 2246 | */ | ||
| 2247 | spin_lock(&mapping->i_mmap_lock); | ||
| 1906 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 2248 | vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
| 1907 | /* Do not unmap the current VMA */ | 2249 | /* Do not unmap the current VMA */ |
| 1908 | if (iter_vma == vma) | 2250 | if (iter_vma == vma) |
| @@ -1916,10 +2258,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1916 | * from the time of fork. This would look like data corruption | 2258 | * from the time of fork. This would look like data corruption |
| 1917 | */ | 2259 | */ |
| 1918 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) | 2260 | if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) |
| 1919 | unmap_hugepage_range(iter_vma, | 2261 | __unmap_hugepage_range(iter_vma, |
| 1920 | address, address + huge_page_size(h), | 2262 | address, address + huge_page_size(h), |
| 1921 | page); | 2263 | page); |
| 1922 | } | 2264 | } |
| 2265 | spin_unlock(&mapping->i_mmap_lock); | ||
| 1923 | 2266 | ||
| 1924 | return 1; | 2267 | return 1; |
| 1925 | } | 2268 | } |
| @@ -1959,6 +2302,9 @@ retry_avoidcopy: | |||
| 1959 | outside_reserve = 1; | 2302 | outside_reserve = 1; |
| 1960 | 2303 | ||
| 1961 | page_cache_get(old_page); | 2304 | page_cache_get(old_page); |
| 2305 | |||
| 2306 | /* Drop page_table_lock as buddy allocator may be called */ | ||
| 2307 | spin_unlock(&mm->page_table_lock); | ||
| 1962 | new_page = alloc_huge_page(vma, address, outside_reserve); | 2308 | new_page = alloc_huge_page(vma, address, outside_reserve); |
| 1963 | 2309 | ||
| 1964 | if (IS_ERR(new_page)) { | 2310 | if (IS_ERR(new_page)) { |
| @@ -1976,19 +2322,25 @@ retry_avoidcopy: | |||
| 1976 | if (unmap_ref_private(mm, vma, old_page, address)) { | 2322 | if (unmap_ref_private(mm, vma, old_page, address)) { |
| 1977 | BUG_ON(page_count(old_page) != 1); | 2323 | BUG_ON(page_count(old_page) != 1); |
| 1978 | BUG_ON(huge_pte_none(pte)); | 2324 | BUG_ON(huge_pte_none(pte)); |
| 2325 | spin_lock(&mm->page_table_lock); | ||
| 1979 | goto retry_avoidcopy; | 2326 | goto retry_avoidcopy; |
| 1980 | } | 2327 | } |
| 1981 | WARN_ON_ONCE(1); | 2328 | WARN_ON_ONCE(1); |
| 1982 | } | 2329 | } |
| 1983 | 2330 | ||
| 2331 | /* Caller expects lock to be held */ | ||
| 2332 | spin_lock(&mm->page_table_lock); | ||
| 1984 | return -PTR_ERR(new_page); | 2333 | return -PTR_ERR(new_page); |
| 1985 | } | 2334 | } |
| 1986 | 2335 | ||
| 1987 | spin_unlock(&mm->page_table_lock); | ||
| 1988 | copy_huge_page(new_page, old_page, address, vma); | 2336 | copy_huge_page(new_page, old_page, address, vma); |
| 1989 | __SetPageUptodate(new_page); | 2337 | __SetPageUptodate(new_page); |
| 1990 | spin_lock(&mm->page_table_lock); | ||
| 1991 | 2338 | ||
| 2339 | /* | ||
| 2340 | * Retake the page_table_lock to check for racing updates | ||
| 2341 | * before the page tables are altered | ||
| 2342 | */ | ||
| 2343 | spin_lock(&mm->page_table_lock); | ||
| 1992 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); | 2344 | ptep = huge_pte_offset(mm, address & huge_page_mask(h)); |
| 1993 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { | 2345 | if (likely(pte_same(huge_ptep_get(ptep), pte))) { |
| 1994 | /* Break COW */ | 2346 | /* Break COW */ |
| @@ -2206,7 +2558,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2206 | entry = pte_mkyoung(entry); | 2558 | entry = pte_mkyoung(entry); |
| 2207 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, | 2559 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
| 2208 | flags & FAULT_FLAG_WRITE)) | 2560 | flags & FAULT_FLAG_WRITE)) |
| 2209 | update_mmu_cache(vma, address, entry); | 2561 | update_mmu_cache(vma, address, ptep); |
| 2210 | 2562 | ||
| 2211 | out_page_table_lock: | 2563 | out_page_table_lock: |
| 2212 | spin_unlock(&mm->page_table_lock); | 2564 | spin_unlock(&mm->page_table_lock); |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index e1d85137f086..10ea71905c1f 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c | |||
| @@ -3,18 +3,68 @@ | |||
| 3 | #include <linux/debugfs.h> | 3 | #include <linux/debugfs.h> |
| 4 | #include <linux/kernel.h> | 4 | #include <linux/kernel.h> |
| 5 | #include <linux/mm.h> | 5 | #include <linux/mm.h> |
| 6 | #include <linux/swap.h> | ||
| 7 | #include <linux/pagemap.h> | ||
| 8 | #include "internal.h" | ||
| 6 | 9 | ||
| 7 | static struct dentry *hwpoison_dir, *corrupt_pfn; | 10 | static struct dentry *hwpoison_dir; |
| 8 | 11 | ||
| 9 | static int hwpoison_inject(void *data, u64 val) | 12 | static int hwpoison_inject(void *data, u64 val) |
| 10 | { | 13 | { |
| 14 | unsigned long pfn = val; | ||
| 15 | struct page *p; | ||
| 16 | int err; | ||
| 17 | |||
| 18 | if (!capable(CAP_SYS_ADMIN)) | ||
| 19 | return -EPERM; | ||
| 20 | |||
| 21 | if (!hwpoison_filter_enable) | ||
| 22 | goto inject; | ||
| 23 | if (!pfn_valid(pfn)) | ||
| 24 | return -ENXIO; | ||
| 25 | |||
| 26 | p = pfn_to_page(pfn); | ||
| 27 | /* | ||
| 28 | * This implies unable to support free buddy pages. | ||
| 29 | */ | ||
| 30 | if (!get_page_unless_zero(p)) | ||
| 31 | return 0; | ||
| 32 | |||
| 33 | if (!PageLRU(p)) | ||
| 34 | shake_page(p, 0); | ||
| 35 | /* | ||
| 36 | * This implies unable to support non-LRU pages. | ||
| 37 | */ | ||
| 38 | if (!PageLRU(p)) | ||
| 39 | return 0; | ||
| 40 | |||
| 41 | /* | ||
| 42 | * do a racy check with elevated page count, to make sure PG_hwpoison | ||
| 43 | * will only be set for the targeted owner (or on a free page). | ||
| 44 | * We temporarily take page lock for try_get_mem_cgroup_from_page(). | ||
| 45 | * __memory_failure() will redo the check reliably inside page lock. | ||
| 46 | */ | ||
| 47 | lock_page(p); | ||
| 48 | err = hwpoison_filter(p); | ||
| 49 | unlock_page(p); | ||
| 50 | if (err) | ||
| 51 | return 0; | ||
| 52 | |||
| 53 | inject: | ||
| 54 | printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); | ||
| 55 | return __memory_failure(pfn, 18, MF_COUNT_INCREASED); | ||
| 56 | } | ||
| 57 | |||
| 58 | static int hwpoison_unpoison(void *data, u64 val) | ||
| 59 | { | ||
| 11 | if (!capable(CAP_SYS_ADMIN)) | 60 | if (!capable(CAP_SYS_ADMIN)) |
| 12 | return -EPERM; | 61 | return -EPERM; |
| 13 | printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); | 62 | |
| 14 | return __memory_failure(val, 18, 0); | 63 | return unpoison_memory(val); |
| 15 | } | 64 | } |
| 16 | 65 | ||
| 17 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); | 66 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); |
| 67 | DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); | ||
| 18 | 68 | ||
| 19 | static void pfn_inject_exit(void) | 69 | static void pfn_inject_exit(void) |
| 20 | { | 70 | { |
| @@ -24,16 +74,63 @@ static void pfn_inject_exit(void) | |||
| 24 | 74 | ||
| 25 | static int pfn_inject_init(void) | 75 | static int pfn_inject_init(void) |
| 26 | { | 76 | { |
| 77 | struct dentry *dentry; | ||
| 78 | |||
| 27 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); | 79 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); |
| 28 | if (hwpoison_dir == NULL) | 80 | if (hwpoison_dir == NULL) |
| 29 | return -ENOMEM; | 81 | return -ENOMEM; |
| 30 | corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | 82 | |
| 83 | /* | ||
| 84 | * Note that the below poison/unpoison interfaces do not involve | ||
| 85 | * hardware status change, hence do not require hardware support. | ||
| 86 | * They are mainly for testing hwpoison in software level. | ||
| 87 | */ | ||
| 88 | dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | ||
| 31 | NULL, &hwpoison_fops); | 89 | NULL, &hwpoison_fops); |
| 32 | if (corrupt_pfn == NULL) { | 90 | if (!dentry) |
| 33 | pfn_inject_exit(); | 91 | goto fail; |
| 34 | return -ENOMEM; | 92 | |
| 35 | } | 93 | dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, |
| 94 | NULL, &unpoison_fops); | ||
| 95 | if (!dentry) | ||
| 96 | goto fail; | ||
| 97 | |||
| 98 | dentry = debugfs_create_u32("corrupt-filter-enable", 0600, | ||
| 99 | hwpoison_dir, &hwpoison_filter_enable); | ||
| 100 | if (!dentry) | ||
| 101 | goto fail; | ||
| 102 | |||
| 103 | dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, | ||
| 104 | hwpoison_dir, &hwpoison_filter_dev_major); | ||
| 105 | if (!dentry) | ||
| 106 | goto fail; | ||
| 107 | |||
| 108 | dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, | ||
| 109 | hwpoison_dir, &hwpoison_filter_dev_minor); | ||
| 110 | if (!dentry) | ||
| 111 | goto fail; | ||
| 112 | |||
| 113 | dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, | ||
| 114 | hwpoison_dir, &hwpoison_filter_flags_mask); | ||
| 115 | if (!dentry) | ||
| 116 | goto fail; | ||
| 117 | |||
| 118 | dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, | ||
| 119 | hwpoison_dir, &hwpoison_filter_flags_value); | ||
| 120 | if (!dentry) | ||
| 121 | goto fail; | ||
| 122 | |||
| 123 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
| 124 | dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, | ||
| 125 | hwpoison_dir, &hwpoison_filter_memcg); | ||
| 126 | if (!dentry) | ||
| 127 | goto fail; | ||
| 128 | #endif | ||
| 129 | |||
| 36 | return 0; | 130 | return 0; |
| 131 | fail: | ||
| 132 | pfn_inject_exit(); | ||
| 133 | return -ENOMEM; | ||
| 37 | } | 134 | } |
| 38 | 135 | ||
| 39 | module_init(pfn_inject_init); | 136 | module_init(pfn_inject_init); |
diff --git a/mm/internal.h b/mm/internal.h index 22ec8d2b0fb8..6a697bb97fc5 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page); | |||
| 50 | */ | 50 | */ |
| 51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
| 52 | extern void prep_compound_page(struct page *page, unsigned long order); | 52 | extern void prep_compound_page(struct page *page, unsigned long order); |
| 53 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 54 | extern bool is_free_buddy_page(struct page *page); | ||
| 55 | #endif | ||
| 53 | 56 | ||
| 54 | 57 | ||
| 55 | /* | 58 | /* |
| @@ -63,7 +66,7 @@ static inline unsigned long page_order(struct page *page) | |||
| 63 | return page_private(page); | 66 | return page_private(page); |
| 64 | } | 67 | } |
| 65 | 68 | ||
| 66 | #ifdef CONFIG_HAVE_MLOCK | 69 | #ifdef CONFIG_MMU |
| 67 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, | 70 | extern long mlock_vma_pages_range(struct vm_area_struct *vma, |
| 68 | unsigned long start, unsigned long end); | 71 | unsigned long start, unsigned long end); |
| 69 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | 72 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, |
| @@ -72,22 +75,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
| 72 | { | 75 | { |
| 73 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); | 76 | munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); |
| 74 | } | 77 | } |
| 75 | #endif | ||
| 76 | 78 | ||
| 77 | /* | 79 | /* |
| 78 | * unevictable_migrate_page() called only from migrate_page_copy() to | ||
| 79 | * migrate unevictable flag to new page. | ||
| 80 | * Note that the old page has been isolated from the LRU lists at this | ||
| 81 | * point so we don't need to worry about LRU statistics. | ||
| 82 | */ | ||
| 83 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
| 84 | { | ||
| 85 | if (TestClearPageUnevictable(old)) | ||
| 86 | SetPageUnevictable(new); | ||
| 87 | } | ||
| 88 | |||
| 89 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
| 90 | /* | ||
| 91 | * Called only in fault path via page_evictable() for a new page | 80 | * Called only in fault path via page_evictable() for a new page |
| 92 | * to determine if it's being mapped into a LOCKED vma. | 81 | * to determine if it's being mapped into a LOCKED vma. |
| 93 | * If so, mark page as mlocked. | 82 | * If so, mark page as mlocked. |
| @@ -107,9 +96,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) | |||
| 107 | } | 96 | } |
| 108 | 97 | ||
| 109 | /* | 98 | /* |
| 110 | * must be called with vma's mmap_sem held for read, and page locked. | 99 | * must be called with vma's mmap_sem held for read or write, and page locked. |
| 111 | */ | 100 | */ |
| 112 | extern void mlock_vma_page(struct page *page); | 101 | extern void mlock_vma_page(struct page *page); |
| 102 | extern void munlock_vma_page(struct page *page); | ||
| 113 | 103 | ||
| 114 | /* | 104 | /* |
| 115 | * Clear the page's PageMlocked(). This can be useful in a situation where | 105 | * Clear the page's PageMlocked(). This can be useful in a situation where |
| @@ -144,7 +134,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
| 144 | } | 134 | } |
| 145 | } | 135 | } |
| 146 | 136 | ||
| 147 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 137 | #else /* !CONFIG_MMU */ |
| 148 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 138 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
| 149 | { | 139 | { |
| 150 | return 0; | 140 | return 0; |
| @@ -153,7 +143,7 @@ static inline void clear_page_mlock(struct page *page) { } | |||
| 153 | static inline void mlock_vma_page(struct page *page) { } | 143 | static inline void mlock_vma_page(struct page *page) { } |
| 154 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 144 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
| 155 | 145 | ||
| 156 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 146 | #endif /* !CONFIG_MMU */ |
| 157 | 147 | ||
| 158 | /* | 148 | /* |
| 159 | * Return the mem_map entry representing the 'offset' subpage within | 149 | * Return the mem_map entry representing the 'offset' subpage within |
| @@ -260,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 260 | #define ZONE_RECLAIM_SOME 0 | 250 | #define ZONE_RECLAIM_SOME 0 |
| 261 | #define ZONE_RECLAIM_SUCCESS 1 | 251 | #define ZONE_RECLAIM_SUCCESS 1 |
| 262 | #endif | 252 | #endif |
| 253 | |||
| 254 | extern int hwpoison_filter(struct page *p); | ||
| 255 | |||
| 256 | extern u32 hwpoison_filter_dev_major; | ||
| 257 | extern u32 hwpoison_filter_dev_minor; | ||
| 258 | extern u64 hwpoison_filter_flags_mask; | ||
| 259 | extern u64 hwpoison_filter_flags_value; | ||
| 260 | extern u64 hwpoison_filter_memcg; | ||
| 261 | extern u32 hwpoison_filter_enable; | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8bf765c4f58d..5b069e4f5e48 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c | |||
| @@ -93,6 +93,7 @@ | |||
| 93 | #include <linux/nodemask.h> | 93 | #include <linux/nodemask.h> |
| 94 | #include <linux/mm.h> | 94 | #include <linux/mm.h> |
| 95 | #include <linux/workqueue.h> | 95 | #include <linux/workqueue.h> |
| 96 | #include <linux/crc32.h> | ||
| 96 | 97 | ||
| 97 | #include <asm/sections.h> | 98 | #include <asm/sections.h> |
| 98 | #include <asm/processor.h> | 99 | #include <asm/processor.h> |
| @@ -108,7 +109,6 @@ | |||
| 108 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ | 109 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ |
| 109 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ | 110 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ |
| 110 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ | 111 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ |
| 111 | #define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ | ||
| 112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ | 112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ |
| 113 | 113 | ||
| 114 | #define BYTES_PER_POINTER sizeof(void *) | 114 | #define BYTES_PER_POINTER sizeof(void *) |
| @@ -119,8 +119,8 @@ | |||
| 119 | /* scanning area inside a memory block */ | 119 | /* scanning area inside a memory block */ |
| 120 | struct kmemleak_scan_area { | 120 | struct kmemleak_scan_area { |
| 121 | struct hlist_node node; | 121 | struct hlist_node node; |
| 122 | unsigned long offset; | 122 | unsigned long start; |
| 123 | size_t length; | 123 | size_t size; |
| 124 | }; | 124 | }; |
| 125 | 125 | ||
| 126 | #define KMEMLEAK_GREY 0 | 126 | #define KMEMLEAK_GREY 0 |
| @@ -149,6 +149,8 @@ struct kmemleak_object { | |||
| 149 | int min_count; | 149 | int min_count; |
| 150 | /* the total number of pointers found pointing to this object */ | 150 | /* the total number of pointers found pointing to this object */ |
| 151 | int count; | 151 | int count; |
| 152 | /* checksum for detecting modified objects */ | ||
| 153 | u32 checksum; | ||
| 152 | /* memory ranges to be scanned inside an object (empty for all) */ | 154 | /* memory ranges to be scanned inside an object (empty for all) */ |
| 153 | struct hlist_head area_list; | 155 | struct hlist_head area_list; |
| 154 | unsigned long trace[MAX_TRACE]; | 156 | unsigned long trace[MAX_TRACE]; |
| @@ -164,8 +166,6 @@ struct kmemleak_object { | |||
| 164 | #define OBJECT_REPORTED (1 << 1) | 166 | #define OBJECT_REPORTED (1 << 1) |
| 165 | /* flag set to not scan the object */ | 167 | /* flag set to not scan the object */ |
| 166 | #define OBJECT_NO_SCAN (1 << 2) | 168 | #define OBJECT_NO_SCAN (1 << 2) |
| 167 | /* flag set on newly allocated objects */ | ||
| 168 | #define OBJECT_NEW (1 << 3) | ||
| 169 | 169 | ||
| 170 | /* number of bytes to print per line; must be 16 or 32 */ | 170 | /* number of bytes to print per line; must be 16 or 32 */ |
| 171 | #define HEX_ROW_SIZE 16 | 171 | #define HEX_ROW_SIZE 16 |
| @@ -241,8 +241,6 @@ struct early_log { | |||
| 241 | const void *ptr; /* allocated/freed memory block */ | 241 | const void *ptr; /* allocated/freed memory block */ |
| 242 | size_t size; /* memory block size */ | 242 | size_t size; /* memory block size */ |
| 243 | int min_count; /* minimum reference count */ | 243 | int min_count; /* minimum reference count */ |
| 244 | unsigned long offset; /* scan area offset */ | ||
| 245 | size_t length; /* scan area length */ | ||
| 246 | unsigned long trace[MAX_TRACE]; /* stack trace */ | 244 | unsigned long trace[MAX_TRACE]; /* stack trace */ |
| 247 | unsigned int trace_len; /* stack trace length */ | 245 | unsigned int trace_len; /* stack trace length */ |
| 248 | }; | 246 | }; |
| @@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object) | |||
| 323 | object->count >= object->min_count; | 321 | object->count >= object->min_count; |
| 324 | } | 322 | } |
| 325 | 323 | ||
| 326 | static bool color_black(const struct kmemleak_object *object) | ||
| 327 | { | ||
| 328 | return object->min_count == KMEMLEAK_BLACK; | ||
| 329 | } | ||
| 330 | |||
| 331 | /* | 324 | /* |
| 332 | * Objects are considered unreferenced only if their color is white, they have | 325 | * Objects are considered unreferenced only if their color is white, they have |
| 333 | * not be deleted and have a minimum age to avoid false positives caused by | 326 | * not be deleted and have a minimum age to avoid false positives caused by |
| @@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object) | |||
| 335 | */ | 328 | */ |
| 336 | static bool unreferenced_object(struct kmemleak_object *object) | 329 | static bool unreferenced_object(struct kmemleak_object *object) |
| 337 | { | 330 | { |
| 338 | return (object->flags & OBJECT_ALLOCATED) && color_white(object) && | 331 | return (color_white(object) && object->flags & OBJECT_ALLOCATED) && |
| 339 | time_before_eq(object->jiffies + jiffies_min_age, | 332 | time_before_eq(object->jiffies + jiffies_min_age, |
| 340 | jiffies_last_scan); | 333 | jiffies_last_scan); |
| 341 | } | 334 | } |
| @@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq, | |||
| 348 | struct kmemleak_object *object) | 341 | struct kmemleak_object *object) |
| 349 | { | 342 | { |
| 350 | int i; | 343 | int i; |
| 344 | unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); | ||
| 351 | 345 | ||
| 352 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", | 346 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", |
| 353 | object->pointer, object->size); | 347 | object->pointer, object->size); |
| 354 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", | 348 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", |
| 355 | object->comm, object->pid, object->jiffies); | 349 | object->comm, object->pid, object->jiffies, |
| 350 | msecs_age / 1000, msecs_age % 1000); | ||
| 356 | hex_dump_object(seq, object); | 351 | hex_dump_object(seq, object); |
| 357 | seq_printf(seq, " backtrace:\n"); | 352 | seq_printf(seq, " backtrace:\n"); |
| 358 | 353 | ||
| @@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object) | |||
| 381 | pr_notice(" min_count = %d\n", object->min_count); | 376 | pr_notice(" min_count = %d\n", object->min_count); |
| 382 | pr_notice(" count = %d\n", object->count); | 377 | pr_notice(" count = %d\n", object->count); |
| 383 | pr_notice(" flags = 0x%lx\n", object->flags); | 378 | pr_notice(" flags = 0x%lx\n", object->flags); |
| 379 | pr_notice(" checksum = %d\n", object->checksum); | ||
| 384 | pr_notice(" backtrace:\n"); | 380 | pr_notice(" backtrace:\n"); |
| 385 | print_stack_trace(&trace, 4); | 381 | print_stack_trace(&trace, 4); |
| 386 | } | 382 | } |
| @@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | |||
| 522 | INIT_HLIST_HEAD(&object->area_list); | 518 | INIT_HLIST_HEAD(&object->area_list); |
| 523 | spin_lock_init(&object->lock); | 519 | spin_lock_init(&object->lock); |
| 524 | atomic_set(&object->use_count, 1); | 520 | atomic_set(&object->use_count, 1); |
| 525 | object->flags = OBJECT_ALLOCATED | OBJECT_NEW; | 521 | object->flags = OBJECT_ALLOCATED; |
| 526 | object->pointer = ptr; | 522 | object->pointer = ptr; |
| 527 | object->size = size; | 523 | object->size = size; |
| 528 | object->min_count = min_count; | 524 | object->min_count = min_count; |
| 529 | object->count = -1; /* no color initially */ | 525 | object->count = 0; /* white color initially */ |
| 530 | object->jiffies = jiffies; | 526 | object->jiffies = jiffies; |
| 527 | object->checksum = 0; | ||
| 531 | 528 | ||
| 532 | /* task information */ | 529 | /* task information */ |
| 533 | if (in_irq()) { | 530 | if (in_irq()) { |
| @@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr) | |||
| 720 | * Add a scanning area to the object. If at least one such area is added, | 717 | * Add a scanning area to the object. If at least one such area is added, |
| 721 | * kmemleak will only scan these ranges rather than the whole memory block. | 718 | * kmemleak will only scan these ranges rather than the whole memory block. |
| 722 | */ | 719 | */ |
| 723 | static void add_scan_area(unsigned long ptr, unsigned long offset, | 720 | static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) |
| 724 | size_t length, gfp_t gfp) | ||
| 725 | { | 721 | { |
| 726 | unsigned long flags; | 722 | unsigned long flags; |
| 727 | struct kmemleak_object *object; | 723 | struct kmemleak_object *object; |
| 728 | struct kmemleak_scan_area *area; | 724 | struct kmemleak_scan_area *area; |
| 729 | 725 | ||
| 730 | object = find_and_get_object(ptr, 0); | 726 | object = find_and_get_object(ptr, 1); |
| 731 | if (!object) { | 727 | if (!object) { |
| 732 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", | 728 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", |
| 733 | ptr); | 729 | ptr); |
| @@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, | |||
| 741 | } | 737 | } |
| 742 | 738 | ||
| 743 | spin_lock_irqsave(&object->lock, flags); | 739 | spin_lock_irqsave(&object->lock, flags); |
| 744 | if (offset + length > object->size) { | 740 | if (ptr + size > object->pointer + object->size) { |
| 745 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); | 741 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); |
| 746 | dump_object_info(object); | 742 | dump_object_info(object); |
| 747 | kmem_cache_free(scan_area_cache, area); | 743 | kmem_cache_free(scan_area_cache, area); |
| @@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset, | |||
| 749 | } | 745 | } |
| 750 | 746 | ||
| 751 | INIT_HLIST_NODE(&area->node); | 747 | INIT_HLIST_NODE(&area->node); |
| 752 | area->offset = offset; | 748 | area->start = ptr; |
| 753 | area->length = length; | 749 | area->size = size; |
| 754 | 750 | ||
| 755 | hlist_add_head(&area->node, &object->area_list); | 751 | hlist_add_head(&area->node, &object->area_list); |
| 756 | out_unlock: | 752 | out_unlock: |
| @@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr) | |||
| 786 | * processed later once kmemleak is fully initialized. | 782 | * processed later once kmemleak is fully initialized. |
| 787 | */ | 783 | */ |
| 788 | static void __init log_early(int op_type, const void *ptr, size_t size, | 784 | static void __init log_early(int op_type, const void *ptr, size_t size, |
| 789 | int min_count, unsigned long offset, size_t length) | 785 | int min_count) |
| 790 | { | 786 | { |
| 791 | unsigned long flags; | 787 | unsigned long flags; |
| 792 | struct early_log *log; | 788 | struct early_log *log; |
| @@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size, | |||
| 808 | log->ptr = ptr; | 804 | log->ptr = ptr; |
| 809 | log->size = size; | 805 | log->size = size; |
| 810 | log->min_count = min_count; | 806 | log->min_count = min_count; |
| 811 | log->offset = offset; | ||
| 812 | log->length = length; | ||
| 813 | if (op_type == KMEMLEAK_ALLOC) | 807 | if (op_type == KMEMLEAK_ALLOC) |
| 814 | log->trace_len = __save_stack_trace(log->trace); | 808 | log->trace_len = __save_stack_trace(log->trace); |
| 815 | crt_early_log++; | 809 | crt_early_log++; |
| @@ -858,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, | |||
| 858 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 852 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
| 859 | create_object((unsigned long)ptr, size, min_count, gfp); | 853 | create_object((unsigned long)ptr, size, min_count, gfp); |
| 860 | else if (atomic_read(&kmemleak_early_log)) | 854 | else if (atomic_read(&kmemleak_early_log)) |
| 861 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); | 855 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count); |
| 862 | } | 856 | } |
| 863 | EXPORT_SYMBOL_GPL(kmemleak_alloc); | 857 | EXPORT_SYMBOL_GPL(kmemleak_alloc); |
| 864 | 858 | ||
| @@ -873,7 +867,7 @@ void __ref kmemleak_free(const void *ptr) | |||
| 873 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 867 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
| 874 | delete_object_full((unsigned long)ptr); | 868 | delete_object_full((unsigned long)ptr); |
| 875 | else if (atomic_read(&kmemleak_early_log)) | 869 | else if (atomic_read(&kmemleak_early_log)) |
| 876 | log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); | 870 | log_early(KMEMLEAK_FREE, ptr, 0, 0); |
| 877 | } | 871 | } |
| 878 | EXPORT_SYMBOL_GPL(kmemleak_free); | 872 | EXPORT_SYMBOL_GPL(kmemleak_free); |
| 879 | 873 | ||
| @@ -888,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size) | |||
| 888 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 882 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
| 889 | delete_object_part((unsigned long)ptr, size); | 883 | delete_object_part((unsigned long)ptr, size); |
| 890 | else if (atomic_read(&kmemleak_early_log)) | 884 | else if (atomic_read(&kmemleak_early_log)) |
| 891 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); | 885 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0); |
| 892 | } | 886 | } |
| 893 | EXPORT_SYMBOL_GPL(kmemleak_free_part); | 887 | EXPORT_SYMBOL_GPL(kmemleak_free_part); |
| 894 | 888 | ||
| @@ -903,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr) | |||
| 903 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 897 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
| 904 | make_gray_object((unsigned long)ptr); | 898 | make_gray_object((unsigned long)ptr); |
| 905 | else if (atomic_read(&kmemleak_early_log)) | 899 | else if (atomic_read(&kmemleak_early_log)) |
| 906 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); | 900 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); |
| 907 | } | 901 | } |
| 908 | EXPORT_SYMBOL(kmemleak_not_leak); | 902 | EXPORT_SYMBOL(kmemleak_not_leak); |
| 909 | 903 | ||
| @@ -919,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr) | |||
| 919 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 913 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
| 920 | make_black_object((unsigned long)ptr); | 914 | make_black_object((unsigned long)ptr); |
| 921 | else if (atomic_read(&kmemleak_early_log)) | 915 | else if (atomic_read(&kmemleak_early_log)) |
| 922 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); | 916 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0); |
| 923 | } | 917 | } |
| 924 | EXPORT_SYMBOL(kmemleak_ignore); | 918 | EXPORT_SYMBOL(kmemleak_ignore); |
| 925 | 919 | ||
| 926 | /* | 920 | /* |
| 927 | * Limit the range to be scanned in an allocated memory block. | 921 | * Limit the range to be scanned in an allocated memory block. |
| 928 | */ | 922 | */ |
| 929 | void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, | 923 | void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) |
| 930 | size_t length, gfp_t gfp) | ||
| 931 | { | 924 | { |
| 932 | pr_debug("%s(0x%p)\n", __func__, ptr); | 925 | pr_debug("%s(0x%p)\n", __func__, ptr); |
| 933 | 926 | ||
| 934 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 927 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
| 935 | add_scan_area((unsigned long)ptr, offset, length, gfp); | 928 | add_scan_area((unsigned long)ptr, size, gfp); |
| 936 | else if (atomic_read(&kmemleak_early_log)) | 929 | else if (atomic_read(&kmemleak_early_log)) |
| 937 | log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); | 930 | log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); |
| 938 | } | 931 | } |
| 939 | EXPORT_SYMBOL(kmemleak_scan_area); | 932 | EXPORT_SYMBOL(kmemleak_scan_area); |
| 940 | 933 | ||
| @@ -948,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr) | |||
| 948 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | 941 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) |
| 949 | object_no_scan((unsigned long)ptr); | 942 | object_no_scan((unsigned long)ptr); |
| 950 | else if (atomic_read(&kmemleak_early_log)) | 943 | else if (atomic_read(&kmemleak_early_log)) |
| 951 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); | 944 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); |
| 952 | } | 945 | } |
| 953 | EXPORT_SYMBOL(kmemleak_no_scan); | 946 | EXPORT_SYMBOL(kmemleak_no_scan); |
| 954 | 947 | ||
| 955 | /* | 948 | /* |
| 949 | * Update an object's checksum and return true if it was modified. | ||
| 950 | */ | ||
| 951 | static bool update_checksum(struct kmemleak_object *object) | ||
| 952 | { | ||
| 953 | u32 old_csum = object->checksum; | ||
| 954 | |||
| 955 | if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) | ||
| 956 | return false; | ||
| 957 | |||
| 958 | object->checksum = crc32(0, (void *)object->pointer, object->size); | ||
| 959 | return object->checksum != old_csum; | ||
| 960 | } | ||
| 961 | |||
| 962 | /* | ||
| 956 | * Memory scanning is a long process and it needs to be interruptable. This | 963 | * Memory scanning is a long process and it needs to be interruptable. This |
| 957 | * function checks whether such interrupt condition occured. | 964 | * function checks whether such interrupt condition occured. |
| 958 | */ | 965 | */ |
| @@ -1031,11 +1038,14 @@ static void scan_block(void *_start, void *_end, | |||
| 1031 | * added to the gray_list. | 1038 | * added to the gray_list. |
| 1032 | */ | 1039 | */ |
| 1033 | object->count++; | 1040 | object->count++; |
| 1034 | if (color_gray(object)) | 1041 | if (color_gray(object)) { |
| 1035 | list_add_tail(&object->gray_list, &gray_list); | 1042 | list_add_tail(&object->gray_list, &gray_list); |
| 1036 | else | 1043 | spin_unlock_irqrestore(&object->lock, flags); |
| 1037 | put_object(object); | 1044 | continue; |
| 1045 | } | ||
| 1046 | |||
| 1038 | spin_unlock_irqrestore(&object->lock, flags); | 1047 | spin_unlock_irqrestore(&object->lock, flags); |
| 1048 | put_object(object); | ||
| 1039 | } | 1049 | } |
| 1040 | } | 1050 | } |
| 1041 | 1051 | ||
| @@ -1050,8 +1060,8 @@ static void scan_object(struct kmemleak_object *object) | |||
| 1050 | unsigned long flags; | 1060 | unsigned long flags; |
| 1051 | 1061 | ||
| 1052 | /* | 1062 | /* |
| 1053 | * Once the object->lock is aquired, the corresponding memory block | 1063 | * Once the object->lock is acquired, the corresponding memory block |
| 1054 | * cannot be freed (the same lock is aquired in delete_object). | 1064 | * cannot be freed (the same lock is acquired in delete_object). |
| 1055 | */ | 1065 | */ |
| 1056 | spin_lock_irqsave(&object->lock, flags); | 1066 | spin_lock_irqsave(&object->lock, flags); |
| 1057 | if (object->flags & OBJECT_NO_SCAN) | 1067 | if (object->flags & OBJECT_NO_SCAN) |
| @@ -1075,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object) | |||
| 1075 | } | 1085 | } |
| 1076 | } else | 1086 | } else |
| 1077 | hlist_for_each_entry(area, elem, &object->area_list, node) | 1087 | hlist_for_each_entry(area, elem, &object->area_list, node) |
| 1078 | scan_block((void *)(object->pointer + area->offset), | 1088 | scan_block((void *)area->start, |
| 1079 | (void *)(object->pointer + area->offset | 1089 | (void *)(area->start + area->size), |
| 1080 | + area->length), object, 0); | 1090 | object, 0); |
| 1081 | out: | 1091 | out: |
| 1082 | spin_unlock_irqrestore(&object->lock, flags); | 1092 | spin_unlock_irqrestore(&object->lock, flags); |
| 1083 | } | 1093 | } |
| 1084 | 1094 | ||
| 1085 | /* | 1095 | /* |
| 1096 | * Scan the objects already referenced (gray objects). More objects will be | ||
| 1097 | * referenced and, if there are no memory leaks, all the objects are scanned. | ||
| 1098 | */ | ||
| 1099 | static void scan_gray_list(void) | ||
| 1100 | { | ||
| 1101 | struct kmemleak_object *object, *tmp; | ||
| 1102 | |||
| 1103 | /* | ||
| 1104 | * The list traversal is safe for both tail additions and removals | ||
| 1105 | * from inside the loop. The kmemleak objects cannot be freed from | ||
| 1106 | * outside the loop because their use_count was incremented. | ||
| 1107 | */ | ||
| 1108 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
| 1109 | while (&object->gray_list != &gray_list) { | ||
| 1110 | cond_resched(); | ||
| 1111 | |||
| 1112 | /* may add new objects to the list */ | ||
| 1113 | if (!scan_should_stop()) | ||
| 1114 | scan_object(object); | ||
| 1115 | |||
| 1116 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
| 1117 | gray_list); | ||
| 1118 | |||
| 1119 | /* remove the object from the list and release it */ | ||
| 1120 | list_del(&object->gray_list); | ||
| 1121 | put_object(object); | ||
| 1122 | |||
| 1123 | object = tmp; | ||
| 1124 | } | ||
| 1125 | WARN_ON(!list_empty(&gray_list)); | ||
| 1126 | } | ||
| 1127 | |||
| 1128 | /* | ||
| 1086 | * Scan data sections and all the referenced memory blocks allocated via the | 1129 | * Scan data sections and all the referenced memory blocks allocated via the |
| 1087 | * kernel's standard allocators. This function must be called with the | 1130 | * kernel's standard allocators. This function must be called with the |
| 1088 | * scan_mutex held. | 1131 | * scan_mutex held. |
| @@ -1090,10 +1133,9 @@ out: | |||
| 1090 | static void kmemleak_scan(void) | 1133 | static void kmemleak_scan(void) |
| 1091 | { | 1134 | { |
| 1092 | unsigned long flags; | 1135 | unsigned long flags; |
| 1093 | struct kmemleak_object *object, *tmp; | 1136 | struct kmemleak_object *object; |
| 1094 | int i; | 1137 | int i; |
| 1095 | int new_leaks = 0; | 1138 | int new_leaks = 0; |
| 1096 | int gray_list_pass = 0; | ||
| 1097 | 1139 | ||
| 1098 | jiffies_last_scan = jiffies; | 1140 | jiffies_last_scan = jiffies; |
| 1099 | 1141 | ||
| @@ -1114,7 +1156,6 @@ static void kmemleak_scan(void) | |||
| 1114 | #endif | 1156 | #endif |
| 1115 | /* reset the reference count (whiten the object) */ | 1157 | /* reset the reference count (whiten the object) */ |
| 1116 | object->count = 0; | 1158 | object->count = 0; |
| 1117 | object->flags &= ~OBJECT_NEW; | ||
| 1118 | if (color_gray(object) && get_object(object)) | 1159 | if (color_gray(object) && get_object(object)) |
| 1119 | list_add_tail(&object->gray_list, &gray_list); | 1160 | list_add_tail(&object->gray_list, &gray_list); |
| 1120 | 1161 | ||
| @@ -1172,62 +1213,36 @@ static void kmemleak_scan(void) | |||
| 1172 | 1213 | ||
| 1173 | /* | 1214 | /* |
| 1174 | * Scan the objects already referenced from the sections scanned | 1215 | * Scan the objects already referenced from the sections scanned |
| 1175 | * above. More objects will be referenced and, if there are no memory | 1216 | * above. |
| 1176 | * leaks, all the objects will be scanned. The list traversal is safe | ||
| 1177 | * for both tail additions and removals from inside the loop. The | ||
| 1178 | * kmemleak objects cannot be freed from outside the loop because their | ||
| 1179 | * use_count was increased. | ||
| 1180 | */ | 1217 | */ |
| 1181 | repeat: | 1218 | scan_gray_list(); |
| 1182 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
| 1183 | while (&object->gray_list != &gray_list) { | ||
| 1184 | cond_resched(); | ||
| 1185 | |||
| 1186 | /* may add new objects to the list */ | ||
| 1187 | if (!scan_should_stop()) | ||
| 1188 | scan_object(object); | ||
| 1189 | |||
| 1190 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
| 1191 | gray_list); | ||
| 1192 | |||
| 1193 | /* remove the object from the list and release it */ | ||
| 1194 | list_del(&object->gray_list); | ||
| 1195 | put_object(object); | ||
| 1196 | |||
| 1197 | object = tmp; | ||
| 1198 | } | ||
| 1199 | |||
| 1200 | if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) | ||
| 1201 | goto scan_end; | ||
| 1202 | 1219 | ||
| 1203 | /* | 1220 | /* |
| 1204 | * Check for new objects allocated during this scanning and add them | 1221 | * Check for new or unreferenced objects modified since the previous |
| 1205 | * to the gray list. | 1222 | * scan and color them gray until the next scan. |
| 1206 | */ | 1223 | */ |
| 1207 | rcu_read_lock(); | 1224 | rcu_read_lock(); |
| 1208 | list_for_each_entry_rcu(object, &object_list, object_list) { | 1225 | list_for_each_entry_rcu(object, &object_list, object_list) { |
| 1209 | spin_lock_irqsave(&object->lock, flags); | 1226 | spin_lock_irqsave(&object->lock, flags); |
| 1210 | if ((object->flags & OBJECT_NEW) && !color_black(object) && | 1227 | if (color_white(object) && (object->flags & OBJECT_ALLOCATED) |
| 1211 | get_object(object)) { | 1228 | && update_checksum(object) && get_object(object)) { |
| 1212 | object->flags &= ~OBJECT_NEW; | 1229 | /* color it gray temporarily */ |
| 1230 | object->count = object->min_count; | ||
| 1213 | list_add_tail(&object->gray_list, &gray_list); | 1231 | list_add_tail(&object->gray_list, &gray_list); |
| 1214 | } | 1232 | } |
| 1215 | spin_unlock_irqrestore(&object->lock, flags); | 1233 | spin_unlock_irqrestore(&object->lock, flags); |
| 1216 | } | 1234 | } |
| 1217 | rcu_read_unlock(); | 1235 | rcu_read_unlock(); |
| 1218 | 1236 | ||
| 1219 | if (!list_empty(&gray_list)) | 1237 | /* |
| 1220 | goto repeat; | 1238 | * Re-scan the gray list for modified unreferenced objects. |
| 1221 | 1239 | */ | |
| 1222 | scan_end: | 1240 | scan_gray_list(); |
| 1223 | WARN_ON(!list_empty(&gray_list)); | ||
| 1224 | 1241 | ||
| 1225 | /* | 1242 | /* |
| 1226 | * If scanning was stopped or new objects were being allocated at a | 1243 | * If scanning was stopped do not report any new unreferenced objects. |
| 1227 | * higher rate than gray list scanning, do not report any new | ||
| 1228 | * unreferenced objects. | ||
| 1229 | */ | 1244 | */ |
| 1230 | if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) | 1245 | if (scan_should_stop()) |
| 1231 | return; | 1246 | return; |
| 1232 | 1247 | ||
| 1233 | /* | 1248 | /* |
| @@ -1642,8 +1657,7 @@ void __init kmemleak_init(void) | |||
| 1642 | kmemleak_ignore(log->ptr); | 1657 | kmemleak_ignore(log->ptr); |
| 1643 | break; | 1658 | break; |
| 1644 | case KMEMLEAK_SCAN_AREA: | 1659 | case KMEMLEAK_SCAN_AREA: |
| 1645 | kmemleak_scan_area(log->ptr, log->offset, log->length, | 1660 | kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); |
| 1646 | GFP_KERNEL); | ||
| 1647 | break; | 1661 | break; |
| 1648 | case KMEMLEAK_NO_SCAN: | 1662 | case KMEMLEAK_NO_SCAN: |
| 1649 | kmemleak_no_scan(log->ptr); | 1663 | kmemleak_no_scan(log->ptr); |
| @@ -29,11 +29,13 @@ | |||
| 29 | #include <linux/wait.h> | 29 | #include <linux/wait.h> |
| 30 | #include <linux/slab.h> | 30 | #include <linux/slab.h> |
| 31 | #include <linux/rbtree.h> | 31 | #include <linux/rbtree.h> |
| 32 | #include <linux/memory.h> | ||
| 32 | #include <linux/mmu_notifier.h> | 33 | #include <linux/mmu_notifier.h> |
| 33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
| 34 | #include <linux/ksm.h> | 35 | #include <linux/ksm.h> |
| 35 | 36 | ||
| 36 | #include <asm/tlbflush.h> | 37 | #include <asm/tlbflush.h> |
| 38 | #include "internal.h" | ||
| 37 | 39 | ||
| 38 | /* | 40 | /* |
| 39 | * A few notes about the KSM scanning process, | 41 | * A few notes about the KSM scanning process, |
| @@ -79,13 +81,13 @@ | |||
| 79 | * struct mm_slot - ksm information per mm that is being scanned | 81 | * struct mm_slot - ksm information per mm that is being scanned |
| 80 | * @link: link to the mm_slots hash list | 82 | * @link: link to the mm_slots hash list |
| 81 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head | 83 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head |
| 82 | * @rmap_list: head for this mm_slot's list of rmap_items | 84 | * @rmap_list: head for this mm_slot's singly-linked list of rmap_items |
| 83 | * @mm: the mm that this information is valid for | 85 | * @mm: the mm that this information is valid for |
| 84 | */ | 86 | */ |
| 85 | struct mm_slot { | 87 | struct mm_slot { |
| 86 | struct hlist_node link; | 88 | struct hlist_node link; |
| 87 | struct list_head mm_list; | 89 | struct list_head mm_list; |
| 88 | struct list_head rmap_list; | 90 | struct rmap_item *rmap_list; |
| 89 | struct mm_struct *mm; | 91 | struct mm_struct *mm; |
| 90 | }; | 92 | }; |
| 91 | 93 | ||
| @@ -93,7 +95,7 @@ struct mm_slot { | |||
| 93 | * struct ksm_scan - cursor for scanning | 95 | * struct ksm_scan - cursor for scanning |
| 94 | * @mm_slot: the current mm_slot we are scanning | 96 | * @mm_slot: the current mm_slot we are scanning |
| 95 | * @address: the next address inside that to be scanned | 97 | * @address: the next address inside that to be scanned |
| 96 | * @rmap_item: the current rmap that we are scanning inside the rmap_list | 98 | * @rmap_list: link to the next rmap to be scanned in the rmap_list |
| 97 | * @seqnr: count of completed full scans (needed when removing unstable node) | 99 | * @seqnr: count of completed full scans (needed when removing unstable node) |
| 98 | * | 100 | * |
| 99 | * There is only the one ksm_scan instance of this cursor structure. | 101 | * There is only the one ksm_scan instance of this cursor structure. |
| @@ -101,37 +103,51 @@ struct mm_slot { | |||
| 101 | struct ksm_scan { | 103 | struct ksm_scan { |
| 102 | struct mm_slot *mm_slot; | 104 | struct mm_slot *mm_slot; |
| 103 | unsigned long address; | 105 | unsigned long address; |
| 104 | struct rmap_item *rmap_item; | 106 | struct rmap_item **rmap_list; |
| 105 | unsigned long seqnr; | 107 | unsigned long seqnr; |
| 106 | }; | 108 | }; |
| 107 | 109 | ||
| 108 | /** | 110 | /** |
| 111 | * struct stable_node - node of the stable rbtree | ||
| 112 | * @node: rb node of this ksm page in the stable tree | ||
| 113 | * @hlist: hlist head of rmap_items using this ksm page | ||
| 114 | * @kpfn: page frame number of this ksm page | ||
| 115 | */ | ||
| 116 | struct stable_node { | ||
| 117 | struct rb_node node; | ||
| 118 | struct hlist_head hlist; | ||
| 119 | unsigned long kpfn; | ||
| 120 | }; | ||
| 121 | |||
| 122 | /** | ||
| 109 | * struct rmap_item - reverse mapping item for virtual addresses | 123 | * struct rmap_item - reverse mapping item for virtual addresses |
| 110 | * @link: link into mm_slot's rmap_list (rmap_list is per mm) | 124 | * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list |
| 125 | * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree | ||
| 111 | * @mm: the memory structure this rmap_item is pointing into | 126 | * @mm: the memory structure this rmap_item is pointing into |
| 112 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | 127 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) |
| 113 | * @oldchecksum: previous checksum of the page at that virtual address | 128 | * @oldchecksum: previous checksum of the page at that virtual address |
| 114 | * @node: rb_node of this rmap_item in either unstable or stable tree | 129 | * @node: rb node of this rmap_item in the unstable tree |
| 115 | * @next: next rmap_item hanging off the same node of the stable tree | 130 | * @head: pointer to stable_node heading this list in the stable tree |
| 116 | * @prev: previous rmap_item hanging off the same node of the stable tree | 131 | * @hlist: link into hlist of rmap_items hanging off that stable_node |
| 117 | */ | 132 | */ |
| 118 | struct rmap_item { | 133 | struct rmap_item { |
| 119 | struct list_head link; | 134 | struct rmap_item *rmap_list; |
| 135 | struct anon_vma *anon_vma; /* when stable */ | ||
| 120 | struct mm_struct *mm; | 136 | struct mm_struct *mm; |
| 121 | unsigned long address; /* + low bits used for flags below */ | 137 | unsigned long address; /* + low bits used for flags below */ |
| 138 | unsigned int oldchecksum; /* when unstable */ | ||
| 122 | union { | 139 | union { |
| 123 | unsigned int oldchecksum; /* when unstable */ | 140 | struct rb_node node; /* when node of unstable tree */ |
| 124 | struct rmap_item *next; /* when stable */ | 141 | struct { /* when listed from stable tree */ |
| 125 | }; | 142 | struct stable_node *head; |
| 126 | union { | 143 | struct hlist_node hlist; |
| 127 | struct rb_node node; /* when tree node */ | 144 | }; |
| 128 | struct rmap_item *prev; /* in stable list */ | ||
| 129 | }; | 145 | }; |
| 130 | }; | 146 | }; |
| 131 | 147 | ||
| 132 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ | 148 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ |
| 133 | #define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ | 149 | #define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ |
| 134 | #define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ | 150 | #define STABLE_FLAG 0x200 /* is listed from the stable tree */ |
| 135 | 151 | ||
| 136 | /* The stable and unstable tree heads */ | 152 | /* The stable and unstable tree heads */ |
| 137 | static struct rb_root root_stable_tree = RB_ROOT; | 153 | static struct rb_root root_stable_tree = RB_ROOT; |
| @@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = { | |||
| 148 | }; | 164 | }; |
| 149 | 165 | ||
| 150 | static struct kmem_cache *rmap_item_cache; | 166 | static struct kmem_cache *rmap_item_cache; |
| 167 | static struct kmem_cache *stable_node_cache; | ||
| 151 | static struct kmem_cache *mm_slot_cache; | 168 | static struct kmem_cache *mm_slot_cache; |
| 152 | 169 | ||
| 153 | /* The number of nodes in the stable tree */ | 170 | /* The number of nodes in the stable tree */ |
| @@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared; | |||
| 162 | /* The number of rmap_items in use: to calculate pages_volatile */ | 179 | /* The number of rmap_items in use: to calculate pages_volatile */ |
| 163 | static unsigned long ksm_rmap_items; | 180 | static unsigned long ksm_rmap_items; |
| 164 | 181 | ||
| 165 | /* Limit on the number of unswappable pages used */ | ||
| 166 | static unsigned long ksm_max_kernel_pages; | ||
| 167 | |||
| 168 | /* Number of pages ksmd should scan in one batch */ | 182 | /* Number of pages ksmd should scan in one batch */ |
| 169 | static unsigned int ksm_thread_pages_to_scan = 100; | 183 | static unsigned int ksm_thread_pages_to_scan = 100; |
| 170 | 184 | ||
| @@ -190,13 +204,19 @@ static int __init ksm_slab_init(void) | |||
| 190 | if (!rmap_item_cache) | 204 | if (!rmap_item_cache) |
| 191 | goto out; | 205 | goto out; |
| 192 | 206 | ||
| 207 | stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); | ||
| 208 | if (!stable_node_cache) | ||
| 209 | goto out_free1; | ||
| 210 | |||
| 193 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); | 211 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); |
| 194 | if (!mm_slot_cache) | 212 | if (!mm_slot_cache) |
| 195 | goto out_free; | 213 | goto out_free2; |
| 196 | 214 | ||
| 197 | return 0; | 215 | return 0; |
| 198 | 216 | ||
| 199 | out_free: | 217 | out_free2: |
| 218 | kmem_cache_destroy(stable_node_cache); | ||
| 219 | out_free1: | ||
| 200 | kmem_cache_destroy(rmap_item_cache); | 220 | kmem_cache_destroy(rmap_item_cache); |
| 201 | out: | 221 | out: |
| 202 | return -ENOMEM; | 222 | return -ENOMEM; |
| @@ -205,6 +225,7 @@ out: | |||
| 205 | static void __init ksm_slab_free(void) | 225 | static void __init ksm_slab_free(void) |
| 206 | { | 226 | { |
| 207 | kmem_cache_destroy(mm_slot_cache); | 227 | kmem_cache_destroy(mm_slot_cache); |
| 228 | kmem_cache_destroy(stable_node_cache); | ||
| 208 | kmem_cache_destroy(rmap_item_cache); | 229 | kmem_cache_destroy(rmap_item_cache); |
| 209 | mm_slot_cache = NULL; | 230 | mm_slot_cache = NULL; |
| 210 | } | 231 | } |
| @@ -226,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) | |||
| 226 | kmem_cache_free(rmap_item_cache, rmap_item); | 247 | kmem_cache_free(rmap_item_cache, rmap_item); |
| 227 | } | 248 | } |
| 228 | 249 | ||
| 250 | static inline struct stable_node *alloc_stable_node(void) | ||
| 251 | { | ||
| 252 | return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); | ||
| 253 | } | ||
| 254 | |||
| 255 | static inline void free_stable_node(struct stable_node *stable_node) | ||
| 256 | { | ||
| 257 | kmem_cache_free(stable_node_cache, stable_node); | ||
| 258 | } | ||
| 259 | |||
| 229 | static inline struct mm_slot *alloc_mm_slot(void) | 260 | static inline struct mm_slot *alloc_mm_slot(void) |
| 230 | { | 261 | { |
| 231 | if (!mm_slot_cache) /* initialization failed */ | 262 | if (!mm_slot_cache) /* initialization failed */ |
| @@ -275,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, | |||
| 275 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | 306 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) |
| 276 | % MM_SLOTS_HASH_HEADS]; | 307 | % MM_SLOTS_HASH_HEADS]; |
| 277 | mm_slot->mm = mm; | 308 | mm_slot->mm = mm; |
| 278 | INIT_LIST_HEAD(&mm_slot->rmap_list); | ||
| 279 | hlist_add_head(&mm_slot->link, bucket); | 309 | hlist_add_head(&mm_slot->link, bucket); |
| 280 | } | 310 | } |
| 281 | 311 | ||
| @@ -284,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item) | |||
| 284 | return rmap_item->address & STABLE_FLAG; | 314 | return rmap_item->address & STABLE_FLAG; |
| 285 | } | 315 | } |
| 286 | 316 | ||
| 317 | static void hold_anon_vma(struct rmap_item *rmap_item, | ||
| 318 | struct anon_vma *anon_vma) | ||
| 319 | { | ||
| 320 | rmap_item->anon_vma = anon_vma; | ||
| 321 | atomic_inc(&anon_vma->ksm_refcount); | ||
| 322 | } | ||
| 323 | |||
| 324 | static void drop_anon_vma(struct rmap_item *rmap_item) | ||
| 325 | { | ||
| 326 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
| 327 | |||
| 328 | if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) { | ||
| 329 | int empty = list_empty(&anon_vma->head); | ||
| 330 | spin_unlock(&anon_vma->lock); | ||
| 331 | if (empty) | ||
| 332 | anon_vma_free(anon_vma); | ||
| 333 | } | ||
| 334 | } | ||
| 335 | |||
| 287 | /* | 336 | /* |
| 288 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's | 337 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's |
| 289 | * page tables after it has passed through ksm_exit() - which, if necessary, | 338 | * page tables after it has passed through ksm_exit() - which, if necessary, |
| @@ -356,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | |||
| 356 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | 405 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; |
| 357 | } | 406 | } |
| 358 | 407 | ||
| 359 | static void break_cow(struct mm_struct *mm, unsigned long addr) | 408 | static void break_cow(struct rmap_item *rmap_item) |
| 360 | { | 409 | { |
| 410 | struct mm_struct *mm = rmap_item->mm; | ||
| 411 | unsigned long addr = rmap_item->address; | ||
| 361 | struct vm_area_struct *vma; | 412 | struct vm_area_struct *vma; |
| 362 | 413 | ||
| 414 | /* | ||
| 415 | * It is not an accident that whenever we want to break COW | ||
| 416 | * to undo, we also need to drop a reference to the anon_vma. | ||
| 417 | */ | ||
| 418 | drop_anon_vma(rmap_item); | ||
| 419 | |||
| 363 | down_read(&mm->mmap_sem); | 420 | down_read(&mm->mmap_sem); |
| 364 | if (ksm_test_exit(mm)) | 421 | if (ksm_test_exit(mm)) |
| 365 | goto out; | 422 | goto out; |
| @@ -403,21 +460,77 @@ out: page = NULL; | |||
| 403 | return page; | 460 | return page; |
| 404 | } | 461 | } |
| 405 | 462 | ||
| 463 | static void remove_node_from_stable_tree(struct stable_node *stable_node) | ||
| 464 | { | ||
| 465 | struct rmap_item *rmap_item; | ||
| 466 | struct hlist_node *hlist; | ||
| 467 | |||
| 468 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
| 469 | if (rmap_item->hlist.next) | ||
| 470 | ksm_pages_sharing--; | ||
| 471 | else | ||
| 472 | ksm_pages_shared--; | ||
| 473 | drop_anon_vma(rmap_item); | ||
| 474 | rmap_item->address &= PAGE_MASK; | ||
| 475 | cond_resched(); | ||
| 476 | } | ||
| 477 | |||
| 478 | rb_erase(&stable_node->node, &root_stable_tree); | ||
| 479 | free_stable_node(stable_node); | ||
| 480 | } | ||
| 481 | |||
| 406 | /* | 482 | /* |
| 407 | * get_ksm_page: checks if the page at the virtual address in rmap_item | 483 | * get_ksm_page: checks if the page indicated by the stable node |
| 408 | * is still PageKsm, in which case we can trust the content of the page, | 484 | * is still its ksm page, despite having held no reference to it. |
| 409 | * and it returns the gotten page; but NULL if the page has been zapped. | 485 | * In which case we can trust the content of the page, and it |
| 486 | * returns the gotten page; but if the page has now been zapped, | ||
| 487 | * remove the stale node from the stable tree and return NULL. | ||
| 488 | * | ||
| 489 | * You would expect the stable_node to hold a reference to the ksm page. | ||
| 490 | * But if it increments the page's count, swapping out has to wait for | ||
| 491 | * ksmd to come around again before it can free the page, which may take | ||
| 492 | * seconds or even minutes: much too unresponsive. So instead we use a | ||
| 493 | * "keyhole reference": access to the ksm page from the stable node peeps | ||
| 494 | * out through its keyhole to see if that page still holds the right key, | ||
| 495 | * pointing back to this stable node. This relies on freeing a PageAnon | ||
| 496 | * page to reset its page->mapping to NULL, and relies on no other use of | ||
| 497 | * a page to put something that might look like our key in page->mapping. | ||
| 498 | * | ||
| 499 | * include/linux/pagemap.h page_cache_get_speculative() is a good reference, | ||
| 500 | * but this is different - made simpler by ksm_thread_mutex being held, but | ||
| 501 | * interesting for assuming that no other use of the struct page could ever | ||
| 502 | * put our expected_mapping into page->mapping (or a field of the union which | ||
| 503 | * coincides with page->mapping). The RCU calls are not for KSM at all, but | ||
| 504 | * to keep the page_count protocol described with page_cache_get_speculative. | ||
| 505 | * | ||
| 506 | * Note: it is possible that get_ksm_page() will return NULL one moment, | ||
| 507 | * then page the next, if the page is in between page_freeze_refs() and | ||
| 508 | * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page | ||
| 509 | * is on its way to being freed; but it is an anomaly to bear in mind. | ||
| 410 | */ | 510 | */ |
| 411 | static struct page *get_ksm_page(struct rmap_item *rmap_item) | 511 | static struct page *get_ksm_page(struct stable_node *stable_node) |
| 412 | { | 512 | { |
| 413 | struct page *page; | 513 | struct page *page; |
| 414 | 514 | void *expected_mapping; | |
| 415 | page = get_mergeable_page(rmap_item); | 515 | |
| 416 | if (page && !PageKsm(page)) { | 516 | page = pfn_to_page(stable_node->kpfn); |
| 517 | expected_mapping = (void *)stable_node + | ||
| 518 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | ||
| 519 | rcu_read_lock(); | ||
| 520 | if (page->mapping != expected_mapping) | ||
| 521 | goto stale; | ||
| 522 | if (!get_page_unless_zero(page)) | ||
| 523 | goto stale; | ||
| 524 | if (page->mapping != expected_mapping) { | ||
| 417 | put_page(page); | 525 | put_page(page); |
| 418 | page = NULL; | 526 | goto stale; |
| 419 | } | 527 | } |
| 528 | rcu_read_unlock(); | ||
| 420 | return page; | 529 | return page; |
| 530 | stale: | ||
| 531 | rcu_read_unlock(); | ||
| 532 | remove_node_from_stable_tree(stable_node); | ||
| 533 | return NULL; | ||
| 421 | } | 534 | } |
| 422 | 535 | ||
| 423 | /* | 536 | /* |
| @@ -426,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item) | |||
| 426 | */ | 539 | */ |
| 427 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | 540 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) |
| 428 | { | 541 | { |
| 429 | if (in_stable_tree(rmap_item)) { | 542 | if (rmap_item->address & STABLE_FLAG) { |
| 430 | struct rmap_item *next_item = rmap_item->next; | 543 | struct stable_node *stable_node; |
| 431 | 544 | struct page *page; | |
| 432 | if (rmap_item->address & NODE_FLAG) { | ||
| 433 | if (next_item) { | ||
| 434 | rb_replace_node(&rmap_item->node, | ||
| 435 | &next_item->node, | ||
| 436 | &root_stable_tree); | ||
| 437 | next_item->address |= NODE_FLAG; | ||
| 438 | ksm_pages_sharing--; | ||
| 439 | } else { | ||
| 440 | rb_erase(&rmap_item->node, &root_stable_tree); | ||
| 441 | ksm_pages_shared--; | ||
| 442 | } | ||
| 443 | } else { | ||
| 444 | struct rmap_item *prev_item = rmap_item->prev; | ||
| 445 | 545 | ||
| 446 | BUG_ON(prev_item->next != rmap_item); | 546 | stable_node = rmap_item->head; |
| 447 | prev_item->next = next_item; | 547 | page = get_ksm_page(stable_node); |
| 448 | if (next_item) { | 548 | if (!page) |
| 449 | BUG_ON(next_item->prev != rmap_item); | 549 | goto out; |
| 450 | next_item->prev = rmap_item->prev; | 550 | |
| 451 | } | 551 | lock_page(page); |
| 552 | hlist_del(&rmap_item->hlist); | ||
| 553 | unlock_page(page); | ||
| 554 | put_page(page); | ||
| 555 | |||
| 556 | if (stable_node->hlist.first) | ||
| 452 | ksm_pages_sharing--; | 557 | ksm_pages_sharing--; |
| 453 | } | 558 | else |
| 559 | ksm_pages_shared--; | ||
| 454 | 560 | ||
| 455 | rmap_item->next = NULL; | 561 | drop_anon_vma(rmap_item); |
| 562 | rmap_item->address &= PAGE_MASK; | ||
| 456 | 563 | ||
| 457 | } else if (rmap_item->address & NODE_FLAG) { | 564 | } else if (rmap_item->address & UNSTABLE_FLAG) { |
| 458 | unsigned char age; | 565 | unsigned char age; |
| 459 | /* | 566 | /* |
| 460 | * Usually ksmd can and must skip the rb_erase, because | 567 | * Usually ksmd can and must skip the rb_erase, because |
| @@ -467,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | |||
| 467 | BUG_ON(age > 1); | 574 | BUG_ON(age > 1); |
| 468 | if (!age) | 575 | if (!age) |
| 469 | rb_erase(&rmap_item->node, &root_unstable_tree); | 576 | rb_erase(&rmap_item->node, &root_unstable_tree); |
| 577 | |||
| 470 | ksm_pages_unshared--; | 578 | ksm_pages_unshared--; |
| 579 | rmap_item->address &= PAGE_MASK; | ||
| 471 | } | 580 | } |
| 472 | 581 | out: | |
| 473 | rmap_item->address &= PAGE_MASK; | ||
| 474 | |||
| 475 | cond_resched(); /* we're called from many long loops */ | 582 | cond_resched(); /* we're called from many long loops */ |
| 476 | } | 583 | } |
| 477 | 584 | ||
| 478 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | 585 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, |
| 479 | struct list_head *cur) | 586 | struct rmap_item **rmap_list) |
| 480 | { | 587 | { |
| 481 | struct rmap_item *rmap_item; | 588 | while (*rmap_list) { |
| 482 | 589 | struct rmap_item *rmap_item = *rmap_list; | |
| 483 | while (cur != &mm_slot->rmap_list) { | 590 | *rmap_list = rmap_item->rmap_list; |
| 484 | rmap_item = list_entry(cur, struct rmap_item, link); | ||
| 485 | cur = cur->next; | ||
| 486 | remove_rmap_item_from_tree(rmap_item); | 591 | remove_rmap_item_from_tree(rmap_item); |
| 487 | list_del(&rmap_item->link); | ||
| 488 | free_rmap_item(rmap_item); | 592 | free_rmap_item(rmap_item); |
| 489 | } | 593 | } |
| 490 | } | 594 | } |
| @@ -550,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void) | |||
| 550 | goto error; | 654 | goto error; |
| 551 | } | 655 | } |
| 552 | 656 | ||
| 553 | remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); | 657 | remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); |
| 554 | 658 | ||
| 555 | spin_lock(&ksm_mmlist_lock); | 659 | spin_lock(&ksm_mmlist_lock); |
| 556 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, | 660 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, |
| @@ -646,7 +750,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, | |||
| 646 | * Check that no O_DIRECT or similar I/O is in progress on the | 750 | * Check that no O_DIRECT or similar I/O is in progress on the |
| 647 | * page | 751 | * page |
| 648 | */ | 752 | */ |
| 649 | if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { | 753 | if (page_mapcount(page) + 1 + swapped != page_count(page)) { |
| 650 | set_pte_at_notify(mm, addr, ptep, entry); | 754 | set_pte_at_notify(mm, addr, ptep, entry); |
| 651 | goto out_unlock; | 755 | goto out_unlock; |
| 652 | } | 756 | } |
| @@ -664,15 +768,15 @@ out: | |||
| 664 | 768 | ||
| 665 | /** | 769 | /** |
| 666 | * replace_page - replace page in vma by new ksm page | 770 | * replace_page - replace page in vma by new ksm page |
| 667 | * @vma: vma that holds the pte pointing to oldpage | 771 | * @vma: vma that holds the pte pointing to page |
| 668 | * @oldpage: the page we are replacing by newpage | 772 | * @page: the page we are replacing by kpage |
| 669 | * @newpage: the ksm page we replace oldpage by | 773 | * @kpage: the ksm page we replace page by |
| 670 | * @orig_pte: the original value of the pte | 774 | * @orig_pte: the original value of the pte |
| 671 | * | 775 | * |
| 672 | * Returns 0 on success, -EFAULT on failure. | 776 | * Returns 0 on success, -EFAULT on failure. |
| 673 | */ | 777 | */ |
| 674 | static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | 778 | static int replace_page(struct vm_area_struct *vma, struct page *page, |
| 675 | struct page *newpage, pte_t orig_pte) | 779 | struct page *kpage, pte_t orig_pte) |
| 676 | { | 780 | { |
| 677 | struct mm_struct *mm = vma->vm_mm; | 781 | struct mm_struct *mm = vma->vm_mm; |
| 678 | pgd_t *pgd; | 782 | pgd_t *pgd; |
| @@ -681,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | |||
| 681 | pte_t *ptep; | 785 | pte_t *ptep; |
| 682 | spinlock_t *ptl; | 786 | spinlock_t *ptl; |
| 683 | unsigned long addr; | 787 | unsigned long addr; |
| 684 | pgprot_t prot; | ||
| 685 | int err = -EFAULT; | 788 | int err = -EFAULT; |
| 686 | 789 | ||
| 687 | prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); | 790 | addr = page_address_in_vma(page, vma); |
| 688 | |||
| 689 | addr = page_address_in_vma(oldpage, vma); | ||
| 690 | if (addr == -EFAULT) | 791 | if (addr == -EFAULT) |
| 691 | goto out; | 792 | goto out; |
| 692 | 793 | ||
| @@ -708,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | |||
| 708 | goto out; | 809 | goto out; |
| 709 | } | 810 | } |
| 710 | 811 | ||
| 711 | get_page(newpage); | 812 | get_page(kpage); |
| 712 | page_add_ksm_rmap(newpage); | 813 | page_add_anon_rmap(kpage, vma, addr); |
| 713 | 814 | ||
| 714 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 815 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
| 715 | ptep_clear_flush(vma, addr, ptep); | 816 | ptep_clear_flush(vma, addr, ptep); |
| 716 | set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); | 817 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
| 717 | 818 | ||
| 718 | page_remove_rmap(oldpage); | 819 | page_remove_rmap(page); |
| 719 | put_page(oldpage); | 820 | put_page(page); |
| 720 | 821 | ||
| 721 | pte_unmap_unlock(ptep, ptl); | 822 | pte_unmap_unlock(ptep, ptl); |
| 722 | err = 0; | 823 | err = 0; |
| @@ -726,32 +827,27 @@ out: | |||
| 726 | 827 | ||
| 727 | /* | 828 | /* |
| 728 | * try_to_merge_one_page - take two pages and merge them into one | 829 | * try_to_merge_one_page - take two pages and merge them into one |
| 729 | * @vma: the vma that hold the pte pointing into oldpage | 830 | * @vma: the vma that holds the pte pointing to page |
| 730 | * @oldpage: the page that we want to replace with newpage | 831 | * @page: the PageAnon page that we want to replace with kpage |
| 731 | * @newpage: the page that we want to map instead of oldpage | 832 | * @kpage: the PageKsm page that we want to map instead of page, |
| 732 | * | 833 | * or NULL the first time when we want to use page as kpage. |
| 733 | * Note: | ||
| 734 | * oldpage should be a PageAnon page, while newpage should be a PageKsm page, | ||
| 735 | * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. | ||
| 736 | * | 834 | * |
| 737 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | 835 | * This function returns 0 if the pages were merged, -EFAULT otherwise. |
| 738 | */ | 836 | */ |
| 739 | static int try_to_merge_one_page(struct vm_area_struct *vma, | 837 | static int try_to_merge_one_page(struct vm_area_struct *vma, |
| 740 | struct page *oldpage, | 838 | struct page *page, struct page *kpage) |
| 741 | struct page *newpage) | ||
| 742 | { | 839 | { |
| 743 | pte_t orig_pte = __pte(0); | 840 | pte_t orig_pte = __pte(0); |
| 744 | int err = -EFAULT; | 841 | int err = -EFAULT; |
| 745 | 842 | ||
| 843 | if (page == kpage) /* ksm page forked */ | ||
| 844 | return 0; | ||
| 845 | |||
| 746 | if (!(vma->vm_flags & VM_MERGEABLE)) | 846 | if (!(vma->vm_flags & VM_MERGEABLE)) |
| 747 | goto out; | 847 | goto out; |
| 748 | 848 | if (!PageAnon(page)) | |
| 749 | if (!PageAnon(oldpage)) | ||
| 750 | goto out; | 849 | goto out; |
| 751 | 850 | ||
| 752 | get_page(newpage); | ||
| 753 | get_page(oldpage); | ||
| 754 | |||
| 755 | /* | 851 | /* |
| 756 | * We need the page lock to read a stable PageSwapCache in | 852 | * We need the page lock to read a stable PageSwapCache in |
| 757 | * write_protect_page(). We use trylock_page() instead of | 853 | * write_protect_page(). We use trylock_page() instead of |
| @@ -759,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
| 759 | * prefer to continue scanning and merging different pages, | 855 | * prefer to continue scanning and merging different pages, |
| 760 | * then come back to this page when it is unlocked. | 856 | * then come back to this page when it is unlocked. |
| 761 | */ | 857 | */ |
| 762 | if (!trylock_page(oldpage)) | 858 | if (!trylock_page(page)) |
| 763 | goto out_putpage; | 859 | goto out; |
| 764 | /* | 860 | /* |
| 765 | * If this anonymous page is mapped only here, its pte may need | 861 | * If this anonymous page is mapped only here, its pte may need |
| 766 | * to be write-protected. If it's mapped elsewhere, all of its | 862 | * to be write-protected. If it's mapped elsewhere, all of its |
| 767 | * ptes are necessarily already write-protected. But in either | 863 | * ptes are necessarily already write-protected. But in either |
| 768 | * case, we need to lock and check page_count is not raised. | 864 | * case, we need to lock and check page_count is not raised. |
| 769 | */ | 865 | */ |
| 770 | if (write_protect_page(vma, oldpage, &orig_pte)) { | 866 | if (write_protect_page(vma, page, &orig_pte) == 0) { |
| 771 | unlock_page(oldpage); | 867 | if (!kpage) { |
| 772 | goto out_putpage; | 868 | /* |
| 869 | * While we hold page lock, upgrade page from | ||
| 870 | * PageAnon+anon_vma to PageKsm+NULL stable_node: | ||
| 871 | * stable_tree_insert() will update stable_node. | ||
| 872 | */ | ||
| 873 | set_page_stable_node(page, NULL); | ||
| 874 | mark_page_accessed(page); | ||
| 875 | err = 0; | ||
| 876 | } else if (pages_identical(page, kpage)) | ||
| 877 | err = replace_page(vma, page, kpage, orig_pte); | ||
| 773 | } | 878 | } |
| 774 | unlock_page(oldpage); | ||
| 775 | 879 | ||
| 776 | if (pages_identical(oldpage, newpage)) | 880 | if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { |
| 777 | err = replace_page(vma, oldpage, newpage, orig_pte); | 881 | munlock_vma_page(page); |
| 882 | if (!PageMlocked(kpage)) { | ||
| 883 | unlock_page(page); | ||
| 884 | lock_page(kpage); | ||
| 885 | mlock_vma_page(kpage); | ||
| 886 | page = kpage; /* for final unlock */ | ||
| 887 | } | ||
| 888 | } | ||
| 778 | 889 | ||
| 779 | out_putpage: | 890 | unlock_page(page); |
| 780 | put_page(oldpage); | ||
| 781 | put_page(newpage); | ||
| 782 | out: | 891 | out: |
| 783 | return err; | 892 | return err; |
| 784 | } | 893 | } |
| @@ -786,26 +895,31 @@ out: | |||
| 786 | /* | 895 | /* |
| 787 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, | 896 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, |
| 788 | * but no new kernel page is allocated: kpage must already be a ksm page. | 897 | * but no new kernel page is allocated: kpage must already be a ksm page. |
| 898 | * | ||
| 899 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | ||
| 789 | */ | 900 | */ |
| 790 | static int try_to_merge_with_ksm_page(struct mm_struct *mm1, | 901 | static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, |
| 791 | unsigned long addr1, | 902 | struct page *page, struct page *kpage) |
| 792 | struct page *page1, | ||
| 793 | struct page *kpage) | ||
| 794 | { | 903 | { |
| 904 | struct mm_struct *mm = rmap_item->mm; | ||
| 795 | struct vm_area_struct *vma; | 905 | struct vm_area_struct *vma; |
| 796 | int err = -EFAULT; | 906 | int err = -EFAULT; |
| 797 | 907 | ||
| 798 | down_read(&mm1->mmap_sem); | 908 | down_read(&mm->mmap_sem); |
| 799 | if (ksm_test_exit(mm1)) | 909 | if (ksm_test_exit(mm)) |
| 910 | goto out; | ||
| 911 | vma = find_vma(mm, rmap_item->address); | ||
| 912 | if (!vma || vma->vm_start > rmap_item->address) | ||
| 800 | goto out; | 913 | goto out; |
| 801 | 914 | ||
| 802 | vma = find_vma(mm1, addr1); | 915 | err = try_to_merge_one_page(vma, page, kpage); |
| 803 | if (!vma || vma->vm_start > addr1) | 916 | if (err) |
| 804 | goto out; | 917 | goto out; |
| 805 | 918 | ||
| 806 | err = try_to_merge_one_page(vma, page1, kpage); | 919 | /* Must get reference to anon_vma while still holding mmap_sem */ |
| 920 | hold_anon_vma(rmap_item, vma->anon_vma); | ||
| 807 | out: | 921 | out: |
| 808 | up_read(&mm1->mmap_sem); | 922 | up_read(&mm->mmap_sem); |
| 809 | return err; | 923 | return err; |
| 810 | } | 924 | } |
| 811 | 925 | ||
| @@ -813,109 +927,73 @@ out: | |||
| 813 | * try_to_merge_two_pages - take two identical pages and prepare them | 927 | * try_to_merge_two_pages - take two identical pages and prepare them |
| 814 | * to be merged into one page. | 928 | * to be merged into one page. |
| 815 | * | 929 | * |
| 816 | * This function returns 0 if we successfully mapped two identical pages | 930 | * This function returns the kpage if we successfully merged two identical |
| 817 | * into one page, -EFAULT otherwise. | 931 | * pages into one ksm page, NULL otherwise. |
| 818 | * | 932 | * |
| 819 | * Note that this function allocates a new kernel page: if one of the pages | 933 | * Note that this function upgrades page to ksm page: if one of the pages |
| 820 | * is already a ksm page, try_to_merge_with_ksm_page should be used. | 934 | * is already a ksm page, try_to_merge_with_ksm_page should be used. |
| 821 | */ | 935 | */ |
| 822 | static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, | 936 | static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, |
| 823 | struct page *page1, struct mm_struct *mm2, | 937 | struct page *page, |
| 824 | unsigned long addr2, struct page *page2) | 938 | struct rmap_item *tree_rmap_item, |
| 939 | struct page *tree_page) | ||
| 825 | { | 940 | { |
| 826 | struct vm_area_struct *vma; | 941 | int err; |
| 827 | struct page *kpage; | ||
| 828 | int err = -EFAULT; | ||
| 829 | |||
| 830 | /* | ||
| 831 | * The number of nodes in the stable tree | ||
| 832 | * is the number of kernel pages that we hold. | ||
| 833 | */ | ||
| 834 | if (ksm_max_kernel_pages && | ||
| 835 | ksm_max_kernel_pages <= ksm_pages_shared) | ||
| 836 | return err; | ||
| 837 | |||
| 838 | kpage = alloc_page(GFP_HIGHUSER); | ||
| 839 | if (!kpage) | ||
| 840 | return err; | ||
| 841 | |||
| 842 | down_read(&mm1->mmap_sem); | ||
| 843 | if (ksm_test_exit(mm1)) { | ||
| 844 | up_read(&mm1->mmap_sem); | ||
| 845 | goto out; | ||
| 846 | } | ||
| 847 | vma = find_vma(mm1, addr1); | ||
| 848 | if (!vma || vma->vm_start > addr1) { | ||
| 849 | up_read(&mm1->mmap_sem); | ||
| 850 | goto out; | ||
| 851 | } | ||
| 852 | |||
| 853 | copy_user_highpage(kpage, page1, addr1, vma); | ||
| 854 | err = try_to_merge_one_page(vma, page1, kpage); | ||
| 855 | up_read(&mm1->mmap_sem); | ||
| 856 | 942 | ||
| 943 | err = try_to_merge_with_ksm_page(rmap_item, page, NULL); | ||
| 857 | if (!err) { | 944 | if (!err) { |
| 858 | err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); | 945 | err = try_to_merge_with_ksm_page(tree_rmap_item, |
| 946 | tree_page, page); | ||
| 859 | /* | 947 | /* |
| 860 | * If that fails, we have a ksm page with only one pte | 948 | * If that fails, we have a ksm page with only one pte |
| 861 | * pointing to it: so break it. | 949 | * pointing to it: so break it. |
| 862 | */ | 950 | */ |
| 863 | if (err) | 951 | if (err) |
| 864 | break_cow(mm1, addr1); | 952 | break_cow(rmap_item); |
| 865 | } | 953 | } |
| 866 | out: | 954 | return err ? NULL : page; |
| 867 | put_page(kpage); | ||
| 868 | return err; | ||
| 869 | } | 955 | } |
| 870 | 956 | ||
| 871 | /* | 957 | /* |
| 872 | * stable_tree_search - search page inside the stable tree | 958 | * stable_tree_search - search for page inside the stable tree |
| 873 | * @page: the page that we are searching identical pages to. | ||
| 874 | * @page2: pointer into identical page that we are holding inside the stable | ||
| 875 | * tree that we have found. | ||
| 876 | * @rmap_item: the reverse mapping item | ||
| 877 | * | 959 | * |
| 878 | * This function checks if there is a page inside the stable tree | 960 | * This function checks if there is a page inside the stable tree |
| 879 | * with identical content to the page that we are scanning right now. | 961 | * with identical content to the page that we are scanning right now. |
| 880 | * | 962 | * |
| 881 | * This function return rmap_item pointer to the identical item if found, | 963 | * This function returns the stable tree node of identical content if found, |
| 882 | * NULL otherwise. | 964 | * NULL otherwise. |
| 883 | */ | 965 | */ |
| 884 | static struct rmap_item *stable_tree_search(struct page *page, | 966 | static struct page *stable_tree_search(struct page *page) |
| 885 | struct page **page2, | ||
| 886 | struct rmap_item *rmap_item) | ||
| 887 | { | 967 | { |
| 888 | struct rb_node *node = root_stable_tree.rb_node; | 968 | struct rb_node *node = root_stable_tree.rb_node; |
| 969 | struct stable_node *stable_node; | ||
| 970 | |||
| 971 | stable_node = page_stable_node(page); | ||
| 972 | if (stable_node) { /* ksm page forked */ | ||
| 973 | get_page(page); | ||
| 974 | return page; | ||
| 975 | } | ||
| 889 | 976 | ||
| 890 | while (node) { | 977 | while (node) { |
| 891 | struct rmap_item *tree_rmap_item, *next_rmap_item; | 978 | struct page *tree_page; |
| 892 | int ret; | 979 | int ret; |
| 893 | 980 | ||
| 894 | tree_rmap_item = rb_entry(node, struct rmap_item, node); | 981 | cond_resched(); |
| 895 | while (tree_rmap_item) { | 982 | stable_node = rb_entry(node, struct stable_node, node); |
| 896 | BUG_ON(!in_stable_tree(tree_rmap_item)); | 983 | tree_page = get_ksm_page(stable_node); |
| 897 | cond_resched(); | 984 | if (!tree_page) |
| 898 | page2[0] = get_ksm_page(tree_rmap_item); | ||
| 899 | if (page2[0]) | ||
| 900 | break; | ||
| 901 | next_rmap_item = tree_rmap_item->next; | ||
| 902 | remove_rmap_item_from_tree(tree_rmap_item); | ||
| 903 | tree_rmap_item = next_rmap_item; | ||
| 904 | } | ||
| 905 | if (!tree_rmap_item) | ||
| 906 | return NULL; | 985 | return NULL; |
| 907 | 986 | ||
| 908 | ret = memcmp_pages(page, page2[0]); | 987 | ret = memcmp_pages(page, tree_page); |
| 909 | 988 | ||
| 910 | if (ret < 0) { | 989 | if (ret < 0) { |
| 911 | put_page(page2[0]); | 990 | put_page(tree_page); |
| 912 | node = node->rb_left; | 991 | node = node->rb_left; |
| 913 | } else if (ret > 0) { | 992 | } else if (ret > 0) { |
| 914 | put_page(page2[0]); | 993 | put_page(tree_page); |
| 915 | node = node->rb_right; | 994 | node = node->rb_right; |
| 916 | } else { | 995 | } else |
| 917 | return tree_rmap_item; | 996 | return tree_page; |
| 918 | } | ||
| 919 | } | 997 | } |
| 920 | 998 | ||
| 921 | return NULL; | 999 | return NULL; |
| @@ -925,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page, | |||
| 925 | * stable_tree_insert - insert rmap_item pointing to new ksm page | 1003 | * stable_tree_insert - insert rmap_item pointing to new ksm page |
| 926 | * into the stable tree. | 1004 | * into the stable tree. |
| 927 | * | 1005 | * |
| 928 | * @page: the page that we are searching identical page to inside the stable | 1006 | * This function returns the stable tree node just allocated on success, |
| 929 | * tree. | 1007 | * NULL otherwise. |
| 930 | * @rmap_item: pointer to the reverse mapping item. | ||
| 931 | * | ||
| 932 | * This function returns rmap_item if success, NULL otherwise. | ||
| 933 | */ | 1008 | */ |
| 934 | static struct rmap_item *stable_tree_insert(struct page *page, | 1009 | static struct stable_node *stable_tree_insert(struct page *kpage) |
| 935 | struct rmap_item *rmap_item) | ||
| 936 | { | 1010 | { |
| 937 | struct rb_node **new = &root_stable_tree.rb_node; | 1011 | struct rb_node **new = &root_stable_tree.rb_node; |
| 938 | struct rb_node *parent = NULL; | 1012 | struct rb_node *parent = NULL; |
| 1013 | struct stable_node *stable_node; | ||
| 939 | 1014 | ||
| 940 | while (*new) { | 1015 | while (*new) { |
| 941 | struct rmap_item *tree_rmap_item, *next_rmap_item; | ||
| 942 | struct page *tree_page; | 1016 | struct page *tree_page; |
| 943 | int ret; | 1017 | int ret; |
| 944 | 1018 | ||
| 945 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1019 | cond_resched(); |
| 946 | while (tree_rmap_item) { | 1020 | stable_node = rb_entry(*new, struct stable_node, node); |
| 947 | BUG_ON(!in_stable_tree(tree_rmap_item)); | 1021 | tree_page = get_ksm_page(stable_node); |
| 948 | cond_resched(); | 1022 | if (!tree_page) |
| 949 | tree_page = get_ksm_page(tree_rmap_item); | ||
| 950 | if (tree_page) | ||
| 951 | break; | ||
| 952 | next_rmap_item = tree_rmap_item->next; | ||
| 953 | remove_rmap_item_from_tree(tree_rmap_item); | ||
| 954 | tree_rmap_item = next_rmap_item; | ||
| 955 | } | ||
| 956 | if (!tree_rmap_item) | ||
| 957 | return NULL; | 1023 | return NULL; |
| 958 | 1024 | ||
| 959 | ret = memcmp_pages(page, tree_page); | 1025 | ret = memcmp_pages(kpage, tree_page); |
| 960 | put_page(tree_page); | 1026 | put_page(tree_page); |
| 961 | 1027 | ||
| 962 | parent = *new; | 1028 | parent = *new; |
| @@ -974,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page, | |||
| 974 | } | 1040 | } |
| 975 | } | 1041 | } |
| 976 | 1042 | ||
| 977 | rmap_item->address |= NODE_FLAG | STABLE_FLAG; | 1043 | stable_node = alloc_stable_node(); |
| 978 | rmap_item->next = NULL; | 1044 | if (!stable_node) |
| 979 | rb_link_node(&rmap_item->node, parent, new); | 1045 | return NULL; |
| 980 | rb_insert_color(&rmap_item->node, &root_stable_tree); | ||
| 981 | 1046 | ||
| 982 | ksm_pages_shared++; | 1047 | rb_link_node(&stable_node->node, parent, new); |
| 983 | return rmap_item; | 1048 | rb_insert_color(&stable_node->node, &root_stable_tree); |
| 1049 | |||
| 1050 | INIT_HLIST_HEAD(&stable_node->hlist); | ||
| 1051 | |||
| 1052 | stable_node->kpfn = page_to_pfn(kpage); | ||
| 1053 | set_page_stable_node(kpage, stable_node); | ||
| 1054 | |||
| 1055 | return stable_node; | ||
| 984 | } | 1056 | } |
| 985 | 1057 | ||
| 986 | /* | 1058 | /* |
| 987 | * unstable_tree_search_insert - search and insert items into the unstable tree. | 1059 | * unstable_tree_search_insert - search for identical page, |
| 988 | * | 1060 | * else insert rmap_item into the unstable tree. |
| 989 | * @page: the page that we are going to search for identical page or to insert | ||
| 990 | * into the unstable tree | ||
| 991 | * @page2: pointer into identical page that was found inside the unstable tree | ||
| 992 | * @rmap_item: the reverse mapping item of page | ||
| 993 | * | 1061 | * |
| 994 | * This function searches for a page in the unstable tree identical to the | 1062 | * This function searches for a page in the unstable tree identical to the |
| 995 | * page currently being scanned; and if no identical page is found in the | 1063 | * page currently being scanned; and if no identical page is found in the |
| @@ -1001,47 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page, | |||
| 1001 | * This function does both searching and inserting, because they share | 1069 | * This function does both searching and inserting, because they share |
| 1002 | * the same walking algorithm in an rbtree. | 1070 | * the same walking algorithm in an rbtree. |
| 1003 | */ | 1071 | */ |
| 1004 | static struct rmap_item *unstable_tree_search_insert(struct page *page, | 1072 | static |
| 1005 | struct page **page2, | 1073 | struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, |
| 1006 | struct rmap_item *rmap_item) | 1074 | struct page *page, |
| 1075 | struct page **tree_pagep) | ||
| 1076 | |||
| 1007 | { | 1077 | { |
| 1008 | struct rb_node **new = &root_unstable_tree.rb_node; | 1078 | struct rb_node **new = &root_unstable_tree.rb_node; |
| 1009 | struct rb_node *parent = NULL; | 1079 | struct rb_node *parent = NULL; |
| 1010 | 1080 | ||
| 1011 | while (*new) { | 1081 | while (*new) { |
| 1012 | struct rmap_item *tree_rmap_item; | 1082 | struct rmap_item *tree_rmap_item; |
| 1083 | struct page *tree_page; | ||
| 1013 | int ret; | 1084 | int ret; |
| 1014 | 1085 | ||
| 1015 | cond_resched(); | 1086 | cond_resched(); |
| 1016 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | 1087 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); |
| 1017 | page2[0] = get_mergeable_page(tree_rmap_item); | 1088 | tree_page = get_mergeable_page(tree_rmap_item); |
| 1018 | if (!page2[0]) | 1089 | if (!tree_page) |
| 1019 | return NULL; | 1090 | return NULL; |
| 1020 | 1091 | ||
| 1021 | /* | 1092 | /* |
| 1022 | * Don't substitute an unswappable ksm page | 1093 | * Don't substitute a ksm page for a forked page. |
| 1023 | * just for one good swappable forked page. | ||
| 1024 | */ | 1094 | */ |
| 1025 | if (page == page2[0]) { | 1095 | if (page == tree_page) { |
| 1026 | put_page(page2[0]); | 1096 | put_page(tree_page); |
| 1027 | return NULL; | 1097 | return NULL; |
| 1028 | } | 1098 | } |
| 1029 | 1099 | ||
| 1030 | ret = memcmp_pages(page, page2[0]); | 1100 | ret = memcmp_pages(page, tree_page); |
| 1031 | 1101 | ||
| 1032 | parent = *new; | 1102 | parent = *new; |
| 1033 | if (ret < 0) { | 1103 | if (ret < 0) { |
| 1034 | put_page(page2[0]); | 1104 | put_page(tree_page); |
| 1035 | new = &parent->rb_left; | 1105 | new = &parent->rb_left; |
| 1036 | } else if (ret > 0) { | 1106 | } else if (ret > 0) { |
| 1037 | put_page(page2[0]); | 1107 | put_page(tree_page); |
| 1038 | new = &parent->rb_right; | 1108 | new = &parent->rb_right; |
| 1039 | } else { | 1109 | } else { |
| 1110 | *tree_pagep = tree_page; | ||
| 1040 | return tree_rmap_item; | 1111 | return tree_rmap_item; |
| 1041 | } | 1112 | } |
| 1042 | } | 1113 | } |
| 1043 | 1114 | ||
| 1044 | rmap_item->address |= NODE_FLAG; | 1115 | rmap_item->address |= UNSTABLE_FLAG; |
| 1045 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); | 1116 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); |
| 1046 | rb_link_node(&rmap_item->node, parent, new); | 1117 | rb_link_node(&rmap_item->node, parent, new); |
| 1047 | rb_insert_color(&rmap_item->node, &root_unstable_tree); | 1118 | rb_insert_color(&rmap_item->node, &root_unstable_tree); |
| @@ -1056,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page, | |||
| 1056 | * the same ksm page. | 1127 | * the same ksm page. |
| 1057 | */ | 1128 | */ |
| 1058 | static void stable_tree_append(struct rmap_item *rmap_item, | 1129 | static void stable_tree_append(struct rmap_item *rmap_item, |
| 1059 | struct rmap_item *tree_rmap_item) | 1130 | struct stable_node *stable_node) |
| 1060 | { | 1131 | { |
| 1061 | rmap_item->next = tree_rmap_item->next; | 1132 | rmap_item->head = stable_node; |
| 1062 | rmap_item->prev = tree_rmap_item; | ||
| 1063 | |||
| 1064 | if (tree_rmap_item->next) | ||
| 1065 | tree_rmap_item->next->prev = rmap_item; | ||
| 1066 | |||
| 1067 | tree_rmap_item->next = rmap_item; | ||
| 1068 | rmap_item->address |= STABLE_FLAG; | 1133 | rmap_item->address |= STABLE_FLAG; |
| 1134 | hlist_add_head(&rmap_item->hlist, &stable_node->hlist); | ||
| 1069 | 1135 | ||
| 1070 | ksm_pages_sharing++; | 1136 | if (rmap_item->hlist.next) |
| 1137 | ksm_pages_sharing++; | ||
| 1138 | else | ||
| 1139 | ksm_pages_shared++; | ||
| 1071 | } | 1140 | } |
| 1072 | 1141 | ||
| 1073 | /* | 1142 | /* |
| @@ -1081,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item, | |||
| 1081 | */ | 1150 | */ |
| 1082 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | 1151 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) |
| 1083 | { | 1152 | { |
| 1084 | struct page *page2[1]; | ||
| 1085 | struct rmap_item *tree_rmap_item; | 1153 | struct rmap_item *tree_rmap_item; |
| 1154 | struct page *tree_page = NULL; | ||
| 1155 | struct stable_node *stable_node; | ||
| 1156 | struct page *kpage; | ||
| 1086 | unsigned int checksum; | 1157 | unsigned int checksum; |
| 1087 | int err; | 1158 | int err; |
| 1088 | 1159 | ||
| 1089 | if (in_stable_tree(rmap_item)) | 1160 | remove_rmap_item_from_tree(rmap_item); |
| 1090 | remove_rmap_item_from_tree(rmap_item); | ||
| 1091 | 1161 | ||
| 1092 | /* We first start with searching the page inside the stable tree */ | 1162 | /* We first start with searching the page inside the stable tree */ |
| 1093 | tree_rmap_item = stable_tree_search(page, page2, rmap_item); | 1163 | kpage = stable_tree_search(page); |
| 1094 | if (tree_rmap_item) { | 1164 | if (kpage) { |
| 1095 | if (page == page2[0]) /* forked */ | 1165 | err = try_to_merge_with_ksm_page(rmap_item, page, kpage); |
| 1096 | err = 0; | ||
| 1097 | else | ||
| 1098 | err = try_to_merge_with_ksm_page(rmap_item->mm, | ||
| 1099 | rmap_item->address, | ||
| 1100 | page, page2[0]); | ||
| 1101 | put_page(page2[0]); | ||
| 1102 | |||
| 1103 | if (!err) { | 1166 | if (!err) { |
| 1104 | /* | 1167 | /* |
| 1105 | * The page was successfully merged: | 1168 | * The page was successfully merged: |
| 1106 | * add its rmap_item to the stable tree. | 1169 | * add its rmap_item to the stable tree. |
| 1107 | */ | 1170 | */ |
| 1108 | stable_tree_append(rmap_item, tree_rmap_item); | 1171 | lock_page(kpage); |
| 1172 | stable_tree_append(rmap_item, page_stable_node(kpage)); | ||
| 1173 | unlock_page(kpage); | ||
| 1109 | } | 1174 | } |
| 1175 | put_page(kpage); | ||
| 1110 | return; | 1176 | return; |
| 1111 | } | 1177 | } |
| 1112 | 1178 | ||
| 1113 | /* | 1179 | /* |
| 1114 | * A ksm page might have got here by fork, but its other | 1180 | * If the hash value of the page has changed from the last time |
| 1115 | * references have already been removed from the stable tree. | 1181 | * we calculated it, this page is changing frequently: therefore we |
| 1116 | * Or it might be left over from a break_ksm which failed | 1182 | * don't want to insert it in the unstable tree, and we don't want |
| 1117 | * when the mem_cgroup had reached its limit: try again now. | 1183 | * to waste our time searching for something identical to it there. |
| 1118 | */ | ||
| 1119 | if (PageKsm(page)) | ||
| 1120 | break_cow(rmap_item->mm, rmap_item->address); | ||
| 1121 | |||
| 1122 | /* | ||
| 1123 | * In case the hash value of the page was changed from the last time we | ||
| 1124 | * have calculated it, this page to be changed frequely, therefore we | ||
| 1125 | * don't want to insert it to the unstable tree, and we don't want to | ||
| 1126 | * waste our time to search if there is something identical to it there. | ||
| 1127 | */ | 1184 | */ |
| 1128 | checksum = calc_checksum(page); | 1185 | checksum = calc_checksum(page); |
| 1129 | if (rmap_item->oldchecksum != checksum) { | 1186 | if (rmap_item->oldchecksum != checksum) { |
| @@ -1131,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
| 1131 | return; | 1188 | return; |
| 1132 | } | 1189 | } |
| 1133 | 1190 | ||
| 1134 | tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); | 1191 | tree_rmap_item = |
| 1192 | unstable_tree_search_insert(rmap_item, page, &tree_page); | ||
| 1135 | if (tree_rmap_item) { | 1193 | if (tree_rmap_item) { |
| 1136 | err = try_to_merge_two_pages(rmap_item->mm, | 1194 | kpage = try_to_merge_two_pages(rmap_item, page, |
| 1137 | rmap_item->address, page, | 1195 | tree_rmap_item, tree_page); |
| 1138 | tree_rmap_item->mm, | 1196 | put_page(tree_page); |
| 1139 | tree_rmap_item->address, page2[0]); | ||
| 1140 | /* | 1197 | /* |
| 1141 | * As soon as we merge this page, we want to remove the | 1198 | * As soon as we merge this page, we want to remove the |
| 1142 | * rmap_item of the page we have merged with from the unstable | 1199 | * rmap_item of the page we have merged with from the unstable |
| 1143 | * tree, and insert it instead as new node in the stable tree. | 1200 | * tree, and insert it instead as new node in the stable tree. |
| 1144 | */ | 1201 | */ |
| 1145 | if (!err) { | 1202 | if (kpage) { |
| 1146 | rb_erase(&tree_rmap_item->node, &root_unstable_tree); | 1203 | remove_rmap_item_from_tree(tree_rmap_item); |
| 1147 | tree_rmap_item->address &= ~NODE_FLAG; | 1204 | |
| 1148 | ksm_pages_unshared--; | 1205 | lock_page(kpage); |
| 1206 | stable_node = stable_tree_insert(kpage); | ||
| 1207 | if (stable_node) { | ||
| 1208 | stable_tree_append(tree_rmap_item, stable_node); | ||
| 1209 | stable_tree_append(rmap_item, stable_node); | ||
| 1210 | } | ||
| 1211 | unlock_page(kpage); | ||
| 1149 | 1212 | ||
| 1150 | /* | 1213 | /* |
| 1151 | * If we fail to insert the page into the stable tree, | 1214 | * If we fail to insert the page into the stable tree, |
| @@ -1153,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | |||
| 1153 | * to a ksm page left outside the stable tree, | 1216 | * to a ksm page left outside the stable tree, |
| 1154 | * in which case we need to break_cow on both. | 1217 | * in which case we need to break_cow on both. |
| 1155 | */ | 1218 | */ |
| 1156 | if (stable_tree_insert(page2[0], tree_rmap_item)) | 1219 | if (!stable_node) { |
| 1157 | stable_tree_append(rmap_item, tree_rmap_item); | 1220 | break_cow(tree_rmap_item); |
| 1158 | else { | 1221 | break_cow(rmap_item); |
| 1159 | break_cow(tree_rmap_item->mm, | ||
| 1160 | tree_rmap_item->address); | ||
| 1161 | break_cow(rmap_item->mm, rmap_item->address); | ||
| 1162 | } | 1222 | } |
| 1163 | } | 1223 | } |
| 1164 | |||
| 1165 | put_page(page2[0]); | ||
| 1166 | } | 1224 | } |
| 1167 | } | 1225 | } |
| 1168 | 1226 | ||
| 1169 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | 1227 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, |
| 1170 | struct list_head *cur, | 1228 | struct rmap_item **rmap_list, |
| 1171 | unsigned long addr) | 1229 | unsigned long addr) |
| 1172 | { | 1230 | { |
| 1173 | struct rmap_item *rmap_item; | 1231 | struct rmap_item *rmap_item; |
| 1174 | 1232 | ||
| 1175 | while (cur != &mm_slot->rmap_list) { | 1233 | while (*rmap_list) { |
| 1176 | rmap_item = list_entry(cur, struct rmap_item, link); | 1234 | rmap_item = *rmap_list; |
| 1177 | if ((rmap_item->address & PAGE_MASK) == addr) { | 1235 | if ((rmap_item->address & PAGE_MASK) == addr) |
| 1178 | if (!in_stable_tree(rmap_item)) | ||
| 1179 | remove_rmap_item_from_tree(rmap_item); | ||
| 1180 | return rmap_item; | 1236 | return rmap_item; |
| 1181 | } | ||
| 1182 | if (rmap_item->address > addr) | 1237 | if (rmap_item->address > addr) |
| 1183 | break; | 1238 | break; |
| 1184 | cur = cur->next; | 1239 | *rmap_list = rmap_item->rmap_list; |
| 1185 | remove_rmap_item_from_tree(rmap_item); | 1240 | remove_rmap_item_from_tree(rmap_item); |
| 1186 | list_del(&rmap_item->link); | ||
| 1187 | free_rmap_item(rmap_item); | 1241 | free_rmap_item(rmap_item); |
| 1188 | } | 1242 | } |
| 1189 | 1243 | ||
| @@ -1192,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | |||
| 1192 | /* It has already been zeroed */ | 1246 | /* It has already been zeroed */ |
| 1193 | rmap_item->mm = mm_slot->mm; | 1247 | rmap_item->mm = mm_slot->mm; |
| 1194 | rmap_item->address = addr; | 1248 | rmap_item->address = addr; |
| 1195 | list_add_tail(&rmap_item->link, cur); | 1249 | rmap_item->rmap_list = *rmap_list; |
| 1250 | *rmap_list = rmap_item; | ||
| 1196 | } | 1251 | } |
| 1197 | return rmap_item; | 1252 | return rmap_item; |
| 1198 | } | 1253 | } |
| @@ -1217,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) | |||
| 1217 | spin_unlock(&ksm_mmlist_lock); | 1272 | spin_unlock(&ksm_mmlist_lock); |
| 1218 | next_mm: | 1273 | next_mm: |
| 1219 | ksm_scan.address = 0; | 1274 | ksm_scan.address = 0; |
| 1220 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | 1275 | ksm_scan.rmap_list = &slot->rmap_list; |
| 1221 | struct rmap_item, link); | ||
| 1222 | } | 1276 | } |
| 1223 | 1277 | ||
| 1224 | mm = slot->mm; | 1278 | mm = slot->mm; |
| @@ -1244,10 +1298,10 @@ next_mm: | |||
| 1244 | flush_anon_page(vma, *page, ksm_scan.address); | 1298 | flush_anon_page(vma, *page, ksm_scan.address); |
| 1245 | flush_dcache_page(*page); | 1299 | flush_dcache_page(*page); |
| 1246 | rmap_item = get_next_rmap_item(slot, | 1300 | rmap_item = get_next_rmap_item(slot, |
| 1247 | ksm_scan.rmap_item->link.next, | 1301 | ksm_scan.rmap_list, ksm_scan.address); |
| 1248 | ksm_scan.address); | ||
| 1249 | if (rmap_item) { | 1302 | if (rmap_item) { |
| 1250 | ksm_scan.rmap_item = rmap_item; | 1303 | ksm_scan.rmap_list = |
| 1304 | &rmap_item->rmap_list; | ||
| 1251 | ksm_scan.address += PAGE_SIZE; | 1305 | ksm_scan.address += PAGE_SIZE; |
| 1252 | } else | 1306 | } else |
| 1253 | put_page(*page); | 1307 | put_page(*page); |
| @@ -1263,14 +1317,13 @@ next_mm: | |||
| 1263 | 1317 | ||
| 1264 | if (ksm_test_exit(mm)) { | 1318 | if (ksm_test_exit(mm)) { |
| 1265 | ksm_scan.address = 0; | 1319 | ksm_scan.address = 0; |
| 1266 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | 1320 | ksm_scan.rmap_list = &slot->rmap_list; |
| 1267 | struct rmap_item, link); | ||
| 1268 | } | 1321 | } |
| 1269 | /* | 1322 | /* |
| 1270 | * Nuke all the rmap_items that are above this current rmap: | 1323 | * Nuke all the rmap_items that are above this current rmap: |
| 1271 | * because there were no VM_MERGEABLE vmas with such addresses. | 1324 | * because there were no VM_MERGEABLE vmas with such addresses. |
| 1272 | */ | 1325 | */ |
| 1273 | remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); | 1326 | remove_trailing_rmap_items(slot, ksm_scan.rmap_list); |
| 1274 | 1327 | ||
| 1275 | spin_lock(&ksm_mmlist_lock); | 1328 | spin_lock(&ksm_mmlist_lock); |
| 1276 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, | 1329 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, |
| @@ -1323,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages) | |||
| 1323 | return; | 1376 | return; |
| 1324 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | 1377 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) |
| 1325 | cmp_and_merge_page(page, rmap_item); | 1378 | cmp_and_merge_page(page, rmap_item); |
| 1326 | else if (page_mapcount(page) == 1) { | ||
| 1327 | /* | ||
| 1328 | * Replace now-unshared ksm page by ordinary page. | ||
| 1329 | */ | ||
| 1330 | break_cow(rmap_item->mm, rmap_item->address); | ||
| 1331 | remove_rmap_item_from_tree(rmap_item); | ||
| 1332 | rmap_item->oldchecksum = calc_checksum(page); | ||
| 1333 | } | ||
| 1334 | put_page(page); | 1379 | put_page(page); |
| 1335 | } | 1380 | } |
| 1336 | } | 1381 | } |
| @@ -1375,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | |||
| 1375 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | 1420 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | |
| 1376 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | 1421 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | |
| 1377 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | 1422 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | |
| 1378 | VM_MIXEDMAP | VM_SAO)) | 1423 | VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) |
| 1379 | return 0; /* just ignore the advice */ | 1424 | return 0; /* just ignore the advice */ |
| 1380 | 1425 | ||
| 1381 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { | 1426 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { |
| @@ -1452,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm) | |||
| 1452 | spin_lock(&ksm_mmlist_lock); | 1497 | spin_lock(&ksm_mmlist_lock); |
| 1453 | mm_slot = get_mm_slot(mm); | 1498 | mm_slot = get_mm_slot(mm); |
| 1454 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { | 1499 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { |
| 1455 | if (list_empty(&mm_slot->rmap_list)) { | 1500 | if (!mm_slot->rmap_list) { |
| 1456 | hlist_del(&mm_slot->link); | 1501 | hlist_del(&mm_slot->link); |
| 1457 | list_del(&mm_slot->mm_list); | 1502 | list_del(&mm_slot->mm_list); |
| 1458 | easy_to_free = 1; | 1503 | easy_to_free = 1; |
| @@ -1473,6 +1518,255 @@ void __ksm_exit(struct mm_struct *mm) | |||
| 1473 | } | 1518 | } |
| 1474 | } | 1519 | } |
| 1475 | 1520 | ||
| 1521 | struct page *ksm_does_need_to_copy(struct page *page, | ||
| 1522 | struct vm_area_struct *vma, unsigned long address) | ||
| 1523 | { | ||
| 1524 | struct page *new_page; | ||
| 1525 | |||
| 1526 | unlock_page(page); /* any racers will COW it, not modify it */ | ||
| 1527 | |||
| 1528 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
| 1529 | if (new_page) { | ||
| 1530 | copy_user_highpage(new_page, page, address, vma); | ||
| 1531 | |||
| 1532 | SetPageDirty(new_page); | ||
| 1533 | __SetPageUptodate(new_page); | ||
| 1534 | SetPageSwapBacked(new_page); | ||
| 1535 | __set_page_locked(new_page); | ||
| 1536 | |||
| 1537 | if (page_evictable(new_page, vma)) | ||
| 1538 | lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); | ||
| 1539 | else | ||
| 1540 | add_page_to_unevictable_list(new_page); | ||
| 1541 | } | ||
| 1542 | |||
| 1543 | page_cache_release(page); | ||
| 1544 | return new_page; | ||
| 1545 | } | ||
| 1546 | |||
| 1547 | int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, | ||
| 1548 | unsigned long *vm_flags) | ||
| 1549 | { | ||
| 1550 | struct stable_node *stable_node; | ||
| 1551 | struct rmap_item *rmap_item; | ||
| 1552 | struct hlist_node *hlist; | ||
| 1553 | unsigned int mapcount = page_mapcount(page); | ||
| 1554 | int referenced = 0; | ||
| 1555 | int search_new_forks = 0; | ||
| 1556 | |||
| 1557 | VM_BUG_ON(!PageKsm(page)); | ||
| 1558 | VM_BUG_ON(!PageLocked(page)); | ||
| 1559 | |||
| 1560 | stable_node = page_stable_node(page); | ||
| 1561 | if (!stable_node) | ||
| 1562 | return 0; | ||
| 1563 | again: | ||
| 1564 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
| 1565 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
| 1566 | struct anon_vma_chain *vmac; | ||
| 1567 | struct vm_area_struct *vma; | ||
| 1568 | |||
| 1569 | spin_lock(&anon_vma->lock); | ||
| 1570 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | ||
| 1571 | vma = vmac->vma; | ||
| 1572 | if (rmap_item->address < vma->vm_start || | ||
| 1573 | rmap_item->address >= vma->vm_end) | ||
| 1574 | continue; | ||
| 1575 | /* | ||
| 1576 | * Initially we examine only the vma which covers this | ||
| 1577 | * rmap_item; but later, if there is still work to do, | ||
| 1578 | * we examine covering vmas in other mms: in case they | ||
| 1579 | * were forked from the original since ksmd passed. | ||
| 1580 | */ | ||
| 1581 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
| 1582 | continue; | ||
| 1583 | |||
| 1584 | if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) | ||
| 1585 | continue; | ||
| 1586 | |||
| 1587 | referenced += page_referenced_one(page, vma, | ||
| 1588 | rmap_item->address, &mapcount, vm_flags); | ||
| 1589 | if (!search_new_forks || !mapcount) | ||
| 1590 | break; | ||
| 1591 | } | ||
| 1592 | spin_unlock(&anon_vma->lock); | ||
| 1593 | if (!mapcount) | ||
| 1594 | goto out; | ||
| 1595 | } | ||
| 1596 | if (!search_new_forks++) | ||
| 1597 | goto again; | ||
| 1598 | out: | ||
| 1599 | return referenced; | ||
| 1600 | } | ||
| 1601 | |||
| 1602 | int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) | ||
| 1603 | { | ||
| 1604 | struct stable_node *stable_node; | ||
| 1605 | struct hlist_node *hlist; | ||
| 1606 | struct rmap_item *rmap_item; | ||
| 1607 | int ret = SWAP_AGAIN; | ||
| 1608 | int search_new_forks = 0; | ||
| 1609 | |||
| 1610 | VM_BUG_ON(!PageKsm(page)); | ||
| 1611 | VM_BUG_ON(!PageLocked(page)); | ||
| 1612 | |||
| 1613 | stable_node = page_stable_node(page); | ||
| 1614 | if (!stable_node) | ||
| 1615 | return SWAP_FAIL; | ||
| 1616 | again: | ||
| 1617 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
| 1618 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
| 1619 | struct anon_vma_chain *vmac; | ||
| 1620 | struct vm_area_struct *vma; | ||
| 1621 | |||
| 1622 | spin_lock(&anon_vma->lock); | ||
| 1623 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | ||
| 1624 | vma = vmac->vma; | ||
| 1625 | if (rmap_item->address < vma->vm_start || | ||
| 1626 | rmap_item->address >= vma->vm_end) | ||
| 1627 | continue; | ||
| 1628 | /* | ||
| 1629 | * Initially we examine only the vma which covers this | ||
| 1630 | * rmap_item; but later, if there is still work to do, | ||
| 1631 | * we examine covering vmas in other mms: in case they | ||
| 1632 | * were forked from the original since ksmd passed. | ||
| 1633 | */ | ||
| 1634 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
| 1635 | continue; | ||
| 1636 | |||
| 1637 | ret = try_to_unmap_one(page, vma, | ||
| 1638 | rmap_item->address, flags); | ||
| 1639 | if (ret != SWAP_AGAIN || !page_mapped(page)) { | ||
| 1640 | spin_unlock(&anon_vma->lock); | ||
| 1641 | goto out; | ||
| 1642 | } | ||
| 1643 | } | ||
| 1644 | spin_unlock(&anon_vma->lock); | ||
| 1645 | } | ||
| 1646 | if (!search_new_forks++) | ||
| 1647 | goto again; | ||
| 1648 | out: | ||
| 1649 | return ret; | ||
| 1650 | } | ||
| 1651 | |||
| 1652 | #ifdef CONFIG_MIGRATION | ||
| 1653 | int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, | ||
| 1654 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
| 1655 | { | ||
| 1656 | struct stable_node *stable_node; | ||
| 1657 | struct hlist_node *hlist; | ||
| 1658 | struct rmap_item *rmap_item; | ||
| 1659 | int ret = SWAP_AGAIN; | ||
| 1660 | int search_new_forks = 0; | ||
| 1661 | |||
| 1662 | VM_BUG_ON(!PageKsm(page)); | ||
| 1663 | VM_BUG_ON(!PageLocked(page)); | ||
| 1664 | |||
| 1665 | stable_node = page_stable_node(page); | ||
| 1666 | if (!stable_node) | ||
| 1667 | return ret; | ||
| 1668 | again: | ||
| 1669 | hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { | ||
| 1670 | struct anon_vma *anon_vma = rmap_item->anon_vma; | ||
| 1671 | struct anon_vma_chain *vmac; | ||
| 1672 | struct vm_area_struct *vma; | ||
| 1673 | |||
| 1674 | spin_lock(&anon_vma->lock); | ||
| 1675 | list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { | ||
| 1676 | vma = vmac->vma; | ||
| 1677 | if (rmap_item->address < vma->vm_start || | ||
| 1678 | rmap_item->address >= vma->vm_end) | ||
| 1679 | continue; | ||
| 1680 | /* | ||
| 1681 | * Initially we examine only the vma which covers this | ||
| 1682 | * rmap_item; but later, if there is still work to do, | ||
| 1683 | * we examine covering vmas in other mms: in case they | ||
| 1684 | * were forked from the original since ksmd passed. | ||
| 1685 | */ | ||
| 1686 | if ((rmap_item->mm == vma->vm_mm) == search_new_forks) | ||
| 1687 | continue; | ||
| 1688 | |||
| 1689 | ret = rmap_one(page, vma, rmap_item->address, arg); | ||
| 1690 | if (ret != SWAP_AGAIN) { | ||
| 1691 | spin_unlock(&anon_vma->lock); | ||
| 1692 | goto out; | ||
| 1693 | } | ||
| 1694 | } | ||
| 1695 | spin_unlock(&anon_vma->lock); | ||
| 1696 | } | ||
| 1697 | if (!search_new_forks++) | ||
| 1698 | goto again; | ||
| 1699 | out: | ||
| 1700 | return ret; | ||
| 1701 | } | ||
| 1702 | |||
| 1703 | void ksm_migrate_page(struct page *newpage, struct page *oldpage) | ||
| 1704 | { | ||
| 1705 | struct stable_node *stable_node; | ||
| 1706 | |||
| 1707 | VM_BUG_ON(!PageLocked(oldpage)); | ||
| 1708 | VM_BUG_ON(!PageLocked(newpage)); | ||
| 1709 | VM_BUG_ON(newpage->mapping != oldpage->mapping); | ||
| 1710 | |||
| 1711 | stable_node = page_stable_node(newpage); | ||
| 1712 | if (stable_node) { | ||
| 1713 | VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); | ||
| 1714 | stable_node->kpfn = page_to_pfn(newpage); | ||
| 1715 | } | ||
| 1716 | } | ||
| 1717 | #endif /* CONFIG_MIGRATION */ | ||
| 1718 | |||
| 1719 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 1720 | static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, | ||
| 1721 | unsigned long end_pfn) | ||
| 1722 | { | ||
| 1723 | struct rb_node *node; | ||
| 1724 | |||
| 1725 | for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { | ||
| 1726 | struct stable_node *stable_node; | ||
| 1727 | |||
| 1728 | stable_node = rb_entry(node, struct stable_node, node); | ||
| 1729 | if (stable_node->kpfn >= start_pfn && | ||
| 1730 | stable_node->kpfn < end_pfn) | ||
| 1731 | return stable_node; | ||
| 1732 | } | ||
| 1733 | return NULL; | ||
| 1734 | } | ||
| 1735 | |||
| 1736 | static int ksm_memory_callback(struct notifier_block *self, | ||
| 1737 | unsigned long action, void *arg) | ||
| 1738 | { | ||
| 1739 | struct memory_notify *mn = arg; | ||
| 1740 | struct stable_node *stable_node; | ||
| 1741 | |||
| 1742 | switch (action) { | ||
| 1743 | case MEM_GOING_OFFLINE: | ||
| 1744 | /* | ||
| 1745 | * Keep it very simple for now: just lock out ksmd and | ||
| 1746 | * MADV_UNMERGEABLE while any memory is going offline. | ||
| 1747 | */ | ||
| 1748 | mutex_lock(&ksm_thread_mutex); | ||
| 1749 | break; | ||
| 1750 | |||
| 1751 | case MEM_OFFLINE: | ||
| 1752 | /* | ||
| 1753 | * Most of the work is done by page migration; but there might | ||
| 1754 | * be a few stable_nodes left over, still pointing to struct | ||
| 1755 | * pages which have been offlined: prune those from the tree. | ||
| 1756 | */ | ||
| 1757 | while ((stable_node = ksm_check_stable_tree(mn->start_pfn, | ||
| 1758 | mn->start_pfn + mn->nr_pages)) != NULL) | ||
| 1759 | remove_node_from_stable_tree(stable_node); | ||
| 1760 | /* fallthrough */ | ||
| 1761 | |||
| 1762 | case MEM_CANCEL_OFFLINE: | ||
| 1763 | mutex_unlock(&ksm_thread_mutex); | ||
| 1764 | break; | ||
| 1765 | } | ||
| 1766 | return NOTIFY_OK; | ||
| 1767 | } | ||
| 1768 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | ||
| 1769 | |||
| 1476 | #ifdef CONFIG_SYSFS | 1770 | #ifdef CONFIG_SYSFS |
| 1477 | /* | 1771 | /* |
| 1478 | * This all compiles without CONFIG_SYSFS, but is a waste of space. | 1772 | * This all compiles without CONFIG_SYSFS, but is a waste of space. |
| @@ -1551,8 +1845,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
| 1551 | /* | 1845 | /* |
| 1552 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. | 1846 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. |
| 1553 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, | 1847 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, |
| 1554 | * breaking COW to free the unswappable pages_shared (but leaves | 1848 | * breaking COW to free the pages_shared (but leaves mm_slots |
| 1555 | * mm_slots on the list for when ksmd may be set running again). | 1849 | * on the list for when ksmd may be set running again). |
| 1556 | */ | 1850 | */ |
| 1557 | 1851 | ||
| 1558 | mutex_lock(&ksm_thread_mutex); | 1852 | mutex_lock(&ksm_thread_mutex); |
| @@ -1577,29 +1871,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | |||
| 1577 | } | 1871 | } |
| 1578 | KSM_ATTR(run); | 1872 | KSM_ATTR(run); |
| 1579 | 1873 | ||
| 1580 | static ssize_t max_kernel_pages_store(struct kobject *kobj, | ||
| 1581 | struct kobj_attribute *attr, | ||
| 1582 | const char *buf, size_t count) | ||
| 1583 | { | ||
| 1584 | int err; | ||
| 1585 | unsigned long nr_pages; | ||
| 1586 | |||
| 1587 | err = strict_strtoul(buf, 10, &nr_pages); | ||
| 1588 | if (err) | ||
| 1589 | return -EINVAL; | ||
| 1590 | |||
| 1591 | ksm_max_kernel_pages = nr_pages; | ||
| 1592 | |||
| 1593 | return count; | ||
| 1594 | } | ||
| 1595 | |||
| 1596 | static ssize_t max_kernel_pages_show(struct kobject *kobj, | ||
| 1597 | struct kobj_attribute *attr, char *buf) | ||
| 1598 | { | ||
| 1599 | return sprintf(buf, "%lu\n", ksm_max_kernel_pages); | ||
| 1600 | } | ||
| 1601 | KSM_ATTR(max_kernel_pages); | ||
| 1602 | |||
| 1603 | static ssize_t pages_shared_show(struct kobject *kobj, | 1874 | static ssize_t pages_shared_show(struct kobject *kobj, |
| 1604 | struct kobj_attribute *attr, char *buf) | 1875 | struct kobj_attribute *attr, char *buf) |
| 1605 | { | 1876 | { |
| @@ -1649,7 +1920,6 @@ static struct attribute *ksm_attrs[] = { | |||
| 1649 | &sleep_millisecs_attr.attr, | 1920 | &sleep_millisecs_attr.attr, |
| 1650 | &pages_to_scan_attr.attr, | 1921 | &pages_to_scan_attr.attr, |
| 1651 | &run_attr.attr, | 1922 | &run_attr.attr, |
| 1652 | &max_kernel_pages_attr.attr, | ||
| 1653 | &pages_shared_attr.attr, | 1923 | &pages_shared_attr.attr, |
| 1654 | &pages_sharing_attr.attr, | 1924 | &pages_sharing_attr.attr, |
| 1655 | &pages_unshared_attr.attr, | 1925 | &pages_unshared_attr.attr, |
| @@ -1669,8 +1939,6 @@ static int __init ksm_init(void) | |||
| 1669 | struct task_struct *ksm_thread; | 1939 | struct task_struct *ksm_thread; |
| 1670 | int err; | 1940 | int err; |
| 1671 | 1941 | ||
| 1672 | ksm_max_kernel_pages = totalram_pages / 4; | ||
| 1673 | |||
| 1674 | err = ksm_slab_init(); | 1942 | err = ksm_slab_init(); |
| 1675 | if (err) | 1943 | if (err) |
| 1676 | goto out; | 1944 | goto out; |
| @@ -1698,6 +1966,13 @@ static int __init ksm_init(void) | |||
| 1698 | 1966 | ||
| 1699 | #endif /* CONFIG_SYSFS */ | 1967 | #endif /* CONFIG_SYSFS */ |
| 1700 | 1968 | ||
| 1969 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
| 1970 | /* | ||
| 1971 | * Choose a high priority since the callback takes ksm_thread_mutex: | ||
| 1972 | * later callbacks could only be taking locks which nest within that. | ||
| 1973 | */ | ||
| 1974 | hotplug_memory_notifier(ksm_memory_callback, 100); | ||
| 1975 | #endif | ||
| 1701 | return 0; | 1976 | return 0; |
| 1702 | 1977 | ||
| 1703 | out_free2: | 1978 | out_free2: |
diff --git a/mm/maccess.c b/mm/maccess.c index 9073695ff25f..4e348dbaecd7 100644 --- a/mm/maccess.c +++ b/mm/maccess.c | |||
| @@ -14,7 +14,11 @@ | |||
| 14 | * Safely read from address @src to the buffer at @dst. If a kernel fault | 14 | * Safely read from address @src to the buffer at @dst. If a kernel fault |
| 15 | * happens, handle that and return -EFAULT. | 15 | * happens, handle that and return -EFAULT. |
| 16 | */ | 16 | */ |
| 17 | long probe_kernel_read(void *dst, void *src, size_t size) | 17 | |
| 18 | long __weak probe_kernel_read(void *dst, void *src, size_t size) | ||
| 19 | __attribute__((alias("__probe_kernel_read"))); | ||
| 20 | |||
| 21 | long __probe_kernel_read(void *dst, void *src, size_t size) | ||
| 18 | { | 22 | { |
| 19 | long ret; | 23 | long ret; |
| 20 | mm_segment_t old_fs = get_fs(); | 24 | mm_segment_t old_fs = get_fs(); |
| @@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); | |||
| 39 | * Safely write to address @dst from the buffer at @src. If a kernel fault | 43 | * Safely write to address @dst from the buffer at @src. If a kernel fault |
| 40 | * happens, handle that and return -EFAULT. | 44 | * happens, handle that and return -EFAULT. |
| 41 | */ | 45 | */ |
| 42 | long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) | 46 | long __weak probe_kernel_write(void *dst, void *src, size_t size) |
| 47 | __attribute__((alias("__probe_kernel_write"))); | ||
| 48 | |||
| 49 | long __probe_kernel_write(void *dst, void *src, size_t size) | ||
| 43 | { | 50 | { |
| 44 | long ret; | 51 | long ret; |
| 45 | mm_segment_t old_fs = get_fs(); | 52 | mm_segment_t old_fs = get_fs(); |
diff --git a/mm/madvise.c b/mm/madvise.c index 35b1479b7c9d..319528b8db74 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include <linux/pagemap.h> | 9 | #include <linux/pagemap.h> |
| 10 | #include <linux/syscalls.h> | 10 | #include <linux/syscalls.h> |
| 11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
| 12 | #include <linux/page-isolation.h> | ||
| 12 | #include <linux/hugetlb.h> | 13 | #include <linux/hugetlb.h> |
| 13 | #include <linux/sched.h> | 14 | #include <linux/sched.h> |
| 14 | #include <linux/ksm.h> | 15 | #include <linux/ksm.h> |
| @@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
| 222 | /* | 223 | /* |
| 223 | * Error injection support for memory error handling. | 224 | * Error injection support for memory error handling. |
| 224 | */ | 225 | */ |
| 225 | static int madvise_hwpoison(unsigned long start, unsigned long end) | 226 | static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) |
| 226 | { | 227 | { |
| 227 | int ret = 0; | 228 | int ret = 0; |
| 228 | 229 | ||
| @@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end) | |||
| 230 | return -EPERM; | 231 | return -EPERM; |
| 231 | for (; start < end; start += PAGE_SIZE) { | 232 | for (; start < end; start += PAGE_SIZE) { |
| 232 | struct page *p; | 233 | struct page *p; |
| 233 | int ret = get_user_pages(current, current->mm, start, 1, | 234 | int ret = get_user_pages_fast(start, 1, 0, &p); |
| 234 | 0, 0, &p, NULL); | ||
| 235 | if (ret != 1) | 235 | if (ret != 1) |
| 236 | return ret; | 236 | return ret; |
| 237 | if (bhv == MADV_SOFT_OFFLINE) { | ||
| 238 | printk(KERN_INFO "Soft offlining page %lx at %lx\n", | ||
| 239 | page_to_pfn(p), start); | ||
| 240 | ret = soft_offline_page(p, MF_COUNT_INCREASED); | ||
| 241 | if (ret) | ||
| 242 | break; | ||
| 243 | continue; | ||
| 244 | } | ||
| 237 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | 245 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", |
| 238 | page_to_pfn(p), start); | 246 | page_to_pfn(p), start); |
| 239 | /* Ignore return value for now */ | 247 | /* Ignore return value for now */ |
| 240 | __memory_failure(page_to_pfn(p), 0, 1); | 248 | __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); |
| 241 | put_page(p); | ||
| 242 | } | 249 | } |
| 243 | return ret; | 250 | return ret; |
| 244 | } | 251 | } |
| @@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
| 335 | size_t len; | 342 | size_t len; |
| 336 | 343 | ||
| 337 | #ifdef CONFIG_MEMORY_FAILURE | 344 | #ifdef CONFIG_MEMORY_FAILURE |
| 338 | if (behavior == MADV_HWPOISON) | 345 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) |
| 339 | return madvise_hwpoison(start, start+len_in); | 346 | return madvise_hwpoison(behavior, start, start+len_in); |
| 340 | #endif | 347 | #endif |
| 341 | if (!madvise_behavior_valid(behavior)) | 348 | if (!madvise_behavior_valid(behavior)) |
| 342 | return error; | 349 | return error; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f99f5991d6bb..d813823ab08f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -38,6 +38,7 @@ | |||
| 38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
| 39 | #include <linux/mm_inline.h> | 39 | #include <linux/mm_inline.h> |
| 40 | #include <linux/page_cgroup.h> | 40 | #include <linux/page_cgroup.h> |
| 41 | #include <linux/cpu.h> | ||
| 41 | #include "internal.h" | 42 | #include "internal.h" |
| 42 | 43 | ||
| 43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
| @@ -54,7 +55,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
| 54 | #define do_swap_account (0) | 55 | #define do_swap_account (0) |
| 55 | #endif | 56 | #endif |
| 56 | 57 | ||
| 57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | ||
| 58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 58 | #define SOFTLIMIT_EVENTS_THRESH (1000) |
| 59 | 59 | ||
| 60 | /* | 60 | /* |
| @@ -66,7 +66,7 @@ enum mem_cgroup_stat_index { | |||
| 66 | */ | 66 | */ |
| 67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
| 68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
| 69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
| 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
| 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
| 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ |
| @@ -209,7 +209,7 @@ struct mem_cgroup { | |||
| 209 | int prev_priority; /* for recording reclaim priority */ | 209 | int prev_priority; /* for recording reclaim priority */ |
| 210 | 210 | ||
| 211 | /* | 211 | /* |
| 212 | * While reclaiming in a hiearchy, we cache the last child we | 212 | * While reclaiming in a hierarchy, we cache the last child we |
| 213 | * reclaimed from. | 213 | * reclaimed from. |
| 214 | */ | 214 | */ |
| 215 | int last_scanned_child; | 215 | int last_scanned_child; |
| @@ -275,6 +275,7 @@ enum charge_type { | |||
| 275 | static void mem_cgroup_get(struct mem_cgroup *mem); | 275 | static void mem_cgroup_get(struct mem_cgroup *mem); |
| 276 | static void mem_cgroup_put(struct mem_cgroup *mem); | 276 | static void mem_cgroup_put(struct mem_cgroup *mem); |
| 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
| 278 | static void drain_all_stock_async(void); | ||
| 278 | 279 | ||
| 279 | static struct mem_cgroup_per_zone * | 280 | static struct mem_cgroup_per_zone * |
| 280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 281 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
| @@ -282,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | |||
| 282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 283 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
| 283 | } | 284 | } |
| 284 | 285 | ||
| 286 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | ||
| 287 | { | ||
| 288 | return &mem->css; | ||
| 289 | } | ||
| 290 | |||
| 285 | static struct mem_cgroup_per_zone * | 291 | static struct mem_cgroup_per_zone * |
| 286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 292 | page_cgroup_zoneinfo(struct page_cgroup *pc) |
| 287 | { | 293 | { |
| @@ -758,7 +764,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
| 758 | task_unlock(task); | 764 | task_unlock(task); |
| 759 | if (!curr) | 765 | if (!curr) |
| 760 | return 0; | 766 | return 0; |
| 761 | if (curr->use_hierarchy) | 767 | /* |
| 768 | * We should check use_hierarchy of "mem" not "curr". Because checking | ||
| 769 | * use_hierarchy of "curr" here make this function true if hierarchy is | ||
| 770 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | ||
| 771 | * hierarchy(even if use_hierarchy is disabled in "mem"). | ||
| 772 | */ | ||
| 773 | if (mem->use_hierarchy) | ||
| 762 | ret = css_is_ancestor(&curr->css, &mem->css); | 774 | ret = css_is_ancestor(&curr->css, &mem->css); |
| 763 | else | 775 | else |
| 764 | ret = (curr == mem); | 776 | ret = (curr == mem); |
| @@ -1007,7 +1019,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
| 1007 | static char memcg_name[PATH_MAX]; | 1019 | static char memcg_name[PATH_MAX]; |
| 1008 | int ret; | 1020 | int ret; |
| 1009 | 1021 | ||
| 1010 | if (!memcg) | 1022 | if (!memcg || !p) |
| 1011 | return; | 1023 | return; |
| 1012 | 1024 | ||
| 1013 | 1025 | ||
| @@ -1137,6 +1149,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
| 1137 | victim = mem_cgroup_select_victim(root_mem); | 1149 | victim = mem_cgroup_select_victim(root_mem); |
| 1138 | if (victim == root_mem) { | 1150 | if (victim == root_mem) { |
| 1139 | loop++; | 1151 | loop++; |
| 1152 | if (loop >= 1) | ||
| 1153 | drain_all_stock_async(); | ||
| 1140 | if (loop >= 2) { | 1154 | if (loop >= 2) { |
| 1141 | /* | 1155 | /* |
| 1142 | * If we have not been able to reclaim | 1156 | * If we have not been able to reclaim |
| @@ -1223,7 +1237,7 @@ static void record_last_oom(struct mem_cgroup *mem) | |||
| 1223 | * Currently used to update mapped file statistics, but the routine can be | 1237 | * Currently used to update mapped file statistics, but the routine can be |
| 1224 | * generalized to update other statistics as well. | 1238 | * generalized to update other statistics as well. |
| 1225 | */ | 1239 | */ |
| 1226 | void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | 1240 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
| 1227 | { | 1241 | { |
| 1228 | struct mem_cgroup *mem; | 1242 | struct mem_cgroup *mem; |
| 1229 | struct mem_cgroup_stat *stat; | 1243 | struct mem_cgroup_stat *stat; |
| @@ -1231,9 +1245,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | |||
| 1231 | int cpu; | 1245 | int cpu; |
| 1232 | struct page_cgroup *pc; | 1246 | struct page_cgroup *pc; |
| 1233 | 1247 | ||
| 1234 | if (!page_is_file_cache(page)) | ||
| 1235 | return; | ||
| 1236 | |||
| 1237 | pc = lookup_page_cgroup(page); | 1248 | pc = lookup_page_cgroup(page); |
| 1238 | if (unlikely(!pc)) | 1249 | if (unlikely(!pc)) |
| 1239 | return; | 1250 | return; |
| @@ -1253,12 +1264,139 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | |||
| 1253 | stat = &mem->stat; | 1264 | stat = &mem->stat; |
| 1254 | cpustat = &stat->cpustat[cpu]; | 1265 | cpustat = &stat->cpustat[cpu]; |
| 1255 | 1266 | ||
| 1256 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); | 1267 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); |
| 1257 | done: | 1268 | done: |
| 1258 | unlock_page_cgroup(pc); | 1269 | unlock_page_cgroup(pc); |
| 1259 | } | 1270 | } |
| 1260 | 1271 | ||
| 1261 | /* | 1272 | /* |
| 1273 | * size of first charge trial. "32" comes from vmscan.c's magic value. | ||
| 1274 | * TODO: maybe necessary to use big numbers in big irons. | ||
| 1275 | */ | ||
| 1276 | #define CHARGE_SIZE (32 * PAGE_SIZE) | ||
| 1277 | struct memcg_stock_pcp { | ||
| 1278 | struct mem_cgroup *cached; /* this never be root cgroup */ | ||
| 1279 | int charge; | ||
| 1280 | struct work_struct work; | ||
| 1281 | }; | ||
| 1282 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | ||
| 1283 | static atomic_t memcg_drain_count; | ||
| 1284 | |||
| 1285 | /* | ||
| 1286 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | ||
| 1287 | * from local stock and true is returned. If the stock is 0 or charges from a | ||
| 1288 | * cgroup which is not current target, returns false. This stock will be | ||
| 1289 | * refilled. | ||
| 1290 | */ | ||
| 1291 | static bool consume_stock(struct mem_cgroup *mem) | ||
| 1292 | { | ||
| 1293 | struct memcg_stock_pcp *stock; | ||
| 1294 | bool ret = true; | ||
| 1295 | |||
| 1296 | stock = &get_cpu_var(memcg_stock); | ||
| 1297 | if (mem == stock->cached && stock->charge) | ||
| 1298 | stock->charge -= PAGE_SIZE; | ||
| 1299 | else /* need to call res_counter_charge */ | ||
| 1300 | ret = false; | ||
| 1301 | put_cpu_var(memcg_stock); | ||
| 1302 | return ret; | ||
| 1303 | } | ||
| 1304 | |||
| 1305 | /* | ||
| 1306 | * Returns stocks cached in percpu to res_counter and reset cached information. | ||
| 1307 | */ | ||
| 1308 | static void drain_stock(struct memcg_stock_pcp *stock) | ||
| 1309 | { | ||
| 1310 | struct mem_cgroup *old = stock->cached; | ||
| 1311 | |||
| 1312 | if (stock->charge) { | ||
| 1313 | res_counter_uncharge(&old->res, stock->charge); | ||
| 1314 | if (do_swap_account) | ||
| 1315 | res_counter_uncharge(&old->memsw, stock->charge); | ||
| 1316 | } | ||
| 1317 | stock->cached = NULL; | ||
| 1318 | stock->charge = 0; | ||
| 1319 | } | ||
| 1320 | |||
| 1321 | /* | ||
| 1322 | * This must be called under preempt disabled or must be called by | ||
| 1323 | * a thread which is pinned to local cpu. | ||
| 1324 | */ | ||
| 1325 | static void drain_local_stock(struct work_struct *dummy) | ||
| 1326 | { | ||
| 1327 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | ||
| 1328 | drain_stock(stock); | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | /* | ||
| 1332 | * Cache charges(val) which is from res_counter, to local per_cpu area. | ||
| 1333 | * This will be consumed by consumt_stock() function, later. | ||
| 1334 | */ | ||
| 1335 | static void refill_stock(struct mem_cgroup *mem, int val) | ||
| 1336 | { | ||
| 1337 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | ||
| 1338 | |||
| 1339 | if (stock->cached != mem) { /* reset if necessary */ | ||
| 1340 | drain_stock(stock); | ||
| 1341 | stock->cached = mem; | ||
| 1342 | } | ||
| 1343 | stock->charge += val; | ||
| 1344 | put_cpu_var(memcg_stock); | ||
| 1345 | } | ||
| 1346 | |||
| 1347 | /* | ||
| 1348 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
| 1349 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
| 1350 | * expects some charges will be back to res_counter later but cannot wait for | ||
| 1351 | * it. | ||
| 1352 | */ | ||
| 1353 | static void drain_all_stock_async(void) | ||
| 1354 | { | ||
| 1355 | int cpu; | ||
| 1356 | /* This function is for scheduling "drain" in asynchronous way. | ||
| 1357 | * The result of "drain" is not directly handled by callers. Then, | ||
| 1358 | * if someone is calling drain, we don't have to call drain more. | ||
| 1359 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
| 1360 | * there is a race. We just do loose check here. | ||
| 1361 | */ | ||
| 1362 | if (atomic_read(&memcg_drain_count)) | ||
| 1363 | return; | ||
| 1364 | /* Notify other cpus that system-wide "drain" is running */ | ||
| 1365 | atomic_inc(&memcg_drain_count); | ||
| 1366 | get_online_cpus(); | ||
| 1367 | for_each_online_cpu(cpu) { | ||
| 1368 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
| 1369 | schedule_work_on(cpu, &stock->work); | ||
| 1370 | } | ||
| 1371 | put_online_cpus(); | ||
| 1372 | atomic_dec(&memcg_drain_count); | ||
| 1373 | /* We don't wait for flush_work */ | ||
| 1374 | } | ||
| 1375 | |||
| 1376 | /* This is a synchronous drain interface. */ | ||
| 1377 | static void drain_all_stock_sync(void) | ||
| 1378 | { | ||
| 1379 | /* called when force_empty is called */ | ||
| 1380 | atomic_inc(&memcg_drain_count); | ||
| 1381 | schedule_on_each_cpu(drain_local_stock); | ||
| 1382 | atomic_dec(&memcg_drain_count); | ||
| 1383 | } | ||
| 1384 | |||
| 1385 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | ||
| 1386 | unsigned long action, | ||
| 1387 | void *hcpu) | ||
| 1388 | { | ||
| 1389 | int cpu = (unsigned long)hcpu; | ||
| 1390 | struct memcg_stock_pcp *stock; | ||
| 1391 | |||
| 1392 | if (action != CPU_DEAD) | ||
| 1393 | return NOTIFY_OK; | ||
| 1394 | stock = &per_cpu(memcg_stock, cpu); | ||
| 1395 | drain_stock(stock); | ||
| 1396 | return NOTIFY_OK; | ||
| 1397 | } | ||
| 1398 | |||
| 1399 | /* | ||
| 1262 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1400 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
| 1263 | * oom-killer can be invoked. | 1401 | * oom-killer can be invoked. |
| 1264 | */ | 1402 | */ |
| @@ -1269,6 +1407,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1269 | struct mem_cgroup *mem, *mem_over_limit; | 1407 | struct mem_cgroup *mem, *mem_over_limit; |
| 1270 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1408 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
| 1271 | struct res_counter *fail_res; | 1409 | struct res_counter *fail_res; |
| 1410 | int csize = CHARGE_SIZE; | ||
| 1272 | 1411 | ||
| 1273 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1412 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
| 1274 | /* Don't account this! */ | 1413 | /* Don't account this! */ |
| @@ -1293,23 +1432,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1293 | return 0; | 1432 | return 0; |
| 1294 | 1433 | ||
| 1295 | VM_BUG_ON(css_is_removed(&mem->css)); | 1434 | VM_BUG_ON(css_is_removed(&mem->css)); |
| 1435 | if (mem_cgroup_is_root(mem)) | ||
| 1436 | goto done; | ||
| 1296 | 1437 | ||
| 1297 | while (1) { | 1438 | while (1) { |
| 1298 | int ret = 0; | 1439 | int ret = 0; |
| 1299 | unsigned long flags = 0; | 1440 | unsigned long flags = 0; |
| 1300 | 1441 | ||
| 1301 | if (mem_cgroup_is_root(mem)) | 1442 | if (consume_stock(mem)) |
| 1302 | goto done; | 1443 | goto charged; |
| 1303 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1444 | |
| 1445 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
| 1304 | if (likely(!ret)) { | 1446 | if (likely(!ret)) { |
| 1305 | if (!do_swap_account) | 1447 | if (!do_swap_account) |
| 1306 | break; | 1448 | break; |
| 1307 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1449 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); |
| 1308 | &fail_res); | ||
| 1309 | if (likely(!ret)) | 1450 | if (likely(!ret)) |
| 1310 | break; | 1451 | break; |
| 1311 | /* mem+swap counter fails */ | 1452 | /* mem+swap counter fails */ |
| 1312 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1453 | res_counter_uncharge(&mem->res, csize); |
| 1313 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1454 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
| 1314 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1455 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
| 1315 | memsw); | 1456 | memsw); |
| @@ -1318,6 +1459,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1318 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1459 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
| 1319 | res); | 1460 | res); |
| 1320 | 1461 | ||
| 1462 | /* reduce request size and retry */ | ||
| 1463 | if (csize > PAGE_SIZE) { | ||
| 1464 | csize = PAGE_SIZE; | ||
| 1465 | continue; | ||
| 1466 | } | ||
| 1321 | if (!(gfp_mask & __GFP_WAIT)) | 1467 | if (!(gfp_mask & __GFP_WAIT)) |
| 1322 | goto nomem; | 1468 | goto nomem; |
| 1323 | 1469 | ||
| @@ -1339,14 +1485,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1339 | 1485 | ||
| 1340 | if (!nr_retries--) { | 1486 | if (!nr_retries--) { |
| 1341 | if (oom) { | 1487 | if (oom) { |
| 1342 | mutex_lock(&memcg_tasklist); | ||
| 1343 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1488 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); |
| 1344 | mutex_unlock(&memcg_tasklist); | ||
| 1345 | record_last_oom(mem_over_limit); | 1489 | record_last_oom(mem_over_limit); |
| 1346 | } | 1490 | } |
| 1347 | goto nomem; | 1491 | goto nomem; |
| 1348 | } | 1492 | } |
| 1349 | } | 1493 | } |
| 1494 | if (csize > PAGE_SIZE) | ||
| 1495 | refill_stock(mem, csize - PAGE_SIZE); | ||
| 1496 | charged: | ||
| 1350 | /* | 1497 | /* |
| 1351 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 1498 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
| 1352 | * if they exceeds softlimit. | 1499 | * if they exceeds softlimit. |
| @@ -1361,6 +1508,21 @@ nomem: | |||
| 1361 | } | 1508 | } |
| 1362 | 1509 | ||
| 1363 | /* | 1510 | /* |
| 1511 | * Somemtimes we have to undo a charge we got by try_charge(). | ||
| 1512 | * This function is for that and do uncharge, put css's refcnt. | ||
| 1513 | * gotten by try_charge(). | ||
| 1514 | */ | ||
| 1515 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
| 1516 | { | ||
| 1517 | if (!mem_cgroup_is_root(mem)) { | ||
| 1518 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
| 1519 | if (do_swap_account) | ||
| 1520 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
| 1521 | } | ||
| 1522 | css_put(&mem->css); | ||
| 1523 | } | ||
| 1524 | |||
| 1525 | /* | ||
| 1364 | * A helper function to get mem_cgroup from ID. must be called under | 1526 | * A helper function to get mem_cgroup from ID. must be called under |
| 1365 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1527 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
| 1366 | * it's concern. (dropping refcnt from swap can be called against removed | 1528 | * it's concern. (dropping refcnt from swap can be called against removed |
| @@ -1379,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
| 1379 | return container_of(css, struct mem_cgroup, css); | 1541 | return container_of(css, struct mem_cgroup, css); |
| 1380 | } | 1542 | } |
| 1381 | 1543 | ||
| 1382 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | 1544 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
| 1383 | { | 1545 | { |
| 1384 | struct mem_cgroup *mem; | 1546 | struct mem_cgroup *mem = NULL; |
| 1385 | struct page_cgroup *pc; | 1547 | struct page_cgroup *pc; |
| 1386 | unsigned short id; | 1548 | unsigned short id; |
| 1387 | swp_entry_t ent; | 1549 | swp_entry_t ent; |
| 1388 | 1550 | ||
| 1389 | VM_BUG_ON(!PageLocked(page)); | 1551 | VM_BUG_ON(!PageLocked(page)); |
| 1390 | 1552 | ||
| 1391 | if (!PageSwapCache(page)) | ||
| 1392 | return NULL; | ||
| 1393 | |||
| 1394 | pc = lookup_page_cgroup(page); | 1553 | pc = lookup_page_cgroup(page); |
| 1395 | lock_page_cgroup(pc); | 1554 | lock_page_cgroup(pc); |
| 1396 | if (PageCgroupUsed(pc)) { | 1555 | if (PageCgroupUsed(pc)) { |
| 1397 | mem = pc->mem_cgroup; | 1556 | mem = pc->mem_cgroup; |
| 1398 | if (mem && !css_tryget(&mem->css)) | 1557 | if (mem && !css_tryget(&mem->css)) |
| 1399 | mem = NULL; | 1558 | mem = NULL; |
| 1400 | } else { | 1559 | } else if (PageSwapCache(page)) { |
| 1401 | ent.val = page_private(page); | 1560 | ent.val = page_private(page); |
| 1402 | id = lookup_swap_cgroup(ent); | 1561 | id = lookup_swap_cgroup(ent); |
| 1403 | rcu_read_lock(); | 1562 | rcu_read_lock(); |
| @@ -1426,12 +1585,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
| 1426 | lock_page_cgroup(pc); | 1585 | lock_page_cgroup(pc); |
| 1427 | if (unlikely(PageCgroupUsed(pc))) { | 1586 | if (unlikely(PageCgroupUsed(pc))) { |
| 1428 | unlock_page_cgroup(pc); | 1587 | unlock_page_cgroup(pc); |
| 1429 | if (!mem_cgroup_is_root(mem)) { | 1588 | mem_cgroup_cancel_charge(mem); |
| 1430 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
| 1431 | if (do_swap_account) | ||
| 1432 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
| 1433 | } | ||
| 1434 | css_put(&mem->css); | ||
| 1435 | return; | 1589 | return; |
| 1436 | } | 1590 | } |
| 1437 | 1591 | ||
| @@ -1464,27 +1618,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
| 1464 | } | 1618 | } |
| 1465 | 1619 | ||
| 1466 | /** | 1620 | /** |
| 1467 | * mem_cgroup_move_account - move account of the page | 1621 | * __mem_cgroup_move_account - move account of the page |
| 1468 | * @pc: page_cgroup of the page. | 1622 | * @pc: page_cgroup of the page. |
| 1469 | * @from: mem_cgroup which the page is moved from. | 1623 | * @from: mem_cgroup which the page is moved from. |
| 1470 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1624 | * @to: mem_cgroup which the page is moved to. @from != @to. |
| 1471 | * | 1625 | * |
| 1472 | * The caller must confirm following. | 1626 | * The caller must confirm following. |
| 1473 | * - page is not on LRU (isolate_page() is useful.) | 1627 | * - page is not on LRU (isolate_page() is useful.) |
| 1474 | * | 1628 | * - the pc is locked, used, and ->mem_cgroup points to @from. |
| 1475 | * returns 0 at success, | ||
| 1476 | * returns -EBUSY when lock is busy or "pc" is unstable. | ||
| 1477 | * | 1629 | * |
| 1478 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1630 | * This function does "uncharge" from old cgroup but doesn't do "charge" to |
| 1479 | * new cgroup. It should be done by a caller. | 1631 | * new cgroup. It should be done by a caller. |
| 1480 | */ | 1632 | */ |
| 1481 | 1633 | ||
| 1482 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1634 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
| 1483 | struct mem_cgroup *from, struct mem_cgroup *to) | 1635 | struct mem_cgroup *from, struct mem_cgroup *to) |
| 1484 | { | 1636 | { |
| 1485 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
| 1486 | int nid, zid; | ||
| 1487 | int ret = -EBUSY; | ||
| 1488 | struct page *page; | 1637 | struct page *page; |
| 1489 | int cpu; | 1638 | int cpu; |
| 1490 | struct mem_cgroup_stat *stat; | 1639 | struct mem_cgroup_stat *stat; |
| @@ -1492,38 +1641,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 1492 | 1641 | ||
| 1493 | VM_BUG_ON(from == to); | 1642 | VM_BUG_ON(from == to); |
| 1494 | VM_BUG_ON(PageLRU(pc->page)); | 1643 | VM_BUG_ON(PageLRU(pc->page)); |
| 1495 | 1644 | VM_BUG_ON(!PageCgroupLocked(pc)); | |
| 1496 | nid = page_cgroup_nid(pc); | 1645 | VM_BUG_ON(!PageCgroupUsed(pc)); |
| 1497 | zid = page_cgroup_zid(pc); | 1646 | VM_BUG_ON(pc->mem_cgroup != from); |
| 1498 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | ||
| 1499 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | ||
| 1500 | |||
| 1501 | if (!trylock_page_cgroup(pc)) | ||
| 1502 | return ret; | ||
| 1503 | |||
| 1504 | if (!PageCgroupUsed(pc)) | ||
| 1505 | goto out; | ||
| 1506 | |||
| 1507 | if (pc->mem_cgroup != from) | ||
| 1508 | goto out; | ||
| 1509 | 1647 | ||
| 1510 | if (!mem_cgroup_is_root(from)) | 1648 | if (!mem_cgroup_is_root(from)) |
| 1511 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1649 | res_counter_uncharge(&from->res, PAGE_SIZE); |
| 1512 | mem_cgroup_charge_statistics(from, pc, false); | 1650 | mem_cgroup_charge_statistics(from, pc, false); |
| 1513 | 1651 | ||
| 1514 | page = pc->page; | 1652 | page = pc->page; |
| 1515 | if (page_is_file_cache(page) && page_mapped(page)) { | 1653 | if (page_mapped(page) && !PageAnon(page)) { |
| 1516 | cpu = smp_processor_id(); | 1654 | cpu = smp_processor_id(); |
| 1517 | /* Update mapped_file data for mem_cgroup "from" */ | 1655 | /* Update mapped_file data for mem_cgroup "from" */ |
| 1518 | stat = &from->stat; | 1656 | stat = &from->stat; |
| 1519 | cpustat = &stat->cpustat[cpu]; | 1657 | cpustat = &stat->cpustat[cpu]; |
| 1520 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | 1658 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, |
| 1521 | -1); | 1659 | -1); |
| 1522 | 1660 | ||
| 1523 | /* Update mapped_file data for mem_cgroup "to" */ | 1661 | /* Update mapped_file data for mem_cgroup "to" */ |
| 1524 | stat = &to->stat; | 1662 | stat = &to->stat; |
| 1525 | cpustat = &stat->cpustat[cpu]; | 1663 | cpustat = &stat->cpustat[cpu]; |
| 1526 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | 1664 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, |
| 1527 | 1); | 1665 | 1); |
| 1528 | } | 1666 | } |
| 1529 | 1667 | ||
| @@ -1534,15 +1672,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 1534 | css_get(&to->css); | 1672 | css_get(&to->css); |
| 1535 | pc->mem_cgroup = to; | 1673 | pc->mem_cgroup = to; |
| 1536 | mem_cgroup_charge_statistics(to, pc, true); | 1674 | mem_cgroup_charge_statistics(to, pc, true); |
| 1537 | ret = 0; | ||
| 1538 | out: | ||
| 1539 | unlock_page_cgroup(pc); | ||
| 1540 | /* | 1675 | /* |
| 1541 | * We charges against "to" which may not have any tasks. Then, "to" | 1676 | * We charges against "to" which may not have any tasks. Then, "to" |
| 1542 | * can be under rmdir(). But in current implementation, caller of | 1677 | * can be under rmdir(). But in current implementation, caller of |
| 1543 | * this function is just force_empty() and it's garanteed that | 1678 | * this function is just force_empty() and it's garanteed that |
| 1544 | * "to" is never removed. So, we don't check rmdir status here. | 1679 | * "to" is never removed. So, we don't check rmdir status here. |
| 1545 | */ | 1680 | */ |
| 1681 | } | ||
| 1682 | |||
| 1683 | /* | ||
| 1684 | * check whether the @pc is valid for moving account and call | ||
| 1685 | * __mem_cgroup_move_account() | ||
| 1686 | */ | ||
| 1687 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
| 1688 | struct mem_cgroup *from, struct mem_cgroup *to) | ||
| 1689 | { | ||
| 1690 | int ret = -EINVAL; | ||
| 1691 | lock_page_cgroup(pc); | ||
| 1692 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
| 1693 | __mem_cgroup_move_account(pc, from, to); | ||
| 1694 | ret = 0; | ||
| 1695 | } | ||
| 1696 | unlock_page_cgroup(pc); | ||
| 1546 | return ret; | 1697 | return ret; |
| 1547 | } | 1698 | } |
| 1548 | 1699 | ||
| @@ -1564,45 +1715,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
| 1564 | if (!pcg) | 1715 | if (!pcg) |
| 1565 | return -EINVAL; | 1716 | return -EINVAL; |
| 1566 | 1717 | ||
| 1718 | ret = -EBUSY; | ||
| 1719 | if (!get_page_unless_zero(page)) | ||
| 1720 | goto out; | ||
| 1721 | if (isolate_lru_page(page)) | ||
| 1722 | goto put; | ||
| 1567 | 1723 | ||
| 1568 | parent = mem_cgroup_from_cont(pcg); | 1724 | parent = mem_cgroup_from_cont(pcg); |
| 1569 | |||
| 1570 | |||
| 1571 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | 1725 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
| 1572 | if (ret || !parent) | 1726 | if (ret || !parent) |
| 1573 | return ret; | 1727 | goto put_back; |
| 1574 | |||
| 1575 | if (!get_page_unless_zero(page)) { | ||
| 1576 | ret = -EBUSY; | ||
| 1577 | goto uncharge; | ||
| 1578 | } | ||
| 1579 | |||
| 1580 | ret = isolate_lru_page(page); | ||
| 1581 | |||
| 1582 | if (ret) | ||
| 1583 | goto cancel; | ||
| 1584 | 1728 | ||
| 1585 | ret = mem_cgroup_move_account(pc, child, parent); | 1729 | ret = mem_cgroup_move_account(pc, child, parent); |
| 1586 | 1730 | if (!ret) | |
| 1731 | css_put(&parent->css); /* drop extra refcnt by try_charge() */ | ||
| 1732 | else | ||
| 1733 | mem_cgroup_cancel_charge(parent); /* does css_put */ | ||
| 1734 | put_back: | ||
| 1587 | putback_lru_page(page); | 1735 | putback_lru_page(page); |
| 1588 | if (!ret) { | 1736 | put: |
| 1589 | put_page(page); | ||
| 1590 | /* drop extra refcnt by try_charge() */ | ||
| 1591 | css_put(&parent->css); | ||
| 1592 | return 0; | ||
| 1593 | } | ||
| 1594 | |||
| 1595 | cancel: | ||
| 1596 | put_page(page); | 1737 | put_page(page); |
| 1597 | uncharge: | 1738 | out: |
| 1598 | /* drop extra refcnt by try_charge() */ | ||
| 1599 | css_put(&parent->css); | ||
| 1600 | /* uncharge if move fails */ | ||
| 1601 | if (!mem_cgroup_is_root(parent)) { | ||
| 1602 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
| 1603 | if (do_swap_account) | ||
| 1604 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
| 1605 | } | ||
| 1606 | return ret; | 1739 | return ret; |
| 1607 | } | 1740 | } |
| 1608 | 1741 | ||
| @@ -1720,7 +1853,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
| 1720 | /* | 1853 | /* |
| 1721 | * While swap-in, try_charge -> commit or cancel, the page is locked. | 1854 | * While swap-in, try_charge -> commit or cancel, the page is locked. |
| 1722 | * And when try_charge() successfully returns, one refcnt to memcg without | 1855 | * And when try_charge() successfully returns, one refcnt to memcg without |
| 1723 | * struct page_cgroup is aquired. This refcnt will be cumsumed by | 1856 | * struct page_cgroup is acquired. This refcnt will be consumed by |
| 1724 | * "commit()" or removed by "cancel()" | 1857 | * "commit()" or removed by "cancel()" |
| 1725 | */ | 1858 | */ |
| 1726 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 1859 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
| @@ -1737,12 +1870,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
| 1737 | goto charge_cur_mm; | 1870 | goto charge_cur_mm; |
| 1738 | /* | 1871 | /* |
| 1739 | * A racing thread's fault, or swapoff, may have already updated | 1872 | * A racing thread's fault, or swapoff, may have already updated |
| 1740 | * the pte, and even removed page from swap cache: return success | 1873 | * the pte, and even removed page from swap cache: in those cases |
| 1741 | * to go on to do_swap_page()'s pte_same() test, which should fail. | 1874 | * do_swap_page()'s pte_same() test will fail; but there's also a |
| 1875 | * KSM case which does need to charge the page. | ||
| 1742 | */ | 1876 | */ |
| 1743 | if (!PageSwapCache(page)) | 1877 | if (!PageSwapCache(page)) |
| 1744 | return 0; | 1878 | goto charge_cur_mm; |
| 1745 | mem = try_get_mem_cgroup_from_swapcache(page); | 1879 | mem = try_get_mem_cgroup_from_page(page); |
| 1746 | if (!mem) | 1880 | if (!mem) |
| 1747 | goto charge_cur_mm; | 1881 | goto charge_cur_mm; |
| 1748 | *ptr = mem; | 1882 | *ptr = mem; |
| @@ -1818,14 +1952,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
| 1818 | return; | 1952 | return; |
| 1819 | if (!mem) | 1953 | if (!mem) |
| 1820 | return; | 1954 | return; |
| 1821 | if (!mem_cgroup_is_root(mem)) { | 1955 | mem_cgroup_cancel_charge(mem); |
| 1822 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
| 1823 | if (do_swap_account) | ||
| 1824 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
| 1825 | } | ||
| 1826 | css_put(&mem->css); | ||
| 1827 | } | 1956 | } |
| 1828 | 1957 | ||
| 1958 | static void | ||
| 1959 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | ||
| 1960 | { | ||
| 1961 | struct memcg_batch_info *batch = NULL; | ||
| 1962 | bool uncharge_memsw = true; | ||
| 1963 | /* If swapout, usage of swap doesn't decrease */ | ||
| 1964 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
| 1965 | uncharge_memsw = false; | ||
| 1966 | /* | ||
| 1967 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
| 1968 | * In those cases, all pages freed continously can be expected to be in | ||
| 1969 | * the same cgroup and we have chance to coalesce uncharges. | ||
| 1970 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
| 1971 | * because we want to do uncharge as soon as possible. | ||
| 1972 | */ | ||
| 1973 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
| 1974 | goto direct_uncharge; | ||
| 1975 | |||
| 1976 | batch = ¤t->memcg_batch; | ||
| 1977 | /* | ||
| 1978 | * In usual, we do css_get() when we remember memcg pointer. | ||
| 1979 | * But in this case, we keep res->usage until end of a series of | ||
| 1980 | * uncharges. Then, it's ok to ignore memcg's refcnt. | ||
| 1981 | */ | ||
| 1982 | if (!batch->memcg) | ||
| 1983 | batch->memcg = mem; | ||
| 1984 | /* | ||
| 1985 | * In typical case, batch->memcg == mem. This means we can | ||
| 1986 | * merge a series of uncharges to an uncharge of res_counter. | ||
| 1987 | * If not, we uncharge res_counter ony by one. | ||
| 1988 | */ | ||
| 1989 | if (batch->memcg != mem) | ||
| 1990 | goto direct_uncharge; | ||
| 1991 | /* remember freed charge and uncharge it later */ | ||
| 1992 | batch->bytes += PAGE_SIZE; | ||
| 1993 | if (uncharge_memsw) | ||
| 1994 | batch->memsw_bytes += PAGE_SIZE; | ||
| 1995 | return; | ||
| 1996 | direct_uncharge: | ||
| 1997 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
| 1998 | if (uncharge_memsw) | ||
| 1999 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
| 2000 | return; | ||
| 2001 | } | ||
| 1829 | 2002 | ||
| 1830 | /* | 2003 | /* |
| 1831 | * uncharge if !page_mapped(page) | 2004 | * uncharge if !page_mapped(page) |
| @@ -1874,12 +2047,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 1874 | break; | 2047 | break; |
| 1875 | } | 2048 | } |
| 1876 | 2049 | ||
| 1877 | if (!mem_cgroup_is_root(mem)) { | 2050 | if (!mem_cgroup_is_root(mem)) |
| 1878 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2051 | __do_uncharge(mem, ctype); |
| 1879 | if (do_swap_account && | ||
| 1880 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
| 1881 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
| 1882 | } | ||
| 1883 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2052 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
| 1884 | mem_cgroup_swap_statistics(mem, true); | 2053 | mem_cgroup_swap_statistics(mem, true); |
| 1885 | mem_cgroup_charge_statistics(mem, pc, false); | 2054 | mem_cgroup_charge_statistics(mem, pc, false); |
| @@ -1925,6 +2094,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
| 1925 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 2094 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
| 1926 | } | 2095 | } |
| 1927 | 2096 | ||
| 2097 | /* | ||
| 2098 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | ||
| 2099 | * In that cases, pages are freed continuously and we can expect pages | ||
| 2100 | * are in the same memcg. All these calls itself limits the number of | ||
| 2101 | * pages freed at once, then uncharge_start/end() is called properly. | ||
| 2102 | * This may be called prural(2) times in a context, | ||
| 2103 | */ | ||
| 2104 | |||
| 2105 | void mem_cgroup_uncharge_start(void) | ||
| 2106 | { | ||
| 2107 | current->memcg_batch.do_batch++; | ||
| 2108 | /* We can do nest. */ | ||
| 2109 | if (current->memcg_batch.do_batch == 1) { | ||
| 2110 | current->memcg_batch.memcg = NULL; | ||
| 2111 | current->memcg_batch.bytes = 0; | ||
| 2112 | current->memcg_batch.memsw_bytes = 0; | ||
| 2113 | } | ||
| 2114 | } | ||
| 2115 | |||
| 2116 | void mem_cgroup_uncharge_end(void) | ||
| 2117 | { | ||
| 2118 | struct memcg_batch_info *batch = ¤t->memcg_batch; | ||
| 2119 | |||
| 2120 | if (!batch->do_batch) | ||
| 2121 | return; | ||
| 2122 | |||
| 2123 | batch->do_batch--; | ||
| 2124 | if (batch->do_batch) /* If stacked, do nothing. */ | ||
| 2125 | return; | ||
| 2126 | |||
| 2127 | if (!batch->memcg) | ||
| 2128 | return; | ||
| 2129 | /* | ||
| 2130 | * This "batch->memcg" is valid without any css_get/put etc... | ||
| 2131 | * bacause we hide charges behind us. | ||
| 2132 | */ | ||
| 2133 | if (batch->bytes) | ||
| 2134 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | ||
| 2135 | if (batch->memsw_bytes) | ||
| 2136 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | ||
| 2137 | /* forget this pointer (for sanity check) */ | ||
| 2138 | batch->memcg = NULL; | ||
| 2139 | } | ||
| 2140 | |||
| 1928 | #ifdef CONFIG_SWAP | 2141 | #ifdef CONFIG_SWAP |
| 1929 | /* | 2142 | /* |
| 1930 | * called after __delete_from_swap_cache() and drop "page" account. | 2143 | * called after __delete_from_swap_cache() and drop "page" account. |
| @@ -2100,7 +2313,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
| 2100 | unsigned long long val) | 2313 | unsigned long long val) |
| 2101 | { | 2314 | { |
| 2102 | int retry_count; | 2315 | int retry_count; |
| 2103 | int progress; | ||
| 2104 | u64 memswlimit; | 2316 | u64 memswlimit; |
| 2105 | int ret = 0; | 2317 | int ret = 0; |
| 2106 | int children = mem_cgroup_count_children(memcg); | 2318 | int children = mem_cgroup_count_children(memcg); |
| @@ -2144,8 +2356,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
| 2144 | if (!ret) | 2356 | if (!ret) |
| 2145 | break; | 2357 | break; |
| 2146 | 2358 | ||
| 2147 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, | 2359 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
| 2148 | GFP_KERNEL, | ||
| 2149 | MEM_CGROUP_RECLAIM_SHRINK); | 2360 | MEM_CGROUP_RECLAIM_SHRINK); |
| 2150 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2361 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
| 2151 | /* Usage is reduced ? */ | 2362 | /* Usage is reduced ? */ |
| @@ -2334,7 +2545,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
| 2334 | pc = list_entry(list->prev, struct page_cgroup, lru); | 2545 | pc = list_entry(list->prev, struct page_cgroup, lru); |
| 2335 | if (busy == pc) { | 2546 | if (busy == pc) { |
| 2336 | list_move(&pc->lru, list); | 2547 | list_move(&pc->lru, list); |
| 2337 | busy = 0; | 2548 | busy = NULL; |
| 2338 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 2549 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 2339 | continue; | 2550 | continue; |
| 2340 | } | 2551 | } |
| @@ -2375,7 +2586,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) | |||
| 2375 | if (free_all) | 2586 | if (free_all) |
| 2376 | goto try_to_free; | 2587 | goto try_to_free; |
| 2377 | move_account: | 2588 | move_account: |
| 2378 | while (mem->res.usage > 0) { | 2589 | do { |
| 2379 | ret = -EBUSY; | 2590 | ret = -EBUSY; |
| 2380 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | 2591 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
| 2381 | goto out; | 2592 | goto out; |
| @@ -2384,6 +2595,7 @@ move_account: | |||
| 2384 | goto out; | 2595 | goto out; |
| 2385 | /* This is for making all *used* pages to be on LRU. */ | 2596 | /* This is for making all *used* pages to be on LRU. */ |
| 2386 | lru_add_drain_all(); | 2597 | lru_add_drain_all(); |
| 2598 | drain_all_stock_sync(); | ||
| 2387 | ret = 0; | 2599 | ret = 0; |
| 2388 | for_each_node_state(node, N_HIGH_MEMORY) { | 2600 | for_each_node_state(node, N_HIGH_MEMORY) { |
| 2389 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 2601 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
| @@ -2402,8 +2614,8 @@ move_account: | |||
| 2402 | if (ret == -ENOMEM) | 2614 | if (ret == -ENOMEM) |
| 2403 | goto try_to_free; | 2615 | goto try_to_free; |
| 2404 | cond_resched(); | 2616 | cond_resched(); |
| 2405 | } | 2617 | /* "ret" should also be checked to ensure all lists are empty. */ |
| 2406 | ret = 0; | 2618 | } while (mem->res.usage > 0 || ret); |
| 2407 | out: | 2619 | out: |
| 2408 | css_put(&mem->css); | 2620 | css_put(&mem->css); |
| 2409 | return ret; | 2621 | return ret; |
| @@ -2436,10 +2648,7 @@ try_to_free: | |||
| 2436 | } | 2648 | } |
| 2437 | lru_add_drain(); | 2649 | lru_add_drain(); |
| 2438 | /* try move_account...there may be some *locked* pages. */ | 2650 | /* try move_account...there may be some *locked* pages. */ |
| 2439 | if (mem->res.usage) | 2651 | goto move_account; |
| 2440 | goto move_account; | ||
| 2441 | ret = 0; | ||
| 2442 | goto out; | ||
| 2443 | } | 2652 | } |
| 2444 | 2653 | ||
| 2445 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 2654 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
| @@ -2466,7 +2675,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
| 2466 | 2675 | ||
| 2467 | cgroup_lock(); | 2676 | cgroup_lock(); |
| 2468 | /* | 2677 | /* |
| 2469 | * If parent's use_hiearchy is set, we can't make any modifications | 2678 | * If parent's use_hierarchy is set, we can't make any modifications |
| 2470 | * in the child subtrees. If it is unset, then the change can | 2679 | * in the child subtrees. If it is unset, then the change can |
| 2471 | * occur, provided the current cgroup has no children. | 2680 | * occur, provided the current cgroup has no children. |
| 2472 | * | 2681 | * |
| @@ -2541,6 +2750,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
| 2541 | val += idx_val; | 2750 | val += idx_val; |
| 2542 | mem_cgroup_get_recursive_idx_stat(mem, | 2751 | mem_cgroup_get_recursive_idx_stat(mem, |
| 2543 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | 2752 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); |
| 2753 | val += idx_val; | ||
| 2544 | val <<= PAGE_SHIFT; | 2754 | val <<= PAGE_SHIFT; |
| 2545 | } else | 2755 | } else |
| 2546 | val = res_counter_read_u64(&mem->memsw, name); | 2756 | val = res_counter_read_u64(&mem->memsw, name); |
| @@ -2660,7 +2870,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
| 2660 | enum { | 2870 | enum { |
| 2661 | MCS_CACHE, | 2871 | MCS_CACHE, |
| 2662 | MCS_RSS, | 2872 | MCS_RSS, |
| 2663 | MCS_MAPPED_FILE, | 2873 | MCS_FILE_MAPPED, |
| 2664 | MCS_PGPGIN, | 2874 | MCS_PGPGIN, |
| 2665 | MCS_PGPGOUT, | 2875 | MCS_PGPGOUT, |
| 2666 | MCS_SWAP, | 2876 | MCS_SWAP, |
| @@ -2704,8 +2914,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
| 2704 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 2914 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
| 2705 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 2915 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); |
| 2706 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 2916 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
| 2707 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); | 2917 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); |
| 2708 | s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; | 2918 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
| 2709 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 2919 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); |
| 2710 | s->stat[MCS_PGPGIN] += val; | 2920 | s->stat[MCS_PGPGIN] += val; |
| 2711 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 2921 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
| @@ -3097,11 +3307,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 3097 | 3307 | ||
| 3098 | /* root ? */ | 3308 | /* root ? */ |
| 3099 | if (cont->parent == NULL) { | 3309 | if (cont->parent == NULL) { |
| 3310 | int cpu; | ||
| 3100 | enable_swap_cgroup(); | 3311 | enable_swap_cgroup(); |
| 3101 | parent = NULL; | 3312 | parent = NULL; |
| 3102 | root_mem_cgroup = mem; | 3313 | root_mem_cgroup = mem; |
| 3103 | if (mem_cgroup_soft_limit_tree_init()) | 3314 | if (mem_cgroup_soft_limit_tree_init()) |
| 3104 | goto free_out; | 3315 | goto free_out; |
| 3316 | for_each_possible_cpu(cpu) { | ||
| 3317 | struct memcg_stock_pcp *stock = | ||
| 3318 | &per_cpu(memcg_stock, cpu); | ||
| 3319 | INIT_WORK(&stock->work, drain_local_stock); | ||
| 3320 | } | ||
| 3321 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | ||
| 3105 | 3322 | ||
| 3106 | } else { | 3323 | } else { |
| 3107 | parent = mem_cgroup_from_cont(cont->parent); | 3324 | parent = mem_cgroup_from_cont(cont->parent); |
| @@ -3170,12 +3387,10 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
| 3170 | struct task_struct *p, | 3387 | struct task_struct *p, |
| 3171 | bool threadgroup) | 3388 | bool threadgroup) |
| 3172 | { | 3389 | { |
| 3173 | mutex_lock(&memcg_tasklist); | ||
| 3174 | /* | 3390 | /* |
| 3175 | * FIXME: It's better to move charges of this process from old | 3391 | * FIXME: It's better to move charges of this process from old |
| 3176 | * memcg to new memcg. But it's just on TODO-List now. | 3392 | * memcg to new memcg. But it's just on TODO-List now. |
| 3177 | */ | 3393 | */ |
| 3178 | mutex_unlock(&memcg_tasklist); | ||
| 3179 | } | 3394 | } |
| 3180 | 3395 | ||
| 3181 | struct cgroup_subsys mem_cgroup_subsys = { | 3396 | struct cgroup_subsys mem_cgroup_subsys = { |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index dacc64183874..d1f335162976 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -34,12 +34,16 @@ | |||
| 34 | #include <linux/kernel.h> | 34 | #include <linux/kernel.h> |
| 35 | #include <linux/mm.h> | 35 | #include <linux/mm.h> |
| 36 | #include <linux/page-flags.h> | 36 | #include <linux/page-flags.h> |
| 37 | #include <linux/kernel-page-flags.h> | ||
| 37 | #include <linux/sched.h> | 38 | #include <linux/sched.h> |
| 38 | #include <linux/ksm.h> | 39 | #include <linux/ksm.h> |
| 39 | #include <linux/rmap.h> | 40 | #include <linux/rmap.h> |
| 40 | #include <linux/pagemap.h> | 41 | #include <linux/pagemap.h> |
| 41 | #include <linux/swap.h> | 42 | #include <linux/swap.h> |
| 42 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
| 44 | #include <linux/migrate.h> | ||
| 45 | #include <linux/page-isolation.h> | ||
| 46 | #include <linux/suspend.h> | ||
| 43 | #include "internal.h" | 47 | #include "internal.h" |
| 44 | 48 | ||
| 45 | int sysctl_memory_failure_early_kill __read_mostly = 0; | 49 | int sysctl_memory_failure_early_kill __read_mostly = 0; |
| @@ -48,6 +52,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1; | |||
| 48 | 52 | ||
| 49 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | 53 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); |
| 50 | 54 | ||
| 55 | #if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) | ||
| 56 | |||
| 57 | u32 hwpoison_filter_enable = 0; | ||
| 58 | u32 hwpoison_filter_dev_major = ~0U; | ||
| 59 | u32 hwpoison_filter_dev_minor = ~0U; | ||
| 60 | u64 hwpoison_filter_flags_mask; | ||
| 61 | u64 hwpoison_filter_flags_value; | ||
| 62 | EXPORT_SYMBOL_GPL(hwpoison_filter_enable); | ||
| 63 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); | ||
| 64 | EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); | ||
| 65 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); | ||
| 66 | EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); | ||
| 67 | |||
| 68 | static int hwpoison_filter_dev(struct page *p) | ||
| 69 | { | ||
| 70 | struct address_space *mapping; | ||
| 71 | dev_t dev; | ||
| 72 | |||
| 73 | if (hwpoison_filter_dev_major == ~0U && | ||
| 74 | hwpoison_filter_dev_minor == ~0U) | ||
| 75 | return 0; | ||
| 76 | |||
| 77 | /* | ||
| 78 | * page_mapping() does not accept slab page | ||
| 79 | */ | ||
| 80 | if (PageSlab(p)) | ||
| 81 | return -EINVAL; | ||
| 82 | |||
| 83 | mapping = page_mapping(p); | ||
| 84 | if (mapping == NULL || mapping->host == NULL) | ||
| 85 | return -EINVAL; | ||
| 86 | |||
| 87 | dev = mapping->host->i_sb->s_dev; | ||
| 88 | if (hwpoison_filter_dev_major != ~0U && | ||
| 89 | hwpoison_filter_dev_major != MAJOR(dev)) | ||
| 90 | return -EINVAL; | ||
| 91 | if (hwpoison_filter_dev_minor != ~0U && | ||
| 92 | hwpoison_filter_dev_minor != MINOR(dev)) | ||
| 93 | return -EINVAL; | ||
| 94 | |||
| 95 | return 0; | ||
| 96 | } | ||
| 97 | |||
| 98 | static int hwpoison_filter_flags(struct page *p) | ||
| 99 | { | ||
| 100 | if (!hwpoison_filter_flags_mask) | ||
| 101 | return 0; | ||
| 102 | |||
| 103 | if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == | ||
| 104 | hwpoison_filter_flags_value) | ||
| 105 | return 0; | ||
| 106 | else | ||
| 107 | return -EINVAL; | ||
| 108 | } | ||
| 109 | |||
| 110 | /* | ||
| 111 | * This allows stress tests to limit test scope to a collection of tasks | ||
| 112 | * by putting them under some memcg. This prevents killing unrelated/important | ||
| 113 | * processes such as /sbin/init. Note that the target task may share clean | ||
| 114 | * pages with init (eg. libc text), which is harmless. If the target task | ||
| 115 | * share _dirty_ pages with another task B, the test scheme must make sure B | ||
| 116 | * is also included in the memcg. At last, due to race conditions this filter | ||
| 117 | * can only guarantee that the page either belongs to the memcg tasks, or is | ||
| 118 | * a freed page. | ||
| 119 | */ | ||
| 120 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | ||
| 121 | u64 hwpoison_filter_memcg; | ||
| 122 | EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); | ||
| 123 | static int hwpoison_filter_task(struct page *p) | ||
| 124 | { | ||
| 125 | struct mem_cgroup *mem; | ||
| 126 | struct cgroup_subsys_state *css; | ||
| 127 | unsigned long ino; | ||
| 128 | |||
| 129 | if (!hwpoison_filter_memcg) | ||
| 130 | return 0; | ||
| 131 | |||
| 132 | mem = try_get_mem_cgroup_from_page(p); | ||
| 133 | if (!mem) | ||
| 134 | return -EINVAL; | ||
| 135 | |||
| 136 | css = mem_cgroup_css(mem); | ||
| 137 | /* root_mem_cgroup has NULL dentries */ | ||
| 138 | if (!css->cgroup->dentry) | ||
| 139 | return -EINVAL; | ||
| 140 | |||
| 141 | ino = css->cgroup->dentry->d_inode->i_ino; | ||
| 142 | css_put(css); | ||
| 143 | |||
| 144 | if (ino != hwpoison_filter_memcg) | ||
| 145 | return -EINVAL; | ||
| 146 | |||
| 147 | return 0; | ||
| 148 | } | ||
| 149 | #else | ||
| 150 | static int hwpoison_filter_task(struct page *p) { return 0; } | ||
| 151 | #endif | ||
| 152 | |||
| 153 | int hwpoison_filter(struct page *p) | ||
| 154 | { | ||
| 155 | if (!hwpoison_filter_enable) | ||
| 156 | return 0; | ||
| 157 | |||
| 158 | if (hwpoison_filter_dev(p)) | ||
| 159 | return -EINVAL; | ||
| 160 | |||
| 161 | if (hwpoison_filter_flags(p)) | ||
| 162 | return -EINVAL; | ||
| 163 | |||
| 164 | if (hwpoison_filter_task(p)) | ||
| 165 | return -EINVAL; | ||
| 166 | |||
| 167 | return 0; | ||
| 168 | } | ||
| 169 | #else | ||
| 170 | int hwpoison_filter(struct page *p) | ||
| 171 | { | ||
| 172 | return 0; | ||
| 173 | } | ||
| 174 | #endif | ||
| 175 | |||
| 176 | EXPORT_SYMBOL_GPL(hwpoison_filter); | ||
| 177 | |||
| 51 | /* | 178 | /* |
| 52 | * Send all the processes who have the page mapped an ``action optional'' | 179 | * Send all the processes who have the page mapped an ``action optional'' |
| 53 | * signal. | 180 | * signal. |
| @@ -83,6 +210,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | |||
| 83 | } | 210 | } |
| 84 | 211 | ||
| 85 | /* | 212 | /* |
| 213 | * When a unknown page type is encountered drain as many buffers as possible | ||
| 214 | * in the hope to turn the page into a LRU or free page, which we can handle. | ||
| 215 | */ | ||
| 216 | void shake_page(struct page *p, int access) | ||
| 217 | { | ||
| 218 | if (!PageSlab(p)) { | ||
| 219 | lru_add_drain_all(); | ||
| 220 | if (PageLRU(p)) | ||
| 221 | return; | ||
| 222 | drain_all_pages(); | ||
| 223 | if (PageLRU(p) || is_free_buddy_page(p)) | ||
| 224 | return; | ||
| 225 | } | ||
| 226 | |||
| 227 | /* | ||
| 228 | * Only all shrink_slab here (which would also | ||
| 229 | * shrink other caches) if access is not potentially fatal. | ||
| 230 | */ | ||
| 231 | if (access) { | ||
| 232 | int nr; | ||
| 233 | do { | ||
| 234 | nr = shrink_slab(1000, GFP_KERNEL, 1000); | ||
| 235 | if (page_count(p) == 0) | ||
| 236 | break; | ||
| 237 | } while (nr > 10); | ||
| 238 | } | ||
| 239 | } | ||
| 240 | EXPORT_SYMBOL_GPL(shake_page); | ||
| 241 | |||
| 242 | /* | ||
| 86 | * Kill all processes that have a poisoned page mapped and then isolate | 243 | * Kill all processes that have a poisoned page mapped and then isolate |
| 87 | * the page. | 244 | * the page. |
| 88 | * | 245 | * |
| @@ -174,10 +331,9 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | |||
| 174 | list_for_each_entry_safe (tk, next, to_kill, nd) { | 331 | list_for_each_entry_safe (tk, next, to_kill, nd) { |
| 175 | if (doit) { | 332 | if (doit) { |
| 176 | /* | 333 | /* |
| 177 | * In case something went wrong with munmaping | 334 | * In case something went wrong with munmapping |
| 178 | * make sure the process doesn't catch the | 335 | * make sure the process doesn't catch the |
| 179 | * signal and then access the memory. Just kill it. | 336 | * signal and then access the memory. Just kill it. |
| 180 | * the signal handlers | ||
| 181 | */ | 337 | */ |
| 182 | if (fail || tk->addr_valid == 0) { | 338 | if (fail || tk->addr_valid == 0) { |
| 183 | printk(KERN_ERR | 339 | printk(KERN_ERR |
| @@ -227,9 +383,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, | |||
| 227 | if (av == NULL) /* Not actually mapped anymore */ | 383 | if (av == NULL) /* Not actually mapped anymore */ |
| 228 | goto out; | 384 | goto out; |
| 229 | for_each_process (tsk) { | 385 | for_each_process (tsk) { |
| 386 | struct anon_vma_chain *vmac; | ||
| 387 | |||
| 230 | if (!task_early_kill(tsk)) | 388 | if (!task_early_kill(tsk)) |
| 231 | continue; | 389 | continue; |
| 232 | list_for_each_entry (vma, &av->head, anon_vma_node) { | 390 | list_for_each_entry(vmac, &av->head, same_anon_vma) { |
| 391 | vma = vmac->vma; | ||
| 233 | if (!page_mapped_in_vma(page, vma)) | 392 | if (!page_mapped_in_vma(page, vma)) |
| 234 | continue; | 393 | continue; |
| 235 | if (vma->vm_mm == tsk->mm) | 394 | if (vma->vm_mm == tsk->mm) |
| @@ -314,33 +473,49 @@ static void collect_procs(struct page *page, struct list_head *tokill) | |||
| 314 | */ | 473 | */ |
| 315 | 474 | ||
| 316 | enum outcome { | 475 | enum outcome { |
| 317 | FAILED, /* Error handling failed */ | 476 | IGNORED, /* Error: cannot be handled */ |
| 477 | FAILED, /* Error: handling failed */ | ||
| 318 | DELAYED, /* Will be handled later */ | 478 | DELAYED, /* Will be handled later */ |
| 319 | IGNORED, /* Error safely ignored */ | ||
| 320 | RECOVERED, /* Successfully recovered */ | 479 | RECOVERED, /* Successfully recovered */ |
| 321 | }; | 480 | }; |
| 322 | 481 | ||
| 323 | static const char *action_name[] = { | 482 | static const char *action_name[] = { |
| 483 | [IGNORED] = "Ignored", | ||
| 324 | [FAILED] = "Failed", | 484 | [FAILED] = "Failed", |
| 325 | [DELAYED] = "Delayed", | 485 | [DELAYED] = "Delayed", |
| 326 | [IGNORED] = "Ignored", | ||
| 327 | [RECOVERED] = "Recovered", | 486 | [RECOVERED] = "Recovered", |
| 328 | }; | 487 | }; |
| 329 | 488 | ||
| 330 | /* | 489 | /* |
| 331 | * Error hit kernel page. | 490 | * XXX: It is possible that a page is isolated from LRU cache, |
| 332 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | 491 | * and then kept in swap cache or failed to remove from page cache. |
| 333 | * could be more sophisticated. | 492 | * The page count will stop it from being freed by unpoison. |
| 493 | * Stress tests should be aware of this memory leak problem. | ||
| 334 | */ | 494 | */ |
| 335 | static int me_kernel(struct page *p, unsigned long pfn) | 495 | static int delete_from_lru_cache(struct page *p) |
| 336 | { | 496 | { |
| 337 | return DELAYED; | 497 | if (!isolate_lru_page(p)) { |
| 498 | /* | ||
| 499 | * Clear sensible page flags, so that the buddy system won't | ||
| 500 | * complain when the page is unpoison-and-freed. | ||
| 501 | */ | ||
| 502 | ClearPageActive(p); | ||
| 503 | ClearPageUnevictable(p); | ||
| 504 | /* | ||
| 505 | * drop the page count elevated by isolate_lru_page() | ||
| 506 | */ | ||
| 507 | page_cache_release(p); | ||
| 508 | return 0; | ||
| 509 | } | ||
| 510 | return -EIO; | ||
| 338 | } | 511 | } |
| 339 | 512 | ||
| 340 | /* | 513 | /* |
| 341 | * Already poisoned page. | 514 | * Error hit kernel page. |
| 515 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | ||
| 516 | * could be more sophisticated. | ||
| 342 | */ | 517 | */ |
| 343 | static int me_ignore(struct page *p, unsigned long pfn) | 518 | static int me_kernel(struct page *p, unsigned long pfn) |
| 344 | { | 519 | { |
| 345 | return IGNORED; | 520 | return IGNORED; |
| 346 | } | 521 | } |
| @@ -355,14 +530,6 @@ static int me_unknown(struct page *p, unsigned long pfn) | |||
| 355 | } | 530 | } |
| 356 | 531 | ||
| 357 | /* | 532 | /* |
| 358 | * Free memory | ||
| 359 | */ | ||
| 360 | static int me_free(struct page *p, unsigned long pfn) | ||
| 361 | { | ||
| 362 | return DELAYED; | ||
| 363 | } | ||
| 364 | |||
| 365 | /* | ||
| 366 | * Clean (or cleaned) page cache page. | 533 | * Clean (or cleaned) page cache page. |
| 367 | */ | 534 | */ |
| 368 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | 535 | static int me_pagecache_clean(struct page *p, unsigned long pfn) |
| @@ -371,6 +538,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) | |||
| 371 | int ret = FAILED; | 538 | int ret = FAILED; |
| 372 | struct address_space *mapping; | 539 | struct address_space *mapping; |
| 373 | 540 | ||
| 541 | delete_from_lru_cache(p); | ||
| 542 | |||
| 374 | /* | 543 | /* |
| 375 | * For anonymous pages we're done the only reference left | 544 | * For anonymous pages we're done the only reference left |
| 376 | * should be the one m_f() holds. | 545 | * should be the one m_f() holds. |
| @@ -500,14 +669,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn) | |||
| 500 | /* Trigger EIO in shmem: */ | 669 | /* Trigger EIO in shmem: */ |
| 501 | ClearPageUptodate(p); | 670 | ClearPageUptodate(p); |
| 502 | 671 | ||
| 503 | return DELAYED; | 672 | if (!delete_from_lru_cache(p)) |
| 673 | return DELAYED; | ||
| 674 | else | ||
| 675 | return FAILED; | ||
| 504 | } | 676 | } |
| 505 | 677 | ||
| 506 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | 678 | static int me_swapcache_clean(struct page *p, unsigned long pfn) |
| 507 | { | 679 | { |
| 508 | delete_from_swap_cache(p); | 680 | delete_from_swap_cache(p); |
| 509 | 681 | ||
| 510 | return RECOVERED; | 682 | if (!delete_from_lru_cache(p)) |
| 683 | return RECOVERED; | ||
| 684 | else | ||
| 685 | return FAILED; | ||
| 511 | } | 686 | } |
| 512 | 687 | ||
| 513 | /* | 688 | /* |
| @@ -550,7 +725,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
| 550 | #define tail (1UL << PG_tail) | 725 | #define tail (1UL << PG_tail) |
| 551 | #define compound (1UL << PG_compound) | 726 | #define compound (1UL << PG_compound) |
| 552 | #define slab (1UL << PG_slab) | 727 | #define slab (1UL << PG_slab) |
| 553 | #define buddy (1UL << PG_buddy) | ||
| 554 | #define reserved (1UL << PG_reserved) | 728 | #define reserved (1UL << PG_reserved) |
| 555 | 729 | ||
| 556 | static struct page_state { | 730 | static struct page_state { |
| @@ -559,8 +733,11 @@ static struct page_state { | |||
| 559 | char *msg; | 733 | char *msg; |
| 560 | int (*action)(struct page *p, unsigned long pfn); | 734 | int (*action)(struct page *p, unsigned long pfn); |
| 561 | } error_states[] = { | 735 | } error_states[] = { |
| 562 | { reserved, reserved, "reserved kernel", me_ignore }, | 736 | { reserved, reserved, "reserved kernel", me_kernel }, |
| 563 | { buddy, buddy, "free kernel", me_free }, | 737 | /* |
| 738 | * free pages are specially detected outside this table: | ||
| 739 | * PG_buddy pages only make a small fraction of all free pages. | ||
| 740 | */ | ||
| 564 | 741 | ||
| 565 | /* | 742 | /* |
| 566 | * Could in theory check if slab page is free or if we can drop | 743 | * Could in theory check if slab page is free or if we can drop |
| @@ -582,14 +759,11 @@ static struct page_state { | |||
| 582 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, | 759 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, |
| 583 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, | 760 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, |
| 584 | 761 | ||
| 585 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
| 586 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, | 762 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, |
| 587 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, | 763 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, |
| 588 | #endif | ||
| 589 | 764 | ||
| 590 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | 765 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, |
| 591 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 766 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, |
| 592 | { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, | ||
| 593 | 767 | ||
| 594 | /* | 768 | /* |
| 595 | * Catchall entry: must be at end. | 769 | * Catchall entry: must be at end. |
| @@ -597,20 +771,31 @@ static struct page_state { | |||
| 597 | { 0, 0, "unknown page state", me_unknown }, | 771 | { 0, 0, "unknown page state", me_unknown }, |
| 598 | }; | 772 | }; |
| 599 | 773 | ||
| 774 | #undef dirty | ||
| 775 | #undef sc | ||
| 776 | #undef unevict | ||
| 777 | #undef mlock | ||
| 778 | #undef writeback | ||
| 779 | #undef lru | ||
| 780 | #undef swapbacked | ||
| 781 | #undef head | ||
| 782 | #undef tail | ||
| 783 | #undef compound | ||
| 784 | #undef slab | ||
| 785 | #undef reserved | ||
| 786 | |||
| 600 | static void action_result(unsigned long pfn, char *msg, int result) | 787 | static void action_result(unsigned long pfn, char *msg, int result) |
| 601 | { | 788 | { |
| 602 | struct page *page = NULL; | 789 | struct page *page = pfn_to_page(pfn); |
| 603 | if (pfn_valid(pfn)) | ||
| 604 | page = pfn_to_page(pfn); | ||
| 605 | 790 | ||
| 606 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | 791 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", |
| 607 | pfn, | 792 | pfn, |
| 608 | page && PageDirty(page) ? "dirty " : "", | 793 | PageDirty(page) ? "dirty " : "", |
| 609 | msg, action_name[result]); | 794 | msg, action_name[result]); |
| 610 | } | 795 | } |
| 611 | 796 | ||
| 612 | static int page_action(struct page_state *ps, struct page *p, | 797 | static int page_action(struct page_state *ps, struct page *p, |
| 613 | unsigned long pfn, int ref) | 798 | unsigned long pfn) |
| 614 | { | 799 | { |
| 615 | int result; | 800 | int result; |
| 616 | int count; | 801 | int count; |
| @@ -618,18 +803,22 @@ static int page_action(struct page_state *ps, struct page *p, | |||
| 618 | result = ps->action(p, pfn); | 803 | result = ps->action(p, pfn); |
| 619 | action_result(pfn, ps->msg, result); | 804 | action_result(pfn, ps->msg, result); |
| 620 | 805 | ||
| 621 | count = page_count(p) - 1 - ref; | 806 | count = page_count(p) - 1; |
| 622 | if (count != 0) | 807 | if (ps->action == me_swapcache_dirty && result == DELAYED) |
| 808 | count--; | ||
| 809 | if (count != 0) { | ||
| 623 | printk(KERN_ERR | 810 | printk(KERN_ERR |
| 624 | "MCE %#lx: %s page still referenced by %d users\n", | 811 | "MCE %#lx: %s page still referenced by %d users\n", |
| 625 | pfn, ps->msg, count); | 812 | pfn, ps->msg, count); |
| 813 | result = FAILED; | ||
| 814 | } | ||
| 626 | 815 | ||
| 627 | /* Could do more checks here if page looks ok */ | 816 | /* Could do more checks here if page looks ok */ |
| 628 | /* | 817 | /* |
| 629 | * Could adjust zone counters here to correct for the missing page. | 818 | * Could adjust zone counters here to correct for the missing page. |
| 630 | */ | 819 | */ |
| 631 | 820 | ||
| 632 | return result == RECOVERED ? 0 : -EBUSY; | 821 | return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; |
| 633 | } | 822 | } |
| 634 | 823 | ||
| 635 | #define N_UNMAP_TRIES 5 | 824 | #define N_UNMAP_TRIES 5 |
| @@ -638,7 +827,7 @@ static int page_action(struct page_state *ps, struct page *p, | |||
| 638 | * Do all that is necessary to remove user space mappings. Unmap | 827 | * Do all that is necessary to remove user space mappings. Unmap |
| 639 | * the pages and send SIGBUS to the processes if the data was dirty. | 828 | * the pages and send SIGBUS to the processes if the data was dirty. |
| 640 | */ | 829 | */ |
| 641 | static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | 830 | static int hwpoison_user_mappings(struct page *p, unsigned long pfn, |
| 642 | int trapno) | 831 | int trapno) |
| 643 | { | 832 | { |
| 644 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | 833 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; |
| @@ -648,15 +837,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 648 | int i; | 837 | int i; |
| 649 | int kill = 1; | 838 | int kill = 1; |
| 650 | 839 | ||
| 651 | if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) | 840 | if (PageReserved(p) || PageSlab(p)) |
| 652 | return; | 841 | return SWAP_SUCCESS; |
| 653 | 842 | ||
| 654 | /* | 843 | /* |
| 655 | * This check implies we don't kill processes if their pages | 844 | * This check implies we don't kill processes if their pages |
| 656 | * are in the swap cache early. Those are always late kills. | 845 | * are in the swap cache early. Those are always late kills. |
| 657 | */ | 846 | */ |
| 658 | if (!page_mapped(p)) | 847 | if (!page_mapped(p)) |
| 659 | return; | 848 | return SWAP_SUCCESS; |
| 849 | |||
| 850 | if (PageCompound(p) || PageKsm(p)) | ||
| 851 | return SWAP_FAIL; | ||
| 660 | 852 | ||
| 661 | if (PageSwapCache(p)) { | 853 | if (PageSwapCache(p)) { |
| 662 | printk(KERN_ERR | 854 | printk(KERN_ERR |
| @@ -667,6 +859,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 667 | /* | 859 | /* |
| 668 | * Propagate the dirty bit from PTEs to struct page first, because we | 860 | * Propagate the dirty bit from PTEs to struct page first, because we |
| 669 | * need this to decide if we should kill or just drop the page. | 861 | * need this to decide if we should kill or just drop the page. |
| 862 | * XXX: the dirty test could be racy: set_page_dirty() may not always | ||
| 863 | * be called inside page lock (it's recommended but not enforced). | ||
| 670 | */ | 864 | */ |
| 671 | mapping = page_mapping(p); | 865 | mapping = page_mapping(p); |
| 672 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | 866 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { |
| @@ -718,11 +912,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | |||
| 718 | */ | 912 | */ |
| 719 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | 913 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, |
| 720 | ret != SWAP_SUCCESS, pfn); | 914 | ret != SWAP_SUCCESS, pfn); |
| 915 | |||
| 916 | return ret; | ||
| 721 | } | 917 | } |
| 722 | 918 | ||
| 723 | int __memory_failure(unsigned long pfn, int trapno, int ref) | 919 | int __memory_failure(unsigned long pfn, int trapno, int flags) |
| 724 | { | 920 | { |
| 725 | unsigned long lru_flag; | ||
| 726 | struct page_state *ps; | 921 | struct page_state *ps; |
| 727 | struct page *p; | 922 | struct page *p; |
| 728 | int res; | 923 | int res; |
| @@ -731,13 +926,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
| 731 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | 926 | panic("Memory failure from trap %d on page %lx", trapno, pfn); |
| 732 | 927 | ||
| 733 | if (!pfn_valid(pfn)) { | 928 | if (!pfn_valid(pfn)) { |
| 734 | action_result(pfn, "memory outside kernel control", IGNORED); | 929 | printk(KERN_ERR |
| 735 | return -EIO; | 930 | "MCE %#lx: memory outside kernel control\n", |
| 931 | pfn); | ||
| 932 | return -ENXIO; | ||
| 736 | } | 933 | } |
| 737 | 934 | ||
| 738 | p = pfn_to_page(pfn); | 935 | p = pfn_to_page(pfn); |
| 739 | if (TestSetPageHWPoison(p)) { | 936 | if (TestSetPageHWPoison(p)) { |
| 740 | action_result(pfn, "already hardware poisoned", IGNORED); | 937 | printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); |
| 741 | return 0; | 938 | return 0; |
| 742 | } | 939 | } |
| 743 | 940 | ||
| @@ -754,9 +951,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
| 754 | * In fact it's dangerous to directly bump up page count from 0, | 951 | * In fact it's dangerous to directly bump up page count from 0, |
| 755 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | 952 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. |
| 756 | */ | 953 | */ |
| 757 | if (!get_page_unless_zero(compound_head(p))) { | 954 | if (!(flags & MF_COUNT_INCREASED) && |
| 758 | action_result(pfn, "free or high order kernel", IGNORED); | 955 | !get_page_unless_zero(compound_head(p))) { |
| 759 | return PageBuddy(compound_head(p)) ? 0 : -EBUSY; | 956 | if (is_free_buddy_page(p)) { |
| 957 | action_result(pfn, "free buddy", DELAYED); | ||
| 958 | return 0; | ||
| 959 | } else { | ||
| 960 | action_result(pfn, "high order kernel", IGNORED); | ||
| 961 | return -EBUSY; | ||
| 962 | } | ||
| 760 | } | 963 | } |
| 761 | 964 | ||
| 762 | /* | 965 | /* |
| @@ -768,14 +971,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
| 768 | * walked by the page reclaim code, however that's not a big loss. | 971 | * walked by the page reclaim code, however that's not a big loss. |
| 769 | */ | 972 | */ |
| 770 | if (!PageLRU(p)) | 973 | if (!PageLRU(p)) |
| 771 | lru_add_drain_all(); | 974 | shake_page(p, 0); |
| 772 | lru_flag = p->flags & lru; | 975 | if (!PageLRU(p)) { |
| 773 | if (isolate_lru_page(p)) { | 976 | /* |
| 977 | * shake_page could have turned it free. | ||
| 978 | */ | ||
| 979 | if (is_free_buddy_page(p)) { | ||
| 980 | action_result(pfn, "free buddy, 2nd try", DELAYED); | ||
| 981 | return 0; | ||
| 982 | } | ||
| 774 | action_result(pfn, "non LRU", IGNORED); | 983 | action_result(pfn, "non LRU", IGNORED); |
| 775 | put_page(p); | 984 | put_page(p); |
| 776 | return -EBUSY; | 985 | return -EBUSY; |
| 777 | } | 986 | } |
| 778 | page_cache_release(p); | ||
| 779 | 987 | ||
| 780 | /* | 988 | /* |
| 781 | * Lock the page and wait for writeback to finish. | 989 | * Lock the page and wait for writeback to finish. |
| @@ -783,26 +991,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref) | |||
| 783 | * and in many cases impossible, so we just avoid it here. | 991 | * and in many cases impossible, so we just avoid it here. |
| 784 | */ | 992 | */ |
| 785 | lock_page_nosync(p); | 993 | lock_page_nosync(p); |
| 994 | |||
| 995 | /* | ||
| 996 | * unpoison always clear PG_hwpoison inside page lock | ||
| 997 | */ | ||
| 998 | if (!PageHWPoison(p)) { | ||
| 999 | printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); | ||
| 1000 | res = 0; | ||
| 1001 | goto out; | ||
| 1002 | } | ||
| 1003 | if (hwpoison_filter(p)) { | ||
| 1004 | if (TestClearPageHWPoison(p)) | ||
| 1005 | atomic_long_dec(&mce_bad_pages); | ||
| 1006 | unlock_page(p); | ||
| 1007 | put_page(p); | ||
| 1008 | return 0; | ||
| 1009 | } | ||
| 1010 | |||
| 786 | wait_on_page_writeback(p); | 1011 | wait_on_page_writeback(p); |
| 787 | 1012 | ||
| 788 | /* | 1013 | /* |
| 789 | * Now take care of user space mappings. | 1014 | * Now take care of user space mappings. |
| 1015 | * Abort on fail: __remove_from_page_cache() assumes unmapped page. | ||
| 790 | */ | 1016 | */ |
| 791 | hwpoison_user_mappings(p, pfn, trapno); | 1017 | if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { |
| 1018 | printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); | ||
| 1019 | res = -EBUSY; | ||
| 1020 | goto out; | ||
| 1021 | } | ||
| 792 | 1022 | ||
| 793 | /* | 1023 | /* |
| 794 | * Torn down by someone else? | 1024 | * Torn down by someone else? |
| 795 | */ | 1025 | */ |
| 796 | if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { | 1026 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
| 797 | action_result(pfn, "already truncated LRU", IGNORED); | 1027 | action_result(pfn, "already truncated LRU", IGNORED); |
| 798 | res = 0; | 1028 | res = -EBUSY; |
| 799 | goto out; | 1029 | goto out; |
| 800 | } | 1030 | } |
| 801 | 1031 | ||
| 802 | res = -EBUSY; | 1032 | res = -EBUSY; |
| 803 | for (ps = error_states;; ps++) { | 1033 | for (ps = error_states;; ps++) { |
| 804 | if (((p->flags | lru_flag)& ps->mask) == ps->res) { | 1034 | if ((p->flags & ps->mask) == ps->res) { |
| 805 | res = page_action(ps, p, pfn, ref); | 1035 | res = page_action(ps, p, pfn); |
| 806 | break; | 1036 | break; |
| 807 | } | 1037 | } |
| 808 | } | 1038 | } |
| @@ -833,3 +1063,235 @@ void memory_failure(unsigned long pfn, int trapno) | |||
| 833 | { | 1063 | { |
| 834 | __memory_failure(pfn, trapno, 0); | 1064 | __memory_failure(pfn, trapno, 0); |
| 835 | } | 1065 | } |
| 1066 | |||
| 1067 | /** | ||
| 1068 | * unpoison_memory - Unpoison a previously poisoned page | ||
| 1069 | * @pfn: Page number of the to be unpoisoned page | ||
| 1070 | * | ||
| 1071 | * Software-unpoison a page that has been poisoned by | ||
| 1072 | * memory_failure() earlier. | ||
| 1073 | * | ||
| 1074 | * This is only done on the software-level, so it only works | ||
| 1075 | * for linux injected failures, not real hardware failures | ||
| 1076 | * | ||
| 1077 | * Returns 0 for success, otherwise -errno. | ||
| 1078 | */ | ||
| 1079 | int unpoison_memory(unsigned long pfn) | ||
| 1080 | { | ||
| 1081 | struct page *page; | ||
| 1082 | struct page *p; | ||
| 1083 | int freeit = 0; | ||
| 1084 | |||
| 1085 | if (!pfn_valid(pfn)) | ||
| 1086 | return -ENXIO; | ||
| 1087 | |||
| 1088 | p = pfn_to_page(pfn); | ||
| 1089 | page = compound_head(p); | ||
| 1090 | |||
| 1091 | if (!PageHWPoison(p)) { | ||
| 1092 | pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); | ||
| 1093 | return 0; | ||
| 1094 | } | ||
| 1095 | |||
| 1096 | if (!get_page_unless_zero(page)) { | ||
| 1097 | if (TestClearPageHWPoison(p)) | ||
| 1098 | atomic_long_dec(&mce_bad_pages); | ||
| 1099 | pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); | ||
| 1100 | return 0; | ||
| 1101 | } | ||
| 1102 | |||
| 1103 | lock_page_nosync(page); | ||
| 1104 | /* | ||
| 1105 | * This test is racy because PG_hwpoison is set outside of page lock. | ||
| 1106 | * That's acceptable because that won't trigger kernel panic. Instead, | ||
| 1107 | * the PG_hwpoison page will be caught and isolated on the entrance to | ||
| 1108 | * the free buddy page pool. | ||
| 1109 | */ | ||
| 1110 | if (TestClearPageHWPoison(p)) { | ||
| 1111 | pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); | ||
| 1112 | atomic_long_dec(&mce_bad_pages); | ||
| 1113 | freeit = 1; | ||
| 1114 | } | ||
| 1115 | unlock_page(page); | ||
| 1116 | |||
| 1117 | put_page(page); | ||
| 1118 | if (freeit) | ||
| 1119 | put_page(page); | ||
| 1120 | |||
| 1121 | return 0; | ||
| 1122 | } | ||
| 1123 | EXPORT_SYMBOL(unpoison_memory); | ||
| 1124 | |||
| 1125 | static struct page *new_page(struct page *p, unsigned long private, int **x) | ||
| 1126 | { | ||
| 1127 | int nid = page_to_nid(p); | ||
| 1128 | return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | /* | ||
| 1132 | * Safely get reference count of an arbitrary page. | ||
| 1133 | * Returns 0 for a free page, -EIO for a zero refcount page | ||
| 1134 | * that is not free, and 1 for any other page type. | ||
| 1135 | * For 1 the page is returned with increased page count, otherwise not. | ||
| 1136 | */ | ||
| 1137 | static int get_any_page(struct page *p, unsigned long pfn, int flags) | ||
| 1138 | { | ||
| 1139 | int ret; | ||
| 1140 | |||
| 1141 | if (flags & MF_COUNT_INCREASED) | ||
| 1142 | return 1; | ||
| 1143 | |||
| 1144 | /* | ||
| 1145 | * The lock_system_sleep prevents a race with memory hotplug, | ||
| 1146 | * because the isolation assumes there's only a single user. | ||
| 1147 | * This is a big hammer, a better would be nicer. | ||
| 1148 | */ | ||
| 1149 | lock_system_sleep(); | ||
| 1150 | |||
| 1151 | /* | ||
| 1152 | * Isolate the page, so that it doesn't get reallocated if it | ||
| 1153 | * was free. | ||
| 1154 | */ | ||
| 1155 | set_migratetype_isolate(p); | ||
| 1156 | if (!get_page_unless_zero(compound_head(p))) { | ||
| 1157 | if (is_free_buddy_page(p)) { | ||
| 1158 | pr_debug("get_any_page: %#lx free buddy page\n", pfn); | ||
| 1159 | /* Set hwpoison bit while page is still isolated */ | ||
| 1160 | SetPageHWPoison(p); | ||
| 1161 | ret = 0; | ||
| 1162 | } else { | ||
| 1163 | pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", | ||
| 1164 | pfn, p->flags); | ||
| 1165 | ret = -EIO; | ||
| 1166 | } | ||
| 1167 | } else { | ||
| 1168 | /* Not a free page */ | ||
| 1169 | ret = 1; | ||
| 1170 | } | ||
| 1171 | unset_migratetype_isolate(p); | ||
| 1172 | unlock_system_sleep(); | ||
| 1173 | return ret; | ||
| 1174 | } | ||
| 1175 | |||
| 1176 | /** | ||
| 1177 | * soft_offline_page - Soft offline a page. | ||
| 1178 | * @page: page to offline | ||
| 1179 | * @flags: flags. Same as memory_failure(). | ||
| 1180 | * | ||
| 1181 | * Returns 0 on success, otherwise negated errno. | ||
| 1182 | * | ||
| 1183 | * Soft offline a page, by migration or invalidation, | ||
| 1184 | * without killing anything. This is for the case when | ||
| 1185 | * a page is not corrupted yet (so it's still valid to access), | ||
| 1186 | * but has had a number of corrected errors and is better taken | ||
| 1187 | * out. | ||
| 1188 | * | ||
| 1189 | * The actual policy on when to do that is maintained by | ||
| 1190 | * user space. | ||
| 1191 | * | ||
| 1192 | * This should never impact any application or cause data loss, | ||
| 1193 | * however it might take some time. | ||
| 1194 | * | ||
| 1195 | * This is not a 100% solution for all memory, but tries to be | ||
| 1196 | * ``good enough'' for the majority of memory. | ||
| 1197 | */ | ||
| 1198 | int soft_offline_page(struct page *page, int flags) | ||
| 1199 | { | ||
| 1200 | int ret; | ||
| 1201 | unsigned long pfn = page_to_pfn(page); | ||
| 1202 | |||
| 1203 | ret = get_any_page(page, pfn, flags); | ||
| 1204 | if (ret < 0) | ||
| 1205 | return ret; | ||
| 1206 | if (ret == 0) | ||
| 1207 | goto done; | ||
| 1208 | |||
| 1209 | /* | ||
| 1210 | * Page cache page we can handle? | ||
| 1211 | */ | ||
| 1212 | if (!PageLRU(page)) { | ||
| 1213 | /* | ||
| 1214 | * Try to free it. | ||
| 1215 | */ | ||
| 1216 | put_page(page); | ||
| 1217 | shake_page(page, 1); | ||
| 1218 | |||
| 1219 | /* | ||
| 1220 | * Did it turn free? | ||
| 1221 | */ | ||
| 1222 | ret = get_any_page(page, pfn, 0); | ||
| 1223 | if (ret < 0) | ||
| 1224 | return ret; | ||
| 1225 | if (ret == 0) | ||
| 1226 | goto done; | ||
| 1227 | } | ||
| 1228 | if (!PageLRU(page)) { | ||
| 1229 | pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", | ||
| 1230 | pfn, page->flags); | ||
| 1231 | return -EIO; | ||
| 1232 | } | ||
| 1233 | |||
| 1234 | lock_page(page); | ||
| 1235 | wait_on_page_writeback(page); | ||
| 1236 | |||
| 1237 | /* | ||
| 1238 | * Synchronized using the page lock with memory_failure() | ||
| 1239 | */ | ||
| 1240 | if (PageHWPoison(page)) { | ||
| 1241 | unlock_page(page); | ||
| 1242 | put_page(page); | ||
| 1243 | pr_debug("soft offline: %#lx page already poisoned\n", pfn); | ||
| 1244 | return -EBUSY; | ||
| 1245 | } | ||
| 1246 | |||
| 1247 | /* | ||
| 1248 | * Try to invalidate first. This should work for | ||
| 1249 | * non dirty unmapped page cache pages. | ||
| 1250 | */ | ||
| 1251 | ret = invalidate_inode_page(page); | ||
| 1252 | unlock_page(page); | ||
| 1253 | |||
| 1254 | /* | ||
| 1255 | * Drop count because page migration doesn't like raised | ||
| 1256 | * counts. The page could get re-allocated, but if it becomes | ||
| 1257 | * LRU the isolation will just fail. | ||
| 1258 | * RED-PEN would be better to keep it isolated here, but we | ||
| 1259 | * would need to fix isolation locking first. | ||
| 1260 | */ | ||
| 1261 | put_page(page); | ||
| 1262 | if (ret == 1) { | ||
| 1263 | ret = 0; | ||
| 1264 | pr_debug("soft_offline: %#lx: invalidated\n", pfn); | ||
| 1265 | goto done; | ||
| 1266 | } | ||
| 1267 | |||
| 1268 | /* | ||
| 1269 | * Simple invalidation didn't work. | ||
| 1270 | * Try to migrate to a new page instead. migrate.c | ||
| 1271 | * handles a large number of cases for us. | ||
| 1272 | */ | ||
| 1273 | ret = isolate_lru_page(page); | ||
| 1274 | if (!ret) { | ||
| 1275 | LIST_HEAD(pagelist); | ||
| 1276 | |||
| 1277 | list_add(&page->lru, &pagelist); | ||
| 1278 | ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); | ||
| 1279 | if (ret) { | ||
| 1280 | pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", | ||
| 1281 | pfn, ret, page->flags); | ||
| 1282 | if (ret > 0) | ||
| 1283 | ret = -EIO; | ||
| 1284 | } | ||
| 1285 | } else { | ||
| 1286 | pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", | ||
| 1287 | pfn, ret, page_count(page), page->flags); | ||
| 1288 | } | ||
| 1289 | if (ret) | ||
| 1290 | return ret; | ||
| 1291 | |||
| 1292 | done: | ||
| 1293 | atomic_long_add(1, &mce_bad_pages); | ||
| 1294 | SetPageHWPoison(page); | ||
| 1295 | /* keep elevated page count for bad page */ | ||
| 1296 | return ret; | ||
| 1297 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index 6ab19dd4a199..d1153e37e9ba 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -121,6 +121,80 @@ static int __init init_zero_pfn(void) | |||
| 121 | } | 121 | } |
| 122 | core_initcall(init_zero_pfn); | 122 | core_initcall(init_zero_pfn); |
| 123 | 123 | ||
| 124 | |||
| 125 | #if defined(SPLIT_RSS_COUNTING) | ||
| 126 | |||
| 127 | void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) | ||
| 128 | { | ||
| 129 | int i; | ||
| 130 | |||
| 131 | for (i = 0; i < NR_MM_COUNTERS; i++) { | ||
| 132 | if (task->rss_stat.count[i]) { | ||
| 133 | add_mm_counter(mm, i, task->rss_stat.count[i]); | ||
| 134 | task->rss_stat.count[i] = 0; | ||
| 135 | } | ||
| 136 | } | ||
| 137 | task->rss_stat.events = 0; | ||
| 138 | } | ||
| 139 | |||
| 140 | static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) | ||
| 141 | { | ||
| 142 | struct task_struct *task = current; | ||
| 143 | |||
| 144 | if (likely(task->mm == mm)) | ||
| 145 | task->rss_stat.count[member] += val; | ||
| 146 | else | ||
| 147 | add_mm_counter(mm, member, val); | ||
| 148 | } | ||
| 149 | #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) | ||
| 150 | #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) | ||
| 151 | |||
| 152 | /* sync counter once per 64 page faults */ | ||
| 153 | #define TASK_RSS_EVENTS_THRESH (64) | ||
| 154 | static void check_sync_rss_stat(struct task_struct *task) | ||
| 155 | { | ||
| 156 | if (unlikely(task != current)) | ||
| 157 | return; | ||
| 158 | if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) | ||
| 159 | __sync_task_rss_stat(task, task->mm); | ||
| 160 | } | ||
| 161 | |||
| 162 | unsigned long get_mm_counter(struct mm_struct *mm, int member) | ||
| 163 | { | ||
| 164 | long val = 0; | ||
| 165 | |||
| 166 | /* | ||
| 167 | * Don't use task->mm here...for avoiding to use task_get_mm().. | ||
| 168 | * The caller must guarantee task->mm is not invalid. | ||
| 169 | */ | ||
| 170 | val = atomic_long_read(&mm->rss_stat.count[member]); | ||
| 171 | /* | ||
| 172 | * counter is updated in asynchronous manner and may go to minus. | ||
| 173 | * But it's never be expected number for users. | ||
| 174 | */ | ||
| 175 | if (val < 0) | ||
| 176 | return 0; | ||
| 177 | return (unsigned long)val; | ||
| 178 | } | ||
| 179 | |||
| 180 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
| 181 | { | ||
| 182 | __sync_task_rss_stat(task, mm); | ||
| 183 | } | ||
| 184 | #else | ||
| 185 | |||
| 186 | #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) | ||
| 187 | #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) | ||
| 188 | |||
| 189 | static void check_sync_rss_stat(struct task_struct *task) | ||
| 190 | { | ||
| 191 | } | ||
| 192 | |||
| 193 | void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) | ||
| 194 | { | ||
| 195 | } | ||
| 196 | #endif | ||
| 197 | |||
| 124 | /* | 198 | /* |
| 125 | * If a p?d_bad entry is found while walking page tables, report | 199 | * If a p?d_bad entry is found while walking page tables, report |
| 126 | * the error, before resetting entry to p?d_none. Usually (but | 200 | * the error, before resetting entry to p?d_none. Usually (but |
| @@ -300,7 +374,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 300 | * Hide vma from rmap and truncate_pagecache before freeing | 374 | * Hide vma from rmap and truncate_pagecache before freeing |
| 301 | * pgtables | 375 | * pgtables |
| 302 | */ | 376 | */ |
| 303 | anon_vma_unlink(vma); | 377 | unlink_anon_vmas(vma); |
| 304 | unlink_file_vma(vma); | 378 | unlink_file_vma(vma); |
| 305 | 379 | ||
| 306 | if (is_vm_hugetlb_page(vma)) { | 380 | if (is_vm_hugetlb_page(vma)) { |
| @@ -314,7 +388,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 314 | && !is_vm_hugetlb_page(next)) { | 388 | && !is_vm_hugetlb_page(next)) { |
| 315 | vma = next; | 389 | vma = next; |
| 316 | next = vma->vm_next; | 390 | next = vma->vm_next; |
| 317 | anon_vma_unlink(vma); | 391 | unlink_anon_vmas(vma); |
| 318 | unlink_file_vma(vma); | 392 | unlink_file_vma(vma); |
| 319 | } | 393 | } |
| 320 | free_pgd_range(tlb, addr, vma->vm_end, | 394 | free_pgd_range(tlb, addr, vma->vm_end, |
| @@ -376,12 +450,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
| 376 | return 0; | 450 | return 0; |
| 377 | } | 451 | } |
| 378 | 452 | ||
| 379 | static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) | 453 | static inline void init_rss_vec(int *rss) |
| 380 | { | 454 | { |
| 381 | if (file_rss) | 455 | memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); |
| 382 | add_mm_counter(mm, file_rss, file_rss); | 456 | } |
| 383 | if (anon_rss) | 457 | |
| 384 | add_mm_counter(mm, anon_rss, anon_rss); | 458 | static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) |
| 459 | { | ||
| 460 | int i; | ||
| 461 | |||
| 462 | if (current->mm == mm) | ||
| 463 | sync_mm_rss(current, mm); | ||
| 464 | for (i = 0; i < NR_MM_COUNTERS; i++) | ||
| 465 | if (rss[i]) | ||
| 466 | add_mm_counter(mm, i, rss[i]); | ||
| 385 | } | 467 | } |
| 386 | 468 | ||
| 387 | /* | 469 | /* |
| @@ -572,7 +654,7 @@ out: | |||
| 572 | * covered by this vma. | 654 | * covered by this vma. |
| 573 | */ | 655 | */ |
| 574 | 656 | ||
| 575 | static inline void | 657 | static inline unsigned long |
| 576 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 658 | copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
| 577 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, | 659 | pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, |
| 578 | unsigned long addr, int *rss) | 660 | unsigned long addr, int *rss) |
| @@ -586,7 +668,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 586 | if (!pte_file(pte)) { | 668 | if (!pte_file(pte)) { |
| 587 | swp_entry_t entry = pte_to_swp_entry(pte); | 669 | swp_entry_t entry = pte_to_swp_entry(pte); |
| 588 | 670 | ||
| 589 | swap_duplicate(entry); | 671 | if (swap_duplicate(entry) < 0) |
| 672 | return entry.val; | ||
| 673 | |||
| 590 | /* make sure dst_mm is on swapoff's mmlist. */ | 674 | /* make sure dst_mm is on swapoff's mmlist. */ |
| 591 | if (unlikely(list_empty(&dst_mm->mmlist))) { | 675 | if (unlikely(list_empty(&dst_mm->mmlist))) { |
| 592 | spin_lock(&mmlist_lock); | 676 | spin_lock(&mmlist_lock); |
| @@ -595,7 +679,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 595 | &src_mm->mmlist); | 679 | &src_mm->mmlist); |
| 596 | spin_unlock(&mmlist_lock); | 680 | spin_unlock(&mmlist_lock); |
| 597 | } | 681 | } |
| 598 | if (is_write_migration_entry(entry) && | 682 | if (likely(!non_swap_entry(entry))) |
| 683 | rss[MM_SWAPENTS]++; | ||
| 684 | else if (is_write_migration_entry(entry) && | ||
| 599 | is_cow_mapping(vm_flags)) { | 685 | is_cow_mapping(vm_flags)) { |
| 600 | /* | 686 | /* |
| 601 | * COW mappings require pages in both parent | 687 | * COW mappings require pages in both parent |
| @@ -630,11 +716,15 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 630 | if (page) { | 716 | if (page) { |
| 631 | get_page(page); | 717 | get_page(page); |
| 632 | page_dup_rmap(page); | 718 | page_dup_rmap(page); |
| 633 | rss[PageAnon(page)]++; | 719 | if (PageAnon(page)) |
| 720 | rss[MM_ANONPAGES]++; | ||
| 721 | else | ||
| 722 | rss[MM_FILEPAGES]++; | ||
| 634 | } | 723 | } |
| 635 | 724 | ||
| 636 | out_set_pte: | 725 | out_set_pte: |
| 637 | set_pte_at(dst_mm, addr, dst_pte, pte); | 726 | set_pte_at(dst_mm, addr, dst_pte, pte); |
| 727 | return 0; | ||
| 638 | } | 728 | } |
| 639 | 729 | ||
| 640 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 730 | static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
| @@ -645,10 +735,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 645 | pte_t *src_pte, *dst_pte; | 735 | pte_t *src_pte, *dst_pte; |
| 646 | spinlock_t *src_ptl, *dst_ptl; | 736 | spinlock_t *src_ptl, *dst_ptl; |
| 647 | int progress = 0; | 737 | int progress = 0; |
| 648 | int rss[2]; | 738 | int rss[NR_MM_COUNTERS]; |
| 739 | swp_entry_t entry = (swp_entry_t){0}; | ||
| 649 | 740 | ||
| 650 | again: | 741 | again: |
| 651 | rss[1] = rss[0] = 0; | 742 | init_rss_vec(rss); |
| 743 | |||
| 652 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); | 744 | dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); |
| 653 | if (!dst_pte) | 745 | if (!dst_pte) |
| 654 | return -ENOMEM; | 746 | return -ENOMEM; |
| @@ -674,16 +766,25 @@ again: | |||
| 674 | progress++; | 766 | progress++; |
| 675 | continue; | 767 | continue; |
| 676 | } | 768 | } |
| 677 | copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); | 769 | entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, |
| 770 | vma, addr, rss); | ||
| 771 | if (entry.val) | ||
| 772 | break; | ||
| 678 | progress += 8; | 773 | progress += 8; |
| 679 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); | 774 | } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); |
| 680 | 775 | ||
| 681 | arch_leave_lazy_mmu_mode(); | 776 | arch_leave_lazy_mmu_mode(); |
| 682 | spin_unlock(src_ptl); | 777 | spin_unlock(src_ptl); |
| 683 | pte_unmap_nested(orig_src_pte); | 778 | pte_unmap_nested(orig_src_pte); |
| 684 | add_mm_rss(dst_mm, rss[0], rss[1]); | 779 | add_mm_rss_vec(dst_mm, rss); |
| 685 | pte_unmap_unlock(orig_dst_pte, dst_ptl); | 780 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
| 686 | cond_resched(); | 781 | cond_resched(); |
| 782 | |||
| 783 | if (entry.val) { | ||
| 784 | if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) | ||
| 785 | return -ENOMEM; | ||
| 786 | progress = 0; | ||
| 787 | } | ||
| 687 | if (addr != end) | 788 | if (addr != end) |
| 688 | goto again; | 789 | goto again; |
| 689 | return 0; | 790 | return 0; |
| @@ -803,8 +904,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
| 803 | struct mm_struct *mm = tlb->mm; | 904 | struct mm_struct *mm = tlb->mm; |
| 804 | pte_t *pte; | 905 | pte_t *pte; |
| 805 | spinlock_t *ptl; | 906 | spinlock_t *ptl; |
| 806 | int file_rss = 0; | 907 | int rss[NR_MM_COUNTERS]; |
| 807 | int anon_rss = 0; | 908 | |
| 909 | init_rss_vec(rss); | ||
| 808 | 910 | ||
| 809 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | 911 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); |
| 810 | arch_enter_lazy_mmu_mode(); | 912 | arch_enter_lazy_mmu_mode(); |
| @@ -850,14 +952,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
| 850 | set_pte_at(mm, addr, pte, | 952 | set_pte_at(mm, addr, pte, |
| 851 | pgoff_to_pte(page->index)); | 953 | pgoff_to_pte(page->index)); |
| 852 | if (PageAnon(page)) | 954 | if (PageAnon(page)) |
| 853 | anon_rss--; | 955 | rss[MM_ANONPAGES]--; |
| 854 | else { | 956 | else { |
| 855 | if (pte_dirty(ptent)) | 957 | if (pte_dirty(ptent)) |
| 856 | set_page_dirty(page); | 958 | set_page_dirty(page); |
| 857 | if (pte_young(ptent) && | 959 | if (pte_young(ptent) && |
| 858 | likely(!VM_SequentialReadHint(vma))) | 960 | likely(!VM_SequentialReadHint(vma))) |
| 859 | mark_page_accessed(page); | 961 | mark_page_accessed(page); |
| 860 | file_rss--; | 962 | rss[MM_FILEPAGES]--; |
| 861 | } | 963 | } |
| 862 | page_remove_rmap(page); | 964 | page_remove_rmap(page); |
| 863 | if (unlikely(page_mapcount(page) < 0)) | 965 | if (unlikely(page_mapcount(page) < 0)) |
| @@ -874,13 +976,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, | |||
| 874 | if (pte_file(ptent)) { | 976 | if (pte_file(ptent)) { |
| 875 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) | 977 | if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) |
| 876 | print_bad_pte(vma, addr, ptent, NULL); | 978 | print_bad_pte(vma, addr, ptent, NULL); |
| 877 | } else if | 979 | } else { |
| 878 | (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) | 980 | swp_entry_t entry = pte_to_swp_entry(ptent); |
| 879 | print_bad_pte(vma, addr, ptent, NULL); | 981 | |
| 982 | if (!non_swap_entry(entry)) | ||
| 983 | rss[MM_SWAPENTS]--; | ||
| 984 | if (unlikely(!free_swap_and_cache(entry))) | ||
| 985 | print_bad_pte(vma, addr, ptent, NULL); | ||
| 986 | } | ||
| 880 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | 987 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); |
| 881 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); | 988 | } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); |
| 882 | 989 | ||
| 883 | add_mm_rss(mm, file_rss, anon_rss); | 990 | add_mm_rss_vec(mm, rss); |
| 884 | arch_leave_lazy_mmu_mode(); | 991 | arch_leave_lazy_mmu_mode(); |
| 885 | pte_unmap_unlock(pte - 1, ptl); | 992 | pte_unmap_unlock(pte - 1, ptl); |
| 886 | 993 | ||
| @@ -943,6 +1050,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
| 943 | details = NULL; | 1050 | details = NULL; |
| 944 | 1051 | ||
| 945 | BUG_ON(addr >= end); | 1052 | BUG_ON(addr >= end); |
| 1053 | mem_cgroup_uncharge_start(); | ||
| 946 | tlb_start_vma(tlb, vma); | 1054 | tlb_start_vma(tlb, vma); |
| 947 | pgd = pgd_offset(vma->vm_mm, addr); | 1055 | pgd = pgd_offset(vma->vm_mm, addr); |
| 948 | do { | 1056 | do { |
| @@ -955,6 +1063,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
| 955 | zap_work, details); | 1063 | zap_work, details); |
| 956 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); | 1064 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); |
| 957 | tlb_end_vma(tlb, vma); | 1065 | tlb_end_vma(tlb, vma); |
| 1066 | mem_cgroup_uncharge_end(); | ||
| 958 | 1067 | ||
| 959 | return addr; | 1068 | return addr; |
| 960 | } | 1069 | } |
| @@ -1512,7 +1621,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 1512 | 1621 | ||
| 1513 | /* Ok, finally just insert the thing.. */ | 1622 | /* Ok, finally just insert the thing.. */ |
| 1514 | get_page(page); | 1623 | get_page(page); |
| 1515 | inc_mm_counter(mm, file_rss); | 1624 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
| 1516 | page_add_file_rmap(page); | 1625 | page_add_file_rmap(page); |
| 1517 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1626 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
| 1518 | 1627 | ||
| @@ -1578,7 +1687,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
| 1578 | /* Ok, finally just insert the thing.. */ | 1687 | /* Ok, finally just insert the thing.. */ |
| 1579 | entry = pte_mkspecial(pfn_pte(pfn, prot)); | 1688 | entry = pte_mkspecial(pfn_pte(pfn, prot)); |
| 1580 | set_pte_at(mm, addr, pte, entry); | 1689 | set_pte_at(mm, addr, pte, entry); |
| 1581 | update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ | 1690 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ |
| 1582 | 1691 | ||
| 1583 | retval = 0; | 1692 | retval = 0; |
| 1584 | out_unlock: | 1693 | out_unlock: |
| @@ -2029,6 +2138,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2029 | page_cache_release(old_page); | 2138 | page_cache_release(old_page); |
| 2030 | } | 2139 | } |
| 2031 | reuse = reuse_swap_page(old_page); | 2140 | reuse = reuse_swap_page(old_page); |
| 2141 | if (reuse) | ||
| 2142 | /* | ||
| 2143 | * The page is all ours. Move it to our anon_vma so | ||
| 2144 | * the rmap code will not search our parent or siblings. | ||
| 2145 | * Protected against the rmap code by the page lock. | ||
| 2146 | */ | ||
| 2147 | page_move_anon_rmap(old_page, vma, address); | ||
| 2032 | unlock_page(old_page); | 2148 | unlock_page(old_page); |
| 2033 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | 2149 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == |
| 2034 | (VM_WRITE|VM_SHARED))) { | 2150 | (VM_WRITE|VM_SHARED))) { |
| @@ -2101,7 +2217,7 @@ reuse: | |||
| 2101 | entry = pte_mkyoung(orig_pte); | 2217 | entry = pte_mkyoung(orig_pte); |
| 2102 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2218 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2103 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | 2219 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) |
| 2104 | update_mmu_cache(vma, address, entry); | 2220 | update_mmu_cache(vma, address, page_table); |
| 2105 | ret |= VM_FAULT_WRITE; | 2221 | ret |= VM_FAULT_WRITE; |
| 2106 | goto unlock; | 2222 | goto unlock; |
| 2107 | } | 2223 | } |
| @@ -2148,11 +2264,11 @@ gotten: | |||
| 2148 | if (likely(pte_same(*page_table, orig_pte))) { | 2264 | if (likely(pte_same(*page_table, orig_pte))) { |
| 2149 | if (old_page) { | 2265 | if (old_page) { |
| 2150 | if (!PageAnon(old_page)) { | 2266 | if (!PageAnon(old_page)) { |
| 2151 | dec_mm_counter(mm, file_rss); | 2267 | dec_mm_counter_fast(mm, MM_FILEPAGES); |
| 2152 | inc_mm_counter(mm, anon_rss); | 2268 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2153 | } | 2269 | } |
| 2154 | } else | 2270 | } else |
| 2155 | inc_mm_counter(mm, anon_rss); | 2271 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2156 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2272 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
| 2157 | entry = mk_pte(new_page, vma->vm_page_prot); | 2273 | entry = mk_pte(new_page, vma->vm_page_prot); |
| 2158 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2274 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| @@ -2170,7 +2286,7 @@ gotten: | |||
| 2170 | * new page to be mapped directly into the secondary page table. | 2286 | * new page to be mapped directly into the secondary page table. |
| 2171 | */ | 2287 | */ |
| 2172 | set_pte_at_notify(mm, address, page_table, entry); | 2288 | set_pte_at_notify(mm, address, page_table, entry); |
| 2173 | update_mmu_cache(vma, address, entry); | 2289 | update_mmu_cache(vma, address, page_table); |
| 2174 | if (old_page) { | 2290 | if (old_page) { |
| 2175 | /* | 2291 | /* |
| 2176 | * Only after switching the pte to the new page may | 2292 | * Only after switching the pte to the new page may |
| @@ -2514,7 +2630,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2514 | ret = VM_FAULT_HWPOISON; | 2630 | ret = VM_FAULT_HWPOISON; |
| 2515 | } else { | 2631 | } else { |
| 2516 | print_bad_pte(vma, address, orig_pte, NULL); | 2632 | print_bad_pte(vma, address, orig_pte, NULL); |
| 2517 | ret = VM_FAULT_OOM; | 2633 | ret = VM_FAULT_SIGBUS; |
| 2518 | } | 2634 | } |
| 2519 | goto out; | 2635 | goto out; |
| 2520 | } | 2636 | } |
| @@ -2540,6 +2656,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2540 | ret = VM_FAULT_MAJOR; | 2656 | ret = VM_FAULT_MAJOR; |
| 2541 | count_vm_event(PGMAJFAULT); | 2657 | count_vm_event(PGMAJFAULT); |
| 2542 | } else if (PageHWPoison(page)) { | 2658 | } else if (PageHWPoison(page)) { |
| 2659 | /* | ||
| 2660 | * hwpoisoned dirty swapcache pages are kept for killing | ||
| 2661 | * owner processes (which may be unknown at hwpoison time) | ||
| 2662 | */ | ||
| 2543 | ret = VM_FAULT_HWPOISON; | 2663 | ret = VM_FAULT_HWPOISON; |
| 2544 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2664 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
| 2545 | goto out_release; | 2665 | goto out_release; |
| @@ -2548,6 +2668,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2548 | lock_page(page); | 2668 | lock_page(page); |
| 2549 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2669 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
| 2550 | 2670 | ||
| 2671 | page = ksm_might_need_to_copy(page, vma, address); | ||
| 2672 | if (!page) { | ||
| 2673 | ret = VM_FAULT_OOM; | ||
| 2674 | goto out; | ||
| 2675 | } | ||
| 2676 | |||
| 2551 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { | 2677 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
| 2552 | ret = VM_FAULT_OOM; | 2678 | ret = VM_FAULT_OOM; |
| 2553 | goto out_page; | 2679 | goto out_page; |
| @@ -2579,7 +2705,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2579 | * discarded at swap_free(). | 2705 | * discarded at swap_free(). |
| 2580 | */ | 2706 | */ |
| 2581 | 2707 | ||
| 2582 | inc_mm_counter(mm, anon_rss); | 2708 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2709 | dec_mm_counter_fast(mm, MM_SWAPENTS); | ||
| 2583 | pte = mk_pte(page, vma->vm_page_prot); | 2710 | pte = mk_pte(page, vma->vm_page_prot); |
| 2584 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { | 2711 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { |
| 2585 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2712 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
| @@ -2604,7 +2731,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2604 | } | 2731 | } |
| 2605 | 2732 | ||
| 2606 | /* No need to invalidate - it was non-present before */ | 2733 | /* No need to invalidate - it was non-present before */ |
| 2607 | update_mmu_cache(vma, address, pte); | 2734 | update_mmu_cache(vma, address, page_table); |
| 2608 | unlock: | 2735 | unlock: |
| 2609 | pte_unmap_unlock(page_table, ptl); | 2736 | pte_unmap_unlock(page_table, ptl); |
| 2610 | out: | 2737 | out: |
| @@ -2663,13 +2790,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2663 | if (!pte_none(*page_table)) | 2790 | if (!pte_none(*page_table)) |
| 2664 | goto release; | 2791 | goto release; |
| 2665 | 2792 | ||
| 2666 | inc_mm_counter(mm, anon_rss); | 2793 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2667 | page_add_new_anon_rmap(page, vma, address); | 2794 | page_add_new_anon_rmap(page, vma, address); |
| 2668 | setpte: | 2795 | setpte: |
| 2669 | set_pte_at(mm, address, page_table, entry); | 2796 | set_pte_at(mm, address, page_table, entry); |
| 2670 | 2797 | ||
| 2671 | /* No need to invalidate - it was non-present before */ | 2798 | /* No need to invalidate - it was non-present before */ |
| 2672 | update_mmu_cache(vma, address, entry); | 2799 | update_mmu_cache(vma, address, page_table); |
| 2673 | unlock: | 2800 | unlock: |
| 2674 | pte_unmap_unlock(page_table, ptl); | 2801 | pte_unmap_unlock(page_table, ptl); |
| 2675 | return 0; | 2802 | return 0; |
| @@ -2817,10 +2944,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2817 | if (flags & FAULT_FLAG_WRITE) | 2944 | if (flags & FAULT_FLAG_WRITE) |
| 2818 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2945 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2819 | if (anon) { | 2946 | if (anon) { |
| 2820 | inc_mm_counter(mm, anon_rss); | 2947 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2821 | page_add_new_anon_rmap(page, vma, address); | 2948 | page_add_new_anon_rmap(page, vma, address); |
| 2822 | } else { | 2949 | } else { |
| 2823 | inc_mm_counter(mm, file_rss); | 2950 | inc_mm_counter_fast(mm, MM_FILEPAGES); |
| 2824 | page_add_file_rmap(page); | 2951 | page_add_file_rmap(page); |
| 2825 | if (flags & FAULT_FLAG_WRITE) { | 2952 | if (flags & FAULT_FLAG_WRITE) { |
| 2826 | dirty_page = page; | 2953 | dirty_page = page; |
| @@ -2830,7 +2957,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2830 | set_pte_at(mm, address, page_table, entry); | 2957 | set_pte_at(mm, address, page_table, entry); |
| 2831 | 2958 | ||
| 2832 | /* no need to invalidate: a not-present page won't be cached */ | 2959 | /* no need to invalidate: a not-present page won't be cached */ |
| 2833 | update_mmu_cache(vma, address, entry); | 2960 | update_mmu_cache(vma, address, page_table); |
| 2834 | } else { | 2961 | } else { |
| 2835 | if (charged) | 2962 | if (charged) |
| 2836 | mem_cgroup_uncharge_page(page); | 2963 | mem_cgroup_uncharge_page(page); |
| @@ -2910,7 +3037,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2910 | * Page table corrupted: show pte and kill process. | 3037 | * Page table corrupted: show pte and kill process. |
| 2911 | */ | 3038 | */ |
| 2912 | print_bad_pte(vma, address, orig_pte, NULL); | 3039 | print_bad_pte(vma, address, orig_pte, NULL); |
| 2913 | return VM_FAULT_OOM; | 3040 | return VM_FAULT_SIGBUS; |
| 2914 | } | 3041 | } |
| 2915 | 3042 | ||
| 2916 | pgoff = pte_to_pgoff(orig_pte); | 3043 | pgoff = pte_to_pgoff(orig_pte); |
| @@ -2967,7 +3094,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
| 2967 | } | 3094 | } |
| 2968 | entry = pte_mkyoung(entry); | 3095 | entry = pte_mkyoung(entry); |
| 2969 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { | 3096 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { |
| 2970 | update_mmu_cache(vma, address, entry); | 3097 | update_mmu_cache(vma, address, pte); |
| 2971 | } else { | 3098 | } else { |
| 2972 | /* | 3099 | /* |
| 2973 | * This is needed only for protection faults but the arch code | 3100 | * This is needed only for protection faults but the arch code |
| @@ -2998,6 +3125,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2998 | 3125 | ||
| 2999 | count_vm_event(PGFAULT); | 3126 | count_vm_event(PGFAULT); |
| 3000 | 3127 | ||
| 3128 | /* do counter updates before entering really critical section. */ | ||
| 3129 | check_sync_rss_stat(current); | ||
| 3130 | |||
| 3001 | if (unlikely(is_vm_hugetlb_page(vma))) | 3131 | if (unlikely(is_vm_hugetlb_page(vma))) |
| 3002 | return hugetlb_fault(mm, vma, address, flags); | 3132 | return hugetlb_fault(mm, vma, address, flags); |
| 3003 | 3133 | ||
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2047465cd27c..78e34e63c7b8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -27,6 +27,8 @@ | |||
| 27 | #include <linux/page-isolation.h> | 27 | #include <linux/page-isolation.h> |
| 28 | #include <linux/pfn.h> | 28 | #include <linux/pfn.h> |
| 29 | #include <linux/suspend.h> | 29 | #include <linux/suspend.h> |
| 30 | #include <linux/mm_inline.h> | ||
| 31 | #include <linux/firmware-map.h> | ||
| 30 | 32 | ||
| 31 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
| 32 | 34 | ||
| @@ -71,7 +73,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) | |||
| 71 | atomic_inc(&page->_count); | 73 | atomic_inc(&page->_count); |
| 72 | } | 74 | } |
| 73 | 75 | ||
| 74 | void put_page_bootmem(struct page *page) | 76 | /* reference to __meminit __free_pages_bootmem is valid |
| 77 | * so use __ref to tell modpost not to generate a warning */ | ||
| 78 | void __ref put_page_bootmem(struct page *page) | ||
| 75 | { | 79 | { |
| 76 | int type; | 80 | int type; |
| 77 | 81 | ||
| @@ -520,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
| 520 | BUG_ON(ret); | 524 | BUG_ON(ret); |
| 521 | } | 525 | } |
| 522 | 526 | ||
| 527 | /* create new memmap entry */ | ||
| 528 | firmware_map_add_hotplug(start, start + size, "System RAM"); | ||
| 529 | |||
| 523 | goto out; | 530 | goto out; |
| 524 | 531 | ||
| 525 | error: | 532 | error: |
| @@ -672,6 +679,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
| 672 | if (!ret) { /* Success */ | 679 | if (!ret) { /* Success */ |
| 673 | list_add_tail(&page->lru, &source); | 680 | list_add_tail(&page->lru, &source); |
| 674 | move_pages--; | 681 | move_pages--; |
| 682 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
| 683 | page_is_file_cache(page)); | ||
| 684 | |||
| 675 | } else { | 685 | } else { |
| 676 | /* Becasue we don't have big zone->lock. we should | 686 | /* Becasue we don't have big zone->lock. we should |
| 677 | check this again here. */ | 687 | check this again here. */ |
| @@ -694,7 +704,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) | |||
| 694 | if (list_empty(&source)) | 704 | if (list_empty(&source)) |
| 695 | goto out; | 705 | goto out; |
| 696 | /* this function returns # of failed pages */ | 706 | /* this function returns # of failed pages */ |
| 697 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0); | 707 | ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); |
| 698 | 708 | ||
| 699 | out: | 709 | out: |
| 700 | return ret; | 710 | return ret; |
| @@ -747,7 +757,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
| 747 | return offlined; | 757 | return offlined; |
| 748 | } | 758 | } |
| 749 | 759 | ||
| 750 | int offline_pages(unsigned long start_pfn, | 760 | static int offline_pages(unsigned long start_pfn, |
| 751 | unsigned long end_pfn, unsigned long timeout) | 761 | unsigned long end_pfn, unsigned long timeout) |
| 752 | { | 762 | { |
| 753 | unsigned long pfn, nr_pages, expire; | 763 | unsigned long pfn, nr_pages, expire; |
| @@ -849,6 +859,10 @@ repeat: | |||
| 849 | 859 | ||
| 850 | setup_per_zone_wmarks(); | 860 | setup_per_zone_wmarks(); |
| 851 | calculate_zone_inactive_ratio(zone); | 861 | calculate_zone_inactive_ratio(zone); |
| 862 | if (!node_present_pages(node)) { | ||
| 863 | node_clear_state(node, N_HIGH_MEMORY); | ||
| 864 | kswapd_stop(node); | ||
| 865 | } | ||
| 852 | 866 | ||
| 853 | vm_total_pages = nr_free_pagecache_pages(); | 867 | vm_total_pages = nr_free_pagecache_pages(); |
| 854 | writeback_set_ratelimit(); | 868 | writeback_set_ratelimit(); |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4545d5944243..bda230e52acd 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -85,10 +85,12 @@ | |||
| 85 | #include <linux/seq_file.h> | 85 | #include <linux/seq_file.h> |
| 86 | #include <linux/proc_fs.h> | 86 | #include <linux/proc_fs.h> |
| 87 | #include <linux/migrate.h> | 87 | #include <linux/migrate.h> |
| 88 | #include <linux/ksm.h> | ||
| 88 | #include <linux/rmap.h> | 89 | #include <linux/rmap.h> |
| 89 | #include <linux/security.h> | 90 | #include <linux/security.h> |
| 90 | #include <linux/syscalls.h> | 91 | #include <linux/syscalls.h> |
| 91 | #include <linux/ctype.h> | 92 | #include <linux/ctype.h> |
| 93 | #include <linux/mm_inline.h> | ||
| 92 | 94 | ||
| 93 | #include <asm/tlbflush.h> | 95 | #include <asm/tlbflush.h> |
| 94 | #include <asm/uaccess.h> | 96 | #include <asm/uaccess.h> |
| @@ -412,17 +414,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 412 | if (!page) | 414 | if (!page) |
| 413 | continue; | 415 | continue; |
| 414 | /* | 416 | /* |
| 415 | * The check for PageReserved here is important to avoid | 417 | * vm_normal_page() filters out zero pages, but there might |
| 416 | * handling zero pages and other pages that may have been | 418 | * still be PageReserved pages to skip, perhaps in a VDSO. |
| 417 | * marked special by the system. | 419 | * And we cannot move PageKsm pages sensibly or safely yet. |
| 418 | * | ||
| 419 | * If the PageReserved would not be checked here then f.e. | ||
| 420 | * the location of the zero page could have an influence | ||
| 421 | * on MPOL_MF_STRICT, zero pages would be counted for | ||
| 422 | * the per node stats, and there would be useless attempts | ||
| 423 | * to put zero pages on the migration list. | ||
| 424 | */ | 420 | */ |
| 425 | if (PageReserved(page)) | 421 | if (PageReserved(page) || PageKsm(page)) |
| 426 | continue; | 422 | continue; |
| 427 | nid = page_to_nid(page); | 423 | nid = page_to_nid(page); |
| 428 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 424 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
| @@ -567,24 +563,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | |||
| 567 | } | 563 | } |
| 568 | 564 | ||
| 569 | /* Step 2: apply policy to a range and do splits. */ | 565 | /* Step 2: apply policy to a range and do splits. */ |
| 570 | static int mbind_range(struct vm_area_struct *vma, unsigned long start, | 566 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
| 571 | unsigned long end, struct mempolicy *new) | 567 | unsigned long end, struct mempolicy *new_pol) |
| 572 | { | 568 | { |
| 573 | struct vm_area_struct *next; | 569 | struct vm_area_struct *next; |
| 574 | int err; | 570 | struct vm_area_struct *prev; |
| 571 | struct vm_area_struct *vma; | ||
| 572 | int err = 0; | ||
| 573 | pgoff_t pgoff; | ||
| 574 | unsigned long vmstart; | ||
| 575 | unsigned long vmend; | ||
| 575 | 576 | ||
| 576 | err = 0; | 577 | vma = find_vma_prev(mm, start, &prev); |
| 577 | for (; vma && vma->vm_start < end; vma = next) { | 578 | if (!vma || vma->vm_start > start) |
| 579 | return -EFAULT; | ||
| 580 | |||
| 581 | for (; vma && vma->vm_start < end; prev = vma, vma = next) { | ||
| 578 | next = vma->vm_next; | 582 | next = vma->vm_next; |
| 579 | if (vma->vm_start < start) | 583 | vmstart = max(start, vma->vm_start); |
| 580 | err = split_vma(vma->vm_mm, vma, start, 1); | 584 | vmend = min(end, vma->vm_end); |
| 581 | if (!err && vma->vm_end > end) | 585 | |
| 582 | err = split_vma(vma->vm_mm, vma, end, 0); | 586 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
| 583 | if (!err) | 587 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
| 584 | err = policy_vma(vma, new); | 588 | vma->anon_vma, vma->vm_file, pgoff, new_pol); |
| 589 | if (prev) { | ||
| 590 | vma = prev; | ||
| 591 | next = vma->vm_next; | ||
| 592 | continue; | ||
| 593 | } | ||
| 594 | if (vma->vm_start != vmstart) { | ||
| 595 | err = split_vma(vma->vm_mm, vma, vmstart, 1); | ||
| 596 | if (err) | ||
| 597 | goto out; | ||
| 598 | } | ||
| 599 | if (vma->vm_end != vmend) { | ||
| 600 | err = split_vma(vma->vm_mm, vma, vmend, 0); | ||
| 601 | if (err) | ||
| 602 | goto out; | ||
| 603 | } | ||
| 604 | err = policy_vma(vma, new_pol); | ||
| 585 | if (err) | 605 | if (err) |
| 586 | break; | 606 | goto out; |
| 587 | } | 607 | } |
| 608 | |||
| 609 | out: | ||
| 588 | return err; | 610 | return err; |
| 589 | } | 611 | } |
| 590 | 612 | ||
| @@ -809,6 +831,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
| 809 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 831 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
| 810 | if (!isolate_lru_page(page)) { | 832 | if (!isolate_lru_page(page)) { |
| 811 | list_add_tail(&page->lru, pagelist); | 833 | list_add_tail(&page->lru, pagelist); |
| 834 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
| 835 | page_is_file_cache(page)); | ||
| 812 | } | 836 | } |
| 813 | } | 837 | } |
| 814 | } | 838 | } |
| @@ -836,7 +860,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
| 836 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 860 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
| 837 | 861 | ||
| 838 | if (!list_empty(&pagelist)) | 862 | if (!list_empty(&pagelist)) |
| 839 | err = migrate_pages(&pagelist, new_node_page, dest); | 863 | err = migrate_pages(&pagelist, new_node_page, dest, 0); |
| 840 | 864 | ||
| 841 | return err; | 865 | return err; |
| 842 | } | 866 | } |
| @@ -864,36 +888,36 @@ int do_migrate_pages(struct mm_struct *mm, | |||
| 864 | if (err) | 888 | if (err) |
| 865 | goto out; | 889 | goto out; |
| 866 | 890 | ||
| 867 | /* | 891 | /* |
| 868 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' | 892 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
| 869 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' | 893 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' |
| 870 | * bit in 'tmp', and return that <source, dest> pair for migration. | 894 | * bit in 'tmp', and return that <source, dest> pair for migration. |
| 871 | * The pair of nodemasks 'to' and 'from' define the map. | 895 | * The pair of nodemasks 'to' and 'from' define the map. |
| 872 | * | 896 | * |
| 873 | * If no pair of bits is found that way, fallback to picking some | 897 | * If no pair of bits is found that way, fallback to picking some |
| 874 | * pair of 'source' and 'dest' bits that are not the same. If the | 898 | * pair of 'source' and 'dest' bits that are not the same. If the |
| 875 | * 'source' and 'dest' bits are the same, this represents a node | 899 | * 'source' and 'dest' bits are the same, this represents a node |
| 876 | * that will be migrating to itself, so no pages need move. | 900 | * that will be migrating to itself, so no pages need move. |
| 877 | * | 901 | * |
| 878 | * If no bits are left in 'tmp', or if all remaining bits left | 902 | * If no bits are left in 'tmp', or if all remaining bits left |
| 879 | * in 'tmp' correspond to the same bit in 'to', return false | 903 | * in 'tmp' correspond to the same bit in 'to', return false |
| 880 | * (nothing left to migrate). | 904 | * (nothing left to migrate). |
| 881 | * | 905 | * |
| 882 | * This lets us pick a pair of nodes to migrate between, such that | 906 | * This lets us pick a pair of nodes to migrate between, such that |
| 883 | * if possible the dest node is not already occupied by some other | 907 | * if possible the dest node is not already occupied by some other |
| 884 | * source node, minimizing the risk of overloading the memory on a | 908 | * source node, minimizing the risk of overloading the memory on a |
| 885 | * node that would happen if we migrated incoming memory to a node | 909 | * node that would happen if we migrated incoming memory to a node |
| 886 | * before migrating outgoing memory source that same node. | 910 | * before migrating outgoing memory source that same node. |
| 887 | * | 911 | * |
| 888 | * A single scan of tmp is sufficient. As we go, we remember the | 912 | * A single scan of tmp is sufficient. As we go, we remember the |
| 889 | * most recent <s, d> pair that moved (s != d). If we find a pair | 913 | * most recent <s, d> pair that moved (s != d). If we find a pair |
| 890 | * that not only moved, but what's better, moved to an empty slot | 914 | * that not only moved, but what's better, moved to an empty slot |
| 891 | * (d is not set in tmp), then we break out then, with that pair. | 915 | * (d is not set in tmp), then we break out then, with that pair. |
| 892 | * Otherwise when we finish scannng from_tmp, we at least have the | 916 | * Otherwise when we finish scannng from_tmp, we at least have the |
| 893 | * most recent <s, d> pair that moved. If we get all the way through | 917 | * most recent <s, d> pair that moved. If we get all the way through |
| 894 | * the scan of tmp without finding any node that moved, much less | 918 | * the scan of tmp without finding any node that moved, much less |
| 895 | * moved to an empty node, then there is nothing left worth migrating. | 919 | * moved to an empty node, then there is nothing left worth migrating. |
| 896 | */ | 920 | */ |
| 897 | 921 | ||
| 898 | tmp = *from_nodes; | 922 | tmp = *from_nodes; |
| 899 | while (!nodes_empty(tmp)) { | 923 | while (!nodes_empty(tmp)) { |
| @@ -1049,11 +1073,11 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
| 1049 | if (!IS_ERR(vma)) { | 1073 | if (!IS_ERR(vma)) { |
| 1050 | int nr_failed = 0; | 1074 | int nr_failed = 0; |
| 1051 | 1075 | ||
| 1052 | err = mbind_range(vma, start, end, new); | 1076 | err = mbind_range(mm, start, end, new); |
| 1053 | 1077 | ||
| 1054 | if (!list_empty(&pagelist)) | 1078 | if (!list_empty(&pagelist)) |
| 1055 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1079 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
| 1056 | (unsigned long)vma); | 1080 | (unsigned long)vma, 0); |
| 1057 | 1081 | ||
| 1058 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1082 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
| 1059 | err = -EIO; | 1083 | err = -EIO; |
| @@ -1565,6 +1589,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | |||
| 1565 | } | 1589 | } |
| 1566 | return zl; | 1590 | return zl; |
| 1567 | } | 1591 | } |
| 1592 | |||
| 1593 | /* | ||
| 1594 | * init_nodemask_of_mempolicy | ||
| 1595 | * | ||
| 1596 | * If the current task's mempolicy is "default" [NULL], return 'false' | ||
| 1597 | * to indicate default policy. Otherwise, extract the policy nodemask | ||
| 1598 | * for 'bind' or 'interleave' policy into the argument nodemask, or | ||
| 1599 | * initialize the argument nodemask to contain the single node for | ||
| 1600 | * 'preferred' or 'local' policy and return 'true' to indicate presence | ||
| 1601 | * of non-default mempolicy. | ||
| 1602 | * | ||
| 1603 | * We don't bother with reference counting the mempolicy [mpol_get/put] | ||
| 1604 | * because the current task is examining it's own mempolicy and a task's | ||
| 1605 | * mempolicy is only ever changed by the task itself. | ||
| 1606 | * | ||
| 1607 | * N.B., it is the caller's responsibility to free a returned nodemask. | ||
| 1608 | */ | ||
| 1609 | bool init_nodemask_of_mempolicy(nodemask_t *mask) | ||
| 1610 | { | ||
| 1611 | struct mempolicy *mempolicy; | ||
| 1612 | int nid; | ||
| 1613 | |||
| 1614 | if (!(mask && current->mempolicy)) | ||
| 1615 | return false; | ||
| 1616 | |||
| 1617 | mempolicy = current->mempolicy; | ||
| 1618 | switch (mempolicy->mode) { | ||
| 1619 | case MPOL_PREFERRED: | ||
| 1620 | if (mempolicy->flags & MPOL_F_LOCAL) | ||
| 1621 | nid = numa_node_id(); | ||
| 1622 | else | ||
| 1623 | nid = mempolicy->v.preferred_node; | ||
| 1624 | init_nodemask_of_node(mask, nid); | ||
| 1625 | break; | ||
| 1626 | |||
| 1627 | case MPOL_BIND: | ||
| 1628 | /* Fall through */ | ||
| 1629 | case MPOL_INTERLEAVE: | ||
| 1630 | *mask = mempolicy->v.nodes; | ||
| 1631 | break; | ||
| 1632 | |||
| 1633 | default: | ||
| 1634 | BUG(); | ||
| 1635 | } | ||
| 1636 | |||
| 1637 | return true; | ||
| 1638 | } | ||
| 1568 | #endif | 1639 | #endif |
| 1569 | 1640 | ||
| 1570 | /* Allocate a page in interleaved policy. | 1641 | /* Allocate a page in interleaved policy. |
diff --git a/mm/migrate.c b/mm/migrate.c index 7dbcb22316d2..88000b89fc9a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/mm_inline.h> | 21 | #include <linux/mm_inline.h> |
| 22 | #include <linux/nsproxy.h> | 22 | #include <linux/nsproxy.h> |
| 23 | #include <linux/pagevec.h> | 23 | #include <linux/pagevec.h> |
| 24 | #include <linux/ksm.h> | ||
| 24 | #include <linux/rmap.h> | 25 | #include <linux/rmap.h> |
| 25 | #include <linux/topology.h> | 26 | #include <linux/topology.h> |
| 26 | #include <linux/cpu.h> | 27 | #include <linux/cpu.h> |
| @@ -78,8 +79,8 @@ int putback_lru_pages(struct list_head *l) | |||
| 78 | /* | 79 | /* |
| 79 | * Restore a potential migration pte to a working pte entry | 80 | * Restore a potential migration pte to a working pte entry |
| 80 | */ | 81 | */ |
| 81 | static void remove_migration_pte(struct vm_area_struct *vma, | 82 | static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, |
| 82 | struct page *old, struct page *new) | 83 | unsigned long addr, void *old) |
| 83 | { | 84 | { |
| 84 | struct mm_struct *mm = vma->vm_mm; | 85 | struct mm_struct *mm = vma->vm_mm; |
| 85 | swp_entry_t entry; | 86 | swp_entry_t entry; |
| @@ -88,40 +89,37 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
| 88 | pmd_t *pmd; | 89 | pmd_t *pmd; |
| 89 | pte_t *ptep, pte; | 90 | pte_t *ptep, pte; |
| 90 | spinlock_t *ptl; | 91 | spinlock_t *ptl; |
| 91 | unsigned long addr = page_address_in_vma(new, vma); | ||
| 92 | |||
| 93 | if (addr == -EFAULT) | ||
| 94 | return; | ||
| 95 | 92 | ||
| 96 | pgd = pgd_offset(mm, addr); | 93 | pgd = pgd_offset(mm, addr); |
| 97 | if (!pgd_present(*pgd)) | 94 | if (!pgd_present(*pgd)) |
| 98 | return; | 95 | goto out; |
| 99 | 96 | ||
| 100 | pud = pud_offset(pgd, addr); | 97 | pud = pud_offset(pgd, addr); |
| 101 | if (!pud_present(*pud)) | 98 | if (!pud_present(*pud)) |
| 102 | return; | 99 | goto out; |
| 103 | 100 | ||
| 104 | pmd = pmd_offset(pud, addr); | 101 | pmd = pmd_offset(pud, addr); |
| 105 | if (!pmd_present(*pmd)) | 102 | if (!pmd_present(*pmd)) |
| 106 | return; | 103 | goto out; |
| 107 | 104 | ||
| 108 | ptep = pte_offset_map(pmd, addr); | 105 | ptep = pte_offset_map(pmd, addr); |
| 109 | 106 | ||
| 110 | if (!is_swap_pte(*ptep)) { | 107 | if (!is_swap_pte(*ptep)) { |
| 111 | pte_unmap(ptep); | 108 | pte_unmap(ptep); |
| 112 | return; | 109 | goto out; |
| 113 | } | 110 | } |
| 114 | 111 | ||
| 115 | ptl = pte_lockptr(mm, pmd); | 112 | ptl = pte_lockptr(mm, pmd); |
| 116 | spin_lock(ptl); | 113 | spin_lock(ptl); |
| 117 | pte = *ptep; | 114 | pte = *ptep; |
| 118 | if (!is_swap_pte(pte)) | 115 | if (!is_swap_pte(pte)) |
| 119 | goto out; | 116 | goto unlock; |
| 120 | 117 | ||
| 121 | entry = pte_to_swp_entry(pte); | 118 | entry = pte_to_swp_entry(pte); |
| 122 | 119 | ||
| 123 | if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) | 120 | if (!is_migration_entry(entry) || |
| 124 | goto out; | 121 | migration_entry_to_page(entry) != old) |
| 122 | goto unlock; | ||
| 125 | 123 | ||
| 126 | get_page(new); | 124 | get_page(new); |
| 127 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 125 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
| @@ -136,59 +134,11 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
| 136 | page_add_file_rmap(new); | 134 | page_add_file_rmap(new); |
| 137 | 135 | ||
| 138 | /* No need to invalidate - it was non-present before */ | 136 | /* No need to invalidate - it was non-present before */ |
| 139 | update_mmu_cache(vma, addr, pte); | 137 | update_mmu_cache(vma, addr, ptep); |
| 140 | 138 | unlock: | |
| 141 | out: | ||
| 142 | pte_unmap_unlock(ptep, ptl); | 139 | pte_unmap_unlock(ptep, ptl); |
| 143 | } | 140 | out: |
| 144 | 141 | return SWAP_AGAIN; | |
| 145 | /* | ||
| 146 | * Note that remove_file_migration_ptes will only work on regular mappings, | ||
| 147 | * Nonlinear mappings do not use migration entries. | ||
| 148 | */ | ||
| 149 | static void remove_file_migration_ptes(struct page *old, struct page *new) | ||
| 150 | { | ||
| 151 | struct vm_area_struct *vma; | ||
| 152 | struct address_space *mapping = new->mapping; | ||
| 153 | struct prio_tree_iter iter; | ||
| 154 | pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 155 | |||
| 156 | if (!mapping) | ||
| 157 | return; | ||
| 158 | |||
| 159 | spin_lock(&mapping->i_mmap_lock); | ||
| 160 | |||
| 161 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) | ||
| 162 | remove_migration_pte(vma, old, new); | ||
| 163 | |||
| 164 | spin_unlock(&mapping->i_mmap_lock); | ||
| 165 | } | ||
| 166 | |||
| 167 | /* | ||
| 168 | * Must hold mmap_sem lock on at least one of the vmas containing | ||
| 169 | * the page so that the anon_vma cannot vanish. | ||
| 170 | */ | ||
| 171 | static void remove_anon_migration_ptes(struct page *old, struct page *new) | ||
| 172 | { | ||
| 173 | struct anon_vma *anon_vma; | ||
| 174 | struct vm_area_struct *vma; | ||
| 175 | unsigned long mapping; | ||
| 176 | |||
| 177 | mapping = (unsigned long)new->mapping; | ||
| 178 | |||
| 179 | if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) | ||
| 180 | return; | ||
| 181 | |||
| 182 | /* | ||
| 183 | * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. | ||
| 184 | */ | ||
| 185 | anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); | ||
| 186 | spin_lock(&anon_vma->lock); | ||
| 187 | |||
| 188 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) | ||
| 189 | remove_migration_pte(vma, old, new); | ||
| 190 | |||
| 191 | spin_unlock(&anon_vma->lock); | ||
| 192 | } | 142 | } |
| 193 | 143 | ||
| 194 | /* | 144 | /* |
| @@ -197,10 +147,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new) | |||
| 197 | */ | 147 | */ |
| 198 | static void remove_migration_ptes(struct page *old, struct page *new) | 148 | static void remove_migration_ptes(struct page *old, struct page *new) |
| 199 | { | 149 | { |
| 200 | if (PageAnon(new)) | 150 | rmap_walk(new, remove_migration_pte, old); |
| 201 | remove_anon_migration_ptes(old, new); | ||
| 202 | else | ||
| 203 | remove_file_migration_ptes(old, new); | ||
| 204 | } | 151 | } |
| 205 | 152 | ||
| 206 | /* | 153 | /* |
| @@ -328,8 +275,6 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 328 | */ | 275 | */ |
| 329 | static void migrate_page_copy(struct page *newpage, struct page *page) | 276 | static void migrate_page_copy(struct page *newpage, struct page *page) |
| 330 | { | 277 | { |
| 331 | int anon; | ||
| 332 | |||
| 333 | copy_highpage(newpage, page); | 278 | copy_highpage(newpage, page); |
| 334 | 279 | ||
| 335 | if (PageError(page)) | 280 | if (PageError(page)) |
| @@ -341,8 +286,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
| 341 | if (TestClearPageActive(page)) { | 286 | if (TestClearPageActive(page)) { |
| 342 | VM_BUG_ON(PageUnevictable(page)); | 287 | VM_BUG_ON(PageUnevictable(page)); |
| 343 | SetPageActive(newpage); | 288 | SetPageActive(newpage); |
| 344 | } else | 289 | } else if (TestClearPageUnevictable(page)) |
| 345 | unevictable_migrate_page(newpage, page); | 290 | SetPageUnevictable(newpage); |
| 346 | if (PageChecked(page)) | 291 | if (PageChecked(page)) |
| 347 | SetPageChecked(newpage); | 292 | SetPageChecked(newpage); |
| 348 | if (PageMappedToDisk(page)) | 293 | if (PageMappedToDisk(page)) |
| @@ -361,12 +306,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page) | |||
| 361 | } | 306 | } |
| 362 | 307 | ||
| 363 | mlock_migrate_page(newpage, page); | 308 | mlock_migrate_page(newpage, page); |
| 309 | ksm_migrate_page(newpage, page); | ||
| 364 | 310 | ||
| 365 | ClearPageSwapCache(page); | 311 | ClearPageSwapCache(page); |
| 366 | ClearPagePrivate(page); | 312 | ClearPagePrivate(page); |
| 367 | set_page_private(page, 0); | 313 | set_page_private(page, 0); |
| 368 | /* page->mapping contains a flag for PageAnon() */ | ||
| 369 | anon = PageAnon(page); | ||
| 370 | page->mapping = NULL; | 314 | page->mapping = NULL; |
| 371 | 315 | ||
| 372 | /* | 316 | /* |
| @@ -580,9 +524,9 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
| 580 | else | 524 | else |
| 581 | rc = fallback_migrate_page(mapping, newpage, page); | 525 | rc = fallback_migrate_page(mapping, newpage, page); |
| 582 | 526 | ||
| 583 | if (!rc) { | 527 | if (!rc) |
| 584 | remove_migration_ptes(page, newpage); | 528 | remove_migration_ptes(page, newpage); |
| 585 | } else | 529 | else |
| 586 | newpage->mapping = NULL; | 530 | newpage->mapping = NULL; |
| 587 | 531 | ||
| 588 | unlock_page(newpage); | 532 | unlock_page(newpage); |
| @@ -595,7 +539,7 @@ static int move_to_new_page(struct page *newpage, struct page *page) | |||
| 595 | * to the newly allocated page in newpage. | 539 | * to the newly allocated page in newpage. |
| 596 | */ | 540 | */ |
| 597 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, | 541 | static int unmap_and_move(new_page_t get_new_page, unsigned long private, |
| 598 | struct page *page, int force) | 542 | struct page *page, int force, int offlining) |
| 599 | { | 543 | { |
| 600 | int rc = 0; | 544 | int rc = 0; |
| 601 | int *result = NULL; | 545 | int *result = NULL; |
| @@ -621,6 +565,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 621 | lock_page(page); | 565 | lock_page(page); |
| 622 | } | 566 | } |
| 623 | 567 | ||
| 568 | /* | ||
| 569 | * Only memory hotplug's offline_pages() caller has locked out KSM, | ||
| 570 | * and can safely migrate a KSM page. The other cases have skipped | ||
| 571 | * PageKsm along with PageReserved - but it is only now when we have | ||
| 572 | * the page lock that we can be certain it will not go KSM beneath us | ||
| 573 | * (KSM will not upgrade a page from PageAnon to PageKsm when it sees | ||
| 574 | * its pagecount raised, but only here do we take the page lock which | ||
| 575 | * serializes that). | ||
| 576 | */ | ||
| 577 | if (PageKsm(page) && !offlining) { | ||
| 578 | rc = -EBUSY; | ||
| 579 | goto unlock; | ||
| 580 | } | ||
| 581 | |||
| 624 | /* charge against new page */ | 582 | /* charge against new page */ |
| 625 | charge = mem_cgroup_prepare_migration(page, &mem); | 583 | charge = mem_cgroup_prepare_migration(page, &mem); |
| 626 | if (charge == -ENOMEM) { | 584 | if (charge == -ENOMEM) { |
| @@ -737,7 +695,7 @@ move_newpage: | |||
| 737 | * Return: Number of pages not migrated or error code. | 695 | * Return: Number of pages not migrated or error code. |
| 738 | */ | 696 | */ |
| 739 | int migrate_pages(struct list_head *from, | 697 | int migrate_pages(struct list_head *from, |
| 740 | new_page_t get_new_page, unsigned long private) | 698 | new_page_t get_new_page, unsigned long private, int offlining) |
| 741 | { | 699 | { |
| 742 | int retry = 1; | 700 | int retry = 1; |
| 743 | int nr_failed = 0; | 701 | int nr_failed = 0; |
| @@ -746,13 +704,6 @@ int migrate_pages(struct list_head *from, | |||
| 746 | struct page *page2; | 704 | struct page *page2; |
| 747 | int swapwrite = current->flags & PF_SWAPWRITE; | 705 | int swapwrite = current->flags & PF_SWAPWRITE; |
| 748 | int rc; | 706 | int rc; |
| 749 | unsigned long flags; | ||
| 750 | |||
| 751 | local_irq_save(flags); | ||
| 752 | list_for_each_entry(page, from, lru) | ||
| 753 | __inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
| 754 | page_is_file_cache(page)); | ||
| 755 | local_irq_restore(flags); | ||
| 756 | 707 | ||
| 757 | if (!swapwrite) | 708 | if (!swapwrite) |
| 758 | current->flags |= PF_SWAPWRITE; | 709 | current->flags |= PF_SWAPWRITE; |
| @@ -764,7 +715,7 @@ int migrate_pages(struct list_head *from, | |||
| 764 | cond_resched(); | 715 | cond_resched(); |
| 765 | 716 | ||
| 766 | rc = unmap_and_move(get_new_page, private, | 717 | rc = unmap_and_move(get_new_page, private, |
| 767 | page, pass > 2); | 718 | page, pass > 2, offlining); |
| 768 | 719 | ||
| 769 | switch(rc) { | 720 | switch(rc) { |
| 770 | case -ENOMEM: | 721 | case -ENOMEM: |
| @@ -860,7 +811,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
| 860 | if (!page) | 811 | if (!page) |
| 861 | goto set_status; | 812 | goto set_status; |
| 862 | 813 | ||
| 863 | if (PageReserved(page)) /* Check for zero page */ | 814 | /* Use PageReserved to check for zero page */ |
| 815 | if (PageReserved(page) || PageKsm(page)) | ||
| 864 | goto put_and_set; | 816 | goto put_and_set; |
| 865 | 817 | ||
| 866 | pp->page = page; | 818 | pp->page = page; |
| @@ -878,8 +830,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
| 878 | goto put_and_set; | 830 | goto put_and_set; |
| 879 | 831 | ||
| 880 | err = isolate_lru_page(page); | 832 | err = isolate_lru_page(page); |
| 881 | if (!err) | 833 | if (!err) { |
| 882 | list_add_tail(&page->lru, &pagelist); | 834 | list_add_tail(&page->lru, &pagelist); |
| 835 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
| 836 | page_is_file_cache(page)); | ||
| 837 | } | ||
| 883 | put_and_set: | 838 | put_and_set: |
| 884 | /* | 839 | /* |
| 885 | * Either remove the duplicate refcount from | 840 | * Either remove the duplicate refcount from |
| @@ -894,7 +849,7 @@ set_status: | |||
| 894 | err = 0; | 849 | err = 0; |
| 895 | if (!list_empty(&pagelist)) | 850 | if (!list_empty(&pagelist)) |
| 896 | err = migrate_pages(&pagelist, new_page_node, | 851 | err = migrate_pages(&pagelist, new_page_node, |
| 897 | (unsigned long)pm); | 852 | (unsigned long)pm, 0); |
| 898 | 853 | ||
| 899 | up_read(&mm->mmap_sem); | 854 | up_read(&mm->mmap_sem); |
| 900 | return err; | 855 | return err; |
| @@ -953,6 +908,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
| 953 | goto out_pm; | 908 | goto out_pm; |
| 954 | 909 | ||
| 955 | err = -ENODEV; | 910 | err = -ENODEV; |
| 911 | if (node < 0 || node >= MAX_NUMNODES) | ||
| 912 | goto out_pm; | ||
| 913 | |||
| 956 | if (!node_state(node, N_HIGH_MEMORY)) | 914 | if (!node_state(node, N_HIGH_MEMORY)) |
| 957 | goto out_pm; | 915 | goto out_pm; |
| 958 | 916 | ||
| @@ -1015,7 +973,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, | |||
| 1015 | 973 | ||
| 1016 | err = -ENOENT; | 974 | err = -ENOENT; |
| 1017 | /* Use PageReserved to check for zero page */ | 975 | /* Use PageReserved to check for zero page */ |
| 1018 | if (!page || PageReserved(page)) | 976 | if (!page || PageReserved(page) || PageKsm(page)) |
| 1019 | goto set_status; | 977 | goto set_status; |
| 1020 | 978 | ||
| 1021 | err = page_to_nid(page); | 979 | err = page_to_nid(page); |
| @@ -1040,33 +998,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, | |||
| 1040 | #define DO_PAGES_STAT_CHUNK_NR 16 | 998 | #define DO_PAGES_STAT_CHUNK_NR 16 |
| 1041 | const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; | 999 | const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; |
| 1042 | int chunk_status[DO_PAGES_STAT_CHUNK_NR]; | 1000 | int chunk_status[DO_PAGES_STAT_CHUNK_NR]; |
| 1043 | unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; | ||
| 1044 | int err; | ||
| 1045 | 1001 | ||
| 1046 | for (i = 0; i < nr_pages; i += chunk_nr) { | 1002 | while (nr_pages) { |
| 1047 | if (chunk_nr + i > nr_pages) | 1003 | unsigned long chunk_nr; |
| 1048 | chunk_nr = nr_pages - i; | ||
| 1049 | 1004 | ||
| 1050 | err = copy_from_user(chunk_pages, &pages[i], | 1005 | chunk_nr = nr_pages; |
| 1051 | chunk_nr * sizeof(*chunk_pages)); | 1006 | if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) |
| 1052 | if (err) { | 1007 | chunk_nr = DO_PAGES_STAT_CHUNK_NR; |
| 1053 | err = -EFAULT; | 1008 | |
| 1054 | goto out; | 1009 | if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) |
| 1055 | } | 1010 | break; |
| 1056 | 1011 | ||
| 1057 | do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); | 1012 | do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); |
| 1058 | 1013 | ||
| 1059 | err = copy_to_user(&status[i], chunk_status, | 1014 | if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) |
| 1060 | chunk_nr * sizeof(*chunk_status)); | 1015 | break; |
| 1061 | if (err) { | ||
| 1062 | err = -EFAULT; | ||
| 1063 | goto out; | ||
| 1064 | } | ||
| 1065 | } | ||
| 1066 | err = 0; | ||
| 1067 | 1016 | ||
| 1068 | out: | 1017 | pages += chunk_nr; |
| 1069 | return err; | 1018 | status += chunk_nr; |
| 1019 | nr_pages -= chunk_nr; | ||
| 1020 | } | ||
| 1021 | return nr_pages ? -EFAULT : 0; | ||
| 1070 | } | 1022 | } |
| 1071 | 1023 | ||
| 1072 | /* | 1024 | /* |
diff --git a/mm/mincore.c b/mm/mincore.c index 8cb508f84ea4..7a3436ef39eb 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
| @@ -14,6 +14,7 @@ | |||
| 14 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
| 15 | #include <linux/swap.h> | 15 | #include <linux/swap.h> |
| 16 | #include <linux/swapops.h> | 16 | #include <linux/swapops.h> |
| 17 | #include <linux/hugetlb.h> | ||
| 17 | 18 | ||
| 18 | #include <asm/uaccess.h> | 19 | #include <asm/uaccess.h> |
| 19 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
| @@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag | |||
| 72 | if (!vma || addr < vma->vm_start) | 73 | if (!vma || addr < vma->vm_start) |
| 73 | return -ENOMEM; | 74 | return -ENOMEM; |
| 74 | 75 | ||
| 76 | #ifdef CONFIG_HUGETLB_PAGE | ||
| 77 | if (is_vm_hugetlb_page(vma)) { | ||
| 78 | struct hstate *h; | ||
| 79 | unsigned long nr_huge; | ||
| 80 | unsigned char present; | ||
| 81 | |||
| 82 | i = 0; | ||
| 83 | nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT); | ||
| 84 | h = hstate_vma(vma); | ||
| 85 | nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h)) | ||
| 86 | - (addr >> huge_page_shift(h)) + 1; | ||
| 87 | nr_huge = min(nr_huge, | ||
| 88 | (vma->vm_end - addr) >> huge_page_shift(h)); | ||
| 89 | while (1) { | ||
| 90 | /* hugepage always in RAM for now, | ||
| 91 | * but generally it needs to be check */ | ||
| 92 | ptep = huge_pte_offset(current->mm, | ||
| 93 | addr & huge_page_mask(h)); | ||
| 94 | present = !!(ptep && | ||
| 95 | !huge_pte_none(huge_ptep_get(ptep))); | ||
| 96 | while (1) { | ||
| 97 | vec[i++] = present; | ||
| 98 | addr += PAGE_SIZE; | ||
| 99 | /* reach buffer limit */ | ||
| 100 | if (i == nr) | ||
| 101 | return nr; | ||
| 102 | /* check hugepage border */ | ||
| 103 | if (!((addr & ~huge_page_mask(h)) | ||
| 104 | >> PAGE_SHIFT)) | ||
| 105 | break; | ||
| 106 | } | ||
| 107 | } | ||
| 108 | return nr; | ||
| 109 | } | ||
| 110 | #endif | ||
| 111 | |||
| 75 | /* | 112 | /* |
| 76 | * Calculate how many pages there are left in the last level of the | 113 | * Calculate how many pages there are left in the last level of the |
| 77 | * PTE array for our address. | 114 | * PTE array for our address. |
diff --git a/mm/mlock.c b/mm/mlock.c index bd6f0e466f6c..8f4e2dfceec1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -25,7 +25,7 @@ int can_do_mlock(void) | |||
| 25 | { | 25 | { |
| 26 | if (capable(CAP_IPC_LOCK)) | 26 | if (capable(CAP_IPC_LOCK)) |
| 27 | return 1; | 27 | return 1; |
| 28 | if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) | 28 | if (rlimit(RLIMIT_MEMLOCK) != 0) |
| 29 | return 1; | 29 | return 1; |
| 30 | return 0; | 30 | return 0; |
| 31 | } | 31 | } |
| @@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page) | |||
| 88 | } | 88 | } |
| 89 | } | 89 | } |
| 90 | 90 | ||
| 91 | /* | 91 | /** |
| 92 | * called from munlock()/munmap() path with page supposedly on the LRU. | 92 | * munlock_vma_page - munlock a vma page |
| 93 | * @page - page to be unlocked | ||
| 93 | * | 94 | * |
| 94 | * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked | 95 | * called from munlock()/munmap() path with page supposedly on the LRU. |
| 95 | * [in try_to_munlock()] and then attempt to isolate the page. We must | 96 | * When we munlock a page, because the vma where we found the page is being |
| 96 | * isolate the page to keep others from messing with its unevictable | 97 | * munlock()ed or munmap()ed, we want to check whether other vmas hold the |
| 97 | * and mlocked state while trying to munlock. However, we pre-clear the | 98 | * page locked so that we can leave it on the unevictable lru list and not |
| 98 | * mlocked state anyway as we might lose the isolation race and we might | 99 | * bother vmscan with it. However, to walk the page's rmap list in |
| 99 | * not get another chance to clear PageMlocked. If we successfully | 100 | * try_to_munlock() we must isolate the page from the LRU. If some other |
| 100 | * isolate the page and try_to_munlock() detects other VM_LOCKED vmas | 101 | * task has removed the page from the LRU, we won't be able to do that. |
| 101 | * mapping the page, it will restore the PageMlocked state, unless the page | 102 | * So we clear the PageMlocked as we might not get another chance. If we |
| 102 | * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), | 103 | * can't isolate the page, we leave it for putback_lru_page() and vmscan |
| 103 | * perhaps redundantly. | 104 | * [page_referenced()/try_to_unmap()] to deal with. |
| 104 | * If we lose the isolation race, and the page is mapped by other VM_LOCKED | ||
| 105 | * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() | ||
| 106 | * either of which will restore the PageMlocked state by calling | ||
| 107 | * mlock_vma_page() above, if it can grab the vma's mmap sem. | ||
| 108 | */ | 105 | */ |
| 109 | static void munlock_vma_page(struct page *page) | 106 | void munlock_vma_page(struct page *page) |
| 110 | { | 107 | { |
| 111 | BUG_ON(!PageLocked(page)); | 108 | BUG_ON(!PageLocked(page)); |
| 112 | 109 | ||
| @@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page) | |||
| 117 | /* | 114 | /* |
| 118 | * did try_to_unlock() succeed or punt? | 115 | * did try_to_unlock() succeed or punt? |
| 119 | */ | 116 | */ |
| 120 | if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) | 117 | if (ret != SWAP_MLOCK) |
| 121 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); | 118 | count_vm_event(UNEVICTABLE_PGMUNLOCKED); |
| 122 | 119 | ||
| 123 | putback_lru_page(page); | 120 | putback_lru_page(page); |
| 124 | } else { | 121 | } else { |
| 125 | /* | 122 | /* |
| 126 | * We lost the race. let try_to_unmap() deal | 123 | * Some other task has removed the page from the LRU. |
| 127 | * with it. At least we get the page state and | 124 | * putback_lru_page() will take care of removing the |
| 128 | * mlock stats right. However, page is still on | 125 | * page from the unevictable list, if necessary. |
| 129 | * the noreclaim list. We'll fix that up when | 126 | * vmscan [page_referenced()] will move the page back |
| 130 | * the page is eventually freed or we scan the | 127 | * to the unevictable list if some other vma has it |
| 131 | * noreclaim list. | 128 | * mlocked. |
| 132 | */ | 129 | */ |
| 133 | if (PageUnevictable(page)) | 130 | if (PageUnevictable(page)) |
| 134 | count_vm_event(UNEVICTABLE_PGSTRANDED); | 131 | count_vm_event(UNEVICTABLE_PGSTRANDED); |
| @@ -490,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
| 490 | locked = len >> PAGE_SHIFT; | 487 | locked = len >> PAGE_SHIFT; |
| 491 | locked += current->mm->locked_vm; | 488 | locked += current->mm->locked_vm; |
| 492 | 489 | ||
| 493 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 490 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 494 | lock_limit >>= PAGE_SHIFT; | 491 | lock_limit >>= PAGE_SHIFT; |
| 495 | 492 | ||
| 496 | /* check against resource limits */ | 493 | /* check against resource limits */ |
| @@ -553,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
| 553 | 550 | ||
| 554 | down_write(¤t->mm->mmap_sem); | 551 | down_write(¤t->mm->mmap_sem); |
| 555 | 552 | ||
| 556 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 553 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 557 | lock_limit >>= PAGE_SHIFT; | 554 | lock_limit >>= PAGE_SHIFT; |
| 558 | 555 | ||
| 559 | ret = -ENOMEM; | 556 | ret = -ENOMEM; |
| @@ -587,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user) | |||
| 587 | int allowed = 0; | 584 | int allowed = 0; |
| 588 | 585 | ||
| 589 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | 586 | locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; |
| 590 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 587 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 591 | if (lock_limit == RLIM_INFINITY) | 588 | if (lock_limit == RLIM_INFINITY) |
| 592 | allowed = 1; | 589 | allowed = 1; |
| 593 | lock_limit >>= PAGE_SHIFT; | 590 | lock_limit >>= PAGE_SHIFT; |
| @@ -621,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, | |||
| 621 | 618 | ||
| 622 | down_write(&mm->mmap_sem); | 619 | down_write(&mm->mmap_sem); |
| 623 | 620 | ||
| 624 | lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 621 | lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT; |
| 625 | vm = mm->total_vm + pgsz; | 622 | vm = mm->total_vm + pgsz; |
| 626 | if (lim < vm) | 623 | if (lim < vm) |
| 627 | goto out; | 624 | goto out; |
| 628 | 625 | ||
| 629 | lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 626 | lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT; |
| 630 | vm = mm->locked_vm + pgsz; | 627 | vm = mm->locked_vm + pgsz; |
| 631 | if (lim < vm) | 628 | if (lim < vm) |
| 632 | goto out; | 629 | goto out; |
| @@ -20,7 +20,6 @@ | |||
| 20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
| 21 | #include <linux/personality.h> | 21 | #include <linux/personality.h> |
| 22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
| 23 | #include <linux/ima.h> | ||
| 24 | #include <linux/hugetlb.h> | 23 | #include <linux/hugetlb.h> |
| 25 | #include <linux/profile.h> | 24 | #include <linux/profile.h> |
| 26 | #include <linux/module.h> | 25 | #include <linux/module.h> |
| @@ -266,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
| 266 | * segment grow beyond its set limit the in case where the limit is | 265 | * segment grow beyond its set limit the in case where the limit is |
| 267 | * not page aligned -Ram Gupta | 266 | * not page aligned -Ram Gupta |
| 268 | */ | 267 | */ |
| 269 | rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; | 268 | rlim = rlimit(RLIMIT_DATA); |
| 270 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + | 269 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + |
| 271 | (mm->end_data - mm->start_data) > rlim) | 270 | (mm->end_data - mm->start_data) > rlim) |
| 272 | goto out; | 271 | goto out; |
| @@ -438,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 438 | { | 437 | { |
| 439 | __vma_link_list(mm, vma, prev, rb_parent); | 438 | __vma_link_list(mm, vma, prev, rb_parent); |
| 440 | __vma_link_rb(mm, vma, rb_link, rb_parent); | 439 | __vma_link_rb(mm, vma, rb_link, rb_parent); |
| 441 | __anon_vma_link(vma); | ||
| 442 | } | 440 | } |
| 443 | 441 | ||
| 444 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, | 442 | static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, |
| @@ -500,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 500 | * are necessary. The "insert" vma (if any) is to be inserted | 498 | * are necessary. The "insert" vma (if any) is to be inserted |
| 501 | * before we drop the necessary locks. | 499 | * before we drop the necessary locks. |
| 502 | */ | 500 | */ |
| 503 | void vma_adjust(struct vm_area_struct *vma, unsigned long start, | 501 | int vma_adjust(struct vm_area_struct *vma, unsigned long start, |
| 504 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) | 502 | unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) |
| 505 | { | 503 | { |
| 506 | struct mm_struct *mm = vma->vm_mm; | 504 | struct mm_struct *mm = vma->vm_mm; |
| @@ -543,6 +541,26 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 543 | } | 541 | } |
| 544 | } | 542 | } |
| 545 | 543 | ||
| 544 | /* | ||
| 545 | * When changing only vma->vm_end, we don't really need anon_vma lock. | ||
| 546 | */ | ||
| 547 | if (vma->anon_vma && (insert || importer || start != vma->vm_start)) | ||
| 548 | anon_vma = vma->anon_vma; | ||
| 549 | if (anon_vma) { | ||
| 550 | /* | ||
| 551 | * Easily overlooked: when mprotect shifts the boundary, | ||
| 552 | * make sure the expanding vma has anon_vma set if the | ||
| 553 | * shrinking vma had, to cover any anon pages imported. | ||
| 554 | */ | ||
| 555 | if (importer && !importer->anon_vma) { | ||
| 556 | /* Block reverse map lookups until things are set up. */ | ||
| 557 | if (anon_vma_clone(importer, vma)) { | ||
| 558 | return -ENOMEM; | ||
| 559 | } | ||
| 560 | importer->anon_vma = anon_vma; | ||
| 561 | } | ||
| 562 | } | ||
| 563 | |||
| 546 | if (file) { | 564 | if (file) { |
| 547 | mapping = file->f_mapping; | 565 | mapping = file->f_mapping; |
| 548 | if (!(vma->vm_flags & VM_NONLINEAR)) | 566 | if (!(vma->vm_flags & VM_NONLINEAR)) |
| @@ -568,25 +586,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 568 | } | 586 | } |
| 569 | } | 587 | } |
| 570 | 588 | ||
| 571 | /* | ||
| 572 | * When changing only vma->vm_end, we don't really need | ||
| 573 | * anon_vma lock. | ||
| 574 | */ | ||
| 575 | if (vma->anon_vma && (insert || importer || start != vma->vm_start)) | ||
| 576 | anon_vma = vma->anon_vma; | ||
| 577 | if (anon_vma) { | ||
| 578 | spin_lock(&anon_vma->lock); | ||
| 579 | /* | ||
| 580 | * Easily overlooked: when mprotect shifts the boundary, | ||
| 581 | * make sure the expanding vma has anon_vma set if the | ||
| 582 | * shrinking vma had, to cover any anon pages imported. | ||
| 583 | */ | ||
| 584 | if (importer && !importer->anon_vma) { | ||
| 585 | importer->anon_vma = anon_vma; | ||
| 586 | __anon_vma_link(importer); | ||
| 587 | } | ||
| 588 | } | ||
| 589 | |||
| 590 | if (root) { | 589 | if (root) { |
| 591 | flush_dcache_mmap_lock(mapping); | 590 | flush_dcache_mmap_lock(mapping); |
| 592 | vma_prio_tree_remove(vma, root); | 591 | vma_prio_tree_remove(vma, root); |
| @@ -617,8 +616,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 617 | __vma_unlink(mm, next, vma); | 616 | __vma_unlink(mm, next, vma); |
| 618 | if (file) | 617 | if (file) |
| 619 | __remove_shared_vm_struct(next, file, mapping); | 618 | __remove_shared_vm_struct(next, file, mapping); |
| 620 | if (next->anon_vma) | ||
| 621 | __anon_vma_merge(vma, next); | ||
| 622 | } else if (insert) { | 619 | } else if (insert) { |
| 623 | /* | 620 | /* |
| 624 | * split_vma has split insert from vma, and needs | 621 | * split_vma has split insert from vma, and needs |
| @@ -628,8 +625,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 628 | __insert_vm_struct(mm, insert); | 625 | __insert_vm_struct(mm, insert); |
| 629 | } | 626 | } |
| 630 | 627 | ||
| 631 | if (anon_vma) | ||
| 632 | spin_unlock(&anon_vma->lock); | ||
| 633 | if (mapping) | 628 | if (mapping) |
| 634 | spin_unlock(&mapping->i_mmap_lock); | 629 | spin_unlock(&mapping->i_mmap_lock); |
| 635 | 630 | ||
| @@ -639,6 +634,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 639 | if (next->vm_flags & VM_EXECUTABLE) | 634 | if (next->vm_flags & VM_EXECUTABLE) |
| 640 | removed_exe_file_vma(mm); | 635 | removed_exe_file_vma(mm); |
| 641 | } | 636 | } |
| 637 | if (next->anon_vma) | ||
| 638 | anon_vma_merge(vma, next); | ||
| 642 | mm->map_count--; | 639 | mm->map_count--; |
| 643 | mpol_put(vma_policy(next)); | 640 | mpol_put(vma_policy(next)); |
| 644 | kmem_cache_free(vm_area_cachep, next); | 641 | kmem_cache_free(vm_area_cachep, next); |
| @@ -654,6 +651,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 654 | } | 651 | } |
| 655 | 652 | ||
| 656 | validate_mm(mm); | 653 | validate_mm(mm); |
| 654 | |||
| 655 | return 0; | ||
| 657 | } | 656 | } |
| 658 | 657 | ||
| 659 | /* | 658 | /* |
| @@ -760,6 +759,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
| 760 | { | 759 | { |
| 761 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | 760 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
| 762 | struct vm_area_struct *area, *next; | 761 | struct vm_area_struct *area, *next; |
| 762 | int err; | ||
| 763 | 763 | ||
| 764 | /* | 764 | /* |
| 765 | * We later require that vma->vm_flags == vm_flags, | 765 | * We later require that vma->vm_flags == vm_flags, |
| @@ -793,11 +793,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
| 793 | is_mergeable_anon_vma(prev->anon_vma, | 793 | is_mergeable_anon_vma(prev->anon_vma, |
| 794 | next->anon_vma)) { | 794 | next->anon_vma)) { |
| 795 | /* cases 1, 6 */ | 795 | /* cases 1, 6 */ |
| 796 | vma_adjust(prev, prev->vm_start, | 796 | err = vma_adjust(prev, prev->vm_start, |
| 797 | next->vm_end, prev->vm_pgoff, NULL); | 797 | next->vm_end, prev->vm_pgoff, NULL); |
| 798 | } else /* cases 2, 5, 7 */ | 798 | } else /* cases 2, 5, 7 */ |
| 799 | vma_adjust(prev, prev->vm_start, | 799 | err = vma_adjust(prev, prev->vm_start, |
| 800 | end, prev->vm_pgoff, NULL); | 800 | end, prev->vm_pgoff, NULL); |
| 801 | if (err) | ||
| 802 | return NULL; | ||
| 801 | return prev; | 803 | return prev; |
| 802 | } | 804 | } |
| 803 | 805 | ||
| @@ -809,11 +811,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
| 809 | can_vma_merge_before(next, vm_flags, | 811 | can_vma_merge_before(next, vm_flags, |
| 810 | anon_vma, file, pgoff+pglen)) { | 812 | anon_vma, file, pgoff+pglen)) { |
| 811 | if (prev && addr < prev->vm_end) /* case 4 */ | 813 | if (prev && addr < prev->vm_end) /* case 4 */ |
| 812 | vma_adjust(prev, prev->vm_start, | 814 | err = vma_adjust(prev, prev->vm_start, |
| 813 | addr, prev->vm_pgoff, NULL); | 815 | addr, prev->vm_pgoff, NULL); |
| 814 | else /* cases 3, 8 */ | 816 | else /* cases 3, 8 */ |
| 815 | vma_adjust(area, addr, next->vm_end, | 817 | err = vma_adjust(area, addr, next->vm_end, |
| 816 | next->vm_pgoff - pglen, NULL); | 818 | next->vm_pgoff - pglen, NULL); |
| 819 | if (err) | ||
| 820 | return NULL; | ||
| 817 | return area; | 821 | return area; |
| 818 | } | 822 | } |
| 819 | 823 | ||
| @@ -932,13 +936,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 932 | if (!(flags & MAP_FIXED)) | 936 | if (!(flags & MAP_FIXED)) |
| 933 | addr = round_hint_to_min(addr); | 937 | addr = round_hint_to_min(addr); |
| 934 | 938 | ||
| 935 | error = arch_mmap_check(addr, len, flags); | ||
| 936 | if (error) | ||
| 937 | return error; | ||
| 938 | |||
| 939 | /* Careful about overflows.. */ | 939 | /* Careful about overflows.. */ |
| 940 | len = PAGE_ALIGN(len); | 940 | len = PAGE_ALIGN(len); |
| 941 | if (!len || len > TASK_SIZE) | 941 | if (!len) |
| 942 | return -ENOMEM; | 942 | return -ENOMEM; |
| 943 | 943 | ||
| 944 | /* offset overflow? */ | 944 | /* offset overflow? */ |
| @@ -949,24 +949,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 949 | if (mm->map_count > sysctl_max_map_count) | 949 | if (mm->map_count > sysctl_max_map_count) |
| 950 | return -ENOMEM; | 950 | return -ENOMEM; |
| 951 | 951 | ||
| 952 | if (flags & MAP_HUGETLB) { | ||
| 953 | struct user_struct *user = NULL; | ||
| 954 | if (file) | ||
| 955 | return -EINVAL; | ||
| 956 | |||
| 957 | /* | ||
| 958 | * VM_NORESERVE is used because the reservations will be | ||
| 959 | * taken when vm_ops->mmap() is called | ||
| 960 | * A dummy user value is used because we are not locking | ||
| 961 | * memory so no accounting is necessary | ||
| 962 | */ | ||
| 963 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
| 964 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
| 965 | &user, HUGETLB_ANONHUGE_INODE); | ||
| 966 | if (IS_ERR(file)) | ||
| 967 | return PTR_ERR(file); | ||
| 968 | } | ||
| 969 | |||
| 970 | /* Obtain the address to map to. we verify (or select) it and ensure | 952 | /* Obtain the address to map to. we verify (or select) it and ensure |
| 971 | * that it represents a valid section of the address space. | 953 | * that it represents a valid section of the address space. |
| 972 | */ | 954 | */ |
| @@ -990,7 +972,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 990 | unsigned long locked, lock_limit; | 972 | unsigned long locked, lock_limit; |
| 991 | locked = len >> PAGE_SHIFT; | 973 | locked = len >> PAGE_SHIFT; |
| 992 | locked += mm->locked_vm; | 974 | locked += mm->locked_vm; |
| 993 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 975 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 994 | lock_limit >>= PAGE_SHIFT; | 976 | lock_limit >>= PAGE_SHIFT; |
| 995 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 977 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
| 996 | return -EAGAIN; | 978 | return -EAGAIN; |
| @@ -1061,14 +1043,51 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 1061 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); | 1043 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); |
| 1062 | if (error) | 1044 | if (error) |
| 1063 | return error; | 1045 | return error; |
| 1064 | error = ima_file_mmap(file, prot); | ||
| 1065 | if (error) | ||
| 1066 | return error; | ||
| 1067 | 1046 | ||
| 1068 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); | 1047 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); |
| 1069 | } | 1048 | } |
| 1070 | EXPORT_SYMBOL(do_mmap_pgoff); | 1049 | EXPORT_SYMBOL(do_mmap_pgoff); |
| 1071 | 1050 | ||
| 1051 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | ||
| 1052 | unsigned long, prot, unsigned long, flags, | ||
| 1053 | unsigned long, fd, unsigned long, pgoff) | ||
| 1054 | { | ||
| 1055 | struct file *file = NULL; | ||
| 1056 | unsigned long retval = -EBADF; | ||
| 1057 | |||
| 1058 | if (!(flags & MAP_ANONYMOUS)) { | ||
| 1059 | if (unlikely(flags & MAP_HUGETLB)) | ||
| 1060 | return -EINVAL; | ||
| 1061 | file = fget(fd); | ||
| 1062 | if (!file) | ||
| 1063 | goto out; | ||
| 1064 | } else if (flags & MAP_HUGETLB) { | ||
| 1065 | struct user_struct *user = NULL; | ||
| 1066 | /* | ||
| 1067 | * VM_NORESERVE is used because the reservations will be | ||
| 1068 | * taken when vm_ops->mmap() is called | ||
| 1069 | * A dummy user value is used because we are not locking | ||
| 1070 | * memory so no accounting is necessary | ||
| 1071 | */ | ||
| 1072 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
| 1073 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
| 1074 | &user, HUGETLB_ANONHUGE_INODE); | ||
| 1075 | if (IS_ERR(file)) | ||
| 1076 | return PTR_ERR(file); | ||
| 1077 | } | ||
| 1078 | |||
| 1079 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
| 1080 | |||
| 1081 | down_write(¤t->mm->mmap_sem); | ||
| 1082 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
| 1083 | up_write(¤t->mm->mmap_sem); | ||
| 1084 | |||
| 1085 | if (file) | ||
| 1086 | fput(file); | ||
| 1087 | out: | ||
| 1088 | return retval; | ||
| 1089 | } | ||
| 1090 | |||
| 1072 | /* | 1091 | /* |
| 1073 | * Some shared mappigns will want the pages marked read-only | 1092 | * Some shared mappigns will want the pages marked read-only |
| 1074 | * to track write events. If so, we'll downgrade vm_page_prot | 1093 | * to track write events. If so, we'll downgrade vm_page_prot |
| @@ -1191,6 +1210,7 @@ munmap_back: | |||
| 1191 | vma->vm_flags = vm_flags; | 1210 | vma->vm_flags = vm_flags; |
| 1192 | vma->vm_page_prot = vm_get_page_prot(vm_flags); | 1211 | vma->vm_page_prot = vm_get_page_prot(vm_flags); |
| 1193 | vma->vm_pgoff = pgoff; | 1212 | vma->vm_pgoff = pgoff; |
| 1213 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
| 1194 | 1214 | ||
| 1195 | if (file) { | 1215 | if (file) { |
| 1196 | error = -EINVAL; | 1216 | error = -EINVAL; |
| @@ -1224,8 +1244,20 @@ munmap_back: | |||
| 1224 | goto free_vma; | 1244 | goto free_vma; |
| 1225 | } | 1245 | } |
| 1226 | 1246 | ||
| 1227 | if (vma_wants_writenotify(vma)) | 1247 | if (vma_wants_writenotify(vma)) { |
| 1248 | pgprot_t pprot = vma->vm_page_prot; | ||
| 1249 | |||
| 1250 | /* Can vma->vm_page_prot have changed?? | ||
| 1251 | * | ||
| 1252 | * Answer: Yes, drivers may have changed it in their | ||
| 1253 | * f_op->mmap method. | ||
| 1254 | * | ||
| 1255 | * Ensures that vmas marked as uncached stay that way. | ||
| 1256 | */ | ||
| 1228 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); | 1257 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); |
| 1258 | if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) | ||
| 1259 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); | ||
| 1260 | } | ||
| 1229 | 1261 | ||
| 1230 | vma_link(mm, vma, prev, rb_link, rb_parent); | 1262 | vma_link(mm, vma, prev, rb_link, rb_parent); |
| 1231 | file = vma->vm_file; | 1263 | file = vma->vm_file; |
| @@ -1239,13 +1271,8 @@ out: | |||
| 1239 | mm->total_vm += len >> PAGE_SHIFT; | 1271 | mm->total_vm += len >> PAGE_SHIFT; |
| 1240 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1272 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
| 1241 | if (vm_flags & VM_LOCKED) { | 1273 | if (vm_flags & VM_LOCKED) { |
| 1242 | /* | 1274 | if (!mlock_vma_pages_range(vma, addr, addr + len)) |
| 1243 | * makes pages present; downgrades, drops, reacquires mmap_sem | 1275 | mm->locked_vm += (len >> PAGE_SHIFT); |
| 1244 | */ | ||
| 1245 | long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); | ||
| 1246 | if (nr_pages < 0) | ||
| 1247 | return nr_pages; /* vma gone! */ | ||
| 1248 | mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; | ||
| 1249 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1276 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) |
| 1250 | make_pages_present(addr, addr + len); | 1277 | make_pages_present(addr, addr + len); |
| 1251 | return addr; | 1278 | return addr; |
| @@ -1459,6 +1486,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, | |||
| 1459 | unsigned long (*get_area)(struct file *, unsigned long, | 1486 | unsigned long (*get_area)(struct file *, unsigned long, |
| 1460 | unsigned long, unsigned long, unsigned long); | 1487 | unsigned long, unsigned long, unsigned long); |
| 1461 | 1488 | ||
| 1489 | unsigned long error = arch_mmap_check(addr, len, flags); | ||
| 1490 | if (error) | ||
| 1491 | return error; | ||
| 1492 | |||
| 1493 | /* Careful about overflows.. */ | ||
| 1494 | if (len > TASK_SIZE) | ||
| 1495 | return -ENOMEM; | ||
| 1496 | |||
| 1462 | get_area = current->mm->get_unmapped_area; | 1497 | get_area = current->mm->get_unmapped_area; |
| 1463 | if (file && file->f_op && file->f_op->get_unmapped_area) | 1498 | if (file && file->f_op && file->f_op->get_unmapped_area) |
| 1464 | get_area = file->f_op->get_unmapped_area; | 1499 | get_area = file->f_op->get_unmapped_area; |
| @@ -1565,7 +1600,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
| 1565 | return -ENOMEM; | 1600 | return -ENOMEM; |
| 1566 | 1601 | ||
| 1567 | /* Stack limit test */ | 1602 | /* Stack limit test */ |
| 1568 | if (size > rlim[RLIMIT_STACK].rlim_cur) | 1603 | if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) |
| 1569 | return -ENOMEM; | 1604 | return -ENOMEM; |
| 1570 | 1605 | ||
| 1571 | /* mlock limit tests */ | 1606 | /* mlock limit tests */ |
| @@ -1573,7 +1608,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
| 1573 | unsigned long locked; | 1608 | unsigned long locked; |
| 1574 | unsigned long limit; | 1609 | unsigned long limit; |
| 1575 | locked = mm->locked_vm + grow; | 1610 | locked = mm->locked_vm + grow; |
| 1576 | limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 1611 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); |
| 1612 | limit >>= PAGE_SHIFT; | ||
| 1577 | if (locked > limit && !capable(CAP_IPC_LOCK)) | 1613 | if (locked > limit && !capable(CAP_IPC_LOCK)) |
| 1578 | return -ENOMEM; | 1614 | return -ENOMEM; |
| 1579 | } | 1615 | } |
| @@ -1720,8 +1756,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
| 1720 | if (!prev || expand_stack(prev, addr)) | 1756 | if (!prev || expand_stack(prev, addr)) |
| 1721 | return NULL; | 1757 | return NULL; |
| 1722 | if (prev->vm_flags & VM_LOCKED) { | 1758 | if (prev->vm_flags & VM_LOCKED) { |
| 1723 | if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) | 1759 | mlock_vma_pages_range(prev, addr, prev->vm_end); |
| 1724 | return NULL; /* vma gone! */ | ||
| 1725 | } | 1760 | } |
| 1726 | return prev; | 1761 | return prev; |
| 1727 | } | 1762 | } |
| @@ -1749,8 +1784,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) | |||
| 1749 | if (expand_stack(vma, addr)) | 1784 | if (expand_stack(vma, addr)) |
| 1750 | return NULL; | 1785 | return NULL; |
| 1751 | if (vma->vm_flags & VM_LOCKED) { | 1786 | if (vma->vm_flags & VM_LOCKED) { |
| 1752 | if (mlock_vma_pages_range(vma, addr, start) < 0) | 1787 | mlock_vma_pages_range(vma, addr, start); |
| 1753 | return NULL; /* vma gone! */ | ||
| 1754 | } | 1788 | } |
| 1755 | return vma; | 1789 | return vma; |
| 1756 | } | 1790 | } |
| @@ -1829,29 +1863,29 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1829 | } | 1863 | } |
| 1830 | 1864 | ||
| 1831 | /* | 1865 | /* |
| 1832 | * Split a vma into two pieces at address 'addr', a new vma is allocated | 1866 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the |
| 1833 | * either for the first part or the tail. | 1867 | * munmap path where it doesn't make sense to fail. |
| 1834 | */ | 1868 | */ |
| 1835 | int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | 1869 | static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, |
| 1836 | unsigned long addr, int new_below) | 1870 | unsigned long addr, int new_below) |
| 1837 | { | 1871 | { |
| 1838 | struct mempolicy *pol; | 1872 | struct mempolicy *pol; |
| 1839 | struct vm_area_struct *new; | 1873 | struct vm_area_struct *new; |
| 1874 | int err = -ENOMEM; | ||
| 1840 | 1875 | ||
| 1841 | if (is_vm_hugetlb_page(vma) && (addr & | 1876 | if (is_vm_hugetlb_page(vma) && (addr & |
| 1842 | ~(huge_page_mask(hstate_vma(vma))))) | 1877 | ~(huge_page_mask(hstate_vma(vma))))) |
| 1843 | return -EINVAL; | 1878 | return -EINVAL; |
| 1844 | 1879 | ||
| 1845 | if (mm->map_count >= sysctl_max_map_count) | ||
| 1846 | return -ENOMEM; | ||
| 1847 | |||
| 1848 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); | 1880 | new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); |
| 1849 | if (!new) | 1881 | if (!new) |
| 1850 | return -ENOMEM; | 1882 | goto out_err; |
| 1851 | 1883 | ||
| 1852 | /* most fields are the same, copy all, and then fixup */ | 1884 | /* most fields are the same, copy all, and then fixup */ |
| 1853 | *new = *vma; | 1885 | *new = *vma; |
| 1854 | 1886 | ||
| 1887 | INIT_LIST_HEAD(&new->anon_vma_chain); | ||
| 1888 | |||
| 1855 | if (new_below) | 1889 | if (new_below) |
| 1856 | new->vm_end = addr; | 1890 | new->vm_end = addr; |
| 1857 | else { | 1891 | else { |
| @@ -1861,11 +1895,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
| 1861 | 1895 | ||
| 1862 | pol = mpol_dup(vma_policy(vma)); | 1896 | pol = mpol_dup(vma_policy(vma)); |
| 1863 | if (IS_ERR(pol)) { | 1897 | if (IS_ERR(pol)) { |
| 1864 | kmem_cache_free(vm_area_cachep, new); | 1898 | err = PTR_ERR(pol); |
| 1865 | return PTR_ERR(pol); | 1899 | goto out_free_vma; |
| 1866 | } | 1900 | } |
| 1867 | vma_set_policy(new, pol); | 1901 | vma_set_policy(new, pol); |
| 1868 | 1902 | ||
| 1903 | if (anon_vma_clone(new, vma)) | ||
| 1904 | goto out_free_mpol; | ||
| 1905 | |||
| 1869 | if (new->vm_file) { | 1906 | if (new->vm_file) { |
| 1870 | get_file(new->vm_file); | 1907 | get_file(new->vm_file); |
| 1871 | if (vma->vm_flags & VM_EXECUTABLE) | 1908 | if (vma->vm_flags & VM_EXECUTABLE) |
| @@ -1876,12 +1913,41 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
| 1876 | new->vm_ops->open(new); | 1913 | new->vm_ops->open(new); |
| 1877 | 1914 | ||
| 1878 | if (new_below) | 1915 | if (new_below) |
| 1879 | vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + | 1916 | err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + |
| 1880 | ((addr - new->vm_start) >> PAGE_SHIFT), new); | 1917 | ((addr - new->vm_start) >> PAGE_SHIFT), new); |
| 1881 | else | 1918 | else |
| 1882 | vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); | 1919 | err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); |
| 1883 | 1920 | ||
| 1884 | return 0; | 1921 | /* Success. */ |
| 1922 | if (!err) | ||
| 1923 | return 0; | ||
| 1924 | |||
| 1925 | /* Clean everything up if vma_adjust failed. */ | ||
| 1926 | new->vm_ops->close(new); | ||
| 1927 | if (new->vm_file) { | ||
| 1928 | if (vma->vm_flags & VM_EXECUTABLE) | ||
| 1929 | removed_exe_file_vma(mm); | ||
| 1930 | fput(new->vm_file); | ||
| 1931 | } | ||
| 1932 | out_free_mpol: | ||
| 1933 | mpol_put(pol); | ||
| 1934 | out_free_vma: | ||
| 1935 | kmem_cache_free(vm_area_cachep, new); | ||
| 1936 | out_err: | ||
| 1937 | return err; | ||
| 1938 | } | ||
| 1939 | |||
| 1940 | /* | ||
| 1941 | * Split a vma into two pieces at address 'addr', a new vma is allocated | ||
| 1942 | * either for the first part or the tail. | ||
| 1943 | */ | ||
| 1944 | int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 1945 | unsigned long addr, int new_below) | ||
| 1946 | { | ||
| 1947 | if (mm->map_count >= sysctl_max_map_count) | ||
| 1948 | return -ENOMEM; | ||
| 1949 | |||
| 1950 | return __split_vma(mm, vma, addr, new_below); | ||
| 1885 | } | 1951 | } |
| 1886 | 1952 | ||
| 1887 | /* Munmap is split into 2 main parts -- this part which finds | 1953 | /* Munmap is split into 2 main parts -- this part which finds |
| @@ -1919,7 +1985,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
| 1919 | * places tmp vma above, and higher split_vma places tmp vma below. | 1985 | * places tmp vma above, and higher split_vma places tmp vma below. |
| 1920 | */ | 1986 | */ |
| 1921 | if (start > vma->vm_start) { | 1987 | if (start > vma->vm_start) { |
| 1922 | int error = split_vma(mm, vma, start, 0); | 1988 | int error; |
| 1989 | |||
| 1990 | /* | ||
| 1991 | * Make sure that map_count on return from munmap() will | ||
| 1992 | * not exceed its limit; but let map_count go just above | ||
| 1993 | * its limit temporarily, to help free resources as expected. | ||
| 1994 | */ | ||
| 1995 | if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) | ||
| 1996 | return -ENOMEM; | ||
| 1997 | |||
| 1998 | error = __split_vma(mm, vma, start, 0); | ||
| 1923 | if (error) | 1999 | if (error) |
| 1924 | return error; | 2000 | return error; |
| 1925 | prev = vma; | 2001 | prev = vma; |
| @@ -1928,7 +2004,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
| 1928 | /* Does it split the last one? */ | 2004 | /* Does it split the last one? */ |
| 1929 | last = find_vma(mm, end); | 2005 | last = find_vma(mm, end); |
| 1930 | if (last && end > last->vm_start) { | 2006 | if (last && end > last->vm_start) { |
| 1931 | int error = split_vma(mm, last, end, 1); | 2007 | int error = __split_vma(mm, last, end, 1); |
| 1932 | if (error) | 2008 | if (error) |
| 1933 | return error; | 2009 | return error; |
| 1934 | } | 2010 | } |
| @@ -2003,20 +2079,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2003 | if (!len) | 2079 | if (!len) |
| 2004 | return addr; | 2080 | return addr; |
| 2005 | 2081 | ||
| 2006 | if ((addr + len) > TASK_SIZE || (addr + len) < addr) | ||
| 2007 | return -EINVAL; | ||
| 2008 | |||
| 2009 | if (is_hugepage_only_range(mm, addr, len)) | ||
| 2010 | return -EINVAL; | ||
| 2011 | |||
| 2012 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); | 2082 | error = security_file_mmap(NULL, 0, 0, 0, addr, 1); |
| 2013 | if (error) | 2083 | if (error) |
| 2014 | return error; | 2084 | return error; |
| 2015 | 2085 | ||
| 2016 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; | 2086 | flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; |
| 2017 | 2087 | ||
| 2018 | error = arch_mmap_check(addr, len, flags); | 2088 | error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); |
| 2019 | if (error) | 2089 | if (error & ~PAGE_MASK) |
| 2020 | return error; | 2090 | return error; |
| 2021 | 2091 | ||
| 2022 | /* | 2092 | /* |
| @@ -2026,7 +2096,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2026 | unsigned long locked, lock_limit; | 2096 | unsigned long locked, lock_limit; |
| 2027 | locked = len >> PAGE_SHIFT; | 2097 | locked = len >> PAGE_SHIFT; |
| 2028 | locked += mm->locked_vm; | 2098 | locked += mm->locked_vm; |
| 2029 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | 2099 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 2030 | lock_limit >>= PAGE_SHIFT; | 2100 | lock_limit >>= PAGE_SHIFT; |
| 2031 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 2101 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
| 2032 | return -EAGAIN; | 2102 | return -EAGAIN; |
| @@ -2074,6 +2144,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2074 | return -ENOMEM; | 2144 | return -ENOMEM; |
| 2075 | } | 2145 | } |
| 2076 | 2146 | ||
| 2147 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
| 2077 | vma->vm_mm = mm; | 2148 | vma->vm_mm = mm; |
| 2078 | vma->vm_start = addr; | 2149 | vma->vm_start = addr; |
| 2079 | vma->vm_end = addr + len; | 2150 | vma->vm_end = addr + len; |
| @@ -2210,10 +2281,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
| 2210 | if (new_vma) { | 2281 | if (new_vma) { |
| 2211 | *new_vma = *vma; | 2282 | *new_vma = *vma; |
| 2212 | pol = mpol_dup(vma_policy(vma)); | 2283 | pol = mpol_dup(vma_policy(vma)); |
| 2213 | if (IS_ERR(pol)) { | 2284 | if (IS_ERR(pol)) |
| 2214 | kmem_cache_free(vm_area_cachep, new_vma); | 2285 | goto out_free_vma; |
| 2215 | return NULL; | 2286 | INIT_LIST_HEAD(&new_vma->anon_vma_chain); |
| 2216 | } | 2287 | if (anon_vma_clone(new_vma, vma)) |
| 2288 | goto out_free_mempol; | ||
| 2217 | vma_set_policy(new_vma, pol); | 2289 | vma_set_policy(new_vma, pol); |
| 2218 | new_vma->vm_start = addr; | 2290 | new_vma->vm_start = addr; |
| 2219 | new_vma->vm_end = addr + len; | 2291 | new_vma->vm_end = addr + len; |
| @@ -2229,6 +2301,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
| 2229 | } | 2301 | } |
| 2230 | } | 2302 | } |
| 2231 | return new_vma; | 2303 | return new_vma; |
| 2304 | |||
| 2305 | out_free_mempol: | ||
| 2306 | mpol_put(pol); | ||
| 2307 | out_free_vma: | ||
| 2308 | kmem_cache_free(vm_area_cachep, new_vma); | ||
| 2309 | return NULL; | ||
| 2232 | } | 2310 | } |
| 2233 | 2311 | ||
| 2234 | /* | 2312 | /* |
| @@ -2240,7 +2318,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) | |||
| 2240 | unsigned long cur = mm->total_vm; /* pages */ | 2318 | unsigned long cur = mm->total_vm; /* pages */ |
| 2241 | unsigned long lim; | 2319 | unsigned long lim; |
| 2242 | 2320 | ||
| 2243 | lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 2321 | lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; |
| 2244 | 2322 | ||
| 2245 | if (cur + npages > lim) | 2323 | if (cur + npages > lim) |
| 2246 | return 0; | 2324 | return 0; |
| @@ -2306,6 +2384,7 @@ int install_special_mapping(struct mm_struct *mm, | |||
| 2306 | if (unlikely(vma == NULL)) | 2384 | if (unlikely(vma == NULL)) |
| 2307 | return -ENOMEM; | 2385 | return -ENOMEM; |
| 2308 | 2386 | ||
| 2387 | INIT_LIST_HEAD(&vma->anon_vma_chain); | ||
| 2309 | vma->vm_mm = mm; | 2388 | vma->vm_mm = mm; |
| 2310 | vma->vm_start = addr; | 2389 | vma->vm_start = addr; |
| 2311 | vma->vm_end = addr + len; | 2390 | vma->vm_end = addr + len; |
| @@ -2406,6 +2485,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
| 2406 | int mm_take_all_locks(struct mm_struct *mm) | 2485 | int mm_take_all_locks(struct mm_struct *mm) |
| 2407 | { | 2486 | { |
| 2408 | struct vm_area_struct *vma; | 2487 | struct vm_area_struct *vma; |
| 2488 | struct anon_vma_chain *avc; | ||
| 2409 | int ret = -EINTR; | 2489 | int ret = -EINTR; |
| 2410 | 2490 | ||
| 2411 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2491 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
| @@ -2423,7 +2503,8 @@ int mm_take_all_locks(struct mm_struct *mm) | |||
| 2423 | if (signal_pending(current)) | 2503 | if (signal_pending(current)) |
| 2424 | goto out_unlock; | 2504 | goto out_unlock; |
| 2425 | if (vma->anon_vma) | 2505 | if (vma->anon_vma) |
| 2426 | vm_lock_anon_vma(mm, vma->anon_vma); | 2506 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
| 2507 | vm_lock_anon_vma(mm, avc->anon_vma); | ||
| 2427 | } | 2508 | } |
| 2428 | 2509 | ||
| 2429 | ret = 0; | 2510 | ret = 0; |
| @@ -2478,13 +2559,15 @@ static void vm_unlock_mapping(struct address_space *mapping) | |||
| 2478 | void mm_drop_all_locks(struct mm_struct *mm) | 2559 | void mm_drop_all_locks(struct mm_struct *mm) |
| 2479 | { | 2560 | { |
| 2480 | struct vm_area_struct *vma; | 2561 | struct vm_area_struct *vma; |
| 2562 | struct anon_vma_chain *avc; | ||
| 2481 | 2563 | ||
| 2482 | BUG_ON(down_read_trylock(&mm->mmap_sem)); | 2564 | BUG_ON(down_read_trylock(&mm->mmap_sem)); |
| 2483 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); | 2565 | BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); |
| 2484 | 2566 | ||
| 2485 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 2567 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
| 2486 | if (vma->anon_vma) | 2568 | if (vma->anon_vma) |
| 2487 | vm_unlock_anon_vma(vma->anon_vma); | 2569 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
| 2570 | vm_unlock_anon_vma(avc->anon_vma); | ||
| 2488 | if (vma->vm_file && vma->vm_file->f_mapping) | 2571 | if (vma->vm_file && vma->vm_file->f_mapping) |
| 2489 | vm_unlock_mapping(vma->vm_file->f_mapping); | 2572 | vm_unlock_mapping(vma->vm_file->f_mapping); |
| 2490 | } | 2573 | } |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c index ded9081f4021..0777654147c9 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c | |||
| @@ -5,6 +5,7 @@ | |||
| 5 | 5 | ||
| 6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
| 7 | #include <linux/mmu_context.h> | 7 | #include <linux/mmu_context.h> |
| 8 | #include <linux/module.h> | ||
| 8 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
| 9 | 10 | ||
| 10 | #include <asm/mmu_context.h> | 11 | #include <asm/mmu_context.h> |
| @@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm) | |||
| 37 | if (active_mm != mm) | 38 | if (active_mm != mm) |
| 38 | mmdrop(active_mm); | 39 | mmdrop(active_mm); |
| 39 | } | 40 | } |
| 41 | EXPORT_SYMBOL_GPL(use_mm); | ||
| 40 | 42 | ||
| 41 | /* | 43 | /* |
| 42 | * unuse_mm | 44 | * unuse_mm |
| @@ -56,3 +58,4 @@ void unuse_mm(struct mm_struct *mm) | |||
| 56 | enter_lazy_tlb(mm, tsk); | 58 | enter_lazy_tlb(mm, tsk); |
| 57 | task_unlock(tsk); | 59 | task_unlock(tsk); |
| 58 | } | 60 | } |
| 61 | EXPORT_SYMBOL_GPL(unuse_mm); | ||
diff --git a/mm/mremap.c b/mm/mremap.c index 97bff2547719..e9c75efce609 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -261,6 +261,137 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
| 261 | return new_addr; | 261 | return new_addr; |
| 262 | } | 262 | } |
| 263 | 263 | ||
| 264 | static struct vm_area_struct *vma_to_resize(unsigned long addr, | ||
| 265 | unsigned long old_len, unsigned long new_len, unsigned long *p) | ||
| 266 | { | ||
| 267 | struct mm_struct *mm = current->mm; | ||
| 268 | struct vm_area_struct *vma = find_vma(mm, addr); | ||
| 269 | |||
| 270 | if (!vma || vma->vm_start > addr) | ||
| 271 | goto Efault; | ||
| 272 | |||
| 273 | if (is_vm_hugetlb_page(vma)) | ||
| 274 | goto Einval; | ||
| 275 | |||
| 276 | /* We can't remap across vm area boundaries */ | ||
| 277 | if (old_len > vma->vm_end - addr) | ||
| 278 | goto Efault; | ||
| 279 | |||
| 280 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { | ||
| 281 | if (new_len > old_len) | ||
| 282 | goto Efault; | ||
| 283 | } | ||
| 284 | |||
| 285 | if (vma->vm_flags & VM_LOCKED) { | ||
| 286 | unsigned long locked, lock_limit; | ||
| 287 | locked = mm->locked_vm << PAGE_SHIFT; | ||
| 288 | lock_limit = rlimit(RLIMIT_MEMLOCK); | ||
| 289 | locked += new_len - old_len; | ||
| 290 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
| 291 | goto Eagain; | ||
| 292 | } | ||
| 293 | |||
| 294 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) | ||
| 295 | goto Enomem; | ||
| 296 | |||
| 297 | if (vma->vm_flags & VM_ACCOUNT) { | ||
| 298 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; | ||
| 299 | if (security_vm_enough_memory(charged)) | ||
| 300 | goto Efault; | ||
| 301 | *p = charged; | ||
| 302 | } | ||
| 303 | |||
| 304 | return vma; | ||
| 305 | |||
| 306 | Efault: /* very odd choice for most of the cases, but... */ | ||
| 307 | return ERR_PTR(-EFAULT); | ||
| 308 | Einval: | ||
| 309 | return ERR_PTR(-EINVAL); | ||
| 310 | Enomem: | ||
| 311 | return ERR_PTR(-ENOMEM); | ||
| 312 | Eagain: | ||
| 313 | return ERR_PTR(-EAGAIN); | ||
| 314 | } | ||
| 315 | |||
| 316 | static unsigned long mremap_to(unsigned long addr, | ||
| 317 | unsigned long old_len, unsigned long new_addr, | ||
| 318 | unsigned long new_len) | ||
| 319 | { | ||
| 320 | struct mm_struct *mm = current->mm; | ||
| 321 | struct vm_area_struct *vma; | ||
| 322 | unsigned long ret = -EINVAL; | ||
| 323 | unsigned long charged = 0; | ||
| 324 | unsigned long map_flags; | ||
| 325 | |||
| 326 | if (new_addr & ~PAGE_MASK) | ||
| 327 | goto out; | ||
| 328 | |||
| 329 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | ||
| 330 | goto out; | ||
| 331 | |||
| 332 | /* Check if the location we're moving into overlaps the | ||
| 333 | * old location at all, and fail if it does. | ||
| 334 | */ | ||
| 335 | if ((new_addr <= addr) && (new_addr+new_len) > addr) | ||
| 336 | goto out; | ||
| 337 | |||
| 338 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | ||
| 339 | goto out; | ||
| 340 | |||
| 341 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
| 342 | if (ret) | ||
| 343 | goto out; | ||
| 344 | |||
| 345 | ret = do_munmap(mm, new_addr, new_len); | ||
| 346 | if (ret) | ||
| 347 | goto out; | ||
| 348 | |||
| 349 | if (old_len >= new_len) { | ||
| 350 | ret = do_munmap(mm, addr+new_len, old_len - new_len); | ||
| 351 | if (ret && old_len != new_len) | ||
| 352 | goto out; | ||
| 353 | old_len = new_len; | ||
| 354 | } | ||
| 355 | |||
| 356 | vma = vma_to_resize(addr, old_len, new_len, &charged); | ||
| 357 | if (IS_ERR(vma)) { | ||
| 358 | ret = PTR_ERR(vma); | ||
| 359 | goto out; | ||
| 360 | } | ||
| 361 | |||
| 362 | map_flags = MAP_FIXED; | ||
| 363 | if (vma->vm_flags & VM_MAYSHARE) | ||
| 364 | map_flags |= MAP_SHARED; | ||
| 365 | |||
| 366 | ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + | ||
| 367 | ((addr - vma->vm_start) >> PAGE_SHIFT), | ||
| 368 | map_flags); | ||
| 369 | if (ret & ~PAGE_MASK) | ||
| 370 | goto out1; | ||
| 371 | |||
| 372 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | ||
| 373 | if (!(ret & ~PAGE_MASK)) | ||
| 374 | goto out; | ||
| 375 | out1: | ||
| 376 | vm_unacct_memory(charged); | ||
| 377 | |||
| 378 | out: | ||
| 379 | return ret; | ||
| 380 | } | ||
| 381 | |||
| 382 | static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) | ||
| 383 | { | ||
| 384 | unsigned long end = vma->vm_end + delta; | ||
| 385 | if (end < vma->vm_end) /* overflow */ | ||
| 386 | return 0; | ||
| 387 | if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ | ||
| 388 | return 0; | ||
| 389 | if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, | ||
| 390 | 0, MAP_FIXED) & ~PAGE_MASK) | ||
| 391 | return 0; | ||
| 392 | return 1; | ||
| 393 | } | ||
| 394 | |||
| 264 | /* | 395 | /* |
| 265 | * Expand (or shrink) an existing mapping, potentially moving it at the | 396 | * Expand (or shrink) an existing mapping, potentially moving it at the |
| 266 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) | 397 | * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) |
| @@ -294,32 +425,10 @@ unsigned long do_mremap(unsigned long addr, | |||
| 294 | if (!new_len) | 425 | if (!new_len) |
| 295 | goto out; | 426 | goto out; |
| 296 | 427 | ||
| 297 | /* new_addr is only valid if MREMAP_FIXED is specified */ | ||
| 298 | if (flags & MREMAP_FIXED) { | 428 | if (flags & MREMAP_FIXED) { |
| 299 | if (new_addr & ~PAGE_MASK) | 429 | if (flags & MREMAP_MAYMOVE) |
| 300 | goto out; | 430 | ret = mremap_to(addr, old_len, new_addr, new_len); |
| 301 | if (!(flags & MREMAP_MAYMOVE)) | 431 | goto out; |
| 302 | goto out; | ||
| 303 | |||
| 304 | if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) | ||
| 305 | goto out; | ||
| 306 | |||
| 307 | /* Check if the location we're moving into overlaps the | ||
| 308 | * old location at all, and fail if it does. | ||
| 309 | */ | ||
| 310 | if ((new_addr <= addr) && (new_addr+new_len) > addr) | ||
| 311 | goto out; | ||
| 312 | |||
| 313 | if ((addr <= new_addr) && (addr+old_len) > new_addr) | ||
| 314 | goto out; | ||
| 315 | |||
| 316 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
| 317 | if (ret) | ||
| 318 | goto out; | ||
| 319 | |||
| 320 | ret = do_munmap(mm, new_addr, new_len); | ||
| 321 | if (ret) | ||
| 322 | goto out; | ||
| 323 | } | 432 | } |
| 324 | 433 | ||
| 325 | /* | 434 | /* |
| @@ -332,64 +441,30 @@ unsigned long do_mremap(unsigned long addr, | |||
| 332 | if (ret && old_len != new_len) | 441 | if (ret && old_len != new_len) |
| 333 | goto out; | 442 | goto out; |
| 334 | ret = addr; | 443 | ret = addr; |
| 335 | if (!(flags & MREMAP_FIXED) || (new_addr == addr)) | 444 | goto out; |
| 336 | goto out; | ||
| 337 | old_len = new_len; | ||
| 338 | } | 445 | } |
| 339 | 446 | ||
| 340 | /* | 447 | /* |
| 341 | * Ok, we need to grow.. or relocate. | 448 | * Ok, we need to grow.. |
| 342 | */ | 449 | */ |
| 343 | ret = -EFAULT; | 450 | vma = vma_to_resize(addr, old_len, new_len, &charged); |
| 344 | vma = find_vma(mm, addr); | 451 | if (IS_ERR(vma)) { |
| 345 | if (!vma || vma->vm_start > addr) | 452 | ret = PTR_ERR(vma); |
| 346 | goto out; | ||
| 347 | if (is_vm_hugetlb_page(vma)) { | ||
| 348 | ret = -EINVAL; | ||
| 349 | goto out; | ||
| 350 | } | ||
| 351 | /* We can't remap across vm area boundaries */ | ||
| 352 | if (old_len > vma->vm_end - addr) | ||
| 353 | goto out; | ||
| 354 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { | ||
| 355 | if (new_len > old_len) | ||
| 356 | goto out; | ||
| 357 | } | ||
| 358 | if (vma->vm_flags & VM_LOCKED) { | ||
| 359 | unsigned long locked, lock_limit; | ||
| 360 | locked = mm->locked_vm << PAGE_SHIFT; | ||
| 361 | lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; | ||
| 362 | locked += new_len - old_len; | ||
| 363 | ret = -EAGAIN; | ||
| 364 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | ||
| 365 | goto out; | ||
| 366 | } | ||
| 367 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { | ||
| 368 | ret = -ENOMEM; | ||
| 369 | goto out; | 453 | goto out; |
| 370 | } | 454 | } |
| 371 | 455 | ||
| 372 | if (vma->vm_flags & VM_ACCOUNT) { | ||
| 373 | charged = (new_len - old_len) >> PAGE_SHIFT; | ||
| 374 | if (security_vm_enough_memory(charged)) | ||
| 375 | goto out_nc; | ||
| 376 | } | ||
| 377 | |||
| 378 | /* old_len exactly to the end of the area.. | 456 | /* old_len exactly to the end of the area.. |
| 379 | * And we're not relocating the area. | ||
| 380 | */ | 457 | */ |
| 381 | if (old_len == vma->vm_end - addr && | 458 | if (old_len == vma->vm_end - addr) { |
| 382 | !((flags & MREMAP_FIXED) && (addr != new_addr)) && | ||
| 383 | (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { | ||
| 384 | unsigned long max_addr = TASK_SIZE; | ||
| 385 | if (vma->vm_next) | ||
| 386 | max_addr = vma->vm_next->vm_start; | ||
| 387 | /* can we just expand the current mapping? */ | 459 | /* can we just expand the current mapping? */ |
| 388 | if (max_addr - addr >= new_len) { | 460 | if (vma_expandable(vma, new_len - old_len)) { |
| 389 | int pages = (new_len - old_len) >> PAGE_SHIFT; | 461 | int pages = (new_len - old_len) >> PAGE_SHIFT; |
| 390 | 462 | ||
| 391 | vma_adjust(vma, vma->vm_start, | 463 | if (vma_adjust(vma, vma->vm_start, addr + new_len, |
| 392 | addr + new_len, vma->vm_pgoff, NULL); | 464 | vma->vm_pgoff, NULL)) { |
| 465 | ret = -ENOMEM; | ||
| 466 | goto out; | ||
| 467 | } | ||
| 393 | 468 | ||
| 394 | mm->total_vm += pages; | 469 | mm->total_vm += pages; |
| 395 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); | 470 | vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); |
| @@ -409,28 +484,27 @@ unsigned long do_mremap(unsigned long addr, | |||
| 409 | */ | 484 | */ |
| 410 | ret = -ENOMEM; | 485 | ret = -ENOMEM; |
| 411 | if (flags & MREMAP_MAYMOVE) { | 486 | if (flags & MREMAP_MAYMOVE) { |
| 412 | if (!(flags & MREMAP_FIXED)) { | 487 | unsigned long map_flags = 0; |
| 413 | unsigned long map_flags = 0; | 488 | if (vma->vm_flags & VM_MAYSHARE) |
| 414 | if (vma->vm_flags & VM_MAYSHARE) | 489 | map_flags |= MAP_SHARED; |
| 415 | map_flags |= MAP_SHARED; | 490 | |
| 416 | 491 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, | |
| 417 | new_addr = get_unmapped_area(vma->vm_file, 0, new_len, | 492 | vma->vm_pgoff + |
| 418 | vma->vm_pgoff, map_flags); | 493 | ((addr - vma->vm_start) >> PAGE_SHIFT), |
| 419 | if (new_addr & ~PAGE_MASK) { | 494 | map_flags); |
| 420 | ret = new_addr; | 495 | if (new_addr & ~PAGE_MASK) { |
| 421 | goto out; | 496 | ret = new_addr; |
| 422 | } | 497 | goto out; |
| 423 | |||
| 424 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
| 425 | if (ret) | ||
| 426 | goto out; | ||
| 427 | } | 498 | } |
| 499 | |||
| 500 | ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); | ||
| 501 | if (ret) | ||
| 502 | goto out; | ||
| 428 | ret = move_vma(vma, addr, old_len, new_len, new_addr); | 503 | ret = move_vma(vma, addr, old_len, new_len, new_addr); |
| 429 | } | 504 | } |
| 430 | out: | 505 | out: |
| 431 | if (ret & ~PAGE_MASK) | 506 | if (ret & ~PAGE_MASK) |
| 432 | vm_unacct_memory(charged); | 507 | vm_unacct_memory(charged); |
| 433 | out_nc: | ||
| 434 | return ret; | 508 | return ret; |
| 435 | } | 509 | } |
| 436 | 510 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index 9876fa0c3ad3..b9b5cceb1b68 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -146,7 +146,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 146 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 146 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); |
| 147 | 147 | ||
| 148 | for (i = 0; i < nr_pages; i++) { | 148 | for (i = 0; i < nr_pages; i++) { |
| 149 | vma = find_vma(mm, start); | 149 | vma = find_extend_vma(mm, start); |
| 150 | if (!vma) | 150 | if (!vma) |
| 151 | goto finish_or_fault; | 151 | goto finish_or_fault; |
| 152 | 152 | ||
| @@ -432,6 +432,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
| 432 | /* | 432 | /* |
| 433 | * Ok, looks good - let it rip. | 433 | * Ok, looks good - let it rip. |
| 434 | */ | 434 | */ |
| 435 | flush_icache_range(mm->brk, brk); | ||
| 435 | return mm->brk = brk; | 436 | return mm->brk = brk; |
| 436 | } | 437 | } |
| 437 | 438 | ||
| @@ -551,11 +552,11 @@ static void free_page_series(unsigned long from, unsigned long to) | |||
| 551 | static void __put_nommu_region(struct vm_region *region) | 552 | static void __put_nommu_region(struct vm_region *region) |
| 552 | __releases(nommu_region_sem) | 553 | __releases(nommu_region_sem) |
| 553 | { | 554 | { |
| 554 | kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); | 555 | kenter("%p{%d}", region, region->vm_usage); |
| 555 | 556 | ||
| 556 | BUG_ON(!nommu_region_tree.rb_node); | 557 | BUG_ON(!nommu_region_tree.rb_node); |
| 557 | 558 | ||
| 558 | if (atomic_dec_and_test(®ion->vm_usage)) { | 559 | if (--region->vm_usage == 0) { |
| 559 | if (region->vm_top > region->vm_start) | 560 | if (region->vm_top > region->vm_start) |
| 560 | delete_nommu_region(region); | 561 | delete_nommu_region(region); |
| 561 | up_write(&nommu_region_sem); | 562 | up_write(&nommu_region_sem); |
| @@ -763,7 +764,7 @@ EXPORT_SYMBOL(find_vma); | |||
| 763 | */ | 764 | */ |
| 764 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) | 765 | struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) |
| 765 | { | 766 | { |
| 766 | return find_vma(mm, addr); | 767 | return find_vma(mm, addr & PAGE_MASK); |
| 767 | } | 768 | } |
| 768 | 769 | ||
| 769 | /* | 770 | /* |
| @@ -1143,9 +1144,6 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
| 1143 | if (ret < rlen) | 1144 | if (ret < rlen) |
| 1144 | memset(base + ret, 0, rlen - ret); | 1145 | memset(base + ret, 0, rlen - ret); |
| 1145 | 1146 | ||
| 1146 | } else { | ||
| 1147 | /* if it's an anonymous mapping, then just clear it */ | ||
| 1148 | memset(base, 0, rlen); | ||
| 1149 | } | 1147 | } |
| 1150 | 1148 | ||
| 1151 | return 0; | 1149 | return 0; |
| @@ -1207,11 +1205,11 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1207 | if (!vma) | 1205 | if (!vma) |
| 1208 | goto error_getting_vma; | 1206 | goto error_getting_vma; |
| 1209 | 1207 | ||
| 1210 | atomic_set(®ion->vm_usage, 1); | 1208 | region->vm_usage = 1; |
| 1211 | region->vm_flags = vm_flags; | 1209 | region->vm_flags = vm_flags; |
| 1212 | region->vm_pgoff = pgoff; | 1210 | region->vm_pgoff = pgoff; |
| 1213 | 1211 | ||
| 1214 | INIT_LIST_HEAD(&vma->anon_vma_node); | 1212 | INIT_LIST_HEAD(&vma->anon_vma_chain); |
| 1215 | vma->vm_flags = vm_flags; | 1213 | vma->vm_flags = vm_flags; |
| 1216 | vma->vm_pgoff = pgoff; | 1214 | vma->vm_pgoff = pgoff; |
| 1217 | 1215 | ||
| @@ -1274,7 +1272,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1274 | } | 1272 | } |
| 1275 | 1273 | ||
| 1276 | /* we've found a region we can share */ | 1274 | /* we've found a region we can share */ |
| 1277 | atomic_inc(&pregion->vm_usage); | 1275 | pregion->vm_usage++; |
| 1278 | vma->vm_region = pregion; | 1276 | vma->vm_region = pregion; |
| 1279 | start = pregion->vm_start; | 1277 | start = pregion->vm_start; |
| 1280 | start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; | 1278 | start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; |
| @@ -1291,7 +1289,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1291 | vma->vm_region = NULL; | 1289 | vma->vm_region = NULL; |
| 1292 | vma->vm_start = 0; | 1290 | vma->vm_start = 0; |
| 1293 | vma->vm_end = 0; | 1291 | vma->vm_end = 0; |
| 1294 | atomic_dec(&pregion->vm_usage); | 1292 | pregion->vm_usage--; |
| 1295 | pregion = NULL; | 1293 | pregion = NULL; |
| 1296 | goto error_just_free; | 1294 | goto error_just_free; |
| 1297 | } | 1295 | } |
| @@ -1343,6 +1341,11 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1343 | goto error_just_free; | 1341 | goto error_just_free; |
| 1344 | add_nommu_region(region); | 1342 | add_nommu_region(region); |
| 1345 | 1343 | ||
| 1344 | /* clear anonymous mappings that don't ask for uninitialized data */ | ||
| 1345 | if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) | ||
| 1346 | memset((void *)region->vm_start, 0, | ||
| 1347 | region->vm_end - region->vm_start); | ||
| 1348 | |||
| 1346 | /* okay... we have a mapping; now we have to register it */ | 1349 | /* okay... we have a mapping; now we have to register it */ |
| 1347 | result = vma->vm_start; | 1350 | result = vma->vm_start; |
| 1348 | 1351 | ||
| @@ -1351,10 +1354,14 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1351 | share: | 1354 | share: |
| 1352 | add_vma_to_mm(current->mm, vma); | 1355 | add_vma_to_mm(current->mm, vma); |
| 1353 | 1356 | ||
| 1354 | up_write(&nommu_region_sem); | 1357 | /* we flush the region from the icache only when the first executable |
| 1358 | * mapping of it is made */ | ||
| 1359 | if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { | ||
| 1360 | flush_icache_range(region->vm_start, region->vm_end); | ||
| 1361 | region->vm_icache_flushed = true; | ||
| 1362 | } | ||
| 1355 | 1363 | ||
| 1356 | if (prot & PROT_EXEC) | 1364 | up_write(&nommu_region_sem); |
| 1357 | flush_icache_range(result, result + len); | ||
| 1358 | 1365 | ||
| 1359 | kleave(" = %lx", result); | 1366 | kleave(" = %lx", result); |
| 1360 | return result; | 1367 | return result; |
| @@ -1396,6 +1403,31 @@ error_getting_region: | |||
| 1396 | } | 1403 | } |
| 1397 | EXPORT_SYMBOL(do_mmap_pgoff); | 1404 | EXPORT_SYMBOL(do_mmap_pgoff); |
| 1398 | 1405 | ||
| 1406 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | ||
| 1407 | unsigned long, prot, unsigned long, flags, | ||
| 1408 | unsigned long, fd, unsigned long, pgoff) | ||
| 1409 | { | ||
| 1410 | struct file *file = NULL; | ||
| 1411 | unsigned long retval = -EBADF; | ||
| 1412 | |||
| 1413 | if (!(flags & MAP_ANONYMOUS)) { | ||
| 1414 | file = fget(fd); | ||
| 1415 | if (!file) | ||
| 1416 | goto out; | ||
| 1417 | } | ||
| 1418 | |||
| 1419 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | ||
| 1420 | |||
| 1421 | down_write(¤t->mm->mmap_sem); | ||
| 1422 | retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); | ||
| 1423 | up_write(¤t->mm->mmap_sem); | ||
| 1424 | |||
| 1425 | if (file) | ||
| 1426 | fput(file); | ||
| 1427 | out: | ||
| 1428 | return retval; | ||
| 1429 | } | ||
| 1430 | |||
| 1399 | /* | 1431 | /* |
| 1400 | * split a vma into two pieces at address 'addr', a new vma is allocated either | 1432 | * split a vma into two pieces at address 'addr', a new vma is allocated either |
| 1401 | * for the first part or the tail. | 1433 | * for the first part or the tail. |
| @@ -1409,10 +1441,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1409 | 1441 | ||
| 1410 | kenter(""); | 1442 | kenter(""); |
| 1411 | 1443 | ||
| 1412 | /* we're only permitted to split anonymous regions that have a single | 1444 | /* we're only permitted to split anonymous regions (these should have |
| 1413 | * owner */ | 1445 | * only a single usage on the region) */ |
| 1414 | if (vma->vm_file || | 1446 | if (vma->vm_file) |
| 1415 | atomic_read(&vma->vm_region->vm_usage) != 1) | ||
| 1416 | return -ENOMEM; | 1447 | return -ENOMEM; |
| 1417 | 1448 | ||
| 1418 | if (mm->map_count >= sysctl_max_map_count) | 1449 | if (mm->map_count >= sysctl_max_map_count) |
| @@ -1486,7 +1517,7 @@ static int shrink_vma(struct mm_struct *mm, | |||
| 1486 | 1517 | ||
| 1487 | /* cut the backing region down to size */ | 1518 | /* cut the backing region down to size */ |
| 1488 | region = vma->vm_region; | 1519 | region = vma->vm_region; |
| 1489 | BUG_ON(atomic_read(®ion->vm_usage) != 1); | 1520 | BUG_ON(region->vm_usage != 1); |
| 1490 | 1521 | ||
| 1491 | down_write(&nommu_region_sem); | 1522 | down_write(&nommu_region_sem); |
| 1492 | delete_nommu_region(region); | 1523 | delete_nommu_region(region); |
| @@ -1730,27 +1761,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
| 1730 | EXPORT_SYMBOL(unmap_mapping_range); | 1761 | EXPORT_SYMBOL(unmap_mapping_range); |
| 1731 | 1762 | ||
| 1732 | /* | 1763 | /* |
| 1733 | * ask for an unmapped area at which to create a mapping on a file | ||
| 1734 | */ | ||
| 1735 | unsigned long get_unmapped_area(struct file *file, unsigned long addr, | ||
| 1736 | unsigned long len, unsigned long pgoff, | ||
| 1737 | unsigned long flags) | ||
| 1738 | { | ||
| 1739 | unsigned long (*get_area)(struct file *, unsigned long, unsigned long, | ||
| 1740 | unsigned long, unsigned long); | ||
| 1741 | |||
| 1742 | get_area = current->mm->get_unmapped_area; | ||
| 1743 | if (file && file->f_op && file->f_op->get_unmapped_area) | ||
| 1744 | get_area = file->f_op->get_unmapped_area; | ||
| 1745 | |||
| 1746 | if (!get_area) | ||
| 1747 | return -ENOSYS; | ||
| 1748 | |||
| 1749 | return get_area(file, addr, len, pgoff, flags); | ||
| 1750 | } | ||
| 1751 | EXPORT_SYMBOL(get_unmapped_area); | ||
| 1752 | |||
| 1753 | /* | ||
| 1754 | * Check that a process has enough memory to allocate a new virtual | 1764 | * Check that a process has enough memory to allocate a new virtual |
| 1755 | * mapping. 0 means there is enough memory for the allocation to | 1765 | * mapping. 0 means there is enough memory for the allocation to |
| 1756 | * succeed and -ENOMEM implies there is not. | 1766 | * succeed and -ENOMEM implies there is not. |
| @@ -1889,9 +1899,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
| 1889 | 1899 | ||
| 1890 | /* only read or write mappings where it is permitted */ | 1900 | /* only read or write mappings where it is permitted */ |
| 1891 | if (write && vma->vm_flags & VM_MAYWRITE) | 1901 | if (write && vma->vm_flags & VM_MAYWRITE) |
| 1892 | len -= copy_to_user((void *) addr, buf, len); | 1902 | copy_to_user_page(vma, NULL, addr, |
| 1903 | (void *) addr, buf, len); | ||
| 1893 | else if (!write && vma->vm_flags & VM_MAYREAD) | 1904 | else if (!write && vma->vm_flags & VM_MAYREAD) |
| 1894 | len -= copy_from_user(buf, (void *) addr, len); | 1905 | copy_from_user_page(vma, NULL, addr, |
| 1906 | buf, (void *) addr, len); | ||
| 1895 | else | 1907 | else |
| 1896 | len = 0; | 1908 | len = 0; |
| 1897 | } else { | 1909 | } else { |
| @@ -1902,3 +1914,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in | |||
| 1902 | mmput(mm); | 1914 | mmput(mm); |
| 1903 | return len; | 1915 | return len; |
| 1904 | } | 1916 | } |
| 1917 | |||
| 1918 | /** | ||
| 1919 | * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode | ||
| 1920 | * @inode: The inode to check | ||
| 1921 | * @size: The current filesize of the inode | ||
| 1922 | * @newsize: The proposed filesize of the inode | ||
| 1923 | * | ||
| 1924 | * Check the shared mappings on an inode on behalf of a shrinking truncate to | ||
| 1925 | * make sure that that any outstanding VMAs aren't broken and then shrink the | ||
| 1926 | * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't | ||
| 1927 | * automatically grant mappings that are too large. | ||
| 1928 | */ | ||
| 1929 | int nommu_shrink_inode_mappings(struct inode *inode, size_t size, | ||
| 1930 | size_t newsize) | ||
| 1931 | { | ||
| 1932 | struct vm_area_struct *vma; | ||
| 1933 | struct prio_tree_iter iter; | ||
| 1934 | struct vm_region *region; | ||
| 1935 | pgoff_t low, high; | ||
| 1936 | size_t r_size, r_top; | ||
| 1937 | |||
| 1938 | low = newsize >> PAGE_SHIFT; | ||
| 1939 | high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; | ||
| 1940 | |||
| 1941 | down_write(&nommu_region_sem); | ||
| 1942 | |||
| 1943 | /* search for VMAs that fall within the dead zone */ | ||
| 1944 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | ||
| 1945 | low, high) { | ||
| 1946 | /* found one - only interested if it's shared out of the page | ||
| 1947 | * cache */ | ||
| 1948 | if (vma->vm_flags & VM_SHARED) { | ||
| 1949 | up_write(&nommu_region_sem); | ||
| 1950 | return -ETXTBSY; /* not quite true, but near enough */ | ||
| 1951 | } | ||
| 1952 | } | ||
| 1953 | |||
| 1954 | /* reduce any regions that overlap the dead zone - if in existence, | ||
| 1955 | * these will be pointed to by VMAs that don't overlap the dead zone | ||
| 1956 | * | ||
| 1957 | * we don't check for any regions that start beyond the EOF as there | ||
| 1958 | * shouldn't be any | ||
| 1959 | */ | ||
| 1960 | vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, | ||
| 1961 | 0, ULONG_MAX) { | ||
| 1962 | if (!(vma->vm_flags & VM_SHARED)) | ||
| 1963 | continue; | ||
| 1964 | |||
| 1965 | region = vma->vm_region; | ||
| 1966 | r_size = region->vm_top - region->vm_start; | ||
| 1967 | r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; | ||
| 1968 | |||
| 1969 | if (r_top > newsize) { | ||
| 1970 | region->vm_top -= r_top - newsize; | ||
| 1971 | if (region->vm_end > region->vm_top) | ||
| 1972 | region->vm_end = region->vm_top; | ||
| 1973 | } | ||
| 1974 | } | ||
| 1975 | |||
| 1976 | up_write(&nommu_region_sem); | ||
| 1977 | return 0; | ||
| 1978 | } | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ea2147dabba6..35755a4156d6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 196 | /* | 196 | /* |
| 197 | * Determine the type of allocation constraint. | 197 | * Determine the type of allocation constraint. |
| 198 | */ | 198 | */ |
| 199 | static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
| 200 | gfp_t gfp_mask) | ||
| 201 | { | ||
| 202 | #ifdef CONFIG_NUMA | 199 | #ifdef CONFIG_NUMA |
| 200 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
| 201 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
| 202 | { | ||
| 203 | struct zone *zone; | 203 | struct zone *zone; |
| 204 | struct zoneref *z; | 204 | struct zoneref *z; |
| 205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
| 206 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; | ||
| 207 | 206 | ||
| 208 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 207 | /* |
| 209 | if (cpuset_zone_allowed_softwall(zone, gfp_mask)) | 208 | * Reach here only when __GFP_NOFAIL is used. So, we should avoid |
| 210 | node_clear(zone_to_nid(zone), nodes); | 209 | * to kill current.We have to random task kill in this case. |
| 211 | else | 210 | * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. |
| 212 | return CONSTRAINT_CPUSET; | 211 | */ |
| 212 | if (gfp_mask & __GFP_THISNODE) | ||
| 213 | return CONSTRAINT_NONE; | ||
| 213 | 214 | ||
| 214 | if (!nodes_empty(nodes)) | 215 | /* |
| 216 | * The nodemask here is a nodemask passed to alloc_pages(). Now, | ||
| 217 | * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy | ||
| 218 | * feature. mempolicy is an only user of nodemask here. | ||
| 219 | * check mempolicy's nodemask contains all N_HIGH_MEMORY | ||
| 220 | */ | ||
| 221 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) | ||
| 215 | return CONSTRAINT_MEMORY_POLICY; | 222 | return CONSTRAINT_MEMORY_POLICY; |
| 216 | #endif | ||
| 217 | 223 | ||
| 224 | /* Check this allocation failure is caused by cpuset's wall function */ | ||
| 225 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
| 226 | high_zoneidx, nodemask) | ||
| 227 | if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) | ||
| 228 | return CONSTRAINT_CPUSET; | ||
| 229 | |||
| 230 | return CONSTRAINT_NONE; | ||
| 231 | } | ||
| 232 | #else | ||
| 233 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
| 234 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
| 235 | { | ||
| 218 | return CONSTRAINT_NONE; | 236 | return CONSTRAINT_NONE; |
| 219 | } | 237 | } |
| 238 | #endif | ||
| 220 | 239 | ||
| 221 | /* | 240 | /* |
| 222 | * Simple selection loop. We chose the process with the highest | 241 | * Simple selection loop. We chose the process with the highest |
| @@ -337,6 +356,24 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
| 337 | } while_each_thread(g, p); | 356 | } while_each_thread(g, p); |
| 338 | } | 357 | } |
| 339 | 358 | ||
| 359 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | ||
| 360 | struct mem_cgroup *mem) | ||
| 361 | { | ||
| 362 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | ||
| 363 | "oom_adj=%d\n", | ||
| 364 | current->comm, gfp_mask, order, current->signal->oom_adj); | ||
| 365 | task_lock(current); | ||
| 366 | cpuset_print_task_mems_allowed(current); | ||
| 367 | task_unlock(current); | ||
| 368 | dump_stack(); | ||
| 369 | mem_cgroup_print_oom_info(mem, p); | ||
| 370 | show_mem(); | ||
| 371 | if (sysctl_oom_dump_tasks) | ||
| 372 | dump_tasks(mem); | ||
| 373 | } | ||
| 374 | |||
| 375 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
| 376 | |||
| 340 | /* | 377 | /* |
| 341 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO | 378 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO |
| 342 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO | 379 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO |
| @@ -350,15 +387,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
| 350 | return; | 387 | return; |
| 351 | } | 388 | } |
| 352 | 389 | ||
| 390 | task_lock(p); | ||
| 353 | if (!p->mm) { | 391 | if (!p->mm) { |
| 354 | WARN_ON(1); | 392 | WARN_ON(1); |
| 355 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | 393 | printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", |
| 394 | task_pid_nr(p), p->comm); | ||
| 395 | task_unlock(p); | ||
| 356 | return; | 396 | return; |
| 357 | } | 397 | } |
| 358 | 398 | ||
| 359 | if (verbose) | 399 | if (verbose) |
| 360 | printk(KERN_ERR "Killed process %d (%s)\n", | 400 | printk(KERN_ERR "Killed process %d (%s) " |
| 361 | task_pid_nr(p), p->comm); | 401 | "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
| 402 | task_pid_nr(p), p->comm, | ||
| 403 | K(p->mm->total_vm), | ||
| 404 | K(get_mm_counter(p->mm, MM_ANONPAGES)), | ||
| 405 | K(get_mm_counter(p->mm, MM_FILEPAGES))); | ||
| 406 | task_unlock(p); | ||
| 362 | 407 | ||
| 363 | /* | 408 | /* |
| 364 | * We give our sacrificial lamb high priority and access to | 409 | * We give our sacrificial lamb high priority and access to |
| @@ -395,20 +440,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 395 | { | 440 | { |
| 396 | struct task_struct *c; | 441 | struct task_struct *c; |
| 397 | 442 | ||
| 398 | if (printk_ratelimit()) { | 443 | if (printk_ratelimit()) |
| 399 | printk(KERN_WARNING "%s invoked oom-killer: " | 444 | dump_header(p, gfp_mask, order, mem); |
| 400 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | ||
| 401 | current->comm, gfp_mask, order, | ||
| 402 | current->signal->oom_adj); | ||
| 403 | task_lock(current); | ||
| 404 | cpuset_print_task_mems_allowed(current); | ||
| 405 | task_unlock(current); | ||
| 406 | dump_stack(); | ||
| 407 | mem_cgroup_print_oom_info(mem, current); | ||
| 408 | show_mem(); | ||
| 409 | if (sysctl_oom_dump_tasks) | ||
| 410 | dump_tasks(mem); | ||
| 411 | } | ||
| 412 | 445 | ||
| 413 | /* | 446 | /* |
| 414 | * If the task is already exiting, don't alarm the sysadmin or kill | 447 | * If the task is already exiting, don't alarm the sysadmin or kill |
| @@ -426,6 +459,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 426 | list_for_each_entry(c, &p->children, sibling) { | 459 | list_for_each_entry(c, &p->children, sibling) { |
| 427 | if (c->mm == p->mm) | 460 | if (c->mm == p->mm) |
| 428 | continue; | 461 | continue; |
| 462 | if (mem && !task_in_mem_cgroup(c, mem)) | ||
| 463 | continue; | ||
| 429 | if (!oom_kill_task(c)) | 464 | if (!oom_kill_task(c)) |
| 430 | return 0; | 465 | return 0; |
| 431 | } | 466 | } |
| @@ -544,6 +579,7 @@ retry: | |||
| 544 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 579 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
| 545 | if (!p) { | 580 | if (!p) { |
| 546 | read_unlock(&tasklist_lock); | 581 | read_unlock(&tasklist_lock); |
| 582 | dump_header(NULL, gfp_mask, order, NULL); | ||
| 547 | panic("Out of memory and no killable processes...\n"); | 583 | panic("Out of memory and no killable processes...\n"); |
| 548 | } | 584 | } |
| 549 | 585 | ||
| @@ -599,7 +635,8 @@ rest_and_return: | |||
| 599 | * OR try to be smart about which process to kill. Note that we | 635 | * OR try to be smart about which process to kill. Note that we |
| 600 | * don't have to be perfect here, we just have to be good. | 636 | * don't have to be perfect here, we just have to be good. |
| 601 | */ | 637 | */ |
| 602 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 638 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
| 639 | int order, nodemask_t *nodemask) | ||
| 603 | { | 640 | { |
| 604 | unsigned long freed = 0; | 641 | unsigned long freed = 0; |
| 605 | enum oom_constraint constraint; | 642 | enum oom_constraint constraint; |
| @@ -609,14 +646,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
| 609 | /* Got some memory back in the last second. */ | 646 | /* Got some memory back in the last second. */ |
| 610 | return; | 647 | return; |
| 611 | 648 | ||
| 612 | if (sysctl_panic_on_oom == 2) | 649 | if (sysctl_panic_on_oom == 2) { |
| 650 | dump_header(NULL, gfp_mask, order, NULL); | ||
| 613 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); | 651 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); |
| 652 | } | ||
| 614 | 653 | ||
| 615 | /* | 654 | /* |
| 616 | * Check if there were limitations on the allocation (only relevant for | 655 | * Check if there were limitations on the allocation (only relevant for |
| 617 | * NUMA) that may require different handling. | 656 | * NUMA) that may require different handling. |
| 618 | */ | 657 | */ |
| 619 | constraint = constrained_alloc(zonelist, gfp_mask); | 658 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask); |
| 620 | read_lock(&tasklist_lock); | 659 | read_lock(&tasklist_lock); |
| 621 | 660 | ||
| 622 | switch (constraint) { | 661 | switch (constraint) { |
| @@ -626,8 +665,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
| 626 | break; | 665 | break; |
| 627 | 666 | ||
| 628 | case CONSTRAINT_NONE: | 667 | case CONSTRAINT_NONE: |
| 629 | if (sysctl_panic_on_oom) | 668 | if (sysctl_panic_on_oom) { |
| 669 | dump_header(NULL, gfp_mask, order, NULL); | ||
| 630 | panic("out of memory. panic_on_oom is selected\n"); | 670 | panic("out of memory. panic_on_oom is selected\n"); |
| 671 | } | ||
| 631 | /* Fall-through */ | 672 | /* Fall-through */ |
| 632 | case CONSTRAINT_CPUSET: | 673 | case CONSTRAINT_CPUSET: |
| 633 | __out_of_memory(gfp_mask, order); | 674 | __out_of_memory(gfp_mask, order); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2c5d79236ead..0b19943ecf8b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -821,7 +821,6 @@ int write_cache_pages(struct address_space *mapping, | |||
| 821 | struct writeback_control *wbc, writepage_t writepage, | 821 | struct writeback_control *wbc, writepage_t writepage, |
| 822 | void *data) | 822 | void *data) |
| 823 | { | 823 | { |
| 824 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
| 825 | int ret = 0; | 824 | int ret = 0; |
| 826 | int done = 0; | 825 | int done = 0; |
| 827 | struct pagevec pvec; | 826 | struct pagevec pvec; |
| @@ -834,11 +833,6 @@ int write_cache_pages(struct address_space *mapping, | |||
| 834 | int range_whole = 0; | 833 | int range_whole = 0; |
| 835 | long nr_to_write = wbc->nr_to_write; | 834 | long nr_to_write = wbc->nr_to_write; |
| 836 | 835 | ||
| 837 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
| 838 | wbc->encountered_congestion = 1; | ||
| 839 | return 0; | ||
| 840 | } | ||
| 841 | |||
| 842 | pagevec_init(&pvec, 0); | 836 | pagevec_init(&pvec, 0); |
| 843 | if (wbc->range_cyclic) { | 837 | if (wbc->range_cyclic) { |
| 844 | writeback_index = mapping->writeback_index; /* prev offset */ | 838 | writeback_index = mapping->writeback_index; /* prev offset */ |
| @@ -957,12 +951,6 @@ continue_unlock: | |||
| 957 | break; | 951 | break; |
| 958 | } | 952 | } |
| 959 | } | 953 | } |
| 960 | |||
| 961 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
| 962 | wbc->encountered_congestion = 1; | ||
| 963 | done = 1; | ||
| 964 | break; | ||
| 965 | } | ||
| 966 | } | 954 | } |
| 967 | pagevec_release(&pvec); | 955 | pagevec_release(&pvec); |
| 968 | cond_resched(); | 956 | cond_resched(); |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bc2ac63f41e..a8182c89de59 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -48,6 +48,7 @@ | |||
| 48 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
| 49 | #include <linux/debugobjects.h> | 49 | #include <linux/debugobjects.h> |
| 50 | #include <linux/kmemleak.h> | 50 | #include <linux/kmemleak.h> |
| 51 | #include <linux/memory.h> | ||
| 51 | #include <trace/events/kmem.h> | 52 | #include <trace/events/kmem.h> |
| 52 | 53 | ||
| 53 | #include <asm/tlbflush.h> | 54 | #include <asm/tlbflush.h> |
| @@ -75,6 +76,31 @@ unsigned long totalreserve_pages __read_mostly; | |||
| 75 | int percpu_pagelist_fraction; | 76 | int percpu_pagelist_fraction; |
| 76 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | 77 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
| 77 | 78 | ||
| 79 | #ifdef CONFIG_PM_SLEEP | ||
| 80 | /* | ||
| 81 | * The following functions are used by the suspend/hibernate code to temporarily | ||
| 82 | * change gfp_allowed_mask in order to avoid using I/O during memory allocations | ||
| 83 | * while devices are suspended. To avoid races with the suspend/hibernate code, | ||
| 84 | * they should always be called with pm_mutex held (gfp_allowed_mask also should | ||
| 85 | * only be modified with pm_mutex held, unless the suspend/hibernate code is | ||
| 86 | * guaranteed not to run in parallel with that modification). | ||
| 87 | */ | ||
| 88 | void set_gfp_allowed_mask(gfp_t mask) | ||
| 89 | { | ||
| 90 | WARN_ON(!mutex_is_locked(&pm_mutex)); | ||
| 91 | gfp_allowed_mask = mask; | ||
| 92 | } | ||
| 93 | |||
| 94 | gfp_t clear_gfp_allowed_mask(gfp_t mask) | ||
| 95 | { | ||
| 96 | gfp_t ret = gfp_allowed_mask; | ||
| 97 | |||
| 98 | WARN_ON(!mutex_is_locked(&pm_mutex)); | ||
| 99 | gfp_allowed_mask &= ~mask; | ||
| 100 | return ret; | ||
| 101 | } | ||
| 102 | #endif /* CONFIG_PM_SLEEP */ | ||
| 103 | |||
| 78 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 104 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
| 79 | int pageblock_order __read_mostly; | 105 | int pageblock_order __read_mostly; |
| 80 | #endif | 106 | #endif |
| @@ -486,7 +512,6 @@ static inline void __free_one_page(struct page *page, | |||
| 486 | zone->free_area[order].nr_free++; | 512 | zone->free_area[order].nr_free++; |
| 487 | } | 513 | } |
| 488 | 514 | ||
| 489 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
| 490 | /* | 515 | /* |
| 491 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | 516 | * free_page_mlock() -- clean up attempts to free and mlocked() page. |
| 492 | * Page should not be on lru, so no need to fix that up. | 517 | * Page should not be on lru, so no need to fix that up. |
| @@ -497,9 +522,6 @@ static inline void free_page_mlock(struct page *page) | |||
| 497 | __dec_zone_page_state(page, NR_MLOCK); | 522 | __dec_zone_page_state(page, NR_MLOCK); |
| 498 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | 523 | __count_vm_event(UNEVICTABLE_MLOCKFREED); |
| 499 | } | 524 | } |
| 500 | #else | ||
| 501 | static void free_page_mlock(struct page *page) { } | ||
| 502 | #endif | ||
| 503 | 525 | ||
| 504 | static inline int free_pages_check(struct page *page) | 526 | static inline int free_pages_check(struct page *page) |
| 505 | { | 527 | { |
| @@ -533,7 +555,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
| 533 | int batch_free = 0; | 555 | int batch_free = 0; |
| 534 | 556 | ||
| 535 | spin_lock(&zone->lock); | 557 | spin_lock(&zone->lock); |
| 536 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 558 | zone->all_unreclaimable = 0; |
| 537 | zone->pages_scanned = 0; | 559 | zone->pages_scanned = 0; |
| 538 | 560 | ||
| 539 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | 561 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); |
| @@ -559,8 +581,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
| 559 | page = list_entry(list->prev, struct page, lru); | 581 | page = list_entry(list->prev, struct page, lru); |
| 560 | /* must delete as __free_one_page list manipulates */ | 582 | /* must delete as __free_one_page list manipulates */ |
| 561 | list_del(&page->lru); | 583 | list_del(&page->lru); |
| 562 | __free_one_page(page, zone, 0, migratetype); | 584 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
| 563 | trace_mm_page_pcpu_drain(page, 0, migratetype); | 585 | __free_one_page(page, zone, 0, page_private(page)); |
| 586 | trace_mm_page_pcpu_drain(page, 0, page_private(page)); | ||
| 564 | } while (--count && --batch_free && !list_empty(list)); | 587 | } while (--count && --batch_free && !list_empty(list)); |
| 565 | } | 588 | } |
| 566 | spin_unlock(&zone->lock); | 589 | spin_unlock(&zone->lock); |
| @@ -570,7 +593,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order, | |||
| 570 | int migratetype) | 593 | int migratetype) |
| 571 | { | 594 | { |
| 572 | spin_lock(&zone->lock); | 595 | spin_lock(&zone->lock); |
| 573 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 596 | zone->all_unreclaimable = 0; |
| 574 | zone->pages_scanned = 0; | 597 | zone->pages_scanned = 0; |
| 575 | 598 | ||
| 576 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | 599 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); |
| @@ -585,6 +608,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 585 | int bad = 0; | 608 | int bad = 0; |
| 586 | int wasMlocked = __TestClearPageMlocked(page); | 609 | int wasMlocked = __TestClearPageMlocked(page); |
| 587 | 610 | ||
| 611 | trace_mm_page_free_direct(page, order); | ||
| 588 | kmemcheck_free_shadow(page, order); | 612 | kmemcheck_free_shadow(page, order); |
| 589 | 613 | ||
| 590 | for (i = 0 ; i < (1 << order) ; ++i) | 614 | for (i = 0 ; i < (1 << order) ; ++i) |
| @@ -1011,10 +1035,10 @@ static void drain_pages(unsigned int cpu) | |||
| 1011 | struct per_cpu_pageset *pset; | 1035 | struct per_cpu_pageset *pset; |
| 1012 | struct per_cpu_pages *pcp; | 1036 | struct per_cpu_pages *pcp; |
| 1013 | 1037 | ||
| 1014 | pset = zone_pcp(zone, cpu); | 1038 | local_irq_save(flags); |
| 1039 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
| 1015 | 1040 | ||
| 1016 | pcp = &pset->pcp; | 1041 | pcp = &pset->pcp; |
| 1017 | local_irq_save(flags); | ||
| 1018 | free_pcppages_bulk(zone, pcp->count, pcp); | 1042 | free_pcppages_bulk(zone, pcp->count, pcp); |
| 1019 | pcp->count = 0; | 1043 | pcp->count = 0; |
| 1020 | local_irq_restore(flags); | 1044 | local_irq_restore(flags); |
| @@ -1075,8 +1099,9 @@ void mark_free_pages(struct zone *zone) | |||
| 1075 | 1099 | ||
| 1076 | /* | 1100 | /* |
| 1077 | * Free a 0-order page | 1101 | * Free a 0-order page |
| 1102 | * cold == 1 ? free a cold page : free a hot page | ||
| 1078 | */ | 1103 | */ |
| 1079 | static void free_hot_cold_page(struct page *page, int cold) | 1104 | void free_hot_cold_page(struct page *page, int cold) |
| 1080 | { | 1105 | { |
| 1081 | struct zone *zone = page_zone(page); | 1106 | struct zone *zone = page_zone(page); |
| 1082 | struct per_cpu_pages *pcp; | 1107 | struct per_cpu_pages *pcp; |
| @@ -1084,6 +1109,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 1084 | int migratetype; | 1109 | int migratetype; |
| 1085 | int wasMlocked = __TestClearPageMlocked(page); | 1110 | int wasMlocked = __TestClearPageMlocked(page); |
| 1086 | 1111 | ||
| 1112 | trace_mm_page_free_direct(page, 0); | ||
| 1087 | kmemcheck_free_shadow(page, 0); | 1113 | kmemcheck_free_shadow(page, 0); |
| 1088 | 1114 | ||
| 1089 | if (PageAnon(page)) | 1115 | if (PageAnon(page)) |
| @@ -1098,7 +1124,6 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 1098 | arch_free_page(page, 0); | 1124 | arch_free_page(page, 0); |
| 1099 | kernel_map_pages(page, 1, 0); | 1125 | kernel_map_pages(page, 1, 0); |
| 1100 | 1126 | ||
| 1101 | pcp = &zone_pcp(zone, get_cpu())->pcp; | ||
| 1102 | migratetype = get_pageblock_migratetype(page); | 1127 | migratetype = get_pageblock_migratetype(page); |
| 1103 | set_page_private(page, migratetype); | 1128 | set_page_private(page, migratetype); |
| 1104 | local_irq_save(flags); | 1129 | local_irq_save(flags); |
| @@ -1121,6 +1146,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 1121 | migratetype = MIGRATE_MOVABLE; | 1146 | migratetype = MIGRATE_MOVABLE; |
| 1122 | } | 1147 | } |
| 1123 | 1148 | ||
| 1149 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
| 1124 | if (cold) | 1150 | if (cold) |
| 1125 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | 1151 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
| 1126 | else | 1152 | else |
| @@ -1133,15 +1159,8 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 1133 | 1159 | ||
| 1134 | out: | 1160 | out: |
| 1135 | local_irq_restore(flags); | 1161 | local_irq_restore(flags); |
| 1136 | put_cpu(); | ||
| 1137 | } | 1162 | } |
| 1138 | 1163 | ||
| 1139 | void free_hot_page(struct page *page) | ||
| 1140 | { | ||
| 1141 | trace_mm_page_free_direct(page, 0); | ||
| 1142 | free_hot_cold_page(page, 0); | ||
| 1143 | } | ||
| 1144 | |||
| 1145 | /* | 1164 | /* |
| 1146 | * split_page takes a non-compound higher-order page, and splits it into | 1165 | * split_page takes a non-compound higher-order page, and splits it into |
| 1147 | * n (1<<order) sub-pages: page[0..n] | 1166 | * n (1<<order) sub-pages: page[0..n] |
| @@ -1183,17 +1202,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, | |||
| 1183 | unsigned long flags; | 1202 | unsigned long flags; |
| 1184 | struct page *page; | 1203 | struct page *page; |
| 1185 | int cold = !!(gfp_flags & __GFP_COLD); | 1204 | int cold = !!(gfp_flags & __GFP_COLD); |
| 1186 | int cpu; | ||
| 1187 | 1205 | ||
| 1188 | again: | 1206 | again: |
| 1189 | cpu = get_cpu(); | ||
| 1190 | if (likely(order == 0)) { | 1207 | if (likely(order == 0)) { |
| 1191 | struct per_cpu_pages *pcp; | 1208 | struct per_cpu_pages *pcp; |
| 1192 | struct list_head *list; | 1209 | struct list_head *list; |
| 1193 | 1210 | ||
| 1194 | pcp = &zone_pcp(zone, cpu)->pcp; | ||
| 1195 | list = &pcp->lists[migratetype]; | ||
| 1196 | local_irq_save(flags); | 1211 | local_irq_save(flags); |
| 1212 | pcp = &this_cpu_ptr(zone->pageset)->pcp; | ||
| 1213 | list = &pcp->lists[migratetype]; | ||
| 1197 | if (list_empty(list)) { | 1214 | if (list_empty(list)) { |
| 1198 | pcp->count += rmqueue_bulk(zone, 0, | 1215 | pcp->count += rmqueue_bulk(zone, 0, |
| 1199 | pcp->batch, list, | 1216 | pcp->batch, list, |
| @@ -1225,16 +1242,15 @@ again: | |||
| 1225 | } | 1242 | } |
| 1226 | spin_lock_irqsave(&zone->lock, flags); | 1243 | spin_lock_irqsave(&zone->lock, flags); |
| 1227 | page = __rmqueue(zone, order, migratetype); | 1244 | page = __rmqueue(zone, order, migratetype); |
| 1228 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
| 1229 | spin_unlock(&zone->lock); | 1245 | spin_unlock(&zone->lock); |
| 1230 | if (!page) | 1246 | if (!page) |
| 1231 | goto failed; | 1247 | goto failed; |
| 1248 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
| 1232 | } | 1249 | } |
| 1233 | 1250 | ||
| 1234 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1251 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
| 1235 | zone_statistics(preferred_zone, zone); | 1252 | zone_statistics(preferred_zone, zone); |
| 1236 | local_irq_restore(flags); | 1253 | local_irq_restore(flags); |
| 1237 | put_cpu(); | ||
| 1238 | 1254 | ||
| 1239 | VM_BUG_ON(bad_range(zone, page)); | 1255 | VM_BUG_ON(bad_range(zone, page)); |
| 1240 | if (prep_new_page(page, order, gfp_flags)) | 1256 | if (prep_new_page(page, order, gfp_flags)) |
| @@ -1243,7 +1259,6 @@ again: | |||
| 1243 | 1259 | ||
| 1244 | failed: | 1260 | failed: |
| 1245 | local_irq_restore(flags); | 1261 | local_irq_restore(flags); |
| 1246 | put_cpu(); | ||
| 1247 | return NULL; | 1262 | return NULL; |
| 1248 | } | 1263 | } |
| 1249 | 1264 | ||
| @@ -1658,12 +1673,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
| 1658 | if (page) | 1673 | if (page) |
| 1659 | goto out; | 1674 | goto out; |
| 1660 | 1675 | ||
| 1661 | /* The OOM killer will not help higher order allocs */ | 1676 | if (!(gfp_mask & __GFP_NOFAIL)) { |
| 1662 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | 1677 | /* The OOM killer will not help higher order allocs */ |
| 1663 | goto out; | 1678 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
| 1664 | 1679 | goto out; | |
| 1680 | /* | ||
| 1681 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | ||
| 1682 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | ||
| 1683 | * The caller should handle page allocation failure by itself if | ||
| 1684 | * it specifies __GFP_THISNODE. | ||
| 1685 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. | ||
| 1686 | */ | ||
| 1687 | if (gfp_mask & __GFP_THISNODE) | ||
| 1688 | goto out; | ||
| 1689 | } | ||
| 1665 | /* Exhausted what can be done so it's blamo time */ | 1690 | /* Exhausted what can be done so it's blamo time */ |
| 1666 | out_of_memory(zonelist, gfp_mask, order); | 1691 | out_of_memory(zonelist, gfp_mask, order, nodemask); |
| 1667 | 1692 | ||
| 1668 | out: | 1693 | out: |
| 1669 | clear_zonelist_oom(zonelist, gfp_mask); | 1694 | clear_zonelist_oom(zonelist, gfp_mask); |
| @@ -2005,9 +2030,8 @@ void __pagevec_free(struct pagevec *pvec) | |||
| 2005 | void __free_pages(struct page *page, unsigned int order) | 2030 | void __free_pages(struct page *page, unsigned int order) |
| 2006 | { | 2031 | { |
| 2007 | if (put_page_testzero(page)) { | 2032 | if (put_page_testzero(page)) { |
| 2008 | trace_mm_page_free_direct(page, order); | ||
| 2009 | if (order == 0) | 2033 | if (order == 0) |
| 2010 | free_hot_page(page); | 2034 | free_hot_cold_page(page, 0); |
| 2011 | else | 2035 | else |
| 2012 | __free_pages_ok(page, order); | 2036 | __free_pages_ok(page, order); |
| 2013 | } | 2037 | } |
| @@ -2172,7 +2196,7 @@ void show_free_areas(void) | |||
| 2172 | for_each_online_cpu(cpu) { | 2196 | for_each_online_cpu(cpu) { |
| 2173 | struct per_cpu_pageset *pageset; | 2197 | struct per_cpu_pageset *pageset; |
| 2174 | 2198 | ||
| 2175 | pageset = zone_pcp(zone, cpu); | 2199 | pageset = per_cpu_ptr(zone->pageset, cpu); |
| 2176 | 2200 | ||
| 2177 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", | 2201 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", |
| 2178 | cpu, pageset->pcp.high, | 2202 | cpu, pageset->pcp.high, |
| @@ -2263,7 +2287,7 @@ void show_free_areas(void) | |||
| 2263 | K(zone_page_state(zone, NR_BOUNCE)), | 2287 | K(zone_page_state(zone, NR_BOUNCE)), |
| 2264 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 2288 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
| 2265 | zone->pages_scanned, | 2289 | zone->pages_scanned, |
| 2266 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 2290 | (zone->all_unreclaimable ? "yes" : "no") |
| 2267 | ); | 2291 | ); |
| 2268 | printk("lowmem_reserve[]:"); | 2292 | printk("lowmem_reserve[]:"); |
| 2269 | for (i = 0; i < MAX_NR_ZONES; i++) | 2293 | for (i = 0; i < MAX_NR_ZONES; i++) |
| @@ -2395,13 +2419,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
| 2395 | { | 2419 | { |
| 2396 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 2420 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; |
| 2397 | int ret; | 2421 | int ret; |
| 2422 | static DEFINE_MUTEX(zl_order_mutex); | ||
| 2398 | 2423 | ||
| 2424 | mutex_lock(&zl_order_mutex); | ||
| 2399 | if (write) | 2425 | if (write) |
| 2400 | strncpy(saved_string, (char*)table->data, | 2426 | strcpy(saved_string, (char*)table->data); |
| 2401 | NUMA_ZONELIST_ORDER_LEN); | ||
| 2402 | ret = proc_dostring(table, write, buffer, length, ppos); | 2427 | ret = proc_dostring(table, write, buffer, length, ppos); |
| 2403 | if (ret) | 2428 | if (ret) |
| 2404 | return ret; | 2429 | goto out; |
| 2405 | if (write) { | 2430 | if (write) { |
| 2406 | int oldval = user_zonelist_order; | 2431 | int oldval = user_zonelist_order; |
| 2407 | if (__parse_numa_zonelist_order((char*)table->data)) { | 2432 | if (__parse_numa_zonelist_order((char*)table->data)) { |
| @@ -2414,7 +2439,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
| 2414 | } else if (oldval != user_zonelist_order) | 2439 | } else if (oldval != user_zonelist_order) |
| 2415 | build_all_zonelists(); | 2440 | build_all_zonelists(); |
| 2416 | } | 2441 | } |
| 2417 | return 0; | 2442 | out: |
| 2443 | mutex_unlock(&zl_order_mutex); | ||
| 2444 | return ret; | ||
| 2418 | } | 2445 | } |
| 2419 | 2446 | ||
| 2420 | 2447 | ||
| @@ -2734,10 +2761,29 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
| 2734 | 2761 | ||
| 2735 | #endif /* CONFIG_NUMA */ | 2762 | #endif /* CONFIG_NUMA */ |
| 2736 | 2763 | ||
| 2764 | /* | ||
| 2765 | * Boot pageset table. One per cpu which is going to be used for all | ||
| 2766 | * zones and all nodes. The parameters will be set in such a way | ||
| 2767 | * that an item put on a list will immediately be handed over to | ||
| 2768 | * the buddy list. This is safe since pageset manipulation is done | ||
| 2769 | * with interrupts disabled. | ||
| 2770 | * | ||
| 2771 | * The boot_pagesets must be kept even after bootup is complete for | ||
| 2772 | * unused processors and/or zones. They do play a role for bootstrapping | ||
| 2773 | * hotplugged processors. | ||
| 2774 | * | ||
| 2775 | * zoneinfo_show() and maybe other functions do | ||
| 2776 | * not check if the processor is online before following the pageset pointer. | ||
| 2777 | * Other parts of the kernel may not check if the zone is available. | ||
| 2778 | */ | ||
| 2779 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); | ||
| 2780 | static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); | ||
| 2781 | |||
| 2737 | /* return values int ....just for stop_machine() */ | 2782 | /* return values int ....just for stop_machine() */ |
| 2738 | static int __build_all_zonelists(void *dummy) | 2783 | static int __build_all_zonelists(void *dummy) |
| 2739 | { | 2784 | { |
| 2740 | int nid; | 2785 | int nid; |
| 2786 | int cpu; | ||
| 2741 | 2787 | ||
| 2742 | #ifdef CONFIG_NUMA | 2788 | #ifdef CONFIG_NUMA |
| 2743 | memset(node_load, 0, sizeof(node_load)); | 2789 | memset(node_load, 0, sizeof(node_load)); |
| @@ -2748,6 +2794,23 @@ static int __build_all_zonelists(void *dummy) | |||
| 2748 | build_zonelists(pgdat); | 2794 | build_zonelists(pgdat); |
| 2749 | build_zonelist_cache(pgdat); | 2795 | build_zonelist_cache(pgdat); |
| 2750 | } | 2796 | } |
| 2797 | |||
| 2798 | /* | ||
| 2799 | * Initialize the boot_pagesets that are going to be used | ||
| 2800 | * for bootstrapping processors. The real pagesets for | ||
| 2801 | * each zone will be allocated later when the per cpu | ||
| 2802 | * allocator is available. | ||
| 2803 | * | ||
| 2804 | * boot_pagesets are used also for bootstrapping offline | ||
| 2805 | * cpus if the system is already booted because the pagesets | ||
| 2806 | * are needed to initialize allocators on a specific cpu too. | ||
| 2807 | * F.e. the percpu allocator needs the page allocator which | ||
| 2808 | * needs the percpu allocator in order to allocate its pagesets | ||
| 2809 | * (a chicken-egg dilemma). | ||
| 2810 | */ | ||
| 2811 | for_each_possible_cpu(cpu) | ||
| 2812 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | ||
| 2813 | |||
| 2751 | return 0; | 2814 | return 0; |
| 2752 | } | 2815 | } |
| 2753 | 2816 | ||
| @@ -3085,121 +3148,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
| 3085 | pcp->batch = PAGE_SHIFT * 8; | 3148 | pcp->batch = PAGE_SHIFT * 8; |
| 3086 | } | 3149 | } |
| 3087 | 3150 | ||
| 3088 | |||
| 3089 | #ifdef CONFIG_NUMA | ||
| 3090 | /* | 3151 | /* |
| 3091 | * Boot pageset table. One per cpu which is going to be used for all | 3152 | * Allocate per cpu pagesets and initialize them. |
| 3092 | * zones and all nodes. The parameters will be set in such a way | 3153 | * Before this call only boot pagesets were available. |
| 3093 | * that an item put on a list will immediately be handed over to | 3154 | * Boot pagesets will no longer be used by this processorr |
| 3094 | * the buddy list. This is safe since pageset manipulation is done | 3155 | * after setup_per_cpu_pageset(). |
| 3095 | * with interrupts disabled. | ||
| 3096 | * | ||
| 3097 | * Some NUMA counter updates may also be caught by the boot pagesets. | ||
| 3098 | * | ||
| 3099 | * The boot_pagesets must be kept even after bootup is complete for | ||
| 3100 | * unused processors and/or zones. They do play a role for bootstrapping | ||
| 3101 | * hotplugged processors. | ||
| 3102 | * | ||
| 3103 | * zoneinfo_show() and maybe other functions do | ||
| 3104 | * not check if the processor is online before following the pageset pointer. | ||
| 3105 | * Other parts of the kernel may not check if the zone is available. | ||
| 3106 | */ | ||
| 3107 | static struct per_cpu_pageset boot_pageset[NR_CPUS]; | ||
| 3108 | |||
| 3109 | /* | ||
| 3110 | * Dynamically allocate memory for the | ||
| 3111 | * per cpu pageset array in struct zone. | ||
| 3112 | */ | 3156 | */ |
| 3113 | static int __cpuinit process_zones(int cpu) | 3157 | void __init setup_per_cpu_pageset(void) |
| 3114 | { | 3158 | { |
| 3115 | struct zone *zone, *dzone; | 3159 | struct zone *zone; |
| 3116 | int node = cpu_to_node(cpu); | 3160 | int cpu; |
| 3117 | |||
| 3118 | node_set_state(node, N_CPU); /* this node has a cpu */ | ||
| 3119 | 3161 | ||
| 3120 | for_each_populated_zone(zone) { | 3162 | for_each_populated_zone(zone) { |
| 3121 | zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), | 3163 | zone->pageset = alloc_percpu(struct per_cpu_pageset); |
| 3122 | GFP_KERNEL, node); | ||
| 3123 | if (!zone_pcp(zone, cpu)) | ||
| 3124 | goto bad; | ||
| 3125 | |||
| 3126 | setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); | ||
| 3127 | |||
| 3128 | if (percpu_pagelist_fraction) | ||
| 3129 | setup_pagelist_highmark(zone_pcp(zone, cpu), | ||
| 3130 | (zone->present_pages / percpu_pagelist_fraction)); | ||
| 3131 | } | ||
| 3132 | |||
| 3133 | return 0; | ||
| 3134 | bad: | ||
| 3135 | for_each_zone(dzone) { | ||
| 3136 | if (!populated_zone(dzone)) | ||
| 3137 | continue; | ||
| 3138 | if (dzone == zone) | ||
| 3139 | break; | ||
| 3140 | kfree(zone_pcp(dzone, cpu)); | ||
| 3141 | zone_pcp(dzone, cpu) = &boot_pageset[cpu]; | ||
| 3142 | } | ||
| 3143 | return -ENOMEM; | ||
| 3144 | } | ||
| 3145 | 3164 | ||
| 3146 | static inline void free_zone_pagesets(int cpu) | 3165 | for_each_possible_cpu(cpu) { |
| 3147 | { | 3166 | struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); |
| 3148 | struct zone *zone; | ||
| 3149 | 3167 | ||
| 3150 | for_each_zone(zone) { | 3168 | setup_pageset(pcp, zone_batchsize(zone)); |
| 3151 | struct per_cpu_pageset *pset = zone_pcp(zone, cpu); | ||
| 3152 | 3169 | ||
| 3153 | /* Free per_cpu_pageset if it is slab allocated */ | 3170 | if (percpu_pagelist_fraction) |
| 3154 | if (pset != &boot_pageset[cpu]) | 3171 | setup_pagelist_highmark(pcp, |
| 3155 | kfree(pset); | 3172 | (zone->present_pages / |
| 3156 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | 3173 | percpu_pagelist_fraction)); |
| 3157 | } | 3174 | } |
| 3158 | } | ||
| 3159 | |||
| 3160 | static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, | ||
| 3161 | unsigned long action, | ||
| 3162 | void *hcpu) | ||
| 3163 | { | ||
| 3164 | int cpu = (long)hcpu; | ||
| 3165 | int ret = NOTIFY_OK; | ||
| 3166 | |||
| 3167 | switch (action) { | ||
| 3168 | case CPU_UP_PREPARE: | ||
| 3169 | case CPU_UP_PREPARE_FROZEN: | ||
| 3170 | if (process_zones(cpu)) | ||
| 3171 | ret = NOTIFY_BAD; | ||
| 3172 | break; | ||
| 3173 | case CPU_UP_CANCELED: | ||
| 3174 | case CPU_UP_CANCELED_FROZEN: | ||
| 3175 | case CPU_DEAD: | ||
| 3176 | case CPU_DEAD_FROZEN: | ||
| 3177 | free_zone_pagesets(cpu); | ||
| 3178 | break; | ||
| 3179 | default: | ||
| 3180 | break; | ||
| 3181 | } | 3175 | } |
| 3182 | return ret; | ||
| 3183 | } | 3176 | } |
| 3184 | 3177 | ||
| 3185 | static struct notifier_block __cpuinitdata pageset_notifier = | ||
| 3186 | { &pageset_cpuup_callback, NULL, 0 }; | ||
| 3187 | |||
| 3188 | void __init setup_per_cpu_pageset(void) | ||
| 3189 | { | ||
| 3190 | int err; | ||
| 3191 | |||
| 3192 | /* Initialize per_cpu_pageset for cpu 0. | ||
| 3193 | * A cpuup callback will do this for every cpu | ||
| 3194 | * as it comes online | ||
| 3195 | */ | ||
| 3196 | err = process_zones(smp_processor_id()); | ||
| 3197 | BUG_ON(err); | ||
| 3198 | register_cpu_notifier(&pageset_notifier); | ||
| 3199 | } | ||
| 3200 | |||
| 3201 | #endif | ||
| 3202 | |||
| 3203 | static noinline __init_refok | 3178 | static noinline __init_refok |
| 3204 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | 3179 | int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) |
| 3205 | { | 3180 | { |
| @@ -3253,7 +3228,7 @@ static int __zone_pcp_update(void *data) | |||
| 3253 | struct per_cpu_pageset *pset; | 3228 | struct per_cpu_pageset *pset; |
| 3254 | struct per_cpu_pages *pcp; | 3229 | struct per_cpu_pages *pcp; |
| 3255 | 3230 | ||
| 3256 | pset = zone_pcp(zone, cpu); | 3231 | pset = per_cpu_ptr(zone->pageset, cpu); |
| 3257 | pcp = &pset->pcp; | 3232 | pcp = &pset->pcp; |
| 3258 | 3233 | ||
| 3259 | local_irq_save(flags); | 3234 | local_irq_save(flags); |
| @@ -3271,21 +3246,17 @@ void zone_pcp_update(struct zone *zone) | |||
| 3271 | 3246 | ||
| 3272 | static __meminit void zone_pcp_init(struct zone *zone) | 3247 | static __meminit void zone_pcp_init(struct zone *zone) |
| 3273 | { | 3248 | { |
| 3274 | int cpu; | 3249 | /* |
| 3275 | unsigned long batch = zone_batchsize(zone); | 3250 | * per cpu subsystem is not up at this point. The following code |
| 3251 | * relies on the ability of the linker to provide the | ||
| 3252 | * offset of a (static) per cpu variable into the per cpu area. | ||
| 3253 | */ | ||
| 3254 | zone->pageset = &boot_pageset; | ||
| 3276 | 3255 | ||
| 3277 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
| 3278 | #ifdef CONFIG_NUMA | ||
| 3279 | /* Early boot. Slab allocator not functional yet */ | ||
| 3280 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; | ||
| 3281 | setup_pageset(&boot_pageset[cpu],0); | ||
| 3282 | #else | ||
| 3283 | setup_pageset(zone_pcp(zone,cpu), batch); | ||
| 3284 | #endif | ||
| 3285 | } | ||
| 3286 | if (zone->present_pages) | 3256 | if (zone->present_pages) |
| 3287 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", | 3257 | printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", |
| 3288 | zone->name, zone->present_pages, batch); | 3258 | zone->name, zone->present_pages, |
| 3259 | zone_batchsize(zone)); | ||
| 3289 | } | 3260 | } |
| 3290 | 3261 | ||
| 3291 | __meminit int init_currently_empty_zone(struct zone *zone, | 3262 | __meminit int init_currently_empty_zone(struct zone *zone, |
| @@ -3424,6 +3395,61 @@ void __init free_bootmem_with_active_regions(int nid, | |||
| 3424 | } | 3395 | } |
| 3425 | } | 3396 | } |
| 3426 | 3397 | ||
| 3398 | int __init add_from_early_node_map(struct range *range, int az, | ||
| 3399 | int nr_range, int nid) | ||
| 3400 | { | ||
| 3401 | int i; | ||
| 3402 | u64 start, end; | ||
| 3403 | |||
| 3404 | /* need to go over early_node_map to find out good range for node */ | ||
| 3405 | for_each_active_range_index_in_nid(i, nid) { | ||
| 3406 | start = early_node_map[i].start_pfn; | ||
| 3407 | end = early_node_map[i].end_pfn; | ||
| 3408 | nr_range = add_range(range, az, nr_range, start, end); | ||
| 3409 | } | ||
| 3410 | return nr_range; | ||
| 3411 | } | ||
| 3412 | |||
| 3413 | #ifdef CONFIG_NO_BOOTMEM | ||
| 3414 | void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, | ||
| 3415 | u64 goal, u64 limit) | ||
| 3416 | { | ||
| 3417 | int i; | ||
| 3418 | void *ptr; | ||
| 3419 | |||
| 3420 | /* need to go over early_node_map to find out good range for node */ | ||
| 3421 | for_each_active_range_index_in_nid(i, nid) { | ||
| 3422 | u64 addr; | ||
| 3423 | u64 ei_start, ei_last; | ||
| 3424 | |||
| 3425 | ei_last = early_node_map[i].end_pfn; | ||
| 3426 | ei_last <<= PAGE_SHIFT; | ||
| 3427 | ei_start = early_node_map[i].start_pfn; | ||
| 3428 | ei_start <<= PAGE_SHIFT; | ||
| 3429 | addr = find_early_area(ei_start, ei_last, | ||
| 3430 | goal, limit, size, align); | ||
| 3431 | |||
| 3432 | if (addr == -1ULL) | ||
| 3433 | continue; | ||
| 3434 | |||
| 3435 | #if 0 | ||
| 3436 | printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n", | ||
| 3437 | nid, | ||
| 3438 | ei_start, ei_last, goal, limit, size, | ||
| 3439 | align, addr); | ||
| 3440 | #endif | ||
| 3441 | |||
| 3442 | ptr = phys_to_virt(addr); | ||
| 3443 | memset(ptr, 0, size); | ||
| 3444 | reserve_early_without_check(addr, addr + size, "BOOTMEM"); | ||
| 3445 | return ptr; | ||
| 3446 | } | ||
| 3447 | |||
| 3448 | return NULL; | ||
| 3449 | } | ||
| 3450 | #endif | ||
| 3451 | |||
| 3452 | |||
| 3427 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) | 3453 | void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) |
| 3428 | { | 3454 | { |
| 3429 | int i; | 3455 | int i; |
| @@ -3573,7 +3599,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, | |||
| 3573 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, | 3599 | * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, |
| 3574 | * then all holes in the requested range will be accounted for. | 3600 | * then all holes in the requested range will be accounted for. |
| 3575 | */ | 3601 | */ |
| 3576 | static unsigned long __meminit __absent_pages_in_range(int nid, | 3602 | unsigned long __meminit __absent_pages_in_range(int nid, |
| 3577 | unsigned long range_start_pfn, | 3603 | unsigned long range_start_pfn, |
| 3578 | unsigned long range_end_pfn) | 3604 | unsigned long range_end_pfn) |
| 3579 | { | 3605 | { |
| @@ -3988,7 +4014,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, | |||
| 3988 | } | 4014 | } |
| 3989 | 4015 | ||
| 3990 | /* Merge backward if suitable */ | 4016 | /* Merge backward if suitable */ |
| 3991 | if (start_pfn < early_node_map[i].end_pfn && | 4017 | if (start_pfn < early_node_map[i].start_pfn && |
| 3992 | end_pfn >= early_node_map[i].start_pfn) { | 4018 | end_pfn >= early_node_map[i].start_pfn) { |
| 3993 | early_node_map[i].start_pfn = start_pfn; | 4019 | early_node_map[i].start_pfn = start_pfn; |
| 3994 | return; | 4020 | return; |
| @@ -4102,7 +4128,7 @@ static int __init cmp_node_active_region(const void *a, const void *b) | |||
| 4102 | } | 4128 | } |
| 4103 | 4129 | ||
| 4104 | /* sort the node_map by start_pfn */ | 4130 | /* sort the node_map by start_pfn */ |
| 4105 | static void __init sort_node_map(void) | 4131 | void __init sort_node_map(void) |
| 4106 | { | 4132 | { |
| 4107 | sort(early_node_map, (size_t)nr_nodemap_entries, | 4133 | sort(early_node_map, (size_t)nr_nodemap_entries, |
| 4108 | sizeof(struct node_active_region), | 4134 | sizeof(struct node_active_region), |
| @@ -4366,8 +4392,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
| 4366 | for (i = 0; i < MAX_NR_ZONES; i++) { | 4392 | for (i = 0; i < MAX_NR_ZONES; i++) { |
| 4367 | if (i == ZONE_MOVABLE) | 4393 | if (i == ZONE_MOVABLE) |
| 4368 | continue; | 4394 | continue; |
| 4369 | printk(" %-8s %0#10lx -> %0#10lx\n", | 4395 | printk(" %-8s ", zone_names[i]); |
| 4370 | zone_names[i], | 4396 | if (arch_zone_lowest_possible_pfn[i] == |
| 4397 | arch_zone_highest_possible_pfn[i]) | ||
| 4398 | printk("empty\n"); | ||
| 4399 | else | ||
| 4400 | printk("%0#10lx -> %0#10lx\n", | ||
| 4371 | arch_zone_lowest_possible_pfn[i], | 4401 | arch_zone_lowest_possible_pfn[i], |
| 4372 | arch_zone_highest_possible_pfn[i]); | 4402 | arch_zone_highest_possible_pfn[i]); |
| 4373 | } | 4403 | } |
| @@ -4456,7 +4486,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) | |||
| 4456 | } | 4486 | } |
| 4457 | 4487 | ||
| 4458 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 4488 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
| 4459 | struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; | 4489 | struct pglist_data __refdata contig_page_data = { |
| 4490 | #ifndef CONFIG_NO_BOOTMEM | ||
| 4491 | .bdata = &bootmem_node_data[0] | ||
| 4492 | #endif | ||
| 4493 | }; | ||
| 4460 | EXPORT_SYMBOL(contig_page_data); | 4494 | EXPORT_SYMBOL(contig_page_data); |
| 4461 | #endif | 4495 | #endif |
| 4462 | 4496 | ||
| @@ -4799,10 +4833,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | |||
| 4799 | if (!write || (ret == -EINVAL)) | 4833 | if (!write || (ret == -EINVAL)) |
| 4800 | return ret; | 4834 | return ret; |
| 4801 | for_each_populated_zone(zone) { | 4835 | for_each_populated_zone(zone) { |
| 4802 | for_each_online_cpu(cpu) { | 4836 | for_each_possible_cpu(cpu) { |
| 4803 | unsigned long high; | 4837 | unsigned long high; |
| 4804 | high = zone->present_pages / percpu_pagelist_fraction; | 4838 | high = zone->present_pages / percpu_pagelist_fraction; |
| 4805 | setup_pagelist_highmark(zone_pcp(zone, cpu), high); | 4839 | setup_pagelist_highmark( |
| 4840 | per_cpu_ptr(zone->pageset, cpu), high); | ||
| 4806 | } | 4841 | } |
| 4807 | } | 4842 | } |
| 4808 | return 0; | 4843 | return 0; |
| @@ -5002,23 +5037,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
| 5002 | int set_migratetype_isolate(struct page *page) | 5037 | int set_migratetype_isolate(struct page *page) |
| 5003 | { | 5038 | { |
| 5004 | struct zone *zone; | 5039 | struct zone *zone; |
| 5005 | unsigned long flags; | 5040 | struct page *curr_page; |
| 5041 | unsigned long flags, pfn, iter; | ||
| 5042 | unsigned long immobile = 0; | ||
| 5043 | struct memory_isolate_notify arg; | ||
| 5044 | int notifier_ret; | ||
| 5006 | int ret = -EBUSY; | 5045 | int ret = -EBUSY; |
| 5007 | int zone_idx; | 5046 | int zone_idx; |
| 5008 | 5047 | ||
| 5009 | zone = page_zone(page); | 5048 | zone = page_zone(page); |
| 5010 | zone_idx = zone_idx(zone); | 5049 | zone_idx = zone_idx(zone); |
| 5050 | |||
| 5011 | spin_lock_irqsave(&zone->lock, flags); | 5051 | spin_lock_irqsave(&zone->lock, flags); |
| 5052 | if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE || | ||
| 5053 | zone_idx == ZONE_MOVABLE) { | ||
| 5054 | ret = 0; | ||
| 5055 | goto out; | ||
| 5056 | } | ||
| 5057 | |||
| 5058 | pfn = page_to_pfn(page); | ||
| 5059 | arg.start_pfn = pfn; | ||
| 5060 | arg.nr_pages = pageblock_nr_pages; | ||
| 5061 | arg.pages_found = 0; | ||
| 5062 | |||
| 5012 | /* | 5063 | /* |
| 5013 | * In future, more migrate types will be able to be isolation target. | 5064 | * It may be possible to isolate a pageblock even if the |
| 5065 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
| 5066 | * notifier chain is used by balloon drivers to return the | ||
| 5067 | * number of pages in a range that are held by the balloon | ||
| 5068 | * driver to shrink memory. If all the pages are accounted for | ||
| 5069 | * by balloons, are free, or on the LRU, isolation can continue. | ||
| 5070 | * Later, for example, when memory hotplug notifier runs, these | ||
| 5071 | * pages reported as "can be isolated" should be isolated(freed) | ||
| 5072 | * by the balloon driver through the memory notifier chain. | ||
| 5014 | */ | 5073 | */ |
| 5015 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && | 5074 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); |
| 5016 | zone_idx != ZONE_MOVABLE) | 5075 | notifier_ret = notifier_to_errno(notifier_ret); |
| 5076 | if (notifier_ret || !arg.pages_found) | ||
| 5017 | goto out; | 5077 | goto out; |
| 5018 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 5078 | |
| 5019 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 5079 | for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) { |
| 5020 | ret = 0; | 5080 | if (!pfn_valid_within(pfn)) |
| 5081 | continue; | ||
| 5082 | |||
| 5083 | curr_page = pfn_to_page(iter); | ||
| 5084 | if (!page_count(curr_page) || PageLRU(curr_page)) | ||
| 5085 | continue; | ||
| 5086 | |||
| 5087 | immobile++; | ||
| 5088 | } | ||
| 5089 | |||
| 5090 | if (arg.pages_found == immobile) | ||
| 5091 | ret = 0; | ||
| 5092 | |||
| 5021 | out: | 5093 | out: |
| 5094 | if (!ret) { | ||
| 5095 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
| 5096 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
| 5097 | } | ||
| 5098 | |||
| 5022 | spin_unlock_irqrestore(&zone->lock, flags); | 5099 | spin_unlock_irqrestore(&zone->lock, flags); |
| 5023 | if (!ret) | 5100 | if (!ret) |
| 5024 | drain_all_pages(); | 5101 | drain_all_pages(); |
| @@ -5085,3 +5162,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
| 5085 | spin_unlock_irqrestore(&zone->lock, flags); | 5162 | spin_unlock_irqrestore(&zone->lock, flags); |
| 5086 | } | 5163 | } |
| 5087 | #endif | 5164 | #endif |
| 5165 | |||
| 5166 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 5167 | bool is_free_buddy_page(struct page *page) | ||
| 5168 | { | ||
| 5169 | struct zone *zone = page_zone(page); | ||
| 5170 | unsigned long pfn = page_to_pfn(page); | ||
| 5171 | unsigned long flags; | ||
| 5172 | int order; | ||
| 5173 | |||
| 5174 | spin_lock_irqsave(&zone->lock, flags); | ||
| 5175 | for (order = 0; order < MAX_ORDER; order++) { | ||
| 5176 | struct page *page_head = page - (pfn & ((1 << order) - 1)); | ||
| 5177 | |||
| 5178 | if (PageBuddy(page_head) && page_order(page_head) >= order) | ||
| 5179 | break; | ||
| 5180 | } | ||
| 5181 | spin_unlock_irqrestore(&zone->lock, flags); | ||
| 5182 | |||
| 5183 | return order < MAX_ORDER; | ||
| 5184 | } | ||
| 5185 | #endif | ||
diff --git a/mm/page_io.c b/mm/page_io.c index c6f3e5071de3..a19af956ee1b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -19,20 +19,15 @@ | |||
| 19 | #include <linux/writeback.h> | 19 | #include <linux/writeback.h> |
| 20 | #include <asm/pgtable.h> | 20 | #include <asm/pgtable.h> |
| 21 | 21 | ||
| 22 | static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, | 22 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
| 23 | struct page *page, bio_end_io_t end_io) | 23 | struct page *page, bio_end_io_t end_io) |
| 24 | { | 24 | { |
| 25 | struct bio *bio; | 25 | struct bio *bio; |
| 26 | 26 | ||
| 27 | bio = bio_alloc(gfp_flags, 1); | 27 | bio = bio_alloc(gfp_flags, 1); |
| 28 | if (bio) { | 28 | if (bio) { |
| 29 | struct swap_info_struct *sis; | 29 | bio->bi_sector = map_swap_page(page, &bio->bi_bdev); |
| 30 | swp_entry_t entry = { .val = index, }; | 30 | bio->bi_sector <<= PAGE_SHIFT - 9; |
| 31 | |||
| 32 | sis = get_swap_info_struct(swp_type(entry)); | ||
| 33 | bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * | ||
| 34 | (PAGE_SIZE >> 9); | ||
| 35 | bio->bi_bdev = sis->bdev; | ||
| 36 | bio->bi_io_vec[0].bv_page = page; | 31 | bio->bi_io_vec[0].bv_page = page; |
| 37 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; | 32 | bio->bi_io_vec[0].bv_len = PAGE_SIZE; |
| 38 | bio->bi_io_vec[0].bv_offset = 0; | 33 | bio->bi_io_vec[0].bv_offset = 0; |
| @@ -102,8 +97,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
| 102 | unlock_page(page); | 97 | unlock_page(page); |
| 103 | goto out; | 98 | goto out; |
| 104 | } | 99 | } |
| 105 | bio = get_swap_bio(GFP_NOIO, page_private(page), page, | 100 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
| 106 | end_swap_bio_write); | ||
| 107 | if (bio == NULL) { | 101 | if (bio == NULL) { |
| 108 | set_page_dirty(page); | 102 | set_page_dirty(page); |
| 109 | unlock_page(page); | 103 | unlock_page(page); |
| @@ -127,8 +121,7 @@ int swap_readpage(struct page *page) | |||
| 127 | 121 | ||
| 128 | VM_BUG_ON(!PageLocked(page)); | 122 | VM_BUG_ON(!PageLocked(page)); |
| 129 | VM_BUG_ON(PageUptodate(page)); | 123 | VM_BUG_ON(PageUptodate(page)); |
| 130 | bio = get_swap_bio(GFP_KERNEL, page_private(page), page, | 124 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
| 131 | end_swap_bio_read); | ||
| 132 | if (bio == NULL) { | 125 | if (bio == NULL) { |
| 133 | unlock_page(page); | 126 | unlock_page(page); |
| 134 | ret = -ENOMEM; | 127 | ret = -ENOMEM; |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d5878bed7841..7b47a57b6646 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
| @@ -1,6 +1,7 @@ | |||
| 1 | #include <linux/mm.h> | 1 | #include <linux/mm.h> |
| 2 | #include <linux/highmem.h> | 2 | #include <linux/highmem.h> |
| 3 | #include <linux/sched.h> | 3 | #include <linux/sched.h> |
| 4 | #include <linux/hugetlb.h> | ||
| 4 | 5 | ||
| 5 | static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | 6 | static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, |
| 6 | struct mm_walk *walk) | 7 | struct mm_walk *walk) |
| @@ -107,6 +108,7 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
| 107 | pgd_t *pgd; | 108 | pgd_t *pgd; |
| 108 | unsigned long next; | 109 | unsigned long next; |
| 109 | int err = 0; | 110 | int err = 0; |
| 111 | struct vm_area_struct *vma; | ||
| 110 | 112 | ||
| 111 | if (addr >= end) | 113 | if (addr >= end) |
| 112 | return err; | 114 | return err; |
| @@ -117,11 +119,38 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
| 117 | pgd = pgd_offset(walk->mm, addr); | 119 | pgd = pgd_offset(walk->mm, addr); |
| 118 | do { | 120 | do { |
| 119 | next = pgd_addr_end(addr, end); | 121 | next = pgd_addr_end(addr, end); |
| 122 | |||
| 123 | /* | ||
| 124 | * handle hugetlb vma individually because pagetable walk for | ||
| 125 | * the hugetlb page is dependent on the architecture and | ||
| 126 | * we can't handled it in the same manner as non-huge pages. | ||
| 127 | */ | ||
| 128 | vma = find_vma(walk->mm, addr); | ||
| 129 | #ifdef CONFIG_HUGETLB_PAGE | ||
| 130 | if (vma && is_vm_hugetlb_page(vma)) { | ||
| 131 | pte_t *pte; | ||
| 132 | struct hstate *hs; | ||
| 133 | |||
| 134 | if (vma->vm_end < next) | ||
| 135 | next = vma->vm_end; | ||
| 136 | hs = hstate_vma(vma); | ||
| 137 | pte = huge_pte_offset(walk->mm, | ||
| 138 | addr & huge_page_mask(hs)); | ||
| 139 | if (pte && !huge_pte_none(huge_ptep_get(pte)) | ||
| 140 | && walk->hugetlb_entry) | ||
| 141 | err = walk->hugetlb_entry(pte, addr, | ||
| 142 | next, walk); | ||
| 143 | if (err) | ||
| 144 | break; | ||
| 145 | continue; | ||
| 146 | } | ||
| 147 | #endif | ||
| 120 | if (pgd_none_or_clear_bad(pgd)) { | 148 | if (pgd_none_or_clear_bad(pgd)) { |
| 121 | if (walk->pte_hole) | 149 | if (walk->pte_hole) |
| 122 | err = walk->pte_hole(addr, next, walk); | 150 | err = walk->pte_hole(addr, next, walk); |
| 123 | if (err) | 151 | if (err) |
| 124 | break; | 152 | break; |
| 153 | pgd++; | ||
| 125 | continue; | 154 | continue; |
| 126 | } | 155 | } |
| 127 | if (walk->pgd_entry) | 156 | if (walk->pgd_entry) |
| @@ -131,7 +160,8 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
| 131 | err = walk_pud_range(pgd, addr, next, walk); | 160 | err = walk_pud_range(pgd, addr, next, walk); |
| 132 | if (err) | 161 | if (err) |
| 133 | break; | 162 | break; |
| 134 | } while (pgd++, addr = next, addr != end); | 163 | pgd++; |
| 164 | } while (addr = next, addr != end); | ||
| 135 | 165 | ||
| 136 | return err; | 166 | return err; |
| 137 | } | 167 | } |
diff --git a/mm/percpu.c b/mm/percpu.c index 5adfc268b408..768419d44ad7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
| @@ -46,8 +46,6 @@ | |||
| 46 | * | 46 | * |
| 47 | * To use this allocator, arch code should do the followings. | 47 | * To use this allocator, arch code should do the followings. |
| 48 | * | 48 | * |
| 49 | * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA | ||
| 50 | * | ||
| 51 | * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate | 49 | * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate |
| 52 | * regular address to percpu pointer and back if they need to be | 50 | * regular address to percpu pointer and back if they need to be |
| 53 | * different from the default | 51 | * different from the default |
| @@ -74,6 +72,7 @@ | |||
| 74 | #include <asm/cacheflush.h> | 72 | #include <asm/cacheflush.h> |
| 75 | #include <asm/sections.h> | 73 | #include <asm/sections.h> |
| 76 | #include <asm/tlbflush.h> | 74 | #include <asm/tlbflush.h> |
| 75 | #include <asm/io.h> | ||
| 77 | 76 | ||
| 78 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ | 77 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ |
| 79 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ | 78 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ |
| @@ -81,13 +80,15 @@ | |||
| 81 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ | 80 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ |
| 82 | #ifndef __addr_to_pcpu_ptr | 81 | #ifndef __addr_to_pcpu_ptr |
| 83 | #define __addr_to_pcpu_ptr(addr) \ | 82 | #define __addr_to_pcpu_ptr(addr) \ |
| 84 | (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ | 83 | (void __percpu *)((unsigned long)(addr) - \ |
| 85 | + (unsigned long)__per_cpu_start) | 84 | (unsigned long)pcpu_base_addr + \ |
| 85 | (unsigned long)__per_cpu_start) | ||
| 86 | #endif | 86 | #endif |
| 87 | #ifndef __pcpu_ptr_to_addr | 87 | #ifndef __pcpu_ptr_to_addr |
| 88 | #define __pcpu_ptr_to_addr(ptr) \ | 88 | #define __pcpu_ptr_to_addr(ptr) \ |
| 89 | (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ | 89 | (void __force *)((unsigned long)(ptr) + \ |
| 90 | - (unsigned long)__per_cpu_start) | 90 | (unsigned long)pcpu_base_addr - \ |
| 91 | (unsigned long)__per_cpu_start) | ||
| 91 | #endif | 92 | #endif |
| 92 | 93 | ||
| 93 | struct pcpu_chunk { | 94 | struct pcpu_chunk { |
| @@ -914,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
| 914 | int rs, re; | 915 | int rs, re; |
| 915 | 916 | ||
| 916 | /* quick path, check whether it's empty already */ | 917 | /* quick path, check whether it's empty already */ |
| 917 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | 918 | rs = page_start; |
| 918 | if (rs == page_start && re == page_end) | 919 | pcpu_next_unpop(chunk, &rs, &re, page_end); |
| 919 | return; | 920 | if (rs == page_start && re == page_end) |
| 920 | break; | 921 | return; |
| 921 | } | ||
| 922 | 922 | ||
| 923 | /* immutable chunks can't be depopulated */ | 923 | /* immutable chunks can't be depopulated */ |
| 924 | WARN_ON(chunk->immutable); | 924 | WARN_ON(chunk->immutable); |
| @@ -969,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | |||
| 969 | int rs, re, rc; | 969 | int rs, re, rc; |
| 970 | 970 | ||
| 971 | /* quick path, check whether all pages are already there */ | 971 | /* quick path, check whether all pages are already there */ |
| 972 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { | 972 | rs = page_start; |
| 973 | if (rs == page_start && re == page_end) | 973 | pcpu_next_pop(chunk, &rs, &re, page_end); |
| 974 | goto clear; | 974 | if (rs == page_start && re == page_end) |
| 975 | break; | 975 | goto clear; |
| 976 | } | ||
| 977 | 976 | ||
| 978 | /* need to allocate and map pages, this chunk can't be immutable */ | 977 | /* need to allocate and map pages, this chunk can't be immutable */ |
| 979 | WARN_ON(chunk->immutable); | 978 | WARN_ON(chunk->immutable); |
| @@ -1068,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) | |||
| 1068 | * RETURNS: | 1067 | * RETURNS: |
| 1069 | * Percpu pointer to the allocated area on success, NULL on failure. | 1068 | * Percpu pointer to the allocated area on success, NULL on failure. |
| 1070 | */ | 1069 | */ |
| 1071 | static void *pcpu_alloc(size_t size, size_t align, bool reserved) | 1070 | static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) |
| 1072 | { | 1071 | { |
| 1073 | static int warn_limit = 10; | 1072 | static int warn_limit = 10; |
| 1074 | struct pcpu_chunk *chunk; | 1073 | struct pcpu_chunk *chunk; |
| @@ -1197,7 +1196,7 @@ fail_unlock_mutex: | |||
| 1197 | * RETURNS: | 1196 | * RETURNS: |
| 1198 | * Percpu pointer to the allocated area on success, NULL on failure. | 1197 | * Percpu pointer to the allocated area on success, NULL on failure. |
| 1199 | */ | 1198 | */ |
| 1200 | void *__alloc_percpu(size_t size, size_t align) | 1199 | void __percpu *__alloc_percpu(size_t size, size_t align) |
| 1201 | { | 1200 | { |
| 1202 | return pcpu_alloc(size, align, false); | 1201 | return pcpu_alloc(size, align, false); |
| 1203 | } | 1202 | } |
| @@ -1218,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); | |||
| 1218 | * RETURNS: | 1217 | * RETURNS: |
| 1219 | * Percpu pointer to the allocated area on success, NULL on failure. | 1218 | * Percpu pointer to the allocated area on success, NULL on failure. |
| 1220 | */ | 1219 | */ |
| 1221 | void *__alloc_reserved_percpu(size_t size, size_t align) | 1220 | void __percpu *__alloc_reserved_percpu(size_t size, size_t align) |
| 1222 | { | 1221 | { |
| 1223 | return pcpu_alloc(size, align, true); | 1222 | return pcpu_alloc(size, align, true); |
| 1224 | } | 1223 | } |
| @@ -1270,9 +1269,9 @@ static void pcpu_reclaim(struct work_struct *work) | |||
| 1270 | * CONTEXT: | 1269 | * CONTEXT: |
| 1271 | * Can be called from atomic context. | 1270 | * Can be called from atomic context. |
| 1272 | */ | 1271 | */ |
| 1273 | void free_percpu(void *ptr) | 1272 | void free_percpu(void __percpu *ptr) |
| 1274 | { | 1273 | { |
| 1275 | void *addr = __pcpu_ptr_to_addr(ptr); | 1274 | void *addr; |
| 1276 | struct pcpu_chunk *chunk; | 1275 | struct pcpu_chunk *chunk; |
| 1277 | unsigned long flags; | 1276 | unsigned long flags; |
| 1278 | int off; | 1277 | int off; |
| @@ -1280,6 +1279,8 @@ void free_percpu(void *ptr) | |||
| 1280 | if (!ptr) | 1279 | if (!ptr) |
| 1281 | return; | 1280 | return; |
| 1282 | 1281 | ||
| 1282 | addr = __pcpu_ptr_to_addr(ptr); | ||
| 1283 | |||
| 1283 | spin_lock_irqsave(&pcpu_lock, flags); | 1284 | spin_lock_irqsave(&pcpu_lock, flags); |
| 1284 | 1285 | ||
| 1285 | chunk = pcpu_chunk_addr_search(addr); | 1286 | chunk = pcpu_chunk_addr_search(addr); |
| @@ -1302,6 +1303,27 @@ void free_percpu(void *ptr) | |||
| 1302 | } | 1303 | } |
| 1303 | EXPORT_SYMBOL_GPL(free_percpu); | 1304 | EXPORT_SYMBOL_GPL(free_percpu); |
| 1304 | 1305 | ||
| 1306 | /** | ||
| 1307 | * per_cpu_ptr_to_phys - convert translated percpu address to physical address | ||
| 1308 | * @addr: the address to be converted to physical address | ||
| 1309 | * | ||
| 1310 | * Given @addr which is dereferenceable address obtained via one of | ||
| 1311 | * percpu access macros, this function translates it into its physical | ||
| 1312 | * address. The caller is responsible for ensuring @addr stays valid | ||
| 1313 | * until this function finishes. | ||
| 1314 | * | ||
| 1315 | * RETURNS: | ||
| 1316 | * The physical address for @addr. | ||
| 1317 | */ | ||
| 1318 | phys_addr_t per_cpu_ptr_to_phys(void *addr) | ||
| 1319 | { | ||
| 1320 | if ((unsigned long)addr < VMALLOC_START || | ||
| 1321 | (unsigned long)addr >= VMALLOC_END) | ||
| 1322 | return __pa(addr); | ||
| 1323 | else | ||
| 1324 | return page_to_phys(vmalloc_to_page(addr)); | ||
| 1325 | } | ||
| 1326 | |||
| 1305 | static inline size_t pcpu_calc_fc_sizes(size_t static_size, | 1327 | static inline size_t pcpu_calc_fc_sizes(size_t static_size, |
| 1306 | size_t reserved_size, | 1328 | size_t reserved_size, |
| 1307 | ssize_t *dyn_sizep) | 1329 | ssize_t *dyn_sizep) |
diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa2345235..337b20e946f6 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -501,6 +501,12 @@ void page_cache_sync_readahead(struct address_space *mapping, | |||
| 501 | if (!ra->ra_pages) | 501 | if (!ra->ra_pages) |
| 502 | return; | 502 | return; |
| 503 | 503 | ||
| 504 | /* be dumb */ | ||
| 505 | if (filp->f_mode & FMODE_RANDOM) { | ||
| 506 | force_page_cache_readahead(mapping, filp, offset, req_size); | ||
| 507 | return; | ||
| 508 | } | ||
| 509 | |||
| 504 | /* do read-ahead */ | 510 | /* do read-ahead */ |
| 505 | ondemand_readahead(mapping, ra, filp, false, offset, req_size); | 511 | ondemand_readahead(mapping, ra, filp, false, offset, req_size); |
| 506 | } | 512 | } |
| @@ -547,5 +553,17 @@ page_cache_async_readahead(struct address_space *mapping, | |||
| 547 | 553 | ||
| 548 | /* do read-ahead */ | 554 | /* do read-ahead */ |
| 549 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); | 555 | ondemand_readahead(mapping, ra, filp, true, offset, req_size); |
| 556 | |||
| 557 | #ifdef CONFIG_BLOCK | ||
| 558 | /* | ||
| 559 | * Normally the current page is !uptodate and lock_page() will be | ||
| 560 | * immediately called to implicitly unplug the device. However this | ||
| 561 | * is not always true for RAID conifgurations, where data arrives | ||
| 562 | * not strictly in their submission order. In this case we need to | ||
| 563 | * explicitly kick off the IO. | ||
| 564 | */ | ||
| 565 | if (PageUptodate(page)) | ||
| 566 | blk_run_backing_dev(mapping->backing_dev_info, NULL); | ||
| 567 | #endif | ||
| 550 | } | 568 | } |
| 551 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); | 569 | EXPORT_SYMBOL_GPL(page_cache_async_readahead); |
| @@ -49,6 +49,7 @@ | |||
| 49 | #include <linux/swapops.h> | 49 | #include <linux/swapops.h> |
| 50 | #include <linux/slab.h> | 50 | #include <linux/slab.h> |
| 51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
| 52 | #include <linux/ksm.h> | ||
| 52 | #include <linux/rmap.h> | 53 | #include <linux/rmap.h> |
| 53 | #include <linux/rcupdate.h> | 54 | #include <linux/rcupdate.h> |
| 54 | #include <linux/module.h> | 55 | #include <linux/module.h> |
| @@ -61,17 +62,28 @@ | |||
| 61 | #include "internal.h" | 62 | #include "internal.h" |
| 62 | 63 | ||
| 63 | static struct kmem_cache *anon_vma_cachep; | 64 | static struct kmem_cache *anon_vma_cachep; |
| 65 | static struct kmem_cache *anon_vma_chain_cachep; | ||
| 64 | 66 | ||
| 65 | static inline struct anon_vma *anon_vma_alloc(void) | 67 | static inline struct anon_vma *anon_vma_alloc(void) |
| 66 | { | 68 | { |
| 67 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); | 69 | return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); |
| 68 | } | 70 | } |
| 69 | 71 | ||
| 70 | static inline void anon_vma_free(struct anon_vma *anon_vma) | 72 | void anon_vma_free(struct anon_vma *anon_vma) |
| 71 | { | 73 | { |
| 72 | kmem_cache_free(anon_vma_cachep, anon_vma); | 74 | kmem_cache_free(anon_vma_cachep, anon_vma); |
| 73 | } | 75 | } |
| 74 | 76 | ||
| 77 | static inline struct anon_vma_chain *anon_vma_chain_alloc(void) | ||
| 78 | { | ||
| 79 | return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); | ||
| 80 | } | ||
| 81 | |||
| 82 | void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) | ||
| 83 | { | ||
| 84 | kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); | ||
| 85 | } | ||
| 86 | |||
| 75 | /** | 87 | /** |
| 76 | * anon_vma_prepare - attach an anon_vma to a memory region | 88 | * anon_vma_prepare - attach an anon_vma to a memory region |
| 77 | * @vma: the memory region in question | 89 | * @vma: the memory region in question |
| @@ -102,18 +114,23 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) | |||
| 102 | int anon_vma_prepare(struct vm_area_struct *vma) | 114 | int anon_vma_prepare(struct vm_area_struct *vma) |
| 103 | { | 115 | { |
| 104 | struct anon_vma *anon_vma = vma->anon_vma; | 116 | struct anon_vma *anon_vma = vma->anon_vma; |
| 117 | struct anon_vma_chain *avc; | ||
| 105 | 118 | ||
| 106 | might_sleep(); | 119 | might_sleep(); |
| 107 | if (unlikely(!anon_vma)) { | 120 | if (unlikely(!anon_vma)) { |
| 108 | struct mm_struct *mm = vma->vm_mm; | 121 | struct mm_struct *mm = vma->vm_mm; |
| 109 | struct anon_vma *allocated; | 122 | struct anon_vma *allocated; |
| 110 | 123 | ||
| 124 | avc = anon_vma_chain_alloc(); | ||
| 125 | if (!avc) | ||
| 126 | goto out_enomem; | ||
| 127 | |||
| 111 | anon_vma = find_mergeable_anon_vma(vma); | 128 | anon_vma = find_mergeable_anon_vma(vma); |
| 112 | allocated = NULL; | 129 | allocated = NULL; |
| 113 | if (!anon_vma) { | 130 | if (!anon_vma) { |
| 114 | anon_vma = anon_vma_alloc(); | 131 | anon_vma = anon_vma_alloc(); |
| 115 | if (unlikely(!anon_vma)) | 132 | if (unlikely(!anon_vma)) |
| 116 | return -ENOMEM; | 133 | goto out_enomem_free_avc; |
| 117 | allocated = anon_vma; | 134 | allocated = anon_vma; |
| 118 | } | 135 | } |
| 119 | spin_lock(&anon_vma->lock); | 136 | spin_lock(&anon_vma->lock); |
| @@ -122,67 +139,140 @@ int anon_vma_prepare(struct vm_area_struct *vma) | |||
| 122 | spin_lock(&mm->page_table_lock); | 139 | spin_lock(&mm->page_table_lock); |
| 123 | if (likely(!vma->anon_vma)) { | 140 | if (likely(!vma->anon_vma)) { |
| 124 | vma->anon_vma = anon_vma; | 141 | vma->anon_vma = anon_vma; |
| 125 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 142 | avc->anon_vma = anon_vma; |
| 143 | avc->vma = vma; | ||
| 144 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
| 145 | list_add(&avc->same_anon_vma, &anon_vma->head); | ||
| 126 | allocated = NULL; | 146 | allocated = NULL; |
| 127 | } | 147 | } |
| 128 | spin_unlock(&mm->page_table_lock); | 148 | spin_unlock(&mm->page_table_lock); |
| 129 | 149 | ||
| 130 | spin_unlock(&anon_vma->lock); | 150 | spin_unlock(&anon_vma->lock); |
| 131 | if (unlikely(allocated)) | 151 | if (unlikely(allocated)) { |
| 132 | anon_vma_free(allocated); | 152 | anon_vma_free(allocated); |
| 153 | anon_vma_chain_free(avc); | ||
| 154 | } | ||
| 133 | } | 155 | } |
| 134 | return 0; | 156 | return 0; |
| 157 | |||
| 158 | out_enomem_free_avc: | ||
| 159 | anon_vma_chain_free(avc); | ||
| 160 | out_enomem: | ||
| 161 | return -ENOMEM; | ||
| 135 | } | 162 | } |
| 136 | 163 | ||
| 137 | void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) | 164 | static void anon_vma_chain_link(struct vm_area_struct *vma, |
| 165 | struct anon_vma_chain *avc, | ||
| 166 | struct anon_vma *anon_vma) | ||
| 138 | { | 167 | { |
| 139 | BUG_ON(vma->anon_vma != next->anon_vma); | 168 | avc->vma = vma; |
| 140 | list_del(&next->anon_vma_node); | 169 | avc->anon_vma = anon_vma; |
| 170 | list_add(&avc->same_vma, &vma->anon_vma_chain); | ||
| 171 | |||
| 172 | spin_lock(&anon_vma->lock); | ||
| 173 | list_add_tail(&avc->same_anon_vma, &anon_vma->head); | ||
| 174 | spin_unlock(&anon_vma->lock); | ||
| 141 | } | 175 | } |
| 142 | 176 | ||
| 143 | void __anon_vma_link(struct vm_area_struct *vma) | 177 | /* |
| 178 | * Attach the anon_vmas from src to dst. | ||
| 179 | * Returns 0 on success, -ENOMEM on failure. | ||
| 180 | */ | ||
| 181 | int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | ||
| 144 | { | 182 | { |
| 145 | struct anon_vma *anon_vma = vma->anon_vma; | 183 | struct anon_vma_chain *avc, *pavc; |
| 184 | |||
| 185 | list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) { | ||
| 186 | avc = anon_vma_chain_alloc(); | ||
| 187 | if (!avc) | ||
| 188 | goto enomem_failure; | ||
| 189 | anon_vma_chain_link(dst, avc, pavc->anon_vma); | ||
| 190 | } | ||
| 191 | return 0; | ||
| 146 | 192 | ||
| 147 | if (anon_vma) | 193 | enomem_failure: |
| 148 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 194 | unlink_anon_vmas(dst); |
| 195 | return -ENOMEM; | ||
| 149 | } | 196 | } |
| 150 | 197 | ||
| 151 | void anon_vma_link(struct vm_area_struct *vma) | 198 | /* |
| 199 | * Attach vma to its own anon_vma, as well as to the anon_vmas that | ||
| 200 | * the corresponding VMA in the parent process is attached to. | ||
| 201 | * Returns 0 on success, non-zero on failure. | ||
| 202 | */ | ||
| 203 | int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) | ||
| 152 | { | 204 | { |
| 153 | struct anon_vma *anon_vma = vma->anon_vma; | 205 | struct anon_vma_chain *avc; |
| 206 | struct anon_vma *anon_vma; | ||
| 154 | 207 | ||
| 155 | if (anon_vma) { | 208 | /* Don't bother if the parent process has no anon_vma here. */ |
| 156 | spin_lock(&anon_vma->lock); | 209 | if (!pvma->anon_vma) |
| 157 | list_add_tail(&vma->anon_vma_node, &anon_vma->head); | 210 | return 0; |
| 158 | spin_unlock(&anon_vma->lock); | 211 | |
| 159 | } | 212 | /* |
| 213 | * First, attach the new VMA to the parent VMA's anon_vmas, | ||
| 214 | * so rmap can find non-COWed pages in child processes. | ||
| 215 | */ | ||
| 216 | if (anon_vma_clone(vma, pvma)) | ||
| 217 | return -ENOMEM; | ||
| 218 | |||
| 219 | /* Then add our own anon_vma. */ | ||
| 220 | anon_vma = anon_vma_alloc(); | ||
| 221 | if (!anon_vma) | ||
| 222 | goto out_error; | ||
| 223 | avc = anon_vma_chain_alloc(); | ||
| 224 | if (!avc) | ||
| 225 | goto out_error_free_anon_vma; | ||
| 226 | anon_vma_chain_link(vma, avc, anon_vma); | ||
| 227 | /* Mark this anon_vma as the one where our new (COWed) pages go. */ | ||
| 228 | vma->anon_vma = anon_vma; | ||
| 229 | |||
| 230 | return 0; | ||
| 231 | |||
| 232 | out_error_free_anon_vma: | ||
| 233 | anon_vma_free(anon_vma); | ||
| 234 | out_error: | ||
| 235 | return -ENOMEM; | ||
| 160 | } | 236 | } |
| 161 | 237 | ||
| 162 | void anon_vma_unlink(struct vm_area_struct *vma) | 238 | static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) |
| 163 | { | 239 | { |
| 164 | struct anon_vma *anon_vma = vma->anon_vma; | 240 | struct anon_vma *anon_vma = anon_vma_chain->anon_vma; |
| 165 | int empty; | 241 | int empty; |
| 166 | 242 | ||
| 243 | /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ | ||
| 167 | if (!anon_vma) | 244 | if (!anon_vma) |
| 168 | return; | 245 | return; |
| 169 | 246 | ||
| 170 | spin_lock(&anon_vma->lock); | 247 | spin_lock(&anon_vma->lock); |
| 171 | list_del(&vma->anon_vma_node); | 248 | list_del(&anon_vma_chain->same_anon_vma); |
| 172 | 249 | ||
| 173 | /* We must garbage collect the anon_vma if it's empty */ | 250 | /* We must garbage collect the anon_vma if it's empty */ |
| 174 | empty = list_empty(&anon_vma->head); | 251 | empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma); |
| 175 | spin_unlock(&anon_vma->lock); | 252 | spin_unlock(&anon_vma->lock); |
| 176 | 253 | ||
| 177 | if (empty) | 254 | if (empty) |
| 178 | anon_vma_free(anon_vma); | 255 | anon_vma_free(anon_vma); |
| 179 | } | 256 | } |
| 180 | 257 | ||
| 258 | void unlink_anon_vmas(struct vm_area_struct *vma) | ||
| 259 | { | ||
| 260 | struct anon_vma_chain *avc, *next; | ||
| 261 | |||
| 262 | /* Unlink each anon_vma chained to the VMA. */ | ||
| 263 | list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { | ||
| 264 | anon_vma_unlink(avc); | ||
| 265 | list_del(&avc->same_vma); | ||
| 266 | anon_vma_chain_free(avc); | ||
| 267 | } | ||
| 268 | } | ||
| 269 | |||
| 181 | static void anon_vma_ctor(void *data) | 270 | static void anon_vma_ctor(void *data) |
| 182 | { | 271 | { |
| 183 | struct anon_vma *anon_vma = data; | 272 | struct anon_vma *anon_vma = data; |
| 184 | 273 | ||
| 185 | spin_lock_init(&anon_vma->lock); | 274 | spin_lock_init(&anon_vma->lock); |
| 275 | ksm_refcount_init(anon_vma); | ||
| 186 | INIT_LIST_HEAD(&anon_vma->head); | 276 | INIT_LIST_HEAD(&anon_vma->head); |
| 187 | } | 277 | } |
| 188 | 278 | ||
| @@ -190,6 +280,7 @@ void __init anon_vma_init(void) | |||
| 190 | { | 280 | { |
| 191 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), | 281 | anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), |
| 192 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); | 282 | 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); |
| 283 | anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); | ||
| 193 | } | 284 | } |
| 194 | 285 | ||
| 195 | /* | 286 | /* |
| @@ -202,8 +293,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page) | |||
| 202 | unsigned long anon_mapping; | 293 | unsigned long anon_mapping; |
| 203 | 294 | ||
| 204 | rcu_read_lock(); | 295 | rcu_read_lock(); |
| 205 | anon_mapping = (unsigned long) page->mapping; | 296 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); |
| 206 | if (!(anon_mapping & PAGE_MAPPING_ANON)) | 297 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
| 207 | goto out; | 298 | goto out; |
| 208 | if (!page_mapped(page)) | 299 | if (!page_mapped(page)) |
| 209 | goto out; | 300 | goto out; |
| @@ -248,8 +339,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
| 248 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 339 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
| 249 | { | 340 | { |
| 250 | if (PageAnon(page)) { | 341 | if (PageAnon(page)) { |
| 251 | if ((void *)vma->anon_vma != | 342 | if (vma->anon_vma != page_anon_vma(page)) |
| 252 | (void *)page->mapping - PAGE_MAPPING_ANON) | ||
| 253 | return -EFAULT; | 343 | return -EFAULT; |
| 254 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { | 344 | } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { |
| 255 | if (!vma->vm_file || | 345 | if (!vma->vm_file || |
| @@ -337,21 +427,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
| 337 | * Subfunctions of page_referenced: page_referenced_one called | 427 | * Subfunctions of page_referenced: page_referenced_one called |
| 338 | * repeatedly from either page_referenced_anon or page_referenced_file. | 428 | * repeatedly from either page_referenced_anon or page_referenced_file. |
| 339 | */ | 429 | */ |
| 340 | static int page_referenced_one(struct page *page, | 430 | int page_referenced_one(struct page *page, struct vm_area_struct *vma, |
| 341 | struct vm_area_struct *vma, | 431 | unsigned long address, unsigned int *mapcount, |
| 342 | unsigned int *mapcount, | 432 | unsigned long *vm_flags) |
| 343 | unsigned long *vm_flags) | ||
| 344 | { | 433 | { |
| 345 | struct mm_struct *mm = vma->vm_mm; | 434 | struct mm_struct *mm = vma->vm_mm; |
| 346 | unsigned long address; | ||
| 347 | pte_t *pte; | 435 | pte_t *pte; |
| 348 | spinlock_t *ptl; | 436 | spinlock_t *ptl; |
| 349 | int referenced = 0; | 437 | int referenced = 0; |
| 350 | 438 | ||
| 351 | address = vma_address(page, vma); | ||
| 352 | if (address == -EFAULT) | ||
| 353 | goto out; | ||
| 354 | |||
| 355 | pte = page_check_address(page, mm, address, &ptl, 0); | 439 | pte = page_check_address(page, mm, address, &ptl, 0); |
| 356 | if (!pte) | 440 | if (!pte) |
| 357 | goto out; | 441 | goto out; |
| @@ -388,9 +472,10 @@ static int page_referenced_one(struct page *page, | |||
| 388 | out_unmap: | 472 | out_unmap: |
| 389 | (*mapcount)--; | 473 | (*mapcount)--; |
| 390 | pte_unmap_unlock(pte, ptl); | 474 | pte_unmap_unlock(pte, ptl); |
| 391 | out: | 475 | |
| 392 | if (referenced) | 476 | if (referenced) |
| 393 | *vm_flags |= vma->vm_flags; | 477 | *vm_flags |= vma->vm_flags; |
| 478 | out: | ||
| 394 | return referenced; | 479 | return referenced; |
| 395 | } | 480 | } |
| 396 | 481 | ||
| @@ -400,7 +485,7 @@ static int page_referenced_anon(struct page *page, | |||
| 400 | { | 485 | { |
| 401 | unsigned int mapcount; | 486 | unsigned int mapcount; |
| 402 | struct anon_vma *anon_vma; | 487 | struct anon_vma *anon_vma; |
| 403 | struct vm_area_struct *vma; | 488 | struct anon_vma_chain *avc; |
| 404 | int referenced = 0; | 489 | int referenced = 0; |
| 405 | 490 | ||
| 406 | anon_vma = page_lock_anon_vma(page); | 491 | anon_vma = page_lock_anon_vma(page); |
| @@ -408,7 +493,11 @@ static int page_referenced_anon(struct page *page, | |||
| 408 | return referenced; | 493 | return referenced; |
| 409 | 494 | ||
| 410 | mapcount = page_mapcount(page); | 495 | mapcount = page_mapcount(page); |
| 411 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 496 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
| 497 | struct vm_area_struct *vma = avc->vma; | ||
| 498 | unsigned long address = vma_address(page, vma); | ||
| 499 | if (address == -EFAULT) | ||
| 500 | continue; | ||
| 412 | /* | 501 | /* |
| 413 | * If we are reclaiming on behalf of a cgroup, skip | 502 | * If we are reclaiming on behalf of a cgroup, skip |
| 414 | * counting on behalf of references from different | 503 | * counting on behalf of references from different |
| @@ -416,7 +505,7 @@ static int page_referenced_anon(struct page *page, | |||
| 416 | */ | 505 | */ |
| 417 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 506 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
| 418 | continue; | 507 | continue; |
| 419 | referenced += page_referenced_one(page, vma, | 508 | referenced += page_referenced_one(page, vma, address, |
| 420 | &mapcount, vm_flags); | 509 | &mapcount, vm_flags); |
| 421 | if (!mapcount) | 510 | if (!mapcount) |
| 422 | break; | 511 | break; |
| @@ -474,6 +563,9 @@ static int page_referenced_file(struct page *page, | |||
| 474 | mapcount = page_mapcount(page); | 563 | mapcount = page_mapcount(page); |
| 475 | 564 | ||
| 476 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 565 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
| 566 | unsigned long address = vma_address(page, vma); | ||
| 567 | if (address == -EFAULT) | ||
| 568 | continue; | ||
| 477 | /* | 569 | /* |
| 478 | * If we are reclaiming on behalf of a cgroup, skip | 570 | * If we are reclaiming on behalf of a cgroup, skip |
| 479 | * counting on behalf of references from different | 571 | * counting on behalf of references from different |
| @@ -481,7 +573,7 @@ static int page_referenced_file(struct page *page, | |||
| 481 | */ | 573 | */ |
| 482 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 574 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
| 483 | continue; | 575 | continue; |
| 484 | referenced += page_referenced_one(page, vma, | 576 | referenced += page_referenced_one(page, vma, address, |
| 485 | &mapcount, vm_flags); | 577 | &mapcount, vm_flags); |
| 486 | if (!mapcount) | 578 | if (!mapcount) |
| 487 | break; | 579 | break; |
| @@ -507,46 +599,44 @@ int page_referenced(struct page *page, | |||
| 507 | unsigned long *vm_flags) | 599 | unsigned long *vm_flags) |
| 508 | { | 600 | { |
| 509 | int referenced = 0; | 601 | int referenced = 0; |
| 510 | 602 | int we_locked = 0; | |
| 511 | if (TestClearPageReferenced(page)) | ||
| 512 | referenced++; | ||
| 513 | 603 | ||
| 514 | *vm_flags = 0; | 604 | *vm_flags = 0; |
| 515 | if (page_mapped(page) && page->mapping) { | 605 | if (page_mapped(page) && page_rmapping(page)) { |
| 516 | if (PageAnon(page)) | 606 | if (!is_locked && (!PageAnon(page) || PageKsm(page))) { |
| 607 | we_locked = trylock_page(page); | ||
| 608 | if (!we_locked) { | ||
| 609 | referenced++; | ||
| 610 | goto out; | ||
| 611 | } | ||
| 612 | } | ||
| 613 | if (unlikely(PageKsm(page))) | ||
| 614 | referenced += page_referenced_ksm(page, mem_cont, | ||
| 615 | vm_flags); | ||
| 616 | else if (PageAnon(page)) | ||
| 517 | referenced += page_referenced_anon(page, mem_cont, | 617 | referenced += page_referenced_anon(page, mem_cont, |
| 518 | vm_flags); | 618 | vm_flags); |
| 519 | else if (is_locked) | 619 | else if (page->mapping) |
| 520 | referenced += page_referenced_file(page, mem_cont, | 620 | referenced += page_referenced_file(page, mem_cont, |
| 521 | vm_flags); | 621 | vm_flags); |
| 522 | else if (!trylock_page(page)) | 622 | if (we_locked) |
| 523 | referenced++; | ||
| 524 | else { | ||
| 525 | if (page->mapping) | ||
| 526 | referenced += page_referenced_file(page, | ||
| 527 | mem_cont, vm_flags); | ||
| 528 | unlock_page(page); | 623 | unlock_page(page); |
| 529 | } | ||
| 530 | } | 624 | } |
| 531 | 625 | out: | |
| 532 | if (page_test_and_clear_young(page)) | 626 | if (page_test_and_clear_young(page)) |
| 533 | referenced++; | 627 | referenced++; |
| 534 | 628 | ||
| 535 | return referenced; | 629 | return referenced; |
| 536 | } | 630 | } |
| 537 | 631 | ||
| 538 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) | 632 | static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, |
| 633 | unsigned long address) | ||
| 539 | { | 634 | { |
| 540 | struct mm_struct *mm = vma->vm_mm; | 635 | struct mm_struct *mm = vma->vm_mm; |
| 541 | unsigned long address; | ||
| 542 | pte_t *pte; | 636 | pte_t *pte; |
| 543 | spinlock_t *ptl; | 637 | spinlock_t *ptl; |
| 544 | int ret = 0; | 638 | int ret = 0; |
| 545 | 639 | ||
| 546 | address = vma_address(page, vma); | ||
| 547 | if (address == -EFAULT) | ||
| 548 | goto out; | ||
| 549 | |||
| 550 | pte = page_check_address(page, mm, address, &ptl, 1); | 640 | pte = page_check_address(page, mm, address, &ptl, 1); |
| 551 | if (!pte) | 641 | if (!pte) |
| 552 | goto out; | 642 | goto out; |
| @@ -578,8 +668,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) | |||
| 578 | 668 | ||
| 579 | spin_lock(&mapping->i_mmap_lock); | 669 | spin_lock(&mapping->i_mmap_lock); |
| 580 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 670 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
| 581 | if (vma->vm_flags & VM_SHARED) | 671 | if (vma->vm_flags & VM_SHARED) { |
| 582 | ret += page_mkclean_one(page, vma); | 672 | unsigned long address = vma_address(page, vma); |
| 673 | if (address == -EFAULT) | ||
| 674 | continue; | ||
| 675 | ret += page_mkclean_one(page, vma, address); | ||
| 676 | } | ||
| 583 | } | 677 | } |
| 584 | spin_unlock(&mapping->i_mmap_lock); | 678 | spin_unlock(&mapping->i_mmap_lock); |
| 585 | return ret; | 679 | return ret; |
| @@ -607,6 +701,30 @@ int page_mkclean(struct page *page) | |||
| 607 | EXPORT_SYMBOL_GPL(page_mkclean); | 701 | EXPORT_SYMBOL_GPL(page_mkclean); |
| 608 | 702 | ||
| 609 | /** | 703 | /** |
| 704 | * page_move_anon_rmap - move a page to our anon_vma | ||
| 705 | * @page: the page to move to our anon_vma | ||
| 706 | * @vma: the vma the page belongs to | ||
| 707 | * @address: the user virtual address mapped | ||
| 708 | * | ||
| 709 | * When a page belongs exclusively to one process after a COW event, | ||
| 710 | * that page can be moved into the anon_vma that belongs to just that | ||
| 711 | * process, so the rmap code will not search the parent or sibling | ||
| 712 | * processes. | ||
| 713 | */ | ||
| 714 | void page_move_anon_rmap(struct page *page, | ||
| 715 | struct vm_area_struct *vma, unsigned long address) | ||
| 716 | { | ||
| 717 | struct anon_vma *anon_vma = vma->anon_vma; | ||
| 718 | |||
| 719 | VM_BUG_ON(!PageLocked(page)); | ||
| 720 | VM_BUG_ON(!anon_vma); | ||
| 721 | VM_BUG_ON(page->index != linear_page_index(vma, address)); | ||
| 722 | |||
| 723 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
| 724 | page->mapping = (struct address_space *) anon_vma; | ||
| 725 | } | ||
| 726 | |||
| 727 | /** | ||
| 610 | * __page_set_anon_rmap - setup new anonymous rmap | 728 | * __page_set_anon_rmap - setup new anonymous rmap |
| 611 | * @page: the page to add the mapping to | 729 | * @page: the page to add the mapping to |
| 612 | * @vma: the vm area in which the mapping is added | 730 | * @vma: the vm area in which the mapping is added |
| @@ -620,14 +738,7 @@ static void __page_set_anon_rmap(struct page *page, | |||
| 620 | BUG_ON(!anon_vma); | 738 | BUG_ON(!anon_vma); |
| 621 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 739 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
| 622 | page->mapping = (struct address_space *) anon_vma; | 740 | page->mapping = (struct address_space *) anon_vma; |
| 623 | |||
| 624 | page->index = linear_page_index(vma, address); | 741 | page->index = linear_page_index(vma, address); |
| 625 | |||
| 626 | /* | ||
| 627 | * nr_mapped state can be updated without turning off | ||
| 628 | * interrupts because it is not modified via interrupt. | ||
| 629 | */ | ||
| 630 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
| 631 | } | 742 | } |
| 632 | 743 | ||
| 633 | /** | 744 | /** |
| @@ -652,9 +763,6 @@ static void __page_check_anon_rmap(struct page *page, | |||
| 652 | * are initially only visible via the pagetables, and the pte is locked | 763 | * are initially only visible via the pagetables, and the pte is locked |
| 653 | * over the call to page_add_new_anon_rmap. | 764 | * over the call to page_add_new_anon_rmap. |
| 654 | */ | 765 | */ |
| 655 | struct anon_vma *anon_vma = vma->anon_vma; | ||
| 656 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | ||
| 657 | BUG_ON(page->mapping != (struct address_space *)anon_vma); | ||
| 658 | BUG_ON(page->index != linear_page_index(vma, address)); | 766 | BUG_ON(page->index != linear_page_index(vma, address)); |
| 659 | #endif | 767 | #endif |
| 660 | } | 768 | } |
| @@ -665,14 +773,23 @@ static void __page_check_anon_rmap(struct page *page, | |||
| 665 | * @vma: the vm area in which the mapping is added | 773 | * @vma: the vm area in which the mapping is added |
| 666 | * @address: the user virtual address mapped | 774 | * @address: the user virtual address mapped |
| 667 | * | 775 | * |
| 668 | * The caller needs to hold the pte lock and the page must be locked. | 776 | * The caller needs to hold the pte lock, and the page must be locked in |
| 777 | * the anon_vma case: to serialize mapping,index checking after setting, | ||
| 778 | * and to ensure that PageAnon is not being upgraded racily to PageKsm | ||
| 779 | * (but PageKsm is never downgraded to PageAnon). | ||
| 669 | */ | 780 | */ |
| 670 | void page_add_anon_rmap(struct page *page, | 781 | void page_add_anon_rmap(struct page *page, |
| 671 | struct vm_area_struct *vma, unsigned long address) | 782 | struct vm_area_struct *vma, unsigned long address) |
| 672 | { | 783 | { |
| 784 | int first = atomic_inc_and_test(&page->_mapcount); | ||
| 785 | if (first) | ||
| 786 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
| 787 | if (unlikely(PageKsm(page))) | ||
| 788 | return; | ||
| 789 | |||
| 673 | VM_BUG_ON(!PageLocked(page)); | 790 | VM_BUG_ON(!PageLocked(page)); |
| 674 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 791 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
| 675 | if (atomic_inc_and_test(&page->_mapcount)) | 792 | if (first) |
| 676 | __page_set_anon_rmap(page, vma, address); | 793 | __page_set_anon_rmap(page, vma, address); |
| 677 | else | 794 | else |
| 678 | __page_check_anon_rmap(page, vma, address); | 795 | __page_check_anon_rmap(page, vma, address); |
| @@ -694,6 +811,7 @@ void page_add_new_anon_rmap(struct page *page, | |||
| 694 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 811 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
| 695 | SetPageSwapBacked(page); | 812 | SetPageSwapBacked(page); |
| 696 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 813 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
| 814 | __inc_zone_page_state(page, NR_ANON_PAGES); | ||
| 697 | __page_set_anon_rmap(page, vma, address); | 815 | __page_set_anon_rmap(page, vma, address); |
| 698 | if (page_evictable(page, vma)) | 816 | if (page_evictable(page, vma)) |
| 699 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); | 817 | lru_cache_add_lru(page, LRU_ACTIVE_ANON); |
| @@ -711,7 +829,7 @@ void page_add_file_rmap(struct page *page) | |||
| 711 | { | 829 | { |
| 712 | if (atomic_inc_and_test(&page->_mapcount)) { | 830 | if (atomic_inc_and_test(&page->_mapcount)) { |
| 713 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 831 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
| 714 | mem_cgroup_update_mapped_file_stat(page, 1); | 832 | mem_cgroup_update_file_mapped(page, 1); |
| 715 | } | 833 | } |
| 716 | } | 834 | } |
| 717 | 835 | ||
| @@ -743,8 +861,8 @@ void page_remove_rmap(struct page *page) | |||
| 743 | __dec_zone_page_state(page, NR_ANON_PAGES); | 861 | __dec_zone_page_state(page, NR_ANON_PAGES); |
| 744 | } else { | 862 | } else { |
| 745 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 863 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
| 864 | mem_cgroup_update_file_mapped(page, -1); | ||
| 746 | } | 865 | } |
| 747 | mem_cgroup_update_mapped_file_stat(page, -1); | ||
| 748 | /* | 866 | /* |
| 749 | * It would be tidy to reset the PageAnon mapping here, | 867 | * It would be tidy to reset the PageAnon mapping here, |
| 750 | * but that might overwrite a racing page_add_anon_rmap | 868 | * but that might overwrite a racing page_add_anon_rmap |
| @@ -760,20 +878,15 @@ void page_remove_rmap(struct page *page) | |||
| 760 | * Subfunctions of try_to_unmap: try_to_unmap_one called | 878 | * Subfunctions of try_to_unmap: try_to_unmap_one called |
| 761 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 879 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
| 762 | */ | 880 | */ |
| 763 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 881 | int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
| 764 | enum ttu_flags flags) | 882 | unsigned long address, enum ttu_flags flags) |
| 765 | { | 883 | { |
| 766 | struct mm_struct *mm = vma->vm_mm; | 884 | struct mm_struct *mm = vma->vm_mm; |
| 767 | unsigned long address; | ||
| 768 | pte_t *pte; | 885 | pte_t *pte; |
| 769 | pte_t pteval; | 886 | pte_t pteval; |
| 770 | spinlock_t *ptl; | 887 | spinlock_t *ptl; |
| 771 | int ret = SWAP_AGAIN; | 888 | int ret = SWAP_AGAIN; |
| 772 | 889 | ||
| 773 | address = vma_address(page, vma); | ||
| 774 | if (address == -EFAULT) | ||
| 775 | goto out; | ||
| 776 | |||
| 777 | pte = page_check_address(page, mm, address, &ptl, 0); | 890 | pte = page_check_address(page, mm, address, &ptl, 0); |
| 778 | if (!pte) | 891 | if (!pte) |
| 779 | goto out; | 892 | goto out; |
| @@ -784,10 +897,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 784 | * skipped over this mm) then we should reactivate it. | 897 | * skipped over this mm) then we should reactivate it. |
| 785 | */ | 898 | */ |
| 786 | if (!(flags & TTU_IGNORE_MLOCK)) { | 899 | if (!(flags & TTU_IGNORE_MLOCK)) { |
| 787 | if (vma->vm_flags & VM_LOCKED) { | 900 | if (vma->vm_flags & VM_LOCKED) |
| 788 | ret = SWAP_MLOCK; | 901 | goto out_mlock; |
| 902 | |||
| 903 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
| 789 | goto out_unmap; | 904 | goto out_unmap; |
| 790 | } | ||
| 791 | } | 905 | } |
| 792 | if (!(flags & TTU_IGNORE_ACCESS)) { | 906 | if (!(flags & TTU_IGNORE_ACCESS)) { |
| 793 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 907 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
| @@ -809,9 +923,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 809 | 923 | ||
| 810 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 924 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
| 811 | if (PageAnon(page)) | 925 | if (PageAnon(page)) |
| 812 | dec_mm_counter(mm, anon_rss); | 926 | dec_mm_counter(mm, MM_ANONPAGES); |
| 813 | else | 927 | else |
| 814 | dec_mm_counter(mm, file_rss); | 928 | dec_mm_counter(mm, MM_FILEPAGES); |
| 815 | set_pte_at(mm, address, pte, | 929 | set_pte_at(mm, address, pte, |
| 816 | swp_entry_to_pte(make_hwpoison_entry(page))); | 930 | swp_entry_to_pte(make_hwpoison_entry(page))); |
| 817 | } else if (PageAnon(page)) { | 931 | } else if (PageAnon(page)) { |
| @@ -822,14 +936,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 822 | * Store the swap location in the pte. | 936 | * Store the swap location in the pte. |
| 823 | * See handle_pte_fault() ... | 937 | * See handle_pte_fault() ... |
| 824 | */ | 938 | */ |
| 825 | swap_duplicate(entry); | 939 | if (swap_duplicate(entry) < 0) { |
| 940 | set_pte_at(mm, address, pte, pteval); | ||
| 941 | ret = SWAP_FAIL; | ||
| 942 | goto out_unmap; | ||
| 943 | } | ||
| 826 | if (list_empty(&mm->mmlist)) { | 944 | if (list_empty(&mm->mmlist)) { |
| 827 | spin_lock(&mmlist_lock); | 945 | spin_lock(&mmlist_lock); |
| 828 | if (list_empty(&mm->mmlist)) | 946 | if (list_empty(&mm->mmlist)) |
| 829 | list_add(&mm->mmlist, &init_mm.mmlist); | 947 | list_add(&mm->mmlist, &init_mm.mmlist); |
| 830 | spin_unlock(&mmlist_lock); | 948 | spin_unlock(&mmlist_lock); |
| 831 | } | 949 | } |
| 832 | dec_mm_counter(mm, anon_rss); | 950 | dec_mm_counter(mm, MM_ANONPAGES); |
| 951 | inc_mm_counter(mm, MM_SWAPENTS); | ||
| 833 | } else if (PAGE_MIGRATION) { | 952 | } else if (PAGE_MIGRATION) { |
| 834 | /* | 953 | /* |
| 835 | * Store the pfn of the page in a special migration | 954 | * Store the pfn of the page in a special migration |
| @@ -847,8 +966,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 847 | entry = make_migration_entry(page, pte_write(pteval)); | 966 | entry = make_migration_entry(page, pte_write(pteval)); |
| 848 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 967 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
| 849 | } else | 968 | } else |
| 850 | dec_mm_counter(mm, file_rss); | 969 | dec_mm_counter(mm, MM_FILEPAGES); |
| 851 | |||
| 852 | 970 | ||
| 853 | page_remove_rmap(page); | 971 | page_remove_rmap(page); |
| 854 | page_cache_release(page); | 972 | page_cache_release(page); |
| @@ -857,6 +975,27 @@ out_unmap: | |||
| 857 | pte_unmap_unlock(pte, ptl); | 975 | pte_unmap_unlock(pte, ptl); |
| 858 | out: | 976 | out: |
| 859 | return ret; | 977 | return ret; |
| 978 | |||
| 979 | out_mlock: | ||
| 980 | pte_unmap_unlock(pte, ptl); | ||
| 981 | |||
| 982 | |||
| 983 | /* | ||
| 984 | * We need mmap_sem locking, Otherwise VM_LOCKED check makes | ||
| 985 | * unstable result and race. Plus, We can't wait here because | ||
| 986 | * we now hold anon_vma->lock or mapping->i_mmap_lock. | ||
| 987 | * if trylock failed, the page remain in evictable lru and later | ||
| 988 | * vmscan could retry to move the page to unevictable lru if the | ||
| 989 | * page is actually mlocked. | ||
| 990 | */ | ||
| 991 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
| 992 | if (vma->vm_flags & VM_LOCKED) { | ||
| 993 | mlock_vma_page(page); | ||
| 994 | ret = SWAP_MLOCK; | ||
| 995 | } | ||
| 996 | up_read(&vma->vm_mm->mmap_sem); | ||
| 997 | } | ||
| 998 | return ret; | ||
| 860 | } | 999 | } |
| 861 | 1000 | ||
| 862 | /* | 1001 | /* |
| @@ -922,11 +1061,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
| 922 | return ret; | 1061 | return ret; |
| 923 | 1062 | ||
| 924 | /* | 1063 | /* |
| 925 | * MLOCK_PAGES => feature is configured. | 1064 | * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, |
| 926 | * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, | ||
| 927 | * keep the sem while scanning the cluster for mlocking pages. | 1065 | * keep the sem while scanning the cluster for mlocking pages. |
| 928 | */ | 1066 | */ |
| 929 | if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { | 1067 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { |
| 930 | locked_vma = (vma->vm_flags & VM_LOCKED); | 1068 | locked_vma = (vma->vm_flags & VM_LOCKED); |
| 931 | if (!locked_vma) | 1069 | if (!locked_vma) |
| 932 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ | 1070 | up_read(&vma->vm_mm->mmap_sem); /* don't need it */ |
| @@ -967,7 +1105,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
| 967 | 1105 | ||
| 968 | page_remove_rmap(page); | 1106 | page_remove_rmap(page); |
| 969 | page_cache_release(page); | 1107 | page_cache_release(page); |
| 970 | dec_mm_counter(mm, file_rss); | 1108 | dec_mm_counter(mm, MM_FILEPAGES); |
| 971 | (*mapcount)--; | 1109 | (*mapcount)--; |
| 972 | } | 1110 | } |
| 973 | pte_unmap_unlock(pte - 1, ptl); | 1111 | pte_unmap_unlock(pte - 1, ptl); |
| @@ -976,29 +1114,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, | |||
| 976 | return ret; | 1114 | return ret; |
| 977 | } | 1115 | } |
| 978 | 1116 | ||
| 979 | /* | ||
| 980 | * common handling for pages mapped in VM_LOCKED vmas | ||
| 981 | */ | ||
| 982 | static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | ||
| 983 | { | ||
| 984 | int mlocked = 0; | ||
| 985 | |||
| 986 | if (down_read_trylock(&vma->vm_mm->mmap_sem)) { | ||
| 987 | if (vma->vm_flags & VM_LOCKED) { | ||
| 988 | mlock_vma_page(page); | ||
| 989 | mlocked++; /* really mlocked the page */ | ||
| 990 | } | ||
| 991 | up_read(&vma->vm_mm->mmap_sem); | ||
| 992 | } | ||
| 993 | return mlocked; | ||
| 994 | } | ||
| 995 | |||
| 996 | /** | 1117 | /** |
| 997 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based | 1118 | * try_to_unmap_anon - unmap or unlock anonymous page using the object-based |
| 998 | * rmap method | 1119 | * rmap method |
| 999 | * @page: the page to unmap/unlock | 1120 | * @page: the page to unmap/unlock |
| 1000 | * @unlock: request for unlock rather than unmap [unlikely] | 1121 | * @flags: action and flags |
| 1001 | * @migration: unmapping for migration - ignored if @unlock | ||
| 1002 | * | 1122 | * |
| 1003 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1123 | * Find all the mappings of a page using the mapping pointer and the vma chains |
| 1004 | * contained in the anon_vma struct it points to. | 1124 | * contained in the anon_vma struct it points to. |
| @@ -1013,43 +1133,24 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | |||
| 1013 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) | 1133 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
| 1014 | { | 1134 | { |
| 1015 | struct anon_vma *anon_vma; | 1135 | struct anon_vma *anon_vma; |
| 1016 | struct vm_area_struct *vma; | 1136 | struct anon_vma_chain *avc; |
| 1017 | unsigned int mlocked = 0; | ||
| 1018 | int ret = SWAP_AGAIN; | 1137 | int ret = SWAP_AGAIN; |
| 1019 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
| 1020 | |||
| 1021 | if (MLOCK_PAGES && unlikely(unlock)) | ||
| 1022 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
| 1023 | 1138 | ||
| 1024 | anon_vma = page_lock_anon_vma(page); | 1139 | anon_vma = page_lock_anon_vma(page); |
| 1025 | if (!anon_vma) | 1140 | if (!anon_vma) |
| 1026 | return ret; | 1141 | return ret; |
| 1027 | 1142 | ||
| 1028 | list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { | 1143 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { |
| 1029 | if (MLOCK_PAGES && unlikely(unlock)) { | 1144 | struct vm_area_struct *vma = avc->vma; |
| 1030 | if (!((vma->vm_flags & VM_LOCKED) && | 1145 | unsigned long address = vma_address(page, vma); |
| 1031 | page_mapped_in_vma(page, vma))) | 1146 | if (address == -EFAULT) |
| 1032 | continue; /* must visit all unlocked vmas */ | 1147 | continue; |
| 1033 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ | 1148 | ret = try_to_unmap_one(page, vma, address, flags); |
| 1034 | } else { | 1149 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
| 1035 | ret = try_to_unmap_one(page, vma, flags); | 1150 | break; |
| 1036 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
| 1037 | break; | ||
| 1038 | } | ||
| 1039 | if (ret == SWAP_MLOCK) { | ||
| 1040 | mlocked = try_to_mlock_page(page, vma); | ||
| 1041 | if (mlocked) | ||
| 1042 | break; /* stop if actually mlocked page */ | ||
| 1043 | } | ||
| 1044 | } | 1151 | } |
| 1045 | 1152 | ||
| 1046 | page_unlock_anon_vma(anon_vma); | 1153 | page_unlock_anon_vma(anon_vma); |
| 1047 | |||
| 1048 | if (mlocked) | ||
| 1049 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
| 1050 | else if (ret == SWAP_MLOCK) | ||
| 1051 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
| 1052 | |||
| 1053 | return ret; | 1154 | return ret; |
| 1054 | } | 1155 | } |
| 1055 | 1156 | ||
| @@ -1079,48 +1180,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
| 1079 | unsigned long max_nl_cursor = 0; | 1180 | unsigned long max_nl_cursor = 0; |
| 1080 | unsigned long max_nl_size = 0; | 1181 | unsigned long max_nl_size = 0; |
| 1081 | unsigned int mapcount; | 1182 | unsigned int mapcount; |
| 1082 | unsigned int mlocked = 0; | ||
| 1083 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
| 1084 | |||
| 1085 | if (MLOCK_PAGES && unlikely(unlock)) | ||
| 1086 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | ||
| 1087 | 1183 | ||
| 1088 | spin_lock(&mapping->i_mmap_lock); | 1184 | spin_lock(&mapping->i_mmap_lock); |
| 1089 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 1185 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { |
| 1090 | if (MLOCK_PAGES && unlikely(unlock)) { | 1186 | unsigned long address = vma_address(page, vma); |
| 1091 | if (!((vma->vm_flags & VM_LOCKED) && | 1187 | if (address == -EFAULT) |
| 1092 | page_mapped_in_vma(page, vma))) | 1188 | continue; |
| 1093 | continue; /* must visit all vmas */ | 1189 | ret = try_to_unmap_one(page, vma, address, flags); |
| 1094 | ret = SWAP_MLOCK; | 1190 | if (ret != SWAP_AGAIN || !page_mapped(page)) |
| 1095 | } else { | 1191 | goto out; |
| 1096 | ret = try_to_unmap_one(page, vma, flags); | ||
| 1097 | if (ret == SWAP_FAIL || !page_mapped(page)) | ||
| 1098 | goto out; | ||
| 1099 | } | ||
| 1100 | if (ret == SWAP_MLOCK) { | ||
| 1101 | mlocked = try_to_mlock_page(page, vma); | ||
| 1102 | if (mlocked) | ||
| 1103 | break; /* stop if actually mlocked page */ | ||
| 1104 | } | ||
| 1105 | } | 1192 | } |
| 1106 | 1193 | ||
| 1107 | if (mlocked) | 1194 | if (list_empty(&mapping->i_mmap_nonlinear)) |
| 1108 | goto out; | 1195 | goto out; |
| 1109 | 1196 | ||
| 1110 | if (list_empty(&mapping->i_mmap_nonlinear)) | 1197 | /* |
| 1198 | * We don't bother to try to find the munlocked page in nonlinears. | ||
| 1199 | * It's costly. Instead, later, page reclaim logic may call | ||
| 1200 | * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. | ||
| 1201 | */ | ||
| 1202 | if (TTU_ACTION(flags) == TTU_MUNLOCK) | ||
| 1111 | goto out; | 1203 | goto out; |
| 1112 | 1204 | ||
| 1113 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1205 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
| 1114 | shared.vm_set.list) { | 1206 | shared.vm_set.list) { |
| 1115 | if (MLOCK_PAGES && unlikely(unlock)) { | ||
| 1116 | if (!(vma->vm_flags & VM_LOCKED)) | ||
| 1117 | continue; /* must visit all vmas */ | ||
| 1118 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ | ||
| 1119 | goto out; /* no need to look further */ | ||
| 1120 | } | ||
| 1121 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
| 1122 | (vma->vm_flags & VM_LOCKED)) | ||
| 1123 | continue; | ||
| 1124 | cursor = (unsigned long) vma->vm_private_data; | 1207 | cursor = (unsigned long) vma->vm_private_data; |
| 1125 | if (cursor > max_nl_cursor) | 1208 | if (cursor > max_nl_cursor) |
| 1126 | max_nl_cursor = cursor; | 1209 | max_nl_cursor = cursor; |
| @@ -1153,16 +1236,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
| 1153 | do { | 1236 | do { |
| 1154 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1237 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
| 1155 | shared.vm_set.list) { | 1238 | shared.vm_set.list) { |
| 1156 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && | ||
| 1157 | (vma->vm_flags & VM_LOCKED)) | ||
| 1158 | continue; | ||
| 1159 | cursor = (unsigned long) vma->vm_private_data; | 1239 | cursor = (unsigned long) vma->vm_private_data; |
| 1160 | while ( cursor < max_nl_cursor && | 1240 | while ( cursor < max_nl_cursor && |
| 1161 | cursor < vma->vm_end - vma->vm_start) { | 1241 | cursor < vma->vm_end - vma->vm_start) { |
| 1162 | ret = try_to_unmap_cluster(cursor, &mapcount, | 1242 | if (try_to_unmap_cluster(cursor, &mapcount, |
| 1163 | vma, page); | 1243 | vma, page) == SWAP_MLOCK) |
| 1164 | if (ret == SWAP_MLOCK) | 1244 | ret = SWAP_MLOCK; |
| 1165 | mlocked = 2; /* to return below */ | ||
| 1166 | cursor += CLUSTER_SIZE; | 1245 | cursor += CLUSTER_SIZE; |
| 1167 | vma->vm_private_data = (void *) cursor; | 1246 | vma->vm_private_data = (void *) cursor; |
| 1168 | if ((int)mapcount <= 0) | 1247 | if ((int)mapcount <= 0) |
| @@ -1183,10 +1262,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags) | |||
| 1183 | vma->vm_private_data = NULL; | 1262 | vma->vm_private_data = NULL; |
| 1184 | out: | 1263 | out: |
| 1185 | spin_unlock(&mapping->i_mmap_lock); | 1264 | spin_unlock(&mapping->i_mmap_lock); |
| 1186 | if (mlocked) | ||
| 1187 | ret = SWAP_MLOCK; /* actually mlocked the page */ | ||
| 1188 | else if (ret == SWAP_MLOCK) | ||
| 1189 | ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ | ||
| 1190 | return ret; | 1265 | return ret; |
| 1191 | } | 1266 | } |
| 1192 | 1267 | ||
| @@ -1210,7 +1285,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
| 1210 | 1285 | ||
| 1211 | BUG_ON(!PageLocked(page)); | 1286 | BUG_ON(!PageLocked(page)); |
| 1212 | 1287 | ||
| 1213 | if (PageAnon(page)) | 1288 | if (unlikely(PageKsm(page))) |
| 1289 | ret = try_to_unmap_ksm(page, flags); | ||
| 1290 | else if (PageAnon(page)) | ||
| 1214 | ret = try_to_unmap_anon(page, flags); | 1291 | ret = try_to_unmap_anon(page, flags); |
| 1215 | else | 1292 | else |
| 1216 | ret = try_to_unmap_file(page, flags); | 1293 | ret = try_to_unmap_file(page, flags); |
| @@ -1229,17 +1306,99 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
| 1229 | * | 1306 | * |
| 1230 | * Return values are: | 1307 | * Return values are: |
| 1231 | * | 1308 | * |
| 1232 | * SWAP_SUCCESS - no vma's holding page mlocked. | 1309 | * SWAP_AGAIN - no vma is holding page mlocked, or, |
| 1233 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem | 1310 | * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem |
| 1311 | * SWAP_FAIL - page cannot be located at present | ||
| 1234 | * SWAP_MLOCK - page is now mlocked. | 1312 | * SWAP_MLOCK - page is now mlocked. |
| 1235 | */ | 1313 | */ |
| 1236 | int try_to_munlock(struct page *page) | 1314 | int try_to_munlock(struct page *page) |
| 1237 | { | 1315 | { |
| 1238 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1316 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
| 1239 | 1317 | ||
| 1240 | if (PageAnon(page)) | 1318 | if (unlikely(PageKsm(page))) |
| 1319 | return try_to_unmap_ksm(page, TTU_MUNLOCK); | ||
| 1320 | else if (PageAnon(page)) | ||
| 1241 | return try_to_unmap_anon(page, TTU_MUNLOCK); | 1321 | return try_to_unmap_anon(page, TTU_MUNLOCK); |
| 1242 | else | 1322 | else |
| 1243 | return try_to_unmap_file(page, TTU_MUNLOCK); | 1323 | return try_to_unmap_file(page, TTU_MUNLOCK); |
| 1244 | } | 1324 | } |
| 1245 | 1325 | ||
| 1326 | #ifdef CONFIG_MIGRATION | ||
| 1327 | /* | ||
| 1328 | * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): | ||
| 1329 | * Called by migrate.c to remove migration ptes, but might be used more later. | ||
| 1330 | */ | ||
| 1331 | static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, | ||
| 1332 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
| 1333 | { | ||
| 1334 | struct anon_vma *anon_vma; | ||
| 1335 | struct anon_vma_chain *avc; | ||
| 1336 | int ret = SWAP_AGAIN; | ||
| 1337 | |||
| 1338 | /* | ||
| 1339 | * Note: remove_migration_ptes() cannot use page_lock_anon_vma() | ||
| 1340 | * because that depends on page_mapped(); but not all its usages | ||
| 1341 | * are holding mmap_sem, which also gave the necessary guarantee | ||
| 1342 | * (that this anon_vma's slab has not already been destroyed). | ||
| 1343 | * This needs to be reviewed later: avoiding page_lock_anon_vma() | ||
| 1344 | * is risky, and currently limits the usefulness of rmap_walk(). | ||
| 1345 | */ | ||
| 1346 | anon_vma = page_anon_vma(page); | ||
| 1347 | if (!anon_vma) | ||
| 1348 | return ret; | ||
| 1349 | spin_lock(&anon_vma->lock); | ||
| 1350 | list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { | ||
| 1351 | struct vm_area_struct *vma = avc->vma; | ||
| 1352 | unsigned long address = vma_address(page, vma); | ||
| 1353 | if (address == -EFAULT) | ||
| 1354 | continue; | ||
| 1355 | ret = rmap_one(page, vma, address, arg); | ||
| 1356 | if (ret != SWAP_AGAIN) | ||
| 1357 | break; | ||
| 1358 | } | ||
| 1359 | spin_unlock(&anon_vma->lock); | ||
| 1360 | return ret; | ||
| 1361 | } | ||
| 1362 | |||
| 1363 | static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, | ||
| 1364 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
| 1365 | { | ||
| 1366 | struct address_space *mapping = page->mapping; | ||
| 1367 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 1368 | struct vm_area_struct *vma; | ||
| 1369 | struct prio_tree_iter iter; | ||
| 1370 | int ret = SWAP_AGAIN; | ||
| 1371 | |||
| 1372 | if (!mapping) | ||
| 1373 | return ret; | ||
| 1374 | spin_lock(&mapping->i_mmap_lock); | ||
| 1375 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | ||
| 1376 | unsigned long address = vma_address(page, vma); | ||
| 1377 | if (address == -EFAULT) | ||
| 1378 | continue; | ||
| 1379 | ret = rmap_one(page, vma, address, arg); | ||
| 1380 | if (ret != SWAP_AGAIN) | ||
| 1381 | break; | ||
| 1382 | } | ||
| 1383 | /* | ||
| 1384 | * No nonlinear handling: being always shared, nonlinear vmas | ||
| 1385 | * never contain migration ptes. Decide what to do about this | ||
| 1386 | * limitation to linear when we need rmap_walk() on nonlinear. | ||
| 1387 | */ | ||
| 1388 | spin_unlock(&mapping->i_mmap_lock); | ||
| 1389 | return ret; | ||
| 1390 | } | ||
| 1391 | |||
| 1392 | int rmap_walk(struct page *page, int (*rmap_one)(struct page *, | ||
| 1393 | struct vm_area_struct *, unsigned long, void *), void *arg) | ||
| 1394 | { | ||
| 1395 | VM_BUG_ON(!PageLocked(page)); | ||
| 1396 | |||
| 1397 | if (unlikely(PageKsm(page))) | ||
| 1398 | return rmap_walk_ksm(page, rmap_one, arg); | ||
| 1399 | else if (PageAnon(page)) | ||
| 1400 | return rmap_walk_anon(page, rmap_one, arg); | ||
| 1401 | else | ||
| 1402 | return rmap_walk_file(page, rmap_one, arg); | ||
| 1403 | } | ||
| 1404 | #endif /* CONFIG_MIGRATION */ | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 356dd99566ec..eef4ebea5158 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -29,7 +29,6 @@ | |||
| 29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
| 30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
| 31 | #include <linux/swap.h> | 31 | #include <linux/swap.h> |
| 32 | #include <linux/ima.h> | ||
| 33 | 32 | ||
| 34 | static struct vfsmount *shm_mnt; | 33 | static struct vfsmount *shm_mnt; |
| 35 | 34 | ||
| @@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt; | |||
| 42 | 41 | ||
| 43 | #include <linux/xattr.h> | 42 | #include <linux/xattr.h> |
| 44 | #include <linux/exportfs.h> | 43 | #include <linux/exportfs.h> |
| 44 | #include <linux/posix_acl.h> | ||
| 45 | #include <linux/generic_acl.h> | 45 | #include <linux/generic_acl.h> |
| 46 | #include <linux/mman.h> | 46 | #include <linux/mman.h> |
| 47 | #include <linux/string.h> | 47 | #include <linux/string.h> |
| @@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
| 810 | error = inode_setattr(inode, attr); | 810 | error = inode_setattr(inode, attr); |
| 811 | #ifdef CONFIG_TMPFS_POSIX_ACL | 811 | #ifdef CONFIG_TMPFS_POSIX_ACL |
| 812 | if (!error && (attr->ia_valid & ATTR_MODE)) | 812 | if (!error && (attr->ia_valid & ATTR_MODE)) |
| 813 | error = generic_acl_chmod(inode, &shmem_acl_ops); | 813 | error = generic_acl_chmod(inode); |
| 814 | #endif | 814 | #endif |
| 815 | if (page) | 815 | if (page) |
| 816 | page_cache_release(page); | 816 | page_cache_release(page); |
| @@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
| 1017 | goto out; | 1017 | goto out; |
| 1018 | } | 1018 | } |
| 1019 | mutex_unlock(&shmem_swaplist_mutex); | 1019 | mutex_unlock(&shmem_swaplist_mutex); |
| 1020 | out: return found; /* 0 or 1 or -ENOMEM */ | 1020 | /* |
| 1021 | * Can some race bring us here? We've been holding page lock, | ||
| 1022 | * so I think not; but would rather try again later than BUG() | ||
| 1023 | */ | ||
| 1024 | unlock_page(page); | ||
| 1025 | page_cache_release(page); | ||
| 1026 | out: | ||
| 1027 | return (found < 0) ? found : 0; | ||
| 1021 | } | 1028 | } |
| 1022 | 1029 | ||
| 1023 | /* | 1030 | /* |
| @@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
| 1080 | else | 1087 | else |
| 1081 | inode = NULL; | 1088 | inode = NULL; |
| 1082 | spin_unlock(&info->lock); | 1089 | spin_unlock(&info->lock); |
| 1083 | swap_duplicate(swap); | 1090 | swap_shmem_alloc(swap); |
| 1084 | BUG_ON(page_mapped(page)); | 1091 | BUG_ON(page_mapped(page)); |
| 1085 | page_cache_release(page); /* pagecache ref */ | 1092 | page_cache_release(page); /* pagecache ref */ |
| 1086 | swap_writepage(page, wbc); | 1093 | swap_writepage(page, wbc); |
| @@ -1817,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | |||
| 1817 | return error; | 1824 | return error; |
| 1818 | } | 1825 | } |
| 1819 | } | 1826 | } |
| 1820 | error = shmem_acl_init(inode, dir); | 1827 | #ifdef CONFIG_TMPFS_POSIX_ACL |
| 1828 | error = generic_acl_init(inode, dir); | ||
| 1821 | if (error) { | 1829 | if (error) { |
| 1822 | iput(inode); | 1830 | iput(inode); |
| 1823 | return error; | 1831 | return error; |
| 1824 | } | 1832 | } |
| 1833 | #else | ||
| 1834 | error = 0; | ||
| 1835 | #endif | ||
| 1825 | if (dir->i_mode & S_ISGID) { | 1836 | if (dir->i_mode & S_ISGID) { |
| 1826 | inode->i_gid = dir->i_gid; | 1837 | inode->i_gid = dir->i_gid; |
| 1827 | if (S_ISDIR(mode)) | 1838 | if (S_ISDIR(mode)) |
| @@ -2036,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = { | |||
| 2036 | * filesystem level, though. | 2047 | * filesystem level, though. |
| 2037 | */ | 2048 | */ |
| 2038 | 2049 | ||
| 2039 | static size_t shmem_xattr_security_list(struct inode *inode, char *list, | 2050 | static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, |
| 2040 | size_t list_len, const char *name, | 2051 | size_t list_len, const char *name, |
| 2041 | size_t name_len) | 2052 | size_t name_len, int handler_flags) |
| 2042 | { | 2053 | { |
| 2043 | return security_inode_listsecurity(inode, list, list_len); | 2054 | return security_inode_listsecurity(dentry->d_inode, list, list_len); |
| 2044 | } | 2055 | } |
| 2045 | 2056 | ||
| 2046 | static int shmem_xattr_security_get(struct inode *inode, const char *name, | 2057 | static int shmem_xattr_security_get(struct dentry *dentry, const char *name, |
| 2047 | void *buffer, size_t size) | 2058 | void *buffer, size_t size, int handler_flags) |
| 2048 | { | 2059 | { |
| 2049 | if (strcmp(name, "") == 0) | 2060 | if (strcmp(name, "") == 0) |
| 2050 | return -EINVAL; | 2061 | return -EINVAL; |
| 2051 | return xattr_getsecurity(inode, name, buffer, size); | 2062 | return xattr_getsecurity(dentry->d_inode, name, buffer, size); |
| 2052 | } | 2063 | } |
| 2053 | 2064 | ||
| 2054 | static int shmem_xattr_security_set(struct inode *inode, const char *name, | 2065 | static int shmem_xattr_security_set(struct dentry *dentry, const char *name, |
| 2055 | const void *value, size_t size, int flags) | 2066 | const void *value, size_t size, int flags, int handler_flags) |
| 2056 | { | 2067 | { |
| 2057 | if (strcmp(name, "") == 0) | 2068 | if (strcmp(name, "") == 0) |
| 2058 | return -EINVAL; | 2069 | return -EINVAL; |
| 2059 | return security_inode_setsecurity(inode, name, value, size, flags); | 2070 | return security_inode_setsecurity(dentry->d_inode, name, value, |
| 2071 | size, flags); | ||
| 2060 | } | 2072 | } |
| 2061 | 2073 | ||
| 2062 | static struct xattr_handler shmem_xattr_security_handler = { | 2074 | static struct xattr_handler shmem_xattr_security_handler = { |
| @@ -2067,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = { | |||
| 2067 | }; | 2079 | }; |
| 2068 | 2080 | ||
| 2069 | static struct xattr_handler *shmem_xattr_handlers[] = { | 2081 | static struct xattr_handler *shmem_xattr_handlers[] = { |
| 2070 | &shmem_xattr_acl_access_handler, | 2082 | &generic_acl_access_handler, |
| 2071 | &shmem_xattr_acl_default_handler, | 2083 | &generic_acl_default_handler, |
| 2072 | &shmem_xattr_security_handler, | 2084 | &shmem_xattr_security_handler, |
| 2073 | NULL | 2085 | NULL |
| 2074 | }; | 2086 | }; |
| @@ -2447,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = { | |||
| 2447 | .getxattr = generic_getxattr, | 2459 | .getxattr = generic_getxattr, |
| 2448 | .listxattr = generic_listxattr, | 2460 | .listxattr = generic_listxattr, |
| 2449 | .removexattr = generic_removexattr, | 2461 | .removexattr = generic_removexattr, |
| 2450 | .check_acl = shmem_check_acl, | 2462 | .check_acl = generic_check_acl, |
| 2451 | #endif | 2463 | #endif |
| 2452 | 2464 | ||
| 2453 | }; | 2465 | }; |
| @@ -2470,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
| 2470 | .getxattr = generic_getxattr, | 2482 | .getxattr = generic_getxattr, |
| 2471 | .listxattr = generic_listxattr, | 2483 | .listxattr = generic_listxattr, |
| 2472 | .removexattr = generic_removexattr, | 2484 | .removexattr = generic_removexattr, |
| 2473 | .check_acl = shmem_check_acl, | 2485 | .check_acl = generic_check_acl, |
| 2474 | #endif | 2486 | #endif |
| 2475 | }; | 2487 | }; |
| 2476 | 2488 | ||
| @@ -2481,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
| 2481 | .getxattr = generic_getxattr, | 2493 | .getxattr = generic_getxattr, |
| 2482 | .listxattr = generic_listxattr, | 2494 | .listxattr = generic_listxattr, |
| 2483 | .removexattr = generic_removexattr, | 2495 | .removexattr = generic_removexattr, |
| 2484 | .check_acl = shmem_check_acl, | 2496 | .check_acl = generic_check_acl, |
| 2485 | #endif | 2497 | #endif |
| 2486 | }; | 2498 | }; |
| 2487 | 2499 | ||
| @@ -2619,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
| 2619 | int error; | 2631 | int error; |
| 2620 | struct file *file; | 2632 | struct file *file; |
| 2621 | struct inode *inode; | 2633 | struct inode *inode; |
| 2622 | struct dentry *dentry, *root; | 2634 | struct path path; |
| 2635 | struct dentry *root; | ||
| 2623 | struct qstr this; | 2636 | struct qstr this; |
| 2624 | 2637 | ||
| 2625 | if (IS_ERR(shm_mnt)) | 2638 | if (IS_ERR(shm_mnt)) |
| @@ -2636,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
| 2636 | this.len = strlen(name); | 2649 | this.len = strlen(name); |
| 2637 | this.hash = 0; /* will go */ | 2650 | this.hash = 0; /* will go */ |
| 2638 | root = shm_mnt->mnt_root; | 2651 | root = shm_mnt->mnt_root; |
| 2639 | dentry = d_alloc(root, &this); | 2652 | path.dentry = d_alloc(root, &this); |
| 2640 | if (!dentry) | 2653 | if (!path.dentry) |
| 2641 | goto put_memory; | 2654 | goto put_memory; |
| 2642 | 2655 | path.mnt = mntget(shm_mnt); | |
| 2643 | error = -ENFILE; | ||
| 2644 | file = get_empty_filp(); | ||
| 2645 | if (!file) | ||
| 2646 | goto put_dentry; | ||
| 2647 | 2656 | ||
| 2648 | error = -ENOSPC; | 2657 | error = -ENOSPC; |
| 2649 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); | 2658 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); |
| 2650 | if (!inode) | 2659 | if (!inode) |
| 2651 | goto close_file; | 2660 | goto put_dentry; |
| 2652 | 2661 | ||
| 2653 | d_instantiate(dentry, inode); | 2662 | d_instantiate(path.dentry, inode); |
| 2654 | inode->i_size = size; | 2663 | inode->i_size = size; |
| 2655 | inode->i_nlink = 0; /* It is unlinked */ | 2664 | inode->i_nlink = 0; /* It is unlinked */ |
| 2656 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
| 2657 | &shmem_file_operations); | ||
| 2658 | |||
| 2659 | #ifndef CONFIG_MMU | 2665 | #ifndef CONFIG_MMU |
| 2660 | error = ramfs_nommu_expand_for_mapping(inode, size); | 2666 | error = ramfs_nommu_expand_for_mapping(inode, size); |
| 2661 | if (error) | 2667 | if (error) |
| 2662 | goto close_file; | 2668 | goto put_dentry; |
| 2663 | #endif | 2669 | #endif |
| 2664 | ima_counts_get(file); | 2670 | |
| 2671 | error = -ENFILE; | ||
| 2672 | file = alloc_file(&path, FMODE_WRITE | FMODE_READ, | ||
| 2673 | &shmem_file_operations); | ||
| 2674 | if (!file) | ||
| 2675 | goto put_dentry; | ||
| 2676 | |||
| 2665 | return file; | 2677 | return file; |
| 2666 | 2678 | ||
| 2667 | close_file: | ||
| 2668 | put_filp(file); | ||
| 2669 | put_dentry: | 2679 | put_dentry: |
| 2670 | dput(dentry); | 2680 | path_put(&path); |
| 2671 | put_memory: | 2681 | put_memory: |
| 2672 | shmem_unacct_size(flags, size); | 2682 | shmem_unacct_size(flags, size); |
| 2673 | return ERR_PTR(error); | 2683 | return ERR_PTR(error); |
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c deleted file mode 100644 index df2c87fdae50..000000000000 --- a/mm/shmem_acl.c +++ /dev/null | |||
| @@ -1,171 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * mm/shmem_acl.c | ||
| 3 | * | ||
| 4 | * (C) 2005 Andreas Gruenbacher <agruen@suse.de> | ||
| 5 | * | ||
| 6 | * This file is released under the GPL. | ||
| 7 | */ | ||
| 8 | |||
| 9 | #include <linux/fs.h> | ||
| 10 | #include <linux/shmem_fs.h> | ||
| 11 | #include <linux/xattr.h> | ||
| 12 | #include <linux/generic_acl.h> | ||
| 13 | |||
| 14 | /** | ||
| 15 | * shmem_get_acl - generic_acl_operations->getacl() operation | ||
| 16 | */ | ||
| 17 | static struct posix_acl * | ||
| 18 | shmem_get_acl(struct inode *inode, int type) | ||
| 19 | { | ||
| 20 | struct posix_acl *acl = NULL; | ||
| 21 | |||
| 22 | spin_lock(&inode->i_lock); | ||
| 23 | switch(type) { | ||
| 24 | case ACL_TYPE_ACCESS: | ||
| 25 | acl = posix_acl_dup(inode->i_acl); | ||
| 26 | break; | ||
| 27 | |||
| 28 | case ACL_TYPE_DEFAULT: | ||
| 29 | acl = posix_acl_dup(inode->i_default_acl); | ||
| 30 | break; | ||
| 31 | } | ||
| 32 | spin_unlock(&inode->i_lock); | ||
| 33 | |||
| 34 | return acl; | ||
| 35 | } | ||
| 36 | |||
| 37 | /** | ||
| 38 | * shmem_set_acl - generic_acl_operations->setacl() operation | ||
| 39 | */ | ||
| 40 | static void | ||
| 41 | shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
| 42 | { | ||
| 43 | struct posix_acl *free = NULL; | ||
| 44 | |||
| 45 | spin_lock(&inode->i_lock); | ||
| 46 | switch(type) { | ||
| 47 | case ACL_TYPE_ACCESS: | ||
| 48 | free = inode->i_acl; | ||
| 49 | inode->i_acl = posix_acl_dup(acl); | ||
| 50 | break; | ||
| 51 | |||
| 52 | case ACL_TYPE_DEFAULT: | ||
| 53 | free = inode->i_default_acl; | ||
| 54 | inode->i_default_acl = posix_acl_dup(acl); | ||
| 55 | break; | ||
| 56 | } | ||
| 57 | spin_unlock(&inode->i_lock); | ||
| 58 | posix_acl_release(free); | ||
| 59 | } | ||
| 60 | |||
| 61 | struct generic_acl_operations shmem_acl_ops = { | ||
| 62 | .getacl = shmem_get_acl, | ||
| 63 | .setacl = shmem_set_acl, | ||
| 64 | }; | ||
| 65 | |||
| 66 | /** | ||
| 67 | * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, | ||
| 68 | * shmem_xattr_acl_access_handler - plumbing code to implement the | ||
| 69 | * system.posix_acl_access xattr using the generic acl functions. | ||
| 70 | */ | ||
| 71 | |||
| 72 | static size_t | ||
| 73 | shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, | ||
| 74 | const char *name, size_t name_len) | ||
| 75 | { | ||
| 76 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, | ||
| 77 | list, list_size); | ||
| 78 | } | ||
| 79 | |||
| 80 | static int | ||
| 81 | shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, | ||
| 82 | size_t size) | ||
| 83 | { | ||
| 84 | if (strcmp(name, "") != 0) | ||
| 85 | return -EINVAL; | ||
| 86 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, | ||
| 87 | size); | ||
| 88 | } | ||
| 89 | |||
| 90 | static int | ||
| 91 | shmem_set_acl_access(struct inode *inode, const char *name, const void *value, | ||
| 92 | size_t size, int flags) | ||
| 93 | { | ||
| 94 | if (strcmp(name, "") != 0) | ||
| 95 | return -EINVAL; | ||
| 96 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, | ||
| 97 | size); | ||
| 98 | } | ||
| 99 | |||
| 100 | struct xattr_handler shmem_xattr_acl_access_handler = { | ||
| 101 | .prefix = POSIX_ACL_XATTR_ACCESS, | ||
| 102 | .list = shmem_list_acl_access, | ||
| 103 | .get = shmem_get_acl_access, | ||
| 104 | .set = shmem_set_acl_access, | ||
| 105 | }; | ||
| 106 | |||
| 107 | /** | ||
| 108 | * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, | ||
| 109 | * shmem_xattr_acl_default_handler - plumbing code to implement the | ||
| 110 | * system.posix_acl_default xattr using the generic acl functions. | ||
| 111 | */ | ||
| 112 | |||
| 113 | static size_t | ||
| 114 | shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, | ||
| 115 | const char *name, size_t name_len) | ||
| 116 | { | ||
| 117 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, | ||
| 118 | list, list_size); | ||
| 119 | } | ||
| 120 | |||
| 121 | static int | ||
| 122 | shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, | ||
| 123 | size_t size) | ||
| 124 | { | ||
| 125 | if (strcmp(name, "") != 0) | ||
| 126 | return -EINVAL; | ||
| 127 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, | ||
| 128 | size); | ||
| 129 | } | ||
| 130 | |||
| 131 | static int | ||
| 132 | shmem_set_acl_default(struct inode *inode, const char *name, const void *value, | ||
| 133 | size_t size, int flags) | ||
| 134 | { | ||
| 135 | if (strcmp(name, "") != 0) | ||
| 136 | return -EINVAL; | ||
| 137 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, | ||
| 138 | size); | ||
| 139 | } | ||
| 140 | |||
| 141 | struct xattr_handler shmem_xattr_acl_default_handler = { | ||
| 142 | .prefix = POSIX_ACL_XATTR_DEFAULT, | ||
| 143 | .list = shmem_list_acl_default, | ||
| 144 | .get = shmem_get_acl_default, | ||
| 145 | .set = shmem_set_acl_default, | ||
| 146 | }; | ||
| 147 | |||
| 148 | /** | ||
| 149 | * shmem_acl_init - Inizialize the acl(s) of a new inode | ||
| 150 | */ | ||
| 151 | int | ||
| 152 | shmem_acl_init(struct inode *inode, struct inode *dir) | ||
| 153 | { | ||
| 154 | return generic_acl_init(inode, dir, &shmem_acl_ops); | ||
| 155 | } | ||
| 156 | |||
| 157 | /** | ||
| 158 | * shmem_check_acl - check_acl() callback for generic_permission() | ||
| 159 | */ | ||
| 160 | int | ||
| 161 | shmem_check_acl(struct inode *inode, int mask) | ||
| 162 | { | ||
| 163 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); | ||
| 164 | |||
| 165 | if (acl) { | ||
| 166 | int error = posix_acl_permission(inode, acl, mask); | ||
| 167 | posix_acl_release(acl); | ||
| 168 | return error; | ||
| 169 | } | ||
| 170 | return -EAGAIN; | ||
| 171 | } | ||
| @@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
| 490 | 490 | ||
| 491 | #endif | 491 | #endif |
| 492 | 492 | ||
| 493 | #ifdef CONFIG_KMEMTRACE | 493 | #ifdef CONFIG_TRACING |
| 494 | size_t slab_buffer_size(struct kmem_cache *cachep) | 494 | size_t slab_buffer_size(struct kmem_cache *cachep) |
| 495 | { | 495 | { |
| 496 | return cachep->buffer_size; | 496 | return cachep->buffer_size; |
| @@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = { | |||
| 604 | 604 | ||
| 605 | #define BAD_ALIEN_MAGIC 0x01020304ul | 605 | #define BAD_ALIEN_MAGIC 0x01020304ul |
| 606 | 606 | ||
| 607 | /* | ||
| 608 | * chicken and egg problem: delay the per-cpu array allocation | ||
| 609 | * until the general caches are up. | ||
| 610 | */ | ||
| 611 | static enum { | ||
| 612 | NONE, | ||
| 613 | PARTIAL_AC, | ||
| 614 | PARTIAL_L3, | ||
| 615 | EARLY, | ||
| 616 | FULL | ||
| 617 | } g_cpucache_up; | ||
| 618 | |||
| 619 | /* | ||
| 620 | * used by boot code to determine if it can use slab based allocator | ||
| 621 | */ | ||
| 622 | int slab_is_available(void) | ||
| 623 | { | ||
| 624 | return g_cpucache_up >= EARLY; | ||
| 625 | } | ||
| 626 | |||
| 607 | #ifdef CONFIG_LOCKDEP | 627 | #ifdef CONFIG_LOCKDEP |
| 608 | 628 | ||
| 609 | /* | 629 | /* |
| @@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = { | |||
| 620 | static struct lock_class_key on_slab_l3_key; | 640 | static struct lock_class_key on_slab_l3_key; |
| 621 | static struct lock_class_key on_slab_alc_key; | 641 | static struct lock_class_key on_slab_alc_key; |
| 622 | 642 | ||
| 623 | static inline void init_lock_keys(void) | 643 | static void init_node_lock_keys(int q) |
| 624 | |||
| 625 | { | 644 | { |
| 626 | int q; | ||
| 627 | struct cache_sizes *s = malloc_sizes; | 645 | struct cache_sizes *s = malloc_sizes; |
| 628 | 646 | ||
| 629 | while (s->cs_size != ULONG_MAX) { | 647 | if (g_cpucache_up != FULL) |
| 630 | for_each_node(q) { | 648 | return; |
| 631 | struct array_cache **alc; | 649 | |
| 632 | int r; | 650 | for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { |
| 633 | struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; | 651 | struct array_cache **alc; |
| 634 | if (!l3 || OFF_SLAB(s->cs_cachep)) | 652 | struct kmem_list3 *l3; |
| 635 | continue; | 653 | int r; |
| 636 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); | 654 | |
| 637 | alc = l3->alien; | 655 | l3 = s->cs_cachep->nodelists[q]; |
| 638 | /* | 656 | if (!l3 || OFF_SLAB(s->cs_cachep)) |
| 639 | * FIXME: This check for BAD_ALIEN_MAGIC | 657 | continue; |
| 640 | * should go away when common slab code is taught to | 658 | lockdep_set_class(&l3->list_lock, &on_slab_l3_key); |
| 641 | * work even without alien caches. | 659 | alc = l3->alien; |
| 642 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC | 660 | /* |
| 643 | * for alloc_alien_cache, | 661 | * FIXME: This check for BAD_ALIEN_MAGIC |
| 644 | */ | 662 | * should go away when common slab code is taught to |
| 645 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) | 663 | * work even without alien caches. |
| 646 | continue; | 664 | * Currently, non NUMA code returns BAD_ALIEN_MAGIC |
| 647 | for_each_node(r) { | 665 | * for alloc_alien_cache, |
| 648 | if (alc[r]) | 666 | */ |
| 649 | lockdep_set_class(&alc[r]->lock, | 667 | if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) |
| 650 | &on_slab_alc_key); | 668 | continue; |
| 651 | } | 669 | for_each_node(r) { |
| 670 | if (alc[r]) | ||
| 671 | lockdep_set_class(&alc[r]->lock, | ||
| 672 | &on_slab_alc_key); | ||
| 652 | } | 673 | } |
| 653 | s++; | ||
| 654 | } | 674 | } |
| 655 | } | 675 | } |
| 676 | |||
| 677 | static inline void init_lock_keys(void) | ||
| 678 | { | ||
| 679 | int node; | ||
| 680 | |||
| 681 | for_each_node(node) | ||
| 682 | init_node_lock_keys(node); | ||
| 683 | } | ||
| 656 | #else | 684 | #else |
| 685 | static void init_node_lock_keys(int q) | ||
| 686 | { | ||
| 687 | } | ||
| 688 | |||
| 657 | static inline void init_lock_keys(void) | 689 | static inline void init_lock_keys(void) |
| 658 | { | 690 | { |
| 659 | } | 691 | } |
| @@ -665,27 +697,7 @@ static inline void init_lock_keys(void) | |||
| 665 | static DEFINE_MUTEX(cache_chain_mutex); | 697 | static DEFINE_MUTEX(cache_chain_mutex); |
| 666 | static struct list_head cache_chain; | 698 | static struct list_head cache_chain; |
| 667 | 699 | ||
| 668 | /* | 700 | static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); |
| 669 | * chicken and egg problem: delay the per-cpu array allocation | ||
| 670 | * until the general caches are up. | ||
| 671 | */ | ||
| 672 | static enum { | ||
| 673 | NONE, | ||
| 674 | PARTIAL_AC, | ||
| 675 | PARTIAL_L3, | ||
| 676 | EARLY, | ||
| 677 | FULL | ||
| 678 | } g_cpucache_up; | ||
| 679 | |||
| 680 | /* | ||
| 681 | * used by boot code to determine if it can use slab based allocator | ||
| 682 | */ | ||
| 683 | int slab_is_available(void) | ||
| 684 | { | ||
| 685 | return g_cpucache_up >= EARLY; | ||
| 686 | } | ||
| 687 | |||
| 688 | static DEFINE_PER_CPU(struct delayed_work, reap_work); | ||
| 689 | 701 | ||
| 690 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 702 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
| 691 | { | 703 | { |
| @@ -826,7 +838,7 @@ __setup("noaliencache", noaliencache_setup); | |||
| 826 | * objects freed on different nodes from which they were allocated) and the | 838 | * objects freed on different nodes from which they were allocated) and the |
| 827 | * flushing of remote pcps by calling drain_node_pages. | 839 | * flushing of remote pcps by calling drain_node_pages. |
| 828 | */ | 840 | */ |
| 829 | static DEFINE_PER_CPU(unsigned long, reap_node); | 841 | static DEFINE_PER_CPU(unsigned long, slab_reap_node); |
| 830 | 842 | ||
| 831 | static void init_reap_node(int cpu) | 843 | static void init_reap_node(int cpu) |
| 832 | { | 844 | { |
| @@ -836,17 +848,17 @@ static void init_reap_node(int cpu) | |||
| 836 | if (node == MAX_NUMNODES) | 848 | if (node == MAX_NUMNODES) |
| 837 | node = first_node(node_online_map); | 849 | node = first_node(node_online_map); |
| 838 | 850 | ||
| 839 | per_cpu(reap_node, cpu) = node; | 851 | per_cpu(slab_reap_node, cpu) = node; |
| 840 | } | 852 | } |
| 841 | 853 | ||
| 842 | static void next_reap_node(void) | 854 | static void next_reap_node(void) |
| 843 | { | 855 | { |
| 844 | int node = __get_cpu_var(reap_node); | 856 | int node = __get_cpu_var(slab_reap_node); |
| 845 | 857 | ||
| 846 | node = next_node(node, node_online_map); | 858 | node = next_node(node, node_online_map); |
| 847 | if (unlikely(node >= MAX_NUMNODES)) | 859 | if (unlikely(node >= MAX_NUMNODES)) |
| 848 | node = first_node(node_online_map); | 860 | node = first_node(node_online_map); |
| 849 | __get_cpu_var(reap_node) = node; | 861 | __get_cpu_var(slab_reap_node) = node; |
| 850 | } | 862 | } |
| 851 | 863 | ||
| 852 | #else | 864 | #else |
| @@ -863,7 +875,7 @@ static void next_reap_node(void) | |||
| 863 | */ | 875 | */ |
| 864 | static void __cpuinit start_cpu_timer(int cpu) | 876 | static void __cpuinit start_cpu_timer(int cpu) |
| 865 | { | 877 | { |
| 866 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); | 878 | struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); |
| 867 | 879 | ||
| 868 | /* | 880 | /* |
| 869 | * When this gets called from do_initcalls via cpucache_init(), | 881 | * When this gets called from do_initcalls via cpucache_init(), |
| @@ -923,7 +935,6 @@ static int transfer_objects(struct array_cache *to, | |||
| 923 | 935 | ||
| 924 | from->avail -= nr; | 936 | from->avail -= nr; |
| 925 | to->avail += nr; | 937 | to->avail += nr; |
| 926 | to->touched = 1; | ||
| 927 | return nr; | 938 | return nr; |
| 928 | } | 939 | } |
| 929 | 940 | ||
| @@ -971,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) | |||
| 971 | 982 | ||
| 972 | if (limit > 1) | 983 | if (limit > 1) |
| 973 | limit = 12; | 984 | limit = 12; |
| 974 | ac_ptr = kmalloc_node(memsize, gfp, node); | 985 | ac_ptr = kzalloc_node(memsize, gfp, node); |
| 975 | if (ac_ptr) { | 986 | if (ac_ptr) { |
| 976 | for_each_node(i) { | 987 | for_each_node(i) { |
| 977 | if (i == node || !node_online(i)) { | 988 | if (i == node || !node_online(i)) |
| 978 | ac_ptr[i] = NULL; | ||
| 979 | continue; | 989 | continue; |
| 980 | } | ||
| 981 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); | 990 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); |
| 982 | if (!ac_ptr[i]) { | 991 | if (!ac_ptr[i]) { |
| 983 | for (i--; i >= 0; i--) | 992 | for (i--; i >= 0; i--) |
| @@ -1027,7 +1036,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, | |||
| 1027 | */ | 1036 | */ |
| 1028 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) | 1037 | static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) |
| 1029 | { | 1038 | { |
| 1030 | int node = __get_cpu_var(reap_node); | 1039 | int node = __get_cpu_var(slab_reap_node); |
| 1031 | 1040 | ||
| 1032 | if (l3->alien) { | 1041 | if (l3->alien) { |
| 1033 | struct array_cache *ac = l3->alien[node]; | 1042 | struct array_cache *ac = l3->alien[node]; |
| @@ -1120,7 +1129,7 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
| 1120 | if (nc) | 1129 | if (nc) |
| 1121 | free_block(cachep, nc->entry, nc->avail, node); | 1130 | free_block(cachep, nc->entry, nc->avail, node); |
| 1122 | 1131 | ||
| 1123 | if (!cpus_empty(*mask)) { | 1132 | if (!cpumask_empty(mask)) { |
| 1124 | spin_unlock_irq(&l3->list_lock); | 1133 | spin_unlock_irq(&l3->list_lock); |
| 1125 | goto free_array_cache; | 1134 | goto free_array_cache; |
| 1126 | } | 1135 | } |
| @@ -1254,6 +1263,8 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
| 1254 | kfree(shared); | 1263 | kfree(shared); |
| 1255 | free_alien_cache(alien); | 1264 | free_alien_cache(alien); |
| 1256 | } | 1265 | } |
| 1266 | init_node_lock_keys(node); | ||
| 1267 | |||
| 1257 | return 0; | 1268 | return 0; |
| 1258 | bad: | 1269 | bad: |
| 1259 | cpuup_canceled(cpu); | 1270 | cpuup_canceled(cpu); |
| @@ -1286,9 +1297,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
| 1286 | * anything expensive but will only modify reap_work | 1297 | * anything expensive but will only modify reap_work |
| 1287 | * and reschedule the timer. | 1298 | * and reschedule the timer. |
| 1288 | */ | 1299 | */ |
| 1289 | cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); | 1300 | cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu)); |
| 1290 | /* Now the cache_reaper is guaranteed to be not running. */ | 1301 | /* Now the cache_reaper is guaranteed to be not running. */ |
| 1291 | per_cpu(reap_work, cpu).work.func = NULL; | 1302 | per_cpu(slab_reap_work, cpu).work.func = NULL; |
| 1292 | break; | 1303 | break; |
| 1293 | case CPU_DOWN_FAILED: | 1304 | case CPU_DOWN_FAILED: |
| 1294 | case CPU_DOWN_FAILED_FROZEN: | 1305 | case CPU_DOWN_FAILED_FROZEN: |
| @@ -2261,9 +2272,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2261 | /* | 2272 | /* |
| 2262 | * Determine if the slab management is 'on' or 'off' slab. | 2273 | * Determine if the slab management is 'on' or 'off' slab. |
| 2263 | * (bootstrapping cannot cope with offslab caches so don't do | 2274 | * (bootstrapping cannot cope with offslab caches so don't do |
| 2264 | * it too early on.) | 2275 | * it too early on. Always use on-slab management when |
| 2276 | * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) | ||
| 2265 | */ | 2277 | */ |
| 2266 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) | 2278 | if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && |
| 2279 | !(flags & SLAB_NOLEAKTRACE)) | ||
| 2267 | /* | 2280 | /* |
| 2268 | * Size is large, assume best to place the slab management obj | 2281 | * Size is large, assume best to place the slab management obj |
| 2269 | * off-slab (should allow better packing of objs). | 2282 | * off-slab (should allow better packing of objs). |
| @@ -2582,8 +2595,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
| 2582 | * kmemleak does not treat the ->s_mem pointer as a reference | 2595 | * kmemleak does not treat the ->s_mem pointer as a reference |
| 2583 | * to the object. Otherwise we will not report the leak. | 2596 | * to the object. Otherwise we will not report the leak. |
| 2584 | */ | 2597 | */ |
| 2585 | kmemleak_scan_area(slabp, offsetof(struct slab, list), | 2598 | kmemleak_scan_area(&slabp->list, sizeof(struct list_head), |
| 2586 | sizeof(struct list_head), local_flags); | 2599 | local_flags); |
| 2587 | if (!slabp) | 2600 | if (!slabp) |
| 2588 | return NULL; | 2601 | return NULL; |
| 2589 | } else { | 2602 | } else { |
| @@ -2947,8 +2960,10 @@ retry: | |||
| 2947 | spin_lock(&l3->list_lock); | 2960 | spin_lock(&l3->list_lock); |
| 2948 | 2961 | ||
| 2949 | /* See if we can refill from the shared array */ | 2962 | /* See if we can refill from the shared array */ |
| 2950 | if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) | 2963 | if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { |
| 2964 | l3->shared->touched = 1; | ||
| 2951 | goto alloc_done; | 2965 | goto alloc_done; |
| 2966 | } | ||
| 2952 | 2967 | ||
| 2953 | while (batchcount > 0) { | 2968 | while (batchcount > 0) { |
| 2954 | struct list_head *entry; | 2969 | struct list_head *entry; |
| @@ -3085,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) | |||
| 3085 | if (cachep == &cache_cache) | 3100 | if (cachep == &cache_cache) |
| 3086 | return false; | 3101 | return false; |
| 3087 | 3102 | ||
| 3088 | return should_failslab(obj_size(cachep), flags); | 3103 | return should_failslab(obj_size(cachep), flags, cachep->flags); |
| 3089 | } | 3104 | } |
| 3090 | 3105 | ||
| 3091 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3106 | static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
| @@ -3103,13 +3118,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 3103 | } else { | 3118 | } else { |
| 3104 | STATS_INC_ALLOCMISS(cachep); | 3119 | STATS_INC_ALLOCMISS(cachep); |
| 3105 | objp = cache_alloc_refill(cachep, flags); | 3120 | objp = cache_alloc_refill(cachep, flags); |
| 3121 | /* | ||
| 3122 | * the 'ac' may be updated by cache_alloc_refill(), | ||
| 3123 | * and kmemleak_erase() requires its correct value. | ||
| 3124 | */ | ||
| 3125 | ac = cpu_cache_get(cachep); | ||
| 3106 | } | 3126 | } |
| 3107 | /* | 3127 | /* |
| 3108 | * To avoid a false negative, if an object that is in one of the | 3128 | * To avoid a false negative, if an object that is in one of the |
| 3109 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | 3129 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't |
| 3110 | * treat the array pointers as a reference to the object. | 3130 | * treat the array pointers as a reference to the object. |
| 3111 | */ | 3131 | */ |
| 3112 | kmemleak_erase(&ac->entry[ac->avail]); | 3132 | if (objp) |
| 3133 | kmemleak_erase(&ac->entry[ac->avail]); | ||
| 3113 | return objp; | 3134 | return objp; |
| 3114 | } | 3135 | } |
| 3115 | 3136 | ||
| @@ -3306,7 +3327,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
| 3306 | cache_alloc_debugcheck_before(cachep, flags); | 3327 | cache_alloc_debugcheck_before(cachep, flags); |
| 3307 | local_irq_save(save_flags); | 3328 | local_irq_save(save_flags); |
| 3308 | 3329 | ||
| 3309 | if (unlikely(nodeid == -1)) | 3330 | if (nodeid == -1) |
| 3310 | nodeid = numa_node_id(); | 3331 | nodeid = numa_node_id(); |
| 3311 | 3332 | ||
| 3312 | if (unlikely(!cachep->nodelists[nodeid])) { | 3333 | if (unlikely(!cachep->nodelists[nodeid])) { |
| @@ -3558,7 +3579,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 3558 | } | 3579 | } |
| 3559 | EXPORT_SYMBOL(kmem_cache_alloc); | 3580 | EXPORT_SYMBOL(kmem_cache_alloc); |
| 3560 | 3581 | ||
| 3561 | #ifdef CONFIG_KMEMTRACE | 3582 | #ifdef CONFIG_TRACING |
| 3562 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) | 3583 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) |
| 3563 | { | 3584 | { |
| 3564 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3585 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); |
| @@ -3621,7 +3642,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 3621 | } | 3642 | } |
| 3622 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3643 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
| 3623 | 3644 | ||
| 3624 | #ifdef CONFIG_KMEMTRACE | 3645 | #ifdef CONFIG_TRACING |
| 3625 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, | 3646 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, |
| 3626 | gfp_t flags, | 3647 | gfp_t flags, |
| 3627 | int nodeid) | 3648 | int nodeid) |
| @@ -3649,7 +3670,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | |||
| 3649 | return ret; | 3670 | return ret; |
| 3650 | } | 3671 | } |
| 3651 | 3672 | ||
| 3652 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) | 3673 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
| 3653 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3674 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
| 3654 | { | 3675 | { |
| 3655 | return __do_kmalloc_node(size, flags, node, | 3676 | return __do_kmalloc_node(size, flags, node, |
| @@ -3669,7 +3690,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) | |||
| 3669 | return __do_kmalloc_node(size, flags, node, NULL); | 3690 | return __do_kmalloc_node(size, flags, node, NULL); |
| 3670 | } | 3691 | } |
| 3671 | EXPORT_SYMBOL(__kmalloc_node); | 3692 | EXPORT_SYMBOL(__kmalloc_node); |
| 3672 | #endif /* CONFIG_DEBUG_SLAB */ | 3693 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ |
| 3673 | #endif /* CONFIG_NUMA */ | 3694 | #endif /* CONFIG_NUMA */ |
| 3674 | 3695 | ||
| 3675 | /** | 3696 | /** |
| @@ -3701,7 +3722,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
| 3701 | } | 3722 | } |
| 3702 | 3723 | ||
| 3703 | 3724 | ||
| 3704 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) | 3725 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) |
| 3705 | void *__kmalloc(size_t size, gfp_t flags) | 3726 | void *__kmalloc(size_t size, gfp_t flags) |
| 3706 | { | 3727 | { |
| 3707 | return __do_kmalloc(size, flags, __builtin_return_address(0)); | 3728 | return __do_kmalloc(size, flags, __builtin_return_address(0)); |
| @@ -151,7 +151,8 @@ | |||
| 151 | * Set of flags that will prevent slab merging | 151 | * Set of flags that will prevent slab merging |
| 152 | */ | 152 | */ |
| 153 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 153 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
| 154 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) | 154 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ |
| 155 | SLAB_FAILSLAB) | ||
| 155 | 156 | ||
| 156 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | 157 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ |
| 157 | SLAB_CACHE_DMA | SLAB_NOTRACK) | 158 | SLAB_CACHE_DMA | SLAB_NOTRACK) |
| @@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s) | |||
| 217 | 218 | ||
| 218 | #endif | 219 | #endif |
| 219 | 220 | ||
| 220 | static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) | 221 | static inline void stat(struct kmem_cache *s, enum stat_item si) |
| 221 | { | 222 | { |
| 222 | #ifdef CONFIG_SLUB_STATS | 223 | #ifdef CONFIG_SLUB_STATS |
| 223 | c->stat[si]++; | 224 | __this_cpu_inc(s->cpu_slab->stat[si]); |
| 224 | #endif | 225 | #endif |
| 225 | } | 226 | } |
| 226 | 227 | ||
| @@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
| 242 | #endif | 243 | #endif |
| 243 | } | 244 | } |
| 244 | 245 | ||
| 245 | static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) | ||
| 246 | { | ||
| 247 | #ifdef CONFIG_SMP | ||
| 248 | return s->cpu_slab[cpu]; | ||
| 249 | #else | ||
| 250 | return &s->cpu_slab; | ||
| 251 | #endif | ||
| 252 | } | ||
| 253 | |||
| 254 | /* Verify that a pointer has an address that is valid within a slab page */ | 246 | /* Verify that a pointer has an address that is valid within a slab page */ |
| 255 | static inline int check_valid_pointer(struct kmem_cache *s, | 247 | static inline int check_valid_pointer(struct kmem_cache *s, |
| 256 | struct page *page, const void *object) | 248 | struct page *page, const void *object) |
| @@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s, | |||
| 269 | return 1; | 261 | return 1; |
| 270 | } | 262 | } |
| 271 | 263 | ||
| 272 | /* | ||
| 273 | * Slow version of get and set free pointer. | ||
| 274 | * | ||
| 275 | * This version requires touching the cache lines of kmem_cache which | ||
| 276 | * we avoid to do in the fast alloc free paths. There we obtain the offset | ||
| 277 | * from the page struct. | ||
| 278 | */ | ||
| 279 | static inline void *get_freepointer(struct kmem_cache *s, void *object) | 264 | static inline void *get_freepointer(struct kmem_cache *s, void *object) |
| 280 | { | 265 | { |
| 281 | return *(void **)(object + s->offset); | 266 | return *(void **)(object + s->offset); |
| @@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str) | |||
| 1020 | case 't': | 1005 | case 't': |
| 1021 | slub_debug |= SLAB_TRACE; | 1006 | slub_debug |= SLAB_TRACE; |
| 1022 | break; | 1007 | break; |
| 1008 | case 'a': | ||
| 1009 | slub_debug |= SLAB_FAILSLAB; | ||
| 1010 | break; | ||
| 1023 | default: | 1011 | default: |
| 1024 | printk(KERN_ERR "slub_debug option '%c' " | 1012 | printk(KERN_ERR "slub_debug option '%c' " |
| 1025 | "unknown. skipped\n", *str); | 1013 | "unknown. skipped\n", *str); |
| @@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1124 | if (!page) | 1112 | if (!page) |
| 1125 | return NULL; | 1113 | return NULL; |
| 1126 | 1114 | ||
| 1127 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); | 1115 | stat(s, ORDER_FALLBACK); |
| 1128 | } | 1116 | } |
| 1129 | 1117 | ||
| 1130 | if (kmemcheck_enabled | 1118 | if (kmemcheck_enabled |
| @@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1422 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | 1410 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) |
| 1423 | { | 1411 | { |
| 1424 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | 1412 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
| 1425 | struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); | ||
| 1426 | 1413 | ||
| 1427 | __ClearPageSlubFrozen(page); | 1414 | __ClearPageSlubFrozen(page); |
| 1428 | if (page->inuse) { | 1415 | if (page->inuse) { |
| 1429 | 1416 | ||
| 1430 | if (page->freelist) { | 1417 | if (page->freelist) { |
| 1431 | add_partial(n, page, tail); | 1418 | add_partial(n, page, tail); |
| 1432 | stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | 1419 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); |
| 1433 | } else { | 1420 | } else { |
| 1434 | stat(c, DEACTIVATE_FULL); | 1421 | stat(s, DEACTIVATE_FULL); |
| 1435 | if (SLABDEBUG && PageSlubDebug(page) && | 1422 | if (SLABDEBUG && PageSlubDebug(page) && |
| 1436 | (s->flags & SLAB_STORE_USER)) | 1423 | (s->flags & SLAB_STORE_USER)) |
| 1437 | add_full(n, page); | 1424 | add_full(n, page); |
| 1438 | } | 1425 | } |
| 1439 | slab_unlock(page); | 1426 | slab_unlock(page); |
| 1440 | } else { | 1427 | } else { |
| 1441 | stat(c, DEACTIVATE_EMPTY); | 1428 | stat(s, DEACTIVATE_EMPTY); |
| 1442 | if (n->nr_partial < s->min_partial) { | 1429 | if (n->nr_partial < s->min_partial) { |
| 1443 | /* | 1430 | /* |
| 1444 | * Adding an empty slab to the partial slabs in order | 1431 | * Adding an empty slab to the partial slabs in order |
| @@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | |||
| 1454 | slab_unlock(page); | 1441 | slab_unlock(page); |
| 1455 | } else { | 1442 | } else { |
| 1456 | slab_unlock(page); | 1443 | slab_unlock(page); |
| 1457 | stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); | 1444 | stat(s, FREE_SLAB); |
| 1458 | discard_slab(s, page); | 1445 | discard_slab(s, page); |
| 1459 | } | 1446 | } |
| 1460 | } | 1447 | } |
| @@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
| 1469 | int tail = 1; | 1456 | int tail = 1; |
| 1470 | 1457 | ||
| 1471 | if (page->freelist) | 1458 | if (page->freelist) |
| 1472 | stat(c, DEACTIVATE_REMOTE_FREES); | 1459 | stat(s, DEACTIVATE_REMOTE_FREES); |
| 1473 | /* | 1460 | /* |
| 1474 | * Merge cpu freelist into slab freelist. Typically we get here | 1461 | * Merge cpu freelist into slab freelist. Typically we get here |
| 1475 | * because both freelists are empty. So this is unlikely | 1462 | * because both freelists are empty. So this is unlikely |
| @@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
| 1482 | 1469 | ||
| 1483 | /* Retrieve object from cpu_freelist */ | 1470 | /* Retrieve object from cpu_freelist */ |
| 1484 | object = c->freelist; | 1471 | object = c->freelist; |
| 1485 | c->freelist = c->freelist[c->offset]; | 1472 | c->freelist = get_freepointer(s, c->freelist); |
| 1486 | 1473 | ||
| 1487 | /* And put onto the regular freelist */ | 1474 | /* And put onto the regular freelist */ |
| 1488 | object[c->offset] = page->freelist; | 1475 | set_freepointer(s, object, page->freelist); |
| 1489 | page->freelist = object; | 1476 | page->freelist = object; |
| 1490 | page->inuse--; | 1477 | page->inuse--; |
| 1491 | } | 1478 | } |
| @@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
| 1495 | 1482 | ||
| 1496 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1483 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
| 1497 | { | 1484 | { |
| 1498 | stat(c, CPUSLAB_FLUSH); | 1485 | stat(s, CPUSLAB_FLUSH); |
| 1499 | slab_lock(c->page); | 1486 | slab_lock(c->page); |
| 1500 | deactivate_slab(s, c); | 1487 | deactivate_slab(s, c); |
| 1501 | } | 1488 | } |
| @@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | |||
| 1507 | */ | 1494 | */ |
| 1508 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | 1495 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) |
| 1509 | { | 1496 | { |
| 1510 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | 1497 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
| 1511 | 1498 | ||
| 1512 | if (likely(c && c->page)) | 1499 | if (likely(c && c->page)) |
| 1513 | flush_slab(s, c); | 1500 | flush_slab(s, c); |
| @@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
| 1635 | if (unlikely(!node_match(c, node))) | 1622 | if (unlikely(!node_match(c, node))) |
| 1636 | goto another_slab; | 1623 | goto another_slab; |
| 1637 | 1624 | ||
| 1638 | stat(c, ALLOC_REFILL); | 1625 | stat(s, ALLOC_REFILL); |
| 1639 | 1626 | ||
| 1640 | load_freelist: | 1627 | load_freelist: |
| 1641 | object = c->page->freelist; | 1628 | object = c->page->freelist; |
| @@ -1644,13 +1631,13 @@ load_freelist: | |||
| 1644 | if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) | 1631 | if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) |
| 1645 | goto debug; | 1632 | goto debug; |
| 1646 | 1633 | ||
| 1647 | c->freelist = object[c->offset]; | 1634 | c->freelist = get_freepointer(s, object); |
| 1648 | c->page->inuse = c->page->objects; | 1635 | c->page->inuse = c->page->objects; |
| 1649 | c->page->freelist = NULL; | 1636 | c->page->freelist = NULL; |
| 1650 | c->node = page_to_nid(c->page); | 1637 | c->node = page_to_nid(c->page); |
| 1651 | unlock_out: | 1638 | unlock_out: |
| 1652 | slab_unlock(c->page); | 1639 | slab_unlock(c->page); |
| 1653 | stat(c, ALLOC_SLOWPATH); | 1640 | stat(s, ALLOC_SLOWPATH); |
| 1654 | return object; | 1641 | return object; |
| 1655 | 1642 | ||
| 1656 | another_slab: | 1643 | another_slab: |
| @@ -1660,7 +1647,7 @@ new_slab: | |||
| 1660 | new = get_partial(s, gfpflags, node); | 1647 | new = get_partial(s, gfpflags, node); |
| 1661 | if (new) { | 1648 | if (new) { |
| 1662 | c->page = new; | 1649 | c->page = new; |
| 1663 | stat(c, ALLOC_FROM_PARTIAL); | 1650 | stat(s, ALLOC_FROM_PARTIAL); |
| 1664 | goto load_freelist; | 1651 | goto load_freelist; |
| 1665 | } | 1652 | } |
| 1666 | 1653 | ||
| @@ -1673,8 +1660,8 @@ new_slab: | |||
| 1673 | local_irq_disable(); | 1660 | local_irq_disable(); |
| 1674 | 1661 | ||
| 1675 | if (new) { | 1662 | if (new) { |
| 1676 | c = get_cpu_slab(s, smp_processor_id()); | 1663 | c = __this_cpu_ptr(s->cpu_slab); |
| 1677 | stat(c, ALLOC_SLAB); | 1664 | stat(s, ALLOC_SLAB); |
| 1678 | if (c->page) | 1665 | if (c->page) |
| 1679 | flush_slab(s, c); | 1666 | flush_slab(s, c); |
| 1680 | slab_lock(new); | 1667 | slab_lock(new); |
| @@ -1690,7 +1677,7 @@ debug: | |||
| 1690 | goto another_slab; | 1677 | goto another_slab; |
| 1691 | 1678 | ||
| 1692 | c->page->inuse++; | 1679 | c->page->inuse++; |
| 1693 | c->page->freelist = object[c->offset]; | 1680 | c->page->freelist = get_freepointer(s, object); |
| 1694 | c->node = -1; | 1681 | c->node = -1; |
| 1695 | goto unlock_out; | 1682 | goto unlock_out; |
| 1696 | } | 1683 | } |
| @@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
| 1711 | void **object; | 1698 | void **object; |
| 1712 | struct kmem_cache_cpu *c; | 1699 | struct kmem_cache_cpu *c; |
| 1713 | unsigned long flags; | 1700 | unsigned long flags; |
| 1714 | unsigned int objsize; | ||
| 1715 | 1701 | ||
| 1716 | gfpflags &= gfp_allowed_mask; | 1702 | gfpflags &= gfp_allowed_mask; |
| 1717 | 1703 | ||
| 1718 | lockdep_trace_alloc(gfpflags); | 1704 | lockdep_trace_alloc(gfpflags); |
| 1719 | might_sleep_if(gfpflags & __GFP_WAIT); | 1705 | might_sleep_if(gfpflags & __GFP_WAIT); |
| 1720 | 1706 | ||
| 1721 | if (should_failslab(s->objsize, gfpflags)) | 1707 | if (should_failslab(s->objsize, gfpflags, s->flags)) |
| 1722 | return NULL; | 1708 | return NULL; |
| 1723 | 1709 | ||
| 1724 | local_irq_save(flags); | 1710 | local_irq_save(flags); |
| 1725 | c = get_cpu_slab(s, smp_processor_id()); | 1711 | c = __this_cpu_ptr(s->cpu_slab); |
| 1726 | objsize = c->objsize; | 1712 | object = c->freelist; |
| 1727 | if (unlikely(!c->freelist || !node_match(c, node))) | 1713 | if (unlikely(!object || !node_match(c, node))) |
| 1728 | 1714 | ||
| 1729 | object = __slab_alloc(s, gfpflags, node, addr, c); | 1715 | object = __slab_alloc(s, gfpflags, node, addr, c); |
| 1730 | 1716 | ||
| 1731 | else { | 1717 | else { |
| 1732 | object = c->freelist; | 1718 | c->freelist = get_freepointer(s, object); |
| 1733 | c->freelist = object[c->offset]; | 1719 | stat(s, ALLOC_FASTPATH); |
| 1734 | stat(c, ALLOC_FASTPATH); | ||
| 1735 | } | 1720 | } |
| 1736 | local_irq_restore(flags); | 1721 | local_irq_restore(flags); |
| 1737 | 1722 | ||
| 1738 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1723 | if (unlikely(gfpflags & __GFP_ZERO) && object) |
| 1739 | memset(object, 0, objsize); | 1724 | memset(object, 0, s->objsize); |
| 1740 | 1725 | ||
| 1741 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | 1726 | kmemcheck_slab_alloc(s, gfpflags, object, s->objsize); |
| 1742 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); | 1727 | kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags); |
| 1743 | 1728 | ||
| 1744 | return object; | 1729 | return object; |
| 1745 | } | 1730 | } |
| @@ -1754,7 +1739,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | |||
| 1754 | } | 1739 | } |
| 1755 | EXPORT_SYMBOL(kmem_cache_alloc); | 1740 | EXPORT_SYMBOL(kmem_cache_alloc); |
| 1756 | 1741 | ||
| 1757 | #ifdef CONFIG_KMEMTRACE | 1742 | #ifdef CONFIG_TRACING |
| 1758 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) | 1743 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) |
| 1759 | { | 1744 | { |
| 1760 | return slab_alloc(s, gfpflags, -1, _RET_IP_); | 1745 | return slab_alloc(s, gfpflags, -1, _RET_IP_); |
| @@ -1775,7 +1760,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | |||
| 1775 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 1760 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
| 1776 | #endif | 1761 | #endif |
| 1777 | 1762 | ||
| 1778 | #ifdef CONFIG_KMEMTRACE | 1763 | #ifdef CONFIG_TRACING |
| 1779 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, | 1764 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, |
| 1780 | gfp_t gfpflags, | 1765 | gfp_t gfpflags, |
| 1781 | int node) | 1766 | int node) |
| @@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | |||
| 1794 | * handling required then we can return immediately. | 1779 | * handling required then we can return immediately. |
| 1795 | */ | 1780 | */ |
| 1796 | static void __slab_free(struct kmem_cache *s, struct page *page, | 1781 | static void __slab_free(struct kmem_cache *s, struct page *page, |
| 1797 | void *x, unsigned long addr, unsigned int offset) | 1782 | void *x, unsigned long addr) |
| 1798 | { | 1783 | { |
| 1799 | void *prior; | 1784 | void *prior; |
| 1800 | void **object = (void *)x; | 1785 | void **object = (void *)x; |
| 1801 | struct kmem_cache_cpu *c; | ||
| 1802 | 1786 | ||
| 1803 | c = get_cpu_slab(s, raw_smp_processor_id()); | 1787 | stat(s, FREE_SLOWPATH); |
| 1804 | stat(c, FREE_SLOWPATH); | ||
| 1805 | slab_lock(page); | 1788 | slab_lock(page); |
| 1806 | 1789 | ||
| 1807 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) | 1790 | if (unlikely(SLABDEBUG && PageSlubDebug(page))) |
| 1808 | goto debug; | 1791 | goto debug; |
| 1809 | 1792 | ||
| 1810 | checks_ok: | 1793 | checks_ok: |
| 1811 | prior = object[offset] = page->freelist; | 1794 | prior = page->freelist; |
| 1795 | set_freepointer(s, object, prior); | ||
| 1812 | page->freelist = object; | 1796 | page->freelist = object; |
| 1813 | page->inuse--; | 1797 | page->inuse--; |
| 1814 | 1798 | ||
| 1815 | if (unlikely(PageSlubFrozen(page))) { | 1799 | if (unlikely(PageSlubFrozen(page))) { |
| 1816 | stat(c, FREE_FROZEN); | 1800 | stat(s, FREE_FROZEN); |
| 1817 | goto out_unlock; | 1801 | goto out_unlock; |
| 1818 | } | 1802 | } |
| 1819 | 1803 | ||
| @@ -1826,7 +1810,7 @@ checks_ok: | |||
| 1826 | */ | 1810 | */ |
| 1827 | if (unlikely(!prior)) { | 1811 | if (unlikely(!prior)) { |
| 1828 | add_partial(get_node(s, page_to_nid(page)), page, 1); | 1812 | add_partial(get_node(s, page_to_nid(page)), page, 1); |
| 1829 | stat(c, FREE_ADD_PARTIAL); | 1813 | stat(s, FREE_ADD_PARTIAL); |
| 1830 | } | 1814 | } |
| 1831 | 1815 | ||
| 1832 | out_unlock: | 1816 | out_unlock: |
| @@ -1839,10 +1823,10 @@ slab_empty: | |||
| 1839 | * Slab still on the partial list. | 1823 | * Slab still on the partial list. |
| 1840 | */ | 1824 | */ |
| 1841 | remove_partial(s, page); | 1825 | remove_partial(s, page); |
| 1842 | stat(c, FREE_REMOVE_PARTIAL); | 1826 | stat(s, FREE_REMOVE_PARTIAL); |
| 1843 | } | 1827 | } |
| 1844 | slab_unlock(page); | 1828 | slab_unlock(page); |
| 1845 | stat(c, FREE_SLAB); | 1829 | stat(s, FREE_SLAB); |
| 1846 | discard_slab(s, page); | 1830 | discard_slab(s, page); |
| 1847 | return; | 1831 | return; |
| 1848 | 1832 | ||
| @@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
| 1872 | 1856 | ||
| 1873 | kmemleak_free_recursive(x, s->flags); | 1857 | kmemleak_free_recursive(x, s->flags); |
| 1874 | local_irq_save(flags); | 1858 | local_irq_save(flags); |
| 1875 | c = get_cpu_slab(s, smp_processor_id()); | 1859 | c = __this_cpu_ptr(s->cpu_slab); |
| 1876 | kmemcheck_slab_free(s, object, c->objsize); | 1860 | kmemcheck_slab_free(s, object, s->objsize); |
| 1877 | debug_check_no_locks_freed(object, c->objsize); | 1861 | debug_check_no_locks_freed(object, s->objsize); |
| 1878 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1862 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
| 1879 | debug_check_no_obj_freed(object, c->objsize); | 1863 | debug_check_no_obj_freed(object, s->objsize); |
| 1880 | if (likely(page == c->page && c->node >= 0)) { | 1864 | if (likely(page == c->page && c->node >= 0)) { |
| 1881 | object[c->offset] = c->freelist; | 1865 | set_freepointer(s, object, c->freelist); |
| 1882 | c->freelist = object; | 1866 | c->freelist = object; |
| 1883 | stat(c, FREE_FASTPATH); | 1867 | stat(s, FREE_FASTPATH); |
| 1884 | } else | 1868 | } else |
| 1885 | __slab_free(s, page, x, addr, c->offset); | 1869 | __slab_free(s, page, x, addr); |
| 1886 | 1870 | ||
| 1887 | local_irq_restore(flags); | 1871 | local_irq_restore(flags); |
| 1888 | } | 1872 | } |
| @@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
| 2069 | return ALIGN(align, sizeof(void *)); | 2053 | return ALIGN(align, sizeof(void *)); |
| 2070 | } | 2054 | } |
| 2071 | 2055 | ||
| 2072 | static void init_kmem_cache_cpu(struct kmem_cache *s, | ||
| 2073 | struct kmem_cache_cpu *c) | ||
| 2074 | { | ||
| 2075 | c->page = NULL; | ||
| 2076 | c->freelist = NULL; | ||
| 2077 | c->node = 0; | ||
| 2078 | c->offset = s->offset / sizeof(void *); | ||
| 2079 | c->objsize = s->objsize; | ||
| 2080 | #ifdef CONFIG_SLUB_STATS | ||
| 2081 | memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); | ||
| 2082 | #endif | ||
| 2083 | } | ||
| 2084 | |||
| 2085 | static void | 2056 | static void |
| 2086 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | 2057 | init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) |
| 2087 | { | 2058 | { |
| @@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | |||
| 2095 | #endif | 2066 | #endif |
| 2096 | } | 2067 | } |
| 2097 | 2068 | ||
| 2098 | #ifdef CONFIG_SMP | 2069 | static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]); |
| 2099 | /* | ||
| 2100 | * Per cpu array for per cpu structures. | ||
| 2101 | * | ||
| 2102 | * The per cpu array places all kmem_cache_cpu structures from one processor | ||
| 2103 | * close together meaning that it becomes possible that multiple per cpu | ||
| 2104 | * structures are contained in one cacheline. This may be particularly | ||
| 2105 | * beneficial for the kmalloc caches. | ||
| 2106 | * | ||
| 2107 | * A desktop system typically has around 60-80 slabs. With 100 here we are | ||
| 2108 | * likely able to get per cpu structures for all caches from the array defined | ||
| 2109 | * here. We must be able to cover all kmalloc caches during bootstrap. | ||
| 2110 | * | ||
| 2111 | * If the per cpu array is exhausted then fall back to kmalloc | ||
| 2112 | * of individual cachelines. No sharing is possible then. | ||
| 2113 | */ | ||
| 2114 | #define NR_KMEM_CACHE_CPU 100 | ||
| 2115 | |||
| 2116 | static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], | ||
| 2117 | kmem_cache_cpu); | ||
| 2118 | |||
| 2119 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); | ||
| 2120 | static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); | ||
| 2121 | |||
| 2122 | static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, | ||
| 2123 | int cpu, gfp_t flags) | ||
| 2124 | { | ||
| 2125 | struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); | ||
| 2126 | |||
| 2127 | if (c) | ||
| 2128 | per_cpu(kmem_cache_cpu_free, cpu) = | ||
| 2129 | (void *)c->freelist; | ||
| 2130 | else { | ||
| 2131 | /* Table overflow: So allocate ourselves */ | ||
| 2132 | c = kmalloc_node( | ||
| 2133 | ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), | ||
| 2134 | flags, cpu_to_node(cpu)); | ||
| 2135 | if (!c) | ||
| 2136 | return NULL; | ||
| 2137 | } | ||
| 2138 | |||
| 2139 | init_kmem_cache_cpu(s, c); | ||
| 2140 | return c; | ||
| 2141 | } | ||
| 2142 | |||
| 2143 | static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) | ||
| 2144 | { | ||
| 2145 | if (c < per_cpu(kmem_cache_cpu, cpu) || | ||
| 2146 | c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { | ||
| 2147 | kfree(c); | ||
| 2148 | return; | ||
| 2149 | } | ||
| 2150 | c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); | ||
| 2151 | per_cpu(kmem_cache_cpu_free, cpu) = c; | ||
| 2152 | } | ||
| 2153 | |||
| 2154 | static void free_kmem_cache_cpus(struct kmem_cache *s) | ||
| 2155 | { | ||
| 2156 | int cpu; | ||
| 2157 | |||
| 2158 | for_each_online_cpu(cpu) { | ||
| 2159 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
| 2160 | |||
| 2161 | if (c) { | ||
| 2162 | s->cpu_slab[cpu] = NULL; | ||
| 2163 | free_kmem_cache_cpu(c, cpu); | ||
| 2164 | } | ||
| 2165 | } | ||
| 2166 | } | ||
| 2167 | |||
| 2168 | static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
| 2169 | { | ||
| 2170 | int cpu; | ||
| 2171 | |||
| 2172 | for_each_online_cpu(cpu) { | ||
| 2173 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
| 2174 | |||
| 2175 | if (c) | ||
| 2176 | continue; | ||
| 2177 | |||
| 2178 | c = alloc_kmem_cache_cpu(s, cpu, flags); | ||
| 2179 | if (!c) { | ||
| 2180 | free_kmem_cache_cpus(s); | ||
| 2181 | return 0; | ||
| 2182 | } | ||
| 2183 | s->cpu_slab[cpu] = c; | ||
| 2184 | } | ||
| 2185 | return 1; | ||
| 2186 | } | ||
| 2187 | |||
| 2188 | /* | ||
| 2189 | * Initialize the per cpu array. | ||
| 2190 | */ | ||
| 2191 | static void init_alloc_cpu_cpu(int cpu) | ||
| 2192 | { | ||
| 2193 | int i; | ||
| 2194 | |||
| 2195 | if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) | ||
| 2196 | return; | ||
| 2197 | |||
| 2198 | for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) | ||
| 2199 | free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); | ||
| 2200 | |||
| 2201 | cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); | ||
| 2202 | } | ||
| 2203 | 2070 | ||
| 2204 | static void __init init_alloc_cpu(void) | 2071 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) |
| 2205 | { | 2072 | { |
| 2206 | int cpu; | 2073 | if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) |
| 2207 | 2074 | /* | |
| 2208 | for_each_online_cpu(cpu) | 2075 | * Boot time creation of the kmalloc array. Use static per cpu data |
| 2209 | init_alloc_cpu_cpu(cpu); | 2076 | * since the per cpu allocator is not available yet. |
| 2210 | } | 2077 | */ |
| 2078 | s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches); | ||
| 2079 | else | ||
| 2080 | s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); | ||
| 2211 | 2081 | ||
| 2212 | #else | 2082 | if (!s->cpu_slab) |
| 2213 | static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} | 2083 | return 0; |
| 2214 | static inline void init_alloc_cpu(void) {} | ||
| 2215 | 2084 | ||
| 2216 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
| 2217 | { | ||
| 2218 | init_kmem_cache_cpu(s, &s->cpu_slab); | ||
| 2219 | return 1; | 2085 | return 1; |
| 2220 | } | 2086 | } |
| 2221 | #endif | ||
| 2222 | 2087 | ||
| 2223 | #ifdef CONFIG_NUMA | 2088 | #ifdef CONFIG_NUMA |
| 2224 | /* | 2089 | /* |
| @@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | |||
| 2287 | int node; | 2152 | int node; |
| 2288 | int local_node; | 2153 | int local_node; |
| 2289 | 2154 | ||
| 2290 | if (slab_state >= UP) | 2155 | if (slab_state >= UP && (s < kmalloc_caches || |
| 2156 | s > kmalloc_caches + KMALLOC_CACHES)) | ||
| 2291 | local_node = page_to_nid(virt_to_page(s)); | 2157 | local_node = page_to_nid(virt_to_page(s)); |
| 2292 | else | 2158 | else |
| 2293 | local_node = 0; | 2159 | local_node = 0; |
| @@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
| 2502 | 2368 | ||
| 2503 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) | 2369 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) |
| 2504 | return 1; | 2370 | return 1; |
| 2371 | |||
| 2505 | free_kmem_cache_nodes(s); | 2372 | free_kmem_cache_nodes(s); |
| 2506 | error: | 2373 | error: |
| 2507 | if (flags & SLAB_PANIC) | 2374 | if (flags & SLAB_PANIC) |
| @@ -2609,9 +2476,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) | |||
| 2609 | int node; | 2476 | int node; |
| 2610 | 2477 | ||
| 2611 | flush_all(s); | 2478 | flush_all(s); |
| 2612 | 2479 | free_percpu(s->cpu_slab); | |
| 2613 | /* Attempt to free all objects */ | 2480 | /* Attempt to free all objects */ |
| 2614 | free_kmem_cache_cpus(s); | ||
| 2615 | for_each_node_state(node, N_NORMAL_MEMORY) { | 2481 | for_each_node_state(node, N_NORMAL_MEMORY) { |
| 2616 | struct kmem_cache_node *n = get_node(s, node); | 2482 | struct kmem_cache_node *n = get_node(s, node); |
| 2617 | 2483 | ||
| @@ -2651,7 +2517,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); | |||
| 2651 | * Kmalloc subsystem | 2517 | * Kmalloc subsystem |
| 2652 | *******************************************************************/ | 2518 | *******************************************************************/ |
| 2653 | 2519 | ||
| 2654 | struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; | 2520 | struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned; |
| 2655 | EXPORT_SYMBOL(kmalloc_caches); | 2521 | EXPORT_SYMBOL(kmalloc_caches); |
| 2656 | 2522 | ||
| 2657 | static int __init setup_slub_min_order(char *str) | 2523 | static int __init setup_slub_min_order(char *str) |
| @@ -2741,6 +2607,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
| 2741 | char *text; | 2607 | char *text; |
| 2742 | size_t realsize; | 2608 | size_t realsize; |
| 2743 | unsigned long slabflags; | 2609 | unsigned long slabflags; |
| 2610 | int i; | ||
| 2744 | 2611 | ||
| 2745 | s = kmalloc_caches_dma[index]; | 2612 | s = kmalloc_caches_dma[index]; |
| 2746 | if (s) | 2613 | if (s) |
| @@ -2760,7 +2627,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
| 2760 | realsize = kmalloc_caches[index].objsize; | 2627 | realsize = kmalloc_caches[index].objsize; |
| 2761 | text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", | 2628 | text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", |
| 2762 | (unsigned int)realsize); | 2629 | (unsigned int)realsize); |
| 2763 | s = kmalloc(kmem_size, flags & ~SLUB_DMA); | 2630 | |
| 2631 | s = NULL; | ||
| 2632 | for (i = 0; i < KMALLOC_CACHES; i++) | ||
| 2633 | if (!kmalloc_caches[i].size) | ||
| 2634 | break; | ||
| 2635 | |||
| 2636 | BUG_ON(i >= KMALLOC_CACHES); | ||
| 2637 | s = kmalloc_caches + i; | ||
| 2764 | 2638 | ||
| 2765 | /* | 2639 | /* |
| 2766 | * Must defer sysfs creation to a workqueue because we don't know | 2640 | * Must defer sysfs creation to a workqueue because we don't know |
| @@ -2772,9 +2646,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
| 2772 | if (slab_state >= SYSFS) | 2646 | if (slab_state >= SYSFS) |
| 2773 | slabflags |= __SYSFS_ADD_DEFERRED; | 2647 | slabflags |= __SYSFS_ADD_DEFERRED; |
| 2774 | 2648 | ||
| 2775 | if (!s || !text || !kmem_cache_open(s, flags, text, | 2649 | if (!text || !kmem_cache_open(s, flags, text, |
| 2776 | realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { | 2650 | realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { |
| 2777 | kfree(s); | 2651 | s->size = 0; |
| 2778 | kfree(text); | 2652 | kfree(text); |
| 2779 | goto unlock_out; | 2653 | goto unlock_out; |
| 2780 | } | 2654 | } |
| @@ -3176,8 +3050,6 @@ void __init kmem_cache_init(void) | |||
| 3176 | int i; | 3050 | int i; |
| 3177 | int caches = 0; | 3051 | int caches = 0; |
| 3178 | 3052 | ||
| 3179 | init_alloc_cpu(); | ||
| 3180 | |||
| 3181 | #ifdef CONFIG_NUMA | 3053 | #ifdef CONFIG_NUMA |
| 3182 | /* | 3054 | /* |
| 3183 | * Must first have the slab cache available for the allocations of the | 3055 | * Must first have the slab cache available for the allocations of the |
| @@ -3261,8 +3133,10 @@ void __init kmem_cache_init(void) | |||
| 3261 | 3133 | ||
| 3262 | #ifdef CONFIG_SMP | 3134 | #ifdef CONFIG_SMP |
| 3263 | register_cpu_notifier(&slab_notifier); | 3135 | register_cpu_notifier(&slab_notifier); |
| 3264 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | 3136 | #endif |
| 3265 | nr_cpu_ids * sizeof(struct kmem_cache_cpu *); | 3137 | #ifdef CONFIG_NUMA |
| 3138 | kmem_size = offsetof(struct kmem_cache, node) + | ||
| 3139 | nr_node_ids * sizeof(struct kmem_cache_node *); | ||
| 3266 | #else | 3140 | #else |
| 3267 | kmem_size = sizeof(struct kmem_cache); | 3141 | kmem_size = sizeof(struct kmem_cache); |
| 3268 | #endif | 3142 | #endif |
| @@ -3351,22 +3225,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
| 3351 | down_write(&slub_lock); | 3225 | down_write(&slub_lock); |
| 3352 | s = find_mergeable(size, align, flags, name, ctor); | 3226 | s = find_mergeable(size, align, flags, name, ctor); |
| 3353 | if (s) { | 3227 | if (s) { |
| 3354 | int cpu; | ||
| 3355 | |||
| 3356 | s->refcount++; | 3228 | s->refcount++; |
| 3357 | /* | 3229 | /* |
| 3358 | * Adjust the object sizes so that we clear | 3230 | * Adjust the object sizes so that we clear |
| 3359 | * the complete object on kzalloc. | 3231 | * the complete object on kzalloc. |
| 3360 | */ | 3232 | */ |
| 3361 | s->objsize = max(s->objsize, (int)size); | 3233 | s->objsize = max(s->objsize, (int)size); |
| 3362 | |||
| 3363 | /* | ||
| 3364 | * And then we need to update the object size in the | ||
| 3365 | * per cpu structures | ||
| 3366 | */ | ||
| 3367 | for_each_online_cpu(cpu) | ||
| 3368 | get_cpu_slab(s, cpu)->objsize = s->objsize; | ||
| 3369 | |||
| 3370 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); | 3234 | s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); |
| 3371 | up_write(&slub_lock); | 3235 | up_write(&slub_lock); |
| 3372 | 3236 | ||
| @@ -3420,29 +3284,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
| 3420 | unsigned long flags; | 3284 | unsigned long flags; |
| 3421 | 3285 | ||
| 3422 | switch (action) { | 3286 | switch (action) { |
| 3423 | case CPU_UP_PREPARE: | ||
| 3424 | case CPU_UP_PREPARE_FROZEN: | ||
| 3425 | init_alloc_cpu_cpu(cpu); | ||
| 3426 | down_read(&slub_lock); | ||
| 3427 | list_for_each_entry(s, &slab_caches, list) | ||
| 3428 | s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, | ||
| 3429 | GFP_KERNEL); | ||
| 3430 | up_read(&slub_lock); | ||
| 3431 | break; | ||
| 3432 | |||
| 3433 | case CPU_UP_CANCELED: | 3287 | case CPU_UP_CANCELED: |
| 3434 | case CPU_UP_CANCELED_FROZEN: | 3288 | case CPU_UP_CANCELED_FROZEN: |
| 3435 | case CPU_DEAD: | 3289 | case CPU_DEAD: |
| 3436 | case CPU_DEAD_FROZEN: | 3290 | case CPU_DEAD_FROZEN: |
| 3437 | down_read(&slub_lock); | 3291 | down_read(&slub_lock); |
| 3438 | list_for_each_entry(s, &slab_caches, list) { | 3292 | list_for_each_entry(s, &slab_caches, list) { |
| 3439 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | ||
| 3440 | |||
| 3441 | local_irq_save(flags); | 3293 | local_irq_save(flags); |
| 3442 | __flush_cpu_slab(s, cpu); | 3294 | __flush_cpu_slab(s, cpu); |
| 3443 | local_irq_restore(flags); | 3295 | local_irq_restore(flags); |
| 3444 | free_kmem_cache_cpu(c, cpu); | ||
| 3445 | s->cpu_slab[cpu] = NULL; | ||
| 3446 | } | 3296 | } |
| 3447 | up_read(&slub_lock); | 3297 | up_read(&slub_lock); |
| 3448 | break; | 3298 | break; |
| @@ -3928,7 +3778,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
| 3928 | int cpu; | 3778 | int cpu; |
| 3929 | 3779 | ||
| 3930 | for_each_possible_cpu(cpu) { | 3780 | for_each_possible_cpu(cpu) { |
| 3931 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); | 3781 | struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); |
| 3932 | 3782 | ||
| 3933 | if (!c || c->node < 0) | 3783 | if (!c || c->node < 0) |
| 3934 | continue; | 3784 | continue; |
| @@ -4171,6 +4021,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, | |||
| 4171 | } | 4021 | } |
| 4172 | SLAB_ATTR(trace); | 4022 | SLAB_ATTR(trace); |
| 4173 | 4023 | ||
| 4024 | #ifdef CONFIG_FAILSLAB | ||
| 4025 | static ssize_t failslab_show(struct kmem_cache *s, char *buf) | ||
| 4026 | { | ||
| 4027 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); | ||
| 4028 | } | ||
| 4029 | |||
| 4030 | static ssize_t failslab_store(struct kmem_cache *s, const char *buf, | ||
| 4031 | size_t length) | ||
| 4032 | { | ||
| 4033 | s->flags &= ~SLAB_FAILSLAB; | ||
| 4034 | if (buf[0] == '1') | ||
| 4035 | s->flags |= SLAB_FAILSLAB; | ||
| 4036 | return length; | ||
| 4037 | } | ||
| 4038 | SLAB_ATTR(failslab); | ||
| 4039 | #endif | ||
| 4040 | |||
| 4174 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) | 4041 | static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) |
| 4175 | { | 4042 | { |
| 4176 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); | 4043 | return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); |
| @@ -4353,7 +4220,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) | |||
| 4353 | return -ENOMEM; | 4220 | return -ENOMEM; |
| 4354 | 4221 | ||
| 4355 | for_each_online_cpu(cpu) { | 4222 | for_each_online_cpu(cpu) { |
| 4356 | unsigned x = get_cpu_slab(s, cpu)->stat[si]; | 4223 | unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; |
| 4357 | 4224 | ||
| 4358 | data[cpu] = x; | 4225 | data[cpu] = x; |
| 4359 | sum += x; | 4226 | sum += x; |
| @@ -4371,12 +4238,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) | |||
| 4371 | return len + sprintf(buf + len, "\n"); | 4238 | return len + sprintf(buf + len, "\n"); |
| 4372 | } | 4239 | } |
| 4373 | 4240 | ||
| 4241 | static void clear_stat(struct kmem_cache *s, enum stat_item si) | ||
| 4242 | { | ||
| 4243 | int cpu; | ||
| 4244 | |||
| 4245 | for_each_online_cpu(cpu) | ||
| 4246 | per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; | ||
| 4247 | } | ||
| 4248 | |||
| 4374 | #define STAT_ATTR(si, text) \ | 4249 | #define STAT_ATTR(si, text) \ |
| 4375 | static ssize_t text##_show(struct kmem_cache *s, char *buf) \ | 4250 | static ssize_t text##_show(struct kmem_cache *s, char *buf) \ |
| 4376 | { \ | 4251 | { \ |
| 4377 | return show_stat(s, buf, si); \ | 4252 | return show_stat(s, buf, si); \ |
| 4378 | } \ | 4253 | } \ |
| 4379 | SLAB_ATTR_RO(text); \ | 4254 | static ssize_t text##_store(struct kmem_cache *s, \ |
| 4255 | const char *buf, size_t length) \ | ||
| 4256 | { \ | ||
| 4257 | if (buf[0] != '0') \ | ||
| 4258 | return -EINVAL; \ | ||
| 4259 | clear_stat(s, si); \ | ||
| 4260 | return length; \ | ||
| 4261 | } \ | ||
| 4262 | SLAB_ATTR(text); \ | ||
| 4380 | 4263 | ||
| 4381 | STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); | 4264 | STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); |
| 4382 | STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); | 4265 | STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); |
| @@ -4451,6 +4334,10 @@ static struct attribute *slab_attrs[] = { | |||
| 4451 | &deactivate_remote_frees_attr.attr, | 4334 | &deactivate_remote_frees_attr.attr, |
| 4452 | &order_fallback_attr.attr, | 4335 | &order_fallback_attr.attr, |
| 4453 | #endif | 4336 | #endif |
| 4337 | #ifdef CONFIG_FAILSLAB | ||
| 4338 | &failslab_attr.attr, | ||
| 4339 | #endif | ||
| 4340 | |||
| 4454 | NULL | 4341 | NULL |
| 4455 | }; | 4342 | }; |
| 4456 | 4343 | ||
| @@ -4503,7 +4390,7 @@ static void kmem_cache_release(struct kobject *kobj) | |||
| 4503 | kfree(s); | 4390 | kfree(s); |
| 4504 | } | 4391 | } |
| 4505 | 4392 | ||
| 4506 | static struct sysfs_ops slab_sysfs_ops = { | 4393 | static const struct sysfs_ops slab_sysfs_ops = { |
| 4507 | .show = slab_attr_show, | 4394 | .show = slab_attr_show, |
| 4508 | .store = slab_attr_store, | 4395 | .store = slab_attr_store, |
| 4509 | }; | 4396 | }; |
| @@ -4522,7 +4409,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) | |||
| 4522 | return 0; | 4409 | return 0; |
| 4523 | } | 4410 | } |
| 4524 | 4411 | ||
| 4525 | static struct kset_uevent_ops slab_uevent_ops = { | 4412 | static const struct kset_uevent_ops slab_uevent_ops = { |
| 4526 | .filter = uevent_filter, | 4413 | .filter = uevent_filter, |
| 4527 | }; | 4414 | }; |
| 4528 | 4415 | ||
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index d9714bdcb4a3..392b9bb5bc01 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
| @@ -40,9 +40,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, | |||
| 40 | unsigned long align, | 40 | unsigned long align, |
| 41 | unsigned long goal) | 41 | unsigned long goal) |
| 42 | { | 42 | { |
| 43 | return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); | 43 | return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); |
| 44 | } | 44 | } |
| 45 | 45 | ||
| 46 | static void *vmemmap_buf; | ||
| 47 | static void *vmemmap_buf_end; | ||
| 46 | 48 | ||
| 47 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) | 49 | void * __meminit vmemmap_alloc_block(unsigned long size, int node) |
| 48 | { | 50 | { |
| @@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) | |||
| 64 | __pa(MAX_DMA_ADDRESS)); | 66 | __pa(MAX_DMA_ADDRESS)); |
| 65 | } | 67 | } |
| 66 | 68 | ||
| 69 | /* need to make sure size is all the same during early stage */ | ||
| 70 | void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) | ||
| 71 | { | ||
| 72 | void *ptr; | ||
| 73 | |||
| 74 | if (!vmemmap_buf) | ||
| 75 | return vmemmap_alloc_block(size, node); | ||
| 76 | |||
| 77 | /* take the from buf */ | ||
| 78 | ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); | ||
| 79 | if (ptr + size > vmemmap_buf_end) | ||
| 80 | return vmemmap_alloc_block(size, node); | ||
| 81 | |||
| 82 | vmemmap_buf = ptr + size; | ||
| 83 | |||
| 84 | return ptr; | ||
| 85 | } | ||
| 86 | |||
| 67 | void __meminit vmemmap_verify(pte_t *pte, int node, | 87 | void __meminit vmemmap_verify(pte_t *pte, int node, |
| 68 | unsigned long start, unsigned long end) | 88 | unsigned long start, unsigned long end) |
| 69 | { | 89 | { |
| @@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) | |||
| 80 | pte_t *pte = pte_offset_kernel(pmd, addr); | 100 | pte_t *pte = pte_offset_kernel(pmd, addr); |
| 81 | if (pte_none(*pte)) { | 101 | if (pte_none(*pte)) { |
| 82 | pte_t entry; | 102 | pte_t entry; |
| 83 | void *p = vmemmap_alloc_block(PAGE_SIZE, node); | 103 | void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); |
| 84 | if (!p) | 104 | if (!p) |
| 85 | return NULL; | 105 | return NULL; |
| 86 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); | 106 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); |
| @@ -163,3 +183,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) | |||
| 163 | 183 | ||
| 164 | return map; | 184 | return map; |
| 165 | } | 185 | } |
| 186 | |||
| 187 | void __init sparse_mem_maps_populate_node(struct page **map_map, | ||
| 188 | unsigned long pnum_begin, | ||
| 189 | unsigned long pnum_end, | ||
| 190 | unsigned long map_count, int nodeid) | ||
| 191 | { | ||
| 192 | unsigned long pnum; | ||
| 193 | unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; | ||
| 194 | void *vmemmap_buf_start; | ||
| 195 | |||
| 196 | size = ALIGN(size, PMD_SIZE); | ||
| 197 | vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, | ||
| 198 | PMD_SIZE, __pa(MAX_DMA_ADDRESS)); | ||
| 199 | |||
| 200 | if (vmemmap_buf_start) { | ||
| 201 | vmemmap_buf = vmemmap_buf_start; | ||
| 202 | vmemmap_buf_end = vmemmap_buf_start + size * map_count; | ||
| 203 | } | ||
| 204 | |||
| 205 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
| 206 | struct mem_section *ms; | ||
| 207 | |||
| 208 | if (!present_section_nr(pnum)) | ||
| 209 | continue; | ||
| 210 | |||
| 211 | map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); | ||
| 212 | if (map_map[pnum]) | ||
| 213 | continue; | ||
| 214 | ms = __nr_to_section(pnum); | ||
| 215 | printk(KERN_ERR "%s: sparsemem memory map backing failed " | ||
| 216 | "some memory will not be available.\n", __func__); | ||
| 217 | ms->section_mem_map = 0; | ||
| 218 | } | ||
| 219 | |||
| 220 | if (vmemmap_buf_start) { | ||
| 221 | /* need to free left buf */ | ||
| 222 | #ifdef CONFIG_NO_BOOTMEM | ||
| 223 | free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end)); | ||
| 224 | if (vmemmap_buf_start < vmemmap_buf) { | ||
| 225 | char name[15]; | ||
| 226 | |||
| 227 | snprintf(name, sizeof(name), "MEMMAP %d", nodeid); | ||
| 228 | reserve_early_without_check(__pa(vmemmap_buf_start), | ||
| 229 | __pa(vmemmap_buf), name); | ||
| 230 | } | ||
| 231 | #else | ||
| 232 | free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); | ||
| 233 | #endif | ||
| 234 | vmemmap_buf = NULL; | ||
| 235 | vmemmap_buf_end = NULL; | ||
| 236 | } | ||
| 237 | } | ||
diff --git a/mm/sparse.c b/mm/sparse.c index 6ce4aab69e99..22896d589133 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_usemap(void) | |||
| 271 | 271 | ||
| 272 | #ifdef CONFIG_MEMORY_HOTREMOVE | 272 | #ifdef CONFIG_MEMORY_HOTREMOVE |
| 273 | static unsigned long * __init | 273 | static unsigned long * __init |
| 274 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | 274 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
| 275 | unsigned long count) | ||
| 275 | { | 276 | { |
| 276 | unsigned long section_nr; | 277 | unsigned long section_nr; |
| 277 | 278 | ||
| @@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | |||
| 286 | * this problem. | 287 | * this problem. |
| 287 | */ | 288 | */ |
| 288 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); | 289 | section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); |
| 289 | return alloc_bootmem_section(usemap_size(), section_nr); | 290 | return alloc_bootmem_section(usemap_size() * count, section_nr); |
| 290 | } | 291 | } |
| 291 | 292 | ||
| 292 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 293 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
| @@ -329,7 +330,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
| 329 | } | 330 | } |
| 330 | #else | 331 | #else |
| 331 | static unsigned long * __init | 332 | static unsigned long * __init |
| 332 | sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) | 333 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
| 334 | unsigned long count) | ||
| 333 | { | 335 | { |
| 334 | return NULL; | 336 | return NULL; |
| 335 | } | 337 | } |
| @@ -339,27 +341,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | |||
| 339 | } | 341 | } |
| 340 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 342 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
| 341 | 343 | ||
| 342 | static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) | 344 | static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, |
| 345 | unsigned long pnum_begin, | ||
| 346 | unsigned long pnum_end, | ||
| 347 | unsigned long usemap_count, int nodeid) | ||
| 343 | { | 348 | { |
| 344 | unsigned long *usemap; | 349 | void *usemap; |
| 345 | struct mem_section *ms = __nr_to_section(pnum); | 350 | unsigned long pnum; |
| 346 | int nid = sparse_early_nid(ms); | 351 | int size = usemap_size(); |
| 347 | |||
| 348 | usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); | ||
| 349 | if (usemap) | ||
| 350 | return usemap; | ||
| 351 | 352 | ||
| 352 | usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); | 353 | usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), |
| 354 | usemap_count); | ||
| 353 | if (usemap) { | 355 | if (usemap) { |
| 354 | check_usemap_section_nr(nid, usemap); | 356 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { |
| 355 | return usemap; | 357 | if (!present_section_nr(pnum)) |
| 358 | continue; | ||
| 359 | usemap_map[pnum] = usemap; | ||
| 360 | usemap += size; | ||
| 361 | } | ||
| 362 | return; | ||
| 356 | } | 363 | } |
| 357 | 364 | ||
| 358 | /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ | 365 | usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); |
| 359 | nid = 0; | 366 | if (usemap) { |
| 367 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
| 368 | if (!present_section_nr(pnum)) | ||
| 369 | continue; | ||
| 370 | usemap_map[pnum] = usemap; | ||
| 371 | usemap += size; | ||
| 372 | check_usemap_section_nr(nodeid, usemap_map[pnum]); | ||
| 373 | } | ||
| 374 | return; | ||
| 375 | } | ||
| 360 | 376 | ||
| 361 | printk(KERN_WARNING "%s: allocation failed\n", __func__); | 377 | printk(KERN_WARNING "%s: allocation failed\n", __func__); |
| 362 | return NULL; | ||
| 363 | } | 378 | } |
| 364 | 379 | ||
| 365 | #ifndef CONFIG_SPARSEMEM_VMEMMAP | 380 | #ifndef CONFIG_SPARSEMEM_VMEMMAP |
| @@ -375,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) | |||
| 375 | PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); | 390 | PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); |
| 376 | return map; | 391 | return map; |
| 377 | } | 392 | } |
| 393 | void __init sparse_mem_maps_populate_node(struct page **map_map, | ||
| 394 | unsigned long pnum_begin, | ||
| 395 | unsigned long pnum_end, | ||
| 396 | unsigned long map_count, int nodeid) | ||
| 397 | { | ||
| 398 | void *map; | ||
| 399 | unsigned long pnum; | ||
| 400 | unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; | ||
| 401 | |||
| 402 | map = alloc_remap(nodeid, size * map_count); | ||
| 403 | if (map) { | ||
| 404 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
| 405 | if (!present_section_nr(pnum)) | ||
| 406 | continue; | ||
| 407 | map_map[pnum] = map; | ||
| 408 | map += size; | ||
| 409 | } | ||
| 410 | return; | ||
| 411 | } | ||
| 412 | |||
| 413 | size = PAGE_ALIGN(size); | ||
| 414 | map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count); | ||
| 415 | if (map) { | ||
| 416 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
| 417 | if (!present_section_nr(pnum)) | ||
| 418 | continue; | ||
| 419 | map_map[pnum] = map; | ||
| 420 | map += size; | ||
| 421 | } | ||
| 422 | return; | ||
| 423 | } | ||
| 424 | |||
| 425 | /* fallback */ | ||
| 426 | for (pnum = pnum_begin; pnum < pnum_end; pnum++) { | ||
| 427 | struct mem_section *ms; | ||
| 428 | |||
| 429 | if (!present_section_nr(pnum)) | ||
| 430 | continue; | ||
| 431 | map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); | ||
| 432 | if (map_map[pnum]) | ||
| 433 | continue; | ||
| 434 | ms = __nr_to_section(pnum); | ||
| 435 | printk(KERN_ERR "%s: sparsemem memory map backing failed " | ||
| 436 | "some memory will not be available.\n", __func__); | ||
| 437 | ms->section_mem_map = 0; | ||
| 438 | } | ||
| 439 | } | ||
| 378 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ | 440 | #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ |
| 379 | 441 | ||
| 442 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 443 | static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, | ||
| 444 | unsigned long pnum_begin, | ||
| 445 | unsigned long pnum_end, | ||
| 446 | unsigned long map_count, int nodeid) | ||
| 447 | { | ||
| 448 | sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, | ||
| 449 | map_count, nodeid); | ||
| 450 | } | ||
| 451 | #else | ||
| 380 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | 452 | static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) |
| 381 | { | 453 | { |
| 382 | struct page *map; | 454 | struct page *map; |
| @@ -392,10 +464,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) | |||
| 392 | ms->section_mem_map = 0; | 464 | ms->section_mem_map = 0; |
| 393 | return NULL; | 465 | return NULL; |
| 394 | } | 466 | } |
| 467 | #endif | ||
| 395 | 468 | ||
| 396 | void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) | 469 | void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) |
| 397 | { | 470 | { |
| 398 | } | 471 | } |
| 472 | |||
| 399 | /* | 473 | /* |
| 400 | * Allocate the accumulated non-linear sections, allocate a mem_map | 474 | * Allocate the accumulated non-linear sections, allocate a mem_map |
| 401 | * for each and record the physical to section mapping. | 475 | * for each and record the physical to section mapping. |
| @@ -407,6 +481,14 @@ void __init sparse_init(void) | |||
| 407 | unsigned long *usemap; | 481 | unsigned long *usemap; |
| 408 | unsigned long **usemap_map; | 482 | unsigned long **usemap_map; |
| 409 | int size; | 483 | int size; |
| 484 | int nodeid_begin = 0; | ||
| 485 | unsigned long pnum_begin = 0; | ||
| 486 | unsigned long usemap_count; | ||
| 487 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 488 | unsigned long map_count; | ||
| 489 | int size2; | ||
| 490 | struct page **map_map; | ||
| 491 | #endif | ||
| 410 | 492 | ||
| 411 | /* | 493 | /* |
| 412 | * map is using big page (aka 2M in x86 64 bit) | 494 | * map is using big page (aka 2M in x86 64 bit) |
| @@ -425,10 +507,81 @@ void __init sparse_init(void) | |||
| 425 | panic("can not allocate usemap_map\n"); | 507 | panic("can not allocate usemap_map\n"); |
| 426 | 508 | ||
| 427 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 509 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
| 510 | struct mem_section *ms; | ||
| 511 | |||
| 428 | if (!present_section_nr(pnum)) | 512 | if (!present_section_nr(pnum)) |
| 429 | continue; | 513 | continue; |
| 430 | usemap_map[pnum] = sparse_early_usemap_alloc(pnum); | 514 | ms = __nr_to_section(pnum); |
| 515 | nodeid_begin = sparse_early_nid(ms); | ||
| 516 | pnum_begin = pnum; | ||
| 517 | break; | ||
| 431 | } | 518 | } |
| 519 | usemap_count = 1; | ||
| 520 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
| 521 | struct mem_section *ms; | ||
| 522 | int nodeid; | ||
| 523 | |||
| 524 | if (!present_section_nr(pnum)) | ||
| 525 | continue; | ||
| 526 | ms = __nr_to_section(pnum); | ||
| 527 | nodeid = sparse_early_nid(ms); | ||
| 528 | if (nodeid == nodeid_begin) { | ||
| 529 | usemap_count++; | ||
| 530 | continue; | ||
| 531 | } | ||
| 532 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
| 533 | sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, | ||
| 534 | usemap_count, nodeid_begin); | ||
| 535 | /* new start, update count etc*/ | ||
| 536 | nodeid_begin = nodeid; | ||
| 537 | pnum_begin = pnum; | ||
| 538 | usemap_count = 1; | ||
| 539 | } | ||
| 540 | /* ok, last chunk */ | ||
| 541 | sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, | ||
| 542 | usemap_count, nodeid_begin); | ||
| 543 | |||
| 544 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 545 | size2 = sizeof(struct page *) * NR_MEM_SECTIONS; | ||
| 546 | map_map = alloc_bootmem(size2); | ||
| 547 | if (!map_map) | ||
| 548 | panic("can not allocate map_map\n"); | ||
| 549 | |||
| 550 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | ||
| 551 | struct mem_section *ms; | ||
| 552 | |||
| 553 | if (!present_section_nr(pnum)) | ||
| 554 | continue; | ||
| 555 | ms = __nr_to_section(pnum); | ||
| 556 | nodeid_begin = sparse_early_nid(ms); | ||
| 557 | pnum_begin = pnum; | ||
| 558 | break; | ||
| 559 | } | ||
| 560 | map_count = 1; | ||
| 561 | for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { | ||
| 562 | struct mem_section *ms; | ||
| 563 | int nodeid; | ||
| 564 | |||
| 565 | if (!present_section_nr(pnum)) | ||
| 566 | continue; | ||
| 567 | ms = __nr_to_section(pnum); | ||
| 568 | nodeid = sparse_early_nid(ms); | ||
| 569 | if (nodeid == nodeid_begin) { | ||
| 570 | map_count++; | ||
| 571 | continue; | ||
| 572 | } | ||
| 573 | /* ok, we need to take cake of from pnum_begin to pnum - 1*/ | ||
| 574 | sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, | ||
| 575 | map_count, nodeid_begin); | ||
| 576 | /* new start, update count etc*/ | ||
| 577 | nodeid_begin = nodeid; | ||
| 578 | pnum_begin = pnum; | ||
| 579 | map_count = 1; | ||
| 580 | } | ||
| 581 | /* ok, last chunk */ | ||
| 582 | sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, | ||
| 583 | map_count, nodeid_begin); | ||
| 584 | #endif | ||
| 432 | 585 | ||
| 433 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { | 586 | for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { |
| 434 | if (!present_section_nr(pnum)) | 587 | if (!present_section_nr(pnum)) |
| @@ -438,7 +591,11 @@ void __init sparse_init(void) | |||
| 438 | if (!usemap) | 591 | if (!usemap) |
| 439 | continue; | 592 | continue; |
| 440 | 593 | ||
| 594 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 595 | map = map_map[pnum]; | ||
| 596 | #else | ||
| 441 | map = sparse_early_mem_map_alloc(pnum); | 597 | map = sparse_early_mem_map_alloc(pnum); |
| 598 | #endif | ||
| 442 | if (!map) | 599 | if (!map) |
| 443 | continue; | 600 | continue; |
| 444 | 601 | ||
| @@ -448,6 +605,9 @@ void __init sparse_init(void) | |||
| 448 | 605 | ||
| 449 | vmemmap_populate_print_last(); | 606 | vmemmap_populate_print_last(); |
| 450 | 607 | ||
| 608 | #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER | ||
| 609 | free_bootmem(__pa(map_map), size2); | ||
| 610 | #endif | ||
| 451 | free_bootmem(__pa(usemap_map), size); | 611 | free_bootmem(__pa(usemap_map), size); |
| 452 | } | 612 | } |
| 453 | 613 | ||
| @@ -55,7 +55,7 @@ static void __page_cache_release(struct page *page) | |||
| 55 | del_page_from_lru(zone, page); | 55 | del_page_from_lru(zone, page); |
| 56 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 56 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 57 | } | 57 | } |
| 58 | free_hot_page(page); | 58 | free_hot_cold_page(page, 0); |
| 59 | } | 59 | } |
| 60 | 60 | ||
| 61 | static void put_compound_page(struct page *page) | 61 | static void put_compound_page(struct page *page) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 9c590eef7912..84374d8cf814 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/seq_file.h> | 22 | #include <linux/seq_file.h> |
| 23 | #include <linux/init.h> | 23 | #include <linux/init.h> |
| 24 | #include <linux/module.h> | 24 | #include <linux/module.h> |
| 25 | #include <linux/ksm.h> | ||
| 25 | #include <linux/rmap.h> | 26 | #include <linux/rmap.h> |
| 26 | #include <linux/security.h> | 27 | #include <linux/security.h> |
| 27 | #include <linux/backing-dev.h> | 28 | #include <linux/backing-dev.h> |
| @@ -35,11 +36,15 @@ | |||
| 35 | #include <linux/swapops.h> | 36 | #include <linux/swapops.h> |
| 36 | #include <linux/page_cgroup.h> | 37 | #include <linux/page_cgroup.h> |
| 37 | 38 | ||
| 39 | static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | ||
| 40 | unsigned char); | ||
| 41 | static void free_swap_count_continuations(struct swap_info_struct *); | ||
| 42 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | ||
| 43 | |||
| 38 | static DEFINE_SPINLOCK(swap_lock); | 44 | static DEFINE_SPINLOCK(swap_lock); |
| 39 | static unsigned int nr_swapfiles; | 45 | static unsigned int nr_swapfiles; |
| 40 | long nr_swap_pages; | 46 | long nr_swap_pages; |
| 41 | long total_swap_pages; | 47 | long total_swap_pages; |
| 42 | static int swap_overflow; | ||
| 43 | static int least_priority; | 48 | static int least_priority; |
| 44 | 49 | ||
| 45 | static const char Bad_file[] = "Bad swap file entry "; | 50 | static const char Bad_file[] = "Bad swap file entry "; |
| @@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry "; | |||
| 49 | 54 | ||
| 50 | static struct swap_list_t swap_list = {-1, -1}; | 55 | static struct swap_list_t swap_list = {-1, -1}; |
| 51 | 56 | ||
| 52 | static struct swap_info_struct swap_info[MAX_SWAPFILES]; | 57 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
| 53 | 58 | ||
| 54 | static DEFINE_MUTEX(swapon_mutex); | 59 | static DEFINE_MUTEX(swapon_mutex); |
| 55 | 60 | ||
| 56 | /* For reference count accounting in swap_map */ | 61 | static inline unsigned char swap_count(unsigned char ent) |
| 57 | /* enum for swap_map[] handling. internal use only */ | ||
| 58 | enum { | ||
| 59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
| 60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
| 61 | }; | ||
| 62 | |||
| 63 | static inline int swap_count(unsigned short ent) | ||
| 64 | { | ||
| 65 | return ent & SWAP_COUNT_MASK; | ||
| 66 | } | ||
| 67 | |||
| 68 | static inline bool swap_has_cache(unsigned short ent) | ||
| 69 | { | ||
| 70 | return !!(ent & SWAP_HAS_CACHE); | ||
| 71 | } | ||
| 72 | |||
| 73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | ||
| 74 | { | 62 | { |
| 75 | unsigned short ret = count; | 63 | return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ |
| 76 | |||
| 77 | if (has_cache) | ||
| 78 | return SWAP_HAS_CACHE | ret; | ||
| 79 | return ret; | ||
| 80 | } | 64 | } |
| 81 | 65 | ||
| 82 | /* returnes 1 if swap entry is freed */ | 66 | /* returns 1 if swap entry is freed */ |
| 83 | static int | 67 | static int |
| 84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | 68 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) |
| 85 | { | 69 | { |
| 86 | int type = si - swap_info; | 70 | swp_entry_t entry = swp_entry(si->type, offset); |
| 87 | swp_entry_t entry = swp_entry(type, offset); | ||
| 88 | struct page *page; | 71 | struct page *page; |
| 89 | int ret = 0; | 72 | int ret = 0; |
| 90 | 73 | ||
| @@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
| 120 | down_read(&swap_unplug_sem); | 103 | down_read(&swap_unplug_sem); |
| 121 | entry.val = page_private(page); | 104 | entry.val = page_private(page); |
| 122 | if (PageSwapCache(page)) { | 105 | if (PageSwapCache(page)) { |
| 123 | struct block_device *bdev = swap_info[swp_type(entry)].bdev; | 106 | struct block_device *bdev = swap_info[swp_type(entry)]->bdev; |
| 124 | struct backing_dev_info *bdi; | 107 | struct backing_dev_info *bdi; |
| 125 | 108 | ||
| 126 | /* | 109 | /* |
| @@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) | |||
| 146 | static int discard_swap(struct swap_info_struct *si) | 129 | static int discard_swap(struct swap_info_struct *si) |
| 147 | { | 130 | { |
| 148 | struct swap_extent *se; | 131 | struct swap_extent *se; |
| 132 | sector_t start_block; | ||
| 133 | sector_t nr_blocks; | ||
| 149 | int err = 0; | 134 | int err = 0; |
| 150 | 135 | ||
| 151 | list_for_each_entry(se, &si->extent_list, list) { | 136 | /* Do not discard the swap header page! */ |
| 152 | sector_t start_block = se->start_block << (PAGE_SHIFT - 9); | 137 | se = &si->first_swap_extent; |
| 153 | sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); | 138 | start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); |
| 139 | nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); | ||
| 140 | if (nr_blocks) { | ||
| 141 | err = blkdev_issue_discard(si->bdev, start_block, | ||
| 142 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); | ||
| 143 | if (err) | ||
| 144 | return err; | ||
| 145 | cond_resched(); | ||
| 146 | } | ||
| 154 | 147 | ||
| 155 | if (se->start_page == 0) { | 148 | list_for_each_entry(se, &si->first_swap_extent.list, list) { |
| 156 | /* Do not discard the swap header page! */ | 149 | start_block = se->start_block << (PAGE_SHIFT - 9); |
| 157 | start_block += 1 << (PAGE_SHIFT - 9); | 150 | nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); |
| 158 | nr_blocks -= 1 << (PAGE_SHIFT - 9); | ||
| 159 | if (!nr_blocks) | ||
| 160 | continue; | ||
| 161 | } | ||
| 162 | 151 | ||
| 163 | err = blkdev_issue_discard(si->bdev, start_block, | 152 | err = blkdev_issue_discard(si->bdev, start_block, |
| 164 | nr_blocks, GFP_KERNEL, | 153 | nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER); |
| 165 | DISCARD_FL_BARRIER); | ||
| 166 | if (err) | 154 | if (err) |
| 167 | break; | 155 | break; |
| 168 | 156 | ||
| @@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, | |||
| 201 | start_block <<= PAGE_SHIFT - 9; | 189 | start_block <<= PAGE_SHIFT - 9; |
| 202 | nr_blocks <<= PAGE_SHIFT - 9; | 190 | nr_blocks <<= PAGE_SHIFT - 9; |
| 203 | if (blkdev_issue_discard(si->bdev, start_block, | 191 | if (blkdev_issue_discard(si->bdev, start_block, |
| 204 | nr_blocks, GFP_NOIO, | 192 | nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER)) |
| 205 | DISCARD_FL_BARRIER)) | ||
| 206 | break; | 193 | break; |
| 207 | } | 194 | } |
| 208 | 195 | ||
| 209 | lh = se->list.next; | 196 | lh = se->list.next; |
| 210 | if (lh == &si->extent_list) | ||
| 211 | lh = lh->next; | ||
| 212 | se = list_entry(lh, struct swap_extent, list); | 197 | se = list_entry(lh, struct swap_extent, list); |
| 213 | } | 198 | } |
| 214 | } | 199 | } |
| @@ -223,7 +208,7 @@ static int wait_for_discard(void *word) | |||
| 223 | #define LATENCY_LIMIT 256 | 208 | #define LATENCY_LIMIT 256 |
| 224 | 209 | ||
| 225 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, | 210 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
| 226 | int cache) | 211 | unsigned char usage) |
| 227 | { | 212 | { |
| 228 | unsigned long offset; | 213 | unsigned long offset; |
| 229 | unsigned long scan_base; | 214 | unsigned long scan_base; |
| @@ -354,10 +339,7 @@ checks: | |||
| 354 | si->lowest_bit = si->max; | 339 | si->lowest_bit = si->max; |
| 355 | si->highest_bit = 0; | 340 | si->highest_bit = 0; |
| 356 | } | 341 | } |
| 357 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ | 342 | si->swap_map[offset] = usage; |
| 358 | si->swap_map[offset] = encode_swapmap(0, true); | ||
| 359 | else /* at suspend */ | ||
| 360 | si->swap_map[offset] = encode_swapmap(1, false); | ||
| 361 | si->cluster_next = offset + 1; | 343 | si->cluster_next = offset + 1; |
| 362 | si->flags -= SWP_SCANNING; | 344 | si->flags -= SWP_SCANNING; |
| 363 | 345 | ||
| @@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void) | |||
| 467 | nr_swap_pages--; | 449 | nr_swap_pages--; |
| 468 | 450 | ||
| 469 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { | 451 | for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { |
| 470 | si = swap_info + type; | 452 | si = swap_info[type]; |
| 471 | next = si->next; | 453 | next = si->next; |
| 472 | if (next < 0 || | 454 | if (next < 0 || |
| 473 | (!wrapped && si->prio != swap_info[next].prio)) { | 455 | (!wrapped && si->prio != swap_info[next]->prio)) { |
| 474 | next = swap_list.head; | 456 | next = swap_list.head; |
| 475 | wrapped++; | 457 | wrapped++; |
| 476 | } | 458 | } |
| @@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void) | |||
| 482 | 464 | ||
| 483 | swap_list.next = next; | 465 | swap_list.next = next; |
| 484 | /* This is called for allocating swap entry for cache */ | 466 | /* This is called for allocating swap entry for cache */ |
| 485 | offset = scan_swap_map(si, SWAP_CACHE); | 467 | offset = scan_swap_map(si, SWAP_HAS_CACHE); |
| 486 | if (offset) { | 468 | if (offset) { |
| 487 | spin_unlock(&swap_lock); | 469 | spin_unlock(&swap_lock); |
| 488 | return swp_entry(type, offset); | 470 | return swp_entry(type, offset); |
| @@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type) | |||
| 503 | pgoff_t offset; | 485 | pgoff_t offset; |
| 504 | 486 | ||
| 505 | spin_lock(&swap_lock); | 487 | spin_lock(&swap_lock); |
| 506 | si = swap_info + type; | 488 | si = swap_info[type]; |
| 507 | if (si->flags & SWP_WRITEOK) { | 489 | if (si && (si->flags & SWP_WRITEOK)) { |
| 508 | nr_swap_pages--; | 490 | nr_swap_pages--; |
| 509 | /* This is called for allocating swap entry, not cache */ | 491 | /* This is called for allocating swap entry, not cache */ |
| 510 | offset = scan_swap_map(si, SWAP_MAP); | 492 | offset = scan_swap_map(si, 1); |
| 511 | if (offset) { | 493 | if (offset) { |
| 512 | spin_unlock(&swap_lock); | 494 | spin_unlock(&swap_lock); |
| 513 | return swp_entry(type, offset); | 495 | return swp_entry(type, offset); |
| @@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type) | |||
| 518 | return (swp_entry_t) {0}; | 500 | return (swp_entry_t) {0}; |
| 519 | } | 501 | } |
| 520 | 502 | ||
| 521 | static struct swap_info_struct * swap_info_get(swp_entry_t entry) | 503 | static struct swap_info_struct *swap_info_get(swp_entry_t entry) |
| 522 | { | 504 | { |
| 523 | struct swap_info_struct * p; | 505 | struct swap_info_struct *p; |
| 524 | unsigned long offset, type; | 506 | unsigned long offset, type; |
| 525 | 507 | ||
| 526 | if (!entry.val) | 508 | if (!entry.val) |
| @@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry) | |||
| 528 | type = swp_type(entry); | 510 | type = swp_type(entry); |
| 529 | if (type >= nr_swapfiles) | 511 | if (type >= nr_swapfiles) |
| 530 | goto bad_nofile; | 512 | goto bad_nofile; |
| 531 | p = & swap_info[type]; | 513 | p = swap_info[type]; |
| 532 | if (!(p->flags & SWP_USED)) | 514 | if (!(p->flags & SWP_USED)) |
| 533 | goto bad_device; | 515 | goto bad_device; |
| 534 | offset = swp_offset(entry); | 516 | offset = swp_offset(entry); |
| @@ -554,41 +536,56 @@ out: | |||
| 554 | return NULL; | 536 | return NULL; |
| 555 | } | 537 | } |
| 556 | 538 | ||
| 557 | static int swap_entry_free(struct swap_info_struct *p, | 539 | static unsigned char swap_entry_free(struct swap_info_struct *p, |
| 558 | swp_entry_t ent, int cache) | 540 | swp_entry_t entry, unsigned char usage) |
| 559 | { | 541 | { |
| 560 | unsigned long offset = swp_offset(ent); | 542 | unsigned long offset = swp_offset(entry); |
| 561 | int count = swap_count(p->swap_map[offset]); | 543 | unsigned char count; |
| 562 | bool has_cache; | 544 | unsigned char has_cache; |
| 563 | 545 | ||
| 564 | has_cache = swap_has_cache(p->swap_map[offset]); | 546 | count = p->swap_map[offset]; |
| 547 | has_cache = count & SWAP_HAS_CACHE; | ||
| 548 | count &= ~SWAP_HAS_CACHE; | ||
| 565 | 549 | ||
| 566 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ | 550 | if (usage == SWAP_HAS_CACHE) { |
| 567 | if (count < SWAP_MAP_MAX) { | ||
| 568 | count--; | ||
| 569 | p->swap_map[offset] = encode_swapmap(count, has_cache); | ||
| 570 | } | ||
| 571 | } else { /* dropping swap cache flag */ | ||
| 572 | VM_BUG_ON(!has_cache); | 551 | VM_BUG_ON(!has_cache); |
| 573 | p->swap_map[offset] = encode_swapmap(count, false); | 552 | has_cache = 0; |
| 574 | 553 | } else if (count == SWAP_MAP_SHMEM) { | |
| 554 | /* | ||
| 555 | * Or we could insist on shmem.c using a special | ||
| 556 | * swap_shmem_free() and free_shmem_swap_and_cache()... | ||
| 557 | */ | ||
| 558 | count = 0; | ||
| 559 | } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { | ||
| 560 | if (count == COUNT_CONTINUED) { | ||
| 561 | if (swap_count_continued(p, offset, count)) | ||
| 562 | count = SWAP_MAP_MAX | COUNT_CONTINUED; | ||
| 563 | else | ||
| 564 | count = SWAP_MAP_MAX; | ||
| 565 | } else | ||
| 566 | count--; | ||
| 575 | } | 567 | } |
| 576 | /* return code. */ | 568 | |
| 577 | count = p->swap_map[offset]; | 569 | if (!count) |
| 570 | mem_cgroup_uncharge_swap(entry); | ||
| 571 | |||
| 572 | usage = count | has_cache; | ||
| 573 | p->swap_map[offset] = usage; | ||
| 574 | |||
| 578 | /* free if no reference */ | 575 | /* free if no reference */ |
| 579 | if (!count) { | 576 | if (!usage) { |
| 580 | if (offset < p->lowest_bit) | 577 | if (offset < p->lowest_bit) |
| 581 | p->lowest_bit = offset; | 578 | p->lowest_bit = offset; |
| 582 | if (offset > p->highest_bit) | 579 | if (offset > p->highest_bit) |
| 583 | p->highest_bit = offset; | 580 | p->highest_bit = offset; |
| 584 | if (p->prio > swap_info[swap_list.next].prio) | 581 | if (swap_list.next >= 0 && |
| 585 | swap_list.next = p - swap_info; | 582 | p->prio > swap_info[swap_list.next]->prio) |
| 583 | swap_list.next = p->type; | ||
| 586 | nr_swap_pages++; | 584 | nr_swap_pages++; |
| 587 | p->inuse_pages--; | 585 | p->inuse_pages--; |
| 588 | } | 586 | } |
| 589 | if (!swap_count(count)) | 587 | |
| 590 | mem_cgroup_uncharge_swap(ent); | 588 | return usage; |
| 591 | return count; | ||
| 592 | } | 589 | } |
| 593 | 590 | ||
| 594 | /* | 591 | /* |
| @@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p, | |||
| 597 | */ | 594 | */ |
| 598 | void swap_free(swp_entry_t entry) | 595 | void swap_free(swp_entry_t entry) |
| 599 | { | 596 | { |
| 600 | struct swap_info_struct * p; | 597 | struct swap_info_struct *p; |
| 601 | 598 | ||
| 602 | p = swap_info_get(entry); | 599 | p = swap_info_get(entry); |
| 603 | if (p) { | 600 | if (p) { |
| 604 | swap_entry_free(p, entry, SWAP_MAP); | 601 | swap_entry_free(p, entry, 1); |
| 605 | spin_unlock(&swap_lock); | 602 | spin_unlock(&swap_lock); |
| 606 | } | 603 | } |
| 607 | } | 604 | } |
| @@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry) | |||
| 612 | void swapcache_free(swp_entry_t entry, struct page *page) | 609 | void swapcache_free(swp_entry_t entry, struct page *page) |
| 613 | { | 610 | { |
| 614 | struct swap_info_struct *p; | 611 | struct swap_info_struct *p; |
| 615 | int ret; | 612 | unsigned char count; |
| 616 | 613 | ||
| 617 | p = swap_info_get(entry); | 614 | p = swap_info_get(entry); |
| 618 | if (p) { | 615 | if (p) { |
| 619 | ret = swap_entry_free(p, entry, SWAP_CACHE); | 616 | count = swap_entry_free(p, entry, SWAP_HAS_CACHE); |
| 620 | if (page) { | 617 | if (page) |
| 621 | bool swapout; | 618 | mem_cgroup_uncharge_swapcache(page, entry, count != 0); |
| 622 | if (ret) | ||
| 623 | swapout = true; /* the end of swap out */ | ||
| 624 | else | ||
| 625 | swapout = false; /* no more swap users! */ | ||
| 626 | mem_cgroup_uncharge_swapcache(page, entry, swapout); | ||
| 627 | } | ||
| 628 | spin_unlock(&swap_lock); | 619 | spin_unlock(&swap_lock); |
| 629 | } | 620 | } |
| 630 | return; | ||
| 631 | } | 621 | } |
| 632 | 622 | ||
| 633 | /* | 623 | /* |
| 634 | * How many references to page are currently swapped out? | 624 | * How many references to page are currently swapped out? |
| 625 | * This does not give an exact answer when swap count is continued, | ||
| 626 | * but does include the high COUNT_CONTINUED flag to allow for that. | ||
| 635 | */ | 627 | */ |
| 636 | static inline int page_swapcount(struct page *page) | 628 | static inline int page_swapcount(struct page *page) |
| 637 | { | 629 | { |
| @@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page) | |||
| 659 | int count; | 651 | int count; |
| 660 | 652 | ||
| 661 | VM_BUG_ON(!PageLocked(page)); | 653 | VM_BUG_ON(!PageLocked(page)); |
| 654 | if (unlikely(PageKsm(page))) | ||
| 655 | return 0; | ||
| 662 | count = page_mapcount(page); | 656 | count = page_mapcount(page); |
| 663 | if (count <= 1 && PageSwapCache(page)) { | 657 | if (count <= 1 && PageSwapCache(page)) { |
| 664 | count += page_swapcount(page); | 658 | count += page_swapcount(page); |
| @@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page) | |||
| 667 | SetPageDirty(page); | 661 | SetPageDirty(page); |
| 668 | } | 662 | } |
| 669 | } | 663 | } |
| 670 | return count == 1; | 664 | return count <= 1; |
| 671 | } | 665 | } |
| 672 | 666 | ||
| 673 | /* | 667 | /* |
| @@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry) | |||
| 704 | 698 | ||
| 705 | p = swap_info_get(entry); | 699 | p = swap_info_get(entry); |
| 706 | if (p) { | 700 | if (p) { |
| 707 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { | 701 | if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { |
| 708 | page = find_get_page(&swapper_space, entry.val); | 702 | page = find_get_page(&swapper_space, entry.val); |
| 709 | if (page && !trylock_page(page)) { | 703 | if (page && !trylock_page(page)) { |
| 710 | page_cache_release(page); | 704 | page_cache_release(page); |
| @@ -741,14 +735,14 @@ int free_swap_and_cache(swp_entry_t entry) | |||
| 741 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | 735 | int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) |
| 742 | { | 736 | { |
| 743 | struct block_device *bdev = NULL; | 737 | struct block_device *bdev = NULL; |
| 744 | int i; | 738 | int type; |
| 745 | 739 | ||
| 746 | if (device) | 740 | if (device) |
| 747 | bdev = bdget(device); | 741 | bdev = bdget(device); |
| 748 | 742 | ||
| 749 | spin_lock(&swap_lock); | 743 | spin_lock(&swap_lock); |
| 750 | for (i = 0; i < nr_swapfiles; i++) { | 744 | for (type = 0; type < nr_swapfiles; type++) { |
| 751 | struct swap_info_struct *sis = swap_info + i; | 745 | struct swap_info_struct *sis = swap_info[type]; |
| 752 | 746 | ||
| 753 | if (!(sis->flags & SWP_WRITEOK)) | 747 | if (!(sis->flags & SWP_WRITEOK)) |
| 754 | continue; | 748 | continue; |
| @@ -758,20 +752,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
| 758 | *bdev_p = bdgrab(sis->bdev); | 752 | *bdev_p = bdgrab(sis->bdev); |
| 759 | 753 | ||
| 760 | spin_unlock(&swap_lock); | 754 | spin_unlock(&swap_lock); |
| 761 | return i; | 755 | return type; |
| 762 | } | 756 | } |
| 763 | if (bdev == sis->bdev) { | 757 | if (bdev == sis->bdev) { |
| 764 | struct swap_extent *se; | 758 | struct swap_extent *se = &sis->first_swap_extent; |
| 765 | 759 | ||
| 766 | se = list_entry(sis->extent_list.next, | ||
| 767 | struct swap_extent, list); | ||
| 768 | if (se->start_block == offset) { | 760 | if (se->start_block == offset) { |
| 769 | if (bdev_p) | 761 | if (bdev_p) |
| 770 | *bdev_p = bdgrab(sis->bdev); | 762 | *bdev_p = bdgrab(sis->bdev); |
| 771 | 763 | ||
| 772 | spin_unlock(&swap_lock); | 764 | spin_unlock(&swap_lock); |
| 773 | bdput(bdev); | 765 | bdput(bdev); |
| 774 | return i; | 766 | return type; |
| 775 | } | 767 | } |
| 776 | } | 768 | } |
| 777 | } | 769 | } |
| @@ -783,6 +775,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
| 783 | } | 775 | } |
| 784 | 776 | ||
| 785 | /* | 777 | /* |
| 778 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | ||
| 779 | * corresponding to given index in swap_info (swap type). | ||
| 780 | */ | ||
| 781 | sector_t swapdev_block(int type, pgoff_t offset) | ||
| 782 | { | ||
| 783 | struct block_device *bdev; | ||
| 784 | |||
| 785 | if ((unsigned int)type >= nr_swapfiles) | ||
| 786 | return 0; | ||
| 787 | if (!(swap_info[type]->flags & SWP_WRITEOK)) | ||
| 788 | return 0; | ||
| 789 | return map_swap_entry(swp_entry(type, offset), &bdev); | ||
| 790 | } | ||
| 791 | |||
| 792 | /* | ||
| 786 | * Return either the total number of swap pages of given type, or the number | 793 | * Return either the total number of swap pages of given type, or the number |
| 787 | * of free pages of that type (depending on @free) | 794 | * of free pages of that type (depending on @free) |
| 788 | * | 795 | * |
| @@ -792,18 +799,20 @@ unsigned int count_swap_pages(int type, int free) | |||
| 792 | { | 799 | { |
| 793 | unsigned int n = 0; | 800 | unsigned int n = 0; |
| 794 | 801 | ||
| 795 | if (type < nr_swapfiles) { | 802 | spin_lock(&swap_lock); |
| 796 | spin_lock(&swap_lock); | 803 | if ((unsigned int)type < nr_swapfiles) { |
| 797 | if (swap_info[type].flags & SWP_WRITEOK) { | 804 | struct swap_info_struct *sis = swap_info[type]; |
| 798 | n = swap_info[type].pages; | 805 | |
| 806 | if (sis->flags & SWP_WRITEOK) { | ||
| 807 | n = sis->pages; | ||
| 799 | if (free) | 808 | if (free) |
| 800 | n -= swap_info[type].inuse_pages; | 809 | n -= sis->inuse_pages; |
| 801 | } | 810 | } |
| 802 | spin_unlock(&swap_lock); | ||
| 803 | } | 811 | } |
| 812 | spin_unlock(&swap_lock); | ||
| 804 | return n; | 813 | return n; |
| 805 | } | 814 | } |
| 806 | #endif | 815 | #endif /* CONFIG_HIBERNATION */ |
| 807 | 816 | ||
| 808 | /* | 817 | /* |
| 809 | * No need to decide whether this PTE shares the swap entry with others, | 818 | * No need to decide whether this PTE shares the swap entry with others, |
| @@ -831,7 +840,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 831 | goto out; | 840 | goto out; |
| 832 | } | 841 | } |
| 833 | 842 | ||
| 834 | inc_mm_counter(vma->vm_mm, anon_rss); | 843 | dec_mm_counter(vma->vm_mm, MM_SWAPENTS); |
| 844 | inc_mm_counter(vma->vm_mm, MM_ANONPAGES); | ||
| 835 | get_page(page); | 845 | get_page(page); |
| 836 | set_pte_at(vma->vm_mm, addr, pte, | 846 | set_pte_at(vma->vm_mm, addr, pte, |
| 837 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 847 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
| @@ -932,7 +942,7 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
| 932 | unsigned long addr, end, next; | 942 | unsigned long addr, end, next; |
| 933 | int ret; | 943 | int ret; |
| 934 | 944 | ||
| 935 | if (page->mapping) { | 945 | if (page_anon_vma(page)) { |
| 936 | addr = page_address_in_vma(page, vma); | 946 | addr = page_address_in_vma(page, vma); |
| 937 | if (addr == -EFAULT) | 947 | if (addr == -EFAULT) |
| 938 | return 0; | 948 | return 0; |
| @@ -988,7 +998,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
| 988 | { | 998 | { |
| 989 | unsigned int max = si->max; | 999 | unsigned int max = si->max; |
| 990 | unsigned int i = prev; | 1000 | unsigned int i = prev; |
| 991 | int count; | 1001 | unsigned char count; |
| 992 | 1002 | ||
| 993 | /* | 1003 | /* |
| 994 | * No need for swap_lock here: we're just looking | 1004 | * No need for swap_lock here: we're just looking |
| @@ -1024,16 +1034,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
| 1024 | */ | 1034 | */ |
| 1025 | static int try_to_unuse(unsigned int type) | 1035 | static int try_to_unuse(unsigned int type) |
| 1026 | { | 1036 | { |
| 1027 | struct swap_info_struct * si = &swap_info[type]; | 1037 | struct swap_info_struct *si = swap_info[type]; |
| 1028 | struct mm_struct *start_mm; | 1038 | struct mm_struct *start_mm; |
| 1029 | unsigned short *swap_map; | 1039 | unsigned char *swap_map; |
| 1030 | unsigned short swcount; | 1040 | unsigned char swcount; |
| 1031 | struct page *page; | 1041 | struct page *page; |
| 1032 | swp_entry_t entry; | 1042 | swp_entry_t entry; |
| 1033 | unsigned int i = 0; | 1043 | unsigned int i = 0; |
| 1034 | int retval = 0; | 1044 | int retval = 0; |
| 1035 | int reset_overflow = 0; | ||
| 1036 | int shmem; | ||
| 1037 | 1045 | ||
| 1038 | /* | 1046 | /* |
| 1039 | * When searching mms for an entry, a good strategy is to | 1047 | * When searching mms for an entry, a good strategy is to |
| @@ -1047,8 +1055,7 @@ static int try_to_unuse(unsigned int type) | |||
| 1047 | * together, child after parent. If we race with dup_mmap(), we | 1055 | * together, child after parent. If we race with dup_mmap(), we |
| 1048 | * prefer to resolve parent before child, lest we miss entries | 1056 | * prefer to resolve parent before child, lest we miss entries |
| 1049 | * duplicated after we scanned child: using last mm would invert | 1057 | * duplicated after we scanned child: using last mm would invert |
| 1050 | * that. Though it's only a serious concern when an overflowed | 1058 | * that. |
| 1051 | * swap count is reset from SWAP_MAP_MAX, preventing a rescan. | ||
| 1052 | */ | 1059 | */ |
| 1053 | start_mm = &init_mm; | 1060 | start_mm = &init_mm; |
| 1054 | atomic_inc(&init_mm.mm_users); | 1061 | atomic_inc(&init_mm.mm_users); |
| @@ -1110,17 +1117,18 @@ static int try_to_unuse(unsigned int type) | |||
| 1110 | 1117 | ||
| 1111 | /* | 1118 | /* |
| 1112 | * Remove all references to entry. | 1119 | * Remove all references to entry. |
| 1113 | * Whenever we reach init_mm, there's no address space | ||
| 1114 | * to search, but use it as a reminder to search shmem. | ||
| 1115 | */ | 1120 | */ |
| 1116 | shmem = 0; | ||
| 1117 | swcount = *swap_map; | 1121 | swcount = *swap_map; |
| 1118 | if (swap_count(swcount)) { | 1122 | if (swap_count(swcount) == SWAP_MAP_SHMEM) { |
| 1119 | if (start_mm == &init_mm) | 1123 | retval = shmem_unuse(entry, page); |
| 1120 | shmem = shmem_unuse(entry, page); | 1124 | /* page has already been unlocked and released */ |
| 1121 | else | 1125 | if (retval < 0) |
| 1122 | retval = unuse_mm(start_mm, entry, page); | 1126 | break; |
| 1127 | continue; | ||
| 1123 | } | 1128 | } |
| 1129 | if (swap_count(swcount) && start_mm != &init_mm) | ||
| 1130 | retval = unuse_mm(start_mm, entry, page); | ||
| 1131 | |||
| 1124 | if (swap_count(*swap_map)) { | 1132 | if (swap_count(*swap_map)) { |
| 1125 | int set_start_mm = (*swap_map >= swcount); | 1133 | int set_start_mm = (*swap_map >= swcount); |
| 1126 | struct list_head *p = &start_mm->mmlist; | 1134 | struct list_head *p = &start_mm->mmlist; |
| @@ -1131,7 +1139,7 @@ static int try_to_unuse(unsigned int type) | |||
| 1131 | atomic_inc(&new_start_mm->mm_users); | 1139 | atomic_inc(&new_start_mm->mm_users); |
| 1132 | atomic_inc(&prev_mm->mm_users); | 1140 | atomic_inc(&prev_mm->mm_users); |
| 1133 | spin_lock(&mmlist_lock); | 1141 | spin_lock(&mmlist_lock); |
| 1134 | while (swap_count(*swap_map) && !retval && !shmem && | 1142 | while (swap_count(*swap_map) && !retval && |
| 1135 | (p = p->next) != &start_mm->mmlist) { | 1143 | (p = p->next) != &start_mm->mmlist) { |
| 1136 | mm = list_entry(p, struct mm_struct, mmlist); | 1144 | mm = list_entry(p, struct mm_struct, mmlist); |
| 1137 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1145 | if (!atomic_inc_not_zero(&mm->mm_users)) |
| @@ -1145,10 +1153,9 @@ static int try_to_unuse(unsigned int type) | |||
| 1145 | swcount = *swap_map; | 1153 | swcount = *swap_map; |
| 1146 | if (!swap_count(swcount)) /* any usage ? */ | 1154 | if (!swap_count(swcount)) /* any usage ? */ |
| 1147 | ; | 1155 | ; |
| 1148 | else if (mm == &init_mm) { | 1156 | else if (mm == &init_mm) |
| 1149 | set_start_mm = 1; | 1157 | set_start_mm = 1; |
| 1150 | shmem = shmem_unuse(entry, page); | 1158 | else |
| 1151 | } else | ||
| 1152 | retval = unuse_mm(mm, entry, page); | 1159 | retval = unuse_mm(mm, entry, page); |
| 1153 | 1160 | ||
| 1154 | if (set_start_mm && *swap_map < swcount) { | 1161 | if (set_start_mm && *swap_map < swcount) { |
| @@ -1164,13 +1171,6 @@ static int try_to_unuse(unsigned int type) | |||
| 1164 | mmput(start_mm); | 1171 | mmput(start_mm); |
| 1165 | start_mm = new_start_mm; | 1172 | start_mm = new_start_mm; |
| 1166 | } | 1173 | } |
| 1167 | if (shmem) { | ||
| 1168 | /* page has already been unlocked and released */ | ||
| 1169 | if (shmem > 0) | ||
| 1170 | continue; | ||
| 1171 | retval = shmem; | ||
| 1172 | break; | ||
| 1173 | } | ||
| 1174 | if (retval) { | 1174 | if (retval) { |
| 1175 | unlock_page(page); | 1175 | unlock_page(page); |
| 1176 | page_cache_release(page); | 1176 | page_cache_release(page); |
| @@ -1178,30 +1178,6 @@ static int try_to_unuse(unsigned int type) | |||
| 1178 | } | 1178 | } |
| 1179 | 1179 | ||
| 1180 | /* | 1180 | /* |
| 1181 | * How could swap count reach 0x7ffe ? | ||
| 1182 | * There's no way to repeat a swap page within an mm | ||
| 1183 | * (except in shmem, where it's the shared object which takes | ||
| 1184 | * the reference count)? | ||
| 1185 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned | ||
| 1186 | * short is too small....) | ||
| 1187 | * If that's wrong, then we should worry more about | ||
| 1188 | * exit_mmap() and do_munmap() cases described above: | ||
| 1189 | * we might be resetting SWAP_MAP_MAX too early here. | ||
| 1190 | * We know "Undead"s can happen, they're okay, so don't | ||
| 1191 | * report them; but do report if we reset SWAP_MAP_MAX. | ||
| 1192 | */ | ||
| 1193 | /* We might release the lock_page() in unuse_mm(). */ | ||
| 1194 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
| 1195 | goto retry; | ||
| 1196 | |||
| 1197 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
| 1198 | spin_lock(&swap_lock); | ||
| 1199 | *swap_map = encode_swapmap(0, true); | ||
| 1200 | spin_unlock(&swap_lock); | ||
| 1201 | reset_overflow = 1; | ||
| 1202 | } | ||
| 1203 | |||
| 1204 | /* | ||
| 1205 | * If a reference remains (rare), we would like to leave | 1181 | * If a reference remains (rare), we would like to leave |
| 1206 | * the page in the swap cache; but try_to_unmap could | 1182 | * the page in the swap cache; but try_to_unmap could |
| 1207 | * then re-duplicate the entry once we drop page lock, | 1183 | * then re-duplicate the entry once we drop page lock, |
| @@ -1213,6 +1189,12 @@ static int try_to_unuse(unsigned int type) | |||
| 1213 | * read from disk into another page. Splitting into two | 1189 | * read from disk into another page. Splitting into two |
| 1214 | * pages would be incorrect if swap supported "shared | 1190 | * pages would be incorrect if swap supported "shared |
| 1215 | * private" pages, but they are handled by tmpfs files. | 1191 | * private" pages, but they are handled by tmpfs files. |
| 1192 | * | ||
| 1193 | * Given how unuse_vma() targets one particular offset | ||
| 1194 | * in an anon_vma, once the anon_vma has been determined, | ||
| 1195 | * this splitting happens to be just what is needed to | ||
| 1196 | * handle where KSM pages have been swapped out: re-reading | ||
| 1197 | * is unnecessarily slow, but we can fix that later on. | ||
| 1216 | */ | 1198 | */ |
| 1217 | if (swap_count(*swap_map) && | 1199 | if (swap_count(*swap_map) && |
| 1218 | PageDirty(page) && PageSwapCache(page)) { | 1200 | PageDirty(page) && PageSwapCache(page)) { |
| @@ -1242,7 +1224,6 @@ static int try_to_unuse(unsigned int type) | |||
| 1242 | * mark page dirty so shrink_page_list will preserve it. | 1224 | * mark page dirty so shrink_page_list will preserve it. |
| 1243 | */ | 1225 | */ |
| 1244 | SetPageDirty(page); | 1226 | SetPageDirty(page); |
| 1245 | retry: | ||
| 1246 | unlock_page(page); | 1227 | unlock_page(page); |
| 1247 | page_cache_release(page); | 1228 | page_cache_release(page); |
| 1248 | 1229 | ||
| @@ -1254,10 +1235,6 @@ retry: | |||
| 1254 | } | 1235 | } |
| 1255 | 1236 | ||
| 1256 | mmput(start_mm); | 1237 | mmput(start_mm); |
| 1257 | if (reset_overflow) { | ||
| 1258 | printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); | ||
| 1259 | swap_overflow = 0; | ||
| 1260 | } | ||
| 1261 | return retval; | 1238 | return retval; |
| 1262 | } | 1239 | } |
| 1263 | 1240 | ||
| @@ -1270,10 +1247,10 @@ retry: | |||
| 1270 | static void drain_mmlist(void) | 1247 | static void drain_mmlist(void) |
| 1271 | { | 1248 | { |
| 1272 | struct list_head *p, *next; | 1249 | struct list_head *p, *next; |
| 1273 | unsigned int i; | 1250 | unsigned int type; |
| 1274 | 1251 | ||
| 1275 | for (i = 0; i < nr_swapfiles; i++) | 1252 | for (type = 0; type < nr_swapfiles; type++) |
| 1276 | if (swap_info[i].inuse_pages) | 1253 | if (swap_info[type]->inuse_pages) |
| 1277 | return; | 1254 | return; |
| 1278 | spin_lock(&mmlist_lock); | 1255 | spin_lock(&mmlist_lock); |
| 1279 | list_for_each_safe(p, next, &init_mm.mmlist) | 1256 | list_for_each_safe(p, next, &init_mm.mmlist) |
| @@ -1283,12 +1260,23 @@ static void drain_mmlist(void) | |||
| 1283 | 1260 | ||
| 1284 | /* | 1261 | /* |
| 1285 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which | 1262 | * Use this swapdev's extent info to locate the (PAGE_SIZE) block which |
| 1286 | * corresponds to page offset `offset'. | 1263 | * corresponds to page offset for the specified swap entry. |
| 1264 | * Note that the type of this function is sector_t, but it returns page offset | ||
| 1265 | * into the bdev, not sector offset. | ||
| 1287 | */ | 1266 | */ |
| 1288 | sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | 1267 | static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) |
| 1289 | { | 1268 | { |
| 1290 | struct swap_extent *se = sis->curr_swap_extent; | 1269 | struct swap_info_struct *sis; |
| 1291 | struct swap_extent *start_se = se; | 1270 | struct swap_extent *start_se; |
| 1271 | struct swap_extent *se; | ||
| 1272 | pgoff_t offset; | ||
| 1273 | |||
| 1274 | sis = swap_info[swp_type(entry)]; | ||
| 1275 | *bdev = sis->bdev; | ||
| 1276 | |||
| 1277 | offset = swp_offset(entry); | ||
| 1278 | start_se = sis->curr_swap_extent; | ||
| 1279 | se = start_se; | ||
| 1292 | 1280 | ||
| 1293 | for ( ; ; ) { | 1281 | for ( ; ; ) { |
| 1294 | struct list_head *lh; | 1282 | struct list_head *lh; |
| @@ -1298,40 +1286,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) | |||
| 1298 | return se->start_block + (offset - se->start_page); | 1286 | return se->start_block + (offset - se->start_page); |
| 1299 | } | 1287 | } |
| 1300 | lh = se->list.next; | 1288 | lh = se->list.next; |
| 1301 | if (lh == &sis->extent_list) | ||
| 1302 | lh = lh->next; | ||
| 1303 | se = list_entry(lh, struct swap_extent, list); | 1289 | se = list_entry(lh, struct swap_extent, list); |
| 1304 | sis->curr_swap_extent = se; | 1290 | sis->curr_swap_extent = se; |
| 1305 | BUG_ON(se == start_se); /* It *must* be present */ | 1291 | BUG_ON(se == start_se); /* It *must* be present */ |
| 1306 | } | 1292 | } |
| 1307 | } | 1293 | } |
| 1308 | 1294 | ||
| 1309 | #ifdef CONFIG_HIBERNATION | ||
| 1310 | /* | 1295 | /* |
| 1311 | * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev | 1296 | * Returns the page offset into bdev for the specified page's swap entry. |
| 1312 | * corresponding to given index in swap_info (swap type). | ||
| 1313 | */ | 1297 | */ |
| 1314 | sector_t swapdev_block(int swap_type, pgoff_t offset) | 1298 | sector_t map_swap_page(struct page *page, struct block_device **bdev) |
| 1315 | { | 1299 | { |
| 1316 | struct swap_info_struct *sis; | 1300 | swp_entry_t entry; |
| 1317 | 1301 | entry.val = page_private(page); | |
| 1318 | if (swap_type >= nr_swapfiles) | 1302 | return map_swap_entry(entry, bdev); |
| 1319 | return 0; | ||
| 1320 | |||
| 1321 | sis = swap_info + swap_type; | ||
| 1322 | return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; | ||
| 1323 | } | 1303 | } |
| 1324 | #endif /* CONFIG_HIBERNATION */ | ||
| 1325 | 1304 | ||
| 1326 | /* | 1305 | /* |
| 1327 | * Free all of a swapdev's extent information | 1306 | * Free all of a swapdev's extent information |
| 1328 | */ | 1307 | */ |
| 1329 | static void destroy_swap_extents(struct swap_info_struct *sis) | 1308 | static void destroy_swap_extents(struct swap_info_struct *sis) |
| 1330 | { | 1309 | { |
| 1331 | while (!list_empty(&sis->extent_list)) { | 1310 | while (!list_empty(&sis->first_swap_extent.list)) { |
| 1332 | struct swap_extent *se; | 1311 | struct swap_extent *se; |
| 1333 | 1312 | ||
| 1334 | se = list_entry(sis->extent_list.next, | 1313 | se = list_entry(sis->first_swap_extent.list.next, |
| 1335 | struct swap_extent, list); | 1314 | struct swap_extent, list); |
| 1336 | list_del(&se->list); | 1315 | list_del(&se->list); |
| 1337 | kfree(se); | 1316 | kfree(se); |
| @@ -1352,8 +1331,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
| 1352 | struct swap_extent *new_se; | 1331 | struct swap_extent *new_se; |
| 1353 | struct list_head *lh; | 1332 | struct list_head *lh; |
| 1354 | 1333 | ||
| 1355 | lh = sis->extent_list.prev; /* The highest page extent */ | 1334 | if (start_page == 0) { |
| 1356 | if (lh != &sis->extent_list) { | 1335 | se = &sis->first_swap_extent; |
| 1336 | sis->curr_swap_extent = se; | ||
| 1337 | se->start_page = 0; | ||
| 1338 | se->nr_pages = nr_pages; | ||
| 1339 | se->start_block = start_block; | ||
| 1340 | return 1; | ||
| 1341 | } else { | ||
| 1342 | lh = sis->first_swap_extent.list.prev; /* Highest extent */ | ||
| 1357 | se = list_entry(lh, struct swap_extent, list); | 1343 | se = list_entry(lh, struct swap_extent, list); |
| 1358 | BUG_ON(se->start_page + se->nr_pages != start_page); | 1344 | BUG_ON(se->start_page + se->nr_pages != start_page); |
| 1359 | if (se->start_block + se->nr_pages == start_block) { | 1345 | if (se->start_block + se->nr_pages == start_block) { |
| @@ -1373,7 +1359,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, | |||
| 1373 | new_se->nr_pages = nr_pages; | 1359 | new_se->nr_pages = nr_pages; |
| 1374 | new_se->start_block = start_block; | 1360 | new_se->start_block = start_block; |
| 1375 | 1361 | ||
| 1376 | list_add_tail(&new_se->list, &sis->extent_list); | 1362 | list_add_tail(&new_se->list, &sis->first_swap_extent.list); |
| 1377 | return 1; | 1363 | return 1; |
| 1378 | } | 1364 | } |
| 1379 | 1365 | ||
| @@ -1425,7 +1411,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) | |||
| 1425 | if (S_ISBLK(inode->i_mode)) { | 1411 | if (S_ISBLK(inode->i_mode)) { |
| 1426 | ret = add_swap_extent(sis, 0, sis->max, 0); | 1412 | ret = add_swap_extent(sis, 0, sis->max, 0); |
| 1427 | *span = sis->pages; | 1413 | *span = sis->pages; |
| 1428 | goto done; | 1414 | goto out; |
| 1429 | } | 1415 | } |
| 1430 | 1416 | ||
| 1431 | blkbits = inode->i_blkbits; | 1417 | blkbits = inode->i_blkbits; |
| @@ -1496,25 +1482,22 @@ reprobe: | |||
| 1496 | sis->max = page_no; | 1482 | sis->max = page_no; |
| 1497 | sis->pages = page_no - 1; | 1483 | sis->pages = page_no - 1; |
| 1498 | sis->highest_bit = page_no - 1; | 1484 | sis->highest_bit = page_no - 1; |
| 1499 | done: | 1485 | out: |
| 1500 | sis->curr_swap_extent = list_entry(sis->extent_list.prev, | 1486 | return ret; |
| 1501 | struct swap_extent, list); | ||
| 1502 | goto out; | ||
| 1503 | bad_bmap: | 1487 | bad_bmap: |
| 1504 | printk(KERN_ERR "swapon: swapfile has holes\n"); | 1488 | printk(KERN_ERR "swapon: swapfile has holes\n"); |
| 1505 | ret = -EINVAL; | 1489 | ret = -EINVAL; |
| 1506 | out: | 1490 | goto out; |
| 1507 | return ret; | ||
| 1508 | } | 1491 | } |
| 1509 | 1492 | ||
| 1510 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | 1493 | SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) |
| 1511 | { | 1494 | { |
| 1512 | struct swap_info_struct * p = NULL; | 1495 | struct swap_info_struct *p = NULL; |
| 1513 | unsigned short *swap_map; | 1496 | unsigned char *swap_map; |
| 1514 | struct file *swap_file, *victim; | 1497 | struct file *swap_file, *victim; |
| 1515 | struct address_space *mapping; | 1498 | struct address_space *mapping; |
| 1516 | struct inode *inode; | 1499 | struct inode *inode; |
| 1517 | char * pathname; | 1500 | char *pathname; |
| 1518 | int i, type, prev; | 1501 | int i, type, prev; |
| 1519 | int err; | 1502 | int err; |
| 1520 | 1503 | ||
| @@ -1535,8 +1518,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1535 | mapping = victim->f_mapping; | 1518 | mapping = victim->f_mapping; |
| 1536 | prev = -1; | 1519 | prev = -1; |
| 1537 | spin_lock(&swap_lock); | 1520 | spin_lock(&swap_lock); |
| 1538 | for (type = swap_list.head; type >= 0; type = swap_info[type].next) { | 1521 | for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { |
| 1539 | p = swap_info + type; | 1522 | p = swap_info[type]; |
| 1540 | if (p->flags & SWP_WRITEOK) { | 1523 | if (p->flags & SWP_WRITEOK) { |
| 1541 | if (p->swap_file->f_mapping == mapping) | 1524 | if (p->swap_file->f_mapping == mapping) |
| 1542 | break; | 1525 | break; |
| @@ -1555,18 +1538,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1555 | spin_unlock(&swap_lock); | 1538 | spin_unlock(&swap_lock); |
| 1556 | goto out_dput; | 1539 | goto out_dput; |
| 1557 | } | 1540 | } |
| 1558 | if (prev < 0) { | 1541 | if (prev < 0) |
| 1559 | swap_list.head = p->next; | 1542 | swap_list.head = p->next; |
| 1560 | } else { | 1543 | else |
| 1561 | swap_info[prev].next = p->next; | 1544 | swap_info[prev]->next = p->next; |
| 1562 | } | ||
| 1563 | if (type == swap_list.next) { | 1545 | if (type == swap_list.next) { |
| 1564 | /* just pick something that's safe... */ | 1546 | /* just pick something that's safe... */ |
| 1565 | swap_list.next = swap_list.head; | 1547 | swap_list.next = swap_list.head; |
| 1566 | } | 1548 | } |
| 1567 | if (p->prio < 0) { | 1549 | if (p->prio < 0) { |
| 1568 | for (i = p->next; i >= 0; i = swap_info[i].next) | 1550 | for (i = p->next; i >= 0; i = swap_info[i]->next) |
| 1569 | swap_info[i].prio = p->prio--; | 1551 | swap_info[i]->prio = p->prio--; |
| 1570 | least_priority++; | 1552 | least_priority++; |
| 1571 | } | 1553 | } |
| 1572 | nr_swap_pages -= p->pages; | 1554 | nr_swap_pages -= p->pages; |
| @@ -1584,16 +1566,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1584 | if (p->prio < 0) | 1566 | if (p->prio < 0) |
| 1585 | p->prio = --least_priority; | 1567 | p->prio = --least_priority; |
| 1586 | prev = -1; | 1568 | prev = -1; |
| 1587 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 1569 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
| 1588 | if (p->prio >= swap_info[i].prio) | 1570 | if (p->prio >= swap_info[i]->prio) |
| 1589 | break; | 1571 | break; |
| 1590 | prev = i; | 1572 | prev = i; |
| 1591 | } | 1573 | } |
| 1592 | p->next = i; | 1574 | p->next = i; |
| 1593 | if (prev < 0) | 1575 | if (prev < 0) |
| 1594 | swap_list.head = swap_list.next = p - swap_info; | 1576 | swap_list.head = swap_list.next = type; |
| 1595 | else | 1577 | else |
| 1596 | swap_info[prev].next = p - swap_info; | 1578 | swap_info[prev]->next = type; |
| 1597 | nr_swap_pages += p->pages; | 1579 | nr_swap_pages += p->pages; |
| 1598 | total_swap_pages += p->pages; | 1580 | total_swap_pages += p->pages; |
| 1599 | p->flags |= SWP_WRITEOK; | 1581 | p->flags |= SWP_WRITEOK; |
| @@ -1606,6 +1588,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1606 | up_write(&swap_unplug_sem); | 1588 | up_write(&swap_unplug_sem); |
| 1607 | 1589 | ||
| 1608 | destroy_swap_extents(p); | 1590 | destroy_swap_extents(p); |
| 1591 | if (p->flags & SWP_CONTINUED) | ||
| 1592 | free_swap_count_continuations(p); | ||
| 1593 | |||
| 1609 | mutex_lock(&swapon_mutex); | 1594 | mutex_lock(&swapon_mutex); |
| 1610 | spin_lock(&swap_lock); | 1595 | spin_lock(&swap_lock); |
| 1611 | drain_mmlist(); | 1596 | drain_mmlist(); |
| @@ -1653,8 +1638,8 @@ out: | |||
| 1653 | /* iterator */ | 1638 | /* iterator */ |
| 1654 | static void *swap_start(struct seq_file *swap, loff_t *pos) | 1639 | static void *swap_start(struct seq_file *swap, loff_t *pos) |
| 1655 | { | 1640 | { |
| 1656 | struct swap_info_struct *ptr = swap_info; | 1641 | struct swap_info_struct *si; |
| 1657 | int i; | 1642 | int type; |
| 1658 | loff_t l = *pos; | 1643 | loff_t l = *pos; |
| 1659 | 1644 | ||
| 1660 | mutex_lock(&swapon_mutex); | 1645 | mutex_lock(&swapon_mutex); |
| @@ -1662,11 +1647,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
| 1662 | if (!l) | 1647 | if (!l) |
| 1663 | return SEQ_START_TOKEN; | 1648 | return SEQ_START_TOKEN; |
| 1664 | 1649 | ||
| 1665 | for (i = 0; i < nr_swapfiles; i++, ptr++) { | 1650 | for (type = 0; type < nr_swapfiles; type++) { |
| 1666 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1651 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ |
| 1652 | si = swap_info[type]; | ||
| 1653 | if (!(si->flags & SWP_USED) || !si->swap_map) | ||
| 1667 | continue; | 1654 | continue; |
| 1668 | if (!--l) | 1655 | if (!--l) |
| 1669 | return ptr; | 1656 | return si; |
| 1670 | } | 1657 | } |
| 1671 | 1658 | ||
| 1672 | return NULL; | 1659 | return NULL; |
| @@ -1674,21 +1661,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) | |||
| 1674 | 1661 | ||
| 1675 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) | 1662 | static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) |
| 1676 | { | 1663 | { |
| 1677 | struct swap_info_struct *ptr; | 1664 | struct swap_info_struct *si = v; |
| 1678 | struct swap_info_struct *endptr = swap_info + nr_swapfiles; | 1665 | int type; |
| 1679 | 1666 | ||
| 1680 | if (v == SEQ_START_TOKEN) | 1667 | if (v == SEQ_START_TOKEN) |
| 1681 | ptr = swap_info; | 1668 | type = 0; |
| 1682 | else { | 1669 | else |
| 1683 | ptr = v; | 1670 | type = si->type + 1; |
| 1684 | ptr++; | ||
| 1685 | } | ||
| 1686 | 1671 | ||
| 1687 | for (; ptr < endptr; ptr++) { | 1672 | for (; type < nr_swapfiles; type++) { |
| 1688 | if (!(ptr->flags & SWP_USED) || !ptr->swap_map) | 1673 | smp_rmb(); /* read nr_swapfiles before swap_info[type] */ |
| 1674 | si = swap_info[type]; | ||
| 1675 | if (!(si->flags & SWP_USED) || !si->swap_map) | ||
| 1689 | continue; | 1676 | continue; |
| 1690 | ++*pos; | 1677 | ++*pos; |
| 1691 | return ptr; | 1678 | return si; |
| 1692 | } | 1679 | } |
| 1693 | 1680 | ||
| 1694 | return NULL; | 1681 | return NULL; |
| @@ -1701,24 +1688,24 @@ static void swap_stop(struct seq_file *swap, void *v) | |||
| 1701 | 1688 | ||
| 1702 | static int swap_show(struct seq_file *swap, void *v) | 1689 | static int swap_show(struct seq_file *swap, void *v) |
| 1703 | { | 1690 | { |
| 1704 | struct swap_info_struct *ptr = v; | 1691 | struct swap_info_struct *si = v; |
| 1705 | struct file *file; | 1692 | struct file *file; |
| 1706 | int len; | 1693 | int len; |
| 1707 | 1694 | ||
| 1708 | if (ptr == SEQ_START_TOKEN) { | 1695 | if (si == SEQ_START_TOKEN) { |
| 1709 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); | 1696 | seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); |
| 1710 | return 0; | 1697 | return 0; |
| 1711 | } | 1698 | } |
| 1712 | 1699 | ||
| 1713 | file = ptr->swap_file; | 1700 | file = si->swap_file; |
| 1714 | len = seq_path(swap, &file->f_path, " \t\n\\"); | 1701 | len = seq_path(swap, &file->f_path, " \t\n\\"); |
| 1715 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", | 1702 | seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", |
| 1716 | len < 40 ? 40 - len : 1, " ", | 1703 | len < 40 ? 40 - len : 1, " ", |
| 1717 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? | 1704 | S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? |
| 1718 | "partition" : "file\t", | 1705 | "partition" : "file\t", |
| 1719 | ptr->pages << (PAGE_SHIFT - 10), | 1706 | si->pages << (PAGE_SHIFT - 10), |
| 1720 | ptr->inuse_pages << (PAGE_SHIFT - 10), | 1707 | si->inuse_pages << (PAGE_SHIFT - 10), |
| 1721 | ptr->prio); | 1708 | si->prio); |
| 1722 | return 0; | 1709 | return 0; |
| 1723 | } | 1710 | } |
| 1724 | 1711 | ||
| @@ -1765,7 +1752,7 @@ late_initcall(max_swapfiles_check); | |||
| 1765 | */ | 1752 | */ |
| 1766 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | 1753 | SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) |
| 1767 | { | 1754 | { |
| 1768 | struct swap_info_struct * p; | 1755 | struct swap_info_struct *p; |
| 1769 | char *name = NULL; | 1756 | char *name = NULL; |
| 1770 | struct block_device *bdev = NULL; | 1757 | struct block_device *bdev = NULL; |
| 1771 | struct file *swap_file = NULL; | 1758 | struct file *swap_file = NULL; |
| @@ -1773,36 +1760,58 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1773 | unsigned int type; | 1760 | unsigned int type; |
| 1774 | int i, prev; | 1761 | int i, prev; |
| 1775 | int error; | 1762 | int error; |
| 1776 | union swap_header *swap_header = NULL; | 1763 | union swap_header *swap_header; |
| 1777 | unsigned int nr_good_pages = 0; | 1764 | unsigned int nr_good_pages; |
| 1778 | int nr_extents = 0; | 1765 | int nr_extents = 0; |
| 1779 | sector_t span; | 1766 | sector_t span; |
| 1780 | unsigned long maxpages = 1; | 1767 | unsigned long maxpages; |
| 1781 | unsigned long swapfilepages; | 1768 | unsigned long swapfilepages; |
| 1782 | unsigned short *swap_map = NULL; | 1769 | unsigned char *swap_map = NULL; |
| 1783 | struct page *page = NULL; | 1770 | struct page *page = NULL; |
| 1784 | struct inode *inode = NULL; | 1771 | struct inode *inode = NULL; |
| 1785 | int did_down = 0; | 1772 | int did_down = 0; |
| 1786 | 1773 | ||
| 1787 | if (!capable(CAP_SYS_ADMIN)) | 1774 | if (!capable(CAP_SYS_ADMIN)) |
| 1788 | return -EPERM; | 1775 | return -EPERM; |
| 1776 | |||
| 1777 | p = kzalloc(sizeof(*p), GFP_KERNEL); | ||
| 1778 | if (!p) | ||
| 1779 | return -ENOMEM; | ||
| 1780 | |||
| 1789 | spin_lock(&swap_lock); | 1781 | spin_lock(&swap_lock); |
| 1790 | p = swap_info; | 1782 | for (type = 0; type < nr_swapfiles; type++) { |
| 1791 | for (type = 0 ; type < nr_swapfiles ; type++,p++) | 1783 | if (!(swap_info[type]->flags & SWP_USED)) |
| 1792 | if (!(p->flags & SWP_USED)) | ||
| 1793 | break; | 1784 | break; |
| 1785 | } | ||
| 1794 | error = -EPERM; | 1786 | error = -EPERM; |
| 1795 | if (type >= MAX_SWAPFILES) { | 1787 | if (type >= MAX_SWAPFILES) { |
| 1796 | spin_unlock(&swap_lock); | 1788 | spin_unlock(&swap_lock); |
| 1789 | kfree(p); | ||
| 1797 | goto out; | 1790 | goto out; |
| 1798 | } | 1791 | } |
| 1799 | if (type >= nr_swapfiles) | 1792 | if (type >= nr_swapfiles) { |
| 1800 | nr_swapfiles = type+1; | 1793 | p->type = type; |
| 1801 | memset(p, 0, sizeof(*p)); | 1794 | swap_info[type] = p; |
| 1802 | INIT_LIST_HEAD(&p->extent_list); | 1795 | /* |
| 1796 | * Write swap_info[type] before nr_swapfiles, in case a | ||
| 1797 | * racing procfs swap_start() or swap_next() is reading them. | ||
| 1798 | * (We never shrink nr_swapfiles, we never free this entry.) | ||
| 1799 | */ | ||
| 1800 | smp_wmb(); | ||
| 1801 | nr_swapfiles++; | ||
| 1802 | } else { | ||
| 1803 | kfree(p); | ||
| 1804 | p = swap_info[type]; | ||
| 1805 | /* | ||
| 1806 | * Do not memset this entry: a racing procfs swap_next() | ||
| 1807 | * would be relying on p->type to remain valid. | ||
| 1808 | */ | ||
| 1809 | } | ||
| 1810 | INIT_LIST_HEAD(&p->first_swap_extent.list); | ||
| 1803 | p->flags = SWP_USED; | 1811 | p->flags = SWP_USED; |
| 1804 | p->next = -1; | 1812 | p->next = -1; |
| 1805 | spin_unlock(&swap_lock); | 1813 | spin_unlock(&swap_lock); |
| 1814 | |||
| 1806 | name = getname(specialfile); | 1815 | name = getname(specialfile); |
| 1807 | error = PTR_ERR(name); | 1816 | error = PTR_ERR(name); |
| 1808 | if (IS_ERR(name)) { | 1817 | if (IS_ERR(name)) { |
| @@ -1822,7 +1831,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1822 | 1831 | ||
| 1823 | error = -EBUSY; | 1832 | error = -EBUSY; |
| 1824 | for (i = 0; i < nr_swapfiles; i++) { | 1833 | for (i = 0; i < nr_swapfiles; i++) { |
| 1825 | struct swap_info_struct *q = &swap_info[i]; | 1834 | struct swap_info_struct *q = swap_info[i]; |
| 1826 | 1835 | ||
| 1827 | if (i == type || !q->swap_file) | 1836 | if (i == type || !q->swap_file) |
| 1828 | continue; | 1837 | continue; |
| @@ -1897,6 +1906,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1897 | 1906 | ||
| 1898 | p->lowest_bit = 1; | 1907 | p->lowest_bit = 1; |
| 1899 | p->cluster_next = 1; | 1908 | p->cluster_next = 1; |
| 1909 | p->cluster_nr = 0; | ||
| 1900 | 1910 | ||
| 1901 | /* | 1911 | /* |
| 1902 | * Find out how many pages are allowed for a single swap | 1912 | * Find out how many pages are allowed for a single swap |
| @@ -1913,9 +1923,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1913 | * swap pte. | 1923 | * swap pte. |
| 1914 | */ | 1924 | */ |
| 1915 | maxpages = swp_offset(pte_to_swp_entry( | 1925 | maxpages = swp_offset(pte_to_swp_entry( |
| 1916 | swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; | 1926 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
| 1917 | if (maxpages > swap_header->info.last_page) | 1927 | if (maxpages > swap_header->info.last_page) { |
| 1918 | maxpages = swap_header->info.last_page; | 1928 | maxpages = swap_header->info.last_page + 1; |
| 1929 | /* p->max is an unsigned int: don't overflow it */ | ||
| 1930 | if ((unsigned int)maxpages == 0) | ||
| 1931 | maxpages = UINT_MAX; | ||
| 1932 | } | ||
| 1919 | p->highest_bit = maxpages - 1; | 1933 | p->highest_bit = maxpages - 1; |
| 1920 | 1934 | ||
| 1921 | error = -EINVAL; | 1935 | error = -EINVAL; |
| @@ -1932,30 +1946,31 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1932 | goto bad_swap; | 1946 | goto bad_swap; |
| 1933 | 1947 | ||
| 1934 | /* OK, set up the swap map and apply the bad block list */ | 1948 | /* OK, set up the swap map and apply the bad block list */ |
| 1935 | swap_map = vmalloc(maxpages * sizeof(short)); | 1949 | swap_map = vmalloc(maxpages); |
| 1936 | if (!swap_map) { | 1950 | if (!swap_map) { |
| 1937 | error = -ENOMEM; | 1951 | error = -ENOMEM; |
| 1938 | goto bad_swap; | 1952 | goto bad_swap; |
| 1939 | } | 1953 | } |
| 1940 | 1954 | ||
| 1941 | memset(swap_map, 0, maxpages * sizeof(short)); | 1955 | memset(swap_map, 0, maxpages); |
| 1956 | nr_good_pages = maxpages - 1; /* omit header page */ | ||
| 1957 | |||
| 1942 | for (i = 0; i < swap_header->info.nr_badpages; i++) { | 1958 | for (i = 0; i < swap_header->info.nr_badpages; i++) { |
| 1943 | int page_nr = swap_header->info.badpages[i]; | 1959 | unsigned int page_nr = swap_header->info.badpages[i]; |
| 1944 | if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { | 1960 | if (page_nr == 0 || page_nr > swap_header->info.last_page) { |
| 1945 | error = -EINVAL; | 1961 | error = -EINVAL; |
| 1946 | goto bad_swap; | 1962 | goto bad_swap; |
| 1947 | } | 1963 | } |
| 1948 | swap_map[page_nr] = SWAP_MAP_BAD; | 1964 | if (page_nr < maxpages) { |
| 1965 | swap_map[page_nr] = SWAP_MAP_BAD; | ||
| 1966 | nr_good_pages--; | ||
| 1967 | } | ||
| 1949 | } | 1968 | } |
| 1950 | 1969 | ||
| 1951 | error = swap_cgroup_swapon(type, maxpages); | 1970 | error = swap_cgroup_swapon(type, maxpages); |
| 1952 | if (error) | 1971 | if (error) |
| 1953 | goto bad_swap; | 1972 | goto bad_swap; |
| 1954 | 1973 | ||
| 1955 | nr_good_pages = swap_header->info.last_page - | ||
| 1956 | swap_header->info.nr_badpages - | ||
| 1957 | 1 /* header page */; | ||
| 1958 | |||
| 1959 | if (nr_good_pages) { | 1974 | if (nr_good_pages) { |
| 1960 | swap_map[0] = SWAP_MAP_BAD; | 1975 | swap_map[0] = SWAP_MAP_BAD; |
| 1961 | p->max = maxpages; | 1976 | p->max = maxpages; |
| @@ -2003,18 +2018,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 2003 | 2018 | ||
| 2004 | /* insert swap space into swap_list: */ | 2019 | /* insert swap space into swap_list: */ |
| 2005 | prev = -1; | 2020 | prev = -1; |
| 2006 | for (i = swap_list.head; i >= 0; i = swap_info[i].next) { | 2021 | for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { |
| 2007 | if (p->prio >= swap_info[i].prio) { | 2022 | if (p->prio >= swap_info[i]->prio) |
| 2008 | break; | 2023 | break; |
| 2009 | } | ||
| 2010 | prev = i; | 2024 | prev = i; |
| 2011 | } | 2025 | } |
| 2012 | p->next = i; | 2026 | p->next = i; |
| 2013 | if (prev < 0) { | 2027 | if (prev < 0) |
| 2014 | swap_list.head = swap_list.next = p - swap_info; | 2028 | swap_list.head = swap_list.next = type; |
| 2015 | } else { | 2029 | else |
| 2016 | swap_info[prev].next = p - swap_info; | 2030 | swap_info[prev]->next = type; |
| 2017 | } | ||
| 2018 | spin_unlock(&swap_lock); | 2031 | spin_unlock(&swap_lock); |
| 2019 | mutex_unlock(&swapon_mutex); | 2032 | mutex_unlock(&swapon_mutex); |
| 2020 | error = 0; | 2033 | error = 0; |
| @@ -2051,15 +2064,15 @@ out: | |||
| 2051 | 2064 | ||
| 2052 | void si_swapinfo(struct sysinfo *val) | 2065 | void si_swapinfo(struct sysinfo *val) |
| 2053 | { | 2066 | { |
| 2054 | unsigned int i; | 2067 | unsigned int type; |
| 2055 | unsigned long nr_to_be_unused = 0; | 2068 | unsigned long nr_to_be_unused = 0; |
| 2056 | 2069 | ||
| 2057 | spin_lock(&swap_lock); | 2070 | spin_lock(&swap_lock); |
| 2058 | for (i = 0; i < nr_swapfiles; i++) { | 2071 | for (type = 0; type < nr_swapfiles; type++) { |
| 2059 | if (!(swap_info[i].flags & SWP_USED) || | 2072 | struct swap_info_struct *si = swap_info[type]; |
| 2060 | (swap_info[i].flags & SWP_WRITEOK)) | 2073 | |
| 2061 | continue; | 2074 | if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) |
| 2062 | nr_to_be_unused += swap_info[i].inuse_pages; | 2075 | nr_to_be_unused += si->inuse_pages; |
| 2063 | } | 2076 | } |
| 2064 | val->freeswap = nr_swap_pages + nr_to_be_unused; | 2077 | val->freeswap = nr_swap_pages + nr_to_be_unused; |
| 2065 | val->totalswap = total_swap_pages + nr_to_be_unused; | 2078 | val->totalswap = total_swap_pages + nr_to_be_unused; |
| @@ -2069,101 +2082,111 @@ void si_swapinfo(struct sysinfo *val) | |||
| 2069 | /* | 2082 | /* |
| 2070 | * Verify that a swap entry is valid and increment its swap map count. | 2083 | * Verify that a swap entry is valid and increment its swap map count. |
| 2071 | * | 2084 | * |
| 2072 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | ||
| 2073 | * "permanent", but will be reclaimed by the next swapoff. | ||
| 2074 | * Returns error code in following case. | 2085 | * Returns error code in following case. |
| 2075 | * - success -> 0 | 2086 | * - success -> 0 |
| 2076 | * - swp_entry is invalid -> EINVAL | 2087 | * - swp_entry is invalid -> EINVAL |
| 2077 | * - swp_entry is migration entry -> EINVAL | 2088 | * - swp_entry is migration entry -> EINVAL |
| 2078 | * - swap-cache reference is requested but there is already one. -> EEXIST | 2089 | * - swap-cache reference is requested but there is already one. -> EEXIST |
| 2079 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | 2090 | * - swap-cache reference is requested but the entry is not used. -> ENOENT |
| 2091 | * - swap-mapped reference requested but needs continued swap count. -> ENOMEM | ||
| 2080 | */ | 2092 | */ |
| 2081 | static int __swap_duplicate(swp_entry_t entry, bool cache) | 2093 | static int __swap_duplicate(swp_entry_t entry, unsigned char usage) |
| 2082 | { | 2094 | { |
| 2083 | struct swap_info_struct * p; | 2095 | struct swap_info_struct *p; |
| 2084 | unsigned long offset, type; | 2096 | unsigned long offset, type; |
| 2085 | int result = -EINVAL; | 2097 | unsigned char count; |
| 2086 | int count; | 2098 | unsigned char has_cache; |
| 2087 | bool has_cache; | 2099 | int err = -EINVAL; |
| 2088 | 2100 | ||
| 2089 | if (non_swap_entry(entry)) | 2101 | if (non_swap_entry(entry)) |
| 2090 | return -EINVAL; | 2102 | goto out; |
| 2091 | 2103 | ||
| 2092 | type = swp_type(entry); | 2104 | type = swp_type(entry); |
| 2093 | if (type >= nr_swapfiles) | 2105 | if (type >= nr_swapfiles) |
| 2094 | goto bad_file; | 2106 | goto bad_file; |
| 2095 | p = type + swap_info; | 2107 | p = swap_info[type]; |
| 2096 | offset = swp_offset(entry); | 2108 | offset = swp_offset(entry); |
| 2097 | 2109 | ||
| 2098 | spin_lock(&swap_lock); | 2110 | spin_lock(&swap_lock); |
| 2099 | |||
| 2100 | if (unlikely(offset >= p->max)) | 2111 | if (unlikely(offset >= p->max)) |
| 2101 | goto unlock_out; | 2112 | goto unlock_out; |
| 2102 | 2113 | ||
| 2103 | count = swap_count(p->swap_map[offset]); | 2114 | count = p->swap_map[offset]; |
| 2104 | has_cache = swap_has_cache(p->swap_map[offset]); | 2115 | has_cache = count & SWAP_HAS_CACHE; |
| 2116 | count &= ~SWAP_HAS_CACHE; | ||
| 2117 | err = 0; | ||
| 2105 | 2118 | ||
| 2106 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | 2119 | if (usage == SWAP_HAS_CACHE) { |
| 2107 | 2120 | ||
| 2108 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | 2121 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ |
| 2109 | if (!has_cache && count) { | 2122 | if (!has_cache && count) |
| 2110 | p->swap_map[offset] = encode_swapmap(count, true); | 2123 | has_cache = SWAP_HAS_CACHE; |
| 2111 | result = 0; | 2124 | else if (has_cache) /* someone else added cache */ |
| 2112 | } else if (has_cache) /* someone added cache */ | 2125 | err = -EEXIST; |
| 2113 | result = -EEXIST; | 2126 | else /* no users remaining */ |
| 2114 | else if (!count) /* no users */ | 2127 | err = -ENOENT; |
| 2115 | result = -ENOENT; | ||
| 2116 | 2128 | ||
| 2117 | } else if (count || has_cache) { | 2129 | } else if (count || has_cache) { |
| 2118 | if (count < SWAP_MAP_MAX - 1) { | 2130 | |
| 2119 | p->swap_map[offset] = encode_swapmap(count + 1, | 2131 | if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) |
| 2120 | has_cache); | 2132 | count += usage; |
| 2121 | result = 0; | 2133 | else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) |
| 2122 | } else if (count <= SWAP_MAP_MAX) { | 2134 | err = -EINVAL; |
| 2123 | if (swap_overflow++ < 5) | 2135 | else if (swap_count_continued(p, offset, count)) |
| 2124 | printk(KERN_WARNING | 2136 | count = COUNT_CONTINUED; |
| 2125 | "swap_dup: swap entry overflow\n"); | 2137 | else |
| 2126 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, | 2138 | err = -ENOMEM; |
| 2127 | has_cache); | ||
| 2128 | result = 0; | ||
| 2129 | } | ||
| 2130 | } else | 2139 | } else |
| 2131 | result = -ENOENT; /* unused swap entry */ | 2140 | err = -ENOENT; /* unused swap entry */ |
| 2141 | |||
| 2142 | p->swap_map[offset] = count | has_cache; | ||
| 2143 | |||
| 2132 | unlock_out: | 2144 | unlock_out: |
| 2133 | spin_unlock(&swap_lock); | 2145 | spin_unlock(&swap_lock); |
| 2134 | out: | 2146 | out: |
| 2135 | return result; | 2147 | return err; |
| 2136 | 2148 | ||
| 2137 | bad_file: | 2149 | bad_file: |
| 2138 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2150 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
| 2139 | goto out; | 2151 | goto out; |
| 2140 | } | 2152 | } |
| 2153 | |||
| 2154 | /* | ||
| 2155 | * Help swapoff by noting that swap entry belongs to shmem/tmpfs | ||
| 2156 | * (in which case its reference count is never incremented). | ||
| 2157 | */ | ||
| 2158 | void swap_shmem_alloc(swp_entry_t entry) | ||
| 2159 | { | ||
| 2160 | __swap_duplicate(entry, SWAP_MAP_SHMEM); | ||
| 2161 | } | ||
| 2162 | |||
| 2141 | /* | 2163 | /* |
| 2142 | * increase reference count of swap entry by 1. | 2164 | * Increase reference count of swap entry by 1. |
| 2165 | * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required | ||
| 2166 | * but could not be atomically allocated. Returns 0, just as if it succeeded, | ||
| 2167 | * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which | ||
| 2168 | * might occur if a page table entry has got corrupted. | ||
| 2143 | */ | 2169 | */ |
| 2144 | void swap_duplicate(swp_entry_t entry) | 2170 | int swap_duplicate(swp_entry_t entry) |
| 2145 | { | 2171 | { |
| 2146 | __swap_duplicate(entry, SWAP_MAP); | 2172 | int err = 0; |
| 2173 | |||
| 2174 | while (!err && __swap_duplicate(entry, 1) == -ENOMEM) | ||
| 2175 | err = add_swap_count_continuation(entry, GFP_ATOMIC); | ||
| 2176 | return err; | ||
| 2147 | } | 2177 | } |
| 2148 | 2178 | ||
| 2149 | /* | 2179 | /* |
| 2150 | * @entry: swap entry for which we allocate swap cache. | 2180 | * @entry: swap entry for which we allocate swap cache. |
| 2151 | * | 2181 | * |
| 2152 | * Called when allocating swap cache for exising swap entry, | 2182 | * Called when allocating swap cache for existing swap entry, |
| 2153 | * This can return error codes. Returns 0 at success. | 2183 | * This can return error codes. Returns 0 at success. |
| 2154 | * -EBUSY means there is a swap cache. | 2184 | * -EBUSY means there is a swap cache. |
| 2155 | * Note: return code is different from swap_duplicate(). | 2185 | * Note: return code is different from swap_duplicate(). |
| 2156 | */ | 2186 | */ |
| 2157 | int swapcache_prepare(swp_entry_t entry) | 2187 | int swapcache_prepare(swp_entry_t entry) |
| 2158 | { | 2188 | { |
| 2159 | return __swap_duplicate(entry, SWAP_CACHE); | 2189 | return __swap_duplicate(entry, SWAP_HAS_CACHE); |
| 2160 | } | ||
| 2161 | |||
| 2162 | |||
| 2163 | struct swap_info_struct * | ||
| 2164 | get_swap_info_struct(unsigned type) | ||
| 2165 | { | ||
| 2166 | return &swap_info[type]; | ||
| 2167 | } | 2190 | } |
| 2168 | 2191 | ||
| 2169 | /* | 2192 | /* |
| @@ -2181,7 +2204,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
| 2181 | if (!our_page_cluster) /* no readahead */ | 2204 | if (!our_page_cluster) /* no readahead */ |
| 2182 | return 0; | 2205 | return 0; |
| 2183 | 2206 | ||
| 2184 | si = &swap_info[swp_type(entry)]; | 2207 | si = swap_info[swp_type(entry)]; |
| 2185 | target = swp_offset(entry); | 2208 | target = swp_offset(entry); |
| 2186 | base = (target >> our_page_cluster) << our_page_cluster; | 2209 | base = (target >> our_page_cluster) << our_page_cluster; |
| 2187 | end = base + (1 << our_page_cluster); | 2210 | end = base + (1 << our_page_cluster); |
| @@ -2217,3 +2240,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
| 2217 | *offset = ++toff; | 2240 | *offset = ++toff; |
| 2218 | return nr_pages? ++nr_pages: 0; | 2241 | return nr_pages? ++nr_pages: 0; |
| 2219 | } | 2242 | } |
| 2243 | |||
| 2244 | /* | ||
| 2245 | * add_swap_count_continuation - called when a swap count is duplicated | ||
| 2246 | * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's | ||
| 2247 | * page of the original vmalloc'ed swap_map, to hold the continuation count | ||
| 2248 | * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called | ||
| 2249 | * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. | ||
| 2250 | * | ||
| 2251 | * These continuation pages are seldom referenced: the common paths all work | ||
| 2252 | * on the original swap_map, only referring to a continuation page when the | ||
| 2253 | * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. | ||
| 2254 | * | ||
| 2255 | * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding | ||
| 2256 | * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) | ||
| 2257 | * can be called after dropping locks. | ||
| 2258 | */ | ||
| 2259 | int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) | ||
| 2260 | { | ||
| 2261 | struct swap_info_struct *si; | ||
| 2262 | struct page *head; | ||
| 2263 | struct page *page; | ||
| 2264 | struct page *list_page; | ||
| 2265 | pgoff_t offset; | ||
| 2266 | unsigned char count; | ||
| 2267 | |||
| 2268 | /* | ||
| 2269 | * When debugging, it's easier to use __GFP_ZERO here; but it's better | ||
| 2270 | * for latency not to zero a page while GFP_ATOMIC and holding locks. | ||
| 2271 | */ | ||
| 2272 | page = alloc_page(gfp_mask | __GFP_HIGHMEM); | ||
| 2273 | |||
| 2274 | si = swap_info_get(entry); | ||
| 2275 | if (!si) { | ||
| 2276 | /* | ||
| 2277 | * An acceptable race has occurred since the failing | ||
| 2278 | * __swap_duplicate(): the swap entry has been freed, | ||
| 2279 | * perhaps even the whole swap_map cleared for swapoff. | ||
| 2280 | */ | ||
| 2281 | goto outer; | ||
| 2282 | } | ||
| 2283 | |||
| 2284 | offset = swp_offset(entry); | ||
| 2285 | count = si->swap_map[offset] & ~SWAP_HAS_CACHE; | ||
| 2286 | |||
| 2287 | if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { | ||
| 2288 | /* | ||
| 2289 | * The higher the swap count, the more likely it is that tasks | ||
| 2290 | * will race to add swap count continuation: we need to avoid | ||
| 2291 | * over-provisioning. | ||
| 2292 | */ | ||
| 2293 | goto out; | ||
| 2294 | } | ||
| 2295 | |||
| 2296 | if (!page) { | ||
| 2297 | spin_unlock(&swap_lock); | ||
| 2298 | return -ENOMEM; | ||
| 2299 | } | ||
| 2300 | |||
| 2301 | /* | ||
| 2302 | * We are fortunate that although vmalloc_to_page uses pte_offset_map, | ||
| 2303 | * no architecture is using highmem pages for kernel pagetables: so it | ||
| 2304 | * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. | ||
| 2305 | */ | ||
| 2306 | head = vmalloc_to_page(si->swap_map + offset); | ||
| 2307 | offset &= ~PAGE_MASK; | ||
| 2308 | |||
| 2309 | /* | ||
| 2310 | * Page allocation does not initialize the page's lru field, | ||
| 2311 | * but it does always reset its private field. | ||
| 2312 | */ | ||
| 2313 | if (!page_private(head)) { | ||
| 2314 | BUG_ON(count & COUNT_CONTINUED); | ||
| 2315 | INIT_LIST_HEAD(&head->lru); | ||
| 2316 | set_page_private(head, SWP_CONTINUED); | ||
| 2317 | si->flags |= SWP_CONTINUED; | ||
| 2318 | } | ||
| 2319 | |||
| 2320 | list_for_each_entry(list_page, &head->lru, lru) { | ||
| 2321 | unsigned char *map; | ||
| 2322 | |||
| 2323 | /* | ||
| 2324 | * If the previous map said no continuation, but we've found | ||
| 2325 | * a continuation page, free our allocation and use this one. | ||
| 2326 | */ | ||
| 2327 | if (!(count & COUNT_CONTINUED)) | ||
| 2328 | goto out; | ||
| 2329 | |||
| 2330 | map = kmap_atomic(list_page, KM_USER0) + offset; | ||
| 2331 | count = *map; | ||
| 2332 | kunmap_atomic(map, KM_USER0); | ||
| 2333 | |||
| 2334 | /* | ||
| 2335 | * If this continuation count now has some space in it, | ||
| 2336 | * free our allocation and use this one. | ||
| 2337 | */ | ||
| 2338 | if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) | ||
| 2339 | goto out; | ||
| 2340 | } | ||
| 2341 | |||
| 2342 | list_add_tail(&page->lru, &head->lru); | ||
| 2343 | page = NULL; /* now it's attached, don't free it */ | ||
| 2344 | out: | ||
| 2345 | spin_unlock(&swap_lock); | ||
| 2346 | outer: | ||
| 2347 | if (page) | ||
| 2348 | __free_page(page); | ||
| 2349 | return 0; | ||
| 2350 | } | ||
| 2351 | |||
| 2352 | /* | ||
| 2353 | * swap_count_continued - when the original swap_map count is incremented | ||
| 2354 | * from SWAP_MAP_MAX, check if there is already a continuation page to carry | ||
| 2355 | * into, carry if so, or else fail until a new continuation page is allocated; | ||
| 2356 | * when the original swap_map count is decremented from 0 with continuation, | ||
| 2357 | * borrow from the continuation and report whether it still holds more. | ||
| 2358 | * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. | ||
| 2359 | */ | ||
| 2360 | static bool swap_count_continued(struct swap_info_struct *si, | ||
| 2361 | pgoff_t offset, unsigned char count) | ||
| 2362 | { | ||
| 2363 | struct page *head; | ||
| 2364 | struct page *page; | ||
| 2365 | unsigned char *map; | ||
| 2366 | |||
| 2367 | head = vmalloc_to_page(si->swap_map + offset); | ||
| 2368 | if (page_private(head) != SWP_CONTINUED) { | ||
| 2369 | BUG_ON(count & COUNT_CONTINUED); | ||
| 2370 | return false; /* need to add count continuation */ | ||
| 2371 | } | ||
| 2372 | |||
| 2373 | offset &= ~PAGE_MASK; | ||
| 2374 | page = list_entry(head->lru.next, struct page, lru); | ||
| 2375 | map = kmap_atomic(page, KM_USER0) + offset; | ||
| 2376 | |||
| 2377 | if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ | ||
| 2378 | goto init_map; /* jump over SWAP_CONT_MAX checks */ | ||
| 2379 | |||
| 2380 | if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ | ||
| 2381 | /* | ||
| 2382 | * Think of how you add 1 to 999 | ||
| 2383 | */ | ||
| 2384 | while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { | ||
| 2385 | kunmap_atomic(map, KM_USER0); | ||
| 2386 | page = list_entry(page->lru.next, struct page, lru); | ||
| 2387 | BUG_ON(page == head); | ||
| 2388 | map = kmap_atomic(page, KM_USER0) + offset; | ||
| 2389 | } | ||
| 2390 | if (*map == SWAP_CONT_MAX) { | ||
| 2391 | kunmap_atomic(map, KM_USER0); | ||
| 2392 | page = list_entry(page->lru.next, struct page, lru); | ||
| 2393 | if (page == head) | ||
| 2394 | return false; /* add count continuation */ | ||
| 2395 | map = kmap_atomic(page, KM_USER0) + offset; | ||
| 2396 | init_map: *map = 0; /* we didn't zero the page */ | ||
| 2397 | } | ||
| 2398 | *map += 1; | ||
| 2399 | kunmap_atomic(map, KM_USER0); | ||
| 2400 | page = list_entry(page->lru.prev, struct page, lru); | ||
| 2401 | while (page != head) { | ||
| 2402 | map = kmap_atomic(page, KM_USER0) + offset; | ||
| 2403 | *map = COUNT_CONTINUED; | ||
| 2404 | kunmap_atomic(map, KM_USER0); | ||
| 2405 | page = list_entry(page->lru.prev, struct page, lru); | ||
| 2406 | } | ||
| 2407 | return true; /* incremented */ | ||
| 2408 | |||
| 2409 | } else { /* decrementing */ | ||
| 2410 | /* | ||
| 2411 | * Think of how you subtract 1 from 1000 | ||
| 2412 | */ | ||
| 2413 | BUG_ON(count != COUNT_CONTINUED); | ||
| 2414 | while (*map == COUNT_CONTINUED) { | ||
| 2415 | kunmap_atomic(map, KM_USER0); | ||
| 2416 | page = list_entry(page->lru.next, struct page, lru); | ||
| 2417 | BUG_ON(page == head); | ||
| 2418 | map = kmap_atomic(page, KM_USER0) + offset; | ||
| 2419 | } | ||
| 2420 | BUG_ON(*map == 0); | ||
| 2421 | *map -= 1; | ||
| 2422 | if (*map == 0) | ||
| 2423 | count = 0; | ||
| 2424 | kunmap_atomic(map, KM_USER0); | ||
| 2425 | page = list_entry(page->lru.prev, struct page, lru); | ||
| 2426 | while (page != head) { | ||
| 2427 | map = kmap_atomic(page, KM_USER0) + offset; | ||
| 2428 | *map = SWAP_CONT_MAX | count; | ||
| 2429 | count = COUNT_CONTINUED; | ||
| 2430 | kunmap_atomic(map, KM_USER0); | ||
| 2431 | page = list_entry(page->lru.prev, struct page, lru); | ||
| 2432 | } | ||
| 2433 | return count == COUNT_CONTINUED; | ||
| 2434 | } | ||
| 2435 | } | ||
| 2436 | |||
| 2437 | /* | ||
| 2438 | * free_swap_count_continuations - swapoff free all the continuation pages | ||
| 2439 | * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. | ||
| 2440 | */ | ||
| 2441 | static void free_swap_count_continuations(struct swap_info_struct *si) | ||
| 2442 | { | ||
| 2443 | pgoff_t offset; | ||
| 2444 | |||
| 2445 | for (offset = 0; offset < si->max; offset += PAGE_SIZE) { | ||
| 2446 | struct page *head; | ||
| 2447 | head = vmalloc_to_page(si->swap_map + offset); | ||
| 2448 | if (page_private(head)) { | ||
| 2449 | struct list_head *this, *next; | ||
| 2450 | list_for_each_safe(this, next, &head->lru) { | ||
| 2451 | struct page *page; | ||
| 2452 | page = list_entry(this, struct page, lru); | ||
| 2453 | list_del(this); | ||
| 2454 | __free_page(page); | ||
| 2455 | } | ||
| 2456 | } | ||
| 2457 | } | ||
| 2458 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 450cebdabfc0..e87e37244829 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
| 272 | pagevec_release(&pvec); | 272 | pagevec_release(&pvec); |
| 273 | break; | 273 | break; |
| 274 | } | 274 | } |
| 275 | mem_cgroup_uncharge_start(); | ||
| 275 | for (i = 0; i < pagevec_count(&pvec); i++) { | 276 | for (i = 0; i < pagevec_count(&pvec); i++) { |
| 276 | struct page *page = pvec.pages[i]; | 277 | struct page *page = pvec.pages[i]; |
| 277 | 278 | ||
| @@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
| 286 | unlock_page(page); | 287 | unlock_page(page); |
| 287 | } | 288 | } |
| 288 | pagevec_release(&pvec); | 289 | pagevec_release(&pvec); |
| 290 | mem_cgroup_uncharge_end(); | ||
| 289 | } | 291 | } |
| 290 | } | 292 | } |
| 291 | EXPORT_SYMBOL(truncate_inode_pages_range); | 293 | EXPORT_SYMBOL(truncate_inode_pages_range); |
| @@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
| 327 | pagevec_init(&pvec, 0); | 329 | pagevec_init(&pvec, 0); |
| 328 | while (next <= end && | 330 | while (next <= end && |
| 329 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 331 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
| 332 | mem_cgroup_uncharge_start(); | ||
| 330 | for (i = 0; i < pagevec_count(&pvec); i++) { | 333 | for (i = 0; i < pagevec_count(&pvec); i++) { |
| 331 | struct page *page = pvec.pages[i]; | 334 | struct page *page = pvec.pages[i]; |
| 332 | pgoff_t index; | 335 | pgoff_t index; |
| @@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
| 354 | break; | 357 | break; |
| 355 | } | 358 | } |
| 356 | pagevec_release(&pvec); | 359 | pagevec_release(&pvec); |
| 360 | mem_cgroup_uncharge_end(); | ||
| 357 | cond_resched(); | 361 | cond_resched(); |
| 358 | } | 362 | } |
| 359 | return ret; | 363 | return ret; |
| @@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
| 428 | while (next <= end && !wrapped && | 432 | while (next <= end && !wrapped && |
| 429 | pagevec_lookup(&pvec, mapping, next, | 433 | pagevec_lookup(&pvec, mapping, next, |
| 430 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 434 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
| 435 | mem_cgroup_uncharge_start(); | ||
| 431 | for (i = 0; i < pagevec_count(&pvec); i++) { | 436 | for (i = 0; i < pagevec_count(&pvec); i++) { |
| 432 | struct page *page = pvec.pages[i]; | 437 | struct page *page = pvec.pages[i]; |
| 433 | pgoff_t page_index; | 438 | pgoff_t page_index; |
| @@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
| 477 | unlock_page(page); | 482 | unlock_page(page); |
| 478 | } | 483 | } |
| 479 | pagevec_release(&pvec); | 484 | pagevec_release(&pvec); |
| 485 | mem_cgroup_uncharge_end(); | ||
| 480 | cond_resched(); | 486 | cond_resched(); |
| 481 | } | 487 | } |
| 482 | return ret; | 488 | return ret; |
| @@ -490,7 +496,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); | |||
| 490 | * Any pages which are found to be mapped into pagetables are unmapped prior to | 496 | * Any pages which are found to be mapped into pagetables are unmapped prior to |
| 491 | * invalidation. | 497 | * invalidation. |
| 492 | * | 498 | * |
| 493 | * Returns -EIO if any pages could not be invalidated. | 499 | * Returns -EBUSY if any pages could not be invalidated. |
| 494 | */ | 500 | */ |
| 495 | int invalidate_inode_pages2(struct address_space *mapping) | 501 | int invalidate_inode_pages2(struct address_space *mapping) |
| 496 | { | 502 | { |
| @@ -516,22 +522,20 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | |||
| 516 | */ | 522 | */ |
| 517 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | 523 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) |
| 518 | { | 524 | { |
| 519 | if (new < old) { | 525 | struct address_space *mapping = inode->i_mapping; |
| 520 | struct address_space *mapping = inode->i_mapping; | 526 | |
| 521 | 527 | /* | |
| 522 | /* | 528 | * unmap_mapping_range is called twice, first simply for |
| 523 | * unmap_mapping_range is called twice, first simply for | 529 | * efficiency so that truncate_inode_pages does fewer |
| 524 | * efficiency so that truncate_inode_pages does fewer | 530 | * single-page unmaps. However after this first call, and |
| 525 | * single-page unmaps. However after this first call, and | 531 | * before truncate_inode_pages finishes, it is possible for |
| 526 | * before truncate_inode_pages finishes, it is possible for | 532 | * private pages to be COWed, which remain after |
| 527 | * private pages to be COWed, which remain after | 533 | * truncate_inode_pages finishes, hence the second |
| 528 | * truncate_inode_pages finishes, hence the second | 534 | * unmap_mapping_range call must be made for correctness. |
| 529 | * unmap_mapping_range call must be made for correctness. | 535 | */ |
| 530 | */ | 536 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); |
| 531 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | 537 | truncate_inode_pages(mapping, new); |
| 532 | truncate_inode_pages(mapping, new); | 538 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); |
| 533 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | ||
| 534 | } | ||
| 535 | } | 539 | } |
| 536 | EXPORT_SYMBOL(truncate_pagecache); | 540 | EXPORT_SYMBOL(truncate_pagecache); |
| 537 | 541 | ||
| @@ -220,7 +220,7 @@ char *strndup_user(const char __user *s, long n) | |||
| 220 | } | 220 | } |
| 221 | EXPORT_SYMBOL(strndup_user); | 221 | EXPORT_SYMBOL(strndup_user); |
| 222 | 222 | ||
| 223 | #ifndef HAVE_ARCH_PICK_MMAP_LAYOUT | 223 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
| 224 | void arch_pick_mmap_layout(struct mm_struct *mm) | 224 | void arch_pick_mmap_layout(struct mm_struct *mm) |
| 225 | { | 225 | { |
| 226 | mm->mmap_base = TASK_UNMAPPED_BASE; | 226 | mm->mmap_base = TASK_UNMAPPED_BASE; |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0f551a4a44cd..ae007462b7f6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void) | |||
| 509 | 509 | ||
| 510 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | 510 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); |
| 511 | 511 | ||
| 512 | /* for per-CPU blocks */ | ||
| 513 | static void purge_fragmented_blocks_allcpus(void); | ||
| 514 | |||
| 512 | /* | 515 | /* |
| 513 | * Purges all lazily-freed vmap areas. | 516 | * Purges all lazily-freed vmap areas. |
| 514 | * | 517 | * |
| @@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
| 539 | } else | 542 | } else |
| 540 | spin_lock(&purge_lock); | 543 | spin_lock(&purge_lock); |
| 541 | 544 | ||
| 545 | if (sync) | ||
| 546 | purge_fragmented_blocks_allcpus(); | ||
| 547 | |||
| 542 | rcu_read_lock(); | 548 | rcu_read_lock(); |
| 543 | list_for_each_entry_rcu(va, &vmap_area_list, list) { | 549 | list_for_each_entry_rcu(va, &vmap_area_list, list) { |
| 544 | if (va->flags & VM_LAZY_FREE) { | 550 | if (va->flags & VM_LAZY_FREE) { |
| @@ -555,10 +561,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | |||
| 555 | } | 561 | } |
| 556 | rcu_read_unlock(); | 562 | rcu_read_unlock(); |
| 557 | 563 | ||
| 558 | if (nr) { | 564 | if (nr) |
| 559 | BUG_ON(nr > atomic_read(&vmap_lazy_nr)); | ||
| 560 | atomic_sub(nr, &vmap_lazy_nr); | 565 | atomic_sub(nr, &vmap_lazy_nr); |
| 561 | } | ||
| 562 | 566 | ||
| 563 | if (nr || force_flush) | 567 | if (nr || force_flush) |
| 564 | flush_tlb_kernel_range(*start, *end); | 568 | flush_tlb_kernel_range(*start, *end); |
| @@ -669,8 +673,6 @@ static bool vmap_initialized __read_mostly = false; | |||
| 669 | struct vmap_block_queue { | 673 | struct vmap_block_queue { |
| 670 | spinlock_t lock; | 674 | spinlock_t lock; |
| 671 | struct list_head free; | 675 | struct list_head free; |
| 672 | struct list_head dirty; | ||
| 673 | unsigned int nr_dirty; | ||
| 674 | }; | 676 | }; |
| 675 | 677 | ||
| 676 | struct vmap_block { | 678 | struct vmap_block { |
| @@ -680,10 +682,9 @@ struct vmap_block { | |||
| 680 | unsigned long free, dirty; | 682 | unsigned long free, dirty; |
| 681 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); | 683 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); |
| 682 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | 684 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); |
| 683 | union { | 685 | struct list_head free_list; |
| 684 | struct list_head free_list; | 686 | struct rcu_head rcu_head; |
| 685 | struct rcu_head rcu_head; | 687 | struct list_head purge; |
| 686 | }; | ||
| 687 | }; | 688 | }; |
| 688 | 689 | ||
| 689 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ | 690 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ |
| @@ -759,9 +760,9 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
| 759 | vbq = &get_cpu_var(vmap_block_queue); | 760 | vbq = &get_cpu_var(vmap_block_queue); |
| 760 | vb->vbq = vbq; | 761 | vb->vbq = vbq; |
| 761 | spin_lock(&vbq->lock); | 762 | spin_lock(&vbq->lock); |
| 762 | list_add(&vb->free_list, &vbq->free); | 763 | list_add_rcu(&vb->free_list, &vbq->free); |
| 763 | spin_unlock(&vbq->lock); | 764 | spin_unlock(&vbq->lock); |
| 764 | put_cpu_var(vmap_cpu_blocks); | 765 | put_cpu_var(vmap_block_queue); |
| 765 | 766 | ||
| 766 | return vb; | 767 | return vb; |
| 767 | } | 768 | } |
| @@ -778,8 +779,6 @@ static void free_vmap_block(struct vmap_block *vb) | |||
| 778 | struct vmap_block *tmp; | 779 | struct vmap_block *tmp; |
| 779 | unsigned long vb_idx; | 780 | unsigned long vb_idx; |
| 780 | 781 | ||
| 781 | BUG_ON(!list_empty(&vb->free_list)); | ||
| 782 | |||
| 783 | vb_idx = addr_to_vb_idx(vb->va->va_start); | 782 | vb_idx = addr_to_vb_idx(vb->va->va_start); |
| 784 | spin_lock(&vmap_block_tree_lock); | 783 | spin_lock(&vmap_block_tree_lock); |
| 785 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); | 784 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); |
| @@ -790,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb) | |||
| 790 | call_rcu(&vb->rcu_head, rcu_free_vb); | 789 | call_rcu(&vb->rcu_head, rcu_free_vb); |
| 791 | } | 790 | } |
| 792 | 791 | ||
| 792 | static void purge_fragmented_blocks(int cpu) | ||
| 793 | { | ||
| 794 | LIST_HEAD(purge); | ||
| 795 | struct vmap_block *vb; | ||
| 796 | struct vmap_block *n_vb; | ||
| 797 | struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); | ||
| 798 | |||
| 799 | rcu_read_lock(); | ||
| 800 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
| 801 | |||
| 802 | if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) | ||
| 803 | continue; | ||
| 804 | |||
| 805 | spin_lock(&vb->lock); | ||
| 806 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { | ||
| 807 | vb->free = 0; /* prevent further allocs after releasing lock */ | ||
| 808 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ | ||
| 809 | bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); | ||
| 810 | bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); | ||
| 811 | spin_lock(&vbq->lock); | ||
| 812 | list_del_rcu(&vb->free_list); | ||
| 813 | spin_unlock(&vbq->lock); | ||
| 814 | spin_unlock(&vb->lock); | ||
| 815 | list_add_tail(&vb->purge, &purge); | ||
| 816 | } else | ||
| 817 | spin_unlock(&vb->lock); | ||
| 818 | } | ||
| 819 | rcu_read_unlock(); | ||
| 820 | |||
| 821 | list_for_each_entry_safe(vb, n_vb, &purge, purge) { | ||
| 822 | list_del(&vb->purge); | ||
| 823 | free_vmap_block(vb); | ||
| 824 | } | ||
| 825 | } | ||
| 826 | |||
| 827 | static void purge_fragmented_blocks_thiscpu(void) | ||
| 828 | { | ||
| 829 | purge_fragmented_blocks(smp_processor_id()); | ||
| 830 | } | ||
| 831 | |||
| 832 | static void purge_fragmented_blocks_allcpus(void) | ||
| 833 | { | ||
| 834 | int cpu; | ||
| 835 | |||
| 836 | for_each_possible_cpu(cpu) | ||
| 837 | purge_fragmented_blocks(cpu); | ||
| 838 | } | ||
| 839 | |||
| 793 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | 840 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) |
| 794 | { | 841 | { |
| 795 | struct vmap_block_queue *vbq; | 842 | struct vmap_block_queue *vbq; |
| 796 | struct vmap_block *vb; | 843 | struct vmap_block *vb; |
| 797 | unsigned long addr = 0; | 844 | unsigned long addr = 0; |
| 798 | unsigned int order; | 845 | unsigned int order; |
| 846 | int purge = 0; | ||
| 799 | 847 | ||
| 800 | BUG_ON(size & ~PAGE_MASK); | 848 | BUG_ON(size & ~PAGE_MASK); |
| 801 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | 849 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); |
| @@ -808,25 +856,39 @@ again: | |||
| 808 | int i; | 856 | int i; |
| 809 | 857 | ||
| 810 | spin_lock(&vb->lock); | 858 | spin_lock(&vb->lock); |
| 859 | if (vb->free < 1UL << order) | ||
| 860 | goto next; | ||
| 861 | |||
| 811 | i = bitmap_find_free_region(vb->alloc_map, | 862 | i = bitmap_find_free_region(vb->alloc_map, |
| 812 | VMAP_BBMAP_BITS, order); | 863 | VMAP_BBMAP_BITS, order); |
| 813 | 864 | ||
| 814 | if (i >= 0) { | 865 | if (i < 0) { |
| 815 | addr = vb->va->va_start + (i << PAGE_SHIFT); | 866 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { |
| 816 | BUG_ON(addr_to_vb_idx(addr) != | 867 | /* fragmented and no outstanding allocations */ |
| 817 | addr_to_vb_idx(vb->va->va_start)); | 868 | BUG_ON(vb->dirty != VMAP_BBMAP_BITS); |
| 818 | vb->free -= 1UL << order; | 869 | purge = 1; |
| 819 | if (vb->free == 0) { | ||
| 820 | spin_lock(&vbq->lock); | ||
| 821 | list_del_init(&vb->free_list); | ||
| 822 | spin_unlock(&vbq->lock); | ||
| 823 | } | 870 | } |
| 824 | spin_unlock(&vb->lock); | 871 | goto next; |
| 825 | break; | 872 | } |
| 873 | addr = vb->va->va_start + (i << PAGE_SHIFT); | ||
| 874 | BUG_ON(addr_to_vb_idx(addr) != | ||
| 875 | addr_to_vb_idx(vb->va->va_start)); | ||
| 876 | vb->free -= 1UL << order; | ||
| 877 | if (vb->free == 0) { | ||
| 878 | spin_lock(&vbq->lock); | ||
| 879 | list_del_rcu(&vb->free_list); | ||
| 880 | spin_unlock(&vbq->lock); | ||
| 826 | } | 881 | } |
| 827 | spin_unlock(&vb->lock); | 882 | spin_unlock(&vb->lock); |
| 883 | break; | ||
| 884 | next: | ||
| 885 | spin_unlock(&vb->lock); | ||
| 828 | } | 886 | } |
| 829 | put_cpu_var(vmap_cpu_blocks); | 887 | |
| 888 | if (purge) | ||
| 889 | purge_fragmented_blocks_thiscpu(); | ||
| 890 | |||
| 891 | put_cpu_var(vmap_block_queue); | ||
| 830 | rcu_read_unlock(); | 892 | rcu_read_unlock(); |
| 831 | 893 | ||
| 832 | if (!addr) { | 894 | if (!addr) { |
| @@ -862,11 +924,11 @@ static void vb_free(const void *addr, unsigned long size) | |||
| 862 | BUG_ON(!vb); | 924 | BUG_ON(!vb); |
| 863 | 925 | ||
| 864 | spin_lock(&vb->lock); | 926 | spin_lock(&vb->lock); |
| 865 | bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); | 927 | BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); |
| 866 | 928 | ||
| 867 | vb->dirty += 1UL << order; | 929 | vb->dirty += 1UL << order; |
| 868 | if (vb->dirty == VMAP_BBMAP_BITS) { | 930 | if (vb->dirty == VMAP_BBMAP_BITS) { |
| 869 | BUG_ON(vb->free || !list_empty(&vb->free_list)); | 931 | BUG_ON(vb->free); |
| 870 | spin_unlock(&vb->lock); | 932 | spin_unlock(&vb->lock); |
| 871 | free_vmap_block(vb); | 933 | free_vmap_block(vb); |
| 872 | } else | 934 | } else |
| @@ -1035,8 +1097,6 @@ void __init vmalloc_init(void) | |||
| 1035 | vbq = &per_cpu(vmap_block_queue, i); | 1097 | vbq = &per_cpu(vmap_block_queue, i); |
| 1036 | spin_lock_init(&vbq->lock); | 1098 | spin_lock_init(&vbq->lock); |
| 1037 | INIT_LIST_HEAD(&vbq->free); | 1099 | INIT_LIST_HEAD(&vbq->free); |
| 1038 | INIT_LIST_HEAD(&vbq->dirty); | ||
| 1039 | vbq->nr_dirty = 0; | ||
| 1040 | } | 1100 | } |
| 1041 | 1101 | ||
| 1042 | /* Import existing vmlist entries. */ | 1102 | /* Import existing vmlist entries. */ |
| @@ -1411,6 +1471,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
| 1411 | { | 1471 | { |
| 1412 | struct page **pages; | 1472 | struct page **pages; |
| 1413 | unsigned int nr_pages, array_size, i; | 1473 | unsigned int nr_pages, array_size, i; |
| 1474 | gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; | ||
| 1414 | 1475 | ||
| 1415 | nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; | 1476 | nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; |
| 1416 | array_size = (nr_pages * sizeof(struct page *)); | 1477 | array_size = (nr_pages * sizeof(struct page *)); |
| @@ -1418,13 +1479,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
| 1418 | area->nr_pages = nr_pages; | 1479 | area->nr_pages = nr_pages; |
| 1419 | /* Please note that the recursion is strictly bounded. */ | 1480 | /* Please note that the recursion is strictly bounded. */ |
| 1420 | if (array_size > PAGE_SIZE) { | 1481 | if (array_size > PAGE_SIZE) { |
| 1421 | pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, | 1482 | pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, |
| 1422 | PAGE_KERNEL, node, caller); | 1483 | PAGE_KERNEL, node, caller); |
| 1423 | area->flags |= VM_VPAGES; | 1484 | area->flags |= VM_VPAGES; |
| 1424 | } else { | 1485 | } else { |
| 1425 | pages = kmalloc_node(array_size, | 1486 | pages = kmalloc_node(array_size, nested_gfp, node); |
| 1426 | (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, | ||
| 1427 | node); | ||
| 1428 | } | 1487 | } |
| 1429 | area->pages = pages; | 1488 | area->pages = pages; |
| 1430 | area->caller = caller; | 1489 | area->caller = caller; |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 777af57fd8c8..79c809895fba 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -55,6 +55,11 @@ struct scan_control { | |||
| 55 | /* Number of pages freed so far during a call to shrink_zones() */ | 55 | /* Number of pages freed so far during a call to shrink_zones() */ |
| 56 | unsigned long nr_reclaimed; | 56 | unsigned long nr_reclaimed; |
| 57 | 57 | ||
| 58 | /* How many pages shrink_list() should reclaim */ | ||
| 59 | unsigned long nr_to_reclaim; | ||
| 60 | |||
| 61 | unsigned long hibernation_mode; | ||
| 62 | |||
| 58 | /* This context's GFP mask */ | 63 | /* This context's GFP mask */ |
| 59 | gfp_t gfp_mask; | 64 | gfp_t gfp_mask; |
| 60 | 65 | ||
| @@ -66,12 +71,6 @@ struct scan_control { | |||
| 66 | /* Can pages be swapped as part of reclaim? */ | 71 | /* Can pages be swapped as part of reclaim? */ |
| 67 | int may_swap; | 72 | int may_swap; |
| 68 | 73 | ||
| 69 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | ||
| 70 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | ||
| 71 | * In this context, it doesn't matter that we scan the | ||
| 72 | * whole list at once. */ | ||
| 73 | int swap_cluster_max; | ||
| 74 | |||
| 75 | int swappiness; | 74 | int swappiness; |
| 76 | 75 | ||
| 77 | int all_unreclaimable; | 76 | int all_unreclaimable; |
| @@ -263,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
| 263 | return ret; | 262 | return ret; |
| 264 | } | 263 | } |
| 265 | 264 | ||
| 266 | /* Called without lock on whether page is mapped, so answer is unstable */ | ||
| 267 | static inline int page_mapping_inuse(struct page *page) | ||
| 268 | { | ||
| 269 | struct address_space *mapping; | ||
| 270 | |||
| 271 | /* Page is in somebody's page tables. */ | ||
| 272 | if (page_mapped(page)) | ||
| 273 | return 1; | ||
| 274 | |||
| 275 | /* Be more reluctant to reclaim swapcache than pagecache */ | ||
| 276 | if (PageSwapCache(page)) | ||
| 277 | return 1; | ||
| 278 | |||
| 279 | mapping = page_mapping(page); | ||
| 280 | if (!mapping) | ||
| 281 | return 0; | ||
| 282 | |||
| 283 | /* File is mmap'd by somebody? */ | ||
| 284 | return mapping_mapped(mapping); | ||
| 285 | } | ||
| 286 | |||
| 287 | static inline int is_page_cache_freeable(struct page *page) | 265 | static inline int is_page_cache_freeable(struct page *page) |
| 288 | { | 266 | { |
| 289 | /* | 267 | /* |
| @@ -358,7 +336,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
| 358 | * stalls if we need to run get_block(). We could test | 336 | * stalls if we need to run get_block(). We could test |
| 359 | * PagePrivate for that. | 337 | * PagePrivate for that. |
| 360 | * | 338 | * |
| 361 | * If this process is currently in generic_file_write() against | 339 | * If this process is currently in __generic_file_aio_write() against |
| 362 | * this page's queue, we can perform writeback even if that | 340 | * this page's queue, we can perform writeback even if that |
| 363 | * will block. | 341 | * will block. |
| 364 | * | 342 | * |
| @@ -580,6 +558,65 @@ redo: | |||
| 580 | put_page(page); /* drop ref from isolate */ | 558 | put_page(page); /* drop ref from isolate */ |
| 581 | } | 559 | } |
| 582 | 560 | ||
| 561 | enum page_references { | ||
| 562 | PAGEREF_RECLAIM, | ||
| 563 | PAGEREF_RECLAIM_CLEAN, | ||
| 564 | PAGEREF_KEEP, | ||
| 565 | PAGEREF_ACTIVATE, | ||
| 566 | }; | ||
| 567 | |||
| 568 | static enum page_references page_check_references(struct page *page, | ||
| 569 | struct scan_control *sc) | ||
| 570 | { | ||
| 571 | int referenced_ptes, referenced_page; | ||
| 572 | unsigned long vm_flags; | ||
| 573 | |||
| 574 | referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); | ||
| 575 | referenced_page = TestClearPageReferenced(page); | ||
| 576 | |||
| 577 | /* Lumpy reclaim - ignore references */ | ||
| 578 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
| 579 | return PAGEREF_RECLAIM; | ||
| 580 | |||
| 581 | /* | ||
| 582 | * Mlock lost the isolation race with us. Let try_to_unmap() | ||
| 583 | * move the page to the unevictable list. | ||
| 584 | */ | ||
| 585 | if (vm_flags & VM_LOCKED) | ||
| 586 | return PAGEREF_RECLAIM; | ||
| 587 | |||
| 588 | if (referenced_ptes) { | ||
| 589 | if (PageAnon(page)) | ||
| 590 | return PAGEREF_ACTIVATE; | ||
| 591 | /* | ||
| 592 | * All mapped pages start out with page table | ||
| 593 | * references from the instantiating fault, so we need | ||
| 594 | * to look twice if a mapped file page is used more | ||
| 595 | * than once. | ||
| 596 | * | ||
| 597 | * Mark it and spare it for another trip around the | ||
| 598 | * inactive list. Another page table reference will | ||
| 599 | * lead to its activation. | ||
| 600 | * | ||
| 601 | * Note: the mark is set for activated pages as well | ||
| 602 | * so that recently deactivated but used pages are | ||
| 603 | * quickly recovered. | ||
| 604 | */ | ||
| 605 | SetPageReferenced(page); | ||
| 606 | |||
| 607 | if (referenced_page) | ||
| 608 | return PAGEREF_ACTIVATE; | ||
| 609 | |||
| 610 | return PAGEREF_KEEP; | ||
| 611 | } | ||
| 612 | |||
| 613 | /* Reclaim if clean, defer dirty pages to writeback */ | ||
| 614 | if (referenced_page) | ||
| 615 | return PAGEREF_RECLAIM_CLEAN; | ||
| 616 | |||
| 617 | return PAGEREF_RECLAIM; | ||
| 618 | } | ||
| 619 | |||
| 583 | /* | 620 | /* |
| 584 | * shrink_page_list() returns the number of reclaimed pages | 621 | * shrink_page_list() returns the number of reclaimed pages |
| 585 | */ | 622 | */ |
| @@ -591,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 591 | struct pagevec freed_pvec; | 628 | struct pagevec freed_pvec; |
| 592 | int pgactivate = 0; | 629 | int pgactivate = 0; |
| 593 | unsigned long nr_reclaimed = 0; | 630 | unsigned long nr_reclaimed = 0; |
| 594 | unsigned long vm_flags; | ||
| 595 | 631 | ||
| 596 | cond_resched(); | 632 | cond_resched(); |
| 597 | 633 | ||
| 598 | pagevec_init(&freed_pvec, 1); | 634 | pagevec_init(&freed_pvec, 1); |
| 599 | while (!list_empty(page_list)) { | 635 | while (!list_empty(page_list)) { |
| 636 | enum page_references references; | ||
| 600 | struct address_space *mapping; | 637 | struct address_space *mapping; |
| 601 | struct page *page; | 638 | struct page *page; |
| 602 | int may_enter_fs; | 639 | int may_enter_fs; |
| 603 | int referenced; | ||
| 604 | 640 | ||
| 605 | cond_resched(); | 641 | cond_resched(); |
| 606 | 642 | ||
| @@ -642,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 642 | goto keep_locked; | 678 | goto keep_locked; |
| 643 | } | 679 | } |
| 644 | 680 | ||
| 645 | referenced = page_referenced(page, 1, | 681 | references = page_check_references(page, sc); |
| 646 | sc->mem_cgroup, &vm_flags); | 682 | switch (references) { |
| 647 | /* | 683 | case PAGEREF_ACTIVATE: |
| 648 | * In active use or really unfreeable? Activate it. | ||
| 649 | * If page which have PG_mlocked lost isoltation race, | ||
| 650 | * try_to_unmap moves it to unevictable list | ||
| 651 | */ | ||
| 652 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | ||
| 653 | referenced && page_mapping_inuse(page) | ||
| 654 | && !(vm_flags & VM_LOCKED)) | ||
| 655 | goto activate_locked; | 684 | goto activate_locked; |
| 685 | case PAGEREF_KEEP: | ||
| 686 | goto keep_locked; | ||
| 687 | case PAGEREF_RECLAIM: | ||
| 688 | case PAGEREF_RECLAIM_CLEAN: | ||
| 689 | ; /* try to reclaim the page below */ | ||
| 690 | } | ||
| 656 | 691 | ||
| 657 | /* | 692 | /* |
| 658 | * Anonymous process memory has backing store? | 693 | * Anonymous process memory has backing store? |
| @@ -686,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 686 | } | 721 | } |
| 687 | 722 | ||
| 688 | if (PageDirty(page)) { | 723 | if (PageDirty(page)) { |
| 689 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) | 724 | if (references == PAGEREF_RECLAIM_CLEAN) |
| 690 | goto keep_locked; | 725 | goto keep_locked; |
| 691 | if (!may_enter_fs) | 726 | if (!may_enter_fs) |
| 692 | goto keep_locked; | 727 | goto keep_locked; |
| @@ -1132,7 +1167,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1132 | unsigned long nr_anon; | 1167 | unsigned long nr_anon; |
| 1133 | unsigned long nr_file; | 1168 | unsigned long nr_file; |
| 1134 | 1169 | ||
| 1135 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1170 | nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, |
| 1136 | &page_list, &nr_scan, sc->order, mode, | 1171 | &page_list, &nr_scan, sc->order, mode, |
| 1137 | zone, sc->mem_cgroup, 0, file); | 1172 | zone, sc->mem_cgroup, 0, file); |
| 1138 | 1173 | ||
| @@ -1166,10 +1201,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1166 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); | 1201 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); |
| 1167 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); | 1202 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); |
| 1168 | 1203 | ||
| 1169 | reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; | 1204 | reclaim_stat->recent_scanned[0] += nr_anon; |
| 1170 | reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; | 1205 | reclaim_stat->recent_scanned[1] += nr_file; |
| 1171 | reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; | ||
| 1172 | reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; | ||
| 1173 | 1206 | ||
| 1174 | spin_unlock_irq(&zone->lru_lock); | 1207 | spin_unlock_irq(&zone->lru_lock); |
| 1175 | 1208 | ||
| @@ -1353,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 1353 | continue; | 1386 | continue; |
| 1354 | } | 1387 | } |
| 1355 | 1388 | ||
| 1356 | /* page_referenced clears PageReferenced */ | 1389 | if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
| 1357 | if (page_mapping_inuse(page) && | ||
| 1358 | page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { | ||
| 1359 | nr_rotated++; | 1390 | nr_rotated++; |
| 1360 | /* | 1391 | /* |
| 1361 | * Identify referenced, file-backed active pages and | 1392 | * Identify referenced, file-backed active pages and |
| @@ -1464,20 +1495,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | |||
| 1464 | return low; | 1495 | return low; |
| 1465 | } | 1496 | } |
| 1466 | 1497 | ||
| 1498 | static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, | ||
| 1499 | int file) | ||
| 1500 | { | ||
| 1501 | if (file) | ||
| 1502 | return inactive_file_is_low(zone, sc); | ||
| 1503 | else | ||
| 1504 | return inactive_anon_is_low(zone, sc); | ||
| 1505 | } | ||
| 1506 | |||
| 1467 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1507 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
| 1468 | struct zone *zone, struct scan_control *sc, int priority) | 1508 | struct zone *zone, struct scan_control *sc, int priority) |
| 1469 | { | 1509 | { |
| 1470 | int file = is_file_lru(lru); | 1510 | int file = is_file_lru(lru); |
| 1471 | 1511 | ||
| 1472 | if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { | 1512 | if (is_active_lru(lru)) { |
| 1473 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1513 | if (inactive_list_is_low(zone, sc, file)) |
| 1514 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
| 1474 | return 0; | 1515 | return 0; |
| 1475 | } | 1516 | } |
| 1476 | 1517 | ||
| 1477 | if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { | ||
| 1478 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | ||
| 1479 | return 0; | ||
| 1480 | } | ||
| 1481 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); | 1518 | return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); |
| 1482 | } | 1519 | } |
| 1483 | 1520 | ||
| @@ -1498,6 +1535,13 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
| 1498 | unsigned long ap, fp; | 1535 | unsigned long ap, fp; |
| 1499 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1536 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1500 | 1537 | ||
| 1538 | /* If we have no swap space, do not bother scanning anon pages. */ | ||
| 1539 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
| 1540 | percent[0] = 0; | ||
| 1541 | percent[1] = 100; | ||
| 1542 | return; | ||
| 1543 | } | ||
| 1544 | |||
| 1501 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + | 1545 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + |
| 1502 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); | 1546 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); |
| 1503 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + | 1547 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + |
| @@ -1567,15 +1611,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
| 1567 | * until we collected @swap_cluster_max pages to scan. | 1611 | * until we collected @swap_cluster_max pages to scan. |
| 1568 | */ | 1612 | */ |
| 1569 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | 1613 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, |
| 1570 | unsigned long *nr_saved_scan, | 1614 | unsigned long *nr_saved_scan) |
| 1571 | unsigned long swap_cluster_max) | ||
| 1572 | { | 1615 | { |
| 1573 | unsigned long nr; | 1616 | unsigned long nr; |
| 1574 | 1617 | ||
| 1575 | *nr_saved_scan += nr_to_scan; | 1618 | *nr_saved_scan += nr_to_scan; |
| 1576 | nr = *nr_saved_scan; | 1619 | nr = *nr_saved_scan; |
| 1577 | 1620 | ||
| 1578 | if (nr >= swap_cluster_max) | 1621 | if (nr >= SWAP_CLUSTER_MAX) |
| 1579 | *nr_saved_scan = 0; | 1622 | *nr_saved_scan = 0; |
| 1580 | else | 1623 | else |
| 1581 | nr = 0; | 1624 | nr = 0; |
| @@ -1594,37 +1637,35 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1594 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ | 1637 | unsigned long percent[2]; /* anon @ 0; file @ 1 */ |
| 1595 | enum lru_list l; | 1638 | enum lru_list l; |
| 1596 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1639 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
| 1597 | unsigned long swap_cluster_max = sc->swap_cluster_max; | 1640 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; |
| 1598 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1641 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1599 | int noswap = 0; | ||
| 1600 | 1642 | ||
| 1601 | /* If we have no swap space, do not bother scanning anon pages. */ | 1643 | get_scan_ratio(zone, sc, percent); |
| 1602 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
| 1603 | noswap = 1; | ||
| 1604 | percent[0] = 0; | ||
| 1605 | percent[1] = 100; | ||
| 1606 | } else | ||
| 1607 | get_scan_ratio(zone, sc, percent); | ||
| 1608 | 1644 | ||
| 1609 | for_each_evictable_lru(l) { | 1645 | for_each_evictable_lru(l) { |
| 1610 | int file = is_file_lru(l); | 1646 | int file = is_file_lru(l); |
| 1611 | unsigned long scan; | 1647 | unsigned long scan; |
| 1612 | 1648 | ||
| 1649 | if (percent[file] == 0) { | ||
| 1650 | nr[l] = 0; | ||
| 1651 | continue; | ||
| 1652 | } | ||
| 1653 | |||
| 1613 | scan = zone_nr_lru_pages(zone, sc, l); | 1654 | scan = zone_nr_lru_pages(zone, sc, l); |
| 1614 | if (priority || noswap) { | 1655 | if (priority) { |
| 1615 | scan >>= priority; | 1656 | scan >>= priority; |
| 1616 | scan = (scan * percent[file]) / 100; | 1657 | scan = (scan * percent[file]) / 100; |
| 1617 | } | 1658 | } |
| 1618 | nr[l] = nr_scan_try_batch(scan, | 1659 | nr[l] = nr_scan_try_batch(scan, |
| 1619 | &reclaim_stat->nr_saved_scan[l], | 1660 | &reclaim_stat->nr_saved_scan[l]); |
| 1620 | swap_cluster_max); | ||
| 1621 | } | 1661 | } |
| 1622 | 1662 | ||
| 1623 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1663 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
| 1624 | nr[LRU_INACTIVE_FILE]) { | 1664 | nr[LRU_INACTIVE_FILE]) { |
| 1625 | for_each_evictable_lru(l) { | 1665 | for_each_evictable_lru(l) { |
| 1626 | if (nr[l]) { | 1666 | if (nr[l]) { |
| 1627 | nr_to_scan = min(nr[l], swap_cluster_max); | 1667 | nr_to_scan = min_t(unsigned long, |
| 1668 | nr[l], SWAP_CLUSTER_MAX); | ||
| 1628 | nr[l] -= nr_to_scan; | 1669 | nr[l] -= nr_to_scan; |
| 1629 | 1670 | ||
| 1630 | nr_reclaimed += shrink_list(l, nr_to_scan, | 1671 | nr_reclaimed += shrink_list(l, nr_to_scan, |
| @@ -1639,8 +1680,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1639 | * with multiple processes reclaiming pages, the total | 1680 | * with multiple processes reclaiming pages, the total |
| 1640 | * freeing target can get unreasonably large. | 1681 | * freeing target can get unreasonably large. |
| 1641 | */ | 1682 | */ |
| 1642 | if (nr_reclaimed > swap_cluster_max && | 1683 | if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) |
| 1643 | priority < DEF_PRIORITY && !current_is_kswapd()) | ||
| 1644 | break; | 1684 | break; |
| 1645 | } | 1685 | } |
| 1646 | 1686 | ||
| @@ -1693,8 +1733,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
| 1693 | continue; | 1733 | continue; |
| 1694 | note_zone_scanning_priority(zone, priority); | 1734 | note_zone_scanning_priority(zone, priority); |
| 1695 | 1735 | ||
| 1696 | if (zone_is_all_unreclaimable(zone) && | 1736 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 1697 | priority != DEF_PRIORITY) | ||
| 1698 | continue; /* Let kswapd poll it */ | 1737 | continue; /* Let kswapd poll it */ |
| 1699 | sc->all_unreclaimable = 0; | 1738 | sc->all_unreclaimable = 0; |
| 1700 | } else { | 1739 | } else { |
| @@ -1738,6 +1777,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1738 | struct zoneref *z; | 1777 | struct zoneref *z; |
| 1739 | struct zone *zone; | 1778 | struct zone *zone; |
| 1740 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); | 1779 | enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); |
| 1780 | unsigned long writeback_threshold; | ||
| 1741 | 1781 | ||
| 1742 | delayacct_freepages_start(); | 1782 | delayacct_freepages_start(); |
| 1743 | 1783 | ||
| @@ -1773,7 +1813,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1773 | } | 1813 | } |
| 1774 | } | 1814 | } |
| 1775 | total_scanned += sc->nr_scanned; | 1815 | total_scanned += sc->nr_scanned; |
| 1776 | if (sc->nr_reclaimed >= sc->swap_cluster_max) { | 1816 | if (sc->nr_reclaimed >= sc->nr_to_reclaim) { |
| 1777 | ret = sc->nr_reclaimed; | 1817 | ret = sc->nr_reclaimed; |
| 1778 | goto out; | 1818 | goto out; |
| 1779 | } | 1819 | } |
| @@ -1785,14 +1825,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1785 | * that's undesirable in laptop mode, where we *want* lumpy | 1825 | * that's undesirable in laptop mode, where we *want* lumpy |
| 1786 | * writeout. So in laptop mode, write out the whole world. | 1826 | * writeout. So in laptop mode, write out the whole world. |
| 1787 | */ | 1827 | */ |
| 1788 | if (total_scanned > sc->swap_cluster_max + | 1828 | writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; |
| 1789 | sc->swap_cluster_max / 2) { | 1829 | if (total_scanned > writeback_threshold) { |
| 1790 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); | 1830 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); |
| 1791 | sc->may_writepage = 1; | 1831 | sc->may_writepage = 1; |
| 1792 | } | 1832 | } |
| 1793 | 1833 | ||
| 1794 | /* Take a nap, wait for some writeback to complete */ | 1834 | /* Take a nap, wait for some writeback to complete */ |
| 1795 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1835 | if (!sc->hibernation_mode && sc->nr_scanned && |
| 1836 | priority < DEF_PRIORITY - 2) | ||
| 1796 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1837 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| 1797 | } | 1838 | } |
| 1798 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1839 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
| @@ -1831,7 +1872,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
| 1831 | struct scan_control sc = { | 1872 | struct scan_control sc = { |
| 1832 | .gfp_mask = gfp_mask, | 1873 | .gfp_mask = gfp_mask, |
| 1833 | .may_writepage = !laptop_mode, | 1874 | .may_writepage = !laptop_mode, |
| 1834 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1875 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
| 1835 | .may_unmap = 1, | 1876 | .may_unmap = 1, |
| 1836 | .may_swap = 1, | 1877 | .may_swap = 1, |
| 1837 | .swappiness = vm_swappiness, | 1878 | .swappiness = vm_swappiness, |
| @@ -1855,7 +1896,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | |||
| 1855 | .may_writepage = !laptop_mode, | 1896 | .may_writepage = !laptop_mode, |
| 1856 | .may_unmap = 1, | 1897 | .may_unmap = 1, |
| 1857 | .may_swap = !noswap, | 1898 | .may_swap = !noswap, |
| 1858 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
| 1859 | .swappiness = swappiness, | 1899 | .swappiness = swappiness, |
| 1860 | .order = 0, | 1900 | .order = 0, |
| 1861 | .mem_cgroup = mem, | 1901 | .mem_cgroup = mem, |
| @@ -1889,7 +1929,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
| 1889 | .may_writepage = !laptop_mode, | 1929 | .may_writepage = !laptop_mode, |
| 1890 | .may_unmap = 1, | 1930 | .may_unmap = 1, |
| 1891 | .may_swap = !noswap, | 1931 | .may_swap = !noswap, |
| 1892 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1932 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
| 1893 | .swappiness = swappiness, | 1933 | .swappiness = swappiness, |
| 1894 | .order = 0, | 1934 | .order = 0, |
| 1895 | .mem_cgroup = mem_cont, | 1935 | .mem_cgroup = mem_cont, |
| @@ -1904,6 +1944,33 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
| 1904 | } | 1944 | } |
| 1905 | #endif | 1945 | #endif |
| 1906 | 1946 | ||
| 1947 | /* is kswapd sleeping prematurely? */ | ||
| 1948 | static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | ||
| 1949 | { | ||
| 1950 | int i; | ||
| 1951 | |||
| 1952 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | ||
| 1953 | if (remaining) | ||
| 1954 | return 1; | ||
| 1955 | |||
| 1956 | /* If after HZ/10, a zone is below the high mark, it's premature */ | ||
| 1957 | for (i = 0; i < pgdat->nr_zones; i++) { | ||
| 1958 | struct zone *zone = pgdat->node_zones + i; | ||
| 1959 | |||
| 1960 | if (!populated_zone(zone)) | ||
| 1961 | continue; | ||
| 1962 | |||
| 1963 | if (zone->all_unreclaimable) | ||
| 1964 | continue; | ||
| 1965 | |||
| 1966 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | ||
| 1967 | 0, 0)) | ||
| 1968 | return 1; | ||
| 1969 | } | ||
| 1970 | |||
| 1971 | return 0; | ||
| 1972 | } | ||
| 1973 | |||
| 1907 | /* | 1974 | /* |
| 1908 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1975 | * For kswapd, balance_pgdat() will work across all this node's zones until |
| 1909 | * they are all at high_wmark_pages(zone). | 1976 | * they are all at high_wmark_pages(zone). |
| @@ -1936,7 +2003,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
| 1936 | .gfp_mask = GFP_KERNEL, | 2003 | .gfp_mask = GFP_KERNEL, |
| 1937 | .may_unmap = 1, | 2004 | .may_unmap = 1, |
| 1938 | .may_swap = 1, | 2005 | .may_swap = 1, |
| 1939 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 2006 | /* |
| 2007 | * kswapd doesn't want to be bailed out while reclaim. because | ||
| 2008 | * we want to put equal scanning pressure on each zone. | ||
| 2009 | */ | ||
| 2010 | .nr_to_reclaim = ULONG_MAX, | ||
| 1940 | .swappiness = vm_swappiness, | 2011 | .swappiness = vm_swappiness, |
| 1941 | .order = order, | 2012 | .order = order, |
| 1942 | .mem_cgroup = NULL, | 2013 | .mem_cgroup = NULL, |
| @@ -1961,6 +2032,7 @@ loop_again: | |||
| 1961 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 2032 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
| 1962 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2033 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
| 1963 | unsigned long lru_pages = 0; | 2034 | unsigned long lru_pages = 0; |
| 2035 | int has_under_min_watermark_zone = 0; | ||
| 1964 | 2036 | ||
| 1965 | /* The swap token gets in the way of swapout... */ | 2037 | /* The swap token gets in the way of swapout... */ |
| 1966 | if (!priority) | 2038 | if (!priority) |
| @@ -1978,8 +2050,7 @@ loop_again: | |||
| 1978 | if (!populated_zone(zone)) | 2050 | if (!populated_zone(zone)) |
| 1979 | continue; | 2051 | continue; |
| 1980 | 2052 | ||
| 1981 | if (zone_is_all_unreclaimable(zone) && | 2053 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 1982 | priority != DEF_PRIORITY) | ||
| 1983 | continue; | 2054 | continue; |
| 1984 | 2055 | ||
| 1985 | /* | 2056 | /* |
| @@ -2022,13 +2093,9 @@ loop_again: | |||
| 2022 | if (!populated_zone(zone)) | 2093 | if (!populated_zone(zone)) |
| 2023 | continue; | 2094 | continue; |
| 2024 | 2095 | ||
| 2025 | if (zone_is_all_unreclaimable(zone) && | 2096 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
| 2026 | priority != DEF_PRIORITY) | ||
| 2027 | continue; | 2097 | continue; |
| 2028 | 2098 | ||
| 2029 | if (!zone_watermark_ok(zone, order, | ||
| 2030 | high_wmark_pages(zone), end_zone, 0)) | ||
| 2031 | all_zones_ok = 0; | ||
| 2032 | temp_priority[i] = priority; | 2099 | temp_priority[i] = priority; |
| 2033 | sc.nr_scanned = 0; | 2100 | sc.nr_scanned = 0; |
| 2034 | note_zone_scanning_priority(zone, priority); | 2101 | note_zone_scanning_priority(zone, priority); |
| @@ -2053,12 +2120,11 @@ loop_again: | |||
| 2053 | lru_pages); | 2120 | lru_pages); |
| 2054 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; | 2121 | sc.nr_reclaimed += reclaim_state->reclaimed_slab; |
| 2055 | total_scanned += sc.nr_scanned; | 2122 | total_scanned += sc.nr_scanned; |
| 2056 | if (zone_is_all_unreclaimable(zone)) | 2123 | if (zone->all_unreclaimable) |
| 2057 | continue; | 2124 | continue; |
| 2058 | if (nr_slab == 0 && zone->pages_scanned >= | 2125 | if (nr_slab == 0 && |
| 2059 | (zone_reclaimable_pages(zone) * 6)) | 2126 | zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6)) |
| 2060 | zone_set_flag(zone, | 2127 | zone->all_unreclaimable = 1; |
| 2061 | ZONE_ALL_UNRECLAIMABLE); | ||
| 2062 | /* | 2128 | /* |
| 2063 | * If we've done a decent amount of scanning and | 2129 | * If we've done a decent amount of scanning and |
| 2064 | * the reclaim ratio is low, start doing writepage | 2130 | * the reclaim ratio is low, start doing writepage |
| @@ -2067,6 +2133,20 @@ loop_again: | |||
| 2067 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 2133 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && |
| 2068 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2134 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
| 2069 | sc.may_writepage = 1; | 2135 | sc.may_writepage = 1; |
| 2136 | |||
| 2137 | if (!zone_watermark_ok(zone, order, | ||
| 2138 | high_wmark_pages(zone), end_zone, 0)) { | ||
| 2139 | all_zones_ok = 0; | ||
| 2140 | /* | ||
| 2141 | * We are still under min water mark. This | ||
| 2142 | * means that we have a GFP_ATOMIC allocation | ||
| 2143 | * failure risk. Hurry up! | ||
| 2144 | */ | ||
| 2145 | if (!zone_watermark_ok(zone, order, | ||
| 2146 | min_wmark_pages(zone), end_zone, 0)) | ||
| 2147 | has_under_min_watermark_zone = 1; | ||
| 2148 | } | ||
| 2149 | |||
| 2070 | } | 2150 | } |
| 2071 | if (all_zones_ok) | 2151 | if (all_zones_ok) |
| 2072 | break; /* kswapd: all done */ | 2152 | break; /* kswapd: all done */ |
| @@ -2074,8 +2154,12 @@ loop_again: | |||
| 2074 | * OK, kswapd is getting into trouble. Take a nap, then take | 2154 | * OK, kswapd is getting into trouble. Take a nap, then take |
| 2075 | * another pass across the zones. | 2155 | * another pass across the zones. |
| 2076 | */ | 2156 | */ |
| 2077 | if (total_scanned && priority < DEF_PRIORITY - 2) | 2157 | if (total_scanned && (priority < DEF_PRIORITY - 2)) { |
| 2078 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 2158 | if (has_under_min_watermark_zone) |
| 2159 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | ||
| 2160 | else | ||
| 2161 | congestion_wait(BLK_RW_ASYNC, HZ/10); | ||
| 2162 | } | ||
| 2079 | 2163 | ||
| 2080 | /* | 2164 | /* |
| 2081 | * We do this so kswapd doesn't build up large priorities for | 2165 | * We do this so kswapd doesn't build up large priorities for |
| @@ -2173,6 +2257,7 @@ static int kswapd(void *p) | |||
| 2173 | order = 0; | 2257 | order = 0; |
| 2174 | for ( ; ; ) { | 2258 | for ( ; ; ) { |
| 2175 | unsigned long new_order; | 2259 | unsigned long new_order; |
| 2260 | int ret; | ||
| 2176 | 2261 | ||
| 2177 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2262 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
| 2178 | new_order = pgdat->kswapd_max_order; | 2263 | new_order = pgdat->kswapd_max_order; |
| @@ -2184,19 +2269,45 @@ static int kswapd(void *p) | |||
| 2184 | */ | 2269 | */ |
| 2185 | order = new_order; | 2270 | order = new_order; |
| 2186 | } else { | 2271 | } else { |
| 2187 | if (!freezing(current)) | 2272 | if (!freezing(current) && !kthread_should_stop()) { |
| 2188 | schedule(); | 2273 | long remaining = 0; |
| 2274 | |||
| 2275 | /* Try to sleep for a short interval */ | ||
| 2276 | if (!sleeping_prematurely(pgdat, order, remaining)) { | ||
| 2277 | remaining = schedule_timeout(HZ/10); | ||
| 2278 | finish_wait(&pgdat->kswapd_wait, &wait); | ||
| 2279 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | ||
| 2280 | } | ||
| 2281 | |||
| 2282 | /* | ||
| 2283 | * After a short sleep, check if it was a | ||
| 2284 | * premature sleep. If not, then go fully | ||
| 2285 | * to sleep until explicitly woken up | ||
| 2286 | */ | ||
| 2287 | if (!sleeping_prematurely(pgdat, order, remaining)) | ||
| 2288 | schedule(); | ||
| 2289 | else { | ||
| 2290 | if (remaining) | ||
| 2291 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | ||
| 2292 | else | ||
| 2293 | count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); | ||
| 2294 | } | ||
| 2295 | } | ||
| 2189 | 2296 | ||
| 2190 | order = pgdat->kswapd_max_order; | 2297 | order = pgdat->kswapd_max_order; |
| 2191 | } | 2298 | } |
| 2192 | finish_wait(&pgdat->kswapd_wait, &wait); | 2299 | finish_wait(&pgdat->kswapd_wait, &wait); |
| 2193 | 2300 | ||
| 2194 | if (!try_to_freeze()) { | 2301 | ret = try_to_freeze(); |
| 2195 | /* We can speed up thawing tasks if we don't call | 2302 | if (kthread_should_stop()) |
| 2196 | * balance_pgdat after returning from the refrigerator | 2303 | break; |
| 2197 | */ | 2304 | |
| 2305 | /* | ||
| 2306 | * We can speed up thawing tasks if we don't call balance_pgdat | ||
| 2307 | * after returning from the refrigerator | ||
| 2308 | */ | ||
| 2309 | if (!ret) | ||
| 2198 | balance_pgdat(pgdat, order); | 2310 | balance_pgdat(pgdat, order); |
| 2199 | } | ||
| 2200 | } | 2311 | } |
| 2201 | return 0; | 2312 | return 0; |
| 2202 | } | 2313 | } |
| @@ -2260,148 +2371,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
| 2260 | 2371 | ||
| 2261 | #ifdef CONFIG_HIBERNATION | 2372 | #ifdef CONFIG_HIBERNATION |
| 2262 | /* | 2373 | /* |
| 2263 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 2374 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of |
| 2264 | * from LRU lists system-wide, for given pass and priority. | ||
| 2265 | * | ||
| 2266 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
| 2267 | */ | ||
| 2268 | static void shrink_all_zones(unsigned long nr_pages, int prio, | ||
| 2269 | int pass, struct scan_control *sc) | ||
| 2270 | { | ||
| 2271 | struct zone *zone; | ||
| 2272 | unsigned long nr_reclaimed = 0; | ||
| 2273 | struct zone_reclaim_stat *reclaim_stat; | ||
| 2274 | |||
| 2275 | for_each_populated_zone(zone) { | ||
| 2276 | enum lru_list l; | ||
| 2277 | |||
| 2278 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) | ||
| 2279 | continue; | ||
| 2280 | |||
| 2281 | for_each_evictable_lru(l) { | ||
| 2282 | enum zone_stat_item ls = NR_LRU_BASE + l; | ||
| 2283 | unsigned long lru_pages = zone_page_state(zone, ls); | ||
| 2284 | |||
| 2285 | /* For pass = 0, we don't shrink the active list */ | ||
| 2286 | if (pass == 0 && (l == LRU_ACTIVE_ANON || | ||
| 2287 | l == LRU_ACTIVE_FILE)) | ||
| 2288 | continue; | ||
| 2289 | |||
| 2290 | reclaim_stat = get_reclaim_stat(zone, sc); | ||
| 2291 | reclaim_stat->nr_saved_scan[l] += | ||
| 2292 | (lru_pages >> prio) + 1; | ||
| 2293 | if (reclaim_stat->nr_saved_scan[l] | ||
| 2294 | >= nr_pages || pass > 3) { | ||
| 2295 | unsigned long nr_to_scan; | ||
| 2296 | |||
| 2297 | reclaim_stat->nr_saved_scan[l] = 0; | ||
| 2298 | nr_to_scan = min(nr_pages, lru_pages); | ||
| 2299 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | ||
| 2300 | sc, prio); | ||
| 2301 | if (nr_reclaimed >= nr_pages) { | ||
| 2302 | sc->nr_reclaimed += nr_reclaimed; | ||
| 2303 | return; | ||
| 2304 | } | ||
| 2305 | } | ||
| 2306 | } | ||
| 2307 | } | ||
| 2308 | sc->nr_reclaimed += nr_reclaimed; | ||
| 2309 | } | ||
| 2310 | |||
| 2311 | /* | ||
| 2312 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
| 2313 | * freed pages. | 2375 | * freed pages. |
| 2314 | * | 2376 | * |
| 2315 | * Rather than trying to age LRUs the aim is to preserve the overall | 2377 | * Rather than trying to age LRUs the aim is to preserve the overall |
| 2316 | * LRU order by reclaiming preferentially | 2378 | * LRU order by reclaiming preferentially |
| 2317 | * inactive > active > active referenced > active mapped | 2379 | * inactive > active > active referenced > active mapped |
| 2318 | */ | 2380 | */ |
| 2319 | unsigned long shrink_all_memory(unsigned long nr_pages) | 2381 | unsigned long shrink_all_memory(unsigned long nr_to_reclaim) |
| 2320 | { | 2382 | { |
| 2321 | unsigned long lru_pages, nr_slab; | ||
| 2322 | int pass; | ||
| 2323 | struct reclaim_state reclaim_state; | 2383 | struct reclaim_state reclaim_state; |
| 2324 | struct scan_control sc = { | 2384 | struct scan_control sc = { |
| 2325 | .gfp_mask = GFP_KERNEL, | 2385 | .gfp_mask = GFP_HIGHUSER_MOVABLE, |
| 2326 | .may_unmap = 0, | 2386 | .may_swap = 1, |
| 2387 | .may_unmap = 1, | ||
| 2327 | .may_writepage = 1, | 2388 | .may_writepage = 1, |
| 2389 | .nr_to_reclaim = nr_to_reclaim, | ||
| 2390 | .hibernation_mode = 1, | ||
| 2391 | .swappiness = vm_swappiness, | ||
| 2392 | .order = 0, | ||
| 2328 | .isolate_pages = isolate_pages_global, | 2393 | .isolate_pages = isolate_pages_global, |
| 2329 | .nr_reclaimed = 0, | ||
| 2330 | }; | 2394 | }; |
| 2395 | struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | ||
| 2396 | struct task_struct *p = current; | ||
| 2397 | unsigned long nr_reclaimed; | ||
| 2331 | 2398 | ||
| 2332 | current->reclaim_state = &reclaim_state; | 2399 | p->flags |= PF_MEMALLOC; |
| 2333 | 2400 | lockdep_set_current_reclaim_state(sc.gfp_mask); | |
| 2334 | lru_pages = global_reclaimable_pages(); | 2401 | reclaim_state.reclaimed_slab = 0; |
| 2335 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 2402 | p->reclaim_state = &reclaim_state; |
| 2336 | /* If slab caches are huge, it's better to hit them first */ | ||
| 2337 | while (nr_slab >= lru_pages) { | ||
| 2338 | reclaim_state.reclaimed_slab = 0; | ||
| 2339 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
| 2340 | if (!reclaim_state.reclaimed_slab) | ||
| 2341 | break; | ||
| 2342 | |||
| 2343 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
| 2344 | if (sc.nr_reclaimed >= nr_pages) | ||
| 2345 | goto out; | ||
| 2346 | |||
| 2347 | nr_slab -= reclaim_state.reclaimed_slab; | ||
| 2348 | } | ||
| 2349 | |||
| 2350 | /* | ||
| 2351 | * We try to shrink LRUs in 5 passes: | ||
| 2352 | * 0 = Reclaim from inactive_list only | ||
| 2353 | * 1 = Reclaim from active list but don't reclaim mapped | ||
| 2354 | * 2 = 2nd pass of type 1 | ||
| 2355 | * 3 = Reclaim mapped (normal reclaim) | ||
| 2356 | * 4 = 2nd pass of type 3 | ||
| 2357 | */ | ||
| 2358 | for (pass = 0; pass < 5; pass++) { | ||
| 2359 | int prio; | ||
| 2360 | |||
| 2361 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
| 2362 | if (pass > 2) | ||
| 2363 | sc.may_unmap = 1; | ||
| 2364 | |||
| 2365 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
| 2366 | unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; | ||
| 2367 | |||
| 2368 | sc.nr_scanned = 0; | ||
| 2369 | sc.swap_cluster_max = nr_to_scan; | ||
| 2370 | shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
| 2371 | if (sc.nr_reclaimed >= nr_pages) | ||
| 2372 | goto out; | ||
| 2373 | |||
| 2374 | reclaim_state.reclaimed_slab = 0; | ||
| 2375 | shrink_slab(sc.nr_scanned, sc.gfp_mask, | ||
| 2376 | global_reclaimable_pages()); | ||
| 2377 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
| 2378 | if (sc.nr_reclaimed >= nr_pages) | ||
| 2379 | goto out; | ||
| 2380 | |||
| 2381 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
| 2382 | congestion_wait(BLK_RW_ASYNC, HZ / 10); | ||
| 2383 | } | ||
| 2384 | } | ||
| 2385 | |||
| 2386 | /* | ||
| 2387 | * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be | ||
| 2388 | * something in slab caches | ||
| 2389 | */ | ||
| 2390 | if (!sc.nr_reclaimed) { | ||
| 2391 | do { | ||
| 2392 | reclaim_state.reclaimed_slab = 0; | ||
| 2393 | shrink_slab(nr_pages, sc.gfp_mask, | ||
| 2394 | global_reclaimable_pages()); | ||
| 2395 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
| 2396 | } while (sc.nr_reclaimed < nr_pages && | ||
| 2397 | reclaim_state.reclaimed_slab > 0); | ||
| 2398 | } | ||
| 2399 | 2403 | ||
| 2404 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | ||
| 2400 | 2405 | ||
| 2401 | out: | 2406 | p->reclaim_state = NULL; |
| 2402 | current->reclaim_state = NULL; | 2407 | lockdep_clear_current_reclaim_state(); |
| 2408 | p->flags &= ~PF_MEMALLOC; | ||
| 2403 | 2409 | ||
| 2404 | return sc.nr_reclaimed; | 2410 | return nr_reclaimed; |
| 2405 | } | 2411 | } |
| 2406 | #endif /* CONFIG_HIBERNATION */ | 2412 | #endif /* CONFIG_HIBERNATION */ |
| 2407 | 2413 | ||
| @@ -2451,6 +2457,17 @@ int kswapd_run(int nid) | |||
| 2451 | return ret; | 2457 | return ret; |
| 2452 | } | 2458 | } |
| 2453 | 2459 | ||
| 2460 | /* | ||
| 2461 | * Called by memory hotplug when all memory in a node is offlined. | ||
| 2462 | */ | ||
| 2463 | void kswapd_stop(int nid) | ||
| 2464 | { | ||
| 2465 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | ||
| 2466 | |||
| 2467 | if (kswapd) | ||
| 2468 | kthread_stop(kswapd); | ||
| 2469 | } | ||
| 2470 | |||
| 2454 | static int __init kswapd_init(void) | 2471 | static int __init kswapd_init(void) |
| 2455 | { | 2472 | { |
| 2456 | int nid; | 2473 | int nid; |
| @@ -2553,8 +2570,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2553 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 2570 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
| 2554 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 2571 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
| 2555 | .may_swap = 1, | 2572 | .may_swap = 1, |
| 2556 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 2573 | .nr_to_reclaim = max_t(unsigned long, nr_pages, |
| 2557 | SWAP_CLUSTER_MAX), | 2574 | SWAP_CLUSTER_MAX), |
| 2558 | .gfp_mask = gfp_mask, | 2575 | .gfp_mask = gfp_mask, |
| 2559 | .swappiness = vm_swappiness, | 2576 | .swappiness = vm_swappiness, |
| 2560 | .order = order, | 2577 | .order = order, |
| @@ -2570,6 +2587,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2570 | * and RECLAIM_SWAP. | 2587 | * and RECLAIM_SWAP. |
| 2571 | */ | 2588 | */ |
| 2572 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; | 2589 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; |
| 2590 | lockdep_set_current_reclaim_state(gfp_mask); | ||
| 2573 | reclaim_state.reclaimed_slab = 0; | 2591 | reclaim_state.reclaimed_slab = 0; |
| 2574 | p->reclaim_state = &reclaim_state; | 2592 | p->reclaim_state = &reclaim_state; |
| 2575 | 2593 | ||
| @@ -2613,6 +2631,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2613 | 2631 | ||
| 2614 | p->reclaim_state = NULL; | 2632 | p->reclaim_state = NULL; |
| 2615 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 2633 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
| 2634 | lockdep_clear_current_reclaim_state(); | ||
| 2616 | return sc.nr_reclaimed >= nr_pages; | 2635 | return sc.nr_reclaimed >= nr_pages; |
| 2617 | } | 2636 | } |
| 2618 | 2637 | ||
| @@ -2635,7 +2654,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2635 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) | 2654 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
| 2636 | return ZONE_RECLAIM_FULL; | 2655 | return ZONE_RECLAIM_FULL; |
| 2637 | 2656 | ||
| 2638 | if (zone_is_all_unreclaimable(zone)) | 2657 | if (zone->all_unreclaimable) |
| 2639 | return ZONE_RECLAIM_FULL; | 2658 | return ZONE_RECLAIM_FULL; |
| 2640 | 2659 | ||
| 2641 | /* | 2660 | /* |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c81321f9feec..7f760cbc73f3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void) | |||
| 139 | threshold = calculate_threshold(zone); | 139 | threshold = calculate_threshold(zone); |
| 140 | 140 | ||
| 141 | for_each_online_cpu(cpu) | 141 | for_each_online_cpu(cpu) |
| 142 | zone_pcp(zone, cpu)->stat_threshold = threshold; | 142 | per_cpu_ptr(zone->pageset, cpu)->stat_threshold |
| 143 | = threshold; | ||
| 143 | } | 144 | } |
| 144 | } | 145 | } |
| 145 | 146 | ||
| @@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void) | |||
| 149 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, | 150 | void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, |
| 150 | int delta) | 151 | int delta) |
| 151 | { | 152 | { |
| 152 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 153 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
| 154 | |||
| 153 | s8 *p = pcp->vm_stat_diff + item; | 155 | s8 *p = pcp->vm_stat_diff + item; |
| 154 | long x; | 156 | long x; |
| 155 | 157 | ||
| @@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state); | |||
| 202 | */ | 204 | */ |
| 203 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) | 205 | void __inc_zone_state(struct zone *zone, enum zone_stat_item item) |
| 204 | { | 206 | { |
| 205 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 207 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
| 206 | s8 *p = pcp->vm_stat_diff + item; | 208 | s8 *p = pcp->vm_stat_diff + item; |
| 207 | 209 | ||
| 208 | (*p)++; | 210 | (*p)++; |
| @@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state); | |||
| 223 | 225 | ||
| 224 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) | 226 | void __dec_zone_state(struct zone *zone, enum zone_stat_item item) |
| 225 | { | 227 | { |
| 226 | struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); | 228 | struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset); |
| 227 | s8 *p = pcp->vm_stat_diff + item; | 229 | s8 *p = pcp->vm_stat_diff + item; |
| 228 | 230 | ||
| 229 | (*p)--; | 231 | (*p)--; |
| @@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu) | |||
| 300 | for_each_populated_zone(zone) { | 302 | for_each_populated_zone(zone) { |
| 301 | struct per_cpu_pageset *p; | 303 | struct per_cpu_pageset *p; |
| 302 | 304 | ||
| 303 | p = zone_pcp(zone, cpu); | 305 | p = per_cpu_ptr(zone->pageset, cpu); |
| 304 | 306 | ||
| 305 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 307 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
| 306 | if (p->vm_stat_diff[i]) { | 308 | if (p->vm_stat_diff[i]) { |
| @@ -683,6 +685,9 @@ static const char * const vmstat_text[] = { | |||
| 683 | "slabs_scanned", | 685 | "slabs_scanned", |
| 684 | "kswapd_steal", | 686 | "kswapd_steal", |
| 685 | "kswapd_inodesteal", | 687 | "kswapd_inodesteal", |
| 688 | "kswapd_low_wmark_hit_quickly", | ||
| 689 | "kswapd_high_wmark_hit_quickly", | ||
| 690 | "kswapd_skip_congestion_wait", | ||
| 686 | "pageoutrun", | 691 | "pageoutrun", |
| 687 | "allocstall", | 692 | "allocstall", |
| 688 | 693 | ||
| @@ -738,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 738 | for_each_online_cpu(i) { | 743 | for_each_online_cpu(i) { |
| 739 | struct per_cpu_pageset *pageset; | 744 | struct per_cpu_pageset *pageset; |
| 740 | 745 | ||
| 741 | pageset = zone_pcp(zone, i); | 746 | pageset = per_cpu_ptr(zone->pageset, i); |
| 742 | seq_printf(m, | 747 | seq_printf(m, |
| 743 | "\n cpu: %i" | 748 | "\n cpu: %i" |
| 744 | "\n count: %i" | 749 | "\n count: %i" |
| @@ -758,7 +763,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 758 | "\n prev_priority: %i" | 763 | "\n prev_priority: %i" |
| 759 | "\n start_pfn: %lu" | 764 | "\n start_pfn: %lu" |
| 760 | "\n inactive_ratio: %u", | 765 | "\n inactive_ratio: %u", |
| 761 | zone_is_all_unreclaimable(zone), | 766 | zone->all_unreclaimable, |
| 762 | zone->prev_priority, | 767 | zone->prev_priority, |
| 763 | zone->zone_start_pfn, | 768 | zone->zone_start_pfn, |
| 764 | zone->inactive_ratio); | 769 | zone->inactive_ratio); |
| @@ -883,11 +888,10 @@ static void vmstat_update(struct work_struct *w) | |||
| 883 | 888 | ||
| 884 | static void __cpuinit start_cpu_timer(int cpu) | 889 | static void __cpuinit start_cpu_timer(int cpu) |
| 885 | { | 890 | { |
| 886 | struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); | 891 | struct delayed_work *work = &per_cpu(vmstat_work, cpu); |
| 887 | 892 | ||
| 888 | INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); | 893 | INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); |
| 889 | schedule_delayed_work_on(cpu, vmstat_work, | 894 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); |
| 890 | __round_jiffies_relative(HZ, cpu)); | ||
| 891 | } | 895 | } |
| 892 | 896 | ||
| 893 | /* | 897 | /* |
| @@ -904,6 +908,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, | |||
| 904 | case CPU_ONLINE: | 908 | case CPU_ONLINE: |
| 905 | case CPU_ONLINE_FROZEN: | 909 | case CPU_ONLINE_FROZEN: |
| 906 | start_cpu_timer(cpu); | 910 | start_cpu_timer(cpu); |
| 911 | node_set_state(cpu_to_node(cpu), N_CPU); | ||
| 907 | break; | 912 | break; |
| 908 | case CPU_DOWN_PREPARE: | 913 | case CPU_DOWN_PREPARE: |
| 909 | case CPU_DOWN_PREPARE_FROZEN: | 914 | case CPU_DOWN_PREPARE_FROZEN: |
