diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 100 | ||||
| -rw-r--r-- | mm/Kconfig.debug | 20 | ||||
| -rw-r--r-- | mm/Makefile | 17 | ||||
| -rw-r--r-- | mm/allocpercpu.c | 30 | ||||
| -rw-r--r-- | mm/backing-dev.c | 465 | ||||
| -rw-r--r-- | mm/bootmem.c | 60 | ||||
| -rw-r--r-- | mm/bounce.c | 10 | ||||
| -rw-r--r-- | mm/dmapool.c | 2 | ||||
| -rw-r--r-- | mm/fadvise.c | 2 | ||||
| -rw-r--r-- | mm/failslab.c | 1 | ||||
| -rw-r--r-- | mm/filemap.c | 382 | ||||
| -rw-r--r-- | mm/filemap_xip.c | 6 | ||||
| -rw-r--r-- | mm/highmem.c | 18 | ||||
| -rw-r--r-- | mm/hugetlb.c | 385 | ||||
| -rw-r--r-- | mm/hwpoison-inject.c | 41 | ||||
| -rw-r--r-- | mm/init-mm.c | 20 | ||||
| -rw-r--r-- | mm/internal.h | 43 | ||||
| -rw-r--r-- | mm/kmemcheck.c | 122 | ||||
| -rw-r--r-- | mm/kmemleak-test.c | 111 | ||||
| -rw-r--r-- | mm/kmemleak.c | 1689 | ||||
| -rw-r--r-- | mm/ksm.c | 1710 | ||||
| -rw-r--r-- | mm/maccess.c | 2 | ||||
| -rw-r--r-- | mm/madvise.c | 93 | ||||
| -rw-r--r-- | mm/memcontrol.c | 1563 | ||||
| -rw-r--r-- | mm/memory-failure.c | 835 | ||||
| -rw-r--r-- | mm/memory.c | 630 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 43 | ||||
| -rw-r--r-- | mm/mempolicy.c | 192 | ||||
| -rw-r--r-- | mm/mempool.c | 9 | ||||
| -rw-r--r-- | mm/migrate.c | 42 | ||||
| -rw-r--r-- | mm/mlock.c | 201 | ||||
| -rw-r--r-- | mm/mmap.c | 81 | ||||
| -rw-r--r-- | mm/mmu_context.c | 58 | ||||
| -rw-r--r-- | mm/mmu_notifier.c | 20 | ||||
| -rw-r--r-- | mm/mmzone.c | 15 | ||||
| -rw-r--r-- | mm/mprotect.c | 2 | ||||
| -rw-r--r-- | mm/mremap.c | 18 | ||||
| -rw-r--r-- | mm/nommu.c | 239 | ||||
| -rw-r--r-- | mm/oom_kill.c | 153 | ||||
| -rw-r--r-- | mm/page-writeback.c | 287 | ||||
| -rw-r--r-- | mm/page_alloc.c | 1211 | ||||
| -rw-r--r-- | mm/page_cgroup.c | 70 | ||||
| -rw-r--r-- | mm/page_io.c | 2 | ||||
| -rw-r--r-- | mm/pdflush.c | 251 | ||||
| -rw-r--r-- | mm/percpu.c | 1728 | ||||
| -rw-r--r-- | mm/quicklist.c | 5 | ||||
| -rw-r--r-- | mm/readahead.c | 185 | ||||
| -rw-r--r-- | mm/rmap.c | 188 | ||||
| -rw-r--r-- | mm/shmem.c | 91 | ||||
| -rw-r--r-- | mm/shmem_acl.c | 40 | ||||
| -rw-r--r-- | mm/slab.c | 349 | ||||
| -rw-r--r-- | mm/slob.c | 60 | ||||
| -rw-r--r-- | mm/slub.c | 357 | ||||
| -rw-r--r-- | mm/sparse-vmemmap.c | 8 | ||||
| -rw-r--r-- | mm/sparse.c | 9 | ||||
| -rw-r--r-- | mm/swap.c | 58 | ||||
| -rw-r--r-- | mm/swap_state.c | 153 | ||||
| -rw-r--r-- | mm/swapfile.c | 311 | ||||
| -rw-r--r-- | mm/thrash.c | 32 | ||||
| -rw-r--r-- | mm/truncate.c | 186 | ||||
| -rw-r--r-- | mm/util.c | 39 | ||||
| -rw-r--r-- | mm/vmalloc.c | 643 | ||||
| -rw-r--r-- | mm/vmscan.c | 680 | ||||
| -rw-r--r-- | mm/vmstat.c | 48 |
64 files changed, 12516 insertions, 3905 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b53427ad30a3..44cf6f0a3a6d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -67,7 +67,7 @@ config DISCONTIGMEM | |||
| 67 | 67 | ||
| 68 | config SPARSEMEM | 68 | config SPARSEMEM |
| 69 | def_bool y | 69 | def_bool y |
| 70 | depends on SPARSEMEM_MANUAL | 70 | depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL |
| 71 | 71 | ||
| 72 | config FLATMEM | 72 | config FLATMEM |
| 73 | def_bool y | 73 | def_bool y |
| @@ -128,11 +128,8 @@ config SPARSEMEM_VMEMMAP | |||
| 128 | config MEMORY_HOTPLUG | 128 | config MEMORY_HOTPLUG |
| 129 | bool "Allow for memory hot-add" | 129 | bool "Allow for memory hot-add" |
| 130 | depends on SPARSEMEM || X86_64_ACPI_NUMA | 130 | depends on SPARSEMEM || X86_64_ACPI_NUMA |
| 131 | depends on HOTPLUG && !HIBERNATION && ARCH_ENABLE_MEMORY_HOTPLUG | 131 | depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG |
| 132 | depends on (IA64 || X86 || PPC64 || SUPERH || S390) | 132 | depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) |
| 133 | |||
| 134 | comment "Memory hotplug is currently incompatible with Software Suspend" | ||
| 135 | depends on SPARSEMEM && HOTPLUG && HIBERNATION | ||
| 136 | 133 | ||
| 137 | config MEMORY_HOTPLUG_SPARSE | 134 | config MEMORY_HOTPLUG_SPARSE |
| 138 | def_bool y | 135 | def_bool y |
| @@ -153,7 +150,7 @@ config MEMORY_HOTREMOVE | |||
| 153 | # | 150 | # |
| 154 | config PAGEFLAGS_EXTENDED | 151 | config PAGEFLAGS_EXTENDED |
| 155 | def_bool y | 152 | def_bool y |
| 156 | depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM | 153 | depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM |
| 157 | 154 | ||
| 158 | # Heavily threaded applications may benefit from splitting the mm-wide | 155 | # Heavily threaded applications may benefit from splitting the mm-wide |
| 159 | # page_table_lock, so that faults on different parts of the user address | 156 | # page_table_lock, so that faults on different parts of the user address |
| @@ -203,23 +200,90 @@ config VIRT_TO_BUS | |||
| 203 | def_bool y | 200 | def_bool y |
| 204 | depends on !ARCH_NO_VIRT_TO_BUS | 201 | depends on !ARCH_NO_VIRT_TO_BUS |
| 205 | 202 | ||
| 206 | config UNEVICTABLE_LRU | ||
| 207 | bool "Add LRU list to track non-evictable pages" | ||
| 208 | default y | ||
| 209 | help | ||
| 210 | Keeps unevictable pages off of the active and inactive pageout | ||
| 211 | lists, so kswapd will not waste CPU time or have its balancing | ||
| 212 | algorithms thrown off by scanning these pages. Selecting this | ||
| 213 | will use one page flag and increase the code size a little, | ||
| 214 | say Y unless you know what you are doing. | ||
| 215 | |||
| 216 | config HAVE_MLOCK | 203 | config HAVE_MLOCK |
| 217 | bool | 204 | bool |
| 218 | default y if MMU=y | 205 | default y if MMU=y |
| 219 | 206 | ||
| 220 | config HAVE_MLOCKED_PAGE_BIT | 207 | config HAVE_MLOCKED_PAGE_BIT |
| 221 | bool | 208 | bool |
| 222 | default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y | 209 | default y if HAVE_MLOCK=y |
| 223 | 210 | ||
| 224 | config MMU_NOTIFIER | 211 | config MMU_NOTIFIER |
| 225 | bool | 212 | bool |
| 213 | |||
| 214 | config KSM | ||
| 215 | bool "Enable KSM for page merging" | ||
| 216 | depends on MMU | ||
| 217 | help | ||
| 218 | Enable Kernel Samepage Merging: KSM periodically scans those areas | ||
| 219 | of an application's address space that an app has advised may be | ||
| 220 | mergeable. When it finds pages of identical content, it replaces | ||
| 221 | the many instances by a single resident page with that content, so | ||
| 222 | saving memory until one or another app needs to modify the content. | ||
| 223 | Recommended for use with KVM, or with other duplicative applications. | ||
| 224 | See Documentation/vm/ksm.txt for more information: KSM is inactive | ||
| 225 | until a program has madvised that an area is MADV_MERGEABLE, and | ||
| 226 | root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). | ||
| 227 | |||
| 228 | config DEFAULT_MMAP_MIN_ADDR | ||
| 229 | int "Low address space to protect from user allocation" | ||
| 230 | default 4096 | ||
| 231 | help | ||
| 232 | This is the portion of low virtual memory which should be protected | ||
| 233 | from userspace allocation. Keeping a user from writing to low pages | ||
| 234 | can help reduce the impact of kernel NULL pointer bugs. | ||
| 235 | |||
| 236 | For most ia64, ppc64 and x86 users with lots of address space | ||
| 237 | a value of 65536 is reasonable and should cause no problems. | ||
| 238 | On arm and other archs it should not be higher than 32768. | ||
| 239 | Programs which use vm86 functionality or have some need to map | ||
| 240 | this low address space will need CAP_SYS_RAWIO or disable this | ||
| 241 | protection by setting the value to 0. | ||
| 242 | |||
| 243 | This value can be changed after boot using the | ||
| 244 | /proc/sys/vm/mmap_min_addr tunable. | ||
| 245 | |||
| 246 | config ARCH_SUPPORTS_MEMORY_FAILURE | ||
| 247 | bool | ||
| 248 | |||
| 249 | config MEMORY_FAILURE | ||
| 250 | depends on MMU | ||
| 251 | depends on ARCH_SUPPORTS_MEMORY_FAILURE | ||
| 252 | bool "Enable recovery from hardware memory errors" | ||
| 253 | help | ||
| 254 | Enables code to recover from some memory failures on systems | ||
| 255 | with MCA recovery. This allows a system to continue running | ||
| 256 | even when some of its memory has uncorrected errors. This requires | ||
| 257 | special hardware support and typically ECC memory. | ||
| 258 | |||
| 259 | config HWPOISON_INJECT | ||
| 260 | tristate "Poison pages injector" | ||
| 261 | depends on MEMORY_FAILURE && DEBUG_KERNEL | ||
| 262 | |||
| 263 | config NOMMU_INITIAL_TRIM_EXCESS | ||
| 264 | int "Turn on mmap() excess space trimming before booting" | ||
| 265 | depends on !MMU | ||
| 266 | default 1 | ||
| 267 | help | ||
| 268 | The NOMMU mmap() frequently needs to allocate large contiguous chunks | ||
| 269 | of memory on which to store mappings, but it can only ask the system | ||
| 270 | allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently | ||
| 271 | more than it requires. To deal with this, mmap() is able to trim off | ||
| 272 | the excess and return it to the allocator. | ||
| 273 | |||
| 274 | If trimming is enabled, the excess is trimmed off and returned to the | ||
| 275 | system allocator, which can cause extra fragmentation, particularly | ||
| 276 | if there are a lot of transient processes. | ||
| 277 | |||
| 278 | If trimming is disabled, the excess is kept, but not used, which for | ||
| 279 | long-term mappings means that the space is wasted. | ||
| 280 | |||
| 281 | Trimming can be dynamically controlled through a sysctl option | ||
| 282 | (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of | ||
| 283 | excess pages there must be before trimming should occur, or zero if | ||
| 284 | no trimming is to occur. | ||
| 285 | |||
| 286 | This option specifies the initial value of this option. The default | ||
| 287 | of 1 says that all excess pages should be trimmed. | ||
| 288 | |||
| 289 | See Documentation/nommu-mmap.txt for more information. | ||
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index c8d62d49a44e..af7cfb43d2f0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug | |||
| @@ -1,3 +1,13 @@ | |||
| 1 | config DEBUG_PAGEALLOC | ||
| 2 | bool "Debug page memory allocations" | ||
| 3 | depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC | ||
| 4 | depends on !HIBERNATION || !PPC && !SPARC | ||
| 5 | depends on !KMEMCHECK | ||
| 6 | ---help--- | ||
| 7 | Unmap pages from the kernel linear mapping after free_pages(). | ||
| 8 | This results in a large slowdown, but helps to find certain types | ||
| 9 | of memory corruption. | ||
| 10 | |||
| 1 | config WANT_PAGE_DEBUG_FLAGS | 11 | config WANT_PAGE_DEBUG_FLAGS |
| 2 | bool | 12 | bool |
| 3 | 13 | ||
| @@ -7,11 +17,11 @@ config PAGE_POISONING | |||
| 7 | depends on !HIBERNATION | 17 | depends on !HIBERNATION |
| 8 | select DEBUG_PAGEALLOC | 18 | select DEBUG_PAGEALLOC |
| 9 | select WANT_PAGE_DEBUG_FLAGS | 19 | select WANT_PAGE_DEBUG_FLAGS |
| 10 | help | 20 | ---help--- |
| 11 | Fill the pages with poison patterns after free_pages() and verify | 21 | Fill the pages with poison patterns after free_pages() and verify |
| 12 | the patterns before alloc_pages(). This results in a large slowdown, | 22 | the patterns before alloc_pages(). This results in a large slowdown, |
| 13 | but helps to find certain types of memory corruptions. | 23 | but helps to find certain types of memory corruption. |
| 14 | 24 | ||
| 15 | This option cannot enalbe with hibernation. Otherwise, it will get | 25 | This option cannot be enabled in combination with hibernation as |
| 16 | wrong messages for memory corruption because the free pages are not | 26 | that would result in incorrect warnings of memory corruption after |
| 17 | saved to the suspend image. | 27 | a resume because free pages are not saved to the suspend image. |
diff --git a/mm/Makefile b/mm/Makefile index ec73c68b6015..ebf849042ed3 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -5,15 +5,16 @@ | |||
| 5 | mmu-y := nommu.o | 5 | mmu-y := nommu.o |
| 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ | 6 | mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ |
| 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ | 7 | mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ |
| 8 | vmalloc.o | 8 | vmalloc.o pagewalk.o |
| 9 | 9 | ||
| 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ | 10 | obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ |
| 11 | maccess.o page_alloc.o page-writeback.o pdflush.o \ | 11 | maccess.o page_alloc.o page-writeback.o \ |
| 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 12 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
| 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ | 13 | prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ |
| 14 | page_isolation.o mm_init.o $(mmu-y) | 14 | page_isolation.o mm_init.o mmu_context.o \ |
| 15 | $(mmu-y) | ||
| 16 | obj-y += init-mm.o | ||
| 15 | 17 | ||
| 16 | obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o | ||
| 17 | obj-$(CONFIG_BOUNCE) += bounce.o | 18 | obj-$(CONFIG_BOUNCE) += bounce.o |
| 18 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o | 19 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o |
| 19 | obj-$(CONFIG_HAS_DMA) += dmapool.o | 20 | obj-$(CONFIG_HAS_DMA) += dmapool.o |
| @@ -24,17 +25,23 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | |||
| 24 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | 25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o |
| 25 | obj-$(CONFIG_SLOB) += slob.o | 26 | obj-$(CONFIG_SLOB) += slob.o |
| 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 27 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
| 28 | obj-$(CONFIG_KSM) += ksm.o | ||
| 27 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o | 29 | obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o |
| 28 | obj-$(CONFIG_SLAB) += slab.o | 30 | obj-$(CONFIG_SLAB) += slab.o |
| 29 | obj-$(CONFIG_SLUB) += slub.o | 31 | obj-$(CONFIG_SLUB) += slub.o |
| 32 | obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | ||
| 30 | obj-$(CONFIG_FAILSLAB) += failslab.o | 33 | obj-$(CONFIG_FAILSLAB) += failslab.o |
| 31 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 34 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
| 32 | obj-$(CONFIG_FS_XIP) += filemap_xip.o | 35 | obj-$(CONFIG_FS_XIP) += filemap_xip.o |
| 33 | obj-$(CONFIG_MIGRATION) += migrate.o | 36 | obj-$(CONFIG_MIGRATION) += migrate.o |
| 34 | ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA | 37 | ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA |
| 35 | obj-$(CONFIG_SMP) += percpu.o | 38 | obj-$(CONFIG_SMP) += percpu.o |
| 36 | else | 39 | else |
| 37 | obj-$(CONFIG_SMP) += allocpercpu.o | 40 | obj-$(CONFIG_SMP) += allocpercpu.o |
| 38 | endif | 41 | endif |
| 39 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 42 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
| 40 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o | 43 | obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o |
| 44 | obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o | ||
| 45 | obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o | ||
| 46 | obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o | ||
| 47 | obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o | ||
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c index 139d5b7b6621..df34ceae0c67 100644 --- a/mm/allocpercpu.c +++ b/mm/allocpercpu.c | |||
| @@ -5,6 +5,8 @@ | |||
| 5 | */ | 5 | */ |
| 6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
| 7 | #include <linux/module.h> | 7 | #include <linux/module.h> |
| 8 | #include <linux/bootmem.h> | ||
| 9 | #include <asm/sections.h> | ||
| 8 | 10 | ||
| 9 | #ifndef cache_line_size | 11 | #ifndef cache_line_size |
| 10 | #define cache_line_size() L1_CACHE_BYTES | 12 | #define cache_line_size() L1_CACHE_BYTES |
| @@ -31,7 +33,7 @@ static void percpu_depopulate(void *__pdata, int cpu) | |||
| 31 | * @__pdata: per-cpu data to depopulate | 33 | * @__pdata: per-cpu data to depopulate |
| 32 | * @mask: depopulate per-cpu data for cpu's selected through mask bits | 34 | * @mask: depopulate per-cpu data for cpu's selected through mask bits |
| 33 | */ | 35 | */ |
| 34 | static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) | 36 | static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask) |
| 35 | { | 37 | { |
| 36 | int cpu; | 38 | int cpu; |
| 37 | for_each_cpu_mask_nr(cpu, *mask) | 39 | for_each_cpu_mask_nr(cpu, *mask) |
| @@ -147,3 +149,29 @@ void free_percpu(void *__pdata) | |||
| 147 | kfree(__percpu_disguise(__pdata)); | 149 | kfree(__percpu_disguise(__pdata)); |
| 148 | } | 150 | } |
| 149 | EXPORT_SYMBOL_GPL(free_percpu); | 151 | EXPORT_SYMBOL_GPL(free_percpu); |
| 152 | |||
| 153 | /* | ||
| 154 | * Generic percpu area setup. | ||
| 155 | */ | ||
| 156 | #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA | ||
| 157 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; | ||
| 158 | |||
| 159 | EXPORT_SYMBOL(__per_cpu_offset); | ||
| 160 | |||
| 161 | void __init setup_per_cpu_areas(void) | ||
| 162 | { | ||
| 163 | unsigned long size, i; | ||
| 164 | char *ptr; | ||
| 165 | unsigned long nr_possible_cpus = num_possible_cpus(); | ||
| 166 | |||
| 167 | /* Copy section for each CPU (we discard the original) */ | ||
| 168 | size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); | ||
| 169 | ptr = alloc_bootmem_pages(size * nr_possible_cpus); | ||
| 170 | |||
| 171 | for_each_possible_cpu(i) { | ||
| 172 | __per_cpu_offset[i] = ptr - __per_cpu_start; | ||
| 173 | memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); | ||
| 174 | ptr += size; | ||
| 175 | } | ||
| 176 | } | ||
| 177 | #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index be68c956a660..0e8ca0347707 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
| @@ -1,8 +1,11 @@ | |||
| 1 | 1 | ||
| 2 | #include <linux/wait.h> | 2 | #include <linux/wait.h> |
| 3 | #include <linux/backing-dev.h> | 3 | #include <linux/backing-dev.h> |
| 4 | #include <linux/kthread.h> | ||
| 5 | #include <linux/freezer.h> | ||
| 4 | #include <linux/fs.h> | 6 | #include <linux/fs.h> |
| 5 | #include <linux/pagemap.h> | 7 | #include <linux/pagemap.h> |
| 8 | #include <linux/mm.h> | ||
| 6 | #include <linux/sched.h> | 9 | #include <linux/sched.h> |
| 7 | #include <linux/module.h> | 10 | #include <linux/module.h> |
| 8 | #include <linux/writeback.h> | 11 | #include <linux/writeback.h> |
| @@ -14,6 +17,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) | |||
| 14 | EXPORT_SYMBOL(default_unplug_io_fn); | 17 | EXPORT_SYMBOL(default_unplug_io_fn); |
| 15 | 18 | ||
| 16 | struct backing_dev_info default_backing_dev_info = { | 19 | struct backing_dev_info default_backing_dev_info = { |
| 20 | .name = "default", | ||
| 17 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, | 21 | .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, |
| 18 | .state = 0, | 22 | .state = 0, |
| 19 | .capabilities = BDI_CAP_MAP_COPY, | 23 | .capabilities = BDI_CAP_MAP_COPY, |
| @@ -23,6 +27,24 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info); | |||
| 23 | 27 | ||
| 24 | static struct class *bdi_class; | 28 | static struct class *bdi_class; |
| 25 | 29 | ||
| 30 | /* | ||
| 31 | * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as | ||
| 32 | * reader side protection for bdi_pending_list. bdi_list has RCU reader side | ||
| 33 | * locking. | ||
| 34 | */ | ||
| 35 | DEFINE_SPINLOCK(bdi_lock); | ||
| 36 | LIST_HEAD(bdi_list); | ||
| 37 | LIST_HEAD(bdi_pending_list); | ||
| 38 | |||
| 39 | static struct task_struct *sync_supers_tsk; | ||
| 40 | static struct timer_list sync_supers_timer; | ||
| 41 | |||
| 42 | static int bdi_sync_supers(void *); | ||
| 43 | static void sync_supers_timer_fn(unsigned long); | ||
| 44 | static void arm_supers_timer(void); | ||
| 45 | |||
| 46 | static void bdi_add_default_flusher_task(struct backing_dev_info *bdi); | ||
| 47 | |||
| 26 | #ifdef CONFIG_DEBUG_FS | 48 | #ifdef CONFIG_DEBUG_FS |
| 27 | #include <linux/debugfs.h> | 49 | #include <linux/debugfs.h> |
| 28 | #include <linux/seq_file.h> | 50 | #include <linux/seq_file.h> |
| @@ -37,9 +59,29 @@ static void bdi_debug_init(void) | |||
| 37 | static int bdi_debug_stats_show(struct seq_file *m, void *v) | 59 | static int bdi_debug_stats_show(struct seq_file *m, void *v) |
| 38 | { | 60 | { |
| 39 | struct backing_dev_info *bdi = m->private; | 61 | struct backing_dev_info *bdi = m->private; |
| 62 | struct bdi_writeback *wb; | ||
| 40 | unsigned long background_thresh; | 63 | unsigned long background_thresh; |
| 41 | unsigned long dirty_thresh; | 64 | unsigned long dirty_thresh; |
| 42 | unsigned long bdi_thresh; | 65 | unsigned long bdi_thresh; |
| 66 | unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; | ||
| 67 | struct inode *inode; | ||
| 68 | |||
| 69 | /* | ||
| 70 | * inode lock is enough here, the bdi->wb_list is protected by | ||
| 71 | * RCU on the reader side | ||
| 72 | */ | ||
| 73 | nr_wb = nr_dirty = nr_io = nr_more_io = 0; | ||
| 74 | spin_lock(&inode_lock); | ||
| 75 | list_for_each_entry(wb, &bdi->wb_list, list) { | ||
| 76 | nr_wb++; | ||
| 77 | list_for_each_entry(inode, &wb->b_dirty, i_list) | ||
| 78 | nr_dirty++; | ||
| 79 | list_for_each_entry(inode, &wb->b_io, i_list) | ||
| 80 | nr_io++; | ||
| 81 | list_for_each_entry(inode, &wb->b_more_io, i_list) | ||
| 82 | nr_more_io++; | ||
| 83 | } | ||
| 84 | spin_unlock(&inode_lock); | ||
| 43 | 85 | ||
| 44 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); | 86 | get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); |
| 45 | 87 | ||
| @@ -49,12 +91,22 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) | |||
| 49 | "BdiReclaimable: %8lu kB\n" | 91 | "BdiReclaimable: %8lu kB\n" |
| 50 | "BdiDirtyThresh: %8lu kB\n" | 92 | "BdiDirtyThresh: %8lu kB\n" |
| 51 | "DirtyThresh: %8lu kB\n" | 93 | "DirtyThresh: %8lu kB\n" |
| 52 | "BackgroundThresh: %8lu kB\n", | 94 | "BackgroundThresh: %8lu kB\n" |
| 95 | "WritebackThreads: %8lu\n" | ||
| 96 | "b_dirty: %8lu\n" | ||
| 97 | "b_io: %8lu\n" | ||
| 98 | "b_more_io: %8lu\n" | ||
| 99 | "bdi_list: %8u\n" | ||
| 100 | "state: %8lx\n" | ||
| 101 | "wb_mask: %8lx\n" | ||
| 102 | "wb_list: %8u\n" | ||
| 103 | "wb_cnt: %8u\n", | ||
| 53 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), | 104 | (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), |
| 54 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), | 105 | (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), |
| 55 | K(bdi_thresh), | 106 | K(bdi_thresh), K(dirty_thresh), |
| 56 | K(dirty_thresh), | 107 | K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io, |
| 57 | K(background_thresh)); | 108 | !list_empty(&bdi->bdi_list), bdi->state, bdi->wb_mask, |
| 109 | !list_empty(&bdi->wb_list), bdi->wb_cnt); | ||
| 58 | #undef K | 110 | #undef K |
| 59 | 111 | ||
| 60 | return 0; | 112 | return 0; |
| @@ -185,6 +237,13 @@ static int __init default_bdi_init(void) | |||
| 185 | { | 237 | { |
| 186 | int err; | 238 | int err; |
| 187 | 239 | ||
| 240 | sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); | ||
| 241 | BUG_ON(IS_ERR(sync_supers_tsk)); | ||
| 242 | |||
| 243 | init_timer(&sync_supers_timer); | ||
| 244 | setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); | ||
| 245 | arm_supers_timer(); | ||
| 246 | |||
| 188 | err = bdi_init(&default_backing_dev_info); | 247 | err = bdi_init(&default_backing_dev_info); |
| 189 | if (!err) | 248 | if (!err) |
| 190 | bdi_register(&default_backing_dev_info, NULL, "default"); | 249 | bdi_register(&default_backing_dev_info, NULL, "default"); |
| @@ -193,6 +252,279 @@ static int __init default_bdi_init(void) | |||
| 193 | } | 252 | } |
| 194 | subsys_initcall(default_bdi_init); | 253 | subsys_initcall(default_bdi_init); |
| 195 | 254 | ||
| 255 | static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) | ||
| 256 | { | ||
| 257 | memset(wb, 0, sizeof(*wb)); | ||
| 258 | |||
| 259 | wb->bdi = bdi; | ||
| 260 | wb->last_old_flush = jiffies; | ||
| 261 | INIT_LIST_HEAD(&wb->b_dirty); | ||
| 262 | INIT_LIST_HEAD(&wb->b_io); | ||
| 263 | INIT_LIST_HEAD(&wb->b_more_io); | ||
| 264 | } | ||
| 265 | |||
| 266 | static void bdi_task_init(struct backing_dev_info *bdi, | ||
| 267 | struct bdi_writeback *wb) | ||
| 268 | { | ||
| 269 | struct task_struct *tsk = current; | ||
| 270 | |||
| 271 | spin_lock(&bdi->wb_lock); | ||
| 272 | list_add_tail_rcu(&wb->list, &bdi->wb_list); | ||
| 273 | spin_unlock(&bdi->wb_lock); | ||
| 274 | |||
| 275 | tsk->flags |= PF_FLUSHER | PF_SWAPWRITE; | ||
| 276 | set_freezable(); | ||
| 277 | |||
| 278 | /* | ||
| 279 | * Our parent may run at a different priority, just set us to normal | ||
| 280 | */ | ||
| 281 | set_user_nice(tsk, 0); | ||
| 282 | } | ||
| 283 | |||
| 284 | static int bdi_start_fn(void *ptr) | ||
| 285 | { | ||
| 286 | struct bdi_writeback *wb = ptr; | ||
| 287 | struct backing_dev_info *bdi = wb->bdi; | ||
| 288 | int ret; | ||
| 289 | |||
| 290 | /* | ||
| 291 | * Add us to the active bdi_list | ||
| 292 | */ | ||
| 293 | spin_lock_bh(&bdi_lock); | ||
| 294 | list_add_rcu(&bdi->bdi_list, &bdi_list); | ||
| 295 | spin_unlock_bh(&bdi_lock); | ||
| 296 | |||
| 297 | bdi_task_init(bdi, wb); | ||
| 298 | |||
| 299 | /* | ||
| 300 | * Clear pending bit and wakeup anybody waiting to tear us down | ||
| 301 | */ | ||
| 302 | clear_bit(BDI_pending, &bdi->state); | ||
| 303 | smp_mb__after_clear_bit(); | ||
| 304 | wake_up_bit(&bdi->state, BDI_pending); | ||
| 305 | |||
| 306 | ret = bdi_writeback_task(wb); | ||
| 307 | |||
| 308 | /* | ||
| 309 | * Remove us from the list | ||
| 310 | */ | ||
| 311 | spin_lock(&bdi->wb_lock); | ||
| 312 | list_del_rcu(&wb->list); | ||
| 313 | spin_unlock(&bdi->wb_lock); | ||
| 314 | |||
| 315 | /* | ||
| 316 | * Flush any work that raced with us exiting. No new work | ||
| 317 | * will be added, since this bdi isn't discoverable anymore. | ||
| 318 | */ | ||
| 319 | if (!list_empty(&bdi->work_list)) | ||
| 320 | wb_do_writeback(wb, 1); | ||
| 321 | |||
| 322 | wb->task = NULL; | ||
| 323 | return ret; | ||
| 324 | } | ||
| 325 | |||
| 326 | int bdi_has_dirty_io(struct backing_dev_info *bdi) | ||
| 327 | { | ||
| 328 | return wb_has_dirty_io(&bdi->wb); | ||
| 329 | } | ||
| 330 | |||
| 331 | static void bdi_flush_io(struct backing_dev_info *bdi) | ||
| 332 | { | ||
| 333 | struct writeback_control wbc = { | ||
| 334 | .bdi = bdi, | ||
| 335 | .sync_mode = WB_SYNC_NONE, | ||
| 336 | .older_than_this = NULL, | ||
| 337 | .range_cyclic = 1, | ||
| 338 | .nr_to_write = 1024, | ||
| 339 | }; | ||
| 340 | |||
| 341 | writeback_inodes_wbc(&wbc); | ||
| 342 | } | ||
| 343 | |||
| 344 | /* | ||
| 345 | * kupdated() used to do this. We cannot do it from the bdi_forker_task() | ||
| 346 | * or we risk deadlocking on ->s_umount. The longer term solution would be | ||
| 347 | * to implement sync_supers_bdi() or similar and simply do it from the | ||
| 348 | * bdi writeback tasks individually. | ||
| 349 | */ | ||
| 350 | static int bdi_sync_supers(void *unused) | ||
| 351 | { | ||
| 352 | set_user_nice(current, 0); | ||
| 353 | |||
| 354 | while (!kthread_should_stop()) { | ||
| 355 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 356 | schedule(); | ||
| 357 | |||
| 358 | /* | ||
| 359 | * Do this periodically, like kupdated() did before. | ||
| 360 | */ | ||
| 361 | sync_supers(); | ||
| 362 | } | ||
| 363 | |||
| 364 | return 0; | ||
| 365 | } | ||
| 366 | |||
| 367 | static void arm_supers_timer(void) | ||
| 368 | { | ||
| 369 | unsigned long next; | ||
| 370 | |||
| 371 | next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; | ||
| 372 | mod_timer(&sync_supers_timer, round_jiffies_up(next)); | ||
| 373 | } | ||
| 374 | |||
| 375 | static void sync_supers_timer_fn(unsigned long unused) | ||
| 376 | { | ||
| 377 | wake_up_process(sync_supers_tsk); | ||
| 378 | arm_supers_timer(); | ||
| 379 | } | ||
| 380 | |||
| 381 | static int bdi_forker_task(void *ptr) | ||
| 382 | { | ||
| 383 | struct bdi_writeback *me = ptr; | ||
| 384 | |||
| 385 | bdi_task_init(me->bdi, me); | ||
| 386 | |||
| 387 | for (;;) { | ||
| 388 | struct backing_dev_info *bdi, *tmp; | ||
| 389 | struct bdi_writeback *wb; | ||
| 390 | |||
| 391 | /* | ||
| 392 | * Temporary measure, we want to make sure we don't see | ||
| 393 | * dirty data on the default backing_dev_info | ||
| 394 | */ | ||
| 395 | if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) | ||
| 396 | wb_do_writeback(me, 0); | ||
| 397 | |||
| 398 | spin_lock_bh(&bdi_lock); | ||
| 399 | |||
| 400 | /* | ||
| 401 | * Check if any existing bdi's have dirty data without | ||
| 402 | * a thread registered. If so, set that up. | ||
| 403 | */ | ||
| 404 | list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) { | ||
| 405 | if (bdi->wb.task) | ||
| 406 | continue; | ||
| 407 | if (list_empty(&bdi->work_list) && | ||
| 408 | !bdi_has_dirty_io(bdi)) | ||
| 409 | continue; | ||
| 410 | |||
| 411 | bdi_add_default_flusher_task(bdi); | ||
| 412 | } | ||
| 413 | |||
| 414 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 415 | |||
| 416 | if (list_empty(&bdi_pending_list)) { | ||
| 417 | unsigned long wait; | ||
| 418 | |||
| 419 | spin_unlock_bh(&bdi_lock); | ||
| 420 | wait = msecs_to_jiffies(dirty_writeback_interval * 10); | ||
| 421 | schedule_timeout(wait); | ||
| 422 | try_to_freeze(); | ||
| 423 | continue; | ||
| 424 | } | ||
| 425 | |||
| 426 | __set_current_state(TASK_RUNNING); | ||
| 427 | |||
| 428 | /* | ||
| 429 | * This is our real job - check for pending entries in | ||
| 430 | * bdi_pending_list, and create the tasks that got added | ||
| 431 | */ | ||
| 432 | bdi = list_entry(bdi_pending_list.next, struct backing_dev_info, | ||
| 433 | bdi_list); | ||
| 434 | list_del_init(&bdi->bdi_list); | ||
| 435 | spin_unlock_bh(&bdi_lock); | ||
| 436 | |||
| 437 | wb = &bdi->wb; | ||
| 438 | wb->task = kthread_run(bdi_start_fn, wb, "flush-%s", | ||
| 439 | dev_name(bdi->dev)); | ||
| 440 | /* | ||
| 441 | * If task creation fails, then readd the bdi to | ||
| 442 | * the pending list and force writeout of the bdi | ||
| 443 | * from this forker thread. That will free some memory | ||
| 444 | * and we can try again. | ||
| 445 | */ | ||
| 446 | if (IS_ERR(wb->task)) { | ||
| 447 | wb->task = NULL; | ||
| 448 | |||
| 449 | /* | ||
| 450 | * Add this 'bdi' to the back, so we get | ||
| 451 | * a chance to flush other bdi's to free | ||
| 452 | * memory. | ||
| 453 | */ | ||
| 454 | spin_lock_bh(&bdi_lock); | ||
| 455 | list_add_tail(&bdi->bdi_list, &bdi_pending_list); | ||
| 456 | spin_unlock_bh(&bdi_lock); | ||
| 457 | |||
| 458 | bdi_flush_io(bdi); | ||
| 459 | } | ||
| 460 | } | ||
| 461 | |||
| 462 | return 0; | ||
| 463 | } | ||
| 464 | |||
| 465 | static void bdi_add_to_pending(struct rcu_head *head) | ||
| 466 | { | ||
| 467 | struct backing_dev_info *bdi; | ||
| 468 | |||
| 469 | bdi = container_of(head, struct backing_dev_info, rcu_head); | ||
| 470 | INIT_LIST_HEAD(&bdi->bdi_list); | ||
| 471 | |||
| 472 | spin_lock(&bdi_lock); | ||
| 473 | list_add_tail(&bdi->bdi_list, &bdi_pending_list); | ||
| 474 | spin_unlock(&bdi_lock); | ||
| 475 | |||
| 476 | /* | ||
| 477 | * We are now on the pending list, wake up bdi_forker_task() | ||
| 478 | * to finish the job and add us back to the active bdi_list | ||
| 479 | */ | ||
| 480 | wake_up_process(default_backing_dev_info.wb.task); | ||
| 481 | } | ||
| 482 | |||
| 483 | /* | ||
| 484 | * Add the default flusher task that gets created for any bdi | ||
| 485 | * that has dirty data pending writeout | ||
| 486 | */ | ||
| 487 | void static bdi_add_default_flusher_task(struct backing_dev_info *bdi) | ||
| 488 | { | ||
| 489 | if (!bdi_cap_writeback_dirty(bdi)) | ||
| 490 | return; | ||
| 491 | |||
| 492 | if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) { | ||
| 493 | printk(KERN_ERR "bdi %p/%s is not registered!\n", | ||
| 494 | bdi, bdi->name); | ||
| 495 | return; | ||
| 496 | } | ||
| 497 | |||
| 498 | /* | ||
| 499 | * Check with the helper whether to proceed adding a task. Will only | ||
| 500 | * abort if we two or more simultanous calls to | ||
| 501 | * bdi_add_default_flusher_task() occured, further additions will block | ||
| 502 | * waiting for previous additions to finish. | ||
| 503 | */ | ||
| 504 | if (!test_and_set_bit(BDI_pending, &bdi->state)) { | ||
| 505 | list_del_rcu(&bdi->bdi_list); | ||
| 506 | |||
| 507 | /* | ||
| 508 | * We must wait for the current RCU period to end before | ||
| 509 | * moving to the pending list. So schedule that operation | ||
| 510 | * from an RCU callback. | ||
| 511 | */ | ||
| 512 | call_rcu(&bdi->rcu_head, bdi_add_to_pending); | ||
| 513 | } | ||
| 514 | } | ||
| 515 | |||
| 516 | /* | ||
| 517 | * Remove bdi from bdi_list, and ensure that it is no longer visible | ||
| 518 | */ | ||
| 519 | static void bdi_remove_from_list(struct backing_dev_info *bdi) | ||
| 520 | { | ||
| 521 | spin_lock_bh(&bdi_lock); | ||
| 522 | list_del_rcu(&bdi->bdi_list); | ||
| 523 | spin_unlock_bh(&bdi_lock); | ||
| 524 | |||
| 525 | synchronize_rcu(); | ||
| 526 | } | ||
| 527 | |||
| 196 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, | 528 | int bdi_register(struct backing_dev_info *bdi, struct device *parent, |
| 197 | const char *fmt, ...) | 529 | const char *fmt, ...) |
| 198 | { | 530 | { |
| @@ -211,9 +543,33 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, | |||
| 211 | goto exit; | 543 | goto exit; |
| 212 | } | 544 | } |
| 213 | 545 | ||
| 546 | spin_lock_bh(&bdi_lock); | ||
| 547 | list_add_tail_rcu(&bdi->bdi_list, &bdi_list); | ||
| 548 | spin_unlock_bh(&bdi_lock); | ||
| 549 | |||
| 214 | bdi->dev = dev; | 550 | bdi->dev = dev; |
| 215 | bdi_debug_register(bdi, dev_name(dev)); | ||
| 216 | 551 | ||
| 552 | /* | ||
| 553 | * Just start the forker thread for our default backing_dev_info, | ||
| 554 | * and add other bdi's to the list. They will get a thread created | ||
| 555 | * on-demand when they need it. | ||
| 556 | */ | ||
| 557 | if (bdi_cap_flush_forker(bdi)) { | ||
| 558 | struct bdi_writeback *wb = &bdi->wb; | ||
| 559 | |||
| 560 | wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s", | ||
| 561 | dev_name(dev)); | ||
| 562 | if (IS_ERR(wb->task)) { | ||
| 563 | wb->task = NULL; | ||
| 564 | ret = -ENOMEM; | ||
| 565 | |||
| 566 | bdi_remove_from_list(bdi); | ||
| 567 | goto exit; | ||
| 568 | } | ||
| 569 | } | ||
| 570 | |||
| 571 | bdi_debug_register(bdi, dev_name(dev)); | ||
| 572 | set_bit(BDI_registered, &bdi->state); | ||
| 217 | exit: | 573 | exit: |
| 218 | return ret; | 574 | return ret; |
| 219 | } | 575 | } |
| @@ -225,9 +581,61 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) | |||
| 225 | } | 581 | } |
| 226 | EXPORT_SYMBOL(bdi_register_dev); | 582 | EXPORT_SYMBOL(bdi_register_dev); |
| 227 | 583 | ||
| 584 | /* | ||
| 585 | * Remove bdi from the global list and shutdown any threads we have running | ||
| 586 | */ | ||
| 587 | static void bdi_wb_shutdown(struct backing_dev_info *bdi) | ||
| 588 | { | ||
| 589 | struct bdi_writeback *wb; | ||
| 590 | |||
| 591 | if (!bdi_cap_writeback_dirty(bdi)) | ||
| 592 | return; | ||
| 593 | |||
| 594 | /* | ||
| 595 | * If setup is pending, wait for that to complete first | ||
| 596 | */ | ||
| 597 | wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, | ||
| 598 | TASK_UNINTERRUPTIBLE); | ||
| 599 | |||
| 600 | /* | ||
| 601 | * Make sure nobody finds us on the bdi_list anymore | ||
| 602 | */ | ||
| 603 | bdi_remove_from_list(bdi); | ||
| 604 | |||
| 605 | /* | ||
| 606 | * Finally, kill the kernel threads. We don't need to be RCU | ||
| 607 | * safe anymore, since the bdi is gone from visibility. Force | ||
| 608 | * unfreeze of the thread before calling kthread_stop(), otherwise | ||
| 609 | * it would never exet if it is currently stuck in the refrigerator. | ||
| 610 | */ | ||
| 611 | list_for_each_entry(wb, &bdi->wb_list, list) { | ||
| 612 | thaw_process(wb->task); | ||
| 613 | kthread_stop(wb->task); | ||
| 614 | } | ||
| 615 | } | ||
| 616 | |||
| 617 | /* | ||
| 618 | * This bdi is going away now, make sure that no super_blocks point to it | ||
| 619 | */ | ||
| 620 | static void bdi_prune_sb(struct backing_dev_info *bdi) | ||
| 621 | { | ||
| 622 | struct super_block *sb; | ||
| 623 | |||
| 624 | spin_lock(&sb_lock); | ||
| 625 | list_for_each_entry(sb, &super_blocks, s_list) { | ||
| 626 | if (sb->s_bdi == bdi) | ||
| 627 | sb->s_bdi = NULL; | ||
| 628 | } | ||
| 629 | spin_unlock(&sb_lock); | ||
| 630 | } | ||
| 631 | |||
| 228 | void bdi_unregister(struct backing_dev_info *bdi) | 632 | void bdi_unregister(struct backing_dev_info *bdi) |
| 229 | { | 633 | { |
| 230 | if (bdi->dev) { | 634 | if (bdi->dev) { |
| 635 | bdi_prune_sb(bdi); | ||
| 636 | |||
| 637 | if (!bdi_cap_flush_forker(bdi)) | ||
| 638 | bdi_wb_shutdown(bdi); | ||
| 231 | bdi_debug_unregister(bdi); | 639 | bdi_debug_unregister(bdi); |
| 232 | device_unregister(bdi->dev); | 640 | device_unregister(bdi->dev); |
| 233 | bdi->dev = NULL; | 641 | bdi->dev = NULL; |
| @@ -237,14 +645,26 @@ EXPORT_SYMBOL(bdi_unregister); | |||
| 237 | 645 | ||
| 238 | int bdi_init(struct backing_dev_info *bdi) | 646 | int bdi_init(struct backing_dev_info *bdi) |
| 239 | { | 647 | { |
| 240 | int i; | 648 | int i, err; |
| 241 | int err; | ||
| 242 | 649 | ||
| 243 | bdi->dev = NULL; | 650 | bdi->dev = NULL; |
| 244 | 651 | ||
| 245 | bdi->min_ratio = 0; | 652 | bdi->min_ratio = 0; |
| 246 | bdi->max_ratio = 100; | 653 | bdi->max_ratio = 100; |
| 247 | bdi->max_prop_frac = PROP_FRAC_BASE; | 654 | bdi->max_prop_frac = PROP_FRAC_BASE; |
| 655 | spin_lock_init(&bdi->wb_lock); | ||
| 656 | INIT_RCU_HEAD(&bdi->rcu_head); | ||
| 657 | INIT_LIST_HEAD(&bdi->bdi_list); | ||
| 658 | INIT_LIST_HEAD(&bdi->wb_list); | ||
| 659 | INIT_LIST_HEAD(&bdi->work_list); | ||
| 660 | |||
| 661 | bdi_wb_init(&bdi->wb, bdi); | ||
| 662 | |||
| 663 | /* | ||
| 664 | * Just one thread support for now, hard code mask and count | ||
| 665 | */ | ||
| 666 | bdi->wb_mask = 1; | ||
| 667 | bdi->wb_cnt = 1; | ||
| 248 | 668 | ||
| 249 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | 669 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { |
| 250 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); | 670 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); |
| @@ -269,6 +689,20 @@ void bdi_destroy(struct backing_dev_info *bdi) | |||
| 269 | { | 689 | { |
| 270 | int i; | 690 | int i; |
| 271 | 691 | ||
| 692 | /* | ||
| 693 | * Splice our entries to the default_backing_dev_info, if this | ||
| 694 | * bdi disappears | ||
| 695 | */ | ||
| 696 | if (bdi_has_dirty_io(bdi)) { | ||
| 697 | struct bdi_writeback *dst = &default_backing_dev_info.wb; | ||
| 698 | |||
| 699 | spin_lock(&inode_lock); | ||
| 700 | list_splice(&bdi->wb.b_dirty, &dst->b_dirty); | ||
| 701 | list_splice(&bdi->wb.b_io, &dst->b_io); | ||
| 702 | list_splice(&bdi->wb.b_more_io, &dst->b_more_io); | ||
| 703 | spin_unlock(&inode_lock); | ||
| 704 | } | ||
| 705 | |||
| 272 | bdi_unregister(bdi); | 706 | bdi_unregister(bdi); |
| 273 | 707 | ||
| 274 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) | 708 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) |
| @@ -283,13 +717,12 @@ static wait_queue_head_t congestion_wqh[2] = { | |||
| 283 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) | 717 | __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) |
| 284 | }; | 718 | }; |
| 285 | 719 | ||
| 286 | 720 | void clear_bdi_congested(struct backing_dev_info *bdi, int sync) | |
| 287 | void clear_bdi_congested(struct backing_dev_info *bdi, int rw) | ||
| 288 | { | 721 | { |
| 289 | enum bdi_state bit; | 722 | enum bdi_state bit; |
| 290 | wait_queue_head_t *wqh = &congestion_wqh[rw]; | 723 | wait_queue_head_t *wqh = &congestion_wqh[sync]; |
| 291 | 724 | ||
| 292 | bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; | 725 | bit = sync ? BDI_sync_congested : BDI_async_congested; |
| 293 | clear_bit(bit, &bdi->state); | 726 | clear_bit(bit, &bdi->state); |
| 294 | smp_mb__after_clear_bit(); | 727 | smp_mb__after_clear_bit(); |
| 295 | if (waitqueue_active(wqh)) | 728 | if (waitqueue_active(wqh)) |
| @@ -297,29 +730,29 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int rw) | |||
| 297 | } | 730 | } |
| 298 | EXPORT_SYMBOL(clear_bdi_congested); | 731 | EXPORT_SYMBOL(clear_bdi_congested); |
| 299 | 732 | ||
| 300 | void set_bdi_congested(struct backing_dev_info *bdi, int rw) | 733 | void set_bdi_congested(struct backing_dev_info *bdi, int sync) |
| 301 | { | 734 | { |
| 302 | enum bdi_state bit; | 735 | enum bdi_state bit; |
| 303 | 736 | ||
| 304 | bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; | 737 | bit = sync ? BDI_sync_congested : BDI_async_congested; |
| 305 | set_bit(bit, &bdi->state); | 738 | set_bit(bit, &bdi->state); |
| 306 | } | 739 | } |
| 307 | EXPORT_SYMBOL(set_bdi_congested); | 740 | EXPORT_SYMBOL(set_bdi_congested); |
| 308 | 741 | ||
| 309 | /** | 742 | /** |
| 310 | * congestion_wait - wait for a backing_dev to become uncongested | 743 | * congestion_wait - wait for a backing_dev to become uncongested |
| 311 | * @rw: READ or WRITE | 744 | * @sync: SYNC or ASYNC IO |
| 312 | * @timeout: timeout in jiffies | 745 | * @timeout: timeout in jiffies |
| 313 | * | 746 | * |
| 314 | * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit | 747 | * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit |
| 315 | * write congestion. If no backing_devs are congested then just wait for the | 748 | * write congestion. If no backing_devs are congested then just wait for the |
| 316 | * next write to be completed. | 749 | * next write to be completed. |
| 317 | */ | 750 | */ |
| 318 | long congestion_wait(int rw, long timeout) | 751 | long congestion_wait(int sync, long timeout) |
| 319 | { | 752 | { |
| 320 | long ret; | 753 | long ret; |
| 321 | DEFINE_WAIT(wait); | 754 | DEFINE_WAIT(wait); |
| 322 | wait_queue_head_t *wqh = &congestion_wqh[rw]; | 755 | wait_queue_head_t *wqh = &congestion_wqh[sync]; |
| 323 | 756 | ||
| 324 | prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); | 757 | prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); |
| 325 | ret = io_schedule_timeout(timeout); | 758 | ret = io_schedule_timeout(timeout); |
diff --git a/mm/bootmem.c b/mm/bootmem.c index daf92713f7de..d1dc23cc7f10 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #include <linux/pfn.h> | 12 | #include <linux/pfn.h> |
| 13 | #include <linux/bootmem.h> | 13 | #include <linux/bootmem.h> |
| 14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
| 15 | #include <linux/kmemleak.h> | ||
| 15 | 16 | ||
| 16 | #include <asm/bug.h> | 17 | #include <asm/bug.h> |
| 17 | #include <asm/io.h> | 18 | #include <asm/io.h> |
| @@ -142,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) | |||
| 142 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); | 143 | return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); |
| 143 | } | 144 | } |
| 144 | 145 | ||
| 146 | /* | ||
| 147 | * free_bootmem_late - free bootmem pages directly to page allocator | ||
| 148 | * @addr: starting address of the range | ||
| 149 | * @size: size of the range in bytes | ||
| 150 | * | ||
| 151 | * This is only useful when the bootmem allocator has already been torn | ||
| 152 | * down, but we are still initializing the system. Pages are given directly | ||
| 153 | * to the page allocator, no bootmem metadata is updated because it is gone. | ||
| 154 | */ | ||
| 155 | void __init free_bootmem_late(unsigned long addr, unsigned long size) | ||
| 156 | { | ||
| 157 | unsigned long cursor, end; | ||
| 158 | |||
| 159 | kmemleak_free_part(__va(addr), size); | ||
| 160 | |||
| 161 | cursor = PFN_UP(addr); | ||
| 162 | end = PFN_DOWN(addr + size); | ||
| 163 | |||
| 164 | for (; cursor < end; cursor++) { | ||
| 165 | __free_pages_bootmem(pfn_to_page(cursor), 0); | ||
| 166 | totalram_pages++; | ||
| 167 | } | ||
| 168 | } | ||
| 169 | |||
| 145 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | 170 | static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) |
| 146 | { | 171 | { |
| 147 | int aligned; | 172 | int aligned; |
| @@ -335,6 +360,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, | |||
| 335 | { | 360 | { |
| 336 | unsigned long start, end; | 361 | unsigned long start, end; |
| 337 | 362 | ||
| 363 | kmemleak_free_part(__va(physaddr), size); | ||
| 364 | |||
| 338 | start = PFN_UP(physaddr); | 365 | start = PFN_UP(physaddr); |
| 339 | end = PFN_DOWN(physaddr + size); | 366 | end = PFN_DOWN(physaddr + size); |
| 340 | 367 | ||
| @@ -354,6 +381,8 @@ void __init free_bootmem(unsigned long addr, unsigned long size) | |||
| 354 | { | 381 | { |
| 355 | unsigned long start, end; | 382 | unsigned long start, end; |
| 356 | 383 | ||
| 384 | kmemleak_free_part(__va(addr), size); | ||
| 385 | |||
| 357 | start = PFN_UP(addr); | 386 | start = PFN_UP(addr); |
| 358 | end = PFN_DOWN(addr + size); | 387 | end = PFN_DOWN(addr + size); |
| 359 | 388 | ||
| @@ -516,6 +545,11 @@ find_block: | |||
| 516 | region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + | 545 | region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + |
| 517 | start_off); | 546 | start_off); |
| 518 | memset(region, 0, size); | 547 | memset(region, 0, size); |
| 548 | /* | ||
| 549 | * The min_count is set to 0 so that bootmem allocated blocks | ||
| 550 | * are never reported as leaks. | ||
| 551 | */ | ||
| 552 | kmemleak_alloc(region, size, 0, 0); | ||
| 519 | return region; | 553 | return region; |
| 520 | } | 554 | } |
| 521 | 555 | ||
| @@ -532,12 +566,19 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | |||
| 532 | unsigned long size, unsigned long align, | 566 | unsigned long size, unsigned long align, |
| 533 | unsigned long goal, unsigned long limit) | 567 | unsigned long goal, unsigned long limit) |
| 534 | { | 568 | { |
| 535 | #ifdef CONFIG_HAVE_ARCH_BOOTMEM | 569 | if (WARN_ON_ONCE(slab_is_available())) |
| 536 | bootmem_data_t *p_bdata; | 570 | return kzalloc(size, GFP_NOWAIT); |
| 537 | 571 | ||
| 538 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, goal, limit); | 572 | #ifdef CONFIG_HAVE_ARCH_BOOTMEM |
| 539 | if (p_bdata) | 573 | { |
| 540 | return alloc_bootmem_core(p_bdata, size, align, goal, limit); | 574 | bootmem_data_t *p_bdata; |
| 575 | |||
| 576 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, | ||
| 577 | goal, limit); | ||
| 578 | if (p_bdata) | ||
| 579 | return alloc_bootmem_core(p_bdata, size, align, | ||
| 580 | goal, limit); | ||
| 581 | } | ||
| 541 | #endif | 582 | #endif |
| 542 | return NULL; | 583 | return NULL; |
| 543 | } | 584 | } |
| @@ -662,6 +703,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, | |||
| 662 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, | 703 | void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, |
| 663 | unsigned long align, unsigned long goal) | 704 | unsigned long align, unsigned long goal) |
| 664 | { | 705 | { |
| 706 | if (WARN_ON_ONCE(slab_is_available())) | ||
| 707 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
| 708 | |||
| 665 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); | 709 | return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); |
| 666 | } | 710 | } |
| 667 | 711 | ||
| @@ -693,6 +737,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, | |||
| 693 | { | 737 | { |
| 694 | void *ptr; | 738 | void *ptr; |
| 695 | 739 | ||
| 740 | if (WARN_ON_ONCE(slab_is_available())) | ||
| 741 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
| 742 | |||
| 696 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); | 743 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); |
| 697 | if (ptr) | 744 | if (ptr) |
| 698 | return ptr; | 745 | return ptr; |
| @@ -745,6 +792,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, | |||
| 745 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, | 792 | void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, |
| 746 | unsigned long align, unsigned long goal) | 793 | unsigned long align, unsigned long goal) |
| 747 | { | 794 | { |
| 795 | if (WARN_ON_ONCE(slab_is_available())) | ||
| 796 | return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); | ||
| 797 | |||
| 748 | return ___alloc_bootmem_node(pgdat->bdata, size, align, | 798 | return ___alloc_bootmem_node(pgdat->bdata, size, align, |
| 749 | goal, ARCH_LOW_ADDRESS_LIMIT); | 799 | goal, ARCH_LOW_ADDRESS_LIMIT); |
| 750 | } | 800 | } |
diff --git a/mm/bounce.c b/mm/bounce.c index e590272fe7a8..a2b76a588e34 100644 --- a/mm/bounce.c +++ b/mm/bounce.c | |||
| @@ -13,17 +13,15 @@ | |||
| 13 | #include <linux/init.h> | 13 | #include <linux/init.h> |
| 14 | #include <linux/hash.h> | 14 | #include <linux/hash.h> |
| 15 | #include <linux/highmem.h> | 15 | #include <linux/highmem.h> |
| 16 | #include <linux/blktrace_api.h> | ||
| 17 | #include <trace/block.h> | ||
| 18 | #include <asm/tlbflush.h> | 16 | #include <asm/tlbflush.h> |
| 19 | 17 | ||
| 18 | #include <trace/events/block.h> | ||
| 19 | |||
| 20 | #define POOL_SIZE 64 | 20 | #define POOL_SIZE 64 |
| 21 | #define ISA_POOL_SIZE 16 | 21 | #define ISA_POOL_SIZE 16 |
| 22 | 22 | ||
| 23 | static mempool_t *page_pool, *isa_page_pool; | 23 | static mempool_t *page_pool, *isa_page_pool; |
| 24 | 24 | ||
| 25 | DEFINE_TRACE(block_bio_bounce); | ||
| 26 | |||
| 27 | #ifdef CONFIG_HIGHMEM | 25 | #ifdef CONFIG_HIGHMEM |
| 28 | static __init int init_emergency_pool(void) | 26 | static __init int init_emergency_pool(void) |
| 29 | { | 27 | { |
| @@ -192,7 +190,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, | |||
| 192 | /* | 190 | /* |
| 193 | * is destination page below bounce pfn? | 191 | * is destination page below bounce pfn? |
| 194 | */ | 192 | */ |
| 195 | if (page_to_pfn(page) <= q->bounce_pfn) | 193 | if (page_to_pfn(page) <= queue_bounce_pfn(q)) |
| 196 | continue; | 194 | continue; |
| 197 | 195 | ||
| 198 | /* | 196 | /* |
| @@ -284,7 +282,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) | |||
| 284 | * don't waste time iterating over bio segments | 282 | * don't waste time iterating over bio segments |
| 285 | */ | 283 | */ |
| 286 | if (!(q->bounce_gfp & GFP_DMA)) { | 284 | if (!(q->bounce_gfp & GFP_DMA)) { |
| 287 | if (q->bounce_pfn >= blk_max_pfn) | 285 | if (queue_bounce_pfn(q) >= blk_max_pfn) |
| 288 | return; | 286 | return; |
| 289 | pool = page_pool; | 287 | pool = page_pool; |
| 290 | } else { | 288 | } else { |
diff --git a/mm/dmapool.c b/mm/dmapool.c index b1f0885dda22..3df063706f53 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
| @@ -86,10 +86,12 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf) | |||
| 86 | unsigned pages = 0; | 86 | unsigned pages = 0; |
| 87 | unsigned blocks = 0; | 87 | unsigned blocks = 0; |
| 88 | 88 | ||
| 89 | spin_lock_irq(&pool->lock); | ||
| 89 | list_for_each_entry(page, &pool->page_list, page_list) { | 90 | list_for_each_entry(page, &pool->page_list, page_list) { |
| 90 | pages++; | 91 | pages++; |
| 91 | blocks += page->in_use; | 92 | blocks += page->in_use; |
| 92 | } | 93 | } |
| 94 | spin_unlock_irq(&pool->lock); | ||
| 93 | 95 | ||
| 94 | /* per-pool info, no real statistics yet */ | 96 | /* per-pool info, no real statistics yet */ |
| 95 | temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", | 97 | temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", |
diff --git a/mm/fadvise.c b/mm/fadvise.c index 54a0f8040afa..e43359214f6f 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c | |||
| @@ -101,7 +101,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) | |||
| 101 | 101 | ||
| 102 | ret = force_page_cache_readahead(mapping, file, | 102 | ret = force_page_cache_readahead(mapping, file, |
| 103 | start_index, | 103 | start_index, |
| 104 | max_sane_readahead(nrpages)); | 104 | nrpages); |
| 105 | if (ret > 0) | 105 | if (ret > 0) |
| 106 | ret = 0; | 106 | ret = 0; |
| 107 | break; | 107 | break; |
diff --git a/mm/failslab.c b/mm/failslab.c index 7c6ea6493f80..9339de5f0a91 100644 --- a/mm/failslab.c +++ b/mm/failslab.c | |||
| @@ -1,4 +1,5 @@ | |||
| 1 | #include <linux/fault-inject.h> | 1 | #include <linux/fault-inject.h> |
| 2 | #include <linux/gfp.h> | ||
| 2 | 3 | ||
| 3 | static struct { | 4 | static struct { |
| 4 | struct fault_attr attr; | 5 | struct fault_attr attr; |
diff --git a/mm/filemap.c b/mm/filemap.c index 126d3973b3d1..ef169f37156d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -39,11 +39,10 @@ | |||
| 39 | /* | 39 | /* |
| 40 | * FIXME: remove all knowledge of the buffer layer from the core VM | 40 | * FIXME: remove all knowledge of the buffer layer from the core VM |
| 41 | */ | 41 | */ |
| 42 | #include <linux/buffer_head.h> /* for generic_osync_inode */ | 42 | #include <linux/buffer_head.h> /* for try_to_free_buffers */ |
| 43 | 43 | ||
| 44 | #include <asm/mman.h> | 44 | #include <asm/mman.h> |
| 45 | 45 | ||
| 46 | |||
| 47 | /* | 46 | /* |
| 48 | * Shared mappings implemented 30.11.1994. It's not fully working yet, | 47 | * Shared mappings implemented 30.11.1994. It's not fully working yet, |
| 49 | * though. | 48 | * though. |
| @@ -59,7 +58,7 @@ | |||
| 59 | /* | 58 | /* |
| 60 | * Lock ordering: | 59 | * Lock ordering: |
| 61 | * | 60 | * |
| 62 | * ->i_mmap_lock (vmtruncate) | 61 | * ->i_mmap_lock (truncate_pagecache) |
| 63 | * ->private_lock (__free_pte->__set_page_dirty_buffers) | 62 | * ->private_lock (__free_pte->__set_page_dirty_buffers) |
| 64 | * ->swap_lock (exclusive_swap_page, others) | 63 | * ->swap_lock (exclusive_swap_page, others) |
| 65 | * ->mapping->tree_lock | 64 | * ->mapping->tree_lock |
| @@ -105,6 +104,10 @@ | |||
| 105 | * | 104 | * |
| 106 | * ->task->proc_lock | 105 | * ->task->proc_lock |
| 107 | * ->dcache_lock (proc_pid_lookup) | 106 | * ->dcache_lock (proc_pid_lookup) |
| 107 | * | ||
| 108 | * (code doesn't rely on that order, so you could switch it around) | ||
| 109 | * ->tasklist_lock (memory_failure, collect_procs_ao) | ||
| 110 | * ->i_mmap_lock | ||
| 108 | */ | 111 | */ |
| 109 | 112 | ||
| 110 | /* | 113 | /* |
| @@ -120,8 +123,9 @@ void __remove_from_page_cache(struct page *page) | |||
| 120 | page->mapping = NULL; | 123 | page->mapping = NULL; |
| 121 | mapping->nrpages--; | 124 | mapping->nrpages--; |
| 122 | __dec_zone_page_state(page, NR_FILE_PAGES); | 125 | __dec_zone_page_state(page, NR_FILE_PAGES); |
| 126 | if (PageSwapBacked(page)) | ||
| 127 | __dec_zone_page_state(page, NR_SHMEM); | ||
| 123 | BUG_ON(page_mapped(page)); | 128 | BUG_ON(page_mapped(page)); |
| 124 | mem_cgroup_uncharge_cache_page(page); | ||
| 125 | 129 | ||
| 126 | /* | 130 | /* |
| 127 | * Some filesystems seem to re-dirty the page even after | 131 | * Some filesystems seem to re-dirty the page even after |
| @@ -145,6 +149,7 @@ void remove_from_page_cache(struct page *page) | |||
| 145 | spin_lock_irq(&mapping->tree_lock); | 149 | spin_lock_irq(&mapping->tree_lock); |
| 146 | __remove_from_page_cache(page); | 150 | __remove_from_page_cache(page); |
| 147 | spin_unlock_irq(&mapping->tree_lock); | 151 | spin_unlock_irq(&mapping->tree_lock); |
| 152 | mem_cgroup_uncharge_cache_page(page); | ||
| 148 | } | 153 | } |
| 149 | 154 | ||
| 150 | static int sync_page(void *word) | 155 | static int sync_page(void *word) |
| @@ -307,68 +312,24 @@ int wait_on_page_writeback_range(struct address_space *mapping, | |||
| 307 | } | 312 | } |
| 308 | 313 | ||
| 309 | /** | 314 | /** |
| 310 | * sync_page_range - write and wait on all pages in the passed range | 315 | * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range |
| 311 | * @inode: target inode | 316 | * @mapping: address space structure to wait for |
| 312 | * @mapping: target address_space | 317 | * @start: offset in bytes where the range starts |
| 313 | * @pos: beginning offset in pages to write | 318 | * @end: offset in bytes where the range ends (inclusive) |
| 314 | * @count: number of bytes to write | ||
| 315 | * | ||
| 316 | * Write and wait upon all the pages in the passed range. This is a "data | ||
| 317 | * integrity" operation. It waits upon in-flight writeout before starting and | ||
| 318 | * waiting upon new writeout. If there was an IO error, return it. | ||
| 319 | * | 319 | * |
| 320 | * We need to re-take i_mutex during the generic_osync_inode list walk because | 320 | * Walk the list of under-writeback pages of the given address space |
| 321 | * it is otherwise livelockable. | 321 | * in the given range and wait for all of them. |
| 322 | */ | ||
| 323 | int sync_page_range(struct inode *inode, struct address_space *mapping, | ||
| 324 | loff_t pos, loff_t count) | ||
| 325 | { | ||
| 326 | pgoff_t start = pos >> PAGE_CACHE_SHIFT; | ||
| 327 | pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; | ||
| 328 | int ret; | ||
| 329 | |||
| 330 | if (!mapping_cap_writeback_dirty(mapping) || !count) | ||
| 331 | return 0; | ||
| 332 | ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); | ||
| 333 | if (ret == 0) { | ||
| 334 | mutex_lock(&inode->i_mutex); | ||
| 335 | ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); | ||
| 336 | mutex_unlock(&inode->i_mutex); | ||
| 337 | } | ||
| 338 | if (ret == 0) | ||
| 339 | ret = wait_on_page_writeback_range(mapping, start, end); | ||
| 340 | return ret; | ||
| 341 | } | ||
| 342 | EXPORT_SYMBOL(sync_page_range); | ||
| 343 | |||
| 344 | /** | ||
| 345 | * sync_page_range_nolock - write & wait on all pages in the passed range without locking | ||
| 346 | * @inode: target inode | ||
| 347 | * @mapping: target address_space | ||
| 348 | * @pos: beginning offset in pages to write | ||
| 349 | * @count: number of bytes to write | ||
| 350 | * | 322 | * |
| 351 | * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea | 323 | * This is just a simple wrapper so that callers don't have to convert offsets |
| 352 | * as it forces O_SYNC writers to different parts of the same file | 324 | * to page indexes themselves |
| 353 | * to be serialised right until io completion. | ||
| 354 | */ | 325 | */ |
| 355 | int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, | 326 | int filemap_fdatawait_range(struct address_space *mapping, loff_t start, |
| 356 | loff_t pos, loff_t count) | 327 | loff_t end) |
| 357 | { | 328 | { |
| 358 | pgoff_t start = pos >> PAGE_CACHE_SHIFT; | 329 | return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, |
| 359 | pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; | 330 | end >> PAGE_CACHE_SHIFT); |
| 360 | int ret; | ||
| 361 | |||
| 362 | if (!mapping_cap_writeback_dirty(mapping) || !count) | ||
| 363 | return 0; | ||
| 364 | ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); | ||
| 365 | if (ret == 0) | ||
| 366 | ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); | ||
| 367 | if (ret == 0) | ||
| 368 | ret = wait_on_page_writeback_range(mapping, start, end); | ||
| 369 | return ret; | ||
| 370 | } | 331 | } |
| 371 | EXPORT_SYMBOL(sync_page_range_nolock); | 332 | EXPORT_SYMBOL(filemap_fdatawait_range); |
| 372 | 333 | ||
| 373 | /** | 334 | /** |
| 374 | * filemap_fdatawait - wait for all under-writeback pages to complete | 335 | * filemap_fdatawait - wait for all under-writeback pages to complete |
| @@ -441,6 +402,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, | |||
| 441 | } | 402 | } |
| 442 | return err; | 403 | return err; |
| 443 | } | 404 | } |
| 405 | EXPORT_SYMBOL(filemap_write_and_wait_range); | ||
| 444 | 406 | ||
| 445 | /** | 407 | /** |
| 446 | * add_to_page_cache_locked - add a locked page to the pagecache | 408 | * add_to_page_cache_locked - add a locked page to the pagecache |
| @@ -475,13 +437,15 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, | |||
| 475 | if (likely(!error)) { | 437 | if (likely(!error)) { |
| 476 | mapping->nrpages++; | 438 | mapping->nrpages++; |
| 477 | __inc_zone_page_state(page, NR_FILE_PAGES); | 439 | __inc_zone_page_state(page, NR_FILE_PAGES); |
| 440 | if (PageSwapBacked(page)) | ||
| 441 | __inc_zone_page_state(page, NR_SHMEM); | ||
| 442 | spin_unlock_irq(&mapping->tree_lock); | ||
| 478 | } else { | 443 | } else { |
| 479 | page->mapping = NULL; | 444 | page->mapping = NULL; |
| 445 | spin_unlock_irq(&mapping->tree_lock); | ||
| 480 | mem_cgroup_uncharge_cache_page(page); | 446 | mem_cgroup_uncharge_cache_page(page); |
| 481 | page_cache_release(page); | 447 | page_cache_release(page); |
| 482 | } | 448 | } |
| 483 | |||
| 484 | spin_unlock_irq(&mapping->tree_lock); | ||
| 485 | radix_tree_preload_end(); | 449 | radix_tree_preload_end(); |
| 486 | } else | 450 | } else |
| 487 | mem_cgroup_uncharge_cache_page(page); | 451 | mem_cgroup_uncharge_cache_page(page); |
| @@ -513,13 +477,14 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
| 513 | } | 477 | } |
| 514 | return ret; | 478 | return ret; |
| 515 | } | 479 | } |
| 480 | EXPORT_SYMBOL_GPL(add_to_page_cache_lru); | ||
| 516 | 481 | ||
| 517 | #ifdef CONFIG_NUMA | 482 | #ifdef CONFIG_NUMA |
| 518 | struct page *__page_cache_alloc(gfp_t gfp) | 483 | struct page *__page_cache_alloc(gfp_t gfp) |
| 519 | { | 484 | { |
| 520 | if (cpuset_do_page_mem_spread()) { | 485 | if (cpuset_do_page_mem_spread()) { |
| 521 | int n = cpuset_mem_spread_node(); | 486 | int n = cpuset_mem_spread_node(); |
| 522 | return alloc_pages_node(n, gfp, 0); | 487 | return alloc_pages_exact_node(n, gfp, 0); |
| 523 | } | 488 | } |
| 524 | return alloc_pages(gfp, 0); | 489 | return alloc_pages(gfp, 0); |
| 525 | } | 490 | } |
| @@ -565,6 +530,24 @@ void wait_on_page_bit(struct page *page, int bit_nr) | |||
| 565 | EXPORT_SYMBOL(wait_on_page_bit); | 530 | EXPORT_SYMBOL(wait_on_page_bit); |
| 566 | 531 | ||
| 567 | /** | 532 | /** |
| 533 | * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue | ||
| 534 | * @page: Page defining the wait queue of interest | ||
| 535 | * @waiter: Waiter to add to the queue | ||
| 536 | * | ||
| 537 | * Add an arbitrary @waiter to the wait queue for the nominated @page. | ||
| 538 | */ | ||
| 539 | void add_page_wait_queue(struct page *page, wait_queue_t *waiter) | ||
| 540 | { | ||
| 541 | wait_queue_head_t *q = page_waitqueue(page); | ||
| 542 | unsigned long flags; | ||
| 543 | |||
| 544 | spin_lock_irqsave(&q->lock, flags); | ||
| 545 | __add_wait_queue(q, waiter); | ||
| 546 | spin_unlock_irqrestore(&q->lock, flags); | ||
| 547 | } | ||
| 548 | EXPORT_SYMBOL_GPL(add_page_wait_queue); | ||
| 549 | |||
| 550 | /** | ||
| 568 | * unlock_page - unlock a locked page | 551 | * unlock_page - unlock a locked page |
| 569 | * @page: the page | 552 | * @page: the page |
| 570 | * | 553 | * |
| @@ -627,6 +610,7 @@ int __lock_page_killable(struct page *page) | |||
| 627 | return __wait_on_bit_lock(page_waitqueue(page), &wait, | 610 | return __wait_on_bit_lock(page_waitqueue(page), &wait, |
| 628 | sync_page_killable, TASK_KILLABLE); | 611 | sync_page_killable, TASK_KILLABLE); |
| 629 | } | 612 | } |
| 613 | EXPORT_SYMBOL_GPL(__lock_page_killable); | ||
| 630 | 614 | ||
| 631 | /** | 615 | /** |
| 632 | * __lock_page_nosync - get a lock on the page, without calling sync_page() | 616 | * __lock_page_nosync - get a lock on the page, without calling sync_page() |
| @@ -983,9 +967,6 @@ EXPORT_SYMBOL(grab_cache_page_nowait); | |||
| 983 | static void shrink_readahead_size_eio(struct file *filp, | 967 | static void shrink_readahead_size_eio(struct file *filp, |
| 984 | struct file_ra_state *ra) | 968 | struct file_ra_state *ra) |
| 985 | { | 969 | { |
| 986 | if (!ra->ra_pages) | ||
| 987 | return; | ||
| 988 | |||
| 989 | ra->ra_pages /= 4; | 970 | ra->ra_pages /= 4; |
| 990 | } | 971 | } |
| 991 | 972 | ||
| @@ -1369,8 +1350,7 @@ do_readahead(struct address_space *mapping, struct file *filp, | |||
| 1369 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) | 1350 | if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) |
| 1370 | return -EINVAL; | 1351 | return -EINVAL; |
| 1371 | 1352 | ||
| 1372 | force_page_cache_readahead(mapping, filp, index, | 1353 | force_page_cache_readahead(mapping, filp, index, nr); |
| 1373 | max_sane_readahead(nr)); | ||
| 1374 | return 0; | 1354 | return 0; |
| 1375 | } | 1355 | } |
| 1376 | 1356 | ||
| @@ -1436,6 +1416,73 @@ static int page_cache_read(struct file *file, pgoff_t offset) | |||
| 1436 | 1416 | ||
| 1437 | #define MMAP_LOTSAMISS (100) | 1417 | #define MMAP_LOTSAMISS (100) |
| 1438 | 1418 | ||
| 1419 | /* | ||
| 1420 | * Synchronous readahead happens when we don't even find | ||
| 1421 | * a page in the page cache at all. | ||
| 1422 | */ | ||
| 1423 | static void do_sync_mmap_readahead(struct vm_area_struct *vma, | ||
| 1424 | struct file_ra_state *ra, | ||
| 1425 | struct file *file, | ||
| 1426 | pgoff_t offset) | ||
| 1427 | { | ||
| 1428 | unsigned long ra_pages; | ||
| 1429 | struct address_space *mapping = file->f_mapping; | ||
| 1430 | |||
| 1431 | /* If we don't want any read-ahead, don't bother */ | ||
| 1432 | if (VM_RandomReadHint(vma)) | ||
| 1433 | return; | ||
| 1434 | |||
| 1435 | if (VM_SequentialReadHint(vma) || | ||
| 1436 | offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) { | ||
| 1437 | page_cache_sync_readahead(mapping, ra, file, offset, | ||
| 1438 | ra->ra_pages); | ||
| 1439 | return; | ||
| 1440 | } | ||
| 1441 | |||
| 1442 | if (ra->mmap_miss < INT_MAX) | ||
| 1443 | ra->mmap_miss++; | ||
| 1444 | |||
| 1445 | /* | ||
| 1446 | * Do we miss much more than hit in this file? If so, | ||
| 1447 | * stop bothering with read-ahead. It will only hurt. | ||
| 1448 | */ | ||
| 1449 | if (ra->mmap_miss > MMAP_LOTSAMISS) | ||
| 1450 | return; | ||
| 1451 | |||
| 1452 | /* | ||
| 1453 | * mmap read-around | ||
| 1454 | */ | ||
| 1455 | ra_pages = max_sane_readahead(ra->ra_pages); | ||
| 1456 | if (ra_pages) { | ||
| 1457 | ra->start = max_t(long, 0, offset - ra_pages/2); | ||
| 1458 | ra->size = ra_pages; | ||
| 1459 | ra->async_size = 0; | ||
| 1460 | ra_submit(ra, mapping, file); | ||
| 1461 | } | ||
| 1462 | } | ||
| 1463 | |||
| 1464 | /* | ||
| 1465 | * Asynchronous readahead happens when we find the page and PG_readahead, | ||
| 1466 | * so we want to possibly extend the readahead further.. | ||
| 1467 | */ | ||
| 1468 | static void do_async_mmap_readahead(struct vm_area_struct *vma, | ||
| 1469 | struct file_ra_state *ra, | ||
| 1470 | struct file *file, | ||
| 1471 | struct page *page, | ||
| 1472 | pgoff_t offset) | ||
| 1473 | { | ||
| 1474 | struct address_space *mapping = file->f_mapping; | ||
| 1475 | |||
| 1476 | /* If we don't want any read-ahead, don't bother */ | ||
| 1477 | if (VM_RandomReadHint(vma)) | ||
| 1478 | return; | ||
| 1479 | if (ra->mmap_miss > 0) | ||
| 1480 | ra->mmap_miss--; | ||
| 1481 | if (PageReadahead(page)) | ||
| 1482 | page_cache_async_readahead(mapping, ra, file, | ||
| 1483 | page, offset, ra->ra_pages); | ||
| 1484 | } | ||
| 1485 | |||
| 1439 | /** | 1486 | /** |
| 1440 | * filemap_fault - read in file data for page fault handling | 1487 | * filemap_fault - read in file data for page fault handling |
| 1441 | * @vma: vma in which the fault was taken | 1488 | * @vma: vma in which the fault was taken |
| @@ -1455,78 +1502,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1455 | struct address_space *mapping = file->f_mapping; | 1502 | struct address_space *mapping = file->f_mapping; |
| 1456 | struct file_ra_state *ra = &file->f_ra; | 1503 | struct file_ra_state *ra = &file->f_ra; |
| 1457 | struct inode *inode = mapping->host; | 1504 | struct inode *inode = mapping->host; |
| 1505 | pgoff_t offset = vmf->pgoff; | ||
| 1458 | struct page *page; | 1506 | struct page *page; |
| 1459 | pgoff_t size; | 1507 | pgoff_t size; |
| 1460 | int did_readaround = 0; | ||
| 1461 | int ret = 0; | 1508 | int ret = 0; |
| 1462 | 1509 | ||
| 1463 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1510 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 1464 | if (vmf->pgoff >= size) | 1511 | if (offset >= size) |
| 1465 | return VM_FAULT_SIGBUS; | 1512 | return VM_FAULT_SIGBUS; |
| 1466 | 1513 | ||
| 1467 | /* If we don't want any read-ahead, don't bother */ | ||
| 1468 | if (VM_RandomReadHint(vma)) | ||
| 1469 | goto no_cached_page; | ||
| 1470 | |||
| 1471 | /* | 1514 | /* |
| 1472 | * Do we have something in the page cache already? | 1515 | * Do we have something in the page cache already? |
| 1473 | */ | 1516 | */ |
| 1474 | retry_find: | 1517 | page = find_get_page(mapping, offset); |
| 1475 | page = find_lock_page(mapping, vmf->pgoff); | 1518 | if (likely(page)) { |
| 1476 | /* | ||
| 1477 | * For sequential accesses, we use the generic readahead logic. | ||
| 1478 | */ | ||
| 1479 | if (VM_SequentialReadHint(vma)) { | ||
| 1480 | if (!page) { | ||
| 1481 | page_cache_sync_readahead(mapping, ra, file, | ||
| 1482 | vmf->pgoff, 1); | ||
| 1483 | page = find_lock_page(mapping, vmf->pgoff); | ||
| 1484 | if (!page) | ||
| 1485 | goto no_cached_page; | ||
| 1486 | } | ||
| 1487 | if (PageReadahead(page)) { | ||
| 1488 | page_cache_async_readahead(mapping, ra, file, page, | ||
| 1489 | vmf->pgoff, 1); | ||
| 1490 | } | ||
| 1491 | } | ||
| 1492 | |||
| 1493 | if (!page) { | ||
| 1494 | unsigned long ra_pages; | ||
| 1495 | |||
| 1496 | ra->mmap_miss++; | ||
| 1497 | |||
| 1498 | /* | 1519 | /* |
| 1499 | * Do we miss much more than hit in this file? If so, | 1520 | * We found the page, so try async readahead before |
| 1500 | * stop bothering with read-ahead. It will only hurt. | 1521 | * waiting for the lock. |
| 1501 | */ | 1522 | */ |
| 1502 | if (ra->mmap_miss > MMAP_LOTSAMISS) | 1523 | do_async_mmap_readahead(vma, ra, file, page, offset); |
| 1503 | goto no_cached_page; | 1524 | lock_page(page); |
| 1504 | 1525 | ||
| 1505 | /* | 1526 | /* Did it get truncated? */ |
| 1506 | * To keep the pgmajfault counter straight, we need to | 1527 | if (unlikely(page->mapping != mapping)) { |
| 1507 | * check did_readaround, as this is an inner loop. | 1528 | unlock_page(page); |
| 1508 | */ | 1529 | put_page(page); |
| 1509 | if (!did_readaround) { | 1530 | goto no_cached_page; |
| 1510 | ret = VM_FAULT_MAJOR; | ||
| 1511 | count_vm_event(PGMAJFAULT); | ||
| 1512 | } | ||
| 1513 | did_readaround = 1; | ||
| 1514 | ra_pages = max_sane_readahead(file->f_ra.ra_pages); | ||
| 1515 | if (ra_pages) { | ||
| 1516 | pgoff_t start = 0; | ||
| 1517 | |||
| 1518 | if (vmf->pgoff > ra_pages / 2) | ||
| 1519 | start = vmf->pgoff - ra_pages / 2; | ||
| 1520 | do_page_cache_readahead(mapping, file, start, ra_pages); | ||
| 1521 | } | 1531 | } |
| 1522 | page = find_lock_page(mapping, vmf->pgoff); | 1532 | } else { |
| 1533 | /* No page in the page cache at all */ | ||
| 1534 | do_sync_mmap_readahead(vma, ra, file, offset); | ||
| 1535 | count_vm_event(PGMAJFAULT); | ||
| 1536 | ret = VM_FAULT_MAJOR; | ||
| 1537 | retry_find: | ||
| 1538 | page = find_lock_page(mapping, offset); | ||
| 1523 | if (!page) | 1539 | if (!page) |
| 1524 | goto no_cached_page; | 1540 | goto no_cached_page; |
| 1525 | } | 1541 | } |
| 1526 | 1542 | ||
| 1527 | if (!did_readaround) | ||
| 1528 | ra->mmap_miss--; | ||
| 1529 | |||
| 1530 | /* | 1543 | /* |
| 1531 | * We have a locked page in the page cache, now we need to check | 1544 | * We have a locked page in the page cache, now we need to check |
| 1532 | * that it's up-to-date. If not, it is going to be due to an error. | 1545 | * that it's up-to-date. If not, it is going to be due to an error. |
| @@ -1534,18 +1547,18 @@ retry_find: | |||
| 1534 | if (unlikely(!PageUptodate(page))) | 1547 | if (unlikely(!PageUptodate(page))) |
| 1535 | goto page_not_uptodate; | 1548 | goto page_not_uptodate; |
| 1536 | 1549 | ||
| 1537 | /* Must recheck i_size under page lock */ | 1550 | /* |
| 1551 | * Found the page and have a reference on it. | ||
| 1552 | * We must recheck i_size under page lock. | ||
| 1553 | */ | ||
| 1538 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 1554 | size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
| 1539 | if (unlikely(vmf->pgoff >= size)) { | 1555 | if (unlikely(offset >= size)) { |
| 1540 | unlock_page(page); | 1556 | unlock_page(page); |
| 1541 | page_cache_release(page); | 1557 | page_cache_release(page); |
| 1542 | return VM_FAULT_SIGBUS; | 1558 | return VM_FAULT_SIGBUS; |
| 1543 | } | 1559 | } |
| 1544 | 1560 | ||
| 1545 | /* | 1561 | ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT; |
| 1546 | * Found the page and have a reference on it. | ||
| 1547 | */ | ||
| 1548 | ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; | ||
| 1549 | vmf->page = page; | 1562 | vmf->page = page; |
| 1550 | return ret | VM_FAULT_LOCKED; | 1563 | return ret | VM_FAULT_LOCKED; |
| 1551 | 1564 | ||
| @@ -1554,7 +1567,7 @@ no_cached_page: | |||
| 1554 | * We're only likely to ever get here if MADV_RANDOM is in | 1567 | * We're only likely to ever get here if MADV_RANDOM is in |
| 1555 | * effect. | 1568 | * effect. |
| 1556 | */ | 1569 | */ |
| 1557 | error = page_cache_read(file, vmf->pgoff); | 1570 | error = page_cache_read(file, offset); |
| 1558 | 1571 | ||
| 1559 | /* | 1572 | /* |
| 1560 | * The page we want has now been added to the page cache. | 1573 | * The page we want has now been added to the page cache. |
| @@ -1574,12 +1587,6 @@ no_cached_page: | |||
| 1574 | return VM_FAULT_SIGBUS; | 1587 | return VM_FAULT_SIGBUS; |
| 1575 | 1588 | ||
| 1576 | page_not_uptodate: | 1589 | page_not_uptodate: |
| 1577 | /* IO error path */ | ||
| 1578 | if (!did_readaround) { | ||
| 1579 | ret = VM_FAULT_MAJOR; | ||
| 1580 | count_vm_event(PGMAJFAULT); | ||
| 1581 | } | ||
| 1582 | |||
| 1583 | /* | 1590 | /* |
| 1584 | * Umm, take care of errors if the page isn't up-to-date. | 1591 | * Umm, take care of errors if the page isn't up-to-date. |
| 1585 | * Try to re-read it _once_. We do this synchronously, | 1592 | * Try to re-read it _once_. We do this synchronously, |
| @@ -1604,7 +1611,7 @@ page_not_uptodate: | |||
| 1604 | } | 1611 | } |
| 1605 | EXPORT_SYMBOL(filemap_fault); | 1612 | EXPORT_SYMBOL(filemap_fault); |
| 1606 | 1613 | ||
| 1607 | struct vm_operations_struct generic_file_vm_ops = { | 1614 | const struct vm_operations_struct generic_file_vm_ops = { |
| 1608 | .fault = filemap_fault, | 1615 | .fault = filemap_fault, |
| 1609 | }; | 1616 | }; |
| 1610 | 1617 | ||
| @@ -2123,20 +2130,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2123 | } | 2130 | } |
| 2124 | *ppos = end; | 2131 | *ppos = end; |
| 2125 | } | 2132 | } |
| 2126 | |||
| 2127 | /* | ||
| 2128 | * Sync the fs metadata but not the minor inode changes and | ||
| 2129 | * of course not the data as we did direct DMA for the IO. | ||
| 2130 | * i_mutex is held, which protects generic_osync_inode() from | ||
| 2131 | * livelocking. AIO O_DIRECT ops attempt to sync metadata here. | ||
| 2132 | */ | ||
| 2133 | out: | 2133 | out: |
| 2134 | if ((written >= 0 || written == -EIOCBQUEUED) && | ||
| 2135 | ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2136 | int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); | ||
| 2137 | if (err < 0) | ||
| 2138 | written = err; | ||
| 2139 | } | ||
| 2140 | return written; | 2134 | return written; |
| 2141 | } | 2135 | } |
| 2142 | EXPORT_SYMBOL(generic_file_direct_write); | 2136 | EXPORT_SYMBOL(generic_file_direct_write); |
| @@ -2228,6 +2222,7 @@ again: | |||
| 2228 | pagefault_enable(); | 2222 | pagefault_enable(); |
| 2229 | flush_dcache_page(page); | 2223 | flush_dcache_page(page); |
| 2230 | 2224 | ||
| 2225 | mark_page_accessed(page); | ||
| 2231 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | 2226 | status = a_ops->write_end(file, mapping, pos, bytes, copied, |
| 2232 | page, fsdata); | 2227 | page, fsdata); |
| 2233 | if (unlikely(status < 0)) | 2228 | if (unlikely(status < 0)) |
| @@ -2267,8 +2262,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2267 | { | 2262 | { |
| 2268 | struct file *file = iocb->ki_filp; | 2263 | struct file *file = iocb->ki_filp; |
| 2269 | struct address_space *mapping = file->f_mapping; | 2264 | struct address_space *mapping = file->f_mapping; |
| 2270 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
| 2271 | struct inode *inode = mapping->host; | ||
| 2272 | ssize_t status; | 2265 | ssize_t status; |
| 2273 | struct iov_iter i; | 2266 | struct iov_iter i; |
| 2274 | 2267 | ||
| @@ -2278,16 +2271,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2278 | if (likely(status >= 0)) { | 2271 | if (likely(status >= 0)) { |
| 2279 | written += status; | 2272 | written += status; |
| 2280 | *ppos = pos + status; | 2273 | *ppos = pos + status; |
| 2281 | |||
| 2282 | /* | ||
| 2283 | * For now, when the user asks for O_SYNC, we'll actually give | ||
| 2284 | * O_DSYNC | ||
| 2285 | */ | ||
| 2286 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2287 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) | ||
| 2288 | status = generic_osync_inode(inode, mapping, | ||
| 2289 | OSYNC_METADATA|OSYNC_DATA); | ||
| 2290 | } | ||
| 2291 | } | 2274 | } |
| 2292 | 2275 | ||
| 2293 | /* | 2276 | /* |
| @@ -2303,9 +2286,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2303 | } | 2286 | } |
| 2304 | EXPORT_SYMBOL(generic_file_buffered_write); | 2287 | EXPORT_SYMBOL(generic_file_buffered_write); |
| 2305 | 2288 | ||
| 2306 | static ssize_t | 2289 | /** |
| 2307 | __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, | 2290 | * __generic_file_aio_write - write data to a file |
| 2308 | unsigned long nr_segs, loff_t *ppos) | 2291 | * @iocb: IO state structure (file, offset, etc.) |
| 2292 | * @iov: vector with data to write | ||
| 2293 | * @nr_segs: number of segments in the vector | ||
| 2294 | * @ppos: position where to write | ||
| 2295 | * | ||
| 2296 | * This function does all the work needed for actually writing data to a | ||
| 2297 | * file. It does all basic checks, removes SUID from the file, updates | ||
| 2298 | * modification times and calls proper subroutines depending on whether we | ||
| 2299 | * do direct IO or a standard buffered write. | ||
| 2300 | * | ||
| 2301 | * It expects i_mutex to be grabbed unless we work on a block device or similar | ||
| 2302 | * object which does not need locking at all. | ||
| 2303 | * | ||
| 2304 | * This function does *not* take care of syncing data in case of O_SYNC write. | ||
| 2305 | * A caller has to handle it. This is mainly due to the fact that we want to | ||
| 2306 | * avoid syncing under i_mutex. | ||
| 2307 | */ | ||
| 2308 | ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | ||
| 2309 | unsigned long nr_segs, loff_t *ppos) | ||
| 2309 | { | 2310 | { |
| 2310 | struct file *file = iocb->ki_filp; | 2311 | struct file *file = iocb->ki_filp; |
| 2311 | struct address_space * mapping = file->f_mapping; | 2312 | struct address_space * mapping = file->f_mapping; |
| @@ -2402,51 +2403,37 @@ out: | |||
| 2402 | current->backing_dev_info = NULL; | 2403 | current->backing_dev_info = NULL; |
| 2403 | return written ? written : err; | 2404 | return written ? written : err; |
| 2404 | } | 2405 | } |
| 2406 | EXPORT_SYMBOL(__generic_file_aio_write); | ||
| 2405 | 2407 | ||
| 2406 | ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, | 2408 | /** |
| 2407 | const struct iovec *iov, unsigned long nr_segs, loff_t pos) | 2409 | * generic_file_aio_write - write data to a file |
| 2408 | { | 2410 | * @iocb: IO state structure |
| 2409 | struct file *file = iocb->ki_filp; | 2411 | * @iov: vector with data to write |
| 2410 | struct address_space *mapping = file->f_mapping; | 2412 | * @nr_segs: number of segments in the vector |
| 2411 | struct inode *inode = mapping->host; | 2413 | * @pos: position in file where to write |
| 2412 | ssize_t ret; | 2414 | * |
| 2413 | 2415 | * This is a wrapper around __generic_file_aio_write() to be used by most | |
| 2414 | BUG_ON(iocb->ki_pos != pos); | 2416 | * filesystems. It takes care of syncing the file in case of O_SYNC file |
| 2415 | 2417 | * and acquires i_mutex as needed. | |
| 2416 | ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, | 2418 | */ |
| 2417 | &iocb->ki_pos); | ||
| 2418 | |||
| 2419 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
| 2420 | ssize_t err; | ||
| 2421 | |||
| 2422 | err = sync_page_range_nolock(inode, mapping, pos, ret); | ||
| 2423 | if (err < 0) | ||
| 2424 | ret = err; | ||
| 2425 | } | ||
| 2426 | return ret; | ||
| 2427 | } | ||
| 2428 | EXPORT_SYMBOL(generic_file_aio_write_nolock); | ||
| 2429 | |||
| 2430 | ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | 2419 | ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, |
| 2431 | unsigned long nr_segs, loff_t pos) | 2420 | unsigned long nr_segs, loff_t pos) |
| 2432 | { | 2421 | { |
| 2433 | struct file *file = iocb->ki_filp; | 2422 | struct file *file = iocb->ki_filp; |
| 2434 | struct address_space *mapping = file->f_mapping; | 2423 | struct inode *inode = file->f_mapping->host; |
| 2435 | struct inode *inode = mapping->host; | ||
| 2436 | ssize_t ret; | 2424 | ssize_t ret; |
| 2437 | 2425 | ||
| 2438 | BUG_ON(iocb->ki_pos != pos); | 2426 | BUG_ON(iocb->ki_pos != pos); |
| 2439 | 2427 | ||
| 2440 | mutex_lock(&inode->i_mutex); | 2428 | mutex_lock(&inode->i_mutex); |
| 2441 | ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, | 2429 | ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); |
| 2442 | &iocb->ki_pos); | ||
| 2443 | mutex_unlock(&inode->i_mutex); | 2430 | mutex_unlock(&inode->i_mutex); |
| 2444 | 2431 | ||
| 2445 | if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2432 | if (ret > 0 || ret == -EIOCBQUEUED) { |
| 2446 | ssize_t err; | 2433 | ssize_t err; |
| 2447 | 2434 | ||
| 2448 | err = sync_page_range(inode, mapping, pos, ret); | 2435 | err = generic_write_sync(file, pos, ret); |
| 2449 | if (err < 0) | 2436 | if (err < 0 && ret > 0) |
| 2450 | ret = err; | 2437 | ret = err; |
| 2451 | } | 2438 | } |
| 2452 | return ret; | 2439 | return ret; |
| @@ -2463,6 +2450,9 @@ EXPORT_SYMBOL(generic_file_aio_write); | |||
| 2463 | * (presumably at page->private). If the release was successful, return `1'. | 2450 | * (presumably at page->private). If the release was successful, return `1'. |
| 2464 | * Otherwise return zero. | 2451 | * Otherwise return zero. |
| 2465 | * | 2452 | * |
| 2453 | * This may also be called if PG_fscache is set on a page, indicating that the | ||
| 2454 | * page is known to the local caching routines. | ||
| 2455 | * | ||
| 2466 | * The @gfp_mask argument specifies whether I/O may be performed to release | 2456 | * The @gfp_mask argument specifies whether I/O may be performed to release |
| 2467 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). | 2457 | * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). |
| 2468 | * | 2458 | * |
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 0c04615651b7..1888b2d71bb8 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c | |||
| @@ -89,8 +89,8 @@ do_xip_mapping_read(struct address_space *mapping, | |||
| 89 | } | 89 | } |
| 90 | } | 90 | } |
| 91 | nr = nr - offset; | 91 | nr = nr - offset; |
| 92 | if (nr > len) | 92 | if (nr > len - copied) |
| 93 | nr = len; | 93 | nr = len - copied; |
| 94 | 94 | ||
| 95 | error = mapping->a_ops->get_xip_mem(mapping, index, 0, | 95 | error = mapping->a_ops->get_xip_mem(mapping, index, 0, |
| 96 | &xip_mem, &xip_pfn); | 96 | &xip_mem, &xip_pfn); |
| @@ -296,7 +296,7 @@ out: | |||
| 296 | } | 296 | } |
| 297 | } | 297 | } |
| 298 | 298 | ||
| 299 | static struct vm_operations_struct xip_file_vm_ops = { | 299 | static const struct vm_operations_struct xip_file_vm_ops = { |
| 300 | .fault = xip_file_fault, | 300 | .fault = xip_file_fault, |
| 301 | }; | 301 | }; |
| 302 | 302 | ||
diff --git a/mm/highmem.c b/mm/highmem.c index 68eb1d9b63fa..9c1e627f282e 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
| @@ -26,7 +26,6 @@ | |||
| 26 | #include <linux/init.h> | 26 | #include <linux/init.h> |
| 27 | #include <linux/hash.h> | 27 | #include <linux/hash.h> |
| 28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
| 29 | #include <linux/blktrace_api.h> | ||
| 30 | #include <asm/tlbflush.h> | 29 | #include <asm/tlbflush.h> |
| 31 | 30 | ||
| 32 | /* | 31 | /* |
| @@ -427,16 +426,21 @@ void __init page_address_init(void) | |||
| 427 | 426 | ||
| 428 | void debug_kmap_atomic(enum km_type type) | 427 | void debug_kmap_atomic(enum km_type type) |
| 429 | { | 428 | { |
| 430 | static unsigned warn_count = 10; | 429 | static int warn_count = 10; |
| 431 | 430 | ||
| 432 | if (unlikely(warn_count == 0)) | 431 | if (unlikely(warn_count < 0)) |
| 433 | return; | 432 | return; |
| 434 | 433 | ||
| 435 | if (unlikely(in_interrupt())) { | 434 | if (unlikely(in_interrupt())) { |
| 436 | if (in_irq()) { | 435 | if (in_nmi()) { |
| 436 | if (type != KM_NMI && type != KM_NMI_PTE) { | ||
| 437 | WARN_ON(1); | ||
| 438 | warn_count--; | ||
| 439 | } | ||
| 440 | } else if (in_irq()) { | ||
| 437 | if (type != KM_IRQ0 && type != KM_IRQ1 && | 441 | if (type != KM_IRQ0 && type != KM_IRQ1 && |
| 438 | type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && | 442 | type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && |
| 439 | type != KM_BOUNCE_READ) { | 443 | type != KM_BOUNCE_READ && type != KM_IRQ_PTE) { |
| 440 | WARN_ON(1); | 444 | WARN_ON(1); |
| 441 | warn_count--; | 445 | warn_count--; |
| 442 | } | 446 | } |
| @@ -453,7 +457,9 @@ void debug_kmap_atomic(enum km_type type) | |||
| 453 | } | 457 | } |
| 454 | 458 | ||
| 455 | if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || | 459 | if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || |
| 456 | type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { | 460 | type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ || |
| 461 | type == KM_IRQ_PTE || type == KM_NMI || | ||
| 462 | type == KM_NMI_PTE ) { | ||
| 457 | if (!irqs_disabled()) { | 463 | if (!irqs_disabled()) { |
| 458 | WARN_ON(1); | 464 | WARN_ON(1); |
| 459 | warn_count--; | 465 | warn_count--; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 28c655ba9353..5d7601b02874 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) | |||
| 234 | 234 | ||
| 235 | return 1UL << (hstate->order + PAGE_SHIFT); | 235 | return 1UL << (hstate->order + PAGE_SHIFT); |
| 236 | } | 236 | } |
| 237 | EXPORT_SYMBOL_GPL(vma_kernel_pagesize); | ||
| 237 | 238 | ||
| 238 | /* | 239 | /* |
| 239 | * Return the page size being used by the MMU to back a VMA. In the majority | 240 | * Return the page size being used by the MMU to back a VMA. In the majority |
| @@ -316,7 +317,7 @@ static void resv_map_release(struct kref *ref) | |||
| 316 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | 317 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) |
| 317 | { | 318 | { |
| 318 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 319 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
| 319 | if (!(vma->vm_flags & VM_SHARED)) | 320 | if (!(vma->vm_flags & VM_MAYSHARE)) |
| 320 | return (struct resv_map *)(get_vma_private_data(vma) & | 321 | return (struct resv_map *)(get_vma_private_data(vma) & |
| 321 | ~HPAGE_RESV_MASK); | 322 | ~HPAGE_RESV_MASK); |
| 322 | return NULL; | 323 | return NULL; |
| @@ -325,7 +326,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | |||
| 325 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | 326 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) |
| 326 | { | 327 | { |
| 327 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 328 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
| 328 | VM_BUG_ON(vma->vm_flags & VM_SHARED); | 329 | VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); |
| 329 | 330 | ||
| 330 | set_vma_private_data(vma, (get_vma_private_data(vma) & | 331 | set_vma_private_data(vma, (get_vma_private_data(vma) & |
| 331 | HPAGE_RESV_MASK) | (unsigned long)map); | 332 | HPAGE_RESV_MASK) | (unsigned long)map); |
| @@ -334,7 +335,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | |||
| 334 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) | 335 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) |
| 335 | { | 336 | { |
| 336 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 337 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
| 337 | VM_BUG_ON(vma->vm_flags & VM_SHARED); | 338 | VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); |
| 338 | 339 | ||
| 339 | set_vma_private_data(vma, get_vma_private_data(vma) | flags); | 340 | set_vma_private_data(vma, get_vma_private_data(vma) | flags); |
| 340 | } | 341 | } |
| @@ -353,7 +354,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h, | |||
| 353 | if (vma->vm_flags & VM_NORESERVE) | 354 | if (vma->vm_flags & VM_NORESERVE) |
| 354 | return; | 355 | return; |
| 355 | 356 | ||
| 356 | if (vma->vm_flags & VM_SHARED) { | 357 | if (vma->vm_flags & VM_MAYSHARE) { |
| 357 | /* Shared mappings always use reserves */ | 358 | /* Shared mappings always use reserves */ |
| 358 | h->resv_huge_pages--; | 359 | h->resv_huge_pages--; |
| 359 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { | 360 | } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { |
| @@ -369,14 +370,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h, | |||
| 369 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | 370 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) |
| 370 | { | 371 | { |
| 371 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 372 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); |
| 372 | if (!(vma->vm_flags & VM_SHARED)) | 373 | if (!(vma->vm_flags & VM_MAYSHARE)) |
| 373 | vma->vm_private_data = (void *)0; | 374 | vma->vm_private_data = (void *)0; |
| 374 | } | 375 | } |
| 375 | 376 | ||
| 376 | /* Returns true if the VMA has associated reserve pages */ | 377 | /* Returns true if the VMA has associated reserve pages */ |
| 377 | static int vma_has_reserves(struct vm_area_struct *vma) | 378 | static int vma_has_reserves(struct vm_area_struct *vma) |
| 378 | { | 379 | { |
| 379 | if (vma->vm_flags & VM_SHARED) | 380 | if (vma->vm_flags & VM_MAYSHARE) |
| 380 | return 1; | 381 | return 1; |
| 381 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | 382 | if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
| 382 | return 1; | 383 | return 1; |
| @@ -455,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) | |||
| 455 | h->free_huge_pages_node[nid]++; | 456 | h->free_huge_pages_node[nid]++; |
| 456 | } | 457 | } |
| 457 | 458 | ||
| 458 | static struct page *dequeue_huge_page(struct hstate *h) | ||
| 459 | { | ||
| 460 | int nid; | ||
| 461 | struct page *page = NULL; | ||
| 462 | |||
| 463 | for (nid = 0; nid < MAX_NUMNODES; ++nid) { | ||
| 464 | if (!list_empty(&h->hugepage_freelists[nid])) { | ||
| 465 | page = list_entry(h->hugepage_freelists[nid].next, | ||
| 466 | struct page, lru); | ||
| 467 | list_del(&page->lru); | ||
| 468 | h->free_huge_pages--; | ||
| 469 | h->free_huge_pages_node[nid]--; | ||
| 470 | break; | ||
| 471 | } | ||
| 472 | } | ||
| 473 | return page; | ||
| 474 | } | ||
| 475 | |||
| 476 | static struct page *dequeue_huge_page_vma(struct hstate *h, | 459 | static struct page *dequeue_huge_page_vma(struct hstate *h, |
| 477 | struct vm_area_struct *vma, | 460 | struct vm_area_struct *vma, |
| 478 | unsigned long address, int avoid_reserve) | 461 | unsigned long address, int avoid_reserve) |
| @@ -578,41 +561,6 @@ static void free_huge_page(struct page *page) | |||
| 578 | hugetlb_put_quota(mapping, 1); | 561 | hugetlb_put_quota(mapping, 1); |
| 579 | } | 562 | } |
| 580 | 563 | ||
| 581 | /* | ||
| 582 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
| 583 | * balanced by operating on them in a round-robin fashion. | ||
| 584 | * Returns 1 if an adjustment was made. | ||
| 585 | */ | ||
| 586 | static int adjust_pool_surplus(struct hstate *h, int delta) | ||
| 587 | { | ||
| 588 | static int prev_nid; | ||
| 589 | int nid = prev_nid; | ||
| 590 | int ret = 0; | ||
| 591 | |||
| 592 | VM_BUG_ON(delta != -1 && delta != 1); | ||
| 593 | do { | ||
| 594 | nid = next_node(nid, node_online_map); | ||
| 595 | if (nid == MAX_NUMNODES) | ||
| 596 | nid = first_node(node_online_map); | ||
| 597 | |||
| 598 | /* To shrink on this node, there must be a surplus page */ | ||
| 599 | if (delta < 0 && !h->surplus_huge_pages_node[nid]) | ||
| 600 | continue; | ||
| 601 | /* Surplus cannot exceed the total number of pages */ | ||
| 602 | if (delta > 0 && h->surplus_huge_pages_node[nid] >= | ||
| 603 | h->nr_huge_pages_node[nid]) | ||
| 604 | continue; | ||
| 605 | |||
| 606 | h->surplus_huge_pages += delta; | ||
| 607 | h->surplus_huge_pages_node[nid] += delta; | ||
| 608 | ret = 1; | ||
| 609 | break; | ||
| 610 | } while (nid != prev_nid); | ||
| 611 | |||
| 612 | prev_nid = nid; | ||
| 613 | return ret; | ||
| 614 | } | ||
| 615 | |||
| 616 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 564 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
| 617 | { | 565 | { |
| 618 | set_compound_page_dtor(page, free_huge_page); | 566 | set_compound_page_dtor(page, free_huge_page); |
| @@ -623,6 +571,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | |||
| 623 | put_page(page); /* free it into the hugepage allocator */ | 571 | put_page(page); /* free it into the hugepage allocator */ |
| 624 | } | 572 | } |
| 625 | 573 | ||
| 574 | static void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
| 575 | { | ||
| 576 | int i; | ||
| 577 | int nr_pages = 1 << order; | ||
| 578 | struct page *p = page + 1; | ||
| 579 | |||
| 580 | /* we rely on prep_new_huge_page to set the destructor */ | ||
| 581 | set_compound_order(page, order); | ||
| 582 | __SetPageHead(page); | ||
| 583 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
| 584 | __SetPageTail(p); | ||
| 585 | p->first_page = page; | ||
| 586 | } | ||
| 587 | } | ||
| 588 | |||
| 589 | int PageHuge(struct page *page) | ||
| 590 | { | ||
| 591 | compound_page_dtor *dtor; | ||
| 592 | |||
| 593 | if (!PageCompound(page)) | ||
| 594 | return 0; | ||
| 595 | |||
| 596 | page = compound_head(page); | ||
| 597 | dtor = get_compound_page_dtor(page); | ||
| 598 | |||
| 599 | return dtor == free_huge_page; | ||
| 600 | } | ||
| 601 | |||
| 626 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | 602 | static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) |
| 627 | { | 603 | { |
| 628 | struct page *page; | 604 | struct page *page; |
| @@ -630,7 +606,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
| 630 | if (h->order >= MAX_ORDER) | 606 | if (h->order >= MAX_ORDER) |
| 631 | return NULL; | 607 | return NULL; |
| 632 | 608 | ||
| 633 | page = alloc_pages_node(nid, | 609 | page = alloc_pages_exact_node(nid, |
| 634 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| | 610 | htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| |
| 635 | __GFP_REPEAT|__GFP_NOWARN, | 611 | __GFP_REPEAT|__GFP_NOWARN, |
| 636 | huge_page_order(h)); | 612 | huge_page_order(h)); |
| @@ -647,22 +623,22 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) | |||
| 647 | 623 | ||
| 648 | /* | 624 | /* |
| 649 | * Use a helper variable to find the next node and then | 625 | * Use a helper variable to find the next node and then |
| 650 | * copy it back to hugetlb_next_nid afterwards: | 626 | * copy it back to next_nid_to_alloc afterwards: |
| 651 | * otherwise there's a window in which a racer might | 627 | * otherwise there's a window in which a racer might |
| 652 | * pass invalid nid MAX_NUMNODES to alloc_pages_node. | 628 | * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. |
| 653 | * But we don't need to use a spin_lock here: it really | 629 | * But we don't need to use a spin_lock here: it really |
| 654 | * doesn't matter if occasionally a racer chooses the | 630 | * doesn't matter if occasionally a racer chooses the |
| 655 | * same nid as we do. Move nid forward in the mask even | 631 | * same nid as we do. Move nid forward in the mask even |
| 656 | * if we just successfully allocated a hugepage so that | 632 | * if we just successfully allocated a hugepage so that |
| 657 | * the next caller gets hugepages on the next node. | 633 | * the next caller gets hugepages on the next node. |
| 658 | */ | 634 | */ |
| 659 | static int hstate_next_node(struct hstate *h) | 635 | static int hstate_next_node_to_alloc(struct hstate *h) |
| 660 | { | 636 | { |
| 661 | int next_nid; | 637 | int next_nid; |
| 662 | next_nid = next_node(h->hugetlb_next_nid, node_online_map); | 638 | next_nid = next_node(h->next_nid_to_alloc, node_online_map); |
| 663 | if (next_nid == MAX_NUMNODES) | 639 | if (next_nid == MAX_NUMNODES) |
| 664 | next_nid = first_node(node_online_map); | 640 | next_nid = first_node(node_online_map); |
| 665 | h->hugetlb_next_nid = next_nid; | 641 | h->next_nid_to_alloc = next_nid; |
| 666 | return next_nid; | 642 | return next_nid; |
| 667 | } | 643 | } |
| 668 | 644 | ||
| @@ -673,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
| 673 | int next_nid; | 649 | int next_nid; |
| 674 | int ret = 0; | 650 | int ret = 0; |
| 675 | 651 | ||
| 676 | start_nid = h->hugetlb_next_nid; | 652 | start_nid = h->next_nid_to_alloc; |
| 653 | next_nid = start_nid; | ||
| 677 | 654 | ||
| 678 | do { | 655 | do { |
| 679 | page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); | 656 | page = alloc_fresh_huge_page_node(h, next_nid); |
| 680 | if (page) | 657 | if (page) |
| 681 | ret = 1; | 658 | ret = 1; |
| 682 | next_nid = hstate_next_node(h); | 659 | next_nid = hstate_next_node_to_alloc(h); |
| 683 | } while (!page && h->hugetlb_next_nid != start_nid); | 660 | } while (!page && next_nid != start_nid); |
| 684 | 661 | ||
| 685 | if (ret) | 662 | if (ret) |
| 686 | count_vm_event(HTLB_BUDDY_PGALLOC); | 663 | count_vm_event(HTLB_BUDDY_PGALLOC); |
| @@ -690,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h) | |||
| 690 | return ret; | 667 | return ret; |
| 691 | } | 668 | } |
| 692 | 669 | ||
| 670 | /* | ||
| 671 | * helper for free_pool_huge_page() - find next node | ||
| 672 | * from which to free a huge page | ||
| 673 | */ | ||
| 674 | static int hstate_next_node_to_free(struct hstate *h) | ||
| 675 | { | ||
| 676 | int next_nid; | ||
| 677 | next_nid = next_node(h->next_nid_to_free, node_online_map); | ||
| 678 | if (next_nid == MAX_NUMNODES) | ||
| 679 | next_nid = first_node(node_online_map); | ||
| 680 | h->next_nid_to_free = next_nid; | ||
| 681 | return next_nid; | ||
| 682 | } | ||
| 683 | |||
| 684 | /* | ||
| 685 | * Free huge page from pool from next node to free. | ||
| 686 | * Attempt to keep persistent huge pages more or less | ||
| 687 | * balanced over allowed nodes. | ||
| 688 | * Called with hugetlb_lock locked. | ||
| 689 | */ | ||
| 690 | static int free_pool_huge_page(struct hstate *h, bool acct_surplus) | ||
| 691 | { | ||
| 692 | int start_nid; | ||
| 693 | int next_nid; | ||
| 694 | int ret = 0; | ||
| 695 | |||
| 696 | start_nid = h->next_nid_to_free; | ||
| 697 | next_nid = start_nid; | ||
| 698 | |||
| 699 | do { | ||
| 700 | /* | ||
| 701 | * If we're returning unused surplus pages, only examine | ||
| 702 | * nodes with surplus pages. | ||
| 703 | */ | ||
| 704 | if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && | ||
| 705 | !list_empty(&h->hugepage_freelists[next_nid])) { | ||
| 706 | struct page *page = | ||
| 707 | list_entry(h->hugepage_freelists[next_nid].next, | ||
| 708 | struct page, lru); | ||
| 709 | list_del(&page->lru); | ||
| 710 | h->free_huge_pages--; | ||
| 711 | h->free_huge_pages_node[next_nid]--; | ||
| 712 | if (acct_surplus) { | ||
| 713 | h->surplus_huge_pages--; | ||
| 714 | h->surplus_huge_pages_node[next_nid]--; | ||
| 715 | } | ||
| 716 | update_and_free_page(h, page); | ||
| 717 | ret = 1; | ||
| 718 | } | ||
| 719 | next_nid = hstate_next_node_to_free(h); | ||
| 720 | } while (!ret && next_nid != start_nid); | ||
| 721 | |||
| 722 | return ret; | ||
| 723 | } | ||
| 724 | |||
| 693 | static struct page *alloc_buddy_huge_page(struct hstate *h, | 725 | static struct page *alloc_buddy_huge_page(struct hstate *h, |
| 694 | struct vm_area_struct *vma, unsigned long address) | 726 | struct vm_area_struct *vma, unsigned long address) |
| 695 | { | 727 | { |
| @@ -861,22 +893,13 @@ free: | |||
| 861 | * When releasing a hugetlb pool reservation, any surplus pages that were | 893 | * When releasing a hugetlb pool reservation, any surplus pages that were |
| 862 | * allocated to satisfy the reservation must be explicitly freed if they were | 894 | * allocated to satisfy the reservation must be explicitly freed if they were |
| 863 | * never used. | 895 | * never used. |
| 896 | * Called with hugetlb_lock held. | ||
| 864 | */ | 897 | */ |
| 865 | static void return_unused_surplus_pages(struct hstate *h, | 898 | static void return_unused_surplus_pages(struct hstate *h, |
| 866 | unsigned long unused_resv_pages) | 899 | unsigned long unused_resv_pages) |
| 867 | { | 900 | { |
| 868 | static int nid = -1; | ||
| 869 | struct page *page; | ||
| 870 | unsigned long nr_pages; | 901 | unsigned long nr_pages; |
| 871 | 902 | ||
| 872 | /* | ||
| 873 | * We want to release as many surplus pages as possible, spread | ||
| 874 | * evenly across all nodes. Iterate across all nodes until we | ||
| 875 | * can no longer free unreserved surplus pages. This occurs when | ||
| 876 | * the nodes with surplus pages have no free pages. | ||
| 877 | */ | ||
| 878 | unsigned long remaining_iterations = num_online_nodes(); | ||
| 879 | |||
| 880 | /* Uncommit the reservation */ | 903 | /* Uncommit the reservation */ |
| 881 | h->resv_huge_pages -= unused_resv_pages; | 904 | h->resv_huge_pages -= unused_resv_pages; |
| 882 | 905 | ||
| @@ -886,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
| 886 | 909 | ||
| 887 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); | 910 | nr_pages = min(unused_resv_pages, h->surplus_huge_pages); |
| 888 | 911 | ||
| 889 | while (remaining_iterations-- && nr_pages) { | 912 | /* |
| 890 | nid = next_node(nid, node_online_map); | 913 | * We want to release as many surplus pages as possible, spread |
| 891 | if (nid == MAX_NUMNODES) | 914 | * evenly across all nodes. Iterate across all nodes until we |
| 892 | nid = first_node(node_online_map); | 915 | * can no longer free unreserved surplus pages. This occurs when |
| 893 | 916 | * the nodes with surplus pages have no free pages. | |
| 894 | if (!h->surplus_huge_pages_node[nid]) | 917 | * free_pool_huge_page() will balance the the frees across the |
| 895 | continue; | 918 | * on-line nodes for us and will handle the hstate accounting. |
| 896 | 919 | */ | |
| 897 | if (!list_empty(&h->hugepage_freelists[nid])) { | 920 | while (nr_pages--) { |
| 898 | page = list_entry(h->hugepage_freelists[nid].next, | 921 | if (!free_pool_huge_page(h, 1)) |
| 899 | struct page, lru); | 922 | break; |
| 900 | list_del(&page->lru); | ||
| 901 | update_and_free_page(h, page); | ||
| 902 | h->free_huge_pages--; | ||
| 903 | h->free_huge_pages_node[nid]--; | ||
| 904 | h->surplus_huge_pages--; | ||
| 905 | h->surplus_huge_pages_node[nid]--; | ||
| 906 | nr_pages--; | ||
| 907 | remaining_iterations = num_online_nodes(); | ||
| 908 | } | ||
| 909 | } | 923 | } |
| 910 | } | 924 | } |
| 911 | 925 | ||
| @@ -924,7 +938,7 @@ static long vma_needs_reservation(struct hstate *h, | |||
| 924 | struct address_space *mapping = vma->vm_file->f_mapping; | 938 | struct address_space *mapping = vma->vm_file->f_mapping; |
| 925 | struct inode *inode = mapping->host; | 939 | struct inode *inode = mapping->host; |
| 926 | 940 | ||
| 927 | if (vma->vm_flags & VM_SHARED) { | 941 | if (vma->vm_flags & VM_MAYSHARE) { |
| 928 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | 942 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
| 929 | return region_chg(&inode->i_mapping->private_list, | 943 | return region_chg(&inode->i_mapping->private_list, |
| 930 | idx, idx + 1); | 944 | idx, idx + 1); |
| @@ -949,7 +963,7 @@ static void vma_commit_reservation(struct hstate *h, | |||
| 949 | struct address_space *mapping = vma->vm_file->f_mapping; | 963 | struct address_space *mapping = vma->vm_file->f_mapping; |
| 950 | struct inode *inode = mapping->host; | 964 | struct inode *inode = mapping->host; |
| 951 | 965 | ||
| 952 | if (vma->vm_flags & VM_SHARED) { | 966 | if (vma->vm_flags & VM_MAYSHARE) { |
| 953 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); | 967 | pgoff_t idx = vma_hugecache_offset(h, vma, addr); |
| 954 | region_add(&inode->i_mapping->private_list, idx, idx + 1); | 968 | region_add(&inode->i_mapping->private_list, idx, idx + 1); |
| 955 | 969 | ||
| @@ -1014,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
| 1014 | void *addr; | 1028 | void *addr; |
| 1015 | 1029 | ||
| 1016 | addr = __alloc_bootmem_node_nopanic( | 1030 | addr = __alloc_bootmem_node_nopanic( |
| 1017 | NODE_DATA(h->hugetlb_next_nid), | 1031 | NODE_DATA(h->next_nid_to_alloc), |
| 1018 | huge_page_size(h), huge_page_size(h), 0); | 1032 | huge_page_size(h), huge_page_size(h), 0); |
| 1019 | 1033 | ||
| 1034 | hstate_next_node_to_alloc(h); | ||
| 1020 | if (addr) { | 1035 | if (addr) { |
| 1021 | /* | 1036 | /* |
| 1022 | * Use the beginning of the huge page to store the | 1037 | * Use the beginning of the huge page to store the |
| @@ -1026,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) | |||
| 1026 | m = addr; | 1041 | m = addr; |
| 1027 | goto found; | 1042 | goto found; |
| 1028 | } | 1043 | } |
| 1029 | hstate_next_node(h); | ||
| 1030 | nr_nodes--; | 1044 | nr_nodes--; |
| 1031 | } | 1045 | } |
| 1032 | return 0; | 1046 | return 0; |
| @@ -1140,6 +1154,53 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) | |||
| 1140 | } | 1154 | } |
| 1141 | #endif | 1155 | #endif |
| 1142 | 1156 | ||
| 1157 | /* | ||
| 1158 | * Increment or decrement surplus_huge_pages. Keep node-specific counters | ||
| 1159 | * balanced by operating on them in a round-robin fashion. | ||
| 1160 | * Returns 1 if an adjustment was made. | ||
| 1161 | */ | ||
| 1162 | static int adjust_pool_surplus(struct hstate *h, int delta) | ||
| 1163 | { | ||
| 1164 | int start_nid, next_nid; | ||
| 1165 | int ret = 0; | ||
| 1166 | |||
| 1167 | VM_BUG_ON(delta != -1 && delta != 1); | ||
| 1168 | |||
| 1169 | if (delta < 0) | ||
| 1170 | start_nid = h->next_nid_to_alloc; | ||
| 1171 | else | ||
| 1172 | start_nid = h->next_nid_to_free; | ||
| 1173 | next_nid = start_nid; | ||
| 1174 | |||
| 1175 | do { | ||
| 1176 | int nid = next_nid; | ||
| 1177 | if (delta < 0) { | ||
| 1178 | next_nid = hstate_next_node_to_alloc(h); | ||
| 1179 | /* | ||
| 1180 | * To shrink on this node, there must be a surplus page | ||
| 1181 | */ | ||
| 1182 | if (!h->surplus_huge_pages_node[nid]) | ||
| 1183 | continue; | ||
| 1184 | } | ||
| 1185 | if (delta > 0) { | ||
| 1186 | next_nid = hstate_next_node_to_free(h); | ||
| 1187 | /* | ||
| 1188 | * Surplus cannot exceed the total number of pages | ||
| 1189 | */ | ||
| 1190 | if (h->surplus_huge_pages_node[nid] >= | ||
| 1191 | h->nr_huge_pages_node[nid]) | ||
| 1192 | continue; | ||
| 1193 | } | ||
| 1194 | |||
| 1195 | h->surplus_huge_pages += delta; | ||
| 1196 | h->surplus_huge_pages_node[nid] += delta; | ||
| 1197 | ret = 1; | ||
| 1198 | break; | ||
| 1199 | } while (next_nid != start_nid); | ||
| 1200 | |||
| 1201 | return ret; | ||
| 1202 | } | ||
| 1203 | |||
| 1143 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) | 1204 | #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) |
| 1144 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | 1205 | static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) |
| 1145 | { | 1206 | { |
| @@ -1198,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) | |||
| 1198 | min_count = max(count, min_count); | 1259 | min_count = max(count, min_count); |
| 1199 | try_to_free_low(h, min_count); | 1260 | try_to_free_low(h, min_count); |
| 1200 | while (min_count < persistent_huge_pages(h)) { | 1261 | while (min_count < persistent_huge_pages(h)) { |
| 1201 | struct page *page = dequeue_huge_page(h); | 1262 | if (!free_pool_huge_page(h, 0)) |
| 1202 | if (!page) | ||
| 1203 | break; | 1263 | break; |
| 1204 | update_and_free_page(h, page); | ||
| 1205 | } | 1264 | } |
| 1206 | while (count < persistent_huge_pages(h)) { | 1265 | while (count < persistent_huge_pages(h)) { |
| 1207 | if (!adjust_pool_surplus(h, 1)) | 1266 | if (!adjust_pool_surplus(h, 1)) |
| @@ -1413,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
| 1413 | h->free_huge_pages = 0; | 1472 | h->free_huge_pages = 0; |
| 1414 | for (i = 0; i < MAX_NUMNODES; ++i) | 1473 | for (i = 0; i < MAX_NUMNODES; ++i) |
| 1415 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1474 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
| 1416 | h->hugetlb_next_nid = first_node(node_online_map); | 1475 | h->next_nid_to_alloc = first_node(node_online_map); |
| 1476 | h->next_nid_to_free = first_node(node_online_map); | ||
| 1417 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1477 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
| 1418 | huge_page_size(h)/1024); | 1478 | huge_page_size(h)/1024); |
| 1419 | 1479 | ||
| @@ -1477,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) | |||
| 1477 | 1537 | ||
| 1478 | #ifdef CONFIG_SYSCTL | 1538 | #ifdef CONFIG_SYSCTL |
| 1479 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, | 1539 | int hugetlb_sysctl_handler(struct ctl_table *table, int write, |
| 1480 | struct file *file, void __user *buffer, | 1540 | void __user *buffer, |
| 1481 | size_t *length, loff_t *ppos) | 1541 | size_t *length, loff_t *ppos) |
| 1482 | { | 1542 | { |
| 1483 | struct hstate *h = &default_hstate; | 1543 | struct hstate *h = &default_hstate; |
| @@ -1488,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
| 1488 | 1548 | ||
| 1489 | table->data = &tmp; | 1549 | table->data = &tmp; |
| 1490 | table->maxlen = sizeof(unsigned long); | 1550 | table->maxlen = sizeof(unsigned long); |
| 1491 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1551 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
| 1492 | 1552 | ||
| 1493 | if (write) | 1553 | if (write) |
| 1494 | h->max_huge_pages = set_max_huge_pages(h, tmp); | 1554 | h->max_huge_pages = set_max_huge_pages(h, tmp); |
| @@ -1497,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, | |||
| 1497 | } | 1557 | } |
| 1498 | 1558 | ||
| 1499 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | 1559 | int hugetlb_treat_movable_handler(struct ctl_table *table, int write, |
| 1500 | struct file *file, void __user *buffer, | 1560 | void __user *buffer, |
| 1501 | size_t *length, loff_t *ppos) | 1561 | size_t *length, loff_t *ppos) |
| 1502 | { | 1562 | { |
| 1503 | proc_dointvec(table, write, file, buffer, length, ppos); | 1563 | proc_dointvec(table, write, buffer, length, ppos); |
| 1504 | if (hugepages_treat_as_movable) | 1564 | if (hugepages_treat_as_movable) |
| 1505 | htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; | 1565 | htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; |
| 1506 | else | 1566 | else |
| @@ -1509,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, | |||
| 1509 | } | 1569 | } |
| 1510 | 1570 | ||
| 1511 | int hugetlb_overcommit_handler(struct ctl_table *table, int write, | 1571 | int hugetlb_overcommit_handler(struct ctl_table *table, int write, |
| 1512 | struct file *file, void __user *buffer, | 1572 | void __user *buffer, |
| 1513 | size_t *length, loff_t *ppos) | 1573 | size_t *length, loff_t *ppos) |
| 1514 | { | 1574 | { |
| 1515 | struct hstate *h = &default_hstate; | 1575 | struct hstate *h = &default_hstate; |
| @@ -1520,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, | |||
| 1520 | 1580 | ||
| 1521 | table->data = &tmp; | 1581 | table->data = &tmp; |
| 1522 | table->maxlen = sizeof(unsigned long); | 1582 | table->maxlen = sizeof(unsigned long); |
| 1523 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 1583 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
| 1524 | 1584 | ||
| 1525 | if (write) { | 1585 | if (write) { |
| 1526 | spin_lock(&hugetlb_lock); | 1586 | spin_lock(&hugetlb_lock); |
| @@ -1661,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1661 | return 0; | 1721 | return 0; |
| 1662 | } | 1722 | } |
| 1663 | 1723 | ||
| 1664 | struct vm_operations_struct hugetlb_vm_ops = { | 1724 | const struct vm_operations_struct hugetlb_vm_ops = { |
| 1665 | .fault = hugetlb_vm_op_fault, | 1725 | .fault = hugetlb_vm_op_fault, |
| 1666 | .open = hugetlb_vm_op_open, | 1726 | .open = hugetlb_vm_op_open, |
| 1667 | .close = hugetlb_vm_op_close, | 1727 | .close = hugetlb_vm_op_close, |
| @@ -1893,7 +1953,7 @@ retry_avoidcopy: | |||
| 1893 | * at the time of fork() could consume its reserves on COW instead | 1953 | * at the time of fork() could consume its reserves on COW instead |
| 1894 | * of the full address range. | 1954 | * of the full address range. |
| 1895 | */ | 1955 | */ |
| 1896 | if (!(vma->vm_flags & VM_SHARED) && | 1956 | if (!(vma->vm_flags & VM_MAYSHARE) && |
| 1897 | is_vma_resv_set(vma, HPAGE_RESV_OWNER) && | 1957 | is_vma_resv_set(vma, HPAGE_RESV_OWNER) && |
| 1898 | old_page != pagecache_page) | 1958 | old_page != pagecache_page) |
| 1899 | outside_reserve = 1; | 1959 | outside_reserve = 1; |
| @@ -1956,8 +2016,28 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, | |||
| 1956 | return find_lock_page(mapping, idx); | 2016 | return find_lock_page(mapping, idx); |
| 1957 | } | 2017 | } |
| 1958 | 2018 | ||
| 2019 | /* | ||
| 2020 | * Return whether there is a pagecache page to back given address within VMA. | ||
| 2021 | * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. | ||
| 2022 | */ | ||
| 2023 | static bool hugetlbfs_pagecache_present(struct hstate *h, | ||
| 2024 | struct vm_area_struct *vma, unsigned long address) | ||
| 2025 | { | ||
| 2026 | struct address_space *mapping; | ||
| 2027 | pgoff_t idx; | ||
| 2028 | struct page *page; | ||
| 2029 | |||
| 2030 | mapping = vma->vm_file->f_mapping; | ||
| 2031 | idx = vma_hugecache_offset(h, vma, address); | ||
| 2032 | |||
| 2033 | page = find_get_page(mapping, idx); | ||
| 2034 | if (page) | ||
| 2035 | put_page(page); | ||
| 2036 | return page != NULL; | ||
| 2037 | } | ||
| 2038 | |||
| 1959 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2039 | static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 1960 | unsigned long address, pte_t *ptep, int write_access) | 2040 | unsigned long address, pte_t *ptep, unsigned int flags) |
| 1961 | { | 2041 | { |
| 1962 | struct hstate *h = hstate_vma(vma); | 2042 | struct hstate *h = hstate_vma(vma); |
| 1963 | int ret = VM_FAULT_SIGBUS; | 2043 | int ret = VM_FAULT_SIGBUS; |
| @@ -2000,7 +2080,7 @@ retry: | |||
| 2000 | clear_huge_page(page, address, huge_page_size(h)); | 2080 | clear_huge_page(page, address, huge_page_size(h)); |
| 2001 | __SetPageUptodate(page); | 2081 | __SetPageUptodate(page); |
| 2002 | 2082 | ||
| 2003 | if (vma->vm_flags & VM_SHARED) { | 2083 | if (vma->vm_flags & VM_MAYSHARE) { |
| 2004 | int err; | 2084 | int err; |
| 2005 | struct inode *inode = mapping->host; | 2085 | struct inode *inode = mapping->host; |
| 2006 | 2086 | ||
| @@ -2025,7 +2105,7 @@ retry: | |||
| 2025 | * any allocations necessary to record that reservation occur outside | 2105 | * any allocations necessary to record that reservation occur outside |
| 2026 | * the spinlock. | 2106 | * the spinlock. |
| 2027 | */ | 2107 | */ |
| 2028 | if (write_access && !(vma->vm_flags & VM_SHARED)) | 2108 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) |
| 2029 | if (vma_needs_reservation(h, vma, address) < 0) { | 2109 | if (vma_needs_reservation(h, vma, address) < 0) { |
| 2030 | ret = VM_FAULT_OOM; | 2110 | ret = VM_FAULT_OOM; |
| 2031 | goto backout_unlocked; | 2111 | goto backout_unlocked; |
| @@ -2044,7 +2124,7 @@ retry: | |||
| 2044 | && (vma->vm_flags & VM_SHARED))); | 2124 | && (vma->vm_flags & VM_SHARED))); |
| 2045 | set_huge_pte_at(mm, address, ptep, new_pte); | 2125 | set_huge_pte_at(mm, address, ptep, new_pte); |
| 2046 | 2126 | ||
| 2047 | if (write_access && !(vma->vm_flags & VM_SHARED)) { | 2127 | if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { |
| 2048 | /* Optimization, do the COW without a second fault */ | 2128 | /* Optimization, do the COW without a second fault */ |
| 2049 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); | 2129 | ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); |
| 2050 | } | 2130 | } |
| @@ -2063,7 +2143,7 @@ backout_unlocked: | |||
| 2063 | } | 2143 | } |
| 2064 | 2144 | ||
| 2065 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2145 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2066 | unsigned long address, int write_access) | 2146 | unsigned long address, unsigned int flags) |
| 2067 | { | 2147 | { |
| 2068 | pte_t *ptep; | 2148 | pte_t *ptep; |
| 2069 | pte_t entry; | 2149 | pte_t entry; |
| @@ -2084,7 +2164,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2084 | mutex_lock(&hugetlb_instantiation_mutex); | 2164 | mutex_lock(&hugetlb_instantiation_mutex); |
| 2085 | entry = huge_ptep_get(ptep); | 2165 | entry = huge_ptep_get(ptep); |
| 2086 | if (huge_pte_none(entry)) { | 2166 | if (huge_pte_none(entry)) { |
| 2087 | ret = hugetlb_no_page(mm, vma, address, ptep, write_access); | 2167 | ret = hugetlb_no_page(mm, vma, address, ptep, flags); |
| 2088 | goto out_mutex; | 2168 | goto out_mutex; |
| 2089 | } | 2169 | } |
| 2090 | 2170 | ||
| @@ -2098,13 +2178,13 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2098 | * page now as it is used to determine if a reservation has been | 2178 | * page now as it is used to determine if a reservation has been |
| 2099 | * consumed. | 2179 | * consumed. |
| 2100 | */ | 2180 | */ |
| 2101 | if (write_access && !pte_write(entry)) { | 2181 | if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { |
| 2102 | if (vma_needs_reservation(h, vma, address) < 0) { | 2182 | if (vma_needs_reservation(h, vma, address) < 0) { |
| 2103 | ret = VM_FAULT_OOM; | 2183 | ret = VM_FAULT_OOM; |
| 2104 | goto out_mutex; | 2184 | goto out_mutex; |
| 2105 | } | 2185 | } |
| 2106 | 2186 | ||
| 2107 | if (!(vma->vm_flags & VM_SHARED)) | 2187 | if (!(vma->vm_flags & VM_MAYSHARE)) |
| 2108 | pagecache_page = hugetlbfs_pagecache_page(h, | 2188 | pagecache_page = hugetlbfs_pagecache_page(h, |
| 2109 | vma, address); | 2189 | vma, address); |
| 2110 | } | 2190 | } |
| @@ -2115,7 +2195,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2115 | goto out_page_table_lock; | 2195 | goto out_page_table_lock; |
| 2116 | 2196 | ||
| 2117 | 2197 | ||
| 2118 | if (write_access) { | 2198 | if (flags & FAULT_FLAG_WRITE) { |
| 2119 | if (!pte_write(entry)) { | 2199 | if (!pte_write(entry)) { |
| 2120 | ret = hugetlb_cow(mm, vma, address, ptep, entry, | 2200 | ret = hugetlb_cow(mm, vma, address, ptep, entry, |
| 2121 | pagecache_page); | 2201 | pagecache_page); |
| @@ -2124,7 +2204,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2124 | entry = pte_mkdirty(entry); | 2204 | entry = pte_mkdirty(entry); |
| 2125 | } | 2205 | } |
| 2126 | entry = pte_mkyoung(entry); | 2206 | entry = pte_mkyoung(entry); |
| 2127 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) | 2207 | if (huge_ptep_set_access_flags(vma, address, ptep, entry, |
| 2208 | flags & FAULT_FLAG_WRITE)) | ||
| 2128 | update_mmu_cache(vma, address, entry); | 2209 | update_mmu_cache(vma, address, entry); |
| 2129 | 2210 | ||
| 2130 | out_page_table_lock: | 2211 | out_page_table_lock: |
| @@ -2150,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
| 2150 | return NULL; | 2231 | return NULL; |
| 2151 | } | 2232 | } |
| 2152 | 2233 | ||
| 2153 | static int huge_zeropage_ok(pte_t *ptep, int write, int shared) | ||
| 2154 | { | ||
| 2155 | if (!ptep || write || shared) | ||
| 2156 | return 0; | ||
| 2157 | else | ||
| 2158 | return huge_pte_none(huge_ptep_get(ptep)); | ||
| 2159 | } | ||
| 2160 | |||
| 2161 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2234 | int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2162 | struct page **pages, struct vm_area_struct **vmas, | 2235 | struct page **pages, struct vm_area_struct **vmas, |
| 2163 | unsigned long *position, int *length, int i, | 2236 | unsigned long *position, int *length, int i, |
| 2164 | int write) | 2237 | unsigned int flags) |
| 2165 | { | 2238 | { |
| 2166 | unsigned long pfn_offset; | 2239 | unsigned long pfn_offset; |
| 2167 | unsigned long vaddr = *position; | 2240 | unsigned long vaddr = *position; |
| 2168 | int remainder = *length; | 2241 | int remainder = *length; |
| 2169 | struct hstate *h = hstate_vma(vma); | 2242 | struct hstate *h = hstate_vma(vma); |
| 2170 | int zeropage_ok = 0; | ||
| 2171 | int shared = vma->vm_flags & VM_SHARED; | ||
| 2172 | 2243 | ||
| 2173 | spin_lock(&mm->page_table_lock); | 2244 | spin_lock(&mm->page_table_lock); |
| 2174 | while (vaddr < vma->vm_end && remainder) { | 2245 | while (vaddr < vma->vm_end && remainder) { |
| 2175 | pte_t *pte; | 2246 | pte_t *pte; |
| 2247 | int absent; | ||
| 2176 | struct page *page; | 2248 | struct page *page; |
| 2177 | 2249 | ||
| 2178 | /* | 2250 | /* |
| 2179 | * Some archs (sparc64, sh*) have multiple pte_ts to | 2251 | * Some archs (sparc64, sh*) have multiple pte_ts to |
| 2180 | * each hugepage. We have to make * sure we get the | 2252 | * each hugepage. We have to make sure we get the |
| 2181 | * first, for the page indexing below to work. | 2253 | * first, for the page indexing below to work. |
| 2182 | */ | 2254 | */ |
| 2183 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); | 2255 | pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); |
| 2184 | if (huge_zeropage_ok(pte, write, shared)) | 2256 | absent = !pte || huge_pte_none(huge_ptep_get(pte)); |
| 2185 | zeropage_ok = 1; | 2257 | |
| 2258 | /* | ||
| 2259 | * When coredumping, it suits get_dump_page if we just return | ||
| 2260 | * an error where there's an empty slot with no huge pagecache | ||
| 2261 | * to back it. This way, we avoid allocating a hugepage, and | ||
| 2262 | * the sparse dumpfile avoids allocating disk blocks, but its | ||
| 2263 | * huge holes still show up with zeroes where they need to be. | ||
| 2264 | */ | ||
| 2265 | if (absent && (flags & FOLL_DUMP) && | ||
| 2266 | !hugetlbfs_pagecache_present(h, vma, vaddr)) { | ||
| 2267 | remainder = 0; | ||
| 2268 | break; | ||
| 2269 | } | ||
| 2186 | 2270 | ||
| 2187 | if (!pte || | 2271 | if (absent || |
| 2188 | (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || | 2272 | ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { |
| 2189 | (write && !pte_write(huge_ptep_get(pte)))) { | ||
| 2190 | int ret; | 2273 | int ret; |
| 2191 | 2274 | ||
| 2192 | spin_unlock(&mm->page_table_lock); | 2275 | spin_unlock(&mm->page_table_lock); |
| 2193 | ret = hugetlb_fault(mm, vma, vaddr, write); | 2276 | ret = hugetlb_fault(mm, vma, vaddr, |
| 2277 | (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); | ||
| 2194 | spin_lock(&mm->page_table_lock); | 2278 | spin_lock(&mm->page_table_lock); |
| 2195 | if (!(ret & VM_FAULT_ERROR)) | 2279 | if (!(ret & VM_FAULT_ERROR)) |
| 2196 | continue; | 2280 | continue; |
| 2197 | 2281 | ||
| 2198 | remainder = 0; | 2282 | remainder = 0; |
| 2199 | if (!i) | ||
| 2200 | i = -EFAULT; | ||
| 2201 | break; | 2283 | break; |
| 2202 | } | 2284 | } |
| 2203 | 2285 | ||
| @@ -2205,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2205 | page = pte_page(huge_ptep_get(pte)); | 2287 | page = pte_page(huge_ptep_get(pte)); |
| 2206 | same_page: | 2288 | same_page: |
| 2207 | if (pages) { | 2289 | if (pages) { |
| 2208 | if (zeropage_ok) | 2290 | pages[i] = mem_map_offset(page, pfn_offset); |
| 2209 | pages[i] = ZERO_PAGE(0); | ||
| 2210 | else | ||
| 2211 | pages[i] = mem_map_offset(page, pfn_offset); | ||
| 2212 | get_page(pages[i]); | 2291 | get_page(pages[i]); |
| 2213 | } | 2292 | } |
| 2214 | 2293 | ||
| @@ -2232,7 +2311,7 @@ same_page: | |||
| 2232 | *length = remainder; | 2311 | *length = remainder; |
| 2233 | *position = vaddr; | 2312 | *position = vaddr; |
| 2234 | 2313 | ||
| 2235 | return i; | 2314 | return i ? i : -EFAULT; |
| 2236 | } | 2315 | } |
| 2237 | 2316 | ||
| 2238 | void hugetlb_change_protection(struct vm_area_struct *vma, | 2317 | void hugetlb_change_protection(struct vm_area_struct *vma, |
| @@ -2289,7 +2368,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
| 2289 | * to reserve the full area even if read-only as mprotect() may be | 2368 | * to reserve the full area even if read-only as mprotect() may be |
| 2290 | * called to make the mapping read-write. Assume !vma is a shm mapping | 2369 | * called to make the mapping read-write. Assume !vma is a shm mapping |
| 2291 | */ | 2370 | */ |
| 2292 | if (!vma || vma->vm_flags & VM_SHARED) | 2371 | if (!vma || vma->vm_flags & VM_MAYSHARE) |
| 2293 | chg = region_chg(&inode->i_mapping->private_list, from, to); | 2372 | chg = region_chg(&inode->i_mapping->private_list, from, to); |
| 2294 | else { | 2373 | else { |
| 2295 | struct resv_map *resv_map = resv_map_alloc(); | 2374 | struct resv_map *resv_map = resv_map_alloc(); |
| @@ -2330,7 +2409,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
| 2330 | * consumed reservations are stored in the map. Hence, nothing | 2409 | * consumed reservations are stored in the map. Hence, nothing |
| 2331 | * else has to be done for private mappings here | 2410 | * else has to be done for private mappings here |
| 2332 | */ | 2411 | */ |
| 2333 | if (!vma || vma->vm_flags & VM_SHARED) | 2412 | if (!vma || vma->vm_flags & VM_MAYSHARE) |
| 2334 | region_add(&inode->i_mapping->private_list, from, to); | 2413 | region_add(&inode->i_mapping->private_list, from, to); |
| 2335 | return 0; | 2414 | return 0; |
| 2336 | } | 2415 | } |
| @@ -2341,7 +2420,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
| 2341 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | 2420 | long chg = region_truncate(&inode->i_mapping->private_list, offset); |
| 2342 | 2421 | ||
| 2343 | spin_lock(&inode->i_lock); | 2422 | spin_lock(&inode->i_lock); |
| 2344 | inode->i_blocks -= blocks_per_huge_page(h); | 2423 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); |
| 2345 | spin_unlock(&inode->i_lock); | 2424 | spin_unlock(&inode->i_lock); |
| 2346 | 2425 | ||
| 2347 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); | 2426 | hugetlb_put_quota(inode->i_mapping, (chg - freed)); |
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c new file mode 100644 index 000000000000..e1d85137f086 --- /dev/null +++ b/mm/hwpoison-inject.c | |||
| @@ -0,0 +1,41 @@ | |||
| 1 | /* Inject a hwpoison memory failure on a arbitary pfn */ | ||
| 2 | #include <linux/module.h> | ||
| 3 | #include <linux/debugfs.h> | ||
| 4 | #include <linux/kernel.h> | ||
| 5 | #include <linux/mm.h> | ||
| 6 | |||
| 7 | static struct dentry *hwpoison_dir, *corrupt_pfn; | ||
| 8 | |||
| 9 | static int hwpoison_inject(void *data, u64 val) | ||
| 10 | { | ||
| 11 | if (!capable(CAP_SYS_ADMIN)) | ||
| 12 | return -EPERM; | ||
| 13 | printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); | ||
| 14 | return __memory_failure(val, 18, 0); | ||
| 15 | } | ||
| 16 | |||
| 17 | DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); | ||
| 18 | |||
| 19 | static void pfn_inject_exit(void) | ||
| 20 | { | ||
| 21 | if (hwpoison_dir) | ||
| 22 | debugfs_remove_recursive(hwpoison_dir); | ||
| 23 | } | ||
| 24 | |||
| 25 | static int pfn_inject_init(void) | ||
| 26 | { | ||
| 27 | hwpoison_dir = debugfs_create_dir("hwpoison", NULL); | ||
| 28 | if (hwpoison_dir == NULL) | ||
| 29 | return -ENOMEM; | ||
| 30 | corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, | ||
| 31 | NULL, &hwpoison_fops); | ||
| 32 | if (corrupt_pfn == NULL) { | ||
| 33 | pfn_inject_exit(); | ||
| 34 | return -ENOMEM; | ||
| 35 | } | ||
| 36 | return 0; | ||
| 37 | } | ||
| 38 | |||
| 39 | module_init(pfn_inject_init); | ||
| 40 | module_exit(pfn_inject_exit); | ||
| 41 | MODULE_LICENSE("GPL"); | ||
diff --git a/mm/init-mm.c b/mm/init-mm.c new file mode 100644 index 000000000000..57aba0da9668 --- /dev/null +++ b/mm/init-mm.c | |||
| @@ -0,0 +1,20 @@ | |||
| 1 | #include <linux/mm_types.h> | ||
| 2 | #include <linux/rbtree.h> | ||
| 3 | #include <linux/rwsem.h> | ||
| 4 | #include <linux/spinlock.h> | ||
| 5 | #include <linux/list.h> | ||
| 6 | #include <linux/cpumask.h> | ||
| 7 | |||
| 8 | #include <asm/atomic.h> | ||
| 9 | #include <asm/pgtable.h> | ||
| 10 | |||
| 11 | struct mm_struct init_mm = { | ||
| 12 | .mm_rb = RB_ROOT, | ||
| 13 | .pgd = swapper_pg_dir, | ||
| 14 | .mm_users = ATOMIC_INIT(2), | ||
| 15 | .mm_count = ATOMIC_INIT(1), | ||
| 16 | .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), | ||
| 17 | .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), | ||
| 18 | .mmlist = LIST_HEAD_INIT(init_mm.mmlist), | ||
| 19 | .cpu_vm_mask = CPU_MASK_ALL, | ||
| 20 | }; | ||
diff --git a/mm/internal.h b/mm/internal.h index 987bb03fbdd8..22ec8d2b0fb8 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -16,9 +16,6 @@ | |||
| 16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | 16 | void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, |
| 17 | unsigned long floor, unsigned long ceiling); | 17 | unsigned long floor, unsigned long ceiling); |
| 18 | 18 | ||
| 19 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
| 20 | extern void prep_compound_gigantic_page(struct page *page, unsigned long order); | ||
| 21 | |||
| 22 | static inline void set_page_count(struct page *page, int v) | 19 | static inline void set_page_count(struct page *page, int v) |
| 23 | { | 20 | { |
| 24 | atomic_set(&page->_count, v); | 21 | atomic_set(&page->_count, v); |
| @@ -40,6 +37,8 @@ static inline void __put_page(struct page *page) | |||
| 40 | atomic_dec(&page->_count); | 37 | atomic_dec(&page->_count); |
| 41 | } | 38 | } |
| 42 | 39 | ||
| 40 | extern unsigned long highest_memmap_pfn; | ||
| 41 | |||
| 43 | /* | 42 | /* |
| 44 | * in mm/vmscan.c: | 43 | * in mm/vmscan.c: |
| 45 | */ | 44 | */ |
| @@ -49,8 +48,9 @@ extern void putback_lru_page(struct page *page); | |||
| 49 | /* | 48 | /* |
| 50 | * in mm/page_alloc.c | 49 | * in mm/page_alloc.c |
| 51 | */ | 50 | */ |
| 52 | extern unsigned long highest_memmap_pfn; | ||
| 53 | extern void __free_pages_bootmem(struct page *page, unsigned int order); | 51 | extern void __free_pages_bootmem(struct page *page, unsigned int order); |
| 52 | extern void prep_compound_page(struct page *page, unsigned long order); | ||
| 53 | |||
| 54 | 54 | ||
| 55 | /* | 55 | /* |
| 56 | * function for dealing with page's order in buddy system. | 56 | * function for dealing with page's order in buddy system. |
| @@ -74,7 +74,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) | |||
| 74 | } | 74 | } |
| 75 | #endif | 75 | #endif |
| 76 | 76 | ||
| 77 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 78 | /* | 77 | /* |
| 79 | * unevictable_migrate_page() called only from migrate_page_copy() to | 78 | * unevictable_migrate_page() called only from migrate_page_copy() to |
| 80 | * migrate unevictable flag to new page. | 79 | * migrate unevictable flag to new page. |
| @@ -86,11 +85,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) | |||
| 86 | if (TestClearPageUnevictable(old)) | 85 | if (TestClearPageUnevictable(old)) |
| 87 | SetPageUnevictable(new); | 86 | SetPageUnevictable(new); |
| 88 | } | 87 | } |
| 89 | #else | ||
| 90 | static inline void unevictable_migrate_page(struct page *new, struct page *old) | ||
| 91 | { | ||
| 92 | } | ||
| 93 | #endif | ||
| 94 | 88 | ||
| 95 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | 89 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT |
| 96 | /* | 90 | /* |
| @@ -150,23 +144,6 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
| 150 | } | 144 | } |
| 151 | } | 145 | } |
| 152 | 146 | ||
| 153 | /* | ||
| 154 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
| 155 | * Page should not be on lru, so no need to fix that up. | ||
| 156 | * free_pages_check() will verify... | ||
| 157 | */ | ||
| 158 | static inline void free_page_mlock(struct page *page) | ||
| 159 | { | ||
| 160 | if (unlikely(TestClearPageMlocked(page))) { | ||
| 161 | unsigned long flags; | ||
| 162 | |||
| 163 | local_irq_save(flags); | ||
| 164 | __dec_zone_page_state(page, NR_MLOCK); | ||
| 165 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
| 166 | local_irq_restore(flags); | ||
| 167 | } | ||
| 168 | } | ||
| 169 | |||
| 170 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 147 | #else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ |
| 171 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | 148 | static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) |
| 172 | { | 149 | { |
| @@ -175,7 +152,6 @@ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) | |||
| 175 | static inline void clear_page_mlock(struct page *page) { } | 152 | static inline void clear_page_mlock(struct page *page) { } |
| 176 | static inline void mlock_vma_page(struct page *page) { } | 153 | static inline void mlock_vma_page(struct page *page) { } |
| 177 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } | 154 | static inline void mlock_migrate_page(struct page *new, struct page *old) { } |
| 178 | static inline void free_page_mlock(struct page *page) { } | ||
| 179 | 155 | ||
| 180 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ | 156 | #endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ |
| 181 | 157 | ||
| @@ -275,13 +251,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, | |||
| 275 | } | 251 | } |
| 276 | #endif /* CONFIG_SPARSEMEM */ | 252 | #endif /* CONFIG_SPARSEMEM */ |
| 277 | 253 | ||
| 278 | #define GUP_FLAGS_WRITE 0x1 | ||
| 279 | #define GUP_FLAGS_FORCE 0x2 | ||
| 280 | #define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 | ||
| 281 | #define GUP_FLAGS_IGNORE_SIGKILL 0x8 | ||
| 282 | |||
| 283 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 254 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 284 | unsigned long start, int len, int flags, | 255 | unsigned long start, int len, unsigned int foll_flags, |
| 285 | struct page **pages, struct vm_area_struct **vmas); | 256 | struct page **pages, struct vm_area_struct **vmas); |
| 286 | 257 | ||
| 258 | #define ZONE_RECLAIM_NOSCAN -2 | ||
| 259 | #define ZONE_RECLAIM_FULL -1 | ||
| 260 | #define ZONE_RECLAIM_SOME 0 | ||
| 261 | #define ZONE_RECLAIM_SUCCESS 1 | ||
| 287 | #endif | 262 | #endif |
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c new file mode 100644 index 000000000000..fd814fd61319 --- /dev/null +++ b/mm/kmemcheck.c | |||
| @@ -0,0 +1,122 @@ | |||
| 1 | #include <linux/gfp.h> | ||
| 2 | #include <linux/mm_types.h> | ||
| 3 | #include <linux/mm.h> | ||
| 4 | #include <linux/slab.h> | ||
| 5 | #include <linux/kmemcheck.h> | ||
| 6 | |||
| 7 | void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) | ||
| 8 | { | ||
| 9 | struct page *shadow; | ||
| 10 | int pages; | ||
| 11 | int i; | ||
| 12 | |||
| 13 | pages = 1 << order; | ||
| 14 | |||
| 15 | /* | ||
| 16 | * With kmemcheck enabled, we need to allocate a memory area for the | ||
| 17 | * shadow bits as well. | ||
| 18 | */ | ||
| 19 | shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); | ||
| 20 | if (!shadow) { | ||
| 21 | if (printk_ratelimit()) | ||
| 22 | printk(KERN_ERR "kmemcheck: failed to allocate " | ||
| 23 | "shadow bitmap\n"); | ||
| 24 | return; | ||
| 25 | } | ||
| 26 | |||
| 27 | for(i = 0; i < pages; ++i) | ||
| 28 | page[i].shadow = page_address(&shadow[i]); | ||
| 29 | |||
| 30 | /* | ||
| 31 | * Mark it as non-present for the MMU so that our accesses to | ||
| 32 | * this memory will trigger a page fault and let us analyze | ||
| 33 | * the memory accesses. | ||
| 34 | */ | ||
| 35 | kmemcheck_hide_pages(page, pages); | ||
| 36 | } | ||
| 37 | |||
| 38 | void kmemcheck_free_shadow(struct page *page, int order) | ||
| 39 | { | ||
| 40 | struct page *shadow; | ||
| 41 | int pages; | ||
| 42 | int i; | ||
| 43 | |||
| 44 | if (!kmemcheck_page_is_tracked(page)) | ||
| 45 | return; | ||
| 46 | |||
| 47 | pages = 1 << order; | ||
| 48 | |||
| 49 | kmemcheck_show_pages(page, pages); | ||
| 50 | |||
| 51 | shadow = virt_to_page(page[0].shadow); | ||
| 52 | |||
| 53 | for(i = 0; i < pages; ++i) | ||
| 54 | page[i].shadow = NULL; | ||
| 55 | |||
| 56 | __free_pages(shadow, order); | ||
| 57 | } | ||
| 58 | |||
| 59 | void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, | ||
| 60 | size_t size) | ||
| 61 | { | ||
| 62 | /* | ||
| 63 | * Has already been memset(), which initializes the shadow for us | ||
| 64 | * as well. | ||
| 65 | */ | ||
| 66 | if (gfpflags & __GFP_ZERO) | ||
| 67 | return; | ||
| 68 | |||
| 69 | /* No need to initialize the shadow of a non-tracked slab. */ | ||
| 70 | if (s->flags & SLAB_NOTRACK) | ||
| 71 | return; | ||
| 72 | |||
| 73 | if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { | ||
| 74 | /* | ||
| 75 | * Allow notracked objects to be allocated from | ||
| 76 | * tracked caches. Note however that these objects | ||
| 77 | * will still get page faults on access, they just | ||
| 78 | * won't ever be flagged as uninitialized. If page | ||
| 79 | * faults are not acceptable, the slab cache itself | ||
| 80 | * should be marked NOTRACK. | ||
| 81 | */ | ||
| 82 | kmemcheck_mark_initialized(object, size); | ||
| 83 | } else if (!s->ctor) { | ||
| 84 | /* | ||
| 85 | * New objects should be marked uninitialized before | ||
| 86 | * they're returned to the called. | ||
| 87 | */ | ||
| 88 | kmemcheck_mark_uninitialized(object, size); | ||
| 89 | } | ||
| 90 | } | ||
| 91 | |||
| 92 | void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) | ||
| 93 | { | ||
| 94 | /* TODO: RCU freeing is unsupported for now; hide false positives. */ | ||
| 95 | if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) | ||
| 96 | kmemcheck_mark_freed(object, size); | ||
| 97 | } | ||
| 98 | |||
| 99 | void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, | ||
| 100 | gfp_t gfpflags) | ||
| 101 | { | ||
| 102 | int pages; | ||
| 103 | |||
| 104 | if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) | ||
| 105 | return; | ||
| 106 | |||
| 107 | pages = 1 << order; | ||
| 108 | |||
| 109 | /* | ||
| 110 | * NOTE: We choose to track GFP_ZERO pages too; in fact, they | ||
| 111 | * can become uninitialized by copying uninitialized memory | ||
| 112 | * into them. | ||
| 113 | */ | ||
| 114 | |||
| 115 | /* XXX: Can use zone->node for node? */ | ||
| 116 | kmemcheck_alloc_shadow(page, order, gfpflags, -1); | ||
| 117 | |||
| 118 | if (gfpflags & __GFP_ZERO) | ||
| 119 | kmemcheck_mark_initialized_pages(page, pages); | ||
| 120 | else | ||
| 121 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
| 122 | } | ||
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c new file mode 100644 index 000000000000..177a5169bbde --- /dev/null +++ b/mm/kmemleak-test.c | |||
| @@ -0,0 +1,111 @@ | |||
| 1 | /* | ||
| 2 | * mm/kmemleak-test.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2008 ARM Limited | ||
| 5 | * Written by Catalin Marinas <catalin.marinas@arm.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as | ||
| 9 | * published by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | * GNU General Public License for more details. | ||
| 15 | * | ||
| 16 | * You should have received a copy of the GNU General Public License | ||
| 17 | * along with this program; if not, write to the Free Software | ||
| 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 19 | */ | ||
| 20 | |||
| 21 | #include <linux/init.h> | ||
| 22 | #include <linux/kernel.h> | ||
| 23 | #include <linux/module.h> | ||
| 24 | #include <linux/slab.h> | ||
| 25 | #include <linux/vmalloc.h> | ||
| 26 | #include <linux/list.h> | ||
| 27 | #include <linux/percpu.h> | ||
| 28 | #include <linux/fdtable.h> | ||
| 29 | |||
| 30 | #include <linux/kmemleak.h> | ||
| 31 | |||
| 32 | struct test_node { | ||
| 33 | long header[25]; | ||
| 34 | struct list_head list; | ||
| 35 | long footer[25]; | ||
| 36 | }; | ||
| 37 | |||
| 38 | static LIST_HEAD(test_list); | ||
| 39 | static DEFINE_PER_CPU(void *, kmemleak_test_pointer); | ||
| 40 | |||
| 41 | /* | ||
| 42 | * Some very simple testing. This function needs to be extended for | ||
| 43 | * proper testing. | ||
| 44 | */ | ||
| 45 | static int __init kmemleak_test_init(void) | ||
| 46 | { | ||
| 47 | struct test_node *elem; | ||
| 48 | int i; | ||
| 49 | |||
| 50 | printk(KERN_INFO "Kmemleak testing\n"); | ||
| 51 | |||
| 52 | /* make some orphan objects */ | ||
| 53 | pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); | ||
| 54 | pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); | ||
| 55 | pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); | ||
| 56 | pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); | ||
| 57 | pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); | ||
| 58 | pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); | ||
| 59 | pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); | ||
| 60 | pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); | ||
| 61 | #ifndef CONFIG_MODULES | ||
| 62 | pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", | ||
| 63 | kmem_cache_alloc(files_cachep, GFP_KERNEL)); | ||
| 64 | pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n", | ||
| 65 | kmem_cache_alloc(files_cachep, GFP_KERNEL)); | ||
| 66 | #endif | ||
| 67 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
| 68 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
| 69 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
| 70 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
| 71 | pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64)); | ||
| 72 | |||
| 73 | /* | ||
| 74 | * Add elements to a list. They should only appear as orphan | ||
| 75 | * after the module is removed. | ||
| 76 | */ | ||
| 77 | for (i = 0; i < 10; i++) { | ||
| 78 | elem = kmalloc(sizeof(*elem), GFP_KERNEL); | ||
| 79 | pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); | ||
| 80 | if (!elem) | ||
| 81 | return -ENOMEM; | ||
| 82 | memset(elem, 0, sizeof(*elem)); | ||
| 83 | INIT_LIST_HEAD(&elem->list); | ||
| 84 | |||
| 85 | list_add_tail(&elem->list, &test_list); | ||
| 86 | } | ||
| 87 | |||
| 88 | for_each_possible_cpu(i) { | ||
| 89 | per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); | ||
| 90 | pr_info("kmemleak: kmalloc(129) = %p\n", | ||
| 91 | per_cpu(kmemleak_test_pointer, i)); | ||
| 92 | } | ||
| 93 | |||
| 94 | return 0; | ||
| 95 | } | ||
| 96 | module_init(kmemleak_test_init); | ||
| 97 | |||
| 98 | static void __exit kmemleak_test_exit(void) | ||
| 99 | { | ||
| 100 | struct test_node *elem, *tmp; | ||
| 101 | |||
| 102 | /* | ||
| 103 | * Remove the list elements without actually freeing the | ||
| 104 | * memory. | ||
| 105 | */ | ||
| 106 | list_for_each_entry_safe(elem, tmp, &test_list, list) | ||
| 107 | list_del(&elem->list); | ||
| 108 | } | ||
| 109 | module_exit(kmemleak_test_exit); | ||
| 110 | |||
| 111 | MODULE_LICENSE("GPL"); | ||
diff --git a/mm/kmemleak.c b/mm/kmemleak.c new file mode 100644 index 000000000000..8bf765c4f58d --- /dev/null +++ b/mm/kmemleak.c | |||
| @@ -0,0 +1,1689 @@ | |||
| 1 | /* | ||
| 2 | * mm/kmemleak.c | ||
| 3 | * | ||
| 4 | * Copyright (C) 2008 ARM Limited | ||
| 5 | * Written by Catalin Marinas <catalin.marinas@arm.com> | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License version 2 as | ||
| 9 | * published by the Free Software Foundation. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | * GNU General Public License for more details. | ||
| 15 | * | ||
| 16 | * You should have received a copy of the GNU General Public License | ||
| 17 | * along with this program; if not, write to the Free Software | ||
| 18 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 19 | * | ||
| 20 | * | ||
| 21 | * For more information on the algorithm and kmemleak usage, please see | ||
| 22 | * Documentation/kmemleak.txt. | ||
| 23 | * | ||
| 24 | * Notes on locking | ||
| 25 | * ---------------- | ||
| 26 | * | ||
| 27 | * The following locks and mutexes are used by kmemleak: | ||
| 28 | * | ||
| 29 | * - kmemleak_lock (rwlock): protects the object_list modifications and | ||
| 30 | * accesses to the object_tree_root. The object_list is the main list | ||
| 31 | * holding the metadata (struct kmemleak_object) for the allocated memory | ||
| 32 | * blocks. The object_tree_root is a priority search tree used to look-up | ||
| 33 | * metadata based on a pointer to the corresponding memory block. The | ||
| 34 | * kmemleak_object structures are added to the object_list and | ||
| 35 | * object_tree_root in the create_object() function called from the | ||
| 36 | * kmemleak_alloc() callback and removed in delete_object() called from the | ||
| 37 | * kmemleak_free() callback | ||
| 38 | * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to | ||
| 39 | * the metadata (e.g. count) are protected by this lock. Note that some | ||
| 40 | * members of this structure may be protected by other means (atomic or | ||
| 41 | * kmemleak_lock). This lock is also held when scanning the corresponding | ||
| 42 | * memory block to avoid the kernel freeing it via the kmemleak_free() | ||
| 43 | * callback. This is less heavyweight than holding a global lock like | ||
| 44 | * kmemleak_lock during scanning | ||
| 45 | * - scan_mutex (mutex): ensures that only one thread may scan the memory for | ||
| 46 | * unreferenced objects at a time. The gray_list contains the objects which | ||
| 47 | * are already referenced or marked as false positives and need to be | ||
| 48 | * scanned. This list is only modified during a scanning episode when the | ||
| 49 | * scan_mutex is held. At the end of a scan, the gray_list is always empty. | ||
| 50 | * Note that the kmemleak_object.use_count is incremented when an object is | ||
| 51 | * added to the gray_list and therefore cannot be freed. This mutex also | ||
| 52 | * prevents multiple users of the "kmemleak" debugfs file together with | ||
| 53 | * modifications to the memory scanning parameters including the scan_thread | ||
| 54 | * pointer | ||
| 55 | * | ||
| 56 | * The kmemleak_object structures have a use_count incremented or decremented | ||
| 57 | * using the get_object()/put_object() functions. When the use_count becomes | ||
| 58 | * 0, this count can no longer be incremented and put_object() schedules the | ||
| 59 | * kmemleak_object freeing via an RCU callback. All calls to the get_object() | ||
| 60 | * function must be protected by rcu_read_lock() to avoid accessing a freed | ||
| 61 | * structure. | ||
| 62 | */ | ||
| 63 | |||
| 64 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | ||
| 65 | |||
| 66 | #include <linux/init.h> | ||
| 67 | #include <linux/kernel.h> | ||
| 68 | #include <linux/list.h> | ||
| 69 | #include <linux/sched.h> | ||
| 70 | #include <linux/jiffies.h> | ||
| 71 | #include <linux/delay.h> | ||
| 72 | #include <linux/module.h> | ||
| 73 | #include <linux/kthread.h> | ||
| 74 | #include <linux/prio_tree.h> | ||
| 75 | #include <linux/gfp.h> | ||
| 76 | #include <linux/fs.h> | ||
| 77 | #include <linux/debugfs.h> | ||
| 78 | #include <linux/seq_file.h> | ||
| 79 | #include <linux/cpumask.h> | ||
| 80 | #include <linux/spinlock.h> | ||
| 81 | #include <linux/mutex.h> | ||
| 82 | #include <linux/rcupdate.h> | ||
| 83 | #include <linux/stacktrace.h> | ||
| 84 | #include <linux/cache.h> | ||
| 85 | #include <linux/percpu.h> | ||
| 86 | #include <linux/hardirq.h> | ||
| 87 | #include <linux/mmzone.h> | ||
| 88 | #include <linux/slab.h> | ||
| 89 | #include <linux/thread_info.h> | ||
| 90 | #include <linux/err.h> | ||
| 91 | #include <linux/uaccess.h> | ||
| 92 | #include <linux/string.h> | ||
| 93 | #include <linux/nodemask.h> | ||
| 94 | #include <linux/mm.h> | ||
| 95 | #include <linux/workqueue.h> | ||
| 96 | |||
| 97 | #include <asm/sections.h> | ||
| 98 | #include <asm/processor.h> | ||
| 99 | #include <asm/atomic.h> | ||
| 100 | |||
| 101 | #include <linux/kmemcheck.h> | ||
| 102 | #include <linux/kmemleak.h> | ||
| 103 | |||
| 104 | /* | ||
| 105 | * Kmemleak configuration and common defines. | ||
| 106 | */ | ||
| 107 | #define MAX_TRACE 16 /* stack trace length */ | ||
| 108 | #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ | ||
| 109 | #define SECS_FIRST_SCAN 60 /* delay before the first scan */ | ||
| 110 | #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ | ||
| 111 | #define GRAY_LIST_PASSES 25 /* maximum number of gray list scans */ | ||
| 112 | #define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ | ||
| 113 | |||
| 114 | #define BYTES_PER_POINTER sizeof(void *) | ||
| 115 | |||
| 116 | /* GFP bitmask for kmemleak internal allocations */ | ||
| 117 | #define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) | ||
| 118 | |||
| 119 | /* scanning area inside a memory block */ | ||
| 120 | struct kmemleak_scan_area { | ||
| 121 | struct hlist_node node; | ||
| 122 | unsigned long offset; | ||
| 123 | size_t length; | ||
| 124 | }; | ||
| 125 | |||
| 126 | #define KMEMLEAK_GREY 0 | ||
| 127 | #define KMEMLEAK_BLACK -1 | ||
| 128 | |||
| 129 | /* | ||
| 130 | * Structure holding the metadata for each allocated memory block. | ||
| 131 | * Modifications to such objects should be made while holding the | ||
| 132 | * object->lock. Insertions or deletions from object_list, gray_list or | ||
| 133 | * tree_node are already protected by the corresponding locks or mutex (see | ||
| 134 | * the notes on locking above). These objects are reference-counted | ||
| 135 | * (use_count) and freed using the RCU mechanism. | ||
| 136 | */ | ||
| 137 | struct kmemleak_object { | ||
| 138 | spinlock_t lock; | ||
| 139 | unsigned long flags; /* object status flags */ | ||
| 140 | struct list_head object_list; | ||
| 141 | struct list_head gray_list; | ||
| 142 | struct prio_tree_node tree_node; | ||
| 143 | struct rcu_head rcu; /* object_list lockless traversal */ | ||
| 144 | /* object usage count; object freed when use_count == 0 */ | ||
| 145 | atomic_t use_count; | ||
| 146 | unsigned long pointer; | ||
| 147 | size_t size; | ||
| 148 | /* minimum number of a pointers found before it is considered leak */ | ||
| 149 | int min_count; | ||
| 150 | /* the total number of pointers found pointing to this object */ | ||
| 151 | int count; | ||
| 152 | /* memory ranges to be scanned inside an object (empty for all) */ | ||
| 153 | struct hlist_head area_list; | ||
| 154 | unsigned long trace[MAX_TRACE]; | ||
| 155 | unsigned int trace_len; | ||
| 156 | unsigned long jiffies; /* creation timestamp */ | ||
| 157 | pid_t pid; /* pid of the current task */ | ||
| 158 | char comm[TASK_COMM_LEN]; /* executable name */ | ||
| 159 | }; | ||
| 160 | |||
| 161 | /* flag representing the memory block allocation status */ | ||
| 162 | #define OBJECT_ALLOCATED (1 << 0) | ||
| 163 | /* flag set after the first reporting of an unreference object */ | ||
| 164 | #define OBJECT_REPORTED (1 << 1) | ||
| 165 | /* flag set to not scan the object */ | ||
| 166 | #define OBJECT_NO_SCAN (1 << 2) | ||
| 167 | /* flag set on newly allocated objects */ | ||
| 168 | #define OBJECT_NEW (1 << 3) | ||
| 169 | |||
| 170 | /* number of bytes to print per line; must be 16 or 32 */ | ||
| 171 | #define HEX_ROW_SIZE 16 | ||
| 172 | /* number of bytes to print at a time (1, 2, 4, 8) */ | ||
| 173 | #define HEX_GROUP_SIZE 1 | ||
| 174 | /* include ASCII after the hex output */ | ||
| 175 | #define HEX_ASCII 1 | ||
| 176 | /* max number of lines to be printed */ | ||
| 177 | #define HEX_MAX_LINES 2 | ||
| 178 | |||
| 179 | /* the list of all allocated objects */ | ||
| 180 | static LIST_HEAD(object_list); | ||
| 181 | /* the list of gray-colored objects (see color_gray comment below) */ | ||
| 182 | static LIST_HEAD(gray_list); | ||
| 183 | /* prio search tree for object boundaries */ | ||
| 184 | static struct prio_tree_root object_tree_root; | ||
| 185 | /* rw_lock protecting the access to object_list and prio_tree_root */ | ||
| 186 | static DEFINE_RWLOCK(kmemleak_lock); | ||
| 187 | |||
| 188 | /* allocation caches for kmemleak internal data */ | ||
| 189 | static struct kmem_cache *object_cache; | ||
| 190 | static struct kmem_cache *scan_area_cache; | ||
| 191 | |||
| 192 | /* set if tracing memory operations is enabled */ | ||
| 193 | static atomic_t kmemleak_enabled = ATOMIC_INIT(0); | ||
| 194 | /* set in the late_initcall if there were no errors */ | ||
| 195 | static atomic_t kmemleak_initialized = ATOMIC_INIT(0); | ||
| 196 | /* enables or disables early logging of the memory operations */ | ||
| 197 | static atomic_t kmemleak_early_log = ATOMIC_INIT(1); | ||
| 198 | /* set if a fata kmemleak error has occurred */ | ||
| 199 | static atomic_t kmemleak_error = ATOMIC_INIT(0); | ||
| 200 | |||
| 201 | /* minimum and maximum address that may be valid pointers */ | ||
| 202 | static unsigned long min_addr = ULONG_MAX; | ||
| 203 | static unsigned long max_addr; | ||
| 204 | |||
| 205 | static struct task_struct *scan_thread; | ||
| 206 | /* used to avoid reporting of recently allocated objects */ | ||
| 207 | static unsigned long jiffies_min_age; | ||
| 208 | static unsigned long jiffies_last_scan; | ||
| 209 | /* delay between automatic memory scannings */ | ||
| 210 | static signed long jiffies_scan_wait; | ||
| 211 | /* enables or disables the task stacks scanning */ | ||
| 212 | static int kmemleak_stack_scan = 1; | ||
| 213 | /* protects the memory scanning, parameters and debug/kmemleak file access */ | ||
| 214 | static DEFINE_MUTEX(scan_mutex); | ||
| 215 | |||
| 216 | /* | ||
| 217 | * Early object allocation/freeing logging. Kmemleak is initialized after the | ||
| 218 | * kernel allocator. However, both the kernel allocator and kmemleak may | ||
| 219 | * allocate memory blocks which need to be tracked. Kmemleak defines an | ||
| 220 | * arbitrary buffer to hold the allocation/freeing information before it is | ||
| 221 | * fully initialized. | ||
| 222 | */ | ||
| 223 | |||
| 224 | /* kmemleak operation type for early logging */ | ||
| 225 | enum { | ||
| 226 | KMEMLEAK_ALLOC, | ||
| 227 | KMEMLEAK_FREE, | ||
| 228 | KMEMLEAK_FREE_PART, | ||
| 229 | KMEMLEAK_NOT_LEAK, | ||
| 230 | KMEMLEAK_IGNORE, | ||
| 231 | KMEMLEAK_SCAN_AREA, | ||
| 232 | KMEMLEAK_NO_SCAN | ||
| 233 | }; | ||
| 234 | |||
| 235 | /* | ||
| 236 | * Structure holding the information passed to kmemleak callbacks during the | ||
| 237 | * early logging. | ||
| 238 | */ | ||
| 239 | struct early_log { | ||
| 240 | int op_type; /* kmemleak operation type */ | ||
| 241 | const void *ptr; /* allocated/freed memory block */ | ||
| 242 | size_t size; /* memory block size */ | ||
| 243 | int min_count; /* minimum reference count */ | ||
| 244 | unsigned long offset; /* scan area offset */ | ||
| 245 | size_t length; /* scan area length */ | ||
| 246 | unsigned long trace[MAX_TRACE]; /* stack trace */ | ||
| 247 | unsigned int trace_len; /* stack trace length */ | ||
| 248 | }; | ||
| 249 | |||
| 250 | /* early logging buffer and current position */ | ||
| 251 | static struct early_log | ||
| 252 | early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; | ||
| 253 | static int crt_early_log __initdata; | ||
| 254 | |||
| 255 | static void kmemleak_disable(void); | ||
| 256 | |||
| 257 | /* | ||
| 258 | * Print a warning and dump the stack trace. | ||
| 259 | */ | ||
| 260 | #define kmemleak_warn(x...) do { \ | ||
| 261 | pr_warning(x); \ | ||
| 262 | dump_stack(); \ | ||
| 263 | } while (0) | ||
| 264 | |||
| 265 | /* | ||
| 266 | * Macro invoked when a serious kmemleak condition occured and cannot be | ||
| 267 | * recovered from. Kmemleak will be disabled and further allocation/freeing | ||
| 268 | * tracing no longer available. | ||
| 269 | */ | ||
| 270 | #define kmemleak_stop(x...) do { \ | ||
| 271 | kmemleak_warn(x); \ | ||
| 272 | kmemleak_disable(); \ | ||
| 273 | } while (0) | ||
| 274 | |||
| 275 | /* | ||
| 276 | * Printing of the objects hex dump to the seq file. The number of lines to be | ||
| 277 | * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The | ||
| 278 | * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called | ||
| 279 | * with the object->lock held. | ||
| 280 | */ | ||
| 281 | static void hex_dump_object(struct seq_file *seq, | ||
| 282 | struct kmemleak_object *object) | ||
| 283 | { | ||
| 284 | const u8 *ptr = (const u8 *)object->pointer; | ||
| 285 | int i, len, remaining; | ||
| 286 | unsigned char linebuf[HEX_ROW_SIZE * 5]; | ||
| 287 | |||
| 288 | /* limit the number of lines to HEX_MAX_LINES */ | ||
| 289 | remaining = len = | ||
| 290 | min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); | ||
| 291 | |||
| 292 | seq_printf(seq, " hex dump (first %d bytes):\n", len); | ||
| 293 | for (i = 0; i < len; i += HEX_ROW_SIZE) { | ||
| 294 | int linelen = min(remaining, HEX_ROW_SIZE); | ||
| 295 | |||
| 296 | remaining -= HEX_ROW_SIZE; | ||
| 297 | hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE, | ||
| 298 | HEX_GROUP_SIZE, linebuf, sizeof(linebuf), | ||
| 299 | HEX_ASCII); | ||
| 300 | seq_printf(seq, " %s\n", linebuf); | ||
| 301 | } | ||
| 302 | } | ||
| 303 | |||
| 304 | /* | ||
| 305 | * Object colors, encoded with count and min_count: | ||
| 306 | * - white - orphan object, not enough references to it (count < min_count) | ||
| 307 | * - gray - not orphan, not marked as false positive (min_count == 0) or | ||
| 308 | * sufficient references to it (count >= min_count) | ||
| 309 | * - black - ignore, it doesn't contain references (e.g. text section) | ||
| 310 | * (min_count == -1). No function defined for this color. | ||
| 311 | * Newly created objects don't have any color assigned (object->count == -1) | ||
| 312 | * before the next memory scan when they become white. | ||
| 313 | */ | ||
| 314 | static bool color_white(const struct kmemleak_object *object) | ||
| 315 | { | ||
| 316 | return object->count != KMEMLEAK_BLACK && | ||
| 317 | object->count < object->min_count; | ||
| 318 | } | ||
| 319 | |||
| 320 | static bool color_gray(const struct kmemleak_object *object) | ||
| 321 | { | ||
| 322 | return object->min_count != KMEMLEAK_BLACK && | ||
| 323 | object->count >= object->min_count; | ||
| 324 | } | ||
| 325 | |||
| 326 | static bool color_black(const struct kmemleak_object *object) | ||
| 327 | { | ||
| 328 | return object->min_count == KMEMLEAK_BLACK; | ||
| 329 | } | ||
| 330 | |||
| 331 | /* | ||
| 332 | * Objects are considered unreferenced only if their color is white, they have | ||
| 333 | * not be deleted and have a minimum age to avoid false positives caused by | ||
| 334 | * pointers temporarily stored in CPU registers. | ||
| 335 | */ | ||
| 336 | static bool unreferenced_object(struct kmemleak_object *object) | ||
| 337 | { | ||
| 338 | return (object->flags & OBJECT_ALLOCATED) && color_white(object) && | ||
| 339 | time_before_eq(object->jiffies + jiffies_min_age, | ||
| 340 | jiffies_last_scan); | ||
| 341 | } | ||
| 342 | |||
| 343 | /* | ||
| 344 | * Printing of the unreferenced objects information to the seq file. The | ||
| 345 | * print_unreferenced function must be called with the object->lock held. | ||
| 346 | */ | ||
| 347 | static void print_unreferenced(struct seq_file *seq, | ||
| 348 | struct kmemleak_object *object) | ||
| 349 | { | ||
| 350 | int i; | ||
| 351 | |||
| 352 | seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", | ||
| 353 | object->pointer, object->size); | ||
| 354 | seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", | ||
| 355 | object->comm, object->pid, object->jiffies); | ||
| 356 | hex_dump_object(seq, object); | ||
| 357 | seq_printf(seq, " backtrace:\n"); | ||
| 358 | |||
| 359 | for (i = 0; i < object->trace_len; i++) { | ||
| 360 | void *ptr = (void *)object->trace[i]; | ||
| 361 | seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); | ||
| 362 | } | ||
| 363 | } | ||
| 364 | |||
| 365 | /* | ||
| 366 | * Print the kmemleak_object information. This function is used mainly for | ||
| 367 | * debugging special cases when kmemleak operations. It must be called with | ||
| 368 | * the object->lock held. | ||
| 369 | */ | ||
| 370 | static void dump_object_info(struct kmemleak_object *object) | ||
| 371 | { | ||
| 372 | struct stack_trace trace; | ||
| 373 | |||
| 374 | trace.nr_entries = object->trace_len; | ||
| 375 | trace.entries = object->trace; | ||
| 376 | |||
| 377 | pr_notice("Object 0x%08lx (size %zu):\n", | ||
| 378 | object->tree_node.start, object->size); | ||
| 379 | pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", | ||
| 380 | object->comm, object->pid, object->jiffies); | ||
| 381 | pr_notice(" min_count = %d\n", object->min_count); | ||
| 382 | pr_notice(" count = %d\n", object->count); | ||
| 383 | pr_notice(" flags = 0x%lx\n", object->flags); | ||
| 384 | pr_notice(" backtrace:\n"); | ||
| 385 | print_stack_trace(&trace, 4); | ||
| 386 | } | ||
| 387 | |||
| 388 | /* | ||
| 389 | * Look-up a memory block metadata (kmemleak_object) in the priority search | ||
| 390 | * tree based on a pointer value. If alias is 0, only values pointing to the | ||
| 391 | * beginning of the memory block are allowed. The kmemleak_lock must be held | ||
| 392 | * when calling this function. | ||
| 393 | */ | ||
| 394 | static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) | ||
| 395 | { | ||
| 396 | struct prio_tree_node *node; | ||
| 397 | struct prio_tree_iter iter; | ||
| 398 | struct kmemleak_object *object; | ||
| 399 | |||
| 400 | prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr); | ||
| 401 | node = prio_tree_next(&iter); | ||
| 402 | if (node) { | ||
| 403 | object = prio_tree_entry(node, struct kmemleak_object, | ||
| 404 | tree_node); | ||
| 405 | if (!alias && object->pointer != ptr) { | ||
| 406 | kmemleak_warn("Found object by alias"); | ||
| 407 | object = NULL; | ||
| 408 | } | ||
| 409 | } else | ||
| 410 | object = NULL; | ||
| 411 | |||
| 412 | return object; | ||
| 413 | } | ||
| 414 | |||
| 415 | /* | ||
| 416 | * Increment the object use_count. Return 1 if successful or 0 otherwise. Note | ||
| 417 | * that once an object's use_count reached 0, the RCU freeing was already | ||
| 418 | * registered and the object should no longer be used. This function must be | ||
| 419 | * called under the protection of rcu_read_lock(). | ||
| 420 | */ | ||
| 421 | static int get_object(struct kmemleak_object *object) | ||
| 422 | { | ||
| 423 | return atomic_inc_not_zero(&object->use_count); | ||
| 424 | } | ||
| 425 | |||
| 426 | /* | ||
| 427 | * RCU callback to free a kmemleak_object. | ||
| 428 | */ | ||
| 429 | static void free_object_rcu(struct rcu_head *rcu) | ||
| 430 | { | ||
| 431 | struct hlist_node *elem, *tmp; | ||
| 432 | struct kmemleak_scan_area *area; | ||
| 433 | struct kmemleak_object *object = | ||
| 434 | container_of(rcu, struct kmemleak_object, rcu); | ||
| 435 | |||
| 436 | /* | ||
| 437 | * Once use_count is 0 (guaranteed by put_object), there is no other | ||
| 438 | * code accessing this object, hence no need for locking. | ||
| 439 | */ | ||
| 440 | hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) { | ||
| 441 | hlist_del(elem); | ||
| 442 | kmem_cache_free(scan_area_cache, area); | ||
| 443 | } | ||
| 444 | kmem_cache_free(object_cache, object); | ||
| 445 | } | ||
| 446 | |||
| 447 | /* | ||
| 448 | * Decrement the object use_count. Once the count is 0, free the object using | ||
| 449 | * an RCU callback. Since put_object() may be called via the kmemleak_free() -> | ||
| 450 | * delete_object() path, the delayed RCU freeing ensures that there is no | ||
| 451 | * recursive call to the kernel allocator. Lock-less RCU object_list traversal | ||
| 452 | * is also possible. | ||
| 453 | */ | ||
| 454 | static void put_object(struct kmemleak_object *object) | ||
| 455 | { | ||
| 456 | if (!atomic_dec_and_test(&object->use_count)) | ||
| 457 | return; | ||
| 458 | |||
| 459 | /* should only get here after delete_object was called */ | ||
| 460 | WARN_ON(object->flags & OBJECT_ALLOCATED); | ||
| 461 | |||
| 462 | call_rcu(&object->rcu, free_object_rcu); | ||
| 463 | } | ||
| 464 | |||
| 465 | /* | ||
| 466 | * Look up an object in the prio search tree and increase its use_count. | ||
| 467 | */ | ||
| 468 | static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) | ||
| 469 | { | ||
| 470 | unsigned long flags; | ||
| 471 | struct kmemleak_object *object = NULL; | ||
| 472 | |||
| 473 | rcu_read_lock(); | ||
| 474 | read_lock_irqsave(&kmemleak_lock, flags); | ||
| 475 | if (ptr >= min_addr && ptr < max_addr) | ||
| 476 | object = lookup_object(ptr, alias); | ||
| 477 | read_unlock_irqrestore(&kmemleak_lock, flags); | ||
| 478 | |||
| 479 | /* check whether the object is still available */ | ||
| 480 | if (object && !get_object(object)) | ||
| 481 | object = NULL; | ||
| 482 | rcu_read_unlock(); | ||
| 483 | |||
| 484 | return object; | ||
| 485 | } | ||
| 486 | |||
| 487 | /* | ||
| 488 | * Save stack trace to the given array of MAX_TRACE size. | ||
| 489 | */ | ||
| 490 | static int __save_stack_trace(unsigned long *trace) | ||
| 491 | { | ||
| 492 | struct stack_trace stack_trace; | ||
| 493 | |||
| 494 | stack_trace.max_entries = MAX_TRACE; | ||
| 495 | stack_trace.nr_entries = 0; | ||
| 496 | stack_trace.entries = trace; | ||
| 497 | stack_trace.skip = 2; | ||
| 498 | save_stack_trace(&stack_trace); | ||
| 499 | |||
| 500 | return stack_trace.nr_entries; | ||
| 501 | } | ||
| 502 | |||
| 503 | /* | ||
| 504 | * Create the metadata (struct kmemleak_object) corresponding to an allocated | ||
| 505 | * memory block and add it to the object_list and object_tree_root. | ||
| 506 | */ | ||
| 507 | static struct kmemleak_object *create_object(unsigned long ptr, size_t size, | ||
| 508 | int min_count, gfp_t gfp) | ||
| 509 | { | ||
| 510 | unsigned long flags; | ||
| 511 | struct kmemleak_object *object; | ||
| 512 | struct prio_tree_node *node; | ||
| 513 | |||
| 514 | object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); | ||
| 515 | if (!object) { | ||
| 516 | kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); | ||
| 517 | return NULL; | ||
| 518 | } | ||
| 519 | |||
| 520 | INIT_LIST_HEAD(&object->object_list); | ||
| 521 | INIT_LIST_HEAD(&object->gray_list); | ||
| 522 | INIT_HLIST_HEAD(&object->area_list); | ||
| 523 | spin_lock_init(&object->lock); | ||
| 524 | atomic_set(&object->use_count, 1); | ||
| 525 | object->flags = OBJECT_ALLOCATED | OBJECT_NEW; | ||
| 526 | object->pointer = ptr; | ||
| 527 | object->size = size; | ||
| 528 | object->min_count = min_count; | ||
| 529 | object->count = -1; /* no color initially */ | ||
| 530 | object->jiffies = jiffies; | ||
| 531 | |||
| 532 | /* task information */ | ||
| 533 | if (in_irq()) { | ||
| 534 | object->pid = 0; | ||
| 535 | strncpy(object->comm, "hardirq", sizeof(object->comm)); | ||
| 536 | } else if (in_softirq()) { | ||
| 537 | object->pid = 0; | ||
| 538 | strncpy(object->comm, "softirq", sizeof(object->comm)); | ||
| 539 | } else { | ||
| 540 | object->pid = current->pid; | ||
| 541 | /* | ||
| 542 | * There is a small chance of a race with set_task_comm(), | ||
| 543 | * however using get_task_comm() here may cause locking | ||
| 544 | * dependency issues with current->alloc_lock. In the worst | ||
| 545 | * case, the command line is not correct. | ||
| 546 | */ | ||
| 547 | strncpy(object->comm, current->comm, sizeof(object->comm)); | ||
| 548 | } | ||
| 549 | |||
| 550 | /* kernel backtrace */ | ||
| 551 | object->trace_len = __save_stack_trace(object->trace); | ||
| 552 | |||
| 553 | INIT_PRIO_TREE_NODE(&object->tree_node); | ||
| 554 | object->tree_node.start = ptr; | ||
| 555 | object->tree_node.last = ptr + size - 1; | ||
| 556 | |||
| 557 | write_lock_irqsave(&kmemleak_lock, flags); | ||
| 558 | |||
| 559 | min_addr = min(min_addr, ptr); | ||
| 560 | max_addr = max(max_addr, ptr + size); | ||
| 561 | node = prio_tree_insert(&object_tree_root, &object->tree_node); | ||
| 562 | /* | ||
| 563 | * The code calling the kernel does not yet have the pointer to the | ||
| 564 | * memory block to be able to free it. However, we still hold the | ||
| 565 | * kmemleak_lock here in case parts of the kernel started freeing | ||
| 566 | * random memory blocks. | ||
| 567 | */ | ||
| 568 | if (node != &object->tree_node) { | ||
| 569 | kmemleak_stop("Cannot insert 0x%lx into the object search tree " | ||
| 570 | "(already existing)\n", ptr); | ||
| 571 | object = lookup_object(ptr, 1); | ||
| 572 | spin_lock(&object->lock); | ||
| 573 | dump_object_info(object); | ||
| 574 | spin_unlock(&object->lock); | ||
| 575 | |||
| 576 | goto out; | ||
| 577 | } | ||
| 578 | list_add_tail_rcu(&object->object_list, &object_list); | ||
| 579 | out: | ||
| 580 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
| 581 | return object; | ||
| 582 | } | ||
| 583 | |||
| 584 | /* | ||
| 585 | * Remove the metadata (struct kmemleak_object) for a memory block from the | ||
| 586 | * object_list and object_tree_root and decrement its use_count. | ||
| 587 | */ | ||
| 588 | static void __delete_object(struct kmemleak_object *object) | ||
| 589 | { | ||
| 590 | unsigned long flags; | ||
| 591 | |||
| 592 | write_lock_irqsave(&kmemleak_lock, flags); | ||
| 593 | prio_tree_remove(&object_tree_root, &object->tree_node); | ||
| 594 | list_del_rcu(&object->object_list); | ||
| 595 | write_unlock_irqrestore(&kmemleak_lock, flags); | ||
| 596 | |||
| 597 | WARN_ON(!(object->flags & OBJECT_ALLOCATED)); | ||
| 598 | WARN_ON(atomic_read(&object->use_count) < 2); | ||
| 599 | |||
| 600 | /* | ||
| 601 | * Locking here also ensures that the corresponding memory block | ||
| 602 | * cannot be freed when it is being scanned. | ||
| 603 | */ | ||
| 604 | spin_lock_irqsave(&object->lock, flags); | ||
| 605 | object->flags &= ~OBJECT_ALLOCATED; | ||
| 606 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 607 | put_object(object); | ||
| 608 | } | ||
| 609 | |||
| 610 | /* | ||
| 611 | * Look up the metadata (struct kmemleak_object) corresponding to ptr and | ||
| 612 | * delete it. | ||
| 613 | */ | ||
| 614 | static void delete_object_full(unsigned long ptr) | ||
| 615 | { | ||
| 616 | struct kmemleak_object *object; | ||
| 617 | |||
| 618 | object = find_and_get_object(ptr, 0); | ||
| 619 | if (!object) { | ||
| 620 | #ifdef DEBUG | ||
| 621 | kmemleak_warn("Freeing unknown object at 0x%08lx\n", | ||
| 622 | ptr); | ||
| 623 | #endif | ||
| 624 | return; | ||
| 625 | } | ||
| 626 | __delete_object(object); | ||
| 627 | put_object(object); | ||
| 628 | } | ||
| 629 | |||
| 630 | /* | ||
| 631 | * Look up the metadata (struct kmemleak_object) corresponding to ptr and | ||
| 632 | * delete it. If the memory block is partially freed, the function may create | ||
| 633 | * additional metadata for the remaining parts of the block. | ||
| 634 | */ | ||
| 635 | static void delete_object_part(unsigned long ptr, size_t size) | ||
| 636 | { | ||
| 637 | struct kmemleak_object *object; | ||
| 638 | unsigned long start, end; | ||
| 639 | |||
| 640 | object = find_and_get_object(ptr, 1); | ||
| 641 | if (!object) { | ||
| 642 | #ifdef DEBUG | ||
| 643 | kmemleak_warn("Partially freeing unknown object at 0x%08lx " | ||
| 644 | "(size %zu)\n", ptr, size); | ||
| 645 | #endif | ||
| 646 | return; | ||
| 647 | } | ||
| 648 | __delete_object(object); | ||
| 649 | |||
| 650 | /* | ||
| 651 | * Create one or two objects that may result from the memory block | ||
| 652 | * split. Note that partial freeing is only done by free_bootmem() and | ||
| 653 | * this happens before kmemleak_init() is called. The path below is | ||
| 654 | * only executed during early log recording in kmemleak_init(), so | ||
| 655 | * GFP_KERNEL is enough. | ||
| 656 | */ | ||
| 657 | start = object->pointer; | ||
| 658 | end = object->pointer + object->size; | ||
| 659 | if (ptr > start) | ||
| 660 | create_object(start, ptr - start, object->min_count, | ||
| 661 | GFP_KERNEL); | ||
| 662 | if (ptr + size < end) | ||
| 663 | create_object(ptr + size, end - ptr - size, object->min_count, | ||
| 664 | GFP_KERNEL); | ||
| 665 | |||
| 666 | put_object(object); | ||
| 667 | } | ||
| 668 | |||
| 669 | static void __paint_it(struct kmemleak_object *object, int color) | ||
| 670 | { | ||
| 671 | object->min_count = color; | ||
| 672 | if (color == KMEMLEAK_BLACK) | ||
| 673 | object->flags |= OBJECT_NO_SCAN; | ||
| 674 | } | ||
| 675 | |||
| 676 | static void paint_it(struct kmemleak_object *object, int color) | ||
| 677 | { | ||
| 678 | unsigned long flags; | ||
| 679 | |||
| 680 | spin_lock_irqsave(&object->lock, flags); | ||
| 681 | __paint_it(object, color); | ||
| 682 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 683 | } | ||
| 684 | |||
| 685 | static void paint_ptr(unsigned long ptr, int color) | ||
| 686 | { | ||
| 687 | struct kmemleak_object *object; | ||
| 688 | |||
| 689 | object = find_and_get_object(ptr, 0); | ||
| 690 | if (!object) { | ||
| 691 | kmemleak_warn("Trying to color unknown object " | ||
| 692 | "at 0x%08lx as %s\n", ptr, | ||
| 693 | (color == KMEMLEAK_GREY) ? "Grey" : | ||
| 694 | (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); | ||
| 695 | return; | ||
| 696 | } | ||
| 697 | paint_it(object, color); | ||
| 698 | put_object(object); | ||
| 699 | } | ||
| 700 | |||
| 701 | /* | ||
| 702 | * Make a object permanently as gray-colored so that it can no longer be | ||
| 703 | * reported as a leak. This is used in general to mark a false positive. | ||
| 704 | */ | ||
| 705 | static void make_gray_object(unsigned long ptr) | ||
| 706 | { | ||
| 707 | paint_ptr(ptr, KMEMLEAK_GREY); | ||
| 708 | } | ||
| 709 | |||
| 710 | /* | ||
| 711 | * Mark the object as black-colored so that it is ignored from scans and | ||
| 712 | * reporting. | ||
| 713 | */ | ||
| 714 | static void make_black_object(unsigned long ptr) | ||
| 715 | { | ||
| 716 | paint_ptr(ptr, KMEMLEAK_BLACK); | ||
| 717 | } | ||
| 718 | |||
| 719 | /* | ||
| 720 | * Add a scanning area to the object. If at least one such area is added, | ||
| 721 | * kmemleak will only scan these ranges rather than the whole memory block. | ||
| 722 | */ | ||
| 723 | static void add_scan_area(unsigned long ptr, unsigned long offset, | ||
| 724 | size_t length, gfp_t gfp) | ||
| 725 | { | ||
| 726 | unsigned long flags; | ||
| 727 | struct kmemleak_object *object; | ||
| 728 | struct kmemleak_scan_area *area; | ||
| 729 | |||
| 730 | object = find_and_get_object(ptr, 0); | ||
| 731 | if (!object) { | ||
| 732 | kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", | ||
| 733 | ptr); | ||
| 734 | return; | ||
| 735 | } | ||
| 736 | |||
| 737 | area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); | ||
| 738 | if (!area) { | ||
| 739 | kmemleak_warn("Cannot allocate a scan area\n"); | ||
| 740 | goto out; | ||
| 741 | } | ||
| 742 | |||
| 743 | spin_lock_irqsave(&object->lock, flags); | ||
| 744 | if (offset + length > object->size) { | ||
| 745 | kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); | ||
| 746 | dump_object_info(object); | ||
| 747 | kmem_cache_free(scan_area_cache, area); | ||
| 748 | goto out_unlock; | ||
| 749 | } | ||
| 750 | |||
| 751 | INIT_HLIST_NODE(&area->node); | ||
| 752 | area->offset = offset; | ||
| 753 | area->length = length; | ||
| 754 | |||
| 755 | hlist_add_head(&area->node, &object->area_list); | ||
| 756 | out_unlock: | ||
| 757 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 758 | out: | ||
| 759 | put_object(object); | ||
| 760 | } | ||
| 761 | |||
| 762 | /* | ||
| 763 | * Set the OBJECT_NO_SCAN flag for the object corresponding to the give | ||
| 764 | * pointer. Such object will not be scanned by kmemleak but references to it | ||
| 765 | * are searched. | ||
| 766 | */ | ||
| 767 | static void object_no_scan(unsigned long ptr) | ||
| 768 | { | ||
| 769 | unsigned long flags; | ||
| 770 | struct kmemleak_object *object; | ||
| 771 | |||
| 772 | object = find_and_get_object(ptr, 0); | ||
| 773 | if (!object) { | ||
| 774 | kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr); | ||
| 775 | return; | ||
| 776 | } | ||
| 777 | |||
| 778 | spin_lock_irqsave(&object->lock, flags); | ||
| 779 | object->flags |= OBJECT_NO_SCAN; | ||
| 780 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 781 | put_object(object); | ||
| 782 | } | ||
| 783 | |||
| 784 | /* | ||
| 785 | * Log an early kmemleak_* call to the early_log buffer. These calls will be | ||
| 786 | * processed later once kmemleak is fully initialized. | ||
| 787 | */ | ||
| 788 | static void __init log_early(int op_type, const void *ptr, size_t size, | ||
| 789 | int min_count, unsigned long offset, size_t length) | ||
| 790 | { | ||
| 791 | unsigned long flags; | ||
| 792 | struct early_log *log; | ||
| 793 | |||
| 794 | if (crt_early_log >= ARRAY_SIZE(early_log)) { | ||
| 795 | pr_warning("Early log buffer exceeded, " | ||
| 796 | "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n"); | ||
| 797 | kmemleak_disable(); | ||
| 798 | return; | ||
| 799 | } | ||
| 800 | |||
| 801 | /* | ||
| 802 | * There is no need for locking since the kernel is still in UP mode | ||
| 803 | * at this stage. Disabling the IRQs is enough. | ||
| 804 | */ | ||
| 805 | local_irq_save(flags); | ||
| 806 | log = &early_log[crt_early_log]; | ||
| 807 | log->op_type = op_type; | ||
| 808 | log->ptr = ptr; | ||
| 809 | log->size = size; | ||
| 810 | log->min_count = min_count; | ||
| 811 | log->offset = offset; | ||
| 812 | log->length = length; | ||
| 813 | if (op_type == KMEMLEAK_ALLOC) | ||
| 814 | log->trace_len = __save_stack_trace(log->trace); | ||
| 815 | crt_early_log++; | ||
| 816 | local_irq_restore(flags); | ||
| 817 | } | ||
| 818 | |||
| 819 | /* | ||
| 820 | * Log an early allocated block and populate the stack trace. | ||
| 821 | */ | ||
| 822 | static void early_alloc(struct early_log *log) | ||
| 823 | { | ||
| 824 | struct kmemleak_object *object; | ||
| 825 | unsigned long flags; | ||
| 826 | int i; | ||
| 827 | |||
| 828 | if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) | ||
| 829 | return; | ||
| 830 | |||
| 831 | /* | ||
| 832 | * RCU locking needed to ensure object is not freed via put_object(). | ||
| 833 | */ | ||
| 834 | rcu_read_lock(); | ||
| 835 | object = create_object((unsigned long)log->ptr, log->size, | ||
| 836 | log->min_count, GFP_ATOMIC); | ||
| 837 | if (!object) | ||
| 838 | goto out; | ||
| 839 | spin_lock_irqsave(&object->lock, flags); | ||
| 840 | for (i = 0; i < log->trace_len; i++) | ||
| 841 | object->trace[i] = log->trace[i]; | ||
| 842 | object->trace_len = log->trace_len; | ||
| 843 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 844 | out: | ||
| 845 | rcu_read_unlock(); | ||
| 846 | } | ||
| 847 | |||
| 848 | /* | ||
| 849 | * Memory allocation function callback. This function is called from the | ||
| 850 | * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, | ||
| 851 | * vmalloc etc.). | ||
| 852 | */ | ||
| 853 | void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, | ||
| 854 | gfp_t gfp) | ||
| 855 | { | ||
| 856 | pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); | ||
| 857 | |||
| 858 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
| 859 | create_object((unsigned long)ptr, size, min_count, gfp); | ||
| 860 | else if (atomic_read(&kmemleak_early_log)) | ||
| 861 | log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); | ||
| 862 | } | ||
| 863 | EXPORT_SYMBOL_GPL(kmemleak_alloc); | ||
| 864 | |||
| 865 | /* | ||
| 866 | * Memory freeing function callback. This function is called from the kernel | ||
| 867 | * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). | ||
| 868 | */ | ||
| 869 | void __ref kmemleak_free(const void *ptr) | ||
| 870 | { | ||
| 871 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
| 872 | |||
| 873 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
| 874 | delete_object_full((unsigned long)ptr); | ||
| 875 | else if (atomic_read(&kmemleak_early_log)) | ||
| 876 | log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); | ||
| 877 | } | ||
| 878 | EXPORT_SYMBOL_GPL(kmemleak_free); | ||
| 879 | |||
| 880 | /* | ||
| 881 | * Partial memory freeing function callback. This function is usually called | ||
| 882 | * from bootmem allocator when (part of) a memory block is freed. | ||
| 883 | */ | ||
| 884 | void __ref kmemleak_free_part(const void *ptr, size_t size) | ||
| 885 | { | ||
| 886 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
| 887 | |||
| 888 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
| 889 | delete_object_part((unsigned long)ptr, size); | ||
| 890 | else if (atomic_read(&kmemleak_early_log)) | ||
| 891 | log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); | ||
| 892 | } | ||
| 893 | EXPORT_SYMBOL_GPL(kmemleak_free_part); | ||
| 894 | |||
| 895 | /* | ||
| 896 | * Mark an already allocated memory block as a false positive. This will cause | ||
| 897 | * the block to no longer be reported as leak and always be scanned. | ||
| 898 | */ | ||
| 899 | void __ref kmemleak_not_leak(const void *ptr) | ||
| 900 | { | ||
| 901 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
| 902 | |||
| 903 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
| 904 | make_gray_object((unsigned long)ptr); | ||
| 905 | else if (atomic_read(&kmemleak_early_log)) | ||
| 906 | log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); | ||
| 907 | } | ||
| 908 | EXPORT_SYMBOL(kmemleak_not_leak); | ||
| 909 | |||
| 910 | /* | ||
| 911 | * Ignore a memory block. This is usually done when it is known that the | ||
| 912 | * corresponding block is not a leak and does not contain any references to | ||
| 913 | * other allocated memory blocks. | ||
| 914 | */ | ||
| 915 | void __ref kmemleak_ignore(const void *ptr) | ||
| 916 | { | ||
| 917 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
| 918 | |||
| 919 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
| 920 | make_black_object((unsigned long)ptr); | ||
| 921 | else if (atomic_read(&kmemleak_early_log)) | ||
| 922 | log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); | ||
| 923 | } | ||
| 924 | EXPORT_SYMBOL(kmemleak_ignore); | ||
| 925 | |||
| 926 | /* | ||
| 927 | * Limit the range to be scanned in an allocated memory block. | ||
| 928 | */ | ||
| 929 | void __ref kmemleak_scan_area(const void *ptr, unsigned long offset, | ||
| 930 | size_t length, gfp_t gfp) | ||
| 931 | { | ||
| 932 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
| 933 | |||
| 934 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
| 935 | add_scan_area((unsigned long)ptr, offset, length, gfp); | ||
| 936 | else if (atomic_read(&kmemleak_early_log)) | ||
| 937 | log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); | ||
| 938 | } | ||
| 939 | EXPORT_SYMBOL(kmemleak_scan_area); | ||
| 940 | |||
| 941 | /* | ||
| 942 | * Inform kmemleak not to scan the given memory block. | ||
| 943 | */ | ||
| 944 | void __ref kmemleak_no_scan(const void *ptr) | ||
| 945 | { | ||
| 946 | pr_debug("%s(0x%p)\n", __func__, ptr); | ||
| 947 | |||
| 948 | if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) | ||
| 949 | object_no_scan((unsigned long)ptr); | ||
| 950 | else if (atomic_read(&kmemleak_early_log)) | ||
| 951 | log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); | ||
| 952 | } | ||
| 953 | EXPORT_SYMBOL(kmemleak_no_scan); | ||
| 954 | |||
| 955 | /* | ||
| 956 | * Memory scanning is a long process and it needs to be interruptable. This | ||
| 957 | * function checks whether such interrupt condition occured. | ||
| 958 | */ | ||
| 959 | static int scan_should_stop(void) | ||
| 960 | { | ||
| 961 | if (!atomic_read(&kmemleak_enabled)) | ||
| 962 | return 1; | ||
| 963 | |||
| 964 | /* | ||
| 965 | * This function may be called from either process or kthread context, | ||
| 966 | * hence the need to check for both stop conditions. | ||
| 967 | */ | ||
| 968 | if (current->mm) | ||
| 969 | return signal_pending(current); | ||
| 970 | else | ||
| 971 | return kthread_should_stop(); | ||
| 972 | |||
| 973 | return 0; | ||
| 974 | } | ||
| 975 | |||
| 976 | /* | ||
| 977 | * Scan a memory block (exclusive range) for valid pointers and add those | ||
| 978 | * found to the gray list. | ||
| 979 | */ | ||
| 980 | static void scan_block(void *_start, void *_end, | ||
| 981 | struct kmemleak_object *scanned, int allow_resched) | ||
| 982 | { | ||
| 983 | unsigned long *ptr; | ||
| 984 | unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); | ||
| 985 | unsigned long *end = _end - (BYTES_PER_POINTER - 1); | ||
| 986 | |||
| 987 | for (ptr = start; ptr < end; ptr++) { | ||
| 988 | struct kmemleak_object *object; | ||
| 989 | unsigned long flags; | ||
| 990 | unsigned long pointer; | ||
| 991 | |||
| 992 | if (allow_resched) | ||
| 993 | cond_resched(); | ||
| 994 | if (scan_should_stop()) | ||
| 995 | break; | ||
| 996 | |||
| 997 | /* don't scan uninitialized memory */ | ||
| 998 | if (!kmemcheck_is_obj_initialized((unsigned long)ptr, | ||
| 999 | BYTES_PER_POINTER)) | ||
| 1000 | continue; | ||
| 1001 | |||
| 1002 | pointer = *ptr; | ||
| 1003 | |||
| 1004 | object = find_and_get_object(pointer, 1); | ||
| 1005 | if (!object) | ||
| 1006 | continue; | ||
| 1007 | if (object == scanned) { | ||
| 1008 | /* self referenced, ignore */ | ||
| 1009 | put_object(object); | ||
| 1010 | continue; | ||
| 1011 | } | ||
| 1012 | |||
| 1013 | /* | ||
| 1014 | * Avoid the lockdep recursive warning on object->lock being | ||
| 1015 | * previously acquired in scan_object(). These locks are | ||
| 1016 | * enclosed by scan_mutex. | ||
| 1017 | */ | ||
| 1018 | spin_lock_irqsave_nested(&object->lock, flags, | ||
| 1019 | SINGLE_DEPTH_NESTING); | ||
| 1020 | if (!color_white(object)) { | ||
| 1021 | /* non-orphan, ignored or new */ | ||
| 1022 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 1023 | put_object(object); | ||
| 1024 | continue; | ||
| 1025 | } | ||
| 1026 | |||
| 1027 | /* | ||
| 1028 | * Increase the object's reference count (number of pointers | ||
| 1029 | * to the memory block). If this count reaches the required | ||
| 1030 | * minimum, the object's color will become gray and it will be | ||
| 1031 | * added to the gray_list. | ||
| 1032 | */ | ||
| 1033 | object->count++; | ||
| 1034 | if (color_gray(object)) | ||
| 1035 | list_add_tail(&object->gray_list, &gray_list); | ||
| 1036 | else | ||
| 1037 | put_object(object); | ||
| 1038 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 1039 | } | ||
| 1040 | } | ||
| 1041 | |||
| 1042 | /* | ||
| 1043 | * Scan a memory block corresponding to a kmemleak_object. A condition is | ||
| 1044 | * that object->use_count >= 1. | ||
| 1045 | */ | ||
| 1046 | static void scan_object(struct kmemleak_object *object) | ||
| 1047 | { | ||
| 1048 | struct kmemleak_scan_area *area; | ||
| 1049 | struct hlist_node *elem; | ||
| 1050 | unsigned long flags; | ||
| 1051 | |||
| 1052 | /* | ||
| 1053 | * Once the object->lock is aquired, the corresponding memory block | ||
| 1054 | * cannot be freed (the same lock is aquired in delete_object). | ||
| 1055 | */ | ||
| 1056 | spin_lock_irqsave(&object->lock, flags); | ||
| 1057 | if (object->flags & OBJECT_NO_SCAN) | ||
| 1058 | goto out; | ||
| 1059 | if (!(object->flags & OBJECT_ALLOCATED)) | ||
| 1060 | /* already freed object */ | ||
| 1061 | goto out; | ||
| 1062 | if (hlist_empty(&object->area_list)) { | ||
| 1063 | void *start = (void *)object->pointer; | ||
| 1064 | void *end = (void *)(object->pointer + object->size); | ||
| 1065 | |||
| 1066 | while (start < end && (object->flags & OBJECT_ALLOCATED) && | ||
| 1067 | !(object->flags & OBJECT_NO_SCAN)) { | ||
| 1068 | scan_block(start, min(start + MAX_SCAN_SIZE, end), | ||
| 1069 | object, 0); | ||
| 1070 | start += MAX_SCAN_SIZE; | ||
| 1071 | |||
| 1072 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 1073 | cond_resched(); | ||
| 1074 | spin_lock_irqsave(&object->lock, flags); | ||
| 1075 | } | ||
| 1076 | } else | ||
| 1077 | hlist_for_each_entry(area, elem, &object->area_list, node) | ||
| 1078 | scan_block((void *)(object->pointer + area->offset), | ||
| 1079 | (void *)(object->pointer + area->offset | ||
| 1080 | + area->length), object, 0); | ||
| 1081 | out: | ||
| 1082 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 1083 | } | ||
| 1084 | |||
| 1085 | /* | ||
| 1086 | * Scan data sections and all the referenced memory blocks allocated via the | ||
| 1087 | * kernel's standard allocators. This function must be called with the | ||
| 1088 | * scan_mutex held. | ||
| 1089 | */ | ||
| 1090 | static void kmemleak_scan(void) | ||
| 1091 | { | ||
| 1092 | unsigned long flags; | ||
| 1093 | struct kmemleak_object *object, *tmp; | ||
| 1094 | int i; | ||
| 1095 | int new_leaks = 0; | ||
| 1096 | int gray_list_pass = 0; | ||
| 1097 | |||
| 1098 | jiffies_last_scan = jiffies; | ||
| 1099 | |||
| 1100 | /* prepare the kmemleak_object's */ | ||
| 1101 | rcu_read_lock(); | ||
| 1102 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
| 1103 | spin_lock_irqsave(&object->lock, flags); | ||
| 1104 | #ifdef DEBUG | ||
| 1105 | /* | ||
| 1106 | * With a few exceptions there should be a maximum of | ||
| 1107 | * 1 reference to any object at this point. | ||
| 1108 | */ | ||
| 1109 | if (atomic_read(&object->use_count) > 1) { | ||
| 1110 | pr_debug("object->use_count = %d\n", | ||
| 1111 | atomic_read(&object->use_count)); | ||
| 1112 | dump_object_info(object); | ||
| 1113 | } | ||
| 1114 | #endif | ||
| 1115 | /* reset the reference count (whiten the object) */ | ||
| 1116 | object->count = 0; | ||
| 1117 | object->flags &= ~OBJECT_NEW; | ||
| 1118 | if (color_gray(object) && get_object(object)) | ||
| 1119 | list_add_tail(&object->gray_list, &gray_list); | ||
| 1120 | |||
| 1121 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 1122 | } | ||
| 1123 | rcu_read_unlock(); | ||
| 1124 | |||
| 1125 | /* data/bss scanning */ | ||
| 1126 | scan_block(_sdata, _edata, NULL, 1); | ||
| 1127 | scan_block(__bss_start, __bss_stop, NULL, 1); | ||
| 1128 | |||
| 1129 | #ifdef CONFIG_SMP | ||
| 1130 | /* per-cpu sections scanning */ | ||
| 1131 | for_each_possible_cpu(i) | ||
| 1132 | scan_block(__per_cpu_start + per_cpu_offset(i), | ||
| 1133 | __per_cpu_end + per_cpu_offset(i), NULL, 1); | ||
| 1134 | #endif | ||
| 1135 | |||
| 1136 | /* | ||
| 1137 | * Struct page scanning for each node. The code below is not yet safe | ||
| 1138 | * with MEMORY_HOTPLUG. | ||
| 1139 | */ | ||
| 1140 | for_each_online_node(i) { | ||
| 1141 | pg_data_t *pgdat = NODE_DATA(i); | ||
| 1142 | unsigned long start_pfn = pgdat->node_start_pfn; | ||
| 1143 | unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; | ||
| 1144 | unsigned long pfn; | ||
| 1145 | |||
| 1146 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | ||
| 1147 | struct page *page; | ||
| 1148 | |||
| 1149 | if (!pfn_valid(pfn)) | ||
| 1150 | continue; | ||
| 1151 | page = pfn_to_page(pfn); | ||
| 1152 | /* only scan if page is in use */ | ||
| 1153 | if (page_count(page) == 0) | ||
| 1154 | continue; | ||
| 1155 | scan_block(page, page + 1, NULL, 1); | ||
| 1156 | } | ||
| 1157 | } | ||
| 1158 | |||
| 1159 | /* | ||
| 1160 | * Scanning the task stacks (may introduce false negatives). | ||
| 1161 | */ | ||
| 1162 | if (kmemleak_stack_scan) { | ||
| 1163 | struct task_struct *p, *g; | ||
| 1164 | |||
| 1165 | read_lock(&tasklist_lock); | ||
| 1166 | do_each_thread(g, p) { | ||
| 1167 | scan_block(task_stack_page(p), task_stack_page(p) + | ||
| 1168 | THREAD_SIZE, NULL, 0); | ||
| 1169 | } while_each_thread(g, p); | ||
| 1170 | read_unlock(&tasklist_lock); | ||
| 1171 | } | ||
| 1172 | |||
| 1173 | /* | ||
| 1174 | * Scan the objects already referenced from the sections scanned | ||
| 1175 | * above. More objects will be referenced and, if there are no memory | ||
| 1176 | * leaks, all the objects will be scanned. The list traversal is safe | ||
| 1177 | * for both tail additions and removals from inside the loop. The | ||
| 1178 | * kmemleak objects cannot be freed from outside the loop because their | ||
| 1179 | * use_count was increased. | ||
| 1180 | */ | ||
| 1181 | repeat: | ||
| 1182 | object = list_entry(gray_list.next, typeof(*object), gray_list); | ||
| 1183 | while (&object->gray_list != &gray_list) { | ||
| 1184 | cond_resched(); | ||
| 1185 | |||
| 1186 | /* may add new objects to the list */ | ||
| 1187 | if (!scan_should_stop()) | ||
| 1188 | scan_object(object); | ||
| 1189 | |||
| 1190 | tmp = list_entry(object->gray_list.next, typeof(*object), | ||
| 1191 | gray_list); | ||
| 1192 | |||
| 1193 | /* remove the object from the list and release it */ | ||
| 1194 | list_del(&object->gray_list); | ||
| 1195 | put_object(object); | ||
| 1196 | |||
| 1197 | object = tmp; | ||
| 1198 | } | ||
| 1199 | |||
| 1200 | if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES) | ||
| 1201 | goto scan_end; | ||
| 1202 | |||
| 1203 | /* | ||
| 1204 | * Check for new objects allocated during this scanning and add them | ||
| 1205 | * to the gray list. | ||
| 1206 | */ | ||
| 1207 | rcu_read_lock(); | ||
| 1208 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
| 1209 | spin_lock_irqsave(&object->lock, flags); | ||
| 1210 | if ((object->flags & OBJECT_NEW) && !color_black(object) && | ||
| 1211 | get_object(object)) { | ||
| 1212 | object->flags &= ~OBJECT_NEW; | ||
| 1213 | list_add_tail(&object->gray_list, &gray_list); | ||
| 1214 | } | ||
| 1215 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 1216 | } | ||
| 1217 | rcu_read_unlock(); | ||
| 1218 | |||
| 1219 | if (!list_empty(&gray_list)) | ||
| 1220 | goto repeat; | ||
| 1221 | |||
| 1222 | scan_end: | ||
| 1223 | WARN_ON(!list_empty(&gray_list)); | ||
| 1224 | |||
| 1225 | /* | ||
| 1226 | * If scanning was stopped or new objects were being allocated at a | ||
| 1227 | * higher rate than gray list scanning, do not report any new | ||
| 1228 | * unreferenced objects. | ||
| 1229 | */ | ||
| 1230 | if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES) | ||
| 1231 | return; | ||
| 1232 | |||
| 1233 | /* | ||
| 1234 | * Scanning result reporting. | ||
| 1235 | */ | ||
| 1236 | rcu_read_lock(); | ||
| 1237 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
| 1238 | spin_lock_irqsave(&object->lock, flags); | ||
| 1239 | if (unreferenced_object(object) && | ||
| 1240 | !(object->flags & OBJECT_REPORTED)) { | ||
| 1241 | object->flags |= OBJECT_REPORTED; | ||
| 1242 | new_leaks++; | ||
| 1243 | } | ||
| 1244 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 1245 | } | ||
| 1246 | rcu_read_unlock(); | ||
| 1247 | |||
| 1248 | if (new_leaks) | ||
| 1249 | pr_info("%d new suspected memory leaks (see " | ||
| 1250 | "/sys/kernel/debug/kmemleak)\n", new_leaks); | ||
| 1251 | |||
| 1252 | } | ||
| 1253 | |||
| 1254 | /* | ||
| 1255 | * Thread function performing automatic memory scanning. Unreferenced objects | ||
| 1256 | * at the end of a memory scan are reported but only the first time. | ||
| 1257 | */ | ||
| 1258 | static int kmemleak_scan_thread(void *arg) | ||
| 1259 | { | ||
| 1260 | static int first_run = 1; | ||
| 1261 | |||
| 1262 | pr_info("Automatic memory scanning thread started\n"); | ||
| 1263 | set_user_nice(current, 10); | ||
| 1264 | |||
| 1265 | /* | ||
| 1266 | * Wait before the first scan to allow the system to fully initialize. | ||
| 1267 | */ | ||
| 1268 | if (first_run) { | ||
| 1269 | first_run = 0; | ||
| 1270 | ssleep(SECS_FIRST_SCAN); | ||
| 1271 | } | ||
| 1272 | |||
| 1273 | while (!kthread_should_stop()) { | ||
| 1274 | signed long timeout = jiffies_scan_wait; | ||
| 1275 | |||
| 1276 | mutex_lock(&scan_mutex); | ||
| 1277 | kmemleak_scan(); | ||
| 1278 | mutex_unlock(&scan_mutex); | ||
| 1279 | |||
| 1280 | /* wait before the next scan */ | ||
| 1281 | while (timeout && !kthread_should_stop()) | ||
| 1282 | timeout = schedule_timeout_interruptible(timeout); | ||
| 1283 | } | ||
| 1284 | |||
| 1285 | pr_info("Automatic memory scanning thread ended\n"); | ||
| 1286 | |||
| 1287 | return 0; | ||
| 1288 | } | ||
| 1289 | |||
| 1290 | /* | ||
| 1291 | * Start the automatic memory scanning thread. This function must be called | ||
| 1292 | * with the scan_mutex held. | ||
| 1293 | */ | ||
| 1294 | static void start_scan_thread(void) | ||
| 1295 | { | ||
| 1296 | if (scan_thread) | ||
| 1297 | return; | ||
| 1298 | scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); | ||
| 1299 | if (IS_ERR(scan_thread)) { | ||
| 1300 | pr_warning("Failed to create the scan thread\n"); | ||
| 1301 | scan_thread = NULL; | ||
| 1302 | } | ||
| 1303 | } | ||
| 1304 | |||
| 1305 | /* | ||
| 1306 | * Stop the automatic memory scanning thread. This function must be called | ||
| 1307 | * with the scan_mutex held. | ||
| 1308 | */ | ||
| 1309 | static void stop_scan_thread(void) | ||
| 1310 | { | ||
| 1311 | if (scan_thread) { | ||
| 1312 | kthread_stop(scan_thread); | ||
| 1313 | scan_thread = NULL; | ||
| 1314 | } | ||
| 1315 | } | ||
| 1316 | |||
| 1317 | /* | ||
| 1318 | * Iterate over the object_list and return the first valid object at or after | ||
| 1319 | * the required position with its use_count incremented. The function triggers | ||
| 1320 | * a memory scanning when the pos argument points to the first position. | ||
| 1321 | */ | ||
| 1322 | static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) | ||
| 1323 | { | ||
| 1324 | struct kmemleak_object *object; | ||
| 1325 | loff_t n = *pos; | ||
| 1326 | int err; | ||
| 1327 | |||
| 1328 | err = mutex_lock_interruptible(&scan_mutex); | ||
| 1329 | if (err < 0) | ||
| 1330 | return ERR_PTR(err); | ||
| 1331 | |||
| 1332 | rcu_read_lock(); | ||
| 1333 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
| 1334 | if (n-- > 0) | ||
| 1335 | continue; | ||
| 1336 | if (get_object(object)) | ||
| 1337 | goto out; | ||
| 1338 | } | ||
| 1339 | object = NULL; | ||
| 1340 | out: | ||
| 1341 | return object; | ||
| 1342 | } | ||
| 1343 | |||
| 1344 | /* | ||
| 1345 | * Return the next object in the object_list. The function decrements the | ||
| 1346 | * use_count of the previous object and increases that of the next one. | ||
| 1347 | */ | ||
| 1348 | static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
| 1349 | { | ||
| 1350 | struct kmemleak_object *prev_obj = v; | ||
| 1351 | struct kmemleak_object *next_obj = NULL; | ||
| 1352 | struct list_head *n = &prev_obj->object_list; | ||
| 1353 | |||
| 1354 | ++(*pos); | ||
| 1355 | |||
| 1356 | list_for_each_continue_rcu(n, &object_list) { | ||
| 1357 | next_obj = list_entry(n, struct kmemleak_object, object_list); | ||
| 1358 | if (get_object(next_obj)) | ||
| 1359 | break; | ||
| 1360 | } | ||
| 1361 | |||
| 1362 | put_object(prev_obj); | ||
| 1363 | return next_obj; | ||
| 1364 | } | ||
| 1365 | |||
| 1366 | /* | ||
| 1367 | * Decrement the use_count of the last object required, if any. | ||
| 1368 | */ | ||
| 1369 | static void kmemleak_seq_stop(struct seq_file *seq, void *v) | ||
| 1370 | { | ||
| 1371 | if (!IS_ERR(v)) { | ||
| 1372 | /* | ||
| 1373 | * kmemleak_seq_start may return ERR_PTR if the scan_mutex | ||
| 1374 | * waiting was interrupted, so only release it if !IS_ERR. | ||
| 1375 | */ | ||
| 1376 | rcu_read_unlock(); | ||
| 1377 | mutex_unlock(&scan_mutex); | ||
| 1378 | if (v) | ||
| 1379 | put_object(v); | ||
| 1380 | } | ||
| 1381 | } | ||
| 1382 | |||
| 1383 | /* | ||
| 1384 | * Print the information for an unreferenced object to the seq file. | ||
| 1385 | */ | ||
| 1386 | static int kmemleak_seq_show(struct seq_file *seq, void *v) | ||
| 1387 | { | ||
| 1388 | struct kmemleak_object *object = v; | ||
| 1389 | unsigned long flags; | ||
| 1390 | |||
| 1391 | spin_lock_irqsave(&object->lock, flags); | ||
| 1392 | if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) | ||
| 1393 | print_unreferenced(seq, object); | ||
| 1394 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 1395 | return 0; | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | static const struct seq_operations kmemleak_seq_ops = { | ||
| 1399 | .start = kmemleak_seq_start, | ||
| 1400 | .next = kmemleak_seq_next, | ||
| 1401 | .stop = kmemleak_seq_stop, | ||
| 1402 | .show = kmemleak_seq_show, | ||
| 1403 | }; | ||
| 1404 | |||
| 1405 | static int kmemleak_open(struct inode *inode, struct file *file) | ||
| 1406 | { | ||
| 1407 | if (!atomic_read(&kmemleak_enabled)) | ||
| 1408 | return -EBUSY; | ||
| 1409 | |||
| 1410 | return seq_open(file, &kmemleak_seq_ops); | ||
| 1411 | } | ||
| 1412 | |||
| 1413 | static int kmemleak_release(struct inode *inode, struct file *file) | ||
| 1414 | { | ||
| 1415 | return seq_release(inode, file); | ||
| 1416 | } | ||
| 1417 | |||
| 1418 | static int dump_str_object_info(const char *str) | ||
| 1419 | { | ||
| 1420 | unsigned long flags; | ||
| 1421 | struct kmemleak_object *object; | ||
| 1422 | unsigned long addr; | ||
| 1423 | |||
| 1424 | addr= simple_strtoul(str, NULL, 0); | ||
| 1425 | object = find_and_get_object(addr, 0); | ||
| 1426 | if (!object) { | ||
| 1427 | pr_info("Unknown object at 0x%08lx\n", addr); | ||
| 1428 | return -EINVAL; | ||
| 1429 | } | ||
| 1430 | |||
| 1431 | spin_lock_irqsave(&object->lock, flags); | ||
| 1432 | dump_object_info(object); | ||
| 1433 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 1434 | |||
| 1435 | put_object(object); | ||
| 1436 | return 0; | ||
| 1437 | } | ||
| 1438 | |||
| 1439 | /* | ||
| 1440 | * We use grey instead of black to ensure we can do future scans on the same | ||
| 1441 | * objects. If we did not do future scans these black objects could | ||
| 1442 | * potentially contain references to newly allocated objects in the future and | ||
| 1443 | * we'd end up with false positives. | ||
| 1444 | */ | ||
| 1445 | static void kmemleak_clear(void) | ||
| 1446 | { | ||
| 1447 | struct kmemleak_object *object; | ||
| 1448 | unsigned long flags; | ||
| 1449 | |||
| 1450 | rcu_read_lock(); | ||
| 1451 | list_for_each_entry_rcu(object, &object_list, object_list) { | ||
| 1452 | spin_lock_irqsave(&object->lock, flags); | ||
| 1453 | if ((object->flags & OBJECT_REPORTED) && | ||
| 1454 | unreferenced_object(object)) | ||
| 1455 | __paint_it(object, KMEMLEAK_GREY); | ||
| 1456 | spin_unlock_irqrestore(&object->lock, flags); | ||
| 1457 | } | ||
| 1458 | rcu_read_unlock(); | ||
| 1459 | } | ||
| 1460 | |||
| 1461 | /* | ||
| 1462 | * File write operation to configure kmemleak at run-time. The following | ||
| 1463 | * commands can be written to the /sys/kernel/debug/kmemleak file: | ||
| 1464 | * off - disable kmemleak (irreversible) | ||
| 1465 | * stack=on - enable the task stacks scanning | ||
| 1466 | * stack=off - disable the tasks stacks scanning | ||
| 1467 | * scan=on - start the automatic memory scanning thread | ||
| 1468 | * scan=off - stop the automatic memory scanning thread | ||
| 1469 | * scan=... - set the automatic memory scanning period in seconds (0 to | ||
| 1470 | * disable it) | ||
| 1471 | * scan - trigger a memory scan | ||
| 1472 | * clear - mark all current reported unreferenced kmemleak objects as | ||
| 1473 | * grey to ignore printing them | ||
| 1474 | * dump=... - dump information about the object found at the given address | ||
| 1475 | */ | ||
| 1476 | static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, | ||
| 1477 | size_t size, loff_t *ppos) | ||
| 1478 | { | ||
| 1479 | char buf[64]; | ||
| 1480 | int buf_size; | ||
| 1481 | int ret; | ||
| 1482 | |||
| 1483 | buf_size = min(size, (sizeof(buf) - 1)); | ||
| 1484 | if (strncpy_from_user(buf, user_buf, buf_size) < 0) | ||
| 1485 | return -EFAULT; | ||
| 1486 | buf[buf_size] = 0; | ||
| 1487 | |||
| 1488 | ret = mutex_lock_interruptible(&scan_mutex); | ||
| 1489 | if (ret < 0) | ||
| 1490 | return ret; | ||
| 1491 | |||
| 1492 | if (strncmp(buf, "off", 3) == 0) | ||
| 1493 | kmemleak_disable(); | ||
| 1494 | else if (strncmp(buf, "stack=on", 8) == 0) | ||
| 1495 | kmemleak_stack_scan = 1; | ||
| 1496 | else if (strncmp(buf, "stack=off", 9) == 0) | ||
| 1497 | kmemleak_stack_scan = 0; | ||
| 1498 | else if (strncmp(buf, "scan=on", 7) == 0) | ||
| 1499 | start_scan_thread(); | ||
| 1500 | else if (strncmp(buf, "scan=off", 8) == 0) | ||
| 1501 | stop_scan_thread(); | ||
| 1502 | else if (strncmp(buf, "scan=", 5) == 0) { | ||
| 1503 | unsigned long secs; | ||
| 1504 | |||
| 1505 | ret = strict_strtoul(buf + 5, 0, &secs); | ||
| 1506 | if (ret < 0) | ||
| 1507 | goto out; | ||
| 1508 | stop_scan_thread(); | ||
| 1509 | if (secs) { | ||
| 1510 | jiffies_scan_wait = msecs_to_jiffies(secs * 1000); | ||
| 1511 | start_scan_thread(); | ||
| 1512 | } | ||
| 1513 | } else if (strncmp(buf, "scan", 4) == 0) | ||
| 1514 | kmemleak_scan(); | ||
| 1515 | else if (strncmp(buf, "clear", 5) == 0) | ||
| 1516 | kmemleak_clear(); | ||
| 1517 | else if (strncmp(buf, "dump=", 5) == 0) | ||
| 1518 | ret = dump_str_object_info(buf + 5); | ||
| 1519 | else | ||
| 1520 | ret = -EINVAL; | ||
| 1521 | |||
| 1522 | out: | ||
| 1523 | mutex_unlock(&scan_mutex); | ||
| 1524 | if (ret < 0) | ||
| 1525 | return ret; | ||
| 1526 | |||
| 1527 | /* ignore the rest of the buffer, only one command at a time */ | ||
| 1528 | *ppos += size; | ||
| 1529 | return size; | ||
| 1530 | } | ||
| 1531 | |||
| 1532 | static const struct file_operations kmemleak_fops = { | ||
| 1533 | .owner = THIS_MODULE, | ||
| 1534 | .open = kmemleak_open, | ||
| 1535 | .read = seq_read, | ||
| 1536 | .write = kmemleak_write, | ||
| 1537 | .llseek = seq_lseek, | ||
| 1538 | .release = kmemleak_release, | ||
| 1539 | }; | ||
| 1540 | |||
| 1541 | /* | ||
| 1542 | * Perform the freeing of the kmemleak internal objects after waiting for any | ||
| 1543 | * current memory scan to complete. | ||
| 1544 | */ | ||
| 1545 | static void kmemleak_do_cleanup(struct work_struct *work) | ||
| 1546 | { | ||
| 1547 | struct kmemleak_object *object; | ||
| 1548 | |||
| 1549 | mutex_lock(&scan_mutex); | ||
| 1550 | stop_scan_thread(); | ||
| 1551 | |||
| 1552 | rcu_read_lock(); | ||
| 1553 | list_for_each_entry_rcu(object, &object_list, object_list) | ||
| 1554 | delete_object_full(object->pointer); | ||
| 1555 | rcu_read_unlock(); | ||
| 1556 | mutex_unlock(&scan_mutex); | ||
| 1557 | } | ||
| 1558 | |||
| 1559 | static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); | ||
| 1560 | |||
| 1561 | /* | ||
| 1562 | * Disable kmemleak. No memory allocation/freeing will be traced once this | ||
| 1563 | * function is called. Disabling kmemleak is an irreversible operation. | ||
| 1564 | */ | ||
| 1565 | static void kmemleak_disable(void) | ||
| 1566 | { | ||
| 1567 | /* atomically check whether it was already invoked */ | ||
| 1568 | if (atomic_cmpxchg(&kmemleak_error, 0, 1)) | ||
| 1569 | return; | ||
| 1570 | |||
| 1571 | /* stop any memory operation tracing */ | ||
| 1572 | atomic_set(&kmemleak_early_log, 0); | ||
| 1573 | atomic_set(&kmemleak_enabled, 0); | ||
| 1574 | |||
| 1575 | /* check whether it is too early for a kernel thread */ | ||
| 1576 | if (atomic_read(&kmemleak_initialized)) | ||
| 1577 | schedule_work(&cleanup_work); | ||
| 1578 | |||
| 1579 | pr_info("Kernel memory leak detector disabled\n"); | ||
| 1580 | } | ||
| 1581 | |||
| 1582 | /* | ||
| 1583 | * Allow boot-time kmemleak disabling (enabled by default). | ||
| 1584 | */ | ||
| 1585 | static int kmemleak_boot_config(char *str) | ||
| 1586 | { | ||
| 1587 | if (!str) | ||
| 1588 | return -EINVAL; | ||
| 1589 | if (strcmp(str, "off") == 0) | ||
| 1590 | kmemleak_disable(); | ||
| 1591 | else if (strcmp(str, "on") != 0) | ||
| 1592 | return -EINVAL; | ||
| 1593 | return 0; | ||
| 1594 | } | ||
| 1595 | early_param("kmemleak", kmemleak_boot_config); | ||
| 1596 | |||
| 1597 | /* | ||
| 1598 | * Kmemleak initialization. | ||
| 1599 | */ | ||
| 1600 | void __init kmemleak_init(void) | ||
| 1601 | { | ||
| 1602 | int i; | ||
| 1603 | unsigned long flags; | ||
| 1604 | |||
| 1605 | jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); | ||
| 1606 | jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); | ||
| 1607 | |||
| 1608 | object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE); | ||
| 1609 | scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE); | ||
| 1610 | INIT_PRIO_TREE_ROOT(&object_tree_root); | ||
| 1611 | |||
| 1612 | /* the kernel is still in UP mode, so disabling the IRQs is enough */ | ||
| 1613 | local_irq_save(flags); | ||
| 1614 | if (!atomic_read(&kmemleak_error)) { | ||
| 1615 | atomic_set(&kmemleak_enabled, 1); | ||
| 1616 | atomic_set(&kmemleak_early_log, 0); | ||
| 1617 | } | ||
| 1618 | local_irq_restore(flags); | ||
| 1619 | |||
| 1620 | /* | ||
| 1621 | * This is the point where tracking allocations is safe. Automatic | ||
| 1622 | * scanning is started during the late initcall. Add the early logged | ||
| 1623 | * callbacks to the kmemleak infrastructure. | ||
| 1624 | */ | ||
| 1625 | for (i = 0; i < crt_early_log; i++) { | ||
| 1626 | struct early_log *log = &early_log[i]; | ||
| 1627 | |||
| 1628 | switch (log->op_type) { | ||
| 1629 | case KMEMLEAK_ALLOC: | ||
| 1630 | early_alloc(log); | ||
| 1631 | break; | ||
| 1632 | case KMEMLEAK_FREE: | ||
| 1633 | kmemleak_free(log->ptr); | ||
| 1634 | break; | ||
| 1635 | case KMEMLEAK_FREE_PART: | ||
| 1636 | kmemleak_free_part(log->ptr, log->size); | ||
| 1637 | break; | ||
| 1638 | case KMEMLEAK_NOT_LEAK: | ||
| 1639 | kmemleak_not_leak(log->ptr); | ||
| 1640 | break; | ||
| 1641 | case KMEMLEAK_IGNORE: | ||
| 1642 | kmemleak_ignore(log->ptr); | ||
| 1643 | break; | ||
| 1644 | case KMEMLEAK_SCAN_AREA: | ||
| 1645 | kmemleak_scan_area(log->ptr, log->offset, log->length, | ||
| 1646 | GFP_KERNEL); | ||
| 1647 | break; | ||
| 1648 | case KMEMLEAK_NO_SCAN: | ||
| 1649 | kmemleak_no_scan(log->ptr); | ||
| 1650 | break; | ||
| 1651 | default: | ||
| 1652 | WARN_ON(1); | ||
| 1653 | } | ||
| 1654 | } | ||
| 1655 | } | ||
| 1656 | |||
| 1657 | /* | ||
| 1658 | * Late initialization function. | ||
| 1659 | */ | ||
| 1660 | static int __init kmemleak_late_init(void) | ||
| 1661 | { | ||
| 1662 | struct dentry *dentry; | ||
| 1663 | |||
| 1664 | atomic_set(&kmemleak_initialized, 1); | ||
| 1665 | |||
| 1666 | if (atomic_read(&kmemleak_error)) { | ||
| 1667 | /* | ||
| 1668 | * Some error occured and kmemleak was disabled. There is a | ||
| 1669 | * small chance that kmemleak_disable() was called immediately | ||
| 1670 | * after setting kmemleak_initialized and we may end up with | ||
| 1671 | * two clean-up threads but serialized by scan_mutex. | ||
| 1672 | */ | ||
| 1673 | schedule_work(&cleanup_work); | ||
| 1674 | return -ENOMEM; | ||
| 1675 | } | ||
| 1676 | |||
| 1677 | dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, | ||
| 1678 | &kmemleak_fops); | ||
| 1679 | if (!dentry) | ||
| 1680 | pr_warning("Failed to create the debugfs kmemleak file\n"); | ||
| 1681 | mutex_lock(&scan_mutex); | ||
| 1682 | start_scan_thread(); | ||
| 1683 | mutex_unlock(&scan_mutex); | ||
| 1684 | |||
| 1685 | pr_info("Kernel memory leak detector initialized\n"); | ||
| 1686 | |||
| 1687 | return 0; | ||
| 1688 | } | ||
| 1689 | late_initcall(kmemleak_late_init); | ||
diff --git a/mm/ksm.c b/mm/ksm.c new file mode 100644 index 000000000000..5575f8628fef --- /dev/null +++ b/mm/ksm.c | |||
| @@ -0,0 +1,1710 @@ | |||
| 1 | /* | ||
| 2 | * Memory merging support. | ||
| 3 | * | ||
| 4 | * This code enables dynamic sharing of identical pages found in different | ||
| 5 | * memory areas, even if they are not shared by fork() | ||
| 6 | * | ||
| 7 | * Copyright (C) 2008-2009 Red Hat, Inc. | ||
| 8 | * Authors: | ||
| 9 | * Izik Eidus | ||
| 10 | * Andrea Arcangeli | ||
| 11 | * Chris Wright | ||
| 12 | * Hugh Dickins | ||
| 13 | * | ||
| 14 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
| 15 | */ | ||
| 16 | |||
| 17 | #include <linux/errno.h> | ||
| 18 | #include <linux/mm.h> | ||
| 19 | #include <linux/fs.h> | ||
| 20 | #include <linux/mman.h> | ||
| 21 | #include <linux/sched.h> | ||
| 22 | #include <linux/rwsem.h> | ||
| 23 | #include <linux/pagemap.h> | ||
| 24 | #include <linux/rmap.h> | ||
| 25 | #include <linux/spinlock.h> | ||
| 26 | #include <linux/jhash.h> | ||
| 27 | #include <linux/delay.h> | ||
| 28 | #include <linux/kthread.h> | ||
| 29 | #include <linux/wait.h> | ||
| 30 | #include <linux/slab.h> | ||
| 31 | #include <linux/rbtree.h> | ||
| 32 | #include <linux/mmu_notifier.h> | ||
| 33 | #include <linux/swap.h> | ||
| 34 | #include <linux/ksm.h> | ||
| 35 | |||
| 36 | #include <asm/tlbflush.h> | ||
| 37 | |||
| 38 | /* | ||
| 39 | * A few notes about the KSM scanning process, | ||
| 40 | * to make it easier to understand the data structures below: | ||
| 41 | * | ||
| 42 | * In order to reduce excessive scanning, KSM sorts the memory pages by their | ||
| 43 | * contents into a data structure that holds pointers to the pages' locations. | ||
| 44 | * | ||
| 45 | * Since the contents of the pages may change at any moment, KSM cannot just | ||
| 46 | * insert the pages into a normal sorted tree and expect it to find anything. | ||
| 47 | * Therefore KSM uses two data structures - the stable and the unstable tree. | ||
| 48 | * | ||
| 49 | * The stable tree holds pointers to all the merged pages (ksm pages), sorted | ||
| 50 | * by their contents. Because each such page is write-protected, searching on | ||
| 51 | * this tree is fully assured to be working (except when pages are unmapped), | ||
| 52 | * and therefore this tree is called the stable tree. | ||
| 53 | * | ||
| 54 | * In addition to the stable tree, KSM uses a second data structure called the | ||
| 55 | * unstable tree: this tree holds pointers to pages which have been found to | ||
| 56 | * be "unchanged for a period of time". The unstable tree sorts these pages | ||
| 57 | * by their contents, but since they are not write-protected, KSM cannot rely | ||
| 58 | * upon the unstable tree to work correctly - the unstable tree is liable to | ||
| 59 | * be corrupted as its contents are modified, and so it is called unstable. | ||
| 60 | * | ||
| 61 | * KSM solves this problem by several techniques: | ||
| 62 | * | ||
| 63 | * 1) The unstable tree is flushed every time KSM completes scanning all | ||
| 64 | * memory areas, and then the tree is rebuilt again from the beginning. | ||
| 65 | * 2) KSM will only insert into the unstable tree, pages whose hash value | ||
| 66 | * has not changed since the previous scan of all memory areas. | ||
| 67 | * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the | ||
| 68 | * colors of the nodes and not on their contents, assuring that even when | ||
| 69 | * the tree gets "corrupted" it won't get out of balance, so scanning time | ||
| 70 | * remains the same (also, searching and inserting nodes in an rbtree uses | ||
| 71 | * the same algorithm, so we have no overhead when we flush and rebuild). | ||
| 72 | * 4) KSM never flushes the stable tree, which means that even if it were to | ||
| 73 | * take 10 attempts to find a page in the unstable tree, once it is found, | ||
| 74 | * it is secured in the stable tree. (When we scan a new page, we first | ||
| 75 | * compare it against the stable tree, and then against the unstable tree.) | ||
| 76 | */ | ||
| 77 | |||
| 78 | /** | ||
| 79 | * struct mm_slot - ksm information per mm that is being scanned | ||
| 80 | * @link: link to the mm_slots hash list | ||
| 81 | * @mm_list: link into the mm_slots list, rooted in ksm_mm_head | ||
| 82 | * @rmap_list: head for this mm_slot's list of rmap_items | ||
| 83 | * @mm: the mm that this information is valid for | ||
| 84 | */ | ||
| 85 | struct mm_slot { | ||
| 86 | struct hlist_node link; | ||
| 87 | struct list_head mm_list; | ||
| 88 | struct list_head rmap_list; | ||
| 89 | struct mm_struct *mm; | ||
| 90 | }; | ||
| 91 | |||
| 92 | /** | ||
| 93 | * struct ksm_scan - cursor for scanning | ||
| 94 | * @mm_slot: the current mm_slot we are scanning | ||
| 95 | * @address: the next address inside that to be scanned | ||
| 96 | * @rmap_item: the current rmap that we are scanning inside the rmap_list | ||
| 97 | * @seqnr: count of completed full scans (needed when removing unstable node) | ||
| 98 | * | ||
| 99 | * There is only the one ksm_scan instance of this cursor structure. | ||
| 100 | */ | ||
| 101 | struct ksm_scan { | ||
| 102 | struct mm_slot *mm_slot; | ||
| 103 | unsigned long address; | ||
| 104 | struct rmap_item *rmap_item; | ||
| 105 | unsigned long seqnr; | ||
| 106 | }; | ||
| 107 | |||
| 108 | /** | ||
| 109 | * struct rmap_item - reverse mapping item for virtual addresses | ||
| 110 | * @link: link into mm_slot's rmap_list (rmap_list is per mm) | ||
| 111 | * @mm: the memory structure this rmap_item is pointing into | ||
| 112 | * @address: the virtual address this rmap_item tracks (+ flags in low bits) | ||
| 113 | * @oldchecksum: previous checksum of the page at that virtual address | ||
| 114 | * @node: rb_node of this rmap_item in either unstable or stable tree | ||
| 115 | * @next: next rmap_item hanging off the same node of the stable tree | ||
| 116 | * @prev: previous rmap_item hanging off the same node of the stable tree | ||
| 117 | */ | ||
| 118 | struct rmap_item { | ||
| 119 | struct list_head link; | ||
| 120 | struct mm_struct *mm; | ||
| 121 | unsigned long address; /* + low bits used for flags below */ | ||
| 122 | union { | ||
| 123 | unsigned int oldchecksum; /* when unstable */ | ||
| 124 | struct rmap_item *next; /* when stable */ | ||
| 125 | }; | ||
| 126 | union { | ||
| 127 | struct rb_node node; /* when tree node */ | ||
| 128 | struct rmap_item *prev; /* in stable list */ | ||
| 129 | }; | ||
| 130 | }; | ||
| 131 | |||
| 132 | #define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ | ||
| 133 | #define NODE_FLAG 0x100 /* is a node of unstable or stable tree */ | ||
| 134 | #define STABLE_FLAG 0x200 /* is a node or list item of stable tree */ | ||
| 135 | |||
| 136 | /* The stable and unstable tree heads */ | ||
| 137 | static struct rb_root root_stable_tree = RB_ROOT; | ||
| 138 | static struct rb_root root_unstable_tree = RB_ROOT; | ||
| 139 | |||
| 140 | #define MM_SLOTS_HASH_HEADS 1024 | ||
| 141 | static struct hlist_head *mm_slots_hash; | ||
| 142 | |||
| 143 | static struct mm_slot ksm_mm_head = { | ||
| 144 | .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), | ||
| 145 | }; | ||
| 146 | static struct ksm_scan ksm_scan = { | ||
| 147 | .mm_slot = &ksm_mm_head, | ||
| 148 | }; | ||
| 149 | |||
| 150 | static struct kmem_cache *rmap_item_cache; | ||
| 151 | static struct kmem_cache *mm_slot_cache; | ||
| 152 | |||
| 153 | /* The number of nodes in the stable tree */ | ||
| 154 | static unsigned long ksm_pages_shared; | ||
| 155 | |||
| 156 | /* The number of page slots additionally sharing those nodes */ | ||
| 157 | static unsigned long ksm_pages_sharing; | ||
| 158 | |||
| 159 | /* The number of nodes in the unstable tree */ | ||
| 160 | static unsigned long ksm_pages_unshared; | ||
| 161 | |||
| 162 | /* The number of rmap_items in use: to calculate pages_volatile */ | ||
| 163 | static unsigned long ksm_rmap_items; | ||
| 164 | |||
| 165 | /* Limit on the number of unswappable pages used */ | ||
| 166 | static unsigned long ksm_max_kernel_pages; | ||
| 167 | |||
| 168 | /* Number of pages ksmd should scan in one batch */ | ||
| 169 | static unsigned int ksm_thread_pages_to_scan = 100; | ||
| 170 | |||
| 171 | /* Milliseconds ksmd should sleep between batches */ | ||
| 172 | static unsigned int ksm_thread_sleep_millisecs = 20; | ||
| 173 | |||
| 174 | #define KSM_RUN_STOP 0 | ||
| 175 | #define KSM_RUN_MERGE 1 | ||
| 176 | #define KSM_RUN_UNMERGE 2 | ||
| 177 | static unsigned int ksm_run = KSM_RUN_STOP; | ||
| 178 | |||
| 179 | static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); | ||
| 180 | static DEFINE_MUTEX(ksm_thread_mutex); | ||
| 181 | static DEFINE_SPINLOCK(ksm_mmlist_lock); | ||
| 182 | |||
| 183 | #define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ | ||
| 184 | sizeof(struct __struct), __alignof__(struct __struct),\ | ||
| 185 | (__flags), NULL) | ||
| 186 | |||
| 187 | static int __init ksm_slab_init(void) | ||
| 188 | { | ||
| 189 | rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); | ||
| 190 | if (!rmap_item_cache) | ||
| 191 | goto out; | ||
| 192 | |||
| 193 | mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); | ||
| 194 | if (!mm_slot_cache) | ||
| 195 | goto out_free; | ||
| 196 | |||
| 197 | return 0; | ||
| 198 | |||
| 199 | out_free: | ||
| 200 | kmem_cache_destroy(rmap_item_cache); | ||
| 201 | out: | ||
| 202 | return -ENOMEM; | ||
| 203 | } | ||
| 204 | |||
| 205 | static void __init ksm_slab_free(void) | ||
| 206 | { | ||
| 207 | kmem_cache_destroy(mm_slot_cache); | ||
| 208 | kmem_cache_destroy(rmap_item_cache); | ||
| 209 | mm_slot_cache = NULL; | ||
| 210 | } | ||
| 211 | |||
| 212 | static inline struct rmap_item *alloc_rmap_item(void) | ||
| 213 | { | ||
| 214 | struct rmap_item *rmap_item; | ||
| 215 | |||
| 216 | rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); | ||
| 217 | if (rmap_item) | ||
| 218 | ksm_rmap_items++; | ||
| 219 | return rmap_item; | ||
| 220 | } | ||
| 221 | |||
| 222 | static inline void free_rmap_item(struct rmap_item *rmap_item) | ||
| 223 | { | ||
| 224 | ksm_rmap_items--; | ||
| 225 | rmap_item->mm = NULL; /* debug safety */ | ||
| 226 | kmem_cache_free(rmap_item_cache, rmap_item); | ||
| 227 | } | ||
| 228 | |||
| 229 | static inline struct mm_slot *alloc_mm_slot(void) | ||
| 230 | { | ||
| 231 | if (!mm_slot_cache) /* initialization failed */ | ||
| 232 | return NULL; | ||
| 233 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); | ||
| 234 | } | ||
| 235 | |||
| 236 | static inline void free_mm_slot(struct mm_slot *mm_slot) | ||
| 237 | { | ||
| 238 | kmem_cache_free(mm_slot_cache, mm_slot); | ||
| 239 | } | ||
| 240 | |||
| 241 | static int __init mm_slots_hash_init(void) | ||
| 242 | { | ||
| 243 | mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), | ||
| 244 | GFP_KERNEL); | ||
| 245 | if (!mm_slots_hash) | ||
| 246 | return -ENOMEM; | ||
| 247 | return 0; | ||
| 248 | } | ||
| 249 | |||
| 250 | static void __init mm_slots_hash_free(void) | ||
| 251 | { | ||
| 252 | kfree(mm_slots_hash); | ||
| 253 | } | ||
| 254 | |||
| 255 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | ||
| 256 | { | ||
| 257 | struct mm_slot *mm_slot; | ||
| 258 | struct hlist_head *bucket; | ||
| 259 | struct hlist_node *node; | ||
| 260 | |||
| 261 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
| 262 | % MM_SLOTS_HASH_HEADS]; | ||
| 263 | hlist_for_each_entry(mm_slot, node, bucket, link) { | ||
| 264 | if (mm == mm_slot->mm) | ||
| 265 | return mm_slot; | ||
| 266 | } | ||
| 267 | return NULL; | ||
| 268 | } | ||
| 269 | |||
| 270 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | ||
| 271 | struct mm_slot *mm_slot) | ||
| 272 | { | ||
| 273 | struct hlist_head *bucket; | ||
| 274 | |||
| 275 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
| 276 | % MM_SLOTS_HASH_HEADS]; | ||
| 277 | mm_slot->mm = mm; | ||
| 278 | INIT_LIST_HEAD(&mm_slot->rmap_list); | ||
| 279 | hlist_add_head(&mm_slot->link, bucket); | ||
| 280 | } | ||
| 281 | |||
| 282 | static inline int in_stable_tree(struct rmap_item *rmap_item) | ||
| 283 | { | ||
| 284 | return rmap_item->address & STABLE_FLAG; | ||
| 285 | } | ||
| 286 | |||
| 287 | /* | ||
| 288 | * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's | ||
| 289 | * page tables after it has passed through ksm_exit() - which, if necessary, | ||
| 290 | * takes mmap_sem briefly to serialize against them. ksm_exit() does not set | ||
| 291 | * a special flag: they can just back out as soon as mm_users goes to zero. | ||
| 292 | * ksm_test_exit() is used throughout to make this test for exit: in some | ||
| 293 | * places for correctness, in some places just to avoid unnecessary work. | ||
| 294 | */ | ||
| 295 | static inline bool ksm_test_exit(struct mm_struct *mm) | ||
| 296 | { | ||
| 297 | return atomic_read(&mm->mm_users) == 0; | ||
| 298 | } | ||
| 299 | |||
| 300 | /* | ||
| 301 | * We use break_ksm to break COW on a ksm page: it's a stripped down | ||
| 302 | * | ||
| 303 | * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) | ||
| 304 | * put_page(page); | ||
| 305 | * | ||
| 306 | * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, | ||
| 307 | * in case the application has unmapped and remapped mm,addr meanwhile. | ||
| 308 | * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP | ||
| 309 | * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. | ||
| 310 | */ | ||
| 311 | static int break_ksm(struct vm_area_struct *vma, unsigned long addr) | ||
| 312 | { | ||
| 313 | struct page *page; | ||
| 314 | int ret = 0; | ||
| 315 | |||
| 316 | do { | ||
| 317 | cond_resched(); | ||
| 318 | page = follow_page(vma, addr, FOLL_GET); | ||
| 319 | if (!page) | ||
| 320 | break; | ||
| 321 | if (PageKsm(page)) | ||
| 322 | ret = handle_mm_fault(vma->vm_mm, vma, addr, | ||
| 323 | FAULT_FLAG_WRITE); | ||
| 324 | else | ||
| 325 | ret = VM_FAULT_WRITE; | ||
| 326 | put_page(page); | ||
| 327 | } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); | ||
| 328 | /* | ||
| 329 | * We must loop because handle_mm_fault() may back out if there's | ||
| 330 | * any difficulty e.g. if pte accessed bit gets updated concurrently. | ||
| 331 | * | ||
| 332 | * VM_FAULT_WRITE is what we have been hoping for: it indicates that | ||
| 333 | * COW has been broken, even if the vma does not permit VM_WRITE; | ||
| 334 | * but note that a concurrent fault might break PageKsm for us. | ||
| 335 | * | ||
| 336 | * VM_FAULT_SIGBUS could occur if we race with truncation of the | ||
| 337 | * backing file, which also invalidates anonymous pages: that's | ||
| 338 | * okay, that truncation will have unmapped the PageKsm for us. | ||
| 339 | * | ||
| 340 | * VM_FAULT_OOM: at the time of writing (late July 2009), setting | ||
| 341 | * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the | ||
| 342 | * current task has TIF_MEMDIE set, and will be OOM killed on return | ||
| 343 | * to user; and ksmd, having no mm, would never be chosen for that. | ||
| 344 | * | ||
| 345 | * But if the mm is in a limited mem_cgroup, then the fault may fail | ||
| 346 | * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and | ||
| 347 | * even ksmd can fail in this way - though it's usually breaking ksm | ||
| 348 | * just to undo a merge it made a moment before, so unlikely to oom. | ||
| 349 | * | ||
| 350 | * That's a pity: we might therefore have more kernel pages allocated | ||
| 351 | * than we're counting as nodes in the stable tree; but ksm_do_scan | ||
| 352 | * will retry to break_cow on each pass, so should recover the page | ||
| 353 | * in due course. The important thing is to not let VM_MERGEABLE | ||
| 354 | * be cleared while any such pages might remain in the area. | ||
| 355 | */ | ||
| 356 | return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; | ||
| 357 | } | ||
| 358 | |||
| 359 | static void break_cow(struct mm_struct *mm, unsigned long addr) | ||
| 360 | { | ||
| 361 | struct vm_area_struct *vma; | ||
| 362 | |||
| 363 | down_read(&mm->mmap_sem); | ||
| 364 | if (ksm_test_exit(mm)) | ||
| 365 | goto out; | ||
| 366 | vma = find_vma(mm, addr); | ||
| 367 | if (!vma || vma->vm_start > addr) | ||
| 368 | goto out; | ||
| 369 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
| 370 | goto out; | ||
| 371 | break_ksm(vma, addr); | ||
| 372 | out: | ||
| 373 | up_read(&mm->mmap_sem); | ||
| 374 | } | ||
| 375 | |||
| 376 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) | ||
| 377 | { | ||
| 378 | struct mm_struct *mm = rmap_item->mm; | ||
| 379 | unsigned long addr = rmap_item->address; | ||
| 380 | struct vm_area_struct *vma; | ||
| 381 | struct page *page; | ||
| 382 | |||
| 383 | down_read(&mm->mmap_sem); | ||
| 384 | if (ksm_test_exit(mm)) | ||
| 385 | goto out; | ||
| 386 | vma = find_vma(mm, addr); | ||
| 387 | if (!vma || vma->vm_start > addr) | ||
| 388 | goto out; | ||
| 389 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
| 390 | goto out; | ||
| 391 | |||
| 392 | page = follow_page(vma, addr, FOLL_GET); | ||
| 393 | if (!page) | ||
| 394 | goto out; | ||
| 395 | if (PageAnon(page)) { | ||
| 396 | flush_anon_page(vma, page, addr); | ||
| 397 | flush_dcache_page(page); | ||
| 398 | } else { | ||
| 399 | put_page(page); | ||
| 400 | out: page = NULL; | ||
| 401 | } | ||
| 402 | up_read(&mm->mmap_sem); | ||
| 403 | return page; | ||
| 404 | } | ||
| 405 | |||
| 406 | /* | ||
| 407 | * get_ksm_page: checks if the page at the virtual address in rmap_item | ||
| 408 | * is still PageKsm, in which case we can trust the content of the page, | ||
| 409 | * and it returns the gotten page; but NULL if the page has been zapped. | ||
| 410 | */ | ||
| 411 | static struct page *get_ksm_page(struct rmap_item *rmap_item) | ||
| 412 | { | ||
| 413 | struct page *page; | ||
| 414 | |||
| 415 | page = get_mergeable_page(rmap_item); | ||
| 416 | if (page && !PageKsm(page)) { | ||
| 417 | put_page(page); | ||
| 418 | page = NULL; | ||
| 419 | } | ||
| 420 | return page; | ||
| 421 | } | ||
| 422 | |||
| 423 | /* | ||
| 424 | * Removing rmap_item from stable or unstable tree. | ||
| 425 | * This function will clean the information from the stable/unstable tree. | ||
| 426 | */ | ||
| 427 | static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) | ||
| 428 | { | ||
| 429 | if (in_stable_tree(rmap_item)) { | ||
| 430 | struct rmap_item *next_item = rmap_item->next; | ||
| 431 | |||
| 432 | if (rmap_item->address & NODE_FLAG) { | ||
| 433 | if (next_item) { | ||
| 434 | rb_replace_node(&rmap_item->node, | ||
| 435 | &next_item->node, | ||
| 436 | &root_stable_tree); | ||
| 437 | next_item->address |= NODE_FLAG; | ||
| 438 | ksm_pages_sharing--; | ||
| 439 | } else { | ||
| 440 | rb_erase(&rmap_item->node, &root_stable_tree); | ||
| 441 | ksm_pages_shared--; | ||
| 442 | } | ||
| 443 | } else { | ||
| 444 | struct rmap_item *prev_item = rmap_item->prev; | ||
| 445 | |||
| 446 | BUG_ON(prev_item->next != rmap_item); | ||
| 447 | prev_item->next = next_item; | ||
| 448 | if (next_item) { | ||
| 449 | BUG_ON(next_item->prev != rmap_item); | ||
| 450 | next_item->prev = rmap_item->prev; | ||
| 451 | } | ||
| 452 | ksm_pages_sharing--; | ||
| 453 | } | ||
| 454 | |||
| 455 | rmap_item->next = NULL; | ||
| 456 | |||
| 457 | } else if (rmap_item->address & NODE_FLAG) { | ||
| 458 | unsigned char age; | ||
| 459 | /* | ||
| 460 | * Usually ksmd can and must skip the rb_erase, because | ||
| 461 | * root_unstable_tree was already reset to RB_ROOT. | ||
| 462 | * But be careful when an mm is exiting: do the rb_erase | ||
| 463 | * if this rmap_item was inserted by this scan, rather | ||
| 464 | * than left over from before. | ||
| 465 | */ | ||
| 466 | age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); | ||
| 467 | BUG_ON(age > 1); | ||
| 468 | if (!age) | ||
| 469 | rb_erase(&rmap_item->node, &root_unstable_tree); | ||
| 470 | ksm_pages_unshared--; | ||
| 471 | } | ||
| 472 | |||
| 473 | rmap_item->address &= PAGE_MASK; | ||
| 474 | |||
| 475 | cond_resched(); /* we're called from many long loops */ | ||
| 476 | } | ||
| 477 | |||
| 478 | static void remove_trailing_rmap_items(struct mm_slot *mm_slot, | ||
| 479 | struct list_head *cur) | ||
| 480 | { | ||
| 481 | struct rmap_item *rmap_item; | ||
| 482 | |||
| 483 | while (cur != &mm_slot->rmap_list) { | ||
| 484 | rmap_item = list_entry(cur, struct rmap_item, link); | ||
| 485 | cur = cur->next; | ||
| 486 | remove_rmap_item_from_tree(rmap_item); | ||
| 487 | list_del(&rmap_item->link); | ||
| 488 | free_rmap_item(rmap_item); | ||
| 489 | } | ||
| 490 | } | ||
| 491 | |||
| 492 | /* | ||
| 493 | * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather | ||
| 494 | * than check every pte of a given vma, the locking doesn't quite work for | ||
| 495 | * that - an rmap_item is assigned to the stable tree after inserting ksm | ||
| 496 | * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing | ||
| 497 | * rmap_items from parent to child at fork time (so as not to waste time | ||
| 498 | * if exit comes before the next scan reaches it). | ||
| 499 | * | ||
| 500 | * Similarly, although we'd like to remove rmap_items (so updating counts | ||
| 501 | * and freeing memory) when unmerging an area, it's easier to leave that | ||
| 502 | * to the next pass of ksmd - consider, for example, how ksmd might be | ||
| 503 | * in cmp_and_merge_page on one of the rmap_items we would be removing. | ||
| 504 | */ | ||
| 505 | static int unmerge_ksm_pages(struct vm_area_struct *vma, | ||
| 506 | unsigned long start, unsigned long end) | ||
| 507 | { | ||
| 508 | unsigned long addr; | ||
| 509 | int err = 0; | ||
| 510 | |||
| 511 | for (addr = start; addr < end && !err; addr += PAGE_SIZE) { | ||
| 512 | if (ksm_test_exit(vma->vm_mm)) | ||
| 513 | break; | ||
| 514 | if (signal_pending(current)) | ||
| 515 | err = -ERESTARTSYS; | ||
| 516 | else | ||
| 517 | err = break_ksm(vma, addr); | ||
| 518 | } | ||
| 519 | return err; | ||
| 520 | } | ||
| 521 | |||
| 522 | #ifdef CONFIG_SYSFS | ||
| 523 | /* | ||
| 524 | * Only called through the sysfs control interface: | ||
| 525 | */ | ||
| 526 | static int unmerge_and_remove_all_rmap_items(void) | ||
| 527 | { | ||
| 528 | struct mm_slot *mm_slot; | ||
| 529 | struct mm_struct *mm; | ||
| 530 | struct vm_area_struct *vma; | ||
| 531 | int err = 0; | ||
| 532 | |||
| 533 | spin_lock(&ksm_mmlist_lock); | ||
| 534 | ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, | ||
| 535 | struct mm_slot, mm_list); | ||
| 536 | spin_unlock(&ksm_mmlist_lock); | ||
| 537 | |||
| 538 | for (mm_slot = ksm_scan.mm_slot; | ||
| 539 | mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { | ||
| 540 | mm = mm_slot->mm; | ||
| 541 | down_read(&mm->mmap_sem); | ||
| 542 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
| 543 | if (ksm_test_exit(mm)) | ||
| 544 | break; | ||
| 545 | if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) | ||
| 546 | continue; | ||
| 547 | err = unmerge_ksm_pages(vma, | ||
| 548 | vma->vm_start, vma->vm_end); | ||
| 549 | if (err) | ||
| 550 | goto error; | ||
| 551 | } | ||
| 552 | |||
| 553 | remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next); | ||
| 554 | |||
| 555 | spin_lock(&ksm_mmlist_lock); | ||
| 556 | ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, | ||
| 557 | struct mm_slot, mm_list); | ||
| 558 | if (ksm_test_exit(mm)) { | ||
| 559 | hlist_del(&mm_slot->link); | ||
| 560 | list_del(&mm_slot->mm_list); | ||
| 561 | spin_unlock(&ksm_mmlist_lock); | ||
| 562 | |||
| 563 | free_mm_slot(mm_slot); | ||
| 564 | clear_bit(MMF_VM_MERGEABLE, &mm->flags); | ||
| 565 | up_read(&mm->mmap_sem); | ||
| 566 | mmdrop(mm); | ||
| 567 | } else { | ||
| 568 | spin_unlock(&ksm_mmlist_lock); | ||
| 569 | up_read(&mm->mmap_sem); | ||
| 570 | } | ||
| 571 | } | ||
| 572 | |||
| 573 | ksm_scan.seqnr = 0; | ||
| 574 | return 0; | ||
| 575 | |||
| 576 | error: | ||
| 577 | up_read(&mm->mmap_sem); | ||
| 578 | spin_lock(&ksm_mmlist_lock); | ||
| 579 | ksm_scan.mm_slot = &ksm_mm_head; | ||
| 580 | spin_unlock(&ksm_mmlist_lock); | ||
| 581 | return err; | ||
| 582 | } | ||
| 583 | #endif /* CONFIG_SYSFS */ | ||
| 584 | |||
| 585 | static u32 calc_checksum(struct page *page) | ||
| 586 | { | ||
| 587 | u32 checksum; | ||
| 588 | void *addr = kmap_atomic(page, KM_USER0); | ||
| 589 | checksum = jhash2(addr, PAGE_SIZE / 4, 17); | ||
| 590 | kunmap_atomic(addr, KM_USER0); | ||
| 591 | return checksum; | ||
| 592 | } | ||
| 593 | |||
| 594 | static int memcmp_pages(struct page *page1, struct page *page2) | ||
| 595 | { | ||
| 596 | char *addr1, *addr2; | ||
| 597 | int ret; | ||
| 598 | |||
| 599 | addr1 = kmap_atomic(page1, KM_USER0); | ||
| 600 | addr2 = kmap_atomic(page2, KM_USER1); | ||
| 601 | ret = memcmp(addr1, addr2, PAGE_SIZE); | ||
| 602 | kunmap_atomic(addr2, KM_USER1); | ||
| 603 | kunmap_atomic(addr1, KM_USER0); | ||
| 604 | return ret; | ||
| 605 | } | ||
| 606 | |||
| 607 | static inline int pages_identical(struct page *page1, struct page *page2) | ||
| 608 | { | ||
| 609 | return !memcmp_pages(page1, page2); | ||
| 610 | } | ||
| 611 | |||
| 612 | static int write_protect_page(struct vm_area_struct *vma, struct page *page, | ||
| 613 | pte_t *orig_pte) | ||
| 614 | { | ||
| 615 | struct mm_struct *mm = vma->vm_mm; | ||
| 616 | unsigned long addr; | ||
| 617 | pte_t *ptep; | ||
| 618 | spinlock_t *ptl; | ||
| 619 | int swapped; | ||
| 620 | int err = -EFAULT; | ||
| 621 | |||
| 622 | addr = page_address_in_vma(page, vma); | ||
| 623 | if (addr == -EFAULT) | ||
| 624 | goto out; | ||
| 625 | |||
| 626 | ptep = page_check_address(page, mm, addr, &ptl, 0); | ||
| 627 | if (!ptep) | ||
| 628 | goto out; | ||
| 629 | |||
| 630 | if (pte_write(*ptep)) { | ||
| 631 | pte_t entry; | ||
| 632 | |||
| 633 | swapped = PageSwapCache(page); | ||
| 634 | flush_cache_page(vma, addr, page_to_pfn(page)); | ||
| 635 | /* | ||
| 636 | * Ok this is tricky, when get_user_pages_fast() run it doesnt | ||
| 637 | * take any lock, therefore the check that we are going to make | ||
| 638 | * with the pagecount against the mapcount is racey and | ||
| 639 | * O_DIRECT can happen right after the check. | ||
| 640 | * So we clear the pte and flush the tlb before the check | ||
| 641 | * this assure us that no O_DIRECT can happen after the check | ||
| 642 | * or in the middle of the check. | ||
| 643 | */ | ||
| 644 | entry = ptep_clear_flush(vma, addr, ptep); | ||
| 645 | /* | ||
| 646 | * Check that no O_DIRECT or similar I/O is in progress on the | ||
| 647 | * page | ||
| 648 | */ | ||
| 649 | if ((page_mapcount(page) + 2 + swapped) != page_count(page)) { | ||
| 650 | set_pte_at_notify(mm, addr, ptep, entry); | ||
| 651 | goto out_unlock; | ||
| 652 | } | ||
| 653 | entry = pte_wrprotect(entry); | ||
| 654 | set_pte_at_notify(mm, addr, ptep, entry); | ||
| 655 | } | ||
| 656 | *orig_pte = *ptep; | ||
| 657 | err = 0; | ||
| 658 | |||
| 659 | out_unlock: | ||
| 660 | pte_unmap_unlock(ptep, ptl); | ||
| 661 | out: | ||
| 662 | return err; | ||
| 663 | } | ||
| 664 | |||
| 665 | /** | ||
| 666 | * replace_page - replace page in vma by new ksm page | ||
| 667 | * @vma: vma that holds the pte pointing to oldpage | ||
| 668 | * @oldpage: the page we are replacing by newpage | ||
| 669 | * @newpage: the ksm page we replace oldpage by | ||
| 670 | * @orig_pte: the original value of the pte | ||
| 671 | * | ||
| 672 | * Returns 0 on success, -EFAULT on failure. | ||
| 673 | */ | ||
| 674 | static int replace_page(struct vm_area_struct *vma, struct page *oldpage, | ||
| 675 | struct page *newpage, pte_t orig_pte) | ||
| 676 | { | ||
| 677 | struct mm_struct *mm = vma->vm_mm; | ||
| 678 | pgd_t *pgd; | ||
| 679 | pud_t *pud; | ||
| 680 | pmd_t *pmd; | ||
| 681 | pte_t *ptep; | ||
| 682 | spinlock_t *ptl; | ||
| 683 | unsigned long addr; | ||
| 684 | pgprot_t prot; | ||
| 685 | int err = -EFAULT; | ||
| 686 | |||
| 687 | prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE); | ||
| 688 | |||
| 689 | addr = page_address_in_vma(oldpage, vma); | ||
| 690 | if (addr == -EFAULT) | ||
| 691 | goto out; | ||
| 692 | |||
| 693 | pgd = pgd_offset(mm, addr); | ||
| 694 | if (!pgd_present(*pgd)) | ||
| 695 | goto out; | ||
| 696 | |||
| 697 | pud = pud_offset(pgd, addr); | ||
| 698 | if (!pud_present(*pud)) | ||
| 699 | goto out; | ||
| 700 | |||
| 701 | pmd = pmd_offset(pud, addr); | ||
| 702 | if (!pmd_present(*pmd)) | ||
| 703 | goto out; | ||
| 704 | |||
| 705 | ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); | ||
| 706 | if (!pte_same(*ptep, orig_pte)) { | ||
| 707 | pte_unmap_unlock(ptep, ptl); | ||
| 708 | goto out; | ||
| 709 | } | ||
| 710 | |||
| 711 | get_page(newpage); | ||
| 712 | page_add_ksm_rmap(newpage); | ||
| 713 | |||
| 714 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | ||
| 715 | ptep_clear_flush(vma, addr, ptep); | ||
| 716 | set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot)); | ||
| 717 | |||
| 718 | page_remove_rmap(oldpage); | ||
| 719 | put_page(oldpage); | ||
| 720 | |||
| 721 | pte_unmap_unlock(ptep, ptl); | ||
| 722 | err = 0; | ||
| 723 | out: | ||
| 724 | return err; | ||
| 725 | } | ||
| 726 | |||
| 727 | /* | ||
| 728 | * try_to_merge_one_page - take two pages and merge them into one | ||
| 729 | * @vma: the vma that hold the pte pointing into oldpage | ||
| 730 | * @oldpage: the page that we want to replace with newpage | ||
| 731 | * @newpage: the page that we want to map instead of oldpage | ||
| 732 | * | ||
| 733 | * Note: | ||
| 734 | * oldpage should be a PageAnon page, while newpage should be a PageKsm page, | ||
| 735 | * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm. | ||
| 736 | * | ||
| 737 | * This function returns 0 if the pages were merged, -EFAULT otherwise. | ||
| 738 | */ | ||
| 739 | static int try_to_merge_one_page(struct vm_area_struct *vma, | ||
| 740 | struct page *oldpage, | ||
| 741 | struct page *newpage) | ||
| 742 | { | ||
| 743 | pte_t orig_pte = __pte(0); | ||
| 744 | int err = -EFAULT; | ||
| 745 | |||
| 746 | if (!(vma->vm_flags & VM_MERGEABLE)) | ||
| 747 | goto out; | ||
| 748 | |||
| 749 | if (!PageAnon(oldpage)) | ||
| 750 | goto out; | ||
| 751 | |||
| 752 | get_page(newpage); | ||
| 753 | get_page(oldpage); | ||
| 754 | |||
| 755 | /* | ||
| 756 | * We need the page lock to read a stable PageSwapCache in | ||
| 757 | * write_protect_page(). We use trylock_page() instead of | ||
| 758 | * lock_page() because we don't want to wait here - we | ||
| 759 | * prefer to continue scanning and merging different pages, | ||
| 760 | * then come back to this page when it is unlocked. | ||
| 761 | */ | ||
| 762 | if (!trylock_page(oldpage)) | ||
| 763 | goto out_putpage; | ||
| 764 | /* | ||
| 765 | * If this anonymous page is mapped only here, its pte may need | ||
| 766 | * to be write-protected. If it's mapped elsewhere, all of its | ||
| 767 | * ptes are necessarily already write-protected. But in either | ||
| 768 | * case, we need to lock and check page_count is not raised. | ||
| 769 | */ | ||
| 770 | if (write_protect_page(vma, oldpage, &orig_pte)) { | ||
| 771 | unlock_page(oldpage); | ||
| 772 | goto out_putpage; | ||
| 773 | } | ||
| 774 | unlock_page(oldpage); | ||
| 775 | |||
| 776 | if (pages_identical(oldpage, newpage)) | ||
| 777 | err = replace_page(vma, oldpage, newpage, orig_pte); | ||
| 778 | |||
| 779 | out_putpage: | ||
| 780 | put_page(oldpage); | ||
| 781 | put_page(newpage); | ||
| 782 | out: | ||
| 783 | return err; | ||
| 784 | } | ||
| 785 | |||
| 786 | /* | ||
| 787 | * try_to_merge_with_ksm_page - like try_to_merge_two_pages, | ||
| 788 | * but no new kernel page is allocated: kpage must already be a ksm page. | ||
| 789 | */ | ||
| 790 | static int try_to_merge_with_ksm_page(struct mm_struct *mm1, | ||
| 791 | unsigned long addr1, | ||
| 792 | struct page *page1, | ||
| 793 | struct page *kpage) | ||
| 794 | { | ||
| 795 | struct vm_area_struct *vma; | ||
| 796 | int err = -EFAULT; | ||
| 797 | |||
| 798 | down_read(&mm1->mmap_sem); | ||
| 799 | if (ksm_test_exit(mm1)) | ||
| 800 | goto out; | ||
| 801 | |||
| 802 | vma = find_vma(mm1, addr1); | ||
| 803 | if (!vma || vma->vm_start > addr1) | ||
| 804 | goto out; | ||
| 805 | |||
| 806 | err = try_to_merge_one_page(vma, page1, kpage); | ||
| 807 | out: | ||
| 808 | up_read(&mm1->mmap_sem); | ||
| 809 | return err; | ||
| 810 | } | ||
| 811 | |||
| 812 | /* | ||
| 813 | * try_to_merge_two_pages - take two identical pages and prepare them | ||
| 814 | * to be merged into one page. | ||
| 815 | * | ||
| 816 | * This function returns 0 if we successfully mapped two identical pages | ||
| 817 | * into one page, -EFAULT otherwise. | ||
| 818 | * | ||
| 819 | * Note that this function allocates a new kernel page: if one of the pages | ||
| 820 | * is already a ksm page, try_to_merge_with_ksm_page should be used. | ||
| 821 | */ | ||
| 822 | static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1, | ||
| 823 | struct page *page1, struct mm_struct *mm2, | ||
| 824 | unsigned long addr2, struct page *page2) | ||
| 825 | { | ||
| 826 | struct vm_area_struct *vma; | ||
| 827 | struct page *kpage; | ||
| 828 | int err = -EFAULT; | ||
| 829 | |||
| 830 | /* | ||
| 831 | * The number of nodes in the stable tree | ||
| 832 | * is the number of kernel pages that we hold. | ||
| 833 | */ | ||
| 834 | if (ksm_max_kernel_pages && | ||
| 835 | ksm_max_kernel_pages <= ksm_pages_shared) | ||
| 836 | return err; | ||
| 837 | |||
| 838 | kpage = alloc_page(GFP_HIGHUSER); | ||
| 839 | if (!kpage) | ||
| 840 | return err; | ||
| 841 | |||
| 842 | down_read(&mm1->mmap_sem); | ||
| 843 | if (ksm_test_exit(mm1)) { | ||
| 844 | up_read(&mm1->mmap_sem); | ||
| 845 | goto out; | ||
| 846 | } | ||
| 847 | vma = find_vma(mm1, addr1); | ||
| 848 | if (!vma || vma->vm_start > addr1) { | ||
| 849 | up_read(&mm1->mmap_sem); | ||
| 850 | goto out; | ||
| 851 | } | ||
| 852 | |||
| 853 | copy_user_highpage(kpage, page1, addr1, vma); | ||
| 854 | err = try_to_merge_one_page(vma, page1, kpage); | ||
| 855 | up_read(&mm1->mmap_sem); | ||
| 856 | |||
| 857 | if (!err) { | ||
| 858 | err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage); | ||
| 859 | /* | ||
| 860 | * If that fails, we have a ksm page with only one pte | ||
| 861 | * pointing to it: so break it. | ||
| 862 | */ | ||
| 863 | if (err) | ||
| 864 | break_cow(mm1, addr1); | ||
| 865 | } | ||
| 866 | out: | ||
| 867 | put_page(kpage); | ||
| 868 | return err; | ||
| 869 | } | ||
| 870 | |||
| 871 | /* | ||
| 872 | * stable_tree_search - search page inside the stable tree | ||
| 873 | * @page: the page that we are searching identical pages to. | ||
| 874 | * @page2: pointer into identical page that we are holding inside the stable | ||
| 875 | * tree that we have found. | ||
| 876 | * @rmap_item: the reverse mapping item | ||
| 877 | * | ||
| 878 | * This function checks if there is a page inside the stable tree | ||
| 879 | * with identical content to the page that we are scanning right now. | ||
| 880 | * | ||
| 881 | * This function return rmap_item pointer to the identical item if found, | ||
| 882 | * NULL otherwise. | ||
| 883 | */ | ||
| 884 | static struct rmap_item *stable_tree_search(struct page *page, | ||
| 885 | struct page **page2, | ||
| 886 | struct rmap_item *rmap_item) | ||
| 887 | { | ||
| 888 | struct rb_node *node = root_stable_tree.rb_node; | ||
| 889 | |||
| 890 | while (node) { | ||
| 891 | struct rmap_item *tree_rmap_item, *next_rmap_item; | ||
| 892 | int ret; | ||
| 893 | |||
| 894 | tree_rmap_item = rb_entry(node, struct rmap_item, node); | ||
| 895 | while (tree_rmap_item) { | ||
| 896 | BUG_ON(!in_stable_tree(tree_rmap_item)); | ||
| 897 | cond_resched(); | ||
| 898 | page2[0] = get_ksm_page(tree_rmap_item); | ||
| 899 | if (page2[0]) | ||
| 900 | break; | ||
| 901 | next_rmap_item = tree_rmap_item->next; | ||
| 902 | remove_rmap_item_from_tree(tree_rmap_item); | ||
| 903 | tree_rmap_item = next_rmap_item; | ||
| 904 | } | ||
| 905 | if (!tree_rmap_item) | ||
| 906 | return NULL; | ||
| 907 | |||
| 908 | ret = memcmp_pages(page, page2[0]); | ||
| 909 | |||
| 910 | if (ret < 0) { | ||
| 911 | put_page(page2[0]); | ||
| 912 | node = node->rb_left; | ||
| 913 | } else if (ret > 0) { | ||
| 914 | put_page(page2[0]); | ||
| 915 | node = node->rb_right; | ||
| 916 | } else { | ||
| 917 | return tree_rmap_item; | ||
| 918 | } | ||
| 919 | } | ||
| 920 | |||
| 921 | return NULL; | ||
| 922 | } | ||
| 923 | |||
| 924 | /* | ||
| 925 | * stable_tree_insert - insert rmap_item pointing to new ksm page | ||
| 926 | * into the stable tree. | ||
| 927 | * | ||
| 928 | * @page: the page that we are searching identical page to inside the stable | ||
| 929 | * tree. | ||
| 930 | * @rmap_item: pointer to the reverse mapping item. | ||
| 931 | * | ||
| 932 | * This function returns rmap_item if success, NULL otherwise. | ||
| 933 | */ | ||
| 934 | static struct rmap_item *stable_tree_insert(struct page *page, | ||
| 935 | struct rmap_item *rmap_item) | ||
| 936 | { | ||
| 937 | struct rb_node **new = &root_stable_tree.rb_node; | ||
| 938 | struct rb_node *parent = NULL; | ||
| 939 | |||
| 940 | while (*new) { | ||
| 941 | struct rmap_item *tree_rmap_item, *next_rmap_item; | ||
| 942 | struct page *tree_page; | ||
| 943 | int ret; | ||
| 944 | |||
| 945 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | ||
| 946 | while (tree_rmap_item) { | ||
| 947 | BUG_ON(!in_stable_tree(tree_rmap_item)); | ||
| 948 | cond_resched(); | ||
| 949 | tree_page = get_ksm_page(tree_rmap_item); | ||
| 950 | if (tree_page) | ||
| 951 | break; | ||
| 952 | next_rmap_item = tree_rmap_item->next; | ||
| 953 | remove_rmap_item_from_tree(tree_rmap_item); | ||
| 954 | tree_rmap_item = next_rmap_item; | ||
| 955 | } | ||
| 956 | if (!tree_rmap_item) | ||
| 957 | return NULL; | ||
| 958 | |||
| 959 | ret = memcmp_pages(page, tree_page); | ||
| 960 | put_page(tree_page); | ||
| 961 | |||
| 962 | parent = *new; | ||
| 963 | if (ret < 0) | ||
| 964 | new = &parent->rb_left; | ||
| 965 | else if (ret > 0) | ||
| 966 | new = &parent->rb_right; | ||
| 967 | else { | ||
| 968 | /* | ||
| 969 | * It is not a bug that stable_tree_search() didn't | ||
| 970 | * find this node: because at that time our page was | ||
| 971 | * not yet write-protected, so may have changed since. | ||
| 972 | */ | ||
| 973 | return NULL; | ||
| 974 | } | ||
| 975 | } | ||
| 976 | |||
| 977 | rmap_item->address |= NODE_FLAG | STABLE_FLAG; | ||
| 978 | rmap_item->next = NULL; | ||
| 979 | rb_link_node(&rmap_item->node, parent, new); | ||
| 980 | rb_insert_color(&rmap_item->node, &root_stable_tree); | ||
| 981 | |||
| 982 | ksm_pages_shared++; | ||
| 983 | return rmap_item; | ||
| 984 | } | ||
| 985 | |||
| 986 | /* | ||
| 987 | * unstable_tree_search_insert - search and insert items into the unstable tree. | ||
| 988 | * | ||
| 989 | * @page: the page that we are going to search for identical page or to insert | ||
| 990 | * into the unstable tree | ||
| 991 | * @page2: pointer into identical page that was found inside the unstable tree | ||
| 992 | * @rmap_item: the reverse mapping item of page | ||
| 993 | * | ||
| 994 | * This function searches for a page in the unstable tree identical to the | ||
| 995 | * page currently being scanned; and if no identical page is found in the | ||
| 996 | * tree, we insert rmap_item as a new object into the unstable tree. | ||
| 997 | * | ||
| 998 | * This function returns pointer to rmap_item found to be identical | ||
| 999 | * to the currently scanned page, NULL otherwise. | ||
| 1000 | * | ||
| 1001 | * This function does both searching and inserting, because they share | ||
| 1002 | * the same walking algorithm in an rbtree. | ||
| 1003 | */ | ||
| 1004 | static struct rmap_item *unstable_tree_search_insert(struct page *page, | ||
| 1005 | struct page **page2, | ||
| 1006 | struct rmap_item *rmap_item) | ||
| 1007 | { | ||
| 1008 | struct rb_node **new = &root_unstable_tree.rb_node; | ||
| 1009 | struct rb_node *parent = NULL; | ||
| 1010 | |||
| 1011 | while (*new) { | ||
| 1012 | struct rmap_item *tree_rmap_item; | ||
| 1013 | int ret; | ||
| 1014 | |||
| 1015 | cond_resched(); | ||
| 1016 | tree_rmap_item = rb_entry(*new, struct rmap_item, node); | ||
| 1017 | page2[0] = get_mergeable_page(tree_rmap_item); | ||
| 1018 | if (!page2[0]) | ||
| 1019 | return NULL; | ||
| 1020 | |||
| 1021 | /* | ||
| 1022 | * Don't substitute an unswappable ksm page | ||
| 1023 | * just for one good swappable forked page. | ||
| 1024 | */ | ||
| 1025 | if (page == page2[0]) { | ||
| 1026 | put_page(page2[0]); | ||
| 1027 | return NULL; | ||
| 1028 | } | ||
| 1029 | |||
| 1030 | ret = memcmp_pages(page, page2[0]); | ||
| 1031 | |||
| 1032 | parent = *new; | ||
| 1033 | if (ret < 0) { | ||
| 1034 | put_page(page2[0]); | ||
| 1035 | new = &parent->rb_left; | ||
| 1036 | } else if (ret > 0) { | ||
| 1037 | put_page(page2[0]); | ||
| 1038 | new = &parent->rb_right; | ||
| 1039 | } else { | ||
| 1040 | return tree_rmap_item; | ||
| 1041 | } | ||
| 1042 | } | ||
| 1043 | |||
| 1044 | rmap_item->address |= NODE_FLAG; | ||
| 1045 | rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); | ||
| 1046 | rb_link_node(&rmap_item->node, parent, new); | ||
| 1047 | rb_insert_color(&rmap_item->node, &root_unstable_tree); | ||
| 1048 | |||
| 1049 | ksm_pages_unshared++; | ||
| 1050 | return NULL; | ||
| 1051 | } | ||
| 1052 | |||
| 1053 | /* | ||
| 1054 | * stable_tree_append - add another rmap_item to the linked list of | ||
| 1055 | * rmap_items hanging off a given node of the stable tree, all sharing | ||
| 1056 | * the same ksm page. | ||
| 1057 | */ | ||
| 1058 | static void stable_tree_append(struct rmap_item *rmap_item, | ||
| 1059 | struct rmap_item *tree_rmap_item) | ||
| 1060 | { | ||
| 1061 | rmap_item->next = tree_rmap_item->next; | ||
| 1062 | rmap_item->prev = tree_rmap_item; | ||
| 1063 | |||
| 1064 | if (tree_rmap_item->next) | ||
| 1065 | tree_rmap_item->next->prev = rmap_item; | ||
| 1066 | |||
| 1067 | tree_rmap_item->next = rmap_item; | ||
| 1068 | rmap_item->address |= STABLE_FLAG; | ||
| 1069 | |||
| 1070 | ksm_pages_sharing++; | ||
| 1071 | } | ||
| 1072 | |||
| 1073 | /* | ||
| 1074 | * cmp_and_merge_page - first see if page can be merged into the stable tree; | ||
| 1075 | * if not, compare checksum to previous and if it's the same, see if page can | ||
| 1076 | * be inserted into the unstable tree, or merged with a page already there and | ||
| 1077 | * both transferred to the stable tree. | ||
| 1078 | * | ||
| 1079 | * @page: the page that we are searching identical page to. | ||
| 1080 | * @rmap_item: the reverse mapping into the virtual address of this page | ||
| 1081 | */ | ||
| 1082 | static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) | ||
| 1083 | { | ||
| 1084 | struct page *page2[1]; | ||
| 1085 | struct rmap_item *tree_rmap_item; | ||
| 1086 | unsigned int checksum; | ||
| 1087 | int err; | ||
| 1088 | |||
| 1089 | if (in_stable_tree(rmap_item)) | ||
| 1090 | remove_rmap_item_from_tree(rmap_item); | ||
| 1091 | |||
| 1092 | /* We first start with searching the page inside the stable tree */ | ||
| 1093 | tree_rmap_item = stable_tree_search(page, page2, rmap_item); | ||
| 1094 | if (tree_rmap_item) { | ||
| 1095 | if (page == page2[0]) /* forked */ | ||
| 1096 | err = 0; | ||
| 1097 | else | ||
| 1098 | err = try_to_merge_with_ksm_page(rmap_item->mm, | ||
| 1099 | rmap_item->address, | ||
| 1100 | page, page2[0]); | ||
| 1101 | put_page(page2[0]); | ||
| 1102 | |||
| 1103 | if (!err) { | ||
| 1104 | /* | ||
| 1105 | * The page was successfully merged: | ||
| 1106 | * add its rmap_item to the stable tree. | ||
| 1107 | */ | ||
| 1108 | stable_tree_append(rmap_item, tree_rmap_item); | ||
| 1109 | } | ||
| 1110 | return; | ||
| 1111 | } | ||
| 1112 | |||
| 1113 | /* | ||
| 1114 | * A ksm page might have got here by fork, but its other | ||
| 1115 | * references have already been removed from the stable tree. | ||
| 1116 | * Or it might be left over from a break_ksm which failed | ||
| 1117 | * when the mem_cgroup had reached its limit: try again now. | ||
| 1118 | */ | ||
| 1119 | if (PageKsm(page)) | ||
| 1120 | break_cow(rmap_item->mm, rmap_item->address); | ||
| 1121 | |||
| 1122 | /* | ||
| 1123 | * In case the hash value of the page was changed from the last time we | ||
| 1124 | * have calculated it, this page to be changed frequely, therefore we | ||
| 1125 | * don't want to insert it to the unstable tree, and we don't want to | ||
| 1126 | * waste our time to search if there is something identical to it there. | ||
| 1127 | */ | ||
| 1128 | checksum = calc_checksum(page); | ||
| 1129 | if (rmap_item->oldchecksum != checksum) { | ||
| 1130 | rmap_item->oldchecksum = checksum; | ||
| 1131 | return; | ||
| 1132 | } | ||
| 1133 | |||
| 1134 | tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item); | ||
| 1135 | if (tree_rmap_item) { | ||
| 1136 | err = try_to_merge_two_pages(rmap_item->mm, | ||
| 1137 | rmap_item->address, page, | ||
| 1138 | tree_rmap_item->mm, | ||
| 1139 | tree_rmap_item->address, page2[0]); | ||
| 1140 | /* | ||
| 1141 | * As soon as we merge this page, we want to remove the | ||
| 1142 | * rmap_item of the page we have merged with from the unstable | ||
| 1143 | * tree, and insert it instead as new node in the stable tree. | ||
| 1144 | */ | ||
| 1145 | if (!err) { | ||
| 1146 | rb_erase(&tree_rmap_item->node, &root_unstable_tree); | ||
| 1147 | tree_rmap_item->address &= ~NODE_FLAG; | ||
| 1148 | ksm_pages_unshared--; | ||
| 1149 | |||
| 1150 | /* | ||
| 1151 | * If we fail to insert the page into the stable tree, | ||
| 1152 | * we will have 2 virtual addresses that are pointing | ||
| 1153 | * to a ksm page left outside the stable tree, | ||
| 1154 | * in which case we need to break_cow on both. | ||
| 1155 | */ | ||
| 1156 | if (stable_tree_insert(page2[0], tree_rmap_item)) | ||
| 1157 | stable_tree_append(rmap_item, tree_rmap_item); | ||
| 1158 | else { | ||
| 1159 | break_cow(tree_rmap_item->mm, | ||
| 1160 | tree_rmap_item->address); | ||
| 1161 | break_cow(rmap_item->mm, rmap_item->address); | ||
| 1162 | } | ||
| 1163 | } | ||
| 1164 | |||
| 1165 | put_page(page2[0]); | ||
| 1166 | } | ||
| 1167 | } | ||
| 1168 | |||
| 1169 | static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, | ||
| 1170 | struct list_head *cur, | ||
| 1171 | unsigned long addr) | ||
| 1172 | { | ||
| 1173 | struct rmap_item *rmap_item; | ||
| 1174 | |||
| 1175 | while (cur != &mm_slot->rmap_list) { | ||
| 1176 | rmap_item = list_entry(cur, struct rmap_item, link); | ||
| 1177 | if ((rmap_item->address & PAGE_MASK) == addr) { | ||
| 1178 | if (!in_stable_tree(rmap_item)) | ||
| 1179 | remove_rmap_item_from_tree(rmap_item); | ||
| 1180 | return rmap_item; | ||
| 1181 | } | ||
| 1182 | if (rmap_item->address > addr) | ||
| 1183 | break; | ||
| 1184 | cur = cur->next; | ||
| 1185 | remove_rmap_item_from_tree(rmap_item); | ||
| 1186 | list_del(&rmap_item->link); | ||
| 1187 | free_rmap_item(rmap_item); | ||
| 1188 | } | ||
| 1189 | |||
| 1190 | rmap_item = alloc_rmap_item(); | ||
| 1191 | if (rmap_item) { | ||
| 1192 | /* It has already been zeroed */ | ||
| 1193 | rmap_item->mm = mm_slot->mm; | ||
| 1194 | rmap_item->address = addr; | ||
| 1195 | list_add_tail(&rmap_item->link, cur); | ||
| 1196 | } | ||
| 1197 | return rmap_item; | ||
| 1198 | } | ||
| 1199 | |||
| 1200 | static struct rmap_item *scan_get_next_rmap_item(struct page **page) | ||
| 1201 | { | ||
| 1202 | struct mm_struct *mm; | ||
| 1203 | struct mm_slot *slot; | ||
| 1204 | struct vm_area_struct *vma; | ||
| 1205 | struct rmap_item *rmap_item; | ||
| 1206 | |||
| 1207 | if (list_empty(&ksm_mm_head.mm_list)) | ||
| 1208 | return NULL; | ||
| 1209 | |||
| 1210 | slot = ksm_scan.mm_slot; | ||
| 1211 | if (slot == &ksm_mm_head) { | ||
| 1212 | root_unstable_tree = RB_ROOT; | ||
| 1213 | |||
| 1214 | spin_lock(&ksm_mmlist_lock); | ||
| 1215 | slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); | ||
| 1216 | ksm_scan.mm_slot = slot; | ||
| 1217 | spin_unlock(&ksm_mmlist_lock); | ||
| 1218 | next_mm: | ||
| 1219 | ksm_scan.address = 0; | ||
| 1220 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | ||
| 1221 | struct rmap_item, link); | ||
| 1222 | } | ||
| 1223 | |||
| 1224 | mm = slot->mm; | ||
| 1225 | down_read(&mm->mmap_sem); | ||
| 1226 | if (ksm_test_exit(mm)) | ||
| 1227 | vma = NULL; | ||
| 1228 | else | ||
| 1229 | vma = find_vma(mm, ksm_scan.address); | ||
| 1230 | |||
| 1231 | for (; vma; vma = vma->vm_next) { | ||
| 1232 | if (!(vma->vm_flags & VM_MERGEABLE)) | ||
| 1233 | continue; | ||
| 1234 | if (ksm_scan.address < vma->vm_start) | ||
| 1235 | ksm_scan.address = vma->vm_start; | ||
| 1236 | if (!vma->anon_vma) | ||
| 1237 | ksm_scan.address = vma->vm_end; | ||
| 1238 | |||
| 1239 | while (ksm_scan.address < vma->vm_end) { | ||
| 1240 | if (ksm_test_exit(mm)) | ||
| 1241 | break; | ||
| 1242 | *page = follow_page(vma, ksm_scan.address, FOLL_GET); | ||
| 1243 | if (*page && PageAnon(*page)) { | ||
| 1244 | flush_anon_page(vma, *page, ksm_scan.address); | ||
| 1245 | flush_dcache_page(*page); | ||
| 1246 | rmap_item = get_next_rmap_item(slot, | ||
| 1247 | ksm_scan.rmap_item->link.next, | ||
| 1248 | ksm_scan.address); | ||
| 1249 | if (rmap_item) { | ||
| 1250 | ksm_scan.rmap_item = rmap_item; | ||
| 1251 | ksm_scan.address += PAGE_SIZE; | ||
| 1252 | } else | ||
| 1253 | put_page(*page); | ||
| 1254 | up_read(&mm->mmap_sem); | ||
| 1255 | return rmap_item; | ||
| 1256 | } | ||
| 1257 | if (*page) | ||
| 1258 | put_page(*page); | ||
| 1259 | ksm_scan.address += PAGE_SIZE; | ||
| 1260 | cond_resched(); | ||
| 1261 | } | ||
| 1262 | } | ||
| 1263 | |||
| 1264 | if (ksm_test_exit(mm)) { | ||
| 1265 | ksm_scan.address = 0; | ||
| 1266 | ksm_scan.rmap_item = list_entry(&slot->rmap_list, | ||
| 1267 | struct rmap_item, link); | ||
| 1268 | } | ||
| 1269 | /* | ||
| 1270 | * Nuke all the rmap_items that are above this current rmap: | ||
| 1271 | * because there were no VM_MERGEABLE vmas with such addresses. | ||
| 1272 | */ | ||
| 1273 | remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next); | ||
| 1274 | |||
| 1275 | spin_lock(&ksm_mmlist_lock); | ||
| 1276 | ksm_scan.mm_slot = list_entry(slot->mm_list.next, | ||
| 1277 | struct mm_slot, mm_list); | ||
| 1278 | if (ksm_scan.address == 0) { | ||
| 1279 | /* | ||
| 1280 | * We've completed a full scan of all vmas, holding mmap_sem | ||
| 1281 | * throughout, and found no VM_MERGEABLE: so do the same as | ||
| 1282 | * __ksm_exit does to remove this mm from all our lists now. | ||
| 1283 | * This applies either when cleaning up after __ksm_exit | ||
| 1284 | * (but beware: we can reach here even before __ksm_exit), | ||
| 1285 | * or when all VM_MERGEABLE areas have been unmapped (and | ||
| 1286 | * mmap_sem then protects against race with MADV_MERGEABLE). | ||
| 1287 | */ | ||
| 1288 | hlist_del(&slot->link); | ||
| 1289 | list_del(&slot->mm_list); | ||
| 1290 | spin_unlock(&ksm_mmlist_lock); | ||
| 1291 | |||
| 1292 | free_mm_slot(slot); | ||
| 1293 | clear_bit(MMF_VM_MERGEABLE, &mm->flags); | ||
| 1294 | up_read(&mm->mmap_sem); | ||
| 1295 | mmdrop(mm); | ||
| 1296 | } else { | ||
| 1297 | spin_unlock(&ksm_mmlist_lock); | ||
| 1298 | up_read(&mm->mmap_sem); | ||
| 1299 | } | ||
| 1300 | |||
| 1301 | /* Repeat until we've completed scanning the whole list */ | ||
| 1302 | slot = ksm_scan.mm_slot; | ||
| 1303 | if (slot != &ksm_mm_head) | ||
| 1304 | goto next_mm; | ||
| 1305 | |||
| 1306 | ksm_scan.seqnr++; | ||
| 1307 | return NULL; | ||
| 1308 | } | ||
| 1309 | |||
| 1310 | /** | ||
| 1311 | * ksm_do_scan - the ksm scanner main worker function. | ||
| 1312 | * @scan_npages - number of pages we want to scan before we return. | ||
| 1313 | */ | ||
| 1314 | static void ksm_do_scan(unsigned int scan_npages) | ||
| 1315 | { | ||
| 1316 | struct rmap_item *rmap_item; | ||
| 1317 | struct page *page; | ||
| 1318 | |||
| 1319 | while (scan_npages--) { | ||
| 1320 | cond_resched(); | ||
| 1321 | rmap_item = scan_get_next_rmap_item(&page); | ||
| 1322 | if (!rmap_item) | ||
| 1323 | return; | ||
| 1324 | if (!PageKsm(page) || !in_stable_tree(rmap_item)) | ||
| 1325 | cmp_and_merge_page(page, rmap_item); | ||
| 1326 | else if (page_mapcount(page) == 1) { | ||
| 1327 | /* | ||
| 1328 | * Replace now-unshared ksm page by ordinary page. | ||
| 1329 | */ | ||
| 1330 | break_cow(rmap_item->mm, rmap_item->address); | ||
| 1331 | remove_rmap_item_from_tree(rmap_item); | ||
| 1332 | rmap_item->oldchecksum = calc_checksum(page); | ||
| 1333 | } | ||
| 1334 | put_page(page); | ||
| 1335 | } | ||
| 1336 | } | ||
| 1337 | |||
| 1338 | static int ksmd_should_run(void) | ||
| 1339 | { | ||
| 1340 | return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); | ||
| 1341 | } | ||
| 1342 | |||
| 1343 | static int ksm_scan_thread(void *nothing) | ||
| 1344 | { | ||
| 1345 | set_user_nice(current, 5); | ||
| 1346 | |||
| 1347 | while (!kthread_should_stop()) { | ||
| 1348 | mutex_lock(&ksm_thread_mutex); | ||
| 1349 | if (ksmd_should_run()) | ||
| 1350 | ksm_do_scan(ksm_thread_pages_to_scan); | ||
| 1351 | mutex_unlock(&ksm_thread_mutex); | ||
| 1352 | |||
| 1353 | if (ksmd_should_run()) { | ||
| 1354 | schedule_timeout_interruptible( | ||
| 1355 | msecs_to_jiffies(ksm_thread_sleep_millisecs)); | ||
| 1356 | } else { | ||
| 1357 | wait_event_interruptible(ksm_thread_wait, | ||
| 1358 | ksmd_should_run() || kthread_should_stop()); | ||
| 1359 | } | ||
| 1360 | } | ||
| 1361 | return 0; | ||
| 1362 | } | ||
| 1363 | |||
| 1364 | int ksm_madvise(struct vm_area_struct *vma, unsigned long start, | ||
| 1365 | unsigned long end, int advice, unsigned long *vm_flags) | ||
| 1366 | { | ||
| 1367 | struct mm_struct *mm = vma->vm_mm; | ||
| 1368 | int err; | ||
| 1369 | |||
| 1370 | switch (advice) { | ||
| 1371 | case MADV_MERGEABLE: | ||
| 1372 | /* | ||
| 1373 | * Be somewhat over-protective for now! | ||
| 1374 | */ | ||
| 1375 | if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | | ||
| 1376 | VM_PFNMAP | VM_IO | VM_DONTEXPAND | | ||
| 1377 | VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | | ||
| 1378 | VM_MIXEDMAP | VM_SAO)) | ||
| 1379 | return 0; /* just ignore the advice */ | ||
| 1380 | |||
| 1381 | if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { | ||
| 1382 | err = __ksm_enter(mm); | ||
| 1383 | if (err) | ||
| 1384 | return err; | ||
| 1385 | } | ||
| 1386 | |||
| 1387 | *vm_flags |= VM_MERGEABLE; | ||
| 1388 | break; | ||
| 1389 | |||
| 1390 | case MADV_UNMERGEABLE: | ||
| 1391 | if (!(*vm_flags & VM_MERGEABLE)) | ||
| 1392 | return 0; /* just ignore the advice */ | ||
| 1393 | |||
| 1394 | if (vma->anon_vma) { | ||
| 1395 | err = unmerge_ksm_pages(vma, start, end); | ||
| 1396 | if (err) | ||
| 1397 | return err; | ||
| 1398 | } | ||
| 1399 | |||
| 1400 | *vm_flags &= ~VM_MERGEABLE; | ||
| 1401 | break; | ||
| 1402 | } | ||
| 1403 | |||
| 1404 | return 0; | ||
| 1405 | } | ||
| 1406 | |||
| 1407 | int __ksm_enter(struct mm_struct *mm) | ||
| 1408 | { | ||
| 1409 | struct mm_slot *mm_slot; | ||
| 1410 | int needs_wakeup; | ||
| 1411 | |||
| 1412 | mm_slot = alloc_mm_slot(); | ||
| 1413 | if (!mm_slot) | ||
| 1414 | return -ENOMEM; | ||
| 1415 | |||
| 1416 | /* Check ksm_run too? Would need tighter locking */ | ||
| 1417 | needs_wakeup = list_empty(&ksm_mm_head.mm_list); | ||
| 1418 | |||
| 1419 | spin_lock(&ksm_mmlist_lock); | ||
| 1420 | insert_to_mm_slots_hash(mm, mm_slot); | ||
| 1421 | /* | ||
| 1422 | * Insert just behind the scanning cursor, to let the area settle | ||
| 1423 | * down a little; when fork is followed by immediate exec, we don't | ||
| 1424 | * want ksmd to waste time setting up and tearing down an rmap_list. | ||
| 1425 | */ | ||
| 1426 | list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); | ||
| 1427 | spin_unlock(&ksm_mmlist_lock); | ||
| 1428 | |||
| 1429 | set_bit(MMF_VM_MERGEABLE, &mm->flags); | ||
| 1430 | atomic_inc(&mm->mm_count); | ||
| 1431 | |||
| 1432 | if (needs_wakeup) | ||
| 1433 | wake_up_interruptible(&ksm_thread_wait); | ||
| 1434 | |||
| 1435 | return 0; | ||
| 1436 | } | ||
| 1437 | |||
| 1438 | void __ksm_exit(struct mm_struct *mm) | ||
| 1439 | { | ||
| 1440 | struct mm_slot *mm_slot; | ||
| 1441 | int easy_to_free = 0; | ||
| 1442 | |||
| 1443 | /* | ||
| 1444 | * This process is exiting: if it's straightforward (as is the | ||
| 1445 | * case when ksmd was never running), free mm_slot immediately. | ||
| 1446 | * But if it's at the cursor or has rmap_items linked to it, use | ||
| 1447 | * mmap_sem to synchronize with any break_cows before pagetables | ||
| 1448 | * are freed, and leave the mm_slot on the list for ksmd to free. | ||
| 1449 | * Beware: ksm may already have noticed it exiting and freed the slot. | ||
| 1450 | */ | ||
| 1451 | |||
| 1452 | spin_lock(&ksm_mmlist_lock); | ||
| 1453 | mm_slot = get_mm_slot(mm); | ||
| 1454 | if (mm_slot && ksm_scan.mm_slot != mm_slot) { | ||
| 1455 | if (list_empty(&mm_slot->rmap_list)) { | ||
| 1456 | hlist_del(&mm_slot->link); | ||
| 1457 | list_del(&mm_slot->mm_list); | ||
| 1458 | easy_to_free = 1; | ||
| 1459 | } else { | ||
| 1460 | list_move(&mm_slot->mm_list, | ||
| 1461 | &ksm_scan.mm_slot->mm_list); | ||
| 1462 | } | ||
| 1463 | } | ||
| 1464 | spin_unlock(&ksm_mmlist_lock); | ||
| 1465 | |||
| 1466 | if (easy_to_free) { | ||
| 1467 | free_mm_slot(mm_slot); | ||
| 1468 | clear_bit(MMF_VM_MERGEABLE, &mm->flags); | ||
| 1469 | mmdrop(mm); | ||
| 1470 | } else if (mm_slot) { | ||
| 1471 | down_write(&mm->mmap_sem); | ||
| 1472 | up_write(&mm->mmap_sem); | ||
| 1473 | } | ||
| 1474 | } | ||
| 1475 | |||
| 1476 | #ifdef CONFIG_SYSFS | ||
| 1477 | /* | ||
| 1478 | * This all compiles without CONFIG_SYSFS, but is a waste of space. | ||
| 1479 | */ | ||
| 1480 | |||
| 1481 | #define KSM_ATTR_RO(_name) \ | ||
| 1482 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) | ||
| 1483 | #define KSM_ATTR(_name) \ | ||
| 1484 | static struct kobj_attribute _name##_attr = \ | ||
| 1485 | __ATTR(_name, 0644, _name##_show, _name##_store) | ||
| 1486 | |||
| 1487 | static ssize_t sleep_millisecs_show(struct kobject *kobj, | ||
| 1488 | struct kobj_attribute *attr, char *buf) | ||
| 1489 | { | ||
| 1490 | return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); | ||
| 1491 | } | ||
| 1492 | |||
| 1493 | static ssize_t sleep_millisecs_store(struct kobject *kobj, | ||
| 1494 | struct kobj_attribute *attr, | ||
| 1495 | const char *buf, size_t count) | ||
| 1496 | { | ||
| 1497 | unsigned long msecs; | ||
| 1498 | int err; | ||
| 1499 | |||
| 1500 | err = strict_strtoul(buf, 10, &msecs); | ||
| 1501 | if (err || msecs > UINT_MAX) | ||
| 1502 | return -EINVAL; | ||
| 1503 | |||
| 1504 | ksm_thread_sleep_millisecs = msecs; | ||
| 1505 | |||
| 1506 | return count; | ||
| 1507 | } | ||
| 1508 | KSM_ATTR(sleep_millisecs); | ||
| 1509 | |||
| 1510 | static ssize_t pages_to_scan_show(struct kobject *kobj, | ||
| 1511 | struct kobj_attribute *attr, char *buf) | ||
| 1512 | { | ||
| 1513 | return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); | ||
| 1514 | } | ||
| 1515 | |||
| 1516 | static ssize_t pages_to_scan_store(struct kobject *kobj, | ||
| 1517 | struct kobj_attribute *attr, | ||
| 1518 | const char *buf, size_t count) | ||
| 1519 | { | ||
| 1520 | int err; | ||
| 1521 | unsigned long nr_pages; | ||
| 1522 | |||
| 1523 | err = strict_strtoul(buf, 10, &nr_pages); | ||
| 1524 | if (err || nr_pages > UINT_MAX) | ||
| 1525 | return -EINVAL; | ||
| 1526 | |||
| 1527 | ksm_thread_pages_to_scan = nr_pages; | ||
| 1528 | |||
| 1529 | return count; | ||
| 1530 | } | ||
| 1531 | KSM_ATTR(pages_to_scan); | ||
| 1532 | |||
| 1533 | static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
| 1534 | char *buf) | ||
| 1535 | { | ||
| 1536 | return sprintf(buf, "%u\n", ksm_run); | ||
| 1537 | } | ||
| 1538 | |||
| 1539 | static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
| 1540 | const char *buf, size_t count) | ||
| 1541 | { | ||
| 1542 | int err; | ||
| 1543 | unsigned long flags; | ||
| 1544 | |||
| 1545 | err = strict_strtoul(buf, 10, &flags); | ||
| 1546 | if (err || flags > UINT_MAX) | ||
| 1547 | return -EINVAL; | ||
| 1548 | if (flags > KSM_RUN_UNMERGE) | ||
| 1549 | return -EINVAL; | ||
| 1550 | |||
| 1551 | /* | ||
| 1552 | * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. | ||
| 1553 | * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, | ||
| 1554 | * breaking COW to free the unswappable pages_shared (but leaves | ||
| 1555 | * mm_slots on the list for when ksmd may be set running again). | ||
| 1556 | */ | ||
| 1557 | |||
| 1558 | mutex_lock(&ksm_thread_mutex); | ||
| 1559 | if (ksm_run != flags) { | ||
| 1560 | ksm_run = flags; | ||
| 1561 | if (flags & KSM_RUN_UNMERGE) { | ||
| 1562 | current->flags |= PF_OOM_ORIGIN; | ||
| 1563 | err = unmerge_and_remove_all_rmap_items(); | ||
| 1564 | current->flags &= ~PF_OOM_ORIGIN; | ||
| 1565 | if (err) { | ||
| 1566 | ksm_run = KSM_RUN_STOP; | ||
| 1567 | count = err; | ||
| 1568 | } | ||
| 1569 | } | ||
| 1570 | } | ||
| 1571 | mutex_unlock(&ksm_thread_mutex); | ||
| 1572 | |||
| 1573 | if (flags & KSM_RUN_MERGE) | ||
| 1574 | wake_up_interruptible(&ksm_thread_wait); | ||
| 1575 | |||
| 1576 | return count; | ||
| 1577 | } | ||
| 1578 | KSM_ATTR(run); | ||
| 1579 | |||
| 1580 | static ssize_t max_kernel_pages_store(struct kobject *kobj, | ||
| 1581 | struct kobj_attribute *attr, | ||
| 1582 | const char *buf, size_t count) | ||
| 1583 | { | ||
| 1584 | int err; | ||
| 1585 | unsigned long nr_pages; | ||
| 1586 | |||
| 1587 | err = strict_strtoul(buf, 10, &nr_pages); | ||
| 1588 | if (err) | ||
| 1589 | return -EINVAL; | ||
| 1590 | |||
| 1591 | ksm_max_kernel_pages = nr_pages; | ||
| 1592 | |||
| 1593 | return count; | ||
| 1594 | } | ||
| 1595 | |||
| 1596 | static ssize_t max_kernel_pages_show(struct kobject *kobj, | ||
| 1597 | struct kobj_attribute *attr, char *buf) | ||
| 1598 | { | ||
| 1599 | return sprintf(buf, "%lu\n", ksm_max_kernel_pages); | ||
| 1600 | } | ||
| 1601 | KSM_ATTR(max_kernel_pages); | ||
| 1602 | |||
| 1603 | static ssize_t pages_shared_show(struct kobject *kobj, | ||
| 1604 | struct kobj_attribute *attr, char *buf) | ||
| 1605 | { | ||
| 1606 | return sprintf(buf, "%lu\n", ksm_pages_shared); | ||
| 1607 | } | ||
| 1608 | KSM_ATTR_RO(pages_shared); | ||
| 1609 | |||
| 1610 | static ssize_t pages_sharing_show(struct kobject *kobj, | ||
| 1611 | struct kobj_attribute *attr, char *buf) | ||
| 1612 | { | ||
| 1613 | return sprintf(buf, "%lu\n", ksm_pages_sharing); | ||
| 1614 | } | ||
| 1615 | KSM_ATTR_RO(pages_sharing); | ||
| 1616 | |||
| 1617 | static ssize_t pages_unshared_show(struct kobject *kobj, | ||
| 1618 | struct kobj_attribute *attr, char *buf) | ||
| 1619 | { | ||
| 1620 | return sprintf(buf, "%lu\n", ksm_pages_unshared); | ||
| 1621 | } | ||
| 1622 | KSM_ATTR_RO(pages_unshared); | ||
| 1623 | |||
| 1624 | static ssize_t pages_volatile_show(struct kobject *kobj, | ||
| 1625 | struct kobj_attribute *attr, char *buf) | ||
| 1626 | { | ||
| 1627 | long ksm_pages_volatile; | ||
| 1628 | |||
| 1629 | ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared | ||
| 1630 | - ksm_pages_sharing - ksm_pages_unshared; | ||
| 1631 | /* | ||
| 1632 | * It was not worth any locking to calculate that statistic, | ||
| 1633 | * but it might therefore sometimes be negative: conceal that. | ||
| 1634 | */ | ||
| 1635 | if (ksm_pages_volatile < 0) | ||
| 1636 | ksm_pages_volatile = 0; | ||
| 1637 | return sprintf(buf, "%ld\n", ksm_pages_volatile); | ||
| 1638 | } | ||
| 1639 | KSM_ATTR_RO(pages_volatile); | ||
| 1640 | |||
| 1641 | static ssize_t full_scans_show(struct kobject *kobj, | ||
| 1642 | struct kobj_attribute *attr, char *buf) | ||
| 1643 | { | ||
| 1644 | return sprintf(buf, "%lu\n", ksm_scan.seqnr); | ||
| 1645 | } | ||
| 1646 | KSM_ATTR_RO(full_scans); | ||
| 1647 | |||
| 1648 | static struct attribute *ksm_attrs[] = { | ||
| 1649 | &sleep_millisecs_attr.attr, | ||
| 1650 | &pages_to_scan_attr.attr, | ||
| 1651 | &run_attr.attr, | ||
| 1652 | &max_kernel_pages_attr.attr, | ||
| 1653 | &pages_shared_attr.attr, | ||
| 1654 | &pages_sharing_attr.attr, | ||
| 1655 | &pages_unshared_attr.attr, | ||
| 1656 | &pages_volatile_attr.attr, | ||
| 1657 | &full_scans_attr.attr, | ||
| 1658 | NULL, | ||
| 1659 | }; | ||
| 1660 | |||
| 1661 | static struct attribute_group ksm_attr_group = { | ||
| 1662 | .attrs = ksm_attrs, | ||
| 1663 | .name = "ksm", | ||
| 1664 | }; | ||
| 1665 | #endif /* CONFIG_SYSFS */ | ||
| 1666 | |||
| 1667 | static int __init ksm_init(void) | ||
| 1668 | { | ||
| 1669 | struct task_struct *ksm_thread; | ||
| 1670 | int err; | ||
| 1671 | |||
| 1672 | ksm_max_kernel_pages = totalram_pages / 4; | ||
| 1673 | |||
| 1674 | err = ksm_slab_init(); | ||
| 1675 | if (err) | ||
| 1676 | goto out; | ||
| 1677 | |||
| 1678 | err = mm_slots_hash_init(); | ||
| 1679 | if (err) | ||
| 1680 | goto out_free1; | ||
| 1681 | |||
| 1682 | ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); | ||
| 1683 | if (IS_ERR(ksm_thread)) { | ||
| 1684 | printk(KERN_ERR "ksm: creating kthread failed\n"); | ||
| 1685 | err = PTR_ERR(ksm_thread); | ||
| 1686 | goto out_free2; | ||
| 1687 | } | ||
| 1688 | |||
| 1689 | #ifdef CONFIG_SYSFS | ||
| 1690 | err = sysfs_create_group(mm_kobj, &ksm_attr_group); | ||
| 1691 | if (err) { | ||
| 1692 | printk(KERN_ERR "ksm: register sysfs failed\n"); | ||
| 1693 | kthread_stop(ksm_thread); | ||
| 1694 | goto out_free2; | ||
| 1695 | } | ||
| 1696 | #else | ||
| 1697 | ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ | ||
| 1698 | |||
| 1699 | #endif /* CONFIG_SYSFS */ | ||
| 1700 | |||
| 1701 | return 0; | ||
| 1702 | |||
| 1703 | out_free2: | ||
| 1704 | mm_slots_hash_free(); | ||
| 1705 | out_free1: | ||
| 1706 | ksm_slab_free(); | ||
| 1707 | out: | ||
| 1708 | return err; | ||
| 1709 | } | ||
| 1710 | module_init(ksm_init) | ||
diff --git a/mm/maccess.c b/mm/maccess.c index ac40796cfb15..9073695ff25f 100644 --- a/mm/maccess.c +++ b/mm/maccess.c | |||
| @@ -39,7 +39,7 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); | |||
| 39 | * Safely write to address @dst from the buffer at @src. If a kernel fault | 39 | * Safely write to address @dst from the buffer at @src. If a kernel fault |
| 40 | * happens, handle that and return -EFAULT. | 40 | * happens, handle that and return -EFAULT. |
| 41 | */ | 41 | */ |
| 42 | long probe_kernel_write(void *dst, void *src, size_t size) | 42 | long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) |
| 43 | { | 43 | { |
| 44 | long ret; | 44 | long ret; |
| 45 | mm_segment_t old_fs = get_fs(); | 45 | mm_segment_t old_fs = get_fs(); |
diff --git a/mm/madvise.c b/mm/madvise.c index b9ce574827c8..35b1479b7c9d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -11,6 +11,7 @@ | |||
| 11 | #include <linux/mempolicy.h> | 11 | #include <linux/mempolicy.h> |
| 12 | #include <linux/hugetlb.h> | 12 | #include <linux/hugetlb.h> |
| 13 | #include <linux/sched.h> | 13 | #include <linux/sched.h> |
| 14 | #include <linux/ksm.h> | ||
| 14 | 15 | ||
| 15 | /* | 16 | /* |
| 16 | * Any behaviour which results in changes to the vma->vm_flags needs to | 17 | * Any behaviour which results in changes to the vma->vm_flags needs to |
| @@ -41,7 +42,7 @@ static long madvise_behavior(struct vm_area_struct * vma, | |||
| 41 | struct mm_struct * mm = vma->vm_mm; | 42 | struct mm_struct * mm = vma->vm_mm; |
| 42 | int error = 0; | 43 | int error = 0; |
| 43 | pgoff_t pgoff; | 44 | pgoff_t pgoff; |
| 44 | int new_flags = vma->vm_flags; | 45 | unsigned long new_flags = vma->vm_flags; |
| 45 | 46 | ||
| 46 | switch (behavior) { | 47 | switch (behavior) { |
| 47 | case MADV_NORMAL: | 48 | case MADV_NORMAL: |
| @@ -57,8 +58,18 @@ static long madvise_behavior(struct vm_area_struct * vma, | |||
| 57 | new_flags |= VM_DONTCOPY; | 58 | new_flags |= VM_DONTCOPY; |
| 58 | break; | 59 | break; |
| 59 | case MADV_DOFORK: | 60 | case MADV_DOFORK: |
| 61 | if (vma->vm_flags & VM_IO) { | ||
| 62 | error = -EINVAL; | ||
| 63 | goto out; | ||
| 64 | } | ||
| 60 | new_flags &= ~VM_DONTCOPY; | 65 | new_flags &= ~VM_DONTCOPY; |
| 61 | break; | 66 | break; |
| 67 | case MADV_MERGEABLE: | ||
| 68 | case MADV_UNMERGEABLE: | ||
| 69 | error = ksm_madvise(vma, start, end, behavior, &new_flags); | ||
| 70 | if (error) | ||
| 71 | goto out; | ||
| 72 | break; | ||
| 62 | } | 73 | } |
| 63 | 74 | ||
| 64 | if (new_flags == vma->vm_flags) { | 75 | if (new_flags == vma->vm_flags) { |
| @@ -123,8 +134,7 @@ static long madvise_willneed(struct vm_area_struct * vma, | |||
| 123 | end = vma->vm_end; | 134 | end = vma->vm_end; |
| 124 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 135 | end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
| 125 | 136 | ||
| 126 | force_page_cache_readahead(file->f_mapping, | 137 | force_page_cache_readahead(file->f_mapping, file, start, end - start); |
| 127 | file, start, max_sane_readahead(end - start)); | ||
| 128 | return 0; | 138 | return 0; |
| 129 | } | 139 | } |
| 130 | 140 | ||
| @@ -208,41 +218,69 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
| 208 | return error; | 218 | return error; |
| 209 | } | 219 | } |
| 210 | 220 | ||
| 221 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 222 | /* | ||
| 223 | * Error injection support for memory error handling. | ||
| 224 | */ | ||
| 225 | static int madvise_hwpoison(unsigned long start, unsigned long end) | ||
| 226 | { | ||
| 227 | int ret = 0; | ||
| 228 | |||
| 229 | if (!capable(CAP_SYS_ADMIN)) | ||
| 230 | return -EPERM; | ||
| 231 | for (; start < end; start += PAGE_SIZE) { | ||
| 232 | struct page *p; | ||
| 233 | int ret = get_user_pages(current, current->mm, start, 1, | ||
| 234 | 0, 0, &p, NULL); | ||
| 235 | if (ret != 1) | ||
| 236 | return ret; | ||
| 237 | printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", | ||
| 238 | page_to_pfn(p), start); | ||
| 239 | /* Ignore return value for now */ | ||
| 240 | __memory_failure(page_to_pfn(p), 0, 1); | ||
| 241 | put_page(p); | ||
| 242 | } | ||
| 243 | return ret; | ||
| 244 | } | ||
| 245 | #endif | ||
| 246 | |||
| 211 | static long | 247 | static long |
| 212 | madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | 248 | madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, |
| 213 | unsigned long start, unsigned long end, int behavior) | 249 | unsigned long start, unsigned long end, int behavior) |
| 214 | { | 250 | { |
| 215 | long error; | 251 | switch (behavior) { |
| 252 | case MADV_REMOVE: | ||
| 253 | return madvise_remove(vma, prev, start, end); | ||
| 254 | case MADV_WILLNEED: | ||
| 255 | return madvise_willneed(vma, prev, start, end); | ||
| 256 | case MADV_DONTNEED: | ||
| 257 | return madvise_dontneed(vma, prev, start, end); | ||
| 258 | default: | ||
| 259 | return madvise_behavior(vma, prev, start, end, behavior); | ||
| 260 | } | ||
| 261 | } | ||
| 216 | 262 | ||
| 263 | static int | ||
| 264 | madvise_behavior_valid(int behavior) | ||
| 265 | { | ||
| 217 | switch (behavior) { | 266 | switch (behavior) { |
| 218 | case MADV_DOFORK: | 267 | case MADV_DOFORK: |
| 219 | if (vma->vm_flags & VM_IO) { | ||
| 220 | error = -EINVAL; | ||
| 221 | break; | ||
| 222 | } | ||
| 223 | case MADV_DONTFORK: | 268 | case MADV_DONTFORK: |
| 224 | case MADV_NORMAL: | 269 | case MADV_NORMAL: |
| 225 | case MADV_SEQUENTIAL: | 270 | case MADV_SEQUENTIAL: |
| 226 | case MADV_RANDOM: | 271 | case MADV_RANDOM: |
| 227 | error = madvise_behavior(vma, prev, start, end, behavior); | ||
| 228 | break; | ||
| 229 | case MADV_REMOVE: | 272 | case MADV_REMOVE: |
| 230 | error = madvise_remove(vma, prev, start, end); | ||
| 231 | break; | ||
| 232 | |||
| 233 | case MADV_WILLNEED: | 273 | case MADV_WILLNEED: |
| 234 | error = madvise_willneed(vma, prev, start, end); | ||
| 235 | break; | ||
| 236 | |||
| 237 | case MADV_DONTNEED: | 274 | case MADV_DONTNEED: |
| 238 | error = madvise_dontneed(vma, prev, start, end); | 275 | #ifdef CONFIG_KSM |
| 239 | break; | 276 | case MADV_MERGEABLE: |
| 277 | case MADV_UNMERGEABLE: | ||
| 278 | #endif | ||
| 279 | return 1; | ||
| 240 | 280 | ||
| 241 | default: | 281 | default: |
| 242 | error = -EINVAL; | 282 | return 0; |
| 243 | break; | ||
| 244 | } | 283 | } |
| 245 | return error; | ||
| 246 | } | 284 | } |
| 247 | 285 | ||
| 248 | /* | 286 | /* |
| @@ -269,6 +307,12 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
| 269 | * so the kernel can free resources associated with it. | 307 | * so the kernel can free resources associated with it. |
| 270 | * MADV_REMOVE - the application wants to free up the given range of | 308 | * MADV_REMOVE - the application wants to free up the given range of |
| 271 | * pages and associated backing store. | 309 | * pages and associated backing store. |
| 310 | * MADV_DONTFORK - omit this area from child's address space when forking: | ||
| 311 | * typically, to avoid COWing pages pinned by get_user_pages(). | ||
| 312 | * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. | ||
| 313 | * MADV_MERGEABLE - the application recommends that KSM try to merge pages in | ||
| 314 | * this area with pages of identical content from other such areas. | ||
| 315 | * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. | ||
| 272 | * | 316 | * |
| 273 | * return values: | 317 | * return values: |
| 274 | * zero - success | 318 | * zero - success |
| @@ -290,6 +334,13 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) | |||
| 290 | int write; | 334 | int write; |
| 291 | size_t len; | 335 | size_t len; |
| 292 | 336 | ||
| 337 | #ifdef CONFIG_MEMORY_FAILURE | ||
| 338 | if (behavior == MADV_HWPOISON) | ||
| 339 | return madvise_hwpoison(start, start+len_in); | ||
| 340 | #endif | ||
| 341 | if (!madvise_behavior_valid(behavior)) | ||
| 342 | return error; | ||
| 343 | |||
| 293 | write = madvise_need_mmap_write(behavior); | 344 | write = madvise_need_mmap_write(behavior); |
| 294 | if (write) | 345 | if (write) |
| 295 | down_write(¤t->mm->mmap_sem); | 346 | down_write(¤t->mm->mmap_sem); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8e4be9cb2a6a..f99f5991d6bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -27,7 +27,9 @@ | |||
| 27 | #include <linux/backing-dev.h> | 27 | #include <linux/backing-dev.h> |
| 28 | #include <linux/bit_spinlock.h> | 28 | #include <linux/bit_spinlock.h> |
| 29 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
| 30 | #include <linux/limits.h> | ||
| 30 | #include <linux/mutex.h> | 31 | #include <linux/mutex.h> |
| 32 | #include <linux/rbtree.h> | ||
| 31 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
| 32 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
| 33 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
| @@ -42,9 +44,10 @@ | |||
| 42 | 44 | ||
| 43 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 45 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
| 44 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 46 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
| 47 | struct mem_cgroup *root_mem_cgroup __read_mostly; | ||
| 45 | 48 | ||
| 46 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 49 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
| 47 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */ | 50 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
| 48 | int do_swap_account __read_mostly; | 51 | int do_swap_account __read_mostly; |
| 49 | static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | 52 | static int really_do_swap_account __initdata = 1; /* for remember boot option*/ |
| 50 | #else | 53 | #else |
| @@ -52,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
| 52 | #endif | 55 | #endif |
| 53 | 56 | ||
| 54 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | 57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ |
| 58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | ||
| 55 | 59 | ||
| 56 | /* | 60 | /* |
| 57 | * Statistics for memory cgroup. | 61 | * Statistics for memory cgroup. |
| @@ -61,9 +65,12 @@ enum mem_cgroup_stat_index { | |||
| 61 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. | 65 | * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. |
| 62 | */ | 66 | */ |
| 63 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
| 64 | MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ | 68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
| 69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | ||
| 65 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
| 66 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
| 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
| 73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | ||
| 67 | 74 | ||
| 68 | MEM_CGROUP_STAT_NSTATS, | 75 | MEM_CGROUP_STAT_NSTATS, |
| 69 | }; | 76 | }; |
| @@ -76,6 +83,20 @@ struct mem_cgroup_stat { | |||
| 76 | struct mem_cgroup_stat_cpu cpustat[0]; | 83 | struct mem_cgroup_stat_cpu cpustat[0]; |
| 77 | }; | 84 | }; |
| 78 | 85 | ||
| 86 | static inline void | ||
| 87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
| 88 | enum mem_cgroup_stat_index idx) | ||
| 89 | { | ||
| 90 | stat->count[idx] = 0; | ||
| 91 | } | ||
| 92 | |||
| 93 | static inline s64 | ||
| 94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
| 95 | enum mem_cgroup_stat_index idx) | ||
| 96 | { | ||
| 97 | return stat->count[idx]; | ||
| 98 | } | ||
| 99 | |||
| 79 | /* | 100 | /* |
| 80 | * For accounting under irq disable, no need for increment preempt count. | 101 | * For accounting under irq disable, no need for increment preempt count. |
| 81 | */ | 102 | */ |
| @@ -95,6 +116,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | |||
| 95 | return ret; | 116 | return ret; |
| 96 | } | 117 | } |
| 97 | 118 | ||
| 119 | static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) | ||
| 120 | { | ||
| 121 | s64 ret; | ||
| 122 | |||
| 123 | ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); | ||
| 124 | ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); | ||
| 125 | return ret; | ||
| 126 | } | ||
| 127 | |||
| 98 | /* | 128 | /* |
| 99 | * per-zone information in memory controller. | 129 | * per-zone information in memory controller. |
| 100 | */ | 130 | */ |
| @@ -106,6 +136,12 @@ struct mem_cgroup_per_zone { | |||
| 106 | unsigned long count[NR_LRU_LISTS]; | 136 | unsigned long count[NR_LRU_LISTS]; |
| 107 | 137 | ||
| 108 | struct zone_reclaim_stat reclaim_stat; | 138 | struct zone_reclaim_stat reclaim_stat; |
| 139 | struct rb_node tree_node; /* RB tree node */ | ||
| 140 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
| 141 | /* the soft limit is exceeded*/ | ||
| 142 | bool on_tree; | ||
| 143 | struct mem_cgroup *mem; /* Back pointer, we cannot */ | ||
| 144 | /* use container_of */ | ||
| 109 | }; | 145 | }; |
| 110 | /* Macro for accessing counter */ | 146 | /* Macro for accessing counter */ |
| 111 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 147 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
| @@ -119,6 +155,26 @@ struct mem_cgroup_lru_info { | |||
| 119 | }; | 155 | }; |
| 120 | 156 | ||
| 121 | /* | 157 | /* |
| 158 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
| 159 | * their hierarchy representation | ||
| 160 | */ | ||
| 161 | |||
| 162 | struct mem_cgroup_tree_per_zone { | ||
| 163 | struct rb_root rb_root; | ||
| 164 | spinlock_t lock; | ||
| 165 | }; | ||
| 166 | |||
| 167 | struct mem_cgroup_tree_per_node { | ||
| 168 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
| 169 | }; | ||
| 170 | |||
| 171 | struct mem_cgroup_tree { | ||
| 172 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
| 173 | }; | ||
| 174 | |||
| 175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
| 176 | |||
| 177 | /* | ||
| 122 | * The memory controller data structure. The memory controller controls both | 178 | * The memory controller data structure. The memory controller controls both |
| 123 | * page cache and RSS per cgroup. We would eventually like to provide | 179 | * page cache and RSS per cgroup. We would eventually like to provide |
| 124 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 180 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
| @@ -154,9 +210,9 @@ struct mem_cgroup { | |||
| 154 | 210 | ||
| 155 | /* | 211 | /* |
| 156 | * While reclaiming in a hiearchy, we cache the last child we | 212 | * While reclaiming in a hiearchy, we cache the last child we |
| 157 | * reclaimed from. Protected by hierarchy_mutex | 213 | * reclaimed from. |
| 158 | */ | 214 | */ |
| 159 | struct mem_cgroup *last_scanned_child; | 215 | int last_scanned_child; |
| 160 | /* | 216 | /* |
| 161 | * Should the accounting and control be hierarchical, per subtree? | 217 | * Should the accounting and control be hierarchical, per subtree? |
| 162 | */ | 218 | */ |
| @@ -166,18 +222,29 @@ struct mem_cgroup { | |||
| 166 | 222 | ||
| 167 | unsigned int swappiness; | 223 | unsigned int swappiness; |
| 168 | 224 | ||
| 225 | /* set when res.limit == memsw.limit */ | ||
| 226 | bool memsw_is_minimum; | ||
| 227 | |||
| 169 | /* | 228 | /* |
| 170 | * statistics. This must be placed at the end of memcg. | 229 | * statistics. This must be placed at the end of memcg. |
| 171 | */ | 230 | */ |
| 172 | struct mem_cgroup_stat stat; | 231 | struct mem_cgroup_stat stat; |
| 173 | }; | 232 | }; |
| 174 | 233 | ||
| 234 | /* | ||
| 235 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | ||
| 236 | * limit reclaim to prevent infinite loops, if they ever occur. | ||
| 237 | */ | ||
| 238 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) | ||
| 239 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) | ||
| 240 | |||
| 175 | enum charge_type { | 241 | enum charge_type { |
| 176 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 242 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
| 177 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 243 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
| 178 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ | 244 | MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ |
| 179 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ | 245 | MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ |
| 180 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ | 246 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ |
| 247 | MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ | ||
| 181 | NR_CHARGE_TYPE, | 248 | NR_CHARGE_TYPE, |
| 182 | }; | 249 | }; |
| 183 | 250 | ||
| @@ -185,13 +252,8 @@ enum charge_type { | |||
| 185 | #define PCGF_CACHE (1UL << PCG_CACHE) | 252 | #define PCGF_CACHE (1UL << PCG_CACHE) |
| 186 | #define PCGF_USED (1UL << PCG_USED) | 253 | #define PCGF_USED (1UL << PCG_USED) |
| 187 | #define PCGF_LOCK (1UL << PCG_LOCK) | 254 | #define PCGF_LOCK (1UL << PCG_LOCK) |
| 188 | static const unsigned long | 255 | /* Not used, but added here for completeness */ |
| 189 | pcg_default_flags[NR_CHARGE_TYPE] = { | 256 | #define PCGF_ACCT (1UL << PCG_ACCT) |
| 190 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ | ||
| 191 | PCGF_USED | PCGF_LOCK, /* Anon */ | ||
| 192 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | ||
| 193 | 0, /* FORCE */ | ||
| 194 | }; | ||
| 195 | 257 | ||
| 196 | /* for encoding cft->private value on file */ | 258 | /* for encoding cft->private value on file */ |
| 197 | #define _MEM (0) | 259 | #define _MEM (0) |
| @@ -200,15 +262,237 @@ pcg_default_flags[NR_CHARGE_TYPE] = { | |||
| 200 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 262 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
| 201 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 263 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
| 202 | 264 | ||
| 265 | /* | ||
| 266 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | ||
| 267 | */ | ||
| 268 | #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 | ||
| 269 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | ||
| 270 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | ||
| 271 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | ||
| 272 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | ||
| 273 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | ||
| 274 | |||
| 203 | static void mem_cgroup_get(struct mem_cgroup *mem); | 275 | static void mem_cgroup_get(struct mem_cgroup *mem); |
| 204 | static void mem_cgroup_put(struct mem_cgroup *mem); | 276 | static void mem_cgroup_put(struct mem_cgroup *mem); |
| 205 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
| 206 | 278 | ||
| 279 | static struct mem_cgroup_per_zone * | ||
| 280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
| 281 | { | ||
| 282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
| 283 | } | ||
| 284 | |||
| 285 | static struct mem_cgroup_per_zone * | ||
| 286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
| 287 | { | ||
| 288 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
| 289 | int nid = page_cgroup_nid(pc); | ||
| 290 | int zid = page_cgroup_zid(pc); | ||
| 291 | |||
| 292 | if (!mem) | ||
| 293 | return NULL; | ||
| 294 | |||
| 295 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
| 296 | } | ||
| 297 | |||
| 298 | static struct mem_cgroup_tree_per_zone * | ||
| 299 | soft_limit_tree_node_zone(int nid, int zid) | ||
| 300 | { | ||
| 301 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
| 302 | } | ||
| 303 | |||
| 304 | static struct mem_cgroup_tree_per_zone * | ||
| 305 | soft_limit_tree_from_page(struct page *page) | ||
| 306 | { | ||
| 307 | int nid = page_to_nid(page); | ||
| 308 | int zid = page_zonenum(page); | ||
| 309 | |||
| 310 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
| 311 | } | ||
| 312 | |||
| 313 | static void | ||
| 314 | __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
| 315 | struct mem_cgroup_per_zone *mz, | ||
| 316 | struct mem_cgroup_tree_per_zone *mctz, | ||
| 317 | unsigned long long new_usage_in_excess) | ||
| 318 | { | ||
| 319 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
| 320 | struct rb_node *parent = NULL; | ||
| 321 | struct mem_cgroup_per_zone *mz_node; | ||
| 322 | |||
| 323 | if (mz->on_tree) | ||
| 324 | return; | ||
| 325 | |||
| 326 | mz->usage_in_excess = new_usage_in_excess; | ||
| 327 | if (!mz->usage_in_excess) | ||
| 328 | return; | ||
| 329 | while (*p) { | ||
| 330 | parent = *p; | ||
| 331 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
| 332 | tree_node); | ||
| 333 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
| 334 | p = &(*p)->rb_left; | ||
| 335 | /* | ||
| 336 | * We can't avoid mem cgroups that are over their soft | ||
| 337 | * limit by the same amount | ||
| 338 | */ | ||
| 339 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
| 340 | p = &(*p)->rb_right; | ||
| 341 | } | ||
| 342 | rb_link_node(&mz->tree_node, parent, p); | ||
| 343 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
| 344 | mz->on_tree = true; | ||
| 345 | } | ||
| 346 | |||
| 347 | static void | ||
| 348 | __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
| 349 | struct mem_cgroup_per_zone *mz, | ||
| 350 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 351 | { | ||
| 352 | if (!mz->on_tree) | ||
| 353 | return; | ||
| 354 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
| 355 | mz->on_tree = false; | ||
| 356 | } | ||
| 357 | |||
| 358 | static void | ||
| 359 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
| 360 | struct mem_cgroup_per_zone *mz, | ||
| 361 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 362 | { | ||
| 363 | spin_lock(&mctz->lock); | ||
| 364 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
| 365 | spin_unlock(&mctz->lock); | ||
| 366 | } | ||
| 367 | |||
| 368 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
| 369 | { | ||
| 370 | bool ret = false; | ||
| 371 | int cpu; | ||
| 372 | s64 val; | ||
| 373 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 374 | |||
| 375 | cpu = get_cpu(); | ||
| 376 | cpustat = &mem->stat.cpustat[cpu]; | ||
| 377 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
| 378 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
| 379 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
| 380 | ret = true; | ||
| 381 | } | ||
| 382 | put_cpu(); | ||
| 383 | return ret; | ||
| 384 | } | ||
| 385 | |||
| 386 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | ||
| 387 | { | ||
| 388 | unsigned long long excess; | ||
| 389 | struct mem_cgroup_per_zone *mz; | ||
| 390 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 391 | int nid = page_to_nid(page); | ||
| 392 | int zid = page_zonenum(page); | ||
| 393 | mctz = soft_limit_tree_from_page(page); | ||
| 394 | |||
| 395 | /* | ||
| 396 | * Necessary to update all ancestors when hierarchy is used. | ||
| 397 | * because their event counter is not touched. | ||
| 398 | */ | ||
| 399 | for (; mem; mem = parent_mem_cgroup(mem)) { | ||
| 400 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
| 401 | excess = res_counter_soft_limit_excess(&mem->res); | ||
| 402 | /* | ||
| 403 | * We have to update the tree if mz is on RB-tree or | ||
| 404 | * mem is over its softlimit. | ||
| 405 | */ | ||
| 406 | if (excess || mz->on_tree) { | ||
| 407 | spin_lock(&mctz->lock); | ||
| 408 | /* if on-tree, remove it */ | ||
| 409 | if (mz->on_tree) | ||
| 410 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
| 411 | /* | ||
| 412 | * Insert again. mz->usage_in_excess will be updated. | ||
| 413 | * If excess is 0, no tree ops. | ||
| 414 | */ | ||
| 415 | __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); | ||
| 416 | spin_unlock(&mctz->lock); | ||
| 417 | } | ||
| 418 | } | ||
| 419 | } | ||
| 420 | |||
| 421 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | ||
| 422 | { | ||
| 423 | int node, zone; | ||
| 424 | struct mem_cgroup_per_zone *mz; | ||
| 425 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 426 | |||
| 427 | for_each_node_state(node, N_POSSIBLE) { | ||
| 428 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 429 | mz = mem_cgroup_zoneinfo(mem, node, zone); | ||
| 430 | mctz = soft_limit_tree_node_zone(node, zone); | ||
| 431 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
| 432 | } | ||
| 433 | } | ||
| 434 | } | ||
| 435 | |||
| 436 | static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) | ||
| 437 | { | ||
| 438 | return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; | ||
| 439 | } | ||
| 440 | |||
| 441 | static struct mem_cgroup_per_zone * | ||
| 442 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
| 443 | { | ||
| 444 | struct rb_node *rightmost = NULL; | ||
| 445 | struct mem_cgroup_per_zone *mz; | ||
| 446 | |||
| 447 | retry: | ||
| 448 | mz = NULL; | ||
| 449 | rightmost = rb_last(&mctz->rb_root); | ||
| 450 | if (!rightmost) | ||
| 451 | goto done; /* Nothing to reclaim from */ | ||
| 452 | |||
| 453 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
| 454 | /* | ||
| 455 | * Remove the node now but someone else can add it back, | ||
| 456 | * we will to add it back at the end of reclaim to its correct | ||
| 457 | * position in the tree. | ||
| 458 | */ | ||
| 459 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
| 460 | if (!res_counter_soft_limit_excess(&mz->mem->res) || | ||
| 461 | !css_tryget(&mz->mem->css)) | ||
| 462 | goto retry; | ||
| 463 | done: | ||
| 464 | return mz; | ||
| 465 | } | ||
| 466 | |||
| 467 | static struct mem_cgroup_per_zone * | ||
| 468 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
| 469 | { | ||
| 470 | struct mem_cgroup_per_zone *mz; | ||
| 471 | |||
| 472 | spin_lock(&mctz->lock); | ||
| 473 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
| 474 | spin_unlock(&mctz->lock); | ||
| 475 | return mz; | ||
| 476 | } | ||
| 477 | |||
| 478 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | ||
| 479 | bool charge) | ||
| 480 | { | ||
| 481 | int val = (charge) ? 1 : -1; | ||
| 482 | struct mem_cgroup_stat *stat = &mem->stat; | ||
| 483 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 484 | int cpu = get_cpu(); | ||
| 485 | |||
| 486 | cpustat = &stat->cpustat[cpu]; | ||
| 487 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
| 488 | put_cpu(); | ||
| 489 | } | ||
| 490 | |||
| 207 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 491 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
| 208 | struct page_cgroup *pc, | 492 | struct page_cgroup *pc, |
| 209 | bool charge) | 493 | bool charge) |
| 210 | { | 494 | { |
| 211 | int val = (charge)? 1 : -1; | 495 | int val = (charge) ? 1 : -1; |
| 212 | struct mem_cgroup_stat *stat = &mem->stat; | 496 | struct mem_cgroup_stat *stat = &mem->stat; |
| 213 | struct mem_cgroup_stat_cpu *cpustat; | 497 | struct mem_cgroup_stat_cpu *cpustat; |
| 214 | int cpu = get_cpu(); | 498 | int cpu = get_cpu(); |
| @@ -225,29 +509,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
| 225 | else | 509 | else |
| 226 | __mem_cgroup_stat_add_safe(cpustat, | 510 | __mem_cgroup_stat_add_safe(cpustat, |
| 227 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 511 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
| 512 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | ||
| 228 | put_cpu(); | 513 | put_cpu(); |
| 229 | } | 514 | } |
| 230 | 515 | ||
| 231 | static struct mem_cgroup_per_zone * | 516 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
| 232 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
| 233 | { | ||
| 234 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
| 235 | } | ||
| 236 | |||
| 237 | static struct mem_cgroup_per_zone * | ||
| 238 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
| 239 | { | ||
| 240 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
| 241 | int nid = page_cgroup_nid(pc); | ||
| 242 | int zid = page_cgroup_zid(pc); | ||
| 243 | |||
| 244 | if (!mem) | ||
| 245 | return NULL; | ||
| 246 | |||
| 247 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
| 248 | } | ||
| 249 | |||
| 250 | static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, | ||
| 251 | enum lru_list idx) | 517 | enum lru_list idx) |
| 252 | { | 518 | { |
| 253 | int nid, zid; | 519 | int nid, zid; |
| @@ -286,6 +552,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) | |||
| 286 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | 552 | static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) |
| 287 | { | 553 | { |
| 288 | struct mem_cgroup *mem = NULL; | 554 | struct mem_cgroup *mem = NULL; |
| 555 | |||
| 556 | if (!mm) | ||
| 557 | return NULL; | ||
| 289 | /* | 558 | /* |
| 290 | * Because we have no locks, mm->owner's may be being moved to other | 559 | * Because we have no locks, mm->owner's may be being moved to other |
| 291 | * cgroup. We use css_tryget() here even if this looks | 560 | * cgroup. We use css_tryget() here even if this looks |
| @@ -301,11 +570,44 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
| 301 | return mem; | 570 | return mem; |
| 302 | } | 571 | } |
| 303 | 572 | ||
| 304 | static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) | 573 | /* |
| 574 | * Call callback function against all cgroup under hierarchy tree. | ||
| 575 | */ | ||
| 576 | static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | ||
| 577 | int (*func)(struct mem_cgroup *, void *)) | ||
| 305 | { | 578 | { |
| 306 | if (!mem) | 579 | int found, ret, nextid; |
| 307 | return true; | 580 | struct cgroup_subsys_state *css; |
| 308 | return css_is_removed(&mem->css); | 581 | struct mem_cgroup *mem; |
| 582 | |||
| 583 | if (!root->use_hierarchy) | ||
| 584 | return (*func)(root, data); | ||
| 585 | |||
| 586 | nextid = 1; | ||
| 587 | do { | ||
| 588 | ret = 0; | ||
| 589 | mem = NULL; | ||
| 590 | |||
| 591 | rcu_read_lock(); | ||
| 592 | css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, | ||
| 593 | &found); | ||
| 594 | if (css && css_tryget(css)) | ||
| 595 | mem = container_of(css, struct mem_cgroup, css); | ||
| 596 | rcu_read_unlock(); | ||
| 597 | |||
| 598 | if (mem) { | ||
| 599 | ret = (*func)(mem, data); | ||
| 600 | css_put(&mem->css); | ||
| 601 | } | ||
| 602 | nextid = found + 1; | ||
| 603 | } while (!ret && css); | ||
| 604 | |||
| 605 | return ret; | ||
| 606 | } | ||
| 607 | |||
| 608 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | ||
| 609 | { | ||
| 610 | return (mem == root_mem_cgroup); | ||
| 309 | } | 611 | } |
| 310 | 612 | ||
| 311 | /* | 613 | /* |
| @@ -325,22 +627,24 @@ static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem) | |||
| 325 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | 627 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) |
| 326 | { | 628 | { |
| 327 | struct page_cgroup *pc; | 629 | struct page_cgroup *pc; |
| 328 | struct mem_cgroup *mem; | ||
| 329 | struct mem_cgroup_per_zone *mz; | 630 | struct mem_cgroup_per_zone *mz; |
| 330 | 631 | ||
| 331 | if (mem_cgroup_disabled()) | 632 | if (mem_cgroup_disabled()) |
| 332 | return; | 633 | return; |
| 333 | pc = lookup_page_cgroup(page); | 634 | pc = lookup_page_cgroup(page); |
| 334 | /* can happen while we handle swapcache. */ | 635 | /* can happen while we handle swapcache. */ |
| 335 | if (list_empty(&pc->lru) || !pc->mem_cgroup) | 636 | if (!TestClearPageCgroupAcctLRU(pc)) |
| 336 | return; | 637 | return; |
| 638 | VM_BUG_ON(!pc->mem_cgroup); | ||
| 337 | /* | 639 | /* |
| 338 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | 640 | * We don't check PCG_USED bit. It's cleared when the "page" is finally |
| 339 | * removed from global LRU. | 641 | * removed from global LRU. |
| 340 | */ | 642 | */ |
| 341 | mz = page_cgroup_zoneinfo(pc); | 643 | mz = page_cgroup_zoneinfo(pc); |
| 342 | mem = pc->mem_cgroup; | ||
| 343 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 644 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
| 645 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
| 646 | return; | ||
| 647 | VM_BUG_ON(list_empty(&pc->lru)); | ||
| 344 | list_del_init(&pc->lru); | 648 | list_del_init(&pc->lru); |
| 345 | return; | 649 | return; |
| 346 | } | 650 | } |
| @@ -364,8 +668,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
| 364 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 668 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
| 365 | */ | 669 | */ |
| 366 | smp_rmb(); | 670 | smp_rmb(); |
| 367 | /* unused page is not rotated. */ | 671 | /* unused or root page is not rotated. */ |
| 368 | if (!PageCgroupUsed(pc)) | 672 | if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) |
| 369 | return; | 673 | return; |
| 370 | mz = page_cgroup_zoneinfo(pc); | 674 | mz = page_cgroup_zoneinfo(pc); |
| 371 | list_move(&pc->lru, &mz->lists[lru]); | 675 | list_move(&pc->lru, &mz->lists[lru]); |
| @@ -379,6 +683,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
| 379 | if (mem_cgroup_disabled()) | 683 | if (mem_cgroup_disabled()) |
| 380 | return; | 684 | return; |
| 381 | pc = lookup_page_cgroup(page); | 685 | pc = lookup_page_cgroup(page); |
| 686 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | ||
| 382 | /* | 687 | /* |
| 383 | * Used bit is set without atomic ops but after smp_wmb(). | 688 | * Used bit is set without atomic ops but after smp_wmb(). |
| 384 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 689 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
| @@ -389,6 +694,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
| 389 | 694 | ||
| 390 | mz = page_cgroup_zoneinfo(pc); | 695 | mz = page_cgroup_zoneinfo(pc); |
| 391 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 696 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
| 697 | SetPageCgroupAcctLRU(pc); | ||
| 698 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
| 699 | return; | ||
| 392 | list_add(&pc->lru, &mz->lists[lru]); | 700 | list_add(&pc->lru, &mz->lists[lru]); |
| 393 | } | 701 | } |
| 394 | 702 | ||
| @@ -423,7 +731,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) | |||
| 423 | 731 | ||
| 424 | spin_lock_irqsave(&zone->lru_lock, flags); | 732 | spin_lock_irqsave(&zone->lru_lock, flags); |
| 425 | /* link when the page is linked to LRU but page_cgroup isn't */ | 733 | /* link when the page is linked to LRU but page_cgroup isn't */ |
| 426 | if (PageLRU(page) && list_empty(&pc->lru)) | 734 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) |
| 427 | mem_cgroup_add_lru_list(page, page_lru(page)); | 735 | mem_cgroup_add_lru_list(page, page_lru(page)); |
| 428 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 736 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 429 | } | 737 | } |
| @@ -441,31 +749,24 @@ void mem_cgroup_move_lists(struct page *page, | |||
| 441 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | 749 | int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) |
| 442 | { | 750 | { |
| 443 | int ret; | 751 | int ret; |
| 752 | struct mem_cgroup *curr = NULL; | ||
| 444 | 753 | ||
| 445 | task_lock(task); | 754 | task_lock(task); |
| 446 | ret = task->mm && mm_match_cgroup(task->mm, mem); | 755 | rcu_read_lock(); |
| 756 | curr = try_get_mem_cgroup_from_mm(task->mm); | ||
| 757 | rcu_read_unlock(); | ||
| 447 | task_unlock(task); | 758 | task_unlock(task); |
| 759 | if (!curr) | ||
| 760 | return 0; | ||
| 761 | if (curr->use_hierarchy) | ||
| 762 | ret = css_is_ancestor(&curr->css, &mem->css); | ||
| 763 | else | ||
| 764 | ret = (curr == mem); | ||
| 765 | css_put(&curr->css); | ||
| 448 | return ret; | 766 | return ret; |
| 449 | } | 767 | } |
| 450 | 768 | ||
| 451 | /* | 769 | /* |
| 452 | * Calculate mapped_ratio under memory controller. This will be used in | ||
| 453 | * vmscan.c for deteremining we have to reclaim mapped pages. | ||
| 454 | */ | ||
| 455 | int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) | ||
| 456 | { | ||
| 457 | long total, rss; | ||
| 458 | |||
| 459 | /* | ||
| 460 | * usage is recorded in bytes. But, here, we assume the number of | ||
| 461 | * physical pages can be represented by "long" on any arch. | ||
| 462 | */ | ||
| 463 | total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; | ||
| 464 | rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | ||
| 465 | return (int)((rss * 100L) / total); | ||
| 466 | } | ||
| 467 | |||
| 468 | /* | ||
| 469 | * prev_priority control...this will be used in memory reclaim path. | 770 | * prev_priority control...this will be used in memory reclaim path. |
| 470 | */ | 771 | */ |
| 471 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) | 772 | int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) |
| @@ -501,8 +802,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ | |||
| 501 | unsigned long gb; | 802 | unsigned long gb; |
| 502 | unsigned long inactive_ratio; | 803 | unsigned long inactive_ratio; |
| 503 | 804 | ||
| 504 | inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); | 805 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON); |
| 505 | active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); | 806 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON); |
| 506 | 807 | ||
| 507 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 808 | gb = (inactive + active) >> (30 - PAGE_SHIFT); |
| 508 | if (gb) | 809 | if (gb) |
| @@ -536,6 +837,17 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) | |||
| 536 | return 0; | 837 | return 0; |
| 537 | } | 838 | } |
| 538 | 839 | ||
| 840 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) | ||
| 841 | { | ||
| 842 | unsigned long active; | ||
| 843 | unsigned long inactive; | ||
| 844 | |||
| 845 | inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_FILE); | ||
| 846 | active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_FILE); | ||
| 847 | |||
| 848 | return (active > inactive); | ||
| 849 | } | ||
| 850 | |||
| 539 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | 851 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, |
| 540 | struct zone *zone, | 852 | struct zone *zone, |
| 541 | enum lru_list lru) | 853 | enum lru_list lru) |
| @@ -598,7 +910,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
| 598 | int nid = z->zone_pgdat->node_id; | 910 | int nid = z->zone_pgdat->node_id; |
| 599 | int zid = zone_idx(z); | 911 | int zid = zone_idx(z); |
| 600 | struct mem_cgroup_per_zone *mz; | 912 | struct mem_cgroup_per_zone *mz; |
| 601 | int lru = LRU_FILE * !!file + !!active; | 913 | int lru = LRU_FILE * file + active; |
| 914 | int ret; | ||
| 602 | 915 | ||
| 603 | BUG_ON(!mem_cont); | 916 | BUG_ON(!mem_cont); |
| 604 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); | 917 | mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); |
| @@ -616,9 +929,19 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
| 616 | continue; | 929 | continue; |
| 617 | 930 | ||
| 618 | scan++; | 931 | scan++; |
| 619 | if (__isolate_lru_page(page, mode, file) == 0) { | 932 | ret = __isolate_lru_page(page, mode, file); |
| 933 | switch (ret) { | ||
| 934 | case 0: | ||
| 620 | list_move(&page->lru, dst); | 935 | list_move(&page->lru, dst); |
| 936 | mem_cgroup_del_lru(page); | ||
| 621 | nr_taken++; | 937 | nr_taken++; |
| 938 | break; | ||
| 939 | case -EBUSY: | ||
| 940 | /* we don't affect global LRU but rotate in our LRU */ | ||
| 941 | mem_cgroup_rotate_lru_list(page, page_lru(page)); | ||
| 942 | break; | ||
| 943 | default: | ||
| 944 | break; | ||
| 622 | } | 945 | } |
| 623 | } | 946 | } |
| 624 | 947 | ||
| @@ -629,172 +952,243 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
| 629 | #define mem_cgroup_from_res_counter(counter, member) \ | 952 | #define mem_cgroup_from_res_counter(counter, member) \ |
| 630 | container_of(counter, struct mem_cgroup, member) | 953 | container_of(counter, struct mem_cgroup, member) |
| 631 | 954 | ||
| 632 | /* | 955 | static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) |
| 633 | * This routine finds the DFS walk successor. This routine should be | ||
| 634 | * called with hierarchy_mutex held | ||
| 635 | */ | ||
| 636 | static struct mem_cgroup * | ||
| 637 | __mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) | ||
| 638 | { | 956 | { |
| 639 | struct cgroup *cgroup, *curr_cgroup, *root_cgroup; | 957 | if (do_swap_account) { |
| 640 | 958 | if (res_counter_check_under_limit(&mem->res) && | |
| 641 | curr_cgroup = curr->css.cgroup; | 959 | res_counter_check_under_limit(&mem->memsw)) |
| 642 | root_cgroup = root_mem->css.cgroup; | 960 | return true; |
| 961 | } else | ||
| 962 | if (res_counter_check_under_limit(&mem->res)) | ||
| 963 | return true; | ||
| 964 | return false; | ||
| 965 | } | ||
| 643 | 966 | ||
| 644 | if (!list_empty(&curr_cgroup->children)) { | 967 | static unsigned int get_swappiness(struct mem_cgroup *memcg) |
| 645 | /* | 968 | { |
| 646 | * Walk down to children | 969 | struct cgroup *cgrp = memcg->css.cgroup; |
| 647 | */ | 970 | unsigned int swappiness; |
| 648 | cgroup = list_entry(curr_cgroup->children.next, | ||
| 649 | struct cgroup, sibling); | ||
| 650 | curr = mem_cgroup_from_cont(cgroup); | ||
| 651 | goto done; | ||
| 652 | } | ||
| 653 | 971 | ||
| 654 | visit_parent: | 972 | /* root ? */ |
| 655 | if (curr_cgroup == root_cgroup) { | 973 | if (cgrp->parent == NULL) |
| 656 | /* caller handles NULL case */ | 974 | return vm_swappiness; |
| 657 | curr = NULL; | ||
| 658 | goto done; | ||
| 659 | } | ||
| 660 | 975 | ||
| 661 | /* | 976 | spin_lock(&memcg->reclaim_param_lock); |
| 662 | * Goto next sibling | 977 | swappiness = memcg->swappiness; |
| 663 | */ | 978 | spin_unlock(&memcg->reclaim_param_lock); |
| 664 | if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { | ||
| 665 | cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, | ||
| 666 | sibling); | ||
| 667 | curr = mem_cgroup_from_cont(cgroup); | ||
| 668 | goto done; | ||
| 669 | } | ||
| 670 | 979 | ||
| 671 | /* | 980 | return swappiness; |
| 672 | * Go up to next parent and next parent's sibling if need be | 981 | } |
| 673 | */ | ||
| 674 | curr_cgroup = curr_cgroup->parent; | ||
| 675 | goto visit_parent; | ||
| 676 | 982 | ||
| 677 | done: | 983 | static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) |
| 678 | return curr; | 984 | { |
| 985 | int *val = data; | ||
| 986 | (*val)++; | ||
| 987 | return 0; | ||
| 679 | } | 988 | } |
| 680 | 989 | ||
| 681 | /* | 990 | /** |
| 682 | * Visit the first child (need not be the first child as per the ordering | 991 | * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. |
| 683 | * of the cgroup list, since we track last_scanned_child) of @mem and use | 992 | * @memcg: The memory cgroup that went over limit |
| 684 | * that to reclaim free pages from. | 993 | * @p: Task that is going to be killed |
| 994 | * | ||
| 995 | * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is | ||
| 996 | * enabled | ||
| 685 | */ | 997 | */ |
| 686 | static struct mem_cgroup * | 998 | void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) |
| 687 | mem_cgroup_get_next_node(struct mem_cgroup *root_mem) | ||
| 688 | { | 999 | { |
| 689 | struct cgroup *cgroup; | 1000 | struct cgroup *task_cgrp; |
| 690 | struct mem_cgroup *orig, *next; | 1001 | struct cgroup *mem_cgrp; |
| 691 | bool obsolete; | ||
| 692 | |||
| 693 | /* | 1002 | /* |
| 694 | * Scan all children under the mem_cgroup mem | 1003 | * Need a buffer in BSS, can't rely on allocations. The code relies |
| 1004 | * on the assumption that OOM is serialized for memory controller. | ||
| 1005 | * If this assumption is broken, revisit this code. | ||
| 695 | */ | 1006 | */ |
| 696 | mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); | 1007 | static char memcg_name[PATH_MAX]; |
| 1008 | int ret; | ||
| 1009 | |||
| 1010 | if (!memcg) | ||
| 1011 | return; | ||
| 697 | 1012 | ||
| 698 | orig = root_mem->last_scanned_child; | ||
| 699 | obsolete = mem_cgroup_is_obsolete(orig); | ||
| 700 | 1013 | ||
| 701 | if (list_empty(&root_mem->css.cgroup->children)) { | 1014 | rcu_read_lock(); |
| 1015 | |||
| 1016 | mem_cgrp = memcg->css.cgroup; | ||
| 1017 | task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); | ||
| 1018 | |||
| 1019 | ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); | ||
| 1020 | if (ret < 0) { | ||
| 702 | /* | 1021 | /* |
| 703 | * root_mem might have children before and last_scanned_child | 1022 | * Unfortunately, we are unable to convert to a useful name |
| 704 | * may point to one of them. We put it later. | 1023 | * But we'll still print out the usage information |
| 705 | */ | 1024 | */ |
| 706 | if (orig) | 1025 | rcu_read_unlock(); |
| 707 | VM_BUG_ON(!obsolete); | ||
| 708 | next = NULL; | ||
| 709 | goto done; | 1026 | goto done; |
| 710 | } | 1027 | } |
| 1028 | rcu_read_unlock(); | ||
| 711 | 1029 | ||
| 712 | if (!orig || obsolete) { | 1030 | printk(KERN_INFO "Task in %s killed", memcg_name); |
| 713 | cgroup = list_first_entry(&root_mem->css.cgroup->children, | 1031 | |
| 714 | struct cgroup, sibling); | 1032 | rcu_read_lock(); |
| 715 | next = mem_cgroup_from_cont(cgroup); | 1033 | ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); |
| 716 | } else | 1034 | if (ret < 0) { |
| 717 | next = __mem_cgroup_get_next_node(orig, root_mem); | 1035 | rcu_read_unlock(); |
| 1036 | goto done; | ||
| 1037 | } | ||
| 1038 | rcu_read_unlock(); | ||
| 718 | 1039 | ||
| 1040 | /* | ||
| 1041 | * Continues from above, so we don't need an KERN_ level | ||
| 1042 | */ | ||
| 1043 | printk(KERN_CONT " as a result of limit of %s\n", memcg_name); | ||
| 719 | done: | 1044 | done: |
| 720 | if (next) | 1045 | |
| 721 | mem_cgroup_get(next); | 1046 | printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n", |
| 722 | root_mem->last_scanned_child = next; | 1047 | res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, |
| 723 | if (orig) | 1048 | res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, |
| 724 | mem_cgroup_put(orig); | 1049 | res_counter_read_u64(&memcg->res, RES_FAILCNT)); |
| 725 | mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); | 1050 | printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, " |
| 726 | return (next) ? next : root_mem; | 1051 | "failcnt %llu\n", |
| 1052 | res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, | ||
| 1053 | res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, | ||
| 1054 | res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); | ||
| 727 | } | 1055 | } |
| 728 | 1056 | ||
| 729 | static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) | 1057 | /* |
| 1058 | * This function returns the number of memcg under hierarchy tree. Returns | ||
| 1059 | * 1(self count) if no children. | ||
| 1060 | */ | ||
| 1061 | static int mem_cgroup_count_children(struct mem_cgroup *mem) | ||
| 730 | { | 1062 | { |
| 731 | if (do_swap_account) { | 1063 | int num = 0; |
| 732 | if (res_counter_check_under_limit(&mem->res) && | 1064 | mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); |
| 733 | res_counter_check_under_limit(&mem->memsw)) | 1065 | return num; |
| 734 | return true; | ||
| 735 | } else | ||
| 736 | if (res_counter_check_under_limit(&mem->res)) | ||
| 737 | return true; | ||
| 738 | return false; | ||
| 739 | } | 1066 | } |
| 740 | 1067 | ||
| 741 | static unsigned int get_swappiness(struct mem_cgroup *memcg) | 1068 | /* |
| 1069 | * Visit the first child (need not be the first child as per the ordering | ||
| 1070 | * of the cgroup list, since we track last_scanned_child) of @mem and use | ||
| 1071 | * that to reclaim free pages from. | ||
| 1072 | */ | ||
| 1073 | static struct mem_cgroup * | ||
| 1074 | mem_cgroup_select_victim(struct mem_cgroup *root_mem) | ||
| 742 | { | 1075 | { |
| 743 | struct cgroup *cgrp = memcg->css.cgroup; | 1076 | struct mem_cgroup *ret = NULL; |
| 744 | unsigned int swappiness; | 1077 | struct cgroup_subsys_state *css; |
| 1078 | int nextid, found; | ||
| 745 | 1079 | ||
| 746 | /* root ? */ | 1080 | if (!root_mem->use_hierarchy) { |
| 747 | if (cgrp->parent == NULL) | 1081 | css_get(&root_mem->css); |
| 748 | return vm_swappiness; | 1082 | ret = root_mem; |
| 1083 | } | ||
| 749 | 1084 | ||
| 750 | spin_lock(&memcg->reclaim_param_lock); | 1085 | while (!ret) { |
| 751 | swappiness = memcg->swappiness; | 1086 | rcu_read_lock(); |
| 752 | spin_unlock(&memcg->reclaim_param_lock); | 1087 | nextid = root_mem->last_scanned_child + 1; |
| 1088 | css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, | ||
| 1089 | &found); | ||
| 1090 | if (css && css_tryget(css)) | ||
| 1091 | ret = container_of(css, struct mem_cgroup, css); | ||
| 1092 | |||
| 1093 | rcu_read_unlock(); | ||
| 1094 | /* Updates scanning parameter */ | ||
| 1095 | spin_lock(&root_mem->reclaim_param_lock); | ||
| 1096 | if (!css) { | ||
| 1097 | /* this means start scan from ID:1 */ | ||
| 1098 | root_mem->last_scanned_child = 0; | ||
| 1099 | } else | ||
| 1100 | root_mem->last_scanned_child = found; | ||
| 1101 | spin_unlock(&root_mem->reclaim_param_lock); | ||
| 1102 | } | ||
| 753 | 1103 | ||
| 754 | return swappiness; | 1104 | return ret; |
| 755 | } | 1105 | } |
| 756 | 1106 | ||
| 757 | /* | 1107 | /* |
| 758 | * Dance down the hierarchy if needed to reclaim memory. We remember the | 1108 | * Scan the hierarchy if needed to reclaim memory. We remember the last child |
| 759 | * last child we reclaimed from, so that we don't end up penalizing | 1109 | * we reclaimed from, so that we don't end up penalizing one child extensively |
| 760 | * one child extensively based on its position in the children list. | 1110 | * based on its position in the children list. |
| 761 | * | 1111 | * |
| 762 | * root_mem is the original ancestor that we've been reclaim from. | 1112 | * root_mem is the original ancestor that we've been reclaim from. |
| 1113 | * | ||
| 1114 | * We give up and return to the caller when we visit root_mem twice. | ||
| 1115 | * (other groups can be removed while we're walking....) | ||
| 1116 | * | ||
| 1117 | * If shrink==true, for avoiding to free too much, this returns immedieately. | ||
| 763 | */ | 1118 | */ |
| 764 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 1119 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
| 765 | gfp_t gfp_mask, bool noswap) | 1120 | struct zone *zone, |
| 766 | { | 1121 | gfp_t gfp_mask, |
| 767 | struct mem_cgroup *next_mem; | 1122 | unsigned long reclaim_options) |
| 768 | int ret = 0; | 1123 | { |
| 769 | 1124 | struct mem_cgroup *victim; | |
| 770 | /* | 1125 | int ret, total = 0; |
| 771 | * Reclaim unconditionally and don't check for return value. | 1126 | int loop = 0; |
| 772 | * We need to reclaim in the current group and down the tree. | 1127 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; |
| 773 | * One might think about checking for children before reclaiming, | 1128 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; |
| 774 | * but there might be left over accounting, even after children | 1129 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; |
| 775 | * have left. | 1130 | unsigned long excess = mem_cgroup_get_excess(root_mem); |
| 776 | */ | 1131 | |
| 777 | ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, | 1132 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
| 778 | get_swappiness(root_mem)); | 1133 | if (root_mem->memsw_is_minimum) |
| 779 | if (mem_cgroup_check_under_limit(root_mem)) | 1134 | noswap = true; |
| 780 | return 1; /* indicate reclaim has succeeded */ | ||
| 781 | if (!root_mem->use_hierarchy) | ||
| 782 | return ret; | ||
| 783 | 1135 | ||
| 784 | next_mem = mem_cgroup_get_next_node(root_mem); | 1136 | while (1) { |
| 785 | 1137 | victim = mem_cgroup_select_victim(root_mem); | |
| 786 | while (next_mem != root_mem) { | 1138 | if (victim == root_mem) { |
| 787 | if (mem_cgroup_is_obsolete(next_mem)) { | 1139 | loop++; |
| 788 | next_mem = mem_cgroup_get_next_node(root_mem); | 1140 | if (loop >= 2) { |
| 1141 | /* | ||
| 1142 | * If we have not been able to reclaim | ||
| 1143 | * anything, it might because there are | ||
| 1144 | * no reclaimable pages under this hierarchy | ||
| 1145 | */ | ||
| 1146 | if (!check_soft || !total) { | ||
| 1147 | css_put(&victim->css); | ||
| 1148 | break; | ||
| 1149 | } | ||
| 1150 | /* | ||
| 1151 | * We want to do more targetted reclaim. | ||
| 1152 | * excess >> 2 is not to excessive so as to | ||
| 1153 | * reclaim too much, nor too less that we keep | ||
| 1154 | * coming back to reclaim from this cgroup | ||
| 1155 | */ | ||
| 1156 | if (total >= (excess >> 2) || | ||
| 1157 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { | ||
| 1158 | css_put(&victim->css); | ||
| 1159 | break; | ||
| 1160 | } | ||
| 1161 | } | ||
| 1162 | } | ||
| 1163 | if (!mem_cgroup_local_usage(&victim->stat)) { | ||
| 1164 | /* this cgroup's local usage == 0 */ | ||
| 1165 | css_put(&victim->css); | ||
| 789 | continue; | 1166 | continue; |
| 790 | } | 1167 | } |
| 791 | ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, | 1168 | /* we use swappiness of local cgroup */ |
| 792 | get_swappiness(next_mem)); | 1169 | if (check_soft) |
| 793 | if (mem_cgroup_check_under_limit(root_mem)) | 1170 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
| 794 | return 1; /* indicate reclaim has succeeded */ | 1171 | noswap, get_swappiness(victim), zone, |
| 795 | next_mem = mem_cgroup_get_next_node(root_mem); | 1172 | zone->zone_pgdat->node_id); |
| 1173 | else | ||
| 1174 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | ||
| 1175 | noswap, get_swappiness(victim)); | ||
| 1176 | css_put(&victim->css); | ||
| 1177 | /* | ||
| 1178 | * At shrinking usage, we can't check we should stop here or | ||
| 1179 | * reclaim more. It's depends on callers. last_scanned_child | ||
| 1180 | * will work enough for keeping fairness under tree. | ||
| 1181 | */ | ||
| 1182 | if (shrink) | ||
| 1183 | return ret; | ||
| 1184 | total += ret; | ||
| 1185 | if (check_soft) { | ||
| 1186 | if (res_counter_check_under_soft_limit(&root_mem->res)) | ||
| 1187 | return total; | ||
| 1188 | } else if (mem_cgroup_check_under_limit(root_mem)) | ||
| 1189 | return 1 + total; | ||
| 796 | } | 1190 | } |
| 797 | return ret; | 1191 | return total; |
| 798 | } | 1192 | } |
| 799 | 1193 | ||
| 800 | bool mem_cgroup_oom_called(struct task_struct *task) | 1194 | bool mem_cgroup_oom_called(struct task_struct *task) |
| @@ -813,13 +1207,64 @@ bool mem_cgroup_oom_called(struct task_struct *task) | |||
| 813 | rcu_read_unlock(); | 1207 | rcu_read_unlock(); |
| 814 | return ret; | 1208 | return ret; |
| 815 | } | 1209 | } |
| 1210 | |||
| 1211 | static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | ||
| 1212 | { | ||
| 1213 | mem->last_oom_jiffies = jiffies; | ||
| 1214 | return 0; | ||
| 1215 | } | ||
| 1216 | |||
| 1217 | static void record_last_oom(struct mem_cgroup *mem) | ||
| 1218 | { | ||
| 1219 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | ||
| 1220 | } | ||
| 1221 | |||
| 1222 | /* | ||
| 1223 | * Currently used to update mapped file statistics, but the routine can be | ||
| 1224 | * generalized to update other statistics as well. | ||
| 1225 | */ | ||
| 1226 | void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | ||
| 1227 | { | ||
| 1228 | struct mem_cgroup *mem; | ||
| 1229 | struct mem_cgroup_stat *stat; | ||
| 1230 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 1231 | int cpu; | ||
| 1232 | struct page_cgroup *pc; | ||
| 1233 | |||
| 1234 | if (!page_is_file_cache(page)) | ||
| 1235 | return; | ||
| 1236 | |||
| 1237 | pc = lookup_page_cgroup(page); | ||
| 1238 | if (unlikely(!pc)) | ||
| 1239 | return; | ||
| 1240 | |||
| 1241 | lock_page_cgroup(pc); | ||
| 1242 | mem = pc->mem_cgroup; | ||
| 1243 | if (!mem) | ||
| 1244 | goto done; | ||
| 1245 | |||
| 1246 | if (!PageCgroupUsed(pc)) | ||
| 1247 | goto done; | ||
| 1248 | |||
| 1249 | /* | ||
| 1250 | * Preemption is already disabled, we don't need get_cpu() | ||
| 1251 | */ | ||
| 1252 | cpu = smp_processor_id(); | ||
| 1253 | stat = &mem->stat; | ||
| 1254 | cpustat = &stat->cpustat[cpu]; | ||
| 1255 | |||
| 1256 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); | ||
| 1257 | done: | ||
| 1258 | unlock_page_cgroup(pc); | ||
| 1259 | } | ||
| 1260 | |||
| 816 | /* | 1261 | /* |
| 817 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1262 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
| 818 | * oom-killer can be invoked. | 1263 | * oom-killer can be invoked. |
| 819 | */ | 1264 | */ |
| 820 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1265 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
| 821 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1266 | gfp_t gfp_mask, struct mem_cgroup **memcg, |
| 822 | bool oom) | 1267 | bool oom, struct page *page) |
| 823 | { | 1268 | { |
| 824 | struct mem_cgroup *mem, *mem_over_limit; | 1269 | struct mem_cgroup *mem, *mem_over_limit; |
| 825 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1270 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
| @@ -847,12 +1292,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 847 | if (unlikely(!mem)) | 1292 | if (unlikely(!mem)) |
| 848 | return 0; | 1293 | return 0; |
| 849 | 1294 | ||
| 850 | VM_BUG_ON(mem_cgroup_is_obsolete(mem)); | 1295 | VM_BUG_ON(css_is_removed(&mem->css)); |
| 851 | 1296 | ||
| 852 | while (1) { | 1297 | while (1) { |
| 853 | int ret; | 1298 | int ret = 0; |
| 854 | bool noswap = false; | 1299 | unsigned long flags = 0; |
| 855 | 1300 | ||
| 1301 | if (mem_cgroup_is_root(mem)) | ||
| 1302 | goto done; | ||
| 856 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1303 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); |
| 857 | if (likely(!ret)) { | 1304 | if (likely(!ret)) { |
| 858 | if (!do_swap_account) | 1305 | if (!do_swap_account) |
| @@ -863,7 +1310,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 863 | break; | 1310 | break; |
| 864 | /* mem+swap counter fails */ | 1311 | /* mem+swap counter fails */ |
| 865 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1312 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
| 866 | noswap = true; | 1313 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
| 867 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1314 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
| 868 | memsw); | 1315 | memsw); |
| 869 | } else | 1316 | } else |
| @@ -874,8 +1321,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 874 | if (!(gfp_mask & __GFP_WAIT)) | 1321 | if (!(gfp_mask & __GFP_WAIT)) |
| 875 | goto nomem; | 1322 | goto nomem; |
| 876 | 1323 | ||
| 877 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, | 1324 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
| 878 | noswap); | 1325 | gfp_mask, flags); |
| 879 | if (ret) | 1326 | if (ret) |
| 880 | continue; | 1327 | continue; |
| 881 | 1328 | ||
| @@ -895,31 +1342,71 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 895 | mutex_lock(&memcg_tasklist); | 1342 | mutex_lock(&memcg_tasklist); |
| 896 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1343 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); |
| 897 | mutex_unlock(&memcg_tasklist); | 1344 | mutex_unlock(&memcg_tasklist); |
| 898 | mem_over_limit->last_oom_jiffies = jiffies; | 1345 | record_last_oom(mem_over_limit); |
| 899 | } | 1346 | } |
| 900 | goto nomem; | 1347 | goto nomem; |
| 901 | } | 1348 | } |
| 902 | } | 1349 | } |
| 1350 | /* | ||
| 1351 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
| 1352 | * if they exceeds softlimit. | ||
| 1353 | */ | ||
| 1354 | if (mem_cgroup_soft_limit_check(mem)) | ||
| 1355 | mem_cgroup_update_tree(mem, page); | ||
| 1356 | done: | ||
| 903 | return 0; | 1357 | return 0; |
| 904 | nomem: | 1358 | nomem: |
| 905 | css_put(&mem->css); | 1359 | css_put(&mem->css); |
| 906 | return -ENOMEM; | 1360 | return -ENOMEM; |
| 907 | } | 1361 | } |
| 908 | 1362 | ||
| 1363 | /* | ||
| 1364 | * A helper function to get mem_cgroup from ID. must be called under | ||
| 1365 | * rcu_read_lock(). The caller must check css_is_removed() or some if | ||
| 1366 | * it's concern. (dropping refcnt from swap can be called against removed | ||
| 1367 | * memcg.) | ||
| 1368 | */ | ||
| 1369 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | ||
| 1370 | { | ||
| 1371 | struct cgroup_subsys_state *css; | ||
| 1372 | |||
| 1373 | /* ID 0 is unused ID */ | ||
| 1374 | if (!id) | ||
| 1375 | return NULL; | ||
| 1376 | css = css_lookup(&mem_cgroup_subsys, id); | ||
| 1377 | if (!css) | ||
| 1378 | return NULL; | ||
| 1379 | return container_of(css, struct mem_cgroup, css); | ||
| 1380 | } | ||
| 1381 | |||
| 909 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | 1382 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) |
| 910 | { | 1383 | { |
| 911 | struct mem_cgroup *mem; | 1384 | struct mem_cgroup *mem; |
| 1385 | struct page_cgroup *pc; | ||
| 1386 | unsigned short id; | ||
| 912 | swp_entry_t ent; | 1387 | swp_entry_t ent; |
| 913 | 1388 | ||
| 1389 | VM_BUG_ON(!PageLocked(page)); | ||
| 1390 | |||
| 914 | if (!PageSwapCache(page)) | 1391 | if (!PageSwapCache(page)) |
| 915 | return NULL; | 1392 | return NULL; |
| 916 | 1393 | ||
| 917 | ent.val = page_private(page); | 1394 | pc = lookup_page_cgroup(page); |
| 918 | mem = lookup_swap_cgroup(ent); | 1395 | lock_page_cgroup(pc); |
| 919 | if (!mem) | 1396 | if (PageCgroupUsed(pc)) { |
| 920 | return NULL; | 1397 | mem = pc->mem_cgroup; |
| 921 | if (!css_tryget(&mem->css)) | 1398 | if (mem && !css_tryget(&mem->css)) |
| 922 | return NULL; | 1399 | mem = NULL; |
| 1400 | } else { | ||
| 1401 | ent.val = page_private(page); | ||
| 1402 | id = lookup_swap_cgroup(ent); | ||
| 1403 | rcu_read_lock(); | ||
| 1404 | mem = mem_cgroup_lookup(id); | ||
| 1405 | if (mem && !css_tryget(&mem->css)) | ||
| 1406 | mem = NULL; | ||
| 1407 | rcu_read_unlock(); | ||
| 1408 | } | ||
| 1409 | unlock_page_cgroup(pc); | ||
| 923 | return mem; | 1410 | return mem; |
| 924 | } | 1411 | } |
| 925 | 1412 | ||
| @@ -939,15 +1426,37 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
| 939 | lock_page_cgroup(pc); | 1426 | lock_page_cgroup(pc); |
| 940 | if (unlikely(PageCgroupUsed(pc))) { | 1427 | if (unlikely(PageCgroupUsed(pc))) { |
| 941 | unlock_page_cgroup(pc); | 1428 | unlock_page_cgroup(pc); |
| 942 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1429 | if (!mem_cgroup_is_root(mem)) { |
| 943 | if (do_swap_account) | 1430 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
| 944 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1431 | if (do_swap_account) |
| 1432 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
| 1433 | } | ||
| 945 | css_put(&mem->css); | 1434 | css_put(&mem->css); |
| 946 | return; | 1435 | return; |
| 947 | } | 1436 | } |
| 1437 | |||
| 948 | pc->mem_cgroup = mem; | 1438 | pc->mem_cgroup = mem; |
| 1439 | /* | ||
| 1440 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | ||
| 1441 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup | ||
| 1442 | * is accessed after testing USED bit. To make pc->mem_cgroup visible | ||
| 1443 | * before USED bit, we need memory barrier here. | ||
| 1444 | * See mem_cgroup_add_lru_list(), etc. | ||
| 1445 | */ | ||
| 949 | smp_wmb(); | 1446 | smp_wmb(); |
| 950 | pc->flags = pcg_default_flags[ctype]; | 1447 | switch (ctype) { |
| 1448 | case MEM_CGROUP_CHARGE_TYPE_CACHE: | ||
| 1449 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: | ||
| 1450 | SetPageCgroupCache(pc); | ||
| 1451 | SetPageCgroupUsed(pc); | ||
| 1452 | break; | ||
| 1453 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | ||
| 1454 | ClearPageCgroupCache(pc); | ||
| 1455 | SetPageCgroupUsed(pc); | ||
| 1456 | break; | ||
| 1457 | default: | ||
| 1458 | break; | ||
| 1459 | } | ||
| 951 | 1460 | ||
| 952 | mem_cgroup_charge_statistics(mem, pc, true); | 1461 | mem_cgroup_charge_statistics(mem, pc, true); |
| 953 | 1462 | ||
| @@ -976,6 +1485,10 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 976 | struct mem_cgroup_per_zone *from_mz, *to_mz; | 1485 | struct mem_cgroup_per_zone *from_mz, *to_mz; |
| 977 | int nid, zid; | 1486 | int nid, zid; |
| 978 | int ret = -EBUSY; | 1487 | int ret = -EBUSY; |
| 1488 | struct page *page; | ||
| 1489 | int cpu; | ||
| 1490 | struct mem_cgroup_stat *stat; | ||
| 1491 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 979 | 1492 | ||
| 980 | VM_BUG_ON(from == to); | 1493 | VM_BUG_ON(from == to); |
| 981 | VM_BUG_ON(PageLRU(pc->page)); | 1494 | VM_BUG_ON(PageLRU(pc->page)); |
| @@ -994,9 +1507,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 994 | if (pc->mem_cgroup != from) | 1507 | if (pc->mem_cgroup != from) |
| 995 | goto out; | 1508 | goto out; |
| 996 | 1509 | ||
| 997 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1510 | if (!mem_cgroup_is_root(from)) |
| 1511 | res_counter_uncharge(&from->res, PAGE_SIZE); | ||
| 998 | mem_cgroup_charge_statistics(from, pc, false); | 1512 | mem_cgroup_charge_statistics(from, pc, false); |
| 999 | if (do_swap_account) | 1513 | |
| 1514 | page = pc->page; | ||
| 1515 | if (page_is_file_cache(page) && page_mapped(page)) { | ||
| 1516 | cpu = smp_processor_id(); | ||
| 1517 | /* Update mapped_file data for mem_cgroup "from" */ | ||
| 1518 | stat = &from->stat; | ||
| 1519 | cpustat = &stat->cpustat[cpu]; | ||
| 1520 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | ||
| 1521 | -1); | ||
| 1522 | |||
| 1523 | /* Update mapped_file data for mem_cgroup "to" */ | ||
| 1524 | stat = &to->stat; | ||
| 1525 | cpustat = &stat->cpustat[cpu]; | ||
| 1526 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | ||
| 1527 | 1); | ||
| 1528 | } | ||
| 1529 | |||
| 1530 | if (do_swap_account && !mem_cgroup_is_root(from)) | ||
| 1000 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | 1531 | res_counter_uncharge(&from->memsw, PAGE_SIZE); |
| 1001 | css_put(&from->css); | 1532 | css_put(&from->css); |
| 1002 | 1533 | ||
| @@ -1006,6 +1537,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 1006 | ret = 0; | 1537 | ret = 0; |
| 1007 | out: | 1538 | out: |
| 1008 | unlock_page_cgroup(pc); | 1539 | unlock_page_cgroup(pc); |
| 1540 | /* | ||
| 1541 | * We charges against "to" which may not have any tasks. Then, "to" | ||
| 1542 | * can be under rmdir(). But in current implementation, caller of | ||
| 1543 | * this function is just force_empty() and it's garanteed that | ||
| 1544 | * "to" is never removed. So, we don't check rmdir status here. | ||
| 1545 | */ | ||
| 1009 | return ret; | 1546 | return ret; |
| 1010 | } | 1547 | } |
| 1011 | 1548 | ||
| @@ -1031,7 +1568,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
| 1031 | parent = mem_cgroup_from_cont(pcg); | 1568 | parent = mem_cgroup_from_cont(pcg); |
| 1032 | 1569 | ||
| 1033 | 1570 | ||
| 1034 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 1571 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
| 1035 | if (ret || !parent) | 1572 | if (ret || !parent) |
| 1036 | return ret; | 1573 | return ret; |
| 1037 | 1574 | ||
| @@ -1061,9 +1598,11 @@ uncharge: | |||
| 1061 | /* drop extra refcnt by try_charge() */ | 1598 | /* drop extra refcnt by try_charge() */ |
| 1062 | css_put(&parent->css); | 1599 | css_put(&parent->css); |
| 1063 | /* uncharge if move fails */ | 1600 | /* uncharge if move fails */ |
| 1064 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 1601 | if (!mem_cgroup_is_root(parent)) { |
| 1065 | if (do_swap_account) | 1602 | res_counter_uncharge(&parent->res, PAGE_SIZE); |
| 1066 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | 1603 | if (do_swap_account) |
| 1604 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
| 1605 | } | ||
| 1067 | return ret; | 1606 | return ret; |
| 1068 | } | 1607 | } |
| 1069 | 1608 | ||
| @@ -1088,7 +1627,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 1088 | prefetchw(pc); | 1627 | prefetchw(pc); |
| 1089 | 1628 | ||
| 1090 | mem = memcg; | 1629 | mem = memcg; |
| 1091 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 1630 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); |
| 1092 | if (ret || !mem) | 1631 | if (ret || !mem) |
| 1093 | return ret; | 1632 | return ret; |
| 1094 | 1633 | ||
| @@ -1118,6 +1657,10 @@ int mem_cgroup_newpage_charge(struct page *page, | |||
| 1118 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); | 1657 | MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); |
| 1119 | } | 1658 | } |
| 1120 | 1659 | ||
| 1660 | static void | ||
| 1661 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | ||
| 1662 | enum charge_type ctype); | ||
| 1663 | |||
| 1121 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | 1664 | int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, |
| 1122 | gfp_t gfp_mask) | 1665 | gfp_t gfp_mask) |
| 1123 | { | 1666 | { |
| @@ -1154,16 +1697,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
| 1154 | unlock_page_cgroup(pc); | 1697 | unlock_page_cgroup(pc); |
| 1155 | } | 1698 | } |
| 1156 | 1699 | ||
| 1157 | if (do_swap_account && PageSwapCache(page)) { | ||
| 1158 | mem = try_get_mem_cgroup_from_swapcache(page); | ||
| 1159 | if (mem) | ||
| 1160 | mm = NULL; | ||
| 1161 | else | ||
| 1162 | mem = NULL; | ||
| 1163 | /* SwapCache may be still linked to LRU now. */ | ||
| 1164 | mem_cgroup_lru_del_before_commit_swapcache(page); | ||
| 1165 | } | ||
| 1166 | |||
| 1167 | if (unlikely(!mm && !mem)) | 1700 | if (unlikely(!mm && !mem)) |
| 1168 | mm = &init_mm; | 1701 | mm = &init_mm; |
| 1169 | 1702 | ||
| @@ -1171,22 +1704,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
| 1171 | return mem_cgroup_charge_common(page, mm, gfp_mask, | 1704 | return mem_cgroup_charge_common(page, mm, gfp_mask, |
| 1172 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); | 1705 | MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); |
| 1173 | 1706 | ||
| 1174 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | 1707 | /* shmem */ |
| 1175 | MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); | 1708 | if (PageSwapCache(page)) { |
| 1176 | if (mem) | 1709 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); |
| 1177 | css_put(&mem->css); | 1710 | if (!ret) |
| 1178 | if (PageSwapCache(page)) | 1711 | __mem_cgroup_commit_charge_swapin(page, mem, |
| 1179 | mem_cgroup_lru_add_after_commit_swapcache(page); | 1712 | MEM_CGROUP_CHARGE_TYPE_SHMEM); |
| 1713 | } else | ||
| 1714 | ret = mem_cgroup_charge_common(page, mm, gfp_mask, | ||
| 1715 | MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); | ||
| 1180 | 1716 | ||
| 1181 | if (do_swap_account && !ret && PageSwapCache(page)) { | ||
| 1182 | swp_entry_t ent = {.val = page_private(page)}; | ||
| 1183 | /* avoid double counting */ | ||
| 1184 | mem = swap_cgroup_record(ent, NULL); | ||
| 1185 | if (mem) { | ||
| 1186 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
| 1187 | mem_cgroup_put(mem); | ||
| 1188 | } | ||
| 1189 | } | ||
| 1190 | return ret; | 1717 | return ret; |
| 1191 | } | 1718 | } |
| 1192 | 1719 | ||
| @@ -1219,17 +1746,19 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
| 1219 | if (!mem) | 1746 | if (!mem) |
| 1220 | goto charge_cur_mm; | 1747 | goto charge_cur_mm; |
| 1221 | *ptr = mem; | 1748 | *ptr = mem; |
| 1222 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 1749 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); |
| 1223 | /* drop extra refcnt from tryget */ | 1750 | /* drop extra refcnt from tryget */ |
| 1224 | css_put(&mem->css); | 1751 | css_put(&mem->css); |
| 1225 | return ret; | 1752 | return ret; |
| 1226 | charge_cur_mm: | 1753 | charge_cur_mm: |
| 1227 | if (unlikely(!mm)) | 1754 | if (unlikely(!mm)) |
| 1228 | mm = &init_mm; | 1755 | mm = &init_mm; |
| 1229 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 1756 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); |
| 1230 | } | 1757 | } |
| 1231 | 1758 | ||
| 1232 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | 1759 | static void |
| 1760 | __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | ||
| 1761 | enum charge_type ctype) | ||
| 1233 | { | 1762 | { |
| 1234 | struct page_cgroup *pc; | 1763 | struct page_cgroup *pc; |
| 1235 | 1764 | ||
| @@ -1237,9 +1766,10 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | |||
| 1237 | return; | 1766 | return; |
| 1238 | if (!ptr) | 1767 | if (!ptr) |
| 1239 | return; | 1768 | return; |
| 1769 | cgroup_exclude_rmdir(&ptr->css); | ||
| 1240 | pc = lookup_page_cgroup(page); | 1770 | pc = lookup_page_cgroup(page); |
| 1241 | mem_cgroup_lru_del_before_commit_swapcache(page); | 1771 | mem_cgroup_lru_del_before_commit_swapcache(page); |
| 1242 | __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); | 1772 | __mem_cgroup_commit_charge(ptr, pc, ctype); |
| 1243 | mem_cgroup_lru_add_after_commit_swapcache(page); | 1773 | mem_cgroup_lru_add_after_commit_swapcache(page); |
| 1244 | /* | 1774 | /* |
| 1245 | * Now swap is on-memory. This means this page may be | 1775 | * Now swap is on-memory. This means this page may be |
| @@ -1250,16 +1780,36 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | |||
| 1250 | */ | 1780 | */ |
| 1251 | if (do_swap_account && PageSwapCache(page)) { | 1781 | if (do_swap_account && PageSwapCache(page)) { |
| 1252 | swp_entry_t ent = {.val = page_private(page)}; | 1782 | swp_entry_t ent = {.val = page_private(page)}; |
| 1783 | unsigned short id; | ||
| 1253 | struct mem_cgroup *memcg; | 1784 | struct mem_cgroup *memcg; |
| 1254 | memcg = swap_cgroup_record(ent, NULL); | 1785 | |
| 1786 | id = swap_cgroup_record(ent, 0); | ||
| 1787 | rcu_read_lock(); | ||
| 1788 | memcg = mem_cgroup_lookup(id); | ||
| 1255 | if (memcg) { | 1789 | if (memcg) { |
| 1256 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1790 | /* |
| 1791 | * This recorded memcg can be obsolete one. So, avoid | ||
| 1792 | * calling css_tryget | ||
| 1793 | */ | ||
| 1794 | if (!mem_cgroup_is_root(memcg)) | ||
| 1795 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
| 1796 | mem_cgroup_swap_statistics(memcg, false); | ||
| 1257 | mem_cgroup_put(memcg); | 1797 | mem_cgroup_put(memcg); |
| 1258 | } | 1798 | } |
| 1259 | 1799 | rcu_read_unlock(); | |
| 1260 | } | 1800 | } |
| 1261 | /* add this page(page_cgroup) to the LRU we want. */ | 1801 | /* |
| 1802 | * At swapin, we may charge account against cgroup which has no tasks. | ||
| 1803 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
| 1804 | * In that case, we need to call pre_destroy() again. check it here. | ||
| 1805 | */ | ||
| 1806 | cgroup_release_and_wakeup_rmdir(&ptr->css); | ||
| 1807 | } | ||
| 1262 | 1808 | ||
| 1809 | void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) | ||
| 1810 | { | ||
| 1811 | __mem_cgroup_commit_charge_swapin(page, ptr, | ||
| 1812 | MEM_CGROUP_CHARGE_TYPE_MAPPED); | ||
| 1263 | } | 1813 | } |
| 1264 | 1814 | ||
| 1265 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | 1815 | void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) |
| @@ -1268,9 +1818,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
| 1268 | return; | 1818 | return; |
| 1269 | if (!mem) | 1819 | if (!mem) |
| 1270 | return; | 1820 | return; |
| 1271 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1821 | if (!mem_cgroup_is_root(mem)) { |
| 1272 | if (do_swap_account) | 1822 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
| 1273 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1823 | if (do_swap_account) |
| 1824 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
| 1825 | } | ||
| 1274 | css_put(&mem->css); | 1826 | css_put(&mem->css); |
| 1275 | } | 1827 | } |
| 1276 | 1828 | ||
| @@ -1307,6 +1859,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 1307 | 1859 | ||
| 1308 | switch (ctype) { | 1860 | switch (ctype) { |
| 1309 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 1861 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
| 1862 | case MEM_CGROUP_CHARGE_TYPE_DROP: | ||
| 1310 | if (page_mapped(page)) | 1863 | if (page_mapped(page)) |
| 1311 | goto unlock_out; | 1864 | goto unlock_out; |
| 1312 | break; | 1865 | break; |
| @@ -1321,11 +1874,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 1321 | break; | 1874 | break; |
| 1322 | } | 1875 | } |
| 1323 | 1876 | ||
| 1324 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1877 | if (!mem_cgroup_is_root(mem)) { |
| 1325 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | 1878 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
| 1326 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1879 | if (do_swap_account && |
| 1327 | 1880 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | |
| 1881 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
| 1882 | } | ||
| 1883 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
| 1884 | mem_cgroup_swap_statistics(mem, true); | ||
| 1328 | mem_cgroup_charge_statistics(mem, pc, false); | 1885 | mem_cgroup_charge_statistics(mem, pc, false); |
| 1886 | |||
| 1329 | ClearPageCgroupUsed(pc); | 1887 | ClearPageCgroupUsed(pc); |
| 1330 | /* | 1888 | /* |
| 1331 | * pc->mem_cgroup is not cleared here. It will be accessed when it's | 1889 | * pc->mem_cgroup is not cleared here. It will be accessed when it's |
| @@ -1337,6 +1895,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 1337 | mz = page_cgroup_zoneinfo(pc); | 1895 | mz = page_cgroup_zoneinfo(pc); |
| 1338 | unlock_page_cgroup(pc); | 1896 | unlock_page_cgroup(pc); |
| 1339 | 1897 | ||
| 1898 | if (mem_cgroup_soft_limit_check(mem)) | ||
| 1899 | mem_cgroup_update_tree(mem, page); | ||
| 1340 | /* at swapout, this memcg will be accessed to record to swap */ | 1900 | /* at swapout, this memcg will be accessed to record to swap */ |
| 1341 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 1901 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
| 1342 | css_put(&mem->css); | 1902 | css_put(&mem->css); |
| @@ -1365,24 +1925,31 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
| 1365 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 1925 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
| 1366 | } | 1926 | } |
| 1367 | 1927 | ||
| 1928 | #ifdef CONFIG_SWAP | ||
| 1368 | /* | 1929 | /* |
| 1369 | * called from __delete_from_swap_cache() and drop "page" account. | 1930 | * called after __delete_from_swap_cache() and drop "page" account. |
| 1370 | * memcg information is recorded to swap_cgroup of "ent" | 1931 | * memcg information is recorded to swap_cgroup of "ent" |
| 1371 | */ | 1932 | */ |
| 1372 | void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) | 1933 | void |
| 1934 | mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | ||
| 1373 | { | 1935 | { |
| 1374 | struct mem_cgroup *memcg; | 1936 | struct mem_cgroup *memcg; |
| 1937 | int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; | ||
| 1938 | |||
| 1939 | if (!swapout) /* this was a swap cache but the swap is unused ! */ | ||
| 1940 | ctype = MEM_CGROUP_CHARGE_TYPE_DROP; | ||
| 1941 | |||
| 1942 | memcg = __mem_cgroup_uncharge_common(page, ctype); | ||
| 1375 | 1943 | ||
| 1376 | memcg = __mem_cgroup_uncharge_common(page, | ||
| 1377 | MEM_CGROUP_CHARGE_TYPE_SWAPOUT); | ||
| 1378 | /* record memcg information */ | 1944 | /* record memcg information */ |
| 1379 | if (do_swap_account && memcg) { | 1945 | if (do_swap_account && swapout && memcg) { |
| 1380 | swap_cgroup_record(ent, memcg); | 1946 | swap_cgroup_record(ent, css_id(&memcg->css)); |
| 1381 | mem_cgroup_get(memcg); | 1947 | mem_cgroup_get(memcg); |
| 1382 | } | 1948 | } |
| 1383 | if (memcg) | 1949 | if (swapout && memcg) |
| 1384 | css_put(&memcg->css); | 1950 | css_put(&memcg->css); |
| 1385 | } | 1951 | } |
| 1952 | #endif | ||
| 1386 | 1953 | ||
| 1387 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 1954 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
| 1388 | /* | 1955 | /* |
| @@ -1392,15 +1959,25 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent) | |||
| 1392 | void mem_cgroup_uncharge_swap(swp_entry_t ent) | 1959 | void mem_cgroup_uncharge_swap(swp_entry_t ent) |
| 1393 | { | 1960 | { |
| 1394 | struct mem_cgroup *memcg; | 1961 | struct mem_cgroup *memcg; |
| 1962 | unsigned short id; | ||
| 1395 | 1963 | ||
| 1396 | if (!do_swap_account) | 1964 | if (!do_swap_account) |
| 1397 | return; | 1965 | return; |
| 1398 | 1966 | ||
| 1399 | memcg = swap_cgroup_record(ent, NULL); | 1967 | id = swap_cgroup_record(ent, 0); |
| 1968 | rcu_read_lock(); | ||
| 1969 | memcg = mem_cgroup_lookup(id); | ||
| 1400 | if (memcg) { | 1970 | if (memcg) { |
| 1401 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1971 | /* |
| 1972 | * We uncharge this because swap is freed. | ||
| 1973 | * This memcg can be obsolete one. We avoid calling css_tryget | ||
| 1974 | */ | ||
| 1975 | if (!mem_cgroup_is_root(memcg)) | ||
| 1976 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
| 1977 | mem_cgroup_swap_statistics(memcg, false); | ||
| 1402 | mem_cgroup_put(memcg); | 1978 | mem_cgroup_put(memcg); |
| 1403 | } | 1979 | } |
| 1980 | rcu_read_unlock(); | ||
| 1404 | } | 1981 | } |
| 1405 | #endif | 1982 | #endif |
| 1406 | 1983 | ||
| @@ -1426,7 +2003,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
| 1426 | unlock_page_cgroup(pc); | 2003 | unlock_page_cgroup(pc); |
| 1427 | 2004 | ||
| 1428 | if (mem) { | 2005 | if (mem) { |
| 1429 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 2006 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
| 2007 | page); | ||
| 1430 | css_put(&mem->css); | 2008 | css_put(&mem->css); |
| 1431 | } | 2009 | } |
| 1432 | *ptr = mem; | 2010 | *ptr = mem; |
| @@ -1443,7 +2021,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
| 1443 | 2021 | ||
| 1444 | if (!mem) | 2022 | if (!mem) |
| 1445 | return; | 2023 | return; |
| 1446 | 2024 | cgroup_exclude_rmdir(&mem->css); | |
| 1447 | /* at migration success, oldpage->mapping is NULL. */ | 2025 | /* at migration success, oldpage->mapping is NULL. */ |
| 1448 | if (oldpage->mapping) { | 2026 | if (oldpage->mapping) { |
| 1449 | target = oldpage; | 2027 | target = oldpage; |
| @@ -1483,39 +2061,37 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
| 1483 | */ | 2061 | */ |
| 1484 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 2062 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) |
| 1485 | mem_cgroup_uncharge_page(target); | 2063 | mem_cgroup_uncharge_page(target); |
| 2064 | /* | ||
| 2065 | * At migration, we may charge account against cgroup which has no tasks | ||
| 2066 | * So, rmdir()->pre_destroy() can be called while we do this charge. | ||
| 2067 | * In that case, we need to call pre_destroy() again. check it here. | ||
| 2068 | */ | ||
| 2069 | cgroup_release_and_wakeup_rmdir(&mem->css); | ||
| 1486 | } | 2070 | } |
| 1487 | 2071 | ||
| 1488 | /* | 2072 | /* |
| 1489 | * A call to try to shrink memory usage under specified resource controller. | 2073 | * A call to try to shrink memory usage on charge failure at shmem's swapin. |
| 1490 | * This is typically used for page reclaiming for shmem for reducing side | 2074 | * Calling hierarchical_reclaim is not enough because we should update |
| 1491 | * effect of page allocation from shmem, which is used by some mem_cgroup. | 2075 | * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM. |
| 2076 | * Moreover considering hierarchy, we should reclaim from the mem_over_limit, | ||
| 2077 | * not from the memcg which this page would be charged to. | ||
| 2078 | * try_charge_swapin does all of these works properly. | ||
| 1492 | */ | 2079 | */ |
| 1493 | int mem_cgroup_shrink_usage(struct page *page, | 2080 | int mem_cgroup_shmem_charge_fallback(struct page *page, |
| 1494 | struct mm_struct *mm, | 2081 | struct mm_struct *mm, |
| 1495 | gfp_t gfp_mask) | 2082 | gfp_t gfp_mask) |
| 1496 | { | 2083 | { |
| 1497 | struct mem_cgroup *mem = NULL; | 2084 | struct mem_cgroup *mem = NULL; |
| 1498 | int progress = 0; | 2085 | int ret; |
| 1499 | int retry = MEM_CGROUP_RECLAIM_RETRIES; | ||
| 1500 | 2086 | ||
| 1501 | if (mem_cgroup_disabled()) | 2087 | if (mem_cgroup_disabled()) |
| 1502 | return 0; | 2088 | return 0; |
| 1503 | if (page) | ||
| 1504 | mem = try_get_mem_cgroup_from_swapcache(page); | ||
| 1505 | if (!mem && mm) | ||
| 1506 | mem = try_get_mem_cgroup_from_mm(mm); | ||
| 1507 | if (unlikely(!mem)) | ||
| 1508 | return 0; | ||
| 1509 | 2089 | ||
| 1510 | do { | 2090 | ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); |
| 1511 | progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); | 2091 | if (!ret) |
| 1512 | progress += mem_cgroup_check_under_limit(mem); | 2092 | mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */ |
| 1513 | } while (!progress && --retry); | ||
| 1514 | 2093 | ||
| 1515 | css_put(&mem->css); | 2094 | return ret; |
| 1516 | if (!retry) | ||
| 1517 | return -ENOMEM; | ||
| 1518 | return 0; | ||
| 1519 | } | 2095 | } |
| 1520 | 2096 | ||
| 1521 | static DEFINE_MUTEX(set_limit_mutex); | 2097 | static DEFINE_MUTEX(set_limit_mutex); |
| @@ -1523,11 +2099,21 @@ static DEFINE_MUTEX(set_limit_mutex); | |||
| 1523 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | 2099 | static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, |
| 1524 | unsigned long long val) | 2100 | unsigned long long val) |
| 1525 | { | 2101 | { |
| 1526 | 2102 | int retry_count; | |
| 1527 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | ||
| 1528 | int progress; | 2103 | int progress; |
| 1529 | u64 memswlimit; | 2104 | u64 memswlimit; |
| 1530 | int ret = 0; | 2105 | int ret = 0; |
| 2106 | int children = mem_cgroup_count_children(memcg); | ||
| 2107 | u64 curusage, oldusage; | ||
| 2108 | |||
| 2109 | /* | ||
| 2110 | * For keeping hierarchical_reclaim simple, how long we should retry | ||
| 2111 | * is depends on callers. We set our retry-count to be function | ||
| 2112 | * of # of children which we should visit in this loop. | ||
| 2113 | */ | ||
| 2114 | retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; | ||
| 2115 | |||
| 2116 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); | ||
| 1531 | 2117 | ||
| 1532 | while (retry_count) { | 2118 | while (retry_count) { |
| 1533 | if (signal_pending(current)) { | 2119 | if (signal_pending(current)) { |
| @@ -1547,29 +2133,42 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
| 1547 | break; | 2133 | break; |
| 1548 | } | 2134 | } |
| 1549 | ret = res_counter_set_limit(&memcg->res, val); | 2135 | ret = res_counter_set_limit(&memcg->res, val); |
| 2136 | if (!ret) { | ||
| 2137 | if (memswlimit == val) | ||
| 2138 | memcg->memsw_is_minimum = true; | ||
| 2139 | else | ||
| 2140 | memcg->memsw_is_minimum = false; | ||
| 2141 | } | ||
| 1550 | mutex_unlock(&set_limit_mutex); | 2142 | mutex_unlock(&set_limit_mutex); |
| 1551 | 2143 | ||
| 1552 | if (!ret) | 2144 | if (!ret) |
| 1553 | break; | 2145 | break; |
| 1554 | 2146 | ||
| 1555 | progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, | 2147 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, |
| 1556 | false); | 2148 | GFP_KERNEL, |
| 1557 | if (!progress) retry_count--; | 2149 | MEM_CGROUP_RECLAIM_SHRINK); |
| 2150 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | ||
| 2151 | /* Usage is reduced ? */ | ||
| 2152 | if (curusage >= oldusage) | ||
| 2153 | retry_count--; | ||
| 2154 | else | ||
| 2155 | oldusage = curusage; | ||
| 1558 | } | 2156 | } |
| 1559 | 2157 | ||
| 1560 | return ret; | 2158 | return ret; |
| 1561 | } | 2159 | } |
| 1562 | 2160 | ||
| 1563 | int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | 2161 | static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, |
| 1564 | unsigned long long val) | 2162 | unsigned long long val) |
| 1565 | { | 2163 | { |
| 1566 | int retry_count = MEM_CGROUP_RECLAIM_RETRIES; | 2164 | int retry_count; |
| 1567 | u64 memlimit, oldusage, curusage; | 2165 | u64 memlimit, oldusage, curusage; |
| 1568 | int ret; | 2166 | int children = mem_cgroup_count_children(memcg); |
| 1569 | 2167 | int ret = -EBUSY; | |
| 1570 | if (!do_swap_account) | ||
| 1571 | return -EINVAL; | ||
| 1572 | 2168 | ||
| 2169 | /* see mem_cgroup_resize_res_limit */ | ||
| 2170 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | ||
| 2171 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | ||
| 1573 | while (retry_count) { | 2172 | while (retry_count) { |
| 1574 | if (signal_pending(current)) { | 2173 | if (signal_pending(current)) { |
| 1575 | ret = -EINTR; | 2174 | ret = -EINTR; |
| @@ -1588,20 +2187,121 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
| 1588 | break; | 2187 | break; |
| 1589 | } | 2188 | } |
| 1590 | ret = res_counter_set_limit(&memcg->memsw, val); | 2189 | ret = res_counter_set_limit(&memcg->memsw, val); |
| 2190 | if (!ret) { | ||
| 2191 | if (memlimit == val) | ||
| 2192 | memcg->memsw_is_minimum = true; | ||
| 2193 | else | ||
| 2194 | memcg->memsw_is_minimum = false; | ||
| 2195 | } | ||
| 1591 | mutex_unlock(&set_limit_mutex); | 2196 | mutex_unlock(&set_limit_mutex); |
| 1592 | 2197 | ||
| 1593 | if (!ret) | 2198 | if (!ret) |
| 1594 | break; | 2199 | break; |
| 1595 | 2200 | ||
| 1596 | oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 2201 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
| 1597 | mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true); | 2202 | MEM_CGROUP_RECLAIM_NOSWAP | |
| 2203 | MEM_CGROUP_RECLAIM_SHRINK); | ||
| 1598 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 2204 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
| 2205 | /* Usage is reduced ? */ | ||
| 1599 | if (curusage >= oldusage) | 2206 | if (curusage >= oldusage) |
| 1600 | retry_count--; | 2207 | retry_count--; |
| 2208 | else | ||
| 2209 | oldusage = curusage; | ||
| 1601 | } | 2210 | } |
| 1602 | return ret; | 2211 | return ret; |
| 1603 | } | 2212 | } |
| 1604 | 2213 | ||
| 2214 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
| 2215 | gfp_t gfp_mask, int nid, | ||
| 2216 | int zid) | ||
| 2217 | { | ||
| 2218 | unsigned long nr_reclaimed = 0; | ||
| 2219 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
| 2220 | unsigned long reclaimed; | ||
| 2221 | int loop = 0; | ||
| 2222 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 2223 | unsigned long long excess; | ||
| 2224 | |||
| 2225 | if (order > 0) | ||
| 2226 | return 0; | ||
| 2227 | |||
| 2228 | mctz = soft_limit_tree_node_zone(nid, zid); | ||
| 2229 | /* | ||
| 2230 | * This loop can run a while, specially if mem_cgroup's continuously | ||
| 2231 | * keep exceeding their soft limit and putting the system under | ||
| 2232 | * pressure | ||
| 2233 | */ | ||
| 2234 | do { | ||
| 2235 | if (next_mz) | ||
| 2236 | mz = next_mz; | ||
| 2237 | else | ||
| 2238 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
| 2239 | if (!mz) | ||
| 2240 | break; | ||
| 2241 | |||
| 2242 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, | ||
| 2243 | gfp_mask, | ||
| 2244 | MEM_CGROUP_RECLAIM_SOFT); | ||
| 2245 | nr_reclaimed += reclaimed; | ||
| 2246 | spin_lock(&mctz->lock); | ||
| 2247 | |||
| 2248 | /* | ||
| 2249 | * If we failed to reclaim anything from this memory cgroup | ||
| 2250 | * it is time to move on to the next cgroup | ||
| 2251 | */ | ||
| 2252 | next_mz = NULL; | ||
| 2253 | if (!reclaimed) { | ||
| 2254 | do { | ||
| 2255 | /* | ||
| 2256 | * Loop until we find yet another one. | ||
| 2257 | * | ||
| 2258 | * By the time we get the soft_limit lock | ||
| 2259 | * again, someone might have aded the | ||
| 2260 | * group back on the RB tree. Iterate to | ||
| 2261 | * make sure we get a different mem. | ||
| 2262 | * mem_cgroup_largest_soft_limit_node returns | ||
| 2263 | * NULL if no other cgroup is present on | ||
| 2264 | * the tree | ||
| 2265 | */ | ||
| 2266 | next_mz = | ||
| 2267 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
| 2268 | if (next_mz == mz) { | ||
| 2269 | css_put(&next_mz->mem->css); | ||
| 2270 | next_mz = NULL; | ||
| 2271 | } else /* next_mz == NULL or other memcg */ | ||
| 2272 | break; | ||
| 2273 | } while (1); | ||
| 2274 | } | ||
| 2275 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
| 2276 | excess = res_counter_soft_limit_excess(&mz->mem->res); | ||
| 2277 | /* | ||
| 2278 | * One school of thought says that we should not add | ||
| 2279 | * back the node to the tree if reclaim returns 0. | ||
| 2280 | * But our reclaim could return 0, simply because due | ||
| 2281 | * to priority we are exposing a smaller subset of | ||
| 2282 | * memory to reclaim from. Consider this as a longer | ||
| 2283 | * term TODO. | ||
| 2284 | */ | ||
| 2285 | /* If excess == 0, no tree ops */ | ||
| 2286 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); | ||
| 2287 | spin_unlock(&mctz->lock); | ||
| 2288 | css_put(&mz->mem->css); | ||
| 2289 | loop++; | ||
| 2290 | /* | ||
| 2291 | * Could not reclaim anything and there are no more | ||
| 2292 | * mem cgroups to try or we seem to be looping without | ||
| 2293 | * reclaiming anything. | ||
| 2294 | */ | ||
| 2295 | if (!nr_reclaimed && | ||
| 2296 | (next_mz == NULL || | ||
| 2297 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
| 2298 | break; | ||
| 2299 | } while (!nr_reclaimed); | ||
| 2300 | if (next_mz) | ||
| 2301 | css_put(&next_mz->mem->css); | ||
| 2302 | return nr_reclaimed; | ||
| 2303 | } | ||
| 2304 | |||
| 1605 | /* | 2305 | /* |
| 1606 | * This routine traverse page_cgroup in given list and drop them all. | 2306 | * This routine traverse page_cgroup in given list and drop them all. |
| 1607 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 2307 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
| @@ -1730,7 +2430,7 @@ try_to_free: | |||
| 1730 | if (!progress) { | 2430 | if (!progress) { |
| 1731 | nr_retries--; | 2431 | nr_retries--; |
| 1732 | /* maybe some writeback is necessary */ | 2432 | /* maybe some writeback is necessary */ |
| 1733 | congestion_wait(WRITE, HZ/10); | 2433 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| 1734 | } | 2434 | } |
| 1735 | 2435 | ||
| 1736 | } | 2436 | } |
| @@ -1786,20 +2486,63 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
| 1786 | return retval; | 2486 | return retval; |
| 1787 | } | 2487 | } |
| 1788 | 2488 | ||
| 2489 | struct mem_cgroup_idx_data { | ||
| 2490 | s64 val; | ||
| 2491 | enum mem_cgroup_stat_index idx; | ||
| 2492 | }; | ||
| 2493 | |||
| 2494 | static int | ||
| 2495 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | ||
| 2496 | { | ||
| 2497 | struct mem_cgroup_idx_data *d = data; | ||
| 2498 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | ||
| 2499 | return 0; | ||
| 2500 | } | ||
| 2501 | |||
| 2502 | static void | ||
| 2503 | mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | ||
| 2504 | enum mem_cgroup_stat_index idx, s64 *val) | ||
| 2505 | { | ||
| 2506 | struct mem_cgroup_idx_data d; | ||
| 2507 | d.idx = idx; | ||
| 2508 | d.val = 0; | ||
| 2509 | mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); | ||
| 2510 | *val = d.val; | ||
| 2511 | } | ||
| 2512 | |||
| 1789 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2513 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
| 1790 | { | 2514 | { |
| 1791 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2515 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
| 1792 | u64 val = 0; | 2516 | u64 idx_val, val; |
| 1793 | int type, name; | 2517 | int type, name; |
| 1794 | 2518 | ||
| 1795 | type = MEMFILE_TYPE(cft->private); | 2519 | type = MEMFILE_TYPE(cft->private); |
| 1796 | name = MEMFILE_ATTR(cft->private); | 2520 | name = MEMFILE_ATTR(cft->private); |
| 1797 | switch (type) { | 2521 | switch (type) { |
| 1798 | case _MEM: | 2522 | case _MEM: |
| 1799 | val = res_counter_read_u64(&mem->res, name); | 2523 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
| 2524 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2525 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
| 2526 | val = idx_val; | ||
| 2527 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2528 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
| 2529 | val += idx_val; | ||
| 2530 | val <<= PAGE_SHIFT; | ||
| 2531 | } else | ||
| 2532 | val = res_counter_read_u64(&mem->res, name); | ||
| 1800 | break; | 2533 | break; |
| 1801 | case _MEMSWAP: | 2534 | case _MEMSWAP: |
| 1802 | if (do_swap_account) | 2535 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
| 2536 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2537 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
| 2538 | val = idx_val; | ||
| 2539 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2540 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
| 2541 | val += idx_val; | ||
| 2542 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2543 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
| 2544 | val <<= PAGE_SHIFT; | ||
| 2545 | } else | ||
| 1803 | val = res_counter_read_u64(&mem->memsw, name); | 2546 | val = res_counter_read_u64(&mem->memsw, name); |
| 1804 | break; | 2547 | break; |
| 1805 | default: | 2548 | default: |
| @@ -1824,6 +2567,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
| 1824 | name = MEMFILE_ATTR(cft->private); | 2567 | name = MEMFILE_ATTR(cft->private); |
| 1825 | switch (name) { | 2568 | switch (name) { |
| 1826 | case RES_LIMIT: | 2569 | case RES_LIMIT: |
| 2570 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | ||
| 2571 | ret = -EINVAL; | ||
| 2572 | break; | ||
| 2573 | } | ||
| 1827 | /* This function does all necessary parse...reuse it */ | 2574 | /* This function does all necessary parse...reuse it */ |
| 1828 | ret = res_counter_memparse_write_strategy(buffer, &val); | 2575 | ret = res_counter_memparse_write_strategy(buffer, &val); |
| 1829 | if (ret) | 2576 | if (ret) |
| @@ -1833,6 +2580,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
| 1833 | else | 2580 | else |
| 1834 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | 2581 | ret = mem_cgroup_resize_memsw_limit(memcg, val); |
| 1835 | break; | 2582 | break; |
| 2583 | case RES_SOFT_LIMIT: | ||
| 2584 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
| 2585 | if (ret) | ||
| 2586 | break; | ||
| 2587 | /* | ||
| 2588 | * For memsw, soft limits are hard to implement in terms | ||
| 2589 | * of semantics, for now, we support soft limits for | ||
| 2590 | * control without swap | ||
| 2591 | */ | ||
| 2592 | if (type == _MEM) | ||
| 2593 | ret = res_counter_set_soft_limit(&memcg->res, val); | ||
| 2594 | else | ||
| 2595 | ret = -EINVAL; | ||
| 2596 | break; | ||
| 1836 | default: | 2597 | default: |
| 1837 | ret = -EINVAL; /* should be BUG() ? */ | 2598 | ret = -EINVAL; /* should be BUG() ? */ |
| 1838 | break; | 2599 | break; |
| @@ -1890,57 +2651,107 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
| 1890 | res_counter_reset_failcnt(&mem->memsw); | 2651 | res_counter_reset_failcnt(&mem->memsw); |
| 1891 | break; | 2652 | break; |
| 1892 | } | 2653 | } |
| 2654 | |||
| 1893 | return 0; | 2655 | return 0; |
| 1894 | } | 2656 | } |
| 1895 | 2657 | ||
| 1896 | static const struct mem_cgroup_stat_desc { | 2658 | |
| 1897 | const char *msg; | 2659 | /* For read statistics */ |
| 1898 | u64 unit; | 2660 | enum { |
| 1899 | } mem_cgroup_stat_desc[] = { | 2661 | MCS_CACHE, |
| 1900 | [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, | 2662 | MCS_RSS, |
| 1901 | [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, | 2663 | MCS_MAPPED_FILE, |
| 1902 | [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, | 2664 | MCS_PGPGIN, |
| 1903 | [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, | 2665 | MCS_PGPGOUT, |
| 2666 | MCS_SWAP, | ||
| 2667 | MCS_INACTIVE_ANON, | ||
| 2668 | MCS_ACTIVE_ANON, | ||
| 2669 | MCS_INACTIVE_FILE, | ||
| 2670 | MCS_ACTIVE_FILE, | ||
| 2671 | MCS_UNEVICTABLE, | ||
| 2672 | NR_MCS_STAT, | ||
| 2673 | }; | ||
| 2674 | |||
| 2675 | struct mcs_total_stat { | ||
| 2676 | s64 stat[NR_MCS_STAT]; | ||
| 2677 | }; | ||
| 2678 | |||
| 2679 | struct { | ||
| 2680 | char *local_name; | ||
| 2681 | char *total_name; | ||
| 2682 | } memcg_stat_strings[NR_MCS_STAT] = { | ||
| 2683 | {"cache", "total_cache"}, | ||
| 2684 | {"rss", "total_rss"}, | ||
| 2685 | {"mapped_file", "total_mapped_file"}, | ||
| 2686 | {"pgpgin", "total_pgpgin"}, | ||
| 2687 | {"pgpgout", "total_pgpgout"}, | ||
| 2688 | {"swap", "total_swap"}, | ||
| 2689 | {"inactive_anon", "total_inactive_anon"}, | ||
| 2690 | {"active_anon", "total_active_anon"}, | ||
| 2691 | {"inactive_file", "total_inactive_file"}, | ||
| 2692 | {"active_file", "total_active_file"}, | ||
| 2693 | {"unevictable", "total_unevictable"} | ||
| 1904 | }; | 2694 | }; |
| 1905 | 2695 | ||
| 2696 | |||
| 2697 | static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | ||
| 2698 | { | ||
| 2699 | struct mcs_total_stat *s = data; | ||
| 2700 | s64 val; | ||
| 2701 | |||
| 2702 | /* per cpu stat */ | ||
| 2703 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); | ||
| 2704 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | ||
| 2705 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | ||
| 2706 | s->stat[MCS_RSS] += val * PAGE_SIZE; | ||
| 2707 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); | ||
| 2708 | s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; | ||
| 2709 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | ||
| 2710 | s->stat[MCS_PGPGIN] += val; | ||
| 2711 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | ||
| 2712 | s->stat[MCS_PGPGOUT] += val; | ||
| 2713 | if (do_swap_account) { | ||
| 2714 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | ||
| 2715 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | ||
| 2716 | } | ||
| 2717 | |||
| 2718 | /* per zone stat */ | ||
| 2719 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); | ||
| 2720 | s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; | ||
| 2721 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON); | ||
| 2722 | s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; | ||
| 2723 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE); | ||
| 2724 | s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; | ||
| 2725 | val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE); | ||
| 2726 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; | ||
| 2727 | val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); | ||
| 2728 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; | ||
| 2729 | return 0; | ||
| 2730 | } | ||
| 2731 | |||
| 2732 | static void | ||
| 2733 | mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | ||
| 2734 | { | ||
| 2735 | mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); | ||
| 2736 | } | ||
| 2737 | |||
| 1906 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 2738 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, |
| 1907 | struct cgroup_map_cb *cb) | 2739 | struct cgroup_map_cb *cb) |
| 1908 | { | 2740 | { |
| 1909 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); | 2741 | struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); |
| 1910 | struct mem_cgroup_stat *stat = &mem_cont->stat; | 2742 | struct mcs_total_stat mystat; |
| 1911 | int i; | 2743 | int i; |
| 1912 | 2744 | ||
| 1913 | for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { | 2745 | memset(&mystat, 0, sizeof(mystat)); |
| 1914 | s64 val; | 2746 | mem_cgroup_get_local_stat(mem_cont, &mystat); |
| 1915 | 2747 | ||
| 1916 | val = mem_cgroup_read_stat(stat, i); | 2748 | for (i = 0; i < NR_MCS_STAT; i++) { |
| 1917 | val *= mem_cgroup_stat_desc[i].unit; | 2749 | if (i == MCS_SWAP && !do_swap_account) |
| 1918 | cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); | 2750 | continue; |
| 2751 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); | ||
| 1919 | } | 2752 | } |
| 1920 | /* showing # of active pages */ | ||
| 1921 | { | ||
| 1922 | unsigned long active_anon, inactive_anon; | ||
| 1923 | unsigned long active_file, inactive_file; | ||
| 1924 | unsigned long unevictable; | ||
| 1925 | |||
| 1926 | inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, | ||
| 1927 | LRU_INACTIVE_ANON); | ||
| 1928 | active_anon = mem_cgroup_get_all_zonestat(mem_cont, | ||
| 1929 | LRU_ACTIVE_ANON); | ||
| 1930 | inactive_file = mem_cgroup_get_all_zonestat(mem_cont, | ||
| 1931 | LRU_INACTIVE_FILE); | ||
| 1932 | active_file = mem_cgroup_get_all_zonestat(mem_cont, | ||
| 1933 | LRU_ACTIVE_FILE); | ||
| 1934 | unevictable = mem_cgroup_get_all_zonestat(mem_cont, | ||
| 1935 | LRU_UNEVICTABLE); | ||
| 1936 | |||
| 1937 | cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); | ||
| 1938 | cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); | ||
| 1939 | cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); | ||
| 1940 | cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); | ||
| 1941 | cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); | ||
| 1942 | 2753 | ||
| 1943 | } | 2754 | /* Hierarchical information */ |
| 1944 | { | 2755 | { |
| 1945 | unsigned long long limit, memsw_limit; | 2756 | unsigned long long limit, memsw_limit; |
| 1946 | memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); | 2757 | memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); |
| @@ -1949,6 +2760,14 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
| 1949 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); | 2760 | cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); |
| 1950 | } | 2761 | } |
| 1951 | 2762 | ||
| 2763 | memset(&mystat, 0, sizeof(mystat)); | ||
| 2764 | mem_cgroup_get_total_stat(mem_cont, &mystat); | ||
| 2765 | for (i = 0; i < NR_MCS_STAT; i++) { | ||
| 2766 | if (i == MCS_SWAP && !do_swap_account) | ||
| 2767 | continue; | ||
| 2768 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); | ||
| 2769 | } | ||
| 2770 | |||
| 1952 | #ifdef CONFIG_DEBUG_VM | 2771 | #ifdef CONFIG_DEBUG_VM |
| 1953 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); | 2772 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); |
| 1954 | 2773 | ||
| @@ -2040,6 +2859,12 @@ static struct cftype mem_cgroup_files[] = { | |||
| 2040 | .read_u64 = mem_cgroup_read, | 2859 | .read_u64 = mem_cgroup_read, |
| 2041 | }, | 2860 | }, |
| 2042 | { | 2861 | { |
| 2862 | .name = "soft_limit_in_bytes", | ||
| 2863 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | ||
| 2864 | .write_string = mem_cgroup_write, | ||
| 2865 | .read_u64 = mem_cgroup_read, | ||
| 2866 | }, | ||
| 2867 | { | ||
| 2043 | .name = "failcnt", | 2868 | .name = "failcnt", |
| 2044 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 2869 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
| 2045 | .trigger = mem_cgroup_reset, | 2870 | .trigger = mem_cgroup_reset, |
| @@ -2133,6 +2958,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
| 2133 | mz = &pn->zoneinfo[zone]; | 2958 | mz = &pn->zoneinfo[zone]; |
| 2134 | for_each_lru(l) | 2959 | for_each_lru(l) |
| 2135 | INIT_LIST_HEAD(&mz->lists[l]); | 2960 | INIT_LIST_HEAD(&mz->lists[l]); |
| 2961 | mz->usage_in_excess = 0; | ||
| 2962 | mz->on_tree = false; | ||
| 2963 | mz->mem = mem; | ||
| 2136 | } | 2964 | } |
| 2137 | return 0; | 2965 | return 0; |
| 2138 | } | 2966 | } |
| @@ -2178,6 +3006,9 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
| 2178 | { | 3006 | { |
| 2179 | int node; | 3007 | int node; |
| 2180 | 3008 | ||
| 3009 | mem_cgroup_remove_from_trees(mem); | ||
| 3010 | free_css_id(&mem_cgroup_subsys, &mem->css); | ||
| 3011 | |||
| 2181 | for_each_node_state(node, N_POSSIBLE) | 3012 | for_each_node_state(node, N_POSSIBLE) |
| 2182 | free_mem_cgroup_per_zone_info(mem, node); | 3013 | free_mem_cgroup_per_zone_info(mem, node); |
| 2183 | 3014 | ||
| @@ -2224,23 +3055,54 @@ static void __init enable_swap_cgroup(void) | |||
| 2224 | } | 3055 | } |
| 2225 | #endif | 3056 | #endif |
| 2226 | 3057 | ||
| 3058 | static int mem_cgroup_soft_limit_tree_init(void) | ||
| 3059 | { | ||
| 3060 | struct mem_cgroup_tree_per_node *rtpn; | ||
| 3061 | struct mem_cgroup_tree_per_zone *rtpz; | ||
| 3062 | int tmp, node, zone; | ||
| 3063 | |||
| 3064 | for_each_node_state(node, N_POSSIBLE) { | ||
| 3065 | tmp = node; | ||
| 3066 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
| 3067 | tmp = -1; | ||
| 3068 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
| 3069 | if (!rtpn) | ||
| 3070 | return 1; | ||
| 3071 | |||
| 3072 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
| 3073 | |||
| 3074 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 3075 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
| 3076 | rtpz->rb_root = RB_ROOT; | ||
| 3077 | spin_lock_init(&rtpz->lock); | ||
| 3078 | } | ||
| 3079 | } | ||
| 3080 | return 0; | ||
| 3081 | } | ||
| 3082 | |||
| 2227 | static struct cgroup_subsys_state * __ref | 3083 | static struct cgroup_subsys_state * __ref |
| 2228 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 3084 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
| 2229 | { | 3085 | { |
| 2230 | struct mem_cgroup *mem, *parent; | 3086 | struct mem_cgroup *mem, *parent; |
| 3087 | long error = -ENOMEM; | ||
| 2231 | int node; | 3088 | int node; |
| 2232 | 3089 | ||
| 2233 | mem = mem_cgroup_alloc(); | 3090 | mem = mem_cgroup_alloc(); |
| 2234 | if (!mem) | 3091 | if (!mem) |
| 2235 | return ERR_PTR(-ENOMEM); | 3092 | return ERR_PTR(error); |
| 2236 | 3093 | ||
| 2237 | for_each_node_state(node, N_POSSIBLE) | 3094 | for_each_node_state(node, N_POSSIBLE) |
| 2238 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 3095 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
| 2239 | goto free_out; | 3096 | goto free_out; |
| 3097 | |||
| 2240 | /* root ? */ | 3098 | /* root ? */ |
| 2241 | if (cont->parent == NULL) { | 3099 | if (cont->parent == NULL) { |
| 2242 | enable_swap_cgroup(); | 3100 | enable_swap_cgroup(); |
| 2243 | parent = NULL; | 3101 | parent = NULL; |
| 3102 | root_mem_cgroup = mem; | ||
| 3103 | if (mem_cgroup_soft_limit_tree_init()) | ||
| 3104 | goto free_out; | ||
| 3105 | |||
| 2244 | } else { | 3106 | } else { |
| 2245 | parent = mem_cgroup_from_cont(cont->parent); | 3107 | parent = mem_cgroup_from_cont(cont->parent); |
| 2246 | mem->use_hierarchy = parent->use_hierarchy; | 3108 | mem->use_hierarchy = parent->use_hierarchy; |
| @@ -2260,7 +3122,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 2260 | res_counter_init(&mem->res, NULL); | 3122 | res_counter_init(&mem->res, NULL); |
| 2261 | res_counter_init(&mem->memsw, NULL); | 3123 | res_counter_init(&mem->memsw, NULL); |
| 2262 | } | 3124 | } |
| 2263 | mem->last_scanned_child = NULL; | 3125 | mem->last_scanned_child = 0; |
| 2264 | spin_lock_init(&mem->reclaim_param_lock); | 3126 | spin_lock_init(&mem->reclaim_param_lock); |
| 2265 | 3127 | ||
| 2266 | if (parent) | 3128 | if (parent) |
| @@ -2269,26 +3131,23 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 2269 | return &mem->css; | 3131 | return &mem->css; |
| 2270 | free_out: | 3132 | free_out: |
| 2271 | __mem_cgroup_free(mem); | 3133 | __mem_cgroup_free(mem); |
| 2272 | return ERR_PTR(-ENOMEM); | 3134 | root_mem_cgroup = NULL; |
| 3135 | return ERR_PTR(error); | ||
| 2273 | } | 3136 | } |
| 2274 | 3137 | ||
| 2275 | static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, | 3138 | static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, |
| 2276 | struct cgroup *cont) | 3139 | struct cgroup *cont) |
| 2277 | { | 3140 | { |
| 2278 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 3141 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
| 2279 | mem_cgroup_force_empty(mem, false); | 3142 | |
| 3143 | return mem_cgroup_force_empty(mem, false); | ||
| 2280 | } | 3144 | } |
| 2281 | 3145 | ||
| 2282 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, | 3146 | static void mem_cgroup_destroy(struct cgroup_subsys *ss, |
| 2283 | struct cgroup *cont) | 3147 | struct cgroup *cont) |
| 2284 | { | 3148 | { |
| 2285 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 3149 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
| 2286 | struct mem_cgroup *last_scanned_child = mem->last_scanned_child; | ||
| 2287 | 3150 | ||
| 2288 | if (last_scanned_child) { | ||
| 2289 | VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); | ||
| 2290 | mem_cgroup_put(last_scanned_child); | ||
| 2291 | } | ||
| 2292 | mem_cgroup_put(mem); | 3151 | mem_cgroup_put(mem); |
| 2293 | } | 3152 | } |
| 2294 | 3153 | ||
| @@ -2308,7 +3167,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
| 2308 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 3167 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
| 2309 | struct cgroup *cont, | 3168 | struct cgroup *cont, |
| 2310 | struct cgroup *old_cont, | 3169 | struct cgroup *old_cont, |
| 2311 | struct task_struct *p) | 3170 | struct task_struct *p, |
| 3171 | bool threadgroup) | ||
| 2312 | { | 3172 | { |
| 2313 | mutex_lock(&memcg_tasklist); | 3173 | mutex_lock(&memcg_tasklist); |
| 2314 | /* | 3174 | /* |
| @@ -2327,6 +3187,7 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
| 2327 | .populate = mem_cgroup_populate, | 3187 | .populate = mem_cgroup_populate, |
| 2328 | .attach = mem_cgroup_move_task, | 3188 | .attach = mem_cgroup_move_task, |
| 2329 | .early_init = 0, | 3189 | .early_init = 0, |
| 3190 | .use_id = 1, | ||
| 2330 | }; | 3191 | }; |
| 2331 | 3192 | ||
| 2332 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3193 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c new file mode 100644 index 000000000000..dacc64183874 --- /dev/null +++ b/mm/memory-failure.c | |||
| @@ -0,0 +1,835 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2008, 2009 Intel Corporation | ||
| 3 | * Authors: Andi Kleen, Fengguang Wu | ||
| 4 | * | ||
| 5 | * This software may be redistributed and/or modified under the terms of | ||
| 6 | * the GNU General Public License ("GPL") version 2 only as published by the | ||
| 7 | * Free Software Foundation. | ||
| 8 | * | ||
| 9 | * High level machine check handler. Handles pages reported by the | ||
| 10 | * hardware as being corrupted usually due to a 2bit ECC memory or cache | ||
| 11 | * failure. | ||
| 12 | * | ||
| 13 | * Handles page cache pages in various states. The tricky part | ||
| 14 | * here is that we can access any page asynchronous to other VM | ||
| 15 | * users, because memory failures could happen anytime and anywhere, | ||
| 16 | * possibly violating some of their assumptions. This is why this code | ||
| 17 | * has to be extremely careful. Generally it tries to use normal locking | ||
| 18 | * rules, as in get the standard locks, even if that means the | ||
| 19 | * error handling takes potentially a long time. | ||
| 20 | * | ||
| 21 | * The operation to map back from RMAP chains to processes has to walk | ||
| 22 | * the complete process list and has non linear complexity with the number | ||
| 23 | * mappings. In short it can be quite slow. But since memory corruptions | ||
| 24 | * are rare we hope to get away with this. | ||
| 25 | */ | ||
| 26 | |||
| 27 | /* | ||
| 28 | * Notebook: | ||
| 29 | * - hugetlb needs more code | ||
| 30 | * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages | ||
| 31 | * - pass bad pages to kdump next kernel | ||
| 32 | */ | ||
| 33 | #define DEBUG 1 /* remove me in 2.6.34 */ | ||
| 34 | #include <linux/kernel.h> | ||
| 35 | #include <linux/mm.h> | ||
| 36 | #include <linux/page-flags.h> | ||
| 37 | #include <linux/sched.h> | ||
| 38 | #include <linux/ksm.h> | ||
| 39 | #include <linux/rmap.h> | ||
| 40 | #include <linux/pagemap.h> | ||
| 41 | #include <linux/swap.h> | ||
| 42 | #include <linux/backing-dev.h> | ||
| 43 | #include "internal.h" | ||
| 44 | |||
| 45 | int sysctl_memory_failure_early_kill __read_mostly = 0; | ||
| 46 | |||
| 47 | int sysctl_memory_failure_recovery __read_mostly = 1; | ||
| 48 | |||
| 49 | atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); | ||
| 50 | |||
| 51 | /* | ||
| 52 | * Send all the processes who have the page mapped an ``action optional'' | ||
| 53 | * signal. | ||
| 54 | */ | ||
| 55 | static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, | ||
| 56 | unsigned long pfn) | ||
| 57 | { | ||
| 58 | struct siginfo si; | ||
| 59 | int ret; | ||
| 60 | |||
| 61 | printk(KERN_ERR | ||
| 62 | "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", | ||
| 63 | pfn, t->comm, t->pid); | ||
| 64 | si.si_signo = SIGBUS; | ||
| 65 | si.si_errno = 0; | ||
| 66 | si.si_code = BUS_MCEERR_AO; | ||
| 67 | si.si_addr = (void *)addr; | ||
| 68 | #ifdef __ARCH_SI_TRAPNO | ||
| 69 | si.si_trapno = trapno; | ||
| 70 | #endif | ||
| 71 | si.si_addr_lsb = PAGE_SHIFT; | ||
| 72 | /* | ||
| 73 | * Don't use force here, it's convenient if the signal | ||
| 74 | * can be temporarily blocked. | ||
| 75 | * This could cause a loop when the user sets SIGBUS | ||
| 76 | * to SIG_IGN, but hopefully noone will do that? | ||
| 77 | */ | ||
| 78 | ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ | ||
| 79 | if (ret < 0) | ||
| 80 | printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", | ||
| 81 | t->comm, t->pid, ret); | ||
| 82 | return ret; | ||
| 83 | } | ||
| 84 | |||
| 85 | /* | ||
| 86 | * Kill all processes that have a poisoned page mapped and then isolate | ||
| 87 | * the page. | ||
| 88 | * | ||
| 89 | * General strategy: | ||
| 90 | * Find all processes having the page mapped and kill them. | ||
| 91 | * But we keep a page reference around so that the page is not | ||
| 92 | * actually freed yet. | ||
| 93 | * Then stash the page away | ||
| 94 | * | ||
| 95 | * There's no convenient way to get back to mapped processes | ||
| 96 | * from the VMAs. So do a brute-force search over all | ||
| 97 | * running processes. | ||
| 98 | * | ||
| 99 | * Remember that machine checks are not common (or rather | ||
| 100 | * if they are common you have other problems), so this shouldn't | ||
| 101 | * be a performance issue. | ||
| 102 | * | ||
| 103 | * Also there are some races possible while we get from the | ||
| 104 | * error detection to actually handle it. | ||
| 105 | */ | ||
| 106 | |||
| 107 | struct to_kill { | ||
| 108 | struct list_head nd; | ||
| 109 | struct task_struct *tsk; | ||
| 110 | unsigned long addr; | ||
| 111 | unsigned addr_valid:1; | ||
| 112 | }; | ||
| 113 | |||
| 114 | /* | ||
| 115 | * Failure handling: if we can't find or can't kill a process there's | ||
| 116 | * not much we can do. We just print a message and ignore otherwise. | ||
| 117 | */ | ||
| 118 | |||
| 119 | /* | ||
| 120 | * Schedule a process for later kill. | ||
| 121 | * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. | ||
| 122 | * TBD would GFP_NOIO be enough? | ||
| 123 | */ | ||
| 124 | static void add_to_kill(struct task_struct *tsk, struct page *p, | ||
| 125 | struct vm_area_struct *vma, | ||
| 126 | struct list_head *to_kill, | ||
| 127 | struct to_kill **tkc) | ||
| 128 | { | ||
| 129 | struct to_kill *tk; | ||
| 130 | |||
| 131 | if (*tkc) { | ||
| 132 | tk = *tkc; | ||
| 133 | *tkc = NULL; | ||
| 134 | } else { | ||
| 135 | tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); | ||
| 136 | if (!tk) { | ||
| 137 | printk(KERN_ERR | ||
| 138 | "MCE: Out of memory while machine check handling\n"); | ||
| 139 | return; | ||
| 140 | } | ||
| 141 | } | ||
| 142 | tk->addr = page_address_in_vma(p, vma); | ||
| 143 | tk->addr_valid = 1; | ||
| 144 | |||
| 145 | /* | ||
| 146 | * In theory we don't have to kill when the page was | ||
| 147 | * munmaped. But it could be also a mremap. Since that's | ||
| 148 | * likely very rare kill anyways just out of paranoia, but use | ||
| 149 | * a SIGKILL because the error is not contained anymore. | ||
| 150 | */ | ||
| 151 | if (tk->addr == -EFAULT) { | ||
| 152 | pr_debug("MCE: Unable to find user space address %lx in %s\n", | ||
| 153 | page_to_pfn(p), tsk->comm); | ||
| 154 | tk->addr_valid = 0; | ||
| 155 | } | ||
| 156 | get_task_struct(tsk); | ||
| 157 | tk->tsk = tsk; | ||
| 158 | list_add_tail(&tk->nd, to_kill); | ||
| 159 | } | ||
| 160 | |||
| 161 | /* | ||
| 162 | * Kill the processes that have been collected earlier. | ||
| 163 | * | ||
| 164 | * Only do anything when DOIT is set, otherwise just free the list | ||
| 165 | * (this is used for clean pages which do not need killing) | ||
| 166 | * Also when FAIL is set do a force kill because something went | ||
| 167 | * wrong earlier. | ||
| 168 | */ | ||
| 169 | static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, | ||
| 170 | int fail, unsigned long pfn) | ||
| 171 | { | ||
| 172 | struct to_kill *tk, *next; | ||
| 173 | |||
| 174 | list_for_each_entry_safe (tk, next, to_kill, nd) { | ||
| 175 | if (doit) { | ||
| 176 | /* | ||
| 177 | * In case something went wrong with munmaping | ||
| 178 | * make sure the process doesn't catch the | ||
| 179 | * signal and then access the memory. Just kill it. | ||
| 180 | * the signal handlers | ||
| 181 | */ | ||
| 182 | if (fail || tk->addr_valid == 0) { | ||
| 183 | printk(KERN_ERR | ||
| 184 | "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", | ||
| 185 | pfn, tk->tsk->comm, tk->tsk->pid); | ||
| 186 | force_sig(SIGKILL, tk->tsk); | ||
| 187 | } | ||
| 188 | |||
| 189 | /* | ||
| 190 | * In theory the process could have mapped | ||
| 191 | * something else on the address in-between. We could | ||
| 192 | * check for that, but we need to tell the | ||
| 193 | * process anyways. | ||
| 194 | */ | ||
| 195 | else if (kill_proc_ao(tk->tsk, tk->addr, trapno, | ||
| 196 | pfn) < 0) | ||
| 197 | printk(KERN_ERR | ||
| 198 | "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", | ||
| 199 | pfn, tk->tsk->comm, tk->tsk->pid); | ||
| 200 | } | ||
| 201 | put_task_struct(tk->tsk); | ||
| 202 | kfree(tk); | ||
| 203 | } | ||
| 204 | } | ||
| 205 | |||
| 206 | static int task_early_kill(struct task_struct *tsk) | ||
| 207 | { | ||
| 208 | if (!tsk->mm) | ||
| 209 | return 0; | ||
| 210 | if (tsk->flags & PF_MCE_PROCESS) | ||
| 211 | return !!(tsk->flags & PF_MCE_EARLY); | ||
| 212 | return sysctl_memory_failure_early_kill; | ||
| 213 | } | ||
| 214 | |||
| 215 | /* | ||
| 216 | * Collect processes when the error hit an anonymous page. | ||
| 217 | */ | ||
| 218 | static void collect_procs_anon(struct page *page, struct list_head *to_kill, | ||
| 219 | struct to_kill **tkc) | ||
| 220 | { | ||
| 221 | struct vm_area_struct *vma; | ||
| 222 | struct task_struct *tsk; | ||
| 223 | struct anon_vma *av; | ||
| 224 | |||
| 225 | read_lock(&tasklist_lock); | ||
| 226 | av = page_lock_anon_vma(page); | ||
| 227 | if (av == NULL) /* Not actually mapped anymore */ | ||
| 228 | goto out; | ||
| 229 | for_each_process (tsk) { | ||
| 230 | if (!task_early_kill(tsk)) | ||
| 231 | continue; | ||
| 232 | list_for_each_entry (vma, &av->head, anon_vma_node) { | ||
| 233 | if (!page_mapped_in_vma(page, vma)) | ||
| 234 | continue; | ||
| 235 | if (vma->vm_mm == tsk->mm) | ||
| 236 | add_to_kill(tsk, page, vma, to_kill, tkc); | ||
| 237 | } | ||
| 238 | } | ||
| 239 | page_unlock_anon_vma(av); | ||
| 240 | out: | ||
| 241 | read_unlock(&tasklist_lock); | ||
| 242 | } | ||
| 243 | |||
| 244 | /* | ||
| 245 | * Collect processes when the error hit a file mapped page. | ||
| 246 | */ | ||
| 247 | static void collect_procs_file(struct page *page, struct list_head *to_kill, | ||
| 248 | struct to_kill **tkc) | ||
| 249 | { | ||
| 250 | struct vm_area_struct *vma; | ||
| 251 | struct task_struct *tsk; | ||
| 252 | struct prio_tree_iter iter; | ||
| 253 | struct address_space *mapping = page->mapping; | ||
| 254 | |||
| 255 | /* | ||
| 256 | * A note on the locking order between the two locks. | ||
| 257 | * We don't rely on this particular order. | ||
| 258 | * If you have some other code that needs a different order | ||
| 259 | * feel free to switch them around. Or add a reverse link | ||
| 260 | * from mm_struct to task_struct, then this could be all | ||
| 261 | * done without taking tasklist_lock and looping over all tasks. | ||
| 262 | */ | ||
| 263 | |||
| 264 | read_lock(&tasklist_lock); | ||
| 265 | spin_lock(&mapping->i_mmap_lock); | ||
| 266 | for_each_process(tsk) { | ||
| 267 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
| 268 | |||
| 269 | if (!task_early_kill(tsk)) | ||
| 270 | continue; | ||
| 271 | |||
| 272 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, | ||
| 273 | pgoff) { | ||
| 274 | /* | ||
| 275 | * Send early kill signal to tasks where a vma covers | ||
| 276 | * the page but the corrupted page is not necessarily | ||
| 277 | * mapped it in its pte. | ||
| 278 | * Assume applications who requested early kill want | ||
| 279 | * to be informed of all such data corruptions. | ||
| 280 | */ | ||
| 281 | if (vma->vm_mm == tsk->mm) | ||
| 282 | add_to_kill(tsk, page, vma, to_kill, tkc); | ||
| 283 | } | ||
| 284 | } | ||
| 285 | spin_unlock(&mapping->i_mmap_lock); | ||
| 286 | read_unlock(&tasklist_lock); | ||
| 287 | } | ||
| 288 | |||
| 289 | /* | ||
| 290 | * Collect the processes who have the corrupted page mapped to kill. | ||
| 291 | * This is done in two steps for locking reasons. | ||
| 292 | * First preallocate one tokill structure outside the spin locks, | ||
| 293 | * so that we can kill at least one process reasonably reliable. | ||
| 294 | */ | ||
| 295 | static void collect_procs(struct page *page, struct list_head *tokill) | ||
| 296 | { | ||
| 297 | struct to_kill *tk; | ||
| 298 | |||
| 299 | if (!page->mapping) | ||
| 300 | return; | ||
| 301 | |||
| 302 | tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); | ||
| 303 | if (!tk) | ||
| 304 | return; | ||
| 305 | if (PageAnon(page)) | ||
| 306 | collect_procs_anon(page, tokill, &tk); | ||
| 307 | else | ||
| 308 | collect_procs_file(page, tokill, &tk); | ||
| 309 | kfree(tk); | ||
| 310 | } | ||
| 311 | |||
| 312 | /* | ||
| 313 | * Error handlers for various types of pages. | ||
| 314 | */ | ||
| 315 | |||
| 316 | enum outcome { | ||
| 317 | FAILED, /* Error handling failed */ | ||
| 318 | DELAYED, /* Will be handled later */ | ||
| 319 | IGNORED, /* Error safely ignored */ | ||
| 320 | RECOVERED, /* Successfully recovered */ | ||
| 321 | }; | ||
| 322 | |||
| 323 | static const char *action_name[] = { | ||
| 324 | [FAILED] = "Failed", | ||
| 325 | [DELAYED] = "Delayed", | ||
| 326 | [IGNORED] = "Ignored", | ||
| 327 | [RECOVERED] = "Recovered", | ||
| 328 | }; | ||
| 329 | |||
| 330 | /* | ||
| 331 | * Error hit kernel page. | ||
| 332 | * Do nothing, try to be lucky and not touch this instead. For a few cases we | ||
| 333 | * could be more sophisticated. | ||
| 334 | */ | ||
| 335 | static int me_kernel(struct page *p, unsigned long pfn) | ||
| 336 | { | ||
| 337 | return DELAYED; | ||
| 338 | } | ||
| 339 | |||
| 340 | /* | ||
| 341 | * Already poisoned page. | ||
| 342 | */ | ||
| 343 | static int me_ignore(struct page *p, unsigned long pfn) | ||
| 344 | { | ||
| 345 | return IGNORED; | ||
| 346 | } | ||
| 347 | |||
| 348 | /* | ||
| 349 | * Page in unknown state. Do nothing. | ||
| 350 | */ | ||
| 351 | static int me_unknown(struct page *p, unsigned long pfn) | ||
| 352 | { | ||
| 353 | printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); | ||
| 354 | return FAILED; | ||
| 355 | } | ||
| 356 | |||
| 357 | /* | ||
| 358 | * Free memory | ||
| 359 | */ | ||
| 360 | static int me_free(struct page *p, unsigned long pfn) | ||
| 361 | { | ||
| 362 | return DELAYED; | ||
| 363 | } | ||
| 364 | |||
| 365 | /* | ||
| 366 | * Clean (or cleaned) page cache page. | ||
| 367 | */ | ||
| 368 | static int me_pagecache_clean(struct page *p, unsigned long pfn) | ||
| 369 | { | ||
| 370 | int err; | ||
| 371 | int ret = FAILED; | ||
| 372 | struct address_space *mapping; | ||
| 373 | |||
| 374 | /* | ||
| 375 | * For anonymous pages we're done the only reference left | ||
| 376 | * should be the one m_f() holds. | ||
| 377 | */ | ||
| 378 | if (PageAnon(p)) | ||
| 379 | return RECOVERED; | ||
| 380 | |||
| 381 | /* | ||
| 382 | * Now truncate the page in the page cache. This is really | ||
| 383 | * more like a "temporary hole punch" | ||
| 384 | * Don't do this for block devices when someone else | ||
| 385 | * has a reference, because it could be file system metadata | ||
| 386 | * and that's not safe to truncate. | ||
| 387 | */ | ||
| 388 | mapping = page_mapping(p); | ||
| 389 | if (!mapping) { | ||
| 390 | /* | ||
| 391 | * Page has been teared down in the meanwhile | ||
| 392 | */ | ||
| 393 | return FAILED; | ||
| 394 | } | ||
| 395 | |||
| 396 | /* | ||
| 397 | * Truncation is a bit tricky. Enable it per file system for now. | ||
| 398 | * | ||
| 399 | * Open: to take i_mutex or not for this? Right now we don't. | ||
| 400 | */ | ||
| 401 | if (mapping->a_ops->error_remove_page) { | ||
| 402 | err = mapping->a_ops->error_remove_page(mapping, p); | ||
| 403 | if (err != 0) { | ||
| 404 | printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", | ||
| 405 | pfn, err); | ||
| 406 | } else if (page_has_private(p) && | ||
| 407 | !try_to_release_page(p, GFP_NOIO)) { | ||
| 408 | pr_debug("MCE %#lx: failed to release buffers\n", pfn); | ||
| 409 | } else { | ||
| 410 | ret = RECOVERED; | ||
| 411 | } | ||
| 412 | } else { | ||
| 413 | /* | ||
| 414 | * If the file system doesn't support it just invalidate | ||
| 415 | * This fails on dirty or anything with private pages | ||
| 416 | */ | ||
| 417 | if (invalidate_inode_page(p)) | ||
| 418 | ret = RECOVERED; | ||
| 419 | else | ||
| 420 | printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", | ||
| 421 | pfn); | ||
| 422 | } | ||
| 423 | return ret; | ||
| 424 | } | ||
| 425 | |||
| 426 | /* | ||
| 427 | * Dirty cache page page | ||
| 428 | * Issues: when the error hit a hole page the error is not properly | ||
| 429 | * propagated. | ||
| 430 | */ | ||
| 431 | static int me_pagecache_dirty(struct page *p, unsigned long pfn) | ||
| 432 | { | ||
| 433 | struct address_space *mapping = page_mapping(p); | ||
| 434 | |||
| 435 | SetPageError(p); | ||
| 436 | /* TBD: print more information about the file. */ | ||
| 437 | if (mapping) { | ||
| 438 | /* | ||
| 439 | * IO error will be reported by write(), fsync(), etc. | ||
| 440 | * who check the mapping. | ||
| 441 | * This way the application knows that something went | ||
| 442 | * wrong with its dirty file data. | ||
| 443 | * | ||
| 444 | * There's one open issue: | ||
| 445 | * | ||
| 446 | * The EIO will be only reported on the next IO | ||
| 447 | * operation and then cleared through the IO map. | ||
| 448 | * Normally Linux has two mechanisms to pass IO error | ||
| 449 | * first through the AS_EIO flag in the address space | ||
| 450 | * and then through the PageError flag in the page. | ||
| 451 | * Since we drop pages on memory failure handling the | ||
| 452 | * only mechanism open to use is through AS_AIO. | ||
| 453 | * | ||
| 454 | * This has the disadvantage that it gets cleared on | ||
| 455 | * the first operation that returns an error, while | ||
| 456 | * the PageError bit is more sticky and only cleared | ||
| 457 | * when the page is reread or dropped. If an | ||
| 458 | * application assumes it will always get error on | ||
| 459 | * fsync, but does other operations on the fd before | ||
| 460 | * and the page is dropped inbetween then the error | ||
| 461 | * will not be properly reported. | ||
| 462 | * | ||
| 463 | * This can already happen even without hwpoisoned | ||
| 464 | * pages: first on metadata IO errors (which only | ||
| 465 | * report through AS_EIO) or when the page is dropped | ||
| 466 | * at the wrong time. | ||
| 467 | * | ||
| 468 | * So right now we assume that the application DTRT on | ||
| 469 | * the first EIO, but we're not worse than other parts | ||
| 470 | * of the kernel. | ||
| 471 | */ | ||
| 472 | mapping_set_error(mapping, EIO); | ||
| 473 | } | ||
| 474 | |||
| 475 | return me_pagecache_clean(p, pfn); | ||
| 476 | } | ||
| 477 | |||
| 478 | /* | ||
| 479 | * Clean and dirty swap cache. | ||
| 480 | * | ||
| 481 | * Dirty swap cache page is tricky to handle. The page could live both in page | ||
| 482 | * cache and swap cache(ie. page is freshly swapped in). So it could be | ||
| 483 | * referenced concurrently by 2 types of PTEs: | ||
| 484 | * normal PTEs and swap PTEs. We try to handle them consistently by calling | ||
| 485 | * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, | ||
| 486 | * and then | ||
| 487 | * - clear dirty bit to prevent IO | ||
| 488 | * - remove from LRU | ||
| 489 | * - but keep in the swap cache, so that when we return to it on | ||
| 490 | * a later page fault, we know the application is accessing | ||
| 491 | * corrupted data and shall be killed (we installed simple | ||
| 492 | * interception code in do_swap_page to catch it). | ||
| 493 | * | ||
| 494 | * Clean swap cache pages can be directly isolated. A later page fault will | ||
| 495 | * bring in the known good data from disk. | ||
| 496 | */ | ||
| 497 | static int me_swapcache_dirty(struct page *p, unsigned long pfn) | ||
| 498 | { | ||
| 499 | ClearPageDirty(p); | ||
| 500 | /* Trigger EIO in shmem: */ | ||
| 501 | ClearPageUptodate(p); | ||
| 502 | |||
| 503 | return DELAYED; | ||
| 504 | } | ||
| 505 | |||
| 506 | static int me_swapcache_clean(struct page *p, unsigned long pfn) | ||
| 507 | { | ||
| 508 | delete_from_swap_cache(p); | ||
| 509 | |||
| 510 | return RECOVERED; | ||
| 511 | } | ||
| 512 | |||
| 513 | /* | ||
| 514 | * Huge pages. Needs work. | ||
| 515 | * Issues: | ||
| 516 | * No rmap support so we cannot find the original mapper. In theory could walk | ||
| 517 | * all MMs and look for the mappings, but that would be non atomic and racy. | ||
| 518 | * Need rmap for hugepages for this. Alternatively we could employ a heuristic, | ||
| 519 | * like just walking the current process and hoping it has it mapped (that | ||
| 520 | * should be usually true for the common "shared database cache" case) | ||
| 521 | * Should handle free huge pages and dequeue them too, but this needs to | ||
| 522 | * handle huge page accounting correctly. | ||
| 523 | */ | ||
| 524 | static int me_huge_page(struct page *p, unsigned long pfn) | ||
| 525 | { | ||
| 526 | return FAILED; | ||
| 527 | } | ||
| 528 | |||
| 529 | /* | ||
| 530 | * Various page states we can handle. | ||
| 531 | * | ||
| 532 | * A page state is defined by its current page->flags bits. | ||
| 533 | * The table matches them in order and calls the right handler. | ||
| 534 | * | ||
| 535 | * This is quite tricky because we can access page at any time | ||
| 536 | * in its live cycle, so all accesses have to be extremly careful. | ||
| 537 | * | ||
| 538 | * This is not complete. More states could be added. | ||
| 539 | * For any missing state don't attempt recovery. | ||
| 540 | */ | ||
| 541 | |||
| 542 | #define dirty (1UL << PG_dirty) | ||
| 543 | #define sc (1UL << PG_swapcache) | ||
| 544 | #define unevict (1UL << PG_unevictable) | ||
| 545 | #define mlock (1UL << PG_mlocked) | ||
| 546 | #define writeback (1UL << PG_writeback) | ||
| 547 | #define lru (1UL << PG_lru) | ||
| 548 | #define swapbacked (1UL << PG_swapbacked) | ||
| 549 | #define head (1UL << PG_head) | ||
| 550 | #define tail (1UL << PG_tail) | ||
| 551 | #define compound (1UL << PG_compound) | ||
| 552 | #define slab (1UL << PG_slab) | ||
| 553 | #define buddy (1UL << PG_buddy) | ||
| 554 | #define reserved (1UL << PG_reserved) | ||
| 555 | |||
| 556 | static struct page_state { | ||
| 557 | unsigned long mask; | ||
| 558 | unsigned long res; | ||
| 559 | char *msg; | ||
| 560 | int (*action)(struct page *p, unsigned long pfn); | ||
| 561 | } error_states[] = { | ||
| 562 | { reserved, reserved, "reserved kernel", me_ignore }, | ||
| 563 | { buddy, buddy, "free kernel", me_free }, | ||
| 564 | |||
| 565 | /* | ||
| 566 | * Could in theory check if slab page is free or if we can drop | ||
| 567 | * currently unused objects without touching them. But just | ||
| 568 | * treat it as standard kernel for now. | ||
| 569 | */ | ||
| 570 | { slab, slab, "kernel slab", me_kernel }, | ||
| 571 | |||
| 572 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
| 573 | { head, head, "huge", me_huge_page }, | ||
| 574 | { tail, tail, "huge", me_huge_page }, | ||
| 575 | #else | ||
| 576 | { compound, compound, "huge", me_huge_page }, | ||
| 577 | #endif | ||
| 578 | |||
| 579 | { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, | ||
| 580 | { sc|dirty, sc, "swapcache", me_swapcache_clean }, | ||
| 581 | |||
| 582 | { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, | ||
| 583 | { unevict, unevict, "unevictable LRU", me_pagecache_clean}, | ||
| 584 | |||
| 585 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
| 586 | { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, | ||
| 587 | { mlock, mlock, "mlocked LRU", me_pagecache_clean }, | ||
| 588 | #endif | ||
| 589 | |||
| 590 | { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, | ||
| 591 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | ||
| 592 | { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, | ||
| 593 | |||
| 594 | /* | ||
| 595 | * Catchall entry: must be at end. | ||
| 596 | */ | ||
| 597 | { 0, 0, "unknown page state", me_unknown }, | ||
| 598 | }; | ||
| 599 | |||
| 600 | static void action_result(unsigned long pfn, char *msg, int result) | ||
| 601 | { | ||
| 602 | struct page *page = NULL; | ||
| 603 | if (pfn_valid(pfn)) | ||
| 604 | page = pfn_to_page(pfn); | ||
| 605 | |||
| 606 | printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", | ||
| 607 | pfn, | ||
| 608 | page && PageDirty(page) ? "dirty " : "", | ||
| 609 | msg, action_name[result]); | ||
| 610 | } | ||
| 611 | |||
| 612 | static int page_action(struct page_state *ps, struct page *p, | ||
| 613 | unsigned long pfn, int ref) | ||
| 614 | { | ||
| 615 | int result; | ||
| 616 | int count; | ||
| 617 | |||
| 618 | result = ps->action(p, pfn); | ||
| 619 | action_result(pfn, ps->msg, result); | ||
| 620 | |||
| 621 | count = page_count(p) - 1 - ref; | ||
| 622 | if (count != 0) | ||
| 623 | printk(KERN_ERR | ||
| 624 | "MCE %#lx: %s page still referenced by %d users\n", | ||
| 625 | pfn, ps->msg, count); | ||
| 626 | |||
| 627 | /* Could do more checks here if page looks ok */ | ||
| 628 | /* | ||
| 629 | * Could adjust zone counters here to correct for the missing page. | ||
| 630 | */ | ||
| 631 | |||
| 632 | return result == RECOVERED ? 0 : -EBUSY; | ||
| 633 | } | ||
| 634 | |||
| 635 | #define N_UNMAP_TRIES 5 | ||
| 636 | |||
| 637 | /* | ||
| 638 | * Do all that is necessary to remove user space mappings. Unmap | ||
| 639 | * the pages and send SIGBUS to the processes if the data was dirty. | ||
| 640 | */ | ||
| 641 | static void hwpoison_user_mappings(struct page *p, unsigned long pfn, | ||
| 642 | int trapno) | ||
| 643 | { | ||
| 644 | enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; | ||
| 645 | struct address_space *mapping; | ||
| 646 | LIST_HEAD(tokill); | ||
| 647 | int ret; | ||
| 648 | int i; | ||
| 649 | int kill = 1; | ||
| 650 | |||
| 651 | if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p)) | ||
| 652 | return; | ||
| 653 | |||
| 654 | /* | ||
| 655 | * This check implies we don't kill processes if their pages | ||
| 656 | * are in the swap cache early. Those are always late kills. | ||
| 657 | */ | ||
| 658 | if (!page_mapped(p)) | ||
| 659 | return; | ||
| 660 | |||
| 661 | if (PageSwapCache(p)) { | ||
| 662 | printk(KERN_ERR | ||
| 663 | "MCE %#lx: keeping poisoned page in swap cache\n", pfn); | ||
| 664 | ttu |= TTU_IGNORE_HWPOISON; | ||
| 665 | } | ||
| 666 | |||
| 667 | /* | ||
| 668 | * Propagate the dirty bit from PTEs to struct page first, because we | ||
| 669 | * need this to decide if we should kill or just drop the page. | ||
| 670 | */ | ||
| 671 | mapping = page_mapping(p); | ||
| 672 | if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { | ||
| 673 | if (page_mkclean(p)) { | ||
| 674 | SetPageDirty(p); | ||
| 675 | } else { | ||
| 676 | kill = 0; | ||
| 677 | ttu |= TTU_IGNORE_HWPOISON; | ||
| 678 | printk(KERN_INFO | ||
| 679 | "MCE %#lx: corrupted page was clean: dropped without side effects\n", | ||
| 680 | pfn); | ||
| 681 | } | ||
| 682 | } | ||
| 683 | |||
| 684 | /* | ||
| 685 | * First collect all the processes that have the page | ||
| 686 | * mapped in dirty form. This has to be done before try_to_unmap, | ||
| 687 | * because ttu takes the rmap data structures down. | ||
| 688 | * | ||
| 689 | * Error handling: We ignore errors here because | ||
| 690 | * there's nothing that can be done. | ||
| 691 | */ | ||
| 692 | if (kill) | ||
| 693 | collect_procs(p, &tokill); | ||
| 694 | |||
| 695 | /* | ||
| 696 | * try_to_unmap can fail temporarily due to races. | ||
| 697 | * Try a few times (RED-PEN better strategy?) | ||
| 698 | */ | ||
| 699 | for (i = 0; i < N_UNMAP_TRIES; i++) { | ||
| 700 | ret = try_to_unmap(p, ttu); | ||
| 701 | if (ret == SWAP_SUCCESS) | ||
| 702 | break; | ||
| 703 | pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); | ||
| 704 | } | ||
| 705 | |||
| 706 | if (ret != SWAP_SUCCESS) | ||
| 707 | printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", | ||
| 708 | pfn, page_mapcount(p)); | ||
| 709 | |||
| 710 | /* | ||
| 711 | * Now that the dirty bit has been propagated to the | ||
| 712 | * struct page and all unmaps done we can decide if | ||
| 713 | * killing is needed or not. Only kill when the page | ||
| 714 | * was dirty, otherwise the tokill list is merely | ||
| 715 | * freed. When there was a problem unmapping earlier | ||
| 716 | * use a more force-full uncatchable kill to prevent | ||
| 717 | * any accesses to the poisoned memory. | ||
| 718 | */ | ||
| 719 | kill_procs_ao(&tokill, !!PageDirty(p), trapno, | ||
| 720 | ret != SWAP_SUCCESS, pfn); | ||
| 721 | } | ||
| 722 | |||
| 723 | int __memory_failure(unsigned long pfn, int trapno, int ref) | ||
| 724 | { | ||
| 725 | unsigned long lru_flag; | ||
| 726 | struct page_state *ps; | ||
| 727 | struct page *p; | ||
| 728 | int res; | ||
| 729 | |||
| 730 | if (!sysctl_memory_failure_recovery) | ||
| 731 | panic("Memory failure from trap %d on page %lx", trapno, pfn); | ||
| 732 | |||
| 733 | if (!pfn_valid(pfn)) { | ||
| 734 | action_result(pfn, "memory outside kernel control", IGNORED); | ||
| 735 | return -EIO; | ||
| 736 | } | ||
| 737 | |||
| 738 | p = pfn_to_page(pfn); | ||
| 739 | if (TestSetPageHWPoison(p)) { | ||
| 740 | action_result(pfn, "already hardware poisoned", IGNORED); | ||
| 741 | return 0; | ||
| 742 | } | ||
| 743 | |||
| 744 | atomic_long_add(1, &mce_bad_pages); | ||
| 745 | |||
| 746 | /* | ||
| 747 | * We need/can do nothing about count=0 pages. | ||
| 748 | * 1) it's a free page, and therefore in safe hand: | ||
| 749 | * prep_new_page() will be the gate keeper. | ||
| 750 | * 2) it's part of a non-compound high order page. | ||
| 751 | * Implies some kernel user: cannot stop them from | ||
| 752 | * R/W the page; let's pray that the page has been | ||
| 753 | * used and will be freed some time later. | ||
| 754 | * In fact it's dangerous to directly bump up page count from 0, | ||
| 755 | * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. | ||
| 756 | */ | ||
| 757 | if (!get_page_unless_zero(compound_head(p))) { | ||
| 758 | action_result(pfn, "free or high order kernel", IGNORED); | ||
| 759 | return PageBuddy(compound_head(p)) ? 0 : -EBUSY; | ||
| 760 | } | ||
| 761 | |||
| 762 | /* | ||
| 763 | * We ignore non-LRU pages for good reasons. | ||
| 764 | * - PG_locked is only well defined for LRU pages and a few others | ||
| 765 | * - to avoid races with __set_page_locked() | ||
| 766 | * - to avoid races with __SetPageSlab*() (and more non-atomic ops) | ||
| 767 | * The check (unnecessarily) ignores LRU pages being isolated and | ||
| 768 | * walked by the page reclaim code, however that's not a big loss. | ||
| 769 | */ | ||
| 770 | if (!PageLRU(p)) | ||
| 771 | lru_add_drain_all(); | ||
| 772 | lru_flag = p->flags & lru; | ||
| 773 | if (isolate_lru_page(p)) { | ||
| 774 | action_result(pfn, "non LRU", IGNORED); | ||
| 775 | put_page(p); | ||
| 776 | return -EBUSY; | ||
| 777 | } | ||
| 778 | page_cache_release(p); | ||
| 779 | |||
| 780 | /* | ||
| 781 | * Lock the page and wait for writeback to finish. | ||
| 782 | * It's very difficult to mess with pages currently under IO | ||
| 783 | * and in many cases impossible, so we just avoid it here. | ||
| 784 | */ | ||
| 785 | lock_page_nosync(p); | ||
| 786 | wait_on_page_writeback(p); | ||
| 787 | |||
| 788 | /* | ||
| 789 | * Now take care of user space mappings. | ||
| 790 | */ | ||
| 791 | hwpoison_user_mappings(p, pfn, trapno); | ||
| 792 | |||
| 793 | /* | ||
| 794 | * Torn down by someone else? | ||
| 795 | */ | ||
| 796 | if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) { | ||
| 797 | action_result(pfn, "already truncated LRU", IGNORED); | ||
| 798 | res = 0; | ||
| 799 | goto out; | ||
| 800 | } | ||
| 801 | |||
| 802 | res = -EBUSY; | ||
| 803 | for (ps = error_states;; ps++) { | ||
| 804 | if (((p->flags | lru_flag)& ps->mask) == ps->res) { | ||
| 805 | res = page_action(ps, p, pfn, ref); | ||
| 806 | break; | ||
| 807 | } | ||
| 808 | } | ||
| 809 | out: | ||
| 810 | unlock_page(p); | ||
| 811 | return res; | ||
| 812 | } | ||
| 813 | EXPORT_SYMBOL_GPL(__memory_failure); | ||
| 814 | |||
| 815 | /** | ||
| 816 | * memory_failure - Handle memory failure of a page. | ||
| 817 | * @pfn: Page Number of the corrupted page | ||
| 818 | * @trapno: Trap number reported in the signal to user space. | ||
| 819 | * | ||
| 820 | * This function is called by the low level machine check code | ||
| 821 | * of an architecture when it detects hardware memory corruption | ||
| 822 | * of a page. It tries its best to recover, which includes | ||
| 823 | * dropping pages, killing processes etc. | ||
| 824 | * | ||
| 825 | * The function is primarily of use for corruptions that | ||
| 826 | * happen outside the current execution context (e.g. when | ||
| 827 | * detected by a background scrubber) | ||
| 828 | * | ||
| 829 | * Must run in process context (e.g. a work queue) with interrupts | ||
| 830 | * enabled and no spinlocks hold. | ||
| 831 | */ | ||
| 832 | void memory_failure(unsigned long pfn, int trapno) | ||
| 833 | { | ||
| 834 | __memory_failure(pfn, trapno, 0); | ||
| 835 | } | ||
diff --git a/mm/memory.c b/mm/memory.c index cf6873e91c6a..6ab19dd4a199 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -45,6 +45,7 @@ | |||
| 45 | #include <linux/swap.h> | 45 | #include <linux/swap.h> |
| 46 | #include <linux/highmem.h> | 46 | #include <linux/highmem.h> |
| 47 | #include <linux/pagemap.h> | 47 | #include <linux/pagemap.h> |
| 48 | #include <linux/ksm.h> | ||
| 48 | #include <linux/rmap.h> | 49 | #include <linux/rmap.h> |
| 49 | #include <linux/module.h> | 50 | #include <linux/module.h> |
| 50 | #include <linux/delayacct.h> | 51 | #include <linux/delayacct.h> |
| @@ -56,6 +57,7 @@ | |||
| 56 | #include <linux/swapops.h> | 57 | #include <linux/swapops.h> |
| 57 | #include <linux/elf.h> | 58 | #include <linux/elf.h> |
| 58 | 59 | ||
| 60 | #include <asm/io.h> | ||
| 59 | #include <asm/pgalloc.h> | 61 | #include <asm/pgalloc.h> |
| 60 | #include <asm/uaccess.h> | 62 | #include <asm/uaccess.h> |
| 61 | #include <asm/tlb.h> | 63 | #include <asm/tlb.h> |
| @@ -106,6 +108,18 @@ static int __init disable_randmaps(char *s) | |||
| 106 | } | 108 | } |
| 107 | __setup("norandmaps", disable_randmaps); | 109 | __setup("norandmaps", disable_randmaps); |
| 108 | 110 | ||
| 111 | unsigned long zero_pfn __read_mostly; | ||
| 112 | unsigned long highest_memmap_pfn __read_mostly; | ||
| 113 | |||
| 114 | /* | ||
| 115 | * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() | ||
| 116 | */ | ||
| 117 | static int __init init_zero_pfn(void) | ||
| 118 | { | ||
| 119 | zero_pfn = page_to_pfn(ZERO_PAGE(0)); | ||
| 120 | return 0; | ||
| 121 | } | ||
| 122 | core_initcall(init_zero_pfn); | ||
| 109 | 123 | ||
| 110 | /* | 124 | /* |
| 111 | * If a p?d_bad entry is found while walking page tables, report | 125 | * If a p?d_bad entry is found while walking page tables, report |
| @@ -135,11 +149,12 @@ void pmd_clear_bad(pmd_t *pmd) | |||
| 135 | * Note: this doesn't free the actual pages themselves. That | 149 | * Note: this doesn't free the actual pages themselves. That |
| 136 | * has been handled earlier when unmapping all the memory regions. | 150 | * has been handled earlier when unmapping all the memory regions. |
| 137 | */ | 151 | */ |
| 138 | static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) | 152 | static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, |
| 153 | unsigned long addr) | ||
| 139 | { | 154 | { |
| 140 | pgtable_t token = pmd_pgtable(*pmd); | 155 | pgtable_t token = pmd_pgtable(*pmd); |
| 141 | pmd_clear(pmd); | 156 | pmd_clear(pmd); |
| 142 | pte_free_tlb(tlb, token); | 157 | pte_free_tlb(tlb, token, addr); |
| 143 | tlb->mm->nr_ptes--; | 158 | tlb->mm->nr_ptes--; |
| 144 | } | 159 | } |
| 145 | 160 | ||
| @@ -157,7 +172,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
| 157 | next = pmd_addr_end(addr, end); | 172 | next = pmd_addr_end(addr, end); |
| 158 | if (pmd_none_or_clear_bad(pmd)) | 173 | if (pmd_none_or_clear_bad(pmd)) |
| 159 | continue; | 174 | continue; |
| 160 | free_pte_range(tlb, pmd); | 175 | free_pte_range(tlb, pmd, addr); |
| 161 | } while (pmd++, addr = next, addr != end); | 176 | } while (pmd++, addr = next, addr != end); |
| 162 | 177 | ||
| 163 | start &= PUD_MASK; | 178 | start &= PUD_MASK; |
| @@ -173,7 +188,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, | |||
| 173 | 188 | ||
| 174 | pmd = pmd_offset(pud, start); | 189 | pmd = pmd_offset(pud, start); |
| 175 | pud_clear(pud); | 190 | pud_clear(pud); |
| 176 | pmd_free_tlb(tlb, pmd); | 191 | pmd_free_tlb(tlb, pmd, start); |
| 177 | } | 192 | } |
| 178 | 193 | ||
| 179 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | 194 | static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, |
| @@ -206,7 +221,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, | |||
| 206 | 221 | ||
| 207 | pud = pud_offset(pgd, start); | 222 | pud = pud_offset(pgd, start); |
| 208 | pgd_clear(pgd); | 223 | pgd_clear(pgd); |
| 209 | pud_free_tlb(tlb, pud); | 224 | pud_free_tlb(tlb, pud, start); |
| 210 | } | 225 | } |
| 211 | 226 | ||
| 212 | /* | 227 | /* |
| @@ -282,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 282 | unsigned long addr = vma->vm_start; | 297 | unsigned long addr = vma->vm_start; |
| 283 | 298 | ||
| 284 | /* | 299 | /* |
| 285 | * Hide vma from rmap and vmtruncate before freeing pgtables | 300 | * Hide vma from rmap and truncate_pagecache before freeing |
| 301 | * pgtables | ||
| 286 | */ | 302 | */ |
| 287 | anon_vma_unlink(vma); | 303 | anon_vma_unlink(vma); |
| 288 | unlink_file_vma(vma); | 304 | unlink_file_vma(vma); |
| @@ -441,6 +457,20 @@ static inline int is_cow_mapping(unsigned int flags) | |||
| 441 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 457 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
| 442 | } | 458 | } |
| 443 | 459 | ||
| 460 | #ifndef is_zero_pfn | ||
| 461 | static inline int is_zero_pfn(unsigned long pfn) | ||
| 462 | { | ||
| 463 | return pfn == zero_pfn; | ||
| 464 | } | ||
| 465 | #endif | ||
| 466 | |||
| 467 | #ifndef my_zero_pfn | ||
| 468 | static inline unsigned long my_zero_pfn(unsigned long addr) | ||
| 469 | { | ||
| 470 | return zero_pfn; | ||
| 471 | } | ||
| 472 | #endif | ||
| 473 | |||
| 444 | /* | 474 | /* |
| 445 | * vm_normal_page -- This function gets the "struct page" associated with a pte. | 475 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
| 446 | * | 476 | * |
| @@ -496,7 +526,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 496 | if (HAVE_PTE_SPECIAL) { | 526 | if (HAVE_PTE_SPECIAL) { |
| 497 | if (likely(!pte_special(pte))) | 527 | if (likely(!pte_special(pte))) |
| 498 | goto check_pfn; | 528 | goto check_pfn; |
| 499 | if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) | 529 | if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) |
| 530 | return NULL; | ||
| 531 | if (!is_zero_pfn(pfn)) | ||
| 500 | print_bad_pte(vma, addr, pte, NULL); | 532 | print_bad_pte(vma, addr, pte, NULL); |
| 501 | return NULL; | 533 | return NULL; |
| 502 | } | 534 | } |
| @@ -518,6 +550,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 518 | } | 550 | } |
| 519 | } | 551 | } |
| 520 | 552 | ||
| 553 | if (is_zero_pfn(pfn)) | ||
| 554 | return NULL; | ||
| 521 | check_pfn: | 555 | check_pfn: |
| 522 | if (unlikely(pfn > highest_memmap_pfn)) { | 556 | if (unlikely(pfn > highest_memmap_pfn)) { |
| 523 | print_bad_pte(vma, addr, pte, NULL); | 557 | print_bad_pte(vma, addr, pte, NULL); |
| @@ -595,8 +629,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 595 | page = vm_normal_page(vma, addr, pte); | 629 | page = vm_normal_page(vma, addr, pte); |
| 596 | if (page) { | 630 | if (page) { |
| 597 | get_page(page); | 631 | get_page(page); |
| 598 | page_dup_rmap(page, vma, addr); | 632 | page_dup_rmap(page); |
| 599 | rss[!!PageAnon(page)]++; | 633 | rss[PageAnon(page)]++; |
| 600 | } | 634 | } |
| 601 | 635 | ||
| 602 | out_set_pte: | 636 | out_set_pte: |
| @@ -607,6 +641,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 607 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, | 641 | pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, |
| 608 | unsigned long addr, unsigned long end) | 642 | unsigned long addr, unsigned long end) |
| 609 | { | 643 | { |
| 644 | pte_t *orig_src_pte, *orig_dst_pte; | ||
| 610 | pte_t *src_pte, *dst_pte; | 645 | pte_t *src_pte, *dst_pte; |
| 611 | spinlock_t *src_ptl, *dst_ptl; | 646 | spinlock_t *src_ptl, *dst_ptl; |
| 612 | int progress = 0; | 647 | int progress = 0; |
| @@ -620,6 +655,8 @@ again: | |||
| 620 | src_pte = pte_offset_map_nested(src_pmd, addr); | 655 | src_pte = pte_offset_map_nested(src_pmd, addr); |
| 621 | src_ptl = pte_lockptr(src_mm, src_pmd); | 656 | src_ptl = pte_lockptr(src_mm, src_pmd); |
| 622 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); | 657 | spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); |
| 658 | orig_src_pte = src_pte; | ||
| 659 | orig_dst_pte = dst_pte; | ||
| 623 | arch_enter_lazy_mmu_mode(); | 660 | arch_enter_lazy_mmu_mode(); |
| 624 | 661 | ||
| 625 | do { | 662 | do { |
| @@ -643,9 +680,9 @@ again: | |||
| 643 | 680 | ||
| 644 | arch_leave_lazy_mmu_mode(); | 681 | arch_leave_lazy_mmu_mode(); |
| 645 | spin_unlock(src_ptl); | 682 | spin_unlock(src_ptl); |
| 646 | pte_unmap_nested(src_pte - 1); | 683 | pte_unmap_nested(orig_src_pte); |
| 647 | add_mm_rss(dst_mm, rss[0], rss[1]); | 684 | add_mm_rss(dst_mm, rss[0], rss[1]); |
| 648 | pte_unmap_unlock(dst_pte - 1, dst_ptl); | 685 | pte_unmap_unlock(orig_dst_pte, dst_ptl); |
| 649 | cond_resched(); | 686 | cond_resched(); |
| 650 | if (addr != end) | 687 | if (addr != end) |
| 651 | goto again; | 688 | goto again; |
| @@ -1141,9 +1178,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
| 1141 | goto no_page; | 1178 | goto no_page; |
| 1142 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 1179 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
| 1143 | goto unlock; | 1180 | goto unlock; |
| 1181 | |||
| 1144 | page = vm_normal_page(vma, address, pte); | 1182 | page = vm_normal_page(vma, address, pte); |
| 1145 | if (unlikely(!page)) | 1183 | if (unlikely(!page)) { |
| 1146 | goto bad_page; | 1184 | if ((flags & FOLL_DUMP) || |
| 1185 | !is_zero_pfn(pte_pfn(pte))) | ||
| 1186 | goto bad_page; | ||
| 1187 | page = pte_page(pte); | ||
| 1188 | } | ||
| 1147 | 1189 | ||
| 1148 | if (flags & FOLL_GET) | 1190 | if (flags & FOLL_GET) |
| 1149 | get_page(page); | 1191 | get_page(page); |
| @@ -1171,65 +1213,46 @@ no_page: | |||
| 1171 | pte_unmap_unlock(ptep, ptl); | 1213 | pte_unmap_unlock(ptep, ptl); |
| 1172 | if (!pte_none(pte)) | 1214 | if (!pte_none(pte)) |
| 1173 | return page; | 1215 | return page; |
| 1174 | /* Fall through to ZERO_PAGE handling */ | 1216 | |
| 1175 | no_page_table: | 1217 | no_page_table: |
| 1176 | /* | 1218 | /* |
| 1177 | * When core dumping an enormous anonymous area that nobody | 1219 | * When core dumping an enormous anonymous area that nobody |
| 1178 | * has touched so far, we don't want to allocate page tables. | 1220 | * has touched so far, we don't want to allocate unnecessary pages or |
| 1221 | * page tables. Return error instead of NULL to skip handle_mm_fault, | ||
| 1222 | * then get_dump_page() will return NULL to leave a hole in the dump. | ||
| 1223 | * But we can only make this optimization where a hole would surely | ||
| 1224 | * be zero-filled if handle_mm_fault() actually did handle it. | ||
| 1179 | */ | 1225 | */ |
| 1180 | if (flags & FOLL_ANON) { | 1226 | if ((flags & FOLL_DUMP) && |
| 1181 | page = ZERO_PAGE(0); | 1227 | (!vma->vm_ops || !vma->vm_ops->fault)) |
| 1182 | if (flags & FOLL_GET) | 1228 | return ERR_PTR(-EFAULT); |
| 1183 | get_page(page); | ||
| 1184 | BUG_ON(flags & FOLL_WRITE); | ||
| 1185 | } | ||
| 1186 | return page; | 1229 | return page; |
| 1187 | } | 1230 | } |
| 1188 | 1231 | ||
| 1189 | /* Can we do the FOLL_ANON optimization? */ | ||
| 1190 | static inline int use_zero_page(struct vm_area_struct *vma) | ||
| 1191 | { | ||
| 1192 | /* | ||
| 1193 | * We don't want to optimize FOLL_ANON for make_pages_present() | ||
| 1194 | * when it tries to page in a VM_LOCKED region. As to VM_SHARED, | ||
| 1195 | * we want to get the page from the page tables to make sure | ||
| 1196 | * that we serialize and update with any other user of that | ||
| 1197 | * mapping. | ||
| 1198 | */ | ||
| 1199 | if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) | ||
| 1200 | return 0; | ||
| 1201 | /* | ||
| 1202 | * And if we have a fault routine, it's not an anonymous region. | ||
| 1203 | */ | ||
| 1204 | return !vma->vm_ops || !vma->vm_ops->fault; | ||
| 1205 | } | ||
| 1206 | |||
| 1207 | |||
| 1208 | |||
| 1209 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1232 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 1210 | unsigned long start, int len, int flags, | 1233 | unsigned long start, int nr_pages, unsigned int gup_flags, |
| 1211 | struct page **pages, struct vm_area_struct **vmas) | 1234 | struct page **pages, struct vm_area_struct **vmas) |
| 1212 | { | 1235 | { |
| 1213 | int i; | 1236 | int i; |
| 1214 | unsigned int vm_flags = 0; | 1237 | unsigned long vm_flags; |
| 1215 | int write = !!(flags & GUP_FLAGS_WRITE); | ||
| 1216 | int force = !!(flags & GUP_FLAGS_FORCE); | ||
| 1217 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | ||
| 1218 | int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); | ||
| 1219 | 1238 | ||
| 1220 | if (len <= 0) | 1239 | if (nr_pages <= 0) |
| 1221 | return 0; | 1240 | return 0; |
| 1241 | |||
| 1242 | VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); | ||
| 1243 | |||
| 1222 | /* | 1244 | /* |
| 1223 | * Require read or write permissions. | 1245 | * Require read or write permissions. |
| 1224 | * If 'force' is set, we only require the "MAY" flags. | 1246 | * If FOLL_FORCE is set, we only require the "MAY" flags. |
| 1225 | */ | 1247 | */ |
| 1226 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 1248 | vm_flags = (gup_flags & FOLL_WRITE) ? |
| 1227 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 1249 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
| 1250 | vm_flags &= (gup_flags & FOLL_FORCE) ? | ||
| 1251 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | ||
| 1228 | i = 0; | 1252 | i = 0; |
| 1229 | 1253 | ||
| 1230 | do { | 1254 | do { |
| 1231 | struct vm_area_struct *vma; | 1255 | struct vm_area_struct *vma; |
| 1232 | unsigned int foll_flags; | ||
| 1233 | 1256 | ||
| 1234 | vma = find_extend_vma(mm, start); | 1257 | vma = find_extend_vma(mm, start); |
| 1235 | if (!vma && in_gate_area(tsk, start)) { | 1258 | if (!vma && in_gate_area(tsk, start)) { |
| @@ -1241,7 +1264,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1241 | pte_t *pte; | 1264 | pte_t *pte; |
| 1242 | 1265 | ||
| 1243 | /* user gate pages are read-only */ | 1266 | /* user gate pages are read-only */ |
| 1244 | if (!ignore && write) | 1267 | if (gup_flags & FOLL_WRITE) |
| 1245 | return i ? : -EFAULT; | 1268 | return i ? : -EFAULT; |
| 1246 | if (pg > TASK_SIZE) | 1269 | if (pg > TASK_SIZE) |
| 1247 | pgd = pgd_offset_k(pg); | 1270 | pgd = pgd_offset_k(pg); |
| @@ -1269,53 +1292,45 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1269 | vmas[i] = gate_vma; | 1292 | vmas[i] = gate_vma; |
| 1270 | i++; | 1293 | i++; |
| 1271 | start += PAGE_SIZE; | 1294 | start += PAGE_SIZE; |
| 1272 | len--; | 1295 | nr_pages--; |
| 1273 | continue; | 1296 | continue; |
| 1274 | } | 1297 | } |
| 1275 | 1298 | ||
| 1276 | if (!vma || | 1299 | if (!vma || |
| 1277 | (vma->vm_flags & (VM_IO | VM_PFNMAP)) || | 1300 | (vma->vm_flags & (VM_IO | VM_PFNMAP)) || |
| 1278 | (!ignore && !(vm_flags & vma->vm_flags))) | 1301 | !(vm_flags & vma->vm_flags)) |
| 1279 | return i ? : -EFAULT; | 1302 | return i ? : -EFAULT; |
| 1280 | 1303 | ||
| 1281 | if (is_vm_hugetlb_page(vma)) { | 1304 | if (is_vm_hugetlb_page(vma)) { |
| 1282 | i = follow_hugetlb_page(mm, vma, pages, vmas, | 1305 | i = follow_hugetlb_page(mm, vma, pages, vmas, |
| 1283 | &start, &len, i, write); | 1306 | &start, &nr_pages, i, gup_flags); |
| 1284 | continue; | 1307 | continue; |
| 1285 | } | 1308 | } |
| 1286 | 1309 | ||
| 1287 | foll_flags = FOLL_TOUCH; | ||
| 1288 | if (pages) | ||
| 1289 | foll_flags |= FOLL_GET; | ||
| 1290 | if (!write && use_zero_page(vma)) | ||
| 1291 | foll_flags |= FOLL_ANON; | ||
| 1292 | |||
| 1293 | do { | 1310 | do { |
| 1294 | struct page *page; | 1311 | struct page *page; |
| 1312 | unsigned int foll_flags = gup_flags; | ||
| 1295 | 1313 | ||
| 1296 | /* | 1314 | /* |
| 1297 | * If we have a pending SIGKILL, don't keep faulting | 1315 | * If we have a pending SIGKILL, don't keep faulting |
| 1298 | * pages and potentially allocating memory, unless | 1316 | * pages and potentially allocating memory. |
| 1299 | * current is handling munlock--e.g., on exit. In | ||
| 1300 | * that case, we are not allocating memory. Rather, | ||
| 1301 | * we're only unlocking already resident/mapped pages. | ||
| 1302 | */ | 1317 | */ |
| 1303 | if (unlikely(!ignore_sigkill && | 1318 | if (unlikely(fatal_signal_pending(current))) |
| 1304 | fatal_signal_pending(current))) | ||
| 1305 | return i ? i : -ERESTARTSYS; | 1319 | return i ? i : -ERESTARTSYS; |
| 1306 | 1320 | ||
| 1307 | if (write) | ||
| 1308 | foll_flags |= FOLL_WRITE; | ||
| 1309 | |||
| 1310 | cond_resched(); | 1321 | cond_resched(); |
| 1311 | while (!(page = follow_page(vma, start, foll_flags))) { | 1322 | while (!(page = follow_page(vma, start, foll_flags))) { |
| 1312 | int ret; | 1323 | int ret; |
| 1324 | |||
| 1313 | ret = handle_mm_fault(mm, vma, start, | 1325 | ret = handle_mm_fault(mm, vma, start, |
| 1314 | foll_flags & FOLL_WRITE); | 1326 | (foll_flags & FOLL_WRITE) ? |
| 1327 | FAULT_FLAG_WRITE : 0); | ||
| 1328 | |||
| 1315 | if (ret & VM_FAULT_ERROR) { | 1329 | if (ret & VM_FAULT_ERROR) { |
| 1316 | if (ret & VM_FAULT_OOM) | 1330 | if (ret & VM_FAULT_OOM) |
| 1317 | return i ? i : -ENOMEM; | 1331 | return i ? i : -ENOMEM; |
| 1318 | else if (ret & VM_FAULT_SIGBUS) | 1332 | if (ret & |
| 1333 | (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) | ||
| 1319 | return i ? i : -EFAULT; | 1334 | return i ? i : -EFAULT; |
| 1320 | BUG(); | 1335 | BUG(); |
| 1321 | } | 1336 | } |
| @@ -1354,30 +1369,107 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 1354 | vmas[i] = vma; | 1369 | vmas[i] = vma; |
| 1355 | i++; | 1370 | i++; |
| 1356 | start += PAGE_SIZE; | 1371 | start += PAGE_SIZE; |
| 1357 | len--; | 1372 | nr_pages--; |
| 1358 | } while (len && start < vma->vm_end); | 1373 | } while (nr_pages && start < vma->vm_end); |
| 1359 | } while (len); | 1374 | } while (nr_pages); |
| 1360 | return i; | 1375 | return i; |
| 1361 | } | 1376 | } |
| 1362 | 1377 | ||
| 1378 | /** | ||
| 1379 | * get_user_pages() - pin user pages in memory | ||
| 1380 | * @tsk: task_struct of target task | ||
| 1381 | * @mm: mm_struct of target mm | ||
| 1382 | * @start: starting user address | ||
| 1383 | * @nr_pages: number of pages from start to pin | ||
| 1384 | * @write: whether pages will be written to by the caller | ||
| 1385 | * @force: whether to force write access even if user mapping is | ||
| 1386 | * readonly. This will result in the page being COWed even | ||
| 1387 | * in MAP_SHARED mappings. You do not want this. | ||
| 1388 | * @pages: array that receives pointers to the pages pinned. | ||
| 1389 | * Should be at least nr_pages long. Or NULL, if caller | ||
| 1390 | * only intends to ensure the pages are faulted in. | ||
| 1391 | * @vmas: array of pointers to vmas corresponding to each page. | ||
| 1392 | * Or NULL if the caller does not require them. | ||
| 1393 | * | ||
| 1394 | * Returns number of pages pinned. This may be fewer than the number | ||
| 1395 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
| 1396 | * were pinned, returns -errno. Each page returned must be released | ||
| 1397 | * with a put_page() call when it is finished with. vmas will only | ||
| 1398 | * remain valid while mmap_sem is held. | ||
| 1399 | * | ||
| 1400 | * Must be called with mmap_sem held for read or write. | ||
| 1401 | * | ||
| 1402 | * get_user_pages walks a process's page tables and takes a reference to | ||
| 1403 | * each struct page that each user address corresponds to at a given | ||
| 1404 | * instant. That is, it takes the page that would be accessed if a user | ||
| 1405 | * thread accesses the given user virtual address at that instant. | ||
| 1406 | * | ||
| 1407 | * This does not guarantee that the page exists in the user mappings when | ||
| 1408 | * get_user_pages returns, and there may even be a completely different | ||
| 1409 | * page there in some cases (eg. if mmapped pagecache has been invalidated | ||
| 1410 | * and subsequently re faulted). However it does guarantee that the page | ||
| 1411 | * won't be freed completely. And mostly callers simply care that the page | ||
| 1412 | * contains data that was valid *at some point in time*. Typically, an IO | ||
| 1413 | * or similar operation cannot guarantee anything stronger anyway because | ||
| 1414 | * locks can't be held over the syscall boundary. | ||
| 1415 | * | ||
| 1416 | * If write=0, the page must not be written to. If the page is written to, | ||
| 1417 | * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called | ||
| 1418 | * after the page is finished with, and before put_page is called. | ||
| 1419 | * | ||
| 1420 | * get_user_pages is typically used for fewer-copy IO operations, to get a | ||
| 1421 | * handle on the memory by some means other than accesses via the user virtual | ||
| 1422 | * addresses. The pages may be submitted for DMA to devices or accessed via | ||
| 1423 | * their kernel linear mapping (via the kmap APIs). Care should be taken to | ||
| 1424 | * use the correct cache flushing APIs. | ||
| 1425 | * | ||
| 1426 | * See also get_user_pages_fast, for performance critical applications. | ||
| 1427 | */ | ||
| 1363 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 1428 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 1364 | unsigned long start, int len, int write, int force, | 1429 | unsigned long start, int nr_pages, int write, int force, |
| 1365 | struct page **pages, struct vm_area_struct **vmas) | 1430 | struct page **pages, struct vm_area_struct **vmas) |
| 1366 | { | 1431 | { |
| 1367 | int flags = 0; | 1432 | int flags = FOLL_TOUCH; |
| 1368 | 1433 | ||
| 1434 | if (pages) | ||
| 1435 | flags |= FOLL_GET; | ||
| 1369 | if (write) | 1436 | if (write) |
| 1370 | flags |= GUP_FLAGS_WRITE; | 1437 | flags |= FOLL_WRITE; |
| 1371 | if (force) | 1438 | if (force) |
| 1372 | flags |= GUP_FLAGS_FORCE; | 1439 | flags |= FOLL_FORCE; |
| 1373 | 1440 | ||
| 1374 | return __get_user_pages(tsk, mm, | 1441 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); |
| 1375 | start, len, flags, | ||
| 1376 | pages, vmas); | ||
| 1377 | } | 1442 | } |
| 1378 | |||
| 1379 | EXPORT_SYMBOL(get_user_pages); | 1443 | EXPORT_SYMBOL(get_user_pages); |
| 1380 | 1444 | ||
| 1445 | /** | ||
| 1446 | * get_dump_page() - pin user page in memory while writing it to core dump | ||
| 1447 | * @addr: user address | ||
| 1448 | * | ||
| 1449 | * Returns struct page pointer of user page pinned for dump, | ||
| 1450 | * to be freed afterwards by page_cache_release() or put_page(). | ||
| 1451 | * | ||
| 1452 | * Returns NULL on any kind of failure - a hole must then be inserted into | ||
| 1453 | * the corefile, to preserve alignment with its headers; and also returns | ||
| 1454 | * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - | ||
| 1455 | * allowing a hole to be left in the corefile to save diskspace. | ||
| 1456 | * | ||
| 1457 | * Called without mmap_sem, but after all other threads have been killed. | ||
| 1458 | */ | ||
| 1459 | #ifdef CONFIG_ELF_CORE | ||
| 1460 | struct page *get_dump_page(unsigned long addr) | ||
| 1461 | { | ||
| 1462 | struct vm_area_struct *vma; | ||
| 1463 | struct page *page; | ||
| 1464 | |||
| 1465 | if (__get_user_pages(current, current->mm, addr, 1, | ||
| 1466 | FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) | ||
| 1467 | return NULL; | ||
| 1468 | flush_cache_page(vma, addr, page_to_pfn(page)); | ||
| 1469 | return page; | ||
| 1470 | } | ||
| 1471 | #endif /* CONFIG_ELF_CORE */ | ||
| 1472 | |||
| 1381 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, | 1473 | pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, |
| 1382 | spinlock_t **ptl) | 1474 | spinlock_t **ptl) |
| 1383 | { | 1475 | { |
| @@ -1555,7 +1647,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | |||
| 1555 | * If we don't have pte special, then we have to use the pfn_valid() | 1647 | * If we don't have pte special, then we have to use the pfn_valid() |
| 1556 | * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* | 1648 | * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* |
| 1557 | * refcount the page if pfn_valid is true (hence insert_page rather | 1649 | * refcount the page if pfn_valid is true (hence insert_page rather |
| 1558 | * than insert_pfn). | 1650 | * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP |
| 1651 | * without pte special, it would there be refcounted as a normal page. | ||
| 1559 | */ | 1652 | */ |
| 1560 | if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { | 1653 | if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { |
| 1561 | struct page *page; | 1654 | struct page *page; |
| @@ -1730,10 +1823,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, | |||
| 1730 | token = pmd_pgtable(*pmd); | 1823 | token = pmd_pgtable(*pmd); |
| 1731 | 1824 | ||
| 1732 | do { | 1825 | do { |
| 1733 | err = fn(pte, token, addr, data); | 1826 | err = fn(pte++, token, addr, data); |
| 1734 | if (err) | 1827 | if (err) |
| 1735 | break; | 1828 | break; |
| 1736 | } while (pte++, addr += PAGE_SIZE, addr != end); | 1829 | } while (addr += PAGE_SIZE, addr != end); |
| 1737 | 1830 | ||
| 1738 | arch_leave_lazy_mmu_mode(); | 1831 | arch_leave_lazy_mmu_mode(); |
| 1739 | 1832 | ||
| @@ -1921,7 +2014,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1921 | * Take out anonymous pages first, anonymous shared vmas are | 2014 | * Take out anonymous pages first, anonymous shared vmas are |
| 1922 | * not dirty accountable. | 2015 | * not dirty accountable. |
| 1923 | */ | 2016 | */ |
| 1924 | if (PageAnon(old_page)) { | 2017 | if (PageAnon(old_page) && !PageKsm(old_page)) { |
| 1925 | if (!trylock_page(old_page)) { | 2018 | if (!trylock_page(old_page)) { |
| 1926 | page_cache_get(old_page); | 2019 | page_cache_get(old_page); |
| 1927 | pte_unmap_unlock(page_table, ptl); | 2020 | pte_unmap_unlock(page_table, ptl); |
| @@ -1971,6 +2064,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1971 | ret = tmp; | 2064 | ret = tmp; |
| 1972 | goto unwritable_page; | 2065 | goto unwritable_page; |
| 1973 | } | 2066 | } |
| 2067 | if (unlikely(!(tmp & VM_FAULT_LOCKED))) { | ||
| 2068 | lock_page(old_page); | ||
| 2069 | if (!old_page->mapping) { | ||
| 2070 | ret = 0; /* retry the fault */ | ||
| 2071 | unlock_page(old_page); | ||
| 2072 | goto unwritable_page; | ||
| 2073 | } | ||
| 2074 | } else | ||
| 2075 | VM_BUG_ON(!PageLocked(old_page)); | ||
| 1974 | 2076 | ||
| 1975 | /* | 2077 | /* |
| 1976 | * Since we dropped the lock we need to revalidate | 2078 | * Since we dropped the lock we need to revalidate |
| @@ -1980,9 +2082,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1980 | */ | 2082 | */ |
| 1981 | page_table = pte_offset_map_lock(mm, pmd, address, | 2083 | page_table = pte_offset_map_lock(mm, pmd, address, |
| 1982 | &ptl); | 2084 | &ptl); |
| 1983 | page_cache_release(old_page); | 2085 | if (!pte_same(*page_table, orig_pte)) { |
| 1984 | if (!pte_same(*page_table, orig_pte)) | 2086 | unlock_page(old_page); |
| 2087 | page_cache_release(old_page); | ||
| 1985 | goto unlock; | 2088 | goto unlock; |
| 2089 | } | ||
| 1986 | 2090 | ||
| 1987 | page_mkwrite = 1; | 2091 | page_mkwrite = 1; |
| 1988 | } | 2092 | } |
| @@ -2011,10 +2115,19 @@ gotten: | |||
| 2011 | 2115 | ||
| 2012 | if (unlikely(anon_vma_prepare(vma))) | 2116 | if (unlikely(anon_vma_prepare(vma))) |
| 2013 | goto oom; | 2117 | goto oom; |
| 2014 | VM_BUG_ON(old_page == ZERO_PAGE(0)); | 2118 | |
| 2015 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | 2119 | if (is_zero_pfn(pte_pfn(orig_pte))) { |
| 2016 | if (!new_page) | 2120 | new_page = alloc_zeroed_user_highpage_movable(vma, address); |
| 2017 | goto oom; | 2121 | if (!new_page) |
| 2122 | goto oom; | ||
| 2123 | } else { | ||
| 2124 | new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
| 2125 | if (!new_page) | ||
| 2126 | goto oom; | ||
| 2127 | cow_user_page(new_page, old_page, address, vma); | ||
| 2128 | } | ||
| 2129 | __SetPageUptodate(new_page); | ||
| 2130 | |||
| 2018 | /* | 2131 | /* |
| 2019 | * Don't let another task, with possibly unlocked vma, | 2132 | * Don't let another task, with possibly unlocked vma, |
| 2020 | * keep the mlocked page. | 2133 | * keep the mlocked page. |
| @@ -2024,8 +2137,6 @@ gotten: | |||
| 2024 | clear_page_mlock(old_page); | 2137 | clear_page_mlock(old_page); |
| 2025 | unlock_page(old_page); | 2138 | unlock_page(old_page); |
| 2026 | } | 2139 | } |
| 2027 | cow_user_page(new_page, old_page, address, vma); | ||
| 2028 | __SetPageUptodate(new_page); | ||
| 2029 | 2140 | ||
| 2030 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) | 2141 | if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) |
| 2031 | goto oom_free_new; | 2142 | goto oom_free_new; |
| @@ -2051,9 +2162,14 @@ gotten: | |||
| 2051 | * seen in the presence of one thread doing SMC and another | 2162 | * seen in the presence of one thread doing SMC and another |
| 2052 | * thread doing COW. | 2163 | * thread doing COW. |
| 2053 | */ | 2164 | */ |
| 2054 | ptep_clear_flush_notify(vma, address, page_table); | 2165 | ptep_clear_flush(vma, address, page_table); |
| 2055 | page_add_new_anon_rmap(new_page, vma, address); | 2166 | page_add_new_anon_rmap(new_page, vma, address); |
| 2056 | set_pte_at(mm, address, page_table, entry); | 2167 | /* |
| 2168 | * We call the notify macro here because, when using secondary | ||
| 2169 | * mmu page tables (such as kvm shadow page tables), we want the | ||
| 2170 | * new page to be mapped directly into the secondary page table. | ||
| 2171 | */ | ||
| 2172 | set_pte_at_notify(mm, address, page_table, entry); | ||
| 2057 | update_mmu_cache(vma, address, entry); | 2173 | update_mmu_cache(vma, address, entry); |
| 2058 | if (old_page) { | 2174 | if (old_page) { |
| 2059 | /* | 2175 | /* |
| @@ -2094,9 +2210,6 @@ gotten: | |||
| 2094 | unlock: | 2210 | unlock: |
| 2095 | pte_unmap_unlock(page_table, ptl); | 2211 | pte_unmap_unlock(page_table, ptl); |
| 2096 | if (dirty_page) { | 2212 | if (dirty_page) { |
| 2097 | if (vma->vm_file) | ||
| 2098 | file_update_time(vma->vm_file); | ||
| 2099 | |||
| 2100 | /* | 2213 | /* |
| 2101 | * Yes, Virginia, this is actually required to prevent a race | 2214 | * Yes, Virginia, this is actually required to prevent a race |
| 2102 | * with clear_page_dirty_for_io() from clearing the page dirty | 2215 | * with clear_page_dirty_for_io() from clearing the page dirty |
| @@ -2105,16 +2218,41 @@ unlock: | |||
| 2105 | * | 2218 | * |
| 2106 | * do_no_page is protected similarly. | 2219 | * do_no_page is protected similarly. |
| 2107 | */ | 2220 | */ |
| 2108 | wait_on_page_locked(dirty_page); | 2221 | if (!page_mkwrite) { |
| 2109 | set_page_dirty_balance(dirty_page, page_mkwrite); | 2222 | wait_on_page_locked(dirty_page); |
| 2223 | set_page_dirty_balance(dirty_page, page_mkwrite); | ||
| 2224 | } | ||
| 2110 | put_page(dirty_page); | 2225 | put_page(dirty_page); |
| 2226 | if (page_mkwrite) { | ||
| 2227 | struct address_space *mapping = dirty_page->mapping; | ||
| 2228 | |||
| 2229 | set_page_dirty(dirty_page); | ||
| 2230 | unlock_page(dirty_page); | ||
| 2231 | page_cache_release(dirty_page); | ||
| 2232 | if (mapping) { | ||
| 2233 | /* | ||
| 2234 | * Some device drivers do not set page.mapping | ||
| 2235 | * but still dirty their pages | ||
| 2236 | */ | ||
| 2237 | balance_dirty_pages_ratelimited(mapping); | ||
| 2238 | } | ||
| 2239 | } | ||
| 2240 | |||
| 2241 | /* file_update_time outside page_lock */ | ||
| 2242 | if (vma->vm_file) | ||
| 2243 | file_update_time(vma->vm_file); | ||
| 2111 | } | 2244 | } |
| 2112 | return ret; | 2245 | return ret; |
| 2113 | oom_free_new: | 2246 | oom_free_new: |
| 2114 | page_cache_release(new_page); | 2247 | page_cache_release(new_page); |
| 2115 | oom: | 2248 | oom: |
| 2116 | if (old_page) | 2249 | if (old_page) { |
| 2250 | if (page_mkwrite) { | ||
| 2251 | unlock_page(old_page); | ||
| 2252 | page_cache_release(old_page); | ||
| 2253 | } | ||
| 2117 | page_cache_release(old_page); | 2254 | page_cache_release(old_page); |
| 2255 | } | ||
| 2118 | return VM_FAULT_OOM; | 2256 | return VM_FAULT_OOM; |
| 2119 | 2257 | ||
| 2120 | unwritable_page: | 2258 | unwritable_page: |
| @@ -2274,7 +2412,7 @@ restart: | |||
| 2274 | * @mapping: the address space containing mmaps to be unmapped. | 2412 | * @mapping: the address space containing mmaps to be unmapped. |
| 2275 | * @holebegin: byte in first page to unmap, relative to the start of | 2413 | * @holebegin: byte in first page to unmap, relative to the start of |
| 2276 | * the underlying file. This will be rounded down to a PAGE_SIZE | 2414 | * the underlying file. This will be rounded down to a PAGE_SIZE |
| 2277 | * boundary. Note that this is different from vmtruncate(), which | 2415 | * boundary. Note that this is different from truncate_pagecache(), which |
| 2278 | * must keep the partial page. In contrast, we must get rid of | 2416 | * must keep the partial page. In contrast, we must get rid of |
| 2279 | * partial pages. | 2417 | * partial pages. |
| 2280 | * @holelen: size of prospective hole in bytes. This will be rounded | 2418 | * @holelen: size of prospective hole in bytes. This will be rounded |
| @@ -2325,63 +2463,6 @@ void unmap_mapping_range(struct address_space *mapping, | |||
| 2325 | } | 2463 | } |
| 2326 | EXPORT_SYMBOL(unmap_mapping_range); | 2464 | EXPORT_SYMBOL(unmap_mapping_range); |
| 2327 | 2465 | ||
| 2328 | /** | ||
| 2329 | * vmtruncate - unmap mappings "freed" by truncate() syscall | ||
| 2330 | * @inode: inode of the file used | ||
| 2331 | * @offset: file offset to start truncating | ||
| 2332 | * | ||
| 2333 | * NOTE! We have to be ready to update the memory sharing | ||
| 2334 | * between the file and the memory map for a potential last | ||
| 2335 | * incomplete page. Ugly, but necessary. | ||
| 2336 | */ | ||
| 2337 | int vmtruncate(struct inode * inode, loff_t offset) | ||
| 2338 | { | ||
| 2339 | if (inode->i_size < offset) { | ||
| 2340 | unsigned long limit; | ||
| 2341 | |||
| 2342 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
| 2343 | if (limit != RLIM_INFINITY && offset > limit) | ||
| 2344 | goto out_sig; | ||
| 2345 | if (offset > inode->i_sb->s_maxbytes) | ||
| 2346 | goto out_big; | ||
| 2347 | i_size_write(inode, offset); | ||
| 2348 | } else { | ||
| 2349 | struct address_space *mapping = inode->i_mapping; | ||
| 2350 | |||
| 2351 | /* | ||
| 2352 | * truncation of in-use swapfiles is disallowed - it would | ||
| 2353 | * cause subsequent swapout to scribble on the now-freed | ||
| 2354 | * blocks. | ||
| 2355 | */ | ||
| 2356 | if (IS_SWAPFILE(inode)) | ||
| 2357 | return -ETXTBSY; | ||
| 2358 | i_size_write(inode, offset); | ||
| 2359 | |||
| 2360 | /* | ||
| 2361 | * unmap_mapping_range is called twice, first simply for | ||
| 2362 | * efficiency so that truncate_inode_pages does fewer | ||
| 2363 | * single-page unmaps. However after this first call, and | ||
| 2364 | * before truncate_inode_pages finishes, it is possible for | ||
| 2365 | * private pages to be COWed, which remain after | ||
| 2366 | * truncate_inode_pages finishes, hence the second | ||
| 2367 | * unmap_mapping_range call must be made for correctness. | ||
| 2368 | */ | ||
| 2369 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
| 2370 | truncate_inode_pages(mapping, offset); | ||
| 2371 | unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); | ||
| 2372 | } | ||
| 2373 | |||
| 2374 | if (inode->i_op->truncate) | ||
| 2375 | inode->i_op->truncate(inode); | ||
| 2376 | return 0; | ||
| 2377 | |||
| 2378 | out_sig: | ||
| 2379 | send_sig(SIGXFSZ, current, 0); | ||
| 2380 | out_big: | ||
| 2381 | return -EFBIG; | ||
| 2382 | } | ||
| 2383 | EXPORT_SYMBOL(vmtruncate); | ||
| 2384 | |||
| 2385 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | 2466 | int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) |
| 2386 | { | 2467 | { |
| 2387 | struct address_space *mapping = inode->i_mapping; | 2468 | struct address_space *mapping = inode->i_mapping; |
| @@ -2413,7 +2494,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) | |||
| 2413 | */ | 2494 | */ |
| 2414 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2495 | static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2415 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2496 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 2416 | int write_access, pte_t orig_pte) | 2497 | unsigned int flags, pte_t orig_pte) |
| 2417 | { | 2498 | { |
| 2418 | spinlock_t *ptl; | 2499 | spinlock_t *ptl; |
| 2419 | struct page *page; | 2500 | struct page *page; |
| @@ -2426,14 +2507,21 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2426 | goto out; | 2507 | goto out; |
| 2427 | 2508 | ||
| 2428 | entry = pte_to_swp_entry(orig_pte); | 2509 | entry = pte_to_swp_entry(orig_pte); |
| 2429 | if (is_migration_entry(entry)) { | 2510 | if (unlikely(non_swap_entry(entry))) { |
| 2430 | migration_entry_wait(mm, pmd, address); | 2511 | if (is_migration_entry(entry)) { |
| 2512 | migration_entry_wait(mm, pmd, address); | ||
| 2513 | } else if (is_hwpoison_entry(entry)) { | ||
| 2514 | ret = VM_FAULT_HWPOISON; | ||
| 2515 | } else { | ||
| 2516 | print_bad_pte(vma, address, orig_pte, NULL); | ||
| 2517 | ret = VM_FAULT_OOM; | ||
| 2518 | } | ||
| 2431 | goto out; | 2519 | goto out; |
| 2432 | } | 2520 | } |
| 2433 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); | 2521 | delayacct_set_flag(DELAYACCT_PF_SWAPIN); |
| 2434 | page = lookup_swap_cache(entry); | 2522 | page = lookup_swap_cache(entry); |
| 2435 | if (!page) { | 2523 | if (!page) { |
| 2436 | grab_swap_token(); /* Contend for token _before_ read-in */ | 2524 | grab_swap_token(mm); /* Contend for token _before_ read-in */ |
| 2437 | page = swapin_readahead(entry, | 2525 | page = swapin_readahead(entry, |
| 2438 | GFP_HIGHUSER_MOVABLE, vma, address); | 2526 | GFP_HIGHUSER_MOVABLE, vma, address); |
| 2439 | if (!page) { | 2527 | if (!page) { |
| @@ -2451,6 +2539,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2451 | /* Had to read the page from swap area: Major fault */ | 2539 | /* Had to read the page from swap area: Major fault */ |
| 2452 | ret = VM_FAULT_MAJOR; | 2540 | ret = VM_FAULT_MAJOR; |
| 2453 | count_vm_event(PGMAJFAULT); | 2541 | count_vm_event(PGMAJFAULT); |
| 2542 | } else if (PageHWPoison(page)) { | ||
| 2543 | ret = VM_FAULT_HWPOISON; | ||
| 2544 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
| 2545 | goto out_release; | ||
| 2454 | } | 2546 | } |
| 2455 | 2547 | ||
| 2456 | lock_page(page); | 2548 | lock_page(page); |
| @@ -2458,8 +2550,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2458 | 2550 | ||
| 2459 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { | 2551 | if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { |
| 2460 | ret = VM_FAULT_OOM; | 2552 | ret = VM_FAULT_OOM; |
| 2461 | unlock_page(page); | 2553 | goto out_page; |
| 2462 | goto out; | ||
| 2463 | } | 2554 | } |
| 2464 | 2555 | ||
| 2465 | /* | 2556 | /* |
| @@ -2490,9 +2581,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2490 | 2581 | ||
| 2491 | inc_mm_counter(mm, anon_rss); | 2582 | inc_mm_counter(mm, anon_rss); |
| 2492 | pte = mk_pte(page, vma->vm_page_prot); | 2583 | pte = mk_pte(page, vma->vm_page_prot); |
| 2493 | if (write_access && reuse_swap_page(page)) { | 2584 | if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { |
| 2494 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2585 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
| 2495 | write_access = 0; | 2586 | flags &= ~FAULT_FLAG_WRITE; |
| 2496 | } | 2587 | } |
| 2497 | flush_icache_page(vma, page); | 2588 | flush_icache_page(vma, page); |
| 2498 | set_pte_at(mm, address, page_table, pte); | 2589 | set_pte_at(mm, address, page_table, pte); |
| @@ -2505,7 +2596,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2505 | try_to_free_swap(page); | 2596 | try_to_free_swap(page); |
| 2506 | unlock_page(page); | 2597 | unlock_page(page); |
| 2507 | 2598 | ||
| 2508 | if (write_access) { | 2599 | if (flags & FAULT_FLAG_WRITE) { |
| 2509 | ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); | 2600 | ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); |
| 2510 | if (ret & VM_FAULT_ERROR) | 2601 | if (ret & VM_FAULT_ERROR) |
| 2511 | ret &= VM_FAULT_ERROR; | 2602 | ret &= VM_FAULT_ERROR; |
| @@ -2521,7 +2612,9 @@ out: | |||
| 2521 | out_nomap: | 2612 | out_nomap: |
| 2522 | mem_cgroup_cancel_charge_swapin(ptr); | 2613 | mem_cgroup_cancel_charge_swapin(ptr); |
| 2523 | pte_unmap_unlock(page_table, ptl); | 2614 | pte_unmap_unlock(page_table, ptl); |
| 2615 | out_page: | ||
| 2524 | unlock_page(page); | 2616 | unlock_page(page); |
| 2617 | out_release: | ||
| 2525 | page_cache_release(page); | 2618 | page_cache_release(page); |
| 2526 | return ret; | 2619 | return ret; |
| 2527 | } | 2620 | } |
| @@ -2533,12 +2626,22 @@ out_nomap: | |||
| 2533 | */ | 2626 | */ |
| 2534 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 2627 | static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2535 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2628 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 2536 | int write_access) | 2629 | unsigned int flags) |
| 2537 | { | 2630 | { |
| 2538 | struct page *page; | 2631 | struct page *page; |
| 2539 | spinlock_t *ptl; | 2632 | spinlock_t *ptl; |
| 2540 | pte_t entry; | 2633 | pte_t entry; |
| 2541 | 2634 | ||
| 2635 | if (!(flags & FAULT_FLAG_WRITE)) { | ||
| 2636 | entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), | ||
| 2637 | vma->vm_page_prot)); | ||
| 2638 | ptl = pte_lockptr(mm, pmd); | ||
| 2639 | spin_lock(ptl); | ||
| 2640 | if (!pte_none(*page_table)) | ||
| 2641 | goto unlock; | ||
| 2642 | goto setpte; | ||
| 2643 | } | ||
| 2644 | |||
| 2542 | /* Allocate our own private page. */ | 2645 | /* Allocate our own private page. */ |
| 2543 | pte_unmap(page_table); | 2646 | pte_unmap(page_table); |
| 2544 | 2647 | ||
| @@ -2553,13 +2656,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2553 | goto oom_free_page; | 2656 | goto oom_free_page; |
| 2554 | 2657 | ||
| 2555 | entry = mk_pte(page, vma->vm_page_prot); | 2658 | entry = mk_pte(page, vma->vm_page_prot); |
| 2556 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2659 | if (vma->vm_flags & VM_WRITE) |
| 2660 | entry = pte_mkwrite(pte_mkdirty(entry)); | ||
| 2557 | 2661 | ||
| 2558 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2662 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 2559 | if (!pte_none(*page_table)) | 2663 | if (!pte_none(*page_table)) |
| 2560 | goto release; | 2664 | goto release; |
| 2665 | |||
| 2561 | inc_mm_counter(mm, anon_rss); | 2666 | inc_mm_counter(mm, anon_rss); |
| 2562 | page_add_new_anon_rmap(page, vma, address); | 2667 | page_add_new_anon_rmap(page, vma, address); |
| 2668 | setpte: | ||
| 2563 | set_pte_at(mm, address, page_table, entry); | 2669 | set_pte_at(mm, address, page_table, entry); |
| 2564 | 2670 | ||
| 2565 | /* No need to invalidate - it was non-present before */ | 2671 | /* No need to invalidate - it was non-present before */ |
| @@ -2614,6 +2720,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2614 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) | 2720 | if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) |
| 2615 | return ret; | 2721 | return ret; |
| 2616 | 2722 | ||
| 2723 | if (unlikely(PageHWPoison(vmf.page))) { | ||
| 2724 | if (ret & VM_FAULT_LOCKED) | ||
| 2725 | unlock_page(vmf.page); | ||
| 2726 | return VM_FAULT_HWPOISON; | ||
| 2727 | } | ||
| 2728 | |||
| 2617 | /* | 2729 | /* |
| 2618 | * For consistency in subsequent calls, make the faulted page always | 2730 | * For consistency in subsequent calls, make the faulted page always |
| 2619 | * locked. | 2731 | * locked. |
| @@ -2664,27 +2776,22 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2664 | int tmp; | 2776 | int tmp; |
| 2665 | 2777 | ||
| 2666 | unlock_page(page); | 2778 | unlock_page(page); |
| 2667 | vmf.flags |= FAULT_FLAG_MKWRITE; | 2779 | vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; |
| 2668 | tmp = vma->vm_ops->page_mkwrite(vma, &vmf); | 2780 | tmp = vma->vm_ops->page_mkwrite(vma, &vmf); |
| 2669 | if (unlikely(tmp & | 2781 | if (unlikely(tmp & |
| 2670 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { | 2782 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { |
| 2671 | ret = tmp; | 2783 | ret = tmp; |
| 2672 | anon = 1; /* no anon but release vmf.page */ | 2784 | goto unwritable_page; |
| 2673 | goto out_unlocked; | ||
| 2674 | } | ||
| 2675 | lock_page(page); | ||
| 2676 | /* | ||
| 2677 | * XXX: this is not quite right (racy vs | ||
| 2678 | * invalidate) to unlock and relock the page | ||
| 2679 | * like this, however a better fix requires | ||
| 2680 | * reworking page_mkwrite locking API, which | ||
| 2681 | * is better done later. | ||
| 2682 | */ | ||
| 2683 | if (!page->mapping) { | ||
| 2684 | ret = 0; | ||
| 2685 | anon = 1; /* no anon but release vmf.page */ | ||
| 2686 | goto out; | ||
| 2687 | } | 2785 | } |
| 2786 | if (unlikely(!(tmp & VM_FAULT_LOCKED))) { | ||
| 2787 | lock_page(page); | ||
| 2788 | if (!page->mapping) { | ||
| 2789 | ret = 0; /* retry the fault */ | ||
| 2790 | unlock_page(page); | ||
| 2791 | goto unwritable_page; | ||
| 2792 | } | ||
| 2793 | } else | ||
| 2794 | VM_BUG_ON(!PageLocked(page)); | ||
| 2688 | page_mkwrite = 1; | 2795 | page_mkwrite = 1; |
| 2689 | } | 2796 | } |
| 2690 | } | 2797 | } |
| @@ -2698,7 +2805,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2698 | * due to the bad i386 page protection. But it's valid | 2805 | * due to the bad i386 page protection. But it's valid |
| 2699 | * for other architectures too. | 2806 | * for other architectures too. |
| 2700 | * | 2807 | * |
| 2701 | * Note that if write_access is true, we either now have | 2808 | * Note that if FAULT_FLAG_WRITE is set, we either now have |
| 2702 | * an exclusive copy of the page, or this is a shared mapping, | 2809 | * an exclusive copy of the page, or this is a shared mapping, |
| 2703 | * so we can make it writable and dirty to avoid having to | 2810 | * so we can make it writable and dirty to avoid having to |
| 2704 | * handle that later. | 2811 | * handle that later. |
| @@ -2736,28 +2843,43 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2736 | pte_unmap_unlock(page_table, ptl); | 2843 | pte_unmap_unlock(page_table, ptl); |
| 2737 | 2844 | ||
| 2738 | out: | 2845 | out: |
| 2739 | unlock_page(vmf.page); | 2846 | if (dirty_page) { |
| 2740 | out_unlocked: | 2847 | struct address_space *mapping = page->mapping; |
| 2741 | if (anon) | ||
| 2742 | page_cache_release(vmf.page); | ||
| 2743 | else if (dirty_page) { | ||
| 2744 | if (vma->vm_file) | ||
| 2745 | file_update_time(vma->vm_file); | ||
| 2746 | 2848 | ||
| 2747 | set_page_dirty_balance(dirty_page, page_mkwrite); | 2849 | if (set_page_dirty(dirty_page)) |
| 2850 | page_mkwrite = 1; | ||
| 2851 | unlock_page(dirty_page); | ||
| 2748 | put_page(dirty_page); | 2852 | put_page(dirty_page); |
| 2853 | if (page_mkwrite && mapping) { | ||
| 2854 | /* | ||
| 2855 | * Some device drivers do not set page.mapping but still | ||
| 2856 | * dirty their pages | ||
| 2857 | */ | ||
| 2858 | balance_dirty_pages_ratelimited(mapping); | ||
| 2859 | } | ||
| 2860 | |||
| 2861 | /* file_update_time outside page_lock */ | ||
| 2862 | if (vma->vm_file) | ||
| 2863 | file_update_time(vma->vm_file); | ||
| 2864 | } else { | ||
| 2865 | unlock_page(vmf.page); | ||
| 2866 | if (anon) | ||
| 2867 | page_cache_release(vmf.page); | ||
| 2749 | } | 2868 | } |
| 2750 | 2869 | ||
| 2751 | return ret; | 2870 | return ret; |
| 2871 | |||
| 2872 | unwritable_page: | ||
| 2873 | page_cache_release(page); | ||
| 2874 | return ret; | ||
| 2752 | } | 2875 | } |
| 2753 | 2876 | ||
| 2754 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2877 | static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2755 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2878 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 2756 | int write_access, pte_t orig_pte) | 2879 | unsigned int flags, pte_t orig_pte) |
| 2757 | { | 2880 | { |
| 2758 | pgoff_t pgoff = (((address & PAGE_MASK) | 2881 | pgoff_t pgoff = (((address & PAGE_MASK) |
| 2759 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; | 2882 | - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; |
| 2760 | unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0); | ||
| 2761 | 2883 | ||
| 2762 | pte_unmap(page_table); | 2884 | pte_unmap(page_table); |
| 2763 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); | 2885 | return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte); |
| @@ -2774,12 +2896,12 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2774 | */ | 2896 | */ |
| 2775 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2897 | static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2776 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 2898 | unsigned long address, pte_t *page_table, pmd_t *pmd, |
| 2777 | int write_access, pte_t orig_pte) | 2899 | unsigned int flags, pte_t orig_pte) |
| 2778 | { | 2900 | { |
| 2779 | unsigned int flags = FAULT_FLAG_NONLINEAR | | ||
| 2780 | (write_access ? FAULT_FLAG_WRITE : 0); | ||
| 2781 | pgoff_t pgoff; | 2901 | pgoff_t pgoff; |
| 2782 | 2902 | ||
| 2903 | flags |= FAULT_FLAG_NONLINEAR; | ||
| 2904 | |||
| 2783 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) | 2905 | if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) |
| 2784 | return 0; | 2906 | return 0; |
| 2785 | 2907 | ||
| @@ -2810,7 +2932,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2810 | */ | 2932 | */ |
| 2811 | static inline int handle_pte_fault(struct mm_struct *mm, | 2933 | static inline int handle_pte_fault(struct mm_struct *mm, |
| 2812 | struct vm_area_struct *vma, unsigned long address, | 2934 | struct vm_area_struct *vma, unsigned long address, |
| 2813 | pte_t *pte, pmd_t *pmd, int write_access) | 2935 | pte_t *pte, pmd_t *pmd, unsigned int flags) |
| 2814 | { | 2936 | { |
| 2815 | pte_t entry; | 2937 | pte_t entry; |
| 2816 | spinlock_t *ptl; | 2938 | spinlock_t *ptl; |
| @@ -2821,30 +2943,30 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
| 2821 | if (vma->vm_ops) { | 2943 | if (vma->vm_ops) { |
| 2822 | if (likely(vma->vm_ops->fault)) | 2944 | if (likely(vma->vm_ops->fault)) |
| 2823 | return do_linear_fault(mm, vma, address, | 2945 | return do_linear_fault(mm, vma, address, |
| 2824 | pte, pmd, write_access, entry); | 2946 | pte, pmd, flags, entry); |
| 2825 | } | 2947 | } |
| 2826 | return do_anonymous_page(mm, vma, address, | 2948 | return do_anonymous_page(mm, vma, address, |
| 2827 | pte, pmd, write_access); | 2949 | pte, pmd, flags); |
| 2828 | } | 2950 | } |
| 2829 | if (pte_file(entry)) | 2951 | if (pte_file(entry)) |
| 2830 | return do_nonlinear_fault(mm, vma, address, | 2952 | return do_nonlinear_fault(mm, vma, address, |
| 2831 | pte, pmd, write_access, entry); | 2953 | pte, pmd, flags, entry); |
| 2832 | return do_swap_page(mm, vma, address, | 2954 | return do_swap_page(mm, vma, address, |
| 2833 | pte, pmd, write_access, entry); | 2955 | pte, pmd, flags, entry); |
| 2834 | } | 2956 | } |
| 2835 | 2957 | ||
| 2836 | ptl = pte_lockptr(mm, pmd); | 2958 | ptl = pte_lockptr(mm, pmd); |
| 2837 | spin_lock(ptl); | 2959 | spin_lock(ptl); |
| 2838 | if (unlikely(!pte_same(*pte, entry))) | 2960 | if (unlikely(!pte_same(*pte, entry))) |
| 2839 | goto unlock; | 2961 | goto unlock; |
| 2840 | if (write_access) { | 2962 | if (flags & FAULT_FLAG_WRITE) { |
| 2841 | if (!pte_write(entry)) | 2963 | if (!pte_write(entry)) |
| 2842 | return do_wp_page(mm, vma, address, | 2964 | return do_wp_page(mm, vma, address, |
| 2843 | pte, pmd, ptl, entry); | 2965 | pte, pmd, ptl, entry); |
| 2844 | entry = pte_mkdirty(entry); | 2966 | entry = pte_mkdirty(entry); |
| 2845 | } | 2967 | } |
| 2846 | entry = pte_mkyoung(entry); | 2968 | entry = pte_mkyoung(entry); |
| 2847 | if (ptep_set_access_flags(vma, address, pte, entry, write_access)) { | 2969 | if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { |
| 2848 | update_mmu_cache(vma, address, entry); | 2970 | update_mmu_cache(vma, address, entry); |
| 2849 | } else { | 2971 | } else { |
| 2850 | /* | 2972 | /* |
| @@ -2853,7 +2975,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, | |||
| 2853 | * This still avoids useless tlb flushes for .text page faults | 2975 | * This still avoids useless tlb flushes for .text page faults |
| 2854 | * with threads. | 2976 | * with threads. |
| 2855 | */ | 2977 | */ |
| 2856 | if (write_access) | 2978 | if (flags & FAULT_FLAG_WRITE) |
| 2857 | flush_tlb_page(vma, address); | 2979 | flush_tlb_page(vma, address); |
| 2858 | } | 2980 | } |
| 2859 | unlock: | 2981 | unlock: |
| @@ -2865,7 +2987,7 @@ unlock: | |||
| 2865 | * By the time we get here, we already hold the mm semaphore | 2987 | * By the time we get here, we already hold the mm semaphore |
| 2866 | */ | 2988 | */ |
| 2867 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 2989 | int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
| 2868 | unsigned long address, int write_access) | 2990 | unsigned long address, unsigned int flags) |
| 2869 | { | 2991 | { |
| 2870 | pgd_t *pgd; | 2992 | pgd_t *pgd; |
| 2871 | pud_t *pud; | 2993 | pud_t *pud; |
| @@ -2877,7 +2999,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2877 | count_vm_event(PGFAULT); | 2999 | count_vm_event(PGFAULT); |
| 2878 | 3000 | ||
| 2879 | if (unlikely(is_vm_hugetlb_page(vma))) | 3001 | if (unlikely(is_vm_hugetlb_page(vma))) |
| 2880 | return hugetlb_fault(mm, vma, address, write_access); | 3002 | return hugetlb_fault(mm, vma, address, flags); |
| 2881 | 3003 | ||
| 2882 | pgd = pgd_offset(mm, address); | 3004 | pgd = pgd_offset(mm, address); |
| 2883 | pud = pud_alloc(mm, pgd, address); | 3005 | pud = pud_alloc(mm, pgd, address); |
| @@ -2890,7 +3012,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2890 | if (!pte) | 3012 | if (!pte) |
| 2891 | return VM_FAULT_OOM; | 3013 | return VM_FAULT_OOM; |
| 2892 | 3014 | ||
| 2893 | return handle_pte_fault(mm, vma, address, pte, pmd, write_access); | 3015 | return handle_pte_fault(mm, vma, address, pte, pmd, flags); |
| 2894 | } | 3016 | } |
| 2895 | 3017 | ||
| 2896 | #ifndef __PAGETABLE_PUD_FOLDED | 3018 | #ifndef __PAGETABLE_PUD_FOLDED |
| @@ -3009,22 +3131,13 @@ int in_gate_area_no_task(unsigned long addr) | |||
| 3009 | 3131 | ||
| 3010 | #endif /* __HAVE_ARCH_GATE_AREA */ | 3132 | #endif /* __HAVE_ARCH_GATE_AREA */ |
| 3011 | 3133 | ||
| 3012 | #ifdef CONFIG_HAVE_IOREMAP_PROT | 3134 | static int follow_pte(struct mm_struct *mm, unsigned long address, |
| 3013 | int follow_phys(struct vm_area_struct *vma, | 3135 | pte_t **ptepp, spinlock_t **ptlp) |
| 3014 | unsigned long address, unsigned int flags, | ||
| 3015 | unsigned long *prot, resource_size_t *phys) | ||
| 3016 | { | 3136 | { |
| 3017 | pgd_t *pgd; | 3137 | pgd_t *pgd; |
| 3018 | pud_t *pud; | 3138 | pud_t *pud; |
| 3019 | pmd_t *pmd; | 3139 | pmd_t *pmd; |
| 3020 | pte_t *ptep, pte; | 3140 | pte_t *ptep; |
| 3021 | spinlock_t *ptl; | ||
| 3022 | resource_size_t phys_addr = 0; | ||
| 3023 | struct mm_struct *mm = vma->vm_mm; | ||
| 3024 | int ret = -EINVAL; | ||
| 3025 | |||
| 3026 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
| 3027 | goto out; | ||
| 3028 | 3141 | ||
| 3029 | pgd = pgd_offset(mm, address); | 3142 | pgd = pgd_offset(mm, address); |
| 3030 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) | 3143 | if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) |
| @@ -3042,22 +3155,71 @@ int follow_phys(struct vm_area_struct *vma, | |||
| 3042 | if (pmd_huge(*pmd)) | 3155 | if (pmd_huge(*pmd)) |
| 3043 | goto out; | 3156 | goto out; |
| 3044 | 3157 | ||
| 3045 | ptep = pte_offset_map_lock(mm, pmd, address, &ptl); | 3158 | ptep = pte_offset_map_lock(mm, pmd, address, ptlp); |
| 3046 | if (!ptep) | 3159 | if (!ptep) |
| 3047 | goto out; | 3160 | goto out; |
| 3161 | if (!pte_present(*ptep)) | ||
| 3162 | goto unlock; | ||
| 3163 | *ptepp = ptep; | ||
| 3164 | return 0; | ||
| 3165 | unlock: | ||
| 3166 | pte_unmap_unlock(ptep, *ptlp); | ||
| 3167 | out: | ||
| 3168 | return -EINVAL; | ||
| 3169 | } | ||
| 3170 | |||
| 3171 | /** | ||
| 3172 | * follow_pfn - look up PFN at a user virtual address | ||
| 3173 | * @vma: memory mapping | ||
| 3174 | * @address: user virtual address | ||
| 3175 | * @pfn: location to store found PFN | ||
| 3176 | * | ||
| 3177 | * Only IO mappings and raw PFN mappings are allowed. | ||
| 3178 | * | ||
| 3179 | * Returns zero and the pfn at @pfn on success, -ve otherwise. | ||
| 3180 | */ | ||
| 3181 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, | ||
| 3182 | unsigned long *pfn) | ||
| 3183 | { | ||
| 3184 | int ret = -EINVAL; | ||
| 3185 | spinlock_t *ptl; | ||
| 3186 | pte_t *ptep; | ||
| 3187 | |||
| 3188 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
| 3189 | return ret; | ||
| 3048 | 3190 | ||
| 3191 | ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); | ||
| 3192 | if (ret) | ||
| 3193 | return ret; | ||
| 3194 | *pfn = pte_pfn(*ptep); | ||
| 3195 | pte_unmap_unlock(ptep, ptl); | ||
| 3196 | return 0; | ||
| 3197 | } | ||
| 3198 | EXPORT_SYMBOL(follow_pfn); | ||
| 3199 | |||
| 3200 | #ifdef CONFIG_HAVE_IOREMAP_PROT | ||
| 3201 | int follow_phys(struct vm_area_struct *vma, | ||
| 3202 | unsigned long address, unsigned int flags, | ||
| 3203 | unsigned long *prot, resource_size_t *phys) | ||
| 3204 | { | ||
| 3205 | int ret = -EINVAL; | ||
| 3206 | pte_t *ptep, pte; | ||
| 3207 | spinlock_t *ptl; | ||
| 3208 | |||
| 3209 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
| 3210 | goto out; | ||
| 3211 | |||
| 3212 | if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) | ||
| 3213 | goto out; | ||
| 3049 | pte = *ptep; | 3214 | pte = *ptep; |
| 3050 | if (!pte_present(pte)) | 3215 | |
| 3051 | goto unlock; | ||
| 3052 | if ((flags & FOLL_WRITE) && !pte_write(pte)) | 3216 | if ((flags & FOLL_WRITE) && !pte_write(pte)) |
| 3053 | goto unlock; | 3217 | goto unlock; |
| 3054 | phys_addr = pte_pfn(pte); | ||
| 3055 | phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */ | ||
| 3056 | 3218 | ||
| 3057 | *prot = pgprot_val(pte_pgprot(pte)); | 3219 | *prot = pgprot_val(pte_pgprot(pte)); |
| 3058 | *phys = phys_addr; | 3220 | *phys = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT; |
| 3059 | ret = 0; | ||
| 3060 | 3221 | ||
| 3222 | ret = 0; | ||
| 3061 | unlock: | 3223 | unlock: |
| 3062 | pte_unmap_unlock(ptep, ptl); | 3224 | pte_unmap_unlock(ptep, ptl); |
| 3063 | out: | 3225 | out: |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c083cf5fd6df..2047465cd27c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -26,6 +26,7 @@ | |||
| 26 | #include <linux/migrate.h> | 26 | #include <linux/migrate.h> |
| 27 | #include <linux/page-isolation.h> | 27 | #include <linux/page-isolation.h> |
| 28 | #include <linux/pfn.h> | 28 | #include <linux/pfn.h> |
| 29 | #include <linux/suspend.h> | ||
| 29 | 30 | ||
| 30 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
| 31 | 32 | ||
| @@ -339,8 +340,11 @@ EXPORT_SYMBOL_GPL(__remove_pages); | |||
| 339 | 340 | ||
| 340 | void online_page(struct page *page) | 341 | void online_page(struct page *page) |
| 341 | { | 342 | { |
| 343 | unsigned long pfn = page_to_pfn(page); | ||
| 344 | |||
| 342 | totalram_pages++; | 345 | totalram_pages++; |
| 343 | num_physpages++; | 346 | if (pfn >= num_physpages) |
| 347 | num_physpages = pfn + 1; | ||
| 344 | 348 | ||
| 345 | #ifdef CONFIG_HIGHMEM | 349 | #ifdef CONFIG_HIGHMEM |
| 346 | if (PageHighMem(page)) | 350 | if (PageHighMem(page)) |
| @@ -410,7 +414,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 410 | if (!populated_zone(zone)) | 414 | if (!populated_zone(zone)) |
| 411 | need_zonelists_rebuild = 1; | 415 | need_zonelists_rebuild = 1; |
| 412 | 416 | ||
| 413 | ret = walk_memory_resource(pfn, nr_pages, &onlined_pages, | 417 | ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, |
| 414 | online_pages_range); | 418 | online_pages_range); |
| 415 | if (ret) { | 419 | if (ret) { |
| 416 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", | 420 | printk(KERN_DEBUG "online_pages %lx at %lx failed\n", |
| @@ -422,7 +426,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 422 | zone->present_pages += onlined_pages; | 426 | zone->present_pages += onlined_pages; |
| 423 | zone->zone_pgdat->node_present_pages += onlined_pages; | 427 | zone->zone_pgdat->node_present_pages += onlined_pages; |
| 424 | 428 | ||
| 425 | setup_per_zone_pages_min(); | 429 | zone_pcp_update(zone); |
| 430 | setup_per_zone_wmarks(); | ||
| 431 | calculate_zone_inactive_ratio(zone); | ||
| 426 | if (onlined_pages) { | 432 | if (onlined_pages) { |
| 427 | kswapd_run(zone_to_nid(zone)); | 433 | kswapd_run(zone_to_nid(zone)); |
| 428 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); | 434 | node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); |
| @@ -442,7 +448,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) | |||
| 442 | } | 448 | } |
| 443 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ | 449 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ |
| 444 | 450 | ||
| 445 | static pg_data_t *hotadd_new_pgdat(int nid, u64 start) | 451 | /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ |
| 452 | static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | ||
| 446 | { | 453 | { |
| 447 | struct pglist_data *pgdat; | 454 | struct pglist_data *pgdat; |
| 448 | unsigned long zones_size[MAX_NR_ZONES] = {0}; | 455 | unsigned long zones_size[MAX_NR_ZONES] = {0}; |
| @@ -479,14 +486,18 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
| 479 | struct resource *res; | 486 | struct resource *res; |
| 480 | int ret; | 487 | int ret; |
| 481 | 488 | ||
| 489 | lock_system_sleep(); | ||
| 490 | |||
| 482 | res = register_memory_resource(start, size); | 491 | res = register_memory_resource(start, size); |
| 492 | ret = -EEXIST; | ||
| 483 | if (!res) | 493 | if (!res) |
| 484 | return -EEXIST; | 494 | goto out; |
| 485 | 495 | ||
| 486 | if (!node_online(nid)) { | 496 | if (!node_online(nid)) { |
| 487 | pgdat = hotadd_new_pgdat(nid, start); | 497 | pgdat = hotadd_new_pgdat(nid, start); |
| 498 | ret = -ENOMEM; | ||
| 488 | if (!pgdat) | 499 | if (!pgdat) |
| 489 | return -ENOMEM; | 500 | goto out; |
| 490 | new_pgdat = 1; | 501 | new_pgdat = 1; |
| 491 | } | 502 | } |
| 492 | 503 | ||
| @@ -509,7 +520,8 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
| 509 | BUG_ON(ret); | 520 | BUG_ON(ret); |
| 510 | } | 521 | } |
| 511 | 522 | ||
| 512 | return ret; | 523 | goto out; |
| 524 | |||
| 513 | error: | 525 | error: |
| 514 | /* rollback pgdat allocation and others */ | 526 | /* rollback pgdat allocation and others */ |
| 515 | if (new_pgdat) | 527 | if (new_pgdat) |
| @@ -517,6 +529,8 @@ error: | |||
| 517 | if (res) | 529 | if (res) |
| 518 | release_memory_resource(res); | 530 | release_memory_resource(res); |
| 519 | 531 | ||
| 532 | out: | ||
| 533 | unlock_system_sleep(); | ||
| 520 | return ret; | 534 | return ret; |
| 521 | } | 535 | } |
| 522 | EXPORT_SYMBOL_GPL(add_memory); | 536 | EXPORT_SYMBOL_GPL(add_memory); |
| @@ -700,7 +714,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, | |||
| 700 | static void | 714 | static void |
| 701 | offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | 715 | offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) |
| 702 | { | 716 | { |
| 703 | walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, | 717 | walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, |
| 704 | offline_isolated_pages_cb); | 718 | offline_isolated_pages_cb); |
| 705 | } | 719 | } |
| 706 | 720 | ||
| @@ -726,7 +740,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
| 726 | long offlined = 0; | 740 | long offlined = 0; |
| 727 | int ret; | 741 | int ret; |
| 728 | 742 | ||
| 729 | ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, | 743 | ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, |
| 730 | check_pages_isolated_cb); | 744 | check_pages_isolated_cb); |
| 731 | if (ret < 0) | 745 | if (ret < 0) |
| 732 | offlined = (long)ret; | 746 | offlined = (long)ret; |
| @@ -753,6 +767,8 @@ int offline_pages(unsigned long start_pfn, | |||
| 753 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | 767 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) |
| 754 | return -EINVAL; | 768 | return -EINVAL; |
| 755 | 769 | ||
| 770 | lock_system_sleep(); | ||
| 771 | |||
| 756 | zone = page_zone(pfn_to_page(start_pfn)); | 772 | zone = page_zone(pfn_to_page(start_pfn)); |
| 757 | node = zone_to_nid(zone); | 773 | node = zone_to_nid(zone); |
| 758 | nr_pages = end_pfn - start_pfn; | 774 | nr_pages = end_pfn - start_pfn; |
| @@ -760,7 +776,7 @@ int offline_pages(unsigned long start_pfn, | |||
| 760 | /* set above range as isolated */ | 776 | /* set above range as isolated */ |
| 761 | ret = start_isolate_page_range(start_pfn, end_pfn); | 777 | ret = start_isolate_page_range(start_pfn, end_pfn); |
| 762 | if (ret) | 778 | if (ret) |
| 763 | return ret; | 779 | goto out; |
| 764 | 780 | ||
| 765 | arg.start_pfn = start_pfn; | 781 | arg.start_pfn = start_pfn; |
| 766 | arg.nr_pages = nr_pages; | 782 | arg.nr_pages = nr_pages; |
| @@ -830,12 +846,15 @@ repeat: | |||
| 830 | zone->present_pages -= offlined_pages; | 846 | zone->present_pages -= offlined_pages; |
| 831 | zone->zone_pgdat->node_present_pages -= offlined_pages; | 847 | zone->zone_pgdat->node_present_pages -= offlined_pages; |
| 832 | totalram_pages -= offlined_pages; | 848 | totalram_pages -= offlined_pages; |
| 833 | num_physpages -= offlined_pages; | 849 | |
| 850 | setup_per_zone_wmarks(); | ||
| 851 | calculate_zone_inactive_ratio(zone); | ||
| 834 | 852 | ||
| 835 | vm_total_pages = nr_free_pagecache_pages(); | 853 | vm_total_pages = nr_free_pagecache_pages(); |
| 836 | writeback_set_ratelimit(); | 854 | writeback_set_ratelimit(); |
| 837 | 855 | ||
| 838 | memory_notify(MEM_OFFLINE, &arg); | 856 | memory_notify(MEM_OFFLINE, &arg); |
| 857 | unlock_system_sleep(); | ||
| 839 | return 0; | 858 | return 0; |
| 840 | 859 | ||
| 841 | failed_removal: | 860 | failed_removal: |
| @@ -845,6 +864,8 @@ failed_removal: | |||
| 845 | /* pushback to free area */ | 864 | /* pushback to free area */ |
| 846 | undo_isolate_page_range(start_pfn, end_pfn); | 865 | undo_isolate_page_range(start_pfn, end_pfn); |
| 847 | 866 | ||
| 867 | out: | ||
| 868 | unlock_system_sleep(); | ||
| 848 | return ret; | 869 | return ret; |
| 849 | } | 870 | } |
| 850 | 871 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3eb4a6fdc043..4545d5944243 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -182,13 +182,58 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) | |||
| 182 | return 0; | 182 | return 0; |
| 183 | } | 183 | } |
| 184 | 184 | ||
| 185 | /* Create a new policy */ | 185 | /* |
| 186 | * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if | ||
| 187 | * any, for the new policy. mpol_new() has already validated the nodes | ||
| 188 | * parameter with respect to the policy mode and flags. But, we need to | ||
| 189 | * handle an empty nodemask with MPOL_PREFERRED here. | ||
| 190 | * | ||
| 191 | * Must be called holding task's alloc_lock to protect task's mems_allowed | ||
| 192 | * and mempolicy. May also be called holding the mmap_semaphore for write. | ||
| 193 | */ | ||
| 194 | static int mpol_set_nodemask(struct mempolicy *pol, | ||
| 195 | const nodemask_t *nodes, struct nodemask_scratch *nsc) | ||
| 196 | { | ||
| 197 | int ret; | ||
| 198 | |||
| 199 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | ||
| 200 | if (pol == NULL) | ||
| 201 | return 0; | ||
| 202 | /* Check N_HIGH_MEMORY */ | ||
| 203 | nodes_and(nsc->mask1, | ||
| 204 | cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); | ||
| 205 | |||
| 206 | VM_BUG_ON(!nodes); | ||
| 207 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | ||
| 208 | nodes = NULL; /* explicit local allocation */ | ||
| 209 | else { | ||
| 210 | if (pol->flags & MPOL_F_RELATIVE_NODES) | ||
| 211 | mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); | ||
| 212 | else | ||
| 213 | nodes_and(nsc->mask2, *nodes, nsc->mask1); | ||
| 214 | |||
| 215 | if (mpol_store_user_nodemask(pol)) | ||
| 216 | pol->w.user_nodemask = *nodes; | ||
| 217 | else | ||
| 218 | pol->w.cpuset_mems_allowed = | ||
| 219 | cpuset_current_mems_allowed; | ||
| 220 | } | ||
| 221 | |||
| 222 | if (nodes) | ||
| 223 | ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); | ||
| 224 | else | ||
| 225 | ret = mpol_ops[pol->mode].create(pol, NULL); | ||
| 226 | return ret; | ||
| 227 | } | ||
| 228 | |||
| 229 | /* | ||
| 230 | * This function just creates a new policy, does some check and simple | ||
| 231 | * initialization. You must invoke mpol_set_nodemask() to set nodes. | ||
| 232 | */ | ||
| 186 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | 233 | static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, |
| 187 | nodemask_t *nodes) | 234 | nodemask_t *nodes) |
| 188 | { | 235 | { |
| 189 | struct mempolicy *policy; | 236 | struct mempolicy *policy; |
| 190 | nodemask_t cpuset_context_nmask; | ||
| 191 | int ret; | ||
| 192 | 237 | ||
| 193 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", | 238 | pr_debug("setting mode %d flags %d nodes[0] %lx\n", |
| 194 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); | 239 | mode, flags, nodes ? nodes_addr(*nodes)[0] : -1); |
| @@ -210,7 +255,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
| 210 | if (((flags & MPOL_F_STATIC_NODES) || | 255 | if (((flags & MPOL_F_STATIC_NODES) || |
| 211 | (flags & MPOL_F_RELATIVE_NODES))) | 256 | (flags & MPOL_F_RELATIVE_NODES))) |
| 212 | return ERR_PTR(-EINVAL); | 257 | return ERR_PTR(-EINVAL); |
| 213 | nodes = NULL; /* flag local alloc */ | ||
| 214 | } | 258 | } |
| 215 | } else if (nodes_empty(*nodes)) | 259 | } else if (nodes_empty(*nodes)) |
| 216 | return ERR_PTR(-EINVAL); | 260 | return ERR_PTR(-EINVAL); |
| @@ -221,30 +265,6 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
| 221 | policy->mode = mode; | 265 | policy->mode = mode; |
| 222 | policy->flags = flags; | 266 | policy->flags = flags; |
| 223 | 267 | ||
| 224 | if (nodes) { | ||
| 225 | /* | ||
| 226 | * cpuset related setup doesn't apply to local allocation | ||
| 227 | */ | ||
| 228 | cpuset_update_task_memory_state(); | ||
| 229 | if (flags & MPOL_F_RELATIVE_NODES) | ||
| 230 | mpol_relative_nodemask(&cpuset_context_nmask, nodes, | ||
| 231 | &cpuset_current_mems_allowed); | ||
| 232 | else | ||
| 233 | nodes_and(cpuset_context_nmask, *nodes, | ||
| 234 | cpuset_current_mems_allowed); | ||
| 235 | if (mpol_store_user_nodemask(policy)) | ||
| 236 | policy->w.user_nodemask = *nodes; | ||
| 237 | else | ||
| 238 | policy->w.cpuset_mems_allowed = | ||
| 239 | cpuset_mems_allowed(current); | ||
| 240 | } | ||
| 241 | |||
| 242 | ret = mpol_ops[mode].create(policy, | ||
| 243 | nodes ? &cpuset_context_nmask : NULL); | ||
| 244 | if (ret < 0) { | ||
| 245 | kmem_cache_free(policy_cache, policy); | ||
| 246 | return ERR_PTR(ret); | ||
| 247 | } | ||
| 248 | return policy; | 268 | return policy; |
| 249 | } | 269 | } |
| 250 | 270 | ||
| @@ -324,6 +344,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
| 324 | /* | 344 | /* |
| 325 | * Wrapper for mpol_rebind_policy() that just requires task | 345 | * Wrapper for mpol_rebind_policy() that just requires task |
| 326 | * pointer, and updates task mempolicy. | 346 | * pointer, and updates task mempolicy. |
| 347 | * | ||
| 348 | * Called with task's alloc_lock held. | ||
| 327 | */ | 349 | */ |
| 328 | 350 | ||
| 329 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) | 351 | void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) |
| @@ -600,13 +622,19 @@ static void mpol_set_task_struct_flag(void) | |||
| 600 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, | 622 | static long do_set_mempolicy(unsigned short mode, unsigned short flags, |
| 601 | nodemask_t *nodes) | 623 | nodemask_t *nodes) |
| 602 | { | 624 | { |
| 603 | struct mempolicy *new; | 625 | struct mempolicy *new, *old; |
| 604 | struct mm_struct *mm = current->mm; | 626 | struct mm_struct *mm = current->mm; |
| 627 | NODEMASK_SCRATCH(scratch); | ||
| 628 | int ret; | ||
| 605 | 629 | ||
| 606 | new = mpol_new(mode, flags, nodes); | 630 | if (!scratch) |
| 607 | if (IS_ERR(new)) | 631 | return -ENOMEM; |
| 608 | return PTR_ERR(new); | ||
| 609 | 632 | ||
| 633 | new = mpol_new(mode, flags, nodes); | ||
| 634 | if (IS_ERR(new)) { | ||
| 635 | ret = PTR_ERR(new); | ||
| 636 | goto out; | ||
| 637 | } | ||
| 610 | /* | 638 | /* |
| 611 | * prevent changing our mempolicy while show_numa_maps() | 639 | * prevent changing our mempolicy while show_numa_maps() |
| 612 | * is using it. | 640 | * is using it. |
| @@ -615,20 +643,36 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
| 615 | */ | 643 | */ |
| 616 | if (mm) | 644 | if (mm) |
| 617 | down_write(&mm->mmap_sem); | 645 | down_write(&mm->mmap_sem); |
| 618 | mpol_put(current->mempolicy); | 646 | task_lock(current); |
| 647 | ret = mpol_set_nodemask(new, nodes, scratch); | ||
| 648 | if (ret) { | ||
| 649 | task_unlock(current); | ||
| 650 | if (mm) | ||
| 651 | up_write(&mm->mmap_sem); | ||
| 652 | mpol_put(new); | ||
| 653 | goto out; | ||
| 654 | } | ||
| 655 | old = current->mempolicy; | ||
| 619 | current->mempolicy = new; | 656 | current->mempolicy = new; |
| 620 | mpol_set_task_struct_flag(); | 657 | mpol_set_task_struct_flag(); |
| 621 | if (new && new->mode == MPOL_INTERLEAVE && | 658 | if (new && new->mode == MPOL_INTERLEAVE && |
| 622 | nodes_weight(new->v.nodes)) | 659 | nodes_weight(new->v.nodes)) |
| 623 | current->il_next = first_node(new->v.nodes); | 660 | current->il_next = first_node(new->v.nodes); |
| 661 | task_unlock(current); | ||
| 624 | if (mm) | 662 | if (mm) |
| 625 | up_write(&mm->mmap_sem); | 663 | up_write(&mm->mmap_sem); |
| 626 | 664 | ||
| 627 | return 0; | 665 | mpol_put(old); |
| 666 | ret = 0; | ||
| 667 | out: | ||
| 668 | NODEMASK_SCRATCH_FREE(scratch); | ||
| 669 | return ret; | ||
| 628 | } | 670 | } |
| 629 | 671 | ||
| 630 | /* | 672 | /* |
| 631 | * Return nodemask for policy for get_mempolicy() query | 673 | * Return nodemask for policy for get_mempolicy() query |
| 674 | * | ||
| 675 | * Called with task's alloc_lock held | ||
| 632 | */ | 676 | */ |
| 633 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) | 677 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) |
| 634 | { | 678 | { |
| @@ -674,7 +718,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 674 | struct vm_area_struct *vma = NULL; | 718 | struct vm_area_struct *vma = NULL; |
| 675 | struct mempolicy *pol = current->mempolicy; | 719 | struct mempolicy *pol = current->mempolicy; |
| 676 | 720 | ||
| 677 | cpuset_update_task_memory_state(); | ||
| 678 | if (flags & | 721 | if (flags & |
| 679 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) | 722 | ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) |
| 680 | return -EINVAL; | 723 | return -EINVAL; |
| @@ -683,7 +726,9 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 683 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) | 726 | if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) |
| 684 | return -EINVAL; | 727 | return -EINVAL; |
| 685 | *policy = 0; /* just so it's initialized */ | 728 | *policy = 0; /* just so it's initialized */ |
| 729 | task_lock(current); | ||
| 686 | *nmask = cpuset_current_mems_allowed; | 730 | *nmask = cpuset_current_mems_allowed; |
| 731 | task_unlock(current); | ||
| 687 | return 0; | 732 | return 0; |
| 688 | } | 733 | } |
| 689 | 734 | ||
| @@ -738,8 +783,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
| 738 | } | 783 | } |
| 739 | 784 | ||
| 740 | err = 0; | 785 | err = 0; |
| 741 | if (nmask) | 786 | if (nmask) { |
| 787 | task_lock(current); | ||
| 742 | get_policy_nodemask(pol, nmask); | 788 | get_policy_nodemask(pol, nmask); |
| 789 | task_unlock(current); | ||
| 790 | } | ||
| 743 | 791 | ||
| 744 | out: | 792 | out: |
| 745 | mpol_cond_put(pol); | 793 | mpol_cond_put(pol); |
| @@ -767,7 +815,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
| 767 | 815 | ||
| 768 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) | 816 | static struct page *new_node_page(struct page *page, unsigned long node, int **x) |
| 769 | { | 817 | { |
| 770 | return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); | 818 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); |
| 771 | } | 819 | } |
| 772 | 820 | ||
| 773 | /* | 821 | /* |
| @@ -976,9 +1024,24 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
| 976 | 1024 | ||
| 977 | err = migrate_prep(); | 1025 | err = migrate_prep(); |
| 978 | if (err) | 1026 | if (err) |
| 979 | return err; | 1027 | goto mpol_out; |
| 980 | } | 1028 | } |
| 981 | down_write(&mm->mmap_sem); | 1029 | { |
| 1030 | NODEMASK_SCRATCH(scratch); | ||
| 1031 | if (scratch) { | ||
| 1032 | down_write(&mm->mmap_sem); | ||
| 1033 | task_lock(current); | ||
| 1034 | err = mpol_set_nodemask(new, nmask, scratch); | ||
| 1035 | task_unlock(current); | ||
| 1036 | if (err) | ||
| 1037 | up_write(&mm->mmap_sem); | ||
| 1038 | } else | ||
| 1039 | err = -ENOMEM; | ||
| 1040 | NODEMASK_SCRATCH_FREE(scratch); | ||
| 1041 | } | ||
| 1042 | if (err) | ||
| 1043 | goto mpol_out; | ||
| 1044 | |||
| 982 | vma = check_range(mm, start, end, nmask, | 1045 | vma = check_range(mm, start, end, nmask, |
| 983 | flags | MPOL_MF_INVERT, &pagelist); | 1046 | flags | MPOL_MF_INVERT, &pagelist); |
| 984 | 1047 | ||
| @@ -994,9 +1057,11 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
| 994 | 1057 | ||
| 995 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1058 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
| 996 | err = -EIO; | 1059 | err = -EIO; |
| 997 | } | 1060 | } else |
| 1061 | putback_lru_pages(&pagelist); | ||
| 998 | 1062 | ||
| 999 | up_write(&mm->mmap_sem); | 1063 | up_write(&mm->mmap_sem); |
| 1064 | mpol_out: | ||
| 1000 | mpol_put(new); | 1065 | mpol_put(new); |
| 1001 | return err; | 1066 | return err; |
| 1002 | } | 1067 | } |
| @@ -1545,8 +1610,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
| 1545 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1610 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
| 1546 | struct zonelist *zl; | 1611 | struct zonelist *zl; |
| 1547 | 1612 | ||
| 1548 | cpuset_update_task_memory_state(); | ||
| 1549 | |||
| 1550 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 1613 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
| 1551 | unsigned nid; | 1614 | unsigned nid; |
| 1552 | 1615 | ||
| @@ -1593,8 +1656,6 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
| 1593 | { | 1656 | { |
| 1594 | struct mempolicy *pol = current->mempolicy; | 1657 | struct mempolicy *pol = current->mempolicy; |
| 1595 | 1658 | ||
| 1596 | if ((gfp & __GFP_WAIT) && !in_interrupt()) | ||
| 1597 | cpuset_update_task_memory_state(); | ||
| 1598 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 1659 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) |
| 1599 | pol = &default_policy; | 1660 | pol = &default_policy; |
| 1600 | 1661 | ||
| @@ -1851,27 +1912,46 @@ restart: | |||
| 1851 | * Install non-NULL @mpol in inode's shared policy rb-tree. | 1912 | * Install non-NULL @mpol in inode's shared policy rb-tree. |
| 1852 | * On entry, the current task has a reference on a non-NULL @mpol. | 1913 | * On entry, the current task has a reference on a non-NULL @mpol. |
| 1853 | * This must be released on exit. | 1914 | * This must be released on exit. |
| 1915 | * This is called at get_inode() calls and we can use GFP_KERNEL. | ||
| 1854 | */ | 1916 | */ |
| 1855 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) | 1917 | void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) |
| 1856 | { | 1918 | { |
| 1919 | int ret; | ||
| 1920 | |||
| 1857 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ | 1921 | sp->root = RB_ROOT; /* empty tree == default mempolicy */ |
| 1858 | spin_lock_init(&sp->lock); | 1922 | spin_lock_init(&sp->lock); |
| 1859 | 1923 | ||
| 1860 | if (mpol) { | 1924 | if (mpol) { |
| 1861 | struct vm_area_struct pvma; | 1925 | struct vm_area_struct pvma; |
| 1862 | struct mempolicy *new; | 1926 | struct mempolicy *new; |
| 1927 | NODEMASK_SCRATCH(scratch); | ||
| 1863 | 1928 | ||
| 1929 | if (!scratch) | ||
| 1930 | return; | ||
| 1864 | /* contextualize the tmpfs mount point mempolicy */ | 1931 | /* contextualize the tmpfs mount point mempolicy */ |
| 1865 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); | 1932 | new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); |
| 1866 | mpol_put(mpol); /* drop our ref on sb mpol */ | 1933 | if (IS_ERR(new)) { |
| 1867 | if (IS_ERR(new)) | 1934 | mpol_put(mpol); /* drop our ref on sb mpol */ |
| 1935 | NODEMASK_SCRATCH_FREE(scratch); | ||
| 1868 | return; /* no valid nodemask intersection */ | 1936 | return; /* no valid nodemask intersection */ |
| 1937 | } | ||
| 1938 | |||
| 1939 | task_lock(current); | ||
| 1940 | ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); | ||
| 1941 | task_unlock(current); | ||
| 1942 | mpol_put(mpol); /* drop our ref on sb mpol */ | ||
| 1943 | if (ret) { | ||
| 1944 | NODEMASK_SCRATCH_FREE(scratch); | ||
| 1945 | mpol_put(new); | ||
| 1946 | return; | ||
| 1947 | } | ||
| 1869 | 1948 | ||
| 1870 | /* Create pseudo-vma that contains just the policy */ | 1949 | /* Create pseudo-vma that contains just the policy */ |
| 1871 | memset(&pvma, 0, sizeof(struct vm_area_struct)); | 1950 | memset(&pvma, 0, sizeof(struct vm_area_struct)); |
| 1872 | pvma.vm_end = TASK_SIZE; /* policy covers entire file */ | 1951 | pvma.vm_end = TASK_SIZE; /* policy covers entire file */ |
| 1873 | mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ | 1952 | mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ |
| 1874 | mpol_put(new); /* drop initial ref */ | 1953 | mpol_put(new); /* drop initial ref */ |
| 1954 | NODEMASK_SCRATCH_FREE(scratch); | ||
| 1875 | } | 1955 | } |
| 1876 | } | 1956 | } |
| 1877 | 1957 | ||
| @@ -2086,8 +2166,24 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2086 | new = mpol_new(mode, mode_flags, &nodes); | 2166 | new = mpol_new(mode, mode_flags, &nodes); |
| 2087 | if (IS_ERR(new)) | 2167 | if (IS_ERR(new)) |
| 2088 | err = 1; | 2168 | err = 1; |
| 2089 | else if (no_context) | 2169 | else { |
| 2090 | new->w.user_nodemask = nodes; /* save for contextualization */ | 2170 | int ret; |
| 2171 | NODEMASK_SCRATCH(scratch); | ||
| 2172 | if (scratch) { | ||
| 2173 | task_lock(current); | ||
| 2174 | ret = mpol_set_nodemask(new, &nodes, scratch); | ||
| 2175 | task_unlock(current); | ||
| 2176 | } else | ||
| 2177 | ret = -ENOMEM; | ||
| 2178 | NODEMASK_SCRATCH_FREE(scratch); | ||
| 2179 | if (ret) { | ||
| 2180 | err = 1; | ||
| 2181 | mpol_put(new); | ||
| 2182 | } else if (no_context) { | ||
| 2183 | /* save for contextualization */ | ||
| 2184 | new->w.user_nodemask = nodes; | ||
| 2185 | } | ||
| 2186 | } | ||
| 2091 | 2187 | ||
| 2092 | out: | 2188 | out: |
| 2093 | /* Restore string for error message */ | 2189 | /* Restore string for error message */ |
diff --git a/mm/mempool.c b/mm/mempool.c index a46eb1b4bb66..1a3bc3d4d554 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
| @@ -303,18 +303,11 @@ EXPORT_SYMBOL(mempool_free_slab); | |||
| 303 | */ | 303 | */ |
| 304 | void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) | 304 | void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) |
| 305 | { | 305 | { |
| 306 | size_t size = (size_t)(long)pool_data; | 306 | size_t size = (size_t)pool_data; |
| 307 | return kmalloc(size, gfp_mask); | 307 | return kmalloc(size, gfp_mask); |
| 308 | } | 308 | } |
| 309 | EXPORT_SYMBOL(mempool_kmalloc); | 309 | EXPORT_SYMBOL(mempool_kmalloc); |
| 310 | 310 | ||
| 311 | void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) | ||
| 312 | { | ||
| 313 | size_t size = (size_t) pool_data; | ||
| 314 | return kzalloc(size, gfp_mask); | ||
| 315 | } | ||
| 316 | EXPORT_SYMBOL(mempool_kzalloc); | ||
| 317 | |||
| 318 | void mempool_kfree(void *element, void *pool_data) | 311 | void mempool_kfree(void *element, void *pool_data) |
| 319 | { | 312 | { |
| 320 | kfree(element); | 313 | kfree(element); |
diff --git a/mm/migrate.c b/mm/migrate.c index a9eff3f092f6..7dbcb22316d2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -67,6 +67,8 @@ int putback_lru_pages(struct list_head *l) | |||
| 67 | 67 | ||
| 68 | list_for_each_entry_safe(page, page2, l, lru) { | 68 | list_for_each_entry_safe(page, page2, l, lru) { |
| 69 | list_del(&page->lru); | 69 | list_del(&page->lru); |
| 70 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
| 71 | page_is_file_cache(page)); | ||
| 70 | putback_lru_page(page); | 72 | putback_lru_page(page); |
| 71 | count++; | 73 | count++; |
| 72 | } | 74 | } |
| @@ -147,7 +149,7 @@ out: | |||
| 147 | static void remove_file_migration_ptes(struct page *old, struct page *new) | 149 | static void remove_file_migration_ptes(struct page *old, struct page *new) |
| 148 | { | 150 | { |
| 149 | struct vm_area_struct *vma; | 151 | struct vm_area_struct *vma; |
| 150 | struct address_space *mapping = page_mapping(new); | 152 | struct address_space *mapping = new->mapping; |
| 151 | struct prio_tree_iter iter; | 153 | struct prio_tree_iter iter; |
| 152 | pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 154 | pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
| 153 | 155 | ||
| @@ -250,7 +252,7 @@ out: | |||
| 250 | * The number of remaining references must be: | 252 | * The number of remaining references must be: |
| 251 | * 1 for anonymous pages without a mapping | 253 | * 1 for anonymous pages without a mapping |
| 252 | * 2 for pages with a mapping | 254 | * 2 for pages with a mapping |
| 253 | * 3 for pages with a mapping and PagePrivate set. | 255 | * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. |
| 254 | */ | 256 | */ |
| 255 | static int migrate_page_move_mapping(struct address_space *mapping, | 257 | static int migrate_page_move_mapping(struct address_space *mapping, |
| 256 | struct page *newpage, struct page *page) | 258 | struct page *newpage, struct page *page) |
| @@ -270,7 +272,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 270 | pslot = radix_tree_lookup_slot(&mapping->page_tree, | 272 | pslot = radix_tree_lookup_slot(&mapping->page_tree, |
| 271 | page_index(page)); | 273 | page_index(page)); |
| 272 | 274 | ||
| 273 | expected_count = 2 + !!PagePrivate(page); | 275 | expected_count = 2 + page_has_private(page); |
| 274 | if (page_count(page) != expected_count || | 276 | if (page_count(page) != expected_count || |
| 275 | (struct page *)radix_tree_deref_slot(pslot) != page) { | 277 | (struct page *)radix_tree_deref_slot(pslot) != page) { |
| 276 | spin_unlock_irq(&mapping->tree_lock); | 278 | spin_unlock_irq(&mapping->tree_lock); |
| @@ -312,7 +314,10 @@ static int migrate_page_move_mapping(struct address_space *mapping, | |||
| 312 | */ | 314 | */ |
| 313 | __dec_zone_page_state(page, NR_FILE_PAGES); | 315 | __dec_zone_page_state(page, NR_FILE_PAGES); |
| 314 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 316 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
| 315 | 317 | if (PageSwapBacked(page)) { | |
| 318 | __dec_zone_page_state(page, NR_SHMEM); | ||
| 319 | __inc_zone_page_state(newpage, NR_SHMEM); | ||
| 320 | } | ||
| 316 | spin_unlock_irq(&mapping->tree_lock); | 321 | spin_unlock_irq(&mapping->tree_lock); |
| 317 | 322 | ||
| 318 | return 0; | 323 | return 0; |
| @@ -386,7 +391,7 @@ EXPORT_SYMBOL(fail_migrate_page); | |||
| 386 | 391 | ||
| 387 | /* | 392 | /* |
| 388 | * Common logic to directly migrate a single page suitable for | 393 | * Common logic to directly migrate a single page suitable for |
| 389 | * pages that do not use PagePrivate. | 394 | * pages that do not use PagePrivate/PagePrivate2. |
| 390 | * | 395 | * |
| 391 | * Pages are locked upon entry and exit. | 396 | * Pages are locked upon entry and exit. |
| 392 | */ | 397 | */ |
| @@ -522,7 +527,7 @@ static int fallback_migrate_page(struct address_space *mapping, | |||
| 522 | * Buffers may be managed in a filesystem specific way. | 527 | * Buffers may be managed in a filesystem specific way. |
| 523 | * We must have no buffers or drop them. | 528 | * We must have no buffers or drop them. |
| 524 | */ | 529 | */ |
| 525 | if (PagePrivate(page) && | 530 | if (page_has_private(page) && |
| 526 | !try_to_release_page(page, GFP_KERNEL)) | 531 | !try_to_release_page(page, GFP_KERNEL)) |
| 527 | return -EAGAIN; | 532 | return -EAGAIN; |
| 528 | 533 | ||
| @@ -597,7 +602,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 597 | struct page *newpage = get_new_page(page, private, &result); | 602 | struct page *newpage = get_new_page(page, private, &result); |
| 598 | int rcu_locked = 0; | 603 | int rcu_locked = 0; |
| 599 | int charge = 0; | 604 | int charge = 0; |
| 600 | struct mem_cgroup *mem; | 605 | struct mem_cgroup *mem = NULL; |
| 601 | 606 | ||
| 602 | if (!newpage) | 607 | if (!newpage) |
| 603 | return -ENOMEM; | 608 | return -ENOMEM; |
| @@ -655,7 +660,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 655 | * free the metadata, so the page can be freed. | 660 | * free the metadata, so the page can be freed. |
| 656 | */ | 661 | */ |
| 657 | if (!page->mapping) { | 662 | if (!page->mapping) { |
| 658 | if (!PageAnon(page) && PagePrivate(page)) { | 663 | if (!PageAnon(page) && page_has_private(page)) { |
| 659 | /* | 664 | /* |
| 660 | * Go direct to try_to_free_buffers() here because | 665 | * Go direct to try_to_free_buffers() here because |
| 661 | * a) that's what try_to_release_page() would do anyway | 666 | * a) that's what try_to_release_page() would do anyway |
| @@ -664,13 +669,15 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
| 664 | * needs to be effective. | 669 | * needs to be effective. |
| 665 | */ | 670 | */ |
| 666 | try_to_free_buffers(page); | 671 | try_to_free_buffers(page); |
| 672 | goto rcu_unlock; | ||
| 667 | } | 673 | } |
| 668 | goto rcu_unlock; | 674 | goto skip_unmap; |
| 669 | } | 675 | } |
| 670 | 676 | ||
| 671 | /* Establish migration ptes or remove ptes */ | 677 | /* Establish migration ptes or remove ptes */ |
| 672 | try_to_unmap(page, 1); | 678 | try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); |
| 673 | 679 | ||
| 680 | skip_unmap: | ||
| 674 | if (!page_mapped(page)) | 681 | if (!page_mapped(page)) |
| 675 | rc = move_to_new_page(newpage, page); | 682 | rc = move_to_new_page(newpage, page); |
| 676 | 683 | ||
| @@ -693,6 +700,8 @@ unlock: | |||
| 693 | * restored. | 700 | * restored. |
| 694 | */ | 701 | */ |
| 695 | list_del(&page->lru); | 702 | list_del(&page->lru); |
| 703 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
| 704 | page_is_file_cache(page)); | ||
| 696 | putback_lru_page(page); | 705 | putback_lru_page(page); |
| 697 | } | 706 | } |
| 698 | 707 | ||
| @@ -737,6 +746,13 @@ int migrate_pages(struct list_head *from, | |||
| 737 | struct page *page2; | 746 | struct page *page2; |
| 738 | int swapwrite = current->flags & PF_SWAPWRITE; | 747 | int swapwrite = current->flags & PF_SWAPWRITE; |
| 739 | int rc; | 748 | int rc; |
| 749 | unsigned long flags; | ||
| 750 | |||
| 751 | local_irq_save(flags); | ||
| 752 | list_for_each_entry(page, from, lru) | ||
| 753 | __inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
| 754 | page_is_file_cache(page)); | ||
| 755 | local_irq_restore(flags); | ||
| 740 | 756 | ||
| 741 | if (!swapwrite) | 757 | if (!swapwrite) |
| 742 | current->flags |= PF_SWAPWRITE; | 758 | current->flags |= PF_SWAPWRITE; |
| @@ -802,7 +818,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, | |||
| 802 | 818 | ||
| 803 | *result = &pm->status; | 819 | *result = &pm->status; |
| 804 | 820 | ||
| 805 | return alloc_pages_node(pm->node, | 821 | return alloc_pages_exact_node(pm->node, |
| 806 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); | 822 | GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); |
| 807 | } | 823 | } |
| 808 | 824 | ||
| @@ -820,7 +836,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm, | |||
| 820 | struct page_to_node *pp; | 836 | struct page_to_node *pp; |
| 821 | LIST_HEAD(pagelist); | 837 | LIST_HEAD(pagelist); |
| 822 | 838 | ||
| 823 | migrate_prep(); | ||
| 824 | down_read(&mm->mmap_sem); | 839 | down_read(&mm->mmap_sem); |
| 825 | 840 | ||
| 826 | /* | 841 | /* |
| @@ -907,6 +922,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, | |||
| 907 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); | 922 | pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); |
| 908 | if (!pm) | 923 | if (!pm) |
| 909 | goto out; | 924 | goto out; |
| 925 | |||
| 926 | migrate_prep(); | ||
| 927 | |||
| 910 | /* | 928 | /* |
| 911 | * Store a chunk of page_to_node array in a page, | 929 | * Store a chunk of page_to_node array in a page, |
| 912 | * but keep the last one as a marker | 930 | * but keep the last one as a marker |
diff --git a/mm/mlock.c b/mm/mlock.c index cbe9e0581b75..bd6f0e466f6c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -31,7 +31,6 @@ int can_do_mlock(void) | |||
| 31 | } | 31 | } |
| 32 | EXPORT_SYMBOL(can_do_mlock); | 32 | EXPORT_SYMBOL(can_do_mlock); |
| 33 | 33 | ||
| 34 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 35 | /* | 34 | /* |
| 36 | * Mlocked pages are marked with PageMlocked() flag for efficient testing | 35 | * Mlocked pages are marked with PageMlocked() flag for efficient testing |
| 37 | * in vmscan and, possibly, the fault path; and to support semi-accurate | 36 | * in vmscan and, possibly, the fault path; and to support semi-accurate |
| @@ -140,49 +139,36 @@ static void munlock_vma_page(struct page *page) | |||
| 140 | } | 139 | } |
| 141 | 140 | ||
| 142 | /** | 141 | /** |
| 143 | * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. | 142 | * __mlock_vma_pages_range() - mlock a range of pages in the vma. |
| 144 | * @vma: target vma | 143 | * @vma: target vma |
| 145 | * @start: start address | 144 | * @start: start address |
| 146 | * @end: end address | 145 | * @end: end address |
| 147 | * @mlock: 0 indicate munlock, otherwise mlock. | ||
| 148 | * | 146 | * |
| 149 | * If @mlock == 0, unlock an mlocked range; | 147 | * This takes care of making the pages present too. |
| 150 | * else mlock the range of pages. This takes care of making the pages present , | ||
| 151 | * too. | ||
| 152 | * | 148 | * |
| 153 | * return 0 on success, negative error code on error. | 149 | * return 0 on success, negative error code on error. |
| 154 | * | 150 | * |
| 155 | * vma->vm_mm->mmap_sem must be held for at least read. | 151 | * vma->vm_mm->mmap_sem must be held for at least read. |
| 156 | */ | 152 | */ |
| 157 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | 153 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, |
| 158 | unsigned long start, unsigned long end, | 154 | unsigned long start, unsigned long end) |
| 159 | int mlock) | ||
| 160 | { | 155 | { |
| 161 | struct mm_struct *mm = vma->vm_mm; | 156 | struct mm_struct *mm = vma->vm_mm; |
| 162 | unsigned long addr = start; | 157 | unsigned long addr = start; |
| 163 | struct page *pages[16]; /* 16 gives a reasonable batch */ | 158 | struct page *pages[16]; /* 16 gives a reasonable batch */ |
| 164 | int nr_pages = (end - start) / PAGE_SIZE; | 159 | int nr_pages = (end - start) / PAGE_SIZE; |
| 165 | int ret = 0; | 160 | int ret = 0; |
| 166 | int gup_flags = 0; | 161 | int gup_flags; |
| 167 | 162 | ||
| 168 | VM_BUG_ON(start & ~PAGE_MASK); | 163 | VM_BUG_ON(start & ~PAGE_MASK); |
| 169 | VM_BUG_ON(end & ~PAGE_MASK); | 164 | VM_BUG_ON(end & ~PAGE_MASK); |
| 170 | VM_BUG_ON(start < vma->vm_start); | 165 | VM_BUG_ON(start < vma->vm_start); |
| 171 | VM_BUG_ON(end > vma->vm_end); | 166 | VM_BUG_ON(end > vma->vm_end); |
| 172 | VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && | 167 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); |
| 173 | (atomic_read(&mm->mm_users) != 0)); | ||
| 174 | |||
| 175 | /* | ||
| 176 | * mlock: don't page populate if vma has PROT_NONE permission. | ||
| 177 | * munlock: always do munlock although the vma has PROT_NONE | ||
| 178 | * permission, or SIGKILL is pending. | ||
| 179 | */ | ||
| 180 | if (!mlock) | ||
| 181 | gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | | ||
| 182 | GUP_FLAGS_IGNORE_SIGKILL; | ||
| 183 | 168 | ||
| 169 | gup_flags = FOLL_TOUCH | FOLL_GET; | ||
| 184 | if (vma->vm_flags & VM_WRITE) | 170 | if (vma->vm_flags & VM_WRITE) |
| 185 | gup_flags |= GUP_FLAGS_WRITE; | 171 | gup_flags |= FOLL_WRITE; |
| 186 | 172 | ||
| 187 | while (nr_pages > 0) { | 173 | while (nr_pages > 0) { |
| 188 | int i; | 174 | int i; |
| @@ -202,51 +188,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
| 202 | * This can happen for, e.g., VM_NONLINEAR regions before | 188 | * This can happen for, e.g., VM_NONLINEAR regions before |
| 203 | * a page has been allocated and mapped at a given offset, | 189 | * a page has been allocated and mapped at a given offset, |
| 204 | * or for addresses that map beyond end of a file. | 190 | * or for addresses that map beyond end of a file. |
| 205 | * We'll mlock the the pages if/when they get faulted in. | 191 | * We'll mlock the pages if/when they get faulted in. |
| 206 | */ | 192 | */ |
| 207 | if (ret < 0) | 193 | if (ret < 0) |
| 208 | break; | 194 | break; |
| 209 | if (ret == 0) { | ||
| 210 | /* | ||
| 211 | * We know the vma is there, so the only time | ||
| 212 | * we cannot get a single page should be an | ||
| 213 | * error (ret < 0) case. | ||
| 214 | */ | ||
| 215 | WARN_ON(1); | ||
| 216 | break; | ||
| 217 | } | ||
| 218 | 195 | ||
| 219 | lru_add_drain(); /* push cached pages to LRU */ | 196 | lru_add_drain(); /* push cached pages to LRU */ |
| 220 | 197 | ||
| 221 | for (i = 0; i < ret; i++) { | 198 | for (i = 0; i < ret; i++) { |
| 222 | struct page *page = pages[i]; | 199 | struct page *page = pages[i]; |
| 223 | 200 | ||
| 224 | lock_page(page); | ||
| 225 | /* | ||
| 226 | * Because we lock page here and migration is blocked | ||
| 227 | * by the elevated reference, we need only check for | ||
| 228 | * page truncation (file-cache only). | ||
| 229 | */ | ||
| 230 | if (page->mapping) { | 201 | if (page->mapping) { |
| 231 | if (mlock) | 202 | /* |
| 203 | * That preliminary check is mainly to avoid | ||
| 204 | * the pointless overhead of lock_page on the | ||
| 205 | * ZERO_PAGE: which might bounce very badly if | ||
| 206 | * there is contention. However, we're still | ||
| 207 | * dirtying its cacheline with get/put_page: | ||
| 208 | * we'll add another __get_user_pages flag to | ||
| 209 | * avoid it if that case turns out to matter. | ||
| 210 | */ | ||
| 211 | lock_page(page); | ||
| 212 | /* | ||
| 213 | * Because we lock page here and migration is | ||
| 214 | * blocked by the elevated reference, we need | ||
| 215 | * only check for file-cache page truncation. | ||
| 216 | */ | ||
| 217 | if (page->mapping) | ||
| 232 | mlock_vma_page(page); | 218 | mlock_vma_page(page); |
| 233 | else | 219 | unlock_page(page); |
| 234 | munlock_vma_page(page); | ||
| 235 | } | 220 | } |
| 236 | unlock_page(page); | 221 | put_page(page); /* ref from get_user_pages() */ |
| 237 | put_page(page); /* ref from get_user_pages() */ | ||
| 238 | |||
| 239 | /* | ||
| 240 | * here we assume that get_user_pages() has given us | ||
| 241 | * a list of virtually contiguous pages. | ||
| 242 | */ | ||
| 243 | addr += PAGE_SIZE; /* for next get_user_pages() */ | ||
| 244 | nr_pages--; | ||
| 245 | } | 222 | } |
| 223 | |||
| 224 | addr += ret * PAGE_SIZE; | ||
| 225 | nr_pages -= ret; | ||
| 246 | ret = 0; | 226 | ret = 0; |
| 247 | } | 227 | } |
| 248 | 228 | ||
| 249 | return ret; /* count entire vma as locked_vm */ | 229 | return ret; /* 0 or negative error code */ |
| 250 | } | 230 | } |
| 251 | 231 | ||
| 252 | /* | 232 | /* |
| @@ -261,27 +241,6 @@ static int __mlock_posix_error_return(long retval) | |||
| 261 | return retval; | 241 | return retval; |
| 262 | } | 242 | } |
| 263 | 243 | ||
| 264 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
| 265 | |||
| 266 | /* | ||
| 267 | * Just make pages present if VM_LOCKED. No-op if unlocking. | ||
| 268 | */ | ||
| 269 | static long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
| 270 | unsigned long start, unsigned long end, | ||
| 271 | int mlock) | ||
| 272 | { | ||
| 273 | if (mlock && (vma->vm_flags & VM_LOCKED)) | ||
| 274 | return make_pages_present(start, end); | ||
| 275 | return 0; | ||
| 276 | } | ||
| 277 | |||
| 278 | static inline int __mlock_posix_error_return(long retval) | ||
| 279 | { | ||
| 280 | return 0; | ||
| 281 | } | ||
| 282 | |||
| 283 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
| 284 | |||
| 285 | /** | 244 | /** |
| 286 | * mlock_vma_pages_range() - mlock pages in specified vma range. | 245 | * mlock_vma_pages_range() - mlock pages in specified vma range. |
| 287 | * @vma - the vma containing the specfied address range | 246 | * @vma - the vma containing the specfied address range |
| @@ -311,7 +270,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, | |||
| 311 | is_vm_hugetlb_page(vma) || | 270 | is_vm_hugetlb_page(vma) || |
| 312 | vma == get_gate_vma(current))) { | 271 | vma == get_gate_vma(current))) { |
| 313 | 272 | ||
| 314 | __mlock_vma_pages_range(vma, start, end, 1); | 273 | __mlock_vma_pages_range(vma, start, end); |
| 315 | 274 | ||
| 316 | /* Hide errors from mmap() and other callers */ | 275 | /* Hide errors from mmap() and other callers */ |
| 317 | return 0; | 276 | return 0; |
| @@ -332,7 +291,6 @@ no_mlock: | |||
| 332 | return nr_pages; /* error or pages NOT mlocked */ | 291 | return nr_pages; /* error or pages NOT mlocked */ |
| 333 | } | 292 | } |
| 334 | 293 | ||
| 335 | |||
| 336 | /* | 294 | /* |
| 337 | * munlock_vma_pages_range() - munlock all pages in the vma range.' | 295 | * munlock_vma_pages_range() - munlock all pages in the vma range.' |
| 338 | * @vma - vma containing range to be munlock()ed. | 296 | * @vma - vma containing range to be munlock()ed. |
| @@ -352,10 +310,38 @@ no_mlock: | |||
| 352 | * free them. This will result in freeing mlocked pages. | 310 | * free them. This will result in freeing mlocked pages. |
| 353 | */ | 311 | */ |
| 354 | void munlock_vma_pages_range(struct vm_area_struct *vma, | 312 | void munlock_vma_pages_range(struct vm_area_struct *vma, |
| 355 | unsigned long start, unsigned long end) | 313 | unsigned long start, unsigned long end) |
| 356 | { | 314 | { |
| 315 | unsigned long addr; | ||
| 316 | |||
| 317 | lru_add_drain(); | ||
| 357 | vma->vm_flags &= ~VM_LOCKED; | 318 | vma->vm_flags &= ~VM_LOCKED; |
| 358 | __mlock_vma_pages_range(vma, start, end, 0); | 319 | |
| 320 | for (addr = start; addr < end; addr += PAGE_SIZE) { | ||
| 321 | struct page *page; | ||
| 322 | /* | ||
| 323 | * Although FOLL_DUMP is intended for get_dump_page(), | ||
| 324 | * it just so happens that its special treatment of the | ||
| 325 | * ZERO_PAGE (returning an error instead of doing get_page) | ||
| 326 | * suits munlock very well (and if somehow an abnormal page | ||
| 327 | * has sneaked into the range, we won't oops here: great). | ||
| 328 | */ | ||
| 329 | page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); | ||
| 330 | if (page && !IS_ERR(page)) { | ||
| 331 | lock_page(page); | ||
| 332 | /* | ||
| 333 | * Like in __mlock_vma_pages_range(), | ||
| 334 | * because we lock page here and migration is | ||
| 335 | * blocked by the elevated reference, we need | ||
| 336 | * only check for file-cache page truncation. | ||
| 337 | */ | ||
| 338 | if (page->mapping) | ||
| 339 | munlock_vma_page(page); | ||
| 340 | unlock_page(page); | ||
| 341 | put_page(page); | ||
| 342 | } | ||
| 343 | cond_resched(); | ||
| 344 | } | ||
| 359 | } | 345 | } |
| 360 | 346 | ||
| 361 | /* | 347 | /* |
| @@ -422,18 +408,14 @@ success: | |||
| 422 | * It's okay if try_to_unmap_one unmaps a page just after we | 408 | * It's okay if try_to_unmap_one unmaps a page just after we |
| 423 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. | 409 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. |
| 424 | */ | 410 | */ |
| 425 | vma->vm_flags = newflags; | ||
| 426 | 411 | ||
| 427 | if (lock) { | 412 | if (lock) { |
| 428 | ret = __mlock_vma_pages_range(vma, start, end, 1); | 413 | vma->vm_flags = newflags; |
| 429 | 414 | ret = __mlock_vma_pages_range(vma, start, end); | |
| 430 | if (ret > 0) { | 415 | if (ret < 0) |
| 431 | mm->locked_vm -= ret; | 416 | ret = __mlock_posix_error_return(ret); |
| 432 | ret = 0; | ||
| 433 | } else | ||
| 434 | ret = __mlock_posix_error_return(ret); /* translate if needed */ | ||
| 435 | } else { | 417 | } else { |
| 436 | __mlock_vma_pages_range(vma, start, end, 0); | 418 | munlock_vma_pages_range(vma, start, end); |
| 437 | } | 419 | } |
| 438 | 420 | ||
| 439 | out: | 421 | out: |
| @@ -629,52 +611,43 @@ void user_shm_unlock(size_t size, struct user_struct *user) | |||
| 629 | free_uid(user); | 611 | free_uid(user); |
| 630 | } | 612 | } |
| 631 | 613 | ||
| 632 | void *alloc_locked_buffer(size_t size) | 614 | int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, |
| 615 | size_t size) | ||
| 633 | { | 616 | { |
| 634 | unsigned long rlim, vm, pgsz; | 617 | unsigned long lim, vm, pgsz; |
| 635 | void *buffer = NULL; | 618 | int error = -ENOMEM; |
| 636 | 619 | ||
| 637 | pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; | 620 | pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; |
| 638 | 621 | ||
| 639 | down_write(¤t->mm->mmap_sem); | 622 | down_write(&mm->mmap_sem); |
| 640 | 623 | ||
| 641 | rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; | 624 | lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; |
| 642 | vm = current->mm->total_vm + pgsz; | 625 | vm = mm->total_vm + pgsz; |
| 643 | if (rlim < vm) | 626 | if (lim < vm) |
| 644 | goto out; | 627 | goto out; |
| 645 | 628 | ||
| 646 | rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; | 629 | lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; |
| 647 | vm = current->mm->locked_vm + pgsz; | 630 | vm = mm->locked_vm + pgsz; |
| 648 | if (rlim < vm) | 631 | if (lim < vm) |
| 649 | goto out; | 632 | goto out; |
| 650 | 633 | ||
| 651 | buffer = kzalloc(size, GFP_KERNEL); | 634 | mm->total_vm += pgsz; |
| 652 | if (!buffer) | 635 | mm->locked_vm += pgsz; |
| 653 | goto out; | ||
| 654 | |||
| 655 | current->mm->total_vm += pgsz; | ||
| 656 | current->mm->locked_vm += pgsz; | ||
| 657 | 636 | ||
| 637 | error = 0; | ||
| 658 | out: | 638 | out: |
| 659 | up_write(¤t->mm->mmap_sem); | 639 | up_write(&mm->mmap_sem); |
| 660 | return buffer; | 640 | return error; |
| 661 | } | 641 | } |
| 662 | 642 | ||
| 663 | void release_locked_buffer(void *buffer, size_t size) | 643 | void refund_locked_memory(struct mm_struct *mm, size_t size) |
| 664 | { | 644 | { |
| 665 | unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; | 645 | unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; |
| 666 | 646 | ||
| 667 | down_write(¤t->mm->mmap_sem); | 647 | down_write(&mm->mmap_sem); |
| 668 | |||
| 669 | current->mm->total_vm -= pgsz; | ||
| 670 | current->mm->locked_vm -= pgsz; | ||
| 671 | |||
| 672 | up_write(¤t->mm->mmap_sem); | ||
| 673 | } | ||
| 674 | 648 | ||
| 675 | void free_locked_buffer(void *buffer, size_t size) | 649 | mm->total_vm -= pgsz; |
| 676 | { | 650 | mm->locked_vm -= pgsz; |
| 677 | release_locked_buffer(buffer, size); | ||
| 678 | 651 | ||
| 679 | kfree(buffer); | 652 | up_write(&mm->mmap_sem); |
| 680 | } | 653 | } |
| @@ -20,7 +20,6 @@ | |||
| 20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
| 21 | #include <linux/personality.h> | 21 | #include <linux/personality.h> |
| 22 | #include <linux/security.h> | 22 | #include <linux/security.h> |
| 23 | #include <linux/ima.h> | ||
| 24 | #include <linux/hugetlb.h> | 23 | #include <linux/hugetlb.h> |
| 25 | #include <linux/profile.h> | 24 | #include <linux/profile.h> |
| 26 | #include <linux/module.h> | 25 | #include <linux/module.h> |
| @@ -28,6 +27,7 @@ | |||
| 28 | #include <linux/mempolicy.h> | 27 | #include <linux/mempolicy.h> |
| 29 | #include <linux/rmap.h> | 28 | #include <linux/rmap.h> |
| 30 | #include <linux/mmu_notifier.h> | 29 | #include <linux/mmu_notifier.h> |
| 30 | #include <linux/perf_event.h> | ||
| 31 | 31 | ||
| 32 | #include <asm/uaccess.h> | 32 | #include <asm/uaccess.h> |
| 33 | #include <asm/cacheflush.h> | 33 | #include <asm/cacheflush.h> |
| @@ -85,7 +85,7 @@ EXPORT_SYMBOL(vm_get_page_prot); | |||
| 85 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 85 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
| 86 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 86 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
| 87 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | 87 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; |
| 88 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | 88 | struct percpu_counter vm_committed_as; |
| 89 | 89 | ||
| 90 | /* | 90 | /* |
| 91 | * Check that a process has enough memory to allocate a new virtual | 91 | * Check that a process has enough memory to allocate a new virtual |
| @@ -179,11 +179,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
| 179 | if (mm) | 179 | if (mm) |
| 180 | allowed -= mm->total_vm / 32; | 180 | allowed -= mm->total_vm / 32; |
| 181 | 181 | ||
| 182 | /* | 182 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
| 183 | * cast `allowed' as a signed long because vm_committed_space | ||
| 184 | * sometimes has a negative value | ||
| 185 | */ | ||
| 186 | if (atomic_long_read(&vm_committed_space) < (long)allowed) | ||
| 187 | return 0; | 183 | return 0; |
| 188 | error: | 184 | error: |
| 189 | vm_unacct_memory(pages); | 185 | vm_unacct_memory(pages); |
| @@ -573,9 +569,9 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 573 | 569 | ||
| 574 | /* | 570 | /* |
| 575 | * When changing only vma->vm_end, we don't really need | 571 | * When changing only vma->vm_end, we don't really need |
| 576 | * anon_vma lock: but is that case worth optimizing out? | 572 | * anon_vma lock. |
| 577 | */ | 573 | */ |
| 578 | if (vma->anon_vma) | 574 | if (vma->anon_vma && (insert || importer || start != vma->vm_start)) |
| 579 | anon_vma = vma->anon_vma; | 575 | anon_vma = vma->anon_vma; |
| 580 | if (anon_vma) { | 576 | if (anon_vma) { |
| 581 | spin_lock(&anon_vma->lock); | 577 | spin_lock(&anon_vma->lock); |
| @@ -659,9 +655,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 659 | validate_mm(mm); | 655 | validate_mm(mm); |
| 660 | } | 656 | } |
| 661 | 657 | ||
| 662 | /* Flags that can be inherited from an existing mapping when merging */ | ||
| 663 | #define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) | ||
| 664 | |||
| 665 | /* | 658 | /* |
| 666 | * If the vma has a ->close operation then the driver probably needs to release | 659 | * If the vma has a ->close operation then the driver probably needs to release |
| 667 | * per-vma resources, so we don't attempt to merge those. | 660 | * per-vma resources, so we don't attempt to merge those. |
| @@ -669,7 +662,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 669 | static inline int is_mergeable_vma(struct vm_area_struct *vma, | 662 | static inline int is_mergeable_vma(struct vm_area_struct *vma, |
| 670 | struct file *file, unsigned long vm_flags) | 663 | struct file *file, unsigned long vm_flags) |
| 671 | { | 664 | { |
| 672 | if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) | 665 | /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ |
| 666 | if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) | ||
| 673 | return 0; | 667 | return 0; |
| 674 | if (vma->vm_file != file) | 668 | if (vma->vm_file != file) |
| 675 | return 0; | 669 | return 0; |
| @@ -908,7 +902,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, | |||
| 908 | #endif /* CONFIG_PROC_FS */ | 902 | #endif /* CONFIG_PROC_FS */ |
| 909 | 903 | ||
| 910 | /* | 904 | /* |
| 911 | * The caller must hold down_write(current->mm->mmap_sem). | 905 | * The caller must hold down_write(¤t->mm->mmap_sem). |
| 912 | */ | 906 | */ |
| 913 | 907 | ||
| 914 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | 908 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
| @@ -954,6 +948,24 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 954 | if (mm->map_count > sysctl_max_map_count) | 948 | if (mm->map_count > sysctl_max_map_count) |
| 955 | return -ENOMEM; | 949 | return -ENOMEM; |
| 956 | 950 | ||
| 951 | if (flags & MAP_HUGETLB) { | ||
| 952 | struct user_struct *user = NULL; | ||
| 953 | if (file) | ||
| 954 | return -EINVAL; | ||
| 955 | |||
| 956 | /* | ||
| 957 | * VM_NORESERVE is used because the reservations will be | ||
| 958 | * taken when vm_ops->mmap() is called | ||
| 959 | * A dummy user value is used because we are not locking | ||
| 960 | * memory so no accounting is necessary | ||
| 961 | */ | ||
| 962 | len = ALIGN(len, huge_page_size(&default_hstate)); | ||
| 963 | file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, | ||
| 964 | &user, HUGETLB_ANONHUGE_INODE); | ||
| 965 | if (IS_ERR(file)) | ||
| 966 | return PTR_ERR(file); | ||
| 967 | } | ||
| 968 | |||
| 957 | /* Obtain the address to map to. we verify (or select) it and ensure | 969 | /* Obtain the address to map to. we verify (or select) it and ensure |
| 958 | * that it represents a valid section of the address space. | 970 | * that it represents a valid section of the address space. |
| 959 | */ | 971 | */ |
| @@ -968,11 +980,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 968 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | | 980 | vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | |
| 969 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; | 981 | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; |
| 970 | 982 | ||
| 971 | if (flags & MAP_LOCKED) { | 983 | if (flags & MAP_LOCKED) |
| 972 | if (!can_do_mlock()) | 984 | if (!can_do_mlock()) |
| 973 | return -EPERM; | 985 | return -EPERM; |
| 974 | vm_flags |= VM_LOCKED; | ||
| 975 | } | ||
| 976 | 986 | ||
| 977 | /* mlock MCL_FUTURE? */ | 987 | /* mlock MCL_FUTURE? */ |
| 978 | if (vm_flags & VM_LOCKED) { | 988 | if (vm_flags & VM_LOCKED) { |
| @@ -1050,9 +1060,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
| 1050 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); | 1060 | error = security_file_mmap(file, reqprot, prot, flags, addr, 0); |
| 1051 | if (error) | 1061 | if (error) |
| 1052 | return error; | 1062 | return error; |
| 1053 | error = ima_file_mmap(file, prot); | ||
| 1054 | if (error) | ||
| 1055 | return error; | ||
| 1056 | 1063 | ||
| 1057 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); | 1064 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); |
| 1058 | } | 1065 | } |
| @@ -1198,21 +1205,21 @@ munmap_back: | |||
| 1198 | goto unmap_and_free_vma; | 1205 | goto unmap_and_free_vma; |
| 1199 | if (vm_flags & VM_EXECUTABLE) | 1206 | if (vm_flags & VM_EXECUTABLE) |
| 1200 | added_exe_file_vma(mm); | 1207 | added_exe_file_vma(mm); |
| 1208 | |||
| 1209 | /* Can addr have changed?? | ||
| 1210 | * | ||
| 1211 | * Answer: Yes, several device drivers can do it in their | ||
| 1212 | * f_op->mmap method. -DaveM | ||
| 1213 | */ | ||
| 1214 | addr = vma->vm_start; | ||
| 1215 | pgoff = vma->vm_pgoff; | ||
| 1216 | vm_flags = vma->vm_flags; | ||
| 1201 | } else if (vm_flags & VM_SHARED) { | 1217 | } else if (vm_flags & VM_SHARED) { |
| 1202 | error = shmem_zero_setup(vma); | 1218 | error = shmem_zero_setup(vma); |
| 1203 | if (error) | 1219 | if (error) |
| 1204 | goto free_vma; | 1220 | goto free_vma; |
| 1205 | } | 1221 | } |
| 1206 | 1222 | ||
| 1207 | /* Can addr have changed?? | ||
| 1208 | * | ||
| 1209 | * Answer: Yes, several device drivers can do it in their | ||
| 1210 | * f_op->mmap method. -DaveM | ||
| 1211 | */ | ||
| 1212 | addr = vma->vm_start; | ||
| 1213 | pgoff = vma->vm_pgoff; | ||
| 1214 | vm_flags = vma->vm_flags; | ||
| 1215 | |||
| 1216 | if (vma_wants_writenotify(vma)) | 1223 | if (vma_wants_writenotify(vma)) |
| 1217 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); | 1224 | vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); |
| 1218 | 1225 | ||
| @@ -1223,6 +1230,8 @@ munmap_back: | |||
| 1223 | if (correct_wcount) | 1230 | if (correct_wcount) |
| 1224 | atomic_inc(&inode->i_writecount); | 1231 | atomic_inc(&inode->i_writecount); |
| 1225 | out: | 1232 | out: |
| 1233 | perf_event_mmap(vma); | ||
| 1234 | |||
| 1226 | mm->total_vm += len >> PAGE_SHIFT; | 1235 | mm->total_vm += len >> PAGE_SHIFT; |
| 1227 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1236 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
| 1228 | if (vm_flags & VM_LOCKED) { | 1237 | if (vm_flags & VM_LOCKED) { |
| @@ -1575,7 +1584,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
| 1575 | * Overcommit.. This must be the final test, as it will | 1584 | * Overcommit.. This must be the final test, as it will |
| 1576 | * update security statistics. | 1585 | * update security statistics. |
| 1577 | */ | 1586 | */ |
| 1578 | if (security_vm_enough_memory(grow)) | 1587 | if (security_vm_enough_memory_mm(mm, grow)) |
| 1579 | return -ENOMEM; | 1588 | return -ENOMEM; |
| 1580 | 1589 | ||
| 1581 | /* Ok, everything looks good - let it rip */ | 1590 | /* Ok, everything looks good - let it rip */ |
| @@ -2112,6 +2121,7 @@ void exit_mmap(struct mm_struct *mm) | |||
| 2112 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ | 2121 | /* Use -1 here to ensure all VMAs in the mm are unmapped */ |
| 2113 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); | 2122 | end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); |
| 2114 | vm_unacct_memory(nr_accounted); | 2123 | vm_unacct_memory(nr_accounted); |
| 2124 | |||
| 2115 | free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); | 2125 | free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); |
| 2116 | tlb_finish_mmu(tlb, 0, end); | 2126 | tlb_finish_mmu(tlb, 0, end); |
| 2117 | 2127 | ||
| @@ -2268,7 +2278,7 @@ static void special_mapping_close(struct vm_area_struct *vma) | |||
| 2268 | { | 2278 | { |
| 2269 | } | 2279 | } |
| 2270 | 2280 | ||
| 2271 | static struct vm_operations_struct special_mapping_vmops = { | 2281 | static const struct vm_operations_struct special_mapping_vmops = { |
| 2272 | .close = special_mapping_close, | 2282 | .close = special_mapping_close, |
| 2273 | .fault = special_mapping_fault, | 2283 | .fault = special_mapping_fault, |
| 2274 | }; | 2284 | }; |
| @@ -2309,6 +2319,8 @@ int install_special_mapping(struct mm_struct *mm, | |||
| 2309 | 2319 | ||
| 2310 | mm->total_vm += len >> PAGE_SHIFT; | 2320 | mm->total_vm += len >> PAGE_SHIFT; |
| 2311 | 2321 | ||
| 2322 | perf_event_mmap(vma); | ||
| 2323 | |||
| 2312 | return 0; | 2324 | return 0; |
| 2313 | } | 2325 | } |
| 2314 | 2326 | ||
| @@ -2481,7 +2493,8 @@ void mm_drop_all_locks(struct mm_struct *mm) | |||
| 2481 | */ | 2493 | */ |
| 2482 | void __init mmap_init(void) | 2494 | void __init mmap_init(void) |
| 2483 | { | 2495 | { |
| 2484 | vm_area_cachep = kmem_cache_create("vm_area_struct", | 2496 | int ret; |
| 2485 | sizeof(struct vm_area_struct), 0, | 2497 | |
| 2486 | SLAB_PANIC, NULL); | 2498 | ret = percpu_counter_init(&vm_committed_as, 0); |
| 2499 | VM_BUG_ON(ret); | ||
| 2487 | } | 2500 | } |
diff --git a/mm/mmu_context.c b/mm/mmu_context.c new file mode 100644 index 000000000000..ded9081f4021 --- /dev/null +++ b/mm/mmu_context.c | |||
| @@ -0,0 +1,58 @@ | |||
| 1 | /* Copyright (C) 2009 Red Hat, Inc. | ||
| 2 | * | ||
| 3 | * See ../COPYING for licensing terms. | ||
| 4 | */ | ||
| 5 | |||
| 6 | #include <linux/mm.h> | ||
| 7 | #include <linux/mmu_context.h> | ||
| 8 | #include <linux/sched.h> | ||
| 9 | |||
| 10 | #include <asm/mmu_context.h> | ||
| 11 | |||
| 12 | /* | ||
| 13 | * use_mm | ||
| 14 | * Makes the calling kernel thread take on the specified | ||
| 15 | * mm context. | ||
| 16 | * Called by the retry thread execute retries within the | ||
| 17 | * iocb issuer's mm context, so that copy_from/to_user | ||
| 18 | * operations work seamlessly for aio. | ||
| 19 | * (Note: this routine is intended to be called only | ||
| 20 | * from a kernel thread context) | ||
| 21 | */ | ||
| 22 | void use_mm(struct mm_struct *mm) | ||
| 23 | { | ||
| 24 | struct mm_struct *active_mm; | ||
| 25 | struct task_struct *tsk = current; | ||
| 26 | |||
| 27 | task_lock(tsk); | ||
| 28 | active_mm = tsk->active_mm; | ||
| 29 | if (active_mm != mm) { | ||
| 30 | atomic_inc(&mm->mm_count); | ||
| 31 | tsk->active_mm = mm; | ||
| 32 | } | ||
| 33 | tsk->mm = mm; | ||
| 34 | switch_mm(active_mm, mm, tsk); | ||
| 35 | task_unlock(tsk); | ||
| 36 | |||
| 37 | if (active_mm != mm) | ||
| 38 | mmdrop(active_mm); | ||
| 39 | } | ||
| 40 | |||
| 41 | /* | ||
| 42 | * unuse_mm | ||
| 43 | * Reverses the effect of use_mm, i.e. releases the | ||
| 44 | * specified mm context which was earlier taken on | ||
| 45 | * by the calling kernel thread | ||
| 46 | * (Note: this routine is intended to be called only | ||
| 47 | * from a kernel thread context) | ||
| 48 | */ | ||
| 49 | void unuse_mm(struct mm_struct *mm) | ||
| 50 | { | ||
| 51 | struct task_struct *tsk = current; | ||
| 52 | |||
| 53 | task_lock(tsk); | ||
| 54 | tsk->mm = NULL; | ||
| 55 | /* active_mm is still 'mm' */ | ||
| 56 | enter_lazy_tlb(mm, tsk); | ||
| 57 | task_unlock(tsk); | ||
| 58 | } | ||
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5f4ef0250bee..7e33f2cb3c77 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c | |||
| @@ -99,6 +99,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, | |||
| 99 | return young; | 99 | return young; |
| 100 | } | 100 | } |
| 101 | 101 | ||
| 102 | void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, | ||
| 103 | pte_t pte) | ||
| 104 | { | ||
| 105 | struct mmu_notifier *mn; | ||
| 106 | struct hlist_node *n; | ||
| 107 | |||
| 108 | rcu_read_lock(); | ||
| 109 | hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { | ||
| 110 | if (mn->ops->change_pte) | ||
| 111 | mn->ops->change_pte(mn, mm, address, pte); | ||
| 112 | /* | ||
| 113 | * Some drivers don't have change_pte, | ||
| 114 | * so we must call invalidate_page in that case. | ||
| 115 | */ | ||
| 116 | else if (mn->ops->invalidate_page) | ||
| 117 | mn->ops->invalidate_page(mn, mm, address); | ||
| 118 | } | ||
| 119 | rcu_read_unlock(); | ||
| 120 | } | ||
| 121 | |||
| 102 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, | 122 | void __mmu_notifier_invalidate_page(struct mm_struct *mm, |
| 103 | unsigned long address) | 123 | unsigned long address) |
| 104 | { | 124 | { |
diff --git a/mm/mmzone.c b/mm/mmzone.c index 16ce8b955dcf..f5b7d1760213 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | 6 | ||
| 7 | 7 | ||
| 8 | #include <linux/stddef.h> | 8 | #include <linux/stddef.h> |
| 9 | #include <linux/mm.h> | ||
| 9 | #include <linux/mmzone.h> | 10 | #include <linux/mmzone.h> |
| 10 | #include <linux/module.h> | 11 | #include <linux/module.h> |
| 11 | 12 | ||
| @@ -72,3 +73,17 @@ struct zoneref *next_zones_zonelist(struct zoneref *z, | |||
| 72 | *zone = zonelist_zone(z); | 73 | *zone = zonelist_zone(z); |
| 73 | return z; | 74 | return z; |
| 74 | } | 75 | } |
| 76 | |||
| 77 | #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL | ||
| 78 | int memmap_valid_within(unsigned long pfn, | ||
| 79 | struct page *page, struct zone *zone) | ||
| 80 | { | ||
| 81 | if (page_to_pfn(page) != pfn) | ||
| 82 | return 0; | ||
| 83 | |||
| 84 | if (page_zone(page) != zone) | ||
| 85 | return 0; | ||
| 86 | |||
| 87 | return 1; | ||
| 88 | } | ||
| 89 | #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index 258197b76fb4..8bc969d8112d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/swapops.h> | 23 | #include <linux/swapops.h> |
| 24 | #include <linux/mmu_notifier.h> | 24 | #include <linux/mmu_notifier.h> |
| 25 | #include <linux/migrate.h> | 25 | #include <linux/migrate.h> |
| 26 | #include <linux/perf_event.h> | ||
| 26 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
| 27 | #include <asm/pgtable.h> | 28 | #include <asm/pgtable.h> |
| 28 | #include <asm/cacheflush.h> | 29 | #include <asm/cacheflush.h> |
| @@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, | |||
| 299 | error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); | 300 | error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); |
| 300 | if (error) | 301 | if (error) |
| 301 | goto out; | 302 | goto out; |
| 303 | perf_event_mmap(vma); | ||
| 302 | nstart = tmp; | 304 | nstart = tmp; |
| 303 | 305 | ||
| 304 | if (nstart < prev->vm_end) | 306 | if (nstart < prev->vm_end) |
diff --git a/mm/mremap.c b/mm/mremap.c index a39b7b91be46..97bff2547719 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -11,6 +11,7 @@ | |||
| 11 | #include <linux/hugetlb.h> | 11 | #include <linux/hugetlb.h> |
| 12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
| 13 | #include <linux/shm.h> | 13 | #include <linux/shm.h> |
| 14 | #include <linux/ksm.h> | ||
| 14 | #include <linux/mman.h> | 15 | #include <linux/mman.h> |
| 15 | #include <linux/swap.h> | 16 | #include <linux/swap.h> |
| 16 | #include <linux/capability.h> | 17 | #include <linux/capability.h> |
| @@ -85,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, | |||
| 85 | if (vma->vm_file) { | 86 | if (vma->vm_file) { |
| 86 | /* | 87 | /* |
| 87 | * Subtle point from Rajesh Venkatasubramanian: before | 88 | * Subtle point from Rajesh Venkatasubramanian: before |
| 88 | * moving file-based ptes, we must lock vmtruncate out, | 89 | * moving file-based ptes, we must lock truncate_pagecache |
| 89 | * since it might clean the dst vma before the src vma, | 90 | * out, since it might clean the dst vma before the src vma, |
| 90 | * and we propagate stale pages into the dst afterward. | 91 | * and we propagate stale pages into the dst afterward. |
| 91 | */ | 92 | */ |
| 92 | mapping = vma->vm_file->f_mapping; | 93 | mapping = vma->vm_file->f_mapping; |
| @@ -174,6 +175,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
| 174 | unsigned long excess = 0; | 175 | unsigned long excess = 0; |
| 175 | unsigned long hiwater_vm; | 176 | unsigned long hiwater_vm; |
| 176 | int split = 0; | 177 | int split = 0; |
| 178 | int err; | ||
| 177 | 179 | ||
| 178 | /* | 180 | /* |
| 179 | * We'd prefer to avoid failure later on in do_munmap: | 181 | * We'd prefer to avoid failure later on in do_munmap: |
| @@ -182,6 +184,18 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
| 182 | if (mm->map_count >= sysctl_max_map_count - 3) | 184 | if (mm->map_count >= sysctl_max_map_count - 3) |
| 183 | return -ENOMEM; | 185 | return -ENOMEM; |
| 184 | 186 | ||
| 187 | /* | ||
| 188 | * Advise KSM to break any KSM pages in the area to be moved: | ||
| 189 | * it would be confusing if they were to turn up at the new | ||
| 190 | * location, where they happen to coincide with different KSM | ||
| 191 | * pages recently unmapped. But leave vma->vm_flags as it was, | ||
| 192 | * so KSM can come around to merge on vma and new_vma afterwards. | ||
| 193 | */ | ||
| 194 | err = ksm_madvise(vma, old_addr, old_addr + old_len, | ||
| 195 | MADV_UNMERGEABLE, &vm_flags); | ||
| 196 | if (err) | ||
| 197 | return err; | ||
| 198 | |||
| 185 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); | 199 | new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); |
| 186 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); | 200 | new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); |
| 187 | if (!new_vma) | 201 | if (!new_vma) |
diff --git a/mm/nommu.c b/mm/nommu.c index 2fcf47d449b4..9876fa0c3ad3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <asm/uaccess.h> | 33 | #include <asm/uaccess.h> |
| 34 | #include <asm/tlb.h> | 34 | #include <asm/tlb.h> |
| 35 | #include <asm/tlbflush.h> | 35 | #include <asm/tlbflush.h> |
| 36 | #include <asm/mmu_context.h> | ||
| 36 | #include "internal.h" | 37 | #include "internal.h" |
| 37 | 38 | ||
| 38 | static inline __attribute__((format(printf, 1, 2))) | 39 | static inline __attribute__((format(printf, 1, 2))) |
| @@ -56,20 +57,19 @@ void no_printk(const char *fmt, ...) | |||
| 56 | no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) | 57 | no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) |
| 57 | #endif | 58 | #endif |
| 58 | 59 | ||
| 59 | #include "internal.h" | ||
| 60 | |||
| 61 | void *high_memory; | 60 | void *high_memory; |
| 62 | struct page *mem_map; | 61 | struct page *mem_map; |
| 63 | unsigned long max_mapnr; | 62 | unsigned long max_mapnr; |
| 64 | unsigned long num_physpages; | 63 | unsigned long num_physpages; |
| 65 | atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | 64 | unsigned long highest_memmap_pfn; |
| 65 | struct percpu_counter vm_committed_as; | ||
| 66 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 66 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
| 67 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | 67 | int sysctl_overcommit_ratio = 50; /* default is 50% */ |
| 68 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | 68 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; |
| 69 | int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ | 69 | int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; |
| 70 | int heap_stack_gap = 0; | 70 | int heap_stack_gap = 0; |
| 71 | 71 | ||
| 72 | atomic_t mmap_pages_allocated; | 72 | atomic_long_t mmap_pages_allocated; |
| 73 | 73 | ||
| 74 | EXPORT_SYMBOL(mem_map); | 74 | EXPORT_SYMBOL(mem_map); |
| 75 | EXPORT_SYMBOL(num_physpages); | 75 | EXPORT_SYMBOL(num_physpages); |
| @@ -79,50 +79,10 @@ static struct kmem_cache *vm_region_jar; | |||
| 79 | struct rb_root nommu_region_tree = RB_ROOT; | 79 | struct rb_root nommu_region_tree = RB_ROOT; |
| 80 | DECLARE_RWSEM(nommu_region_sem); | 80 | DECLARE_RWSEM(nommu_region_sem); |
| 81 | 81 | ||
| 82 | struct vm_operations_struct generic_file_vm_ops = { | 82 | const struct vm_operations_struct generic_file_vm_ops = { |
| 83 | }; | 83 | }; |
| 84 | 84 | ||
| 85 | /* | 85 | /* |
| 86 | * Handle all mappings that got truncated by a "truncate()" | ||
| 87 | * system call. | ||
| 88 | * | ||
| 89 | * NOTE! We have to be ready to update the memory sharing | ||
| 90 | * between the file and the memory map for a potential last | ||
| 91 | * incomplete page. Ugly, but necessary. | ||
| 92 | */ | ||
| 93 | int vmtruncate(struct inode *inode, loff_t offset) | ||
| 94 | { | ||
| 95 | struct address_space *mapping = inode->i_mapping; | ||
| 96 | unsigned long limit; | ||
| 97 | |||
| 98 | if (inode->i_size < offset) | ||
| 99 | goto do_expand; | ||
| 100 | i_size_write(inode, offset); | ||
| 101 | |||
| 102 | truncate_inode_pages(mapping, offset); | ||
| 103 | goto out_truncate; | ||
| 104 | |||
| 105 | do_expand: | ||
| 106 | limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; | ||
| 107 | if (limit != RLIM_INFINITY && offset > limit) | ||
| 108 | goto out_sig; | ||
| 109 | if (offset > inode->i_sb->s_maxbytes) | ||
| 110 | goto out; | ||
| 111 | i_size_write(inode, offset); | ||
| 112 | |||
| 113 | out_truncate: | ||
| 114 | if (inode->i_op->truncate) | ||
| 115 | inode->i_op->truncate(inode); | ||
| 116 | return 0; | ||
| 117 | out_sig: | ||
| 118 | send_sig(SIGXFSZ, current, 0); | ||
| 119 | out: | ||
| 120 | return -EFBIG; | ||
| 121 | } | ||
| 122 | |||
| 123 | EXPORT_SYMBOL(vmtruncate); | ||
| 124 | |||
| 125 | /* | ||
| 126 | * Return the total memory allocated for this pointer, not | 86 | * Return the total memory allocated for this pointer, not |
| 127 | * just what the caller asked for. | 87 | * just what the caller asked for. |
| 128 | * | 88 | * |
| @@ -170,30 +130,29 @@ unsigned int kobjsize(const void *objp) | |||
| 170 | } | 130 | } |
| 171 | 131 | ||
| 172 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 132 | int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 173 | unsigned long start, int len, int flags, | 133 | unsigned long start, int nr_pages, unsigned int foll_flags, |
| 174 | struct page **pages, struct vm_area_struct **vmas) | 134 | struct page **pages, struct vm_area_struct **vmas) |
| 175 | { | 135 | { |
| 176 | struct vm_area_struct *vma; | 136 | struct vm_area_struct *vma; |
| 177 | unsigned long vm_flags; | 137 | unsigned long vm_flags; |
| 178 | int i; | 138 | int i; |
| 179 | int write = !!(flags & GUP_FLAGS_WRITE); | ||
| 180 | int force = !!(flags & GUP_FLAGS_FORCE); | ||
| 181 | int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); | ||
| 182 | 139 | ||
| 183 | /* calculate required read or write permissions. | 140 | /* calculate required read or write permissions. |
| 184 | * - if 'force' is set, we only require the "MAY" flags. | 141 | * If FOLL_FORCE is set, we only require the "MAY" flags. |
| 185 | */ | 142 | */ |
| 186 | vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); | 143 | vm_flags = (foll_flags & FOLL_WRITE) ? |
| 187 | vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | 144 | (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); |
| 145 | vm_flags &= (foll_flags & FOLL_FORCE) ? | ||
| 146 | (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); | ||
| 188 | 147 | ||
| 189 | for (i = 0; i < len; i++) { | 148 | for (i = 0; i < nr_pages; i++) { |
| 190 | vma = find_vma(mm, start); | 149 | vma = find_vma(mm, start); |
| 191 | if (!vma) | 150 | if (!vma) |
| 192 | goto finish_or_fault; | 151 | goto finish_or_fault; |
| 193 | 152 | ||
| 194 | /* protect what we can, including chardevs */ | 153 | /* protect what we can, including chardevs */ |
| 195 | if (vma->vm_flags & (VM_IO | VM_PFNMAP) || | 154 | if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || |
| 196 | (!ignore && !(vm_flags & vma->vm_flags))) | 155 | !(vm_flags & vma->vm_flags)) |
| 197 | goto finish_or_fault; | 156 | goto finish_or_fault; |
| 198 | 157 | ||
| 199 | if (pages) { | 158 | if (pages) { |
| @@ -212,7 +171,6 @@ finish_or_fault: | |||
| 212 | return i ? : -EFAULT; | 171 | return i ? : -EFAULT; |
| 213 | } | 172 | } |
| 214 | 173 | ||
| 215 | |||
| 216 | /* | 174 | /* |
| 217 | * get a list of pages in an address range belonging to the specified process | 175 | * get a list of pages in an address range belonging to the specified process |
| 218 | * and indicate the VMA that covers each page | 176 | * and indicate the VMA that covers each page |
| @@ -221,22 +179,41 @@ finish_or_fault: | |||
| 221 | * - don't permit access to VMAs that don't support it, such as I/O mappings | 179 | * - don't permit access to VMAs that don't support it, such as I/O mappings |
| 222 | */ | 180 | */ |
| 223 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | 181 | int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, |
| 224 | unsigned long start, int len, int write, int force, | 182 | unsigned long start, int nr_pages, int write, int force, |
| 225 | struct page **pages, struct vm_area_struct **vmas) | 183 | struct page **pages, struct vm_area_struct **vmas) |
| 226 | { | 184 | { |
| 227 | int flags = 0; | 185 | int flags = 0; |
| 228 | 186 | ||
| 229 | if (write) | 187 | if (write) |
| 230 | flags |= GUP_FLAGS_WRITE; | 188 | flags |= FOLL_WRITE; |
| 231 | if (force) | 189 | if (force) |
| 232 | flags |= GUP_FLAGS_FORCE; | 190 | flags |= FOLL_FORCE; |
| 233 | 191 | ||
| 234 | return __get_user_pages(tsk, mm, | 192 | return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); |
| 235 | start, len, flags, | ||
| 236 | pages, vmas); | ||
| 237 | } | 193 | } |
| 238 | EXPORT_SYMBOL(get_user_pages); | 194 | EXPORT_SYMBOL(get_user_pages); |
| 239 | 195 | ||
| 196 | /** | ||
| 197 | * follow_pfn - look up PFN at a user virtual address | ||
| 198 | * @vma: memory mapping | ||
| 199 | * @address: user virtual address | ||
| 200 | * @pfn: location to store found PFN | ||
| 201 | * | ||
| 202 | * Only IO mappings and raw PFN mappings are allowed. | ||
| 203 | * | ||
| 204 | * Returns zero and the pfn at @pfn on success, -ve otherwise. | ||
| 205 | */ | ||
| 206 | int follow_pfn(struct vm_area_struct *vma, unsigned long address, | ||
| 207 | unsigned long *pfn) | ||
| 208 | { | ||
| 209 | if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) | ||
| 210 | return -EINVAL; | ||
| 211 | |||
| 212 | *pfn = address >> PAGE_SHIFT; | ||
| 213 | return 0; | ||
| 214 | } | ||
| 215 | EXPORT_SYMBOL(follow_pfn); | ||
| 216 | |||
| 240 | DEFINE_RWLOCK(vmlist_lock); | 217 | DEFINE_RWLOCK(vmlist_lock); |
| 241 | struct vm_struct *vmlist; | 218 | struct vm_struct *vmlist; |
| 242 | 219 | ||
| @@ -463,12 +440,11 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
| 463 | */ | 440 | */ |
| 464 | void __init mmap_init(void) | 441 | void __init mmap_init(void) |
| 465 | { | 442 | { |
| 466 | vm_region_jar = kmem_cache_create("vm_region_jar", | 443 | int ret; |
| 467 | sizeof(struct vm_region), 0, | 444 | |
| 468 | SLAB_PANIC, NULL); | 445 | ret = percpu_counter_init(&vm_committed_as, 0); |
| 469 | vm_area_cachep = kmem_cache_create("vm_area_struct", | 446 | VM_BUG_ON(ret); |
| 470 | sizeof(struct vm_area_struct), 0, | 447 | vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); |
| 471 | SLAB_PANIC, NULL); | ||
| 472 | } | 448 | } |
| 473 | 449 | ||
| 474 | /* | 450 | /* |
| @@ -486,27 +462,24 @@ static noinline void validate_nommu_regions(void) | |||
| 486 | return; | 462 | return; |
| 487 | 463 | ||
| 488 | last = rb_entry(lastp, struct vm_region, vm_rb); | 464 | last = rb_entry(lastp, struct vm_region, vm_rb); |
| 489 | if (unlikely(last->vm_end <= last->vm_start)) | 465 | BUG_ON(unlikely(last->vm_end <= last->vm_start)); |
| 490 | BUG(); | 466 | BUG_ON(unlikely(last->vm_top < last->vm_end)); |
| 491 | if (unlikely(last->vm_top < last->vm_end)) | ||
| 492 | BUG(); | ||
| 493 | 467 | ||
| 494 | while ((p = rb_next(lastp))) { | 468 | while ((p = rb_next(lastp))) { |
| 495 | region = rb_entry(p, struct vm_region, vm_rb); | 469 | region = rb_entry(p, struct vm_region, vm_rb); |
| 496 | last = rb_entry(lastp, struct vm_region, vm_rb); | 470 | last = rb_entry(lastp, struct vm_region, vm_rb); |
| 497 | 471 | ||
| 498 | if (unlikely(region->vm_end <= region->vm_start)) | 472 | BUG_ON(unlikely(region->vm_end <= region->vm_start)); |
| 499 | BUG(); | 473 | BUG_ON(unlikely(region->vm_top < region->vm_end)); |
| 500 | if (unlikely(region->vm_top < region->vm_end)) | 474 | BUG_ON(unlikely(region->vm_start < last->vm_top)); |
| 501 | BUG(); | ||
| 502 | if (unlikely(region->vm_start < last->vm_top)) | ||
| 503 | BUG(); | ||
| 504 | 475 | ||
| 505 | lastp = p; | 476 | lastp = p; |
| 506 | } | 477 | } |
| 507 | } | 478 | } |
| 508 | #else | 479 | #else |
| 509 | #define validate_nommu_regions() do {} while(0) | 480 | static void validate_nommu_regions(void) |
| 481 | { | ||
| 482 | } | ||
| 510 | #endif | 483 | #endif |
| 511 | 484 | ||
| 512 | /* | 485 | /* |
| @@ -519,8 +492,6 @@ static void add_nommu_region(struct vm_region *region) | |||
| 519 | 492 | ||
| 520 | validate_nommu_regions(); | 493 | validate_nommu_regions(); |
| 521 | 494 | ||
| 522 | BUG_ON(region->vm_start & ~PAGE_MASK); | ||
| 523 | |||
| 524 | parent = NULL; | 495 | parent = NULL; |
| 525 | p = &nommu_region_tree.rb_node; | 496 | p = &nommu_region_tree.rb_node; |
| 526 | while (*p) { | 497 | while (*p) { |
| @@ -563,16 +534,17 @@ static void free_page_series(unsigned long from, unsigned long to) | |||
| 563 | struct page *page = virt_to_page(from); | 534 | struct page *page = virt_to_page(from); |
| 564 | 535 | ||
| 565 | kdebug("- free %lx", from); | 536 | kdebug("- free %lx", from); |
| 566 | atomic_dec(&mmap_pages_allocated); | 537 | atomic_long_dec(&mmap_pages_allocated); |
| 567 | if (page_count(page) != 1) | 538 | if (page_count(page) != 1) |
| 568 | kdebug("free page %p [%d]", page, page_count(page)); | 539 | kdebug("free page %p: refcount not one: %d", |
| 540 | page, page_count(page)); | ||
| 569 | put_page(page); | 541 | put_page(page); |
| 570 | } | 542 | } |
| 571 | } | 543 | } |
| 572 | 544 | ||
| 573 | /* | 545 | /* |
| 574 | * release a reference to a region | 546 | * release a reference to a region |
| 575 | * - the caller must hold the region semaphore, which this releases | 547 | * - the caller must hold the region semaphore for writing, which this releases |
| 576 | * - the region may not have been added to the tree yet, in which case vm_top | 548 | * - the region may not have been added to the tree yet, in which case vm_top |
| 577 | * will equal vm_start | 549 | * will equal vm_start |
| 578 | */ | 550 | */ |
| @@ -613,6 +585,22 @@ static void put_nommu_region(struct vm_region *region) | |||
| 613 | } | 585 | } |
| 614 | 586 | ||
| 615 | /* | 587 | /* |
| 588 | * update protection on a vma | ||
| 589 | */ | ||
| 590 | static void protect_vma(struct vm_area_struct *vma, unsigned long flags) | ||
| 591 | { | ||
| 592 | #ifdef CONFIG_MPU | ||
| 593 | struct mm_struct *mm = vma->vm_mm; | ||
| 594 | long start = vma->vm_start & PAGE_MASK; | ||
| 595 | while (start < vma->vm_end) { | ||
| 596 | protect_page(mm, start, flags); | ||
| 597 | start += PAGE_SIZE; | ||
| 598 | } | ||
| 599 | update_protections(mm); | ||
| 600 | #endif | ||
| 601 | } | ||
| 602 | |||
| 603 | /* | ||
| 616 | * add a VMA into a process's mm_struct in the appropriate place in the list | 604 | * add a VMA into a process's mm_struct in the appropriate place in the list |
| 617 | * and tree and add to the address space's page tree also if not an anonymous | 605 | * and tree and add to the address space's page tree also if not an anonymous |
| 618 | * page | 606 | * page |
| @@ -631,6 +619,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) | |||
| 631 | mm->map_count++; | 619 | mm->map_count++; |
| 632 | vma->vm_mm = mm; | 620 | vma->vm_mm = mm; |
| 633 | 621 | ||
| 622 | protect_vma(vma, vma->vm_flags); | ||
| 623 | |||
| 634 | /* add the VMA to the mapping */ | 624 | /* add the VMA to the mapping */ |
| 635 | if (vma->vm_file) { | 625 | if (vma->vm_file) { |
| 636 | mapping = vma->vm_file->f_mapping; | 626 | mapping = vma->vm_file->f_mapping; |
| @@ -693,6 +683,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) | |||
| 693 | 683 | ||
| 694 | kenter("%p", vma); | 684 | kenter("%p", vma); |
| 695 | 685 | ||
| 686 | protect_vma(vma, 0); | ||
| 687 | |||
| 696 | mm->map_count--; | 688 | mm->map_count--; |
| 697 | if (mm->mmap_cache == vma) | 689 | if (mm->mmap_cache == vma) |
| 698 | mm->mmap_cache = NULL; | 690 | mm->mmap_cache = NULL; |
| @@ -834,7 +826,7 @@ static int validate_mmap_request(struct file *file, | |||
| 834 | int ret; | 826 | int ret; |
| 835 | 827 | ||
| 836 | /* do the simple checks first */ | 828 | /* do the simple checks first */ |
| 837 | if (flags & MAP_FIXED || addr) { | 829 | if (flags & MAP_FIXED) { |
| 838 | printk(KERN_DEBUG | 830 | printk(KERN_DEBUG |
| 839 | "%d: Can't do fixed-address/overlay mmap of RAM\n", | 831 | "%d: Can't do fixed-address/overlay mmap of RAM\n", |
| 840 | current->pid); | 832 | current->pid); |
| @@ -905,6 +897,10 @@ static int validate_mmap_request(struct file *file, | |||
| 905 | if (!file->f_op->read) | 897 | if (!file->f_op->read) |
| 906 | capabilities &= ~BDI_CAP_MAP_COPY; | 898 | capabilities &= ~BDI_CAP_MAP_COPY; |
| 907 | 899 | ||
| 900 | /* The file shall have been opened with read permission. */ | ||
| 901 | if (!(file->f_mode & FMODE_READ)) | ||
| 902 | return -EACCES; | ||
| 903 | |||
| 908 | if (flags & MAP_SHARED) { | 904 | if (flags & MAP_SHARED) { |
| 909 | /* do checks for writing, appending and locking */ | 905 | /* do checks for writing, appending and locking */ |
| 910 | if ((prot & PROT_WRITE) && | 906 | if ((prot & PROT_WRITE) && |
| @@ -1038,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) | |||
| 1038 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1034 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
| 1039 | if (ret == 0) { | 1035 | if (ret == 0) { |
| 1040 | vma->vm_region->vm_top = vma->vm_region->vm_end; | 1036 | vma->vm_region->vm_top = vma->vm_region->vm_end; |
| 1041 | return ret; | 1037 | return 0; |
| 1042 | } | 1038 | } |
| 1043 | if (ret != -ENOSYS) | 1039 | if (ret != -ENOSYS) |
| 1044 | return ret; | 1040 | return ret; |
| @@ -1055,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) | |||
| 1055 | */ | 1051 | */ |
| 1056 | static int do_mmap_private(struct vm_area_struct *vma, | 1052 | static int do_mmap_private(struct vm_area_struct *vma, |
| 1057 | struct vm_region *region, | 1053 | struct vm_region *region, |
| 1058 | unsigned long len) | 1054 | unsigned long len, |
| 1055 | unsigned long capabilities) | ||
| 1059 | { | 1056 | { |
| 1060 | struct page *pages; | 1057 | struct page *pages; |
| 1061 | unsigned long total, point, n, rlen; | 1058 | unsigned long total, point, n, rlen; |
| @@ -1066,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
| 1066 | * shared mappings on devices or memory | 1063 | * shared mappings on devices or memory |
| 1067 | * - VM_MAYSHARE will be set if it may attempt to share | 1064 | * - VM_MAYSHARE will be set if it may attempt to share |
| 1068 | */ | 1065 | */ |
| 1069 | if (vma->vm_file) { | 1066 | if (capabilities & BDI_CAP_MAP_DIRECT) { |
| 1070 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); | 1067 | ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); |
| 1071 | if (ret == 0) { | 1068 | if (ret == 0) { |
| 1072 | /* shouldn't return success if we're not sharing */ | 1069 | /* shouldn't return success if we're not sharing */ |
| 1073 | BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); | 1070 | BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); |
| 1074 | vma->vm_region->vm_top = vma->vm_region->vm_end; | 1071 | vma->vm_region->vm_top = vma->vm_region->vm_end; |
| 1075 | return ret; | 1072 | return 0; |
| 1076 | } | 1073 | } |
| 1077 | if (ret != -ENOSYS) | 1074 | if (ret != -ENOSYS) |
| 1078 | return ret; | 1075 | return ret; |
| @@ -1096,7 +1093,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
| 1096 | goto enomem; | 1093 | goto enomem; |
| 1097 | 1094 | ||
| 1098 | total = 1 << order; | 1095 | total = 1 << order; |
| 1099 | atomic_add(total, &mmap_pages_allocated); | 1096 | atomic_long_add(total, &mmap_pages_allocated); |
| 1100 | 1097 | ||
| 1101 | point = rlen >> PAGE_SHIFT; | 1098 | point = rlen >> PAGE_SHIFT; |
| 1102 | 1099 | ||
| @@ -1107,7 +1104,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
| 1107 | order = ilog2(total - point); | 1104 | order = ilog2(total - point); |
| 1108 | n = 1 << order; | 1105 | n = 1 << order; |
| 1109 | kdebug("shave %lu/%lu @%lu", n, total - point, total); | 1106 | kdebug("shave %lu/%lu @%lu", n, total - point, total); |
| 1110 | atomic_sub(n, &mmap_pages_allocated); | 1107 | atomic_long_sub(n, &mmap_pages_allocated); |
| 1111 | total -= n; | 1108 | total -= n; |
| 1112 | set_page_refcounted(pages + total); | 1109 | set_page_refcounted(pages + total); |
| 1113 | __free_pages(pages + total, order); | 1110 | __free_pages(pages + total, order); |
| @@ -1185,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1185 | 1182 | ||
| 1186 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); | 1183 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); |
| 1187 | 1184 | ||
| 1188 | if (!(flags & MAP_FIXED)) | ||
| 1189 | addr = round_hint_to_min(addr); | ||
| 1190 | |||
| 1191 | /* decide whether we should attempt the mapping, and if so what sort of | 1185 | /* decide whether we should attempt the mapping, and if so what sort of |
| 1192 | * mapping */ | 1186 | * mapping */ |
| 1193 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | 1187 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, |
| @@ -1197,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1197 | return ret; | 1191 | return ret; |
| 1198 | } | 1192 | } |
| 1199 | 1193 | ||
| 1194 | /* we ignore the address hint */ | ||
| 1195 | addr = 0; | ||
| 1196 | |||
| 1200 | /* we've determined that we can make the mapping, now translate what we | 1197 | /* we've determined that we can make the mapping, now translate what we |
| 1201 | * now know into VMA flags */ | 1198 | * now know into VMA flags */ |
| 1202 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); | 1199 | vm_flags = determine_vm_flags(file, prot, flags, capabilities); |
| @@ -1310,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1310 | * - this is the hook for quasi-memory character devices to | 1307 | * - this is the hook for quasi-memory character devices to |
| 1311 | * tell us the location of a shared mapping | 1308 | * tell us the location of a shared mapping |
| 1312 | */ | 1309 | */ |
| 1313 | if (file && file->f_op->get_unmapped_area) { | 1310 | if (capabilities & BDI_CAP_MAP_DIRECT) { |
| 1314 | addr = file->f_op->get_unmapped_area(file, addr, len, | 1311 | addr = file->f_op->get_unmapped_area(file, addr, len, |
| 1315 | pgoff, flags); | 1312 | pgoff, flags); |
| 1316 | if (IS_ERR((void *) addr)) { | 1313 | if (IS_ERR((void *) addr)) { |
| @@ -1335,14 +1332,15 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
| 1335 | 1332 | ||
| 1336 | vma->vm_region = region; | 1333 | vma->vm_region = region; |
| 1337 | 1334 | ||
| 1338 | /* set up the mapping */ | 1335 | /* set up the mapping |
| 1336 | * - the region is filled in if BDI_CAP_MAP_DIRECT is still set | ||
| 1337 | */ | ||
| 1339 | if (file && vma->vm_flags & VM_SHARED) | 1338 | if (file && vma->vm_flags & VM_SHARED) |
| 1340 | ret = do_mmap_shared_file(vma); | 1339 | ret = do_mmap_shared_file(vma); |
| 1341 | else | 1340 | else |
| 1342 | ret = do_mmap_private(vma, region, len); | 1341 | ret = do_mmap_private(vma, region, len, capabilities); |
| 1343 | if (ret < 0) | 1342 | if (ret < 0) |
| 1344 | goto error_put_region; | 1343 | goto error_just_free; |
| 1345 | |||
| 1346 | add_nommu_region(region); | 1344 | add_nommu_region(region); |
| 1347 | 1345 | ||
| 1348 | /* okay... we have a mapping; now we have to register it */ | 1346 | /* okay... we have a mapping; now we have to register it */ |
| @@ -1361,25 +1359,14 @@ share: | |||
| 1361 | kleave(" = %lx", result); | 1359 | kleave(" = %lx", result); |
| 1362 | return result; | 1360 | return result; |
| 1363 | 1361 | ||
| 1364 | error_put_region: | ||
| 1365 | __put_nommu_region(region); | ||
| 1366 | if (vma) { | ||
| 1367 | if (vma->vm_file) { | ||
| 1368 | fput(vma->vm_file); | ||
| 1369 | if (vma->vm_flags & VM_EXECUTABLE) | ||
| 1370 | removed_exe_file_vma(vma->vm_mm); | ||
| 1371 | } | ||
| 1372 | kmem_cache_free(vm_area_cachep, vma); | ||
| 1373 | } | ||
| 1374 | kleave(" = %d [pr]", ret); | ||
| 1375 | return ret; | ||
| 1376 | |||
| 1377 | error_just_free: | 1362 | error_just_free: |
| 1378 | up_write(&nommu_region_sem); | 1363 | up_write(&nommu_region_sem); |
| 1379 | error: | 1364 | error: |
| 1380 | fput(region->vm_file); | 1365 | if (region->vm_file) |
| 1366 | fput(region->vm_file); | ||
| 1381 | kmem_cache_free(vm_region_jar, region); | 1367 | kmem_cache_free(vm_region_jar, region); |
| 1382 | fput(vma->vm_file); | 1368 | if (vma->vm_file) |
| 1369 | fput(vma->vm_file); | ||
| 1383 | if (vma->vm_flags & VM_EXECUTABLE) | 1370 | if (vma->vm_flags & VM_EXECUTABLE) |
| 1384 | removed_exe_file_vma(vma->vm_mm); | 1371 | removed_exe_file_vma(vma->vm_mm); |
| 1385 | kmem_cache_free(vm_area_cachep, vma); | 1372 | kmem_cache_free(vm_area_cachep, vma); |
| @@ -1536,10 +1523,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
| 1536 | /* find the first potentially overlapping VMA */ | 1523 | /* find the first potentially overlapping VMA */ |
| 1537 | vma = find_vma(mm, start); | 1524 | vma = find_vma(mm, start); |
| 1538 | if (!vma) { | 1525 | if (!vma) { |
| 1539 | printk(KERN_WARNING | 1526 | static int limit = 0; |
| 1540 | "munmap of memory not mmapped by process %d (%s):" | 1527 | if (limit < 5) { |
| 1541 | " 0x%lx-0x%lx\n", | 1528 | printk(KERN_WARNING |
| 1542 | current->pid, current->comm, start, start + len - 1); | 1529 | "munmap of memory not mmapped by process %d" |
| 1530 | " (%s): 0x%lx-0x%lx\n", | ||
| 1531 | current->pid, current->comm, | ||
| 1532 | start, start + len - 1); | ||
| 1533 | limit++; | ||
| 1534 | } | ||
| 1543 | return -EINVAL; | 1535 | return -EINVAL; |
| 1544 | } | 1536 | } |
| 1545 | 1537 | ||
| @@ -1849,12 +1841,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) | |||
| 1849 | if (mm) | 1841 | if (mm) |
| 1850 | allowed -= mm->total_vm / 32; | 1842 | allowed -= mm->total_vm / 32; |
| 1851 | 1843 | ||
| 1852 | /* | 1844 | if (percpu_counter_read_positive(&vm_committed_as) < allowed) |
| 1853 | * cast `allowed' as a signed long because vm_committed_space | ||
| 1854 | * sometimes has a negative value | ||
| 1855 | */ | ||
| 1856 | if (atomic_long_read(&vm_committed_space) < (long)allowed) | ||
| 1857 | return 0; | 1845 | return 0; |
| 1846 | |||
| 1858 | error: | 1847 | error: |
| 1859 | vm_unacct_memory(pages); | 1848 | vm_unacct_memory(pages); |
| 1860 | 1849 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d3b9bac085b5..ea2147dabba6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks; | |||
| 34 | static DEFINE_SPINLOCK(zone_scan_lock); | 34 | static DEFINE_SPINLOCK(zone_scan_lock); |
| 35 | /* #define DEBUG */ | 35 | /* #define DEBUG */ |
| 36 | 36 | ||
| 37 | /* | ||
| 38 | * Is all threads of the target process nodes overlap ours? | ||
| 39 | */ | ||
| 40 | static int has_intersects_mems_allowed(struct task_struct *tsk) | ||
| 41 | { | ||
| 42 | struct task_struct *t; | ||
| 43 | |||
| 44 | t = tsk; | ||
| 45 | do { | ||
| 46 | if (cpuset_mems_allowed_intersects(current, t)) | ||
| 47 | return 1; | ||
| 48 | t = next_thread(t); | ||
| 49 | } while (t != tsk); | ||
| 50 | |||
| 51 | return 0; | ||
| 52 | } | ||
| 53 | |||
| 37 | /** | 54 | /** |
| 38 | * badness - calculate a numeric value for how bad this task has been | 55 | * badness - calculate a numeric value for how bad this task has been |
| 39 | * @p: task struct of which task we should calculate | 56 | * @p: task struct of which task we should calculate |
| @@ -58,6 +75,13 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 58 | unsigned long points, cpu_time, run_time; | 75 | unsigned long points, cpu_time, run_time; |
| 59 | struct mm_struct *mm; | 76 | struct mm_struct *mm; |
| 60 | struct task_struct *child; | 77 | struct task_struct *child; |
| 78 | int oom_adj = p->signal->oom_adj; | ||
| 79 | struct task_cputime task_time; | ||
| 80 | unsigned long utime; | ||
| 81 | unsigned long stime; | ||
| 82 | |||
| 83 | if (oom_adj == OOM_DISABLE) | ||
| 84 | return 0; | ||
| 61 | 85 | ||
| 62 | task_lock(p); | 86 | task_lock(p); |
| 63 | mm = p->mm; | 87 | mm = p->mm; |
| @@ -79,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 79 | /* | 103 | /* |
| 80 | * swapoff can easily use up all memory, so kill those first. | 104 | * swapoff can easily use up all memory, so kill those first. |
| 81 | */ | 105 | */ |
| 82 | if (p->flags & PF_SWAPOFF) | 106 | if (p->flags & PF_OOM_ORIGIN) |
| 83 | return ULONG_MAX; | 107 | return ULONG_MAX; |
| 84 | 108 | ||
| 85 | /* | 109 | /* |
| @@ -102,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 102 | * of seconds. There is no particular reason for this other than | 126 | * of seconds. There is no particular reason for this other than |
| 103 | * that it turned out to work very well in practice. | 127 | * that it turned out to work very well in practice. |
| 104 | */ | 128 | */ |
| 105 | cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) | 129 | thread_group_cputime(p, &task_time); |
| 106 | >> (SHIFT_HZ + 3); | 130 | utime = cputime_to_jiffies(task_time.utime); |
| 131 | stime = cputime_to_jiffies(task_time.stime); | ||
| 132 | cpu_time = (utime + stime) >> (SHIFT_HZ + 3); | ||
| 133 | |||
| 107 | 134 | ||
| 108 | if (uptime >= p->start_time.tv_sec) | 135 | if (uptime >= p->start_time.tv_sec) |
| 109 | run_time = (uptime - p->start_time.tv_sec) >> 10; | 136 | run_time = (uptime - p->start_time.tv_sec) >> 10; |
| @@ -144,19 +171,19 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 144 | * because p may have allocated or otherwise mapped memory on | 171 | * because p may have allocated or otherwise mapped memory on |
| 145 | * this node before. However it will be less likely. | 172 | * this node before. However it will be less likely. |
| 146 | */ | 173 | */ |
| 147 | if (!cpuset_mems_allowed_intersects(current, p)) | 174 | if (!has_intersects_mems_allowed(p)) |
| 148 | points /= 8; | 175 | points /= 8; |
| 149 | 176 | ||
| 150 | /* | 177 | /* |
| 151 | * Adjust the score by oomkilladj. | 178 | * Adjust the score by oom_adj. |
| 152 | */ | 179 | */ |
| 153 | if (p->oomkilladj) { | 180 | if (oom_adj) { |
| 154 | if (p->oomkilladj > 0) { | 181 | if (oom_adj > 0) { |
| 155 | if (!points) | 182 | if (!points) |
| 156 | points = 1; | 183 | points = 1; |
| 157 | points <<= p->oomkilladj; | 184 | points <<= oom_adj; |
| 158 | } else | 185 | } else |
| 159 | points >>= -(p->oomkilladj); | 186 | points >>= -(oom_adj); |
| 160 | } | 187 | } |
| 161 | 188 | ||
| 162 | #ifdef DEBUG | 189 | #ifdef DEBUG |
| @@ -200,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
| 200 | static struct task_struct *select_bad_process(unsigned long *ppoints, | 227 | static struct task_struct *select_bad_process(unsigned long *ppoints, |
| 201 | struct mem_cgroup *mem) | 228 | struct mem_cgroup *mem) |
| 202 | { | 229 | { |
| 203 | struct task_struct *g, *p; | 230 | struct task_struct *p; |
| 204 | struct task_struct *chosen = NULL; | 231 | struct task_struct *chosen = NULL; |
| 205 | struct timespec uptime; | 232 | struct timespec uptime; |
| 206 | *ppoints = 0; | 233 | *ppoints = 0; |
| 207 | 234 | ||
| 208 | do_posix_clock_monotonic_gettime(&uptime); | 235 | do_posix_clock_monotonic_gettime(&uptime); |
| 209 | do_each_thread(g, p) { | 236 | for_each_process(p) { |
| 210 | unsigned long points; | 237 | unsigned long points; |
| 211 | 238 | ||
| 212 | /* | 239 | /* |
| @@ -251,7 +278,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
| 251 | *ppoints = ULONG_MAX; | 278 | *ppoints = ULONG_MAX; |
| 252 | } | 279 | } |
| 253 | 280 | ||
| 254 | if (p->oomkilladj == OOM_DISABLE) | 281 | if (p->signal->oom_adj == OOM_DISABLE) |
| 255 | continue; | 282 | continue; |
| 256 | 283 | ||
| 257 | points = badness(p, uptime.tv_sec); | 284 | points = badness(p, uptime.tv_sec); |
| @@ -259,7 +286,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
| 259 | chosen = p; | 286 | chosen = p; |
| 260 | *ppoints = points; | 287 | *ppoints = points; |
| 261 | } | 288 | } |
| 262 | } while_each_thread(g, p); | 289 | } |
| 263 | 290 | ||
| 264 | return chosen; | 291 | return chosen; |
| 265 | } | 292 | } |
| @@ -284,22 +311,28 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
| 284 | printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " | 311 | printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " |
| 285 | "name\n"); | 312 | "name\n"); |
| 286 | do_each_thread(g, p) { | 313 | do_each_thread(g, p) { |
| 287 | /* | 314 | struct mm_struct *mm; |
| 288 | * total_vm and rss sizes do not exist for tasks with a | 315 | |
| 289 | * detached mm so there's no need to report them. | ||
| 290 | */ | ||
| 291 | if (!p->mm) | ||
| 292 | continue; | ||
| 293 | if (mem && !task_in_mem_cgroup(p, mem)) | 316 | if (mem && !task_in_mem_cgroup(p, mem)) |
| 294 | continue; | 317 | continue; |
| 295 | if (!thread_group_leader(p)) | 318 | if (!thread_group_leader(p)) |
| 296 | continue; | 319 | continue; |
| 297 | 320 | ||
| 298 | task_lock(p); | 321 | task_lock(p); |
| 322 | mm = p->mm; | ||
| 323 | if (!mm) { | ||
| 324 | /* | ||
| 325 | * total_vm and rss sizes do not exist for tasks with no | ||
| 326 | * mm so there's no need to report them; they can't be | ||
| 327 | * oom killed anyway. | ||
| 328 | */ | ||
| 329 | task_unlock(p); | ||
| 330 | continue; | ||
| 331 | } | ||
| 299 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | 332 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
| 300 | p->pid, __task_cred(p)->uid, p->tgid, | 333 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, |
| 301 | p->mm->total_vm, get_mm_rss(p->mm), (int)task_cpu(p), | 334 | get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, |
| 302 | p->oomkilladj, p->comm); | 335 | p->comm); |
| 303 | task_unlock(p); | 336 | task_unlock(p); |
| 304 | } while_each_thread(g, p); | 337 | } while_each_thread(g, p); |
| 305 | } | 338 | } |
| @@ -340,11 +373,6 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
| 340 | 373 | ||
| 341 | static int oom_kill_task(struct task_struct *p) | 374 | static int oom_kill_task(struct task_struct *p) |
| 342 | { | 375 | { |
| 343 | struct mm_struct *mm; | ||
| 344 | struct task_struct *g, *q; | ||
| 345 | |||
| 346 | mm = p->mm; | ||
| 347 | |||
| 348 | /* WARNING: mm may not be dereferenced since we did not obtain its | 376 | /* WARNING: mm may not be dereferenced since we did not obtain its |
| 349 | * value from get_task_mm(p). This is OK since all we need to do is | 377 | * value from get_task_mm(p). This is OK since all we need to do is |
| 350 | * compare mm to q->mm below. | 378 | * compare mm to q->mm below. |
| @@ -353,30 +381,11 @@ static int oom_kill_task(struct task_struct *p) | |||
| 353 | * change to NULL at any time since we do not hold task_lock(p). | 381 | * change to NULL at any time since we do not hold task_lock(p). |
| 354 | * However, this is of no concern to us. | 382 | * However, this is of no concern to us. |
| 355 | */ | 383 | */ |
| 356 | 384 | if (!p->mm || p->signal->oom_adj == OOM_DISABLE) | |
| 357 | if (mm == NULL) | ||
| 358 | return 1; | 385 | return 1; |
| 359 | 386 | ||
| 360 | /* | ||
| 361 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
| 362 | */ | ||
| 363 | do_each_thread(g, q) { | ||
| 364 | if (q->mm == mm && q->oomkilladj == OOM_DISABLE) | ||
| 365 | return 1; | ||
| 366 | } while_each_thread(g, q); | ||
| 367 | |||
| 368 | __oom_kill_task(p, 1); | 387 | __oom_kill_task(p, 1); |
| 369 | 388 | ||
| 370 | /* | ||
| 371 | * kill all processes that share the ->mm (i.e. all threads), | ||
| 372 | * but are in a different thread group. Don't let them have access | ||
| 373 | * to memory reserves though, otherwise we might deplete all memory. | ||
| 374 | */ | ||
| 375 | do_each_thread(g, q) { | ||
| 376 | if (q->mm == mm && !same_thread_group(q, p)) | ||
| 377 | force_sig(SIGKILL, q); | ||
| 378 | } while_each_thread(g, q); | ||
| 379 | |||
| 380 | return 0; | 389 | return 0; |
| 381 | } | 390 | } |
| 382 | 391 | ||
| @@ -388,12 +397,14 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 388 | 397 | ||
| 389 | if (printk_ratelimit()) { | 398 | if (printk_ratelimit()) { |
| 390 | printk(KERN_WARNING "%s invoked oom-killer: " | 399 | printk(KERN_WARNING "%s invoked oom-killer: " |
| 391 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | 400 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", |
| 392 | current->comm, gfp_mask, order, current->oomkilladj); | 401 | current->comm, gfp_mask, order, |
| 402 | current->signal->oom_adj); | ||
| 393 | task_lock(current); | 403 | task_lock(current); |
| 394 | cpuset_print_task_mems_allowed(current); | 404 | cpuset_print_task_mems_allowed(current); |
| 395 | task_unlock(current); | 405 | task_unlock(current); |
| 396 | dump_stack(); | 406 | dump_stack(); |
| 407 | mem_cgroup_print_oom_info(mem, current); | ||
| 397 | show_mem(); | 408 | show_mem(); |
| 398 | if (sysctl_oom_dump_tasks) | 409 | if (sysctl_oom_dump_tasks) |
| 399 | dump_tasks(mem); | 410 | dump_tasks(mem); |
| @@ -513,34 +524,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
| 513 | */ | 524 | */ |
| 514 | static void __out_of_memory(gfp_t gfp_mask, int order) | 525 | static void __out_of_memory(gfp_t gfp_mask, int order) |
| 515 | { | 526 | { |
| 516 | if (sysctl_oom_kill_allocating_task) { | 527 | struct task_struct *p; |
| 517 | oom_kill_process(current, gfp_mask, order, 0, NULL, | 528 | unsigned long points; |
| 518 | "Out of memory (oom_kill_allocating_task)"); | ||
| 519 | |||
| 520 | } else { | ||
| 521 | unsigned long points; | ||
| 522 | struct task_struct *p; | ||
| 523 | |||
| 524 | retry: | ||
| 525 | /* | ||
| 526 | * Rambo mode: Shoot down a process and hope it solves whatever | ||
| 527 | * issues we may have. | ||
| 528 | */ | ||
| 529 | p = select_bad_process(&points, NULL); | ||
| 530 | 529 | ||
| 531 | if (PTR_ERR(p) == -1UL) | 530 | if (sysctl_oom_kill_allocating_task) |
| 531 | if (!oom_kill_process(current, gfp_mask, order, 0, NULL, | ||
| 532 | "Out of memory (oom_kill_allocating_task)")) | ||
| 532 | return; | 533 | return; |
| 534 | retry: | ||
| 535 | /* | ||
| 536 | * Rambo mode: Shoot down a process and hope it solves whatever | ||
| 537 | * issues we may have. | ||
| 538 | */ | ||
| 539 | p = select_bad_process(&points, NULL); | ||
| 533 | 540 | ||
| 534 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 541 | if (PTR_ERR(p) == -1UL) |
| 535 | if (!p) { | 542 | return; |
| 536 | read_unlock(&tasklist_lock); | ||
| 537 | panic("Out of memory and no killable processes...\n"); | ||
| 538 | } | ||
| 539 | 543 | ||
| 540 | if (oom_kill_process(p, gfp_mask, order, points, NULL, | 544 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
| 541 | "Out of memory")) | 545 | if (!p) { |
| 542 | goto retry; | 546 | read_unlock(&tasklist_lock); |
| 547 | panic("Out of memory and no killable processes...\n"); | ||
| 543 | } | 548 | } |
| 549 | |||
| 550 | if (oom_kill_process(p, gfp_mask, order, points, NULL, | ||
| 551 | "Out of memory")) | ||
| 552 | goto retry; | ||
| 544 | } | 553 | } |
| 545 | 554 | ||
| 546 | /* | 555 | /* |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 30351f0063ac..0b19943ecf8b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -36,15 +36,6 @@ | |||
| 36 | #include <linux/pagevec.h> | 36 | #include <linux/pagevec.h> |
| 37 | 37 | ||
| 38 | /* | 38 | /* |
| 39 | * The maximum number of pages to writeout in a single bdflush/kupdate | ||
| 40 | * operation. We do this so we don't hold I_SYNC against an inode for | ||
| 41 | * enormous amounts of time, which would block a userspace task which has | ||
| 42 | * been forced to throttle against that inode. Also, the code reevaluates | ||
| 43 | * the dirty each time it has written this many pages. | ||
| 44 | */ | ||
| 45 | #define MAX_WRITEBACK_PAGES 1024 | ||
| 46 | |||
| 47 | /* | ||
| 48 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited | 39 | * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited |
| 49 | * will look to see if it needs to force writeback or throttling. | 40 | * will look to see if it needs to force writeback or throttling. |
| 50 | */ | 41 | */ |
| @@ -53,18 +44,21 @@ static long ratelimit_pages = 32; | |||
| 53 | /* | 44 | /* |
| 54 | * When balance_dirty_pages decides that the caller needs to perform some | 45 | * When balance_dirty_pages decides that the caller needs to perform some |
| 55 | * non-background writeback, this is how many pages it will attempt to write. | 46 | * non-background writeback, this is how many pages it will attempt to write. |
| 56 | * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably | 47 | * It should be somewhat larger than dirtied pages to ensure that reasonably |
| 57 | * large amounts of I/O are submitted. | 48 | * large amounts of I/O are submitted. |
| 58 | */ | 49 | */ |
| 59 | static inline long sync_writeback_pages(void) | 50 | static inline long sync_writeback_pages(unsigned long dirtied) |
| 60 | { | 51 | { |
| 61 | return ratelimit_pages + ratelimit_pages / 2; | 52 | if (dirtied < ratelimit_pages) |
| 53 | dirtied = ratelimit_pages; | ||
| 54 | |||
| 55 | return dirtied + dirtied / 2; | ||
| 62 | } | 56 | } |
| 63 | 57 | ||
| 64 | /* The following parameters are exported via /proc/sys/vm */ | 58 | /* The following parameters are exported via /proc/sys/vm */ |
| 65 | 59 | ||
| 66 | /* | 60 | /* |
| 67 | * Start background writeback (via pdflush) at this percentage | 61 | * Start background writeback (via writeback threads) at this percentage |
| 68 | */ | 62 | */ |
| 69 | int dirty_background_ratio = 10; | 63 | int dirty_background_ratio = 10; |
| 70 | 64 | ||
| @@ -94,12 +88,12 @@ unsigned long vm_dirty_bytes; | |||
| 94 | /* | 88 | /* |
| 95 | * The interval between `kupdate'-style writebacks | 89 | * The interval between `kupdate'-style writebacks |
| 96 | */ | 90 | */ |
| 97 | unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */ | 91 | unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ |
| 98 | 92 | ||
| 99 | /* | 93 | /* |
| 100 | * The longest time for which data is allowed to remain dirty | 94 | * The longest time for which data is allowed to remain dirty |
| 101 | */ | 95 | */ |
| 102 | unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */ | 96 | unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */ |
| 103 | 97 | ||
| 104 | /* | 98 | /* |
| 105 | * Flag that makes the machine dump writes/reads and block dirtyings. | 99 | * Flag that makes the machine dump writes/reads and block dirtyings. |
| @@ -117,8 +111,6 @@ EXPORT_SYMBOL(laptop_mode); | |||
| 117 | /* End of sysctl-exported parameters */ | 111 | /* End of sysctl-exported parameters */ |
| 118 | 112 | ||
| 119 | 113 | ||
| 120 | static void background_writeout(unsigned long _min_pages); | ||
| 121 | |||
| 122 | /* | 114 | /* |
| 123 | * Scale the writeback cache size proportional to the relative writeout speeds. | 115 | * Scale the writeback cache size proportional to the relative writeout speeds. |
| 124 | * | 116 | * |
| @@ -166,37 +158,37 @@ static void update_completion_period(void) | |||
| 166 | } | 158 | } |
| 167 | 159 | ||
| 168 | int dirty_background_ratio_handler(struct ctl_table *table, int write, | 160 | int dirty_background_ratio_handler(struct ctl_table *table, int write, |
| 169 | struct file *filp, void __user *buffer, size_t *lenp, | 161 | void __user *buffer, size_t *lenp, |
| 170 | loff_t *ppos) | 162 | loff_t *ppos) |
| 171 | { | 163 | { |
| 172 | int ret; | 164 | int ret; |
| 173 | 165 | ||
| 174 | ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | 166 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 175 | if (ret == 0 && write) | 167 | if (ret == 0 && write) |
| 176 | dirty_background_bytes = 0; | 168 | dirty_background_bytes = 0; |
| 177 | return ret; | 169 | return ret; |
| 178 | } | 170 | } |
| 179 | 171 | ||
| 180 | int dirty_background_bytes_handler(struct ctl_table *table, int write, | 172 | int dirty_background_bytes_handler(struct ctl_table *table, int write, |
| 181 | struct file *filp, void __user *buffer, size_t *lenp, | 173 | void __user *buffer, size_t *lenp, |
| 182 | loff_t *ppos) | 174 | loff_t *ppos) |
| 183 | { | 175 | { |
| 184 | int ret; | 176 | int ret; |
| 185 | 177 | ||
| 186 | ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | 178 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
| 187 | if (ret == 0 && write) | 179 | if (ret == 0 && write) |
| 188 | dirty_background_ratio = 0; | 180 | dirty_background_ratio = 0; |
| 189 | return ret; | 181 | return ret; |
| 190 | } | 182 | } |
| 191 | 183 | ||
| 192 | int dirty_ratio_handler(struct ctl_table *table, int write, | 184 | int dirty_ratio_handler(struct ctl_table *table, int write, |
| 193 | struct file *filp, void __user *buffer, size_t *lenp, | 185 | void __user *buffer, size_t *lenp, |
| 194 | loff_t *ppos) | 186 | loff_t *ppos) |
| 195 | { | 187 | { |
| 196 | int old_ratio = vm_dirty_ratio; | 188 | int old_ratio = vm_dirty_ratio; |
| 197 | int ret; | 189 | int ret; |
| 198 | 190 | ||
| 199 | ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | 191 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 200 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { | 192 | if (ret == 0 && write && vm_dirty_ratio != old_ratio) { |
| 201 | update_completion_period(); | 193 | update_completion_period(); |
| 202 | vm_dirty_bytes = 0; | 194 | vm_dirty_bytes = 0; |
| @@ -206,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write, | |||
| 206 | 198 | ||
| 207 | 199 | ||
| 208 | int dirty_bytes_handler(struct ctl_table *table, int write, | 200 | int dirty_bytes_handler(struct ctl_table *table, int write, |
| 209 | struct file *filp, void __user *buffer, size_t *lenp, | 201 | void __user *buffer, size_t *lenp, |
| 210 | loff_t *ppos) | 202 | loff_t *ppos) |
| 211 | { | 203 | { |
| 212 | unsigned long old_bytes = vm_dirty_bytes; | 204 | unsigned long old_bytes = vm_dirty_bytes; |
| 213 | int ret; | 205 | int ret; |
| 214 | 206 | ||
| 215 | ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); | 207 | ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); |
| 216 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { | 208 | if (ret == 0 && write && vm_dirty_bytes != old_bytes) { |
| 217 | update_completion_period(); | 209 | update_completion_period(); |
| 218 | vm_dirty_ratio = 0; | 210 | vm_dirty_ratio = 0; |
| @@ -265,18 +257,19 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, | |||
| 265 | * This avoids exceeding the total dirty_limit when the floating averages | 257 | * This avoids exceeding the total dirty_limit when the floating averages |
| 266 | * fluctuate too quickly. | 258 | * fluctuate too quickly. |
| 267 | */ | 259 | */ |
| 268 | static void | 260 | static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, |
| 269 | clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty) | 261 | unsigned long dirty, unsigned long *pbdi_dirty) |
| 270 | { | 262 | { |
| 271 | long avail_dirty; | 263 | unsigned long avail_dirty; |
| 272 | 264 | ||
| 273 | avail_dirty = dirty - | 265 | avail_dirty = global_page_state(NR_FILE_DIRTY) + |
| 274 | (global_page_state(NR_FILE_DIRTY) + | ||
| 275 | global_page_state(NR_WRITEBACK) + | 266 | global_page_state(NR_WRITEBACK) + |
| 276 | global_page_state(NR_UNSTABLE_NFS) + | 267 | global_page_state(NR_UNSTABLE_NFS) + |
| 277 | global_page_state(NR_WRITEBACK_TEMP)); | 268 | global_page_state(NR_WRITEBACK_TEMP); |
| 278 | 269 | ||
| 279 | if (avail_dirty < 0) | 270 | if (avail_dirty < dirty) |
| 271 | avail_dirty = dirty - avail_dirty; | ||
| 272 | else | ||
| 280 | avail_dirty = 0; | 273 | avail_dirty = 0; |
| 281 | 274 | ||
| 282 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + | 275 | avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + |
| @@ -299,10 +292,10 @@ static inline void task_dirties_fraction(struct task_struct *tsk, | |||
| 299 | * | 292 | * |
| 300 | * dirty -= (dirty/8) * p_{t} | 293 | * dirty -= (dirty/8) * p_{t} |
| 301 | */ | 294 | */ |
| 302 | static void task_dirty_limit(struct task_struct *tsk, long *pdirty) | 295 | static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) |
| 303 | { | 296 | { |
| 304 | long numerator, denominator; | 297 | long numerator, denominator; |
| 305 | long dirty = *pdirty; | 298 | unsigned long dirty = *pdirty; |
| 306 | u64 inv = dirty >> 3; | 299 | u64 inv = dirty >> 3; |
| 307 | 300 | ||
| 308 | task_dirties_fraction(tsk, &numerator, &denominator); | 301 | task_dirties_fraction(tsk, &numerator, &denominator); |
| @@ -319,15 +312,13 @@ static void task_dirty_limit(struct task_struct *tsk, long *pdirty) | |||
| 319 | /* | 312 | /* |
| 320 | * | 313 | * |
| 321 | */ | 314 | */ |
| 322 | static DEFINE_SPINLOCK(bdi_lock); | ||
| 323 | static unsigned int bdi_min_ratio; | 315 | static unsigned int bdi_min_ratio; |
| 324 | 316 | ||
| 325 | int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) | 317 | int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) |
| 326 | { | 318 | { |
| 327 | int ret = 0; | 319 | int ret = 0; |
| 328 | unsigned long flags; | ||
| 329 | 320 | ||
| 330 | spin_lock_irqsave(&bdi_lock, flags); | 321 | spin_lock_bh(&bdi_lock); |
| 331 | if (min_ratio > bdi->max_ratio) { | 322 | if (min_ratio > bdi->max_ratio) { |
| 332 | ret = -EINVAL; | 323 | ret = -EINVAL; |
| 333 | } else { | 324 | } else { |
| @@ -339,27 +330,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) | |||
| 339 | ret = -EINVAL; | 330 | ret = -EINVAL; |
| 340 | } | 331 | } |
| 341 | } | 332 | } |
| 342 | spin_unlock_irqrestore(&bdi_lock, flags); | 333 | spin_unlock_bh(&bdi_lock); |
| 343 | 334 | ||
| 344 | return ret; | 335 | return ret; |
| 345 | } | 336 | } |
| 346 | 337 | ||
| 347 | int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) | 338 | int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) |
| 348 | { | 339 | { |
| 349 | unsigned long flags; | ||
| 350 | int ret = 0; | 340 | int ret = 0; |
| 351 | 341 | ||
| 352 | if (max_ratio > 100) | 342 | if (max_ratio > 100) |
| 353 | return -EINVAL; | 343 | return -EINVAL; |
| 354 | 344 | ||
| 355 | spin_lock_irqsave(&bdi_lock, flags); | 345 | spin_lock_bh(&bdi_lock); |
| 356 | if (bdi->min_ratio > max_ratio) { | 346 | if (bdi->min_ratio > max_ratio) { |
| 357 | ret = -EINVAL; | 347 | ret = -EINVAL; |
| 358 | } else { | 348 | } else { |
| 359 | bdi->max_ratio = max_ratio; | 349 | bdi->max_ratio = max_ratio; |
| 360 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; | 350 | bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; |
| 361 | } | 351 | } |
| 362 | spin_unlock_irqrestore(&bdi_lock, flags); | 352 | spin_unlock_bh(&bdi_lock); |
| 363 | 353 | ||
| 364 | return ret; | 354 | return ret; |
| 365 | } | 355 | } |
| @@ -393,7 +383,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) | |||
| 393 | struct zone *z = | 383 | struct zone *z = |
| 394 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; | 384 | &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; |
| 395 | 385 | ||
| 396 | x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); | 386 | x += zone_page_state(z, NR_FREE_PAGES) + |
| 387 | zone_reclaimable_pages(z); | ||
| 397 | } | 388 | } |
| 398 | /* | 389 | /* |
| 399 | * Make sure that the number of highmem pages is never larger | 390 | * Make sure that the number of highmem pages is never larger |
| @@ -417,7 +408,7 @@ unsigned long determine_dirtyable_memory(void) | |||
| 417 | { | 408 | { |
| 418 | unsigned long x; | 409 | unsigned long x; |
| 419 | 410 | ||
| 420 | x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); | 411 | x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); |
| 421 | 412 | ||
| 422 | if (!vm_highmem_is_dirtyable) | 413 | if (!vm_highmem_is_dirtyable) |
| 423 | x -= highmem_dirtyable_memory(x); | 414 | x -= highmem_dirtyable_memory(x); |
| @@ -486,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, | |||
| 486 | * balance_dirty_pages() must be called by processes which are generating dirty | 477 | * balance_dirty_pages() must be called by processes which are generating dirty |
| 487 | * data. It looks at the number of dirty pages in the machine and will force | 478 | * data. It looks at the number of dirty pages in the machine and will force |
| 488 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. | 479 | * the caller to perform writeback if the system is over `vm_dirty_ratio'. |
| 489 | * If we're over `background_thresh' then pdflush is woken to perform some | 480 | * If we're over `background_thresh' then the writeback threads are woken to |
| 490 | * writeout. | 481 | * perform some writeout. |
| 491 | */ | 482 | */ |
| 492 | static void balance_dirty_pages(struct address_space *mapping) | 483 | static void balance_dirty_pages(struct address_space *mapping, |
| 484 | unsigned long write_chunk) | ||
| 493 | { | 485 | { |
| 494 | long nr_reclaimable, bdi_nr_reclaimable; | 486 | long nr_reclaimable, bdi_nr_reclaimable; |
| 495 | long nr_writeback, bdi_nr_writeback; | 487 | long nr_writeback, bdi_nr_writeback; |
| @@ -497,7 +489,7 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
| 497 | unsigned long dirty_thresh; | 489 | unsigned long dirty_thresh; |
| 498 | unsigned long bdi_thresh; | 490 | unsigned long bdi_thresh; |
| 499 | unsigned long pages_written = 0; | 491 | unsigned long pages_written = 0; |
| 500 | unsigned long write_chunk = sync_writeback_pages(); | 492 | unsigned long pause = 1; |
| 501 | 493 | ||
| 502 | struct backing_dev_info *bdi = mapping->backing_dev_info; | 494 | struct backing_dev_info *bdi = mapping->backing_dev_info; |
| 503 | 495 | ||
| @@ -540,9 +532,12 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
| 540 | * filesystems (i.e. NFS) in which data may have been | 532 | * filesystems (i.e. NFS) in which data may have been |
| 541 | * written to the server's write cache, but has not yet | 533 | * written to the server's write cache, but has not yet |
| 542 | * been flushed to permanent storage. | 534 | * been flushed to permanent storage. |
| 535 | * Only move pages to writeback if this bdi is over its | ||
| 536 | * threshold otherwise wait until the disk writes catch | ||
| 537 | * up. | ||
| 543 | */ | 538 | */ |
| 544 | if (bdi_nr_reclaimable) { | 539 | if (bdi_nr_reclaimable > bdi_thresh) { |
| 545 | writeback_inodes(&wbc); | 540 | writeback_inodes_wbc(&wbc); |
| 546 | pages_written += write_chunk - wbc.nr_to_write; | 541 | pages_written += write_chunk - wbc.nr_to_write; |
| 547 | get_dirty_limits(&background_thresh, &dirty_thresh, | 542 | get_dirty_limits(&background_thresh, &dirty_thresh, |
| 548 | &bdi_thresh, bdi); | 543 | &bdi_thresh, bdi); |
| @@ -571,7 +566,16 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
| 571 | if (pages_written >= write_chunk) | 566 | if (pages_written >= write_chunk) |
| 572 | break; /* We've done our duty */ | 567 | break; /* We've done our duty */ |
| 573 | 568 | ||
| 574 | congestion_wait(WRITE, HZ/10); | 569 | __set_current_state(TASK_INTERRUPTIBLE); |
| 570 | io_schedule_timeout(pause); | ||
| 571 | |||
| 572 | /* | ||
| 573 | * Increase the delay for each loop, up to our previous | ||
| 574 | * default of taking a 100ms nap. | ||
| 575 | */ | ||
| 576 | pause <<= 1; | ||
| 577 | if (pause > HZ / 10) | ||
| 578 | pause = HZ / 10; | ||
| 575 | } | 579 | } |
| 576 | 580 | ||
| 577 | if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && | 581 | if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && |
| @@ -579,7 +583,7 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
| 579 | bdi->dirty_exceeded = 0; | 583 | bdi->dirty_exceeded = 0; |
| 580 | 584 | ||
| 581 | if (writeback_in_progress(bdi)) | 585 | if (writeback_in_progress(bdi)) |
| 582 | return; /* pdflush is already working this queue */ | 586 | return; |
| 583 | 587 | ||
| 584 | /* | 588 | /* |
| 585 | * In laptop mode, we wait until hitting the higher threshold before | 589 | * In laptop mode, we wait until hitting the higher threshold before |
| @@ -590,10 +594,10 @@ static void balance_dirty_pages(struct address_space *mapping) | |||
| 590 | * background_thresh, to keep the amount of dirty memory low. | 594 | * background_thresh, to keep the amount of dirty memory low. |
| 591 | */ | 595 | */ |
| 592 | if ((laptop_mode && pages_written) || | 596 | if ((laptop_mode && pages_written) || |
| 593 | (!laptop_mode && (global_page_state(NR_FILE_DIRTY) | 597 | (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) |
| 594 | + global_page_state(NR_UNSTABLE_NFS) | 598 | + global_page_state(NR_UNSTABLE_NFS)) |
| 595 | > background_thresh))) | 599 | > background_thresh))) |
| 596 | pdflush_operation(background_writeout, 0); | 600 | bdi_start_writeback(bdi, NULL, 0); |
| 597 | } | 601 | } |
| 598 | 602 | ||
| 599 | void set_page_dirty_balance(struct page *page, int page_mkwrite) | 603 | void set_page_dirty_balance(struct page *page, int page_mkwrite) |
| @@ -606,6 +610,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) | |||
| 606 | } | 610 | } |
| 607 | } | 611 | } |
| 608 | 612 | ||
| 613 | static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; | ||
| 614 | |||
| 609 | /** | 615 | /** |
| 610 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state | 616 | * balance_dirty_pages_ratelimited_nr - balance dirty memory state |
| 611 | * @mapping: address_space which was dirtied | 617 | * @mapping: address_space which was dirtied |
| @@ -623,7 +629,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) | |||
| 623 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | 629 | void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, |
| 624 | unsigned long nr_pages_dirtied) | 630 | unsigned long nr_pages_dirtied) |
| 625 | { | 631 | { |
| 626 | static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; | ||
| 627 | unsigned long ratelimit; | 632 | unsigned long ratelimit; |
| 628 | unsigned long *p; | 633 | unsigned long *p; |
| 629 | 634 | ||
| @@ -636,12 +641,13 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, | |||
| 636 | * tasks in balance_dirty_pages(). Period. | 641 | * tasks in balance_dirty_pages(). Period. |
| 637 | */ | 642 | */ |
| 638 | preempt_disable(); | 643 | preempt_disable(); |
| 639 | p = &__get_cpu_var(ratelimits); | 644 | p = &__get_cpu_var(bdp_ratelimits); |
| 640 | *p += nr_pages_dirtied; | 645 | *p += nr_pages_dirtied; |
| 641 | if (unlikely(*p >= ratelimit)) { | 646 | if (unlikely(*p >= ratelimit)) { |
| 647 | ratelimit = sync_writeback_pages(*p); | ||
| 642 | *p = 0; | 648 | *p = 0; |
| 643 | preempt_enable(); | 649 | preempt_enable(); |
| 644 | balance_dirty_pages(mapping); | 650 | balance_dirty_pages(mapping, ratelimit); |
| 645 | return; | 651 | return; |
| 646 | } | 652 | } |
| 647 | preempt_enable(); | 653 | preempt_enable(); |
| @@ -665,7 +671,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
| 665 | if (global_page_state(NR_UNSTABLE_NFS) + | 671 | if (global_page_state(NR_UNSTABLE_NFS) + |
| 666 | global_page_state(NR_WRITEBACK) <= dirty_thresh) | 672 | global_page_state(NR_WRITEBACK) <= dirty_thresh) |
| 667 | break; | 673 | break; |
| 668 | congestion_wait(WRITE, HZ/10); | 674 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| 669 | 675 | ||
| 670 | /* | 676 | /* |
| 671 | * The caller might hold locks which can prevent IO completion | 677 | * The caller might hold locks which can prevent IO completion |
| @@ -677,153 +683,35 @@ void throttle_vm_writeout(gfp_t gfp_mask) | |||
| 677 | } | 683 | } |
| 678 | } | 684 | } |
| 679 | 685 | ||
| 680 | /* | ||
| 681 | * writeback at least _min_pages, and keep writing until the amount of dirty | ||
| 682 | * memory is less than the background threshold, or until we're all clean. | ||
| 683 | */ | ||
| 684 | static void background_writeout(unsigned long _min_pages) | ||
| 685 | { | ||
| 686 | long min_pages = _min_pages; | ||
| 687 | struct writeback_control wbc = { | ||
| 688 | .bdi = NULL, | ||
| 689 | .sync_mode = WB_SYNC_NONE, | ||
| 690 | .older_than_this = NULL, | ||
| 691 | .nr_to_write = 0, | ||
| 692 | .nonblocking = 1, | ||
| 693 | .range_cyclic = 1, | ||
| 694 | }; | ||
| 695 | |||
| 696 | for ( ; ; ) { | ||
| 697 | unsigned long background_thresh; | ||
| 698 | unsigned long dirty_thresh; | ||
| 699 | |||
| 700 | get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); | ||
| 701 | if (global_page_state(NR_FILE_DIRTY) + | ||
| 702 | global_page_state(NR_UNSTABLE_NFS) < background_thresh | ||
| 703 | && min_pages <= 0) | ||
| 704 | break; | ||
| 705 | wbc.more_io = 0; | ||
| 706 | wbc.encountered_congestion = 0; | ||
| 707 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | ||
| 708 | wbc.pages_skipped = 0; | ||
| 709 | writeback_inodes(&wbc); | ||
| 710 | min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | ||
| 711 | if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { | ||
| 712 | /* Wrote less than expected */ | ||
| 713 | if (wbc.encountered_congestion || wbc.more_io) | ||
| 714 | congestion_wait(WRITE, HZ/10); | ||
| 715 | else | ||
| 716 | break; | ||
| 717 | } | ||
| 718 | } | ||
| 719 | } | ||
| 720 | |||
| 721 | /* | ||
| 722 | * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back | ||
| 723 | * the whole world. Returns 0 if a pdflush thread was dispatched. Returns | ||
| 724 | * -1 if all pdflush threads were busy. | ||
| 725 | */ | ||
| 726 | int wakeup_pdflush(long nr_pages) | ||
| 727 | { | ||
| 728 | if (nr_pages == 0) | ||
| 729 | nr_pages = global_page_state(NR_FILE_DIRTY) + | ||
| 730 | global_page_state(NR_UNSTABLE_NFS); | ||
| 731 | return pdflush_operation(background_writeout, nr_pages); | ||
| 732 | } | ||
| 733 | |||
| 734 | static void wb_timer_fn(unsigned long unused); | ||
| 735 | static void laptop_timer_fn(unsigned long unused); | 686 | static void laptop_timer_fn(unsigned long unused); |
| 736 | 687 | ||
| 737 | static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); | ||
| 738 | static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); | 688 | static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); |
| 739 | 689 | ||
| 740 | /* | 690 | /* |
| 741 | * Periodic writeback of "old" data. | ||
| 742 | * | ||
| 743 | * Define "old": the first time one of an inode's pages is dirtied, we mark the | ||
| 744 | * dirtying-time in the inode's address_space. So this periodic writeback code | ||
| 745 | * just walks the superblock inode list, writing back any inodes which are | ||
| 746 | * older than a specific point in time. | ||
| 747 | * | ||
| 748 | * Try to run once per dirty_writeback_interval. But if a writeback event | ||
| 749 | * takes longer than a dirty_writeback_interval interval, then leave a | ||
| 750 | * one-second gap. | ||
| 751 | * | ||
| 752 | * older_than_this takes precedence over nr_to_write. So we'll only write back | ||
| 753 | * all dirty pages if they are all attached to "old" mappings. | ||
| 754 | */ | ||
| 755 | static void wb_kupdate(unsigned long arg) | ||
| 756 | { | ||
| 757 | unsigned long oldest_jif; | ||
| 758 | unsigned long start_jif; | ||
| 759 | unsigned long next_jif; | ||
| 760 | long nr_to_write; | ||
| 761 | struct writeback_control wbc = { | ||
| 762 | .bdi = NULL, | ||
| 763 | .sync_mode = WB_SYNC_NONE, | ||
| 764 | .older_than_this = &oldest_jif, | ||
| 765 | .nr_to_write = 0, | ||
| 766 | .nonblocking = 1, | ||
| 767 | .for_kupdate = 1, | ||
| 768 | .range_cyclic = 1, | ||
| 769 | }; | ||
| 770 | |||
| 771 | sync_supers(); | ||
| 772 | |||
| 773 | oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval); | ||
| 774 | start_jif = jiffies; | ||
| 775 | next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); | ||
| 776 | nr_to_write = global_page_state(NR_FILE_DIRTY) + | ||
| 777 | global_page_state(NR_UNSTABLE_NFS) + | ||
| 778 | (inodes_stat.nr_inodes - inodes_stat.nr_unused); | ||
| 779 | while (nr_to_write > 0) { | ||
| 780 | wbc.more_io = 0; | ||
| 781 | wbc.encountered_congestion = 0; | ||
| 782 | wbc.nr_to_write = MAX_WRITEBACK_PAGES; | ||
| 783 | writeback_inodes(&wbc); | ||
| 784 | if (wbc.nr_to_write > 0) { | ||
| 785 | if (wbc.encountered_congestion || wbc.more_io) | ||
| 786 | congestion_wait(WRITE, HZ/10); | ||
| 787 | else | ||
| 788 | break; /* All the old data is written */ | ||
| 789 | } | ||
| 790 | nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; | ||
| 791 | } | ||
| 792 | if (time_before(next_jif, jiffies + HZ)) | ||
| 793 | next_jif = jiffies + HZ; | ||
| 794 | if (dirty_writeback_interval) | ||
| 795 | mod_timer(&wb_timer, next_jif); | ||
| 796 | } | ||
| 797 | |||
| 798 | /* | ||
| 799 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs | 691 | * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs |
| 800 | */ | 692 | */ |
| 801 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, | 693 | int dirty_writeback_centisecs_handler(ctl_table *table, int write, |
| 802 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 694 | void __user *buffer, size_t *length, loff_t *ppos) |
| 803 | { | 695 | { |
| 804 | proc_dointvec(table, write, file, buffer, length, ppos); | 696 | proc_dointvec(table, write, buffer, length, ppos); |
| 805 | if (dirty_writeback_interval) | ||
| 806 | mod_timer(&wb_timer, jiffies + | ||
| 807 | msecs_to_jiffies(dirty_writeback_interval * 10)); | ||
| 808 | else | ||
| 809 | del_timer(&wb_timer); | ||
| 810 | return 0; | 697 | return 0; |
| 811 | } | 698 | } |
| 812 | 699 | ||
| 813 | static void wb_timer_fn(unsigned long unused) | 700 | static void do_laptop_sync(struct work_struct *work) |
| 814 | { | 701 | { |
| 815 | if (pdflush_operation(wb_kupdate, 0) < 0) | 702 | wakeup_flusher_threads(0); |
| 816 | mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ | 703 | kfree(work); |
| 817 | } | ||
| 818 | |||
| 819 | static void laptop_flush(unsigned long unused) | ||
| 820 | { | ||
| 821 | sys_sync(); | ||
| 822 | } | 704 | } |
| 823 | 705 | ||
| 824 | static void laptop_timer_fn(unsigned long unused) | 706 | static void laptop_timer_fn(unsigned long unused) |
| 825 | { | 707 | { |
| 826 | pdflush_operation(laptop_flush, 0); | 708 | struct work_struct *work; |
| 709 | |||
| 710 | work = kmalloc(sizeof(*work), GFP_ATOMIC); | ||
| 711 | if (work) { | ||
| 712 | INIT_WORK(work, do_laptop_sync); | ||
| 713 | schedule_work(work); | ||
| 714 | } | ||
| 827 | } | 715 | } |
| 828 | 716 | ||
| 829 | /* | 717 | /* |
| @@ -906,8 +794,6 @@ void __init page_writeback_init(void) | |||
| 906 | { | 794 | { |
| 907 | int shift; | 795 | int shift; |
| 908 | 796 | ||
| 909 | mod_timer(&wb_timer, | ||
| 910 | jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); | ||
| 911 | writeback_set_ratelimit(); | 797 | writeback_set_ratelimit(); |
| 912 | register_cpu_notifier(&ratelimit_nb); | 798 | register_cpu_notifier(&ratelimit_nb); |
| 913 | 799 | ||
| @@ -935,7 +821,6 @@ int write_cache_pages(struct address_space *mapping, | |||
| 935 | struct writeback_control *wbc, writepage_t writepage, | 821 | struct writeback_control *wbc, writepage_t writepage, |
| 936 | void *data) | 822 | void *data) |
| 937 | { | 823 | { |
| 938 | struct backing_dev_info *bdi = mapping->backing_dev_info; | ||
| 939 | int ret = 0; | 824 | int ret = 0; |
| 940 | int done = 0; | 825 | int done = 0; |
| 941 | struct pagevec pvec; | 826 | struct pagevec pvec; |
| @@ -948,11 +833,6 @@ int write_cache_pages(struct address_space *mapping, | |||
| 948 | int range_whole = 0; | 833 | int range_whole = 0; |
| 949 | long nr_to_write = wbc->nr_to_write; | 834 | long nr_to_write = wbc->nr_to_write; |
| 950 | 835 | ||
| 951 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
| 952 | wbc->encountered_congestion = 1; | ||
| 953 | return 0; | ||
| 954 | } | ||
| 955 | |||
| 956 | pagevec_init(&pvec, 0); | 836 | pagevec_init(&pvec, 0); |
| 957 | if (wbc->range_cyclic) { | 837 | if (wbc->range_cyclic) { |
| 958 | writeback_index = mapping->writeback_index; /* prev offset */ | 838 | writeback_index = mapping->writeback_index; /* prev offset */ |
| @@ -1071,12 +951,6 @@ continue_unlock: | |||
| 1071 | break; | 951 | break; |
| 1072 | } | 952 | } |
| 1073 | } | 953 | } |
| 1074 | |||
| 1075 | if (wbc->nonblocking && bdi_write_congested(bdi)) { | ||
| 1076 | wbc->encountered_congestion = 1; | ||
| 1077 | done = 1; | ||
| 1078 | break; | ||
| 1079 | } | ||
| 1080 | } | 954 | } |
| 1081 | pagevec_release(&pvec); | 955 | pagevec_release(&pvec); |
| 1082 | cond_resched(); | 956 | cond_resched(); |
| @@ -1141,12 +1015,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) | |||
| 1141 | 1015 | ||
| 1142 | if (wbc->nr_to_write <= 0) | 1016 | if (wbc->nr_to_write <= 0) |
| 1143 | return 0; | 1017 | return 0; |
| 1144 | wbc->for_writepages = 1; | ||
| 1145 | if (mapping->a_ops->writepages) | 1018 | if (mapping->a_ops->writepages) |
| 1146 | ret = mapping->a_ops->writepages(mapping, wbc); | 1019 | ret = mapping->a_ops->writepages(mapping, wbc); |
| 1147 | else | 1020 | else |
| 1148 | ret = generic_writepages(mapping, wbc); | 1021 | ret = generic_writepages(mapping, wbc); |
| 1149 | wbc->for_writepages = 0; | ||
| 1150 | return ret; | 1022 | return ret; |
| 1151 | } | 1023 | } |
| 1152 | 1024 | ||
| @@ -1270,6 +1142,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) | |||
| 1270 | EXPORT_SYMBOL(redirty_page_for_writepage); | 1142 | EXPORT_SYMBOL(redirty_page_for_writepage); |
| 1271 | 1143 | ||
| 1272 | /* | 1144 | /* |
| 1145 | * Dirty a page. | ||
| 1146 | * | ||
| 1147 | * For pages with a mapping this should be done under the page lock | ||
| 1148 | * for the benefit of asynchronous memory errors who prefer a consistent | ||
| 1149 | * dirty state. This rule can be broken in some special cases, | ||
| 1150 | * but should be better not to. | ||
| 1151 | * | ||
| 1273 | * If the mapping doesn't provide a set_page_dirty a_op, then | 1152 | * If the mapping doesn't provide a set_page_dirty a_op, then |
| 1274 | * just fall through and assume that it wants buffer_heads. | 1153 | * just fall through and assume that it wants buffer_heads. |
| 1275 | */ | 1154 | */ |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0284e528748d..2bc2ac63f41e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/bootmem.h> | 23 | #include <linux/bootmem.h> |
| 24 | #include <linux/compiler.h> | 24 | #include <linux/compiler.h> |
| 25 | #include <linux/kernel.h> | 25 | #include <linux/kernel.h> |
| 26 | #include <linux/kmemcheck.h> | ||
| 26 | #include <linux/module.h> | 27 | #include <linux/module.h> |
| 27 | #include <linux/suspend.h> | 28 | #include <linux/suspend.h> |
| 28 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
| @@ -46,6 +47,8 @@ | |||
| 46 | #include <linux/page-isolation.h> | 47 | #include <linux/page-isolation.h> |
| 47 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
| 48 | #include <linux/debugobjects.h> | 49 | #include <linux/debugobjects.h> |
| 50 | #include <linux/kmemleak.h> | ||
| 51 | #include <trace/events/kmem.h> | ||
| 49 | 52 | ||
| 50 | #include <asm/tlbflush.h> | 53 | #include <asm/tlbflush.h> |
| 51 | #include <asm/div64.h> | 54 | #include <asm/div64.h> |
| @@ -69,8 +72,8 @@ EXPORT_SYMBOL(node_states); | |||
| 69 | 72 | ||
| 70 | unsigned long totalram_pages __read_mostly; | 73 | unsigned long totalram_pages __read_mostly; |
| 71 | unsigned long totalreserve_pages __read_mostly; | 74 | unsigned long totalreserve_pages __read_mostly; |
| 72 | unsigned long highest_memmap_pfn __read_mostly; | ||
| 73 | int percpu_pagelist_fraction; | 75 | int percpu_pagelist_fraction; |
| 76 | gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; | ||
| 74 | 77 | ||
| 75 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 78 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
| 76 | int pageblock_order __read_mostly; | 79 | int pageblock_order __read_mostly; |
| @@ -120,8 +123,8 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
| 120 | 123 | ||
| 121 | int min_free_kbytes = 1024; | 124 | int min_free_kbytes = 1024; |
| 122 | 125 | ||
| 123 | unsigned long __meminitdata nr_kernel_pages; | 126 | static unsigned long __meminitdata nr_kernel_pages; |
| 124 | unsigned long __meminitdata nr_all_pages; | 127 | static unsigned long __meminitdata nr_all_pages; |
| 125 | static unsigned long __meminitdata dma_reserve; | 128 | static unsigned long __meminitdata dma_reserve; |
| 126 | 129 | ||
| 127 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP | 130 | #ifdef CONFIG_ARCH_POPULATES_NODE_MAP |
| @@ -149,10 +152,6 @@ static unsigned long __meminitdata dma_reserve; | |||
| 149 | static int __meminitdata nr_nodemap_entries; | 152 | static int __meminitdata nr_nodemap_entries; |
| 150 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; | 153 | static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; |
| 151 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; | 154 | static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; |
| 152 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
| 153 | static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; | ||
| 154 | static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; | ||
| 155 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
| 156 | static unsigned long __initdata required_kernelcore; | 155 | static unsigned long __initdata required_kernelcore; |
| 157 | static unsigned long __initdata required_movablecore; | 156 | static unsigned long __initdata required_movablecore; |
| 158 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; | 157 | static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; |
| @@ -164,17 +163,25 @@ static unsigned long __meminitdata dma_reserve; | |||
| 164 | 163 | ||
| 165 | #if MAX_NUMNODES > 1 | 164 | #if MAX_NUMNODES > 1 |
| 166 | int nr_node_ids __read_mostly = MAX_NUMNODES; | 165 | int nr_node_ids __read_mostly = MAX_NUMNODES; |
| 166 | int nr_online_nodes __read_mostly = 1; | ||
| 167 | EXPORT_SYMBOL(nr_node_ids); | 167 | EXPORT_SYMBOL(nr_node_ids); |
| 168 | EXPORT_SYMBOL(nr_online_nodes); | ||
| 168 | #endif | 169 | #endif |
| 169 | 170 | ||
| 170 | int page_group_by_mobility_disabled __read_mostly; | 171 | int page_group_by_mobility_disabled __read_mostly; |
| 171 | 172 | ||
| 172 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 173 | static void set_pageblock_migratetype(struct page *page, int migratetype) |
| 173 | { | 174 | { |
| 175 | |||
| 176 | if (unlikely(page_group_by_mobility_disabled)) | ||
| 177 | migratetype = MIGRATE_UNMOVABLE; | ||
| 178 | |||
| 174 | set_pageblock_flags_group(page, (unsigned long)migratetype, | 179 | set_pageblock_flags_group(page, (unsigned long)migratetype, |
| 175 | PB_migrate, PB_migrate_end); | 180 | PB_migrate, PB_migrate_end); |
| 176 | } | 181 | } |
| 177 | 182 | ||
| 183 | bool oom_killer_disabled __read_mostly; | ||
| 184 | |||
| 178 | #ifdef CONFIG_DEBUG_VM | 185 | #ifdef CONFIG_DEBUG_VM |
| 179 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) | 186 | static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
| 180 | { | 187 | { |
| @@ -227,6 +234,12 @@ static void bad_page(struct page *page) | |||
| 227 | static unsigned long nr_shown; | 234 | static unsigned long nr_shown; |
| 228 | static unsigned long nr_unshown; | 235 | static unsigned long nr_unshown; |
| 229 | 236 | ||
| 237 | /* Don't complain about poisoned pages */ | ||
| 238 | if (PageHWPoison(page)) { | ||
| 239 | __ClearPageBuddy(page); | ||
| 240 | return; | ||
| 241 | } | ||
| 242 | |||
| 230 | /* | 243 | /* |
| 231 | * Allow a burst of 60 reports, then keep quiet for that minute; | 244 | * Allow a burst of 60 reports, then keep quiet for that minute; |
| 232 | * or allow a steady drip of one report per second. | 245 | * or allow a steady drip of one report per second. |
| @@ -297,23 +310,6 @@ void prep_compound_page(struct page *page, unsigned long order) | |||
| 297 | } | 310 | } |
| 298 | } | 311 | } |
| 299 | 312 | ||
| 300 | #ifdef CONFIG_HUGETLBFS | ||
| 301 | void prep_compound_gigantic_page(struct page *page, unsigned long order) | ||
| 302 | { | ||
| 303 | int i; | ||
| 304 | int nr_pages = 1 << order; | ||
| 305 | struct page *p = page + 1; | ||
| 306 | |||
| 307 | set_compound_page_dtor(page, free_compound_page); | ||
| 308 | set_compound_order(page, order); | ||
| 309 | __SetPageHead(page); | ||
| 310 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | ||
| 311 | __SetPageTail(p); | ||
| 312 | p->first_page = page; | ||
| 313 | } | ||
| 314 | } | ||
| 315 | #endif | ||
| 316 | |||
| 317 | static int destroy_compound_page(struct page *page, unsigned long order) | 313 | static int destroy_compound_page(struct page *page, unsigned long order) |
| 318 | { | 314 | { |
| 319 | int i; | 315 | int i; |
| @@ -331,7 +327,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
| 331 | for (i = 1; i < nr_pages; i++) { | 327 | for (i = 1; i < nr_pages; i++) { |
| 332 | struct page *p = page + i; | 328 | struct page *p = page + i; |
| 333 | 329 | ||
| 334 | if (unlikely(!PageTail(p) | (p->first_page != page))) { | 330 | if (unlikely(!PageTail(p) || (p->first_page != page))) { |
| 335 | bad_page(page); | 331 | bad_page(page); |
| 336 | bad++; | 332 | bad++; |
| 337 | } | 333 | } |
| @@ -420,7 +416,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
| 420 | return 0; | 416 | return 0; |
| 421 | 417 | ||
| 422 | if (PageBuddy(buddy) && page_order(buddy) == order) { | 418 | if (PageBuddy(buddy) && page_order(buddy) == order) { |
| 423 | BUG_ON(page_count(buddy) != 0); | 419 | VM_BUG_ON(page_count(buddy) != 0); |
| 424 | return 1; | 420 | return 1; |
| 425 | } | 421 | } |
| 426 | return 0; | 422 | return 0; |
| @@ -451,22 +447,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
| 451 | */ | 447 | */ |
| 452 | 448 | ||
| 453 | static inline void __free_one_page(struct page *page, | 449 | static inline void __free_one_page(struct page *page, |
| 454 | struct zone *zone, unsigned int order) | 450 | struct zone *zone, unsigned int order, |
| 451 | int migratetype) | ||
| 455 | { | 452 | { |
| 456 | unsigned long page_idx; | 453 | unsigned long page_idx; |
| 457 | int order_size = 1 << order; | ||
| 458 | int migratetype = get_pageblock_migratetype(page); | ||
| 459 | 454 | ||
| 460 | if (unlikely(PageCompound(page))) | 455 | if (unlikely(PageCompound(page))) |
| 461 | if (unlikely(destroy_compound_page(page, order))) | 456 | if (unlikely(destroy_compound_page(page, order))) |
| 462 | return; | 457 | return; |
| 463 | 458 | ||
| 459 | VM_BUG_ON(migratetype == -1); | ||
| 460 | |||
| 464 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); | 461 | page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); |
| 465 | 462 | ||
| 466 | VM_BUG_ON(page_idx & (order_size - 1)); | 463 | VM_BUG_ON(page_idx & ((1 << order) - 1)); |
| 467 | VM_BUG_ON(bad_range(zone, page)); | 464 | VM_BUG_ON(bad_range(zone, page)); |
| 468 | 465 | ||
| 469 | __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); | ||
| 470 | while (order < MAX_ORDER-1) { | 466 | while (order < MAX_ORDER-1) { |
| 471 | unsigned long combined_idx; | 467 | unsigned long combined_idx; |
| 472 | struct page *buddy; | 468 | struct page *buddy; |
| @@ -490,12 +486,26 @@ static inline void __free_one_page(struct page *page, | |||
| 490 | zone->free_area[order].nr_free++; | 486 | zone->free_area[order].nr_free++; |
| 491 | } | 487 | } |
| 492 | 488 | ||
| 489 | #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT | ||
| 490 | /* | ||
| 491 | * free_page_mlock() -- clean up attempts to free and mlocked() page. | ||
| 492 | * Page should not be on lru, so no need to fix that up. | ||
| 493 | * free_pages_check() will verify... | ||
| 494 | */ | ||
| 495 | static inline void free_page_mlock(struct page *page) | ||
| 496 | { | ||
| 497 | __dec_zone_page_state(page, NR_MLOCK); | ||
| 498 | __count_vm_event(UNEVICTABLE_MLOCKFREED); | ||
| 499 | } | ||
| 500 | #else | ||
| 501 | static void free_page_mlock(struct page *page) { } | ||
| 502 | #endif | ||
| 503 | |||
| 493 | static inline int free_pages_check(struct page *page) | 504 | static inline int free_pages_check(struct page *page) |
| 494 | { | 505 | { |
| 495 | free_page_mlock(page); | ||
| 496 | if (unlikely(page_mapcount(page) | | 506 | if (unlikely(page_mapcount(page) | |
| 497 | (page->mapping != NULL) | | 507 | (page->mapping != NULL) | |
| 498 | (page_count(page) != 0) | | 508 | (atomic_read(&page->_count) != 0) | |
| 499 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { | 509 | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { |
| 500 | bad_page(page); | 510 | bad_page(page); |
| 501 | return 1; | 511 | return 1; |
| @@ -506,7 +516,7 @@ static inline int free_pages_check(struct page *page) | |||
| 506 | } | 516 | } |
| 507 | 517 | ||
| 508 | /* | 518 | /* |
| 509 | * Frees a list of pages. | 519 | * Frees a number of pages from the PCP lists |
| 510 | * Assumes all pages on list are in same zone, and of same order. | 520 | * Assumes all pages on list are in same zone, and of same order. |
| 511 | * count is the number of pages to free. | 521 | * count is the number of pages to free. |
| 512 | * | 522 | * |
| @@ -516,30 +526,55 @@ static inline int free_pages_check(struct page *page) | |||
| 516 | * And clear the zone's pages_scanned counter, to hold off the "all pages are | 526 | * And clear the zone's pages_scanned counter, to hold off the "all pages are |
| 517 | * pinned" detection logic. | 527 | * pinned" detection logic. |
| 518 | */ | 528 | */ |
| 519 | static void free_pages_bulk(struct zone *zone, int count, | 529 | static void free_pcppages_bulk(struct zone *zone, int count, |
| 520 | struct list_head *list, int order) | 530 | struct per_cpu_pages *pcp) |
| 521 | { | 531 | { |
| 532 | int migratetype = 0; | ||
| 533 | int batch_free = 0; | ||
| 534 | |||
| 522 | spin_lock(&zone->lock); | 535 | spin_lock(&zone->lock); |
| 523 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 536 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
| 524 | zone->pages_scanned = 0; | 537 | zone->pages_scanned = 0; |
| 525 | while (count--) { | 538 | |
| 539 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | ||
| 540 | while (count) { | ||
| 526 | struct page *page; | 541 | struct page *page; |
| 542 | struct list_head *list; | ||
| 527 | 543 | ||
| 528 | VM_BUG_ON(list_empty(list)); | 544 | /* |
| 529 | page = list_entry(list->prev, struct page, lru); | 545 | * Remove pages from lists in a round-robin fashion. A |
| 530 | /* have to delete it as __free_one_page list manipulates */ | 546 | * batch_free count is maintained that is incremented when an |
| 531 | list_del(&page->lru); | 547 | * empty list is encountered. This is so more pages are freed |
| 532 | __free_one_page(page, zone, order); | 548 | * off fuller lists instead of spinning excessively around empty |
| 549 | * lists | ||
| 550 | */ | ||
| 551 | do { | ||
| 552 | batch_free++; | ||
| 553 | if (++migratetype == MIGRATE_PCPTYPES) | ||
| 554 | migratetype = 0; | ||
| 555 | list = &pcp->lists[migratetype]; | ||
| 556 | } while (list_empty(list)); | ||
| 557 | |||
| 558 | do { | ||
| 559 | page = list_entry(list->prev, struct page, lru); | ||
| 560 | /* must delete as __free_one_page list manipulates */ | ||
| 561 | list_del(&page->lru); | ||
| 562 | __free_one_page(page, zone, 0, migratetype); | ||
| 563 | trace_mm_page_pcpu_drain(page, 0, migratetype); | ||
| 564 | } while (--count && --batch_free && !list_empty(list)); | ||
| 533 | } | 565 | } |
| 534 | spin_unlock(&zone->lock); | 566 | spin_unlock(&zone->lock); |
| 535 | } | 567 | } |
| 536 | 568 | ||
| 537 | static void free_one_page(struct zone *zone, struct page *page, int order) | 569 | static void free_one_page(struct zone *zone, struct page *page, int order, |
| 570 | int migratetype) | ||
| 538 | { | 571 | { |
| 539 | spin_lock(&zone->lock); | 572 | spin_lock(&zone->lock); |
| 540 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); | 573 | zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); |
| 541 | zone->pages_scanned = 0; | 574 | zone->pages_scanned = 0; |
| 542 | __free_one_page(page, zone, order); | 575 | |
| 576 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); | ||
| 577 | __free_one_page(page, zone, order, migratetype); | ||
| 543 | spin_unlock(&zone->lock); | 578 | spin_unlock(&zone->lock); |
| 544 | } | 579 | } |
| 545 | 580 | ||
| @@ -548,6 +583,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 548 | unsigned long flags; | 583 | unsigned long flags; |
| 549 | int i; | 584 | int i; |
| 550 | int bad = 0; | 585 | int bad = 0; |
| 586 | int wasMlocked = __TestClearPageMlocked(page); | ||
| 587 | |||
| 588 | kmemcheck_free_shadow(page, order); | ||
| 551 | 589 | ||
| 552 | for (i = 0 ; i < (1 << order) ; ++i) | 590 | for (i = 0 ; i < (1 << order) ; ++i) |
| 553 | bad += free_pages_check(page + i); | 591 | bad += free_pages_check(page + i); |
| @@ -563,8 +601,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 563 | kernel_map_pages(page, 1 << order, 0); | 601 | kernel_map_pages(page, 1 << order, 0); |
| 564 | 602 | ||
| 565 | local_irq_save(flags); | 603 | local_irq_save(flags); |
| 604 | if (unlikely(wasMlocked)) | ||
| 605 | free_page_mlock(page); | ||
| 566 | __count_vm_events(PGFREE, 1 << order); | 606 | __count_vm_events(PGFREE, 1 << order); |
| 567 | free_one_page(page_zone(page), page, order); | 607 | free_one_page(page_zone(page), page, order, |
| 608 | get_pageblock_migratetype(page)); | ||
| 568 | local_irq_restore(flags); | 609 | local_irq_restore(flags); |
| 569 | } | 610 | } |
| 570 | 611 | ||
| @@ -631,15 +672,27 @@ static inline void expand(struct zone *zone, struct page *page, | |||
| 631 | /* | 672 | /* |
| 632 | * This page is about to be returned from the page allocator | 673 | * This page is about to be returned from the page allocator |
| 633 | */ | 674 | */ |
| 634 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | 675 | static inline int check_new_page(struct page *page) |
| 635 | { | 676 | { |
| 636 | if (unlikely(page_mapcount(page) | | 677 | if (unlikely(page_mapcount(page) | |
| 637 | (page->mapping != NULL) | | 678 | (page->mapping != NULL) | |
| 638 | (page_count(page) != 0) | | 679 | (atomic_read(&page->_count) != 0) | |
| 639 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { | 680 | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { |
| 640 | bad_page(page); | 681 | bad_page(page); |
| 641 | return 1; | 682 | return 1; |
| 642 | } | 683 | } |
| 684 | return 0; | ||
| 685 | } | ||
| 686 | |||
| 687 | static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | ||
| 688 | { | ||
| 689 | int i; | ||
| 690 | |||
| 691 | for (i = 0; i < (1 << order); i++) { | ||
| 692 | struct page *p = page + i; | ||
| 693 | if (unlikely(check_new_page(p))) | ||
| 694 | return 1; | ||
| 695 | } | ||
| 643 | 696 | ||
| 644 | set_page_private(page, 0); | 697 | set_page_private(page, 0); |
| 645 | set_page_refcounted(page); | 698 | set_page_refcounted(page); |
| @@ -660,7 +713,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
| 660 | * Go through the free lists for the given migratetype and remove | 713 | * Go through the free lists for the given migratetype and remove |
| 661 | * the smallest available page from the freelists | 714 | * the smallest available page from the freelists |
| 662 | */ | 715 | */ |
| 663 | static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | 716 | static inline |
| 717 | struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | ||
| 664 | int migratetype) | 718 | int migratetype) |
| 665 | { | 719 | { |
| 666 | unsigned int current_order; | 720 | unsigned int current_order; |
| @@ -678,7 +732,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
| 678 | list_del(&page->lru); | 732 | list_del(&page->lru); |
| 679 | rmv_page_order(page); | 733 | rmv_page_order(page); |
| 680 | area->nr_free--; | 734 | area->nr_free--; |
| 681 | __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); | ||
| 682 | expand(zone, page, order, current_order, area, migratetype); | 735 | expand(zone, page, order, current_order, area, migratetype); |
| 683 | return page; | 736 | return page; |
| 684 | } | 737 | } |
| @@ -768,9 +821,20 @@ static int move_freepages_block(struct zone *zone, struct page *page, | |||
| 768 | return move_freepages(zone, start_page, end_page, migratetype); | 821 | return move_freepages(zone, start_page, end_page, migratetype); |
| 769 | } | 822 | } |
| 770 | 823 | ||
| 824 | static void change_pageblock_range(struct page *pageblock_page, | ||
| 825 | int start_order, int migratetype) | ||
| 826 | { | ||
| 827 | int nr_pageblocks = 1 << (start_order - pageblock_order); | ||
| 828 | |||
| 829 | while (nr_pageblocks--) { | ||
| 830 | set_pageblock_migratetype(pageblock_page, migratetype); | ||
| 831 | pageblock_page += pageblock_nr_pages; | ||
| 832 | } | ||
| 833 | } | ||
| 834 | |||
| 771 | /* Remove an element from the buddy allocator from the fallback list */ | 835 | /* Remove an element from the buddy allocator from the fallback list */ |
| 772 | static struct page *__rmqueue_fallback(struct zone *zone, int order, | 836 | static inline struct page * |
| 773 | int start_migratetype) | 837 | __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) |
| 774 | { | 838 | { |
| 775 | struct free_area * area; | 839 | struct free_area * area; |
| 776 | int current_order; | 840 | int current_order; |
| @@ -802,13 +866,15 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, | |||
| 802 | * agressive about taking ownership of free pages | 866 | * agressive about taking ownership of free pages |
| 803 | */ | 867 | */ |
| 804 | if (unlikely(current_order >= (pageblock_order >> 1)) || | 868 | if (unlikely(current_order >= (pageblock_order >> 1)) || |
| 805 | start_migratetype == MIGRATE_RECLAIMABLE) { | 869 | start_migratetype == MIGRATE_RECLAIMABLE || |
| 870 | page_group_by_mobility_disabled) { | ||
| 806 | unsigned long pages; | 871 | unsigned long pages; |
| 807 | pages = move_freepages_block(zone, page, | 872 | pages = move_freepages_block(zone, page, |
| 808 | start_migratetype); | 873 | start_migratetype); |
| 809 | 874 | ||
| 810 | /* Claim the whole block if over half of it is free */ | 875 | /* Claim the whole block if over half of it is free */ |
| 811 | if (pages >= (1 << (pageblock_order-1))) | 876 | if (pages >= (1 << (pageblock_order-1)) || |
| 877 | page_group_by_mobility_disabled) | ||
| 812 | set_pageblock_migratetype(page, | 878 | set_pageblock_migratetype(page, |
| 813 | start_migratetype); | 879 | start_migratetype); |
| 814 | 880 | ||
| @@ -818,20 +884,22 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order, | |||
| 818 | /* Remove the page from the freelists */ | 884 | /* Remove the page from the freelists */ |
| 819 | list_del(&page->lru); | 885 | list_del(&page->lru); |
| 820 | rmv_page_order(page); | 886 | rmv_page_order(page); |
| 821 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
| 822 | -(1UL << order)); | ||
| 823 | 887 | ||
| 824 | if (current_order == pageblock_order) | 888 | /* Take ownership for orders >= pageblock_order */ |
| 825 | set_pageblock_migratetype(page, | 889 | if (current_order >= pageblock_order) |
| 890 | change_pageblock_range(page, current_order, | ||
| 826 | start_migratetype); | 891 | start_migratetype); |
| 827 | 892 | ||
| 828 | expand(zone, page, order, current_order, area, migratetype); | 893 | expand(zone, page, order, current_order, area, migratetype); |
| 894 | |||
| 895 | trace_mm_page_alloc_extfrag(page, order, current_order, | ||
| 896 | start_migratetype, migratetype); | ||
| 897 | |||
| 829 | return page; | 898 | return page; |
| 830 | } | 899 | } |
| 831 | } | 900 | } |
| 832 | 901 | ||
| 833 | /* Use MIGRATE_RESERVE rather than fail an allocation */ | 902 | return NULL; |
| 834 | return __rmqueue_smallest(zone, order, MIGRATE_RESERVE); | ||
| 835 | } | 903 | } |
| 836 | 904 | ||
| 837 | /* | 905 | /* |
| @@ -843,11 +911,24 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, | |||
| 843 | { | 911 | { |
| 844 | struct page *page; | 912 | struct page *page; |
| 845 | 913 | ||
| 914 | retry_reserve: | ||
| 846 | page = __rmqueue_smallest(zone, order, migratetype); | 915 | page = __rmqueue_smallest(zone, order, migratetype); |
| 847 | 916 | ||
| 848 | if (unlikely(!page)) | 917 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { |
| 849 | page = __rmqueue_fallback(zone, order, migratetype); | 918 | page = __rmqueue_fallback(zone, order, migratetype); |
| 850 | 919 | ||
| 920 | /* | ||
| 921 | * Use MIGRATE_RESERVE rather than fail an allocation. goto | ||
| 922 | * is used because __rmqueue_smallest is an inline function | ||
| 923 | * and we want just one call site | ||
| 924 | */ | ||
| 925 | if (!page) { | ||
| 926 | migratetype = MIGRATE_RESERVE; | ||
| 927 | goto retry_reserve; | ||
| 928 | } | ||
| 929 | } | ||
| 930 | |||
| 931 | trace_mm_page_alloc_zone_locked(page, order, migratetype); | ||
| 851 | return page; | 932 | return page; |
| 852 | } | 933 | } |
| 853 | 934 | ||
| @@ -858,7 +939,7 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, | |||
| 858 | */ | 939 | */ |
| 859 | static int rmqueue_bulk(struct zone *zone, unsigned int order, | 940 | static int rmqueue_bulk(struct zone *zone, unsigned int order, |
| 860 | unsigned long count, struct list_head *list, | 941 | unsigned long count, struct list_head *list, |
| 861 | int migratetype) | 942 | int migratetype, int cold) |
| 862 | { | 943 | { |
| 863 | int i; | 944 | int i; |
| 864 | 945 | ||
| @@ -877,10 +958,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, | |||
| 877 | * merge IO requests if the physical pages are ordered | 958 | * merge IO requests if the physical pages are ordered |
| 878 | * properly. | 959 | * properly. |
| 879 | */ | 960 | */ |
| 880 | list_add(&page->lru, list); | 961 | if (likely(cold == 0)) |
| 962 | list_add(&page->lru, list); | ||
| 963 | else | ||
| 964 | list_add_tail(&page->lru, list); | ||
| 881 | set_page_private(page, migratetype); | 965 | set_page_private(page, migratetype); |
| 882 | list = &page->lru; | 966 | list = &page->lru; |
| 883 | } | 967 | } |
| 968 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); | ||
| 884 | spin_unlock(&zone->lock); | 969 | spin_unlock(&zone->lock); |
| 885 | return i; | 970 | return i; |
| 886 | } | 971 | } |
| @@ -904,7 +989,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
| 904 | to_drain = pcp->batch; | 989 | to_drain = pcp->batch; |
| 905 | else | 990 | else |
| 906 | to_drain = pcp->count; | 991 | to_drain = pcp->count; |
| 907 | free_pages_bulk(zone, to_drain, &pcp->list, 0); | 992 | free_pcppages_bulk(zone, to_drain, pcp); |
| 908 | pcp->count -= to_drain; | 993 | pcp->count -= to_drain; |
| 909 | local_irq_restore(flags); | 994 | local_irq_restore(flags); |
| 910 | } | 995 | } |
| @@ -930,7 +1015,7 @@ static void drain_pages(unsigned int cpu) | |||
| 930 | 1015 | ||
| 931 | pcp = &pset->pcp; | 1016 | pcp = &pset->pcp; |
| 932 | local_irq_save(flags); | 1017 | local_irq_save(flags); |
| 933 | free_pages_bulk(zone, pcp->count, &pcp->list, 0); | 1018 | free_pcppages_bulk(zone, pcp->count, pcp); |
| 934 | pcp->count = 0; | 1019 | pcp->count = 0; |
| 935 | local_irq_restore(flags); | 1020 | local_irq_restore(flags); |
| 936 | } | 1021 | } |
| @@ -996,6 +1081,10 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 996 | struct zone *zone = page_zone(page); | 1081 | struct zone *zone = page_zone(page); |
| 997 | struct per_cpu_pages *pcp; | 1082 | struct per_cpu_pages *pcp; |
| 998 | unsigned long flags; | 1083 | unsigned long flags; |
| 1084 | int migratetype; | ||
| 1085 | int wasMlocked = __TestClearPageMlocked(page); | ||
| 1086 | |||
| 1087 | kmemcheck_free_shadow(page, 0); | ||
| 999 | 1088 | ||
| 1000 | if (PageAnon(page)) | 1089 | if (PageAnon(page)) |
| 1001 | page->mapping = NULL; | 1090 | page->mapping = NULL; |
| @@ -1010,32 +1099,49 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 1010 | kernel_map_pages(page, 1, 0); | 1099 | kernel_map_pages(page, 1, 0); |
| 1011 | 1100 | ||
| 1012 | pcp = &zone_pcp(zone, get_cpu())->pcp; | 1101 | pcp = &zone_pcp(zone, get_cpu())->pcp; |
| 1102 | migratetype = get_pageblock_migratetype(page); | ||
| 1103 | set_page_private(page, migratetype); | ||
| 1013 | local_irq_save(flags); | 1104 | local_irq_save(flags); |
| 1105 | if (unlikely(wasMlocked)) | ||
| 1106 | free_page_mlock(page); | ||
| 1014 | __count_vm_event(PGFREE); | 1107 | __count_vm_event(PGFREE); |
| 1108 | |||
| 1109 | /* | ||
| 1110 | * We only track unmovable, reclaimable and movable on pcp lists. | ||
| 1111 | * Free ISOLATE pages back to the allocator because they are being | ||
| 1112 | * offlined but treat RESERVE as movable pages so we can get those | ||
| 1113 | * areas back if necessary. Otherwise, we may have to free | ||
| 1114 | * excessively into the page allocator | ||
| 1115 | */ | ||
| 1116 | if (migratetype >= MIGRATE_PCPTYPES) { | ||
| 1117 | if (unlikely(migratetype == MIGRATE_ISOLATE)) { | ||
| 1118 | free_one_page(zone, page, 0, migratetype); | ||
| 1119 | goto out; | ||
| 1120 | } | ||
| 1121 | migratetype = MIGRATE_MOVABLE; | ||
| 1122 | } | ||
| 1123 | |||
| 1015 | if (cold) | 1124 | if (cold) |
| 1016 | list_add_tail(&page->lru, &pcp->list); | 1125 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
| 1017 | else | 1126 | else |
| 1018 | list_add(&page->lru, &pcp->list); | 1127 | list_add(&page->lru, &pcp->lists[migratetype]); |
| 1019 | set_page_private(page, get_pageblock_migratetype(page)); | ||
| 1020 | pcp->count++; | 1128 | pcp->count++; |
| 1021 | if (pcp->count >= pcp->high) { | 1129 | if (pcp->count >= pcp->high) { |
| 1022 | free_pages_bulk(zone, pcp->batch, &pcp->list, 0); | 1130 | free_pcppages_bulk(zone, pcp->batch, pcp); |
| 1023 | pcp->count -= pcp->batch; | 1131 | pcp->count -= pcp->batch; |
| 1024 | } | 1132 | } |
| 1133 | |||
| 1134 | out: | ||
| 1025 | local_irq_restore(flags); | 1135 | local_irq_restore(flags); |
| 1026 | put_cpu(); | 1136 | put_cpu(); |
| 1027 | } | 1137 | } |
| 1028 | 1138 | ||
| 1029 | void free_hot_page(struct page *page) | 1139 | void free_hot_page(struct page *page) |
| 1030 | { | 1140 | { |
| 1141 | trace_mm_page_free_direct(page, 0); | ||
| 1031 | free_hot_cold_page(page, 0); | 1142 | free_hot_cold_page(page, 0); |
| 1032 | } | 1143 | } |
| 1033 | 1144 | ||
| 1034 | void free_cold_page(struct page *page) | ||
| 1035 | { | ||
| 1036 | free_hot_cold_page(page, 1); | ||
| 1037 | } | ||
| 1038 | |||
| 1039 | /* | 1145 | /* |
| 1040 | * split_page takes a non-compound higher-order page, and splits it into | 1146 | * split_page takes a non-compound higher-order page, and splits it into |
| 1041 | * n (1<<order) sub-pages: page[0..n] | 1147 | * n (1<<order) sub-pages: page[0..n] |
| @@ -1050,6 +1156,16 @@ void split_page(struct page *page, unsigned int order) | |||
| 1050 | 1156 | ||
| 1051 | VM_BUG_ON(PageCompound(page)); | 1157 | VM_BUG_ON(PageCompound(page)); |
| 1052 | VM_BUG_ON(!page_count(page)); | 1158 | VM_BUG_ON(!page_count(page)); |
| 1159 | |||
| 1160 | #ifdef CONFIG_KMEMCHECK | ||
| 1161 | /* | ||
| 1162 | * Split shadow pages too, because free(page[0]) would | ||
| 1163 | * otherwise free the whole shadow. | ||
| 1164 | */ | ||
| 1165 | if (kmemcheck_page_is_tracked(page)) | ||
| 1166 | split_page(virt_to_page(page[0].shadow), order); | ||
| 1167 | #endif | ||
| 1168 | |||
| 1053 | for (i = 1; i < (1 << order); i++) | 1169 | for (i = 1; i < (1 << order); i++) |
| 1054 | set_page_refcounted(page + i); | 1170 | set_page_refcounted(page + i); |
| 1055 | } | 1171 | } |
| @@ -1059,52 +1175,57 @@ void split_page(struct page *page, unsigned int order) | |||
| 1059 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1175 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
| 1060 | * or two. | 1176 | * or two. |
| 1061 | */ | 1177 | */ |
| 1062 | static struct page *buffered_rmqueue(struct zone *preferred_zone, | 1178 | static inline |
| 1063 | struct zone *zone, int order, gfp_t gfp_flags) | 1179 | struct page *buffered_rmqueue(struct zone *preferred_zone, |
| 1180 | struct zone *zone, int order, gfp_t gfp_flags, | ||
| 1181 | int migratetype) | ||
| 1064 | { | 1182 | { |
| 1065 | unsigned long flags; | 1183 | unsigned long flags; |
| 1066 | struct page *page; | 1184 | struct page *page; |
| 1067 | int cold = !!(gfp_flags & __GFP_COLD); | 1185 | int cold = !!(gfp_flags & __GFP_COLD); |
| 1068 | int cpu; | 1186 | int cpu; |
| 1069 | int migratetype = allocflags_to_migratetype(gfp_flags); | ||
| 1070 | 1187 | ||
| 1071 | again: | 1188 | again: |
| 1072 | cpu = get_cpu(); | 1189 | cpu = get_cpu(); |
| 1073 | if (likely(order == 0)) { | 1190 | if (likely(order == 0)) { |
| 1074 | struct per_cpu_pages *pcp; | 1191 | struct per_cpu_pages *pcp; |
| 1192 | struct list_head *list; | ||
| 1075 | 1193 | ||
| 1076 | pcp = &zone_pcp(zone, cpu)->pcp; | 1194 | pcp = &zone_pcp(zone, cpu)->pcp; |
| 1195 | list = &pcp->lists[migratetype]; | ||
| 1077 | local_irq_save(flags); | 1196 | local_irq_save(flags); |
| 1078 | if (!pcp->count) { | 1197 | if (list_empty(list)) { |
| 1079 | pcp->count = rmqueue_bulk(zone, 0, | 1198 | pcp->count += rmqueue_bulk(zone, 0, |
| 1080 | pcp->batch, &pcp->list, migratetype); | 1199 | pcp->batch, list, |
| 1081 | if (unlikely(!pcp->count)) | 1200 | migratetype, cold); |
| 1201 | if (unlikely(list_empty(list))) | ||
| 1082 | goto failed; | 1202 | goto failed; |
| 1083 | } | 1203 | } |
| 1084 | 1204 | ||
| 1085 | /* Find a page of the appropriate migrate type */ | 1205 | if (cold) |
| 1086 | if (cold) { | 1206 | page = list_entry(list->prev, struct page, lru); |
| 1087 | list_for_each_entry_reverse(page, &pcp->list, lru) | 1207 | else |
| 1088 | if (page_private(page) == migratetype) | 1208 | page = list_entry(list->next, struct page, lru); |
| 1089 | break; | ||
| 1090 | } else { | ||
| 1091 | list_for_each_entry(page, &pcp->list, lru) | ||
| 1092 | if (page_private(page) == migratetype) | ||
| 1093 | break; | ||
| 1094 | } | ||
| 1095 | |||
| 1096 | /* Allocate more to the pcp list if necessary */ | ||
| 1097 | if (unlikely(&page->lru == &pcp->list)) { | ||
| 1098 | pcp->count += rmqueue_bulk(zone, 0, | ||
| 1099 | pcp->batch, &pcp->list, migratetype); | ||
| 1100 | page = list_entry(pcp->list.next, struct page, lru); | ||
| 1101 | } | ||
| 1102 | 1209 | ||
| 1103 | list_del(&page->lru); | 1210 | list_del(&page->lru); |
| 1104 | pcp->count--; | 1211 | pcp->count--; |
| 1105 | } else { | 1212 | } else { |
| 1213 | if (unlikely(gfp_flags & __GFP_NOFAIL)) { | ||
| 1214 | /* | ||
| 1215 | * __GFP_NOFAIL is not to be used in new code. | ||
| 1216 | * | ||
| 1217 | * All __GFP_NOFAIL callers should be fixed so that they | ||
| 1218 | * properly detect and handle allocation failures. | ||
| 1219 | * | ||
| 1220 | * We most definitely don't want callers attempting to | ||
| 1221 | * allocate greater than order-1 page units with | ||
| 1222 | * __GFP_NOFAIL. | ||
| 1223 | */ | ||
| 1224 | WARN_ON_ONCE(order > 1); | ||
| 1225 | } | ||
| 1106 | spin_lock_irqsave(&zone->lock, flags); | 1226 | spin_lock_irqsave(&zone->lock, flags); |
| 1107 | page = __rmqueue(zone, order, migratetype); | 1227 | page = __rmqueue(zone, order, migratetype); |
| 1228 | __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); | ||
| 1108 | spin_unlock(&zone->lock); | 1229 | spin_unlock(&zone->lock); |
| 1109 | if (!page) | 1230 | if (!page) |
| 1110 | goto failed; | 1231 | goto failed; |
| @@ -1126,10 +1247,15 @@ failed: | |||
| 1126 | return NULL; | 1247 | return NULL; |
| 1127 | } | 1248 | } |
| 1128 | 1249 | ||
| 1129 | #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ | 1250 | /* The ALLOC_WMARK bits are used as an index to zone->watermark */ |
| 1130 | #define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ | 1251 | #define ALLOC_WMARK_MIN WMARK_MIN |
| 1131 | #define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ | 1252 | #define ALLOC_WMARK_LOW WMARK_LOW |
| 1132 | #define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ | 1253 | #define ALLOC_WMARK_HIGH WMARK_HIGH |
| 1254 | #define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */ | ||
| 1255 | |||
| 1256 | /* Mask to get the watermark bits */ | ||
| 1257 | #define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) | ||
| 1258 | |||
| 1133 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ | 1259 | #define ALLOC_HARDER 0x10 /* try to alloc harder */ |
| 1134 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ | 1260 | #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ |
| 1135 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ | 1261 | #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ |
| @@ -1387,23 +1513,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
| 1387 | */ | 1513 | */ |
| 1388 | static struct page * | 1514 | static struct page * |
| 1389 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, | 1515 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
| 1390 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) | 1516 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags, |
| 1517 | struct zone *preferred_zone, int migratetype) | ||
| 1391 | { | 1518 | { |
| 1392 | struct zoneref *z; | 1519 | struct zoneref *z; |
| 1393 | struct page *page = NULL; | 1520 | struct page *page = NULL; |
| 1394 | int classzone_idx; | 1521 | int classzone_idx; |
| 1395 | struct zone *zone, *preferred_zone; | 1522 | struct zone *zone; |
| 1396 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1523 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
| 1397 | int zlc_active = 0; /* set if using zonelist_cache */ | 1524 | int zlc_active = 0; /* set if using zonelist_cache */ |
| 1398 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1525 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
| 1399 | 1526 | ||
| 1400 | (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, | ||
| 1401 | &preferred_zone); | ||
| 1402 | if (!preferred_zone) | ||
| 1403 | return NULL; | ||
| 1404 | |||
| 1405 | classzone_idx = zone_idx(preferred_zone); | 1527 | classzone_idx = zone_idx(preferred_zone); |
| 1406 | |||
| 1407 | zonelist_scan: | 1528 | zonelist_scan: |
| 1408 | /* | 1529 | /* |
| 1409 | * Scan zonelist, looking for a zone with enough free. | 1530 | * Scan zonelist, looking for a zone with enough free. |
| @@ -1418,31 +1539,49 @@ zonelist_scan: | |||
| 1418 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1539 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
| 1419 | goto try_next_zone; | 1540 | goto try_next_zone; |
| 1420 | 1541 | ||
| 1542 | BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); | ||
| 1421 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { | 1543 | if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { |
| 1422 | unsigned long mark; | 1544 | unsigned long mark; |
| 1423 | if (alloc_flags & ALLOC_WMARK_MIN) | 1545 | int ret; |
| 1424 | mark = zone->pages_min; | 1546 | |
| 1425 | else if (alloc_flags & ALLOC_WMARK_LOW) | 1547 | mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; |
| 1426 | mark = zone->pages_low; | 1548 | if (zone_watermark_ok(zone, order, mark, |
| 1427 | else | 1549 | classzone_idx, alloc_flags)) |
| 1428 | mark = zone->pages_high; | 1550 | goto try_this_zone; |
| 1429 | if (!zone_watermark_ok(zone, order, mark, | 1551 | |
| 1430 | classzone_idx, alloc_flags)) { | 1552 | if (zone_reclaim_mode == 0) |
| 1431 | if (!zone_reclaim_mode || | 1553 | goto this_zone_full; |
| 1432 | !zone_reclaim(zone, gfp_mask, order)) | 1554 | |
| 1555 | ret = zone_reclaim(zone, gfp_mask, order); | ||
| 1556 | switch (ret) { | ||
| 1557 | case ZONE_RECLAIM_NOSCAN: | ||
| 1558 | /* did not scan */ | ||
| 1559 | goto try_next_zone; | ||
| 1560 | case ZONE_RECLAIM_FULL: | ||
| 1561 | /* scanned but unreclaimable */ | ||
| 1562 | goto this_zone_full; | ||
| 1563 | default: | ||
| 1564 | /* did we reclaim enough */ | ||
| 1565 | if (!zone_watermark_ok(zone, order, mark, | ||
| 1566 | classzone_idx, alloc_flags)) | ||
| 1433 | goto this_zone_full; | 1567 | goto this_zone_full; |
| 1434 | } | 1568 | } |
| 1435 | } | 1569 | } |
| 1436 | 1570 | ||
| 1437 | page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); | 1571 | try_this_zone: |
| 1572 | page = buffered_rmqueue(preferred_zone, zone, order, | ||
| 1573 | gfp_mask, migratetype); | ||
| 1438 | if (page) | 1574 | if (page) |
| 1439 | break; | 1575 | break; |
| 1440 | this_zone_full: | 1576 | this_zone_full: |
| 1441 | if (NUMA_BUILD) | 1577 | if (NUMA_BUILD) |
| 1442 | zlc_mark_zone_full(zonelist, z); | 1578 | zlc_mark_zone_full(zonelist, z); |
| 1443 | try_next_zone: | 1579 | try_next_zone: |
| 1444 | if (NUMA_BUILD && !did_zlc_setup) { | 1580 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { |
| 1445 | /* we do zlc_setup after the first zone is tried */ | 1581 | /* |
| 1582 | * we do zlc_setup after the first zone is tried but only | ||
| 1583 | * if there are multiple nodes make it worthwhile | ||
| 1584 | */ | ||
| 1446 | allowednodes = zlc_setup(zonelist, alloc_flags); | 1585 | allowednodes = zlc_setup(zonelist, alloc_flags); |
| 1447 | zlc_active = 1; | 1586 | zlc_active = 1; |
| 1448 | did_zlc_setup = 1; | 1587 | did_zlc_setup = 1; |
| @@ -1457,47 +1596,215 @@ try_next_zone: | |||
| 1457 | return page; | 1596 | return page; |
| 1458 | } | 1597 | } |
| 1459 | 1598 | ||
| 1599 | static inline int | ||
| 1600 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, | ||
| 1601 | unsigned long pages_reclaimed) | ||
| 1602 | { | ||
| 1603 | /* Do not loop if specifically requested */ | ||
| 1604 | if (gfp_mask & __GFP_NORETRY) | ||
| 1605 | return 0; | ||
| 1606 | |||
| 1607 | /* | ||
| 1608 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
| 1609 | * means __GFP_NOFAIL, but that may not be true in other | ||
| 1610 | * implementations. | ||
| 1611 | */ | ||
| 1612 | if (order <= PAGE_ALLOC_COSTLY_ORDER) | ||
| 1613 | return 1; | ||
| 1614 | |||
| 1615 | /* | ||
| 1616 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
| 1617 | * specified, then we retry until we no longer reclaim any pages | ||
| 1618 | * (above), or we've reclaimed an order of pages at least as | ||
| 1619 | * large as the allocation's order. In both cases, if the | ||
| 1620 | * allocation still fails, we stop retrying. | ||
| 1621 | */ | ||
| 1622 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | ||
| 1623 | return 1; | ||
| 1624 | |||
| 1625 | /* | ||
| 1626 | * Don't let big-order allocations loop unless the caller | ||
| 1627 | * explicitly requests that. | ||
| 1628 | */ | ||
| 1629 | if (gfp_mask & __GFP_NOFAIL) | ||
| 1630 | return 1; | ||
| 1631 | |||
| 1632 | return 0; | ||
| 1633 | } | ||
| 1634 | |||
| 1635 | static inline struct page * | ||
| 1636 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | ||
| 1637 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
| 1638 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
| 1639 | int migratetype) | ||
| 1640 | { | ||
| 1641 | struct page *page; | ||
| 1642 | |||
| 1643 | /* Acquire the OOM killer lock for the zones in zonelist */ | ||
| 1644 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
| 1645 | schedule_timeout_uninterruptible(1); | ||
| 1646 | return NULL; | ||
| 1647 | } | ||
| 1648 | |||
| 1649 | /* | ||
| 1650 | * Go through the zonelist yet one more time, keep very high watermark | ||
| 1651 | * here, this is only to catch a parallel oom killing, we must fail if | ||
| 1652 | * we're still under heavy pressure. | ||
| 1653 | */ | ||
| 1654 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
| 1655 | order, zonelist, high_zoneidx, | ||
| 1656 | ALLOC_WMARK_HIGH|ALLOC_CPUSET, | ||
| 1657 | preferred_zone, migratetype); | ||
| 1658 | if (page) | ||
| 1659 | goto out; | ||
| 1660 | |||
| 1661 | /* The OOM killer will not help higher order allocs */ | ||
| 1662 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | ||
| 1663 | goto out; | ||
| 1664 | |||
| 1665 | /* Exhausted what can be done so it's blamo time */ | ||
| 1666 | out_of_memory(zonelist, gfp_mask, order); | ||
| 1667 | |||
| 1668 | out: | ||
| 1669 | clear_zonelist_oom(zonelist, gfp_mask); | ||
| 1670 | return page; | ||
| 1671 | } | ||
| 1672 | |||
| 1673 | /* The really slow allocator path where we enter direct reclaim */ | ||
| 1674 | static inline struct page * | ||
| 1675 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | ||
| 1676 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
| 1677 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | ||
| 1678 | int migratetype, unsigned long *did_some_progress) | ||
| 1679 | { | ||
| 1680 | struct page *page = NULL; | ||
| 1681 | struct reclaim_state reclaim_state; | ||
| 1682 | struct task_struct *p = current; | ||
| 1683 | |||
| 1684 | cond_resched(); | ||
| 1685 | |||
| 1686 | /* We now go into synchronous reclaim */ | ||
| 1687 | cpuset_memory_pressure_bump(); | ||
| 1688 | p->flags |= PF_MEMALLOC; | ||
| 1689 | lockdep_set_current_reclaim_state(gfp_mask); | ||
| 1690 | reclaim_state.reclaimed_slab = 0; | ||
| 1691 | p->reclaim_state = &reclaim_state; | ||
| 1692 | |||
| 1693 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | ||
| 1694 | |||
| 1695 | p->reclaim_state = NULL; | ||
| 1696 | lockdep_clear_current_reclaim_state(); | ||
| 1697 | p->flags &= ~PF_MEMALLOC; | ||
| 1698 | |||
| 1699 | cond_resched(); | ||
| 1700 | |||
| 1701 | if (order != 0) | ||
| 1702 | drain_all_pages(); | ||
| 1703 | |||
| 1704 | if (likely(*did_some_progress)) | ||
| 1705 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
| 1706 | zonelist, high_zoneidx, | ||
| 1707 | alloc_flags, preferred_zone, | ||
| 1708 | migratetype); | ||
| 1709 | return page; | ||
| 1710 | } | ||
| 1711 | |||
| 1460 | /* | 1712 | /* |
| 1461 | * This is the 'heart' of the zoned buddy allocator. | 1713 | * This is called in the allocator slow-path if the allocation request is of |
| 1714 | * sufficient urgency to ignore watermarks and take other desperate measures | ||
| 1462 | */ | 1715 | */ |
| 1463 | struct page * | 1716 | static inline struct page * |
| 1464 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, | 1717 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, |
| 1465 | struct zonelist *zonelist, nodemask_t *nodemask) | 1718 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
| 1719 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
| 1720 | int migratetype) | ||
| 1721 | { | ||
| 1722 | struct page *page; | ||
| 1723 | |||
| 1724 | do { | ||
| 1725 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
| 1726 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, | ||
| 1727 | preferred_zone, migratetype); | ||
| 1728 | |||
| 1729 | if (!page && gfp_mask & __GFP_NOFAIL) | ||
| 1730 | congestion_wait(BLK_RW_ASYNC, HZ/50); | ||
| 1731 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | ||
| 1732 | |||
| 1733 | return page; | ||
| 1734 | } | ||
| 1735 | |||
| 1736 | static inline | ||
| 1737 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | ||
| 1738 | enum zone_type high_zoneidx) | ||
| 1466 | { | 1739 | { |
| 1467 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
| 1468 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
| 1469 | struct zoneref *z; | 1740 | struct zoneref *z; |
| 1470 | struct zone *zone; | 1741 | struct zone *zone; |
| 1471 | struct page *page; | ||
| 1472 | struct reclaim_state reclaim_state; | ||
| 1473 | struct task_struct *p = current; | ||
| 1474 | int do_retry; | ||
| 1475 | int alloc_flags; | ||
| 1476 | unsigned long did_some_progress; | ||
| 1477 | unsigned long pages_reclaimed = 0; | ||
| 1478 | 1742 | ||
| 1479 | lockdep_trace_alloc(gfp_mask); | 1743 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
| 1744 | wakeup_kswapd(zone, order); | ||
| 1745 | } | ||
| 1480 | 1746 | ||
| 1481 | might_sleep_if(wait); | 1747 | static inline int |
| 1748 | gfp_to_alloc_flags(gfp_t gfp_mask) | ||
| 1749 | { | ||
| 1750 | struct task_struct *p = current; | ||
| 1751 | int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; | ||
| 1752 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
| 1482 | 1753 | ||
| 1483 | if (should_fail_alloc_page(gfp_mask, order)) | 1754 | /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ |
| 1484 | return NULL; | 1755 | BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); |
| 1485 | 1756 | ||
| 1486 | restart: | 1757 | /* |
| 1487 | z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ | 1758 | * The caller may dip into page reserves a bit more if the caller |
| 1759 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
| 1760 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
| 1761 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
| 1762 | */ | ||
| 1763 | alloc_flags |= (gfp_mask & __GFP_HIGH); | ||
| 1488 | 1764 | ||
| 1489 | if (unlikely(!z->zone)) { | 1765 | if (!wait) { |
| 1766 | alloc_flags |= ALLOC_HARDER; | ||
| 1490 | /* | 1767 | /* |
| 1491 | * Happens if we have an empty zonelist as a result of | 1768 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
| 1492 | * GFP_THISNODE being used on a memoryless node | 1769 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
| 1493 | */ | 1770 | */ |
| 1494 | return NULL; | 1771 | alloc_flags &= ~ALLOC_CPUSET; |
| 1772 | } else if (unlikely(rt_task(p)) && !in_interrupt()) | ||
| 1773 | alloc_flags |= ALLOC_HARDER; | ||
| 1774 | |||
| 1775 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | ||
| 1776 | if (!in_interrupt() && | ||
| 1777 | ((p->flags & PF_MEMALLOC) || | ||
| 1778 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
| 1779 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
| 1495 | } | 1780 | } |
| 1496 | 1781 | ||
| 1497 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 1782 | return alloc_flags; |
| 1498 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1783 | } |
| 1499 | if (page) | 1784 | |
| 1500 | goto got_pg; | 1785 | static inline struct page * |
| 1786 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | ||
| 1787 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
| 1788 | nodemask_t *nodemask, struct zone *preferred_zone, | ||
| 1789 | int migratetype) | ||
| 1790 | { | ||
| 1791 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
| 1792 | struct page *page = NULL; | ||
| 1793 | int alloc_flags; | ||
| 1794 | unsigned long pages_reclaimed = 0; | ||
| 1795 | unsigned long did_some_progress; | ||
| 1796 | struct task_struct *p = current; | ||
| 1797 | |||
| 1798 | /* | ||
| 1799 | * In the slowpath, we sanity check order to avoid ever trying to | ||
| 1800 | * reclaim >= MAX_ORDER areas which will never succeed. Callers may | ||
| 1801 | * be using allocators in order of preference for an area that is | ||
| 1802 | * too large. | ||
| 1803 | */ | ||
| 1804 | if (order >= MAX_ORDER) { | ||
| 1805 | WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); | ||
| 1806 | return NULL; | ||
| 1807 | } | ||
| 1501 | 1808 | ||
| 1502 | /* | 1809 | /* |
| 1503 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 1810 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
| @@ -1510,155 +1817,88 @@ restart: | |||
| 1510 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 1817 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
| 1511 | goto nopage; | 1818 | goto nopage; |
| 1512 | 1819 | ||
| 1513 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 1820 | restart: |
| 1514 | wakeup_kswapd(zone, order); | 1821 | wake_all_kswapd(order, zonelist, high_zoneidx); |
| 1515 | 1822 | ||
| 1516 | /* | 1823 | /* |
| 1517 | * OK, we're below the kswapd watermark and have kicked background | 1824 | * OK, we're below the kswapd watermark and have kicked background |
| 1518 | * reclaim. Now things get more complex, so set up alloc_flags according | 1825 | * reclaim. Now things get more complex, so set up alloc_flags according |
| 1519 | * to how we want to proceed. | 1826 | * to how we want to proceed. |
| 1520 | * | ||
| 1521 | * The caller may dip into page reserves a bit more if the caller | ||
| 1522 | * cannot run direct reclaim, or if the caller has realtime scheduling | ||
| 1523 | * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will | ||
| 1524 | * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). | ||
| 1525 | */ | 1827 | */ |
| 1526 | alloc_flags = ALLOC_WMARK_MIN; | 1828 | alloc_flags = gfp_to_alloc_flags(gfp_mask); |
| 1527 | if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) | ||
| 1528 | alloc_flags |= ALLOC_HARDER; | ||
| 1529 | if (gfp_mask & __GFP_HIGH) | ||
| 1530 | alloc_flags |= ALLOC_HIGH; | ||
| 1531 | if (wait) | ||
| 1532 | alloc_flags |= ALLOC_CPUSET; | ||
| 1533 | 1829 | ||
| 1534 | /* | 1830 | /* This is the last chance, in general, before the goto nopage. */ |
| 1535 | * Go through the zonelist again. Let __GFP_HIGH and allocations | ||
| 1536 | * coming from realtime tasks go deeper into reserves. | ||
| 1537 | * | ||
| 1538 | * This is the last chance, in general, before the goto nopage. | ||
| 1539 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | ||
| 1540 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | ||
| 1541 | */ | ||
| 1542 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, | 1831 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
| 1543 | high_zoneidx, alloc_flags); | 1832 | high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, |
| 1833 | preferred_zone, migratetype); | ||
| 1544 | if (page) | 1834 | if (page) |
| 1545 | goto got_pg; | 1835 | goto got_pg; |
| 1546 | 1836 | ||
| 1547 | /* This allocation should allow future memory freeing. */ | ||
| 1548 | |||
| 1549 | rebalance: | 1837 | rebalance: |
| 1550 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1838 | /* Allocate without watermarks if the context allows */ |
| 1551 | && !in_interrupt()) { | 1839 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
| 1552 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1840 | page = __alloc_pages_high_priority(gfp_mask, order, |
| 1553 | nofail_alloc: | 1841 | zonelist, high_zoneidx, nodemask, |
| 1554 | /* go through the zonelist yet again, ignoring mins */ | 1842 | preferred_zone, migratetype); |
| 1555 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 1843 | if (page) |
| 1556 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); | 1844 | goto got_pg; |
| 1557 | if (page) | ||
| 1558 | goto got_pg; | ||
| 1559 | if (gfp_mask & __GFP_NOFAIL) { | ||
| 1560 | congestion_wait(WRITE, HZ/50); | ||
| 1561 | goto nofail_alloc; | ||
| 1562 | } | ||
| 1563 | } | ||
| 1564 | goto nopage; | ||
| 1565 | } | 1845 | } |
| 1566 | 1846 | ||
| 1567 | /* Atomic allocations - we can't balance anything */ | 1847 | /* Atomic allocations - we can't balance anything */ |
| 1568 | if (!wait) | 1848 | if (!wait) |
| 1569 | goto nopage; | 1849 | goto nopage; |
| 1570 | 1850 | ||
| 1571 | cond_resched(); | 1851 | /* Avoid recursion of direct reclaim */ |
| 1572 | 1852 | if (p->flags & PF_MEMALLOC) | |
| 1573 | /* We now go into synchronous reclaim */ | 1853 | goto nopage; |
| 1574 | cpuset_memory_pressure_bump(); | ||
| 1575 | /* | ||
| 1576 | * The task's cpuset might have expanded its set of allowable nodes | ||
| 1577 | */ | ||
| 1578 | cpuset_update_task_memory_state(); | ||
| 1579 | p->flags |= PF_MEMALLOC; | ||
| 1580 | |||
| 1581 | lockdep_set_current_reclaim_state(gfp_mask); | ||
| 1582 | reclaim_state.reclaimed_slab = 0; | ||
| 1583 | p->reclaim_state = &reclaim_state; | ||
| 1584 | 1854 | ||
| 1585 | did_some_progress = try_to_free_pages(zonelist, order, | 1855 | /* Avoid allocations with no watermarks from looping endlessly */ |
| 1586 | gfp_mask, nodemask); | 1856 | if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) |
| 1857 | goto nopage; | ||
| 1587 | 1858 | ||
| 1588 | p->reclaim_state = NULL; | 1859 | /* Try direct reclaim and then allocating */ |
| 1589 | lockdep_clear_current_reclaim_state(); | 1860 | page = __alloc_pages_direct_reclaim(gfp_mask, order, |
| 1590 | p->flags &= ~PF_MEMALLOC; | 1861 | zonelist, high_zoneidx, |
| 1862 | nodemask, | ||
| 1863 | alloc_flags, preferred_zone, | ||
| 1864 | migratetype, &did_some_progress); | ||
| 1865 | if (page) | ||
| 1866 | goto got_pg; | ||
| 1591 | 1867 | ||
| 1592 | cond_resched(); | 1868 | /* |
| 1869 | * If we failed to make any progress reclaiming, then we are | ||
| 1870 | * running out of options and have to consider going OOM | ||
| 1871 | */ | ||
| 1872 | if (!did_some_progress) { | ||
| 1873 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | ||
| 1874 | if (oom_killer_disabled) | ||
| 1875 | goto nopage; | ||
| 1876 | page = __alloc_pages_may_oom(gfp_mask, order, | ||
| 1877 | zonelist, high_zoneidx, | ||
| 1878 | nodemask, preferred_zone, | ||
| 1879 | migratetype); | ||
| 1880 | if (page) | ||
| 1881 | goto got_pg; | ||
| 1593 | 1882 | ||
| 1594 | if (order != 0) | 1883 | /* |
| 1595 | drain_all_pages(); | 1884 | * The OOM killer does not trigger for high-order |
| 1885 | * ~__GFP_NOFAIL allocations so if no progress is being | ||
| 1886 | * made, there are no other options and retrying is | ||
| 1887 | * unlikely to help. | ||
| 1888 | */ | ||
| 1889 | if (order > PAGE_ALLOC_COSTLY_ORDER && | ||
| 1890 | !(gfp_mask & __GFP_NOFAIL)) | ||
| 1891 | goto nopage; | ||
| 1596 | 1892 | ||
| 1597 | if (likely(did_some_progress)) { | ||
| 1598 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
| 1599 | zonelist, high_zoneidx, alloc_flags); | ||
| 1600 | if (page) | ||
| 1601 | goto got_pg; | ||
| 1602 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | ||
| 1603 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
| 1604 | schedule_timeout_uninterruptible(1); | ||
| 1605 | goto restart; | 1893 | goto restart; |
| 1606 | } | 1894 | } |
| 1607 | |||
| 1608 | /* | ||
| 1609 | * Go through the zonelist yet one more time, keep | ||
| 1610 | * very high watermark here, this is only to catch | ||
| 1611 | * a parallel oom killing, we must fail if we're still | ||
| 1612 | * under heavy pressure. | ||
| 1613 | */ | ||
| 1614 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
| 1615 | order, zonelist, high_zoneidx, | ||
| 1616 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
| 1617 | if (page) { | ||
| 1618 | clear_zonelist_oom(zonelist, gfp_mask); | ||
| 1619 | goto got_pg; | ||
| 1620 | } | ||
| 1621 | |||
| 1622 | /* The OOM killer will not help higher order allocs so fail */ | ||
| 1623 | if (order > PAGE_ALLOC_COSTLY_ORDER) { | ||
| 1624 | clear_zonelist_oom(zonelist, gfp_mask); | ||
| 1625 | goto nopage; | ||
| 1626 | } | ||
| 1627 | |||
| 1628 | out_of_memory(zonelist, gfp_mask, order); | ||
| 1629 | clear_zonelist_oom(zonelist, gfp_mask); | ||
| 1630 | goto restart; | ||
| 1631 | } | 1895 | } |
| 1632 | 1896 | ||
| 1633 | /* | 1897 | /* Check if we should retry the allocation */ |
| 1634 | * Don't let big-order allocations loop unless the caller explicitly | ||
| 1635 | * requests that. Wait for some write requests to complete then retry. | ||
| 1636 | * | ||
| 1637 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
| 1638 | * means __GFP_NOFAIL, but that may not be true in other | ||
| 1639 | * implementations. | ||
| 1640 | * | ||
| 1641 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
| 1642 | * specified, then we retry until we no longer reclaim any pages | ||
| 1643 | * (above), or we've reclaimed an order of pages at least as | ||
| 1644 | * large as the allocation's order. In both cases, if the | ||
| 1645 | * allocation still fails, we stop retrying. | ||
| 1646 | */ | ||
| 1647 | pages_reclaimed += did_some_progress; | 1898 | pages_reclaimed += did_some_progress; |
| 1648 | do_retry = 0; | 1899 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
| 1649 | if (!(gfp_mask & __GFP_NORETRY)) { | 1900 | /* Wait for some write requests to complete then retry */ |
| 1650 | if (order <= PAGE_ALLOC_COSTLY_ORDER) { | 1901 | congestion_wait(BLK_RW_ASYNC, HZ/50); |
| 1651 | do_retry = 1; | ||
| 1652 | } else { | ||
| 1653 | if (gfp_mask & __GFP_REPEAT && | ||
| 1654 | pages_reclaimed < (1 << order)) | ||
| 1655 | do_retry = 1; | ||
| 1656 | } | ||
| 1657 | if (gfp_mask & __GFP_NOFAIL) | ||
| 1658 | do_retry = 1; | ||
| 1659 | } | ||
| 1660 | if (do_retry) { | ||
| 1661 | congestion_wait(WRITE, HZ/50); | ||
| 1662 | goto rebalance; | 1902 | goto rebalance; |
| 1663 | } | 1903 | } |
| 1664 | 1904 | ||
| @@ -1670,54 +1910,102 @@ nopage: | |||
| 1670 | dump_stack(); | 1910 | dump_stack(); |
| 1671 | show_mem(); | 1911 | show_mem(); |
| 1672 | } | 1912 | } |
| 1913 | return page; | ||
| 1673 | got_pg: | 1914 | got_pg: |
| 1915 | if (kmemcheck_enabled) | ||
| 1916 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | ||
| 1674 | return page; | 1917 | return page; |
| 1918 | |||
| 1675 | } | 1919 | } |
| 1676 | EXPORT_SYMBOL(__alloc_pages_internal); | 1920 | |
| 1921 | /* | ||
| 1922 | * This is the 'heart' of the zoned buddy allocator. | ||
| 1923 | */ | ||
| 1924 | struct page * | ||
| 1925 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
| 1926 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
| 1927 | { | ||
| 1928 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
| 1929 | struct zone *preferred_zone; | ||
| 1930 | struct page *page; | ||
| 1931 | int migratetype = allocflags_to_migratetype(gfp_mask); | ||
| 1932 | |||
| 1933 | gfp_mask &= gfp_allowed_mask; | ||
| 1934 | |||
| 1935 | lockdep_trace_alloc(gfp_mask); | ||
| 1936 | |||
| 1937 | might_sleep_if(gfp_mask & __GFP_WAIT); | ||
| 1938 | |||
| 1939 | if (should_fail_alloc_page(gfp_mask, order)) | ||
| 1940 | return NULL; | ||
| 1941 | |||
| 1942 | /* | ||
| 1943 | * Check the zones suitable for the gfp_mask contain at least one | ||
| 1944 | * valid zone. It's possible to have an empty zonelist as a result | ||
| 1945 | * of GFP_THISNODE and a memoryless node | ||
| 1946 | */ | ||
| 1947 | if (unlikely(!zonelist->_zonerefs->zone)) | ||
| 1948 | return NULL; | ||
| 1949 | |||
| 1950 | /* The preferred zone is used for statistics later */ | ||
| 1951 | first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); | ||
| 1952 | if (!preferred_zone) | ||
| 1953 | return NULL; | ||
| 1954 | |||
| 1955 | /* First allocation attempt */ | ||
| 1956 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | ||
| 1957 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET, | ||
| 1958 | preferred_zone, migratetype); | ||
| 1959 | if (unlikely(!page)) | ||
| 1960 | page = __alloc_pages_slowpath(gfp_mask, order, | ||
| 1961 | zonelist, high_zoneidx, nodemask, | ||
| 1962 | preferred_zone, migratetype); | ||
| 1963 | |||
| 1964 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | ||
| 1965 | return page; | ||
| 1966 | } | ||
| 1967 | EXPORT_SYMBOL(__alloc_pages_nodemask); | ||
| 1677 | 1968 | ||
| 1678 | /* | 1969 | /* |
| 1679 | * Common helper functions. | 1970 | * Common helper functions. |
| 1680 | */ | 1971 | */ |
| 1681 | unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) | 1972 | unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) |
| 1682 | { | 1973 | { |
| 1683 | struct page * page; | 1974 | struct page *page; |
| 1975 | |||
| 1976 | /* | ||
| 1977 | * __get_free_pages() returns a 32-bit address, which cannot represent | ||
| 1978 | * a highmem page | ||
| 1979 | */ | ||
| 1980 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | ||
| 1981 | |||
| 1684 | page = alloc_pages(gfp_mask, order); | 1982 | page = alloc_pages(gfp_mask, order); |
| 1685 | if (!page) | 1983 | if (!page) |
| 1686 | return 0; | 1984 | return 0; |
| 1687 | return (unsigned long) page_address(page); | 1985 | return (unsigned long) page_address(page); |
| 1688 | } | 1986 | } |
| 1689 | |||
| 1690 | EXPORT_SYMBOL(__get_free_pages); | 1987 | EXPORT_SYMBOL(__get_free_pages); |
| 1691 | 1988 | ||
| 1692 | unsigned long get_zeroed_page(gfp_t gfp_mask) | 1989 | unsigned long get_zeroed_page(gfp_t gfp_mask) |
| 1693 | { | 1990 | { |
| 1694 | struct page * page; | 1991 | return __get_free_pages(gfp_mask | __GFP_ZERO, 0); |
| 1695 | |||
| 1696 | /* | ||
| 1697 | * get_zeroed_page() returns a 32-bit address, which cannot represent | ||
| 1698 | * a highmem page | ||
| 1699 | */ | ||
| 1700 | VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); | ||
| 1701 | |||
| 1702 | page = alloc_pages(gfp_mask | __GFP_ZERO, 0); | ||
| 1703 | if (page) | ||
| 1704 | return (unsigned long) page_address(page); | ||
| 1705 | return 0; | ||
| 1706 | } | 1992 | } |
| 1707 | |||
| 1708 | EXPORT_SYMBOL(get_zeroed_page); | 1993 | EXPORT_SYMBOL(get_zeroed_page); |
| 1709 | 1994 | ||
| 1710 | void __pagevec_free(struct pagevec *pvec) | 1995 | void __pagevec_free(struct pagevec *pvec) |
| 1711 | { | 1996 | { |
| 1712 | int i = pagevec_count(pvec); | 1997 | int i = pagevec_count(pvec); |
| 1713 | 1998 | ||
| 1714 | while (--i >= 0) | 1999 | while (--i >= 0) { |
| 2000 | trace_mm_pagevec_free(pvec->pages[i], pvec->cold); | ||
| 1715 | free_hot_cold_page(pvec->pages[i], pvec->cold); | 2001 | free_hot_cold_page(pvec->pages[i], pvec->cold); |
| 2002 | } | ||
| 1716 | } | 2003 | } |
| 1717 | 2004 | ||
| 1718 | void __free_pages(struct page *page, unsigned int order) | 2005 | void __free_pages(struct page *page, unsigned int order) |
| 1719 | { | 2006 | { |
| 1720 | if (put_page_testzero(page)) { | 2007 | if (put_page_testzero(page)) { |
| 2008 | trace_mm_page_free_direct(page, order); | ||
| 1721 | if (order == 0) | 2009 | if (order == 0) |
| 1722 | free_hot_page(page); | 2010 | free_hot_page(page); |
| 1723 | else | 2011 | else |
| @@ -1760,7 +2048,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) | |||
| 1760 | unsigned long alloc_end = addr + (PAGE_SIZE << order); | 2048 | unsigned long alloc_end = addr + (PAGE_SIZE << order); |
| 1761 | unsigned long used = addr + PAGE_ALIGN(size); | 2049 | unsigned long used = addr + PAGE_ALIGN(size); |
| 1762 | 2050 | ||
| 1763 | split_page(virt_to_page(addr), order); | 2051 | split_page(virt_to_page((void *)addr), order); |
| 1764 | while (used < alloc_end) { | 2052 | while (used < alloc_end) { |
| 1765 | free_page(used); | 2053 | free_page(used); |
| 1766 | used += PAGE_SIZE; | 2054 | used += PAGE_SIZE; |
| @@ -1802,7 +2090,7 @@ static unsigned int nr_free_zone_pages(int offset) | |||
| 1802 | 2090 | ||
| 1803 | for_each_zone_zonelist(zone, z, zonelist, offset) { | 2091 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
| 1804 | unsigned long size = zone->present_pages; | 2092 | unsigned long size = zone->present_pages; |
| 1805 | unsigned long high = zone->pages_high; | 2093 | unsigned long high = high_wmark_pages(zone); |
| 1806 | if (size > high) | 2094 | if (size > high) |
| 1807 | sum += size - high; | 2095 | sum += size - high; |
| 1808 | } | 2096 | } |
| @@ -1892,28 +2180,27 @@ void show_free_areas(void) | |||
| 1892 | } | 2180 | } |
| 1893 | } | 2181 | } |
| 1894 | 2182 | ||
| 1895 | printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" | 2183 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" |
| 1896 | " inactive_file:%lu" | 2184 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" |
| 1897 | //TODO: check/adjust line lengths | ||
| 1898 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1899 | " unevictable:%lu" | 2185 | " unevictable:%lu" |
| 1900 | #endif | ||
| 1901 | " dirty:%lu writeback:%lu unstable:%lu\n" | 2186 | " dirty:%lu writeback:%lu unstable:%lu\n" |
| 1902 | " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", | 2187 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
| 2188 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", | ||
| 1903 | global_page_state(NR_ACTIVE_ANON), | 2189 | global_page_state(NR_ACTIVE_ANON), |
| 1904 | global_page_state(NR_ACTIVE_FILE), | ||
| 1905 | global_page_state(NR_INACTIVE_ANON), | 2190 | global_page_state(NR_INACTIVE_ANON), |
| 2191 | global_page_state(NR_ISOLATED_ANON), | ||
| 2192 | global_page_state(NR_ACTIVE_FILE), | ||
| 1906 | global_page_state(NR_INACTIVE_FILE), | 2193 | global_page_state(NR_INACTIVE_FILE), |
| 1907 | #ifdef CONFIG_UNEVICTABLE_LRU | 2194 | global_page_state(NR_ISOLATED_FILE), |
| 1908 | global_page_state(NR_UNEVICTABLE), | 2195 | global_page_state(NR_UNEVICTABLE), |
| 1909 | #endif | ||
| 1910 | global_page_state(NR_FILE_DIRTY), | 2196 | global_page_state(NR_FILE_DIRTY), |
| 1911 | global_page_state(NR_WRITEBACK), | 2197 | global_page_state(NR_WRITEBACK), |
| 1912 | global_page_state(NR_UNSTABLE_NFS), | 2198 | global_page_state(NR_UNSTABLE_NFS), |
| 1913 | global_page_state(NR_FREE_PAGES), | 2199 | global_page_state(NR_FREE_PAGES), |
| 1914 | global_page_state(NR_SLAB_RECLAIMABLE) + | 2200 | global_page_state(NR_SLAB_RECLAIMABLE), |
| 1915 | global_page_state(NR_SLAB_UNRECLAIMABLE), | 2201 | global_page_state(NR_SLAB_UNRECLAIMABLE), |
| 1916 | global_page_state(NR_FILE_MAPPED), | 2202 | global_page_state(NR_FILE_MAPPED), |
| 2203 | global_page_state(NR_SHMEM), | ||
| 1917 | global_page_state(NR_PAGETABLE), | 2204 | global_page_state(NR_PAGETABLE), |
| 1918 | global_page_state(NR_BOUNCE)); | 2205 | global_page_state(NR_BOUNCE)); |
| 1919 | 2206 | ||
| @@ -1930,26 +2217,51 @@ void show_free_areas(void) | |||
| 1930 | " inactive_anon:%lukB" | 2217 | " inactive_anon:%lukB" |
| 1931 | " active_file:%lukB" | 2218 | " active_file:%lukB" |
| 1932 | " inactive_file:%lukB" | 2219 | " inactive_file:%lukB" |
| 1933 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1934 | " unevictable:%lukB" | 2220 | " unevictable:%lukB" |
| 1935 | #endif | 2221 | " isolated(anon):%lukB" |
| 2222 | " isolated(file):%lukB" | ||
| 1936 | " present:%lukB" | 2223 | " present:%lukB" |
| 2224 | " mlocked:%lukB" | ||
| 2225 | " dirty:%lukB" | ||
| 2226 | " writeback:%lukB" | ||
| 2227 | " mapped:%lukB" | ||
| 2228 | " shmem:%lukB" | ||
| 2229 | " slab_reclaimable:%lukB" | ||
| 2230 | " slab_unreclaimable:%lukB" | ||
| 2231 | " kernel_stack:%lukB" | ||
| 2232 | " pagetables:%lukB" | ||
| 2233 | " unstable:%lukB" | ||
| 2234 | " bounce:%lukB" | ||
| 2235 | " writeback_tmp:%lukB" | ||
| 1937 | " pages_scanned:%lu" | 2236 | " pages_scanned:%lu" |
| 1938 | " all_unreclaimable? %s" | 2237 | " all_unreclaimable? %s" |
| 1939 | "\n", | 2238 | "\n", |
| 1940 | zone->name, | 2239 | zone->name, |
| 1941 | K(zone_page_state(zone, NR_FREE_PAGES)), | 2240 | K(zone_page_state(zone, NR_FREE_PAGES)), |
| 1942 | K(zone->pages_min), | 2241 | K(min_wmark_pages(zone)), |
| 1943 | K(zone->pages_low), | 2242 | K(low_wmark_pages(zone)), |
| 1944 | K(zone->pages_high), | 2243 | K(high_wmark_pages(zone)), |
| 1945 | K(zone_page_state(zone, NR_ACTIVE_ANON)), | 2244 | K(zone_page_state(zone, NR_ACTIVE_ANON)), |
| 1946 | K(zone_page_state(zone, NR_INACTIVE_ANON)), | 2245 | K(zone_page_state(zone, NR_INACTIVE_ANON)), |
| 1947 | K(zone_page_state(zone, NR_ACTIVE_FILE)), | 2246 | K(zone_page_state(zone, NR_ACTIVE_FILE)), |
| 1948 | K(zone_page_state(zone, NR_INACTIVE_FILE)), | 2247 | K(zone_page_state(zone, NR_INACTIVE_FILE)), |
| 1949 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1950 | K(zone_page_state(zone, NR_UNEVICTABLE)), | 2248 | K(zone_page_state(zone, NR_UNEVICTABLE)), |
| 1951 | #endif | 2249 | K(zone_page_state(zone, NR_ISOLATED_ANON)), |
| 2250 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | ||
| 1952 | K(zone->present_pages), | 2251 | K(zone->present_pages), |
| 2252 | K(zone_page_state(zone, NR_MLOCK)), | ||
| 2253 | K(zone_page_state(zone, NR_FILE_DIRTY)), | ||
| 2254 | K(zone_page_state(zone, NR_WRITEBACK)), | ||
| 2255 | K(zone_page_state(zone, NR_FILE_MAPPED)), | ||
| 2256 | K(zone_page_state(zone, NR_SHMEM)), | ||
| 2257 | K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), | ||
| 2258 | K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), | ||
| 2259 | zone_page_state(zone, NR_KERNEL_STACK) * | ||
| 2260 | THREAD_SIZE / 1024, | ||
| 2261 | K(zone_page_state(zone, NR_PAGETABLE)), | ||
| 2262 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), | ||
| 2263 | K(zone_page_state(zone, NR_BOUNCE)), | ||
| 2264 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | ||
| 1953 | zone->pages_scanned, | 2265 | zone->pages_scanned, |
| 1954 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") | 2266 | (zone_is_all_unreclaimable(zone) ? "yes" : "no") |
| 1955 | ); | 2267 | ); |
| @@ -2078,7 +2390,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); | |||
| 2078 | * sysctl handler for numa_zonelist_order | 2390 | * sysctl handler for numa_zonelist_order |
| 2079 | */ | 2391 | */ |
| 2080 | int numa_zonelist_order_handler(ctl_table *table, int write, | 2392 | int numa_zonelist_order_handler(ctl_table *table, int write, |
| 2081 | struct file *file, void __user *buffer, size_t *length, | 2393 | void __user *buffer, size_t *length, |
| 2082 | loff_t *ppos) | 2394 | loff_t *ppos) |
| 2083 | { | 2395 | { |
| 2084 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | 2396 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; |
| @@ -2087,7 +2399,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
| 2087 | if (write) | 2399 | if (write) |
| 2088 | strncpy(saved_string, (char*)table->data, | 2400 | strncpy(saved_string, (char*)table->data, |
| 2089 | NUMA_ZONELIST_ORDER_LEN); | 2401 | NUMA_ZONELIST_ORDER_LEN); |
| 2090 | ret = proc_dostring(table, write, file, buffer, length, ppos); | 2402 | ret = proc_dostring(table, write, buffer, length, ppos); |
| 2091 | if (ret) | 2403 | if (ret) |
| 2092 | return ret; | 2404 | return ret; |
| 2093 | if (write) { | 2405 | if (write) { |
| @@ -2106,7 +2418,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
| 2106 | } | 2418 | } |
| 2107 | 2419 | ||
| 2108 | 2420 | ||
| 2109 | #define MAX_NODE_LOAD (num_online_nodes()) | 2421 | #define MAX_NODE_LOAD (nr_online_nodes) |
| 2110 | static int node_load[MAX_NUMNODES]; | 2422 | static int node_load[MAX_NUMNODES]; |
| 2111 | 2423 | ||
| 2112 | /** | 2424 | /** |
| @@ -2128,7 +2440,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
| 2128 | int n, val; | 2440 | int n, val; |
| 2129 | int min_val = INT_MAX; | 2441 | int min_val = INT_MAX; |
| 2130 | int best_node = -1; | 2442 | int best_node = -1; |
| 2131 | node_to_cpumask_ptr(tmp, 0); | 2443 | const struct cpumask *tmp = cpumask_of_node(0); |
| 2132 | 2444 | ||
| 2133 | /* Use the local node if we haven't already */ | 2445 | /* Use the local node if we haven't already */ |
| 2134 | if (!node_isset(node, *used_node_mask)) { | 2446 | if (!node_isset(node, *used_node_mask)) { |
| @@ -2149,8 +2461,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
| 2149 | val += (n < node); | 2461 | val += (n < node); |
| 2150 | 2462 | ||
| 2151 | /* Give preference to headless and unused nodes */ | 2463 | /* Give preference to headless and unused nodes */ |
| 2152 | node_to_cpumask_ptr_next(tmp, n); | 2464 | tmp = cpumask_of_node(n); |
| 2153 | if (!cpus_empty(*tmp)) | 2465 | if (!cpumask_empty(tmp)) |
| 2154 | val += PENALTY_FOR_NODE_WITH_CPUS; | 2466 | val += PENALTY_FOR_NODE_WITH_CPUS; |
| 2155 | 2467 | ||
| 2156 | /* Slight preference for less loaded node */ | 2468 | /* Slight preference for less loaded node */ |
| @@ -2315,11 +2627,10 @@ static void build_zonelists(pg_data_t *pgdat) | |||
| 2315 | 2627 | ||
| 2316 | /* NUMA-aware ordering of nodes */ | 2628 | /* NUMA-aware ordering of nodes */ |
| 2317 | local_node = pgdat->node_id; | 2629 | local_node = pgdat->node_id; |
| 2318 | load = num_online_nodes(); | 2630 | load = nr_online_nodes; |
| 2319 | prev_node = local_node; | 2631 | prev_node = local_node; |
| 2320 | nodes_clear(used_mask); | 2632 | nodes_clear(used_mask); |
| 2321 | 2633 | ||
| 2322 | memset(node_load, 0, sizeof(node_load)); | ||
| 2323 | memset(node_order, 0, sizeof(node_order)); | 2634 | memset(node_order, 0, sizeof(node_order)); |
| 2324 | j = 0; | 2635 | j = 0; |
| 2325 | 2636 | ||
| @@ -2428,6 +2739,9 @@ static int __build_all_zonelists(void *dummy) | |||
| 2428 | { | 2739 | { |
| 2429 | int nid; | 2740 | int nid; |
| 2430 | 2741 | ||
| 2742 | #ifdef CONFIG_NUMA | ||
| 2743 | memset(node_load, 0, sizeof(node_load)); | ||
| 2744 | #endif | ||
| 2431 | for_each_online_node(nid) { | 2745 | for_each_online_node(nid) { |
| 2432 | pg_data_t *pgdat = NODE_DATA(nid); | 2746 | pg_data_t *pgdat = NODE_DATA(nid); |
| 2433 | 2747 | ||
| @@ -2466,7 +2780,7 @@ void build_all_zonelists(void) | |||
| 2466 | 2780 | ||
| 2467 | printk("Built %i zonelists in %s order, mobility grouping %s. " | 2781 | printk("Built %i zonelists in %s order, mobility grouping %s. " |
| 2468 | "Total pages: %ld\n", | 2782 | "Total pages: %ld\n", |
| 2469 | num_online_nodes(), | 2783 | nr_online_nodes, |
| 2470 | zonelist_order_name[current_zonelist_order], | 2784 | zonelist_order_name[current_zonelist_order], |
| 2471 | page_group_by_mobility_disabled ? "off" : "on", | 2785 | page_group_by_mobility_disabled ? "off" : "on", |
| 2472 | vm_total_pages); | 2786 | vm_total_pages); |
| @@ -2545,8 +2859,8 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
| 2545 | 2859 | ||
| 2546 | /* | 2860 | /* |
| 2547 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number | 2861 | * Mark a number of pageblocks as MIGRATE_RESERVE. The number |
| 2548 | * of blocks reserved is based on zone->pages_min. The memory within the | 2862 | * of blocks reserved is based on min_wmark_pages(zone). The memory within |
| 2549 | * reserve will tend to store contiguous free pages. Setting min_free_kbytes | 2863 | * the reserve will tend to store contiguous free pages. Setting min_free_kbytes |
| 2550 | * higher will lead to a bigger reserve which will get freed as contiguous | 2864 | * higher will lead to a bigger reserve which will get freed as contiguous |
| 2551 | * blocks as reclaim kicks in | 2865 | * blocks as reclaim kicks in |
| 2552 | */ | 2866 | */ |
| @@ -2554,14 +2868,24 @@ static void setup_zone_migrate_reserve(struct zone *zone) | |||
| 2554 | { | 2868 | { |
| 2555 | unsigned long start_pfn, pfn, end_pfn; | 2869 | unsigned long start_pfn, pfn, end_pfn; |
| 2556 | struct page *page; | 2870 | struct page *page; |
| 2557 | unsigned long reserve, block_migratetype; | 2871 | unsigned long block_migratetype; |
| 2872 | int reserve; | ||
| 2558 | 2873 | ||
| 2559 | /* Get the start pfn, end pfn and the number of blocks to reserve */ | 2874 | /* Get the start pfn, end pfn and the number of blocks to reserve */ |
| 2560 | start_pfn = zone->zone_start_pfn; | 2875 | start_pfn = zone->zone_start_pfn; |
| 2561 | end_pfn = start_pfn + zone->spanned_pages; | 2876 | end_pfn = start_pfn + zone->spanned_pages; |
| 2562 | reserve = roundup(zone->pages_min, pageblock_nr_pages) >> | 2877 | reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> |
| 2563 | pageblock_order; | 2878 | pageblock_order; |
| 2564 | 2879 | ||
| 2880 | /* | ||
| 2881 | * Reserve blocks are generally in place to help high-order atomic | ||
| 2882 | * allocations that are short-lived. A min_free_kbytes value that | ||
| 2883 | * would result in more than 2 reserve blocks for atomic allocations | ||
| 2884 | * is assumed to be in place to help anti-fragmentation for the | ||
| 2885 | * future allocation of hugepages at runtime. | ||
| 2886 | */ | ||
| 2887 | reserve = min(2, reserve); | ||
| 2888 | |||
| 2565 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | 2889 | for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { |
| 2566 | if (!pfn_valid(pfn)) | 2890 | if (!pfn_valid(pfn)) |
| 2567 | continue; | 2891 | continue; |
| @@ -2681,6 +3005,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) | |||
| 2681 | 3005 | ||
| 2682 | static int zone_batchsize(struct zone *zone) | 3006 | static int zone_batchsize(struct zone *zone) |
| 2683 | { | 3007 | { |
| 3008 | #ifdef CONFIG_MMU | ||
| 2684 | int batch; | 3009 | int batch; |
| 2685 | 3010 | ||
| 2686 | /* | 3011 | /* |
| @@ -2706,14 +3031,32 @@ static int zone_batchsize(struct zone *zone) | |||
| 2706 | * of pages of one half of the possible page colors | 3031 | * of pages of one half of the possible page colors |
| 2707 | * and the other with pages of the other colors. | 3032 | * and the other with pages of the other colors. |
| 2708 | */ | 3033 | */ |
| 2709 | batch = (1 << (fls(batch + batch/2)-1)) - 1; | 3034 | batch = rounddown_pow_of_two(batch + batch/2) - 1; |
| 2710 | 3035 | ||
| 2711 | return batch; | 3036 | return batch; |
| 3037 | |||
| 3038 | #else | ||
| 3039 | /* The deferral and batching of frees should be suppressed under NOMMU | ||
| 3040 | * conditions. | ||
| 3041 | * | ||
| 3042 | * The problem is that NOMMU needs to be able to allocate large chunks | ||
| 3043 | * of contiguous memory as there's no hardware page translation to | ||
| 3044 | * assemble apparent contiguous memory from discontiguous pages. | ||
| 3045 | * | ||
| 3046 | * Queueing large contiguous runs of pages for batching, however, | ||
| 3047 | * causes the pages to actually be freed in smaller chunks. As there | ||
| 3048 | * can be a significant delay between the individual batches being | ||
| 3049 | * recycled, this leads to the once large chunks of space being | ||
| 3050 | * fragmented and becoming unavailable for high-order allocations. | ||
| 3051 | */ | ||
| 3052 | return 0; | ||
| 3053 | #endif | ||
| 2712 | } | 3054 | } |
| 2713 | 3055 | ||
| 2714 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | 3056 | static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) |
| 2715 | { | 3057 | { |
| 2716 | struct per_cpu_pages *pcp; | 3058 | struct per_cpu_pages *pcp; |
| 3059 | int migratetype; | ||
| 2717 | 3060 | ||
| 2718 | memset(p, 0, sizeof(*p)); | 3061 | memset(p, 0, sizeof(*p)); |
| 2719 | 3062 | ||
| @@ -2721,7 +3064,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) | |||
| 2721 | pcp->count = 0; | 3064 | pcp->count = 0; |
| 2722 | pcp->high = 6 * batch; | 3065 | pcp->high = 6 * batch; |
| 2723 | pcp->batch = max(1UL, 1 * batch); | 3066 | pcp->batch = max(1UL, 1 * batch); |
| 2724 | INIT_LIST_HEAD(&pcp->list); | 3067 | for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) |
| 3068 | INIT_LIST_HEAD(&pcp->lists[migratetype]); | ||
| 2725 | } | 3069 | } |
| 2726 | 3070 | ||
| 2727 | /* | 3071 | /* |
| @@ -2794,7 +3138,7 @@ bad: | |||
| 2794 | if (dzone == zone) | 3138 | if (dzone == zone) |
| 2795 | break; | 3139 | break; |
| 2796 | kfree(zone_pcp(dzone, cpu)); | 3140 | kfree(zone_pcp(dzone, cpu)); |
| 2797 | zone_pcp(dzone, cpu) = NULL; | 3141 | zone_pcp(dzone, cpu) = &boot_pageset[cpu]; |
| 2798 | } | 3142 | } |
| 2799 | return -ENOMEM; | 3143 | return -ENOMEM; |
| 2800 | } | 3144 | } |
| @@ -2809,7 +3153,7 @@ static inline void free_zone_pagesets(int cpu) | |||
| 2809 | /* Free per_cpu_pageset if it is slab allocated */ | 3153 | /* Free per_cpu_pageset if it is slab allocated */ |
| 2810 | if (pset != &boot_pageset[cpu]) | 3154 | if (pset != &boot_pageset[cpu]) |
| 2811 | kfree(pset); | 3155 | kfree(pset); |
| 2812 | zone_pcp(zone, cpu) = NULL; | 3156 | zone_pcp(zone, cpu) = &boot_pageset[cpu]; |
| 2813 | } | 3157 | } |
| 2814 | } | 3158 | } |
| 2815 | 3159 | ||
| @@ -2899,6 +3243,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
| 2899 | return 0; | 3243 | return 0; |
| 2900 | } | 3244 | } |
| 2901 | 3245 | ||
| 3246 | static int __zone_pcp_update(void *data) | ||
| 3247 | { | ||
| 3248 | struct zone *zone = data; | ||
| 3249 | int cpu; | ||
| 3250 | unsigned long batch = zone_batchsize(zone), flags; | ||
| 3251 | |||
| 3252 | for (cpu = 0; cpu < NR_CPUS; cpu++) { | ||
| 3253 | struct per_cpu_pageset *pset; | ||
| 3254 | struct per_cpu_pages *pcp; | ||
| 3255 | |||
| 3256 | pset = zone_pcp(zone, cpu); | ||
| 3257 | pcp = &pset->pcp; | ||
| 3258 | |||
| 3259 | local_irq_save(flags); | ||
| 3260 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
| 3261 | setup_pageset(pset, batch); | ||
| 3262 | local_irq_restore(flags); | ||
| 3263 | } | ||
| 3264 | return 0; | ||
| 3265 | } | ||
| 3266 | |||
| 3267 | void zone_pcp_update(struct zone *zone) | ||
| 3268 | { | ||
| 3269 | stop_machine(__zone_pcp_update, zone, NULL); | ||
| 3270 | } | ||
| 3271 | |||
| 2902 | static __meminit void zone_pcp_init(struct zone *zone) | 3272 | static __meminit void zone_pcp_init(struct zone *zone) |
| 2903 | { | 3273 | { |
| 2904 | int cpu; | 3274 | int cpu; |
| @@ -3085,64 +3455,6 @@ void __init sparse_memory_present_with_active_regions(int nid) | |||
| 3085 | } | 3455 | } |
| 3086 | 3456 | ||
| 3087 | /** | 3457 | /** |
| 3088 | * push_node_boundaries - Push node boundaries to at least the requested boundary | ||
| 3089 | * @nid: The nid of the node to push the boundary for | ||
| 3090 | * @start_pfn: The start pfn of the node | ||
| 3091 | * @end_pfn: The end pfn of the node | ||
| 3092 | * | ||
| 3093 | * In reserve-based hot-add, mem_map is allocated that is unused until hotadd | ||
| 3094 | * time. Specifically, on x86_64, SRAT will report ranges that can potentially | ||
| 3095 | * be hotplugged even though no physical memory exists. This function allows | ||
| 3096 | * an arch to push out the node boundaries so mem_map is allocated that can | ||
| 3097 | * be used later. | ||
| 3098 | */ | ||
| 3099 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
| 3100 | void __init push_node_boundaries(unsigned int nid, | ||
| 3101 | unsigned long start_pfn, unsigned long end_pfn) | ||
| 3102 | { | ||
| 3103 | mminit_dprintk(MMINIT_TRACE, "zoneboundary", | ||
| 3104 | "Entering push_node_boundaries(%u, %lu, %lu)\n", | ||
| 3105 | nid, start_pfn, end_pfn); | ||
| 3106 | |||
| 3107 | /* Initialise the boundary for this node if necessary */ | ||
| 3108 | if (node_boundary_end_pfn[nid] == 0) | ||
| 3109 | node_boundary_start_pfn[nid] = -1UL; | ||
| 3110 | |||
| 3111 | /* Update the boundaries */ | ||
| 3112 | if (node_boundary_start_pfn[nid] > start_pfn) | ||
| 3113 | node_boundary_start_pfn[nid] = start_pfn; | ||
| 3114 | if (node_boundary_end_pfn[nid] < end_pfn) | ||
| 3115 | node_boundary_end_pfn[nid] = end_pfn; | ||
| 3116 | } | ||
| 3117 | |||
| 3118 | /* If necessary, push the node boundary out for reserve hotadd */ | ||
| 3119 | static void __meminit account_node_boundary(unsigned int nid, | ||
| 3120 | unsigned long *start_pfn, unsigned long *end_pfn) | ||
| 3121 | { | ||
| 3122 | mminit_dprintk(MMINIT_TRACE, "zoneboundary", | ||
| 3123 | "Entering account_node_boundary(%u, %lu, %lu)\n", | ||
| 3124 | nid, *start_pfn, *end_pfn); | ||
| 3125 | |||
| 3126 | /* Return if boundary information has not been provided */ | ||
| 3127 | if (node_boundary_end_pfn[nid] == 0) | ||
| 3128 | return; | ||
| 3129 | |||
| 3130 | /* Check the boundaries and update if necessary */ | ||
| 3131 | if (node_boundary_start_pfn[nid] < *start_pfn) | ||
| 3132 | *start_pfn = node_boundary_start_pfn[nid]; | ||
| 3133 | if (node_boundary_end_pfn[nid] > *end_pfn) | ||
| 3134 | *end_pfn = node_boundary_end_pfn[nid]; | ||
| 3135 | } | ||
| 3136 | #else | ||
| 3137 | void __init push_node_boundaries(unsigned int nid, | ||
| 3138 | unsigned long start_pfn, unsigned long end_pfn) {} | ||
| 3139 | |||
| 3140 | static void __meminit account_node_boundary(unsigned int nid, | ||
| 3141 | unsigned long *start_pfn, unsigned long *end_pfn) {} | ||
| 3142 | #endif | ||
| 3143 | |||
| 3144 | |||
| 3145 | /** | ||
| 3146 | * get_pfn_range_for_nid - Return the start and end page frames for a node | 3458 | * get_pfn_range_for_nid - Return the start and end page frames for a node |
| 3147 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. | 3459 | * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. |
| 3148 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. | 3460 | * @start_pfn: Passed by reference. On return, it will have the node start_pfn. |
| @@ -3167,9 +3479,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, | |||
| 3167 | 3479 | ||
| 3168 | if (*start_pfn == -1UL) | 3480 | if (*start_pfn == -1UL) |
| 3169 | *start_pfn = 0; | 3481 | *start_pfn = 0; |
| 3170 | |||
| 3171 | /* Push the node boundaries out if requested */ | ||
| 3172 | account_node_boundary(nid, start_pfn, end_pfn); | ||
| 3173 | } | 3482 | } |
| 3174 | 3483 | ||
| 3175 | /* | 3484 | /* |
| @@ -3534,7 +3843,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 3534 | zone_pcp_init(zone); | 3843 | zone_pcp_init(zone); |
| 3535 | for_each_lru(l) { | 3844 | for_each_lru(l) { |
| 3536 | INIT_LIST_HEAD(&zone->lru[l].list); | 3845 | INIT_LIST_HEAD(&zone->lru[l].list); |
| 3537 | zone->lru[l].nr_scan = 0; | 3846 | zone->reclaim_stat.nr_saved_scan[l] = 0; |
| 3538 | } | 3847 | } |
| 3539 | zone->reclaim_stat.recent_rotated[0] = 0; | 3848 | zone->reclaim_stat.recent_rotated[0] = 0; |
| 3540 | zone->reclaim_stat.recent_rotated[1] = 0; | 3849 | zone->reclaim_stat.recent_rotated[1] = 0; |
| @@ -3775,10 +4084,6 @@ void __init remove_all_active_ranges(void) | |||
| 3775 | { | 4084 | { |
| 3776 | memset(early_node_map, 0, sizeof(early_node_map)); | 4085 | memset(early_node_map, 0, sizeof(early_node_map)); |
| 3777 | nr_nodemap_entries = 0; | 4086 | nr_nodemap_entries = 0; |
| 3778 | #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE | ||
| 3779 | memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); | ||
| 3780 | memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); | ||
| 3781 | #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ | ||
| 3782 | } | 4087 | } |
| 3783 | 4088 | ||
| 3784 | /* Compare two active node_active_regions */ | 4089 | /* Compare two active node_active_regions */ |
| @@ -3865,6 +4170,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
| 3865 | int i, nid; | 4170 | int i, nid; |
| 3866 | unsigned long usable_startpfn; | 4171 | unsigned long usable_startpfn; |
| 3867 | unsigned long kernelcore_node, kernelcore_remaining; | 4172 | unsigned long kernelcore_node, kernelcore_remaining; |
| 4173 | /* save the state before borrow the nodemask */ | ||
| 4174 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; | ||
| 3868 | unsigned long totalpages = early_calculate_totalpages(); | 4175 | unsigned long totalpages = early_calculate_totalpages(); |
| 3869 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 4176 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); |
| 3870 | 4177 | ||
| @@ -3892,7 +4199,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) | |||
| 3892 | 4199 | ||
| 3893 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ | 4200 | /* If kernelcore was not specified, there is no ZONE_MOVABLE */ |
| 3894 | if (!required_kernelcore) | 4201 | if (!required_kernelcore) |
| 3895 | return; | 4202 | goto out; |
| 3896 | 4203 | ||
| 3897 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ | 4204 | /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ |
| 3898 | find_usable_zone_for_movable(); | 4205 | find_usable_zone_for_movable(); |
| @@ -3991,6 +4298,10 @@ restart: | |||
| 3991 | for (nid = 0; nid < MAX_NUMNODES; nid++) | 4298 | for (nid = 0; nid < MAX_NUMNODES; nid++) |
| 3992 | zone_movable_pfn[nid] = | 4299 | zone_movable_pfn[nid] = |
| 3993 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); | 4300 | roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); |
| 4301 | |||
| 4302 | out: | ||
| 4303 | /* restore the node_state */ | ||
| 4304 | node_states[N_HIGH_MEMORY] = saved_node_state; | ||
| 3994 | } | 4305 | } |
| 3995 | 4306 | ||
| 3996 | /* Any regular memory on that node ? */ | 4307 | /* Any regular memory on that node ? */ |
| @@ -4209,8 +4520,8 @@ static void calculate_totalreserve_pages(void) | |||
| 4209 | max = zone->lowmem_reserve[j]; | 4520 | max = zone->lowmem_reserve[j]; |
| 4210 | } | 4521 | } |
| 4211 | 4522 | ||
| 4212 | /* we treat pages_high as reserved pages. */ | 4523 | /* we treat the high watermark as reserved pages. */ |
| 4213 | max += zone->pages_high; | 4524 | max += high_wmark_pages(zone); |
| 4214 | 4525 | ||
| 4215 | if (max > zone->present_pages) | 4526 | if (max > zone->present_pages) |
| 4216 | max = zone->present_pages; | 4527 | max = zone->present_pages; |
| @@ -4260,12 +4571,13 @@ static void setup_per_zone_lowmem_reserve(void) | |||
| 4260 | } | 4571 | } |
| 4261 | 4572 | ||
| 4262 | /** | 4573 | /** |
| 4263 | * setup_per_zone_pages_min - called when min_free_kbytes changes. | 4574 | * setup_per_zone_wmarks - called when min_free_kbytes changes |
| 4575 | * or when memory is hot-{added|removed} | ||
| 4264 | * | 4576 | * |
| 4265 | * Ensures that the pages_{min,low,high} values for each zone are set correctly | 4577 | * Ensures that the watermark[min,low,high] values for each zone are set |
| 4266 | * with respect to min_free_kbytes. | 4578 | * correctly with respect to min_free_kbytes. |
| 4267 | */ | 4579 | */ |
| 4268 | void setup_per_zone_pages_min(void) | 4580 | void setup_per_zone_wmarks(void) |
| 4269 | { | 4581 | { |
| 4270 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | 4582 | unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
| 4271 | unsigned long lowmem_pages = 0; | 4583 | unsigned long lowmem_pages = 0; |
| @@ -4290,7 +4602,7 @@ void setup_per_zone_pages_min(void) | |||
| 4290 | * need highmem pages, so cap pages_min to a small | 4602 | * need highmem pages, so cap pages_min to a small |
| 4291 | * value here. | 4603 | * value here. |
| 4292 | * | 4604 | * |
| 4293 | * The (pages_high-pages_low) and (pages_low-pages_min) | 4605 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
| 4294 | * deltas controls asynch page reclaim, and so should | 4606 | * deltas controls asynch page reclaim, and so should |
| 4295 | * not be capped for highmem. | 4607 | * not be capped for highmem. |
| 4296 | */ | 4608 | */ |
| @@ -4301,17 +4613,17 @@ void setup_per_zone_pages_min(void) | |||
| 4301 | min_pages = SWAP_CLUSTER_MAX; | 4613 | min_pages = SWAP_CLUSTER_MAX; |
| 4302 | if (min_pages > 128) | 4614 | if (min_pages > 128) |
| 4303 | min_pages = 128; | 4615 | min_pages = 128; |
| 4304 | zone->pages_min = min_pages; | 4616 | zone->watermark[WMARK_MIN] = min_pages; |
| 4305 | } else { | 4617 | } else { |
| 4306 | /* | 4618 | /* |
| 4307 | * If it's a lowmem zone, reserve a number of pages | 4619 | * If it's a lowmem zone, reserve a number of pages |
| 4308 | * proportionate to the zone's size. | 4620 | * proportionate to the zone's size. |
| 4309 | */ | 4621 | */ |
| 4310 | zone->pages_min = tmp; | 4622 | zone->watermark[WMARK_MIN] = tmp; |
| 4311 | } | 4623 | } |
| 4312 | 4624 | ||
| 4313 | zone->pages_low = zone->pages_min + (tmp >> 2); | 4625 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
| 4314 | zone->pages_high = zone->pages_min + (tmp >> 1); | 4626 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
| 4315 | setup_zone_migrate_reserve(zone); | 4627 | setup_zone_migrate_reserve(zone); |
| 4316 | spin_unlock_irqrestore(&zone->lock, flags); | 4628 | spin_unlock_irqrestore(&zone->lock, flags); |
| 4317 | } | 4629 | } |
| @@ -4320,9 +4632,7 @@ void setup_per_zone_pages_min(void) | |||
| 4320 | calculate_totalreserve_pages(); | 4632 | calculate_totalreserve_pages(); |
| 4321 | } | 4633 | } |
| 4322 | 4634 | ||
| 4323 | /** | 4635 | /* |
| 4324 | * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. | ||
| 4325 | * | ||
| 4326 | * The inactive anon list should be small enough that the VM never has to | 4636 | * The inactive anon list should be small enough that the VM never has to |
| 4327 | * do too much work, but large enough that each inactive page has a chance | 4637 | * do too much work, but large enough that each inactive page has a chance |
| 4328 | * to be referenced again before it is swapped out. | 4638 | * to be referenced again before it is swapped out. |
| @@ -4343,21 +4653,26 @@ void setup_per_zone_pages_min(void) | |||
| 4343 | * 1TB 101 10GB | 4653 | * 1TB 101 10GB |
| 4344 | * 10TB 320 32GB | 4654 | * 10TB 320 32GB |
| 4345 | */ | 4655 | */ |
| 4346 | static void setup_per_zone_inactive_ratio(void) | 4656 | void calculate_zone_inactive_ratio(struct zone *zone) |
| 4347 | { | 4657 | { |
| 4348 | struct zone *zone; | 4658 | unsigned int gb, ratio; |
| 4349 | 4659 | ||
| 4350 | for_each_zone(zone) { | 4660 | /* Zone size in gigabytes */ |
| 4351 | unsigned int gb, ratio; | 4661 | gb = zone->present_pages >> (30 - PAGE_SHIFT); |
| 4352 | 4662 | if (gb) | |
| 4353 | /* Zone size in gigabytes */ | ||
| 4354 | gb = zone->present_pages >> (30 - PAGE_SHIFT); | ||
| 4355 | ratio = int_sqrt(10 * gb); | 4663 | ratio = int_sqrt(10 * gb); |
| 4356 | if (!ratio) | 4664 | else |
| 4357 | ratio = 1; | 4665 | ratio = 1; |
| 4358 | 4666 | ||
| 4359 | zone->inactive_ratio = ratio; | 4667 | zone->inactive_ratio = ratio; |
| 4360 | } | 4668 | } |
| 4669 | |||
| 4670 | static void __init setup_per_zone_inactive_ratio(void) | ||
| 4671 | { | ||
| 4672 | struct zone *zone; | ||
| 4673 | |||
| 4674 | for_each_zone(zone) | ||
| 4675 | calculate_zone_inactive_ratio(zone); | ||
| 4361 | } | 4676 | } |
| 4362 | 4677 | ||
| 4363 | /* | 4678 | /* |
| @@ -4384,7 +4699,7 @@ static void setup_per_zone_inactive_ratio(void) | |||
| 4384 | * 8192MB: 11584k | 4699 | * 8192MB: 11584k |
| 4385 | * 16384MB: 16384k | 4700 | * 16384MB: 16384k |
| 4386 | */ | 4701 | */ |
| 4387 | static int __init init_per_zone_pages_min(void) | 4702 | static int __init init_per_zone_wmark_min(void) |
| 4388 | { | 4703 | { |
| 4389 | unsigned long lowmem_kbytes; | 4704 | unsigned long lowmem_kbytes; |
| 4390 | 4705 | ||
| @@ -4395,12 +4710,12 @@ static int __init init_per_zone_pages_min(void) | |||
| 4395 | min_free_kbytes = 128; | 4710 | min_free_kbytes = 128; |
| 4396 | if (min_free_kbytes > 65536) | 4711 | if (min_free_kbytes > 65536) |
| 4397 | min_free_kbytes = 65536; | 4712 | min_free_kbytes = 65536; |
| 4398 | setup_per_zone_pages_min(); | 4713 | setup_per_zone_wmarks(); |
| 4399 | setup_per_zone_lowmem_reserve(); | 4714 | setup_per_zone_lowmem_reserve(); |
| 4400 | setup_per_zone_inactive_ratio(); | 4715 | setup_per_zone_inactive_ratio(); |
| 4401 | return 0; | 4716 | return 0; |
| 4402 | } | 4717 | } |
| 4403 | module_init(init_per_zone_pages_min) | 4718 | module_init(init_per_zone_wmark_min) |
| 4404 | 4719 | ||
| 4405 | /* | 4720 | /* |
| 4406 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so | 4721 | * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
| @@ -4408,22 +4723,22 @@ module_init(init_per_zone_pages_min) | |||
| 4408 | * changes. | 4723 | * changes. |
| 4409 | */ | 4724 | */ |
| 4410 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, | 4725 | int min_free_kbytes_sysctl_handler(ctl_table *table, int write, |
| 4411 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 4726 | void __user *buffer, size_t *length, loff_t *ppos) |
| 4412 | { | 4727 | { |
| 4413 | proc_dointvec(table, write, file, buffer, length, ppos); | 4728 | proc_dointvec(table, write, buffer, length, ppos); |
| 4414 | if (write) | 4729 | if (write) |
| 4415 | setup_per_zone_pages_min(); | 4730 | setup_per_zone_wmarks(); |
| 4416 | return 0; | 4731 | return 0; |
| 4417 | } | 4732 | } |
| 4418 | 4733 | ||
| 4419 | #ifdef CONFIG_NUMA | 4734 | #ifdef CONFIG_NUMA |
| 4420 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | 4735 | int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, |
| 4421 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 4736 | void __user *buffer, size_t *length, loff_t *ppos) |
| 4422 | { | 4737 | { |
| 4423 | struct zone *zone; | 4738 | struct zone *zone; |
| 4424 | int rc; | 4739 | int rc; |
| 4425 | 4740 | ||
| 4426 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 4741 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); |
| 4427 | if (rc) | 4742 | if (rc) |
| 4428 | return rc; | 4743 | return rc; |
| 4429 | 4744 | ||
| @@ -4434,12 +4749,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, | |||
| 4434 | } | 4749 | } |
| 4435 | 4750 | ||
| 4436 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | 4751 | int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, |
| 4437 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 4752 | void __user *buffer, size_t *length, loff_t *ppos) |
| 4438 | { | 4753 | { |
| 4439 | struct zone *zone; | 4754 | struct zone *zone; |
| 4440 | int rc; | 4755 | int rc; |
| 4441 | 4756 | ||
| 4442 | rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 4757 | rc = proc_dointvec_minmax(table, write, buffer, length, ppos); |
| 4443 | if (rc) | 4758 | if (rc) |
| 4444 | return rc; | 4759 | return rc; |
| 4445 | 4760 | ||
| @@ -4456,13 +4771,13 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, | |||
| 4456 | * whenever sysctl_lowmem_reserve_ratio changes. | 4771 | * whenever sysctl_lowmem_reserve_ratio changes. |
| 4457 | * | 4772 | * |
| 4458 | * The reserve ratio obviously has absolutely no relation with the | 4773 | * The reserve ratio obviously has absolutely no relation with the |
| 4459 | * pages_min watermarks. The lowmem reserve ratio can only make sense | 4774 | * minimum watermarks. The lowmem reserve ratio can only make sense |
| 4460 | * if in function of the boot time zone sizes. | 4775 | * if in function of the boot time zone sizes. |
| 4461 | */ | 4776 | */ |
| 4462 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | 4777 | int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, |
| 4463 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 4778 | void __user *buffer, size_t *length, loff_t *ppos) |
| 4464 | { | 4779 | { |
| 4465 | proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 4780 | proc_dointvec_minmax(table, write, buffer, length, ppos); |
| 4466 | setup_per_zone_lowmem_reserve(); | 4781 | setup_per_zone_lowmem_reserve(); |
| 4467 | return 0; | 4782 | return 0; |
| 4468 | } | 4783 | } |
| @@ -4474,16 +4789,16 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, | |||
| 4474 | */ | 4789 | */ |
| 4475 | 4790 | ||
| 4476 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, | 4791 | int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, |
| 4477 | struct file *file, void __user *buffer, size_t *length, loff_t *ppos) | 4792 | void __user *buffer, size_t *length, loff_t *ppos) |
| 4478 | { | 4793 | { |
| 4479 | struct zone *zone; | 4794 | struct zone *zone; |
| 4480 | unsigned int cpu; | 4795 | unsigned int cpu; |
| 4481 | int ret; | 4796 | int ret; |
| 4482 | 4797 | ||
| 4483 | ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); | 4798 | ret = proc_dointvec_minmax(table, write, buffer, length, ppos); |
| 4484 | if (!write || (ret == -EINVAL)) | 4799 | if (!write || (ret == -EINVAL)) |
| 4485 | return ret; | 4800 | return ret; |
| 4486 | for_each_zone(zone) { | 4801 | for_each_populated_zone(zone) { |
| 4487 | for_each_online_cpu(cpu) { | 4802 | for_each_online_cpu(cpu) { |
| 4488 | unsigned long high; | 4803 | unsigned long high; |
| 4489 | high = zone->present_pages / percpu_pagelist_fraction; | 4804 | high = zone->present_pages / percpu_pagelist_fraction; |
| @@ -4540,7 +4855,14 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
| 4540 | numentries <<= (PAGE_SHIFT - scale); | 4855 | numentries <<= (PAGE_SHIFT - scale); |
| 4541 | 4856 | ||
| 4542 | /* Make sure we've got at least a 0-order allocation.. */ | 4857 | /* Make sure we've got at least a 0-order allocation.. */ |
| 4543 | if (unlikely((numentries * bucketsize) < PAGE_SIZE)) | 4858 | if (unlikely(flags & HASH_SMALL)) { |
| 4859 | /* Makes no sense without HASH_EARLY */ | ||
| 4860 | WARN_ON(!(flags & HASH_EARLY)); | ||
| 4861 | if (!(numentries >> *_hash_shift)) { | ||
| 4862 | numentries = 1UL << *_hash_shift; | ||
| 4863 | BUG_ON(!numentries); | ||
| 4864 | } | ||
| 4865 | } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) | ||
| 4544 | numentries = PAGE_SIZE / bucketsize; | 4866 | numentries = PAGE_SIZE / bucketsize; |
| 4545 | } | 4867 | } |
| 4546 | numentries = roundup_pow_of_two(numentries); | 4868 | numentries = roundup_pow_of_two(numentries); |
| @@ -4563,22 +4885,14 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
| 4563 | else if (hashdist) | 4885 | else if (hashdist) |
| 4564 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 4886 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
| 4565 | else { | 4887 | else { |
| 4566 | unsigned long order = get_order(size); | ||
| 4567 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | ||
| 4568 | /* | 4888 | /* |
| 4569 | * If bucketsize is not a power-of-two, we may free | 4889 | * If bucketsize is not a power-of-two, we may free |
| 4570 | * some pages at the end of hash table. | 4890 | * some pages at the end of hash table which |
| 4891 | * alloc_pages_exact() automatically does | ||
| 4571 | */ | 4892 | */ |
| 4572 | if (table) { | 4893 | if (get_order(size) < MAX_ORDER) { |
| 4573 | unsigned long alloc_end = (unsigned long)table + | 4894 | table = alloc_pages_exact(size, GFP_ATOMIC); |
| 4574 | (PAGE_SIZE << order); | 4895 | kmemleak_alloc(table, size, 1, GFP_ATOMIC); |
| 4575 | unsigned long used = (unsigned long)table + | ||
| 4576 | PAGE_ALIGN(size); | ||
| 4577 | split_page(virt_to_page(table), order); | ||
| 4578 | while (used < alloc_end) { | ||
| 4579 | free_page(used); | ||
| 4580 | used += PAGE_SIZE; | ||
| 4581 | } | ||
| 4582 | } | 4896 | } |
| 4583 | } | 4897 | } |
| 4584 | } while (!table && size > PAGE_SIZE && --log2qty); | 4898 | } while (!table && size > PAGE_SIZE && --log2qty); |
| @@ -4690,13 +5004,16 @@ int set_migratetype_isolate(struct page *page) | |||
| 4690 | struct zone *zone; | 5004 | struct zone *zone; |
| 4691 | unsigned long flags; | 5005 | unsigned long flags; |
| 4692 | int ret = -EBUSY; | 5006 | int ret = -EBUSY; |
| 5007 | int zone_idx; | ||
| 4693 | 5008 | ||
| 4694 | zone = page_zone(page); | 5009 | zone = page_zone(page); |
| 5010 | zone_idx = zone_idx(zone); | ||
| 4695 | spin_lock_irqsave(&zone->lock, flags); | 5011 | spin_lock_irqsave(&zone->lock, flags); |
| 4696 | /* | 5012 | /* |
| 4697 | * In future, more migrate types will be able to be isolation target. | 5013 | * In future, more migrate types will be able to be isolation target. |
| 4698 | */ | 5014 | */ |
| 4699 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) | 5015 | if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && |
| 5016 | zone_idx != ZONE_MOVABLE) | ||
| 4700 | goto out; | 5017 | goto out; |
| 4701 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | 5018 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); |
| 4702 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | 5019 | move_freepages_block(zone, page, MIGRATE_ISOLATE); |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index ceecfbb143fa..3d535d594826 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
| @@ -69,7 +69,7 @@ static int __init alloc_node_page_cgroup(int nid) | |||
| 69 | return 0; | 69 | return 0; |
| 70 | } | 70 | } |
| 71 | 71 | ||
| 72 | void __init page_cgroup_init(void) | 72 | void __init page_cgroup_init_flatmem(void) |
| 73 | { | 73 | { |
| 74 | 74 | ||
| 75 | int nid, fail; | 75 | int nid, fail; |
| @@ -83,12 +83,12 @@ void __init page_cgroup_init(void) | |||
| 83 | goto fail; | 83 | goto fail; |
| 84 | } | 84 | } |
| 85 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | 85 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); |
| 86 | printk(KERN_INFO "please try cgroup_disable=memory option if you" | 86 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you" |
| 87 | " don't want\n"); | 87 | " don't want memory cgroups\n"); |
| 88 | return; | 88 | return; |
| 89 | fail: | 89 | fail: |
| 90 | printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); | 90 | printk(KERN_CRIT "allocation of page_cgroup failed.\n"); |
| 91 | printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); | 91 | printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n"); |
| 92 | panic("Out of memory"); | 92 | panic("Out of memory"); |
| 93 | } | 93 | } |
| 94 | 94 | ||
| @@ -99,6 +99,8 @@ struct page_cgroup *lookup_page_cgroup(struct page *page) | |||
| 99 | unsigned long pfn = page_to_pfn(page); | 99 | unsigned long pfn = page_to_pfn(page); |
| 100 | struct mem_section *section = __pfn_to_section(pfn); | 100 | struct mem_section *section = __pfn_to_section(pfn); |
| 101 | 101 | ||
| 102 | if (!section->page_cgroup) | ||
| 103 | return NULL; | ||
| 102 | return section->page_cgroup + pfn; | 104 | return section->page_cgroup + pfn; |
| 103 | } | 105 | } |
| 104 | 106 | ||
| @@ -113,15 +115,16 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) | |||
| 113 | if (!section->page_cgroup) { | 115 | if (!section->page_cgroup) { |
| 114 | nid = page_to_nid(pfn_to_page(pfn)); | 116 | nid = page_to_nid(pfn_to_page(pfn)); |
| 115 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; | 117 | table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; |
| 116 | if (slab_is_available()) { | 118 | VM_BUG_ON(!slab_is_available()); |
| 119 | if (node_state(nid, N_HIGH_MEMORY)) { | ||
| 117 | base = kmalloc_node(table_size, | 120 | base = kmalloc_node(table_size, |
| 118 | GFP_KERNEL | __GFP_NOWARN, nid); | 121 | GFP_KERNEL | __GFP_NOWARN, nid); |
| 119 | if (!base) | 122 | if (!base) |
| 120 | base = vmalloc_node(table_size, nid); | 123 | base = vmalloc_node(table_size, nid); |
| 121 | } else { | 124 | } else { |
| 122 | base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), | 125 | base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); |
| 123 | table_size, | 126 | if (!base) |
| 124 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 127 | base = vmalloc(table_size); |
| 125 | } | 128 | } |
| 126 | } else { | 129 | } else { |
| 127 | /* | 130 | /* |
| @@ -257,14 +260,14 @@ void __init page_cgroup_init(void) | |||
| 257 | fail = init_section_page_cgroup(pfn); | 260 | fail = init_section_page_cgroup(pfn); |
| 258 | } | 261 | } |
| 259 | if (fail) { | 262 | if (fail) { |
| 260 | printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); | 263 | printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n"); |
| 261 | panic("Out of memory"); | 264 | panic("Out of memory"); |
| 262 | } else { | 265 | } else { |
| 263 | hotplug_memory_notifier(page_cgroup_callback, 0); | 266 | hotplug_memory_notifier(page_cgroup_callback, 0); |
| 264 | } | 267 | } |
| 265 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); | 268 | printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); |
| 266 | printk(KERN_INFO "please try cgroup_disable=memory option if you don't" | 269 | printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't" |
| 267 | " want\n"); | 270 | " want memory cgroups\n"); |
| 268 | } | 271 | } |
| 269 | 272 | ||
| 270 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) | 273 | void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) |
| @@ -285,12 +288,8 @@ struct swap_cgroup_ctrl { | |||
| 285 | 288 | ||
| 286 | struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; | 289 | struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; |
| 287 | 290 | ||
| 288 | /* | ||
| 289 | * This 8bytes seems big..maybe we can reduce this when we can use "id" for | ||
| 290 | * cgroup rather than pointer. | ||
| 291 | */ | ||
| 292 | struct swap_cgroup { | 291 | struct swap_cgroup { |
| 293 | struct mem_cgroup *val; | 292 | unsigned short id; |
| 294 | }; | 293 | }; |
| 295 | #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) | 294 | #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) |
| 296 | #define SC_POS_MASK (SC_PER_PAGE - 1) | 295 | #define SC_POS_MASK (SC_PER_PAGE - 1) |
| @@ -318,8 +317,6 @@ static int swap_cgroup_prepare(int type) | |||
| 318 | struct swap_cgroup_ctrl *ctrl; | 317 | struct swap_cgroup_ctrl *ctrl; |
| 319 | unsigned long idx, max; | 318 | unsigned long idx, max; |
| 320 | 319 | ||
| 321 | if (!do_swap_account) | ||
| 322 | return 0; | ||
| 323 | ctrl = &swap_cgroup_ctrl[type]; | 320 | ctrl = &swap_cgroup_ctrl[type]; |
| 324 | 321 | ||
| 325 | for (idx = 0; idx < ctrl->length; idx++) { | 322 | for (idx = 0; idx < ctrl->length; idx++) { |
| @@ -342,10 +339,10 @@ not_enough_page: | |||
| 342 | * @ent: swap entry to be recorded into | 339 | * @ent: swap entry to be recorded into |
| 343 | * @mem: mem_cgroup to be recorded | 340 | * @mem: mem_cgroup to be recorded |
| 344 | * | 341 | * |
| 345 | * Returns old value at success, NULL at failure. | 342 | * Returns old value at success, 0 at failure. |
| 346 | * (Of course, old value can be NULL.) | 343 | * (Of course, old value can be 0.) |
| 347 | */ | 344 | */ |
| 348 | struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) | 345 | unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) |
| 349 | { | 346 | { |
| 350 | int type = swp_type(ent); | 347 | int type = swp_type(ent); |
| 351 | unsigned long offset = swp_offset(ent); | 348 | unsigned long offset = swp_offset(ent); |
| @@ -354,18 +351,15 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) | |||
| 354 | struct swap_cgroup_ctrl *ctrl; | 351 | struct swap_cgroup_ctrl *ctrl; |
| 355 | struct page *mappage; | 352 | struct page *mappage; |
| 356 | struct swap_cgroup *sc; | 353 | struct swap_cgroup *sc; |
| 357 | struct mem_cgroup *old; | 354 | unsigned short old; |
| 358 | |||
| 359 | if (!do_swap_account) | ||
| 360 | return NULL; | ||
| 361 | 355 | ||
| 362 | ctrl = &swap_cgroup_ctrl[type]; | 356 | ctrl = &swap_cgroup_ctrl[type]; |
| 363 | 357 | ||
| 364 | mappage = ctrl->map[idx]; | 358 | mappage = ctrl->map[idx]; |
| 365 | sc = page_address(mappage); | 359 | sc = page_address(mappage); |
| 366 | sc += pos; | 360 | sc += pos; |
| 367 | old = sc->val; | 361 | old = sc->id; |
| 368 | sc->val = mem; | 362 | sc->id = id; |
| 369 | 363 | ||
| 370 | return old; | 364 | return old; |
| 371 | } | 365 | } |
| @@ -374,9 +368,9 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) | |||
| 374 | * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry | 368 | * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry |
| 375 | * @ent: swap entry to be looked up. | 369 | * @ent: swap entry to be looked up. |
| 376 | * | 370 | * |
| 377 | * Returns pointer to mem_cgroup at success. NULL at failure. | 371 | * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) |
| 378 | */ | 372 | */ |
| 379 | struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) | 373 | unsigned short lookup_swap_cgroup(swp_entry_t ent) |
| 380 | { | 374 | { |
| 381 | int type = swp_type(ent); | 375 | int type = swp_type(ent); |
| 382 | unsigned long offset = swp_offset(ent); | 376 | unsigned long offset = swp_offset(ent); |
| @@ -385,16 +379,13 @@ struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) | |||
| 385 | struct swap_cgroup_ctrl *ctrl; | 379 | struct swap_cgroup_ctrl *ctrl; |
| 386 | struct page *mappage; | 380 | struct page *mappage; |
| 387 | struct swap_cgroup *sc; | 381 | struct swap_cgroup *sc; |
| 388 | struct mem_cgroup *ret; | 382 | unsigned short ret; |
| 389 | |||
| 390 | if (!do_swap_account) | ||
| 391 | return NULL; | ||
| 392 | 383 | ||
| 393 | ctrl = &swap_cgroup_ctrl[type]; | 384 | ctrl = &swap_cgroup_ctrl[type]; |
| 394 | mappage = ctrl->map[idx]; | 385 | mappage = ctrl->map[idx]; |
| 395 | sc = page_address(mappage); | 386 | sc = page_address(mappage); |
| 396 | sc += pos; | 387 | sc += pos; |
| 397 | ret = sc->val; | 388 | ret = sc->id; |
| 398 | return ret; | 389 | return ret; |
| 399 | } | 390 | } |
| 400 | 391 | ||
| @@ -430,13 +421,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) | |||
| 430 | } | 421 | } |
| 431 | mutex_unlock(&swap_cgroup_mutex); | 422 | mutex_unlock(&swap_cgroup_mutex); |
| 432 | 423 | ||
| 433 | printk(KERN_INFO | ||
| 434 | "swap_cgroup: uses %ld bytes of vmalloc for pointer array space" | ||
| 435 | " and %ld bytes to hold mem_cgroup pointers on swap\n", | ||
| 436 | array_size, length * PAGE_SIZE); | ||
| 437 | printk(KERN_INFO | ||
| 438 | "swap_cgroup can be disabled by noswapaccount boot option.\n"); | ||
| 439 | |||
| 440 | return 0; | 424 | return 0; |
| 441 | nomem: | 425 | nomem: |
| 442 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); | 426 | printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); |
diff --git a/mm/page_io.c b/mm/page_io.c index 3023c475e041..c6f3e5071de3 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -120,7 +120,7 @@ out: | |||
| 120 | return ret; | 120 | return ret; |
| 121 | } | 121 | } |
| 122 | 122 | ||
| 123 | int swap_readpage(struct file *file, struct page *page) | 123 | int swap_readpage(struct page *page) |
| 124 | { | 124 | { |
| 125 | struct bio *bio; | 125 | struct bio *bio; |
| 126 | int ret = 0; | 126 | int ret = 0; |
diff --git a/mm/pdflush.c b/mm/pdflush.c deleted file mode 100644 index 118905e3d788..000000000000 --- a/mm/pdflush.c +++ /dev/null | |||
| @@ -1,251 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * mm/pdflush.c - worker threads for writing back filesystem data | ||
| 3 | * | ||
| 4 | * Copyright (C) 2002, Linus Torvalds. | ||
| 5 | * | ||
| 6 | * 09Apr2002 Andrew Morton | ||
| 7 | * Initial version | ||
| 8 | * 29Feb2004 kaos@sgi.com | ||
| 9 | * Move worker thread creation to kthread to avoid chewing | ||
| 10 | * up stack space with nested calls to kernel_thread. | ||
| 11 | */ | ||
| 12 | |||
| 13 | #include <linux/sched.h> | ||
| 14 | #include <linux/list.h> | ||
| 15 | #include <linux/signal.h> | ||
| 16 | #include <linux/spinlock.h> | ||
| 17 | #include <linux/gfp.h> | ||
| 18 | #include <linux/init.h> | ||
| 19 | #include <linux/module.h> | ||
| 20 | #include <linux/fs.h> /* Needed by writeback.h */ | ||
| 21 | #include <linux/writeback.h> /* Prototypes pdflush_operation() */ | ||
| 22 | #include <linux/kthread.h> | ||
| 23 | #include <linux/cpuset.h> | ||
| 24 | #include <linux/freezer.h> | ||
| 25 | |||
| 26 | |||
| 27 | /* | ||
| 28 | * Minimum and maximum number of pdflush instances | ||
| 29 | */ | ||
| 30 | #define MIN_PDFLUSH_THREADS 2 | ||
| 31 | #define MAX_PDFLUSH_THREADS 8 | ||
| 32 | |||
| 33 | static void start_one_pdflush_thread(void); | ||
| 34 | |||
| 35 | |||
| 36 | /* | ||
| 37 | * The pdflush threads are worker threads for writing back dirty data. | ||
| 38 | * Ideally, we'd like one thread per active disk spindle. But the disk | ||
| 39 | * topology is very hard to divine at this level. Instead, we take | ||
| 40 | * care in various places to prevent more than one pdflush thread from | ||
| 41 | * performing writeback against a single filesystem. pdflush threads | ||
| 42 | * have the PF_FLUSHER flag set in current->flags to aid in this. | ||
| 43 | */ | ||
| 44 | |||
| 45 | /* | ||
| 46 | * All the pdflush threads. Protected by pdflush_lock | ||
| 47 | */ | ||
| 48 | static LIST_HEAD(pdflush_list); | ||
| 49 | static DEFINE_SPINLOCK(pdflush_lock); | ||
| 50 | |||
| 51 | /* | ||
| 52 | * The count of currently-running pdflush threads. Protected | ||
| 53 | * by pdflush_lock. | ||
| 54 | * | ||
| 55 | * Readable by sysctl, but not writable. Published to userspace at | ||
| 56 | * /proc/sys/vm/nr_pdflush_threads. | ||
| 57 | */ | ||
| 58 | int nr_pdflush_threads = 0; | ||
| 59 | |||
| 60 | /* | ||
| 61 | * The time at which the pdflush thread pool last went empty | ||
| 62 | */ | ||
| 63 | static unsigned long last_empty_jifs; | ||
| 64 | |||
| 65 | /* | ||
| 66 | * The pdflush thread. | ||
| 67 | * | ||
| 68 | * Thread pool management algorithm: | ||
| 69 | * | ||
| 70 | * - The minimum and maximum number of pdflush instances are bound | ||
| 71 | * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. | ||
| 72 | * | ||
| 73 | * - If there have been no idle pdflush instances for 1 second, create | ||
| 74 | * a new one. | ||
| 75 | * | ||
| 76 | * - If the least-recently-went-to-sleep pdflush thread has been asleep | ||
| 77 | * for more than one second, terminate a thread. | ||
| 78 | */ | ||
| 79 | |||
| 80 | /* | ||
| 81 | * A structure for passing work to a pdflush thread. Also for passing | ||
| 82 | * state information between pdflush threads. Protected by pdflush_lock. | ||
| 83 | */ | ||
| 84 | struct pdflush_work { | ||
| 85 | struct task_struct *who; /* The thread */ | ||
| 86 | void (*fn)(unsigned long); /* A callback function */ | ||
| 87 | unsigned long arg0; /* An argument to the callback */ | ||
| 88 | struct list_head list; /* On pdflush_list, when idle */ | ||
| 89 | unsigned long when_i_went_to_sleep; | ||
| 90 | }; | ||
| 91 | |||
| 92 | static int __pdflush(struct pdflush_work *my_work) | ||
| 93 | { | ||
| 94 | current->flags |= PF_FLUSHER | PF_SWAPWRITE; | ||
| 95 | set_freezable(); | ||
| 96 | my_work->fn = NULL; | ||
| 97 | my_work->who = current; | ||
| 98 | INIT_LIST_HEAD(&my_work->list); | ||
| 99 | |||
| 100 | spin_lock_irq(&pdflush_lock); | ||
| 101 | nr_pdflush_threads++; | ||
| 102 | for ( ; ; ) { | ||
| 103 | struct pdflush_work *pdf; | ||
| 104 | |||
| 105 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 106 | list_move(&my_work->list, &pdflush_list); | ||
| 107 | my_work->when_i_went_to_sleep = jiffies; | ||
| 108 | spin_unlock_irq(&pdflush_lock); | ||
| 109 | schedule(); | ||
| 110 | try_to_freeze(); | ||
| 111 | spin_lock_irq(&pdflush_lock); | ||
| 112 | if (!list_empty(&my_work->list)) { | ||
| 113 | /* | ||
| 114 | * Someone woke us up, but without removing our control | ||
| 115 | * structure from the global list. swsusp will do this | ||
| 116 | * in try_to_freeze()->refrigerator(). Handle it. | ||
| 117 | */ | ||
| 118 | my_work->fn = NULL; | ||
| 119 | continue; | ||
| 120 | } | ||
| 121 | if (my_work->fn == NULL) { | ||
| 122 | printk("pdflush: bogus wakeup\n"); | ||
| 123 | continue; | ||
| 124 | } | ||
| 125 | spin_unlock_irq(&pdflush_lock); | ||
| 126 | |||
| 127 | (*my_work->fn)(my_work->arg0); | ||
| 128 | |||
| 129 | /* | ||
| 130 | * Thread creation: For how long have there been zero | ||
| 131 | * available threads? | ||
| 132 | */ | ||
| 133 | if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { | ||
| 134 | /* unlocked list_empty() test is OK here */ | ||
| 135 | if (list_empty(&pdflush_list)) { | ||
| 136 | /* unlocked test is OK here */ | ||
| 137 | if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) | ||
| 138 | start_one_pdflush_thread(); | ||
| 139 | } | ||
| 140 | } | ||
| 141 | |||
| 142 | spin_lock_irq(&pdflush_lock); | ||
| 143 | my_work->fn = NULL; | ||
| 144 | |||
| 145 | /* | ||
| 146 | * Thread destruction: For how long has the sleepiest | ||
| 147 | * thread slept? | ||
| 148 | */ | ||
| 149 | if (list_empty(&pdflush_list)) | ||
| 150 | continue; | ||
| 151 | if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) | ||
| 152 | continue; | ||
| 153 | pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); | ||
| 154 | if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { | ||
| 155 | /* Limit exit rate */ | ||
| 156 | pdf->when_i_went_to_sleep = jiffies; | ||
| 157 | break; /* exeunt */ | ||
| 158 | } | ||
| 159 | } | ||
| 160 | nr_pdflush_threads--; | ||
| 161 | spin_unlock_irq(&pdflush_lock); | ||
| 162 | return 0; | ||
| 163 | } | ||
| 164 | |||
| 165 | /* | ||
| 166 | * Of course, my_work wants to be just a local in __pdflush(). It is | ||
| 167 | * separated out in this manner to hopefully prevent the compiler from | ||
| 168 | * performing unfortunate optimisations against the auto variables. Because | ||
| 169 | * these are visible to other tasks and CPUs. (No problem has actually | ||
| 170 | * been observed. This is just paranoia). | ||
| 171 | */ | ||
| 172 | static int pdflush(void *dummy) | ||
| 173 | { | ||
| 174 | struct pdflush_work my_work; | ||
| 175 | cpumask_var_t cpus_allowed; | ||
| 176 | |||
| 177 | /* | ||
| 178 | * Since the caller doesn't even check kthread_run() worked, let's not | ||
| 179 | * freak out too much if this fails. | ||
| 180 | */ | ||
| 181 | if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { | ||
| 182 | printk(KERN_WARNING "pdflush failed to allocate cpumask\n"); | ||
| 183 | return 0; | ||
| 184 | } | ||
| 185 | |||
| 186 | /* | ||
| 187 | * pdflush can spend a lot of time doing encryption via dm-crypt. We | ||
| 188 | * don't want to do that at keventd's priority. | ||
| 189 | */ | ||
| 190 | set_user_nice(current, 0); | ||
| 191 | |||
| 192 | /* | ||
| 193 | * Some configs put our parent kthread in a limited cpuset, | ||
| 194 | * which kthread() overrides, forcing cpus_allowed == cpu_all_mask. | ||
| 195 | * Our needs are more modest - cut back to our cpusets cpus_allowed. | ||
| 196 | * This is needed as pdflush's are dynamically created and destroyed. | ||
| 197 | * The boottime pdflush's are easily placed w/o these 2 lines. | ||
| 198 | */ | ||
| 199 | cpuset_cpus_allowed(current, cpus_allowed); | ||
| 200 | set_cpus_allowed_ptr(current, cpus_allowed); | ||
| 201 | free_cpumask_var(cpus_allowed); | ||
| 202 | |||
| 203 | return __pdflush(&my_work); | ||
| 204 | } | ||
| 205 | |||
| 206 | /* | ||
| 207 | * Attempt to wake up a pdflush thread, and get it to do some work for you. | ||
| 208 | * Returns zero if it indeed managed to find a worker thread, and passed your | ||
| 209 | * payload to it. | ||
| 210 | */ | ||
| 211 | int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) | ||
| 212 | { | ||
| 213 | unsigned long flags; | ||
| 214 | int ret = 0; | ||
| 215 | |||
| 216 | BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ | ||
| 217 | |||
| 218 | spin_lock_irqsave(&pdflush_lock, flags); | ||
| 219 | if (list_empty(&pdflush_list)) { | ||
| 220 | ret = -1; | ||
| 221 | } else { | ||
| 222 | struct pdflush_work *pdf; | ||
| 223 | |||
| 224 | pdf = list_entry(pdflush_list.next, struct pdflush_work, list); | ||
| 225 | list_del_init(&pdf->list); | ||
| 226 | if (list_empty(&pdflush_list)) | ||
| 227 | last_empty_jifs = jiffies; | ||
| 228 | pdf->fn = fn; | ||
| 229 | pdf->arg0 = arg0; | ||
| 230 | wake_up_process(pdf->who); | ||
| 231 | } | ||
| 232 | spin_unlock_irqrestore(&pdflush_lock, flags); | ||
| 233 | |||
| 234 | return ret; | ||
| 235 | } | ||
| 236 | |||
| 237 | static void start_one_pdflush_thread(void) | ||
| 238 | { | ||
| 239 | kthread_run(pdflush, NULL, "pdflush"); | ||
| 240 | } | ||
| 241 | |||
| 242 | static int __init pdflush_init(void) | ||
| 243 | { | ||
| 244 | int i; | ||
| 245 | |||
| 246 | for (i = 0; i < MIN_PDFLUSH_THREADS; i++) | ||
| 247 | start_one_pdflush_thread(); | ||
| 248 | return 0; | ||
| 249 | } | ||
| 250 | |||
| 251 | module_init(pdflush_init); | ||
diff --git a/mm/percpu.c b/mm/percpu.c index 1aa5d8fbca12..5adfc268b408 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
| @@ -8,12 +8,13 @@ | |||
| 8 | * | 8 | * |
| 9 | * This is percpu allocator which can handle both static and dynamic | 9 | * This is percpu allocator which can handle both static and dynamic |
| 10 | * areas. Percpu areas are allocated in chunks in vmalloc area. Each | 10 | * areas. Percpu areas are allocated in chunks in vmalloc area. Each |
| 11 | * chunk is consisted of num_possible_cpus() units and the first chunk | 11 | * chunk is consisted of boot-time determined number of units and the |
| 12 | * is used for static percpu variables in the kernel image (special | 12 | * first chunk is used for static percpu variables in the kernel image |
| 13 | * boot time alloc/init handling necessary as these areas need to be | 13 | * (special boot time alloc/init handling necessary as these areas |
| 14 | * brought up before allocation services are running). Unit grows as | 14 | * need to be brought up before allocation services are running). |
| 15 | * necessary and all units grow or shrink in unison. When a chunk is | 15 | * Unit grows as necessary and all units grow or shrink in unison. |
| 16 | * filled up, another chunk is allocated. ie. in vmalloc area | 16 | * When a chunk is filled up, another chunk is allocated. ie. in |
| 17 | * vmalloc area | ||
| 17 | * | 18 | * |
| 18 | * c0 c1 c2 | 19 | * c0 c1 c2 |
| 19 | * ------------------- ------------------- ------------ | 20 | * ------------------- ------------------- ------------ |
| @@ -22,11 +23,13 @@ | |||
| 22 | * | 23 | * |
| 23 | * Allocation is done in offset-size areas of single unit space. Ie, | 24 | * Allocation is done in offset-size areas of single unit space. Ie, |
| 24 | * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, | 25 | * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, |
| 25 | * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring | 26 | * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to |
| 26 | * percpu base registers UNIT_SIZE apart. | 27 | * cpus. On NUMA, the mapping can be non-linear and even sparse. |
| 28 | * Percpu access can be done by configuring percpu base registers | ||
| 29 | * according to cpu to unit mapping and pcpu_unit_size. | ||
| 27 | * | 30 | * |
| 28 | * There are usually many small percpu allocations many of them as | 31 | * There are usually many small percpu allocations many of them being |
| 29 | * small as 4 bytes. The allocator organizes chunks into lists | 32 | * as small as 4 bytes. The allocator organizes chunks into lists |
| 30 | * according to free size and tries to allocate from the fullest one. | 33 | * according to free size and tries to allocate from the fullest one. |
| 31 | * Each chunk keeps the maximum contiguous area size hint which is | 34 | * Each chunk keeps the maximum contiguous area size hint which is |
| 32 | * guaranteed to be eqaul to or larger than the maximum contiguous | 35 | * guaranteed to be eqaul to or larger than the maximum contiguous |
| @@ -38,12 +41,12 @@ | |||
| 38 | * region and negative allocated. Allocation inside a chunk is done | 41 | * region and negative allocated. Allocation inside a chunk is done |
| 39 | * by scanning this map sequentially and serving the first matching | 42 | * by scanning this map sequentially and serving the first matching |
| 40 | * entry. This is mostly copied from the percpu_modalloc() allocator. | 43 | * entry. This is mostly copied from the percpu_modalloc() allocator. |
| 41 | * Chunks are also linked into a rb tree to ease address to chunk | 44 | * Chunks can be determined from the address using the index field |
| 42 | * mapping during free. | 45 | * in the page struct. The index field contains a pointer to the chunk. |
| 43 | * | 46 | * |
| 44 | * To use this allocator, arch code should do the followings. | 47 | * To use this allocator, arch code should do the followings. |
| 45 | * | 48 | * |
| 46 | * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA | 49 | * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA |
| 47 | * | 50 | * |
| 48 | * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate | 51 | * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate |
| 49 | * regular address to percpu pointer and back if they need to be | 52 | * regular address to percpu pointer and back if they need to be |
| @@ -55,13 +58,14 @@ | |||
| 55 | 58 | ||
| 56 | #include <linux/bitmap.h> | 59 | #include <linux/bitmap.h> |
| 57 | #include <linux/bootmem.h> | 60 | #include <linux/bootmem.h> |
| 61 | #include <linux/err.h> | ||
| 58 | #include <linux/list.h> | 62 | #include <linux/list.h> |
| 63 | #include <linux/log2.h> | ||
| 59 | #include <linux/mm.h> | 64 | #include <linux/mm.h> |
| 60 | #include <linux/module.h> | 65 | #include <linux/module.h> |
| 61 | #include <linux/mutex.h> | 66 | #include <linux/mutex.h> |
| 62 | #include <linux/percpu.h> | 67 | #include <linux/percpu.h> |
| 63 | #include <linux/pfn.h> | 68 | #include <linux/pfn.h> |
| 64 | #include <linux/rbtree.h> | ||
| 65 | #include <linux/slab.h> | 69 | #include <linux/slab.h> |
| 66 | #include <linux/spinlock.h> | 70 | #include <linux/spinlock.h> |
| 67 | #include <linux/vmalloc.h> | 71 | #include <linux/vmalloc.h> |
| @@ -88,44 +92,71 @@ | |||
| 88 | 92 | ||
| 89 | struct pcpu_chunk { | 93 | struct pcpu_chunk { |
| 90 | struct list_head list; /* linked to pcpu_slot lists */ | 94 | struct list_head list; /* linked to pcpu_slot lists */ |
| 91 | struct rb_node rb_node; /* key is chunk->vm->addr */ | ||
| 92 | int free_size; /* free bytes in the chunk */ | 95 | int free_size; /* free bytes in the chunk */ |
| 93 | int contig_hint; /* max contiguous size hint */ | 96 | int contig_hint; /* max contiguous size hint */ |
| 94 | struct vm_struct *vm; /* mapped vmalloc region */ | 97 | void *base_addr; /* base address of this chunk */ |
| 95 | int map_used; /* # of map entries used */ | 98 | int map_used; /* # of map entries used */ |
| 96 | int map_alloc; /* # of map entries allocated */ | 99 | int map_alloc; /* # of map entries allocated */ |
| 97 | int *map; /* allocation map */ | 100 | int *map; /* allocation map */ |
| 101 | struct vm_struct **vms; /* mapped vmalloc regions */ | ||
| 98 | bool immutable; /* no [de]population allowed */ | 102 | bool immutable; /* no [de]population allowed */ |
| 99 | struct page **page; /* points to page array */ | 103 | unsigned long populated[]; /* populated bitmap */ |
| 100 | struct page *page_ar[]; /* #cpus * UNIT_PAGES */ | ||
| 101 | }; | 104 | }; |
| 102 | 105 | ||
| 103 | static int pcpu_unit_pages __read_mostly; | 106 | static int pcpu_unit_pages __read_mostly; |
| 104 | static int pcpu_unit_size __read_mostly; | 107 | static int pcpu_unit_size __read_mostly; |
| 105 | static int pcpu_chunk_size __read_mostly; | 108 | static int pcpu_nr_units __read_mostly; |
| 109 | static int pcpu_atom_size __read_mostly; | ||
| 106 | static int pcpu_nr_slots __read_mostly; | 110 | static int pcpu_nr_slots __read_mostly; |
| 107 | static size_t pcpu_chunk_struct_size __read_mostly; | 111 | static size_t pcpu_chunk_struct_size __read_mostly; |
| 108 | 112 | ||
| 113 | /* cpus with the lowest and highest unit numbers */ | ||
| 114 | static unsigned int pcpu_first_unit_cpu __read_mostly; | ||
| 115 | static unsigned int pcpu_last_unit_cpu __read_mostly; | ||
| 116 | |||
| 109 | /* the address of the first chunk which starts with the kernel static area */ | 117 | /* the address of the first chunk which starts with the kernel static area */ |
| 110 | void *pcpu_base_addr __read_mostly; | 118 | void *pcpu_base_addr __read_mostly; |
| 111 | EXPORT_SYMBOL_GPL(pcpu_base_addr); | 119 | EXPORT_SYMBOL_GPL(pcpu_base_addr); |
| 112 | 120 | ||
| 113 | /* optional reserved chunk, only accessible for reserved allocations */ | 121 | static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ |
| 122 | const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ | ||
| 123 | |||
| 124 | /* group information, used for vm allocation */ | ||
| 125 | static int pcpu_nr_groups __read_mostly; | ||
| 126 | static const unsigned long *pcpu_group_offsets __read_mostly; | ||
| 127 | static const size_t *pcpu_group_sizes __read_mostly; | ||
| 128 | |||
| 129 | /* | ||
| 130 | * The first chunk which always exists. Note that unlike other | ||
| 131 | * chunks, this one can be allocated and mapped in several different | ||
| 132 | * ways and thus often doesn't live in the vmalloc area. | ||
| 133 | */ | ||
| 134 | static struct pcpu_chunk *pcpu_first_chunk; | ||
| 135 | |||
| 136 | /* | ||
| 137 | * Optional reserved chunk. This chunk reserves part of the first | ||
| 138 | * chunk and serves it for reserved allocations. The amount of | ||
| 139 | * reserved offset is in pcpu_reserved_chunk_limit. When reserved | ||
| 140 | * area doesn't exist, the following variables contain NULL and 0 | ||
| 141 | * respectively. | ||
| 142 | */ | ||
| 114 | static struct pcpu_chunk *pcpu_reserved_chunk; | 143 | static struct pcpu_chunk *pcpu_reserved_chunk; |
| 115 | /* offset limit of the reserved chunk */ | ||
| 116 | static int pcpu_reserved_chunk_limit; | 144 | static int pcpu_reserved_chunk_limit; |
| 117 | 145 | ||
| 118 | /* | 146 | /* |
| 119 | * Synchronization rules. | 147 | * Synchronization rules. |
| 120 | * | 148 | * |
| 121 | * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former | 149 | * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former |
| 122 | * protects allocation/reclaim paths, chunks and chunk->page arrays. | 150 | * protects allocation/reclaim paths, chunks, populated bitmap and |
| 123 | * The latter is a spinlock and protects the index data structures - | 151 | * vmalloc mapping. The latter is a spinlock and protects the index |
| 124 | * chunk slots, rbtree, chunks and area maps in chunks. | 152 | * data structures - chunk slots, chunks and area maps in chunks. |
| 125 | * | 153 | * |
| 126 | * During allocation, pcpu_alloc_mutex is kept locked all the time and | 154 | * During allocation, pcpu_alloc_mutex is kept locked all the time and |
| 127 | * pcpu_lock is grabbed and released as necessary. All actual memory | 155 | * pcpu_lock is grabbed and released as necessary. All actual memory |
| 128 | * allocations are done using GFP_KERNEL with pcpu_lock released. | 156 | * allocations are done using GFP_KERNEL with pcpu_lock released. In |
| 157 | * general, percpu memory can't be allocated with irq off but | ||
| 158 | * irqsave/restore are still used in alloc path so that it can be used | ||
| 159 | * from early init path - sched_init() specifically. | ||
| 129 | * | 160 | * |
| 130 | * Free path accesses and alters only the index data structures, so it | 161 | * Free path accesses and alters only the index data structures, so it |
| 131 | * can be safely called from atomic context. When memory needs to be | 162 | * can be safely called from atomic context. When memory needs to be |
| @@ -140,7 +171,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ | |||
| 140 | static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ | 171 | static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ |
| 141 | 172 | ||
| 142 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ | 173 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ |
| 143 | static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ | ||
| 144 | 174 | ||
| 145 | /* reclaim work to release fully free chunks, scheduled from free path */ | 175 | /* reclaim work to release fully free chunks, scheduled from free path */ |
| 146 | static void pcpu_reclaim(struct work_struct *work); | 176 | static void pcpu_reclaim(struct work_struct *work); |
| @@ -169,28 +199,65 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) | |||
| 169 | 199 | ||
| 170 | static int pcpu_page_idx(unsigned int cpu, int page_idx) | 200 | static int pcpu_page_idx(unsigned int cpu, int page_idx) |
| 171 | { | 201 | { |
| 172 | return cpu * pcpu_unit_pages + page_idx; | 202 | return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; |
| 173 | } | 203 | } |
| 174 | 204 | ||
| 175 | static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, | 205 | static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, |
| 176 | unsigned int cpu, int page_idx) | 206 | unsigned int cpu, int page_idx) |
| 177 | { | 207 | { |
| 178 | return &chunk->page[pcpu_page_idx(cpu, page_idx)]; | 208 | return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + |
| 209 | (page_idx << PAGE_SHIFT); | ||
| 179 | } | 210 | } |
| 180 | 211 | ||
| 181 | static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, | 212 | static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, |
| 182 | unsigned int cpu, int page_idx) | 213 | unsigned int cpu, int page_idx) |
| 214 | { | ||
| 215 | /* must not be used on pre-mapped chunk */ | ||
| 216 | WARN_ON(chunk->immutable); | ||
| 217 | |||
| 218 | return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); | ||
| 219 | } | ||
| 220 | |||
| 221 | /* set the pointer to a chunk in a page struct */ | ||
| 222 | static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) | ||
| 223 | { | ||
| 224 | page->index = (unsigned long)pcpu; | ||
| 225 | } | ||
| 226 | |||
| 227 | /* obtain pointer to a chunk from a page struct */ | ||
| 228 | static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) | ||
| 229 | { | ||
| 230 | return (struct pcpu_chunk *)page->index; | ||
| 231 | } | ||
| 232 | |||
| 233 | static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end) | ||
| 183 | { | 234 | { |
| 184 | return (unsigned long)chunk->vm->addr + | 235 | *rs = find_next_zero_bit(chunk->populated, end, *rs); |
| 185 | (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); | 236 | *re = find_next_bit(chunk->populated, end, *rs + 1); |
| 186 | } | 237 | } |
| 187 | 238 | ||
| 188 | static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, | 239 | static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end) |
| 189 | int page_idx) | ||
| 190 | { | 240 | { |
| 191 | return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; | 241 | *rs = find_next_bit(chunk->populated, end, *rs); |
| 242 | *re = find_next_zero_bit(chunk->populated, end, *rs + 1); | ||
| 192 | } | 243 | } |
| 193 | 244 | ||
| 245 | /* | ||
| 246 | * (Un)populated page region iterators. Iterate over (un)populated | ||
| 247 | * page regions betwen @start and @end in @chunk. @rs and @re should | ||
| 248 | * be integer variables and will be set to start and end page index of | ||
| 249 | * the current region. | ||
| 250 | */ | ||
| 251 | #define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ | ||
| 252 | for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ | ||
| 253 | (rs) < (re); \ | ||
| 254 | (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) | ||
| 255 | |||
| 256 | #define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ | ||
| 257 | for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ | ||
| 258 | (rs) < (re); \ | ||
| 259 | (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) | ||
| 260 | |||
| 194 | /** | 261 | /** |
| 195 | * pcpu_mem_alloc - allocate memory | 262 | * pcpu_mem_alloc - allocate memory |
| 196 | * @size: bytes to allocate | 263 | * @size: bytes to allocate |
| @@ -257,152 +324,117 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) | |||
| 257 | } | 324 | } |
| 258 | } | 325 | } |
| 259 | 326 | ||
| 260 | static struct rb_node **pcpu_chunk_rb_search(void *addr, | ||
| 261 | struct rb_node **parentp) | ||
| 262 | { | ||
| 263 | struct rb_node **p = &pcpu_addr_root.rb_node; | ||
| 264 | struct rb_node *parent = NULL; | ||
| 265 | struct pcpu_chunk *chunk; | ||
| 266 | |||
| 267 | while (*p) { | ||
| 268 | parent = *p; | ||
| 269 | chunk = rb_entry(parent, struct pcpu_chunk, rb_node); | ||
| 270 | |||
| 271 | if (addr < chunk->vm->addr) | ||
| 272 | p = &(*p)->rb_left; | ||
| 273 | else if (addr > chunk->vm->addr) | ||
| 274 | p = &(*p)->rb_right; | ||
| 275 | else | ||
| 276 | break; | ||
| 277 | } | ||
| 278 | |||
| 279 | if (parentp) | ||
| 280 | *parentp = parent; | ||
| 281 | return p; | ||
| 282 | } | ||
| 283 | |||
| 284 | /** | 327 | /** |
| 285 | * pcpu_chunk_addr_search - search for chunk containing specified address | 328 | * pcpu_chunk_addr_search - determine chunk containing specified address |
| 286 | * @addr: address to search for | 329 | * @addr: address for which the chunk needs to be determined. |
| 287 | * | ||
| 288 | * Look for chunk which might contain @addr. More specifically, it | ||
| 289 | * searchs for the chunk with the highest start address which isn't | ||
| 290 | * beyond @addr. | ||
| 291 | * | ||
| 292 | * CONTEXT: | ||
| 293 | * pcpu_lock. | ||
| 294 | * | 330 | * |
| 295 | * RETURNS: | 331 | * RETURNS: |
| 296 | * The address of the found chunk. | 332 | * The address of the found chunk. |
| 297 | */ | 333 | */ |
| 298 | static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) | 334 | static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) |
| 299 | { | 335 | { |
| 300 | struct rb_node *n, *parent; | 336 | void *first_start = pcpu_first_chunk->base_addr; |
| 301 | struct pcpu_chunk *chunk; | ||
| 302 | 337 | ||
| 303 | /* is it in the reserved chunk? */ | 338 | /* is it in the first chunk? */ |
| 304 | if (pcpu_reserved_chunk) { | 339 | if (addr >= first_start && addr < first_start + pcpu_unit_size) { |
| 305 | void *start = pcpu_reserved_chunk->vm->addr; | 340 | /* is it in the reserved area? */ |
| 306 | 341 | if (addr < first_start + pcpu_reserved_chunk_limit) | |
| 307 | if (addr >= start && addr < start + pcpu_reserved_chunk_limit) | ||
| 308 | return pcpu_reserved_chunk; | 342 | return pcpu_reserved_chunk; |
| 343 | return pcpu_first_chunk; | ||
| 309 | } | 344 | } |
| 310 | 345 | ||
| 311 | /* nah... search the regular ones */ | 346 | /* |
| 312 | n = *pcpu_chunk_rb_search(addr, &parent); | 347 | * The address is relative to unit0 which might be unused and |
| 313 | if (!n) { | 348 | * thus unmapped. Offset the address to the unit space of the |
| 314 | /* no exactly matching chunk, the parent is the closest */ | 349 | * current processor before looking it up in the vmalloc |
| 315 | n = parent; | 350 | * space. Note that any possible cpu id can be used here, so |
| 316 | BUG_ON(!n); | 351 | * there's no need to worry about preemption or cpu hotplug. |
| 317 | } | 352 | */ |
| 318 | chunk = rb_entry(n, struct pcpu_chunk, rb_node); | 353 | addr += pcpu_unit_offsets[raw_smp_processor_id()]; |
| 319 | 354 | return pcpu_get_page_chunk(vmalloc_to_page(addr)); | |
| 320 | if (addr < chunk->vm->addr) { | ||
| 321 | /* the parent was the next one, look for the previous one */ | ||
| 322 | n = rb_prev(n); | ||
| 323 | BUG_ON(!n); | ||
| 324 | chunk = rb_entry(n, struct pcpu_chunk, rb_node); | ||
| 325 | } | ||
| 326 | |||
| 327 | return chunk; | ||
| 328 | } | 355 | } |
| 329 | 356 | ||
| 330 | /** | 357 | /** |
| 331 | * pcpu_chunk_addr_insert - insert chunk into address rb tree | 358 | * pcpu_need_to_extend - determine whether chunk area map needs to be extended |
| 332 | * @new: chunk to insert | 359 | * @chunk: chunk of interest |
| 333 | * | 360 | * |
| 334 | * Insert @new into address rb tree. | 361 | * Determine whether area map of @chunk needs to be extended to |
| 362 | * accomodate a new allocation. | ||
| 335 | * | 363 | * |
| 336 | * CONTEXT: | 364 | * CONTEXT: |
| 337 | * pcpu_lock. | 365 | * pcpu_lock. |
| 366 | * | ||
| 367 | * RETURNS: | ||
| 368 | * New target map allocation length if extension is necessary, 0 | ||
| 369 | * otherwise. | ||
| 338 | */ | 370 | */ |
| 339 | static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) | 371 | static int pcpu_need_to_extend(struct pcpu_chunk *chunk) |
| 340 | { | 372 | { |
| 341 | struct rb_node **p, *parent; | 373 | int new_alloc; |
| 342 | 374 | ||
| 343 | p = pcpu_chunk_rb_search(new->vm->addr, &parent); | 375 | if (chunk->map_alloc >= chunk->map_used + 2) |
| 344 | BUG_ON(*p); | 376 | return 0; |
| 345 | rb_link_node(&new->rb_node, parent, p); | 377 | |
| 346 | rb_insert_color(&new->rb_node, &pcpu_addr_root); | 378 | new_alloc = PCPU_DFL_MAP_ALLOC; |
| 379 | while (new_alloc < chunk->map_used + 2) | ||
| 380 | new_alloc *= 2; | ||
| 381 | |||
| 382 | return new_alloc; | ||
| 347 | } | 383 | } |
| 348 | 384 | ||
| 349 | /** | 385 | /** |
| 350 | * pcpu_extend_area_map - extend area map for allocation | 386 | * pcpu_extend_area_map - extend area map of a chunk |
| 351 | * @chunk: target chunk | 387 | * @chunk: chunk of interest |
| 388 | * @new_alloc: new target allocation length of the area map | ||
| 352 | * | 389 | * |
| 353 | * Extend area map of @chunk so that it can accomodate an allocation. | 390 | * Extend area map of @chunk to have @new_alloc entries. |
| 354 | * A single allocation can split an area into three areas, so this | ||
| 355 | * function makes sure that @chunk->map has at least two extra slots. | ||
| 356 | * | 391 | * |
| 357 | * CONTEXT: | 392 | * CONTEXT: |
| 358 | * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired | 393 | * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock. |
| 359 | * if area map is extended. | ||
| 360 | * | 394 | * |
| 361 | * RETURNS: | 395 | * RETURNS: |
| 362 | * 0 if noop, 1 if successfully extended, -errno on failure. | 396 | * 0 on success, -errno on failure. |
| 363 | */ | 397 | */ |
| 364 | static int pcpu_extend_area_map(struct pcpu_chunk *chunk) | 398 | static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) |
| 365 | { | 399 | { |
| 366 | int new_alloc; | 400 | int *old = NULL, *new = NULL; |
| 367 | int *new; | 401 | size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); |
| 368 | size_t size; | 402 | unsigned long flags; |
| 369 | |||
| 370 | /* has enough? */ | ||
| 371 | if (chunk->map_alloc >= chunk->map_used + 2) | ||
| 372 | return 0; | ||
| 373 | |||
| 374 | spin_unlock_irq(&pcpu_lock); | ||
| 375 | |||
| 376 | new_alloc = PCPU_DFL_MAP_ALLOC; | ||
| 377 | while (new_alloc < chunk->map_used + 2) | ||
| 378 | new_alloc *= 2; | ||
| 379 | 403 | ||
| 380 | new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); | 404 | new = pcpu_mem_alloc(new_size); |
| 381 | if (!new) { | 405 | if (!new) |
| 382 | spin_lock_irq(&pcpu_lock); | ||
| 383 | return -ENOMEM; | 406 | return -ENOMEM; |
| 384 | } | ||
| 385 | 407 | ||
| 386 | /* | 408 | /* acquire pcpu_lock and switch to new area map */ |
| 387 | * Acquire pcpu_lock and switch to new area map. Only free | 409 | spin_lock_irqsave(&pcpu_lock, flags); |
| 388 | * could have happened inbetween, so map_used couldn't have | ||
| 389 | * grown. | ||
| 390 | */ | ||
| 391 | spin_lock_irq(&pcpu_lock); | ||
| 392 | BUG_ON(new_alloc < chunk->map_used + 2); | ||
| 393 | 410 | ||
| 394 | size = chunk->map_alloc * sizeof(chunk->map[0]); | 411 | if (new_alloc <= chunk->map_alloc) |
| 395 | memcpy(new, chunk->map, size); | 412 | goto out_unlock; |
| 413 | |||
| 414 | old_size = chunk->map_alloc * sizeof(chunk->map[0]); | ||
| 415 | memcpy(new, chunk->map, old_size); | ||
| 396 | 416 | ||
| 397 | /* | 417 | /* |
| 398 | * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is | 418 | * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is |
| 399 | * one of the first chunks and still using static map. | 419 | * one of the first chunks and still using static map. |
| 400 | */ | 420 | */ |
| 401 | if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) | 421 | if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) |
| 402 | pcpu_mem_free(chunk->map, size); | 422 | old = chunk->map; |
| 403 | 423 | ||
| 404 | chunk->map_alloc = new_alloc; | 424 | chunk->map_alloc = new_alloc; |
| 405 | chunk->map = new; | 425 | chunk->map = new; |
| 426 | new = NULL; | ||
| 427 | |||
| 428 | out_unlock: | ||
| 429 | spin_unlock_irqrestore(&pcpu_lock, flags); | ||
| 430 | |||
| 431 | /* | ||
| 432 | * pcpu_mem_free() might end up calling vfree() which uses | ||
| 433 | * IRQ-unsafe lock and thus can't be called under pcpu_lock. | ||
| 434 | */ | ||
| 435 | pcpu_mem_free(old, old_size); | ||
| 436 | pcpu_mem_free(new, new_size); | ||
| 437 | |||
| 406 | return 0; | 438 | return 0; |
| 407 | } | 439 | } |
| 408 | 440 | ||
| @@ -591,126 +623,327 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) | |||
| 591 | } | 623 | } |
| 592 | 624 | ||
| 593 | /** | 625 | /** |
| 594 | * pcpu_unmap - unmap pages out of a pcpu_chunk | 626 | * pcpu_get_pages_and_bitmap - get temp pages array and bitmap |
| 595 | * @chunk: chunk of interest | 627 | * @chunk: chunk of interest |
| 596 | * @page_start: page index of the first page to unmap | 628 | * @bitmapp: output parameter for bitmap |
| 597 | * @page_end: page index of the last page to unmap + 1 | 629 | * @may_alloc: may allocate the array |
| 598 | * @flush: whether to flush cache and tlb or not | ||
| 599 | * | 630 | * |
| 600 | * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. | 631 | * Returns pointer to array of pointers to struct page and bitmap, |
| 601 | * If @flush is true, vcache is flushed before unmapping and tlb | 632 | * both of which can be indexed with pcpu_page_idx(). The returned |
| 602 | * after. | 633 | * array is cleared to zero and *@bitmapp is copied from |
| 634 | * @chunk->populated. Note that there is only one array and bitmap | ||
| 635 | * and access exclusion is the caller's responsibility. | ||
| 636 | * | ||
| 637 | * CONTEXT: | ||
| 638 | * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. | ||
| 639 | * Otherwise, don't care. | ||
| 640 | * | ||
| 641 | * RETURNS: | ||
| 642 | * Pointer to temp pages array on success, NULL on failure. | ||
| 603 | */ | 643 | */ |
| 604 | static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, | 644 | static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, |
| 605 | bool flush) | 645 | unsigned long **bitmapp, |
| 646 | bool may_alloc) | ||
| 606 | { | 647 | { |
| 607 | unsigned int last = num_possible_cpus() - 1; | 648 | static struct page **pages; |
| 608 | unsigned int cpu; | 649 | static unsigned long *bitmap; |
| 650 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); | ||
| 651 | size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * | ||
| 652 | sizeof(unsigned long); | ||
| 653 | |||
| 654 | if (!pages || !bitmap) { | ||
| 655 | if (may_alloc && !pages) | ||
| 656 | pages = pcpu_mem_alloc(pages_size); | ||
| 657 | if (may_alloc && !bitmap) | ||
| 658 | bitmap = pcpu_mem_alloc(bitmap_size); | ||
| 659 | if (!pages || !bitmap) | ||
| 660 | return NULL; | ||
| 661 | } | ||
| 609 | 662 | ||
| 610 | /* unmap must not be done on immutable chunk */ | 663 | memset(pages, 0, pages_size); |
| 611 | WARN_ON(chunk->immutable); | 664 | bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); |
| 612 | 665 | ||
| 613 | /* | 666 | *bitmapp = bitmap; |
| 614 | * Each flushing trial can be very expensive, issue flush on | 667 | return pages; |
| 615 | * the whole region at once rather than doing it for each cpu. | 668 | } |
| 616 | * This could be an overkill but is more scalable. | ||
| 617 | */ | ||
| 618 | if (flush) | ||
| 619 | flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), | ||
| 620 | pcpu_chunk_addr(chunk, last, page_end)); | ||
| 621 | 669 | ||
| 622 | for_each_possible_cpu(cpu) | 670 | /** |
| 623 | unmap_kernel_range_noflush( | 671 | * pcpu_free_pages - free pages which were allocated for @chunk |
| 624 | pcpu_chunk_addr(chunk, cpu, page_start), | 672 | * @chunk: chunk pages were allocated for |
| 625 | (page_end - page_start) << PAGE_SHIFT); | 673 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() |
| 626 | 674 | * @populated: populated bitmap | |
| 627 | /* ditto as flush_cache_vunmap() */ | 675 | * @page_start: page index of the first page to be freed |
| 628 | if (flush) | 676 | * @page_end: page index of the last page to be freed + 1 |
| 629 | flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), | 677 | * |
| 630 | pcpu_chunk_addr(chunk, last, page_end)); | 678 | * Free pages [@page_start and @page_end) in @pages for all units. |
| 679 | * The pages were allocated for @chunk. | ||
| 680 | */ | ||
| 681 | static void pcpu_free_pages(struct pcpu_chunk *chunk, | ||
| 682 | struct page **pages, unsigned long *populated, | ||
| 683 | int page_start, int page_end) | ||
| 684 | { | ||
| 685 | unsigned int cpu; | ||
| 686 | int i; | ||
| 687 | |||
| 688 | for_each_possible_cpu(cpu) { | ||
| 689 | for (i = page_start; i < page_end; i++) { | ||
| 690 | struct page *page = pages[pcpu_page_idx(cpu, i)]; | ||
| 691 | |||
| 692 | if (page) | ||
| 693 | __free_page(page); | ||
| 694 | } | ||
| 695 | } | ||
| 631 | } | 696 | } |
| 632 | 697 | ||
| 633 | /** | 698 | /** |
| 634 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk | 699 | * pcpu_alloc_pages - allocates pages for @chunk |
| 635 | * @chunk: chunk to depopulate | 700 | * @chunk: target chunk |
| 636 | * @off: offset to the area to depopulate | 701 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() |
| 637 | * @size: size of the area to depopulate in bytes | 702 | * @populated: populated bitmap |
| 638 | * @flush: whether to flush cache and tlb or not | 703 | * @page_start: page index of the first page to be allocated |
| 639 | * | 704 | * @page_end: page index of the last page to be allocated + 1 |
| 640 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 705 | * |
| 641 | * from @chunk. If @flush is true, vcache is flushed before unmapping | 706 | * Allocate pages [@page_start,@page_end) into @pages for all units. |
| 642 | * and tlb after. | 707 | * The allocation is for @chunk. Percpu core doesn't care about the |
| 643 | * | 708 | * content of @pages and will pass it verbatim to pcpu_map_pages(). |
| 644 | * CONTEXT: | ||
| 645 | * pcpu_alloc_mutex. | ||
| 646 | */ | 709 | */ |
| 647 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, | 710 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, |
| 648 | bool flush) | 711 | struct page **pages, unsigned long *populated, |
| 712 | int page_start, int page_end) | ||
| 649 | { | 713 | { |
| 650 | int page_start = PFN_DOWN(off); | 714 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; |
| 651 | int page_end = PFN_UP(off + size); | ||
| 652 | int unmap_start = -1; | ||
| 653 | int uninitialized_var(unmap_end); | ||
| 654 | unsigned int cpu; | 715 | unsigned int cpu; |
| 655 | int i; | 716 | int i; |
| 656 | 717 | ||
| 657 | for (i = page_start; i < page_end; i++) { | 718 | for_each_possible_cpu(cpu) { |
| 658 | for_each_possible_cpu(cpu) { | 719 | for (i = page_start; i < page_end; i++) { |
| 659 | struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); | 720 | struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; |
| 721 | |||
| 722 | *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); | ||
| 723 | if (!*pagep) { | ||
| 724 | pcpu_free_pages(chunk, pages, populated, | ||
| 725 | page_start, page_end); | ||
| 726 | return -ENOMEM; | ||
| 727 | } | ||
| 728 | } | ||
| 729 | } | ||
| 730 | return 0; | ||
| 731 | } | ||
| 660 | 732 | ||
| 661 | if (!*pagep) | 733 | /** |
| 662 | continue; | 734 | * pcpu_pre_unmap_flush - flush cache prior to unmapping |
| 735 | * @chunk: chunk the regions to be flushed belongs to | ||
| 736 | * @page_start: page index of the first page to be flushed | ||
| 737 | * @page_end: page index of the last page to be flushed + 1 | ||
| 738 | * | ||
| 739 | * Pages in [@page_start,@page_end) of @chunk are about to be | ||
| 740 | * unmapped. Flush cache. As each flushing trial can be very | ||
| 741 | * expensive, issue flush on the whole region at once rather than | ||
| 742 | * doing it for each cpu. This could be an overkill but is more | ||
| 743 | * scalable. | ||
| 744 | */ | ||
| 745 | static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, | ||
| 746 | int page_start, int page_end) | ||
| 747 | { | ||
| 748 | flush_cache_vunmap( | ||
| 749 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | ||
| 750 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | ||
| 751 | } | ||
| 663 | 752 | ||
| 664 | __free_page(*pagep); | 753 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) |
| 754 | { | ||
| 755 | unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); | ||
| 756 | } | ||
| 665 | 757 | ||
| 666 | /* | 758 | /** |
| 667 | * If it's partial depopulation, it might get | 759 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk |
| 668 | * populated or depopulated again. Mark the | 760 | * @chunk: chunk of interest |
| 669 | * page gone. | 761 | * @pages: pages array which can be used to pass information to free |
| 670 | */ | 762 | * @populated: populated bitmap |
| 671 | *pagep = NULL; | 763 | * @page_start: page index of the first page to unmap |
| 764 | * @page_end: page index of the last page to unmap + 1 | ||
| 765 | * | ||
| 766 | * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. | ||
| 767 | * Corresponding elements in @pages were cleared by the caller and can | ||
| 768 | * be used to carry information to pcpu_free_pages() which will be | ||
| 769 | * called after all unmaps are finished. The caller should call | ||
| 770 | * proper pre/post flush functions. | ||
| 771 | */ | ||
| 772 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, | ||
| 773 | struct page **pages, unsigned long *populated, | ||
| 774 | int page_start, int page_end) | ||
| 775 | { | ||
| 776 | unsigned int cpu; | ||
| 777 | int i; | ||
| 672 | 778 | ||
| 673 | unmap_start = unmap_start < 0 ? i : unmap_start; | 779 | for_each_possible_cpu(cpu) { |
| 674 | unmap_end = i + 1; | 780 | for (i = page_start; i < page_end; i++) { |
| 781 | struct page *page; | ||
| 782 | |||
| 783 | page = pcpu_chunk_page(chunk, cpu, i); | ||
| 784 | WARN_ON(!page); | ||
| 785 | pages[pcpu_page_idx(cpu, i)] = page; | ||
| 675 | } | 786 | } |
| 787 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), | ||
| 788 | page_end - page_start); | ||
| 676 | } | 789 | } |
| 677 | 790 | ||
| 678 | if (unmap_start >= 0) | 791 | for (i = page_start; i < page_end; i++) |
| 679 | pcpu_unmap(chunk, unmap_start, unmap_end, flush); | 792 | __clear_bit(i, populated); |
| 680 | } | 793 | } |
| 681 | 794 | ||
| 682 | /** | 795 | /** |
| 683 | * pcpu_map - map pages into a pcpu_chunk | 796 | * pcpu_post_unmap_tlb_flush - flush TLB after unmapping |
| 797 | * @chunk: pcpu_chunk the regions to be flushed belong to | ||
| 798 | * @page_start: page index of the first page to be flushed | ||
| 799 | * @page_end: page index of the last page to be flushed + 1 | ||
| 800 | * | ||
| 801 | * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush | ||
| 802 | * TLB for the regions. This can be skipped if the area is to be | ||
| 803 | * returned to vmalloc as vmalloc will handle TLB flushing lazily. | ||
| 804 | * | ||
| 805 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once | ||
| 806 | * for the whole region. | ||
| 807 | */ | ||
| 808 | static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, | ||
| 809 | int page_start, int page_end) | ||
| 810 | { | ||
| 811 | flush_tlb_kernel_range( | ||
| 812 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | ||
| 813 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | ||
| 814 | } | ||
| 815 | |||
| 816 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, | ||
| 817 | int nr_pages) | ||
| 818 | { | ||
| 819 | return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, | ||
| 820 | PAGE_KERNEL, pages); | ||
| 821 | } | ||
| 822 | |||
| 823 | /** | ||
| 824 | * pcpu_map_pages - map pages into a pcpu_chunk | ||
| 684 | * @chunk: chunk of interest | 825 | * @chunk: chunk of interest |
| 826 | * @pages: pages array containing pages to be mapped | ||
| 827 | * @populated: populated bitmap | ||
| 685 | * @page_start: page index of the first page to map | 828 | * @page_start: page index of the first page to map |
| 686 | * @page_end: page index of the last page to map + 1 | 829 | * @page_end: page index of the last page to map + 1 |
| 687 | * | 830 | * |
| 688 | * For each cpu, map pages [@page_start,@page_end) into @chunk. | 831 | * For each cpu, map pages [@page_start,@page_end) into @chunk. The |
| 689 | * vcache is flushed afterwards. | 832 | * caller is responsible for calling pcpu_post_map_flush() after all |
| 833 | * mappings are complete. | ||
| 834 | * | ||
| 835 | * This function is responsible for setting corresponding bits in | ||
| 836 | * @chunk->populated bitmap and whatever is necessary for reverse | ||
| 837 | * lookup (addr -> chunk). | ||
| 690 | */ | 838 | */ |
| 691 | static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) | 839 | static int pcpu_map_pages(struct pcpu_chunk *chunk, |
| 840 | struct page **pages, unsigned long *populated, | ||
| 841 | int page_start, int page_end) | ||
| 692 | { | 842 | { |
| 693 | unsigned int last = num_possible_cpus() - 1; | 843 | unsigned int cpu, tcpu; |
| 694 | unsigned int cpu; | 844 | int i, err; |
| 695 | int err; | ||
| 696 | |||
| 697 | /* map must not be done on immutable chunk */ | ||
| 698 | WARN_ON(chunk->immutable); | ||
| 699 | 845 | ||
| 700 | for_each_possible_cpu(cpu) { | 846 | for_each_possible_cpu(cpu) { |
| 701 | err = map_kernel_range_noflush( | 847 | err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), |
| 702 | pcpu_chunk_addr(chunk, cpu, page_start), | 848 | &pages[pcpu_page_idx(cpu, page_start)], |
| 703 | (page_end - page_start) << PAGE_SHIFT, | 849 | page_end - page_start); |
| 704 | PAGE_KERNEL, | ||
| 705 | pcpu_chunk_pagep(chunk, cpu, page_start)); | ||
| 706 | if (err < 0) | 850 | if (err < 0) |
| 707 | return err; | 851 | goto err; |
| 852 | } | ||
| 853 | |||
| 854 | /* mapping successful, link chunk and mark populated */ | ||
| 855 | for (i = page_start; i < page_end; i++) { | ||
| 856 | for_each_possible_cpu(cpu) | ||
| 857 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], | ||
| 858 | chunk); | ||
| 859 | __set_bit(i, populated); | ||
| 708 | } | 860 | } |
| 709 | 861 | ||
| 710 | /* flush at once, please read comments in pcpu_unmap() */ | ||
| 711 | flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), | ||
| 712 | pcpu_chunk_addr(chunk, last, page_end)); | ||
| 713 | return 0; | 862 | return 0; |
| 863 | |||
| 864 | err: | ||
| 865 | for_each_possible_cpu(tcpu) { | ||
| 866 | if (tcpu == cpu) | ||
| 867 | break; | ||
| 868 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), | ||
| 869 | page_end - page_start); | ||
| 870 | } | ||
| 871 | return err; | ||
| 872 | } | ||
| 873 | |||
| 874 | /** | ||
| 875 | * pcpu_post_map_flush - flush cache after mapping | ||
| 876 | * @chunk: pcpu_chunk the regions to be flushed belong to | ||
| 877 | * @page_start: page index of the first page to be flushed | ||
| 878 | * @page_end: page index of the last page to be flushed + 1 | ||
| 879 | * | ||
| 880 | * Pages [@page_start,@page_end) of @chunk have been mapped. Flush | ||
| 881 | * cache. | ||
| 882 | * | ||
| 883 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once | ||
| 884 | * for the whole region. | ||
| 885 | */ | ||
| 886 | static void pcpu_post_map_flush(struct pcpu_chunk *chunk, | ||
| 887 | int page_start, int page_end) | ||
| 888 | { | ||
| 889 | flush_cache_vmap( | ||
| 890 | pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), | ||
| 891 | pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); | ||
| 892 | } | ||
| 893 | |||
| 894 | /** | ||
| 895 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk | ||
| 896 | * @chunk: chunk to depopulate | ||
| 897 | * @off: offset to the area to depopulate | ||
| 898 | * @size: size of the area to depopulate in bytes | ||
| 899 | * @flush: whether to flush cache and tlb or not | ||
| 900 | * | ||
| 901 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | ||
| 902 | * from @chunk. If @flush is true, vcache is flushed before unmapping | ||
| 903 | * and tlb after. | ||
| 904 | * | ||
| 905 | * CONTEXT: | ||
| 906 | * pcpu_alloc_mutex. | ||
| 907 | */ | ||
| 908 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) | ||
| 909 | { | ||
| 910 | int page_start = PFN_DOWN(off); | ||
| 911 | int page_end = PFN_UP(off + size); | ||
| 912 | struct page **pages; | ||
| 913 | unsigned long *populated; | ||
| 914 | int rs, re; | ||
| 915 | |||
| 916 | /* quick path, check whether it's empty already */ | ||
| 917 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | ||
| 918 | if (rs == page_start && re == page_end) | ||
| 919 | return; | ||
| 920 | break; | ||
| 921 | } | ||
| 922 | |||
| 923 | /* immutable chunks can't be depopulated */ | ||
| 924 | WARN_ON(chunk->immutable); | ||
| 925 | |||
| 926 | /* | ||
| 927 | * If control reaches here, there must have been at least one | ||
| 928 | * successful population attempt so the temp pages array must | ||
| 929 | * be available now. | ||
| 930 | */ | ||
| 931 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); | ||
| 932 | BUG_ON(!pages); | ||
| 933 | |||
| 934 | /* unmap and free */ | ||
| 935 | pcpu_pre_unmap_flush(chunk, page_start, page_end); | ||
| 936 | |||
| 937 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) | ||
| 938 | pcpu_unmap_pages(chunk, pages, populated, rs, re); | ||
| 939 | |||
| 940 | /* no need to flush tlb, vmalloc will handle it lazily */ | ||
| 941 | |||
| 942 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) | ||
| 943 | pcpu_free_pages(chunk, pages, populated, rs, re); | ||
| 944 | |||
| 945 | /* commit new bitmap */ | ||
| 946 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); | ||
| 714 | } | 947 | } |
| 715 | 948 | ||
| 716 | /** | 949 | /** |
| @@ -727,57 +960,68 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) | |||
| 727 | */ | 960 | */ |
| 728 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | 961 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) |
| 729 | { | 962 | { |
| 730 | const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; | ||
| 731 | int page_start = PFN_DOWN(off); | 963 | int page_start = PFN_DOWN(off); |
| 732 | int page_end = PFN_UP(off + size); | 964 | int page_end = PFN_UP(off + size); |
| 733 | int map_start = -1; | 965 | int free_end = page_start, unmap_end = page_start; |
| 734 | int uninitialized_var(map_end); | 966 | struct page **pages; |
| 967 | unsigned long *populated; | ||
| 735 | unsigned int cpu; | 968 | unsigned int cpu; |
| 736 | int i; | 969 | int rs, re, rc; |
| 737 | 970 | ||
| 738 | for (i = page_start; i < page_end; i++) { | 971 | /* quick path, check whether all pages are already there */ |
| 739 | if (pcpu_chunk_page_occupied(chunk, i)) { | 972 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) { |
| 740 | if (map_start >= 0) { | 973 | if (rs == page_start && re == page_end) |
| 741 | if (pcpu_map(chunk, map_start, map_end)) | 974 | goto clear; |
| 742 | goto err; | 975 | break; |
| 743 | map_start = -1; | 976 | } |
| 744 | } | ||
| 745 | continue; | ||
| 746 | } | ||
| 747 | 977 | ||
| 748 | map_start = map_start < 0 ? i : map_start; | 978 | /* need to allocate and map pages, this chunk can't be immutable */ |
| 749 | map_end = i + 1; | 979 | WARN_ON(chunk->immutable); |
| 750 | 980 | ||
| 751 | for_each_possible_cpu(cpu) { | 981 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); |
| 752 | struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); | 982 | if (!pages) |
| 983 | return -ENOMEM; | ||
| 753 | 984 | ||
| 754 | *pagep = alloc_pages_node(cpu_to_node(cpu), | 985 | /* alloc and map */ |
| 755 | alloc_mask, 0); | 986 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { |
| 756 | if (!*pagep) | 987 | rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); |
| 757 | goto err; | 988 | if (rc) |
| 758 | } | 989 | goto err_free; |
| 990 | free_end = re; | ||
| 759 | } | 991 | } |
| 760 | 992 | ||
| 761 | if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) | 993 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { |
| 762 | goto err; | 994 | rc = pcpu_map_pages(chunk, pages, populated, rs, re); |
| 995 | if (rc) | ||
| 996 | goto err_unmap; | ||
| 997 | unmap_end = re; | ||
| 998 | } | ||
| 999 | pcpu_post_map_flush(chunk, page_start, page_end); | ||
| 763 | 1000 | ||
| 1001 | /* commit new bitmap */ | ||
| 1002 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); | ||
| 1003 | clear: | ||
| 764 | for_each_possible_cpu(cpu) | 1004 | for_each_possible_cpu(cpu) |
| 765 | memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, | 1005 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); |
| 766 | size); | ||
| 767 | |||
| 768 | return 0; | 1006 | return 0; |
| 769 | err: | 1007 | |
| 770 | /* likely under heavy memory pressure, give memory back */ | 1008 | err_unmap: |
| 771 | pcpu_depopulate_chunk(chunk, off, size, true); | 1009 | pcpu_pre_unmap_flush(chunk, page_start, unmap_end); |
| 772 | return -ENOMEM; | 1010 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) |
| 1011 | pcpu_unmap_pages(chunk, pages, populated, rs, re); | ||
| 1012 | pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); | ||
| 1013 | err_free: | ||
| 1014 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) | ||
| 1015 | pcpu_free_pages(chunk, pages, populated, rs, re); | ||
| 1016 | return rc; | ||
| 773 | } | 1017 | } |
| 774 | 1018 | ||
| 775 | static void free_pcpu_chunk(struct pcpu_chunk *chunk) | 1019 | static void free_pcpu_chunk(struct pcpu_chunk *chunk) |
| 776 | { | 1020 | { |
| 777 | if (!chunk) | 1021 | if (!chunk) |
| 778 | return; | 1022 | return; |
| 779 | if (chunk->vm) | 1023 | if (chunk->vms) |
| 780 | free_vm_area(chunk->vm); | 1024 | pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups); |
| 781 | pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); | 1025 | pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); |
| 782 | kfree(chunk); | 1026 | kfree(chunk); |
| 783 | } | 1027 | } |
| @@ -793,10 +1037,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) | |||
| 793 | chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); | 1037 | chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); |
| 794 | chunk->map_alloc = PCPU_DFL_MAP_ALLOC; | 1038 | chunk->map_alloc = PCPU_DFL_MAP_ALLOC; |
| 795 | chunk->map[chunk->map_used++] = pcpu_unit_size; | 1039 | chunk->map[chunk->map_used++] = pcpu_unit_size; |
| 796 | chunk->page = chunk->page_ar; | ||
| 797 | 1040 | ||
| 798 | chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); | 1041 | chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, |
| 799 | if (!chunk->vm) { | 1042 | pcpu_nr_groups, pcpu_atom_size, |
| 1043 | GFP_KERNEL); | ||
| 1044 | if (!chunk->vms) { | ||
| 800 | free_pcpu_chunk(chunk); | 1045 | free_pcpu_chunk(chunk); |
| 801 | return NULL; | 1046 | return NULL; |
| 802 | } | 1047 | } |
| @@ -804,6 +1049,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) | |||
| 804 | INIT_LIST_HEAD(&chunk->list); | 1049 | INIT_LIST_HEAD(&chunk->list); |
| 805 | chunk->free_size = pcpu_unit_size; | 1050 | chunk->free_size = pcpu_unit_size; |
| 806 | chunk->contig_hint = pcpu_unit_size; | 1051 | chunk->contig_hint = pcpu_unit_size; |
| 1052 | chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0]; | ||
| 807 | 1053 | ||
| 808 | return chunk; | 1054 | return chunk; |
| 809 | } | 1055 | } |
| @@ -824,8 +1070,11 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) | |||
| 824 | */ | 1070 | */ |
| 825 | static void *pcpu_alloc(size_t size, size_t align, bool reserved) | 1071 | static void *pcpu_alloc(size_t size, size_t align, bool reserved) |
| 826 | { | 1072 | { |
| 1073 | static int warn_limit = 10; | ||
| 827 | struct pcpu_chunk *chunk; | 1074 | struct pcpu_chunk *chunk; |
| 828 | int slot, off; | 1075 | const char *err; |
| 1076 | int slot, off, new_alloc; | ||
| 1077 | unsigned long flags; | ||
| 829 | 1078 | ||
| 830 | if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { | 1079 | if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { |
| 831 | WARN(true, "illegal size (%zu) or align (%zu) for " | 1080 | WARN(true, "illegal size (%zu) or align (%zu) for " |
| @@ -834,17 +1083,31 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) | |||
| 834 | } | 1083 | } |
| 835 | 1084 | ||
| 836 | mutex_lock(&pcpu_alloc_mutex); | 1085 | mutex_lock(&pcpu_alloc_mutex); |
| 837 | spin_lock_irq(&pcpu_lock); | 1086 | spin_lock_irqsave(&pcpu_lock, flags); |
| 838 | 1087 | ||
| 839 | /* serve reserved allocations from the reserved chunk if available */ | 1088 | /* serve reserved allocations from the reserved chunk if available */ |
| 840 | if (reserved && pcpu_reserved_chunk) { | 1089 | if (reserved && pcpu_reserved_chunk) { |
| 841 | chunk = pcpu_reserved_chunk; | 1090 | chunk = pcpu_reserved_chunk; |
| 842 | if (size > chunk->contig_hint || | 1091 | |
| 843 | pcpu_extend_area_map(chunk) < 0) | 1092 | if (size > chunk->contig_hint) { |
| 1093 | err = "alloc from reserved chunk failed"; | ||
| 844 | goto fail_unlock; | 1094 | goto fail_unlock; |
| 1095 | } | ||
| 1096 | |||
| 1097 | while ((new_alloc = pcpu_need_to_extend(chunk))) { | ||
| 1098 | spin_unlock_irqrestore(&pcpu_lock, flags); | ||
| 1099 | if (pcpu_extend_area_map(chunk, new_alloc) < 0) { | ||
| 1100 | err = "failed to extend area map of reserved chunk"; | ||
| 1101 | goto fail_unlock_mutex; | ||
| 1102 | } | ||
| 1103 | spin_lock_irqsave(&pcpu_lock, flags); | ||
| 1104 | } | ||
| 1105 | |||
| 845 | off = pcpu_alloc_area(chunk, size, align); | 1106 | off = pcpu_alloc_area(chunk, size, align); |
| 846 | if (off >= 0) | 1107 | if (off >= 0) |
| 847 | goto area_found; | 1108 | goto area_found; |
| 1109 | |||
| 1110 | err = "alloc from reserved chunk failed"; | ||
| 848 | goto fail_unlock; | 1111 | goto fail_unlock; |
| 849 | } | 1112 | } |
| 850 | 1113 | ||
| @@ -855,13 +1118,20 @@ restart: | |||
| 855 | if (size > chunk->contig_hint) | 1118 | if (size > chunk->contig_hint) |
| 856 | continue; | 1119 | continue; |
| 857 | 1120 | ||
| 858 | switch (pcpu_extend_area_map(chunk)) { | 1121 | new_alloc = pcpu_need_to_extend(chunk); |
| 859 | case 0: | 1122 | if (new_alloc) { |
| 860 | break; | 1123 | spin_unlock_irqrestore(&pcpu_lock, flags); |
| 861 | case 1: | 1124 | if (pcpu_extend_area_map(chunk, |
| 862 | goto restart; /* pcpu_lock dropped, restart */ | 1125 | new_alloc) < 0) { |
| 863 | default: | 1126 | err = "failed to extend area map"; |
| 864 | goto fail_unlock; | 1127 | goto fail_unlock_mutex; |
| 1128 | } | ||
| 1129 | spin_lock_irqsave(&pcpu_lock, flags); | ||
| 1130 | /* | ||
| 1131 | * pcpu_lock has been dropped, need to | ||
| 1132 | * restart cpu_slot list walking. | ||
| 1133 | */ | ||
| 1134 | goto restart; | ||
| 865 | } | 1135 | } |
| 866 | 1136 | ||
| 867 | off = pcpu_alloc_area(chunk, size, align); | 1137 | off = pcpu_alloc_area(chunk, size, align); |
| @@ -871,35 +1141,45 @@ restart: | |||
| 871 | } | 1141 | } |
| 872 | 1142 | ||
| 873 | /* hmmm... no space left, create a new chunk */ | 1143 | /* hmmm... no space left, create a new chunk */ |
| 874 | spin_unlock_irq(&pcpu_lock); | 1144 | spin_unlock_irqrestore(&pcpu_lock, flags); |
| 875 | 1145 | ||
| 876 | chunk = alloc_pcpu_chunk(); | 1146 | chunk = alloc_pcpu_chunk(); |
| 877 | if (!chunk) | 1147 | if (!chunk) { |
| 1148 | err = "failed to allocate new chunk"; | ||
| 878 | goto fail_unlock_mutex; | 1149 | goto fail_unlock_mutex; |
| 1150 | } | ||
| 879 | 1151 | ||
| 880 | spin_lock_irq(&pcpu_lock); | 1152 | spin_lock_irqsave(&pcpu_lock, flags); |
| 881 | pcpu_chunk_relocate(chunk, -1); | 1153 | pcpu_chunk_relocate(chunk, -1); |
| 882 | pcpu_chunk_addr_insert(chunk); | ||
| 883 | goto restart; | 1154 | goto restart; |
| 884 | 1155 | ||
| 885 | area_found: | 1156 | area_found: |
| 886 | spin_unlock_irq(&pcpu_lock); | 1157 | spin_unlock_irqrestore(&pcpu_lock, flags); |
| 887 | 1158 | ||
| 888 | /* populate, map and clear the area */ | 1159 | /* populate, map and clear the area */ |
| 889 | if (pcpu_populate_chunk(chunk, off, size)) { | 1160 | if (pcpu_populate_chunk(chunk, off, size)) { |
| 890 | spin_lock_irq(&pcpu_lock); | 1161 | spin_lock_irqsave(&pcpu_lock, flags); |
| 891 | pcpu_free_area(chunk, off); | 1162 | pcpu_free_area(chunk, off); |
| 1163 | err = "failed to populate"; | ||
| 892 | goto fail_unlock; | 1164 | goto fail_unlock; |
| 893 | } | 1165 | } |
| 894 | 1166 | ||
| 895 | mutex_unlock(&pcpu_alloc_mutex); | 1167 | mutex_unlock(&pcpu_alloc_mutex); |
| 896 | 1168 | ||
| 897 | return __addr_to_pcpu_ptr(chunk->vm->addr + off); | 1169 | /* return address relative to base address */ |
| 1170 | return __addr_to_pcpu_ptr(chunk->base_addr + off); | ||
| 898 | 1171 | ||
| 899 | fail_unlock: | 1172 | fail_unlock: |
| 900 | spin_unlock_irq(&pcpu_lock); | 1173 | spin_unlock_irqrestore(&pcpu_lock, flags); |
| 901 | fail_unlock_mutex: | 1174 | fail_unlock_mutex: |
| 902 | mutex_unlock(&pcpu_alloc_mutex); | 1175 | mutex_unlock(&pcpu_alloc_mutex); |
| 1176 | if (warn_limit) { | ||
| 1177 | pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " | ||
| 1178 | "%s\n", size, align, err); | ||
| 1179 | dump_stack(); | ||
| 1180 | if (!--warn_limit) | ||
| 1181 | pr_info("PERCPU: limit reached, disable warning\n"); | ||
| 1182 | } | ||
| 903 | return NULL; | 1183 | return NULL; |
| 904 | } | 1184 | } |
| 905 | 1185 | ||
| @@ -968,17 +1248,17 @@ static void pcpu_reclaim(struct work_struct *work) | |||
| 968 | if (chunk == list_first_entry(head, struct pcpu_chunk, list)) | 1248 | if (chunk == list_first_entry(head, struct pcpu_chunk, list)) |
| 969 | continue; | 1249 | continue; |
| 970 | 1250 | ||
| 971 | rb_erase(&chunk->rb_node, &pcpu_addr_root); | ||
| 972 | list_move(&chunk->list, &todo); | 1251 | list_move(&chunk->list, &todo); |
| 973 | } | 1252 | } |
| 974 | 1253 | ||
| 975 | spin_unlock_irq(&pcpu_lock); | 1254 | spin_unlock_irq(&pcpu_lock); |
| 976 | mutex_unlock(&pcpu_alloc_mutex); | ||
| 977 | 1255 | ||
| 978 | list_for_each_entry_safe(chunk, next, &todo, list) { | 1256 | list_for_each_entry_safe(chunk, next, &todo, list) { |
| 979 | pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); | 1257 | pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); |
| 980 | free_pcpu_chunk(chunk); | 1258 | free_pcpu_chunk(chunk); |
| 981 | } | 1259 | } |
| 1260 | |||
| 1261 | mutex_unlock(&pcpu_alloc_mutex); | ||
| 982 | } | 1262 | } |
| 983 | 1263 | ||
| 984 | /** | 1264 | /** |
| @@ -1003,7 +1283,7 @@ void free_percpu(void *ptr) | |||
| 1003 | spin_lock_irqsave(&pcpu_lock, flags); | 1283 | spin_lock_irqsave(&pcpu_lock, flags); |
| 1004 | 1284 | ||
| 1005 | chunk = pcpu_chunk_addr_search(addr); | 1285 | chunk = pcpu_chunk_addr_search(addr); |
| 1006 | off = addr - chunk->vm->addr; | 1286 | off = addr - chunk->base_addr; |
| 1007 | 1287 | ||
| 1008 | pcpu_free_area(chunk, off); | 1288 | pcpu_free_area(chunk, off); |
| 1009 | 1289 | ||
| @@ -1022,30 +1302,299 @@ void free_percpu(void *ptr) | |||
| 1022 | } | 1302 | } |
| 1023 | EXPORT_SYMBOL_GPL(free_percpu); | 1303 | EXPORT_SYMBOL_GPL(free_percpu); |
| 1024 | 1304 | ||
| 1305 | static inline size_t pcpu_calc_fc_sizes(size_t static_size, | ||
| 1306 | size_t reserved_size, | ||
| 1307 | ssize_t *dyn_sizep) | ||
| 1308 | { | ||
| 1309 | size_t size_sum; | ||
| 1310 | |||
| 1311 | size_sum = PFN_ALIGN(static_size + reserved_size + | ||
| 1312 | (*dyn_sizep >= 0 ? *dyn_sizep : 0)); | ||
| 1313 | if (*dyn_sizep != 0) | ||
| 1314 | *dyn_sizep = size_sum - static_size - reserved_size; | ||
| 1315 | |||
| 1316 | return size_sum; | ||
| 1317 | } | ||
| 1318 | |||
| 1025 | /** | 1319 | /** |
| 1026 | * pcpu_setup_first_chunk - initialize the first percpu chunk | 1320 | * pcpu_alloc_alloc_info - allocate percpu allocation info |
| 1027 | * @get_page_fn: callback to fetch page pointer | 1321 | * @nr_groups: the number of groups |
| 1028 | * @static_size: the size of static percpu area in bytes | 1322 | * @nr_units: the number of units |
| 1323 | * | ||
| 1324 | * Allocate ai which is large enough for @nr_groups groups containing | ||
| 1325 | * @nr_units units. The returned ai's groups[0].cpu_map points to the | ||
| 1326 | * cpu_map array which is long enough for @nr_units and filled with | ||
| 1327 | * NR_CPUS. It's the caller's responsibility to initialize cpu_map | ||
| 1328 | * pointer of other groups. | ||
| 1329 | * | ||
| 1330 | * RETURNS: | ||
| 1331 | * Pointer to the allocated pcpu_alloc_info on success, NULL on | ||
| 1332 | * failure. | ||
| 1333 | */ | ||
| 1334 | struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, | ||
| 1335 | int nr_units) | ||
| 1336 | { | ||
| 1337 | struct pcpu_alloc_info *ai; | ||
| 1338 | size_t base_size, ai_size; | ||
| 1339 | void *ptr; | ||
| 1340 | int unit; | ||
| 1341 | |||
| 1342 | base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), | ||
| 1343 | __alignof__(ai->groups[0].cpu_map[0])); | ||
| 1344 | ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); | ||
| 1345 | |||
| 1346 | ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); | ||
| 1347 | if (!ptr) | ||
| 1348 | return NULL; | ||
| 1349 | ai = ptr; | ||
| 1350 | ptr += base_size; | ||
| 1351 | |||
| 1352 | ai->groups[0].cpu_map = ptr; | ||
| 1353 | |||
| 1354 | for (unit = 0; unit < nr_units; unit++) | ||
| 1355 | ai->groups[0].cpu_map[unit] = NR_CPUS; | ||
| 1356 | |||
| 1357 | ai->nr_groups = nr_groups; | ||
| 1358 | ai->__ai_size = PFN_ALIGN(ai_size); | ||
| 1359 | |||
| 1360 | return ai; | ||
| 1361 | } | ||
| 1362 | |||
| 1363 | /** | ||
| 1364 | * pcpu_free_alloc_info - free percpu allocation info | ||
| 1365 | * @ai: pcpu_alloc_info to free | ||
| 1366 | * | ||
| 1367 | * Free @ai which was allocated by pcpu_alloc_alloc_info(). | ||
| 1368 | */ | ||
| 1369 | void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) | ||
| 1370 | { | ||
| 1371 | free_bootmem(__pa(ai), ai->__ai_size); | ||
| 1372 | } | ||
| 1373 | |||
| 1374 | /** | ||
| 1375 | * pcpu_build_alloc_info - build alloc_info considering distances between CPUs | ||
| 1029 | * @reserved_size: the size of reserved percpu area in bytes | 1376 | * @reserved_size: the size of reserved percpu area in bytes |
| 1030 | * @dyn_size: free size for dynamic allocation in bytes, -1 for auto | 1377 | * @dyn_size: free size for dynamic allocation in bytes, -1 for auto |
| 1031 | * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto | 1378 | * @atom_size: allocation atom size |
| 1032 | * @base_addr: mapped address, NULL for auto | 1379 | * @cpu_distance_fn: callback to determine distance between cpus, optional |
| 1033 | * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary | 1380 | * |
| 1381 | * This function determines grouping of units, their mappings to cpus | ||
| 1382 | * and other parameters considering needed percpu size, allocation | ||
| 1383 | * atom size and distances between CPUs. | ||
| 1384 | * | ||
| 1385 | * Groups are always mutliples of atom size and CPUs which are of | ||
| 1386 | * LOCAL_DISTANCE both ways are grouped together and share space for | ||
| 1387 | * units in the same group. The returned configuration is guaranteed | ||
| 1388 | * to have CPUs on different nodes on different groups and >=75% usage | ||
| 1389 | * of allocated virtual address space. | ||
| 1390 | * | ||
| 1391 | * RETURNS: | ||
| 1392 | * On success, pointer to the new allocation_info is returned. On | ||
| 1393 | * failure, ERR_PTR value is returned. | ||
| 1394 | */ | ||
| 1395 | struct pcpu_alloc_info * __init pcpu_build_alloc_info( | ||
| 1396 | size_t reserved_size, ssize_t dyn_size, | ||
| 1397 | size_t atom_size, | ||
| 1398 | pcpu_fc_cpu_distance_fn_t cpu_distance_fn) | ||
| 1399 | { | ||
| 1400 | static int group_map[NR_CPUS] __initdata; | ||
| 1401 | static int group_cnt[NR_CPUS] __initdata; | ||
| 1402 | const size_t static_size = __per_cpu_end - __per_cpu_start; | ||
| 1403 | int group_cnt_max = 0, nr_groups = 1, nr_units = 0; | ||
| 1404 | size_t size_sum, min_unit_size, alloc_size; | ||
| 1405 | int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ | ||
| 1406 | int last_allocs, group, unit; | ||
| 1407 | unsigned int cpu, tcpu; | ||
| 1408 | struct pcpu_alloc_info *ai; | ||
| 1409 | unsigned int *cpu_map; | ||
| 1410 | |||
| 1411 | /* this function may be called multiple times */ | ||
| 1412 | memset(group_map, 0, sizeof(group_map)); | ||
| 1413 | memset(group_cnt, 0, sizeof(group_map)); | ||
| 1414 | |||
| 1415 | /* | ||
| 1416 | * Determine min_unit_size, alloc_size and max_upa such that | ||
| 1417 | * alloc_size is multiple of atom_size and is the smallest | ||
| 1418 | * which can accomodate 4k aligned segments which are equal to | ||
| 1419 | * or larger than min_unit_size. | ||
| 1420 | */ | ||
| 1421 | size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); | ||
| 1422 | min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); | ||
| 1423 | |||
| 1424 | alloc_size = roundup(min_unit_size, atom_size); | ||
| 1425 | upa = alloc_size / min_unit_size; | ||
| 1426 | while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) | ||
| 1427 | upa--; | ||
| 1428 | max_upa = upa; | ||
| 1429 | |||
| 1430 | /* group cpus according to their proximity */ | ||
| 1431 | for_each_possible_cpu(cpu) { | ||
| 1432 | group = 0; | ||
| 1433 | next_group: | ||
| 1434 | for_each_possible_cpu(tcpu) { | ||
| 1435 | if (cpu == tcpu) | ||
| 1436 | break; | ||
| 1437 | if (group_map[tcpu] == group && cpu_distance_fn && | ||
| 1438 | (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || | ||
| 1439 | cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { | ||
| 1440 | group++; | ||
| 1441 | nr_groups = max(nr_groups, group + 1); | ||
| 1442 | goto next_group; | ||
| 1443 | } | ||
| 1444 | } | ||
| 1445 | group_map[cpu] = group; | ||
| 1446 | group_cnt[group]++; | ||
| 1447 | group_cnt_max = max(group_cnt_max, group_cnt[group]); | ||
| 1448 | } | ||
| 1449 | |||
| 1450 | /* | ||
| 1451 | * Expand unit size until address space usage goes over 75% | ||
| 1452 | * and then as much as possible without using more address | ||
| 1453 | * space. | ||
| 1454 | */ | ||
| 1455 | last_allocs = INT_MAX; | ||
| 1456 | for (upa = max_upa; upa; upa--) { | ||
| 1457 | int allocs = 0, wasted = 0; | ||
| 1458 | |||
| 1459 | if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) | ||
| 1460 | continue; | ||
| 1461 | |||
| 1462 | for (group = 0; group < nr_groups; group++) { | ||
| 1463 | int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); | ||
| 1464 | allocs += this_allocs; | ||
| 1465 | wasted += this_allocs * upa - group_cnt[group]; | ||
| 1466 | } | ||
| 1467 | |||
| 1468 | /* | ||
| 1469 | * Don't accept if wastage is over 25%. The | ||
| 1470 | * greater-than comparison ensures upa==1 always | ||
| 1471 | * passes the following check. | ||
| 1472 | */ | ||
| 1473 | if (wasted > num_possible_cpus() / 3) | ||
| 1474 | continue; | ||
| 1475 | |||
| 1476 | /* and then don't consume more memory */ | ||
| 1477 | if (allocs > last_allocs) | ||
| 1478 | break; | ||
| 1479 | last_allocs = allocs; | ||
| 1480 | best_upa = upa; | ||
| 1481 | } | ||
| 1482 | upa = best_upa; | ||
| 1483 | |||
| 1484 | /* allocate and fill alloc_info */ | ||
| 1485 | for (group = 0; group < nr_groups; group++) | ||
| 1486 | nr_units += roundup(group_cnt[group], upa); | ||
| 1487 | |||
| 1488 | ai = pcpu_alloc_alloc_info(nr_groups, nr_units); | ||
| 1489 | if (!ai) | ||
| 1490 | return ERR_PTR(-ENOMEM); | ||
| 1491 | cpu_map = ai->groups[0].cpu_map; | ||
| 1492 | |||
| 1493 | for (group = 0; group < nr_groups; group++) { | ||
| 1494 | ai->groups[group].cpu_map = cpu_map; | ||
| 1495 | cpu_map += roundup(group_cnt[group], upa); | ||
| 1496 | } | ||
| 1497 | |||
| 1498 | ai->static_size = static_size; | ||
| 1499 | ai->reserved_size = reserved_size; | ||
| 1500 | ai->dyn_size = dyn_size; | ||
| 1501 | ai->unit_size = alloc_size / upa; | ||
| 1502 | ai->atom_size = atom_size; | ||
| 1503 | ai->alloc_size = alloc_size; | ||
| 1504 | |||
| 1505 | for (group = 0, unit = 0; group_cnt[group]; group++) { | ||
| 1506 | struct pcpu_group_info *gi = &ai->groups[group]; | ||
| 1507 | |||
| 1508 | /* | ||
| 1509 | * Initialize base_offset as if all groups are located | ||
| 1510 | * back-to-back. The caller should update this to | ||
| 1511 | * reflect actual allocation. | ||
| 1512 | */ | ||
| 1513 | gi->base_offset = unit * ai->unit_size; | ||
| 1514 | |||
| 1515 | for_each_possible_cpu(cpu) | ||
| 1516 | if (group_map[cpu] == group) | ||
| 1517 | gi->cpu_map[gi->nr_units++] = cpu; | ||
| 1518 | gi->nr_units = roundup(gi->nr_units, upa); | ||
| 1519 | unit += gi->nr_units; | ||
| 1520 | } | ||
| 1521 | BUG_ON(unit != nr_units); | ||
| 1522 | |||
| 1523 | return ai; | ||
| 1524 | } | ||
| 1525 | |||
| 1526 | /** | ||
| 1527 | * pcpu_dump_alloc_info - print out information about pcpu_alloc_info | ||
| 1528 | * @lvl: loglevel | ||
| 1529 | * @ai: allocation info to dump | ||
| 1530 | * | ||
| 1531 | * Print out information about @ai using loglevel @lvl. | ||
| 1532 | */ | ||
| 1533 | static void pcpu_dump_alloc_info(const char *lvl, | ||
| 1534 | const struct pcpu_alloc_info *ai) | ||
| 1535 | { | ||
| 1536 | int group_width = 1, cpu_width = 1, width; | ||
| 1537 | char empty_str[] = "--------"; | ||
| 1538 | int alloc = 0, alloc_end = 0; | ||
| 1539 | int group, v; | ||
| 1540 | int upa, apl; /* units per alloc, allocs per line */ | ||
| 1541 | |||
| 1542 | v = ai->nr_groups; | ||
| 1543 | while (v /= 10) | ||
| 1544 | group_width++; | ||
| 1545 | |||
| 1546 | v = num_possible_cpus(); | ||
| 1547 | while (v /= 10) | ||
| 1548 | cpu_width++; | ||
| 1549 | empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; | ||
| 1550 | |||
| 1551 | upa = ai->alloc_size / ai->unit_size; | ||
| 1552 | width = upa * (cpu_width + 1) + group_width + 3; | ||
| 1553 | apl = rounddown_pow_of_two(max(60 / width, 1)); | ||
| 1554 | |||
| 1555 | printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", | ||
| 1556 | lvl, ai->static_size, ai->reserved_size, ai->dyn_size, | ||
| 1557 | ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); | ||
| 1558 | |||
| 1559 | for (group = 0; group < ai->nr_groups; group++) { | ||
| 1560 | const struct pcpu_group_info *gi = &ai->groups[group]; | ||
| 1561 | int unit = 0, unit_end = 0; | ||
| 1562 | |||
| 1563 | BUG_ON(gi->nr_units % upa); | ||
| 1564 | for (alloc_end += gi->nr_units / upa; | ||
| 1565 | alloc < alloc_end; alloc++) { | ||
| 1566 | if (!(alloc % apl)) { | ||
| 1567 | printk("\n"); | ||
| 1568 | printk("%spcpu-alloc: ", lvl); | ||
| 1569 | } | ||
| 1570 | printk("[%0*d] ", group_width, group); | ||
| 1571 | |||
| 1572 | for (unit_end += upa; unit < unit_end; unit++) | ||
| 1573 | if (gi->cpu_map[unit] != NR_CPUS) | ||
| 1574 | printk("%0*d ", cpu_width, | ||
| 1575 | gi->cpu_map[unit]); | ||
| 1576 | else | ||
| 1577 | printk("%s ", empty_str); | ||
| 1578 | } | ||
| 1579 | } | ||
| 1580 | printk("\n"); | ||
| 1581 | } | ||
| 1582 | |||
| 1583 | /** | ||
| 1584 | * pcpu_setup_first_chunk - initialize the first percpu chunk | ||
| 1585 | * @ai: pcpu_alloc_info describing how to percpu area is shaped | ||
| 1586 | * @base_addr: mapped address | ||
| 1034 | * | 1587 | * |
| 1035 | * Initialize the first percpu chunk which contains the kernel static | 1588 | * Initialize the first percpu chunk which contains the kernel static |
| 1036 | * perpcu area. This function is to be called from arch percpu area | 1589 | * perpcu area. This function is to be called from arch percpu area |
| 1037 | * setup path. The first two parameters are mandatory. The rest are | 1590 | * setup path. |
| 1038 | * optional. | 1591 | * |
| 1039 | * | 1592 | * @ai contains all information necessary to initialize the first |
| 1040 | * @get_page_fn() should return pointer to percpu page given cpu | 1593 | * chunk and prime the dynamic percpu allocator. |
| 1041 | * number and page number. It should at least return enough pages to | 1594 | * |
| 1042 | * cover the static area. The returned pages for static area should | 1595 | * @ai->static_size is the size of static percpu area. |
| 1043 | * have been initialized with valid data. If @unit_size is specified, | 1596 | * |
| 1044 | * it can also return pages after the static area. NULL return | 1597 | * @ai->reserved_size, if non-zero, specifies the amount of bytes to |
| 1045 | * indicates end of pages for the cpu. Note that @get_page_fn() must | ||
| 1046 | * return the same number of pages for all cpus. | ||
| 1047 | * | ||
| 1048 | * @reserved_size, if non-zero, specifies the amount of bytes to | ||
| 1049 | * reserve after the static area in the first chunk. This reserves | 1598 | * reserve after the static area in the first chunk. This reserves |
| 1050 | * the first chunk such that it's available only through reserved | 1599 | * the first chunk such that it's available only through reserved |
| 1051 | * percpu allocation. This is primarily used to serve module percpu | 1600 | * percpu allocation. This is primarily used to serve module percpu |
| @@ -1053,22 +1602,29 @@ EXPORT_SYMBOL_GPL(free_percpu); | |||
| 1053 | * limited offset range for symbol relocations to guarantee module | 1602 | * limited offset range for symbol relocations to guarantee module |
| 1054 | * percpu symbols fall inside the relocatable range. | 1603 | * percpu symbols fall inside the relocatable range. |
| 1055 | * | 1604 | * |
| 1056 | * @dyn_size, if non-negative, determines the number of bytes | 1605 | * @ai->dyn_size determines the number of bytes available for dynamic |
| 1057 | * available for dynamic allocation in the first chunk. Specifying | 1606 | * allocation in the first chunk. The area between @ai->static_size + |
| 1058 | * non-negative value makes percpu leave alone the area beyond | 1607 | * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. |
| 1059 | * @static_size + @reserved_size + @dyn_size. | 1608 | * |
| 1609 | * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE | ||
| 1610 | * and equal to or larger than @ai->static_size + @ai->reserved_size + | ||
| 1611 | * @ai->dyn_size. | ||
| 1612 | * | ||
| 1613 | * @ai->atom_size is the allocation atom size and used as alignment | ||
| 1614 | * for vm areas. | ||
| 1060 | * | 1615 | * |
| 1061 | * @unit_size, if non-negative, specifies unit size and must be | 1616 | * @ai->alloc_size is the allocation size and always multiple of |
| 1062 | * aligned to PAGE_SIZE and equal to or larger than @static_size + | 1617 | * @ai->atom_size. This is larger than @ai->atom_size if |
| 1063 | * @reserved_size + if non-negative, @dyn_size. | 1618 | * @ai->unit_size is larger than @ai->atom_size. |
| 1064 | * | 1619 | * |
| 1065 | * Non-null @base_addr means that the caller already allocated virtual | 1620 | * @ai->nr_groups and @ai->groups describe virtual memory layout of |
| 1066 | * region for the first chunk and mapped it. percpu must not mess | 1621 | * percpu areas. Units which should be colocated are put into the |
| 1067 | * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL | 1622 | * same group. Dynamic VM areas will be allocated according to these |
| 1068 | * @populate_pte_fn doesn't make any sense. | 1623 | * groupings. If @ai->nr_groups is zero, a single group containing |
| 1624 | * all units is assumed. | ||
| 1069 | * | 1625 | * |
| 1070 | * @populate_pte_fn is used to populate the pagetable. NULL means the | 1626 | * The caller should have mapped the first chunk at @base_addr and |
| 1071 | * caller already populated the pagetable. | 1627 | * copied static data to each unit. |
| 1072 | * | 1628 | * |
| 1073 | * If the first chunk ends up with both reserved and dynamic areas, it | 1629 | * If the first chunk ends up with both reserved and dynamic areas, it |
| 1074 | * is served by two chunks - one to serve the core static and reserved | 1630 | * is served by two chunks - one to serve the core static and reserved |
| @@ -1078,49 +1634,98 @@ EXPORT_SYMBOL_GPL(free_percpu); | |||
| 1078 | * and available for dynamic allocation like any other chunks. | 1634 | * and available for dynamic allocation like any other chunks. |
| 1079 | * | 1635 | * |
| 1080 | * RETURNS: | 1636 | * RETURNS: |
| 1081 | * The determined pcpu_unit_size which can be used to initialize | 1637 | * 0 on success, -errno on failure. |
| 1082 | * percpu access. | ||
| 1083 | */ | 1638 | */ |
| 1084 | size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, | 1639 | int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, |
| 1085 | size_t static_size, size_t reserved_size, | 1640 | void *base_addr) |
| 1086 | ssize_t dyn_size, ssize_t unit_size, | ||
| 1087 | void *base_addr, | ||
| 1088 | pcpu_populate_pte_fn_t populate_pte_fn) | ||
| 1089 | { | 1641 | { |
| 1090 | static struct vm_struct first_vm; | 1642 | static char cpus_buf[4096] __initdata; |
| 1091 | static int smap[2], dmap[2]; | 1643 | static int smap[2], dmap[2]; |
| 1092 | size_t size_sum = static_size + reserved_size + | 1644 | size_t dyn_size = ai->dyn_size; |
| 1093 | (dyn_size >= 0 ? dyn_size : 0); | 1645 | size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; |
| 1094 | struct pcpu_chunk *schunk, *dchunk = NULL; | 1646 | struct pcpu_chunk *schunk, *dchunk = NULL; |
| 1647 | unsigned long *group_offsets; | ||
| 1648 | size_t *group_sizes; | ||
| 1649 | unsigned long *unit_off; | ||
| 1095 | unsigned int cpu; | 1650 | unsigned int cpu; |
| 1096 | int nr_pages; | 1651 | int *unit_map; |
| 1097 | int err, i; | 1652 | int group, unit, i; |
| 1098 | 1653 | ||
| 1099 | /* santiy checks */ | 1654 | cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); |
| 1655 | |||
| 1656 | #define PCPU_SETUP_BUG_ON(cond) do { \ | ||
| 1657 | if (unlikely(cond)) { \ | ||
| 1658 | pr_emerg("PERCPU: failed to initialize, %s", #cond); \ | ||
| 1659 | pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ | ||
| 1660 | pcpu_dump_alloc_info(KERN_EMERG, ai); \ | ||
| 1661 | BUG(); \ | ||
| 1662 | } \ | ||
| 1663 | } while (0) | ||
| 1664 | |||
| 1665 | /* sanity checks */ | ||
| 1100 | BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || | 1666 | BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || |
| 1101 | ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); | 1667 | ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); |
| 1102 | BUG_ON(!static_size); | 1668 | PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); |
| 1103 | if (unit_size >= 0) { | 1669 | PCPU_SETUP_BUG_ON(!ai->static_size); |
| 1104 | BUG_ON(unit_size < size_sum); | 1670 | PCPU_SETUP_BUG_ON(!base_addr); |
| 1105 | BUG_ON(unit_size & ~PAGE_MASK); | 1671 | PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); |
| 1106 | BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); | 1672 | PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); |
| 1107 | } else | 1673 | PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); |
| 1108 | BUG_ON(base_addr); | 1674 | |
| 1109 | BUG_ON(base_addr && populate_pte_fn); | 1675 | /* process group information and build config tables accordingly */ |
| 1110 | 1676 | group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); | |
| 1111 | if (unit_size >= 0) | 1677 | group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); |
| 1112 | pcpu_unit_pages = unit_size >> PAGE_SHIFT; | 1678 | unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); |
| 1113 | else | 1679 | unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); |
| 1114 | pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, | 1680 | |
| 1115 | PFN_UP(size_sum)); | 1681 | for (cpu = 0; cpu < nr_cpu_ids; cpu++) |
| 1682 | unit_map[cpu] = UINT_MAX; | ||
| 1683 | pcpu_first_unit_cpu = NR_CPUS; | ||
| 1684 | |||
| 1685 | for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { | ||
| 1686 | const struct pcpu_group_info *gi = &ai->groups[group]; | ||
| 1687 | |||
| 1688 | group_offsets[group] = gi->base_offset; | ||
| 1689 | group_sizes[group] = gi->nr_units * ai->unit_size; | ||
| 1690 | |||
| 1691 | for (i = 0; i < gi->nr_units; i++) { | ||
| 1692 | cpu = gi->cpu_map[i]; | ||
| 1693 | if (cpu == NR_CPUS) | ||
| 1694 | continue; | ||
| 1116 | 1695 | ||
| 1117 | pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; | 1696 | PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); |
| 1118 | pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; | 1697 | PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); |
| 1119 | pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) | 1698 | PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); |
| 1120 | + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); | 1699 | |
| 1700 | unit_map[cpu] = unit + i; | ||
| 1701 | unit_off[cpu] = gi->base_offset + i * ai->unit_size; | ||
| 1702 | |||
| 1703 | if (pcpu_first_unit_cpu == NR_CPUS) | ||
| 1704 | pcpu_first_unit_cpu = cpu; | ||
| 1705 | } | ||
| 1706 | } | ||
| 1707 | pcpu_last_unit_cpu = cpu; | ||
| 1708 | pcpu_nr_units = unit; | ||
| 1709 | |||
| 1710 | for_each_possible_cpu(cpu) | ||
| 1711 | PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); | ||
| 1712 | |||
| 1713 | /* we're done parsing the input, undefine BUG macro and dump config */ | ||
| 1714 | #undef PCPU_SETUP_BUG_ON | ||
| 1715 | pcpu_dump_alloc_info(KERN_INFO, ai); | ||
| 1121 | 1716 | ||
| 1122 | if (dyn_size < 0) | 1717 | pcpu_nr_groups = ai->nr_groups; |
| 1123 | dyn_size = pcpu_unit_size - static_size - reserved_size; | 1718 | pcpu_group_offsets = group_offsets; |
| 1719 | pcpu_group_sizes = group_sizes; | ||
| 1720 | pcpu_unit_map = unit_map; | ||
| 1721 | pcpu_unit_offsets = unit_off; | ||
| 1722 | |||
| 1723 | /* determine basic parameters */ | ||
| 1724 | pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; | ||
| 1725 | pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; | ||
| 1726 | pcpu_atom_size = ai->atom_size; | ||
| 1727 | pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + | ||
| 1728 | BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); | ||
| 1124 | 1729 | ||
| 1125 | /* | 1730 | /* |
| 1126 | * Allocate chunk slots. The additional last slot is for | 1731 | * Allocate chunk slots. The additional last slot is for |
| @@ -1140,187 +1745,368 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, | |||
| 1140 | */ | 1745 | */ |
| 1141 | schunk = alloc_bootmem(pcpu_chunk_struct_size); | 1746 | schunk = alloc_bootmem(pcpu_chunk_struct_size); |
| 1142 | INIT_LIST_HEAD(&schunk->list); | 1747 | INIT_LIST_HEAD(&schunk->list); |
| 1143 | schunk->vm = &first_vm; | 1748 | schunk->base_addr = base_addr; |
| 1144 | schunk->map = smap; | 1749 | schunk->map = smap; |
| 1145 | schunk->map_alloc = ARRAY_SIZE(smap); | 1750 | schunk->map_alloc = ARRAY_SIZE(smap); |
| 1146 | schunk->page = schunk->page_ar; | 1751 | schunk->immutable = true; |
| 1752 | bitmap_fill(schunk->populated, pcpu_unit_pages); | ||
| 1147 | 1753 | ||
| 1148 | if (reserved_size) { | 1754 | if (ai->reserved_size) { |
| 1149 | schunk->free_size = reserved_size; | 1755 | schunk->free_size = ai->reserved_size; |
| 1150 | pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ | 1756 | pcpu_reserved_chunk = schunk; |
| 1757 | pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; | ||
| 1151 | } else { | 1758 | } else { |
| 1152 | schunk->free_size = dyn_size; | 1759 | schunk->free_size = dyn_size; |
| 1153 | dyn_size = 0; /* dynamic area covered */ | 1760 | dyn_size = 0; /* dynamic area covered */ |
| 1154 | } | 1761 | } |
| 1155 | schunk->contig_hint = schunk->free_size; | 1762 | schunk->contig_hint = schunk->free_size; |
| 1156 | 1763 | ||
| 1157 | schunk->map[schunk->map_used++] = -static_size; | 1764 | schunk->map[schunk->map_used++] = -ai->static_size; |
| 1158 | if (schunk->free_size) | 1765 | if (schunk->free_size) |
| 1159 | schunk->map[schunk->map_used++] = schunk->free_size; | 1766 | schunk->map[schunk->map_used++] = schunk->free_size; |
| 1160 | 1767 | ||
| 1161 | pcpu_reserved_chunk_limit = static_size + schunk->free_size; | ||
| 1162 | |||
| 1163 | /* init dynamic chunk if necessary */ | 1768 | /* init dynamic chunk if necessary */ |
| 1164 | if (dyn_size) { | 1769 | if (dyn_size) { |
| 1165 | dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); | 1770 | dchunk = alloc_bootmem(pcpu_chunk_struct_size); |
| 1166 | INIT_LIST_HEAD(&dchunk->list); | 1771 | INIT_LIST_HEAD(&dchunk->list); |
| 1167 | dchunk->vm = &first_vm; | 1772 | dchunk->base_addr = base_addr; |
| 1168 | dchunk->map = dmap; | 1773 | dchunk->map = dmap; |
| 1169 | dchunk->map_alloc = ARRAY_SIZE(dmap); | 1774 | dchunk->map_alloc = ARRAY_SIZE(dmap); |
| 1170 | dchunk->page = schunk->page_ar; /* share page map with schunk */ | 1775 | dchunk->immutable = true; |
| 1776 | bitmap_fill(dchunk->populated, pcpu_unit_pages); | ||
| 1171 | 1777 | ||
| 1172 | dchunk->contig_hint = dchunk->free_size = dyn_size; | 1778 | dchunk->contig_hint = dchunk->free_size = dyn_size; |
| 1173 | dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; | 1779 | dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; |
| 1174 | dchunk->map[dchunk->map_used++] = dchunk->free_size; | 1780 | dchunk->map[dchunk->map_used++] = dchunk->free_size; |
| 1175 | } | 1781 | } |
| 1176 | 1782 | ||
| 1177 | /* allocate vm address */ | ||
| 1178 | first_vm.flags = VM_ALLOC; | ||
| 1179 | first_vm.size = pcpu_chunk_size; | ||
| 1180 | |||
| 1181 | if (!base_addr) | ||
| 1182 | vm_area_register_early(&first_vm, PAGE_SIZE); | ||
| 1183 | else { | ||
| 1184 | /* | ||
| 1185 | * Pages already mapped. No need to remap into | ||
| 1186 | * vmalloc area. In this case the first chunks can't | ||
| 1187 | * be mapped or unmapped by percpu and are marked | ||
| 1188 | * immutable. | ||
| 1189 | */ | ||
| 1190 | first_vm.addr = base_addr; | ||
| 1191 | schunk->immutable = true; | ||
| 1192 | if (dchunk) | ||
| 1193 | dchunk->immutable = true; | ||
| 1194 | } | ||
| 1195 | |||
| 1196 | /* assign pages */ | ||
| 1197 | nr_pages = -1; | ||
| 1198 | for_each_possible_cpu(cpu) { | ||
| 1199 | for (i = 0; i < pcpu_unit_pages; i++) { | ||
| 1200 | struct page *page = get_page_fn(cpu, i); | ||
| 1201 | |||
| 1202 | if (!page) | ||
| 1203 | break; | ||
| 1204 | *pcpu_chunk_pagep(schunk, cpu, i) = page; | ||
| 1205 | } | ||
| 1206 | |||
| 1207 | BUG_ON(i < PFN_UP(static_size)); | ||
| 1208 | |||
| 1209 | if (nr_pages < 0) | ||
| 1210 | nr_pages = i; | ||
| 1211 | else | ||
| 1212 | BUG_ON(nr_pages != i); | ||
| 1213 | } | ||
| 1214 | |||
| 1215 | /* map them */ | ||
| 1216 | if (populate_pte_fn) { | ||
| 1217 | for_each_possible_cpu(cpu) | ||
| 1218 | for (i = 0; i < nr_pages; i++) | ||
| 1219 | populate_pte_fn(pcpu_chunk_addr(schunk, | ||
| 1220 | cpu, i)); | ||
| 1221 | |||
| 1222 | err = pcpu_map(schunk, 0, nr_pages); | ||
| 1223 | if (err) | ||
| 1224 | panic("failed to setup static percpu area, err=%d\n", | ||
| 1225 | err); | ||
| 1226 | } | ||
| 1227 | |||
| 1228 | /* link the first chunk in */ | 1783 | /* link the first chunk in */ |
| 1229 | if (!dchunk) { | 1784 | pcpu_first_chunk = dchunk ?: schunk; |
| 1230 | pcpu_chunk_relocate(schunk, -1); | 1785 | pcpu_chunk_relocate(pcpu_first_chunk, -1); |
| 1231 | pcpu_chunk_addr_insert(schunk); | ||
| 1232 | } else { | ||
| 1233 | pcpu_chunk_relocate(dchunk, -1); | ||
| 1234 | pcpu_chunk_addr_insert(dchunk); | ||
| 1235 | } | ||
| 1236 | 1786 | ||
| 1237 | /* we're done */ | 1787 | /* we're done */ |
| 1238 | pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); | 1788 | pcpu_base_addr = base_addr; |
| 1239 | return pcpu_unit_size; | 1789 | return 0; |
| 1240 | } | 1790 | } |
| 1241 | 1791 | ||
| 1242 | /* | 1792 | const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { |
| 1243 | * Embedding first chunk setup helper. | 1793 | [PCPU_FC_AUTO] = "auto", |
| 1244 | */ | 1794 | [PCPU_FC_EMBED] = "embed", |
| 1245 | static void *pcpue_ptr __initdata; | 1795 | [PCPU_FC_PAGE] = "page", |
| 1246 | static size_t pcpue_size __initdata; | 1796 | }; |
| 1247 | static size_t pcpue_unit_size __initdata; | ||
| 1248 | 1797 | ||
| 1249 | static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) | 1798 | enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; |
| 1250 | { | ||
| 1251 | size_t off = (size_t)pageno << PAGE_SHIFT; | ||
| 1252 | 1799 | ||
| 1253 | if (off >= pcpue_size) | 1800 | static int __init percpu_alloc_setup(char *str) |
| 1254 | return NULL; | 1801 | { |
| 1802 | if (0) | ||
| 1803 | /* nada */; | ||
| 1804 | #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK | ||
| 1805 | else if (!strcmp(str, "embed")) | ||
| 1806 | pcpu_chosen_fc = PCPU_FC_EMBED; | ||
| 1807 | #endif | ||
| 1808 | #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK | ||
| 1809 | else if (!strcmp(str, "page")) | ||
| 1810 | pcpu_chosen_fc = PCPU_FC_PAGE; | ||
| 1811 | #endif | ||
| 1812 | else | ||
| 1813 | pr_warning("PERCPU: unknown allocator %s specified\n", str); | ||
| 1255 | 1814 | ||
| 1256 | return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); | 1815 | return 0; |
| 1257 | } | 1816 | } |
| 1817 | early_param("percpu_alloc", percpu_alloc_setup); | ||
| 1258 | 1818 | ||
| 1819 | #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ | ||
| 1820 | !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) | ||
| 1259 | /** | 1821 | /** |
| 1260 | * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem | 1822 | * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem |
| 1261 | * @static_size: the size of static percpu area in bytes | ||
| 1262 | * @reserved_size: the size of reserved percpu area in bytes | 1823 | * @reserved_size: the size of reserved percpu area in bytes |
| 1263 | * @dyn_size: free size for dynamic allocation in bytes, -1 for auto | 1824 | * @dyn_size: free size for dynamic allocation in bytes, -1 for auto |
| 1264 | * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto | 1825 | * @atom_size: allocation atom size |
| 1826 | * @cpu_distance_fn: callback to determine distance between cpus, optional | ||
| 1827 | * @alloc_fn: function to allocate percpu page | ||
| 1828 | * @free_fn: funtion to free percpu page | ||
| 1265 | * | 1829 | * |
| 1266 | * This is a helper to ease setting up embedded first percpu chunk and | 1830 | * This is a helper to ease setting up embedded first percpu chunk and |
| 1267 | * can be called where pcpu_setup_first_chunk() is expected. | 1831 | * can be called where pcpu_setup_first_chunk() is expected. |
| 1268 | * | 1832 | * |
| 1269 | * If this function is used to setup the first chunk, it is allocated | 1833 | * If this function is used to setup the first chunk, it is allocated |
| 1270 | * as a contiguous area using bootmem allocator and used as-is without | 1834 | * by calling @alloc_fn and used as-is without being mapped into |
| 1271 | * being mapped into vmalloc area. This enables the first chunk to | 1835 | * vmalloc area. Allocations are always whole multiples of @atom_size |
| 1272 | * piggy back on the linear physical mapping which often uses larger | 1836 | * aligned to @atom_size. |
| 1273 | * page size. | 1837 | * |
| 1838 | * This enables the first chunk to piggy back on the linear physical | ||
| 1839 | * mapping which often uses larger page size. Please note that this | ||
| 1840 | * can result in very sparse cpu->unit mapping on NUMA machines thus | ||
| 1841 | * requiring large vmalloc address space. Don't use this allocator if | ||
| 1842 | * vmalloc space is not orders of magnitude larger than distances | ||
| 1843 | * between node memory addresses (ie. 32bit NUMA machines). | ||
| 1274 | * | 1844 | * |
| 1275 | * When @dyn_size is positive, dynamic area might be larger than | 1845 | * When @dyn_size is positive, dynamic area might be larger than |
| 1276 | * specified to fill page alignment. Also, when @dyn_size is auto, | 1846 | * specified to fill page alignment. When @dyn_size is auto, |
| 1277 | * @dyn_size does not fill the whole first chunk but only what's | 1847 | * @dyn_size is just big enough to fill page alignment after static |
| 1278 | * necessary for page alignment after static and reserved areas. | 1848 | * and reserved areas. |
| 1279 | * | 1849 | * |
| 1280 | * If the needed size is smaller than the minimum or specified unit | 1850 | * If the needed size is smaller than the minimum or specified unit |
| 1281 | * size, the leftover is returned to the bootmem allocator. | 1851 | * size, the leftover is returned using @free_fn. |
| 1282 | * | 1852 | * |
| 1283 | * RETURNS: | 1853 | * RETURNS: |
| 1284 | * The determined pcpu_unit_size which can be used to initialize | 1854 | * 0 on success, -errno on failure. |
| 1285 | * percpu access on success, -errno on failure. | ||
| 1286 | */ | 1855 | */ |
| 1287 | ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, | 1856 | int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, |
| 1288 | ssize_t dyn_size, ssize_t unit_size) | 1857 | size_t atom_size, |
| 1858 | pcpu_fc_cpu_distance_fn_t cpu_distance_fn, | ||
| 1859 | pcpu_fc_alloc_fn_t alloc_fn, | ||
| 1860 | pcpu_fc_free_fn_t free_fn) | ||
| 1289 | { | 1861 | { |
| 1290 | unsigned int cpu; | 1862 | void *base = (void *)ULONG_MAX; |
| 1863 | void **areas = NULL; | ||
| 1864 | struct pcpu_alloc_info *ai; | ||
| 1865 | size_t size_sum, areas_size, max_distance; | ||
| 1866 | int group, i, rc; | ||
| 1867 | |||
| 1868 | ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, | ||
| 1869 | cpu_distance_fn); | ||
| 1870 | if (IS_ERR(ai)) | ||
| 1871 | return PTR_ERR(ai); | ||
| 1872 | |||
| 1873 | size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; | ||
| 1874 | areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); | ||
| 1875 | |||
| 1876 | areas = alloc_bootmem_nopanic(areas_size); | ||
| 1877 | if (!areas) { | ||
| 1878 | rc = -ENOMEM; | ||
| 1879 | goto out_free; | ||
| 1880 | } | ||
| 1291 | 1881 | ||
| 1292 | /* determine parameters and allocate */ | 1882 | /* allocate, copy and determine base address */ |
| 1293 | pcpue_size = PFN_ALIGN(static_size + reserved_size + | 1883 | for (group = 0; group < ai->nr_groups; group++) { |
| 1294 | (dyn_size >= 0 ? dyn_size : 0)); | 1884 | struct pcpu_group_info *gi = &ai->groups[group]; |
| 1295 | if (dyn_size != 0) | 1885 | unsigned int cpu = NR_CPUS; |
| 1296 | dyn_size = pcpue_size - static_size - reserved_size; | 1886 | void *ptr; |
| 1297 | 1887 | ||
| 1298 | if (unit_size >= 0) { | 1888 | for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) |
| 1299 | BUG_ON(unit_size < pcpue_size); | 1889 | cpu = gi->cpu_map[i]; |
| 1300 | pcpue_unit_size = unit_size; | 1890 | BUG_ON(cpu == NR_CPUS); |
| 1301 | } else | 1891 | |
| 1302 | pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); | 1892 | /* allocate space for the whole group */ |
| 1303 | 1893 | ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); | |
| 1304 | pcpue_ptr = __alloc_bootmem_nopanic( | 1894 | if (!ptr) { |
| 1305 | num_possible_cpus() * pcpue_unit_size, | 1895 | rc = -ENOMEM; |
| 1306 | PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); | 1896 | goto out_free_areas; |
| 1307 | if (!pcpue_ptr) | 1897 | } |
| 1308 | return -ENOMEM; | 1898 | areas[group] = ptr; |
| 1309 | 1899 | ||
| 1310 | /* return the leftover and copy */ | 1900 | base = min(ptr, base); |
| 1311 | for_each_possible_cpu(cpu) { | 1901 | |
| 1312 | void *ptr = pcpue_ptr + cpu * pcpue_unit_size; | 1902 | for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { |
| 1903 | if (gi->cpu_map[i] == NR_CPUS) { | ||
| 1904 | /* unused unit, free whole */ | ||
| 1905 | free_fn(ptr, ai->unit_size); | ||
| 1906 | continue; | ||
| 1907 | } | ||
| 1908 | /* copy and return the unused part */ | ||
| 1909 | memcpy(ptr, __per_cpu_load, ai->static_size); | ||
| 1910 | free_fn(ptr + size_sum, ai->unit_size - size_sum); | ||
| 1911 | } | ||
| 1912 | } | ||
| 1913 | |||
| 1914 | /* base address is now known, determine group base offsets */ | ||
| 1915 | max_distance = 0; | ||
| 1916 | for (group = 0; group < ai->nr_groups; group++) { | ||
| 1917 | ai->groups[group].base_offset = areas[group] - base; | ||
| 1918 | max_distance = max_t(size_t, max_distance, | ||
| 1919 | ai->groups[group].base_offset); | ||
| 1920 | } | ||
| 1921 | max_distance += ai->unit_size; | ||
| 1922 | |||
| 1923 | /* warn if maximum distance is further than 75% of vmalloc space */ | ||
| 1924 | if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { | ||
| 1925 | pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " | ||
| 1926 | "space 0x%lx\n", | ||
| 1927 | max_distance, VMALLOC_END - VMALLOC_START); | ||
| 1928 | #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK | ||
| 1929 | /* and fail if we have fallback */ | ||
| 1930 | rc = -EINVAL; | ||
| 1931 | goto out_free; | ||
| 1932 | #endif | ||
| 1933 | } | ||
| 1934 | |||
| 1935 | pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", | ||
| 1936 | PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, | ||
| 1937 | ai->dyn_size, ai->unit_size); | ||
| 1938 | |||
| 1939 | rc = pcpu_setup_first_chunk(ai, base); | ||
| 1940 | goto out_free; | ||
| 1941 | |||
| 1942 | out_free_areas: | ||
| 1943 | for (group = 0; group < ai->nr_groups; group++) | ||
| 1944 | free_fn(areas[group], | ||
| 1945 | ai->groups[group].nr_units * ai->unit_size); | ||
| 1946 | out_free: | ||
| 1947 | pcpu_free_alloc_info(ai); | ||
| 1948 | if (areas) | ||
| 1949 | free_bootmem(__pa(areas), areas_size); | ||
| 1950 | return rc; | ||
| 1951 | } | ||
| 1952 | #endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || | ||
| 1953 | !CONFIG_HAVE_SETUP_PER_CPU_AREA */ | ||
| 1954 | |||
| 1955 | #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK | ||
| 1956 | /** | ||
| 1957 | * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages | ||
| 1958 | * @reserved_size: the size of reserved percpu area in bytes | ||
| 1959 | * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE | ||
| 1960 | * @free_fn: funtion to free percpu page, always called with PAGE_SIZE | ||
| 1961 | * @populate_pte_fn: function to populate pte | ||
| 1962 | * | ||
| 1963 | * This is a helper to ease setting up page-remapped first percpu | ||
| 1964 | * chunk and can be called where pcpu_setup_first_chunk() is expected. | ||
| 1965 | * | ||
| 1966 | * This is the basic allocator. Static percpu area is allocated | ||
| 1967 | * page-by-page into vmalloc area. | ||
| 1968 | * | ||
| 1969 | * RETURNS: | ||
| 1970 | * 0 on success, -errno on failure. | ||
| 1971 | */ | ||
| 1972 | int __init pcpu_page_first_chunk(size_t reserved_size, | ||
| 1973 | pcpu_fc_alloc_fn_t alloc_fn, | ||
| 1974 | pcpu_fc_free_fn_t free_fn, | ||
| 1975 | pcpu_fc_populate_pte_fn_t populate_pte_fn) | ||
| 1976 | { | ||
| 1977 | static struct vm_struct vm; | ||
| 1978 | struct pcpu_alloc_info *ai; | ||
| 1979 | char psize_str[16]; | ||
| 1980 | int unit_pages; | ||
| 1981 | size_t pages_size; | ||
| 1982 | struct page **pages; | ||
| 1983 | int unit, i, j, rc; | ||
| 1984 | |||
| 1985 | snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); | ||
| 1986 | |||
| 1987 | ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL); | ||
| 1988 | if (IS_ERR(ai)) | ||
| 1989 | return PTR_ERR(ai); | ||
| 1990 | BUG_ON(ai->nr_groups != 1); | ||
| 1991 | BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); | ||
| 1992 | |||
| 1993 | unit_pages = ai->unit_size >> PAGE_SHIFT; | ||
| 1994 | |||
| 1995 | /* unaligned allocations can't be freed, round up to page size */ | ||
| 1996 | pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * | ||
| 1997 | sizeof(pages[0])); | ||
| 1998 | pages = alloc_bootmem(pages_size); | ||
| 1999 | |||
| 2000 | /* allocate pages */ | ||
| 2001 | j = 0; | ||
| 2002 | for (unit = 0; unit < num_possible_cpus(); unit++) | ||
| 2003 | for (i = 0; i < unit_pages; i++) { | ||
| 2004 | unsigned int cpu = ai->groups[0].cpu_map[unit]; | ||
| 2005 | void *ptr; | ||
| 2006 | |||
| 2007 | ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); | ||
| 2008 | if (!ptr) { | ||
| 2009 | pr_warning("PERCPU: failed to allocate %s page " | ||
| 2010 | "for cpu%u\n", psize_str, cpu); | ||
| 2011 | goto enomem; | ||
| 2012 | } | ||
| 2013 | pages[j++] = virt_to_page(ptr); | ||
| 2014 | } | ||
| 2015 | |||
| 2016 | /* allocate vm area, map the pages and copy static data */ | ||
| 2017 | vm.flags = VM_ALLOC; | ||
| 2018 | vm.size = num_possible_cpus() * ai->unit_size; | ||
| 2019 | vm_area_register_early(&vm, PAGE_SIZE); | ||
| 2020 | |||
| 2021 | for (unit = 0; unit < num_possible_cpus(); unit++) { | ||
| 2022 | unsigned long unit_addr = | ||
| 2023 | (unsigned long)vm.addr + unit * ai->unit_size; | ||
| 2024 | |||
| 2025 | for (i = 0; i < unit_pages; i++) | ||
| 2026 | populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); | ||
| 2027 | |||
| 2028 | /* pte already populated, the following shouldn't fail */ | ||
| 2029 | rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], | ||
| 2030 | unit_pages); | ||
| 2031 | if (rc < 0) | ||
| 2032 | panic("failed to map percpu area, err=%d\n", rc); | ||
| 1313 | 2033 | ||
| 1314 | free_bootmem(__pa(ptr + pcpue_size), | 2034 | /* |
| 1315 | pcpue_unit_size - pcpue_size); | 2035 | * FIXME: Archs with virtual cache should flush local |
| 1316 | memcpy(ptr, __per_cpu_load, static_size); | 2036 | * cache for the linear mapping here - something |
| 2037 | * equivalent to flush_cache_vmap() on the local cpu. | ||
| 2038 | * flush_cache_vmap() can't be used as most supporting | ||
| 2039 | * data structures are not set up yet. | ||
| 2040 | */ | ||
| 2041 | |||
| 2042 | /* copy static data */ | ||
| 2043 | memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); | ||
| 1317 | } | 2044 | } |
| 1318 | 2045 | ||
| 1319 | /* we're ready, commit */ | 2046 | /* we're ready, commit */ |
| 1320 | pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", | 2047 | pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", |
| 1321 | pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); | 2048 | unit_pages, psize_str, vm.addr, ai->static_size, |
| 2049 | ai->reserved_size, ai->dyn_size); | ||
| 2050 | |||
| 2051 | rc = pcpu_setup_first_chunk(ai, vm.addr); | ||
| 2052 | goto out_free_ar; | ||
| 2053 | |||
| 2054 | enomem: | ||
| 2055 | while (--j >= 0) | ||
| 2056 | free_fn(page_address(pages[j]), PAGE_SIZE); | ||
| 2057 | rc = -ENOMEM; | ||
| 2058 | out_free_ar: | ||
| 2059 | free_bootmem(__pa(pages), pages_size); | ||
| 2060 | pcpu_free_alloc_info(ai); | ||
| 2061 | return rc; | ||
| 2062 | } | ||
| 2063 | #endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */ | ||
| 2064 | |||
| 2065 | /* | ||
| 2066 | * Generic percpu area setup. | ||
| 2067 | * | ||
| 2068 | * The embedding helper is used because its behavior closely resembles | ||
| 2069 | * the original non-dynamic generic percpu area setup. This is | ||
| 2070 | * important because many archs have addressing restrictions and might | ||
| 2071 | * fail if the percpu area is located far away from the previous | ||
| 2072 | * location. As an added bonus, in non-NUMA cases, embedding is | ||
| 2073 | * generally a good idea TLB-wise because percpu area can piggy back | ||
| 2074 | * on the physical linear memory mapping which uses large page | ||
| 2075 | * mappings on applicable archs. | ||
| 2076 | */ | ||
| 2077 | #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA | ||
| 2078 | unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; | ||
| 2079 | EXPORT_SYMBOL(__per_cpu_offset); | ||
| 1322 | 2080 | ||
| 1323 | return pcpu_setup_first_chunk(pcpue_get_page, static_size, | 2081 | static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, |
| 1324 | reserved_size, dyn_size, | 2082 | size_t align) |
| 1325 | pcpue_unit_size, pcpue_ptr, NULL); | 2083 | { |
| 2084 | return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); | ||
| 2085 | } | ||
| 2086 | |||
| 2087 | static void __init pcpu_dfl_fc_free(void *ptr, size_t size) | ||
| 2088 | { | ||
| 2089 | free_bootmem(__pa(ptr), size); | ||
| 2090 | } | ||
| 2091 | |||
| 2092 | void __init setup_per_cpu_areas(void) | ||
| 2093 | { | ||
| 2094 | unsigned long delta; | ||
| 2095 | unsigned int cpu; | ||
| 2096 | int rc; | ||
| 2097 | |||
| 2098 | /* | ||
| 2099 | * Always reserve area for module percpu variables. That's | ||
| 2100 | * what the legacy allocator did. | ||
| 2101 | */ | ||
| 2102 | rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, | ||
| 2103 | PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, | ||
| 2104 | pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); | ||
| 2105 | if (rc < 0) | ||
| 2106 | panic("Failed to initialized percpu areas."); | ||
| 2107 | |||
| 2108 | delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; | ||
| 2109 | for_each_possible_cpu(cpu) | ||
| 2110 | __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; | ||
| 1326 | } | 2111 | } |
| 2112 | #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ | ||
diff --git a/mm/quicklist.c b/mm/quicklist.c index 8dbb6805ef35..6633965bb27b 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c | |||
| @@ -19,7 +19,7 @@ | |||
| 19 | #include <linux/module.h> | 19 | #include <linux/module.h> |
| 20 | #include <linux/quicklist.h> | 20 | #include <linux/quicklist.h> |
| 21 | 21 | ||
| 22 | DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; | 22 | DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); |
| 23 | 23 | ||
| 24 | #define FRACTION_OF_NODE_MEM 16 | 24 | #define FRACTION_OF_NODE_MEM 16 |
| 25 | 25 | ||
| @@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages) | |||
| 29 | int node = numa_node_id(); | 29 | int node = numa_node_id(); |
| 30 | struct zone *zones = NODE_DATA(node)->node_zones; | 30 | struct zone *zones = NODE_DATA(node)->node_zones; |
| 31 | int num_cpus_on_node; | 31 | int num_cpus_on_node; |
| 32 | node_to_cpumask_ptr(cpumask_on_node, node); | ||
| 33 | 32 | ||
| 34 | node_free_pages = | 33 | node_free_pages = |
| 35 | #ifdef CONFIG_ZONE_DMA | 34 | #ifdef CONFIG_ZONE_DMA |
| @@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages) | |||
| 42 | 41 | ||
| 43 | max = node_free_pages / FRACTION_OF_NODE_MEM; | 42 | max = node_free_pages / FRACTION_OF_NODE_MEM; |
| 44 | 43 | ||
| 45 | num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); | 44 | num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); |
| 46 | max /= num_cpus_on_node; | 45 | max /= num_cpus_on_node; |
| 47 | 46 | ||
| 48 | return max(max, min_pages); | 47 | return max(max, min_pages); |
diff --git a/mm/readahead.c b/mm/readahead.c index 9ce303d4b810..aa1aa2345235 100644 --- a/mm/readahead.c +++ b/mm/readahead.c | |||
| @@ -31,6 +31,42 @@ EXPORT_SYMBOL_GPL(file_ra_state_init); | |||
| 31 | 31 | ||
| 32 | #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) | 32 | #define list_to_page(head) (list_entry((head)->prev, struct page, lru)) |
| 33 | 33 | ||
| 34 | /* | ||
| 35 | * see if a page needs releasing upon read_cache_pages() failure | ||
| 36 | * - the caller of read_cache_pages() may have set PG_private or PG_fscache | ||
| 37 | * before calling, such as the NFS fs marking pages that are cached locally | ||
| 38 | * on disk, thus we need to give the fs a chance to clean up in the event of | ||
| 39 | * an error | ||
| 40 | */ | ||
| 41 | static void read_cache_pages_invalidate_page(struct address_space *mapping, | ||
| 42 | struct page *page) | ||
| 43 | { | ||
| 44 | if (page_has_private(page)) { | ||
| 45 | if (!trylock_page(page)) | ||
| 46 | BUG(); | ||
| 47 | page->mapping = mapping; | ||
| 48 | do_invalidatepage(page, 0); | ||
| 49 | page->mapping = NULL; | ||
| 50 | unlock_page(page); | ||
| 51 | } | ||
| 52 | page_cache_release(page); | ||
| 53 | } | ||
| 54 | |||
| 55 | /* | ||
| 56 | * release a list of pages, invalidating them first if need be | ||
| 57 | */ | ||
| 58 | static void read_cache_pages_invalidate_pages(struct address_space *mapping, | ||
| 59 | struct list_head *pages) | ||
| 60 | { | ||
| 61 | struct page *victim; | ||
| 62 | |||
| 63 | while (!list_empty(pages)) { | ||
| 64 | victim = list_to_page(pages); | ||
| 65 | list_del(&victim->lru); | ||
| 66 | read_cache_pages_invalidate_page(mapping, victim); | ||
| 67 | } | ||
| 68 | } | ||
| 69 | |||
| 34 | /** | 70 | /** |
| 35 | * read_cache_pages - populate an address space with some pages & start reads against them | 71 | * read_cache_pages - populate an address space with some pages & start reads against them |
| 36 | * @mapping: the address_space | 72 | * @mapping: the address_space |
| @@ -52,14 +88,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages, | |||
| 52 | list_del(&page->lru); | 88 | list_del(&page->lru); |
| 53 | if (add_to_page_cache_lru(page, mapping, | 89 | if (add_to_page_cache_lru(page, mapping, |
| 54 | page->index, GFP_KERNEL)) { | 90 | page->index, GFP_KERNEL)) { |
| 55 | page_cache_release(page); | 91 | read_cache_pages_invalidate_page(mapping, page); |
| 56 | continue; | 92 | continue; |
| 57 | } | 93 | } |
| 58 | page_cache_release(page); | 94 | page_cache_release(page); |
| 59 | 95 | ||
| 60 | ret = filler(data, page); | 96 | ret = filler(data, page); |
| 61 | if (unlikely(ret)) { | 97 | if (unlikely(ret)) { |
| 62 | put_pages_list(pages); | 98 | read_cache_pages_invalidate_pages(mapping, pages); |
| 63 | break; | 99 | break; |
| 64 | } | 100 | } |
| 65 | task_io_account_read(PAGE_CACHE_SIZE); | 101 | task_io_account_read(PAGE_CACHE_SIZE); |
| @@ -97,15 +133,12 @@ out: | |||
| 97 | } | 133 | } |
| 98 | 134 | ||
| 99 | /* | 135 | /* |
| 100 | * do_page_cache_readahead actually reads a chunk of disk. It allocates all | 136 | * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all |
| 101 | * the pages first, then submits them all for I/O. This avoids the very bad | 137 | * the pages first, then submits them all for I/O. This avoids the very bad |
| 102 | * behaviour which would occur if page allocations are causing VM writeback. | 138 | * behaviour which would occur if page allocations are causing VM writeback. |
| 103 | * We really don't want to intermingle reads and writes like that. | 139 | * We really don't want to intermingle reads and writes like that. |
| 104 | * | 140 | * |
| 105 | * Returns the number of pages requested, or the maximum amount of I/O allowed. | 141 | * Returns the number of pages requested, or the maximum amount of I/O allowed. |
| 106 | * | ||
| 107 | * do_page_cache_readahead() returns -1 if it encountered request queue | ||
| 108 | * congestion. | ||
| 109 | */ | 142 | */ |
| 110 | static int | 143 | static int |
| 111 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, | 144 | __do_page_cache_readahead(struct address_space *mapping, struct file *filp, |
| @@ -174,6 +207,7 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
| 174 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) | 207 | if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) |
| 175 | return -EINVAL; | 208 | return -EINVAL; |
| 176 | 209 | ||
| 210 | nr_to_read = max_sane_readahead(nr_to_read); | ||
| 177 | while (nr_to_read) { | 211 | while (nr_to_read) { |
| 178 | int err; | 212 | int err; |
| 179 | 213 | ||
| @@ -195,22 +229,6 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, | |||
| 195 | } | 229 | } |
| 196 | 230 | ||
| 197 | /* | 231 | /* |
| 198 | * This version skips the IO if the queue is read-congested, and will tell the | ||
| 199 | * block layer to abandon the readahead if request allocation would block. | ||
| 200 | * | ||
| 201 | * force_page_cache_readahead() will ignore queue congestion and will block on | ||
| 202 | * request queues. | ||
| 203 | */ | ||
| 204 | int do_page_cache_readahead(struct address_space *mapping, struct file *filp, | ||
| 205 | pgoff_t offset, unsigned long nr_to_read) | ||
| 206 | { | ||
| 207 | if (bdi_read_congested(mapping->backing_dev_info)) | ||
| 208 | return -1; | ||
| 209 | |||
| 210 | return __do_page_cache_readahead(mapping, filp, offset, nr_to_read, 0); | ||
| 211 | } | ||
| 212 | |||
| 213 | /* | ||
| 214 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a | 232 | * Given a desired number of PAGE_CACHE_SIZE readahead pages, return a |
| 215 | * sensible upper limit. | 233 | * sensible upper limit. |
| 216 | */ | 234 | */ |
| @@ -223,7 +241,7 @@ unsigned long max_sane_readahead(unsigned long nr) | |||
| 223 | /* | 241 | /* |
| 224 | * Submit IO for the read-ahead request in file_ra_state. | 242 | * Submit IO for the read-ahead request in file_ra_state. |
| 225 | */ | 243 | */ |
| 226 | static unsigned long ra_submit(struct file_ra_state *ra, | 244 | unsigned long ra_submit(struct file_ra_state *ra, |
| 227 | struct address_space *mapping, struct file *filp) | 245 | struct address_space *mapping, struct file *filp) |
| 228 | { | 246 | { |
| 229 | int actual; | 247 | int actual; |
| @@ -312,6 +330,59 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, | |||
| 312 | */ | 330 | */ |
| 313 | 331 | ||
| 314 | /* | 332 | /* |
| 333 | * Count contiguously cached pages from @offset-1 to @offset-@max, | ||
| 334 | * this count is a conservative estimation of | ||
| 335 | * - length of the sequential read sequence, or | ||
| 336 | * - thrashing threshold in memory tight systems | ||
| 337 | */ | ||
| 338 | static pgoff_t count_history_pages(struct address_space *mapping, | ||
| 339 | struct file_ra_state *ra, | ||
| 340 | pgoff_t offset, unsigned long max) | ||
| 341 | { | ||
| 342 | pgoff_t head; | ||
| 343 | |||
| 344 | rcu_read_lock(); | ||
| 345 | head = radix_tree_prev_hole(&mapping->page_tree, offset - 1, max); | ||
| 346 | rcu_read_unlock(); | ||
| 347 | |||
| 348 | return offset - 1 - head; | ||
| 349 | } | ||
| 350 | |||
| 351 | /* | ||
| 352 | * page cache context based read-ahead | ||
| 353 | */ | ||
| 354 | static int try_context_readahead(struct address_space *mapping, | ||
| 355 | struct file_ra_state *ra, | ||
| 356 | pgoff_t offset, | ||
| 357 | unsigned long req_size, | ||
| 358 | unsigned long max) | ||
| 359 | { | ||
| 360 | pgoff_t size; | ||
| 361 | |||
| 362 | size = count_history_pages(mapping, ra, offset, max); | ||
| 363 | |||
| 364 | /* | ||
| 365 | * no history pages: | ||
| 366 | * it could be a random read | ||
| 367 | */ | ||
| 368 | if (!size) | ||
| 369 | return 0; | ||
| 370 | |||
| 371 | /* | ||
| 372 | * starts from beginning of file: | ||
| 373 | * it is a strong indication of long-run stream (or whole-file-read) | ||
| 374 | */ | ||
| 375 | if (size >= offset) | ||
| 376 | size *= 2; | ||
| 377 | |||
| 378 | ra->start = offset; | ||
| 379 | ra->size = get_init_ra_size(size + req_size, max); | ||
| 380 | ra->async_size = ra->size; | ||
| 381 | |||
| 382 | return 1; | ||
| 383 | } | ||
| 384 | |||
| 385 | /* | ||
| 315 | * A minimal readahead algorithm for trivial sequential/random reads. | 386 | * A minimal readahead algorithm for trivial sequential/random reads. |
| 316 | */ | 387 | */ |
| 317 | static unsigned long | 388 | static unsigned long |
| @@ -320,34 +391,26 @@ ondemand_readahead(struct address_space *mapping, | |||
| 320 | bool hit_readahead_marker, pgoff_t offset, | 391 | bool hit_readahead_marker, pgoff_t offset, |
| 321 | unsigned long req_size) | 392 | unsigned long req_size) |
| 322 | { | 393 | { |
| 323 | int max = ra->ra_pages; /* max readahead pages */ | 394 | unsigned long max = max_sane_readahead(ra->ra_pages); |
| 324 | pgoff_t prev_offset; | 395 | |
| 325 | int sequential; | 396 | /* |
| 397 | * start of file | ||
| 398 | */ | ||
| 399 | if (!offset) | ||
| 400 | goto initial_readahead; | ||
| 326 | 401 | ||
| 327 | /* | 402 | /* |
| 328 | * It's the expected callback offset, assume sequential access. | 403 | * It's the expected callback offset, assume sequential access. |
| 329 | * Ramp up sizes, and push forward the readahead window. | 404 | * Ramp up sizes, and push forward the readahead window. |
| 330 | */ | 405 | */ |
| 331 | if (offset && (offset == (ra->start + ra->size - ra->async_size) || | 406 | if ((offset == (ra->start + ra->size - ra->async_size) || |
| 332 | offset == (ra->start + ra->size))) { | 407 | offset == (ra->start + ra->size))) { |
| 333 | ra->start += ra->size; | 408 | ra->start += ra->size; |
| 334 | ra->size = get_next_ra_size(ra, max); | 409 | ra->size = get_next_ra_size(ra, max); |
| 335 | ra->async_size = ra->size; | 410 | ra->async_size = ra->size; |
| 336 | goto readit; | 411 | goto readit; |
| 337 | } | 412 | } |
| 338 | 413 | ||
| 339 | prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT; | ||
| 340 | sequential = offset - prev_offset <= 1UL || req_size > max; | ||
| 341 | |||
| 342 | /* | ||
| 343 | * Standalone, small read. | ||
| 344 | * Read as is, and do not pollute the readahead state. | ||
| 345 | */ | ||
| 346 | if (!hit_readahead_marker && !sequential) { | ||
| 347 | return __do_page_cache_readahead(mapping, filp, | ||
| 348 | offset, req_size, 0); | ||
| 349 | } | ||
| 350 | |||
| 351 | /* | 414 | /* |
| 352 | * Hit a marked page without valid readahead state. | 415 | * Hit a marked page without valid readahead state. |
| 353 | * E.g. interleaved reads. | 416 | * E.g. interleaved reads. |
| @@ -358,7 +421,7 @@ ondemand_readahead(struct address_space *mapping, | |||
| 358 | pgoff_t start; | 421 | pgoff_t start; |
| 359 | 422 | ||
| 360 | rcu_read_lock(); | 423 | rcu_read_lock(); |
| 361 | start = radix_tree_next_hole(&mapping->page_tree, offset,max+1); | 424 | start = radix_tree_next_hole(&mapping->page_tree, offset+1,max); |
| 362 | rcu_read_unlock(); | 425 | rcu_read_unlock(); |
| 363 | 426 | ||
| 364 | if (!start || start - offset > max) | 427 | if (!start || start - offset > max) |
| @@ -366,23 +429,53 @@ ondemand_readahead(struct address_space *mapping, | |||
| 366 | 429 | ||
| 367 | ra->start = start; | 430 | ra->start = start; |
| 368 | ra->size = start - offset; /* old async_size */ | 431 | ra->size = start - offset; /* old async_size */ |
| 432 | ra->size += req_size; | ||
| 369 | ra->size = get_next_ra_size(ra, max); | 433 | ra->size = get_next_ra_size(ra, max); |
| 370 | ra->async_size = ra->size; | 434 | ra->async_size = ra->size; |
| 371 | goto readit; | 435 | goto readit; |
| 372 | } | 436 | } |
| 373 | 437 | ||
| 374 | /* | 438 | /* |
| 375 | * It may be one of | 439 | * oversize read |
| 376 | * - first read on start of file | ||
| 377 | * - sequential cache miss | ||
| 378 | * - oversize random read | ||
| 379 | * Start readahead for it. | ||
| 380 | */ | 440 | */ |
| 441 | if (req_size > max) | ||
| 442 | goto initial_readahead; | ||
| 443 | |||
| 444 | /* | ||
| 445 | * sequential cache miss | ||
| 446 | */ | ||
| 447 | if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) | ||
| 448 | goto initial_readahead; | ||
| 449 | |||
| 450 | /* | ||
| 451 | * Query the page cache and look for the traces(cached history pages) | ||
| 452 | * that a sequential stream would leave behind. | ||
| 453 | */ | ||
| 454 | if (try_context_readahead(mapping, ra, offset, req_size, max)) | ||
| 455 | goto readit; | ||
| 456 | |||
| 457 | /* | ||
| 458 | * standalone, small random read | ||
| 459 | * Read as is, and do not pollute the readahead state. | ||
| 460 | */ | ||
| 461 | return __do_page_cache_readahead(mapping, filp, offset, req_size, 0); | ||
| 462 | |||
| 463 | initial_readahead: | ||
| 381 | ra->start = offset; | 464 | ra->start = offset; |
| 382 | ra->size = get_init_ra_size(req_size, max); | 465 | ra->size = get_init_ra_size(req_size, max); |
| 383 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; | 466 | ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; |
| 384 | 467 | ||
| 385 | readit: | 468 | readit: |
| 469 | /* | ||
| 470 | * Will this read hit the readahead marker made by itself? | ||
| 471 | * If so, trigger the readahead marker hit now, and merge | ||
| 472 | * the resulted next readahead window into the current one. | ||
| 473 | */ | ||
| 474 | if (offset == ra->start && ra->size == ra->async_size) { | ||
| 475 | ra->async_size = get_next_ra_size(ra, max); | ||
| 476 | ra->size += ra->async_size; | ||
| 477 | } | ||
| 478 | |||
| 386 | return ra_submit(ra, mapping, filp); | 479 | return ra_submit(ra, mapping, filp); |
| 387 | } | 480 | } |
| 388 | 481 | ||
| @@ -14,7 +14,7 @@ | |||
| 14 | * Original design by Rik van Riel <riel@conectiva.com.br> 2001 | 14 | * Original design by Rik van Riel <riel@conectiva.com.br> 2001 |
| 15 | * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 | 15 | * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004 |
| 16 | * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 | 16 | * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004 |
| 17 | * Contributions by Hugh Dickins <hugh@veritas.com> 2003, 2004 | 17 | * Contributions by Hugh Dickins 2003, 2004 |
| 18 | */ | 18 | */ |
| 19 | 19 | ||
| 20 | /* | 20 | /* |
| @@ -36,6 +36,11 @@ | |||
| 36 | * mapping->tree_lock (widely used, in set_page_dirty, | 36 | * mapping->tree_lock (widely used, in set_page_dirty, |
| 37 | * in arch-dependent flush_dcache_mmap_lock, | 37 | * in arch-dependent flush_dcache_mmap_lock, |
| 38 | * within inode_lock in __sync_single_inode) | 38 | * within inode_lock in __sync_single_inode) |
| 39 | * | ||
| 40 | * (code doesn't rely on that order so it could be switched around) | ||
| 41 | * ->tasklist_lock | ||
| 42 | * anon_vma->lock (memory_failure, collect_procs_anon) | ||
| 43 | * pte map lock | ||
| 39 | */ | 44 | */ |
| 40 | 45 | ||
| 41 | #include <linux/mm.h> | 46 | #include <linux/mm.h> |
| @@ -191,7 +196,7 @@ void __init anon_vma_init(void) | |||
| 191 | * Getting a lock on a stable anon_vma from a page off the LRU is | 196 | * Getting a lock on a stable anon_vma from a page off the LRU is |
| 192 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. | 197 | * tricky: page_lock_anon_vma rely on RCU to guard against the races. |
| 193 | */ | 198 | */ |
| 194 | static struct anon_vma *page_lock_anon_vma(struct page *page) | 199 | struct anon_vma *page_lock_anon_vma(struct page *page) |
| 195 | { | 200 | { |
| 196 | struct anon_vma *anon_vma; | 201 | struct anon_vma *anon_vma; |
| 197 | unsigned long anon_mapping; | 202 | unsigned long anon_mapping; |
| @@ -211,7 +216,7 @@ out: | |||
| 211 | return NULL; | 216 | return NULL; |
| 212 | } | 217 | } |
| 213 | 218 | ||
| 214 | static void page_unlock_anon_vma(struct anon_vma *anon_vma) | 219 | void page_unlock_anon_vma(struct anon_vma *anon_vma) |
| 215 | { | 220 | { |
| 216 | spin_unlock(&anon_vma->lock); | 221 | spin_unlock(&anon_vma->lock); |
| 217 | rcu_read_unlock(); | 222 | rcu_read_unlock(); |
| @@ -237,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
| 237 | } | 242 | } |
| 238 | 243 | ||
| 239 | /* | 244 | /* |
| 240 | * At what user virtual address is page expected in vma? checking that the | 245 | * At what user virtual address is page expected in vma? |
| 241 | * page matches the vma: currently only used on anon pages, by unuse_vma; | 246 | * checking that the page matches the vma. |
| 242 | */ | 247 | */ |
| 243 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) | 248 | unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) |
| 244 | { | 249 | { |
| @@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, | |||
| 311 | * if the page is not mapped into the page tables of this VMA. Only | 316 | * if the page is not mapped into the page tables of this VMA. Only |
| 312 | * valid for normal file or anonymous VMAs. | 317 | * valid for normal file or anonymous VMAs. |
| 313 | */ | 318 | */ |
| 314 | static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | 319 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) |
| 315 | { | 320 | { |
| 316 | unsigned long address; | 321 | unsigned long address; |
| 317 | pte_t *pte; | 322 | pte_t *pte; |
| @@ -333,7 +338,9 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
| 333 | * repeatedly from either page_referenced_anon or page_referenced_file. | 338 | * repeatedly from either page_referenced_anon or page_referenced_file. |
| 334 | */ | 339 | */ |
| 335 | static int page_referenced_one(struct page *page, | 340 | static int page_referenced_one(struct page *page, |
| 336 | struct vm_area_struct *vma, unsigned int *mapcount) | 341 | struct vm_area_struct *vma, |
| 342 | unsigned int *mapcount, | ||
| 343 | unsigned long *vm_flags) | ||
| 337 | { | 344 | { |
| 338 | struct mm_struct *mm = vma->vm_mm; | 345 | struct mm_struct *mm = vma->vm_mm; |
| 339 | unsigned long address; | 346 | unsigned long address; |
| @@ -356,6 +363,7 @@ static int page_referenced_one(struct page *page, | |||
| 356 | */ | 363 | */ |
| 357 | if (vma->vm_flags & VM_LOCKED) { | 364 | if (vma->vm_flags & VM_LOCKED) { |
| 358 | *mapcount = 1; /* break early from loop */ | 365 | *mapcount = 1; /* break early from loop */ |
| 366 | *vm_flags |= VM_LOCKED; | ||
| 359 | goto out_unmap; | 367 | goto out_unmap; |
| 360 | } | 368 | } |
| 361 | 369 | ||
| @@ -381,11 +389,14 @@ out_unmap: | |||
| 381 | (*mapcount)--; | 389 | (*mapcount)--; |
| 382 | pte_unmap_unlock(pte, ptl); | 390 | pte_unmap_unlock(pte, ptl); |
| 383 | out: | 391 | out: |
| 392 | if (referenced) | ||
| 393 | *vm_flags |= vma->vm_flags; | ||
| 384 | return referenced; | 394 | return referenced; |
| 385 | } | 395 | } |
| 386 | 396 | ||
| 387 | static int page_referenced_anon(struct page *page, | 397 | static int page_referenced_anon(struct page *page, |
| 388 | struct mem_cgroup *mem_cont) | 398 | struct mem_cgroup *mem_cont, |
| 399 | unsigned long *vm_flags) | ||
| 389 | { | 400 | { |
| 390 | unsigned int mapcount; | 401 | unsigned int mapcount; |
| 391 | struct anon_vma *anon_vma; | 402 | struct anon_vma *anon_vma; |
| @@ -405,7 +416,8 @@ static int page_referenced_anon(struct page *page, | |||
| 405 | */ | 416 | */ |
| 406 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 417 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
| 407 | continue; | 418 | continue; |
| 408 | referenced += page_referenced_one(page, vma, &mapcount); | 419 | referenced += page_referenced_one(page, vma, |
| 420 | &mapcount, vm_flags); | ||
| 409 | if (!mapcount) | 421 | if (!mapcount) |
| 410 | break; | 422 | break; |
| 411 | } | 423 | } |
| @@ -418,6 +430,7 @@ static int page_referenced_anon(struct page *page, | |||
| 418 | * page_referenced_file - referenced check for object-based rmap | 430 | * page_referenced_file - referenced check for object-based rmap |
| 419 | * @page: the page we're checking references on. | 431 | * @page: the page we're checking references on. |
| 420 | * @mem_cont: target memory controller | 432 | * @mem_cont: target memory controller |
| 433 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
| 421 | * | 434 | * |
| 422 | * For an object-based mapped page, find all the places it is mapped and | 435 | * For an object-based mapped page, find all the places it is mapped and |
| 423 | * check/clear the referenced flag. This is done by following the page->mapping | 436 | * check/clear the referenced flag. This is done by following the page->mapping |
| @@ -427,7 +440,8 @@ static int page_referenced_anon(struct page *page, | |||
| 427 | * This function is only called from page_referenced for object-based pages. | 440 | * This function is only called from page_referenced for object-based pages. |
| 428 | */ | 441 | */ |
| 429 | static int page_referenced_file(struct page *page, | 442 | static int page_referenced_file(struct page *page, |
| 430 | struct mem_cgroup *mem_cont) | 443 | struct mem_cgroup *mem_cont, |
| 444 | unsigned long *vm_flags) | ||
| 431 | { | 445 | { |
| 432 | unsigned int mapcount; | 446 | unsigned int mapcount; |
| 433 | struct address_space *mapping = page->mapping; | 447 | struct address_space *mapping = page->mapping; |
| @@ -467,7 +481,8 @@ static int page_referenced_file(struct page *page, | |||
| 467 | */ | 481 | */ |
| 468 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) | 482 | if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) |
| 469 | continue; | 483 | continue; |
| 470 | referenced += page_referenced_one(page, vma, &mapcount); | 484 | referenced += page_referenced_one(page, vma, |
| 485 | &mapcount, vm_flags); | ||
| 471 | if (!mapcount) | 486 | if (!mapcount) |
| 472 | break; | 487 | break; |
| 473 | } | 488 | } |
| @@ -481,29 +496,35 @@ static int page_referenced_file(struct page *page, | |||
| 481 | * @page: the page to test | 496 | * @page: the page to test |
| 482 | * @is_locked: caller holds lock on the page | 497 | * @is_locked: caller holds lock on the page |
| 483 | * @mem_cont: target memory controller | 498 | * @mem_cont: target memory controller |
| 499 | * @vm_flags: collect encountered vma->vm_flags who actually referenced the page | ||
| 484 | * | 500 | * |
| 485 | * Quick test_and_clear_referenced for all mappings to a page, | 501 | * Quick test_and_clear_referenced for all mappings to a page, |
| 486 | * returns the number of ptes which referenced the page. | 502 | * returns the number of ptes which referenced the page. |
| 487 | */ | 503 | */ |
| 488 | int page_referenced(struct page *page, int is_locked, | 504 | int page_referenced(struct page *page, |
| 489 | struct mem_cgroup *mem_cont) | 505 | int is_locked, |
| 506 | struct mem_cgroup *mem_cont, | ||
| 507 | unsigned long *vm_flags) | ||
| 490 | { | 508 | { |
| 491 | int referenced = 0; | 509 | int referenced = 0; |
| 492 | 510 | ||
| 493 | if (TestClearPageReferenced(page)) | 511 | if (TestClearPageReferenced(page)) |
| 494 | referenced++; | 512 | referenced++; |
| 495 | 513 | ||
| 514 | *vm_flags = 0; | ||
| 496 | if (page_mapped(page) && page->mapping) { | 515 | if (page_mapped(page) && page->mapping) { |
| 497 | if (PageAnon(page)) | 516 | if (PageAnon(page)) |
| 498 | referenced += page_referenced_anon(page, mem_cont); | 517 | referenced += page_referenced_anon(page, mem_cont, |
| 518 | vm_flags); | ||
| 499 | else if (is_locked) | 519 | else if (is_locked) |
| 500 | referenced += page_referenced_file(page, mem_cont); | 520 | referenced += page_referenced_file(page, mem_cont, |
| 521 | vm_flags); | ||
| 501 | else if (!trylock_page(page)) | 522 | else if (!trylock_page(page)) |
| 502 | referenced++; | 523 | referenced++; |
| 503 | else { | 524 | else { |
| 504 | if (page->mapping) | 525 | if (page->mapping) |
| 505 | referenced += | 526 | referenced += page_referenced_file(page, |
| 506 | page_referenced_file(page, mem_cont); | 527 | mem_cont, vm_flags); |
| 507 | unlock_page(page); | 528 | unlock_page(page); |
| 508 | } | 529 | } |
| 509 | } | 530 | } |
| @@ -688,31 +709,12 @@ void page_add_new_anon_rmap(struct page *page, | |||
| 688 | */ | 709 | */ |
| 689 | void page_add_file_rmap(struct page *page) | 710 | void page_add_file_rmap(struct page *page) |
| 690 | { | 711 | { |
| 691 | if (atomic_inc_and_test(&page->_mapcount)) | 712 | if (atomic_inc_and_test(&page->_mapcount)) { |
| 692 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 713 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
| 714 | mem_cgroup_update_mapped_file_stat(page, 1); | ||
| 715 | } | ||
| 693 | } | 716 | } |
| 694 | 717 | ||
| 695 | #ifdef CONFIG_DEBUG_VM | ||
| 696 | /** | ||
| 697 | * page_dup_rmap - duplicate pte mapping to a page | ||
| 698 | * @page: the page to add the mapping to | ||
| 699 | * @vma: the vm area being duplicated | ||
| 700 | * @address: the user virtual address mapped | ||
| 701 | * | ||
| 702 | * For copy_page_range only: minimal extract from page_add_file_rmap / | ||
| 703 | * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's | ||
| 704 | * quicker. | ||
| 705 | * | ||
| 706 | * The caller needs to hold the pte lock. | ||
| 707 | */ | ||
| 708 | void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) | ||
| 709 | { | ||
| 710 | if (PageAnon(page)) | ||
| 711 | __page_check_anon_rmap(page, vma, address); | ||
| 712 | atomic_inc(&page->_mapcount); | ||
| 713 | } | ||
| 714 | #endif | ||
| 715 | |||
| 716 | /** | 718 | /** |
| 717 | * page_remove_rmap - take down pte mapping from a page | 719 | * page_remove_rmap - take down pte mapping from a page |
| 718 | * @page: page to remove mapping from | 720 | * @page: page to remove mapping from |
| @@ -721,33 +723,37 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long | |||
| 721 | */ | 723 | */ |
| 722 | void page_remove_rmap(struct page *page) | 724 | void page_remove_rmap(struct page *page) |
| 723 | { | 725 | { |
| 724 | if (atomic_add_negative(-1, &page->_mapcount)) { | 726 | /* page still mapped by someone else? */ |
| 725 | /* | 727 | if (!atomic_add_negative(-1, &page->_mapcount)) |
| 726 | * Now that the last pte has gone, s390 must transfer dirty | 728 | return; |
| 727 | * flag from storage key to struct page. We can usually skip | 729 | |
| 728 | * this if the page is anon, so about to be freed; but perhaps | 730 | /* |
| 729 | * not if it's in swapcache - there might be another pte slot | 731 | * Now that the last pte has gone, s390 must transfer dirty |
| 730 | * containing the swap entry, but page not yet written to swap. | 732 | * flag from storage key to struct page. We can usually skip |
| 731 | */ | 733 | * this if the page is anon, so about to be freed; but perhaps |
| 732 | if ((!PageAnon(page) || PageSwapCache(page)) && | 734 | * not if it's in swapcache - there might be another pte slot |
| 733 | page_test_dirty(page)) { | 735 | * containing the swap entry, but page not yet written to swap. |
| 734 | page_clear_dirty(page); | 736 | */ |
| 735 | set_page_dirty(page); | 737 | if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { |
| 736 | } | 738 | page_clear_dirty(page); |
| 737 | if (PageAnon(page)) | 739 | set_page_dirty(page); |
| 738 | mem_cgroup_uncharge_page(page); | 740 | } |
| 739 | __dec_zone_page_state(page, | 741 | if (PageAnon(page)) { |
| 740 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | 742 | mem_cgroup_uncharge_page(page); |
| 741 | /* | 743 | __dec_zone_page_state(page, NR_ANON_PAGES); |
| 742 | * It would be tidy to reset the PageAnon mapping here, | 744 | } else { |
| 743 | * but that might overwrite a racing page_add_anon_rmap | 745 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
| 744 | * which increments mapcount after us but sets mapping | ||
| 745 | * before us: so leave the reset to free_hot_cold_page, | ||
| 746 | * and remember that it's only reliable while mapped. | ||
| 747 | * Leaving it set also helps swapoff to reinstate ptes | ||
| 748 | * faster for those pages still in swapcache. | ||
| 749 | */ | ||
| 750 | } | 746 | } |
| 747 | mem_cgroup_update_mapped_file_stat(page, -1); | ||
| 748 | /* | ||
| 749 | * It would be tidy to reset the PageAnon mapping here, | ||
| 750 | * but that might overwrite a racing page_add_anon_rmap | ||
| 751 | * which increments mapcount after us but sets mapping | ||
| 752 | * before us: so leave the reset to free_hot_cold_page, | ||
| 753 | * and remember that it's only reliable while mapped. | ||
| 754 | * Leaving it set also helps swapoff to reinstate ptes | ||
| 755 | * faster for those pages still in swapcache. | ||
| 756 | */ | ||
| 751 | } | 757 | } |
| 752 | 758 | ||
| 753 | /* | 759 | /* |
| @@ -755,7 +761,7 @@ void page_remove_rmap(struct page *page) | |||
| 755 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. | 761 | * repeatedly from either try_to_unmap_anon or try_to_unmap_file. |
| 756 | */ | 762 | */ |
| 757 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | 763 | static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, |
| 758 | int migration) | 764 | enum ttu_flags flags) |
| 759 | { | 765 | { |
| 760 | struct mm_struct *mm = vma->vm_mm; | 766 | struct mm_struct *mm = vma->vm_mm; |
| 761 | unsigned long address; | 767 | unsigned long address; |
| @@ -777,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 777 | * If it's recently referenced (perhaps page_referenced | 783 | * If it's recently referenced (perhaps page_referenced |
| 778 | * skipped over this mm) then we should reactivate it. | 784 | * skipped over this mm) then we should reactivate it. |
| 779 | */ | 785 | */ |
| 780 | if (!migration) { | 786 | if (!(flags & TTU_IGNORE_MLOCK)) { |
| 781 | if (vma->vm_flags & VM_LOCKED) { | 787 | if (vma->vm_flags & VM_LOCKED) { |
| 782 | ret = SWAP_MLOCK; | 788 | ret = SWAP_MLOCK; |
| 783 | goto out_unmap; | 789 | goto out_unmap; |
| 784 | } | 790 | } |
| 791 | } | ||
| 792 | if (!(flags & TTU_IGNORE_ACCESS)) { | ||
| 785 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 793 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
| 786 | ret = SWAP_FAIL; | 794 | ret = SWAP_FAIL; |
| 787 | goto out_unmap; | 795 | goto out_unmap; |
| @@ -799,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 799 | /* Update high watermark before we lower rss */ | 807 | /* Update high watermark before we lower rss */ |
| 800 | update_hiwater_rss(mm); | 808 | update_hiwater_rss(mm); |
| 801 | 809 | ||
| 802 | if (PageAnon(page)) { | 810 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
| 811 | if (PageAnon(page)) | ||
| 812 | dec_mm_counter(mm, anon_rss); | ||
| 813 | else | ||
| 814 | dec_mm_counter(mm, file_rss); | ||
| 815 | set_pte_at(mm, address, pte, | ||
| 816 | swp_entry_to_pte(make_hwpoison_entry(page))); | ||
| 817 | } else if (PageAnon(page)) { | ||
| 803 | swp_entry_t entry = { .val = page_private(page) }; | 818 | swp_entry_t entry = { .val = page_private(page) }; |
| 804 | 819 | ||
| 805 | if (PageSwapCache(page)) { | 820 | if (PageSwapCache(page)) { |
| @@ -821,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 821 | * pte. do_swap_page() will wait until the migration | 836 | * pte. do_swap_page() will wait until the migration |
| 822 | * pte is removed and then restart fault handling. | 837 | * pte is removed and then restart fault handling. |
| 823 | */ | 838 | */ |
| 824 | BUG_ON(!migration); | 839 | BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); |
| 825 | entry = make_migration_entry(page, pte_write(pteval)); | 840 | entry = make_migration_entry(page, pte_write(pteval)); |
| 826 | } | 841 | } |
| 827 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); | 842 | set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); |
| 828 | BUG_ON(pte_file(*pte)); | 843 | BUG_ON(pte_file(*pte)); |
| 829 | } else if (PAGE_MIGRATION && migration) { | 844 | } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { |
| 830 | /* Establish migration entry for a file page */ | 845 | /* Establish migration entry for a file page */ |
| 831 | swp_entry_t entry; | 846 | swp_entry_t entry; |
| 832 | entry = make_migration_entry(page, pte_write(pteval)); | 847 | entry = make_migration_entry(page, pte_write(pteval)); |
| @@ -995,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) | |||
| 995 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | 1010 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be |
| 996 | * 'LOCKED. | 1011 | * 'LOCKED. |
| 997 | */ | 1012 | */ |
| 998 | static int try_to_unmap_anon(struct page *page, int unlock, int migration) | 1013 | static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) |
| 999 | { | 1014 | { |
| 1000 | struct anon_vma *anon_vma; | 1015 | struct anon_vma *anon_vma; |
| 1001 | struct vm_area_struct *vma; | 1016 | struct vm_area_struct *vma; |
| 1002 | unsigned int mlocked = 0; | 1017 | unsigned int mlocked = 0; |
| 1003 | int ret = SWAP_AGAIN; | 1018 | int ret = SWAP_AGAIN; |
| 1019 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
| 1004 | 1020 | ||
| 1005 | if (MLOCK_PAGES && unlikely(unlock)) | 1021 | if (MLOCK_PAGES && unlikely(unlock)) |
| 1006 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | 1022 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ |
| @@ -1016,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) | |||
| 1016 | continue; /* must visit all unlocked vmas */ | 1032 | continue; /* must visit all unlocked vmas */ |
| 1017 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ | 1033 | ret = SWAP_MLOCK; /* saw at least one mlocked vma */ |
| 1018 | } else { | 1034 | } else { |
| 1019 | ret = try_to_unmap_one(page, vma, migration); | 1035 | ret = try_to_unmap_one(page, vma, flags); |
| 1020 | if (ret == SWAP_FAIL || !page_mapped(page)) | 1036 | if (ret == SWAP_FAIL || !page_mapped(page)) |
| 1021 | break; | 1037 | break; |
| 1022 | } | 1038 | } |
| @@ -1040,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) | |||
| 1040 | /** | 1056 | /** |
| 1041 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method | 1057 | * try_to_unmap_file - unmap/unlock file page using the object-based rmap method |
| 1042 | * @page: the page to unmap/unlock | 1058 | * @page: the page to unmap/unlock |
| 1043 | * @unlock: request for unlock rather than unmap [unlikely] | 1059 | * @flags: action and flags |
| 1044 | * @migration: unmapping for migration - ignored if @unlock | ||
| 1045 | * | 1060 | * |
| 1046 | * Find all the mappings of a page using the mapping pointer and the vma chains | 1061 | * Find all the mappings of a page using the mapping pointer and the vma chains |
| 1047 | * contained in the address_space struct it points to. | 1062 | * contained in the address_space struct it points to. |
| @@ -1053,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) | |||
| 1053 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be | 1068 | * vm_flags for that VMA. That should be OK, because that vma shouldn't be |
| 1054 | * 'LOCKED. | 1069 | * 'LOCKED. |
| 1055 | */ | 1070 | */ |
| 1056 | static int try_to_unmap_file(struct page *page, int unlock, int migration) | 1071 | static int try_to_unmap_file(struct page *page, enum ttu_flags flags) |
| 1057 | { | 1072 | { |
| 1058 | struct address_space *mapping = page->mapping; | 1073 | struct address_space *mapping = page->mapping; |
| 1059 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 1074 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
| @@ -1065,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) | |||
| 1065 | unsigned long max_nl_size = 0; | 1080 | unsigned long max_nl_size = 0; |
| 1066 | unsigned int mapcount; | 1081 | unsigned int mapcount; |
| 1067 | unsigned int mlocked = 0; | 1082 | unsigned int mlocked = 0; |
| 1083 | int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; | ||
| 1068 | 1084 | ||
| 1069 | if (MLOCK_PAGES && unlikely(unlock)) | 1085 | if (MLOCK_PAGES && unlikely(unlock)) |
| 1070 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ | 1086 | ret = SWAP_SUCCESS; /* default for try_to_munlock() */ |
| @@ -1077,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) | |||
| 1077 | continue; /* must visit all vmas */ | 1093 | continue; /* must visit all vmas */ |
| 1078 | ret = SWAP_MLOCK; | 1094 | ret = SWAP_MLOCK; |
| 1079 | } else { | 1095 | } else { |
| 1080 | ret = try_to_unmap_one(page, vma, migration); | 1096 | ret = try_to_unmap_one(page, vma, flags); |
| 1081 | if (ret == SWAP_FAIL || !page_mapped(page)) | 1097 | if (ret == SWAP_FAIL || !page_mapped(page)) |
| 1082 | goto out; | 1098 | goto out; |
| 1083 | } | 1099 | } |
| @@ -1102,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) | |||
| 1102 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ | 1118 | ret = SWAP_MLOCK; /* leave mlocked == 0 */ |
| 1103 | goto out; /* no need to look further */ | 1119 | goto out; /* no need to look further */ |
| 1104 | } | 1120 | } |
| 1105 | if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) | 1121 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && |
| 1122 | (vma->vm_flags & VM_LOCKED)) | ||
| 1106 | continue; | 1123 | continue; |
| 1107 | cursor = (unsigned long) vma->vm_private_data; | 1124 | cursor = (unsigned long) vma->vm_private_data; |
| 1108 | if (cursor > max_nl_cursor) | 1125 | if (cursor > max_nl_cursor) |
| @@ -1136,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) | |||
| 1136 | do { | 1153 | do { |
| 1137 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, | 1154 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, |
| 1138 | shared.vm_set.list) { | 1155 | shared.vm_set.list) { |
| 1139 | if (!MLOCK_PAGES && !migration && | 1156 | if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && |
| 1140 | (vma->vm_flags & VM_LOCKED)) | 1157 | (vma->vm_flags & VM_LOCKED)) |
| 1141 | continue; | 1158 | continue; |
| 1142 | cursor = (unsigned long) vma->vm_private_data; | 1159 | cursor = (unsigned long) vma->vm_private_data; |
| @@ -1176,7 +1193,7 @@ out: | |||
| 1176 | /** | 1193 | /** |
| 1177 | * try_to_unmap - try to remove all page table mappings to a page | 1194 | * try_to_unmap - try to remove all page table mappings to a page |
| 1178 | * @page: the page to get unmapped | 1195 | * @page: the page to get unmapped |
| 1179 | * @migration: migration flag | 1196 | * @flags: action and flags |
| 1180 | * | 1197 | * |
| 1181 | * Tries to remove all the page table entries which are mapping this | 1198 | * Tries to remove all the page table entries which are mapping this |
| 1182 | * page, used in the pageout path. Caller must hold the page lock. | 1199 | * page, used in the pageout path. Caller must hold the page lock. |
| @@ -1187,22 +1204,21 @@ out: | |||
| 1187 | * SWAP_FAIL - the page is unswappable | 1204 | * SWAP_FAIL - the page is unswappable |
| 1188 | * SWAP_MLOCK - page is mlocked. | 1205 | * SWAP_MLOCK - page is mlocked. |
| 1189 | */ | 1206 | */ |
| 1190 | int try_to_unmap(struct page *page, int migration) | 1207 | int try_to_unmap(struct page *page, enum ttu_flags flags) |
| 1191 | { | 1208 | { |
| 1192 | int ret; | 1209 | int ret; |
| 1193 | 1210 | ||
| 1194 | BUG_ON(!PageLocked(page)); | 1211 | BUG_ON(!PageLocked(page)); |
| 1195 | 1212 | ||
| 1196 | if (PageAnon(page)) | 1213 | if (PageAnon(page)) |
| 1197 | ret = try_to_unmap_anon(page, 0, migration); | 1214 | ret = try_to_unmap_anon(page, flags); |
| 1198 | else | 1215 | else |
| 1199 | ret = try_to_unmap_file(page, 0, migration); | 1216 | ret = try_to_unmap_file(page, flags); |
| 1200 | if (ret != SWAP_MLOCK && !page_mapped(page)) | 1217 | if (ret != SWAP_MLOCK && !page_mapped(page)) |
| 1201 | ret = SWAP_SUCCESS; | 1218 | ret = SWAP_SUCCESS; |
| 1202 | return ret; | 1219 | return ret; |
| 1203 | } | 1220 | } |
| 1204 | 1221 | ||
| 1205 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 1206 | /** | 1222 | /** |
| 1207 | * try_to_munlock - try to munlock a page | 1223 | * try_to_munlock - try to munlock a page |
| 1208 | * @page: the page to be munlocked | 1224 | * @page: the page to be munlocked |
| @@ -1222,8 +1238,8 @@ int try_to_munlock(struct page *page) | |||
| 1222 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); | 1238 | VM_BUG_ON(!PageLocked(page) || PageLRU(page)); |
| 1223 | 1239 | ||
| 1224 | if (PageAnon(page)) | 1240 | if (PageAnon(page)) |
| 1225 | return try_to_unmap_anon(page, 1, 0); | 1241 | return try_to_unmap_anon(page, TTU_MUNLOCK); |
| 1226 | else | 1242 | else |
| 1227 | return try_to_unmap_file(page, 1, 0); | 1243 | return try_to_unmap_file(page, TTU_MUNLOCK); |
| 1228 | } | 1244 | } |
| 1229 | #endif | 1245 | |
diff --git a/mm/shmem.c b/mm/shmem.c index d94d2e9146bc..356dd99566ec 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -24,6 +24,7 @@ | |||
| 24 | #include <linux/init.h> | 24 | #include <linux/init.h> |
| 25 | #include <linux/vfs.h> | 25 | #include <linux/vfs.h> |
| 26 | #include <linux/mount.h> | 26 | #include <linux/mount.h> |
| 27 | #include <linux/pagemap.h> | ||
| 27 | #include <linux/file.h> | 28 | #include <linux/file.h> |
| 28 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
| 29 | #include <linux/module.h> | 30 | #include <linux/module.h> |
| @@ -43,13 +44,11 @@ static struct vfsmount *shm_mnt; | |||
| 43 | #include <linux/exportfs.h> | 44 | #include <linux/exportfs.h> |
| 44 | #include <linux/generic_acl.h> | 45 | #include <linux/generic_acl.h> |
| 45 | #include <linux/mman.h> | 46 | #include <linux/mman.h> |
| 46 | #include <linux/pagemap.h> | ||
| 47 | #include <linux/string.h> | 47 | #include <linux/string.h> |
| 48 | #include <linux/slab.h> | 48 | #include <linux/slab.h> |
| 49 | #include <linux/backing-dev.h> | 49 | #include <linux/backing-dev.h> |
| 50 | #include <linux/shmem_fs.h> | 50 | #include <linux/shmem_fs.h> |
| 51 | #include <linux/writeback.h> | 51 | #include <linux/writeback.h> |
| 52 | #include <linux/vfs.h> | ||
| 53 | #include <linux/blkdev.h> | 52 | #include <linux/blkdev.h> |
| 54 | #include <linux/security.h> | 53 | #include <linux/security.h> |
| 55 | #include <linux/swapops.h> | 54 | #include <linux/swapops.h> |
| @@ -65,13 +64,28 @@ static struct vfsmount *shm_mnt; | |||
| 65 | #include <asm/div64.h> | 64 | #include <asm/div64.h> |
| 66 | #include <asm/pgtable.h> | 65 | #include <asm/pgtable.h> |
| 67 | 66 | ||
| 67 | /* | ||
| 68 | * The maximum size of a shmem/tmpfs file is limited by the maximum size of | ||
| 69 | * its triple-indirect swap vector - see illustration at shmem_swp_entry(). | ||
| 70 | * | ||
| 71 | * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel, | ||
| 72 | * but one eighth of that on a 64-bit kernel. With 8kB page size, maximum | ||
| 73 | * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel, | ||
| 74 | * MAX_LFS_FILESIZE being then more restrictive than swap vector layout. | ||
| 75 | * | ||
| 76 | * We use / and * instead of shifts in the definitions below, so that the swap | ||
| 77 | * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE. | ||
| 78 | */ | ||
| 68 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) | 79 | #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) |
| 69 | #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) | 80 | #define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) |
| 70 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) | 81 | |
| 82 | #define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) | ||
| 83 | #define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT) | ||
| 71 | 84 | ||
| 72 | #define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1)) | 85 | #define SHMEM_MAX_BYTES min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE) |
| 73 | #define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT) | 86 | #define SHMEM_MAX_INDEX ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT)) |
| 74 | 87 | ||
| 88 | #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) | ||
| 75 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) | 89 | #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT) |
| 76 | 90 | ||
| 77 | /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ | 91 | /* info->flags needs VM_flags to handle pagein/truncate races efficiently */ |
| @@ -204,7 +218,7 @@ static const struct file_operations shmem_file_operations; | |||
| 204 | static const struct inode_operations shmem_inode_operations; | 218 | static const struct inode_operations shmem_inode_operations; |
| 205 | static const struct inode_operations shmem_dir_inode_operations; | 219 | static const struct inode_operations shmem_dir_inode_operations; |
| 206 | static const struct inode_operations shmem_special_inode_operations; | 220 | static const struct inode_operations shmem_special_inode_operations; |
| 207 | static struct vm_operations_struct shmem_vm_ops; | 221 | static const struct vm_operations_struct shmem_vm_ops; |
| 208 | 222 | ||
| 209 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { | 223 | static struct backing_dev_info shmem_backing_dev_info __read_mostly = { |
| 210 | .ra_pages = 0, /* No readahead */ | 224 | .ra_pages = 0, /* No readahead */ |
| @@ -1032,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
| 1032 | * sync from ever calling shmem_writepage; but a stacking filesystem | 1046 | * sync from ever calling shmem_writepage; but a stacking filesystem |
| 1033 | * may use the ->writepage of its underlying filesystem, in which case | 1047 | * may use the ->writepage of its underlying filesystem, in which case |
| 1034 | * tmpfs should write out to swap only in response to memory pressure, | 1048 | * tmpfs should write out to swap only in response to memory pressure, |
| 1035 | * and not for pdflush or sync. However, in those cases, we do still | 1049 | * and not for the writeback threads or sync. However, in those cases, |
| 1036 | * want to check if there's a redundant swappage to be discarded. | 1050 | * we do still want to check if there's a redundant swappage to be |
| 1051 | * discarded. | ||
| 1037 | */ | 1052 | */ |
| 1038 | if (wbc->for_reclaim) | 1053 | if (wbc->for_reclaim) |
| 1039 | swap = get_swap_page(); | 1054 | swap = get_swap_page(); |
| @@ -1082,7 +1097,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) | |||
| 1082 | shmem_swp_unmap(entry); | 1097 | shmem_swp_unmap(entry); |
| 1083 | unlock: | 1098 | unlock: |
| 1084 | spin_unlock(&info->lock); | 1099 | spin_unlock(&info->lock); |
| 1085 | swap_free(swap); | 1100 | /* |
| 1101 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely | ||
| 1102 | * clear SWAP_HAS_CACHE flag. | ||
| 1103 | */ | ||
| 1104 | swapcache_free(swap, NULL); | ||
| 1086 | redirty: | 1105 | redirty: |
| 1087 | set_page_dirty(page); | 1106 | set_page_dirty(page); |
| 1088 | if (wbc->for_reclaim) | 1107 | if (wbc->for_reclaim) |
| @@ -1325,8 +1344,12 @@ repeat: | |||
| 1325 | shmem_swp_unmap(entry); | 1344 | shmem_swp_unmap(entry); |
| 1326 | spin_unlock(&info->lock); | 1345 | spin_unlock(&info->lock); |
| 1327 | if (error == -ENOMEM) { | 1346 | if (error == -ENOMEM) { |
| 1328 | /* allow reclaim from this memory cgroup */ | 1347 | /* |
| 1329 | error = mem_cgroup_shrink_usage(swappage, | 1348 | * reclaim from proper memory cgroup and |
| 1349 | * call memcg's OOM if needed. | ||
| 1350 | */ | ||
| 1351 | error = mem_cgroup_shmem_charge_fallback( | ||
| 1352 | swappage, | ||
| 1330 | current->mm, | 1353 | current->mm, |
| 1331 | gfp); | 1354 | gfp); |
| 1332 | if (error) { | 1355 | if (error) { |
| @@ -1539,6 +1562,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode, | |||
| 1539 | spin_lock_init(&info->lock); | 1562 | spin_lock_init(&info->lock); |
| 1540 | info->flags = flags & VM_NORESERVE; | 1563 | info->flags = flags & VM_NORESERVE; |
| 1541 | INIT_LIST_HEAD(&info->swaplist); | 1564 | INIT_LIST_HEAD(&info->swaplist); |
| 1565 | cache_no_acl(inode); | ||
| 1542 | 1566 | ||
| 1543 | switch (mode & S_IFMT) { | 1567 | switch (mode & S_IFMT) { |
| 1544 | default: | 1568 | default: |
| @@ -1610,8 +1634,8 @@ shmem_write_end(struct file *file, struct address_space *mapping, | |||
| 1610 | if (pos + copied > inode->i_size) | 1634 | if (pos + copied > inode->i_size) |
| 1611 | i_size_write(inode, pos + copied); | 1635 | i_size_write(inode, pos + copied); |
| 1612 | 1636 | ||
| 1613 | unlock_page(page); | ||
| 1614 | set_page_dirty(page); | 1637 | set_page_dirty(page); |
| 1638 | unlock_page(page); | ||
| 1615 | page_cache_release(page); | 1639 | page_cache_release(page); |
| 1616 | 1640 | ||
| 1617 | return copied; | 1641 | return copied; |
| @@ -1948,13 +1972,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
| 1948 | iput(inode); | 1972 | iput(inode); |
| 1949 | return error; | 1973 | return error; |
| 1950 | } | 1974 | } |
| 1951 | unlock_page(page); | ||
| 1952 | inode->i_mapping->a_ops = &shmem_aops; | 1975 | inode->i_mapping->a_ops = &shmem_aops; |
| 1953 | inode->i_op = &shmem_symlink_inode_operations; | 1976 | inode->i_op = &shmem_symlink_inode_operations; |
| 1954 | kaddr = kmap_atomic(page, KM_USER0); | 1977 | kaddr = kmap_atomic(page, KM_USER0); |
| 1955 | memcpy(kaddr, symname, len); | 1978 | memcpy(kaddr, symname, len); |
| 1956 | kunmap_atomic(kaddr, KM_USER0); | 1979 | kunmap_atomic(kaddr, KM_USER0); |
| 1957 | set_page_dirty(page); | 1980 | set_page_dirty(page); |
| 1981 | unlock_page(page); | ||
| 1958 | page_cache_release(page); | 1982 | page_cache_release(page); |
| 1959 | } | 1983 | } |
| 1960 | if (dir->i_mode & S_ISGID) | 1984 | if (dir->i_mode & S_ISGID) |
| @@ -2278,8 +2302,7 @@ static void shmem_put_super(struct super_block *sb) | |||
| 2278 | sb->s_fs_info = NULL; | 2302 | sb->s_fs_info = NULL; |
| 2279 | } | 2303 | } |
| 2280 | 2304 | ||
| 2281 | static int shmem_fill_super(struct super_block *sb, | 2305 | int shmem_fill_super(struct super_block *sb, void *data, int silent) |
| 2282 | void *data, int silent) | ||
| 2283 | { | 2306 | { |
| 2284 | struct inode *inode; | 2307 | struct inode *inode; |
| 2285 | struct dentry *root; | 2308 | struct dentry *root; |
| @@ -2287,17 +2310,14 @@ static int shmem_fill_super(struct super_block *sb, | |||
| 2287 | int err = -ENOMEM; | 2310 | int err = -ENOMEM; |
| 2288 | 2311 | ||
| 2289 | /* Round up to L1_CACHE_BYTES to resist false sharing */ | 2312 | /* Round up to L1_CACHE_BYTES to resist false sharing */ |
| 2290 | sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), | 2313 | sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), |
| 2291 | L1_CACHE_BYTES), GFP_KERNEL); | 2314 | L1_CACHE_BYTES), GFP_KERNEL); |
| 2292 | if (!sbinfo) | 2315 | if (!sbinfo) |
| 2293 | return -ENOMEM; | 2316 | return -ENOMEM; |
| 2294 | 2317 | ||
| 2295 | sbinfo->max_blocks = 0; | ||
| 2296 | sbinfo->max_inodes = 0; | ||
| 2297 | sbinfo->mode = S_IRWXUGO | S_ISVTX; | 2318 | sbinfo->mode = S_IRWXUGO | S_ISVTX; |
| 2298 | sbinfo->uid = current_fsuid(); | 2319 | sbinfo->uid = current_fsuid(); |
| 2299 | sbinfo->gid = current_fsgid(); | 2320 | sbinfo->gid = current_fsgid(); |
| 2300 | sbinfo->mpol = NULL; | ||
| 2301 | sb->s_fs_info = sbinfo; | 2321 | sb->s_fs_info = sbinfo; |
| 2302 | 2322 | ||
| 2303 | #ifdef CONFIG_TMPFS | 2323 | #ifdef CONFIG_TMPFS |
| @@ -2369,7 +2389,6 @@ static void shmem_destroy_inode(struct inode *inode) | |||
| 2369 | /* only struct inode is valid if it's an inline symlink */ | 2389 | /* only struct inode is valid if it's an inline symlink */ |
| 2370 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); | 2390 | mpol_free_shared_policy(&SHMEM_I(inode)->policy); |
| 2371 | } | 2391 | } |
| 2372 | shmem_acl_destroy_inode(inode); | ||
| 2373 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); | 2392 | kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); |
| 2374 | } | 2393 | } |
| 2375 | 2394 | ||
| @@ -2378,10 +2397,6 @@ static void init_once(void *foo) | |||
| 2378 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; | 2397 | struct shmem_inode_info *p = (struct shmem_inode_info *) foo; |
| 2379 | 2398 | ||
| 2380 | inode_init_once(&p->vfs_inode); | 2399 | inode_init_once(&p->vfs_inode); |
| 2381 | #ifdef CONFIG_TMPFS_POSIX_ACL | ||
| 2382 | p->i_acl = NULL; | ||
| 2383 | p->i_default_acl = NULL; | ||
| 2384 | #endif | ||
| 2385 | } | 2400 | } |
| 2386 | 2401 | ||
| 2387 | static int init_inodecache(void) | 2402 | static int init_inodecache(void) |
| @@ -2406,6 +2421,7 @@ static const struct address_space_operations shmem_aops = { | |||
| 2406 | .write_end = shmem_write_end, | 2421 | .write_end = shmem_write_end, |
| 2407 | #endif | 2422 | #endif |
| 2408 | .migratepage = migrate_page, | 2423 | .migratepage = migrate_page, |
| 2424 | .error_remove_page = generic_error_remove_page, | ||
| 2409 | }; | 2425 | }; |
| 2410 | 2426 | ||
| 2411 | static const struct file_operations shmem_file_operations = { | 2427 | static const struct file_operations shmem_file_operations = { |
| @@ -2431,7 +2447,7 @@ static const struct inode_operations shmem_inode_operations = { | |||
| 2431 | .getxattr = generic_getxattr, | 2447 | .getxattr = generic_getxattr, |
| 2432 | .listxattr = generic_listxattr, | 2448 | .listxattr = generic_listxattr, |
| 2433 | .removexattr = generic_removexattr, | 2449 | .removexattr = generic_removexattr, |
| 2434 | .permission = shmem_permission, | 2450 | .check_acl = shmem_check_acl, |
| 2435 | #endif | 2451 | #endif |
| 2436 | 2452 | ||
| 2437 | }; | 2453 | }; |
| @@ -2454,7 +2470,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
| 2454 | .getxattr = generic_getxattr, | 2470 | .getxattr = generic_getxattr, |
| 2455 | .listxattr = generic_listxattr, | 2471 | .listxattr = generic_listxattr, |
| 2456 | .removexattr = generic_removexattr, | 2472 | .removexattr = generic_removexattr, |
| 2457 | .permission = shmem_permission, | 2473 | .check_acl = shmem_check_acl, |
| 2458 | #endif | 2474 | #endif |
| 2459 | }; | 2475 | }; |
| 2460 | 2476 | ||
| @@ -2465,7 +2481,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
| 2465 | .getxattr = generic_getxattr, | 2481 | .getxattr = generic_getxattr, |
| 2466 | .listxattr = generic_listxattr, | 2482 | .listxattr = generic_listxattr, |
| 2467 | .removexattr = generic_removexattr, | 2483 | .removexattr = generic_removexattr, |
| 2468 | .permission = shmem_permission, | 2484 | .check_acl = shmem_check_acl, |
| 2469 | #endif | 2485 | #endif |
| 2470 | }; | 2486 | }; |
| 2471 | 2487 | ||
| @@ -2482,7 +2498,7 @@ static const struct super_operations shmem_ops = { | |||
| 2482 | .put_super = shmem_put_super, | 2498 | .put_super = shmem_put_super, |
| 2483 | }; | 2499 | }; |
| 2484 | 2500 | ||
| 2485 | static struct vm_operations_struct shmem_vm_ops = { | 2501 | static const struct vm_operations_struct shmem_vm_ops = { |
| 2486 | .fault = shmem_fault, | 2502 | .fault = shmem_fault, |
| 2487 | #ifdef CONFIG_NUMA | 2503 | #ifdef CONFIG_NUMA |
| 2488 | .set_policy = shmem_set_policy, | 2504 | .set_policy = shmem_set_policy, |
| @@ -2504,7 +2520,7 @@ static struct file_system_type tmpfs_fs_type = { | |||
| 2504 | .kill_sb = kill_litter_super, | 2520 | .kill_sb = kill_litter_super, |
| 2505 | }; | 2521 | }; |
| 2506 | 2522 | ||
| 2507 | static int __init init_tmpfs(void) | 2523 | int __init init_tmpfs(void) |
| 2508 | { | 2524 | { |
| 2509 | int error; | 2525 | int error; |
| 2510 | 2526 | ||
| @@ -2561,7 +2577,7 @@ static struct file_system_type tmpfs_fs_type = { | |||
| 2561 | .kill_sb = kill_litter_super, | 2577 | .kill_sb = kill_litter_super, |
| 2562 | }; | 2578 | }; |
| 2563 | 2579 | ||
| 2564 | static int __init init_tmpfs(void) | 2580 | int __init init_tmpfs(void) |
| 2565 | { | 2581 | { |
| 2566 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); | 2582 | BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); |
| 2567 | 2583 | ||
| @@ -2576,12 +2592,17 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
| 2576 | return 0; | 2592 | return 0; |
| 2577 | } | 2593 | } |
| 2578 | 2594 | ||
| 2595 | int shmem_lock(struct file *file, int lock, struct user_struct *user) | ||
| 2596 | { | ||
| 2597 | return 0; | ||
| 2598 | } | ||
| 2599 | |||
| 2579 | #define shmem_vm_ops generic_file_vm_ops | 2600 | #define shmem_vm_ops generic_file_vm_ops |
| 2580 | #define shmem_file_operations ramfs_file_operations | 2601 | #define shmem_file_operations ramfs_file_operations |
| 2581 | #define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) | 2602 | #define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) |
| 2582 | #define shmem_acct_size(flags, size) 0 | 2603 | #define shmem_acct_size(flags, size) 0 |
| 2583 | #define shmem_unacct_size(flags, size) do {} while (0) | 2604 | #define shmem_unacct_size(flags, size) do {} while (0) |
| 2584 | #define SHMEM_MAX_BYTES LLONG_MAX | 2605 | #define SHMEM_MAX_BYTES MAX_LFS_FILESIZE |
| 2585 | 2606 | ||
| 2586 | #endif /* CONFIG_SHMEM */ | 2607 | #endif /* CONFIG_SHMEM */ |
| 2587 | 2608 | ||
| @@ -2593,7 +2614,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) | |||
| 2593 | * @size: size to be set for the file | 2614 | * @size: size to be set for the file |
| 2594 | * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size | 2615 | * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size |
| 2595 | */ | 2616 | */ |
| 2596 | struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | 2617 | struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) |
| 2597 | { | 2618 | { |
| 2598 | int error; | 2619 | int error; |
| 2599 | struct file *file; | 2620 | struct file *file; |
| @@ -2640,6 +2661,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) | |||
| 2640 | if (error) | 2661 | if (error) |
| 2641 | goto close_file; | 2662 | goto close_file; |
| 2642 | #endif | 2663 | #endif |
| 2664 | ima_counts_get(file); | ||
| 2643 | return file; | 2665 | return file; |
| 2644 | 2666 | ||
| 2645 | close_file: | 2667 | close_file: |
| @@ -2665,12 +2687,9 @@ int shmem_zero_setup(struct vm_area_struct *vma) | |||
| 2665 | if (IS_ERR(file)) | 2687 | if (IS_ERR(file)) |
| 2666 | return PTR_ERR(file); | 2688 | return PTR_ERR(file); |
| 2667 | 2689 | ||
| 2668 | ima_shm_check(file); | ||
| 2669 | if (vma->vm_file) | 2690 | if (vma->vm_file) |
| 2670 | fput(vma->vm_file); | 2691 | fput(vma->vm_file); |
| 2671 | vma->vm_file = file; | 2692 | vma->vm_file = file; |
| 2672 | vma->vm_ops = &shmem_vm_ops; | 2693 | vma->vm_ops = &shmem_vm_ops; |
| 2673 | return 0; | 2694 | return 0; |
| 2674 | } | 2695 | } |
| 2675 | |||
| 2676 | module_init(init_tmpfs) | ||
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c index 8e5aadd7dcd6..df2c87fdae50 100644 --- a/mm/shmem_acl.c +++ b/mm/shmem_acl.c | |||
| @@ -22,11 +22,11 @@ shmem_get_acl(struct inode *inode, int type) | |||
| 22 | spin_lock(&inode->i_lock); | 22 | spin_lock(&inode->i_lock); |
| 23 | switch(type) { | 23 | switch(type) { |
| 24 | case ACL_TYPE_ACCESS: | 24 | case ACL_TYPE_ACCESS: |
| 25 | acl = posix_acl_dup(SHMEM_I(inode)->i_acl); | 25 | acl = posix_acl_dup(inode->i_acl); |
| 26 | break; | 26 | break; |
| 27 | 27 | ||
| 28 | case ACL_TYPE_DEFAULT: | 28 | case ACL_TYPE_DEFAULT: |
| 29 | acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); | 29 | acl = posix_acl_dup(inode->i_default_acl); |
| 30 | break; | 30 | break; |
| 31 | } | 31 | } |
| 32 | spin_unlock(&inode->i_lock); | 32 | spin_unlock(&inode->i_lock); |
| @@ -45,13 +45,13 @@ shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) | |||
| 45 | spin_lock(&inode->i_lock); | 45 | spin_lock(&inode->i_lock); |
| 46 | switch(type) { | 46 | switch(type) { |
| 47 | case ACL_TYPE_ACCESS: | 47 | case ACL_TYPE_ACCESS: |
| 48 | free = SHMEM_I(inode)->i_acl; | 48 | free = inode->i_acl; |
| 49 | SHMEM_I(inode)->i_acl = posix_acl_dup(acl); | 49 | inode->i_acl = posix_acl_dup(acl); |
| 50 | break; | 50 | break; |
| 51 | 51 | ||
| 52 | case ACL_TYPE_DEFAULT: | 52 | case ACL_TYPE_DEFAULT: |
| 53 | free = SHMEM_I(inode)->i_default_acl; | 53 | free = inode->i_default_acl; |
| 54 | SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); | 54 | inode->i_default_acl = posix_acl_dup(acl); |
| 55 | break; | 55 | break; |
| 56 | } | 56 | } |
| 57 | spin_unlock(&inode->i_lock); | 57 | spin_unlock(&inode->i_lock); |
| @@ -155,26 +155,9 @@ shmem_acl_init(struct inode *inode, struct inode *dir) | |||
| 155 | } | 155 | } |
| 156 | 156 | ||
| 157 | /** | 157 | /** |
| 158 | * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode | ||
| 159 | * | ||
| 160 | * This is done before destroying the actual inode. | ||
| 161 | */ | ||
| 162 | |||
| 163 | void | ||
| 164 | shmem_acl_destroy_inode(struct inode *inode) | ||
| 165 | { | ||
| 166 | if (SHMEM_I(inode)->i_acl) | ||
| 167 | posix_acl_release(SHMEM_I(inode)->i_acl); | ||
| 168 | SHMEM_I(inode)->i_acl = NULL; | ||
| 169 | if (SHMEM_I(inode)->i_default_acl) | ||
| 170 | posix_acl_release(SHMEM_I(inode)->i_default_acl); | ||
| 171 | SHMEM_I(inode)->i_default_acl = NULL; | ||
| 172 | } | ||
| 173 | |||
| 174 | /** | ||
| 175 | * shmem_check_acl - check_acl() callback for generic_permission() | 158 | * shmem_check_acl - check_acl() callback for generic_permission() |
| 176 | */ | 159 | */ |
| 177 | static int | 160 | int |
| 178 | shmem_check_acl(struct inode *inode, int mask) | 161 | shmem_check_acl(struct inode *inode, int mask) |
| 179 | { | 162 | { |
| 180 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); | 163 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); |
| @@ -186,12 +169,3 @@ shmem_check_acl(struct inode *inode, int mask) | |||
| 186 | } | 169 | } |
| 187 | return -EAGAIN; | 170 | return -EAGAIN; |
| 188 | } | 171 | } |
| 189 | |||
| 190 | /** | ||
| 191 | * shmem_permission - permission() inode operation | ||
| 192 | */ | ||
| 193 | int | ||
| 194 | shmem_permission(struct inode *inode, int mask) | ||
| 195 | { | ||
| 196 | return generic_permission(inode, mask, shmem_check_acl); | ||
| 197 | } | ||
| @@ -102,16 +102,19 @@ | |||
| 102 | #include <linux/cpu.h> | 102 | #include <linux/cpu.h> |
| 103 | #include <linux/sysctl.h> | 103 | #include <linux/sysctl.h> |
| 104 | #include <linux/module.h> | 104 | #include <linux/module.h> |
| 105 | #include <linux/kmemtrace.h> | ||
| 105 | #include <linux/rcupdate.h> | 106 | #include <linux/rcupdate.h> |
| 106 | #include <linux/string.h> | 107 | #include <linux/string.h> |
| 107 | #include <linux/uaccess.h> | 108 | #include <linux/uaccess.h> |
| 108 | #include <linux/nodemask.h> | 109 | #include <linux/nodemask.h> |
| 110 | #include <linux/kmemleak.h> | ||
| 109 | #include <linux/mempolicy.h> | 111 | #include <linux/mempolicy.h> |
| 110 | #include <linux/mutex.h> | 112 | #include <linux/mutex.h> |
| 111 | #include <linux/fault-inject.h> | 113 | #include <linux/fault-inject.h> |
| 112 | #include <linux/rtmutex.h> | 114 | #include <linux/rtmutex.h> |
| 113 | #include <linux/reciprocal_div.h> | 115 | #include <linux/reciprocal_div.h> |
| 114 | #include <linux/debugobjects.h> | 116 | #include <linux/debugobjects.h> |
| 117 | #include <linux/kmemcheck.h> | ||
| 115 | 118 | ||
| 116 | #include <asm/cacheflush.h> | 119 | #include <asm/cacheflush.h> |
| 117 | #include <asm/tlbflush.h> | 120 | #include <asm/tlbflush.h> |
| @@ -177,13 +180,13 @@ | |||
| 177 | SLAB_STORE_USER | \ | 180 | SLAB_STORE_USER | \ |
| 178 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 181 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
| 179 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | 182 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ |
| 180 | SLAB_DEBUG_OBJECTS) | 183 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) |
| 181 | #else | 184 | #else |
| 182 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ | 185 | # define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ |
| 183 | SLAB_CACHE_DMA | \ | 186 | SLAB_CACHE_DMA | \ |
| 184 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ | 187 | SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ |
| 185 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ | 188 | SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ |
| 186 | SLAB_DEBUG_OBJECTS) | 189 | SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE | SLAB_NOTRACK) |
| 187 | #endif | 190 | #endif |
| 188 | 191 | ||
| 189 | /* | 192 | /* |
| @@ -314,7 +317,7 @@ static int drain_freelist(struct kmem_cache *cache, | |||
| 314 | struct kmem_list3 *l3, int tofree); | 317 | struct kmem_list3 *l3, int tofree); |
| 315 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, | 318 | static void free_block(struct kmem_cache *cachep, void **objpp, int len, |
| 316 | int node); | 319 | int node); |
| 317 | static int enable_cpucache(struct kmem_cache *cachep); | 320 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); |
| 318 | static void cache_reap(struct work_struct *unused); | 321 | static void cache_reap(struct work_struct *unused); |
| 319 | 322 | ||
| 320 | /* | 323 | /* |
| @@ -372,87 +375,6 @@ static void kmem_list3_init(struct kmem_list3 *parent) | |||
| 372 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ | 375 | MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ |
| 373 | } while (0) | 376 | } while (0) |
| 374 | 377 | ||
| 375 | /* | ||
| 376 | * struct kmem_cache | ||
| 377 | * | ||
| 378 | * manages a cache. | ||
| 379 | */ | ||
| 380 | |||
| 381 | struct kmem_cache { | ||
| 382 | /* 1) per-cpu data, touched during every alloc/free */ | ||
| 383 | struct array_cache *array[NR_CPUS]; | ||
| 384 | /* 2) Cache tunables. Protected by cache_chain_mutex */ | ||
| 385 | unsigned int batchcount; | ||
| 386 | unsigned int limit; | ||
| 387 | unsigned int shared; | ||
| 388 | |||
| 389 | unsigned int buffer_size; | ||
| 390 | u32 reciprocal_buffer_size; | ||
| 391 | /* 3) touched by every alloc & free from the backend */ | ||
| 392 | |||
| 393 | unsigned int flags; /* constant flags */ | ||
| 394 | unsigned int num; /* # of objs per slab */ | ||
| 395 | |||
| 396 | /* 4) cache_grow/shrink */ | ||
| 397 | /* order of pgs per slab (2^n) */ | ||
| 398 | unsigned int gfporder; | ||
| 399 | |||
| 400 | /* force GFP flags, e.g. GFP_DMA */ | ||
| 401 | gfp_t gfpflags; | ||
| 402 | |||
| 403 | size_t colour; /* cache colouring range */ | ||
| 404 | unsigned int colour_off; /* colour offset */ | ||
| 405 | struct kmem_cache *slabp_cache; | ||
| 406 | unsigned int slab_size; | ||
| 407 | unsigned int dflags; /* dynamic flags */ | ||
| 408 | |||
| 409 | /* constructor func */ | ||
| 410 | void (*ctor)(void *obj); | ||
| 411 | |||
| 412 | /* 5) cache creation/removal */ | ||
| 413 | const char *name; | ||
| 414 | struct list_head next; | ||
| 415 | |||
| 416 | /* 6) statistics */ | ||
| 417 | #if STATS | ||
| 418 | unsigned long num_active; | ||
| 419 | unsigned long num_allocations; | ||
| 420 | unsigned long high_mark; | ||
| 421 | unsigned long grown; | ||
| 422 | unsigned long reaped; | ||
| 423 | unsigned long errors; | ||
| 424 | unsigned long max_freeable; | ||
| 425 | unsigned long node_allocs; | ||
| 426 | unsigned long node_frees; | ||
| 427 | unsigned long node_overflow; | ||
| 428 | atomic_t allochit; | ||
| 429 | atomic_t allocmiss; | ||
| 430 | atomic_t freehit; | ||
| 431 | atomic_t freemiss; | ||
| 432 | #endif | ||
| 433 | #if DEBUG | ||
| 434 | /* | ||
| 435 | * If debugging is enabled, then the allocator can add additional | ||
| 436 | * fields and/or padding to every object. buffer_size contains the total | ||
| 437 | * object size including these internal fields, the following two | ||
| 438 | * variables contain the offset to the user object and its size. | ||
| 439 | */ | ||
| 440 | int obj_offset; | ||
| 441 | int obj_size; | ||
| 442 | #endif | ||
| 443 | /* | ||
| 444 | * We put nodelists[] at the end of kmem_cache, because we want to size | ||
| 445 | * this array to nr_node_ids slots instead of MAX_NUMNODES | ||
| 446 | * (see kmem_cache_init()) | ||
| 447 | * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache | ||
| 448 | * is statically defined, so we reserve the max number of nodes. | ||
| 449 | */ | ||
| 450 | struct kmem_list3 *nodelists[MAX_NUMNODES]; | ||
| 451 | /* | ||
| 452 | * Do not add fields after nodelists[] | ||
| 453 | */ | ||
| 454 | }; | ||
| 455 | |||
| 456 | #define CFLGS_OFF_SLAB (0x80000000UL) | 378 | #define CFLGS_OFF_SLAB (0x80000000UL) |
| 457 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) | 379 | #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) |
| 458 | 380 | ||
| @@ -568,6 +490,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) | |||
| 568 | 490 | ||
| 569 | #endif | 491 | #endif |
| 570 | 492 | ||
| 493 | #ifdef CONFIG_KMEMTRACE | ||
| 494 | size_t slab_buffer_size(struct kmem_cache *cachep) | ||
| 495 | { | ||
| 496 | return cachep->buffer_size; | ||
| 497 | } | ||
| 498 | EXPORT_SYMBOL(slab_buffer_size); | ||
| 499 | #endif | ||
| 500 | |||
| 571 | /* | 501 | /* |
| 572 | * Do not go above this order unless 0 objects fit into the slab. | 502 | * Do not go above this order unless 0 objects fit into the slab. |
| 573 | */ | 503 | */ |
| @@ -743,6 +673,7 @@ static enum { | |||
| 743 | NONE, | 673 | NONE, |
| 744 | PARTIAL_AC, | 674 | PARTIAL_AC, |
| 745 | PARTIAL_L3, | 675 | PARTIAL_L3, |
| 676 | EARLY, | ||
| 746 | FULL | 677 | FULL |
| 747 | } g_cpucache_up; | 678 | } g_cpucache_up; |
| 748 | 679 | ||
| @@ -751,7 +682,7 @@ static enum { | |||
| 751 | */ | 682 | */ |
| 752 | int slab_is_available(void) | 683 | int slab_is_available(void) |
| 753 | { | 684 | { |
| 754 | return g_cpucache_up == FULL; | 685 | return g_cpucache_up >= EARLY; |
| 755 | } | 686 | } |
| 756 | 687 | ||
| 757 | static DEFINE_PER_CPU(struct delayed_work, reap_work); | 688 | static DEFINE_PER_CPU(struct delayed_work, reap_work); |
| @@ -881,7 +812,6 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, | |||
| 881 | */ | 812 | */ |
| 882 | 813 | ||
| 883 | static int use_alien_caches __read_mostly = 1; | 814 | static int use_alien_caches __read_mostly = 1; |
| 884 | static int numa_platform __read_mostly = 1; | ||
| 885 | static int __init noaliencache_setup(char *s) | 815 | static int __init noaliencache_setup(char *s) |
| 886 | { | 816 | { |
| 887 | use_alien_caches = 0; | 817 | use_alien_caches = 0; |
| @@ -949,12 +879,20 @@ static void __cpuinit start_cpu_timer(int cpu) | |||
| 949 | } | 879 | } |
| 950 | 880 | ||
| 951 | static struct array_cache *alloc_arraycache(int node, int entries, | 881 | static struct array_cache *alloc_arraycache(int node, int entries, |
| 952 | int batchcount) | 882 | int batchcount, gfp_t gfp) |
| 953 | { | 883 | { |
| 954 | int memsize = sizeof(void *) * entries + sizeof(struct array_cache); | 884 | int memsize = sizeof(void *) * entries + sizeof(struct array_cache); |
| 955 | struct array_cache *nc = NULL; | 885 | struct array_cache *nc = NULL; |
| 956 | 886 | ||
| 957 | nc = kmalloc_node(memsize, GFP_KERNEL, node); | 887 | nc = kmalloc_node(memsize, gfp, node); |
| 888 | /* | ||
| 889 | * The array_cache structures contain pointers to free object. | ||
| 890 | * However, when such objects are allocated or transfered to another | ||
| 891 | * cache the pointers are not cleared and they could be counted as | ||
| 892 | * valid references during a kmemleak scan. Therefore, kmemleak must | ||
| 893 | * not scan such objects. | ||
| 894 | */ | ||
| 895 | kmemleak_no_scan(nc); | ||
| 958 | if (nc) { | 896 | if (nc) { |
| 959 | nc->avail = 0; | 897 | nc->avail = 0; |
| 960 | nc->limit = entries; | 898 | nc->limit = entries; |
| @@ -994,7 +932,7 @@ static int transfer_objects(struct array_cache *to, | |||
| 994 | #define drain_alien_cache(cachep, alien) do { } while (0) | 932 | #define drain_alien_cache(cachep, alien) do { } while (0) |
| 995 | #define reap_alien(cachep, l3) do { } while (0) | 933 | #define reap_alien(cachep, l3) do { } while (0) |
| 996 | 934 | ||
| 997 | static inline struct array_cache **alloc_alien_cache(int node, int limit) | 935 | static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) |
| 998 | { | 936 | { |
| 999 | return (struct array_cache **)BAD_ALIEN_MAGIC; | 937 | return (struct array_cache **)BAD_ALIEN_MAGIC; |
| 1000 | } | 938 | } |
| @@ -1025,7 +963,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, | |||
| 1025 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); | 963 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); |
| 1026 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); | 964 | static void *alternate_node_alloc(struct kmem_cache *, gfp_t); |
| 1027 | 965 | ||
| 1028 | static struct array_cache **alloc_alien_cache(int node, int limit) | 966 | static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) |
| 1029 | { | 967 | { |
| 1030 | struct array_cache **ac_ptr; | 968 | struct array_cache **ac_ptr; |
| 1031 | int memsize = sizeof(void *) * nr_node_ids; | 969 | int memsize = sizeof(void *) * nr_node_ids; |
| @@ -1033,14 +971,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit) | |||
| 1033 | 971 | ||
| 1034 | if (limit > 1) | 972 | if (limit > 1) |
| 1035 | limit = 12; | 973 | limit = 12; |
| 1036 | ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node); | 974 | ac_ptr = kmalloc_node(memsize, gfp, node); |
| 1037 | if (ac_ptr) { | 975 | if (ac_ptr) { |
| 1038 | for_each_node(i) { | 976 | for_each_node(i) { |
| 1039 | if (i == node || !node_online(i)) { | 977 | if (i == node || !node_online(i)) { |
| 1040 | ac_ptr[i] = NULL; | 978 | ac_ptr[i] = NULL; |
| 1041 | continue; | 979 | continue; |
| 1042 | } | 980 | } |
| 1043 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d); | 981 | ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); |
| 1044 | if (!ac_ptr[i]) { | 982 | if (!ac_ptr[i]) { |
| 1045 | for (i--; i >= 0; i--) | 983 | for (i--; i >= 0; i--) |
| 1046 | kfree(ac_ptr[i]); | 984 | kfree(ac_ptr[i]); |
| @@ -1160,7 +1098,7 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
| 1160 | struct kmem_cache *cachep; | 1098 | struct kmem_cache *cachep; |
| 1161 | struct kmem_list3 *l3 = NULL; | 1099 | struct kmem_list3 *l3 = NULL; |
| 1162 | int node = cpu_to_node(cpu); | 1100 | int node = cpu_to_node(cpu); |
| 1163 | node_to_cpumask_ptr(mask, node); | 1101 | const struct cpumask *mask = cpumask_of_node(node); |
| 1164 | 1102 | ||
| 1165 | list_for_each_entry(cachep, &cache_chain, next) { | 1103 | list_for_each_entry(cachep, &cache_chain, next) { |
| 1166 | struct array_cache *nc; | 1104 | struct array_cache *nc; |
| @@ -1273,20 +1211,20 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
| 1273 | struct array_cache **alien = NULL; | 1211 | struct array_cache **alien = NULL; |
| 1274 | 1212 | ||
| 1275 | nc = alloc_arraycache(node, cachep->limit, | 1213 | nc = alloc_arraycache(node, cachep->limit, |
| 1276 | cachep->batchcount); | 1214 | cachep->batchcount, GFP_KERNEL); |
| 1277 | if (!nc) | 1215 | if (!nc) |
| 1278 | goto bad; | 1216 | goto bad; |
| 1279 | if (cachep->shared) { | 1217 | if (cachep->shared) { |
| 1280 | shared = alloc_arraycache(node, | 1218 | shared = alloc_arraycache(node, |
| 1281 | cachep->shared * cachep->batchcount, | 1219 | cachep->shared * cachep->batchcount, |
| 1282 | 0xbaadf00d); | 1220 | 0xbaadf00d, GFP_KERNEL); |
| 1283 | if (!shared) { | 1221 | if (!shared) { |
| 1284 | kfree(nc); | 1222 | kfree(nc); |
| 1285 | goto bad; | 1223 | goto bad; |
| 1286 | } | 1224 | } |
| 1287 | } | 1225 | } |
| 1288 | if (use_alien_caches) { | 1226 | if (use_alien_caches) { |
| 1289 | alien = alloc_alien_cache(node, cachep->limit); | 1227 | alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); |
| 1290 | if (!alien) { | 1228 | if (!alien) { |
| 1291 | kfree(shared); | 1229 | kfree(shared); |
| 1292 | kfree(nc); | 1230 | kfree(nc); |
| @@ -1390,10 +1328,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, | |||
| 1390 | { | 1328 | { |
| 1391 | struct kmem_list3 *ptr; | 1329 | struct kmem_list3 *ptr; |
| 1392 | 1330 | ||
| 1393 | ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid); | 1331 | ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid); |
| 1394 | BUG_ON(!ptr); | 1332 | BUG_ON(!ptr); |
| 1395 | 1333 | ||
| 1396 | local_irq_disable(); | ||
| 1397 | memcpy(ptr, list, sizeof(struct kmem_list3)); | 1334 | memcpy(ptr, list, sizeof(struct kmem_list3)); |
| 1398 | /* | 1335 | /* |
| 1399 | * Do not assume that spinlocks can be initialized via memcpy: | 1336 | * Do not assume that spinlocks can be initialized via memcpy: |
| @@ -1402,7 +1339,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, | |||
| 1402 | 1339 | ||
| 1403 | MAKE_ALL_LISTS(cachep, ptr, nodeid); | 1340 | MAKE_ALL_LISTS(cachep, ptr, nodeid); |
| 1404 | cachep->nodelists[nodeid] = ptr; | 1341 | cachep->nodelists[nodeid] = ptr; |
| 1405 | local_irq_enable(); | ||
| 1406 | } | 1342 | } |
| 1407 | 1343 | ||
| 1408 | /* | 1344 | /* |
| @@ -1434,10 +1370,8 @@ void __init kmem_cache_init(void) | |||
| 1434 | int order; | 1370 | int order; |
| 1435 | int node; | 1371 | int node; |
| 1436 | 1372 | ||
| 1437 | if (num_possible_nodes() == 1) { | 1373 | if (num_possible_nodes() == 1) |
| 1438 | use_alien_caches = 0; | 1374 | use_alien_caches = 0; |
| 1439 | numa_platform = 0; | ||
| 1440 | } | ||
| 1441 | 1375 | ||
| 1442 | for (i = 0; i < NUM_INIT_LISTS; i++) { | 1376 | for (i = 0; i < NUM_INIT_LISTS; i++) { |
| 1443 | kmem_list3_init(&initkmem_list3[i]); | 1377 | kmem_list3_init(&initkmem_list3[i]); |
| @@ -1450,7 +1384,7 @@ void __init kmem_cache_init(void) | |||
| 1450 | * Fragmentation resistance on low memory - only use bigger | 1384 | * Fragmentation resistance on low memory - only use bigger |
| 1451 | * page orders on machines with more than 32MB of memory. | 1385 | * page orders on machines with more than 32MB of memory. |
| 1452 | */ | 1386 | */ |
| 1453 | if (num_physpages > (32 << 20) >> PAGE_SHIFT) | 1387 | if (totalram_pages > (32 << 20) >> PAGE_SHIFT) |
| 1454 | slab_break_gfp_order = BREAK_GFP_ORDER_HI; | 1388 | slab_break_gfp_order = BREAK_GFP_ORDER_HI; |
| 1455 | 1389 | ||
| 1456 | /* Bootstrap is tricky, because several objects are allocated | 1390 | /* Bootstrap is tricky, because several objects are allocated |
| @@ -1566,9 +1500,8 @@ void __init kmem_cache_init(void) | |||
| 1566 | { | 1500 | { |
| 1567 | struct array_cache *ptr; | 1501 | struct array_cache *ptr; |
| 1568 | 1502 | ||
| 1569 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 1503 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
| 1570 | 1504 | ||
| 1571 | local_irq_disable(); | ||
| 1572 | BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); | 1505 | BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache); |
| 1573 | memcpy(ptr, cpu_cache_get(&cache_cache), | 1506 | memcpy(ptr, cpu_cache_get(&cache_cache), |
| 1574 | sizeof(struct arraycache_init)); | 1507 | sizeof(struct arraycache_init)); |
| @@ -1578,11 +1511,9 @@ void __init kmem_cache_init(void) | |||
| 1578 | spin_lock_init(&ptr->lock); | 1511 | spin_lock_init(&ptr->lock); |
| 1579 | 1512 | ||
| 1580 | cache_cache.array[smp_processor_id()] = ptr; | 1513 | cache_cache.array[smp_processor_id()] = ptr; |
| 1581 | local_irq_enable(); | ||
| 1582 | 1514 | ||
| 1583 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 1515 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); |
| 1584 | 1516 | ||
| 1585 | local_irq_disable(); | ||
| 1586 | BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) | 1517 | BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep) |
| 1587 | != &initarray_generic.cache); | 1518 | != &initarray_generic.cache); |
| 1588 | memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), | 1519 | memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep), |
| @@ -1594,7 +1525,6 @@ void __init kmem_cache_init(void) | |||
| 1594 | 1525 | ||
| 1595 | malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = | 1526 | malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] = |
| 1596 | ptr; | 1527 | ptr; |
| 1597 | local_irq_enable(); | ||
| 1598 | } | 1528 | } |
| 1599 | /* 5) Replace the bootstrap kmem_list3's */ | 1529 | /* 5) Replace the bootstrap kmem_list3's */ |
| 1600 | { | 1530 | { |
| @@ -1613,23 +1543,26 @@ void __init kmem_cache_init(void) | |||
| 1613 | } | 1543 | } |
| 1614 | } | 1544 | } |
| 1615 | 1545 | ||
| 1616 | /* 6) resize the head arrays to their final sizes */ | 1546 | g_cpucache_up = EARLY; |
| 1617 | { | 1547 | } |
| 1618 | struct kmem_cache *cachep; | ||
| 1619 | mutex_lock(&cache_chain_mutex); | ||
| 1620 | list_for_each_entry(cachep, &cache_chain, next) | ||
| 1621 | if (enable_cpucache(cachep)) | ||
| 1622 | BUG(); | ||
| 1623 | mutex_unlock(&cache_chain_mutex); | ||
| 1624 | } | ||
| 1625 | 1548 | ||
| 1626 | /* Annotate slab for lockdep -- annotate the malloc caches */ | 1549 | void __init kmem_cache_init_late(void) |
| 1627 | init_lock_keys(); | 1550 | { |
| 1551 | struct kmem_cache *cachep; | ||
| 1628 | 1552 | ||
| 1553 | /* 6) resize the head arrays to their final sizes */ | ||
| 1554 | mutex_lock(&cache_chain_mutex); | ||
| 1555 | list_for_each_entry(cachep, &cache_chain, next) | ||
| 1556 | if (enable_cpucache(cachep, GFP_NOWAIT)) | ||
| 1557 | BUG(); | ||
| 1558 | mutex_unlock(&cache_chain_mutex); | ||
| 1629 | 1559 | ||
| 1630 | /* Done! */ | 1560 | /* Done! */ |
| 1631 | g_cpucache_up = FULL; | 1561 | g_cpucache_up = FULL; |
| 1632 | 1562 | ||
| 1563 | /* Annotate slab for lockdep -- annotate the malloc caches */ | ||
| 1564 | init_lock_keys(); | ||
| 1565 | |||
| 1633 | /* | 1566 | /* |
| 1634 | * Register a cpu startup notifier callback that initializes | 1567 | * Register a cpu startup notifier callback that initializes |
| 1635 | * cpu_cache_get for all new cpus | 1568 | * cpu_cache_get for all new cpus |
| @@ -1680,7 +1613,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 1680 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1613 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
| 1681 | flags |= __GFP_RECLAIMABLE; | 1614 | flags |= __GFP_RECLAIMABLE; |
| 1682 | 1615 | ||
| 1683 | page = alloc_pages_node(nodeid, flags, cachep->gfporder); | 1616 | page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); |
| 1684 | if (!page) | 1617 | if (!page) |
| 1685 | return NULL; | 1618 | return NULL; |
| 1686 | 1619 | ||
| @@ -1693,6 +1626,16 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) | |||
| 1693 | NR_SLAB_UNRECLAIMABLE, nr_pages); | 1626 | NR_SLAB_UNRECLAIMABLE, nr_pages); |
| 1694 | for (i = 0; i < nr_pages; i++) | 1627 | for (i = 0; i < nr_pages; i++) |
| 1695 | __SetPageSlab(page + i); | 1628 | __SetPageSlab(page + i); |
| 1629 | |||
| 1630 | if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { | ||
| 1631 | kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); | ||
| 1632 | |||
| 1633 | if (cachep->ctor) | ||
| 1634 | kmemcheck_mark_uninitialized_pages(page, nr_pages); | ||
| 1635 | else | ||
| 1636 | kmemcheck_mark_unallocated_pages(page, nr_pages); | ||
| 1637 | } | ||
| 1638 | |||
| 1696 | return page_address(page); | 1639 | return page_address(page); |
| 1697 | } | 1640 | } |
| 1698 | 1641 | ||
| @@ -1705,6 +1648,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) | |||
| 1705 | struct page *page = virt_to_page(addr); | 1648 | struct page *page = virt_to_page(addr); |
| 1706 | const unsigned long nr_freed = i; | 1649 | const unsigned long nr_freed = i; |
| 1707 | 1650 | ||
| 1651 | kmemcheck_free_shadow(page, cachep->gfporder); | ||
| 1652 | |||
| 1708 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) | 1653 | if (cachep->flags & SLAB_RECLAIM_ACCOUNT) |
| 1709 | sub_zone_page_state(page_zone(page), | 1654 | sub_zone_page_state(page_zone(page), |
| 1710 | NR_SLAB_RECLAIMABLE, nr_freed); | 1655 | NR_SLAB_RECLAIMABLE, nr_freed); |
| @@ -2055,10 +2000,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
| 2055 | return left_over; | 2000 | return left_over; |
| 2056 | } | 2001 | } |
| 2057 | 2002 | ||
| 2058 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | 2003 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) |
| 2059 | { | 2004 | { |
| 2060 | if (g_cpucache_up == FULL) | 2005 | if (g_cpucache_up == FULL) |
| 2061 | return enable_cpucache(cachep); | 2006 | return enable_cpucache(cachep, gfp); |
| 2062 | 2007 | ||
| 2063 | if (g_cpucache_up == NONE) { | 2008 | if (g_cpucache_up == NONE) { |
| 2064 | /* | 2009 | /* |
| @@ -2080,7 +2025,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
| 2080 | g_cpucache_up = PARTIAL_AC; | 2025 | g_cpucache_up = PARTIAL_AC; |
| 2081 | } else { | 2026 | } else { |
| 2082 | cachep->array[smp_processor_id()] = | 2027 | cachep->array[smp_processor_id()] = |
| 2083 | kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); | 2028 | kmalloc(sizeof(struct arraycache_init), gfp); |
| 2084 | 2029 | ||
| 2085 | if (g_cpucache_up == PARTIAL_AC) { | 2030 | if (g_cpucache_up == PARTIAL_AC) { |
| 2086 | set_up_list3s(cachep, SIZE_L3); | 2031 | set_up_list3s(cachep, SIZE_L3); |
| @@ -2090,7 +2035,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep) | |||
| 2090 | for_each_online_node(node) { | 2035 | for_each_online_node(node) { |
| 2091 | cachep->nodelists[node] = | 2036 | cachep->nodelists[node] = |
| 2092 | kmalloc_node(sizeof(struct kmem_list3), | 2037 | kmalloc_node(sizeof(struct kmem_list3), |
| 2093 | GFP_KERNEL, node); | 2038 | gfp, node); |
| 2094 | BUG_ON(!cachep->nodelists[node]); | 2039 | BUG_ON(!cachep->nodelists[node]); |
| 2095 | kmem_list3_init(cachep->nodelists[node]); | 2040 | kmem_list3_init(cachep->nodelists[node]); |
| 2096 | } | 2041 | } |
| @@ -2144,6 +2089,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2144 | { | 2089 | { |
| 2145 | size_t left_over, slab_size, ralign; | 2090 | size_t left_over, slab_size, ralign; |
| 2146 | struct kmem_cache *cachep = NULL, *pc; | 2091 | struct kmem_cache *cachep = NULL, *pc; |
| 2092 | gfp_t gfp; | ||
| 2147 | 2093 | ||
| 2148 | /* | 2094 | /* |
| 2149 | * Sanity checks... these are all serious usage bugs. | 2095 | * Sanity checks... these are all serious usage bugs. |
| @@ -2159,8 +2105,10 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2159 | * We use cache_chain_mutex to ensure a consistent view of | 2105 | * We use cache_chain_mutex to ensure a consistent view of |
| 2160 | * cpu_online_mask as well. Please see cpuup_callback | 2106 | * cpu_online_mask as well. Please see cpuup_callback |
| 2161 | */ | 2107 | */ |
| 2162 | get_online_cpus(); | 2108 | if (slab_is_available()) { |
| 2163 | mutex_lock(&cache_chain_mutex); | 2109 | get_online_cpus(); |
| 2110 | mutex_lock(&cache_chain_mutex); | ||
| 2111 | } | ||
| 2164 | 2112 | ||
| 2165 | list_for_each_entry(pc, &cache_chain, next) { | 2113 | list_for_each_entry(pc, &cache_chain, next) { |
| 2166 | char tmp; | 2114 | char tmp; |
| @@ -2269,8 +2217,13 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2269 | */ | 2217 | */ |
| 2270 | align = ralign; | 2218 | align = ralign; |
| 2271 | 2219 | ||
| 2220 | if (slab_is_available()) | ||
| 2221 | gfp = GFP_KERNEL; | ||
| 2222 | else | ||
| 2223 | gfp = GFP_NOWAIT; | ||
| 2224 | |||
| 2272 | /* Get cache's description obj. */ | 2225 | /* Get cache's description obj. */ |
| 2273 | cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL); | 2226 | cachep = kmem_cache_zalloc(&cache_cache, gfp); |
| 2274 | if (!cachep) | 2227 | if (!cachep) |
| 2275 | goto oops; | 2228 | goto oops; |
| 2276 | 2229 | ||
| @@ -2344,6 +2297,15 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2344 | /* really off slab. No need for manual alignment */ | 2297 | /* really off slab. No need for manual alignment */ |
| 2345 | slab_size = | 2298 | slab_size = |
| 2346 | cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); | 2299 | cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); |
| 2300 | |||
| 2301 | #ifdef CONFIG_PAGE_POISONING | ||
| 2302 | /* If we're going to use the generic kernel_map_pages() | ||
| 2303 | * poisoning, then it's going to smash the contents of | ||
| 2304 | * the redzone and userword anyhow, so switch them off. | ||
| 2305 | */ | ||
| 2306 | if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) | ||
| 2307 | flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); | ||
| 2308 | #endif | ||
| 2347 | } | 2309 | } |
| 2348 | 2310 | ||
| 2349 | cachep->colour_off = cache_line_size(); | 2311 | cachep->colour_off = cache_line_size(); |
| @@ -2373,7 +2335,7 @@ kmem_cache_create (const char *name, size_t size, size_t align, | |||
| 2373 | cachep->ctor = ctor; | 2335 | cachep->ctor = ctor; |
| 2374 | cachep->name = name; | 2336 | cachep->name = name; |
| 2375 | 2337 | ||
| 2376 | if (setup_cpu_cache(cachep)) { | 2338 | if (setup_cpu_cache(cachep, gfp)) { |
| 2377 | __kmem_cache_destroy(cachep); | 2339 | __kmem_cache_destroy(cachep); |
| 2378 | cachep = NULL; | 2340 | cachep = NULL; |
| 2379 | goto oops; | 2341 | goto oops; |
| @@ -2385,8 +2347,10 @@ oops: | |||
| 2385 | if (!cachep && (flags & SLAB_PANIC)) | 2347 | if (!cachep && (flags & SLAB_PANIC)) |
| 2386 | panic("kmem_cache_create(): failed to create slab `%s'\n", | 2348 | panic("kmem_cache_create(): failed to create slab `%s'\n", |
| 2387 | name); | 2349 | name); |
| 2388 | mutex_unlock(&cache_chain_mutex); | 2350 | if (slab_is_available()) { |
| 2389 | put_online_cpus(); | 2351 | mutex_unlock(&cache_chain_mutex); |
| 2352 | put_online_cpus(); | ||
| 2353 | } | ||
| 2390 | return cachep; | 2354 | return cachep; |
| 2391 | } | 2355 | } |
| 2392 | EXPORT_SYMBOL(kmem_cache_create); | 2356 | EXPORT_SYMBOL(kmem_cache_create); |
| @@ -2583,7 +2547,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) | |||
| 2583 | } | 2547 | } |
| 2584 | 2548 | ||
| 2585 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) | 2549 | if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) |
| 2586 | synchronize_rcu(); | 2550 | rcu_barrier(); |
| 2587 | 2551 | ||
| 2588 | __kmem_cache_destroy(cachep); | 2552 | __kmem_cache_destroy(cachep); |
| 2589 | mutex_unlock(&cache_chain_mutex); | 2553 | mutex_unlock(&cache_chain_mutex); |
| @@ -2612,6 +2576,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, | |||
| 2612 | /* Slab management obj is off-slab. */ | 2576 | /* Slab management obj is off-slab. */ |
| 2613 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, | 2577 | slabp = kmem_cache_alloc_node(cachep->slabp_cache, |
| 2614 | local_flags, nodeid); | 2578 | local_flags, nodeid); |
| 2579 | /* | ||
| 2580 | * If the first object in the slab is leaked (it's allocated | ||
| 2581 | * but no one has a reference to it), we want to make sure | ||
| 2582 | * kmemleak does not treat the ->s_mem pointer as a reference | ||
| 2583 | * to the object. Otherwise we will not report the leak. | ||
| 2584 | */ | ||
| 2585 | kmemleak_scan_area(slabp, offsetof(struct slab, list), | ||
| 2586 | sizeof(struct list_head), local_flags); | ||
| 2615 | if (!slabp) | 2587 | if (!slabp) |
| 2616 | return NULL; | 2588 | return NULL; |
| 2617 | } else { | 2589 | } else { |
| @@ -3132,6 +3104,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
| 3132 | STATS_INC_ALLOCMISS(cachep); | 3104 | STATS_INC_ALLOCMISS(cachep); |
| 3133 | objp = cache_alloc_refill(cachep, flags); | 3105 | objp = cache_alloc_refill(cachep, flags); |
| 3134 | } | 3106 | } |
| 3107 | /* | ||
| 3108 | * To avoid a false negative, if an object that is in one of the | ||
| 3109 | * per-CPU caches is leaked, we need to make sure kmemleak doesn't | ||
| 3110 | * treat the array pointers as a reference to the object. | ||
| 3111 | */ | ||
| 3112 | kmemleak_erase(&ac->entry[ac->avail]); | ||
| 3135 | return objp; | 3113 | return objp; |
| 3136 | } | 3114 | } |
| 3137 | 3115 | ||
| @@ -3210,7 +3188,7 @@ retry: | |||
| 3210 | if (local_flags & __GFP_WAIT) | 3188 | if (local_flags & __GFP_WAIT) |
| 3211 | local_irq_enable(); | 3189 | local_irq_enable(); |
| 3212 | kmem_flagcheck(cache, flags); | 3190 | kmem_flagcheck(cache, flags); |
| 3213 | obj = kmem_getpages(cache, local_flags, -1); | 3191 | obj = kmem_getpages(cache, local_flags, numa_node_id()); |
| 3214 | if (local_flags & __GFP_WAIT) | 3192 | if (local_flags & __GFP_WAIT) |
| 3215 | local_irq_disable(); | 3193 | local_irq_disable(); |
| 3216 | if (obj) { | 3194 | if (obj) { |
| @@ -3318,6 +3296,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
| 3318 | unsigned long save_flags; | 3296 | unsigned long save_flags; |
| 3319 | void *ptr; | 3297 | void *ptr; |
| 3320 | 3298 | ||
| 3299 | flags &= gfp_allowed_mask; | ||
| 3300 | |||
| 3321 | lockdep_trace_alloc(flags); | 3301 | lockdep_trace_alloc(flags); |
| 3322 | 3302 | ||
| 3323 | if (slab_should_failslab(cachep, flags)) | 3303 | if (slab_should_failslab(cachep, flags)) |
| @@ -3351,6 +3331,11 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
| 3351 | out: | 3331 | out: |
| 3352 | local_irq_restore(save_flags); | 3332 | local_irq_restore(save_flags); |
| 3353 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); | 3333 | ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller); |
| 3334 | kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags, | ||
| 3335 | flags); | ||
| 3336 | |||
| 3337 | if (likely(ptr)) | ||
| 3338 | kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep)); | ||
| 3354 | 3339 | ||
| 3355 | if (unlikely((flags & __GFP_ZERO) && ptr)) | 3340 | if (unlikely((flags & __GFP_ZERO) && ptr)) |
| 3356 | memset(ptr, 0, obj_size(cachep)); | 3341 | memset(ptr, 0, obj_size(cachep)); |
| @@ -3396,6 +3381,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
| 3396 | unsigned long save_flags; | 3381 | unsigned long save_flags; |
| 3397 | void *objp; | 3382 | void *objp; |
| 3398 | 3383 | ||
| 3384 | flags &= gfp_allowed_mask; | ||
| 3385 | |||
| 3399 | lockdep_trace_alloc(flags); | 3386 | lockdep_trace_alloc(flags); |
| 3400 | 3387 | ||
| 3401 | if (slab_should_failslab(cachep, flags)) | 3388 | if (slab_should_failslab(cachep, flags)) |
| @@ -3406,8 +3393,13 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller) | |||
| 3406 | objp = __do_cache_alloc(cachep, flags); | 3393 | objp = __do_cache_alloc(cachep, flags); |
| 3407 | local_irq_restore(save_flags); | 3394 | local_irq_restore(save_flags); |
| 3408 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); | 3395 | objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller); |
| 3396 | kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags, | ||
| 3397 | flags); | ||
| 3409 | prefetchw(objp); | 3398 | prefetchw(objp); |
| 3410 | 3399 | ||
| 3400 | if (likely(objp)) | ||
| 3401 | kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep)); | ||
| 3402 | |||
| 3411 | if (unlikely((flags & __GFP_ZERO) && objp)) | 3403 | if (unlikely((flags & __GFP_ZERO) && objp)) |
| 3412 | memset(objp, 0, obj_size(cachep)); | 3404 | memset(objp, 0, obj_size(cachep)); |
| 3413 | 3405 | ||
| @@ -3521,8 +3513,11 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
| 3521 | struct array_cache *ac = cpu_cache_get(cachep); | 3513 | struct array_cache *ac = cpu_cache_get(cachep); |
| 3522 | 3514 | ||
| 3523 | check_irq_off(); | 3515 | check_irq_off(); |
| 3516 | kmemleak_free_recursive(objp, cachep->flags); | ||
| 3524 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); | 3517 | objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); |
| 3525 | 3518 | ||
| 3519 | kmemcheck_slab_free(cachep, objp, obj_size(cachep)); | ||
| 3520 | |||
| 3526 | /* | 3521 | /* |
| 3527 | * Skip calling cache_free_alien() when the platform is not numa. | 3522 | * Skip calling cache_free_alien() when the platform is not numa. |
| 3528 | * This will avoid cache misses that happen while accessing slabp (which | 3523 | * This will avoid cache misses that happen while accessing slabp (which |
| @@ -3530,7 +3525,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
| 3530 | * variable to skip the call, which is mostly likely to be present in | 3525 | * variable to skip the call, which is mostly likely to be present in |
| 3531 | * the cache. | 3526 | * the cache. |
| 3532 | */ | 3527 | */ |
| 3533 | if (numa_platform && cache_free_alien(cachep, objp)) | 3528 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) |
| 3534 | return; | 3529 | return; |
| 3535 | 3530 | ||
| 3536 | if (likely(ac->avail < ac->limit)) { | 3531 | if (likely(ac->avail < ac->limit)) { |
| @@ -3554,10 +3549,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) | |||
| 3554 | */ | 3549 | */ |
| 3555 | void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 3550 | void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
| 3556 | { | 3551 | { |
| 3557 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); | 3552 | void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); |
| 3553 | |||
| 3554 | trace_kmem_cache_alloc(_RET_IP_, ret, | ||
| 3555 | obj_size(cachep), cachep->buffer_size, flags); | ||
| 3556 | |||
| 3557 | return ret; | ||
| 3558 | } | 3558 | } |
| 3559 | EXPORT_SYMBOL(kmem_cache_alloc); | 3559 | EXPORT_SYMBOL(kmem_cache_alloc); |
| 3560 | 3560 | ||
| 3561 | #ifdef CONFIG_KMEMTRACE | ||
| 3562 | void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) | ||
| 3563 | { | ||
| 3564 | return __cache_alloc(cachep, flags, __builtin_return_address(0)); | ||
| 3565 | } | ||
| 3566 | EXPORT_SYMBOL(kmem_cache_alloc_notrace); | ||
| 3567 | #endif | ||
| 3568 | |||
| 3561 | /** | 3569 | /** |
| 3562 | * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. | 3570 | * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. |
| 3563 | * @cachep: the cache we're checking against | 3571 | * @cachep: the cache we're checking against |
| @@ -3602,23 +3610,46 @@ out: | |||
| 3602 | #ifdef CONFIG_NUMA | 3610 | #ifdef CONFIG_NUMA |
| 3603 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) | 3611 | void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) |
| 3604 | { | 3612 | { |
| 3605 | return __cache_alloc_node(cachep, flags, nodeid, | 3613 | void *ret = __cache_alloc_node(cachep, flags, nodeid, |
| 3606 | __builtin_return_address(0)); | 3614 | __builtin_return_address(0)); |
| 3615 | |||
| 3616 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | ||
| 3617 | obj_size(cachep), cachep->buffer_size, | ||
| 3618 | flags, nodeid); | ||
| 3619 | |||
| 3620 | return ret; | ||
| 3607 | } | 3621 | } |
| 3608 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 3622 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
| 3609 | 3623 | ||
| 3624 | #ifdef CONFIG_KMEMTRACE | ||
| 3625 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, | ||
| 3626 | gfp_t flags, | ||
| 3627 | int nodeid) | ||
| 3628 | { | ||
| 3629 | return __cache_alloc_node(cachep, flags, nodeid, | ||
| 3630 | __builtin_return_address(0)); | ||
| 3631 | } | ||
| 3632 | EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | ||
| 3633 | #endif | ||
| 3634 | |||
| 3610 | static __always_inline void * | 3635 | static __always_inline void * |
| 3611 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) | 3636 | __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) |
| 3612 | { | 3637 | { |
| 3613 | struct kmem_cache *cachep; | 3638 | struct kmem_cache *cachep; |
| 3639 | void *ret; | ||
| 3614 | 3640 | ||
| 3615 | cachep = kmem_find_general_cachep(size, flags); | 3641 | cachep = kmem_find_general_cachep(size, flags); |
| 3616 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3642 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
| 3617 | return cachep; | 3643 | return cachep; |
| 3618 | return kmem_cache_alloc_node(cachep, flags, node); | 3644 | ret = kmem_cache_alloc_node_notrace(cachep, flags, node); |
| 3645 | |||
| 3646 | trace_kmalloc_node((unsigned long) caller, ret, | ||
| 3647 | size, cachep->buffer_size, flags, node); | ||
| 3648 | |||
| 3649 | return ret; | ||
| 3619 | } | 3650 | } |
| 3620 | 3651 | ||
| 3621 | #ifdef CONFIG_DEBUG_SLAB | 3652 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) |
| 3622 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3653 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
| 3623 | { | 3654 | { |
| 3624 | return __do_kmalloc_node(size, flags, node, | 3655 | return __do_kmalloc_node(size, flags, node, |
| @@ -3651,6 +3682,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
| 3651 | void *caller) | 3682 | void *caller) |
| 3652 | { | 3683 | { |
| 3653 | struct kmem_cache *cachep; | 3684 | struct kmem_cache *cachep; |
| 3685 | void *ret; | ||
| 3654 | 3686 | ||
| 3655 | /* If you want to save a few bytes .text space: replace | 3687 | /* If you want to save a few bytes .text space: replace |
| 3656 | * __ with kmem_. | 3688 | * __ with kmem_. |
| @@ -3660,11 +3692,16 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
| 3660 | cachep = __find_general_cachep(size, flags); | 3692 | cachep = __find_general_cachep(size, flags); |
| 3661 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) | 3693 | if (unlikely(ZERO_OR_NULL_PTR(cachep))) |
| 3662 | return cachep; | 3694 | return cachep; |
| 3663 | return __cache_alloc(cachep, flags, caller); | 3695 | ret = __cache_alloc(cachep, flags, caller); |
| 3696 | |||
| 3697 | trace_kmalloc((unsigned long) caller, ret, | ||
| 3698 | size, cachep->buffer_size, flags); | ||
| 3699 | |||
| 3700 | return ret; | ||
| 3664 | } | 3701 | } |
| 3665 | 3702 | ||
| 3666 | 3703 | ||
| 3667 | #ifdef CONFIG_DEBUG_SLAB | 3704 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) |
| 3668 | void *__kmalloc(size_t size, gfp_t flags) | 3705 | void *__kmalloc(size_t size, gfp_t flags) |
| 3669 | { | 3706 | { |
| 3670 | return __do_kmalloc(size, flags, __builtin_return_address(0)); | 3707 | return __do_kmalloc(size, flags, __builtin_return_address(0)); |
| @@ -3703,6 +3740,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp) | |||
| 3703 | debug_check_no_obj_freed(objp, obj_size(cachep)); | 3740 | debug_check_no_obj_freed(objp, obj_size(cachep)); |
| 3704 | __cache_free(cachep, objp); | 3741 | __cache_free(cachep, objp); |
| 3705 | local_irq_restore(flags); | 3742 | local_irq_restore(flags); |
| 3743 | |||
| 3744 | trace_kmem_cache_free(_RET_IP_, objp); | ||
| 3706 | } | 3745 | } |
| 3707 | EXPORT_SYMBOL(kmem_cache_free); | 3746 | EXPORT_SYMBOL(kmem_cache_free); |
| 3708 | 3747 | ||
| @@ -3720,6 +3759,8 @@ void kfree(const void *objp) | |||
| 3720 | struct kmem_cache *c; | 3759 | struct kmem_cache *c; |
| 3721 | unsigned long flags; | 3760 | unsigned long flags; |
| 3722 | 3761 | ||
| 3762 | trace_kfree(_RET_IP_, objp); | ||
| 3763 | |||
| 3723 | if (unlikely(ZERO_OR_NULL_PTR(objp))) | 3764 | if (unlikely(ZERO_OR_NULL_PTR(objp))) |
| 3724 | return; | 3765 | return; |
| 3725 | local_irq_save(flags); | 3766 | local_irq_save(flags); |
| @@ -3747,7 +3788,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name); | |||
| 3747 | /* | 3788 | /* |
| 3748 | * This initializes kmem_list3 or resizes various caches for all nodes. | 3789 | * This initializes kmem_list3 or resizes various caches for all nodes. |
| 3749 | */ | 3790 | */ |
| 3750 | static int alloc_kmemlist(struct kmem_cache *cachep) | 3791 | static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp) |
| 3751 | { | 3792 | { |
| 3752 | int node; | 3793 | int node; |
| 3753 | struct kmem_list3 *l3; | 3794 | struct kmem_list3 *l3; |
| @@ -3757,7 +3798,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
| 3757 | for_each_online_node(node) { | 3798 | for_each_online_node(node) { |
| 3758 | 3799 | ||
| 3759 | if (use_alien_caches) { | 3800 | if (use_alien_caches) { |
| 3760 | new_alien = alloc_alien_cache(node, cachep->limit); | 3801 | new_alien = alloc_alien_cache(node, cachep->limit, gfp); |
| 3761 | if (!new_alien) | 3802 | if (!new_alien) |
| 3762 | goto fail; | 3803 | goto fail; |
| 3763 | } | 3804 | } |
| @@ -3766,7 +3807,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
| 3766 | if (cachep->shared) { | 3807 | if (cachep->shared) { |
| 3767 | new_shared = alloc_arraycache(node, | 3808 | new_shared = alloc_arraycache(node, |
| 3768 | cachep->shared*cachep->batchcount, | 3809 | cachep->shared*cachep->batchcount, |
| 3769 | 0xbaadf00d); | 3810 | 0xbaadf00d, gfp); |
| 3770 | if (!new_shared) { | 3811 | if (!new_shared) { |
| 3771 | free_alien_cache(new_alien); | 3812 | free_alien_cache(new_alien); |
| 3772 | goto fail; | 3813 | goto fail; |
| @@ -3795,7 +3836,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep) | |||
| 3795 | free_alien_cache(new_alien); | 3836 | free_alien_cache(new_alien); |
| 3796 | continue; | 3837 | continue; |
| 3797 | } | 3838 | } |
| 3798 | l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node); | 3839 | l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node); |
| 3799 | if (!l3) { | 3840 | if (!l3) { |
| 3800 | free_alien_cache(new_alien); | 3841 | free_alien_cache(new_alien); |
| 3801 | kfree(new_shared); | 3842 | kfree(new_shared); |
| @@ -3851,18 +3892,18 @@ static void do_ccupdate_local(void *info) | |||
| 3851 | 3892 | ||
| 3852 | /* Always called with the cache_chain_mutex held */ | 3893 | /* Always called with the cache_chain_mutex held */ |
| 3853 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | 3894 | static int do_tune_cpucache(struct kmem_cache *cachep, int limit, |
| 3854 | int batchcount, int shared) | 3895 | int batchcount, int shared, gfp_t gfp) |
| 3855 | { | 3896 | { |
| 3856 | struct ccupdate_struct *new; | 3897 | struct ccupdate_struct *new; |
| 3857 | int i; | 3898 | int i; |
| 3858 | 3899 | ||
| 3859 | new = kzalloc(sizeof(*new), GFP_KERNEL); | 3900 | new = kzalloc(sizeof(*new), gfp); |
| 3860 | if (!new) | 3901 | if (!new) |
| 3861 | return -ENOMEM; | 3902 | return -ENOMEM; |
| 3862 | 3903 | ||
| 3863 | for_each_online_cpu(i) { | 3904 | for_each_online_cpu(i) { |
| 3864 | new->new[i] = alloc_arraycache(cpu_to_node(i), limit, | 3905 | new->new[i] = alloc_arraycache(cpu_to_node(i), limit, |
| 3865 | batchcount); | 3906 | batchcount, gfp); |
| 3866 | if (!new->new[i]) { | 3907 | if (!new->new[i]) { |
| 3867 | for (i--; i >= 0; i--) | 3908 | for (i--; i >= 0; i--) |
| 3868 | kfree(new->new[i]); | 3909 | kfree(new->new[i]); |
| @@ -3889,11 +3930,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
| 3889 | kfree(ccold); | 3930 | kfree(ccold); |
| 3890 | } | 3931 | } |
| 3891 | kfree(new); | 3932 | kfree(new); |
| 3892 | return alloc_kmemlist(cachep); | 3933 | return alloc_kmemlist(cachep, gfp); |
| 3893 | } | 3934 | } |
| 3894 | 3935 | ||
| 3895 | /* Called with cache_chain_mutex held always */ | 3936 | /* Called with cache_chain_mutex held always */ |
| 3896 | static int enable_cpucache(struct kmem_cache *cachep) | 3937 | static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) |
| 3897 | { | 3938 | { |
| 3898 | int err; | 3939 | int err; |
| 3899 | int limit, shared; | 3940 | int limit, shared; |
| @@ -3939,7 +3980,7 @@ static int enable_cpucache(struct kmem_cache *cachep) | |||
| 3939 | if (limit > 32) | 3980 | if (limit > 32) |
| 3940 | limit = 32; | 3981 | limit = 32; |
| 3941 | #endif | 3982 | #endif |
| 3942 | err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared); | 3983 | err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp); |
| 3943 | if (err) | 3984 | if (err) |
| 3944 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", | 3985 | printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n", |
| 3945 | cachep->name, -err); | 3986 | cachep->name, -err); |
| @@ -3992,8 +4033,7 @@ static void cache_reap(struct work_struct *w) | |||
| 3992 | struct kmem_cache *searchp; | 4033 | struct kmem_cache *searchp; |
| 3993 | struct kmem_list3 *l3; | 4034 | struct kmem_list3 *l3; |
| 3994 | int node = numa_node_id(); | 4035 | int node = numa_node_id(); |
| 3995 | struct delayed_work *work = | 4036 | struct delayed_work *work = to_delayed_work(w); |
| 3996 | container_of(w, struct delayed_work, work); | ||
| 3997 | 4037 | ||
| 3998 | if (!mutex_trylock(&cache_chain_mutex)) | 4038 | if (!mutex_trylock(&cache_chain_mutex)) |
| 3999 | /* Give up. Setup the next iteration. */ | 4039 | /* Give up. Setup the next iteration. */ |
| @@ -4246,7 +4286,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer, | |||
| 4246 | res = 0; | 4286 | res = 0; |
| 4247 | } else { | 4287 | } else { |
| 4248 | res = do_tune_cpucache(cachep, limit, | 4288 | res = do_tune_cpucache(cachep, limit, |
| 4249 | batchcount, shared); | 4289 | batchcount, shared, |
| 4290 | GFP_KERNEL); | ||
| 4250 | } | 4291 | } |
| 4251 | break; | 4292 | break; |
| 4252 | } | 4293 | } |
| @@ -46,7 +46,7 @@ | |||
| 46 | * NUMA support in SLOB is fairly simplistic, pushing most of the real | 46 | * NUMA support in SLOB is fairly simplistic, pushing most of the real |
| 47 | * logic down to the page allocator, and simply doing the node accounting | 47 | * logic down to the page allocator, and simply doing the node accounting |
| 48 | * on the upper levels. In the event that a node id is explicitly | 48 | * on the upper levels. In the event that a node id is explicitly |
| 49 | * provided, alloc_pages_node() with the specified node id is used | 49 | * provided, alloc_pages_exact_node() with the specified node id is used |
| 50 | * instead. The common case (or when the node id isn't explicitly provided) | 50 | * instead. The common case (or when the node id isn't explicitly provided) |
| 51 | * will default to the current node, as per numa_node_id(). | 51 | * will default to the current node, as per numa_node_id(). |
| 52 | * | 52 | * |
| @@ -60,11 +60,14 @@ | |||
| 60 | #include <linux/kernel.h> | 60 | #include <linux/kernel.h> |
| 61 | #include <linux/slab.h> | 61 | #include <linux/slab.h> |
| 62 | #include <linux/mm.h> | 62 | #include <linux/mm.h> |
| 63 | #include <linux/swap.h> /* struct reclaim_state */ | ||
| 63 | #include <linux/cache.h> | 64 | #include <linux/cache.h> |
| 64 | #include <linux/init.h> | 65 | #include <linux/init.h> |
| 65 | #include <linux/module.h> | 66 | #include <linux/module.h> |
| 66 | #include <linux/rcupdate.h> | 67 | #include <linux/rcupdate.h> |
| 67 | #include <linux/list.h> | 68 | #include <linux/list.h> |
| 69 | #include <linux/kmemtrace.h> | ||
| 70 | #include <linux/kmemleak.h> | ||
| 68 | #include <asm/atomic.h> | 71 | #include <asm/atomic.h> |
| 69 | 72 | ||
| 70 | /* | 73 | /* |
| @@ -130,17 +133,17 @@ static LIST_HEAD(free_slob_large); | |||
| 130 | */ | 133 | */ |
| 131 | static inline int is_slob_page(struct slob_page *sp) | 134 | static inline int is_slob_page(struct slob_page *sp) |
| 132 | { | 135 | { |
| 133 | return PageSlobPage((struct page *)sp); | 136 | return PageSlab((struct page *)sp); |
| 134 | } | 137 | } |
| 135 | 138 | ||
| 136 | static inline void set_slob_page(struct slob_page *sp) | 139 | static inline void set_slob_page(struct slob_page *sp) |
| 137 | { | 140 | { |
| 138 | __SetPageSlobPage((struct page *)sp); | 141 | __SetPageSlab((struct page *)sp); |
| 139 | } | 142 | } |
| 140 | 143 | ||
| 141 | static inline void clear_slob_page(struct slob_page *sp) | 144 | static inline void clear_slob_page(struct slob_page *sp) |
| 142 | { | 145 | { |
| 143 | __ClearPageSlobPage((struct page *)sp); | 146 | __ClearPageSlab((struct page *)sp); |
| 144 | } | 147 | } |
| 145 | 148 | ||
| 146 | static inline struct slob_page *slob_page(const void *addr) | 149 | static inline struct slob_page *slob_page(const void *addr) |
| @@ -241,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) | |||
| 241 | 244 | ||
| 242 | #ifdef CONFIG_NUMA | 245 | #ifdef CONFIG_NUMA |
| 243 | if (node != -1) | 246 | if (node != -1) |
| 244 | page = alloc_pages_node(node, gfp, order); | 247 | page = alloc_pages_exact_node(node, gfp, order); |
| 245 | else | 248 | else |
| 246 | #endif | 249 | #endif |
| 247 | page = alloc_pages(gfp, order); | 250 | page = alloc_pages(gfp, order); |
| @@ -254,6 +257,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) | |||
| 254 | 257 | ||
| 255 | static void slob_free_pages(void *b, int order) | 258 | static void slob_free_pages(void *b, int order) |
| 256 | { | 259 | { |
| 260 | if (current->reclaim_state) | ||
| 261 | current->reclaim_state->reclaimed_slab += 1 << order; | ||
| 257 | free_pages((unsigned long)b, order); | 262 | free_pages((unsigned long)b, order); |
| 258 | } | 263 | } |
| 259 | 264 | ||
| @@ -406,7 +411,7 @@ static void slob_free(void *block, int size) | |||
| 406 | spin_unlock_irqrestore(&slob_lock, flags); | 411 | spin_unlock_irqrestore(&slob_lock, flags); |
| 407 | clear_slob_page(sp); | 412 | clear_slob_page(sp); |
| 408 | free_slob_page(sp); | 413 | free_slob_page(sp); |
| 409 | free_page((unsigned long)b); | 414 | slob_free_pages(b, 0); |
| 410 | return; | 415 | return; |
| 411 | } | 416 | } |
| 412 | 417 | ||
| @@ -474,6 +479,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
| 474 | { | 479 | { |
| 475 | unsigned int *m; | 480 | unsigned int *m; |
| 476 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); | 481 | int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); |
| 482 | void *ret; | ||
| 477 | 483 | ||
| 478 | lockdep_trace_alloc(gfp); | 484 | lockdep_trace_alloc(gfp); |
| 479 | 485 | ||
| @@ -482,12 +488,16 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
| 482 | return ZERO_SIZE_PTR; | 488 | return ZERO_SIZE_PTR; |
| 483 | 489 | ||
| 484 | m = slob_alloc(size + align, gfp, align, node); | 490 | m = slob_alloc(size + align, gfp, align, node); |
| 491 | |||
| 485 | if (!m) | 492 | if (!m) |
| 486 | return NULL; | 493 | return NULL; |
| 487 | *m = size; | 494 | *m = size; |
| 488 | return (void *)m + align; | 495 | ret = (void *)m + align; |
| 496 | |||
| 497 | trace_kmalloc_node(_RET_IP_, ret, | ||
| 498 | size, size + align, gfp, node); | ||
| 489 | } else { | 499 | } else { |
| 490 | void *ret; | 500 | unsigned int order = get_order(size); |
| 491 | 501 | ||
| 492 | ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); | 502 | ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); |
| 493 | if (ret) { | 503 | if (ret) { |
| @@ -495,8 +505,13 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) | |||
| 495 | page = virt_to_page(ret); | 505 | page = virt_to_page(ret); |
| 496 | page->private = size; | 506 | page->private = size; |
| 497 | } | 507 | } |
| 498 | return ret; | 508 | |
| 509 | trace_kmalloc_node(_RET_IP_, ret, | ||
| 510 | size, PAGE_SIZE << order, gfp, node); | ||
| 499 | } | 511 | } |
| 512 | |||
| 513 | kmemleak_alloc(ret, size, 1, gfp); | ||
| 514 | return ret; | ||
| 500 | } | 515 | } |
| 501 | EXPORT_SYMBOL(__kmalloc_node); | 516 | EXPORT_SYMBOL(__kmalloc_node); |
| 502 | 517 | ||
| @@ -504,8 +519,11 @@ void kfree(const void *block) | |||
| 504 | { | 519 | { |
| 505 | struct slob_page *sp; | 520 | struct slob_page *sp; |
| 506 | 521 | ||
| 522 | trace_kfree(_RET_IP_, block); | ||
| 523 | |||
| 507 | if (unlikely(ZERO_OR_NULL_PTR(block))) | 524 | if (unlikely(ZERO_OR_NULL_PTR(block))) |
| 508 | return; | 525 | return; |
| 526 | kmemleak_free(block); | ||
| 509 | 527 | ||
| 510 | sp = slob_page(block); | 528 | sp = slob_page(block); |
| 511 | if (is_slob_page(sp)) { | 529 | if (is_slob_page(sp)) { |
| @@ -569,12 +587,16 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
| 569 | } else if (flags & SLAB_PANIC) | 587 | } else if (flags & SLAB_PANIC) |
| 570 | panic("Cannot create slab cache %s\n", name); | 588 | panic("Cannot create slab cache %s\n", name); |
| 571 | 589 | ||
| 590 | kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL); | ||
| 572 | return c; | 591 | return c; |
| 573 | } | 592 | } |
| 574 | EXPORT_SYMBOL(kmem_cache_create); | 593 | EXPORT_SYMBOL(kmem_cache_create); |
| 575 | 594 | ||
| 576 | void kmem_cache_destroy(struct kmem_cache *c) | 595 | void kmem_cache_destroy(struct kmem_cache *c) |
| 577 | { | 596 | { |
| 597 | kmemleak_free(c); | ||
| 598 | if (c->flags & SLAB_DESTROY_BY_RCU) | ||
| 599 | rcu_barrier(); | ||
| 578 | slob_free(c, sizeof(struct kmem_cache)); | 600 | slob_free(c, sizeof(struct kmem_cache)); |
| 579 | } | 601 | } |
| 580 | EXPORT_SYMBOL(kmem_cache_destroy); | 602 | EXPORT_SYMBOL(kmem_cache_destroy); |
| @@ -583,14 +605,22 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
| 583 | { | 605 | { |
| 584 | void *b; | 606 | void *b; |
| 585 | 607 | ||
| 586 | if (c->size < PAGE_SIZE) | 608 | if (c->size < PAGE_SIZE) { |
| 587 | b = slob_alloc(c->size, flags, c->align, node); | 609 | b = slob_alloc(c->size, flags, c->align, node); |
| 588 | else | 610 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, |
| 611 | SLOB_UNITS(c->size) * SLOB_UNIT, | ||
| 612 | flags, node); | ||
| 613 | } else { | ||
| 589 | b = slob_new_pages(flags, get_order(c->size), node); | 614 | b = slob_new_pages(flags, get_order(c->size), node); |
| 615 | trace_kmem_cache_alloc_node(_RET_IP_, b, c->size, | ||
| 616 | PAGE_SIZE << get_order(c->size), | ||
| 617 | flags, node); | ||
| 618 | } | ||
| 590 | 619 | ||
| 591 | if (c->ctor) | 620 | if (c->ctor) |
| 592 | c->ctor(b); | 621 | c->ctor(b); |
| 593 | 622 | ||
| 623 | kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); | ||
| 594 | return b; | 624 | return b; |
| 595 | } | 625 | } |
| 596 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 626 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
| @@ -613,6 +643,7 @@ static void kmem_rcu_free(struct rcu_head *head) | |||
| 613 | 643 | ||
| 614 | void kmem_cache_free(struct kmem_cache *c, void *b) | 644 | void kmem_cache_free(struct kmem_cache *c, void *b) |
| 615 | { | 645 | { |
| 646 | kmemleak_free_recursive(b, c->flags); | ||
| 616 | if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { | 647 | if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { |
| 617 | struct slob_rcu *slob_rcu; | 648 | struct slob_rcu *slob_rcu; |
| 618 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); | 649 | slob_rcu = b + (c->size - sizeof(struct slob_rcu)); |
| @@ -622,6 +653,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b) | |||
| 622 | } else { | 653 | } else { |
| 623 | __kmem_cache_free(b, c->size); | 654 | __kmem_cache_free(b, c->size); |
| 624 | } | 655 | } |
| 656 | |||
| 657 | trace_kmem_cache_free(_RET_IP_, b); | ||
| 625 | } | 658 | } |
| 626 | EXPORT_SYMBOL(kmem_cache_free); | 659 | EXPORT_SYMBOL(kmem_cache_free); |
| 627 | 660 | ||
| @@ -659,3 +692,8 @@ void __init kmem_cache_init(void) | |||
| 659 | { | 692 | { |
| 660 | slob_ready = 1; | 693 | slob_ready = 1; |
| 661 | } | 694 | } |
| 695 | |||
| 696 | void __init kmem_cache_init_late(void) | ||
| 697 | { | ||
| 698 | /* Nothing to do */ | ||
| 699 | } | ||
| @@ -9,6 +9,7 @@ | |||
| 9 | */ | 9 | */ |
| 10 | 10 | ||
| 11 | #include <linux/mm.h> | 11 | #include <linux/mm.h> |
| 12 | #include <linux/swap.h> /* struct reclaim_state */ | ||
| 12 | #include <linux/module.h> | 13 | #include <linux/module.h> |
| 13 | #include <linux/bit_spinlock.h> | 14 | #include <linux/bit_spinlock.h> |
| 14 | #include <linux/interrupt.h> | 15 | #include <linux/interrupt.h> |
| @@ -16,6 +17,8 @@ | |||
| 16 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
| 17 | #include <linux/proc_fs.h> | 18 | #include <linux/proc_fs.h> |
| 18 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
| 20 | #include <linux/kmemtrace.h> | ||
| 21 | #include <linux/kmemcheck.h> | ||
| 19 | #include <linux/cpu.h> | 22 | #include <linux/cpu.h> |
| 20 | #include <linux/cpuset.h> | 23 | #include <linux/cpuset.h> |
| 21 | #include <linux/mempolicy.h> | 24 | #include <linux/mempolicy.h> |
| @@ -138,13 +141,20 @@ | |||
| 138 | SLAB_POISON | SLAB_STORE_USER) | 141 | SLAB_POISON | SLAB_STORE_USER) |
| 139 | 142 | ||
| 140 | /* | 143 | /* |
| 144 | * Debugging flags that require metadata to be stored in the slab. These get | ||
| 145 | * disabled when slub_debug=O is used and a cache's min order increases with | ||
| 146 | * metadata. | ||
| 147 | */ | ||
| 148 | #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) | ||
| 149 | |||
| 150 | /* | ||
| 141 | * Set of flags that will prevent slab merging | 151 | * Set of flags that will prevent slab merging |
| 142 | */ | 152 | */ |
| 143 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | 153 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ |
| 144 | SLAB_TRACE | SLAB_DESTROY_BY_RCU) | 154 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) |
| 145 | 155 | ||
| 146 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | 156 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ |
| 147 | SLAB_CACHE_DMA) | 157 | SLAB_CACHE_DMA | SLAB_NOTRACK) |
| 148 | 158 | ||
| 149 | #ifndef ARCH_KMALLOC_MINALIGN | 159 | #ifndef ARCH_KMALLOC_MINALIGN |
| 150 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) | 160 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) |
| @@ -322,6 +332,7 @@ static int slub_debug; | |||
| 322 | #endif | 332 | #endif |
| 323 | 333 | ||
| 324 | static char *slub_debug_slabs; | 334 | static char *slub_debug_slabs; |
| 335 | static int disable_higher_order_debug; | ||
| 325 | 336 | ||
| 326 | /* | 337 | /* |
| 327 | * Object debugging | 338 | * Object debugging |
| @@ -643,7 +654,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
| 643 | slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); | 654 | slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); |
| 644 | print_section("Padding", end - remainder, remainder); | 655 | print_section("Padding", end - remainder, remainder); |
| 645 | 656 | ||
| 646 | restore_bytes(s, "slab padding", POISON_INUSE, start, end); | 657 | restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); |
| 647 | return 0; | 658 | return 0; |
| 648 | } | 659 | } |
| 649 | 660 | ||
| @@ -830,6 +841,11 @@ static inline unsigned long slabs_node(struct kmem_cache *s, int node) | |||
| 830 | return atomic_long_read(&n->nr_slabs); | 841 | return atomic_long_read(&n->nr_slabs); |
| 831 | } | 842 | } |
| 832 | 843 | ||
| 844 | static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) | ||
| 845 | { | ||
| 846 | return atomic_long_read(&n->nr_slabs); | ||
| 847 | } | ||
| 848 | |||
| 833 | static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) | 849 | static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) |
| 834 | { | 850 | { |
| 835 | struct kmem_cache_node *n = get_node(s, node); | 851 | struct kmem_cache_node *n = get_node(s, node); |
| @@ -968,6 +984,15 @@ static int __init setup_slub_debug(char *str) | |||
| 968 | */ | 984 | */ |
| 969 | goto check_slabs; | 985 | goto check_slabs; |
| 970 | 986 | ||
| 987 | if (tolower(*str) == 'o') { | ||
| 988 | /* | ||
| 989 | * Avoid enabling debugging on caches if its minimum order | ||
| 990 | * would increase as a result. | ||
| 991 | */ | ||
| 992 | disable_higher_order_debug = 1; | ||
| 993 | goto out; | ||
| 994 | } | ||
| 995 | |||
| 971 | slub_debug = 0; | 996 | slub_debug = 0; |
| 972 | if (*str == '-') | 997 | if (*str == '-') |
| 973 | /* | 998 | /* |
| @@ -1018,8 +1043,8 @@ static unsigned long kmem_cache_flags(unsigned long objsize, | |||
| 1018 | * Enable debugging if selected on the kernel commandline. | 1043 | * Enable debugging if selected on the kernel commandline. |
| 1019 | */ | 1044 | */ |
| 1020 | if (slub_debug && (!slub_debug_slabs || | 1045 | if (slub_debug && (!slub_debug_slabs || |
| 1021 | strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) | 1046 | !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) |
| 1022 | flags |= slub_debug; | 1047 | flags |= slub_debug; |
| 1023 | 1048 | ||
| 1024 | return flags; | 1049 | return flags; |
| 1025 | } | 1050 | } |
| @@ -1046,8 +1071,12 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, | |||
| 1046 | } | 1071 | } |
| 1047 | #define slub_debug 0 | 1072 | #define slub_debug 0 |
| 1048 | 1073 | ||
| 1074 | #define disable_higher_order_debug 0 | ||
| 1075 | |||
| 1049 | static inline unsigned long slabs_node(struct kmem_cache *s, int node) | 1076 | static inline unsigned long slabs_node(struct kmem_cache *s, int node) |
| 1050 | { return 0; } | 1077 | { return 0; } |
| 1078 | static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) | ||
| 1079 | { return 0; } | ||
| 1051 | static inline void inc_slabs_node(struct kmem_cache *s, int node, | 1080 | static inline void inc_slabs_node(struct kmem_cache *s, int node, |
| 1052 | int objects) {} | 1081 | int objects) {} |
| 1053 | static inline void dec_slabs_node(struct kmem_cache *s, int node, | 1082 | static inline void dec_slabs_node(struct kmem_cache *s, int node, |
| @@ -1062,6 +1091,8 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, | |||
| 1062 | { | 1091 | { |
| 1063 | int order = oo_order(oo); | 1092 | int order = oo_order(oo); |
| 1064 | 1093 | ||
| 1094 | flags |= __GFP_NOTRACK; | ||
| 1095 | |||
| 1065 | if (node == -1) | 1096 | if (node == -1) |
| 1066 | return alloc_pages(flags, order); | 1097 | return alloc_pages(flags, order); |
| 1067 | else | 1098 | else |
| @@ -1072,11 +1103,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1072 | { | 1103 | { |
| 1073 | struct page *page; | 1104 | struct page *page; |
| 1074 | struct kmem_cache_order_objects oo = s->oo; | 1105 | struct kmem_cache_order_objects oo = s->oo; |
| 1106 | gfp_t alloc_gfp; | ||
| 1075 | 1107 | ||
| 1076 | flags |= s->allocflags; | 1108 | flags |= s->allocflags; |
| 1077 | 1109 | ||
| 1078 | page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, | 1110 | /* |
| 1079 | oo); | 1111 | * Let the initial higher-order allocation fail under memory pressure |
| 1112 | * so we fall-back to the minimum order allocation. | ||
| 1113 | */ | ||
| 1114 | alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; | ||
| 1115 | |||
| 1116 | page = alloc_slab_page(alloc_gfp, node, oo); | ||
| 1080 | if (unlikely(!page)) { | 1117 | if (unlikely(!page)) { |
| 1081 | oo = s->min; | 1118 | oo = s->min; |
| 1082 | /* | 1119 | /* |
| @@ -1089,6 +1126,23 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
| 1089 | 1126 | ||
| 1090 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); | 1127 | stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); |
| 1091 | } | 1128 | } |
| 1129 | |||
| 1130 | if (kmemcheck_enabled | ||
| 1131 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { | ||
| 1132 | int pages = 1 << oo_order(oo); | ||
| 1133 | |||
| 1134 | kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); | ||
| 1135 | |||
| 1136 | /* | ||
| 1137 | * Objects from caches that have a constructor don't get | ||
| 1138 | * cleared when they're allocated, so we need to do it here. | ||
| 1139 | */ | ||
| 1140 | if (s->ctor) | ||
| 1141 | kmemcheck_mark_uninitialized_pages(page, pages); | ||
| 1142 | else | ||
| 1143 | kmemcheck_mark_unallocated_pages(page, pages); | ||
| 1144 | } | ||
| 1145 | |||
| 1092 | page->objects = oo_objects(oo); | 1146 | page->objects = oo_objects(oo); |
| 1093 | mod_zone_page_state(page_zone(page), | 1147 | mod_zone_page_state(page_zone(page), |
| 1094 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1148 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
| @@ -1162,6 +1216,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
| 1162 | __ClearPageSlubDebug(page); | 1216 | __ClearPageSlubDebug(page); |
| 1163 | } | 1217 | } |
| 1164 | 1218 | ||
| 1219 | kmemcheck_free_shadow(page, compound_order(page)); | ||
| 1220 | |||
| 1165 | mod_zone_page_state(page_zone(page), | 1221 | mod_zone_page_state(page_zone(page), |
| 1166 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? | 1222 | (s->flags & SLAB_RECLAIM_ACCOUNT) ? |
| 1167 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, | 1223 | NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, |
| @@ -1169,6 +1225,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
| 1169 | 1225 | ||
| 1170 | __ClearPageSlab(page); | 1226 | __ClearPageSlab(page); |
| 1171 | reset_page_mapcount(page); | 1227 | reset_page_mapcount(page); |
| 1228 | if (current->reclaim_state) | ||
| 1229 | current->reclaim_state->reclaimed_slab += pages; | ||
| 1172 | __free_pages(page, order); | 1230 | __free_pages(page, order); |
| 1173 | } | 1231 | } |
| 1174 | 1232 | ||
| @@ -1480,6 +1538,69 @@ static inline int node_match(struct kmem_cache_cpu *c, int node) | |||
| 1480 | return 1; | 1538 | return 1; |
| 1481 | } | 1539 | } |
| 1482 | 1540 | ||
| 1541 | static int count_free(struct page *page) | ||
| 1542 | { | ||
| 1543 | return page->objects - page->inuse; | ||
| 1544 | } | ||
| 1545 | |||
| 1546 | static unsigned long count_partial(struct kmem_cache_node *n, | ||
| 1547 | int (*get_count)(struct page *)) | ||
| 1548 | { | ||
| 1549 | unsigned long flags; | ||
| 1550 | unsigned long x = 0; | ||
| 1551 | struct page *page; | ||
| 1552 | |||
| 1553 | spin_lock_irqsave(&n->list_lock, flags); | ||
| 1554 | list_for_each_entry(page, &n->partial, lru) | ||
| 1555 | x += get_count(page); | ||
| 1556 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
| 1557 | return x; | ||
| 1558 | } | ||
| 1559 | |||
| 1560 | static inline unsigned long node_nr_objs(struct kmem_cache_node *n) | ||
| 1561 | { | ||
| 1562 | #ifdef CONFIG_SLUB_DEBUG | ||
| 1563 | return atomic_long_read(&n->total_objects); | ||
| 1564 | #else | ||
| 1565 | return 0; | ||
| 1566 | #endif | ||
| 1567 | } | ||
| 1568 | |||
| 1569 | static noinline void | ||
| 1570 | slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) | ||
| 1571 | { | ||
| 1572 | int node; | ||
| 1573 | |||
| 1574 | printk(KERN_WARNING | ||
| 1575 | "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", | ||
| 1576 | nid, gfpflags); | ||
| 1577 | printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " | ||
| 1578 | "default order: %d, min order: %d\n", s->name, s->objsize, | ||
| 1579 | s->size, oo_order(s->oo), oo_order(s->min)); | ||
| 1580 | |||
| 1581 | if (oo_order(s->min) > get_order(s->objsize)) | ||
| 1582 | printk(KERN_WARNING " %s debugging increased min order, use " | ||
| 1583 | "slub_debug=O to disable.\n", s->name); | ||
| 1584 | |||
| 1585 | for_each_online_node(node) { | ||
| 1586 | struct kmem_cache_node *n = get_node(s, node); | ||
| 1587 | unsigned long nr_slabs; | ||
| 1588 | unsigned long nr_objs; | ||
| 1589 | unsigned long nr_free; | ||
| 1590 | |||
| 1591 | if (!n) | ||
| 1592 | continue; | ||
| 1593 | |||
| 1594 | nr_free = count_partial(n, count_free); | ||
| 1595 | nr_slabs = node_nr_slabs(n); | ||
| 1596 | nr_objs = node_nr_objs(n); | ||
| 1597 | |||
| 1598 | printk(KERN_WARNING | ||
| 1599 | " node %d: slabs: %ld, objs: %ld, free: %ld\n", | ||
| 1600 | node, nr_slabs, nr_objs, nr_free); | ||
| 1601 | } | ||
| 1602 | } | ||
| 1603 | |||
| 1483 | /* | 1604 | /* |
| 1484 | * Slow path. The lockless freelist is empty or we need to perform | 1605 | * Slow path. The lockless freelist is empty or we need to perform |
| 1485 | * debugging duties. | 1606 | * debugging duties. |
| @@ -1561,6 +1682,8 @@ new_slab: | |||
| 1561 | c->page = new; | 1682 | c->page = new; |
| 1562 | goto load_freelist; | 1683 | goto load_freelist; |
| 1563 | } | 1684 | } |
| 1685 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | ||
| 1686 | slab_out_of_memory(s, gfpflags, node); | ||
| 1564 | return NULL; | 1687 | return NULL; |
| 1565 | debug: | 1688 | debug: |
| 1566 | if (!alloc_debug_processing(s, c->page, object, addr)) | 1689 | if (!alloc_debug_processing(s, c->page, object, addr)) |
| @@ -1590,6 +1713,8 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
| 1590 | unsigned long flags; | 1713 | unsigned long flags; |
| 1591 | unsigned int objsize; | 1714 | unsigned int objsize; |
| 1592 | 1715 | ||
| 1716 | gfpflags &= gfp_allowed_mask; | ||
| 1717 | |||
| 1593 | lockdep_trace_alloc(gfpflags); | 1718 | lockdep_trace_alloc(gfpflags); |
| 1594 | might_sleep_if(gfpflags & __GFP_WAIT); | 1719 | might_sleep_if(gfpflags & __GFP_WAIT); |
| 1595 | 1720 | ||
| @@ -1613,23 +1738,53 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, | |||
| 1613 | if (unlikely((gfpflags & __GFP_ZERO) && object)) | 1738 | if (unlikely((gfpflags & __GFP_ZERO) && object)) |
| 1614 | memset(object, 0, objsize); | 1739 | memset(object, 0, objsize); |
| 1615 | 1740 | ||
| 1741 | kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); | ||
| 1742 | kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); | ||
| 1743 | |||
| 1616 | return object; | 1744 | return object; |
| 1617 | } | 1745 | } |
| 1618 | 1746 | ||
| 1619 | void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) | 1747 | void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) |
| 1620 | { | 1748 | { |
| 1621 | return slab_alloc(s, gfpflags, -1, _RET_IP_); | 1749 | void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); |
| 1750 | |||
| 1751 | trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); | ||
| 1752 | |||
| 1753 | return ret; | ||
| 1622 | } | 1754 | } |
| 1623 | EXPORT_SYMBOL(kmem_cache_alloc); | 1755 | EXPORT_SYMBOL(kmem_cache_alloc); |
| 1624 | 1756 | ||
| 1757 | #ifdef CONFIG_KMEMTRACE | ||
| 1758 | void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) | ||
| 1759 | { | ||
| 1760 | return slab_alloc(s, gfpflags, -1, _RET_IP_); | ||
| 1761 | } | ||
| 1762 | EXPORT_SYMBOL(kmem_cache_alloc_notrace); | ||
| 1763 | #endif | ||
| 1764 | |||
| 1625 | #ifdef CONFIG_NUMA | 1765 | #ifdef CONFIG_NUMA |
| 1626 | void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) | 1766 | void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) |
| 1627 | { | 1767 | { |
| 1628 | return slab_alloc(s, gfpflags, node, _RET_IP_); | 1768 | void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); |
| 1769 | |||
| 1770 | trace_kmem_cache_alloc_node(_RET_IP_, ret, | ||
| 1771 | s->objsize, s->size, gfpflags, node); | ||
| 1772 | |||
| 1773 | return ret; | ||
| 1629 | } | 1774 | } |
| 1630 | EXPORT_SYMBOL(kmem_cache_alloc_node); | 1775 | EXPORT_SYMBOL(kmem_cache_alloc_node); |
| 1631 | #endif | 1776 | #endif |
| 1632 | 1777 | ||
| 1778 | #ifdef CONFIG_KMEMTRACE | ||
| 1779 | void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, | ||
| 1780 | gfp_t gfpflags, | ||
| 1781 | int node) | ||
| 1782 | { | ||
| 1783 | return slab_alloc(s, gfpflags, node, _RET_IP_); | ||
| 1784 | } | ||
| 1785 | EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); | ||
| 1786 | #endif | ||
| 1787 | |||
| 1633 | /* | 1788 | /* |
| 1634 | * Slow patch handling. This may still be called frequently since objects | 1789 | * Slow patch handling. This may still be called frequently since objects |
| 1635 | * have a longer lifetime than the cpu slabs in most processing loads. | 1790 | * have a longer lifetime than the cpu slabs in most processing loads. |
| @@ -1715,8 +1870,10 @@ static __always_inline void slab_free(struct kmem_cache *s, | |||
| 1715 | struct kmem_cache_cpu *c; | 1870 | struct kmem_cache_cpu *c; |
| 1716 | unsigned long flags; | 1871 | unsigned long flags; |
| 1717 | 1872 | ||
| 1873 | kmemleak_free_recursive(x, s->flags); | ||
| 1718 | local_irq_save(flags); | 1874 | local_irq_save(flags); |
| 1719 | c = get_cpu_slab(s, smp_processor_id()); | 1875 | c = get_cpu_slab(s, smp_processor_id()); |
| 1876 | kmemcheck_slab_free(s, object, c->objsize); | ||
| 1720 | debug_check_no_locks_freed(object, c->objsize); | 1877 | debug_check_no_locks_freed(object, c->objsize); |
| 1721 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) | 1878 | if (!(s->flags & SLAB_DEBUG_OBJECTS)) |
| 1722 | debug_check_no_obj_freed(object, c->objsize); | 1879 | debug_check_no_obj_freed(object, c->objsize); |
| @@ -1737,6 +1894,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x) | |||
| 1737 | page = virt_to_head_page(x); | 1894 | page = virt_to_head_page(x); |
| 1738 | 1895 | ||
| 1739 | slab_free(s, page, x, _RET_IP_); | 1896 | slab_free(s, page, x, _RET_IP_); |
| 1897 | |||
| 1898 | trace_kmem_cache_free(_RET_IP_, x); | ||
| 1740 | } | 1899 | } |
| 1741 | EXPORT_SYMBOL(kmem_cache_free); | 1900 | EXPORT_SYMBOL(kmem_cache_free); |
| 1742 | 1901 | ||
| @@ -1864,7 +2023,7 @@ static inline int calculate_order(int size) | |||
| 1864 | return order; | 2023 | return order; |
| 1865 | fraction /= 2; | 2024 | fraction /= 2; |
| 1866 | } | 2025 | } |
| 1867 | min_objects --; | 2026 | min_objects--; |
| 1868 | } | 2027 | } |
| 1869 | 2028 | ||
| 1870 | /* | 2029 | /* |
| @@ -1879,7 +2038,7 @@ static inline int calculate_order(int size) | |||
| 1879 | * Doh this slab cannot be placed using slub_max_order. | 2038 | * Doh this slab cannot be placed using slub_max_order. |
| 1880 | */ | 2039 | */ |
| 1881 | order = slab_order(size, 1, MAX_ORDER, 1); | 2040 | order = slab_order(size, 1, MAX_ORDER, 1); |
| 1882 | if (order <= MAX_ORDER) | 2041 | if (order < MAX_ORDER) |
| 1883 | return order; | 2042 | return order; |
| 1884 | return -ENOSYS; | 2043 | return -ENOSYS; |
| 1885 | } | 2044 | } |
| @@ -1954,8 +2113,8 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) | |||
| 1954 | */ | 2113 | */ |
| 1955 | #define NR_KMEM_CACHE_CPU 100 | 2114 | #define NR_KMEM_CACHE_CPU 100 |
| 1956 | 2115 | ||
| 1957 | static DEFINE_PER_CPU(struct kmem_cache_cpu, | 2116 | static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU], |
| 1958 | kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; | 2117 | kmem_cache_cpu); |
| 1959 | 2118 | ||
| 1960 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); | 2119 | static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); |
| 1961 | static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); | 2120 | static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); |
| @@ -2263,6 +2422,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) | |||
| 2263 | * on bootup. | 2422 | * on bootup. |
| 2264 | */ | 2423 | */ |
| 2265 | align = calculate_alignment(flags, align, s->objsize); | 2424 | align = calculate_alignment(flags, align, s->objsize); |
| 2425 | s->align = align; | ||
| 2266 | 2426 | ||
| 2267 | /* | 2427 | /* |
| 2268 | * SLUB stores one object immediately after another beginning from | 2428 | * SLUB stores one object immediately after another beginning from |
| @@ -2315,6 +2475,18 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
| 2315 | 2475 | ||
| 2316 | if (!calculate_sizes(s, -1)) | 2476 | if (!calculate_sizes(s, -1)) |
| 2317 | goto error; | 2477 | goto error; |
| 2478 | if (disable_higher_order_debug) { | ||
| 2479 | /* | ||
| 2480 | * Disable debugging flags that store metadata if the min slab | ||
| 2481 | * order increased. | ||
| 2482 | */ | ||
| 2483 | if (get_order(s->size) > get_order(s->objsize)) { | ||
| 2484 | s->flags &= ~DEBUG_METADATA_FLAGS; | ||
| 2485 | s->offset = 0; | ||
| 2486 | if (!calculate_sizes(s, -1)) | ||
| 2487 | goto error; | ||
| 2488 | } | ||
| 2489 | } | ||
| 2318 | 2490 | ||
| 2319 | /* | 2491 | /* |
| 2320 | * The larger the object size is, the more pages we want on the partial | 2492 | * The larger the object size is, the more pages we want on the partial |
| @@ -2467,6 +2639,8 @@ void kmem_cache_destroy(struct kmem_cache *s) | |||
| 2467 | "still has objects.\n", s->name, __func__); | 2639 | "still has objects.\n", s->name, __func__); |
| 2468 | dump_stack(); | 2640 | dump_stack(); |
| 2469 | } | 2641 | } |
| 2642 | if (s->flags & SLAB_DESTROY_BY_RCU) | ||
| 2643 | rcu_barrier(); | ||
| 2470 | sysfs_slab_remove(s); | 2644 | sysfs_slab_remove(s); |
| 2471 | } else | 2645 | } else |
| 2472 | up_write(&slub_lock); | 2646 | up_write(&slub_lock); |
| @@ -2492,6 +2666,7 @@ __setup("slub_min_order=", setup_slub_min_order); | |||
| 2492 | static int __init setup_slub_max_order(char *str) | 2666 | static int __init setup_slub_max_order(char *str) |
| 2493 | { | 2667 | { |
| 2494 | get_option(&str, &slub_max_order); | 2668 | get_option(&str, &slub_max_order); |
| 2669 | slub_max_order = min(slub_max_order, MAX_ORDER - 1); | ||
| 2495 | 2670 | ||
| 2496 | return 1; | 2671 | return 1; |
| 2497 | } | 2672 | } |
| @@ -2523,13 +2698,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, | |||
| 2523 | if (gfp_flags & SLUB_DMA) | 2698 | if (gfp_flags & SLUB_DMA) |
| 2524 | flags = SLAB_CACHE_DMA; | 2699 | flags = SLAB_CACHE_DMA; |
| 2525 | 2700 | ||
| 2526 | down_write(&slub_lock); | 2701 | /* |
| 2702 | * This function is called with IRQs disabled during early-boot on | ||
| 2703 | * single CPU so there's no need to take slub_lock here. | ||
| 2704 | */ | ||
| 2527 | if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, | 2705 | if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, |
| 2528 | flags, NULL)) | 2706 | flags, NULL)) |
| 2529 | goto panic; | 2707 | goto panic; |
| 2530 | 2708 | ||
| 2531 | list_add(&s->list, &slab_caches); | 2709 | list_add(&s->list, &slab_caches); |
| 2532 | up_write(&slub_lock); | 2710 | |
| 2533 | if (sysfs_slab_add(s)) | 2711 | if (sysfs_slab_add(s)) |
| 2534 | goto panic; | 2712 | goto panic; |
| 2535 | return s; | 2713 | return s; |
| @@ -2562,6 +2740,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
| 2562 | struct kmem_cache *s; | 2740 | struct kmem_cache *s; |
| 2563 | char *text; | 2741 | char *text; |
| 2564 | size_t realsize; | 2742 | size_t realsize; |
| 2743 | unsigned long slabflags; | ||
| 2565 | 2744 | ||
| 2566 | s = kmalloc_caches_dma[index]; | 2745 | s = kmalloc_caches_dma[index]; |
| 2567 | if (s) | 2746 | if (s) |
| @@ -2583,9 +2762,18 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
| 2583 | (unsigned int)realsize); | 2762 | (unsigned int)realsize); |
| 2584 | s = kmalloc(kmem_size, flags & ~SLUB_DMA); | 2763 | s = kmalloc(kmem_size, flags & ~SLUB_DMA); |
| 2585 | 2764 | ||
| 2765 | /* | ||
| 2766 | * Must defer sysfs creation to a workqueue because we don't know | ||
| 2767 | * what context we are called from. Before sysfs comes up, we don't | ||
| 2768 | * need to do anything because our sysfs initcall will start by | ||
| 2769 | * adding all existing slabs to sysfs. | ||
| 2770 | */ | ||
| 2771 | slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK; | ||
| 2772 | if (slab_state >= SYSFS) | ||
| 2773 | slabflags |= __SYSFS_ADD_DEFERRED; | ||
| 2774 | |||
| 2586 | if (!s || !text || !kmem_cache_open(s, flags, text, | 2775 | if (!s || !text || !kmem_cache_open(s, flags, text, |
| 2587 | realsize, ARCH_KMALLOC_MINALIGN, | 2776 | realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { |
| 2588 | SLAB_CACHE_DMA|__SYSFS_ADD_DEFERRED, NULL)) { | ||
| 2589 | kfree(s); | 2777 | kfree(s); |
| 2590 | kfree(text); | 2778 | kfree(text); |
| 2591 | goto unlock_out; | 2779 | goto unlock_out; |
| @@ -2594,7 +2782,8 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) | |||
| 2594 | list_add(&s->list, &slab_caches); | 2782 | list_add(&s->list, &slab_caches); |
| 2595 | kmalloc_caches_dma[index] = s; | 2783 | kmalloc_caches_dma[index] = s; |
| 2596 | 2784 | ||
| 2597 | schedule_work(&sysfs_add_work); | 2785 | if (slab_state >= SYSFS) |
| 2786 | schedule_work(&sysfs_add_work); | ||
| 2598 | 2787 | ||
| 2599 | unlock_out: | 2788 | unlock_out: |
| 2600 | up_write(&slub_lock); | 2789 | up_write(&slub_lock); |
| @@ -2636,6 +2825,11 @@ static s8 size_index[24] = { | |||
| 2636 | 2 /* 192 */ | 2825 | 2 /* 192 */ |
| 2637 | }; | 2826 | }; |
| 2638 | 2827 | ||
| 2828 | static inline int size_index_elem(size_t bytes) | ||
| 2829 | { | ||
| 2830 | return (bytes - 1) / 8; | ||
| 2831 | } | ||
| 2832 | |||
| 2639 | static struct kmem_cache *get_slab(size_t size, gfp_t flags) | 2833 | static struct kmem_cache *get_slab(size_t size, gfp_t flags) |
| 2640 | { | 2834 | { |
| 2641 | int index; | 2835 | int index; |
| @@ -2644,7 +2838,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) | |||
| 2644 | if (!size) | 2838 | if (!size) |
| 2645 | return ZERO_SIZE_PTR; | 2839 | return ZERO_SIZE_PTR; |
| 2646 | 2840 | ||
| 2647 | index = size_index[(size - 1) / 8]; | 2841 | index = size_index[size_index_elem(size)]; |
| 2648 | } else | 2842 | } else |
| 2649 | index = fls(size - 1); | 2843 | index = fls(size - 1); |
| 2650 | 2844 | ||
| @@ -2659,6 +2853,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) | |||
| 2659 | void *__kmalloc(size_t size, gfp_t flags) | 2853 | void *__kmalloc(size_t size, gfp_t flags) |
| 2660 | { | 2854 | { |
| 2661 | struct kmem_cache *s; | 2855 | struct kmem_cache *s; |
| 2856 | void *ret; | ||
| 2662 | 2857 | ||
| 2663 | if (unlikely(size > SLUB_MAX_SIZE)) | 2858 | if (unlikely(size > SLUB_MAX_SIZE)) |
| 2664 | return kmalloc_large(size, flags); | 2859 | return kmalloc_large(size, flags); |
| @@ -2668,35 +2863,54 @@ void *__kmalloc(size_t size, gfp_t flags) | |||
| 2668 | if (unlikely(ZERO_OR_NULL_PTR(s))) | 2863 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
| 2669 | return s; | 2864 | return s; |
| 2670 | 2865 | ||
| 2671 | return slab_alloc(s, flags, -1, _RET_IP_); | 2866 | ret = slab_alloc(s, flags, -1, _RET_IP_); |
| 2867 | |||
| 2868 | trace_kmalloc(_RET_IP_, ret, size, s->size, flags); | ||
| 2869 | |||
| 2870 | return ret; | ||
| 2672 | } | 2871 | } |
| 2673 | EXPORT_SYMBOL(__kmalloc); | 2872 | EXPORT_SYMBOL(__kmalloc); |
| 2674 | 2873 | ||
| 2675 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) | 2874 | static void *kmalloc_large_node(size_t size, gfp_t flags, int node) |
| 2676 | { | 2875 | { |
| 2677 | struct page *page = alloc_pages_node(node, flags | __GFP_COMP, | 2876 | struct page *page; |
| 2678 | get_order(size)); | 2877 | void *ptr = NULL; |
| 2679 | 2878 | ||
| 2879 | flags |= __GFP_COMP | __GFP_NOTRACK; | ||
| 2880 | page = alloc_pages_node(node, flags, get_order(size)); | ||
| 2680 | if (page) | 2881 | if (page) |
| 2681 | return page_address(page); | 2882 | ptr = page_address(page); |
| 2682 | else | 2883 | |
| 2683 | return NULL; | 2884 | kmemleak_alloc(ptr, size, 1, flags); |
| 2885 | return ptr; | ||
| 2684 | } | 2886 | } |
| 2685 | 2887 | ||
| 2686 | #ifdef CONFIG_NUMA | 2888 | #ifdef CONFIG_NUMA |
| 2687 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 2889 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
| 2688 | { | 2890 | { |
| 2689 | struct kmem_cache *s; | 2891 | struct kmem_cache *s; |
| 2892 | void *ret; | ||
| 2690 | 2893 | ||
| 2691 | if (unlikely(size > SLUB_MAX_SIZE)) | 2894 | if (unlikely(size > SLUB_MAX_SIZE)) { |
| 2692 | return kmalloc_large_node(size, flags, node); | 2895 | ret = kmalloc_large_node(size, flags, node); |
| 2896 | |||
| 2897 | trace_kmalloc_node(_RET_IP_, ret, | ||
| 2898 | size, PAGE_SIZE << get_order(size), | ||
| 2899 | flags, node); | ||
| 2900 | |||
| 2901 | return ret; | ||
| 2902 | } | ||
| 2693 | 2903 | ||
| 2694 | s = get_slab(size, flags); | 2904 | s = get_slab(size, flags); |
| 2695 | 2905 | ||
| 2696 | if (unlikely(ZERO_OR_NULL_PTR(s))) | 2906 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
| 2697 | return s; | 2907 | return s; |
| 2698 | 2908 | ||
| 2699 | return slab_alloc(s, flags, node, _RET_IP_); | 2909 | ret = slab_alloc(s, flags, node, _RET_IP_); |
| 2910 | |||
| 2911 | trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); | ||
| 2912 | |||
| 2913 | return ret; | ||
| 2700 | } | 2914 | } |
| 2701 | EXPORT_SYMBOL(__kmalloc_node); | 2915 | EXPORT_SYMBOL(__kmalloc_node); |
| 2702 | #endif | 2916 | #endif |
| @@ -2745,12 +2959,15 @@ void kfree(const void *x) | |||
| 2745 | struct page *page; | 2959 | struct page *page; |
| 2746 | void *object = (void *)x; | 2960 | void *object = (void *)x; |
| 2747 | 2961 | ||
| 2962 | trace_kfree(_RET_IP_, x); | ||
| 2963 | |||
| 2748 | if (unlikely(ZERO_OR_NULL_PTR(x))) | 2964 | if (unlikely(ZERO_OR_NULL_PTR(x))) |
| 2749 | return; | 2965 | return; |
| 2750 | 2966 | ||
| 2751 | page = virt_to_head_page(x); | 2967 | page = virt_to_head_page(x); |
| 2752 | if (unlikely(!PageSlab(page))) { | 2968 | if (unlikely(!PageSlab(page))) { |
| 2753 | BUG_ON(!PageCompound(page)); | 2969 | BUG_ON(!PageCompound(page)); |
| 2970 | kmemleak_free(x); | ||
| 2754 | put_page(page); | 2971 | put_page(page); |
| 2755 | return; | 2972 | return; |
| 2756 | } | 2973 | } |
| @@ -2968,7 +3185,7 @@ void __init kmem_cache_init(void) | |||
| 2968 | * kmem_cache_open for slab_state == DOWN. | 3185 | * kmem_cache_open for slab_state == DOWN. |
| 2969 | */ | 3186 | */ |
| 2970 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", | 3187 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", |
| 2971 | sizeof(struct kmem_cache_node), GFP_KERNEL); | 3188 | sizeof(struct kmem_cache_node), GFP_NOWAIT); |
| 2972 | kmalloc_caches[0].refcount = -1; | 3189 | kmalloc_caches[0].refcount = -1; |
| 2973 | caches++; | 3190 | caches++; |
| 2974 | 3191 | ||
| @@ -2979,18 +3196,20 @@ void __init kmem_cache_init(void) | |||
| 2979 | slab_state = PARTIAL; | 3196 | slab_state = PARTIAL; |
| 2980 | 3197 | ||
| 2981 | /* Caches that are not of the two-to-the-power-of size */ | 3198 | /* Caches that are not of the two-to-the-power-of size */ |
| 2982 | if (KMALLOC_MIN_SIZE <= 64) { | 3199 | if (KMALLOC_MIN_SIZE <= 32) { |
| 2983 | create_kmalloc_cache(&kmalloc_caches[1], | 3200 | create_kmalloc_cache(&kmalloc_caches[1], |
| 2984 | "kmalloc-96", 96, GFP_KERNEL); | 3201 | "kmalloc-96", 96, GFP_NOWAIT); |
| 2985 | caches++; | 3202 | caches++; |
| 3203 | } | ||
| 3204 | if (KMALLOC_MIN_SIZE <= 64) { | ||
| 2986 | create_kmalloc_cache(&kmalloc_caches[2], | 3205 | create_kmalloc_cache(&kmalloc_caches[2], |
| 2987 | "kmalloc-192", 192, GFP_KERNEL); | 3206 | "kmalloc-192", 192, GFP_NOWAIT); |
| 2988 | caches++; | 3207 | caches++; |
| 2989 | } | 3208 | } |
| 2990 | 3209 | ||
| 2991 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { | 3210 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { |
| 2992 | create_kmalloc_cache(&kmalloc_caches[i], | 3211 | create_kmalloc_cache(&kmalloc_caches[i], |
| 2993 | "kmalloc", 1 << i, GFP_KERNEL); | 3212 | "kmalloc", 1 << i, GFP_NOWAIT); |
| 2994 | caches++; | 3213 | caches++; |
| 2995 | } | 3214 | } |
| 2996 | 3215 | ||
| @@ -3009,17 +3228,28 @@ void __init kmem_cache_init(void) | |||
| 3009 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || | 3228 | BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || |
| 3010 | (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); | 3229 | (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); |
| 3011 | 3230 | ||
| 3012 | for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) | 3231 | for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { |
| 3013 | size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; | 3232 | int elem = size_index_elem(i); |
| 3233 | if (elem >= ARRAY_SIZE(size_index)) | ||
| 3234 | break; | ||
| 3235 | size_index[elem] = KMALLOC_SHIFT_LOW; | ||
| 3236 | } | ||
| 3014 | 3237 | ||
| 3015 | if (KMALLOC_MIN_SIZE == 128) { | 3238 | if (KMALLOC_MIN_SIZE == 64) { |
| 3239 | /* | ||
| 3240 | * The 96 byte size cache is not used if the alignment | ||
| 3241 | * is 64 byte. | ||
| 3242 | */ | ||
| 3243 | for (i = 64 + 8; i <= 96; i += 8) | ||
| 3244 | size_index[size_index_elem(i)] = 7; | ||
| 3245 | } else if (KMALLOC_MIN_SIZE == 128) { | ||
| 3016 | /* | 3246 | /* |
| 3017 | * The 192 byte sized cache is not used if the alignment | 3247 | * The 192 byte sized cache is not used if the alignment |
| 3018 | * is 128 byte. Redirect kmalloc to use the 256 byte cache | 3248 | * is 128 byte. Redirect kmalloc to use the 256 byte cache |
| 3019 | * instead. | 3249 | * instead. |
| 3020 | */ | 3250 | */ |
| 3021 | for (i = 128 + 8; i <= 192; i += 8) | 3251 | for (i = 128 + 8; i <= 192; i += 8) |
| 3022 | size_index[(i - 1) / 8] = 8; | 3252 | size_index[size_index_elem(i)] = 8; |
| 3023 | } | 3253 | } |
| 3024 | 3254 | ||
| 3025 | slab_state = UP; | 3255 | slab_state = UP; |
| @@ -3027,7 +3257,7 @@ void __init kmem_cache_init(void) | |||
| 3027 | /* Provide the correct kmalloc names now that the caches are up */ | 3257 | /* Provide the correct kmalloc names now that the caches are up */ |
| 3028 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) | 3258 | for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) |
| 3029 | kmalloc_caches[i]. name = | 3259 | kmalloc_caches[i]. name = |
| 3030 | kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); | 3260 | kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); |
| 3031 | 3261 | ||
| 3032 | #ifdef CONFIG_SMP | 3262 | #ifdef CONFIG_SMP |
| 3033 | register_cpu_notifier(&slab_notifier); | 3263 | register_cpu_notifier(&slab_notifier); |
| @@ -3045,6 +3275,10 @@ void __init kmem_cache_init(void) | |||
| 3045 | nr_cpu_ids, nr_node_ids); | 3275 | nr_cpu_ids, nr_node_ids); |
| 3046 | } | 3276 | } |
| 3047 | 3277 | ||
| 3278 | void __init kmem_cache_init_late(void) | ||
| 3279 | { | ||
| 3280 | } | ||
| 3281 | |||
| 3048 | /* | 3282 | /* |
| 3049 | * Find a mergeable slab cache | 3283 | * Find a mergeable slab cache |
| 3050 | */ | 3284 | */ |
| @@ -3111,6 +3345,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, | |||
| 3111 | { | 3345 | { |
| 3112 | struct kmem_cache *s; | 3346 | struct kmem_cache *s; |
| 3113 | 3347 | ||
| 3348 | if (WARN_ON(!name)) | ||
| 3349 | return NULL; | ||
| 3350 | |||
| 3114 | down_write(&slub_lock); | 3351 | down_write(&slub_lock); |
| 3115 | s = find_mergeable(size, align, flags, name, ctor); | 3352 | s = find_mergeable(size, align, flags, name, ctor); |
| 3116 | if (s) { | 3353 | if (s) { |
| @@ -3224,6 +3461,7 @@ static struct notifier_block __cpuinitdata slab_notifier = { | |||
| 3224 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) | 3461 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) |
| 3225 | { | 3462 | { |
| 3226 | struct kmem_cache *s; | 3463 | struct kmem_cache *s; |
| 3464 | void *ret; | ||
| 3227 | 3465 | ||
| 3228 | if (unlikely(size > SLUB_MAX_SIZE)) | 3466 | if (unlikely(size > SLUB_MAX_SIZE)) |
| 3229 | return kmalloc_large(size, gfpflags); | 3467 | return kmalloc_large(size, gfpflags); |
| @@ -3233,13 +3471,19 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) | |||
| 3233 | if (unlikely(ZERO_OR_NULL_PTR(s))) | 3471 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
| 3234 | return s; | 3472 | return s; |
| 3235 | 3473 | ||
| 3236 | return slab_alloc(s, gfpflags, -1, caller); | 3474 | ret = slab_alloc(s, gfpflags, -1, caller); |
| 3475 | |||
| 3476 | /* Honor the call site pointer we recieved. */ | ||
| 3477 | trace_kmalloc(caller, ret, size, s->size, gfpflags); | ||
| 3478 | |||
| 3479 | return ret; | ||
| 3237 | } | 3480 | } |
| 3238 | 3481 | ||
| 3239 | void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | 3482 | void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, |
| 3240 | int node, unsigned long caller) | 3483 | int node, unsigned long caller) |
| 3241 | { | 3484 | { |
| 3242 | struct kmem_cache *s; | 3485 | struct kmem_cache *s; |
| 3486 | void *ret; | ||
| 3243 | 3487 | ||
| 3244 | if (unlikely(size > SLUB_MAX_SIZE)) | 3488 | if (unlikely(size > SLUB_MAX_SIZE)) |
| 3245 | return kmalloc_large_node(size, gfpflags, node); | 3489 | return kmalloc_large_node(size, gfpflags, node); |
| @@ -3249,24 +3493,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | |||
| 3249 | if (unlikely(ZERO_OR_NULL_PTR(s))) | 3493 | if (unlikely(ZERO_OR_NULL_PTR(s))) |
| 3250 | return s; | 3494 | return s; |
| 3251 | 3495 | ||
| 3252 | return slab_alloc(s, gfpflags, node, caller); | 3496 | ret = slab_alloc(s, gfpflags, node, caller); |
| 3253 | } | ||
| 3254 | 3497 | ||
| 3255 | #ifdef CONFIG_SLUB_DEBUG | 3498 | /* Honor the call site pointer we recieved. */ |
| 3256 | static unsigned long count_partial(struct kmem_cache_node *n, | 3499 | trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); |
| 3257 | int (*get_count)(struct page *)) | ||
| 3258 | { | ||
| 3259 | unsigned long flags; | ||
| 3260 | unsigned long x = 0; | ||
| 3261 | struct page *page; | ||
| 3262 | 3500 | ||
| 3263 | spin_lock_irqsave(&n->list_lock, flags); | 3501 | return ret; |
| 3264 | list_for_each_entry(page, &n->partial, lru) | ||
| 3265 | x += get_count(page); | ||
| 3266 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
| 3267 | return x; | ||
| 3268 | } | 3502 | } |
| 3269 | 3503 | ||
| 3504 | #ifdef CONFIG_SLUB_DEBUG | ||
| 3270 | static int count_inuse(struct page *page) | 3505 | static int count_inuse(struct page *page) |
| 3271 | { | 3506 | { |
| 3272 | return page->inuse; | 3507 | return page->inuse; |
| @@ -3277,11 +3512,6 @@ static int count_total(struct page *page) | |||
| 3277 | return page->objects; | 3512 | return page->objects; |
| 3278 | } | 3513 | } |
| 3279 | 3514 | ||
| 3280 | static int count_free(struct page *page) | ||
| 3281 | { | ||
| 3282 | return page->objects - page->inuse; | ||
| 3283 | } | ||
| 3284 | |||
| 3285 | static int validate_slab(struct kmem_cache *s, struct page *page, | 3515 | static int validate_slab(struct kmem_cache *s, struct page *page, |
| 3286 | unsigned long *map) | 3516 | unsigned long *map) |
| 3287 | { | 3517 | { |
| @@ -3650,7 +3880,7 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
| 3650 | to_cpumask(l->cpus)); | 3880 | to_cpumask(l->cpus)); |
| 3651 | } | 3881 | } |
| 3652 | 3882 | ||
| 3653 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && | 3883 | if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && |
| 3654 | len < PAGE_SIZE - 60) { | 3884 | len < PAGE_SIZE - 60) { |
| 3655 | len += sprintf(buf + len, " nodes="); | 3885 | len += sprintf(buf + len, " nodes="); |
| 3656 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, | 3886 | len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, |
| @@ -4325,6 +4555,8 @@ static char *create_unique_id(struct kmem_cache *s) | |||
| 4325 | *p++ = 'a'; | 4555 | *p++ = 'a'; |
| 4326 | if (s->flags & SLAB_DEBUG_FREE) | 4556 | if (s->flags & SLAB_DEBUG_FREE) |
| 4327 | *p++ = 'F'; | 4557 | *p++ = 'F'; |
| 4558 | if (!(s->flags & SLAB_NOTRACK)) | ||
| 4559 | *p++ = 't'; | ||
| 4328 | if (p != name + 1) | 4560 | if (p != name + 1) |
| 4329 | *p++ = '-'; | 4561 | *p++ = '-'; |
| 4330 | p += sprintf(p, "%07d", s->size); | 4562 | p += sprintf(p, "%07d", s->size); |
| @@ -4367,8 +4599,11 @@ static int sysfs_slab_add(struct kmem_cache *s) | |||
| 4367 | } | 4599 | } |
| 4368 | 4600 | ||
| 4369 | err = sysfs_create_group(&s->kobj, &slab_attr_group); | 4601 | err = sysfs_create_group(&s->kobj, &slab_attr_group); |
| 4370 | if (err) | 4602 | if (err) { |
| 4603 | kobject_del(&s->kobj); | ||
| 4604 | kobject_put(&s->kobj); | ||
| 4371 | return err; | 4605 | return err; |
| 4606 | } | ||
| 4372 | kobject_uevent(&s->kobj, KOBJ_ADD); | 4607 | kobject_uevent(&s->kobj, KOBJ_ADD); |
| 4373 | if (!unmergeable) { | 4608 | if (!unmergeable) { |
| 4374 | /* Setup first alias */ | 4609 | /* Setup first alias */ |
| @@ -4550,7 +4785,7 @@ static const struct file_operations proc_slabinfo_operations = { | |||
| 4550 | 4785 | ||
| 4551 | static int __init slab_proc_init(void) | 4786 | static int __init slab_proc_init(void) |
| 4552 | { | 4787 | { |
| 4553 | proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); | 4788 | proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); |
| 4554 | return 0; | 4789 | return 0; |
| 4555 | } | 4790 | } |
| 4556 | module_init(slab_proc_init); | 4791 | module_init(slab_proc_init); |
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a13ea6401ae7..d9714bdcb4a3 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
| @@ -48,8 +48,14 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) | |||
| 48 | { | 48 | { |
| 49 | /* If the main allocator is up use that, fallback to bootmem. */ | 49 | /* If the main allocator is up use that, fallback to bootmem. */ |
| 50 | if (slab_is_available()) { | 50 | if (slab_is_available()) { |
| 51 | struct page *page = alloc_pages_node(node, | 51 | struct page *page; |
| 52 | |||
| 53 | if (node_state(node, N_HIGH_MEMORY)) | ||
| 54 | page = alloc_pages_node(node, | ||
| 52 | GFP_KERNEL | __GFP_ZERO, get_order(size)); | 55 | GFP_KERNEL | __GFP_ZERO, get_order(size)); |
| 56 | else | ||
| 57 | page = alloc_pages(GFP_KERNEL | __GFP_ZERO, | ||
| 58 | get_order(size)); | ||
| 53 | if (page) | 59 | if (page) |
| 54 | return page_address(page); | 60 | return page_address(page); |
| 55 | return NULL; | 61 | return NULL; |
diff --git a/mm/sparse.c b/mm/sparse.c index da432d9f0ae8..6ce4aab69e99 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -62,9 +62,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) | |||
| 62 | unsigned long array_size = SECTIONS_PER_ROOT * | 62 | unsigned long array_size = SECTIONS_PER_ROOT * |
| 63 | sizeof(struct mem_section); | 63 | sizeof(struct mem_section); |
| 64 | 64 | ||
| 65 | if (slab_is_available()) | 65 | if (slab_is_available()) { |
| 66 | section = kmalloc_node(array_size, GFP_KERNEL, nid); | 66 | if (node_state(nid, N_HIGH_MEMORY)) |
| 67 | else | 67 | section = kmalloc_node(array_size, GFP_KERNEL, nid); |
| 68 | else | ||
| 69 | section = kmalloc(array_size, GFP_KERNEL); | ||
| 70 | } else | ||
| 68 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); | 71 | section = alloc_bootmem_node(NODE_DATA(nid), array_size); |
| 69 | 72 | ||
| 70 | if (section) | 73 | if (section) |
| @@ -118,7 +118,7 @@ static void pagevec_move_tail(struct pagevec *pvec) | |||
| 118 | spin_lock(&zone->lru_lock); | 118 | spin_lock(&zone->lru_lock); |
| 119 | } | 119 | } |
| 120 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 120 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
| 121 | int lru = page_is_file_cache(page); | 121 | int lru = page_lru_base_type(page); |
| 122 | list_move_tail(&page->lru, &zone->lru[lru].list); | 122 | list_move_tail(&page->lru, &zone->lru[lru].list); |
| 123 | pgmoved++; | 123 | pgmoved++; |
| 124 | } | 124 | } |
| @@ -181,7 +181,7 @@ void activate_page(struct page *page) | |||
| 181 | spin_lock_irq(&zone->lru_lock); | 181 | spin_lock_irq(&zone->lru_lock); |
| 182 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { | 182 | if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { |
| 183 | int file = page_is_file_cache(page); | 183 | int file = page_is_file_cache(page); |
| 184 | int lru = LRU_BASE + file; | 184 | int lru = page_lru_base_type(page); |
| 185 | del_page_from_lru_list(zone, page, lru); | 185 | del_page_from_lru_list(zone, page, lru); |
| 186 | 186 | ||
| 187 | SetPageActive(page); | 187 | SetPageActive(page); |
| @@ -189,7 +189,7 @@ void activate_page(struct page *page) | |||
| 189 | add_page_to_lru_list(zone, page, lru); | 189 | add_page_to_lru_list(zone, page, lru); |
| 190 | __count_vm_event(PGACTIVATE); | 190 | __count_vm_event(PGACTIVATE); |
| 191 | 191 | ||
| 192 | update_page_reclaim_stat(zone, page, !!file, 1); | 192 | update_page_reclaim_stat(zone, page, file, 1); |
| 193 | } | 193 | } |
| 194 | spin_unlock_irq(&zone->lru_lock); | 194 | spin_unlock_irq(&zone->lru_lock); |
| 195 | } | 195 | } |
| @@ -448,8 +448,8 @@ void pagevec_strip(struct pagevec *pvec) | |||
| 448 | for (i = 0; i < pagevec_count(pvec); i++) { | 448 | for (i = 0; i < pagevec_count(pvec); i++) { |
| 449 | struct page *page = pvec->pages[i]; | 449 | struct page *page = pvec->pages[i]; |
| 450 | 450 | ||
| 451 | if (PagePrivate(page) && trylock_page(page)) { | 451 | if (page_has_private(page) && trylock_page(page)) { |
| 452 | if (PagePrivate(page)) | 452 | if (page_has_private(page)) |
| 453 | try_to_release_page(page, 0); | 453 | try_to_release_page(page, 0); |
| 454 | unlock_page(page); | 454 | unlock_page(page); |
| 455 | } | 455 | } |
| @@ -491,55 +491,12 @@ unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, | |||
| 491 | 491 | ||
| 492 | EXPORT_SYMBOL(pagevec_lookup_tag); | 492 | EXPORT_SYMBOL(pagevec_lookup_tag); |
| 493 | 493 | ||
| 494 | #ifdef CONFIG_SMP | ||
| 495 | /* | ||
| 496 | * We tolerate a little inaccuracy to avoid ping-ponging the counter between | ||
| 497 | * CPUs | ||
| 498 | */ | ||
| 499 | #define ACCT_THRESHOLD max(16, NR_CPUS * 2) | ||
| 500 | |||
| 501 | static DEFINE_PER_CPU(long, committed_space); | ||
| 502 | |||
| 503 | void vm_acct_memory(long pages) | ||
| 504 | { | ||
| 505 | long *local; | ||
| 506 | |||
| 507 | preempt_disable(); | ||
| 508 | local = &__get_cpu_var(committed_space); | ||
| 509 | *local += pages; | ||
| 510 | if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { | ||
| 511 | atomic_long_add(*local, &vm_committed_space); | ||
| 512 | *local = 0; | ||
| 513 | } | ||
| 514 | preempt_enable(); | ||
| 515 | } | ||
| 516 | |||
| 517 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 518 | |||
| 519 | /* Drop the CPU's cached committed space back into the central pool. */ | ||
| 520 | static int cpu_swap_callback(struct notifier_block *nfb, | ||
| 521 | unsigned long action, | ||
| 522 | void *hcpu) | ||
| 523 | { | ||
| 524 | long *committed; | ||
| 525 | |||
| 526 | committed = &per_cpu(committed_space, (long)hcpu); | ||
| 527 | if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | ||
| 528 | atomic_long_add(*committed, &vm_committed_space); | ||
| 529 | *committed = 0; | ||
| 530 | drain_cpu_pagevecs((long)hcpu); | ||
| 531 | } | ||
| 532 | return NOTIFY_OK; | ||
| 533 | } | ||
| 534 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 535 | #endif /* CONFIG_SMP */ | ||
| 536 | |||
| 537 | /* | 494 | /* |
| 538 | * Perform any setup for the swap system | 495 | * Perform any setup for the swap system |
| 539 | */ | 496 | */ |
| 540 | void __init swap_setup(void) | 497 | void __init swap_setup(void) |
| 541 | { | 498 | { |
| 542 | unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); | 499 | unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); |
| 543 | 500 | ||
| 544 | #ifdef CONFIG_SWAP | 501 | #ifdef CONFIG_SWAP |
| 545 | bdi_init(swapper_space.backing_dev_info); | 502 | bdi_init(swapper_space.backing_dev_info); |
| @@ -554,7 +511,4 @@ void __init swap_setup(void) | |||
| 554 | * Right now other parts of the system means that we | 511 | * Right now other parts of the system means that we |
| 555 | * _really_ don't want to cluster much more | 512 | * _really_ don't want to cluster much more |
| 556 | */ | 513 | */ |
| 557 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 558 | hotcpu_notifier(cpu_swap_callback, 0); | ||
| 559 | #endif | ||
| 560 | } | 514 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 3ecea98ecb45..6d1daeb1cb4a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -34,6 +34,7 @@ static const struct address_space_operations swap_aops = { | |||
| 34 | }; | 34 | }; |
| 35 | 35 | ||
| 36 | static struct backing_dev_info swap_backing_dev_info = { | 36 | static struct backing_dev_info swap_backing_dev_info = { |
| 37 | .name = "swap", | ||
| 37 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, | 38 | .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, |
| 38 | .unplug_io_fn = swap_unplug_io_fn, | 39 | .unplug_io_fn = swap_unplug_io_fn, |
| 39 | }; | 40 | }; |
| @@ -66,10 +67,10 @@ void show_swap_cache_info(void) | |||
| 66 | } | 67 | } |
| 67 | 68 | ||
| 68 | /* | 69 | /* |
| 69 | * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, | 70 | * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, |
| 70 | * but sets SwapCache flag and private instead of mapping and index. | 71 | * but sets SwapCache flag and private instead of mapping and index. |
| 71 | */ | 72 | */ |
| 72 | int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | 73 | static int __add_to_swap_cache(struct page *page, swp_entry_t entry) |
| 73 | { | 74 | { |
| 74 | int error; | 75 | int error; |
| 75 | 76 | ||
| @@ -77,28 +78,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
| 77 | VM_BUG_ON(PageSwapCache(page)); | 78 | VM_BUG_ON(PageSwapCache(page)); |
| 78 | VM_BUG_ON(!PageSwapBacked(page)); | 79 | VM_BUG_ON(!PageSwapBacked(page)); |
| 79 | 80 | ||
| 81 | page_cache_get(page); | ||
| 82 | SetPageSwapCache(page); | ||
| 83 | set_page_private(page, entry.val); | ||
| 84 | |||
| 85 | spin_lock_irq(&swapper_space.tree_lock); | ||
| 86 | error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); | ||
| 87 | if (likely(!error)) { | ||
| 88 | total_swapcache_pages++; | ||
| 89 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
| 90 | INC_CACHE_INFO(add_total); | ||
| 91 | } | ||
| 92 | spin_unlock_irq(&swapper_space.tree_lock); | ||
| 93 | |||
| 94 | if (unlikely(error)) { | ||
| 95 | /* | ||
| 96 | * Only the context which have set SWAP_HAS_CACHE flag | ||
| 97 | * would call add_to_swap_cache(). | ||
| 98 | * So add_to_swap_cache() doesn't returns -EEXIST. | ||
| 99 | */ | ||
| 100 | VM_BUG_ON(error == -EEXIST); | ||
| 101 | set_page_private(page, 0UL); | ||
| 102 | ClearPageSwapCache(page); | ||
| 103 | page_cache_release(page); | ||
| 104 | } | ||
| 105 | |||
| 106 | return error; | ||
| 107 | } | ||
| 108 | |||
| 109 | |||
| 110 | int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | ||
| 111 | { | ||
| 112 | int error; | ||
| 113 | |||
| 80 | error = radix_tree_preload(gfp_mask); | 114 | error = radix_tree_preload(gfp_mask); |
| 81 | if (!error) { | 115 | if (!error) { |
| 82 | page_cache_get(page); | 116 | error = __add_to_swap_cache(page, entry); |
| 83 | SetPageSwapCache(page); | ||
| 84 | set_page_private(page, entry.val); | ||
| 85 | |||
| 86 | spin_lock_irq(&swapper_space.tree_lock); | ||
| 87 | error = radix_tree_insert(&swapper_space.page_tree, | ||
| 88 | entry.val, page); | ||
| 89 | if (likely(!error)) { | ||
| 90 | total_swapcache_pages++; | ||
| 91 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
| 92 | INC_CACHE_INFO(add_total); | ||
| 93 | } | ||
| 94 | spin_unlock_irq(&swapper_space.tree_lock); | ||
| 95 | radix_tree_preload_end(); | 117 | radix_tree_preload_end(); |
| 96 | |||
| 97 | if (unlikely(error)) { | ||
| 98 | set_page_private(page, 0UL); | ||
| 99 | ClearPageSwapCache(page); | ||
| 100 | page_cache_release(page); | ||
| 101 | } | ||
| 102 | } | 118 | } |
| 103 | return error; | 119 | return error; |
| 104 | } | 120 | } |
| @@ -109,8 +125,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
| 109 | */ | 125 | */ |
| 110 | void __delete_from_swap_cache(struct page *page) | 126 | void __delete_from_swap_cache(struct page *page) |
| 111 | { | 127 | { |
| 112 | swp_entry_t ent = {.val = page_private(page)}; | ||
| 113 | |||
| 114 | VM_BUG_ON(!PageLocked(page)); | 128 | VM_BUG_ON(!PageLocked(page)); |
| 115 | VM_BUG_ON(!PageSwapCache(page)); | 129 | VM_BUG_ON(!PageSwapCache(page)); |
| 116 | VM_BUG_ON(PageWriteback(page)); | 130 | VM_BUG_ON(PageWriteback(page)); |
| @@ -121,13 +135,11 @@ void __delete_from_swap_cache(struct page *page) | |||
| 121 | total_swapcache_pages--; | 135 | total_swapcache_pages--; |
| 122 | __dec_zone_page_state(page, NR_FILE_PAGES); | 136 | __dec_zone_page_state(page, NR_FILE_PAGES); |
| 123 | INC_CACHE_INFO(del_total); | 137 | INC_CACHE_INFO(del_total); |
| 124 | mem_cgroup_uncharge_swapcache(page, ent); | ||
| 125 | } | 138 | } |
| 126 | 139 | ||
| 127 | /** | 140 | /** |
| 128 | * add_to_swap - allocate swap space for a page | 141 | * add_to_swap - allocate swap space for a page |
| 129 | * @page: page we want to move to swap | 142 | * @page: page we want to move to swap |
| 130 | * @gfp_mask: memory allocation flags | ||
| 131 | * | 143 | * |
| 132 | * Allocate swap space for the page and add the page to the | 144 | * Allocate swap space for the page and add the page to the |
| 133 | * swap cache. Caller needs to hold the page lock. | 145 | * swap cache. Caller needs to hold the page lock. |
| @@ -140,38 +152,34 @@ int add_to_swap(struct page *page) | |||
| 140 | VM_BUG_ON(!PageLocked(page)); | 152 | VM_BUG_ON(!PageLocked(page)); |
| 141 | VM_BUG_ON(!PageUptodate(page)); | 153 | VM_BUG_ON(!PageUptodate(page)); |
| 142 | 154 | ||
| 143 | for (;;) { | 155 | entry = get_swap_page(); |
| 144 | entry = get_swap_page(); | 156 | if (!entry.val) |
| 145 | if (!entry.val) | 157 | return 0; |
| 146 | return 0; | ||
| 147 | 158 | ||
| 159 | /* | ||
| 160 | * Radix-tree node allocations from PF_MEMALLOC contexts could | ||
| 161 | * completely exhaust the page allocator. __GFP_NOMEMALLOC | ||
| 162 | * stops emergency reserves from being allocated. | ||
| 163 | * | ||
| 164 | * TODO: this could cause a theoretical memory reclaim | ||
| 165 | * deadlock in the swap out path. | ||
| 166 | */ | ||
| 167 | /* | ||
| 168 | * Add it to the swap cache and mark it dirty | ||
| 169 | */ | ||
| 170 | err = add_to_swap_cache(page, entry, | ||
| 171 | __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); | ||
| 172 | |||
| 173 | if (!err) { /* Success */ | ||
| 174 | SetPageDirty(page); | ||
| 175 | return 1; | ||
| 176 | } else { /* -ENOMEM radix-tree allocation failure */ | ||
| 148 | /* | 177 | /* |
| 149 | * Radix-tree node allocations from PF_MEMALLOC contexts could | 178 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely |
| 150 | * completely exhaust the page allocator. __GFP_NOMEMALLOC | 179 | * clear SWAP_HAS_CACHE flag. |
| 151 | * stops emergency reserves from being allocated. | ||
| 152 | * | ||
| 153 | * TODO: this could cause a theoretical memory reclaim | ||
| 154 | * deadlock in the swap out path. | ||
| 155 | */ | ||
| 156 | /* | ||
| 157 | * Add it to the swap cache and mark it dirty | ||
| 158 | */ | 180 | */ |
| 159 | err = add_to_swap_cache(page, entry, | 181 | swapcache_free(entry, NULL); |
| 160 | __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); | 182 | return 0; |
| 161 | |||
| 162 | switch (err) { | ||
| 163 | case 0: /* Success */ | ||
| 164 | SetPageDirty(page); | ||
| 165 | return 1; | ||
| 166 | case -EEXIST: | ||
| 167 | /* Raced with "speculative" read_swap_cache_async */ | ||
| 168 | swap_free(entry); | ||
| 169 | continue; | ||
| 170 | default: | ||
| 171 | /* -ENOMEM radix-tree allocation failure */ | ||
| 172 | swap_free(entry); | ||
| 173 | return 0; | ||
| 174 | } | ||
| 175 | } | 183 | } |
| 176 | } | 184 | } |
| 177 | 185 | ||
| @@ -191,7 +199,7 @@ void delete_from_swap_cache(struct page *page) | |||
| 191 | __delete_from_swap_cache(page); | 199 | __delete_from_swap_cache(page); |
| 192 | spin_unlock_irq(&swapper_space.tree_lock); | 200 | spin_unlock_irq(&swapper_space.tree_lock); |
| 193 | 201 | ||
| 194 | swap_free(entry); | 202 | swapcache_free(entry, page); |
| 195 | page_cache_release(page); | 203 | page_cache_release(page); |
| 196 | } | 204 | } |
| 197 | 205 | ||
| @@ -293,33 +301,46 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
| 293 | } | 301 | } |
| 294 | 302 | ||
| 295 | /* | 303 | /* |
| 296 | * Swap entry may have been freed since our caller observed it. | 304 | * call radix_tree_preload() while we can wait. |
| 297 | */ | 305 | */ |
| 298 | if (!swap_duplicate(entry)) | 306 | err = radix_tree_preload(gfp_mask & GFP_KERNEL); |
| 307 | if (err) | ||
| 299 | break; | 308 | break; |
| 300 | 309 | ||
| 301 | /* | 310 | /* |
| 302 | * Associate the page with swap entry in the swap cache. | 311 | * Swap entry may have been freed since our caller observed it. |
| 303 | * May fail (-EEXIST) if there is already a page associated | ||
| 304 | * with this entry in the swap cache: added by a racing | ||
| 305 | * read_swap_cache_async, or add_to_swap or shmem_writepage | ||
| 306 | * re-using the just freed swap entry for an existing page. | ||
| 307 | * May fail (-ENOMEM) if radix-tree node allocation failed. | ||
| 308 | */ | 312 | */ |
| 313 | err = swapcache_prepare(entry); | ||
| 314 | if (err == -EEXIST) { /* seems racy */ | ||
| 315 | radix_tree_preload_end(); | ||
| 316 | continue; | ||
| 317 | } | ||
| 318 | if (err) { /* swp entry is obsolete ? */ | ||
| 319 | radix_tree_preload_end(); | ||
| 320 | break; | ||
| 321 | } | ||
| 322 | |||
| 323 | /* May fail (-ENOMEM) if radix-tree node allocation failed. */ | ||
| 309 | __set_page_locked(new_page); | 324 | __set_page_locked(new_page); |
| 310 | SetPageSwapBacked(new_page); | 325 | SetPageSwapBacked(new_page); |
| 311 | err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); | 326 | err = __add_to_swap_cache(new_page, entry); |
| 312 | if (likely(!err)) { | 327 | if (likely(!err)) { |
| 328 | radix_tree_preload_end(); | ||
| 313 | /* | 329 | /* |
| 314 | * Initiate read into locked page and return. | 330 | * Initiate read into locked page and return. |
| 315 | */ | 331 | */ |
| 316 | lru_cache_add_anon(new_page); | 332 | lru_cache_add_anon(new_page); |
| 317 | swap_readpage(NULL, new_page); | 333 | swap_readpage(new_page); |
| 318 | return new_page; | 334 | return new_page; |
| 319 | } | 335 | } |
| 336 | radix_tree_preload_end(); | ||
| 320 | ClearPageSwapBacked(new_page); | 337 | ClearPageSwapBacked(new_page); |
| 321 | __clear_page_locked(new_page); | 338 | __clear_page_locked(new_page); |
| 322 | swap_free(entry); | 339 | /* |
| 340 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely | ||
| 341 | * clear SWAP_HAS_CACHE flag. | ||
| 342 | */ | ||
| 343 | swapcache_free(entry, NULL); | ||
| 323 | } while (err != -ENOMEM); | 344 | } while (err != -ENOMEM); |
| 324 | 345 | ||
| 325 | if (new_page) | 346 | if (new_page) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 312fafe0ab6e..9c590eef7912 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -53,6 +53,59 @@ static struct swap_info_struct swap_info[MAX_SWAPFILES]; | |||
| 53 | 53 | ||
| 54 | static DEFINE_MUTEX(swapon_mutex); | 54 | static DEFINE_MUTEX(swapon_mutex); |
| 55 | 55 | ||
| 56 | /* For reference count accounting in swap_map */ | ||
| 57 | /* enum for swap_map[] handling. internal use only */ | ||
| 58 | enum { | ||
| 59 | SWAP_MAP = 0, /* ops for reference from swap users */ | ||
| 60 | SWAP_CACHE, /* ops for reference from swap cache */ | ||
| 61 | }; | ||
| 62 | |||
| 63 | static inline int swap_count(unsigned short ent) | ||
| 64 | { | ||
| 65 | return ent & SWAP_COUNT_MASK; | ||
| 66 | } | ||
| 67 | |||
| 68 | static inline bool swap_has_cache(unsigned short ent) | ||
| 69 | { | ||
| 70 | return !!(ent & SWAP_HAS_CACHE); | ||
| 71 | } | ||
| 72 | |||
| 73 | static inline unsigned short encode_swapmap(int count, bool has_cache) | ||
| 74 | { | ||
| 75 | unsigned short ret = count; | ||
| 76 | |||
| 77 | if (has_cache) | ||
| 78 | return SWAP_HAS_CACHE | ret; | ||
| 79 | return ret; | ||
| 80 | } | ||
| 81 | |||
| 82 | /* returnes 1 if swap entry is freed */ | ||
| 83 | static int | ||
| 84 | __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) | ||
| 85 | { | ||
| 86 | int type = si - swap_info; | ||
| 87 | swp_entry_t entry = swp_entry(type, offset); | ||
| 88 | struct page *page; | ||
| 89 | int ret = 0; | ||
| 90 | |||
| 91 | page = find_get_page(&swapper_space, entry.val); | ||
| 92 | if (!page) | ||
| 93 | return 0; | ||
| 94 | /* | ||
| 95 | * This function is called from scan_swap_map() and it's called | ||
| 96 | * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here. | ||
| 97 | * We have to use trylock for avoiding deadlock. This is a special | ||
| 98 | * case and you should use try_to_free_swap() with explicit lock_page() | ||
| 99 | * in usual operations. | ||
| 100 | */ | ||
| 101 | if (trylock_page(page)) { | ||
| 102 | ret = try_to_free_swap(page); | ||
| 103 | unlock_page(page); | ||
| 104 | } | ||
| 105 | page_cache_release(page); | ||
| 106 | return ret; | ||
| 107 | } | ||
| 108 | |||
| 56 | /* | 109 | /* |
| 57 | * We need this because the bdev->unplug_fn can sleep and we cannot | 110 | * We need this because the bdev->unplug_fn can sleep and we cannot |
| 58 | * hold swap_lock while calling the unplug_fn. And swap_lock | 111 | * hold swap_lock while calling the unplug_fn. And swap_lock |
| @@ -108,7 +161,8 @@ static int discard_swap(struct swap_info_struct *si) | |||
| 108 | } | 161 | } |
| 109 | 162 | ||
| 110 | err = blkdev_issue_discard(si->bdev, start_block, | 163 | err = blkdev_issue_discard(si->bdev, start_block, |
| 111 | nr_blocks, GFP_KERNEL); | 164 | nr_blocks, GFP_KERNEL, |
| 165 | DISCARD_FL_BARRIER); | ||
| 112 | if (err) | 166 | if (err) |
| 113 | break; | 167 | break; |
| 114 | 168 | ||
| @@ -147,7 +201,8 @@ static void discard_swap_cluster(struct swap_info_struct *si, | |||
| 147 | start_block <<= PAGE_SHIFT - 9; | 201 | start_block <<= PAGE_SHIFT - 9; |
| 148 | nr_blocks <<= PAGE_SHIFT - 9; | 202 | nr_blocks <<= PAGE_SHIFT - 9; |
| 149 | if (blkdev_issue_discard(si->bdev, start_block, | 203 | if (blkdev_issue_discard(si->bdev, start_block, |
| 150 | nr_blocks, GFP_NOIO)) | 204 | nr_blocks, GFP_NOIO, |
| 205 | DISCARD_FL_BARRIER)) | ||
| 151 | break; | 206 | break; |
| 152 | } | 207 | } |
| 153 | 208 | ||
| @@ -167,7 +222,8 @@ static int wait_for_discard(void *word) | |||
| 167 | #define SWAPFILE_CLUSTER 256 | 222 | #define SWAPFILE_CLUSTER 256 |
| 168 | #define LATENCY_LIMIT 256 | 223 | #define LATENCY_LIMIT 256 |
| 169 | 224 | ||
| 170 | static inline unsigned long scan_swap_map(struct swap_info_struct *si) | 225 | static inline unsigned long scan_swap_map(struct swap_info_struct *si, |
| 226 | int cache) | ||
| 171 | { | 227 | { |
| 172 | unsigned long offset; | 228 | unsigned long offset; |
| 173 | unsigned long scan_base; | 229 | unsigned long scan_base; |
| @@ -273,6 +329,19 @@ checks: | |||
| 273 | goto no_page; | 329 | goto no_page; |
| 274 | if (offset > si->highest_bit) | 330 | if (offset > si->highest_bit) |
| 275 | scan_base = offset = si->lowest_bit; | 331 | scan_base = offset = si->lowest_bit; |
| 332 | |||
| 333 | /* reuse swap entry of cache-only swap if not busy. */ | ||
| 334 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
| 335 | int swap_was_freed; | ||
| 336 | spin_unlock(&swap_lock); | ||
| 337 | swap_was_freed = __try_to_reclaim_swap(si, offset); | ||
| 338 | spin_lock(&swap_lock); | ||
| 339 | /* entry was freed successfully, try to use this again */ | ||
| 340 | if (swap_was_freed) | ||
| 341 | goto checks; | ||
| 342 | goto scan; /* check next one */ | ||
| 343 | } | ||
| 344 | |||
| 276 | if (si->swap_map[offset]) | 345 | if (si->swap_map[offset]) |
| 277 | goto scan; | 346 | goto scan; |
| 278 | 347 | ||
| @@ -285,7 +354,10 @@ checks: | |||
| 285 | si->lowest_bit = si->max; | 354 | si->lowest_bit = si->max; |
| 286 | si->highest_bit = 0; | 355 | si->highest_bit = 0; |
| 287 | } | 356 | } |
| 288 | si->swap_map[offset] = 1; | 357 | if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ |
| 358 | si->swap_map[offset] = encode_swapmap(0, true); | ||
| 359 | else /* at suspend */ | ||
| 360 | si->swap_map[offset] = encode_swapmap(1, false); | ||
| 289 | si->cluster_next = offset + 1; | 361 | si->cluster_next = offset + 1; |
| 290 | si->flags -= SWP_SCANNING; | 362 | si->flags -= SWP_SCANNING; |
| 291 | 363 | ||
| @@ -351,6 +423,10 @@ scan: | |||
| 351 | spin_lock(&swap_lock); | 423 | spin_lock(&swap_lock); |
| 352 | goto checks; | 424 | goto checks; |
| 353 | } | 425 | } |
| 426 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
| 427 | spin_lock(&swap_lock); | ||
| 428 | goto checks; | ||
| 429 | } | ||
| 354 | if (unlikely(--latency_ration < 0)) { | 430 | if (unlikely(--latency_ration < 0)) { |
| 355 | cond_resched(); | 431 | cond_resched(); |
| 356 | latency_ration = LATENCY_LIMIT; | 432 | latency_ration = LATENCY_LIMIT; |
| @@ -362,6 +438,10 @@ scan: | |||
| 362 | spin_lock(&swap_lock); | 438 | spin_lock(&swap_lock); |
| 363 | goto checks; | 439 | goto checks; |
| 364 | } | 440 | } |
| 441 | if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { | ||
| 442 | spin_lock(&swap_lock); | ||
| 443 | goto checks; | ||
| 444 | } | ||
| 365 | if (unlikely(--latency_ration < 0)) { | 445 | if (unlikely(--latency_ration < 0)) { |
| 366 | cond_resched(); | 446 | cond_resched(); |
| 367 | latency_ration = LATENCY_LIMIT; | 447 | latency_ration = LATENCY_LIMIT; |
| @@ -401,7 +481,8 @@ swp_entry_t get_swap_page(void) | |||
| 401 | continue; | 481 | continue; |
| 402 | 482 | ||
| 403 | swap_list.next = next; | 483 | swap_list.next = next; |
| 404 | offset = scan_swap_map(si); | 484 | /* This is called for allocating swap entry for cache */ |
| 485 | offset = scan_swap_map(si, SWAP_CACHE); | ||
| 405 | if (offset) { | 486 | if (offset) { |
| 406 | spin_unlock(&swap_lock); | 487 | spin_unlock(&swap_lock); |
| 407 | return swp_entry(type, offset); | 488 | return swp_entry(type, offset); |
| @@ -415,6 +496,7 @@ noswap: | |||
| 415 | return (swp_entry_t) {0}; | 496 | return (swp_entry_t) {0}; |
| 416 | } | 497 | } |
| 417 | 498 | ||
| 499 | /* The only caller of this function is now susupend routine */ | ||
| 418 | swp_entry_t get_swap_page_of_type(int type) | 500 | swp_entry_t get_swap_page_of_type(int type) |
| 419 | { | 501 | { |
| 420 | struct swap_info_struct *si; | 502 | struct swap_info_struct *si; |
| @@ -424,7 +506,8 @@ swp_entry_t get_swap_page_of_type(int type) | |||
| 424 | si = swap_info + type; | 506 | si = swap_info + type; |
| 425 | if (si->flags & SWP_WRITEOK) { | 507 | if (si->flags & SWP_WRITEOK) { |
| 426 | nr_swap_pages--; | 508 | nr_swap_pages--; |
| 427 | offset = scan_swap_map(si); | 509 | /* This is called for allocating swap entry, not cache */ |
| 510 | offset = scan_swap_map(si, SWAP_MAP); | ||
| 428 | if (offset) { | 511 | if (offset) { |
| 429 | spin_unlock(&swap_lock); | 512 | spin_unlock(&swap_lock); |
| 430 | return swp_entry(type, offset); | 513 | return swp_entry(type, offset); |
| @@ -471,26 +554,40 @@ out: | |||
| 471 | return NULL; | 554 | return NULL; |
| 472 | } | 555 | } |
| 473 | 556 | ||
| 474 | static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent) | 557 | static int swap_entry_free(struct swap_info_struct *p, |
| 558 | swp_entry_t ent, int cache) | ||
| 475 | { | 559 | { |
| 476 | unsigned long offset = swp_offset(ent); | 560 | unsigned long offset = swp_offset(ent); |
| 477 | int count = p->swap_map[offset]; | 561 | int count = swap_count(p->swap_map[offset]); |
| 478 | 562 | bool has_cache; | |
| 479 | if (count < SWAP_MAP_MAX) { | 563 | |
| 480 | count--; | 564 | has_cache = swap_has_cache(p->swap_map[offset]); |
| 481 | p->swap_map[offset] = count; | 565 | |
| 482 | if (!count) { | 566 | if (cache == SWAP_MAP) { /* dropping usage count of swap */ |
| 483 | if (offset < p->lowest_bit) | 567 | if (count < SWAP_MAP_MAX) { |
| 484 | p->lowest_bit = offset; | 568 | count--; |
| 485 | if (offset > p->highest_bit) | 569 | p->swap_map[offset] = encode_swapmap(count, has_cache); |
| 486 | p->highest_bit = offset; | ||
| 487 | if (p->prio > swap_info[swap_list.next].prio) | ||
| 488 | swap_list.next = p - swap_info; | ||
| 489 | nr_swap_pages++; | ||
| 490 | p->inuse_pages--; | ||
| 491 | mem_cgroup_uncharge_swap(ent); | ||
| 492 | } | 570 | } |
| 571 | } else { /* dropping swap cache flag */ | ||
| 572 | VM_BUG_ON(!has_cache); | ||
| 573 | p->swap_map[offset] = encode_swapmap(count, false); | ||
| 574 | |||
| 575 | } | ||
| 576 | /* return code. */ | ||
| 577 | count = p->swap_map[offset]; | ||
| 578 | /* free if no reference */ | ||
| 579 | if (!count) { | ||
| 580 | if (offset < p->lowest_bit) | ||
| 581 | p->lowest_bit = offset; | ||
| 582 | if (offset > p->highest_bit) | ||
| 583 | p->highest_bit = offset; | ||
| 584 | if (p->prio > swap_info[swap_list.next].prio) | ||
| 585 | swap_list.next = p - swap_info; | ||
| 586 | nr_swap_pages++; | ||
| 587 | p->inuse_pages--; | ||
| 493 | } | 588 | } |
| 589 | if (!swap_count(count)) | ||
| 590 | mem_cgroup_uncharge_swap(ent); | ||
| 494 | return count; | 591 | return count; |
| 495 | } | 592 | } |
| 496 | 593 | ||
| @@ -504,9 +601,33 @@ void swap_free(swp_entry_t entry) | |||
| 504 | 601 | ||
| 505 | p = swap_info_get(entry); | 602 | p = swap_info_get(entry); |
| 506 | if (p) { | 603 | if (p) { |
| 507 | swap_entry_free(p, entry); | 604 | swap_entry_free(p, entry, SWAP_MAP); |
| 605 | spin_unlock(&swap_lock); | ||
| 606 | } | ||
| 607 | } | ||
| 608 | |||
| 609 | /* | ||
| 610 | * Called after dropping swapcache to decrease refcnt to swap entries. | ||
| 611 | */ | ||
| 612 | void swapcache_free(swp_entry_t entry, struct page *page) | ||
| 613 | { | ||
| 614 | struct swap_info_struct *p; | ||
| 615 | int ret; | ||
| 616 | |||
| 617 | p = swap_info_get(entry); | ||
| 618 | if (p) { | ||
| 619 | ret = swap_entry_free(p, entry, SWAP_CACHE); | ||
| 620 | if (page) { | ||
| 621 | bool swapout; | ||
| 622 | if (ret) | ||
| 623 | swapout = true; /* the end of swap out */ | ||
| 624 | else | ||
| 625 | swapout = false; /* no more swap users! */ | ||
| 626 | mem_cgroup_uncharge_swapcache(page, entry, swapout); | ||
| 627 | } | ||
| 508 | spin_unlock(&swap_lock); | 628 | spin_unlock(&swap_lock); |
| 509 | } | 629 | } |
| 630 | return; | ||
| 510 | } | 631 | } |
| 511 | 632 | ||
| 512 | /* | 633 | /* |
| @@ -521,8 +642,7 @@ static inline int page_swapcount(struct page *page) | |||
| 521 | entry.val = page_private(page); | 642 | entry.val = page_private(page); |
| 522 | p = swap_info_get(entry); | 643 | p = swap_info_get(entry); |
| 523 | if (p) { | 644 | if (p) { |
| 524 | /* Subtract the 1 for the swap cache itself */ | 645 | count = swap_count(p->swap_map[swp_offset(entry)]); |
| 525 | count = p->swap_map[swp_offset(entry)] - 1; | ||
| 526 | spin_unlock(&swap_lock); | 646 | spin_unlock(&swap_lock); |
| 527 | } | 647 | } |
| 528 | return count; | 648 | return count; |
| @@ -579,12 +699,12 @@ int free_swap_and_cache(swp_entry_t entry) | |||
| 579 | struct swap_info_struct *p; | 699 | struct swap_info_struct *p; |
| 580 | struct page *page = NULL; | 700 | struct page *page = NULL; |
| 581 | 701 | ||
| 582 | if (is_migration_entry(entry)) | 702 | if (non_swap_entry(entry)) |
| 583 | return 1; | 703 | return 1; |
| 584 | 704 | ||
| 585 | p = swap_info_get(entry); | 705 | p = swap_info_get(entry); |
| 586 | if (p) { | 706 | if (p) { |
| 587 | if (swap_entry_free(p, entry) == 1) { | 707 | if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { |
| 588 | page = find_get_page(&swapper_space, entry.val); | 708 | page = find_get_page(&swapper_space, entry.val); |
| 589 | if (page && !trylock_page(page)) { | 709 | if (page && !trylock_page(page)) { |
| 590 | page_cache_release(page); | 710 | page_cache_release(page); |
| @@ -635,7 +755,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
| 635 | 755 | ||
| 636 | if (!bdev) { | 756 | if (!bdev) { |
| 637 | if (bdev_p) | 757 | if (bdev_p) |
| 638 | *bdev_p = bdget(sis->bdev->bd_dev); | 758 | *bdev_p = bdgrab(sis->bdev); |
| 639 | 759 | ||
| 640 | spin_unlock(&swap_lock); | 760 | spin_unlock(&swap_lock); |
| 641 | return i; | 761 | return i; |
| @@ -647,7 +767,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) | |||
| 647 | struct swap_extent, list); | 767 | struct swap_extent, list); |
| 648 | if (se->start_block == offset) { | 768 | if (se->start_block == offset) { |
| 649 | if (bdev_p) | 769 | if (bdev_p) |
| 650 | *bdev_p = bdget(sis->bdev->bd_dev); | 770 | *bdev_p = bdgrab(sis->bdev); |
| 651 | 771 | ||
| 652 | spin_unlock(&swap_lock); | 772 | spin_unlock(&swap_lock); |
| 653 | bdput(bdev); | 773 | bdput(bdev); |
| @@ -891,7 +1011,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
| 891 | i = 1; | 1011 | i = 1; |
| 892 | } | 1012 | } |
| 893 | count = si->swap_map[i]; | 1013 | count = si->swap_map[i]; |
| 894 | if (count && count != SWAP_MAP_BAD) | 1014 | if (count && swap_count(count) != SWAP_MAP_BAD) |
| 895 | break; | 1015 | break; |
| 896 | } | 1016 | } |
| 897 | return i; | 1017 | return i; |
| @@ -995,13 +1115,13 @@ static int try_to_unuse(unsigned int type) | |||
| 995 | */ | 1115 | */ |
| 996 | shmem = 0; | 1116 | shmem = 0; |
| 997 | swcount = *swap_map; | 1117 | swcount = *swap_map; |
| 998 | if (swcount > 1) { | 1118 | if (swap_count(swcount)) { |
| 999 | if (start_mm == &init_mm) | 1119 | if (start_mm == &init_mm) |
| 1000 | shmem = shmem_unuse(entry, page); | 1120 | shmem = shmem_unuse(entry, page); |
| 1001 | else | 1121 | else |
| 1002 | retval = unuse_mm(start_mm, entry, page); | 1122 | retval = unuse_mm(start_mm, entry, page); |
| 1003 | } | 1123 | } |
| 1004 | if (*swap_map > 1) { | 1124 | if (swap_count(*swap_map)) { |
| 1005 | int set_start_mm = (*swap_map >= swcount); | 1125 | int set_start_mm = (*swap_map >= swcount); |
| 1006 | struct list_head *p = &start_mm->mmlist; | 1126 | struct list_head *p = &start_mm->mmlist; |
| 1007 | struct mm_struct *new_start_mm = start_mm; | 1127 | struct mm_struct *new_start_mm = start_mm; |
| @@ -1011,7 +1131,7 @@ static int try_to_unuse(unsigned int type) | |||
| 1011 | atomic_inc(&new_start_mm->mm_users); | 1131 | atomic_inc(&new_start_mm->mm_users); |
| 1012 | atomic_inc(&prev_mm->mm_users); | 1132 | atomic_inc(&prev_mm->mm_users); |
| 1013 | spin_lock(&mmlist_lock); | 1133 | spin_lock(&mmlist_lock); |
| 1014 | while (*swap_map > 1 && !retval && !shmem && | 1134 | while (swap_count(*swap_map) && !retval && !shmem && |
| 1015 | (p = p->next) != &start_mm->mmlist) { | 1135 | (p = p->next) != &start_mm->mmlist) { |
| 1016 | mm = list_entry(p, struct mm_struct, mmlist); | 1136 | mm = list_entry(p, struct mm_struct, mmlist); |
| 1017 | if (!atomic_inc_not_zero(&mm->mm_users)) | 1137 | if (!atomic_inc_not_zero(&mm->mm_users)) |
| @@ -1023,13 +1143,14 @@ static int try_to_unuse(unsigned int type) | |||
| 1023 | cond_resched(); | 1143 | cond_resched(); |
| 1024 | 1144 | ||
| 1025 | swcount = *swap_map; | 1145 | swcount = *swap_map; |
| 1026 | if (swcount <= 1) | 1146 | if (!swap_count(swcount)) /* any usage ? */ |
| 1027 | ; | 1147 | ; |
| 1028 | else if (mm == &init_mm) { | 1148 | else if (mm == &init_mm) { |
| 1029 | set_start_mm = 1; | 1149 | set_start_mm = 1; |
| 1030 | shmem = shmem_unuse(entry, page); | 1150 | shmem = shmem_unuse(entry, page); |
| 1031 | } else | 1151 | } else |
| 1032 | retval = unuse_mm(mm, entry, page); | 1152 | retval = unuse_mm(mm, entry, page); |
| 1153 | |||
| 1033 | if (set_start_mm && *swap_map < swcount) { | 1154 | if (set_start_mm && *swap_map < swcount) { |
| 1034 | mmput(new_start_mm); | 1155 | mmput(new_start_mm); |
| 1035 | atomic_inc(&mm->mm_users); | 1156 | atomic_inc(&mm->mm_users); |
| @@ -1057,21 +1178,25 @@ static int try_to_unuse(unsigned int type) | |||
| 1057 | } | 1178 | } |
| 1058 | 1179 | ||
| 1059 | /* | 1180 | /* |
| 1060 | * How could swap count reach 0x7fff when the maximum | 1181 | * How could swap count reach 0x7ffe ? |
| 1061 | * pid is 0x7fff, and there's no way to repeat a swap | 1182 | * There's no way to repeat a swap page within an mm |
| 1062 | * page within an mm (except in shmem, where it's the | 1183 | * (except in shmem, where it's the shared object which takes |
| 1063 | * shared object which takes the reference count)? | 1184 | * the reference count)? |
| 1064 | * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. | 1185 | * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned |
| 1065 | * | 1186 | * short is too small....) |
| 1066 | * If that's wrong, then we should worry more about | 1187 | * If that's wrong, then we should worry more about |
| 1067 | * exit_mmap() and do_munmap() cases described above: | 1188 | * exit_mmap() and do_munmap() cases described above: |
| 1068 | * we might be resetting SWAP_MAP_MAX too early here. | 1189 | * we might be resetting SWAP_MAP_MAX too early here. |
| 1069 | * We know "Undead"s can happen, they're okay, so don't | 1190 | * We know "Undead"s can happen, they're okay, so don't |
| 1070 | * report them; but do report if we reset SWAP_MAP_MAX. | 1191 | * report them; but do report if we reset SWAP_MAP_MAX. |
| 1071 | */ | 1192 | */ |
| 1072 | if (*swap_map == SWAP_MAP_MAX) { | 1193 | /* We might release the lock_page() in unuse_mm(). */ |
| 1194 | if (!PageSwapCache(page) || page_private(page) != entry.val) | ||
| 1195 | goto retry; | ||
| 1196 | |||
| 1197 | if (swap_count(*swap_map) == SWAP_MAP_MAX) { | ||
| 1073 | spin_lock(&swap_lock); | 1198 | spin_lock(&swap_lock); |
| 1074 | *swap_map = 1; | 1199 | *swap_map = encode_swapmap(0, true); |
| 1075 | spin_unlock(&swap_lock); | 1200 | spin_unlock(&swap_lock); |
| 1076 | reset_overflow = 1; | 1201 | reset_overflow = 1; |
| 1077 | } | 1202 | } |
| @@ -1089,7 +1214,8 @@ static int try_to_unuse(unsigned int type) | |||
| 1089 | * pages would be incorrect if swap supported "shared | 1214 | * pages would be incorrect if swap supported "shared |
| 1090 | * private" pages, but they are handled by tmpfs files. | 1215 | * private" pages, but they are handled by tmpfs files. |
| 1091 | */ | 1216 | */ |
| 1092 | if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { | 1217 | if (swap_count(*swap_map) && |
| 1218 | PageDirty(page) && PageSwapCache(page)) { | ||
| 1093 | struct writeback_control wbc = { | 1219 | struct writeback_control wbc = { |
| 1094 | .sync_mode = WB_SYNC_NONE, | 1220 | .sync_mode = WB_SYNC_NONE, |
| 1095 | }; | 1221 | }; |
| @@ -1116,6 +1242,7 @@ static int try_to_unuse(unsigned int type) | |||
| 1116 | * mark page dirty so shrink_page_list will preserve it. | 1242 | * mark page dirty so shrink_page_list will preserve it. |
| 1117 | */ | 1243 | */ |
| 1118 | SetPageDirty(page); | 1244 | SetPageDirty(page); |
| 1245 | retry: | ||
| 1119 | unlock_page(page); | 1246 | unlock_page(page); |
| 1120 | page_cache_release(page); | 1247 | page_cache_release(page); |
| 1121 | 1248 | ||
| @@ -1447,9 +1574,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1447 | p->flags &= ~SWP_WRITEOK; | 1574 | p->flags &= ~SWP_WRITEOK; |
| 1448 | spin_unlock(&swap_lock); | 1575 | spin_unlock(&swap_lock); |
| 1449 | 1576 | ||
| 1450 | current->flags |= PF_SWAPOFF; | 1577 | current->flags |= PF_OOM_ORIGIN; |
| 1451 | err = try_to_unuse(type); | 1578 | err = try_to_unuse(type); |
| 1452 | current->flags &= ~PF_SWAPOFF; | 1579 | current->flags &= ~PF_OOM_ORIGIN; |
| 1453 | 1580 | ||
| 1454 | if (err) { | 1581 | if (err) { |
| 1455 | /* re-insert swap space back into swap_list */ | 1582 | /* re-insert swap space back into swap_list */ |
| @@ -1846,12 +1973,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1846 | goto bad_swap; | 1973 | goto bad_swap; |
| 1847 | } | 1974 | } |
| 1848 | 1975 | ||
| 1849 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 1976 | if (p->bdev) { |
| 1850 | p->flags |= SWP_SOLIDSTATE; | 1977 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
| 1851 | p->cluster_next = 1 + (random32() % p->highest_bit); | 1978 | p->flags |= SWP_SOLIDSTATE; |
| 1979 | p->cluster_next = 1 + (random32() % p->highest_bit); | ||
| 1980 | } | ||
| 1981 | if (discard_swap(p) == 0) | ||
| 1982 | p->flags |= SWP_DISCARDABLE; | ||
| 1852 | } | 1983 | } |
| 1853 | if (discard_swap(p) == 0) | ||
| 1854 | p->flags |= SWP_DISCARDABLE; | ||
| 1855 | 1984 | ||
| 1856 | mutex_lock(&swapon_mutex); | 1985 | mutex_lock(&swapon_mutex); |
| 1857 | spin_lock(&swap_lock); | 1986 | spin_lock(&swap_lock); |
| @@ -1942,15 +2071,23 @@ void si_swapinfo(struct sysinfo *val) | |||
| 1942 | * | 2071 | * |
| 1943 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as | 2072 | * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as |
| 1944 | * "permanent", but will be reclaimed by the next swapoff. | 2073 | * "permanent", but will be reclaimed by the next swapoff. |
| 2074 | * Returns error code in following case. | ||
| 2075 | * - success -> 0 | ||
| 2076 | * - swp_entry is invalid -> EINVAL | ||
| 2077 | * - swp_entry is migration entry -> EINVAL | ||
| 2078 | * - swap-cache reference is requested but there is already one. -> EEXIST | ||
| 2079 | * - swap-cache reference is requested but the entry is not used. -> ENOENT | ||
| 1945 | */ | 2080 | */ |
| 1946 | int swap_duplicate(swp_entry_t entry) | 2081 | static int __swap_duplicate(swp_entry_t entry, bool cache) |
| 1947 | { | 2082 | { |
| 1948 | struct swap_info_struct * p; | 2083 | struct swap_info_struct * p; |
| 1949 | unsigned long offset, type; | 2084 | unsigned long offset, type; |
| 1950 | int result = 0; | 2085 | int result = -EINVAL; |
| 2086 | int count; | ||
| 2087 | bool has_cache; | ||
| 1951 | 2088 | ||
| 1952 | if (is_migration_entry(entry)) | 2089 | if (non_swap_entry(entry)) |
| 1953 | return 1; | 2090 | return -EINVAL; |
| 1954 | 2091 | ||
| 1955 | type = swp_type(entry); | 2092 | type = swp_type(entry); |
| 1956 | if (type >= nr_swapfiles) | 2093 | if (type >= nr_swapfiles) |
| @@ -1959,17 +2096,40 @@ int swap_duplicate(swp_entry_t entry) | |||
| 1959 | offset = swp_offset(entry); | 2096 | offset = swp_offset(entry); |
| 1960 | 2097 | ||
| 1961 | spin_lock(&swap_lock); | 2098 | spin_lock(&swap_lock); |
| 1962 | if (offset < p->max && p->swap_map[offset]) { | 2099 | |
| 1963 | if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { | 2100 | if (unlikely(offset >= p->max)) |
| 1964 | p->swap_map[offset]++; | 2101 | goto unlock_out; |
| 1965 | result = 1; | 2102 | |
| 1966 | } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { | 2103 | count = swap_count(p->swap_map[offset]); |
| 2104 | has_cache = swap_has_cache(p->swap_map[offset]); | ||
| 2105 | |||
| 2106 | if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ | ||
| 2107 | |||
| 2108 | /* set SWAP_HAS_CACHE if there is no cache and entry is used */ | ||
| 2109 | if (!has_cache && count) { | ||
| 2110 | p->swap_map[offset] = encode_swapmap(count, true); | ||
| 2111 | result = 0; | ||
| 2112 | } else if (has_cache) /* someone added cache */ | ||
| 2113 | result = -EEXIST; | ||
| 2114 | else if (!count) /* no users */ | ||
| 2115 | result = -ENOENT; | ||
| 2116 | |||
| 2117 | } else if (count || has_cache) { | ||
| 2118 | if (count < SWAP_MAP_MAX - 1) { | ||
| 2119 | p->swap_map[offset] = encode_swapmap(count + 1, | ||
| 2120 | has_cache); | ||
| 2121 | result = 0; | ||
| 2122 | } else if (count <= SWAP_MAP_MAX) { | ||
| 1967 | if (swap_overflow++ < 5) | 2123 | if (swap_overflow++ < 5) |
| 1968 | printk(KERN_WARNING "swap_dup: swap entry overflow\n"); | 2124 | printk(KERN_WARNING |
| 1969 | p->swap_map[offset] = SWAP_MAP_MAX; | 2125 | "swap_dup: swap entry overflow\n"); |
| 1970 | result = 1; | 2126 | p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, |
| 2127 | has_cache); | ||
| 2128 | result = 0; | ||
| 1971 | } | 2129 | } |
| 1972 | } | 2130 | } else |
| 2131 | result = -ENOENT; /* unused swap entry */ | ||
| 2132 | unlock_out: | ||
| 1973 | spin_unlock(&swap_lock); | 2133 | spin_unlock(&swap_lock); |
| 1974 | out: | 2134 | out: |
| 1975 | return result; | 2135 | return result; |
| @@ -1978,6 +2138,27 @@ bad_file: | |||
| 1978 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); | 2138 | printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); |
| 1979 | goto out; | 2139 | goto out; |
| 1980 | } | 2140 | } |
| 2141 | /* | ||
| 2142 | * increase reference count of swap entry by 1. | ||
| 2143 | */ | ||
| 2144 | void swap_duplicate(swp_entry_t entry) | ||
| 2145 | { | ||
| 2146 | __swap_duplicate(entry, SWAP_MAP); | ||
| 2147 | } | ||
| 2148 | |||
| 2149 | /* | ||
| 2150 | * @entry: swap entry for which we allocate swap cache. | ||
| 2151 | * | ||
| 2152 | * Called when allocating swap cache for exising swap entry, | ||
| 2153 | * This can return error codes. Returns 0 at success. | ||
| 2154 | * -EBUSY means there is a swap cache. | ||
| 2155 | * Note: return code is different from swap_duplicate(). | ||
| 2156 | */ | ||
| 2157 | int swapcache_prepare(swp_entry_t entry) | ||
| 2158 | { | ||
| 2159 | return __swap_duplicate(entry, SWAP_CACHE); | ||
| 2160 | } | ||
| 2161 | |||
| 1981 | 2162 | ||
| 1982 | struct swap_info_struct * | 2163 | struct swap_info_struct * |
| 1983 | get_swap_info_struct(unsigned type) | 2164 | get_swap_info_struct(unsigned type) |
| @@ -2016,7 +2197,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
| 2016 | /* Don't read in free or bad pages */ | 2197 | /* Don't read in free or bad pages */ |
| 2017 | if (!si->swap_map[toff]) | 2198 | if (!si->swap_map[toff]) |
| 2018 | break; | 2199 | break; |
| 2019 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2200 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
| 2020 | break; | 2201 | break; |
| 2021 | } | 2202 | } |
| 2022 | /* Count contiguous allocated slots below our target */ | 2203 | /* Count contiguous allocated slots below our target */ |
| @@ -2024,7 +2205,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) | |||
| 2024 | /* Don't read in free or bad pages */ | 2205 | /* Don't read in free or bad pages */ |
| 2025 | if (!si->swap_map[toff]) | 2206 | if (!si->swap_map[toff]) |
| 2026 | break; | 2207 | break; |
| 2027 | if (si->swap_map[toff] == SWAP_MAP_BAD) | 2208 | if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) |
| 2028 | break; | 2209 | break; |
| 2029 | } | 2210 | } |
| 2030 | spin_unlock(&swap_lock); | 2211 | spin_unlock(&swap_lock); |
diff --git a/mm/thrash.c b/mm/thrash.c index c4c5205a9c35..2372d4ed5dd8 100644 --- a/mm/thrash.c +++ b/mm/thrash.c | |||
| @@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock); | |||
| 26 | struct mm_struct *swap_token_mm; | 26 | struct mm_struct *swap_token_mm; |
| 27 | static unsigned int global_faults; | 27 | static unsigned int global_faults; |
| 28 | 28 | ||
| 29 | void grab_swap_token(void) | 29 | void grab_swap_token(struct mm_struct *mm) |
| 30 | { | 30 | { |
| 31 | int current_interval; | 31 | int current_interval; |
| 32 | 32 | ||
| 33 | global_faults++; | 33 | global_faults++; |
| 34 | 34 | ||
| 35 | current_interval = global_faults - current->mm->faultstamp; | 35 | current_interval = global_faults - mm->faultstamp; |
| 36 | 36 | ||
| 37 | if (!spin_trylock(&swap_token_lock)) | 37 | if (!spin_trylock(&swap_token_lock)) |
| 38 | return; | 38 | return; |
| 39 | 39 | ||
| 40 | /* First come first served */ | 40 | /* First come first served */ |
| 41 | if (swap_token_mm == NULL) { | 41 | if (swap_token_mm == NULL) { |
| 42 | current->mm->token_priority = current->mm->token_priority + 2; | 42 | mm->token_priority = mm->token_priority + 2; |
| 43 | swap_token_mm = current->mm; | 43 | swap_token_mm = mm; |
| 44 | goto out; | 44 | goto out; |
| 45 | } | 45 | } |
| 46 | 46 | ||
| 47 | if (current->mm != swap_token_mm) { | 47 | if (mm != swap_token_mm) { |
| 48 | if (current_interval < current->mm->last_interval) | 48 | if (current_interval < mm->last_interval) |
| 49 | current->mm->token_priority++; | 49 | mm->token_priority++; |
| 50 | else { | 50 | else { |
| 51 | if (likely(current->mm->token_priority > 0)) | 51 | if (likely(mm->token_priority > 0)) |
| 52 | current->mm->token_priority--; | 52 | mm->token_priority--; |
| 53 | } | 53 | } |
| 54 | /* Check if we deserve the token */ | 54 | /* Check if we deserve the token */ |
| 55 | if (current->mm->token_priority > | 55 | if (mm->token_priority > swap_token_mm->token_priority) { |
| 56 | swap_token_mm->token_priority) { | 56 | mm->token_priority += 2; |
| 57 | current->mm->token_priority += 2; | 57 | swap_token_mm = mm; |
| 58 | swap_token_mm = current->mm; | ||
| 59 | } | 58 | } |
| 60 | } else { | 59 | } else { |
| 61 | /* Token holder came in again! */ | 60 | /* Token holder came in again! */ |
| 62 | current->mm->token_priority += 2; | 61 | mm->token_priority += 2; |
| 63 | } | 62 | } |
| 64 | 63 | ||
| 65 | out: | 64 | out: |
| 66 | current->mm->faultstamp = global_faults; | 65 | mm->faultstamp = global_faults; |
| 67 | current->mm->last_interval = current_interval; | 66 | mm->last_interval = current_interval; |
| 68 | spin_unlock(&swap_token_lock); | 67 | spin_unlock(&swap_token_lock); |
| 69 | return; | ||
| 70 | } | 68 | } |
| 71 | 69 | ||
| 72 | /* Called on process exit. */ | 70 | /* Called on process exit. */ |
diff --git a/mm/truncate.c b/mm/truncate.c index 1229211104f8..450cebdabfc0 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -50,7 +50,7 @@ void do_invalidatepage(struct page *page, unsigned long offset) | |||
| 50 | static inline void truncate_partial_page(struct page *page, unsigned partial) | 50 | static inline void truncate_partial_page(struct page *page, unsigned partial) |
| 51 | { | 51 | { |
| 52 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); | 52 | zero_user_segment(page, partial, PAGE_CACHE_SIZE); |
| 53 | if (PagePrivate(page)) | 53 | if (page_has_private(page)) |
| 54 | do_invalidatepage(page, partial); | 54 | do_invalidatepage(page, partial); |
| 55 | } | 55 | } |
| 56 | 56 | ||
| @@ -93,13 +93,13 @@ EXPORT_SYMBOL(cancel_dirty_page); | |||
| 93 | * its lock, b) when a concurrent invalidate_mapping_pages got there first and | 93 | * its lock, b) when a concurrent invalidate_mapping_pages got there first and |
| 94 | * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. | 94 | * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. |
| 95 | */ | 95 | */ |
| 96 | static void | 96 | static int |
| 97 | truncate_complete_page(struct address_space *mapping, struct page *page) | 97 | truncate_complete_page(struct address_space *mapping, struct page *page) |
| 98 | { | 98 | { |
| 99 | if (page->mapping != mapping) | 99 | if (page->mapping != mapping) |
| 100 | return; | 100 | return -EIO; |
| 101 | 101 | ||
| 102 | if (PagePrivate(page)) | 102 | if (page_has_private(page)) |
| 103 | do_invalidatepage(page, 0); | 103 | do_invalidatepage(page, 0); |
| 104 | 104 | ||
| 105 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 105 | cancel_dirty_page(page, PAGE_CACHE_SIZE); |
| @@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
| 108 | remove_from_page_cache(page); | 108 | remove_from_page_cache(page); |
| 109 | ClearPageMappedToDisk(page); | 109 | ClearPageMappedToDisk(page); |
| 110 | page_cache_release(page); /* pagecache ref */ | 110 | page_cache_release(page); /* pagecache ref */ |
| 111 | return 0; | ||
| 111 | } | 112 | } |
| 112 | 113 | ||
| 113 | /* | 114 | /* |
| @@ -126,7 +127,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
| 126 | if (page->mapping != mapping) | 127 | if (page->mapping != mapping) |
| 127 | return 0; | 128 | return 0; |
| 128 | 129 | ||
| 129 | if (PagePrivate(page) && !try_to_release_page(page, 0)) | 130 | if (page_has_private(page) && !try_to_release_page(page, 0)) |
| 130 | return 0; | 131 | return 0; |
| 131 | 132 | ||
| 132 | clear_page_mlock(page); | 133 | clear_page_mlock(page); |
| @@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) | |||
| 135 | return ret; | 136 | return ret; |
| 136 | } | 137 | } |
| 137 | 138 | ||
| 139 | int truncate_inode_page(struct address_space *mapping, struct page *page) | ||
| 140 | { | ||
| 141 | if (page_mapped(page)) { | ||
| 142 | unmap_mapping_range(mapping, | ||
| 143 | (loff_t)page->index << PAGE_CACHE_SHIFT, | ||
| 144 | PAGE_CACHE_SIZE, 0); | ||
| 145 | } | ||
| 146 | return truncate_complete_page(mapping, page); | ||
| 147 | } | ||
| 148 | |||
| 149 | /* | ||
| 150 | * Used to get rid of pages on hardware memory corruption. | ||
| 151 | */ | ||
| 152 | int generic_error_remove_page(struct address_space *mapping, struct page *page) | ||
| 153 | { | ||
| 154 | if (!mapping) | ||
| 155 | return -EINVAL; | ||
| 156 | /* | ||
| 157 | * Only punch for normal data pages for now. | ||
| 158 | * Handling other types like directories would need more auditing. | ||
| 159 | */ | ||
| 160 | if (!S_ISREG(mapping->host->i_mode)) | ||
| 161 | return -EIO; | ||
| 162 | return truncate_inode_page(mapping, page); | ||
| 163 | } | ||
| 164 | EXPORT_SYMBOL(generic_error_remove_page); | ||
| 165 | |||
| 166 | /* | ||
| 167 | * Safely invalidate one page from its pagecache mapping. | ||
| 168 | * It only drops clean, unused pages. The page must be locked. | ||
| 169 | * | ||
| 170 | * Returns 1 if the page is successfully invalidated, otherwise 0. | ||
| 171 | */ | ||
| 172 | int invalidate_inode_page(struct page *page) | ||
| 173 | { | ||
| 174 | struct address_space *mapping = page_mapping(page); | ||
| 175 | if (!mapping) | ||
| 176 | return 0; | ||
| 177 | if (PageDirty(page) || PageWriteback(page)) | ||
| 178 | return 0; | ||
| 179 | if (page_mapped(page)) | ||
| 180 | return 0; | ||
| 181 | return invalidate_complete_page(mapping, page); | ||
| 182 | } | ||
| 183 | |||
| 138 | /** | 184 | /** |
| 139 | * truncate_inode_pages - truncate range of pages specified by start & end byte offsets | 185 | * truncate_inode_pages - truncate range of pages specified by start & end byte offsets |
| 140 | * @mapping: mapping to truncate | 186 | * @mapping: mapping to truncate |
| @@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
| 196 | unlock_page(page); | 242 | unlock_page(page); |
| 197 | continue; | 243 | continue; |
| 198 | } | 244 | } |
| 199 | if (page_mapped(page)) { | 245 | truncate_inode_page(mapping, page); |
| 200 | unmap_mapping_range(mapping, | ||
| 201 | (loff_t)page_index<<PAGE_CACHE_SHIFT, | ||
| 202 | PAGE_CACHE_SIZE, 0); | ||
| 203 | } | ||
| 204 | truncate_complete_page(mapping, page); | ||
| 205 | unlock_page(page); | 246 | unlock_page(page); |
| 206 | } | 247 | } |
| 207 | pagevec_release(&pvec); | 248 | pagevec_release(&pvec); |
| @@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
| 238 | break; | 279 | break; |
| 239 | lock_page(page); | 280 | lock_page(page); |
| 240 | wait_on_page_writeback(page); | 281 | wait_on_page_writeback(page); |
| 241 | if (page_mapped(page)) { | 282 | truncate_inode_page(mapping, page); |
| 242 | unmap_mapping_range(mapping, | ||
| 243 | (loff_t)page->index<<PAGE_CACHE_SHIFT, | ||
| 244 | PAGE_CACHE_SIZE, 0); | ||
| 245 | } | ||
| 246 | if (page->index > next) | 283 | if (page->index > next) |
| 247 | next = page->index; | 284 | next = page->index; |
| 248 | next++; | 285 | next++; |
| 249 | truncate_complete_page(mapping, page); | ||
| 250 | unlock_page(page); | 286 | unlock_page(page); |
| 251 | } | 287 | } |
| 252 | pagevec_release(&pvec); | 288 | pagevec_release(&pvec); |
| @@ -267,8 +303,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) | |||
| 267 | } | 303 | } |
| 268 | EXPORT_SYMBOL(truncate_inode_pages); | 304 | EXPORT_SYMBOL(truncate_inode_pages); |
| 269 | 305 | ||
| 270 | unsigned long __invalidate_mapping_pages(struct address_space *mapping, | 306 | /** |
| 271 | pgoff_t start, pgoff_t end, bool be_atomic) | 307 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode |
| 308 | * @mapping: the address_space which holds the pages to invalidate | ||
| 309 | * @start: the offset 'from' which to invalidate | ||
| 310 | * @end: the offset 'to' which to invalidate (inclusive) | ||
| 311 | * | ||
| 312 | * This function only removes the unlocked pages, if you want to | ||
| 313 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
| 314 | * | ||
| 315 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
| 316 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
| 317 | * pagetables. | ||
| 318 | */ | ||
| 319 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
| 320 | pgoff_t start, pgoff_t end) | ||
| 272 | { | 321 | { |
| 273 | struct pagevec pvec; | 322 | struct pagevec pvec; |
| 274 | pgoff_t next = start; | 323 | pgoff_t next = start; |
| @@ -298,41 +347,17 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping, | |||
| 298 | if (lock_failed) | 347 | if (lock_failed) |
| 299 | continue; | 348 | continue; |
| 300 | 349 | ||
| 301 | if (PageDirty(page) || PageWriteback(page)) | 350 | ret += invalidate_inode_page(page); |
| 302 | goto unlock; | 351 | |
| 303 | if (page_mapped(page)) | ||
| 304 | goto unlock; | ||
| 305 | ret += invalidate_complete_page(mapping, page); | ||
| 306 | unlock: | ||
| 307 | unlock_page(page); | 352 | unlock_page(page); |
| 308 | if (next > end) | 353 | if (next > end) |
| 309 | break; | 354 | break; |
| 310 | } | 355 | } |
| 311 | pagevec_release(&pvec); | 356 | pagevec_release(&pvec); |
| 312 | if (likely(!be_atomic)) | 357 | cond_resched(); |
| 313 | cond_resched(); | ||
| 314 | } | 358 | } |
| 315 | return ret; | 359 | return ret; |
| 316 | } | 360 | } |
| 317 | |||
| 318 | /** | ||
| 319 | * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode | ||
| 320 | * @mapping: the address_space which holds the pages to invalidate | ||
| 321 | * @start: the offset 'from' which to invalidate | ||
| 322 | * @end: the offset 'to' which to invalidate (inclusive) | ||
| 323 | * | ||
| 324 | * This function only removes the unlocked pages, if you want to | ||
| 325 | * remove all the pages of one inode, you must call truncate_inode_pages. | ||
| 326 | * | ||
| 327 | * invalidate_mapping_pages() will not block on IO activity. It will not | ||
| 328 | * invalidate pages which are dirty, locked, under writeback or mapped into | ||
| 329 | * pagetables. | ||
| 330 | */ | ||
| 331 | unsigned long invalidate_mapping_pages(struct address_space *mapping, | ||
| 332 | pgoff_t start, pgoff_t end) | ||
| 333 | { | ||
| 334 | return __invalidate_mapping_pages(mapping, start, end, false); | ||
| 335 | } | ||
| 336 | EXPORT_SYMBOL(invalidate_mapping_pages); | 361 | EXPORT_SYMBOL(invalidate_mapping_pages); |
| 337 | 362 | ||
| 338 | /* | 363 | /* |
| @@ -348,7 +373,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
| 348 | if (page->mapping != mapping) | 373 | if (page->mapping != mapping) |
| 349 | return 0; | 374 | return 0; |
| 350 | 375 | ||
| 351 | if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) | 376 | if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL)) |
| 352 | return 0; | 377 | return 0; |
| 353 | 378 | ||
| 354 | spin_lock_irq(&mapping->tree_lock); | 379 | spin_lock_irq(&mapping->tree_lock); |
| @@ -356,9 +381,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) | |||
| 356 | goto failed; | 381 | goto failed; |
| 357 | 382 | ||
| 358 | clear_page_mlock(page); | 383 | clear_page_mlock(page); |
| 359 | BUG_ON(PagePrivate(page)); | 384 | BUG_ON(page_has_private(page)); |
| 360 | __remove_from_page_cache(page); | 385 | __remove_from_page_cache(page); |
| 361 | spin_unlock_irq(&mapping->tree_lock); | 386 | spin_unlock_irq(&mapping->tree_lock); |
| 387 | mem_cgroup_uncharge_cache_page(page); | ||
| 362 | page_cache_release(page); /* pagecache ref */ | 388 | page_cache_release(page); /* pagecache ref */ |
| 363 | return 1; | 389 | return 1; |
| 364 | failed: | 390 | failed: |
| @@ -471,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping) | |||
| 471 | return invalidate_inode_pages2_range(mapping, 0, -1); | 497 | return invalidate_inode_pages2_range(mapping, 0, -1); |
| 472 | } | 498 | } |
| 473 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2); | 499 | EXPORT_SYMBOL_GPL(invalidate_inode_pages2); |
| 500 | |||
| 501 | /** | ||
| 502 | * truncate_pagecache - unmap and remove pagecache that has been truncated | ||
| 503 | * @inode: inode | ||
| 504 | * @old: old file offset | ||
| 505 | * @new: new file offset | ||
| 506 | * | ||
| 507 | * inode's new i_size must already be written before truncate_pagecache | ||
| 508 | * is called. | ||
| 509 | * | ||
| 510 | * This function should typically be called before the filesystem | ||
| 511 | * releases resources associated with the freed range (eg. deallocates | ||
| 512 | * blocks). This way, pagecache will always stay logically coherent | ||
| 513 | * with on-disk format, and the filesystem would not have to deal with | ||
| 514 | * situations such as writepage being called for a page that has already | ||
| 515 | * had its underlying blocks deallocated. | ||
| 516 | */ | ||
| 517 | void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) | ||
| 518 | { | ||
| 519 | if (new < old) { | ||
| 520 | struct address_space *mapping = inode->i_mapping; | ||
| 521 | |||
| 522 | /* | ||
| 523 | * unmap_mapping_range is called twice, first simply for | ||
| 524 | * efficiency so that truncate_inode_pages does fewer | ||
| 525 | * single-page unmaps. However after this first call, and | ||
| 526 | * before truncate_inode_pages finishes, it is possible for | ||
| 527 | * private pages to be COWed, which remain after | ||
| 528 | * truncate_inode_pages finishes, hence the second | ||
| 529 | * unmap_mapping_range call must be made for correctness. | ||
| 530 | */ | ||
| 531 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | ||
| 532 | truncate_inode_pages(mapping, new); | ||
| 533 | unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); | ||
| 534 | } | ||
| 535 | } | ||
| 536 | EXPORT_SYMBOL(truncate_pagecache); | ||
| 537 | |||
| 538 | /** | ||
| 539 | * vmtruncate - unmap mappings "freed" by truncate() syscall | ||
| 540 | * @inode: inode of the file used | ||
| 541 | * @offset: file offset to start truncating | ||
| 542 | * | ||
| 543 | * NOTE! We have to be ready to update the memory sharing | ||
| 544 | * between the file and the memory map for a potential last | ||
| 545 | * incomplete page. Ugly, but necessary. | ||
| 546 | */ | ||
| 547 | int vmtruncate(struct inode *inode, loff_t offset) | ||
| 548 | { | ||
| 549 | loff_t oldsize; | ||
| 550 | int error; | ||
| 551 | |||
| 552 | error = inode_newsize_ok(inode, offset); | ||
| 553 | if (error) | ||
| 554 | return error; | ||
| 555 | oldsize = inode->i_size; | ||
| 556 | i_size_write(inode, offset); | ||
| 557 | truncate_pagecache(inode, oldsize, offset); | ||
| 558 | if (inode->i_op->truncate) | ||
| 559 | inode->i_op->truncate(inode); | ||
| 560 | |||
| 561 | return error; | ||
| 562 | } | ||
| 563 | EXPORT_SYMBOL(vmtruncate); | ||
| @@ -6,6 +6,9 @@ | |||
| 6 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
| 7 | #include <asm/uaccess.h> | 7 | #include <asm/uaccess.h> |
| 8 | 8 | ||
| 9 | #define CREATE_TRACE_POINTS | ||
| 10 | #include <trace/events/kmem.h> | ||
| 11 | |||
| 9 | /** | 12 | /** |
| 10 | * kstrdup - allocate space for and copy an existing string | 13 | * kstrdup - allocate space for and copy an existing string |
| 11 | * @s: the string to duplicate | 14 | * @s: the string to duplicate |
| @@ -165,6 +168,10 @@ EXPORT_SYMBOL(krealloc); | |||
| 165 | * | 168 | * |
| 166 | * The memory of the object @p points to is zeroed before freed. | 169 | * The memory of the object @p points to is zeroed before freed. |
| 167 | * If @p is %NULL, kzfree() does nothing. | 170 | * If @p is %NULL, kzfree() does nothing. |
| 171 | * | ||
| 172 | * Note: this function zeroes the whole allocated buffer which can be a good | ||
| 173 | * deal bigger than the requested buffer size passed to kmalloc(). So be | ||
| 174 | * careful when using this function in performance sensitive code. | ||
| 168 | */ | 175 | */ |
| 169 | void kzfree(const void *p) | 176 | void kzfree(const void *p) |
| 170 | { | 177 | { |
| @@ -222,6 +229,30 @@ void arch_pick_mmap_layout(struct mm_struct *mm) | |||
| 222 | } | 229 | } |
| 223 | #endif | 230 | #endif |
| 224 | 231 | ||
| 232 | /** | ||
| 233 | * get_user_pages_fast() - pin user pages in memory | ||
| 234 | * @start: starting user address | ||
| 235 | * @nr_pages: number of pages from start to pin | ||
| 236 | * @write: whether pages will be written to | ||
| 237 | * @pages: array that receives pointers to the pages pinned. | ||
| 238 | * Should be at least nr_pages long. | ||
| 239 | * | ||
| 240 | * Returns number of pages pinned. This may be fewer than the number | ||
| 241 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
| 242 | * were pinned, returns -errno. | ||
| 243 | * | ||
| 244 | * get_user_pages_fast provides equivalent functionality to get_user_pages, | ||
| 245 | * operating on current and current->mm, with force=0 and vma=NULL. However | ||
| 246 | * unlike get_user_pages, it must be called without mmap_sem held. | ||
| 247 | * | ||
| 248 | * get_user_pages_fast may take mmap_sem and page table locks, so no | ||
| 249 | * assumptions can be made about lack of locking. get_user_pages_fast is to be | ||
| 250 | * implemented in a way that is advantageous (vs get_user_pages()) when the | ||
| 251 | * user memory area is already faulted in and present in ptes. However if the | ||
| 252 | * pages have to be faulted in, it may turn out to be slightly slower so | ||
| 253 | * callers need to carefully consider what to use. On many architectures, | ||
| 254 | * get_user_pages_fast simply falls back to get_user_pages. | ||
| 255 | */ | ||
| 225 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, | 256 | int __attribute__((weak)) get_user_pages_fast(unsigned long start, |
| 226 | int nr_pages, int write, struct page **pages) | 257 | int nr_pages, int write, struct page **pages) |
| 227 | { | 258 | { |
| @@ -236,3 +267,11 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, | |||
| 236 | return ret; | 267 | return ret; |
| 237 | } | 268 | } |
| 238 | EXPORT_SYMBOL_GPL(get_user_pages_fast); | 269 | EXPORT_SYMBOL_GPL(get_user_pages_fast); |
| 270 | |||
| 271 | /* Tracepoints definitions. */ | ||
| 272 | EXPORT_TRACEPOINT_SYMBOL(kmalloc); | ||
| 273 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); | ||
| 274 | EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); | ||
| 275 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); | ||
| 276 | EXPORT_TRACEPOINT_SYMBOL(kfree); | ||
| 277 | EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); | ||
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fab19876b4d1..0f551a4a44cd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -12,6 +12,7 @@ | |||
| 12 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
| 13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
| 14 | #include <linux/highmem.h> | 14 | #include <linux/highmem.h> |
| 15 | #include <linux/sched.h> | ||
| 15 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
| 16 | #include <linux/spinlock.h> | 17 | #include <linux/spinlock.h> |
| 17 | #include <linux/interrupt.h> | 18 | #include <linux/interrupt.h> |
| @@ -23,12 +24,12 @@ | |||
| 23 | #include <linux/rbtree.h> | 24 | #include <linux/rbtree.h> |
| 24 | #include <linux/radix-tree.h> | 25 | #include <linux/radix-tree.h> |
| 25 | #include <linux/rcupdate.h> | 26 | #include <linux/rcupdate.h> |
| 26 | #include <linux/bootmem.h> | ||
| 27 | #include <linux/pfn.h> | 27 | #include <linux/pfn.h> |
| 28 | 28 | #include <linux/kmemleak.h> | |
| 29 | #include <asm/atomic.h> | 29 | #include <asm/atomic.h> |
| 30 | #include <asm/uaccess.h> | 30 | #include <asm/uaccess.h> |
| 31 | #include <asm/tlbflush.h> | 31 | #include <asm/tlbflush.h> |
| 32 | #include <asm/shmparam.h> | ||
| 32 | 33 | ||
| 33 | 34 | ||
| 34 | /*** Page table manipulation functions ***/ | 35 | /*** Page table manipulation functions ***/ |
| @@ -168,11 +169,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end, | |||
| 168 | next = pgd_addr_end(addr, end); | 169 | next = pgd_addr_end(addr, end); |
| 169 | err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); | 170 | err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); |
| 170 | if (err) | 171 | if (err) |
| 171 | break; | 172 | return err; |
| 172 | } while (pgd++, addr = next, addr != end); | 173 | } while (pgd++, addr = next, addr != end); |
| 173 | 174 | ||
| 174 | if (unlikely(err)) | ||
| 175 | return err; | ||
| 176 | return nr; | 175 | return nr; |
| 177 | } | 176 | } |
| 178 | 177 | ||
| @@ -186,7 +185,7 @@ static int vmap_page_range(unsigned long start, unsigned long end, | |||
| 186 | return ret; | 185 | return ret; |
| 187 | } | 186 | } |
| 188 | 187 | ||
| 189 | static inline int is_vmalloc_or_module_addr(const void *x) | 188 | int is_vmalloc_or_module_addr(const void *x) |
| 190 | { | 189 | { |
| 191 | /* | 190 | /* |
| 192 | * ARM, x86-64 and sparc64 put modules in a special place, | 191 | * ARM, x86-64 and sparc64 put modules in a special place, |
| @@ -265,6 +264,7 @@ struct vmap_area { | |||
| 265 | static DEFINE_SPINLOCK(vmap_area_lock); | 264 | static DEFINE_SPINLOCK(vmap_area_lock); |
| 266 | static struct rb_root vmap_area_root = RB_ROOT; | 265 | static struct rb_root vmap_area_root = RB_ROOT; |
| 267 | static LIST_HEAD(vmap_area_list); | 266 | static LIST_HEAD(vmap_area_list); |
| 267 | static unsigned long vmap_area_pcpu_hole; | ||
| 268 | 268 | ||
| 269 | static struct vmap_area *__find_vmap_area(unsigned long addr) | 269 | static struct vmap_area *__find_vmap_area(unsigned long addr) |
| 270 | { | 270 | { |
| @@ -402,6 +402,7 @@ overflow: | |||
| 402 | printk(KERN_WARNING | 402 | printk(KERN_WARNING |
| 403 | "vmap allocation for size %lu failed: " | 403 | "vmap allocation for size %lu failed: " |
| 404 | "use vmalloc=<size> to increase size.\n", size); | 404 | "use vmalloc=<size> to increase size.\n", size); |
| 405 | kfree(va); | ||
| 405 | return ERR_PTR(-EBUSY); | 406 | return ERR_PTR(-EBUSY); |
| 406 | } | 407 | } |
| 407 | 408 | ||
| @@ -430,6 +431,15 @@ static void __free_vmap_area(struct vmap_area *va) | |||
| 430 | RB_CLEAR_NODE(&va->rb_node); | 431 | RB_CLEAR_NODE(&va->rb_node); |
| 431 | list_del_rcu(&va->list); | 432 | list_del_rcu(&va->list); |
| 432 | 433 | ||
| 434 | /* | ||
| 435 | * Track the highest possible candidate for pcpu area | ||
| 436 | * allocation. Areas outside of vmalloc area can be returned | ||
| 437 | * here too, consider only end addresses which fall inside | ||
| 438 | * vmalloc area proper. | ||
| 439 | */ | ||
| 440 | if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) | ||
| 441 | vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); | ||
| 442 | |||
| 433 | call_rcu(&va->rcu_head, rcu_free_va); | 443 | call_rcu(&va->rcu_head, rcu_free_va); |
| 434 | } | 444 | } |
| 435 | 445 | ||
| @@ -1031,12 +1041,15 @@ void __init vmalloc_init(void) | |||
| 1031 | 1041 | ||
| 1032 | /* Import existing vmlist entries. */ | 1042 | /* Import existing vmlist entries. */ |
| 1033 | for (tmp = vmlist; tmp; tmp = tmp->next) { | 1043 | for (tmp = vmlist; tmp; tmp = tmp->next) { |
| 1034 | va = alloc_bootmem(sizeof(struct vmap_area)); | 1044 | va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT); |
| 1035 | va->flags = tmp->flags | VM_VM_AREA; | 1045 | va->flags = tmp->flags | VM_VM_AREA; |
| 1036 | va->va_start = (unsigned long)tmp->addr; | 1046 | va->va_start = (unsigned long)tmp->addr; |
| 1037 | va->va_end = va->va_start + tmp->size; | 1047 | va->va_end = va->va_start + tmp->size; |
| 1038 | __insert_vmap_area(va); | 1048 | __insert_vmap_area(va); |
| 1039 | } | 1049 | } |
| 1050 | |||
| 1051 | vmap_area_pcpu_hole = VMALLOC_END; | ||
| 1052 | |||
| 1040 | vmap_initialized = true; | 1053 | vmap_initialized = true; |
| 1041 | } | 1054 | } |
| 1042 | 1055 | ||
| @@ -1121,14 +1134,34 @@ EXPORT_SYMBOL_GPL(map_vm_area); | |||
| 1121 | DEFINE_RWLOCK(vmlist_lock); | 1134 | DEFINE_RWLOCK(vmlist_lock); |
| 1122 | struct vm_struct *vmlist; | 1135 | struct vm_struct *vmlist; |
| 1123 | 1136 | ||
| 1137 | static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, | ||
| 1138 | unsigned long flags, void *caller) | ||
| 1139 | { | ||
| 1140 | struct vm_struct *tmp, **p; | ||
| 1141 | |||
| 1142 | vm->flags = flags; | ||
| 1143 | vm->addr = (void *)va->va_start; | ||
| 1144 | vm->size = va->va_end - va->va_start; | ||
| 1145 | vm->caller = caller; | ||
| 1146 | va->private = vm; | ||
| 1147 | va->flags |= VM_VM_AREA; | ||
| 1148 | |||
| 1149 | write_lock(&vmlist_lock); | ||
| 1150 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | ||
| 1151 | if (tmp->addr >= vm->addr) | ||
| 1152 | break; | ||
| 1153 | } | ||
| 1154 | vm->next = *p; | ||
| 1155 | *p = vm; | ||
| 1156 | write_unlock(&vmlist_lock); | ||
| 1157 | } | ||
| 1158 | |||
| 1124 | static struct vm_struct *__get_vm_area_node(unsigned long size, | 1159 | static struct vm_struct *__get_vm_area_node(unsigned long size, |
| 1125 | unsigned long flags, unsigned long start, unsigned long end, | 1160 | unsigned long align, unsigned long flags, unsigned long start, |
| 1126 | int node, gfp_t gfp_mask, void *caller) | 1161 | unsigned long end, int node, gfp_t gfp_mask, void *caller) |
| 1127 | { | 1162 | { |
| 1128 | static struct vmap_area *va; | 1163 | static struct vmap_area *va; |
| 1129 | struct vm_struct *area; | 1164 | struct vm_struct *area; |
| 1130 | struct vm_struct *tmp, **p; | ||
| 1131 | unsigned long align = 1; | ||
| 1132 | 1165 | ||
| 1133 | BUG_ON(in_interrupt()); | 1166 | BUG_ON(in_interrupt()); |
| 1134 | if (flags & VM_IOREMAP) { | 1167 | if (flags & VM_IOREMAP) { |
| @@ -1146,7 +1179,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
| 1146 | if (unlikely(!size)) | 1179 | if (unlikely(!size)) |
| 1147 | return NULL; | 1180 | return NULL; |
| 1148 | 1181 | ||
| 1149 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); | 1182 | area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); |
| 1150 | if (unlikely(!area)) | 1183 | if (unlikely(!area)) |
| 1151 | return NULL; | 1184 | return NULL; |
| 1152 | 1185 | ||
| @@ -1161,32 +1194,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
| 1161 | return NULL; | 1194 | return NULL; |
| 1162 | } | 1195 | } |
| 1163 | 1196 | ||
| 1164 | area->flags = flags; | 1197 | insert_vmalloc_vm(area, va, flags, caller); |
| 1165 | area->addr = (void *)va->va_start; | ||
| 1166 | area->size = size; | ||
| 1167 | area->pages = NULL; | ||
| 1168 | area->nr_pages = 0; | ||
| 1169 | area->phys_addr = 0; | ||
| 1170 | area->caller = caller; | ||
| 1171 | va->private = area; | ||
| 1172 | va->flags |= VM_VM_AREA; | ||
| 1173 | |||
| 1174 | write_lock(&vmlist_lock); | ||
| 1175 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | ||
| 1176 | if (tmp->addr >= area->addr) | ||
| 1177 | break; | ||
| 1178 | } | ||
| 1179 | area->next = *p; | ||
| 1180 | *p = area; | ||
| 1181 | write_unlock(&vmlist_lock); | ||
| 1182 | |||
| 1183 | return area; | 1198 | return area; |
| 1184 | } | 1199 | } |
| 1185 | 1200 | ||
| 1186 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 1201 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
| 1187 | unsigned long start, unsigned long end) | 1202 | unsigned long start, unsigned long end) |
| 1188 | { | 1203 | { |
| 1189 | return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, | 1204 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, |
| 1190 | __builtin_return_address(0)); | 1205 | __builtin_return_address(0)); |
| 1191 | } | 1206 | } |
| 1192 | EXPORT_SYMBOL_GPL(__get_vm_area); | 1207 | EXPORT_SYMBOL_GPL(__get_vm_area); |
| @@ -1195,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
| 1195 | unsigned long start, unsigned long end, | 1210 | unsigned long start, unsigned long end, |
| 1196 | void *caller) | 1211 | void *caller) |
| 1197 | { | 1212 | { |
| 1198 | return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, | 1213 | return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, |
| 1199 | caller); | 1214 | caller); |
| 1200 | } | 1215 | } |
| 1201 | 1216 | ||
| @@ -1210,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, | |||
| 1210 | */ | 1225 | */ |
| 1211 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) | 1226 | struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) |
| 1212 | { | 1227 | { |
| 1213 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, | 1228 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
| 1214 | -1, GFP_KERNEL, __builtin_return_address(0)); | 1229 | -1, GFP_KERNEL, __builtin_return_address(0)); |
| 1215 | } | 1230 | } |
| 1216 | 1231 | ||
| 1217 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, | 1232 | struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, |
| 1218 | void *caller) | 1233 | void *caller) |
| 1219 | { | 1234 | { |
| 1220 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, | 1235 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
| 1221 | -1, GFP_KERNEL, caller); | 1236 | -1, GFP_KERNEL, caller); |
| 1222 | } | 1237 | } |
| 1223 | 1238 | ||
| 1224 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | 1239 | struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, |
| 1225 | int node, gfp_t gfp_mask) | 1240 | int node, gfp_t gfp_mask) |
| 1226 | { | 1241 | { |
| 1227 | return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, | 1242 | return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, |
| 1228 | gfp_mask, __builtin_return_address(0)); | 1243 | node, gfp_mask, __builtin_return_address(0)); |
| 1229 | } | 1244 | } |
| 1230 | 1245 | ||
| 1231 | static struct vm_struct *find_vm_area(const void *addr) | 1246 | static struct vm_struct *find_vm_area(const void *addr) |
| @@ -1255,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
| 1255 | if (va && va->flags & VM_VM_AREA) { | 1270 | if (va && va->flags & VM_VM_AREA) { |
| 1256 | struct vm_struct *vm = va->private; | 1271 | struct vm_struct *vm = va->private; |
| 1257 | struct vm_struct *tmp, **p; | 1272 | struct vm_struct *tmp, **p; |
| 1258 | 1273 | /* | |
| 1259 | vmap_debug_free_range(va->va_start, va->va_end); | 1274 | * remove from list and disallow access to this vm_struct |
| 1260 | free_unmap_vmap_area(va); | 1275 | * before unmap. (address range confliction is maintained by |
| 1261 | vm->size -= PAGE_SIZE; | 1276 | * vmap.) |
| 1262 | 1277 | */ | |
| 1263 | write_lock(&vmlist_lock); | 1278 | write_lock(&vmlist_lock); |
| 1264 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | 1279 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) |
| 1265 | ; | 1280 | ; |
| 1266 | *p = tmp->next; | 1281 | *p = tmp->next; |
| 1267 | write_unlock(&vmlist_lock); | 1282 | write_unlock(&vmlist_lock); |
| 1268 | 1283 | ||
| 1284 | vmap_debug_free_range(va->va_start, va->va_end); | ||
| 1285 | free_unmap_vmap_area(va); | ||
| 1286 | vm->size -= PAGE_SIZE; | ||
| 1287 | |||
| 1269 | return vm; | 1288 | return vm; |
| 1270 | } | 1289 | } |
| 1271 | return NULL; | 1290 | return NULL; |
| @@ -1326,6 +1345,9 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
| 1326 | void vfree(const void *addr) | 1345 | void vfree(const void *addr) |
| 1327 | { | 1346 | { |
| 1328 | BUG_ON(in_interrupt()); | 1347 | BUG_ON(in_interrupt()); |
| 1348 | |||
| 1349 | kmemleak_free(addr); | ||
| 1350 | |||
| 1329 | __vunmap(addr, 1); | 1351 | __vunmap(addr, 1); |
| 1330 | } | 1352 | } |
| 1331 | EXPORT_SYMBOL(vfree); | 1353 | EXPORT_SYMBOL(vfree); |
| @@ -1364,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count, | |||
| 1364 | 1386 | ||
| 1365 | might_sleep(); | 1387 | might_sleep(); |
| 1366 | 1388 | ||
| 1367 | if (count > num_physpages) | 1389 | if (count > totalram_pages) |
| 1368 | return NULL; | 1390 | return NULL; |
| 1369 | 1391 | ||
| 1370 | area = get_vm_area_caller((count << PAGE_SHIFT), flags, | 1392 | area = get_vm_area_caller((count << PAGE_SHIFT), flags, |
| @@ -1381,7 +1403,8 @@ void *vmap(struct page **pages, unsigned int count, | |||
| 1381 | } | 1403 | } |
| 1382 | EXPORT_SYMBOL(vmap); | 1404 | EXPORT_SYMBOL(vmap); |
| 1383 | 1405 | ||
| 1384 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | 1406 | static void *__vmalloc_node(unsigned long size, unsigned long align, |
| 1407 | gfp_t gfp_mask, pgprot_t prot, | ||
| 1385 | int node, void *caller); | 1408 | int node, void *caller); |
| 1386 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1409 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
| 1387 | pgprot_t prot, int node, void *caller) | 1410 | pgprot_t prot, int node, void *caller) |
| @@ -1395,7 +1418,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | |||
| 1395 | area->nr_pages = nr_pages; | 1418 | area->nr_pages = nr_pages; |
| 1396 | /* Please note that the recursion is strictly bounded. */ | 1419 | /* Please note that the recursion is strictly bounded. */ |
| 1397 | if (array_size > PAGE_SIZE) { | 1420 | if (array_size > PAGE_SIZE) { |
| 1398 | pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, | 1421 | pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, |
| 1399 | PAGE_KERNEL, node, caller); | 1422 | PAGE_KERNEL, node, caller); |
| 1400 | area->flags |= VM_VPAGES; | 1423 | area->flags |= VM_VPAGES; |
| 1401 | } else { | 1424 | } else { |
| @@ -1438,13 +1461,23 @@ fail: | |||
| 1438 | 1461 | ||
| 1439 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | 1462 | void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) |
| 1440 | { | 1463 | { |
| 1441 | return __vmalloc_area_node(area, gfp_mask, prot, -1, | 1464 | void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, |
| 1442 | __builtin_return_address(0)); | 1465 | __builtin_return_address(0)); |
| 1466 | |||
| 1467 | /* | ||
| 1468 | * A ref_count = 3 is needed because the vm_struct and vmap_area | ||
| 1469 | * structures allocated in the __get_vm_area_node() function contain | ||
| 1470 | * references to the virtual address of the vmalloc'ed block. | ||
| 1471 | */ | ||
| 1472 | kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); | ||
| 1473 | |||
| 1474 | return addr; | ||
| 1443 | } | 1475 | } |
| 1444 | 1476 | ||
| 1445 | /** | 1477 | /** |
| 1446 | * __vmalloc_node - allocate virtually contiguous memory | 1478 | * __vmalloc_node - allocate virtually contiguous memory |
| 1447 | * @size: allocation size | 1479 | * @size: allocation size |
| 1480 | * @align: desired alignment | ||
| 1448 | * @gfp_mask: flags for the page level allocator | 1481 | * @gfp_mask: flags for the page level allocator |
| 1449 | * @prot: protection mask for the allocated pages | 1482 | * @prot: protection mask for the allocated pages |
| 1450 | * @node: node to use for allocation or -1 | 1483 | * @node: node to use for allocation or -1 |
| @@ -1454,27 +1487,39 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) | |||
| 1454 | * allocator with @gfp_mask flags. Map them into contiguous | 1487 | * allocator with @gfp_mask flags. Map them into contiguous |
| 1455 | * kernel virtual space, using a pagetable protection of @prot. | 1488 | * kernel virtual space, using a pagetable protection of @prot. |
| 1456 | */ | 1489 | */ |
| 1457 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | 1490 | static void *__vmalloc_node(unsigned long size, unsigned long align, |
| 1458 | int node, void *caller) | 1491 | gfp_t gfp_mask, pgprot_t prot, |
| 1492 | int node, void *caller) | ||
| 1459 | { | 1493 | { |
| 1460 | struct vm_struct *area; | 1494 | struct vm_struct *area; |
| 1495 | void *addr; | ||
| 1496 | unsigned long real_size = size; | ||
| 1461 | 1497 | ||
| 1462 | size = PAGE_ALIGN(size); | 1498 | size = PAGE_ALIGN(size); |
| 1463 | if (!size || (size >> PAGE_SHIFT) > num_physpages) | 1499 | if (!size || (size >> PAGE_SHIFT) > totalram_pages) |
| 1464 | return NULL; | 1500 | return NULL; |
| 1465 | 1501 | ||
| 1466 | area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, | 1502 | area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, |
| 1467 | node, gfp_mask, caller); | 1503 | VMALLOC_END, node, gfp_mask, caller); |
| 1468 | 1504 | ||
| 1469 | if (!area) | 1505 | if (!area) |
| 1470 | return NULL; | 1506 | return NULL; |
| 1471 | 1507 | ||
| 1472 | return __vmalloc_area_node(area, gfp_mask, prot, node, caller); | 1508 | addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); |
| 1509 | |||
| 1510 | /* | ||
| 1511 | * A ref_count = 3 is needed because the vm_struct and vmap_area | ||
| 1512 | * structures allocated in the __get_vm_area_node() function contain | ||
| 1513 | * references to the virtual address of the vmalloc'ed block. | ||
| 1514 | */ | ||
| 1515 | kmemleak_alloc(addr, real_size, 3, gfp_mask); | ||
| 1516 | |||
| 1517 | return addr; | ||
| 1473 | } | 1518 | } |
| 1474 | 1519 | ||
| 1475 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) | 1520 | void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) |
| 1476 | { | 1521 | { |
| 1477 | return __vmalloc_node(size, gfp_mask, prot, -1, | 1522 | return __vmalloc_node(size, 1, gfp_mask, prot, -1, |
| 1478 | __builtin_return_address(0)); | 1523 | __builtin_return_address(0)); |
| 1479 | } | 1524 | } |
| 1480 | EXPORT_SYMBOL(__vmalloc); | 1525 | EXPORT_SYMBOL(__vmalloc); |
| @@ -1490,7 +1535,7 @@ EXPORT_SYMBOL(__vmalloc); | |||
| 1490 | */ | 1535 | */ |
| 1491 | void *vmalloc(unsigned long size) | 1536 | void *vmalloc(unsigned long size) |
| 1492 | { | 1537 | { |
| 1493 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, | 1538 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, |
| 1494 | -1, __builtin_return_address(0)); | 1539 | -1, __builtin_return_address(0)); |
| 1495 | } | 1540 | } |
| 1496 | EXPORT_SYMBOL(vmalloc); | 1541 | EXPORT_SYMBOL(vmalloc); |
| @@ -1507,7 +1552,8 @@ void *vmalloc_user(unsigned long size) | |||
| 1507 | struct vm_struct *area; | 1552 | struct vm_struct *area; |
| 1508 | void *ret; | 1553 | void *ret; |
| 1509 | 1554 | ||
| 1510 | ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | 1555 | ret = __vmalloc_node(size, SHMLBA, |
| 1556 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | ||
| 1511 | PAGE_KERNEL, -1, __builtin_return_address(0)); | 1557 | PAGE_KERNEL, -1, __builtin_return_address(0)); |
| 1512 | if (ret) { | 1558 | if (ret) { |
| 1513 | area = find_vm_area(ret); | 1559 | area = find_vm_area(ret); |
| @@ -1530,7 +1576,7 @@ EXPORT_SYMBOL(vmalloc_user); | |||
| 1530 | */ | 1576 | */ |
| 1531 | void *vmalloc_node(unsigned long size, int node) | 1577 | void *vmalloc_node(unsigned long size, int node) |
| 1532 | { | 1578 | { |
| 1533 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, | 1579 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, |
| 1534 | node, __builtin_return_address(0)); | 1580 | node, __builtin_return_address(0)); |
| 1535 | } | 1581 | } |
| 1536 | EXPORT_SYMBOL(vmalloc_node); | 1582 | EXPORT_SYMBOL(vmalloc_node); |
| @@ -1553,7 +1599,7 @@ EXPORT_SYMBOL(vmalloc_node); | |||
| 1553 | 1599 | ||
| 1554 | void *vmalloc_exec(unsigned long size) | 1600 | void *vmalloc_exec(unsigned long size) |
| 1555 | { | 1601 | { |
| 1556 | return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, | 1602 | return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, |
| 1557 | -1, __builtin_return_address(0)); | 1603 | -1, __builtin_return_address(0)); |
| 1558 | } | 1604 | } |
| 1559 | 1605 | ||
| @@ -1574,7 +1620,7 @@ void *vmalloc_exec(unsigned long size) | |||
| 1574 | */ | 1620 | */ |
| 1575 | void *vmalloc_32(unsigned long size) | 1621 | void *vmalloc_32(unsigned long size) |
| 1576 | { | 1622 | { |
| 1577 | return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, | 1623 | return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, |
| 1578 | -1, __builtin_return_address(0)); | 1624 | -1, __builtin_return_address(0)); |
| 1579 | } | 1625 | } |
| 1580 | EXPORT_SYMBOL(vmalloc_32); | 1626 | EXPORT_SYMBOL(vmalloc_32); |
| @@ -1591,7 +1637,7 @@ void *vmalloc_32_user(unsigned long size) | |||
| 1591 | struct vm_struct *area; | 1637 | struct vm_struct *area; |
| 1592 | void *ret; | 1638 | void *ret; |
| 1593 | 1639 | ||
| 1594 | ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, | 1640 | ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, |
| 1595 | -1, __builtin_return_address(0)); | 1641 | -1, __builtin_return_address(0)); |
| 1596 | if (ret) { | 1642 | if (ret) { |
| 1597 | area = find_vm_area(ret); | 1643 | area = find_vm_area(ret); |
| @@ -1601,10 +1647,120 @@ void *vmalloc_32_user(unsigned long size) | |||
| 1601 | } | 1647 | } |
| 1602 | EXPORT_SYMBOL(vmalloc_32_user); | 1648 | EXPORT_SYMBOL(vmalloc_32_user); |
| 1603 | 1649 | ||
| 1650 | /* | ||
| 1651 | * small helper routine , copy contents to buf from addr. | ||
| 1652 | * If the page is not present, fill zero. | ||
| 1653 | */ | ||
| 1654 | |||
| 1655 | static int aligned_vread(char *buf, char *addr, unsigned long count) | ||
| 1656 | { | ||
| 1657 | struct page *p; | ||
| 1658 | int copied = 0; | ||
| 1659 | |||
| 1660 | while (count) { | ||
| 1661 | unsigned long offset, length; | ||
| 1662 | |||
| 1663 | offset = (unsigned long)addr & ~PAGE_MASK; | ||
| 1664 | length = PAGE_SIZE - offset; | ||
| 1665 | if (length > count) | ||
| 1666 | length = count; | ||
| 1667 | p = vmalloc_to_page(addr); | ||
| 1668 | /* | ||
| 1669 | * To do safe access to this _mapped_ area, we need | ||
| 1670 | * lock. But adding lock here means that we need to add | ||
| 1671 | * overhead of vmalloc()/vfree() calles for this _debug_ | ||
| 1672 | * interface, rarely used. Instead of that, we'll use | ||
| 1673 | * kmap() and get small overhead in this access function. | ||
| 1674 | */ | ||
| 1675 | if (p) { | ||
| 1676 | /* | ||
| 1677 | * we can expect USER0 is not used (see vread/vwrite's | ||
| 1678 | * function description) | ||
| 1679 | */ | ||
| 1680 | void *map = kmap_atomic(p, KM_USER0); | ||
| 1681 | memcpy(buf, map + offset, length); | ||
| 1682 | kunmap_atomic(map, KM_USER0); | ||
| 1683 | } else | ||
| 1684 | memset(buf, 0, length); | ||
| 1685 | |||
| 1686 | addr += length; | ||
| 1687 | buf += length; | ||
| 1688 | copied += length; | ||
| 1689 | count -= length; | ||
| 1690 | } | ||
| 1691 | return copied; | ||
| 1692 | } | ||
| 1693 | |||
| 1694 | static int aligned_vwrite(char *buf, char *addr, unsigned long count) | ||
| 1695 | { | ||
| 1696 | struct page *p; | ||
| 1697 | int copied = 0; | ||
| 1698 | |||
| 1699 | while (count) { | ||
| 1700 | unsigned long offset, length; | ||
| 1701 | |||
| 1702 | offset = (unsigned long)addr & ~PAGE_MASK; | ||
| 1703 | length = PAGE_SIZE - offset; | ||
| 1704 | if (length > count) | ||
| 1705 | length = count; | ||
| 1706 | p = vmalloc_to_page(addr); | ||
| 1707 | /* | ||
| 1708 | * To do safe access to this _mapped_ area, we need | ||
| 1709 | * lock. But adding lock here means that we need to add | ||
| 1710 | * overhead of vmalloc()/vfree() calles for this _debug_ | ||
| 1711 | * interface, rarely used. Instead of that, we'll use | ||
| 1712 | * kmap() and get small overhead in this access function. | ||
| 1713 | */ | ||
| 1714 | if (p) { | ||
| 1715 | /* | ||
| 1716 | * we can expect USER0 is not used (see vread/vwrite's | ||
| 1717 | * function description) | ||
| 1718 | */ | ||
| 1719 | void *map = kmap_atomic(p, KM_USER0); | ||
| 1720 | memcpy(map + offset, buf, length); | ||
| 1721 | kunmap_atomic(map, KM_USER0); | ||
| 1722 | } | ||
| 1723 | addr += length; | ||
| 1724 | buf += length; | ||
| 1725 | copied += length; | ||
| 1726 | count -= length; | ||
| 1727 | } | ||
| 1728 | return copied; | ||
| 1729 | } | ||
| 1730 | |||
| 1731 | /** | ||
| 1732 | * vread() - read vmalloc area in a safe way. | ||
| 1733 | * @buf: buffer for reading data | ||
| 1734 | * @addr: vm address. | ||
| 1735 | * @count: number of bytes to be read. | ||
| 1736 | * | ||
| 1737 | * Returns # of bytes which addr and buf should be increased. | ||
| 1738 | * (same number to @count). Returns 0 if [addr...addr+count) doesn't | ||
| 1739 | * includes any intersect with alive vmalloc area. | ||
| 1740 | * | ||
| 1741 | * This function checks that addr is a valid vmalloc'ed area, and | ||
| 1742 | * copy data from that area to a given buffer. If the given memory range | ||
| 1743 | * of [addr...addr+count) includes some valid address, data is copied to | ||
| 1744 | * proper area of @buf. If there are memory holes, they'll be zero-filled. | ||
| 1745 | * IOREMAP area is treated as memory hole and no copy is done. | ||
| 1746 | * | ||
| 1747 | * If [addr...addr+count) doesn't includes any intersects with alive | ||
| 1748 | * vm_struct area, returns 0. | ||
| 1749 | * @buf should be kernel's buffer. Because this function uses KM_USER0, | ||
| 1750 | * the caller should guarantee KM_USER0 is not used. | ||
| 1751 | * | ||
| 1752 | * Note: In usual ops, vread() is never necessary because the caller | ||
| 1753 | * should know vmalloc() area is valid and can use memcpy(). | ||
| 1754 | * This is for routines which have to access vmalloc area without | ||
| 1755 | * any informaion, as /dev/kmem. | ||
| 1756 | * | ||
| 1757 | */ | ||
| 1758 | |||
| 1604 | long vread(char *buf, char *addr, unsigned long count) | 1759 | long vread(char *buf, char *addr, unsigned long count) |
| 1605 | { | 1760 | { |
| 1606 | struct vm_struct *tmp; | 1761 | struct vm_struct *tmp; |
| 1607 | char *vaddr, *buf_start = buf; | 1762 | char *vaddr, *buf_start = buf; |
| 1763 | unsigned long buflen = count; | ||
| 1608 | unsigned long n; | 1764 | unsigned long n; |
| 1609 | 1765 | ||
| 1610 | /* Don't allow overflow */ | 1766 | /* Don't allow overflow */ |
| @@ -1612,7 +1768,7 @@ long vread(char *buf, char *addr, unsigned long count) | |||
| 1612 | count = -(unsigned long) addr; | 1768 | count = -(unsigned long) addr; |
| 1613 | 1769 | ||
| 1614 | read_lock(&vmlist_lock); | 1770 | read_lock(&vmlist_lock); |
| 1615 | for (tmp = vmlist; tmp; tmp = tmp->next) { | 1771 | for (tmp = vmlist; count && tmp; tmp = tmp->next) { |
| 1616 | vaddr = (char *) tmp->addr; | 1772 | vaddr = (char *) tmp->addr; |
| 1617 | if (addr >= vaddr + tmp->size - PAGE_SIZE) | 1773 | if (addr >= vaddr + tmp->size - PAGE_SIZE) |
| 1618 | continue; | 1774 | continue; |
| @@ -1625,32 +1781,72 @@ long vread(char *buf, char *addr, unsigned long count) | |||
| 1625 | count--; | 1781 | count--; |
| 1626 | } | 1782 | } |
| 1627 | n = vaddr + tmp->size - PAGE_SIZE - addr; | 1783 | n = vaddr + tmp->size - PAGE_SIZE - addr; |
| 1628 | do { | 1784 | if (n > count) |
| 1629 | if (count == 0) | 1785 | n = count; |
| 1630 | goto finished; | 1786 | if (!(tmp->flags & VM_IOREMAP)) |
| 1631 | *buf = *addr; | 1787 | aligned_vread(buf, addr, n); |
| 1632 | buf++; | 1788 | else /* IOREMAP area is treated as memory hole */ |
| 1633 | addr++; | 1789 | memset(buf, 0, n); |
| 1634 | count--; | 1790 | buf += n; |
| 1635 | } while (--n > 0); | 1791 | addr += n; |
| 1792 | count -= n; | ||
| 1636 | } | 1793 | } |
| 1637 | finished: | 1794 | finished: |
| 1638 | read_unlock(&vmlist_lock); | 1795 | read_unlock(&vmlist_lock); |
| 1639 | return buf - buf_start; | 1796 | |
| 1797 | if (buf == buf_start) | ||
| 1798 | return 0; | ||
| 1799 | /* zero-fill memory holes */ | ||
| 1800 | if (buf != buf_start + buflen) | ||
| 1801 | memset(buf, 0, buflen - (buf - buf_start)); | ||
| 1802 | |||
| 1803 | return buflen; | ||
| 1640 | } | 1804 | } |
| 1641 | 1805 | ||
| 1806 | /** | ||
| 1807 | * vwrite() - write vmalloc area in a safe way. | ||
| 1808 | * @buf: buffer for source data | ||
| 1809 | * @addr: vm address. | ||
| 1810 | * @count: number of bytes to be read. | ||
| 1811 | * | ||
| 1812 | * Returns # of bytes which addr and buf should be incresed. | ||
| 1813 | * (same number to @count). | ||
| 1814 | * If [addr...addr+count) doesn't includes any intersect with valid | ||
| 1815 | * vmalloc area, returns 0. | ||
| 1816 | * | ||
| 1817 | * This function checks that addr is a valid vmalloc'ed area, and | ||
| 1818 | * copy data from a buffer to the given addr. If specified range of | ||
| 1819 | * [addr...addr+count) includes some valid address, data is copied from | ||
| 1820 | * proper area of @buf. If there are memory holes, no copy to hole. | ||
| 1821 | * IOREMAP area is treated as memory hole and no copy is done. | ||
| 1822 | * | ||
| 1823 | * If [addr...addr+count) doesn't includes any intersects with alive | ||
| 1824 | * vm_struct area, returns 0. | ||
| 1825 | * @buf should be kernel's buffer. Because this function uses KM_USER0, | ||
| 1826 | * the caller should guarantee KM_USER0 is not used. | ||
| 1827 | * | ||
| 1828 | * Note: In usual ops, vwrite() is never necessary because the caller | ||
| 1829 | * should know vmalloc() area is valid and can use memcpy(). | ||
| 1830 | * This is for routines which have to access vmalloc area without | ||
| 1831 | * any informaion, as /dev/kmem. | ||
| 1832 | * | ||
| 1833 | * The caller should guarantee KM_USER1 is not used. | ||
| 1834 | */ | ||
| 1835 | |||
| 1642 | long vwrite(char *buf, char *addr, unsigned long count) | 1836 | long vwrite(char *buf, char *addr, unsigned long count) |
| 1643 | { | 1837 | { |
| 1644 | struct vm_struct *tmp; | 1838 | struct vm_struct *tmp; |
| 1645 | char *vaddr, *buf_start = buf; | 1839 | char *vaddr; |
| 1646 | unsigned long n; | 1840 | unsigned long n, buflen; |
| 1841 | int copied = 0; | ||
| 1647 | 1842 | ||
| 1648 | /* Don't allow overflow */ | 1843 | /* Don't allow overflow */ |
| 1649 | if ((unsigned long) addr + count < count) | 1844 | if ((unsigned long) addr + count < count) |
| 1650 | count = -(unsigned long) addr; | 1845 | count = -(unsigned long) addr; |
| 1846 | buflen = count; | ||
| 1651 | 1847 | ||
| 1652 | read_lock(&vmlist_lock); | 1848 | read_lock(&vmlist_lock); |
| 1653 | for (tmp = vmlist; tmp; tmp = tmp->next) { | 1849 | for (tmp = vmlist; count && tmp; tmp = tmp->next) { |
| 1654 | vaddr = (char *) tmp->addr; | 1850 | vaddr = (char *) tmp->addr; |
| 1655 | if (addr >= vaddr + tmp->size - PAGE_SIZE) | 1851 | if (addr >= vaddr + tmp->size - PAGE_SIZE) |
| 1656 | continue; | 1852 | continue; |
| @@ -1662,18 +1858,21 @@ long vwrite(char *buf, char *addr, unsigned long count) | |||
| 1662 | count--; | 1858 | count--; |
| 1663 | } | 1859 | } |
| 1664 | n = vaddr + tmp->size - PAGE_SIZE - addr; | 1860 | n = vaddr + tmp->size - PAGE_SIZE - addr; |
| 1665 | do { | 1861 | if (n > count) |
| 1666 | if (count == 0) | 1862 | n = count; |
| 1667 | goto finished; | 1863 | if (!(tmp->flags & VM_IOREMAP)) { |
| 1668 | *addr = *buf; | 1864 | aligned_vwrite(buf, addr, n); |
| 1669 | buf++; | 1865 | copied++; |
| 1670 | addr++; | 1866 | } |
| 1671 | count--; | 1867 | buf += n; |
| 1672 | } while (--n > 0); | 1868 | addr += n; |
| 1869 | count -= n; | ||
| 1673 | } | 1870 | } |
| 1674 | finished: | 1871 | finished: |
| 1675 | read_unlock(&vmlist_lock); | 1872 | read_unlock(&vmlist_lock); |
| 1676 | return buf - buf_start; | 1873 | if (!copied) |
| 1874 | return 0; | ||
| 1875 | return buflen; | ||
| 1677 | } | 1876 | } |
| 1678 | 1877 | ||
| 1679 | /** | 1878 | /** |
| @@ -1794,6 +1993,286 @@ void free_vm_area(struct vm_struct *area) | |||
| 1794 | } | 1993 | } |
| 1795 | EXPORT_SYMBOL_GPL(free_vm_area); | 1994 | EXPORT_SYMBOL_GPL(free_vm_area); |
| 1796 | 1995 | ||
| 1996 | static struct vmap_area *node_to_va(struct rb_node *n) | ||
| 1997 | { | ||
| 1998 | return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; | ||
| 1999 | } | ||
| 2000 | |||
| 2001 | /** | ||
| 2002 | * pvm_find_next_prev - find the next and prev vmap_area surrounding @end | ||
| 2003 | * @end: target address | ||
| 2004 | * @pnext: out arg for the next vmap_area | ||
| 2005 | * @pprev: out arg for the previous vmap_area | ||
| 2006 | * | ||
| 2007 | * Returns: %true if either or both of next and prev are found, | ||
| 2008 | * %false if no vmap_area exists | ||
| 2009 | * | ||
| 2010 | * Find vmap_areas end addresses of which enclose @end. ie. if not | ||
| 2011 | * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. | ||
| 2012 | */ | ||
| 2013 | static bool pvm_find_next_prev(unsigned long end, | ||
| 2014 | struct vmap_area **pnext, | ||
| 2015 | struct vmap_area **pprev) | ||
| 2016 | { | ||
| 2017 | struct rb_node *n = vmap_area_root.rb_node; | ||
| 2018 | struct vmap_area *va = NULL; | ||
| 2019 | |||
| 2020 | while (n) { | ||
| 2021 | va = rb_entry(n, struct vmap_area, rb_node); | ||
| 2022 | if (end < va->va_end) | ||
| 2023 | n = n->rb_left; | ||
| 2024 | else if (end > va->va_end) | ||
| 2025 | n = n->rb_right; | ||
| 2026 | else | ||
| 2027 | break; | ||
| 2028 | } | ||
| 2029 | |||
| 2030 | if (!va) | ||
| 2031 | return false; | ||
| 2032 | |||
| 2033 | if (va->va_end > end) { | ||
| 2034 | *pnext = va; | ||
| 2035 | *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); | ||
| 2036 | } else { | ||
| 2037 | *pprev = va; | ||
| 2038 | *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); | ||
| 2039 | } | ||
| 2040 | return true; | ||
| 2041 | } | ||
| 2042 | |||
| 2043 | /** | ||
| 2044 | * pvm_determine_end - find the highest aligned address between two vmap_areas | ||
| 2045 | * @pnext: in/out arg for the next vmap_area | ||
| 2046 | * @pprev: in/out arg for the previous vmap_area | ||
| 2047 | * @align: alignment | ||
| 2048 | * | ||
| 2049 | * Returns: determined end address | ||
| 2050 | * | ||
| 2051 | * Find the highest aligned address between *@pnext and *@pprev below | ||
| 2052 | * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned | ||
| 2053 | * down address is between the end addresses of the two vmap_areas. | ||
| 2054 | * | ||
| 2055 | * Please note that the address returned by this function may fall | ||
| 2056 | * inside *@pnext vmap_area. The caller is responsible for checking | ||
| 2057 | * that. | ||
| 2058 | */ | ||
| 2059 | static unsigned long pvm_determine_end(struct vmap_area **pnext, | ||
| 2060 | struct vmap_area **pprev, | ||
| 2061 | unsigned long align) | ||
| 2062 | { | ||
| 2063 | const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); | ||
| 2064 | unsigned long addr; | ||
| 2065 | |||
| 2066 | if (*pnext) | ||
| 2067 | addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); | ||
| 2068 | else | ||
| 2069 | addr = vmalloc_end; | ||
| 2070 | |||
| 2071 | while (*pprev && (*pprev)->va_end > addr) { | ||
| 2072 | *pnext = *pprev; | ||
| 2073 | *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); | ||
| 2074 | } | ||
| 2075 | |||
| 2076 | return addr; | ||
| 2077 | } | ||
| 2078 | |||
| 2079 | /** | ||
| 2080 | * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator | ||
| 2081 | * @offsets: array containing offset of each area | ||
| 2082 | * @sizes: array containing size of each area | ||
| 2083 | * @nr_vms: the number of areas to allocate | ||
| 2084 | * @align: alignment, all entries in @offsets and @sizes must be aligned to this | ||
| 2085 | * @gfp_mask: allocation mask | ||
| 2086 | * | ||
| 2087 | * Returns: kmalloc'd vm_struct pointer array pointing to allocated | ||
| 2088 | * vm_structs on success, %NULL on failure | ||
| 2089 | * | ||
| 2090 | * Percpu allocator wants to use congruent vm areas so that it can | ||
| 2091 | * maintain the offsets among percpu areas. This function allocates | ||
| 2092 | * congruent vmalloc areas for it. These areas tend to be scattered | ||
| 2093 | * pretty far, distance between two areas easily going up to | ||
| 2094 | * gigabytes. To avoid interacting with regular vmallocs, these areas | ||
| 2095 | * are allocated from top. | ||
| 2096 | * | ||
| 2097 | * Despite its complicated look, this allocator is rather simple. It | ||
| 2098 | * does everything top-down and scans areas from the end looking for | ||
| 2099 | * matching slot. While scanning, if any of the areas overlaps with | ||
| 2100 | * existing vmap_area, the base address is pulled down to fit the | ||
| 2101 | * area. Scanning is repeated till all the areas fit and then all | ||
| 2102 | * necessary data structres are inserted and the result is returned. | ||
| 2103 | */ | ||
| 2104 | struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, | ||
| 2105 | const size_t *sizes, int nr_vms, | ||
| 2106 | size_t align, gfp_t gfp_mask) | ||
| 2107 | { | ||
| 2108 | const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); | ||
| 2109 | const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); | ||
| 2110 | struct vmap_area **vas, *prev, *next; | ||
| 2111 | struct vm_struct **vms; | ||
| 2112 | int area, area2, last_area, term_area; | ||
| 2113 | unsigned long base, start, end, last_end; | ||
| 2114 | bool purged = false; | ||
| 2115 | |||
| 2116 | gfp_mask &= GFP_RECLAIM_MASK; | ||
| 2117 | |||
| 2118 | /* verify parameters and allocate data structures */ | ||
| 2119 | BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); | ||
| 2120 | for (last_area = 0, area = 0; area < nr_vms; area++) { | ||
| 2121 | start = offsets[area]; | ||
| 2122 | end = start + sizes[area]; | ||
| 2123 | |||
| 2124 | /* is everything aligned properly? */ | ||
| 2125 | BUG_ON(!IS_ALIGNED(offsets[area], align)); | ||
| 2126 | BUG_ON(!IS_ALIGNED(sizes[area], align)); | ||
| 2127 | |||
| 2128 | /* detect the area with the highest address */ | ||
| 2129 | if (start > offsets[last_area]) | ||
| 2130 | last_area = area; | ||
| 2131 | |||
| 2132 | for (area2 = 0; area2 < nr_vms; area2++) { | ||
| 2133 | unsigned long start2 = offsets[area2]; | ||
| 2134 | unsigned long end2 = start2 + sizes[area2]; | ||
| 2135 | |||
| 2136 | if (area2 == area) | ||
| 2137 | continue; | ||
| 2138 | |||
| 2139 | BUG_ON(start2 >= start && start2 < end); | ||
| 2140 | BUG_ON(end2 <= end && end2 > start); | ||
| 2141 | } | ||
| 2142 | } | ||
| 2143 | last_end = offsets[last_area] + sizes[last_area]; | ||
| 2144 | |||
| 2145 | if (vmalloc_end - vmalloc_start < last_end) { | ||
| 2146 | WARN_ON(true); | ||
| 2147 | return NULL; | ||
| 2148 | } | ||
| 2149 | |||
| 2150 | vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); | ||
| 2151 | vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); | ||
| 2152 | if (!vas || !vms) | ||
| 2153 | goto err_free; | ||
| 2154 | |||
| 2155 | for (area = 0; area < nr_vms; area++) { | ||
| 2156 | vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); | ||
| 2157 | vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); | ||
| 2158 | if (!vas[area] || !vms[area]) | ||
| 2159 | goto err_free; | ||
| 2160 | } | ||
| 2161 | retry: | ||
| 2162 | spin_lock(&vmap_area_lock); | ||
| 2163 | |||
| 2164 | /* start scanning - we scan from the top, begin with the last area */ | ||
| 2165 | area = term_area = last_area; | ||
| 2166 | start = offsets[area]; | ||
| 2167 | end = start + sizes[area]; | ||
| 2168 | |||
| 2169 | if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { | ||
| 2170 | base = vmalloc_end - last_end; | ||
| 2171 | goto found; | ||
| 2172 | } | ||
| 2173 | base = pvm_determine_end(&next, &prev, align) - end; | ||
| 2174 | |||
| 2175 | while (true) { | ||
| 2176 | BUG_ON(next && next->va_end <= base + end); | ||
| 2177 | BUG_ON(prev && prev->va_end > base + end); | ||
| 2178 | |||
| 2179 | /* | ||
| 2180 | * base might have underflowed, add last_end before | ||
| 2181 | * comparing. | ||
| 2182 | */ | ||
| 2183 | if (base + last_end < vmalloc_start + last_end) { | ||
| 2184 | spin_unlock(&vmap_area_lock); | ||
| 2185 | if (!purged) { | ||
| 2186 | purge_vmap_area_lazy(); | ||
| 2187 | purged = true; | ||
| 2188 | goto retry; | ||
| 2189 | } | ||
| 2190 | goto err_free; | ||
| 2191 | } | ||
| 2192 | |||
| 2193 | /* | ||
| 2194 | * If next overlaps, move base downwards so that it's | ||
| 2195 | * right below next and then recheck. | ||
| 2196 | */ | ||
| 2197 | if (next && next->va_start < base + end) { | ||
| 2198 | base = pvm_determine_end(&next, &prev, align) - end; | ||
| 2199 | term_area = area; | ||
| 2200 | continue; | ||
| 2201 | } | ||
| 2202 | |||
| 2203 | /* | ||
| 2204 | * If prev overlaps, shift down next and prev and move | ||
| 2205 | * base so that it's right below new next and then | ||
| 2206 | * recheck. | ||
| 2207 | */ | ||
| 2208 | if (prev && prev->va_end > base + start) { | ||
| 2209 | next = prev; | ||
| 2210 | prev = node_to_va(rb_prev(&next->rb_node)); | ||
| 2211 | base = pvm_determine_end(&next, &prev, align) - end; | ||
| 2212 | term_area = area; | ||
| 2213 | continue; | ||
| 2214 | } | ||
| 2215 | |||
| 2216 | /* | ||
| 2217 | * This area fits, move on to the previous one. If | ||
| 2218 | * the previous one is the terminal one, we're done. | ||
| 2219 | */ | ||
| 2220 | area = (area + nr_vms - 1) % nr_vms; | ||
| 2221 | if (area == term_area) | ||
| 2222 | break; | ||
| 2223 | start = offsets[area]; | ||
| 2224 | end = start + sizes[area]; | ||
| 2225 | pvm_find_next_prev(base + end, &next, &prev); | ||
| 2226 | } | ||
| 2227 | found: | ||
| 2228 | /* we've found a fitting base, insert all va's */ | ||
| 2229 | for (area = 0; area < nr_vms; area++) { | ||
| 2230 | struct vmap_area *va = vas[area]; | ||
| 2231 | |||
| 2232 | va->va_start = base + offsets[area]; | ||
| 2233 | va->va_end = va->va_start + sizes[area]; | ||
| 2234 | __insert_vmap_area(va); | ||
| 2235 | } | ||
| 2236 | |||
| 2237 | vmap_area_pcpu_hole = base + offsets[last_area]; | ||
| 2238 | |||
| 2239 | spin_unlock(&vmap_area_lock); | ||
| 2240 | |||
| 2241 | /* insert all vm's */ | ||
| 2242 | for (area = 0; area < nr_vms; area++) | ||
| 2243 | insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, | ||
| 2244 | pcpu_get_vm_areas); | ||
| 2245 | |||
| 2246 | kfree(vas); | ||
| 2247 | return vms; | ||
| 2248 | |||
| 2249 | err_free: | ||
| 2250 | for (area = 0; area < nr_vms; area++) { | ||
| 2251 | if (vas) | ||
| 2252 | kfree(vas[area]); | ||
| 2253 | if (vms) | ||
| 2254 | kfree(vms[area]); | ||
| 2255 | } | ||
| 2256 | kfree(vas); | ||
| 2257 | kfree(vms); | ||
| 2258 | return NULL; | ||
| 2259 | } | ||
| 2260 | |||
| 2261 | /** | ||
| 2262 | * pcpu_free_vm_areas - free vmalloc areas for percpu allocator | ||
| 2263 | * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() | ||
| 2264 | * @nr_vms: the number of allocated areas | ||
| 2265 | * | ||
| 2266 | * Free vm_structs and the array allocated by pcpu_get_vm_areas(). | ||
| 2267 | */ | ||
| 2268 | void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) | ||
| 2269 | { | ||
| 2270 | int i; | ||
| 2271 | |||
| 2272 | for (i = 0; i < nr_vms; i++) | ||
| 2273 | free_vm_area(vms[i]); | ||
| 2274 | kfree(vms); | ||
| 2275 | } | ||
| 1797 | 2276 | ||
| 1798 | #ifdef CONFIG_PROC_FS | 2277 | #ifdef CONFIG_PROC_FS |
| 1799 | static void *s_start(struct seq_file *m, loff_t *pos) | 2278 | static void *s_start(struct seq_file *m, loff_t *pos) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 06e72693b458..777af57fd8c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -63,6 +63,9 @@ struct scan_control { | |||
| 63 | /* Can mapped pages be reclaimed? */ | 63 | /* Can mapped pages be reclaimed? */ |
| 64 | int may_unmap; | 64 | int may_unmap; |
| 65 | 65 | ||
| 66 | /* Can pages be swapped as part of reclaim? */ | ||
| 67 | int may_swap; | ||
| 68 | |||
| 66 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for | 69 | /* This context's SWAP_CLUSTER_MAX. If freeing memory for |
| 67 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. | 70 | * suspend, we effectively ignore SWAP_CLUSTER_MAX. |
| 68 | * In this context, it doesn't matter that we scan the | 71 | * In this context, it doesn't matter that we scan the |
| @@ -145,8 +148,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, | |||
| 145 | return &zone->reclaim_stat; | 148 | return &zone->reclaim_stat; |
| 146 | } | 149 | } |
| 147 | 150 | ||
| 148 | static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, | 151 | static unsigned long zone_nr_lru_pages(struct zone *zone, |
| 149 | enum lru_list lru) | 152 | struct scan_control *sc, enum lru_list lru) |
| 150 | { | 153 | { |
| 151 | if (!scanning_global_lru(sc)) | 154 | if (!scanning_global_lru(sc)) |
| 152 | return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); | 155 | return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); |
| @@ -283,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page) | |||
| 283 | 286 | ||
| 284 | static inline int is_page_cache_freeable(struct page *page) | 287 | static inline int is_page_cache_freeable(struct page *page) |
| 285 | { | 288 | { |
| 286 | return page_count(page) - !!PagePrivate(page) == 2; | 289 | /* |
| 290 | * A freeable page cache page is referenced only by the caller | ||
| 291 | * that isolated the page, the page cache radix tree and | ||
| 292 | * optional buffer heads at page->private. | ||
| 293 | */ | ||
| 294 | return page_count(page) - page_has_private(page) == 2; | ||
| 287 | } | 295 | } |
| 288 | 296 | ||
| 289 | static int may_write_to_queue(struct backing_dev_info *bdi) | 297 | static int may_write_to_queue(struct backing_dev_info *bdi) |
| @@ -358,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
| 358 | * block, for some throttling. This happens by accident, because | 366 | * block, for some throttling. This happens by accident, because |
| 359 | * swap_backing_dev_info is bust: it doesn't reflect the | 367 | * swap_backing_dev_info is bust: it doesn't reflect the |
| 360 | * congestion state of the swapdevs. Easy to fix, if needed. | 368 | * congestion state of the swapdevs. Easy to fix, if needed. |
| 361 | * See swapfile.c:page_queue_congested(). | ||
| 362 | */ | 369 | */ |
| 363 | if (!is_page_cache_freeable(page)) | 370 | if (!is_page_cache_freeable(page)) |
| 364 | return PAGE_KEEP; | 371 | return PAGE_KEEP; |
| @@ -367,7 +374,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, | |||
| 367 | * Some data journaling orphaned pages can have | 374 | * Some data journaling orphaned pages can have |
| 368 | * page->mapping == NULL while being dirty with clean buffers. | 375 | * page->mapping == NULL while being dirty with clean buffers. |
| 369 | */ | 376 | */ |
| 370 | if (PagePrivate(page)) { | 377 | if (page_has_private(page)) { |
| 371 | if (try_to_free_buffers(page)) { | 378 | if (try_to_free_buffers(page)) { |
| 372 | ClearPageDirty(page); | 379 | ClearPageDirty(page); |
| 373 | printk("%s: orphaned page\n", __func__); | 380 | printk("%s: orphaned page\n", __func__); |
| @@ -467,10 +474,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) | |||
| 467 | swp_entry_t swap = { .val = page_private(page) }; | 474 | swp_entry_t swap = { .val = page_private(page) }; |
| 468 | __delete_from_swap_cache(page); | 475 | __delete_from_swap_cache(page); |
| 469 | spin_unlock_irq(&mapping->tree_lock); | 476 | spin_unlock_irq(&mapping->tree_lock); |
| 470 | swap_free(swap); | 477 | swapcache_free(swap, page); |
| 471 | } else { | 478 | } else { |
| 472 | __remove_from_page_cache(page); | 479 | __remove_from_page_cache(page); |
| 473 | spin_unlock_irq(&mapping->tree_lock); | 480 | spin_unlock_irq(&mapping->tree_lock); |
| 481 | mem_cgroup_uncharge_cache_page(page); | ||
| 474 | } | 482 | } |
| 475 | 483 | ||
| 476 | return 1; | 484 | return 1; |
| @@ -509,7 +517,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) | |||
| 509 | * | 517 | * |
| 510 | * lru_lock must not be held, interrupts must be enabled. | 518 | * lru_lock must not be held, interrupts must be enabled. |
| 511 | */ | 519 | */ |
| 512 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 513 | void putback_lru_page(struct page *page) | 520 | void putback_lru_page(struct page *page) |
| 514 | { | 521 | { |
| 515 | int lru; | 522 | int lru; |
| @@ -528,7 +535,7 @@ redo: | |||
| 528 | * unevictable page on [in]active list. | 535 | * unevictable page on [in]active list. |
| 529 | * We know how to handle that. | 536 | * We know how to handle that. |
| 530 | */ | 537 | */ |
| 531 | lru = active + page_is_file_cache(page); | 538 | lru = active + page_lru_base_type(page); |
| 532 | lru_cache_add_lru(page, lru); | 539 | lru_cache_add_lru(page, lru); |
| 533 | } else { | 540 | } else { |
| 534 | /* | 541 | /* |
| @@ -537,6 +544,16 @@ redo: | |||
| 537 | */ | 544 | */ |
| 538 | lru = LRU_UNEVICTABLE; | 545 | lru = LRU_UNEVICTABLE; |
| 539 | add_page_to_unevictable_list(page); | 546 | add_page_to_unevictable_list(page); |
| 547 | /* | ||
| 548 | * When racing with an mlock clearing (page is | ||
| 549 | * unlocked), make sure that if the other thread does | ||
| 550 | * not observe our setting of PG_lru and fails | ||
| 551 | * isolation, we see PG_mlocked cleared below and move | ||
| 552 | * the page back to the evictable list. | ||
| 553 | * | ||
| 554 | * The other side is TestClearPageMlocked(). | ||
| 555 | */ | ||
| 556 | smp_mb(); | ||
| 540 | } | 557 | } |
| 541 | 558 | ||
| 542 | /* | 559 | /* |
| @@ -563,20 +580,6 @@ redo: | |||
| 563 | put_page(page); /* drop ref from isolate */ | 580 | put_page(page); /* drop ref from isolate */ |
| 564 | } | 581 | } |
| 565 | 582 | ||
| 566 | #else /* CONFIG_UNEVICTABLE_LRU */ | ||
| 567 | |||
| 568 | void putback_lru_page(struct page *page) | ||
| 569 | { | ||
| 570 | int lru; | ||
| 571 | VM_BUG_ON(PageLRU(page)); | ||
| 572 | |||
| 573 | lru = !!TestClearPageActive(page) + page_is_file_cache(page); | ||
| 574 | lru_cache_add_lru(page, lru); | ||
| 575 | put_page(page); | ||
| 576 | } | ||
| 577 | #endif /* CONFIG_UNEVICTABLE_LRU */ | ||
| 578 | |||
| 579 | |||
| 580 | /* | 583 | /* |
| 581 | * shrink_page_list() returns the number of reclaimed pages | 584 | * shrink_page_list() returns the number of reclaimed pages |
| 582 | */ | 585 | */ |
| @@ -588,6 +591,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 588 | struct pagevec freed_pvec; | 591 | struct pagevec freed_pvec; |
| 589 | int pgactivate = 0; | 592 | int pgactivate = 0; |
| 590 | unsigned long nr_reclaimed = 0; | 593 | unsigned long nr_reclaimed = 0; |
| 594 | unsigned long vm_flags; | ||
| 591 | 595 | ||
| 592 | cond_resched(); | 596 | cond_resched(); |
| 593 | 597 | ||
| @@ -638,10 +642,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 638 | goto keep_locked; | 642 | goto keep_locked; |
| 639 | } | 643 | } |
| 640 | 644 | ||
| 641 | referenced = page_referenced(page, 1, sc->mem_cgroup); | 645 | referenced = page_referenced(page, 1, |
| 642 | /* In active use or really unfreeable? Activate it. */ | 646 | sc->mem_cgroup, &vm_flags); |
| 647 | /* | ||
| 648 | * In active use or really unfreeable? Activate it. | ||
| 649 | * If page which have PG_mlocked lost isoltation race, | ||
| 650 | * try_to_unmap moves it to unevictable list | ||
| 651 | */ | ||
| 643 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && | 652 | if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && |
| 644 | referenced && page_mapping_inuse(page)) | 653 | referenced && page_mapping_inuse(page) |
| 654 | && !(vm_flags & VM_LOCKED)) | ||
| 645 | goto activate_locked; | 655 | goto activate_locked; |
| 646 | 656 | ||
| 647 | /* | 657 | /* |
| @@ -663,7 +673,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 663 | * processes. Try to unmap it here. | 673 | * processes. Try to unmap it here. |
| 664 | */ | 674 | */ |
| 665 | if (page_mapped(page) && mapping) { | 675 | if (page_mapped(page) && mapping) { |
| 666 | switch (try_to_unmap(page, 0)) { | 676 | switch (try_to_unmap(page, TTU_UNMAP)) { |
| 667 | case SWAP_FAIL: | 677 | case SWAP_FAIL: |
| 668 | goto activate_locked; | 678 | goto activate_locked; |
| 669 | case SWAP_AGAIN: | 679 | case SWAP_AGAIN: |
| @@ -727,7 +737,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
| 727 | * process address space (page_count == 1) it can be freed. | 737 | * process address space (page_count == 1) it can be freed. |
| 728 | * Otherwise, leave the page on the LRU so it is swappable. | 738 | * Otherwise, leave the page on the LRU so it is swappable. |
| 729 | */ | 739 | */ |
| 730 | if (PagePrivate(page)) { | 740 | if (page_has_private(page)) { |
| 731 | if (!try_to_release_page(page, sc->gfp_mask)) | 741 | if (!try_to_release_page(page, sc->gfp_mask)) |
| 732 | goto activate_locked; | 742 | goto activate_locked; |
| 733 | if (!mapping && page_count(page) == 1) { | 743 | if (!mapping && page_count(page) == 1) { |
| @@ -825,7 +835,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
| 825 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) | 835 | if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) |
| 826 | return ret; | 836 | return ret; |
| 827 | 837 | ||
| 828 | if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) | 838 | if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) |
| 829 | return ret; | 839 | return ret; |
| 830 | 840 | ||
| 831 | /* | 841 | /* |
| @@ -846,7 +856,6 @@ int __isolate_lru_page(struct page *page, int mode, int file) | |||
| 846 | */ | 856 | */ |
| 847 | ClearPageLRU(page); | 857 | ClearPageLRU(page); |
| 848 | ret = 0; | 858 | ret = 0; |
| 849 | mem_cgroup_del_lru(page); | ||
| 850 | } | 859 | } |
| 851 | 860 | ||
| 852 | return ret; | 861 | return ret; |
| @@ -894,12 +903,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 894 | switch (__isolate_lru_page(page, mode, file)) { | 903 | switch (__isolate_lru_page(page, mode, file)) { |
| 895 | case 0: | 904 | case 0: |
| 896 | list_move(&page->lru, dst); | 905 | list_move(&page->lru, dst); |
| 906 | mem_cgroup_del_lru(page); | ||
| 897 | nr_taken++; | 907 | nr_taken++; |
| 898 | break; | 908 | break; |
| 899 | 909 | ||
| 900 | case -EBUSY: | 910 | case -EBUSY: |
| 901 | /* else it is being freed elsewhere */ | 911 | /* else it is being freed elsewhere */ |
| 902 | list_move(&page->lru, src); | 912 | list_move(&page->lru, src); |
| 913 | mem_cgroup_rotate_lru_list(page, page_lru(page)); | ||
| 903 | continue; | 914 | continue; |
| 904 | 915 | ||
| 905 | default: | 916 | default: |
| @@ -938,18 +949,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
| 938 | /* Check that we have not crossed a zone boundary. */ | 949 | /* Check that we have not crossed a zone boundary. */ |
| 939 | if (unlikely(page_zone_id(cursor_page) != zone_id)) | 950 | if (unlikely(page_zone_id(cursor_page) != zone_id)) |
| 940 | continue; | 951 | continue; |
| 941 | switch (__isolate_lru_page(cursor_page, mode, file)) { | 952 | |
| 942 | case 0: | 953 | /* |
| 954 | * If we don't have enough swap space, reclaiming of | ||
| 955 | * anon page which don't already have a swap slot is | ||
| 956 | * pointless. | ||
| 957 | */ | ||
| 958 | if (nr_swap_pages <= 0 && PageAnon(cursor_page) && | ||
| 959 | !PageSwapCache(cursor_page)) | ||
| 960 | continue; | ||
| 961 | |||
| 962 | if (__isolate_lru_page(cursor_page, mode, file) == 0) { | ||
| 943 | list_move(&cursor_page->lru, dst); | 963 | list_move(&cursor_page->lru, dst); |
| 964 | mem_cgroup_del_lru(cursor_page); | ||
| 944 | nr_taken++; | 965 | nr_taken++; |
| 945 | scan++; | 966 | scan++; |
| 946 | break; | ||
| 947 | |||
| 948 | case -EBUSY: | ||
| 949 | /* else it is being freed elsewhere */ | ||
| 950 | list_move(&cursor_page->lru, src); | ||
| 951 | default: | ||
| 952 | break; /* ! on LRU or wrong list */ | ||
| 953 | } | 967 | } |
| 954 | } | 968 | } |
| 955 | } | 969 | } |
| @@ -971,7 +985,7 @@ static unsigned long isolate_pages_global(unsigned long nr, | |||
| 971 | if (file) | 985 | if (file) |
| 972 | lru += LRU_FILE; | 986 | lru += LRU_FILE; |
| 973 | return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, | 987 | return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, |
| 974 | mode, !!file); | 988 | mode, file); |
| 975 | } | 989 | } |
| 976 | 990 | ||
| 977 | /* | 991 | /* |
| @@ -986,7 +1000,7 @@ static unsigned long clear_active_flags(struct list_head *page_list, | |||
| 986 | struct page *page; | 1000 | struct page *page; |
| 987 | 1001 | ||
| 988 | list_for_each_entry(page, page_list, lru) { | 1002 | list_for_each_entry(page, page_list, lru) { |
| 989 | lru = page_is_file_cache(page); | 1003 | lru = page_lru_base_type(page); |
| 990 | if (PageActive(page)) { | 1004 | if (PageActive(page)) { |
| 991 | lru += LRU_ACTIVE; | 1005 | lru += LRU_ACTIVE; |
| 992 | ClearPageActive(page); | 1006 | ClearPageActive(page); |
| @@ -1044,6 +1058,31 @@ int isolate_lru_page(struct page *page) | |||
| 1044 | } | 1058 | } |
| 1045 | 1059 | ||
| 1046 | /* | 1060 | /* |
| 1061 | * Are there way too many processes in the direct reclaim path already? | ||
| 1062 | */ | ||
| 1063 | static int too_many_isolated(struct zone *zone, int file, | ||
| 1064 | struct scan_control *sc) | ||
| 1065 | { | ||
| 1066 | unsigned long inactive, isolated; | ||
| 1067 | |||
| 1068 | if (current_is_kswapd()) | ||
| 1069 | return 0; | ||
| 1070 | |||
| 1071 | if (!scanning_global_lru(sc)) | ||
| 1072 | return 0; | ||
| 1073 | |||
| 1074 | if (file) { | ||
| 1075 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | ||
| 1076 | isolated = zone_page_state(zone, NR_ISOLATED_FILE); | ||
| 1077 | } else { | ||
| 1078 | inactive = zone_page_state(zone, NR_INACTIVE_ANON); | ||
| 1079 | isolated = zone_page_state(zone, NR_ISOLATED_ANON); | ||
| 1080 | } | ||
| 1081 | |||
| 1082 | return isolated > inactive; | ||
| 1083 | } | ||
| 1084 | |||
| 1085 | /* | ||
| 1047 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number | 1086 | * shrink_inactive_list() is a helper for shrink_zone(). It returns the number |
| 1048 | * of reclaimed pages | 1087 | * of reclaimed pages |
| 1049 | */ | 1088 | */ |
| @@ -1056,6 +1095,27 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1056 | unsigned long nr_scanned = 0; | 1095 | unsigned long nr_scanned = 0; |
| 1057 | unsigned long nr_reclaimed = 0; | 1096 | unsigned long nr_reclaimed = 0; |
| 1058 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1097 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1098 | int lumpy_reclaim = 0; | ||
| 1099 | |||
| 1100 | while (unlikely(too_many_isolated(zone, file, sc))) { | ||
| 1101 | congestion_wait(BLK_RW_ASYNC, HZ/10); | ||
| 1102 | |||
| 1103 | /* We are about to die and free our memory. Return now. */ | ||
| 1104 | if (fatal_signal_pending(current)) | ||
| 1105 | return SWAP_CLUSTER_MAX; | ||
| 1106 | } | ||
| 1107 | |||
| 1108 | /* | ||
| 1109 | * If we need a large contiguous chunk of memory, or have | ||
| 1110 | * trouble getting a small set of contiguous pages, we | ||
| 1111 | * will reclaim both active and inactive pages. | ||
| 1112 | * | ||
| 1113 | * We use the same threshold as pageout congestion_wait below. | ||
| 1114 | */ | ||
| 1115 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
| 1116 | lumpy_reclaim = 1; | ||
| 1117 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
| 1118 | lumpy_reclaim = 1; | ||
| 1059 | 1119 | ||
| 1060 | pagevec_init(&pvec, 1); | 1120 | pagevec_init(&pvec, 1); |
| 1061 | 1121 | ||
| @@ -1068,23 +1128,27 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1068 | unsigned long nr_freed; | 1128 | unsigned long nr_freed; |
| 1069 | unsigned long nr_active; | 1129 | unsigned long nr_active; |
| 1070 | unsigned int count[NR_LRU_LISTS] = { 0, }; | 1130 | unsigned int count[NR_LRU_LISTS] = { 0, }; |
| 1071 | int mode = ISOLATE_INACTIVE; | 1131 | int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; |
| 1072 | 1132 | unsigned long nr_anon; | |
| 1073 | /* | 1133 | unsigned long nr_file; |
| 1074 | * If we need a large contiguous chunk of memory, or have | ||
| 1075 | * trouble getting a small set of contiguous pages, we | ||
| 1076 | * will reclaim both active and inactive pages. | ||
| 1077 | * | ||
| 1078 | * We use the same threshold as pageout congestion_wait below. | ||
| 1079 | */ | ||
| 1080 | if (sc->order > PAGE_ALLOC_COSTLY_ORDER) | ||
| 1081 | mode = ISOLATE_BOTH; | ||
| 1082 | else if (sc->order && priority < DEF_PRIORITY - 2) | ||
| 1083 | mode = ISOLATE_BOTH; | ||
| 1084 | 1134 | ||
| 1085 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, | 1135 | nr_taken = sc->isolate_pages(sc->swap_cluster_max, |
| 1086 | &page_list, &nr_scan, sc->order, mode, | 1136 | &page_list, &nr_scan, sc->order, mode, |
| 1087 | zone, sc->mem_cgroup, 0, file); | 1137 | zone, sc->mem_cgroup, 0, file); |
| 1138 | |||
| 1139 | if (scanning_global_lru(sc)) { | ||
| 1140 | zone->pages_scanned += nr_scan; | ||
| 1141 | if (current_is_kswapd()) | ||
| 1142 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, | ||
| 1143 | nr_scan); | ||
| 1144 | else | ||
| 1145 | __count_zone_vm_events(PGSCAN_DIRECT, zone, | ||
| 1146 | nr_scan); | ||
| 1147 | } | ||
| 1148 | |||
| 1149 | if (nr_taken == 0) | ||
| 1150 | goto done; | ||
| 1151 | |||
| 1088 | nr_active = clear_active_flags(&page_list, count); | 1152 | nr_active = clear_active_flags(&page_list, count); |
| 1089 | __count_vm_events(PGDEACTIVATE, nr_active); | 1153 | __count_vm_events(PGDEACTIVATE, nr_active); |
| 1090 | 1154 | ||
| @@ -1097,8 +1161,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1097 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, | 1161 | __mod_zone_page_state(zone, NR_INACTIVE_ANON, |
| 1098 | -count[LRU_INACTIVE_ANON]); | 1162 | -count[LRU_INACTIVE_ANON]); |
| 1099 | 1163 | ||
| 1100 | if (scanning_global_lru(sc)) | 1164 | nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; |
| 1101 | zone->pages_scanned += nr_scan; | 1165 | nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; |
| 1166 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); | ||
| 1167 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); | ||
| 1102 | 1168 | ||
| 1103 | reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; | 1169 | reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; |
| 1104 | reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; | 1170 | reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; |
| @@ -1117,8 +1183,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1117 | * but that should be acceptable to the caller | 1183 | * but that should be acceptable to the caller |
| 1118 | */ | 1184 | */ |
| 1119 | if (nr_freed < nr_taken && !current_is_kswapd() && | 1185 | if (nr_freed < nr_taken && !current_is_kswapd() && |
| 1120 | sc->order > PAGE_ALLOC_COSTLY_ORDER) { | 1186 | lumpy_reclaim) { |
| 1121 | congestion_wait(WRITE, HZ/10); | 1187 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| 1122 | 1188 | ||
| 1123 | /* | 1189 | /* |
| 1124 | * The attempt at page out may have made some | 1190 | * The attempt at page out may have made some |
| @@ -1132,18 +1198,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1132 | } | 1198 | } |
| 1133 | 1199 | ||
| 1134 | nr_reclaimed += nr_freed; | 1200 | nr_reclaimed += nr_freed; |
| 1201 | |||
| 1135 | local_irq_disable(); | 1202 | local_irq_disable(); |
| 1136 | if (current_is_kswapd()) { | 1203 | if (current_is_kswapd()) |
| 1137 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); | ||
| 1138 | __count_vm_events(KSWAPD_STEAL, nr_freed); | 1204 | __count_vm_events(KSWAPD_STEAL, nr_freed); |
| 1139 | } else if (scanning_global_lru(sc)) | ||
| 1140 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); | ||
| 1141 | |||
| 1142 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); | 1205 | __count_zone_vm_events(PGSTEAL, zone, nr_freed); |
| 1143 | 1206 | ||
| 1144 | if (nr_taken == 0) | ||
| 1145 | goto done; | ||
| 1146 | |||
| 1147 | spin_lock(&zone->lru_lock); | 1207 | spin_lock(&zone->lru_lock); |
| 1148 | /* | 1208 | /* |
| 1149 | * Put back any unfreeable pages. | 1209 | * Put back any unfreeable pages. |
| @@ -1162,8 +1222,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1162 | SetPageLRU(page); | 1222 | SetPageLRU(page); |
| 1163 | lru = page_lru(page); | 1223 | lru = page_lru(page); |
| 1164 | add_page_to_lru_list(zone, page, lru); | 1224 | add_page_to_lru_list(zone, page, lru); |
| 1165 | if (PageActive(page)) { | 1225 | if (is_active_lru(lru)) { |
| 1166 | int file = !!page_is_file_cache(page); | 1226 | int file = is_file_lru(lru); |
| 1167 | reclaim_stat->recent_rotated[file]++; | 1227 | reclaim_stat->recent_rotated[file]++; |
| 1168 | } | 1228 | } |
| 1169 | if (!pagevec_add(&pvec, page)) { | 1229 | if (!pagevec_add(&pvec, page)) { |
| @@ -1172,10 +1232,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
| 1172 | spin_lock_irq(&zone->lru_lock); | 1232 | spin_lock_irq(&zone->lru_lock); |
| 1173 | } | 1233 | } |
| 1174 | } | 1234 | } |
| 1235 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); | ||
| 1236 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); | ||
| 1237 | |||
| 1175 | } while (nr_scanned < max_scan); | 1238 | } while (nr_scanned < max_scan); |
| 1176 | spin_unlock(&zone->lru_lock); | 1239 | |
| 1177 | done: | 1240 | done: |
| 1178 | local_irq_enable(); | 1241 | spin_unlock_irq(&zone->lru_lock); |
| 1179 | pagevec_release(&pvec); | 1242 | pagevec_release(&pvec); |
| 1180 | return nr_reclaimed; | 1243 | return nr_reclaimed; |
| 1181 | } | 1244 | } |
| @@ -1212,23 +1275,55 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) | |||
| 1212 | * But we had to alter page->flags anyway. | 1275 | * But we had to alter page->flags anyway. |
| 1213 | */ | 1276 | */ |
| 1214 | 1277 | ||
| 1278 | static void move_active_pages_to_lru(struct zone *zone, | ||
| 1279 | struct list_head *list, | ||
| 1280 | enum lru_list lru) | ||
| 1281 | { | ||
| 1282 | unsigned long pgmoved = 0; | ||
| 1283 | struct pagevec pvec; | ||
| 1284 | struct page *page; | ||
| 1285 | |||
| 1286 | pagevec_init(&pvec, 1); | ||
| 1287 | |||
| 1288 | while (!list_empty(list)) { | ||
| 1289 | page = lru_to_page(list); | ||
| 1290 | |||
| 1291 | VM_BUG_ON(PageLRU(page)); | ||
| 1292 | SetPageLRU(page); | ||
| 1293 | |||
| 1294 | list_move(&page->lru, &zone->lru[lru].list); | ||
| 1295 | mem_cgroup_add_lru_list(page, lru); | ||
| 1296 | pgmoved++; | ||
| 1297 | |||
| 1298 | if (!pagevec_add(&pvec, page) || list_empty(list)) { | ||
| 1299 | spin_unlock_irq(&zone->lru_lock); | ||
| 1300 | if (buffer_heads_over_limit) | ||
| 1301 | pagevec_strip(&pvec); | ||
| 1302 | __pagevec_release(&pvec); | ||
| 1303 | spin_lock_irq(&zone->lru_lock); | ||
| 1304 | } | ||
| 1305 | } | ||
| 1306 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
| 1307 | if (!is_active_lru(lru)) | ||
| 1308 | __count_vm_events(PGDEACTIVATE, pgmoved); | ||
| 1309 | } | ||
| 1215 | 1310 | ||
| 1216 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | 1311 | static void shrink_active_list(unsigned long nr_pages, struct zone *zone, |
| 1217 | struct scan_control *sc, int priority, int file) | 1312 | struct scan_control *sc, int priority, int file) |
| 1218 | { | 1313 | { |
| 1219 | unsigned long pgmoved; | 1314 | unsigned long nr_taken; |
| 1220 | int pgdeactivate = 0; | ||
| 1221 | unsigned long pgscanned; | 1315 | unsigned long pgscanned; |
| 1316 | unsigned long vm_flags; | ||
| 1222 | LIST_HEAD(l_hold); /* The pages which were snipped off */ | 1317 | LIST_HEAD(l_hold); /* The pages which were snipped off */ |
| 1318 | LIST_HEAD(l_active); | ||
| 1223 | LIST_HEAD(l_inactive); | 1319 | LIST_HEAD(l_inactive); |
| 1224 | struct page *page; | 1320 | struct page *page; |
| 1225 | struct pagevec pvec; | ||
| 1226 | enum lru_list lru; | ||
| 1227 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1321 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1322 | unsigned long nr_rotated = 0; | ||
| 1228 | 1323 | ||
| 1229 | lru_add_drain(); | 1324 | lru_add_drain(); |
| 1230 | spin_lock_irq(&zone->lru_lock); | 1325 | spin_lock_irq(&zone->lru_lock); |
| 1231 | pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, | 1326 | nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, |
| 1232 | ISOLATE_ACTIVE, zone, | 1327 | ISOLATE_ACTIVE, zone, |
| 1233 | sc->mem_cgroup, 1, file); | 1328 | sc->mem_cgroup, 1, file); |
| 1234 | /* | 1329 | /* |
| @@ -1238,15 +1333,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 1238 | if (scanning_global_lru(sc)) { | 1333 | if (scanning_global_lru(sc)) { |
| 1239 | zone->pages_scanned += pgscanned; | 1334 | zone->pages_scanned += pgscanned; |
| 1240 | } | 1335 | } |
| 1241 | reclaim_stat->recent_scanned[!!file] += pgmoved; | 1336 | reclaim_stat->recent_scanned[file] += nr_taken; |
| 1242 | 1337 | ||
| 1338 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | ||
| 1243 | if (file) | 1339 | if (file) |
| 1244 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); | 1340 | __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); |
| 1245 | else | 1341 | else |
| 1246 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); | 1342 | __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); |
| 1343 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); | ||
| 1247 | spin_unlock_irq(&zone->lru_lock); | 1344 | spin_unlock_irq(&zone->lru_lock); |
| 1248 | 1345 | ||
| 1249 | pgmoved = 0; | ||
| 1250 | while (!list_empty(&l_hold)) { | 1346 | while (!list_empty(&l_hold)) { |
| 1251 | cond_resched(); | 1347 | cond_resched(); |
| 1252 | page = lru_to_page(&l_hold); | 1348 | page = lru_to_page(&l_hold); |
| @@ -1259,58 +1355,45 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
| 1259 | 1355 | ||
| 1260 | /* page_referenced clears PageReferenced */ | 1356 | /* page_referenced clears PageReferenced */ |
| 1261 | if (page_mapping_inuse(page) && | 1357 | if (page_mapping_inuse(page) && |
| 1262 | page_referenced(page, 0, sc->mem_cgroup)) | 1358 | page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { |
| 1263 | pgmoved++; | 1359 | nr_rotated++; |
| 1360 | /* | ||
| 1361 | * Identify referenced, file-backed active pages and | ||
| 1362 | * give them one more trip around the active list. So | ||
| 1363 | * that executable code get better chances to stay in | ||
| 1364 | * memory under moderate memory pressure. Anon pages | ||
| 1365 | * are not likely to be evicted by use-once streaming | ||
| 1366 | * IO, plus JVM can create lots of anon VM_EXEC pages, | ||
| 1367 | * so we ignore them here. | ||
| 1368 | */ | ||
| 1369 | if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { | ||
| 1370 | list_add(&page->lru, &l_active); | ||
| 1371 | continue; | ||
| 1372 | } | ||
| 1373 | } | ||
| 1264 | 1374 | ||
| 1375 | ClearPageActive(page); /* we are de-activating */ | ||
| 1265 | list_add(&page->lru, &l_inactive); | 1376 | list_add(&page->lru, &l_inactive); |
| 1266 | } | 1377 | } |
| 1267 | 1378 | ||
| 1268 | /* | 1379 | /* |
| 1269 | * Move the pages to the [file or anon] inactive list. | 1380 | * Move pages back to the lru list. |
| 1270 | */ | 1381 | */ |
| 1271 | pagevec_init(&pvec, 1); | ||
| 1272 | lru = LRU_BASE + file * LRU_FILE; | ||
| 1273 | |||
| 1274 | spin_lock_irq(&zone->lru_lock); | 1382 | spin_lock_irq(&zone->lru_lock); |
| 1275 | /* | 1383 | /* |
| 1276 | * Count referenced pages from currently used mappings as | 1384 | * Count referenced pages from currently used mappings as rotated, |
| 1277 | * rotated, even though they are moved to the inactive list. | 1385 | * even though only some of them are actually re-activated. This |
| 1278 | * This helps balance scan pressure between file and anonymous | 1386 | * helps balance scan pressure between file and anonymous pages in |
| 1279 | * pages in get_scan_ratio. | 1387 | * get_scan_ratio. |
| 1280 | */ | 1388 | */ |
| 1281 | reclaim_stat->recent_rotated[!!file] += pgmoved; | 1389 | reclaim_stat->recent_rotated[file] += nr_rotated; |
| 1282 | 1390 | ||
| 1283 | pgmoved = 0; | 1391 | move_active_pages_to_lru(zone, &l_active, |
| 1284 | while (!list_empty(&l_inactive)) { | 1392 | LRU_ACTIVE + file * LRU_FILE); |
| 1285 | page = lru_to_page(&l_inactive); | 1393 | move_active_pages_to_lru(zone, &l_inactive, |
| 1286 | prefetchw_prev_lru_page(page, &l_inactive, flags); | 1394 | LRU_BASE + file * LRU_FILE); |
| 1287 | VM_BUG_ON(PageLRU(page)); | 1395 | __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); |
| 1288 | SetPageLRU(page); | ||
| 1289 | VM_BUG_ON(!PageActive(page)); | ||
| 1290 | ClearPageActive(page); | ||
| 1291 | |||
| 1292 | list_move(&page->lru, &zone->lru[lru].list); | ||
| 1293 | mem_cgroup_add_lru_list(page, lru); | ||
| 1294 | pgmoved++; | ||
| 1295 | if (!pagevec_add(&pvec, page)) { | ||
| 1296 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
| 1297 | spin_unlock_irq(&zone->lru_lock); | ||
| 1298 | pgdeactivate += pgmoved; | ||
| 1299 | pgmoved = 0; | ||
| 1300 | if (buffer_heads_over_limit) | ||
| 1301 | pagevec_strip(&pvec); | ||
| 1302 | __pagevec_release(&pvec); | ||
| 1303 | spin_lock_irq(&zone->lru_lock); | ||
| 1304 | } | ||
| 1305 | } | ||
| 1306 | __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); | ||
| 1307 | pgdeactivate += pgmoved; | ||
| 1308 | __count_zone_vm_events(PGREFILL, zone, pgscanned); | ||
| 1309 | __count_vm_events(PGDEACTIVATE, pgdeactivate); | ||
| 1310 | spin_unlock_irq(&zone->lru_lock); | 1396 | spin_unlock_irq(&zone->lru_lock); |
| 1311 | if (buffer_heads_over_limit) | ||
| 1312 | pagevec_strip(&pvec); | ||
| 1313 | pagevec_release(&pvec); | ||
| 1314 | } | 1397 | } |
| 1315 | 1398 | ||
| 1316 | static int inactive_anon_is_low_global(struct zone *zone) | 1399 | static int inactive_anon_is_low_global(struct zone *zone) |
| @@ -1345,12 +1428,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) | |||
| 1345 | return low; | 1428 | return low; |
| 1346 | } | 1429 | } |
| 1347 | 1430 | ||
| 1431 | static int inactive_file_is_low_global(struct zone *zone) | ||
| 1432 | { | ||
| 1433 | unsigned long active, inactive; | ||
| 1434 | |||
| 1435 | active = zone_page_state(zone, NR_ACTIVE_FILE); | ||
| 1436 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | ||
| 1437 | |||
| 1438 | return (active > inactive); | ||
| 1439 | } | ||
| 1440 | |||
| 1441 | /** | ||
| 1442 | * inactive_file_is_low - check if file pages need to be deactivated | ||
| 1443 | * @zone: zone to check | ||
| 1444 | * @sc: scan control of this context | ||
| 1445 | * | ||
| 1446 | * When the system is doing streaming IO, memory pressure here | ||
| 1447 | * ensures that active file pages get deactivated, until more | ||
| 1448 | * than half of the file pages are on the inactive list. | ||
| 1449 | * | ||
| 1450 | * Once we get to that situation, protect the system's working | ||
| 1451 | * set from being evicted by disabling active file page aging. | ||
| 1452 | * | ||
| 1453 | * This uses a different ratio than the anonymous pages, because | ||
| 1454 | * the page cache uses a use-once replacement algorithm. | ||
| 1455 | */ | ||
| 1456 | static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) | ||
| 1457 | { | ||
| 1458 | int low; | ||
| 1459 | |||
| 1460 | if (scanning_global_lru(sc)) | ||
| 1461 | low = inactive_file_is_low_global(zone); | ||
| 1462 | else | ||
| 1463 | low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); | ||
| 1464 | return low; | ||
| 1465 | } | ||
| 1466 | |||
| 1348 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 1467 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
| 1349 | struct zone *zone, struct scan_control *sc, int priority) | 1468 | struct zone *zone, struct scan_control *sc, int priority) |
| 1350 | { | 1469 | { |
| 1351 | int file = is_file_lru(lru); | 1470 | int file = is_file_lru(lru); |
| 1352 | 1471 | ||
| 1353 | if (lru == LRU_ACTIVE_FILE) { | 1472 | if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { |
| 1354 | shrink_active_list(nr_to_scan, zone, sc, priority, file); | 1473 | shrink_active_list(nr_to_scan, zone, sc, priority, file); |
| 1355 | return 0; | 1474 | return 0; |
| 1356 | } | 1475 | } |
| @@ -1379,23 +1498,16 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
| 1379 | unsigned long ap, fp; | 1498 | unsigned long ap, fp; |
| 1380 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | 1499 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); |
| 1381 | 1500 | ||
| 1382 | /* If we have no swap space, do not bother scanning anon pages. */ | 1501 | anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + |
| 1383 | if (nr_swap_pages <= 0) { | 1502 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); |
| 1384 | percent[0] = 0; | 1503 | file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + |
| 1385 | percent[1] = 100; | 1504 | zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); |
| 1386 | return; | ||
| 1387 | } | ||
| 1388 | |||
| 1389 | anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + | ||
| 1390 | zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); | ||
| 1391 | file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + | ||
| 1392 | zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); | ||
| 1393 | 1505 | ||
| 1394 | if (scanning_global_lru(sc)) { | 1506 | if (scanning_global_lru(sc)) { |
| 1395 | free = zone_page_state(zone, NR_FREE_PAGES); | 1507 | free = zone_page_state(zone, NR_FREE_PAGES); |
| 1396 | /* If we have very few page cache pages, | 1508 | /* If we have very few page cache pages, |
| 1397 | force-scan anon pages. */ | 1509 | force-scan anon pages. */ |
| 1398 | if (unlikely(file + free <= zone->pages_high)) { | 1510 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
| 1399 | percent[0] = 100; | 1511 | percent[0] = 100; |
| 1400 | percent[1] = 0; | 1512 | percent[1] = 0; |
| 1401 | return; | 1513 | return; |
| @@ -1450,6 +1562,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, | |||
| 1450 | percent[1] = 100 - percent[0]; | 1562 | percent[1] = 100 - percent[0]; |
| 1451 | } | 1563 | } |
| 1452 | 1564 | ||
| 1565 | /* | ||
| 1566 | * Smallish @nr_to_scan's are deposited in @nr_saved_scan, | ||
| 1567 | * until we collected @swap_cluster_max pages to scan. | ||
| 1568 | */ | ||
| 1569 | static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, | ||
| 1570 | unsigned long *nr_saved_scan, | ||
| 1571 | unsigned long swap_cluster_max) | ||
| 1572 | { | ||
| 1573 | unsigned long nr; | ||
| 1574 | |||
| 1575 | *nr_saved_scan += nr_to_scan; | ||
| 1576 | nr = *nr_saved_scan; | ||
| 1577 | |||
| 1578 | if (nr >= swap_cluster_max) | ||
| 1579 | *nr_saved_scan = 0; | ||
| 1580 | else | ||
| 1581 | nr = 0; | ||
| 1582 | |||
| 1583 | return nr; | ||
| 1584 | } | ||
| 1453 | 1585 | ||
| 1454 | /* | 1586 | /* |
| 1455 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | 1587 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. |
| @@ -1463,27 +1595,29 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1463 | enum lru_list l; | 1595 | enum lru_list l; |
| 1464 | unsigned long nr_reclaimed = sc->nr_reclaimed; | 1596 | unsigned long nr_reclaimed = sc->nr_reclaimed; |
| 1465 | unsigned long swap_cluster_max = sc->swap_cluster_max; | 1597 | unsigned long swap_cluster_max = sc->swap_cluster_max; |
| 1598 | struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); | ||
| 1599 | int noswap = 0; | ||
| 1466 | 1600 | ||
| 1467 | get_scan_ratio(zone, sc, percent); | 1601 | /* If we have no swap space, do not bother scanning anon pages. */ |
| 1602 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | ||
| 1603 | noswap = 1; | ||
| 1604 | percent[0] = 0; | ||
| 1605 | percent[1] = 100; | ||
| 1606 | } else | ||
| 1607 | get_scan_ratio(zone, sc, percent); | ||
| 1468 | 1608 | ||
| 1469 | for_each_evictable_lru(l) { | 1609 | for_each_evictable_lru(l) { |
| 1470 | int file = is_file_lru(l); | 1610 | int file = is_file_lru(l); |
| 1471 | int scan; | 1611 | unsigned long scan; |
| 1472 | 1612 | ||
| 1473 | scan = zone_nr_pages(zone, sc, l); | 1613 | scan = zone_nr_lru_pages(zone, sc, l); |
| 1474 | if (priority) { | 1614 | if (priority || noswap) { |
| 1475 | scan >>= priority; | 1615 | scan >>= priority; |
| 1476 | scan = (scan * percent[file]) / 100; | 1616 | scan = (scan * percent[file]) / 100; |
| 1477 | } | 1617 | } |
| 1478 | if (scanning_global_lru(sc)) { | 1618 | nr[l] = nr_scan_try_batch(scan, |
| 1479 | zone->lru[l].nr_scan += scan; | 1619 | &reclaim_stat->nr_saved_scan[l], |
| 1480 | nr[l] = zone->lru[l].nr_scan; | 1620 | swap_cluster_max); |
| 1481 | if (nr[l] >= swap_cluster_max) | ||
| 1482 | zone->lru[l].nr_scan = 0; | ||
| 1483 | else | ||
| 1484 | nr[l] = 0; | ||
| 1485 | } else | ||
| 1486 | nr[l] = scan; | ||
| 1487 | } | 1621 | } |
| 1488 | 1622 | ||
| 1489 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | 1623 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || |
| @@ -1516,7 +1650,7 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1516 | * Even if we did not try to evict anon pages at all, we want to | 1650 | * Even if we did not try to evict anon pages at all, we want to |
| 1517 | * rebalance the anon lru active/inactive ratio. | 1651 | * rebalance the anon lru active/inactive ratio. |
| 1518 | */ | 1652 | */ |
| 1519 | if (inactive_anon_is_low(zone, sc)) | 1653 | if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) |
| 1520 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); | 1654 | shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); |
| 1521 | 1655 | ||
| 1522 | throttle_vm_writeout(sc->gfp_mask); | 1656 | throttle_vm_writeout(sc->gfp_mask); |
| @@ -1527,11 +1661,13 @@ static void shrink_zone(int priority, struct zone *zone, | |||
| 1527 | * try to reclaim pages from zones which will satisfy the caller's allocation | 1661 | * try to reclaim pages from zones which will satisfy the caller's allocation |
| 1528 | * request. | 1662 | * request. |
| 1529 | * | 1663 | * |
| 1530 | * We reclaim from a zone even if that zone is over pages_high. Because: | 1664 | * We reclaim from a zone even if that zone is over high_wmark_pages(zone). |
| 1665 | * Because: | ||
| 1531 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order | 1666 | * a) The caller may be trying to free *extra* pages to satisfy a higher-order |
| 1532 | * allocation or | 1667 | * allocation or |
| 1533 | * b) The zones may be over pages_high but they must go *over* pages_high to | 1668 | * b) The target zone may be at high_wmark_pages(zone) but the lower zones |
| 1534 | * satisfy the `incremental min' zone defense algorithm. | 1669 | * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' |
| 1670 | * zone defense algorithm. | ||
| 1535 | * | 1671 | * |
| 1536 | * If a zone is deemed to be full of pinned pages then just give it a light | 1672 | * If a zone is deemed to be full of pinned pages then just give it a light |
| 1537 | * scan then give up on it. | 1673 | * scan then give up on it. |
| @@ -1583,10 +1719,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, | |||
| 1583 | * | 1719 | * |
| 1584 | * If the caller is !__GFP_FS then the probability of a failure is reasonably | 1720 | * If the caller is !__GFP_FS then the probability of a failure is reasonably |
| 1585 | * high - the zone may be full of dirty or under-writeback pages, which this | 1721 | * high - the zone may be full of dirty or under-writeback pages, which this |
| 1586 | * caller can't do much about. We kick pdflush and take explicit naps in the | 1722 | * caller can't do much about. We kick the writeback threads and take explicit |
| 1587 | * hope that some of these pages can be written. But if the allocating task | 1723 | * naps in the hope that some of these pages can be written. But if the |
| 1588 | * holds filesystem locks which prevent writeout this might not work, and the | 1724 | * allocating task holds filesystem locks which prevent writeout this might not |
| 1589 | * allocation attempt will fail. | 1725 | * work, and the allocation attempt will fail. |
| 1590 | * | 1726 | * |
| 1591 | * returns: 0, if no pages reclaimed | 1727 | * returns: 0, if no pages reclaimed |
| 1592 | * else, the number of pages reclaimed | 1728 | * else, the number of pages reclaimed |
| @@ -1616,7 +1752,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1616 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 1752 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
| 1617 | continue; | 1753 | continue; |
| 1618 | 1754 | ||
| 1619 | lru_pages += zone_lru_pages(zone); | 1755 | lru_pages += zone_reclaimable_pages(zone); |
| 1620 | } | 1756 | } |
| 1621 | } | 1757 | } |
| 1622 | 1758 | ||
| @@ -1651,13 +1787,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
| 1651 | */ | 1787 | */ |
| 1652 | if (total_scanned > sc->swap_cluster_max + | 1788 | if (total_scanned > sc->swap_cluster_max + |
| 1653 | sc->swap_cluster_max / 2) { | 1789 | sc->swap_cluster_max / 2) { |
| 1654 | wakeup_pdflush(laptop_mode ? 0 : total_scanned); | 1790 | wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); |
| 1655 | sc->may_writepage = 1; | 1791 | sc->may_writepage = 1; |
| 1656 | } | 1792 | } |
| 1657 | 1793 | ||
| 1658 | /* Take a nap, wait for some writeback to complete */ | 1794 | /* Take a nap, wait for some writeback to complete */ |
| 1659 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1795 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) |
| 1660 | congestion_wait(WRITE, HZ/10); | 1796 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| 1661 | } | 1797 | } |
| 1662 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1798 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
| 1663 | if (!sc->all_unreclaimable && scanning_global_lru(sc)) | 1799 | if (!sc->all_unreclaimable && scanning_global_lru(sc)) |
| @@ -1697,6 +1833,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
| 1697 | .may_writepage = !laptop_mode, | 1833 | .may_writepage = !laptop_mode, |
| 1698 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1834 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
| 1699 | .may_unmap = 1, | 1835 | .may_unmap = 1, |
| 1836 | .may_swap = 1, | ||
| 1700 | .swappiness = vm_swappiness, | 1837 | .swappiness = vm_swappiness, |
| 1701 | .order = order, | 1838 | .order = order, |
| 1702 | .mem_cgroup = NULL, | 1839 | .mem_cgroup = NULL, |
| @@ -1709,14 +1846,49 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
| 1709 | 1846 | ||
| 1710 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 1847 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
| 1711 | 1848 | ||
| 1849 | unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | ||
| 1850 | gfp_t gfp_mask, bool noswap, | ||
| 1851 | unsigned int swappiness, | ||
| 1852 | struct zone *zone, int nid) | ||
| 1853 | { | ||
| 1854 | struct scan_control sc = { | ||
| 1855 | .may_writepage = !laptop_mode, | ||
| 1856 | .may_unmap = 1, | ||
| 1857 | .may_swap = !noswap, | ||
| 1858 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
| 1859 | .swappiness = swappiness, | ||
| 1860 | .order = 0, | ||
| 1861 | .mem_cgroup = mem, | ||
| 1862 | .isolate_pages = mem_cgroup_isolate_pages, | ||
| 1863 | }; | ||
| 1864 | nodemask_t nm = nodemask_of_node(nid); | ||
| 1865 | |||
| 1866 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | ||
| 1867 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | ||
| 1868 | sc.nodemask = &nm; | ||
| 1869 | sc.nr_reclaimed = 0; | ||
| 1870 | sc.nr_scanned = 0; | ||
| 1871 | /* | ||
| 1872 | * NOTE: Although we can get the priority field, using it | ||
| 1873 | * here is not a good idea, since it limits the pages we can scan. | ||
| 1874 | * if we don't reclaim here, the shrink_zone from balance_pgdat | ||
| 1875 | * will pick up pages from other mem cgroup's as well. We hack | ||
| 1876 | * the priority and make it zero. | ||
| 1877 | */ | ||
| 1878 | shrink_zone(0, zone, &sc); | ||
| 1879 | return sc.nr_reclaimed; | ||
| 1880 | } | ||
| 1881 | |||
| 1712 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | 1882 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, |
| 1713 | gfp_t gfp_mask, | 1883 | gfp_t gfp_mask, |
| 1714 | bool noswap, | 1884 | bool noswap, |
| 1715 | unsigned int swappiness) | 1885 | unsigned int swappiness) |
| 1716 | { | 1886 | { |
| 1887 | struct zonelist *zonelist; | ||
| 1717 | struct scan_control sc = { | 1888 | struct scan_control sc = { |
| 1718 | .may_writepage = !laptop_mode, | 1889 | .may_writepage = !laptop_mode, |
| 1719 | .may_unmap = 1, | 1890 | .may_unmap = 1, |
| 1891 | .may_swap = !noswap, | ||
| 1720 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1892 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
| 1721 | .swappiness = swappiness, | 1893 | .swappiness = swappiness, |
| 1722 | .order = 0, | 1894 | .order = 0, |
| @@ -1724,10 +1896,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
| 1724 | .isolate_pages = mem_cgroup_isolate_pages, | 1896 | .isolate_pages = mem_cgroup_isolate_pages, |
| 1725 | .nodemask = NULL, /* we don't care the placement */ | 1897 | .nodemask = NULL, /* we don't care the placement */ |
| 1726 | }; | 1898 | }; |
| 1727 | struct zonelist *zonelist; | ||
| 1728 | |||
| 1729 | if (noswap) | ||
| 1730 | sc.may_unmap = 0; | ||
| 1731 | 1899 | ||
| 1732 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 1900 | sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
| 1733 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); | 1901 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); |
| @@ -1738,7 +1906,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
| 1738 | 1906 | ||
| 1739 | /* | 1907 | /* |
| 1740 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1908 | * For kswapd, balance_pgdat() will work across all this node's zones until |
| 1741 | * they are all at pages_high. | 1909 | * they are all at high_wmark_pages(zone). |
| 1742 | * | 1910 | * |
| 1743 | * Returns the number of pages which were actually freed. | 1911 | * Returns the number of pages which were actually freed. |
| 1744 | * | 1912 | * |
| @@ -1751,11 +1919,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
| 1751 | * the zone for when the problem goes away. | 1919 | * the zone for when the problem goes away. |
| 1752 | * | 1920 | * |
| 1753 | * kswapd scans the zones in the highmem->normal->dma direction. It skips | 1921 | * kswapd scans the zones in the highmem->normal->dma direction. It skips |
| 1754 | * zones which have free_pages > pages_high, but once a zone is found to have | 1922 | * zones which have free_pages > high_wmark_pages(zone), but once a zone is |
| 1755 | * free_pages <= pages_high, we scan that zone and the lower zones regardless | 1923 | * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the |
| 1756 | * of the number of free pages in the lower zones. This interoperates with | 1924 | * lower zones regardless of the number of free pages in the lower zones. This |
| 1757 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1925 | * interoperates with the page allocator fallback scheme to ensure that aging |
| 1758 | * across the zones. | 1926 | * of pages is balanced across the zones. |
| 1759 | */ | 1927 | */ |
| 1760 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | 1928 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
| 1761 | { | 1929 | { |
| @@ -1767,6 +1935,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
| 1767 | struct scan_control sc = { | 1935 | struct scan_control sc = { |
| 1768 | .gfp_mask = GFP_KERNEL, | 1936 | .gfp_mask = GFP_KERNEL, |
| 1769 | .may_unmap = 1, | 1937 | .may_unmap = 1, |
| 1938 | .may_swap = 1, | ||
| 1770 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 1939 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
| 1771 | .swappiness = vm_swappiness, | 1940 | .swappiness = vm_swappiness, |
| 1772 | .order = order, | 1941 | .order = order, |
| @@ -1775,7 +1944,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) | |||
| 1775 | }; | 1944 | }; |
| 1776 | /* | 1945 | /* |
| 1777 | * temp_priority is used to remember the scanning priority at which | 1946 | * temp_priority is used to remember the scanning priority at which |
| 1778 | * this zone was successfully refilled to free_pages == pages_high. | 1947 | * this zone was successfully refilled to |
| 1948 | * free_pages == high_wmark_pages(zone). | ||
| 1779 | */ | 1949 | */ |
| 1780 | int temp_priority[MAX_NR_ZONES]; | 1950 | int temp_priority[MAX_NR_ZONES]; |
| 1781 | 1951 | ||
| @@ -1820,8 +1990,8 @@ loop_again: | |||
| 1820 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 1990 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
| 1821 | &sc, priority, 0); | 1991 | &sc, priority, 0); |
| 1822 | 1992 | ||
| 1823 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 1993 | if (!zone_watermark_ok(zone, order, |
| 1824 | 0, 0)) { | 1994 | high_wmark_pages(zone), 0, 0)) { |
| 1825 | end_zone = i; | 1995 | end_zone = i; |
| 1826 | break; | 1996 | break; |
| 1827 | } | 1997 | } |
| @@ -1832,7 +2002,7 @@ loop_again: | |||
| 1832 | for (i = 0; i <= end_zone; i++) { | 2002 | for (i = 0; i <= end_zone; i++) { |
| 1833 | struct zone *zone = pgdat->node_zones + i; | 2003 | struct zone *zone = pgdat->node_zones + i; |
| 1834 | 2004 | ||
| 1835 | lru_pages += zone_lru_pages(zone); | 2005 | lru_pages += zone_reclaimable_pages(zone); |
| 1836 | } | 2006 | } |
| 1837 | 2007 | ||
| 1838 | /* | 2008 | /* |
| @@ -1847,6 +2017,7 @@ loop_again: | |||
| 1847 | for (i = 0; i <= end_zone; i++) { | 2017 | for (i = 0; i <= end_zone; i++) { |
| 1848 | struct zone *zone = pgdat->node_zones + i; | 2018 | struct zone *zone = pgdat->node_zones + i; |
| 1849 | int nr_slab; | 2019 | int nr_slab; |
| 2020 | int nid, zid; | ||
| 1850 | 2021 | ||
| 1851 | if (!populated_zone(zone)) | 2022 | if (!populated_zone(zone)) |
| 1852 | continue; | 2023 | continue; |
| @@ -1855,18 +2026,27 @@ loop_again: | |||
| 1855 | priority != DEF_PRIORITY) | 2026 | priority != DEF_PRIORITY) |
| 1856 | continue; | 2027 | continue; |
| 1857 | 2028 | ||
| 1858 | if (!zone_watermark_ok(zone, order, zone->pages_high, | 2029 | if (!zone_watermark_ok(zone, order, |
| 1859 | end_zone, 0)) | 2030 | high_wmark_pages(zone), end_zone, 0)) |
| 1860 | all_zones_ok = 0; | 2031 | all_zones_ok = 0; |
| 1861 | temp_priority[i] = priority; | 2032 | temp_priority[i] = priority; |
| 1862 | sc.nr_scanned = 0; | 2033 | sc.nr_scanned = 0; |
| 1863 | note_zone_scanning_priority(zone, priority); | 2034 | note_zone_scanning_priority(zone, priority); |
| 2035 | |||
| 2036 | nid = pgdat->node_id; | ||
| 2037 | zid = zone_idx(zone); | ||
| 2038 | /* | ||
| 2039 | * Call soft limit reclaim before calling shrink_zone. | ||
| 2040 | * For now we ignore the return value | ||
| 2041 | */ | ||
| 2042 | mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, | ||
| 2043 | nid, zid); | ||
| 1864 | /* | 2044 | /* |
| 1865 | * We put equal pressure on every zone, unless one | 2045 | * We put equal pressure on every zone, unless one |
| 1866 | * zone has way too many pages free already. | 2046 | * zone has way too many pages free already. |
| 1867 | */ | 2047 | */ |
| 1868 | if (!zone_watermark_ok(zone, order, 8*zone->pages_high, | 2048 | if (!zone_watermark_ok(zone, order, |
| 1869 | end_zone, 0)) | 2049 | 8*high_wmark_pages(zone), end_zone, 0)) |
| 1870 | shrink_zone(priority, zone, &sc); | 2050 | shrink_zone(priority, zone, &sc); |
| 1871 | reclaim_state->reclaimed_slab = 0; | 2051 | reclaim_state->reclaimed_slab = 0; |
| 1872 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, | 2052 | nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, |
| @@ -1876,7 +2056,7 @@ loop_again: | |||
| 1876 | if (zone_is_all_unreclaimable(zone)) | 2056 | if (zone_is_all_unreclaimable(zone)) |
| 1877 | continue; | 2057 | continue; |
| 1878 | if (nr_slab == 0 && zone->pages_scanned >= | 2058 | if (nr_slab == 0 && zone->pages_scanned >= |
| 1879 | (zone_lru_pages(zone) * 6)) | 2059 | (zone_reclaimable_pages(zone) * 6)) |
| 1880 | zone_set_flag(zone, | 2060 | zone_set_flag(zone, |
| 1881 | ZONE_ALL_UNRECLAIMABLE); | 2061 | ZONE_ALL_UNRECLAIMABLE); |
| 1882 | /* | 2062 | /* |
| @@ -1895,7 +2075,7 @@ loop_again: | |||
| 1895 | * another pass across the zones. | 2075 | * another pass across the zones. |
| 1896 | */ | 2076 | */ |
| 1897 | if (total_scanned && priority < DEF_PRIORITY - 2) | 2077 | if (total_scanned && priority < DEF_PRIORITY - 2) |
| 1898 | congestion_wait(WRITE, HZ/10); | 2078 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
| 1899 | 2079 | ||
| 1900 | /* | 2080 | /* |
| 1901 | * We do this so kswapd doesn't build up large priorities for | 2081 | * We do this so kswapd doesn't build up large priorities for |
| @@ -1967,7 +2147,7 @@ static int kswapd(void *p) | |||
| 1967 | struct reclaim_state reclaim_state = { | 2147 | struct reclaim_state reclaim_state = { |
| 1968 | .reclaimed_slab = 0, | 2148 | .reclaimed_slab = 0, |
| 1969 | }; | 2149 | }; |
| 1970 | node_to_cpumask_ptr(cpumask, pgdat->node_id); | 2150 | const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); |
| 1971 | 2151 | ||
| 1972 | lockdep_set_current_reclaim_state(GFP_KERNEL); | 2152 | lockdep_set_current_reclaim_state(GFP_KERNEL); |
| 1973 | 2153 | ||
| @@ -2032,7 +2212,7 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
| 2032 | return; | 2212 | return; |
| 2033 | 2213 | ||
| 2034 | pgdat = zone->zone_pgdat; | 2214 | pgdat = zone->zone_pgdat; |
| 2035 | if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) | 2215 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) |
| 2036 | return; | 2216 | return; |
| 2037 | if (pgdat->kswapd_max_order < order) | 2217 | if (pgdat->kswapd_max_order < order) |
| 2038 | pgdat->kswapd_max_order = order; | 2218 | pgdat->kswapd_max_order = order; |
| @@ -2043,15 +2223,42 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
| 2043 | wake_up_interruptible(&pgdat->kswapd_wait); | 2223 | wake_up_interruptible(&pgdat->kswapd_wait); |
| 2044 | } | 2224 | } |
| 2045 | 2225 | ||
| 2046 | unsigned long global_lru_pages(void) | 2226 | /* |
| 2227 | * The reclaimable count would be mostly accurate. | ||
| 2228 | * The less reclaimable pages may be | ||
| 2229 | * - mlocked pages, which will be moved to unevictable list when encountered | ||
| 2230 | * - mapped pages, which may require several travels to be reclaimed | ||
| 2231 | * - dirty pages, which is not "instantly" reclaimable | ||
| 2232 | */ | ||
| 2233 | unsigned long global_reclaimable_pages(void) | ||
| 2047 | { | 2234 | { |
| 2048 | return global_page_state(NR_ACTIVE_ANON) | 2235 | int nr; |
| 2049 | + global_page_state(NR_ACTIVE_FILE) | 2236 | |
| 2050 | + global_page_state(NR_INACTIVE_ANON) | 2237 | nr = global_page_state(NR_ACTIVE_FILE) + |
| 2051 | + global_page_state(NR_INACTIVE_FILE); | 2238 | global_page_state(NR_INACTIVE_FILE); |
| 2239 | |||
| 2240 | if (nr_swap_pages > 0) | ||
| 2241 | nr += global_page_state(NR_ACTIVE_ANON) + | ||
| 2242 | global_page_state(NR_INACTIVE_ANON); | ||
| 2243 | |||
| 2244 | return nr; | ||
| 2245 | } | ||
| 2246 | |||
| 2247 | unsigned long zone_reclaimable_pages(struct zone *zone) | ||
| 2248 | { | ||
| 2249 | int nr; | ||
| 2250 | |||
| 2251 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | ||
| 2252 | zone_page_state(zone, NR_INACTIVE_FILE); | ||
| 2253 | |||
| 2254 | if (nr_swap_pages > 0) | ||
| 2255 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + | ||
| 2256 | zone_page_state(zone, NR_INACTIVE_ANON); | ||
| 2257 | |||
| 2258 | return nr; | ||
| 2052 | } | 2259 | } |
| 2053 | 2260 | ||
| 2054 | #ifdef CONFIG_PM | 2261 | #ifdef CONFIG_HIBERNATION |
| 2055 | /* | 2262 | /* |
| 2056 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 2263 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
| 2057 | * from LRU lists system-wide, for given pass and priority. | 2264 | * from LRU lists system-wide, for given pass and priority. |
| @@ -2063,6 +2270,7 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, | |||
| 2063 | { | 2270 | { |
| 2064 | struct zone *zone; | 2271 | struct zone *zone; |
| 2065 | unsigned long nr_reclaimed = 0; | 2272 | unsigned long nr_reclaimed = 0; |
| 2273 | struct zone_reclaim_stat *reclaim_stat; | ||
| 2066 | 2274 | ||
| 2067 | for_each_populated_zone(zone) { | 2275 | for_each_populated_zone(zone) { |
| 2068 | enum lru_list l; | 2276 | enum lru_list l; |
| @@ -2079,22 +2287,25 @@ static void shrink_all_zones(unsigned long nr_pages, int prio, | |||
| 2079 | l == LRU_ACTIVE_FILE)) | 2287 | l == LRU_ACTIVE_FILE)) |
| 2080 | continue; | 2288 | continue; |
| 2081 | 2289 | ||
| 2082 | zone->lru[l].nr_scan += (lru_pages >> prio) + 1; | 2290 | reclaim_stat = get_reclaim_stat(zone, sc); |
| 2083 | if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { | 2291 | reclaim_stat->nr_saved_scan[l] += |
| 2292 | (lru_pages >> prio) + 1; | ||
| 2293 | if (reclaim_stat->nr_saved_scan[l] | ||
| 2294 | >= nr_pages || pass > 3) { | ||
| 2084 | unsigned long nr_to_scan; | 2295 | unsigned long nr_to_scan; |
| 2085 | 2296 | ||
| 2086 | zone->lru[l].nr_scan = 0; | 2297 | reclaim_stat->nr_saved_scan[l] = 0; |
| 2087 | nr_to_scan = min(nr_pages, lru_pages); | 2298 | nr_to_scan = min(nr_pages, lru_pages); |
| 2088 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | 2299 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, |
| 2089 | sc, prio); | 2300 | sc, prio); |
| 2090 | if (nr_reclaimed >= nr_pages) { | 2301 | if (nr_reclaimed >= nr_pages) { |
| 2091 | sc->nr_reclaimed = nr_reclaimed; | 2302 | sc->nr_reclaimed += nr_reclaimed; |
| 2092 | return; | 2303 | return; |
| 2093 | } | 2304 | } |
| 2094 | } | 2305 | } |
| 2095 | } | 2306 | } |
| 2096 | } | 2307 | } |
| 2097 | sc->nr_reclaimed = nr_reclaimed; | 2308 | sc->nr_reclaimed += nr_reclaimed; |
| 2098 | } | 2309 | } |
| 2099 | 2310 | ||
| 2100 | /* | 2311 | /* |
| @@ -2115,11 +2326,12 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
| 2115 | .may_unmap = 0, | 2326 | .may_unmap = 0, |
| 2116 | .may_writepage = 1, | 2327 | .may_writepage = 1, |
| 2117 | .isolate_pages = isolate_pages_global, | 2328 | .isolate_pages = isolate_pages_global, |
| 2329 | .nr_reclaimed = 0, | ||
| 2118 | }; | 2330 | }; |
| 2119 | 2331 | ||
| 2120 | current->reclaim_state = &reclaim_state; | 2332 | current->reclaim_state = &reclaim_state; |
| 2121 | 2333 | ||
| 2122 | lru_pages = global_lru_pages(); | 2334 | lru_pages = global_reclaimable_pages(); |
| 2123 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 2335 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); |
| 2124 | /* If slab caches are huge, it's better to hit them first */ | 2336 | /* If slab caches are huge, it's better to hit them first */ |
| 2125 | while (nr_slab >= lru_pages) { | 2337 | while (nr_slab >= lru_pages) { |
| @@ -2161,13 +2373,13 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
| 2161 | 2373 | ||
| 2162 | reclaim_state.reclaimed_slab = 0; | 2374 | reclaim_state.reclaimed_slab = 0; |
| 2163 | shrink_slab(sc.nr_scanned, sc.gfp_mask, | 2375 | shrink_slab(sc.nr_scanned, sc.gfp_mask, |
| 2164 | global_lru_pages()); | 2376 | global_reclaimable_pages()); |
| 2165 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | 2377 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; |
| 2166 | if (sc.nr_reclaimed >= nr_pages) | 2378 | if (sc.nr_reclaimed >= nr_pages) |
| 2167 | goto out; | 2379 | goto out; |
| 2168 | 2380 | ||
| 2169 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | 2381 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) |
| 2170 | congestion_wait(WRITE, HZ / 10); | 2382 | congestion_wait(BLK_RW_ASYNC, HZ / 10); |
| 2171 | } | 2383 | } |
| 2172 | } | 2384 | } |
| 2173 | 2385 | ||
| @@ -2178,7 +2390,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) | |||
| 2178 | if (!sc.nr_reclaimed) { | 2390 | if (!sc.nr_reclaimed) { |
| 2179 | do { | 2391 | do { |
| 2180 | reclaim_state.reclaimed_slab = 0; | 2392 | reclaim_state.reclaimed_slab = 0; |
| 2181 | shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); | 2393 | shrink_slab(nr_pages, sc.gfp_mask, |
| 2394 | global_reclaimable_pages()); | ||
| 2182 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | 2395 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; |
| 2183 | } while (sc.nr_reclaimed < nr_pages && | 2396 | } while (sc.nr_reclaimed < nr_pages && |
| 2184 | reclaim_state.reclaimed_slab > 0); | 2397 | reclaim_state.reclaimed_slab > 0); |
| @@ -2190,7 +2403,7 @@ out: | |||
| 2190 | 2403 | ||
| 2191 | return sc.nr_reclaimed; | 2404 | return sc.nr_reclaimed; |
| 2192 | } | 2405 | } |
| 2193 | #endif | 2406 | #endif /* CONFIG_HIBERNATION */ |
| 2194 | 2407 | ||
| 2195 | /* It's optimal to keep kswapds on the same CPUs as their memory, but | 2408 | /* It's optimal to keep kswapds on the same CPUs as their memory, but |
| 2196 | not required for correctness. So if the last cpu in a node goes | 2409 | not required for correctness. So if the last cpu in a node goes |
| @@ -2204,7 +2417,9 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
| 2204 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { | 2417 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
| 2205 | for_each_node_state(nid, N_HIGH_MEMORY) { | 2418 | for_each_node_state(nid, N_HIGH_MEMORY) { |
| 2206 | pg_data_t *pgdat = NODE_DATA(nid); | 2419 | pg_data_t *pgdat = NODE_DATA(nid); |
| 2207 | node_to_cpumask_ptr(mask, pgdat->node_id); | 2420 | const struct cpumask *mask; |
| 2421 | |||
| 2422 | mask = cpumask_of_node(pgdat->node_id); | ||
| 2208 | 2423 | ||
| 2209 | if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) | 2424 | if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) |
| 2210 | /* One of our CPUs online: restore mask */ | 2425 | /* One of our CPUs online: restore mask */ |
| @@ -2282,6 +2497,48 @@ int sysctl_min_unmapped_ratio = 1; | |||
| 2282 | */ | 2497 | */ |
| 2283 | int sysctl_min_slab_ratio = 5; | 2498 | int sysctl_min_slab_ratio = 5; |
| 2284 | 2499 | ||
| 2500 | static inline unsigned long zone_unmapped_file_pages(struct zone *zone) | ||
| 2501 | { | ||
| 2502 | unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); | ||
| 2503 | unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + | ||
| 2504 | zone_page_state(zone, NR_ACTIVE_FILE); | ||
| 2505 | |||
| 2506 | /* | ||
| 2507 | * It's possible for there to be more file mapped pages than | ||
| 2508 | * accounted for by the pages on the file LRU lists because | ||
| 2509 | * tmpfs pages accounted for as ANON can also be FILE_MAPPED | ||
| 2510 | */ | ||
| 2511 | return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; | ||
| 2512 | } | ||
| 2513 | |||
| 2514 | /* Work out how many page cache pages we can reclaim in this reclaim_mode */ | ||
| 2515 | static long zone_pagecache_reclaimable(struct zone *zone) | ||
| 2516 | { | ||
| 2517 | long nr_pagecache_reclaimable; | ||
| 2518 | long delta = 0; | ||
| 2519 | |||
| 2520 | /* | ||
| 2521 | * If RECLAIM_SWAP is set, then all file pages are considered | ||
| 2522 | * potentially reclaimable. Otherwise, we have to worry about | ||
| 2523 | * pages like swapcache and zone_unmapped_file_pages() provides | ||
| 2524 | * a better estimate | ||
| 2525 | */ | ||
| 2526 | if (zone_reclaim_mode & RECLAIM_SWAP) | ||
| 2527 | nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); | ||
| 2528 | else | ||
| 2529 | nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); | ||
| 2530 | |||
| 2531 | /* If we can't clean pages, remove dirty pages from consideration */ | ||
| 2532 | if (!(zone_reclaim_mode & RECLAIM_WRITE)) | ||
| 2533 | delta += zone_page_state(zone, NR_FILE_DIRTY); | ||
| 2534 | |||
| 2535 | /* Watch for any possible underflows due to delta */ | ||
| 2536 | if (unlikely(delta > nr_pagecache_reclaimable)) | ||
| 2537 | delta = nr_pagecache_reclaimable; | ||
| 2538 | |||
| 2539 | return nr_pagecache_reclaimable - delta; | ||
| 2540 | } | ||
| 2541 | |||
| 2285 | /* | 2542 | /* |
| 2286 | * Try to free up some pages from this zone through reclaim. | 2543 | * Try to free up some pages from this zone through reclaim. |
| 2287 | */ | 2544 | */ |
| @@ -2295,6 +2552,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2295 | struct scan_control sc = { | 2552 | struct scan_control sc = { |
| 2296 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 2553 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
| 2297 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 2554 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
| 2555 | .may_swap = 1, | ||
| 2298 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 2556 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
| 2299 | SWAP_CLUSTER_MAX), | 2557 | SWAP_CLUSTER_MAX), |
| 2300 | .gfp_mask = gfp_mask, | 2558 | .gfp_mask = gfp_mask, |
| @@ -2315,9 +2573,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2315 | reclaim_state.reclaimed_slab = 0; | 2573 | reclaim_state.reclaimed_slab = 0; |
| 2316 | p->reclaim_state = &reclaim_state; | 2574 | p->reclaim_state = &reclaim_state; |
| 2317 | 2575 | ||
| 2318 | if (zone_page_state(zone, NR_FILE_PAGES) - | 2576 | if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { |
| 2319 | zone_page_state(zone, NR_FILE_MAPPED) > | ||
| 2320 | zone->min_unmapped_pages) { | ||
| 2321 | /* | 2577 | /* |
| 2322 | * Free memory by calling shrink zone with increasing | 2578 | * Free memory by calling shrink zone with increasing |
| 2323 | * priorities until we have enough memory freed. | 2579 | * priorities until we have enough memory freed. |
| @@ -2375,20 +2631,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2375 | * if less than a specified percentage of the zone is used by | 2631 | * if less than a specified percentage of the zone is used by |
| 2376 | * unmapped file backed pages. | 2632 | * unmapped file backed pages. |
| 2377 | */ | 2633 | */ |
| 2378 | if (zone_page_state(zone, NR_FILE_PAGES) - | 2634 | if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && |
| 2379 | zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages | 2635 | zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) |
| 2380 | && zone_page_state(zone, NR_SLAB_RECLAIMABLE) | 2636 | return ZONE_RECLAIM_FULL; |
| 2381 | <= zone->min_slab_pages) | ||
| 2382 | return 0; | ||
| 2383 | 2637 | ||
| 2384 | if (zone_is_all_unreclaimable(zone)) | 2638 | if (zone_is_all_unreclaimable(zone)) |
| 2385 | return 0; | 2639 | return ZONE_RECLAIM_FULL; |
| 2386 | 2640 | ||
| 2387 | /* | 2641 | /* |
| 2388 | * Do not scan if the allocation should not be delayed. | 2642 | * Do not scan if the allocation should not be delayed. |
| 2389 | */ | 2643 | */ |
| 2390 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) | 2644 | if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) |
| 2391 | return 0; | 2645 | return ZONE_RECLAIM_NOSCAN; |
| 2392 | 2646 | ||
| 2393 | /* | 2647 | /* |
| 2394 | * Only run zone reclaim on the local zone or on zones that do not | 2648 | * Only run zone reclaim on the local zone or on zones that do not |
| @@ -2398,18 +2652,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
| 2398 | */ | 2652 | */ |
| 2399 | node_id = zone_to_nid(zone); | 2653 | node_id = zone_to_nid(zone); |
| 2400 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) | 2654 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
| 2401 | return 0; | 2655 | return ZONE_RECLAIM_NOSCAN; |
| 2402 | 2656 | ||
| 2403 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) | 2657 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) |
| 2404 | return 0; | 2658 | return ZONE_RECLAIM_NOSCAN; |
| 2659 | |||
| 2405 | ret = __zone_reclaim(zone, gfp_mask, order); | 2660 | ret = __zone_reclaim(zone, gfp_mask, order); |
| 2406 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); | 2661 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); |
| 2407 | 2662 | ||
| 2663 | if (!ret) | ||
| 2664 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); | ||
| 2665 | |||
| 2408 | return ret; | 2666 | return ret; |
| 2409 | } | 2667 | } |
| 2410 | #endif | 2668 | #endif |
| 2411 | 2669 | ||
| 2412 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 2413 | /* | 2670 | /* |
| 2414 | * page_evictable - test whether a page is evictable | 2671 | * page_evictable - test whether a page is evictable |
| 2415 | * @page: the page to test | 2672 | * @page: the page to test |
| @@ -2454,7 +2711,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone) | |||
| 2454 | retry: | 2711 | retry: |
| 2455 | ClearPageUnevictable(page); | 2712 | ClearPageUnevictable(page); |
| 2456 | if (page_evictable(page, NULL)) { | 2713 | if (page_evictable(page, NULL)) { |
| 2457 | enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); | 2714 | enum lru_list l = page_lru_base_type(page); |
| 2458 | 2715 | ||
| 2459 | __dec_zone_state(zone, NR_UNEVICTABLE); | 2716 | __dec_zone_state(zone, NR_UNEVICTABLE); |
| 2460 | list_move(&page->lru, &zone->lru[l].list); | 2717 | list_move(&page->lru, &zone->lru[l].list); |
| @@ -2597,10 +2854,10 @@ static void scan_all_zones_unevictable_pages(void) | |||
| 2597 | unsigned long scan_unevictable_pages; | 2854 | unsigned long scan_unevictable_pages; |
| 2598 | 2855 | ||
| 2599 | int scan_unevictable_handler(struct ctl_table *table, int write, | 2856 | int scan_unevictable_handler(struct ctl_table *table, int write, |
| 2600 | struct file *file, void __user *buffer, | 2857 | void __user *buffer, |
| 2601 | size_t *length, loff_t *ppos) | 2858 | size_t *length, loff_t *ppos) |
| 2602 | { | 2859 | { |
| 2603 | proc_doulongvec_minmax(table, write, file, buffer, length, ppos); | 2860 | proc_doulongvec_minmax(table, write, buffer, length, ppos); |
| 2604 | 2861 | ||
| 2605 | if (write && *(unsigned long *)table->data) | 2862 | if (write && *(unsigned long *)table->data) |
| 2606 | scan_all_zones_unevictable_pages(); | 2863 | scan_all_zones_unevictable_pages(); |
| @@ -2656,4 +2913,3 @@ void scan_unevictable_unregister_node(struct node *node) | |||
| 2656 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); | 2913 | sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); |
| 2657 | } | 2914 | } |
| 2658 | 2915 | ||
| 2659 | #endif | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index 9826766f1274..c81321f9feec 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -509,22 +509,11 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, | |||
| 509 | continue; | 509 | continue; |
| 510 | 510 | ||
| 511 | page = pfn_to_page(pfn); | 511 | page = pfn_to_page(pfn); |
| 512 | #ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES | 512 | |
| 513 | /* | 513 | /* Watch for unexpected holes punched in the memmap */ |
| 514 | * Ordinarily, memory holes in flatmem still have a valid | 514 | if (!memmap_valid_within(pfn, page, zone)) |
| 515 | * memmap for the PFN range. However, an architecture for | ||
| 516 | * embedded systems (e.g. ARM) can free up the memmap backing | ||
| 517 | * holes to save memory on the assumption the memmap is | ||
| 518 | * never used. The page_zone linkages are then broken even | ||
| 519 | * though pfn_valid() returns true. Skip the page if the | ||
| 520 | * linkages are broken. Even if this test passed, the impact | ||
| 521 | * is that the counters for the movable type are off but | ||
| 522 | * fragmentation monitoring is likely meaningless on small | ||
| 523 | * systems. | ||
| 524 | */ | ||
| 525 | if (page_zone(page) != zone) | ||
| 526 | continue; | 515 | continue; |
| 527 | #endif | 516 | |
| 528 | mtype = get_pageblock_migratetype(page); | 517 | mtype = get_pageblock_migratetype(page); |
| 529 | 518 | ||
| 530 | if (mtype < MIGRATE_TYPES) | 519 | if (mtype < MIGRATE_TYPES) |
| @@ -640,10 +629,8 @@ static const char * const vmstat_text[] = { | |||
| 640 | "nr_active_anon", | 629 | "nr_active_anon", |
| 641 | "nr_inactive_file", | 630 | "nr_inactive_file", |
| 642 | "nr_active_file", | 631 | "nr_active_file", |
| 643 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 644 | "nr_unevictable", | 632 | "nr_unevictable", |
| 645 | "nr_mlock", | 633 | "nr_mlock", |
| 646 | #endif | ||
| 647 | "nr_anon_pages", | 634 | "nr_anon_pages", |
| 648 | "nr_mapped", | 635 | "nr_mapped", |
| 649 | "nr_file_pages", | 636 | "nr_file_pages", |
| @@ -652,11 +639,14 @@ static const char * const vmstat_text[] = { | |||
| 652 | "nr_slab_reclaimable", | 639 | "nr_slab_reclaimable", |
| 653 | "nr_slab_unreclaimable", | 640 | "nr_slab_unreclaimable", |
| 654 | "nr_page_table_pages", | 641 | "nr_page_table_pages", |
| 642 | "nr_kernel_stack", | ||
| 655 | "nr_unstable", | 643 | "nr_unstable", |
| 656 | "nr_bounce", | 644 | "nr_bounce", |
| 657 | "nr_vmscan_write", | 645 | "nr_vmscan_write", |
| 658 | "nr_writeback_temp", | 646 | "nr_writeback_temp", |
| 659 | 647 | "nr_isolated_anon", | |
| 648 | "nr_isolated_file", | ||
| 649 | "nr_shmem", | ||
| 660 | #ifdef CONFIG_NUMA | 650 | #ifdef CONFIG_NUMA |
| 661 | "numa_hit", | 651 | "numa_hit", |
| 662 | "numa_miss", | 652 | "numa_miss", |
| @@ -686,6 +676,9 @@ static const char * const vmstat_text[] = { | |||
| 686 | TEXTS_FOR_ZONES("pgscan_kswapd") | 676 | TEXTS_FOR_ZONES("pgscan_kswapd") |
| 687 | TEXTS_FOR_ZONES("pgscan_direct") | 677 | TEXTS_FOR_ZONES("pgscan_direct") |
| 688 | 678 | ||
| 679 | #ifdef CONFIG_NUMA | ||
| 680 | "zone_reclaim_failed", | ||
| 681 | #endif | ||
| 689 | "pginodesteal", | 682 | "pginodesteal", |
| 690 | "slabs_scanned", | 683 | "slabs_scanned", |
| 691 | "kswapd_steal", | 684 | "kswapd_steal", |
| @@ -698,7 +691,6 @@ static const char * const vmstat_text[] = { | |||
| 698 | "htlb_buddy_alloc_success", | 691 | "htlb_buddy_alloc_success", |
| 699 | "htlb_buddy_alloc_fail", | 692 | "htlb_buddy_alloc_fail", |
| 700 | #endif | 693 | #endif |
| 701 | #ifdef CONFIG_UNEVICTABLE_LRU | ||
| 702 | "unevictable_pgs_culled", | 694 | "unevictable_pgs_culled", |
| 703 | "unevictable_pgs_scanned", | 695 | "unevictable_pgs_scanned", |
| 704 | "unevictable_pgs_rescued", | 696 | "unevictable_pgs_rescued", |
| @@ -708,7 +700,6 @@ static const char * const vmstat_text[] = { | |||
| 708 | "unevictable_pgs_stranded", | 700 | "unevictable_pgs_stranded", |
| 709 | "unevictable_pgs_mlockfreed", | 701 | "unevictable_pgs_mlockfreed", |
| 710 | #endif | 702 | #endif |
| 711 | #endif | ||
| 712 | }; | 703 | }; |
| 713 | 704 | ||
| 714 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | 705 | static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, |
| @@ -721,18 +712,14 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 721 | "\n min %lu" | 712 | "\n min %lu" |
| 722 | "\n low %lu" | 713 | "\n low %lu" |
| 723 | "\n high %lu" | 714 | "\n high %lu" |
| 724 | "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" | 715 | "\n scanned %lu" |
| 725 | "\n spanned %lu" | 716 | "\n spanned %lu" |
| 726 | "\n present %lu", | 717 | "\n present %lu", |
| 727 | zone_page_state(zone, NR_FREE_PAGES), | 718 | zone_page_state(zone, NR_FREE_PAGES), |
| 728 | zone->pages_min, | 719 | min_wmark_pages(zone), |
| 729 | zone->pages_low, | 720 | low_wmark_pages(zone), |
| 730 | zone->pages_high, | 721 | high_wmark_pages(zone), |
| 731 | zone->pages_scanned, | 722 | zone->pages_scanned, |
| 732 | zone->lru[LRU_ACTIVE_ANON].nr_scan, | ||
| 733 | zone->lru[LRU_INACTIVE_ANON].nr_scan, | ||
| 734 | zone->lru[LRU_ACTIVE_FILE].nr_scan, | ||
| 735 | zone->lru[LRU_INACTIVE_FILE].nr_scan, | ||
| 736 | zone->spanned_pages, | 723 | zone->spanned_pages, |
| 737 | zone->present_pages); | 724 | zone->present_pages); |
| 738 | 725 | ||
| @@ -891,7 +878,7 @@ static void vmstat_update(struct work_struct *w) | |||
| 891 | { | 878 | { |
| 892 | refresh_cpu_vm_stats(smp_processor_id()); | 879 | refresh_cpu_vm_stats(smp_processor_id()); |
| 893 | schedule_delayed_work(&__get_cpu_var(vmstat_work), | 880 | schedule_delayed_work(&__get_cpu_var(vmstat_work), |
| 894 | sysctl_stat_interval); | 881 | round_jiffies_relative(sysctl_stat_interval)); |
| 895 | } | 882 | } |
| 896 | 883 | ||
| 897 | static void __cpuinit start_cpu_timer(int cpu) | 884 | static void __cpuinit start_cpu_timer(int cpu) |
| @@ -899,7 +886,8 @@ static void __cpuinit start_cpu_timer(int cpu) | |||
| 899 | struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); | 886 | struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); |
| 900 | 887 | ||
| 901 | INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); | 888 | INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); |
| 902 | schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); | 889 | schedule_delayed_work_on(cpu, vmstat_work, |
| 890 | __round_jiffies_relative(HZ, cpu)); | ||
| 903 | } | 891 | } |
| 904 | 892 | ||
| 905 | /* | 893 | /* |
