diff options
| author | Jens Axboe <axboe@kernel.dk> | 2012-07-30 03:03:10 -0400 |
|---|---|---|
| committer | Jens Axboe <axboe@kernel.dk> | 2012-07-30 03:03:10 -0400 |
| commit | 72ea1f74fcdf874cca6d2c0962379523bbd99e2c (patch) | |
| tree | 4c67be6c73356086ff44ef1b8b1c9479702689ca /mm | |
| parent | b1af9be5ef77898c05667bb9dbf3b180d91d3292 (diff) | |
| parent | a73ff3231df59a4b92ccd0dd4e73897c5822489b (diff) | |
Merge branch 'for-jens' of git://git.drbd.org/linux-drbd into for-3.6/drivers
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 17 | ||||
| -rw-r--r-- | mm/Makefile | 1 | ||||
| -rw-r--r-- | mm/bootmem.c | 6 | ||||
| -rw-r--r-- | mm/compaction.c | 5 | ||||
| -rw-r--r-- | mm/frontswap.c | 314 | ||||
| -rw-r--r-- | mm/madvise.c | 18 | ||||
| -rw-r--r-- | mm/memblock.c | 115 | ||||
| -rw-r--r-- | mm/memcontrol.c | 6 | ||||
| -rw-r--r-- | mm/memory.c | 12 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 2 | ||||
| -rw-r--r-- | mm/mempolicy.c | 2 | ||||
| -rw-r--r-- | mm/nobootmem.c | 40 | ||||
| -rw-r--r-- | mm/nommu.c | 2 | ||||
| -rw-r--r-- | mm/oom_kill.c | 21 | ||||
| -rw-r--r-- | mm/page_alloc.c | 7 | ||||
| -rw-r--r-- | mm/page_cgroup.c | 4 | ||||
| -rw-r--r-- | mm/page_io.c | 12 | ||||
| -rw-r--r-- | mm/pagewalk.c | 1 | ||||
| -rw-r--r-- | mm/percpu-vm.c | 1 | ||||
| -rw-r--r-- | mm/shmem.c | 248 | ||||
| -rw-r--r-- | mm/sparse.c | 20 | ||||
| -rw-r--r-- | mm/swapfile.c | 66 | ||||
| -rw-r--r-- | mm/vmscan.c | 12 |
23 files changed, 666 insertions, 266 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index b2176374b98e..82fed4eb2b6f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -389,3 +389,20 @@ config CLEANCACHE | |||
| 389 | in a negligible performance hit. | 389 | in a negligible performance hit. |
| 390 | 390 | ||
| 391 | If unsure, say Y to enable cleancache | 391 | If unsure, say Y to enable cleancache |
| 392 | |||
| 393 | config FRONTSWAP | ||
| 394 | bool "Enable frontswap to cache swap pages if tmem is present" | ||
| 395 | depends on SWAP | ||
| 396 | default n | ||
| 397 | help | ||
| 398 | Frontswap is so named because it can be thought of as the opposite | ||
| 399 | of a "backing" store for a swap device. The data is stored into | ||
| 400 | "transcendent memory", memory that is not directly accessible or | ||
| 401 | addressable by the kernel and is of unknown and possibly | ||
| 402 | time-varying size. When space in transcendent memory is available, | ||
| 403 | a significant swap I/O reduction may be achieved. When none is | ||
| 404 | available, all frontswap calls are reduced to a single pointer- | ||
| 405 | compare-against-NULL resulting in a negligible performance hit | ||
| 406 | and swap data is stored as normal on the matching swap device. | ||
| 407 | |||
| 408 | If unsure, say Y to enable frontswap. | ||
diff --git a/mm/Makefile b/mm/Makefile index a156285ce88d..2e2fbbefb99f 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -29,6 +29,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o | |||
| 29 | 29 | ||
| 30 | obj-$(CONFIG_BOUNCE) += bounce.o | 30 | obj-$(CONFIG_BOUNCE) += bounce.o |
| 31 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o | 31 | obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o |
| 32 | obj-$(CONFIG_FRONTSWAP) += frontswap.o | ||
| 32 | obj-$(CONFIG_HAS_DMA) += dmapool.o | 33 | obj-$(CONFIG_HAS_DMA) += dmapool.o |
| 33 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o | 34 | obj-$(CONFIG_HUGETLBFS) += hugetlb.o |
| 34 | obj-$(CONFIG_NUMA) += mempolicy.o | 35 | obj-$(CONFIG_NUMA) += mempolicy.o |
diff --git a/mm/bootmem.c b/mm/bootmem.c index ec4fcb7a56c8..bcb63ac48cc5 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
| 698 | return ___alloc_bootmem(size, align, goal, limit); | 698 | return ___alloc_bootmem(size, align, goal, limit); |
| 699 | } | 699 | } |
| 700 | 700 | ||
| 701 | static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, | 701 | void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, |
| 702 | unsigned long size, unsigned long align, | 702 | unsigned long size, unsigned long align, |
| 703 | unsigned long goal, unsigned long limit) | 703 | unsigned long goal, unsigned long limit) |
| 704 | { | 704 | { |
| @@ -710,6 +710,10 @@ again: | |||
| 710 | if (ptr) | 710 | if (ptr) |
| 711 | return ptr; | 711 | return ptr; |
| 712 | 712 | ||
| 713 | /* do not panic in alloc_bootmem_bdata() */ | ||
| 714 | if (limit && goal + size > limit) | ||
| 715 | limit = 0; | ||
| 716 | |||
| 713 | ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); | 717 | ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit); |
| 714 | if (ptr) | 718 | if (ptr) |
| 715 | return ptr; | 719 | return ptr; |
diff --git a/mm/compaction.c b/mm/compaction.c index 7ea259d82a99..2f42d9528539 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
| @@ -701,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
| 701 | if (err) { | 701 | if (err) { |
| 702 | putback_lru_pages(&cc->migratepages); | 702 | putback_lru_pages(&cc->migratepages); |
| 703 | cc->nr_migratepages = 0; | 703 | cc->nr_migratepages = 0; |
| 704 | if (err == -ENOMEM) { | ||
| 705 | ret = COMPACT_PARTIAL; | ||
| 706 | goto out; | ||
| 707 | } | ||
| 704 | } | 708 | } |
| 705 | |||
| 706 | } | 709 | } |
| 707 | 710 | ||
| 708 | out: | 711 | out: |
diff --git a/mm/frontswap.c b/mm/frontswap.c new file mode 100644 index 000000000000..e25025574a02 --- /dev/null +++ b/mm/frontswap.c | |||
| @@ -0,0 +1,314 @@ | |||
| 1 | /* | ||
| 2 | * Frontswap frontend | ||
| 3 | * | ||
| 4 | * This code provides the generic "frontend" layer to call a matching | ||
| 5 | * "backend" driver implementation of frontswap. See | ||
| 6 | * Documentation/vm/frontswap.txt for more information. | ||
| 7 | * | ||
| 8 | * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. | ||
| 9 | * Author: Dan Magenheimer | ||
| 10 | * | ||
| 11 | * This work is licensed under the terms of the GNU GPL, version 2. | ||
| 12 | */ | ||
| 13 | |||
| 14 | #include <linux/mm.h> | ||
| 15 | #include <linux/mman.h> | ||
| 16 | #include <linux/swap.h> | ||
| 17 | #include <linux/swapops.h> | ||
| 18 | #include <linux/proc_fs.h> | ||
| 19 | #include <linux/security.h> | ||
| 20 | #include <linux/capability.h> | ||
| 21 | #include <linux/module.h> | ||
| 22 | #include <linux/uaccess.h> | ||
| 23 | #include <linux/debugfs.h> | ||
| 24 | #include <linux/frontswap.h> | ||
| 25 | #include <linux/swapfile.h> | ||
| 26 | |||
| 27 | /* | ||
| 28 | * frontswap_ops is set by frontswap_register_ops to contain the pointers | ||
| 29 | * to the frontswap "backend" implementation functions. | ||
| 30 | */ | ||
| 31 | static struct frontswap_ops frontswap_ops __read_mostly; | ||
| 32 | |||
| 33 | /* | ||
| 34 | * This global enablement flag reduces overhead on systems where frontswap_ops | ||
| 35 | * has not been registered, so is preferred to the slower alternative: a | ||
| 36 | * function call that checks a non-global. | ||
| 37 | */ | ||
| 38 | bool frontswap_enabled __read_mostly; | ||
| 39 | EXPORT_SYMBOL(frontswap_enabled); | ||
| 40 | |||
| 41 | /* | ||
| 42 | * If enabled, frontswap_store will return failure even on success. As | ||
| 43 | * a result, the swap subsystem will always write the page to swap, in | ||
| 44 | * effect converting frontswap into a writethrough cache. In this mode, | ||
| 45 | * there is no direct reduction in swap writes, but a frontswap backend | ||
| 46 | * can unilaterally "reclaim" any pages in use with no data loss, thus | ||
| 47 | * providing increases control over maximum memory usage due to frontswap. | ||
| 48 | */ | ||
| 49 | static bool frontswap_writethrough_enabled __read_mostly; | ||
| 50 | |||
| 51 | #ifdef CONFIG_DEBUG_FS | ||
| 52 | /* | ||
| 53 | * Counters available via /sys/kernel/debug/frontswap (if debugfs is | ||
| 54 | * properly configured). These are for information only so are not protected | ||
| 55 | * against increment races. | ||
| 56 | */ | ||
| 57 | static u64 frontswap_loads; | ||
| 58 | static u64 frontswap_succ_stores; | ||
| 59 | static u64 frontswap_failed_stores; | ||
| 60 | static u64 frontswap_invalidates; | ||
| 61 | |||
| 62 | static inline void inc_frontswap_loads(void) { | ||
| 63 | frontswap_loads++; | ||
| 64 | } | ||
| 65 | static inline void inc_frontswap_succ_stores(void) { | ||
| 66 | frontswap_succ_stores++; | ||
| 67 | } | ||
| 68 | static inline void inc_frontswap_failed_stores(void) { | ||
| 69 | frontswap_failed_stores++; | ||
| 70 | } | ||
| 71 | static inline void inc_frontswap_invalidates(void) { | ||
| 72 | frontswap_invalidates++; | ||
| 73 | } | ||
| 74 | #else | ||
| 75 | static inline void inc_frontswap_loads(void) { } | ||
| 76 | static inline void inc_frontswap_succ_stores(void) { } | ||
| 77 | static inline void inc_frontswap_failed_stores(void) { } | ||
| 78 | static inline void inc_frontswap_invalidates(void) { } | ||
| 79 | #endif | ||
| 80 | /* | ||
| 81 | * Register operations for frontswap, returning previous thus allowing | ||
| 82 | * detection of multiple backends and possible nesting. | ||
| 83 | */ | ||
| 84 | struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops) | ||
| 85 | { | ||
| 86 | struct frontswap_ops old = frontswap_ops; | ||
| 87 | |||
| 88 | frontswap_ops = *ops; | ||
| 89 | frontswap_enabled = true; | ||
| 90 | return old; | ||
| 91 | } | ||
| 92 | EXPORT_SYMBOL(frontswap_register_ops); | ||
| 93 | |||
| 94 | /* | ||
| 95 | * Enable/disable frontswap writethrough (see above). | ||
| 96 | */ | ||
| 97 | void frontswap_writethrough(bool enable) | ||
| 98 | { | ||
| 99 | frontswap_writethrough_enabled = enable; | ||
| 100 | } | ||
| 101 | EXPORT_SYMBOL(frontswap_writethrough); | ||
| 102 | |||
| 103 | /* | ||
| 104 | * Called when a swap device is swapon'd. | ||
| 105 | */ | ||
| 106 | void __frontswap_init(unsigned type) | ||
| 107 | { | ||
| 108 | struct swap_info_struct *sis = swap_info[type]; | ||
| 109 | |||
| 110 | BUG_ON(sis == NULL); | ||
| 111 | if (sis->frontswap_map == NULL) | ||
| 112 | return; | ||
| 113 | if (frontswap_enabled) | ||
| 114 | (*frontswap_ops.init)(type); | ||
| 115 | } | ||
| 116 | EXPORT_SYMBOL(__frontswap_init); | ||
| 117 | |||
| 118 | /* | ||
| 119 | * "Store" data from a page to frontswap and associate it with the page's | ||
| 120 | * swaptype and offset. Page must be locked and in the swap cache. | ||
| 121 | * If frontswap already contains a page with matching swaptype and | ||
| 122 | * offset, the frontswap implmentation may either overwrite the data and | ||
| 123 | * return success or invalidate the page from frontswap and return failure. | ||
| 124 | */ | ||
| 125 | int __frontswap_store(struct page *page) | ||
| 126 | { | ||
| 127 | int ret = -1, dup = 0; | ||
| 128 | swp_entry_t entry = { .val = page_private(page), }; | ||
| 129 | int type = swp_type(entry); | ||
| 130 | struct swap_info_struct *sis = swap_info[type]; | ||
| 131 | pgoff_t offset = swp_offset(entry); | ||
| 132 | |||
| 133 | BUG_ON(!PageLocked(page)); | ||
| 134 | BUG_ON(sis == NULL); | ||
| 135 | if (frontswap_test(sis, offset)) | ||
| 136 | dup = 1; | ||
| 137 | ret = (*frontswap_ops.store)(type, offset, page); | ||
| 138 | if (ret == 0) { | ||
| 139 | frontswap_set(sis, offset); | ||
| 140 | inc_frontswap_succ_stores(); | ||
| 141 | if (!dup) | ||
| 142 | atomic_inc(&sis->frontswap_pages); | ||
| 143 | } else if (dup) { | ||
| 144 | /* | ||
| 145 | failed dup always results in automatic invalidate of | ||
| 146 | the (older) page from frontswap | ||
| 147 | */ | ||
| 148 | frontswap_clear(sis, offset); | ||
| 149 | atomic_dec(&sis->frontswap_pages); | ||
| 150 | inc_frontswap_failed_stores(); | ||
| 151 | } else | ||
| 152 | inc_frontswap_failed_stores(); | ||
| 153 | if (frontswap_writethrough_enabled) | ||
| 154 | /* report failure so swap also writes to swap device */ | ||
| 155 | ret = -1; | ||
| 156 | return ret; | ||
| 157 | } | ||
| 158 | EXPORT_SYMBOL(__frontswap_store); | ||
| 159 | |||
| 160 | /* | ||
| 161 | * "Get" data from frontswap associated with swaptype and offset that were | ||
| 162 | * specified when the data was put to frontswap and use it to fill the | ||
| 163 | * specified page with data. Page must be locked and in the swap cache. | ||
| 164 | */ | ||
| 165 | int __frontswap_load(struct page *page) | ||
| 166 | { | ||
| 167 | int ret = -1; | ||
| 168 | swp_entry_t entry = { .val = page_private(page), }; | ||
| 169 | int type = swp_type(entry); | ||
| 170 | struct swap_info_struct *sis = swap_info[type]; | ||
| 171 | pgoff_t offset = swp_offset(entry); | ||
| 172 | |||
| 173 | BUG_ON(!PageLocked(page)); | ||
| 174 | BUG_ON(sis == NULL); | ||
| 175 | if (frontswap_test(sis, offset)) | ||
| 176 | ret = (*frontswap_ops.load)(type, offset, page); | ||
| 177 | if (ret == 0) | ||
| 178 | inc_frontswap_loads(); | ||
| 179 | return ret; | ||
| 180 | } | ||
| 181 | EXPORT_SYMBOL(__frontswap_load); | ||
| 182 | |||
| 183 | /* | ||
| 184 | * Invalidate any data from frontswap associated with the specified swaptype | ||
| 185 | * and offset so that a subsequent "get" will fail. | ||
| 186 | */ | ||
| 187 | void __frontswap_invalidate_page(unsigned type, pgoff_t offset) | ||
| 188 | { | ||
| 189 | struct swap_info_struct *sis = swap_info[type]; | ||
| 190 | |||
| 191 | BUG_ON(sis == NULL); | ||
| 192 | if (frontswap_test(sis, offset)) { | ||
| 193 | (*frontswap_ops.invalidate_page)(type, offset); | ||
| 194 | atomic_dec(&sis->frontswap_pages); | ||
| 195 | frontswap_clear(sis, offset); | ||
| 196 | inc_frontswap_invalidates(); | ||
| 197 | } | ||
| 198 | } | ||
| 199 | EXPORT_SYMBOL(__frontswap_invalidate_page); | ||
| 200 | |||
| 201 | /* | ||
| 202 | * Invalidate all data from frontswap associated with all offsets for the | ||
| 203 | * specified swaptype. | ||
| 204 | */ | ||
| 205 | void __frontswap_invalidate_area(unsigned type) | ||
| 206 | { | ||
| 207 | struct swap_info_struct *sis = swap_info[type]; | ||
| 208 | |||
| 209 | BUG_ON(sis == NULL); | ||
| 210 | if (sis->frontswap_map == NULL) | ||
| 211 | return; | ||
| 212 | (*frontswap_ops.invalidate_area)(type); | ||
| 213 | atomic_set(&sis->frontswap_pages, 0); | ||
| 214 | memset(sis->frontswap_map, 0, sis->max / sizeof(long)); | ||
| 215 | } | ||
| 216 | EXPORT_SYMBOL(__frontswap_invalidate_area); | ||
| 217 | |||
| 218 | /* | ||
| 219 | * Frontswap, like a true swap device, may unnecessarily retain pages | ||
| 220 | * under certain circumstances; "shrink" frontswap is essentially a | ||
| 221 | * "partial swapoff" and works by calling try_to_unuse to attempt to | ||
| 222 | * unuse enough frontswap pages to attempt to -- subject to memory | ||
| 223 | * constraints -- reduce the number of pages in frontswap to the | ||
| 224 | * number given in the parameter target_pages. | ||
| 225 | */ | ||
| 226 | void frontswap_shrink(unsigned long target_pages) | ||
| 227 | { | ||
| 228 | struct swap_info_struct *si = NULL; | ||
| 229 | int si_frontswap_pages; | ||
| 230 | unsigned long total_pages = 0, total_pages_to_unuse; | ||
| 231 | unsigned long pages = 0, pages_to_unuse = 0; | ||
| 232 | int type; | ||
| 233 | bool locked = false; | ||
| 234 | |||
| 235 | /* | ||
| 236 | * we don't want to hold swap_lock while doing a very | ||
| 237 | * lengthy try_to_unuse, but swap_list may change | ||
| 238 | * so restart scan from swap_list.head each time | ||
| 239 | */ | ||
| 240 | spin_lock(&swap_lock); | ||
| 241 | locked = true; | ||
| 242 | total_pages = 0; | ||
| 243 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
| 244 | si = swap_info[type]; | ||
| 245 | total_pages += atomic_read(&si->frontswap_pages); | ||
| 246 | } | ||
| 247 | if (total_pages <= target_pages) | ||
| 248 | goto out; | ||
| 249 | total_pages_to_unuse = total_pages - target_pages; | ||
| 250 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
| 251 | si = swap_info[type]; | ||
| 252 | si_frontswap_pages = atomic_read(&si->frontswap_pages); | ||
| 253 | if (total_pages_to_unuse < si_frontswap_pages) | ||
| 254 | pages = pages_to_unuse = total_pages_to_unuse; | ||
| 255 | else { | ||
| 256 | pages = si_frontswap_pages; | ||
| 257 | pages_to_unuse = 0; /* unuse all */ | ||
| 258 | } | ||
| 259 | /* ensure there is enough RAM to fetch pages from frontswap */ | ||
| 260 | if (security_vm_enough_memory_mm(current->mm, pages)) | ||
| 261 | continue; | ||
| 262 | vm_unacct_memory(pages); | ||
| 263 | break; | ||
| 264 | } | ||
| 265 | if (type < 0) | ||
| 266 | goto out; | ||
| 267 | locked = false; | ||
| 268 | spin_unlock(&swap_lock); | ||
| 269 | try_to_unuse(type, true, pages_to_unuse); | ||
| 270 | out: | ||
| 271 | if (locked) | ||
| 272 | spin_unlock(&swap_lock); | ||
| 273 | return; | ||
| 274 | } | ||
| 275 | EXPORT_SYMBOL(frontswap_shrink); | ||
| 276 | |||
| 277 | /* | ||
| 278 | * Count and return the number of frontswap pages across all | ||
| 279 | * swap devices. This is exported so that backend drivers can | ||
| 280 | * determine current usage without reading debugfs. | ||
| 281 | */ | ||
| 282 | unsigned long frontswap_curr_pages(void) | ||
| 283 | { | ||
| 284 | int type; | ||
| 285 | unsigned long totalpages = 0; | ||
| 286 | struct swap_info_struct *si = NULL; | ||
| 287 | |||
| 288 | spin_lock(&swap_lock); | ||
| 289 | for (type = swap_list.head; type >= 0; type = si->next) { | ||
| 290 | si = swap_info[type]; | ||
| 291 | totalpages += atomic_read(&si->frontswap_pages); | ||
| 292 | } | ||
| 293 | spin_unlock(&swap_lock); | ||
| 294 | return totalpages; | ||
| 295 | } | ||
| 296 | EXPORT_SYMBOL(frontswap_curr_pages); | ||
| 297 | |||
| 298 | static int __init init_frontswap(void) | ||
| 299 | { | ||
| 300 | #ifdef CONFIG_DEBUG_FS | ||
| 301 | struct dentry *root = debugfs_create_dir("frontswap", NULL); | ||
| 302 | if (root == NULL) | ||
| 303 | return -ENXIO; | ||
| 304 | debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); | ||
| 305 | debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); | ||
| 306 | debugfs_create_u64("failed_stores", S_IRUGO, root, | ||
| 307 | &frontswap_failed_stores); | ||
| 308 | debugfs_create_u64("invalidates", S_IRUGO, | ||
| 309 | root, &frontswap_invalidates); | ||
| 310 | #endif | ||
| 311 | return 0; | ||
| 312 | } | ||
| 313 | |||
| 314 | module_init(init_frontswap); | ||
diff --git a/mm/madvise.c b/mm/madvise.c index deff1b64a08c..14d260fa0d17 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
| 16 | #include <linux/ksm.h> | 16 | #include <linux/ksm.h> |
| 17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
| 18 | #include <linux/file.h> | ||
| 18 | 19 | ||
| 19 | /* | 20 | /* |
| 20 | * Any behaviour which results in changes to the vma->vm_flags needs to | 21 | * Any behaviour which results in changes to the vma->vm_flags needs to |
| @@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
| 204 | { | 205 | { |
| 205 | loff_t offset; | 206 | loff_t offset; |
| 206 | int error; | 207 | int error; |
| 208 | struct file *f; | ||
| 207 | 209 | ||
| 208 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ | 210 | *prev = NULL; /* tell sys_madvise we drop mmap_sem */ |
| 209 | 211 | ||
| 210 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) | 212 | if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB)) |
| 211 | return -EINVAL; | 213 | return -EINVAL; |
| 212 | 214 | ||
| 213 | if (!vma->vm_file || !vma->vm_file->f_mapping | 215 | f = vma->vm_file; |
| 214 | || !vma->vm_file->f_mapping->host) { | 216 | |
| 217 | if (!f || !f->f_mapping || !f->f_mapping->host) { | ||
| 215 | return -EINVAL; | 218 | return -EINVAL; |
| 216 | } | 219 | } |
| 217 | 220 | ||
| @@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma, | |||
| 221 | offset = (loff_t)(start - vma->vm_start) | 224 | offset = (loff_t)(start - vma->vm_start) |
| 222 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 225 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
| 223 | 226 | ||
| 224 | /* filesystem's fallocate may need to take i_mutex */ | 227 | /* |
| 228 | * Filesystem's fallocate may need to take i_mutex. We need to | ||
| 229 | * explicitly grab a reference because the vma (and hence the | ||
| 230 | * vma's reference to the file) can go away as soon as we drop | ||
| 231 | * mmap_sem. | ||
| 232 | */ | ||
| 233 | get_file(f); | ||
| 225 | up_read(¤t->mm->mmap_sem); | 234 | up_read(¤t->mm->mmap_sem); |
| 226 | error = do_fallocate(vma->vm_file, | 235 | error = do_fallocate(f, |
| 227 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, | 236 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, |
| 228 | offset, end - start); | 237 | offset, end - start); |
| 238 | fput(f); | ||
| 229 | down_read(¤t->mm->mmap_sem); | 239 | down_read(¤t->mm->mmap_sem); |
| 230 | return error; | 240 | return error; |
| 231 | } | 241 | } |
diff --git a/mm/memblock.c b/mm/memblock.c index 952123eba433..5cc6731b00cc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
| @@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, | |||
| 143 | MAX_NUMNODES); | 143 | MAX_NUMNODES); |
| 144 | } | 144 | } |
| 145 | 145 | ||
| 146 | /* | ||
| 147 | * Free memblock.reserved.regions | ||
| 148 | */ | ||
| 149 | int __init_memblock memblock_free_reserved_regions(void) | ||
| 150 | { | ||
| 151 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
| 152 | return 0; | ||
| 153 | |||
| 154 | return memblock_free(__pa(memblock.reserved.regions), | ||
| 155 | sizeof(struct memblock_region) * memblock.reserved.max); | ||
| 156 | } | ||
| 157 | |||
| 158 | /* | ||
| 159 | * Reserve memblock.reserved.regions | ||
| 160 | */ | ||
| 161 | int __init_memblock memblock_reserve_reserved_regions(void) | ||
| 162 | { | ||
| 163 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
| 164 | return 0; | ||
| 165 | |||
| 166 | return memblock_reserve(__pa(memblock.reserved.regions), | ||
| 167 | sizeof(struct memblock_region) * memblock.reserved.max); | ||
| 168 | } | ||
| 169 | |||
| 170 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) | 146 | static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) |
| 171 | { | 147 | { |
| 172 | type->total_size -= type->regions[r].size; | 148 | type->total_size -= type->regions[r].size; |
| @@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u | |||
| 184 | } | 160 | } |
| 185 | } | 161 | } |
| 186 | 162 | ||
| 187 | static int __init_memblock memblock_double_array(struct memblock_type *type) | 163 | phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( |
| 164 | phys_addr_t *addr) | ||
| 165 | { | ||
| 166 | if (memblock.reserved.regions == memblock_reserved_init_regions) | ||
| 167 | return 0; | ||
| 168 | |||
| 169 | *addr = __pa(memblock.reserved.regions); | ||
| 170 | |||
| 171 | return PAGE_ALIGN(sizeof(struct memblock_region) * | ||
| 172 | memblock.reserved.max); | ||
| 173 | } | ||
| 174 | |||
| 175 | /** | ||
| 176 | * memblock_double_array - double the size of the memblock regions array | ||
| 177 | * @type: memblock type of the regions array being doubled | ||
| 178 | * @new_area_start: starting address of memory range to avoid overlap with | ||
| 179 | * @new_area_size: size of memory range to avoid overlap with | ||
| 180 | * | ||
| 181 | * Double the size of the @type regions array. If memblock is being used to | ||
| 182 | * allocate memory for a new reserved regions array and there is a previously | ||
| 183 | * allocated memory range [@new_area_start,@new_area_start+@new_area_size] | ||
| 184 | * waiting to be reserved, ensure the memory used by the new array does | ||
| 185 | * not overlap. | ||
| 186 | * | ||
| 187 | * RETURNS: | ||
| 188 | * 0 on success, -1 on failure. | ||
| 189 | */ | ||
| 190 | static int __init_memblock memblock_double_array(struct memblock_type *type, | ||
| 191 | phys_addr_t new_area_start, | ||
| 192 | phys_addr_t new_area_size) | ||
| 188 | { | 193 | { |
| 189 | struct memblock_region *new_array, *old_array; | 194 | struct memblock_region *new_array, *old_array; |
| 195 | phys_addr_t old_alloc_size, new_alloc_size; | ||
| 190 | phys_addr_t old_size, new_size, addr; | 196 | phys_addr_t old_size, new_size, addr; |
| 191 | int use_slab = slab_is_available(); | 197 | int use_slab = slab_is_available(); |
| 192 | int *in_slab; | 198 | int *in_slab; |
| @@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
| 200 | /* Calculate new doubled size */ | 206 | /* Calculate new doubled size */ |
| 201 | old_size = type->max * sizeof(struct memblock_region); | 207 | old_size = type->max * sizeof(struct memblock_region); |
| 202 | new_size = old_size << 1; | 208 | new_size = old_size << 1; |
| 209 | /* | ||
| 210 | * We need to allocated new one align to PAGE_SIZE, | ||
| 211 | * so we can free them completely later. | ||
| 212 | */ | ||
| 213 | old_alloc_size = PAGE_ALIGN(old_size); | ||
| 214 | new_alloc_size = PAGE_ALIGN(new_size); | ||
| 203 | 215 | ||
| 204 | /* Retrieve the slab flag */ | 216 | /* Retrieve the slab flag */ |
| 205 | if (type == &memblock.memory) | 217 | if (type == &memblock.memory) |
| @@ -222,7 +234,18 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
| 222 | new_array = kmalloc(new_size, GFP_KERNEL); | 234 | new_array = kmalloc(new_size, GFP_KERNEL); |
| 223 | addr = new_array ? __pa(new_array) : 0; | 235 | addr = new_array ? __pa(new_array) : 0; |
| 224 | } else { | 236 | } else { |
| 225 | addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t)); | 237 | /* only exclude range when trying to double reserved.regions */ |
| 238 | if (type != &memblock.reserved) | ||
| 239 | new_area_start = new_area_size = 0; | ||
| 240 | |||
| 241 | addr = memblock_find_in_range(new_area_start + new_area_size, | ||
| 242 | memblock.current_limit, | ||
| 243 | new_alloc_size, PAGE_SIZE); | ||
| 244 | if (!addr && new_area_size) | ||
| 245 | addr = memblock_find_in_range(0, | ||
| 246 | min(new_area_start, memblock.current_limit), | ||
| 247 | new_alloc_size, PAGE_SIZE); | ||
| 248 | |||
| 226 | new_array = addr ? __va(addr) : 0; | 249 | new_array = addr ? __va(addr) : 0; |
| 227 | } | 250 | } |
| 228 | if (!addr) { | 251 | if (!addr) { |
| @@ -251,13 +274,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) | |||
| 251 | kfree(old_array); | 274 | kfree(old_array); |
| 252 | else if (old_array != memblock_memory_init_regions && | 275 | else if (old_array != memblock_memory_init_regions && |
| 253 | old_array != memblock_reserved_init_regions) | 276 | old_array != memblock_reserved_init_regions) |
| 254 | memblock_free(__pa(old_array), old_size); | 277 | memblock_free(__pa(old_array), old_alloc_size); |
| 255 | 278 | ||
| 256 | /* Reserve the new array if that comes from the memblock. | 279 | /* Reserve the new array if that comes from the memblock. |
| 257 | * Otherwise, we needn't do it | 280 | * Otherwise, we needn't do it |
| 258 | */ | 281 | */ |
| 259 | if (!use_slab) | 282 | if (!use_slab) |
| 260 | BUG_ON(memblock_reserve(addr, new_size)); | 283 | BUG_ON(memblock_reserve(addr, new_alloc_size)); |
| 261 | 284 | ||
| 262 | /* Update slab flag */ | 285 | /* Update slab flag */ |
| 263 | *in_slab = use_slab; | 286 | *in_slab = use_slab; |
| @@ -399,7 +422,7 @@ repeat: | |||
| 399 | */ | 422 | */ |
| 400 | if (!insert) { | 423 | if (!insert) { |
| 401 | while (type->cnt + nr_new > type->max) | 424 | while (type->cnt + nr_new > type->max) |
| 402 | if (memblock_double_array(type) < 0) | 425 | if (memblock_double_array(type, obase, size) < 0) |
| 403 | return -ENOMEM; | 426 | return -ENOMEM; |
| 404 | insert = true; | 427 | insert = true; |
| 405 | goto repeat; | 428 | goto repeat; |
| @@ -450,7 +473,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, | |||
| 450 | 473 | ||
| 451 | /* we'll create at most two more regions */ | 474 | /* we'll create at most two more regions */ |
| 452 | while (type->cnt + 2 > type->max) | 475 | while (type->cnt + 2 > type->max) |
| 453 | if (memblock_double_array(type) < 0) | 476 | if (memblock_double_array(type, base, size) < 0) |
| 454 | return -ENOMEM; | 477 | return -ENOMEM; |
| 455 | 478 | ||
| 456 | for (i = 0; i < type->cnt; i++) { | 479 | for (i = 0; i < type->cnt; i++) { |
| @@ -540,9 +563,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | |||
| 540 | * __next_free_mem_range - next function for for_each_free_mem_range() | 563 | * __next_free_mem_range - next function for for_each_free_mem_range() |
| 541 | * @idx: pointer to u64 loop variable | 564 | * @idx: pointer to u64 loop variable |
| 542 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 565 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes |
| 543 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 566 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
| 544 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 567 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
| 545 | * @p_nid: ptr to int for nid of the range, can be %NULL | 568 | * @out_nid: ptr to int for nid of the range, can be %NULL |
| 546 | * | 569 | * |
| 547 | * Find the first free area from *@idx which matches @nid, fill the out | 570 | * Find the first free area from *@idx which matches @nid, fill the out |
| 548 | * parameters, and update *@idx for the next iteration. The lower 32bit of | 571 | * parameters, and update *@idx for the next iteration. The lower 32bit of |
| @@ -616,9 +639,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid, | |||
| 616 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() | 639 | * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse() |
| 617 | * @idx: pointer to u64 loop variable | 640 | * @idx: pointer to u64 loop variable |
| 618 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes | 641 | * @nid: nid: node selector, %MAX_NUMNODES for all nodes |
| 619 | * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL | 642 | * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL |
| 620 | * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL | 643 | * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL |
| 621 | * @p_nid: ptr to int for nid of the range, can be %NULL | 644 | * @out_nid: ptr to int for nid of the range, can be %NULL |
| 622 | * | 645 | * |
| 623 | * Reverse of __next_free_mem_range(). | 646 | * Reverse of __next_free_mem_range(). |
| 624 | */ | 647 | */ |
| @@ -867,6 +890,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) | |||
| 867 | return memblock_search(&memblock.memory, addr) != -1; | 890 | return memblock_search(&memblock.memory, addr) != -1; |
| 868 | } | 891 | } |
| 869 | 892 | ||
| 893 | /** | ||
| 894 | * memblock_is_region_memory - check if a region is a subset of memory | ||
| 895 | * @base: base of region to check | ||
| 896 | * @size: size of region to check | ||
| 897 | * | ||
| 898 | * Check if the region [@base, @base+@size) is a subset of a memory block. | ||
| 899 | * | ||
| 900 | * RETURNS: | ||
| 901 | * 0 if false, non-zero if true | ||
| 902 | */ | ||
| 870 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) | 903 | int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) |
| 871 | { | 904 | { |
| 872 | int idx = memblock_search(&memblock.memory, base); | 905 | int idx = memblock_search(&memblock.memory, base); |
| @@ -879,6 +912,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size | |||
| 879 | memblock.memory.regions[idx].size) >= end; | 912 | memblock.memory.regions[idx].size) >= end; |
| 880 | } | 913 | } |
| 881 | 914 | ||
| 915 | /** | ||
| 916 | * memblock_is_region_reserved - check if a region intersects reserved memory | ||
| 917 | * @base: base of region to check | ||
| 918 | * @size: size of region to check | ||
| 919 | * | ||
| 920 | * Check if the region [@base, @base+@size) intersects a reserved memory block. | ||
| 921 | * | ||
| 922 | * RETURNS: | ||
| 923 | * 0 if false, non-zero if true | ||
| 924 | */ | ||
| 882 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) | 925 | int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) |
| 883 | { | 926 | { |
| 884 | memblock_cap_size(base, &size); | 927 | memblock_cap_size(base, &size); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac35bccadb7b..f72b5e52451a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -1148,7 +1148,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, | |||
| 1148 | { | 1148 | { |
| 1149 | if (root_memcg == memcg) | 1149 | if (root_memcg == memcg) |
| 1150 | return true; | 1150 | return true; |
| 1151 | if (!root_memcg->use_hierarchy) | 1151 | if (!root_memcg->use_hierarchy || !memcg) |
| 1152 | return false; | 1152 | return false; |
| 1153 | return css_is_ancestor(&memcg->css, &root_memcg->css); | 1153 | return css_is_ancestor(&memcg->css, &root_memcg->css); |
| 1154 | } | 1154 | } |
| @@ -1234,7 +1234,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec) | |||
| 1234 | 1234 | ||
| 1235 | /** | 1235 | /** |
| 1236 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup | 1236 | * mem_cgroup_margin - calculate chargeable space of a memory cgroup |
| 1237 | * @mem: the memory cgroup | 1237 | * @memcg: the memory cgroup |
| 1238 | * | 1238 | * |
| 1239 | * Returns the maximum amount of memory @mem can be charged with, in | 1239 | * Returns the maximum amount of memory @mem can be charged with, in |
| 1240 | * pages. | 1240 | * pages. |
| @@ -1508,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | |||
| 1508 | 1508 | ||
| 1509 | /** | 1509 | /** |
| 1510 | * test_mem_cgroup_node_reclaimable | 1510 | * test_mem_cgroup_node_reclaimable |
| 1511 | * @mem: the target memcg | 1511 | * @memcg: the target memcg |
| 1512 | * @nid: the node ID to be checked. | 1512 | * @nid: the node ID to be checked. |
| 1513 | * @noswap : specify true here if the user wants flle only information. | 1513 | * @noswap : specify true here if the user wants flle only information. |
| 1514 | * | 1514 | * |
diff --git a/mm/memory.c b/mm/memory.c index 1b7dc662bf9f..2466d1250231 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -1225,7 +1225,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
| 1225 | next = pmd_addr_end(addr, end); | 1225 | next = pmd_addr_end(addr, end); |
| 1226 | if (pmd_trans_huge(*pmd)) { | 1226 | if (pmd_trans_huge(*pmd)) { |
| 1227 | if (next - addr != HPAGE_PMD_SIZE) { | 1227 | if (next - addr != HPAGE_PMD_SIZE) { |
| 1228 | VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); | 1228 | #ifdef CONFIG_DEBUG_VM |
| 1229 | if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { | ||
| 1230 | pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n", | ||
| 1231 | __func__, addr, end, | ||
| 1232 | vma->vm_start, | ||
| 1233 | vma->vm_end); | ||
| 1234 | BUG(); | ||
| 1235 | } | ||
| 1236 | #endif | ||
| 1229 | split_huge_page_pmd(vma->vm_mm, pmd); | 1237 | split_huge_page_pmd(vma->vm_mm, pmd); |
| 1230 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1238 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
| 1231 | goto next; | 1239 | goto next; |
| @@ -1366,7 +1374,7 @@ void unmap_vmas(struct mmu_gather *tlb, | |||
| 1366 | /** | 1374 | /** |
| 1367 | * zap_page_range - remove user pages in a given range | 1375 | * zap_page_range - remove user pages in a given range |
| 1368 | * @vma: vm_area_struct holding the applicable pages | 1376 | * @vma: vm_area_struct holding the applicable pages |
| 1369 | * @address: starting address of pages to zap | 1377 | * @start: starting address of pages to zap |
| 1370 | * @size: number of bytes to zap | 1378 | * @size: number of bytes to zap |
| 1371 | * @details: details of nonlinear truncation or shared cache invalidation | 1379 | * @details: details of nonlinear truncation or shared cache invalidation |
| 1372 | * | 1380 | * |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0d7e3ec8e0f3..427bb291dd0f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -618,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size) | |||
| 618 | pgdat = hotadd_new_pgdat(nid, start); | 618 | pgdat = hotadd_new_pgdat(nid, start); |
| 619 | ret = -ENOMEM; | 619 | ret = -ENOMEM; |
| 620 | if (!pgdat) | 620 | if (!pgdat) |
| 621 | goto out; | 621 | goto error; |
| 622 | new_pgdat = 1; | 622 | new_pgdat = 1; |
| 623 | } | 623 | } |
| 624 | 624 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f15c1b24ca18..1d771e4200d2 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
| 1177 | if (!list_empty(&pagelist)) { | 1177 | if (!list_empty(&pagelist)) { |
| 1178 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1178 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
| 1179 | (unsigned long)vma, | 1179 | (unsigned long)vma, |
| 1180 | false, true); | 1180 | false, MIGRATE_SYNC); |
| 1181 | if (nr_failed) | 1181 | if (nr_failed) |
| 1182 | putback_lru_pages(&pagelist); | 1182 | putback_lru_pages(&pagelist); |
| 1183 | } | 1183 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index d23415c001bc..405573010f99 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
| @@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) | |||
| 105 | __free_pages_bootmem(pfn_to_page(i), 0); | 105 | __free_pages_bootmem(pfn_to_page(i), 0); |
| 106 | } | 106 | } |
| 107 | 107 | ||
| 108 | static unsigned long __init __free_memory_core(phys_addr_t start, | ||
| 109 | phys_addr_t end) | ||
| 110 | { | ||
| 111 | unsigned long start_pfn = PFN_UP(start); | ||
| 112 | unsigned long end_pfn = min_t(unsigned long, | ||
| 113 | PFN_DOWN(end), max_low_pfn); | ||
| 114 | |||
| 115 | if (start_pfn > end_pfn) | ||
| 116 | return 0; | ||
| 117 | |||
| 118 | __free_pages_memory(start_pfn, end_pfn); | ||
| 119 | |||
| 120 | return end_pfn - start_pfn; | ||
| 121 | } | ||
| 122 | |||
| 108 | unsigned long __init free_low_memory_core_early(int nodeid) | 123 | unsigned long __init free_low_memory_core_early(int nodeid) |
| 109 | { | 124 | { |
| 110 | unsigned long count = 0; | 125 | unsigned long count = 0; |
| 111 | phys_addr_t start, end; | 126 | phys_addr_t start, end, size; |
| 112 | u64 i; | 127 | u64 i; |
| 113 | 128 | ||
| 114 | /* free reserved array temporarily so that it's treated as free area */ | 129 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) |
| 115 | memblock_free_reserved_regions(); | 130 | count += __free_memory_core(start, end); |
| 116 | 131 | ||
| 117 | for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { | 132 | /* free range that is used for reserved array if we allocate it */ |
| 118 | unsigned long start_pfn = PFN_UP(start); | 133 | size = get_allocated_memblock_reserved_regions_info(&start); |
| 119 | unsigned long end_pfn = min_t(unsigned long, | 134 | if (size) |
| 120 | PFN_DOWN(end), max_low_pfn); | 135 | count += __free_memory_core(start, start + size); |
| 121 | if (start_pfn < end_pfn) { | ||
| 122 | __free_pages_memory(start_pfn, end_pfn); | ||
| 123 | count += end_pfn - start_pfn; | ||
| 124 | } | ||
| 125 | } | ||
| 126 | 136 | ||
| 127 | /* put region array back? */ | ||
| 128 | memblock_reserve_reserved_regions(); | ||
| 129 | return count; | 137 | return count; |
| 130 | } | 138 | } |
| 131 | 139 | ||
| @@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, | |||
| 274 | return ___alloc_bootmem(size, align, goal, limit); | 282 | return ___alloc_bootmem(size, align, goal, limit); |
| 275 | } | 283 | } |
| 276 | 284 | ||
| 277 | static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, | 285 | void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, |
| 278 | unsigned long size, | 286 | unsigned long size, |
| 279 | unsigned long align, | 287 | unsigned long align, |
| 280 | unsigned long goal, | 288 | unsigned long goal, |
diff --git a/mm/nommu.c b/mm/nommu.c index c4acfbc09972..d4b0c10872de 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | |||
| 1486 | 1486 | ||
| 1487 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); | 1487 | flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); |
| 1488 | 1488 | ||
| 1489 | ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); | 1489 | retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); |
| 1490 | 1490 | ||
| 1491 | if (file) | 1491 | if (file) |
| 1492 | fput(file); | 1492 | fput(file); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ed0e19677360..ac300c99baf6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
| 183 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | 183 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, |
| 184 | const nodemask_t *nodemask, unsigned long totalpages) | 184 | const nodemask_t *nodemask, unsigned long totalpages) |
| 185 | { | 185 | { |
| 186 | unsigned long points; | 186 | long points; |
| 187 | long adj; | ||
| 187 | 188 | ||
| 188 | if (oom_unkillable_task(p, memcg, nodemask)) | 189 | if (oom_unkillable_task(p, memcg, nodemask)) |
| 189 | return 0; | 190 | return 0; |
| @@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
| 192 | if (!p) | 193 | if (!p) |
| 193 | return 0; | 194 | return 0; |
| 194 | 195 | ||
| 195 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { | 196 | adj = p->signal->oom_score_adj; |
| 197 | if (adj == OOM_SCORE_ADJ_MIN) { | ||
| 196 | task_unlock(p); | 198 | task_unlock(p); |
| 197 | return 0; | 199 | return 0; |
| 198 | } | 200 | } |
| @@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
| 210 | * implementation used by LSMs. | 212 | * implementation used by LSMs. |
| 211 | */ | 213 | */ |
| 212 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) | 214 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) |
| 213 | points -= 30 * totalpages / 1000; | 215 | adj -= 30; |
| 214 | 216 | ||
| 215 | /* | 217 | /* Normalize to oom_score_adj units */ |
| 216 | * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may | 218 | adj *= totalpages / 1000; |
| 217 | * either completely disable oom killing or always prefer a certain | 219 | points += adj; |
| 218 | * task. | ||
| 219 | */ | ||
| 220 | points += p->signal->oom_score_adj * totalpages / 1000; | ||
| 221 | 220 | ||
| 222 | /* | 221 | /* |
| 223 | * Never return 0 for an eligible task regardless of the root bonus and | 222 | * Never return 0 for an eligible task regardless of the root bonus and |
| 224 | * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). | 223 | * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). |
| 225 | */ | 224 | */ |
| 226 | return points ? points : 1; | 225 | return points > 0 ? points : 1; |
| 227 | } | 226 | } |
| 228 | 227 | ||
| 229 | /* | 228 | /* |
| @@ -366,7 +365,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
| 366 | 365 | ||
| 367 | /** | 366 | /** |
| 368 | * dump_tasks - dump current memory state of all system tasks | 367 | * dump_tasks - dump current memory state of all system tasks |
| 369 | * @mem: current's memory controller, if constrained | 368 | * @memcg: current's memory controller, if constrained |
| 370 | * @nodemask: nodemask passed to page allocator for mempolicy ooms | 369 | * @nodemask: nodemask passed to page allocator for mempolicy ooms |
| 371 | * | 370 | * |
| 372 | * Dumps the current memory state of all eligible tasks. Tasks not in the same | 371 | * Dumps the current memory state of all eligible tasks. Tasks not in the same |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44030096da63..4a4f9219683f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -5635,7 +5635,12 @@ static struct page * | |||
| 5635 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, | 5635 | __alloc_contig_migrate_alloc(struct page *page, unsigned long private, |
| 5636 | int **resultp) | 5636 | int **resultp) |
| 5637 | { | 5637 | { |
| 5638 | return alloc_page(GFP_HIGHUSER_MOVABLE); | 5638 | gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; |
| 5639 | |||
| 5640 | if (PageHighMem(page)) | ||
| 5641 | gfp_mask |= __GFP_HIGHMEM; | ||
| 5642 | |||
| 5643 | return alloc_page(gfp_mask); | ||
| 5639 | } | 5644 | } |
| 5640 | 5645 | ||
| 5641 | /* [start, end) must belong to a single zone. */ | 5646 | /* [start, end) must belong to a single zone. */ |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 1ccbd714059c..eb750f851395 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
| @@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, | |||
| 392 | 392 | ||
| 393 | /** | 393 | /** |
| 394 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. | 394 | * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. |
| 395 | * @end: swap entry to be cmpxchged | 395 | * @ent: swap entry to be cmpxchged |
| 396 | * @old: old id | 396 | * @old: old id |
| 397 | * @new: new id | 397 | * @new: new id |
| 398 | * | 398 | * |
| @@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, | |||
| 422 | /** | 422 | /** |
| 423 | * swap_cgroup_record - record mem_cgroup for this swp_entry. | 423 | * swap_cgroup_record - record mem_cgroup for this swp_entry. |
| 424 | * @ent: swap entry to be recorded into | 424 | * @ent: swap entry to be recorded into |
| 425 | * @mem: mem_cgroup to be recorded | 425 | * @id: mem_cgroup to be recorded |
| 426 | * | 426 | * |
| 427 | * Returns old value at success, 0 at failure. | 427 | * Returns old value at success, 0 at failure. |
| 428 | * (Of course, old value can be 0.) | 428 | * (Of course, old value can be 0.) |
diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611e..34f02923744c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -18,6 +18,7 @@ | |||
| 18 | #include <linux/bio.h> | 18 | #include <linux/bio.h> |
| 19 | #include <linux/swapops.h> | 19 | #include <linux/swapops.h> |
| 20 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
| 21 | #include <linux/frontswap.h> | ||
| 21 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
| 22 | 23 | ||
| 23 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 24 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
| @@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) | |||
| 98 | unlock_page(page); | 99 | unlock_page(page); |
| 99 | goto out; | 100 | goto out; |
| 100 | } | 101 | } |
| 102 | if (frontswap_store(page) == 0) { | ||
| 103 | set_page_writeback(page); | ||
| 104 | unlock_page(page); | ||
| 105 | end_page_writeback(page); | ||
| 106 | goto out; | ||
| 107 | } | ||
| 101 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); | 108 | bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); |
| 102 | if (bio == NULL) { | 109 | if (bio == NULL) { |
| 103 | set_page_dirty(page); | 110 | set_page_dirty(page); |
| @@ -122,6 +129,11 @@ int swap_readpage(struct page *page) | |||
| 122 | 129 | ||
| 123 | VM_BUG_ON(!PageLocked(page)); | 130 | VM_BUG_ON(!PageLocked(page)); |
| 124 | VM_BUG_ON(PageUptodate(page)); | 131 | VM_BUG_ON(PageUptodate(page)); |
| 132 | if (frontswap_load(page) == 0) { | ||
| 133 | SetPageUptodate(page); | ||
| 134 | unlock_page(page); | ||
| 135 | goto out; | ||
| 136 | } | ||
| 125 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); | 137 | bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); |
| 126 | if (bio == NULL) { | 138 | if (bio == NULL) { |
| 127 | unlock_page(page); | 139 | unlock_page(page); |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index aa9701e12714..6c118d012bb5 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
| @@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, | |||
| 162 | 162 | ||
| 163 | /** | 163 | /** |
| 164 | * walk_page_range - walk a memory map's page tables with a callback | 164 | * walk_page_range - walk a memory map's page tables with a callback |
| 165 | * @mm: memory map to walk | ||
| 166 | * @addr: starting address | 165 | * @addr: starting address |
| 167 | * @end: ending address | 166 | * @end: ending address |
| 168 | * @walk: set of callbacks to invoke for each level of the tree | 167 | * @walk: set of callbacks to invoke for each level of the tree |
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 405d331804c3..3707c71ae4cd 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
| @@ -360,7 +360,6 @@ err_free: | |||
| 360 | * @chunk: chunk to depopulate | 360 | * @chunk: chunk to depopulate |
| 361 | * @off: offset to the area to depopulate | 361 | * @off: offset to the area to depopulate |
| 362 | * @size: size of the area to depopulate in bytes | 362 | * @size: size of the area to depopulate in bytes |
| 363 | * @flush: whether to flush cache and tlb or not | ||
| 364 | * | 363 | * |
| 365 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 364 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
| 366 | * from @chunk. If @flush is true, vcache is flushed before unmapping | 365 | * from @chunk. If @flush is true, vcache is flushed before unmapping |
diff --git a/mm/shmem.c b/mm/shmem.c index c244e93a70fa..bd106361be4b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping, | |||
| 264 | } | 264 | } |
| 265 | 265 | ||
| 266 | /* | 266 | /* |
| 267 | * Sometimes, before we decide whether to proceed or to fail, we must check | ||
| 268 | * that an entry was not already brought back from swap by a racing thread. | ||
| 269 | * | ||
| 270 | * Checking page is not enough: by the time a SwapCache page is locked, it | ||
| 271 | * might be reused, and again be SwapCache, using the same swap as before. | ||
| 272 | */ | ||
| 273 | static bool shmem_confirm_swap(struct address_space *mapping, | ||
| 274 | pgoff_t index, swp_entry_t swap) | ||
| 275 | { | ||
| 276 | void *item; | ||
| 277 | |||
| 278 | rcu_read_lock(); | ||
| 279 | item = radix_tree_lookup(&mapping->page_tree, index); | ||
| 280 | rcu_read_unlock(); | ||
| 281 | return item == swp_to_radix_entry(swap); | ||
| 282 | } | ||
| 283 | |||
| 284 | /* | ||
| 267 | * Like add_to_page_cache_locked, but error if expected item has gone. | 285 | * Like add_to_page_cache_locked, but error if expected item has gone. |
| 268 | */ | 286 | */ |
| 269 | static int shmem_add_to_page_cache(struct page *page, | 287 | static int shmem_add_to_page_cache(struct page *page, |
| 270 | struct address_space *mapping, | 288 | struct address_space *mapping, |
| 271 | pgoff_t index, gfp_t gfp, void *expected) | 289 | pgoff_t index, gfp_t gfp, void *expected) |
| 272 | { | 290 | { |
| 273 | int error = 0; | 291 | int error; |
| 274 | 292 | ||
| 275 | VM_BUG_ON(!PageLocked(page)); | 293 | VM_BUG_ON(!PageLocked(page)); |
| 276 | VM_BUG_ON(!PageSwapBacked(page)); | 294 | VM_BUG_ON(!PageSwapBacked(page)); |
| 277 | 295 | ||
| 296 | page_cache_get(page); | ||
| 297 | page->mapping = mapping; | ||
| 298 | page->index = index; | ||
| 299 | |||
| 300 | spin_lock_irq(&mapping->tree_lock); | ||
| 278 | if (!expected) | 301 | if (!expected) |
| 279 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | 302 | error = radix_tree_insert(&mapping->page_tree, index, page); |
| 303 | else | ||
| 304 | error = shmem_radix_tree_replace(mapping, index, expected, | ||
| 305 | page); | ||
| 280 | if (!error) { | 306 | if (!error) { |
| 281 | page_cache_get(page); | 307 | mapping->nrpages++; |
| 282 | page->mapping = mapping; | 308 | __inc_zone_page_state(page, NR_FILE_PAGES); |
| 283 | page->index = index; | 309 | __inc_zone_page_state(page, NR_SHMEM); |
| 284 | 310 | spin_unlock_irq(&mapping->tree_lock); | |
| 285 | spin_lock_irq(&mapping->tree_lock); | 311 | } else { |
| 286 | if (!expected) | 312 | page->mapping = NULL; |
| 287 | error = radix_tree_insert(&mapping->page_tree, | 313 | spin_unlock_irq(&mapping->tree_lock); |
| 288 | index, page); | 314 | page_cache_release(page); |
| 289 | else | ||
| 290 | error = shmem_radix_tree_replace(mapping, index, | ||
| 291 | expected, page); | ||
| 292 | if (!error) { | ||
| 293 | mapping->nrpages++; | ||
| 294 | __inc_zone_page_state(page, NR_FILE_PAGES); | ||
| 295 | __inc_zone_page_state(page, NR_SHMEM); | ||
| 296 | spin_unlock_irq(&mapping->tree_lock); | ||
| 297 | } else { | ||
| 298 | page->mapping = NULL; | ||
| 299 | spin_unlock_irq(&mapping->tree_lock); | ||
| 300 | page_cache_release(page); | ||
| 301 | } | ||
| 302 | if (!expected) | ||
| 303 | radix_tree_preload_end(); | ||
| 304 | } | 315 | } |
| 305 | if (error) | ||
| 306 | mem_cgroup_uncharge_cache_page(page); | ||
| 307 | return error; | 316 | return error; |
| 308 | } | 317 | } |
| 309 | 318 | ||
| @@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, | |||
| 683 | mutex_lock(&shmem_swaplist_mutex); | 692 | mutex_lock(&shmem_swaplist_mutex); |
| 684 | /* | 693 | /* |
| 685 | * We needed to drop mutex to make that restrictive page | 694 | * We needed to drop mutex to make that restrictive page |
| 686 | * allocation; but the inode might already be freed by now, | 695 | * allocation, but the inode might have been freed while we |
| 687 | * and we cannot refer to inode or mapping or info to check. | 696 | * dropped it: although a racing shmem_evict_inode() cannot |
| 688 | * However, we do hold page lock on the PageSwapCache page, | 697 | * complete without emptying the radix_tree, our page lock |
| 689 | * so can check if that still has our reference remaining. | 698 | * on this swapcache page is not enough to prevent that - |
| 699 | * free_swap_and_cache() of our swap entry will only | ||
| 700 | * trylock_page(), removing swap from radix_tree whatever. | ||
| 701 | * | ||
| 702 | * We must not proceed to shmem_add_to_page_cache() if the | ||
| 703 | * inode has been freed, but of course we cannot rely on | ||
| 704 | * inode or mapping or info to check that. However, we can | ||
| 705 | * safely check if our swap entry is still in use (and here | ||
| 706 | * it can't have got reused for another page): if it's still | ||
| 707 | * in use, then the inode cannot have been freed yet, and we | ||
| 708 | * can safely proceed (if it's no longer in use, that tells | ||
| 709 | * nothing about the inode, but we don't need to unuse swap). | ||
| 690 | */ | 710 | */ |
| 691 | if (!page_swapcount(*pagep)) | 711 | if (!page_swapcount(*pagep)) |
| 692 | error = -ENOENT; | 712 | error = -ENOENT; |
| @@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
| 730 | 750 | ||
| 731 | /* | 751 | /* |
| 732 | * There's a faint possibility that swap page was replaced before | 752 | * There's a faint possibility that swap page was replaced before |
| 733 | * caller locked it: it will come back later with the right page. | 753 | * caller locked it: caller will come back later with the right page. |
| 734 | */ | 754 | */ |
| 735 | if (unlikely(!PageSwapCache(page))) | 755 | if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val)) |
| 736 | goto out; | 756 | goto out; |
| 737 | 757 | ||
| 738 | /* | 758 | /* |
| @@ -995,21 +1015,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
| 995 | newpage = shmem_alloc_page(gfp, info, index); | 1015 | newpage = shmem_alloc_page(gfp, info, index); |
| 996 | if (!newpage) | 1016 | if (!newpage) |
| 997 | return -ENOMEM; | 1017 | return -ENOMEM; |
| 998 | VM_BUG_ON(shmem_should_replace_page(newpage, gfp)); | ||
| 999 | 1018 | ||
| 1000 | *pagep = newpage; | ||
| 1001 | page_cache_get(newpage); | 1019 | page_cache_get(newpage); |
| 1002 | copy_highpage(newpage, oldpage); | 1020 | copy_highpage(newpage, oldpage); |
| 1021 | flush_dcache_page(newpage); | ||
| 1003 | 1022 | ||
| 1004 | VM_BUG_ON(!PageLocked(oldpage)); | ||
| 1005 | __set_page_locked(newpage); | 1023 | __set_page_locked(newpage); |
| 1006 | VM_BUG_ON(!PageUptodate(oldpage)); | ||
| 1007 | SetPageUptodate(newpage); | 1024 | SetPageUptodate(newpage); |
| 1008 | VM_BUG_ON(!PageSwapBacked(oldpage)); | ||
| 1009 | SetPageSwapBacked(newpage); | 1025 | SetPageSwapBacked(newpage); |
| 1010 | VM_BUG_ON(!swap_index); | ||
| 1011 | set_page_private(newpage, swap_index); | 1026 | set_page_private(newpage, swap_index); |
| 1012 | VM_BUG_ON(!PageSwapCache(oldpage)); | ||
| 1013 | SetPageSwapCache(newpage); | 1027 | SetPageSwapCache(newpage); |
| 1014 | 1028 | ||
| 1015 | /* | 1029 | /* |
| @@ -1019,13 +1033,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
| 1019 | spin_lock_irq(&swap_mapping->tree_lock); | 1033 | spin_lock_irq(&swap_mapping->tree_lock); |
| 1020 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, | 1034 | error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage, |
| 1021 | newpage); | 1035 | newpage); |
| 1022 | __inc_zone_page_state(newpage, NR_FILE_PAGES); | 1036 | if (!error) { |
| 1023 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | 1037 | __inc_zone_page_state(newpage, NR_FILE_PAGES); |
| 1038 | __dec_zone_page_state(oldpage, NR_FILE_PAGES); | ||
| 1039 | } | ||
| 1024 | spin_unlock_irq(&swap_mapping->tree_lock); | 1040 | spin_unlock_irq(&swap_mapping->tree_lock); |
| 1025 | BUG_ON(error); | ||
| 1026 | 1041 | ||
| 1027 | mem_cgroup_replace_page_cache(oldpage, newpage); | 1042 | if (unlikely(error)) { |
| 1028 | lru_cache_add_anon(newpage); | 1043 | /* |
| 1044 | * Is this possible? I think not, now that our callers check | ||
| 1045 | * both PageSwapCache and page_private after getting page lock; | ||
| 1046 | * but be defensive. Reverse old to newpage for clear and free. | ||
| 1047 | */ | ||
| 1048 | oldpage = newpage; | ||
| 1049 | } else { | ||
| 1050 | mem_cgroup_replace_page_cache(oldpage, newpage); | ||
| 1051 | lru_cache_add_anon(newpage); | ||
| 1052 | *pagep = newpage; | ||
| 1053 | } | ||
| 1029 | 1054 | ||
| 1030 | ClearPageSwapCache(oldpage); | 1055 | ClearPageSwapCache(oldpage); |
| 1031 | set_page_private(oldpage, 0); | 1056 | set_page_private(oldpage, 0); |
| @@ -1033,7 +1058,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
| 1033 | unlock_page(oldpage); | 1058 | unlock_page(oldpage); |
| 1034 | page_cache_release(oldpage); | 1059 | page_cache_release(oldpage); |
| 1035 | page_cache_release(oldpage); | 1060 | page_cache_release(oldpage); |
| 1036 | return 0; | 1061 | return error; |
| 1037 | } | 1062 | } |
| 1038 | 1063 | ||
| 1039 | /* | 1064 | /* |
| @@ -1107,9 +1132,10 @@ repeat: | |||
| 1107 | 1132 | ||
| 1108 | /* We have to do this with page locked to prevent races */ | 1133 | /* We have to do this with page locked to prevent races */ |
| 1109 | lock_page(page); | 1134 | lock_page(page); |
| 1110 | if (!PageSwapCache(page) || page->mapping) { | 1135 | if (!PageSwapCache(page) || page_private(page) != swap.val || |
| 1136 | !shmem_confirm_swap(mapping, index, swap)) { | ||
| 1111 | error = -EEXIST; /* try again */ | 1137 | error = -EEXIST; /* try again */ |
| 1112 | goto failed; | 1138 | goto unlock; |
| 1113 | } | 1139 | } |
| 1114 | if (!PageUptodate(page)) { | 1140 | if (!PageUptodate(page)) { |
| 1115 | error = -EIO; | 1141 | error = -EIO; |
| @@ -1125,9 +1151,12 @@ repeat: | |||
| 1125 | 1151 | ||
| 1126 | error = mem_cgroup_cache_charge(page, current->mm, | 1152 | error = mem_cgroup_cache_charge(page, current->mm, |
| 1127 | gfp & GFP_RECLAIM_MASK); | 1153 | gfp & GFP_RECLAIM_MASK); |
| 1128 | if (!error) | 1154 | if (!error) { |
| 1129 | error = shmem_add_to_page_cache(page, mapping, index, | 1155 | error = shmem_add_to_page_cache(page, mapping, index, |
| 1130 | gfp, swp_to_radix_entry(swap)); | 1156 | gfp, swp_to_radix_entry(swap)); |
| 1157 | /* We already confirmed swap, and make no allocation */ | ||
| 1158 | VM_BUG_ON(error); | ||
| 1159 | } | ||
| 1131 | if (error) | 1160 | if (error) |
| 1132 | goto failed; | 1161 | goto failed; |
| 1133 | 1162 | ||
| @@ -1164,11 +1193,18 @@ repeat: | |||
| 1164 | __set_page_locked(page); | 1193 | __set_page_locked(page); |
| 1165 | error = mem_cgroup_cache_charge(page, current->mm, | 1194 | error = mem_cgroup_cache_charge(page, current->mm, |
| 1166 | gfp & GFP_RECLAIM_MASK); | 1195 | gfp & GFP_RECLAIM_MASK); |
| 1167 | if (!error) | ||
| 1168 | error = shmem_add_to_page_cache(page, mapping, index, | ||
| 1169 | gfp, NULL); | ||
| 1170 | if (error) | 1196 | if (error) |
| 1171 | goto decused; | 1197 | goto decused; |
| 1198 | error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); | ||
| 1199 | if (!error) { | ||
| 1200 | error = shmem_add_to_page_cache(page, mapping, index, | ||
| 1201 | gfp, NULL); | ||
| 1202 | radix_tree_preload_end(); | ||
| 1203 | } | ||
| 1204 | if (error) { | ||
| 1205 | mem_cgroup_uncharge_cache_page(page); | ||
| 1206 | goto decused; | ||
| 1207 | } | ||
| 1172 | lru_cache_add_anon(page); | 1208 | lru_cache_add_anon(page); |
| 1173 | 1209 | ||
| 1174 | spin_lock(&info->lock); | 1210 | spin_lock(&info->lock); |
| @@ -1228,14 +1264,10 @@ decused: | |||
| 1228 | unacct: | 1264 | unacct: |
| 1229 | shmem_unacct_blocks(info->flags, 1); | 1265 | shmem_unacct_blocks(info->flags, 1); |
| 1230 | failed: | 1266 | failed: |
| 1231 | if (swap.val && error != -EINVAL) { | 1267 | if (swap.val && error != -EINVAL && |
| 1232 | struct page *test = find_get_page(mapping, index); | 1268 | !shmem_confirm_swap(mapping, index, swap)) |
| 1233 | if (test && !radix_tree_exceptional_entry(test)) | 1269 | error = -EEXIST; |
| 1234 | page_cache_release(test); | 1270 | unlock: |
| 1235 | /* Have another try if the entry has changed */ | ||
| 1236 | if (test != swp_to_radix_entry(swap)) | ||
| 1237 | error = -EEXIST; | ||
| 1238 | } | ||
| 1239 | if (page) { | 1271 | if (page) { |
| 1240 | unlock_page(page); | 1272 | unlock_page(page); |
| 1241 | page_cache_release(page); | 1273 | page_cache_release(page); |
| @@ -1247,7 +1279,7 @@ failed: | |||
| 1247 | spin_unlock(&info->lock); | 1279 | spin_unlock(&info->lock); |
| 1248 | goto repeat; | 1280 | goto repeat; |
| 1249 | } | 1281 | } |
| 1250 | if (error == -EEXIST) | 1282 | if (error == -EEXIST) /* from above or from radix_tree_insert */ |
| 1251 | goto repeat; | 1283 | goto repeat; |
| 1252 | return error; | 1284 | return error; |
| 1253 | } | 1285 | } |
| @@ -1675,98 +1707,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
| 1675 | return error; | 1707 | return error; |
| 1676 | } | 1708 | } |
| 1677 | 1709 | ||
| 1678 | /* | ||
| 1679 | * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. | ||
| 1680 | */ | ||
| 1681 | static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | ||
| 1682 | pgoff_t index, pgoff_t end, int origin) | ||
| 1683 | { | ||
| 1684 | struct page *page; | ||
| 1685 | struct pagevec pvec; | ||
| 1686 | pgoff_t indices[PAGEVEC_SIZE]; | ||
| 1687 | bool done = false; | ||
| 1688 | int i; | ||
| 1689 | |||
| 1690 | pagevec_init(&pvec, 0); | ||
| 1691 | pvec.nr = 1; /* start small: we may be there already */ | ||
| 1692 | while (!done) { | ||
| 1693 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | ||
| 1694 | pvec.nr, pvec.pages, indices); | ||
| 1695 | if (!pvec.nr) { | ||
| 1696 | if (origin == SEEK_DATA) | ||
| 1697 | index = end; | ||
| 1698 | break; | ||
| 1699 | } | ||
| 1700 | for (i = 0; i < pvec.nr; i++, index++) { | ||
| 1701 | if (index < indices[i]) { | ||
| 1702 | if (origin == SEEK_HOLE) { | ||
| 1703 | done = true; | ||
| 1704 | break; | ||
| 1705 | } | ||
| 1706 | index = indices[i]; | ||
| 1707 | } | ||
| 1708 | page = pvec.pages[i]; | ||
| 1709 | if (page && !radix_tree_exceptional_entry(page)) { | ||
| 1710 | if (!PageUptodate(page)) | ||
| 1711 | page = NULL; | ||
| 1712 | } | ||
| 1713 | if (index >= end || | ||
| 1714 | (page && origin == SEEK_DATA) || | ||
| 1715 | (!page && origin == SEEK_HOLE)) { | ||
| 1716 | done = true; | ||
| 1717 | break; | ||
| 1718 | } | ||
| 1719 | } | ||
| 1720 | shmem_deswap_pagevec(&pvec); | ||
| 1721 | pagevec_release(&pvec); | ||
| 1722 | pvec.nr = PAGEVEC_SIZE; | ||
| 1723 | cond_resched(); | ||
| 1724 | } | ||
| 1725 | return index; | ||
| 1726 | } | ||
| 1727 | |||
| 1728 | static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) | ||
| 1729 | { | ||
| 1730 | struct address_space *mapping; | ||
| 1731 | struct inode *inode; | ||
| 1732 | pgoff_t start, end; | ||
| 1733 | loff_t new_offset; | ||
| 1734 | |||
| 1735 | if (origin != SEEK_DATA && origin != SEEK_HOLE) | ||
| 1736 | return generic_file_llseek_size(file, offset, origin, | ||
| 1737 | MAX_LFS_FILESIZE); | ||
| 1738 | mapping = file->f_mapping; | ||
| 1739 | inode = mapping->host; | ||
| 1740 | mutex_lock(&inode->i_mutex); | ||
| 1741 | /* We're holding i_mutex so we can access i_size directly */ | ||
| 1742 | |||
| 1743 | if (offset < 0) | ||
| 1744 | offset = -EINVAL; | ||
| 1745 | else if (offset >= inode->i_size) | ||
| 1746 | offset = -ENXIO; | ||
| 1747 | else { | ||
| 1748 | start = offset >> PAGE_CACHE_SHIFT; | ||
| 1749 | end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 1750 | new_offset = shmem_seek_hole_data(mapping, start, end, origin); | ||
| 1751 | new_offset <<= PAGE_CACHE_SHIFT; | ||
| 1752 | if (new_offset > offset) { | ||
| 1753 | if (new_offset < inode->i_size) | ||
| 1754 | offset = new_offset; | ||
| 1755 | else if (origin == SEEK_DATA) | ||
| 1756 | offset = -ENXIO; | ||
| 1757 | else | ||
| 1758 | offset = inode->i_size; | ||
| 1759 | } | ||
| 1760 | } | ||
| 1761 | |||
| 1762 | if (offset >= 0 && offset != file->f_pos) { | ||
| 1763 | file->f_pos = offset; | ||
| 1764 | file->f_version = 0; | ||
| 1765 | } | ||
| 1766 | mutex_unlock(&inode->i_mutex); | ||
| 1767 | return offset; | ||
| 1768 | } | ||
| 1769 | |||
| 1770 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | 1710 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, |
| 1771 | loff_t len) | 1711 | loff_t len) |
| 1772 | { | 1712 | { |
| @@ -2770,7 +2710,7 @@ static const struct address_space_operations shmem_aops = { | |||
| 2770 | static const struct file_operations shmem_file_operations = { | 2710 | static const struct file_operations shmem_file_operations = { |
| 2771 | .mmap = shmem_mmap, | 2711 | .mmap = shmem_mmap, |
| 2772 | #ifdef CONFIG_TMPFS | 2712 | #ifdef CONFIG_TMPFS |
| 2773 | .llseek = shmem_file_llseek, | 2713 | .llseek = generic_file_llseek, |
| 2774 | .read = do_sync_read, | 2714 | .read = do_sync_read, |
| 2775 | .write = do_sync_write, | 2715 | .write = do_sync_write, |
| 2776 | .aio_read = shmem_file_aio_read, | 2716 | .aio_read = shmem_file_aio_read, |
diff --git a/mm/sparse.c b/mm/sparse.c index 6a4bf9160e85..c7bb952400c8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
| @@ -275,8 +275,9 @@ static unsigned long * __init | |||
| 275 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | 275 | sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, |
| 276 | unsigned long size) | 276 | unsigned long size) |
| 277 | { | 277 | { |
| 278 | pg_data_t *host_pgdat; | 278 | unsigned long goal, limit; |
| 279 | unsigned long goal; | 279 | unsigned long *p; |
| 280 | int nid; | ||
| 280 | /* | 281 | /* |
| 281 | * A page may contain usemaps for other sections preventing the | 282 | * A page may contain usemaps for other sections preventing the |
| 282 | * page being freed and making a section unremovable while | 283 | * page being freed and making a section unremovable while |
| @@ -287,10 +288,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, | |||
| 287 | * from the same section as the pgdat where possible to avoid | 288 | * from the same section as the pgdat where possible to avoid |
| 288 | * this problem. | 289 | * this problem. |
| 289 | */ | 290 | */ |
| 290 | goal = __pa(pgdat) & PAGE_SECTION_MASK; | 291 | goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); |
| 291 | host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); | 292 | limit = goal + (1UL << PA_SECTION_SHIFT); |
| 292 | return __alloc_bootmem_node_nopanic(host_pgdat, size, | 293 | nid = early_pfn_to_nid(goal >> PAGE_SHIFT); |
| 293 | SMP_CACHE_BYTES, goal); | 294 | again: |
| 295 | p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, | ||
| 296 | SMP_CACHE_BYTES, goal, limit); | ||
| 297 | if (!p && limit) { | ||
| 298 | limit = 0; | ||
| 299 | goto again; | ||
| 300 | } | ||
| 301 | return p; | ||
| 294 | } | 302 | } |
| 295 | 303 | ||
| 296 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) | 304 | static void __init check_usemap_section_nr(int nid, unsigned long *usemap) |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 457b10baef59..71373d03fcee 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -31,6 +31,8 @@ | |||
| 31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
| 32 | #include <linux/poll.h> | 32 | #include <linux/poll.h> |
| 33 | #include <linux/oom.h> | 33 | #include <linux/oom.h> |
| 34 | #include <linux/frontswap.h> | ||
| 35 | #include <linux/swapfile.h> | ||
| 34 | 36 | ||
| 35 | #include <asm/pgtable.h> | 37 | #include <asm/pgtable.h> |
| 36 | #include <asm/tlbflush.h> | 38 | #include <asm/tlbflush.h> |
| @@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, | |||
| 42 | static void free_swap_count_continuations(struct swap_info_struct *); | 44 | static void free_swap_count_continuations(struct swap_info_struct *); |
| 43 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); | 45 | static sector_t map_swap_entry(swp_entry_t, struct block_device**); |
| 44 | 46 | ||
| 45 | static DEFINE_SPINLOCK(swap_lock); | 47 | DEFINE_SPINLOCK(swap_lock); |
| 46 | static unsigned int nr_swapfiles; | 48 | static unsigned int nr_swapfiles; |
| 47 | long nr_swap_pages; | 49 | long nr_swap_pages; |
| 48 | long total_swap_pages; | 50 | long total_swap_pages; |
| @@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry "; | |||
| 53 | static const char Bad_offset[] = "Bad swap offset entry "; | 55 | static const char Bad_offset[] = "Bad swap offset entry "; |
| 54 | static const char Unused_offset[] = "Unused swap offset entry "; | 56 | static const char Unused_offset[] = "Unused swap offset entry "; |
| 55 | 57 | ||
| 56 | static struct swap_list_t swap_list = {-1, -1}; | 58 | struct swap_list_t swap_list = {-1, -1}; |
| 57 | 59 | ||
| 58 | static struct swap_info_struct *swap_info[MAX_SWAPFILES]; | 60 | struct swap_info_struct *swap_info[MAX_SWAPFILES]; |
| 59 | 61 | ||
| 60 | static DEFINE_MUTEX(swapon_mutex); | 62 | static DEFINE_MUTEX(swapon_mutex); |
| 61 | 63 | ||
| @@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, | |||
| 556 | swap_list.next = p->type; | 558 | swap_list.next = p->type; |
| 557 | nr_swap_pages++; | 559 | nr_swap_pages++; |
| 558 | p->inuse_pages--; | 560 | p->inuse_pages--; |
| 561 | frontswap_invalidate_page(p->type, offset); | ||
| 559 | if ((p->flags & SWP_BLKDEV) && | 562 | if ((p->flags & SWP_BLKDEV) && |
| 560 | disk->fops->swap_slot_free_notify) | 563 | disk->fops->swap_slot_free_notify) |
| 561 | disk->fops->swap_slot_free_notify(p->bdev, offset); | 564 | disk->fops->swap_slot_free_notify(p->bdev, offset); |
| @@ -985,11 +988,12 @@ static int unuse_mm(struct mm_struct *mm, | |||
| 985 | } | 988 | } |
| 986 | 989 | ||
| 987 | /* | 990 | /* |
| 988 | * Scan swap_map from current position to next entry still in use. | 991 | * Scan swap_map (or frontswap_map if frontswap parameter is true) |
| 992 | * from current position to next entry still in use. | ||
| 989 | * Recycle to start on reaching the end, returning 0 when empty. | 993 | * Recycle to start on reaching the end, returning 0 when empty. |
| 990 | */ | 994 | */ |
| 991 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, | 995 | static unsigned int find_next_to_unuse(struct swap_info_struct *si, |
| 992 | unsigned int prev) | 996 | unsigned int prev, bool frontswap) |
| 993 | { | 997 | { |
| 994 | unsigned int max = si->max; | 998 | unsigned int max = si->max; |
| 995 | unsigned int i = prev; | 999 | unsigned int i = prev; |
| @@ -1015,6 +1019,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
| 1015 | prev = 0; | 1019 | prev = 0; |
| 1016 | i = 1; | 1020 | i = 1; |
| 1017 | } | 1021 | } |
| 1022 | if (frontswap) { | ||
| 1023 | if (frontswap_test(si, i)) | ||
| 1024 | break; | ||
| 1025 | else | ||
| 1026 | continue; | ||
| 1027 | } | ||
| 1018 | count = si->swap_map[i]; | 1028 | count = si->swap_map[i]; |
| 1019 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1029 | if (count && swap_count(count) != SWAP_MAP_BAD) |
| 1020 | break; | 1030 | break; |
| @@ -1026,8 +1036,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
| 1026 | * We completely avoid races by reading each swap page in advance, | 1036 | * We completely avoid races by reading each swap page in advance, |
| 1027 | * and then search for the process using it. All the necessary | 1037 | * and then search for the process using it. All the necessary |
| 1028 | * page table adjustments can then be made atomically. | 1038 | * page table adjustments can then be made atomically. |
| 1039 | * | ||
| 1040 | * if the boolean frontswap is true, only unuse pages_to_unuse pages; | ||
| 1041 | * pages_to_unuse==0 means all pages; ignored if frontswap is false | ||
| 1029 | */ | 1042 | */ |
| 1030 | static int try_to_unuse(unsigned int type) | 1043 | int try_to_unuse(unsigned int type, bool frontswap, |
| 1044 | unsigned long pages_to_unuse) | ||
| 1031 | { | 1045 | { |
| 1032 | struct swap_info_struct *si = swap_info[type]; | 1046 | struct swap_info_struct *si = swap_info[type]; |
| 1033 | struct mm_struct *start_mm; | 1047 | struct mm_struct *start_mm; |
| @@ -1060,7 +1074,7 @@ static int try_to_unuse(unsigned int type) | |||
| 1060 | * one pass through swap_map is enough, but not necessarily: | 1074 | * one pass through swap_map is enough, but not necessarily: |
| 1061 | * there are races when an instance of an entry might be missed. | 1075 | * there are races when an instance of an entry might be missed. |
| 1062 | */ | 1076 | */ |
| 1063 | while ((i = find_next_to_unuse(si, i)) != 0) { | 1077 | while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { |
| 1064 | if (signal_pending(current)) { | 1078 | if (signal_pending(current)) { |
| 1065 | retval = -EINTR; | 1079 | retval = -EINTR; |
| 1066 | break; | 1080 | break; |
| @@ -1227,6 +1241,10 @@ static int try_to_unuse(unsigned int type) | |||
| 1227 | * interactive performance. | 1241 | * interactive performance. |
| 1228 | */ | 1242 | */ |
| 1229 | cond_resched(); | 1243 | cond_resched(); |
| 1244 | if (frontswap && pages_to_unuse > 0) { | ||
| 1245 | if (!--pages_to_unuse) | ||
| 1246 | break; | ||
| 1247 | } | ||
| 1230 | } | 1248 | } |
| 1231 | 1249 | ||
| 1232 | mmput(start_mm); | 1250 | mmput(start_mm); |
| @@ -1486,7 +1504,8 @@ bad_bmap: | |||
| 1486 | } | 1504 | } |
| 1487 | 1505 | ||
| 1488 | static void enable_swap_info(struct swap_info_struct *p, int prio, | 1506 | static void enable_swap_info(struct swap_info_struct *p, int prio, |
| 1489 | unsigned char *swap_map) | 1507 | unsigned char *swap_map, |
| 1508 | unsigned long *frontswap_map) | ||
| 1490 | { | 1509 | { |
| 1491 | int i, prev; | 1510 | int i, prev; |
| 1492 | 1511 | ||
| @@ -1496,6 +1515,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
| 1496 | else | 1515 | else |
| 1497 | p->prio = --least_priority; | 1516 | p->prio = --least_priority; |
| 1498 | p->swap_map = swap_map; | 1517 | p->swap_map = swap_map; |
| 1518 | frontswap_map_set(p, frontswap_map); | ||
| 1499 | p->flags |= SWP_WRITEOK; | 1519 | p->flags |= SWP_WRITEOK; |
| 1500 | nr_swap_pages += p->pages; | 1520 | nr_swap_pages += p->pages; |
| 1501 | total_swap_pages += p->pages; | 1521 | total_swap_pages += p->pages; |
| @@ -1512,6 +1532,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, | |||
| 1512 | swap_list.head = swap_list.next = p->type; | 1532 | swap_list.head = swap_list.next = p->type; |
| 1513 | else | 1533 | else |
| 1514 | swap_info[prev]->next = p->type; | 1534 | swap_info[prev]->next = p->type; |
| 1535 | frontswap_init(p->type); | ||
| 1515 | spin_unlock(&swap_lock); | 1536 | spin_unlock(&swap_lock); |
| 1516 | } | 1537 | } |
| 1517 | 1538 | ||
| @@ -1585,7 +1606,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1585 | spin_unlock(&swap_lock); | 1606 | spin_unlock(&swap_lock); |
| 1586 | 1607 | ||
| 1587 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); | 1608 | oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); |
| 1588 | err = try_to_unuse(type); | 1609 | err = try_to_unuse(type, false, 0); /* force all pages to be unused */ |
| 1589 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); | 1610 | compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); |
| 1590 | 1611 | ||
| 1591 | if (err) { | 1612 | if (err) { |
| @@ -1596,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1596 | * sys_swapoff for this swap_info_struct at this point. | 1617 | * sys_swapoff for this swap_info_struct at this point. |
| 1597 | */ | 1618 | */ |
| 1598 | /* re-insert swap space back into swap_list */ | 1619 | /* re-insert swap space back into swap_list */ |
| 1599 | enable_swap_info(p, p->prio, p->swap_map); | 1620 | enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); |
| 1600 | goto out_dput; | 1621 | goto out_dput; |
| 1601 | } | 1622 | } |
| 1602 | 1623 | ||
| @@ -1622,9 +1643,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1622 | swap_map = p->swap_map; | 1643 | swap_map = p->swap_map; |
| 1623 | p->swap_map = NULL; | 1644 | p->swap_map = NULL; |
| 1624 | p->flags = 0; | 1645 | p->flags = 0; |
| 1646 | frontswap_invalidate_area(type); | ||
| 1625 | spin_unlock(&swap_lock); | 1647 | spin_unlock(&swap_lock); |
| 1626 | mutex_unlock(&swapon_mutex); | 1648 | mutex_unlock(&swapon_mutex); |
| 1627 | vfree(swap_map); | 1649 | vfree(swap_map); |
| 1650 | vfree(frontswap_map_get(p)); | ||
| 1628 | /* Destroy swap account informatin */ | 1651 | /* Destroy swap account informatin */ |
| 1629 | swap_cgroup_swapoff(type); | 1652 | swap_cgroup_swapoff(type); |
| 1630 | 1653 | ||
| @@ -1893,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p, | |||
| 1893 | 1916 | ||
| 1894 | /* | 1917 | /* |
| 1895 | * Find out how many pages are allowed for a single swap | 1918 | * Find out how many pages are allowed for a single swap |
| 1896 | * device. There are three limiting factors: 1) the number | 1919 | * device. There are two limiting factors: 1) the number |
| 1897 | * of bits for the swap offset in the swp_entry_t type, and | 1920 | * of bits for the swap offset in the swp_entry_t type, and |
| 1898 | * 2) the number of bits in the swap pte as defined by the | 1921 | * 2) the number of bits in the swap pte as defined by the |
| 1899 | * the different architectures, and 3) the number of free bits | 1922 | * different architectures. In order to find the |
| 1900 | * in an exceptional radix_tree entry. In order to find the | ||
| 1901 | * largest possible bit mask, a swap entry with swap type 0 | 1923 | * largest possible bit mask, a swap entry with swap type 0 |
| 1902 | * and swap offset ~0UL is created, encoded to a swap pte, | 1924 | * and swap offset ~0UL is created, encoded to a swap pte, |
| 1903 | * decoded to a swp_entry_t again, and finally the swap | 1925 | * decoded to a swp_entry_t again, and finally the swap |
| 1904 | * offset is extracted. This will mask all the bits from | 1926 | * offset is extracted. This will mask all the bits from |
| 1905 | * the initial ~0UL mask that can't be encoded in either | 1927 | * the initial ~0UL mask that can't be encoded in either |
| 1906 | * the swp_entry_t or the architecture definition of a | 1928 | * the swp_entry_t or the architecture definition of a |
| 1907 | * swap pte. Then the same is done for a radix_tree entry. | 1929 | * swap pte. |
| 1908 | */ | 1930 | */ |
| 1909 | maxpages = swp_offset(pte_to_swp_entry( | 1931 | maxpages = swp_offset(pte_to_swp_entry( |
| 1910 | swp_entry_to_pte(swp_entry(0, ~0UL)))); | 1932 | swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; |
| 1911 | maxpages = swp_offset(radix_to_swp_entry( | ||
| 1912 | swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; | ||
| 1913 | |||
| 1914 | if (maxpages > swap_header->info.last_page) { | 1933 | if (maxpages > swap_header->info.last_page) { |
| 1915 | maxpages = swap_header->info.last_page + 1; | 1934 | maxpages = swap_header->info.last_page + 1; |
| 1916 | /* p->max is an unsigned int: don't overflow it */ | 1935 | /* p->max is an unsigned int: don't overflow it */ |
| @@ -1988,6 +2007,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 1988 | sector_t span; | 2007 | sector_t span; |
| 1989 | unsigned long maxpages; | 2008 | unsigned long maxpages; |
| 1990 | unsigned char *swap_map = NULL; | 2009 | unsigned char *swap_map = NULL; |
| 2010 | unsigned long *frontswap_map = NULL; | ||
| 1991 | struct page *page = NULL; | 2011 | struct page *page = NULL; |
| 1992 | struct inode *inode = NULL; | 2012 | struct inode *inode = NULL; |
| 1993 | 2013 | ||
| @@ -2071,6 +2091,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 2071 | error = nr_extents; | 2091 | error = nr_extents; |
| 2072 | goto bad_swap; | 2092 | goto bad_swap; |
| 2073 | } | 2093 | } |
| 2094 | /* frontswap enabled? set up bit-per-page map for frontswap */ | ||
| 2095 | if (frontswap_enabled) | ||
| 2096 | frontswap_map = vzalloc(maxpages / sizeof(long)); | ||
| 2074 | 2097 | ||
| 2075 | if (p->bdev) { | 2098 | if (p->bdev) { |
| 2076 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { | 2099 | if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { |
| @@ -2086,14 +2109,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) | |||
| 2086 | if (swap_flags & SWAP_FLAG_PREFER) | 2109 | if (swap_flags & SWAP_FLAG_PREFER) |
| 2087 | prio = | 2110 | prio = |
| 2088 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; | 2111 | (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; |
| 2089 | enable_swap_info(p, prio, swap_map); | 2112 | enable_swap_info(p, prio, swap_map, frontswap_map); |
| 2090 | 2113 | ||
| 2091 | printk(KERN_INFO "Adding %uk swap on %s. " | 2114 | printk(KERN_INFO "Adding %uk swap on %s. " |
| 2092 | "Priority:%d extents:%d across:%lluk %s%s\n", | 2115 | "Priority:%d extents:%d across:%lluk %s%s%s\n", |
| 2093 | p->pages<<(PAGE_SHIFT-10), name, p->prio, | 2116 | p->pages<<(PAGE_SHIFT-10), name, p->prio, |
| 2094 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), | 2117 | nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), |
| 2095 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", | 2118 | (p->flags & SWP_SOLIDSTATE) ? "SS" : "", |
| 2096 | (p->flags & SWP_DISCARDABLE) ? "D" : ""); | 2119 | (p->flags & SWP_DISCARDABLE) ? "D" : "", |
| 2120 | (frontswap_map) ? "FS" : ""); | ||
| 2097 | 2121 | ||
| 2098 | mutex_unlock(&swapon_mutex); | 2122 | mutex_unlock(&swapon_mutex); |
| 2099 | atomic_inc(&proc_poll_event); | 2123 | atomic_inc(&proc_poll_event); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index eeb3bc9d1d36..66e431060c05 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -2688,7 +2688,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
| 2688 | * them before going back to sleep. | 2688 | * them before going back to sleep. |
| 2689 | */ | 2689 | */ |
| 2690 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); | 2690 | set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); |
| 2691 | schedule(); | 2691 | |
| 2692 | if (!kthread_should_stop()) | ||
| 2693 | schedule(); | ||
| 2694 | |||
| 2692 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); | 2695 | set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); |
| 2693 | } else { | 2696 | } else { |
| 2694 | if (remaining) | 2697 | if (remaining) |
| @@ -2955,14 +2958,17 @@ int kswapd_run(int nid) | |||
| 2955 | } | 2958 | } |
| 2956 | 2959 | ||
| 2957 | /* | 2960 | /* |
| 2958 | * Called by memory hotplug when all memory in a node is offlined. | 2961 | * Called by memory hotplug when all memory in a node is offlined. Caller must |
| 2962 | * hold lock_memory_hotplug(). | ||
| 2959 | */ | 2963 | */ |
| 2960 | void kswapd_stop(int nid) | 2964 | void kswapd_stop(int nid) |
| 2961 | { | 2965 | { |
| 2962 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; | 2966 | struct task_struct *kswapd = NODE_DATA(nid)->kswapd; |
| 2963 | 2967 | ||
| 2964 | if (kswapd) | 2968 | if (kswapd) { |
| 2965 | kthread_stop(kswapd); | 2969 | kthread_stop(kswapd); |
| 2970 | NODE_DATA(nid)->kswapd = NULL; | ||
| 2971 | } | ||
| 2966 | } | 2972 | } |
| 2967 | 2973 | ||
| 2968 | static int __init kswapd_init(void) | 2974 | static int __init kswapd_init(void) |
