diff options
Diffstat (limited to 'mm')
| -rw-r--r-- | mm/Kconfig | 6 | ||||
| -rw-r--r-- | mm/Makefile | 4 | ||||
| -rw-r--r-- | mm/cleancache.c | 276 | ||||
| -rw-r--r-- | mm/cma.c | 62 | ||||
| -rw-r--r-- | mm/cma.h | 24 | ||||
| -rw-r--r-- | mm/cma_debug.c | 205 | ||||
| -rw-r--r-- | mm/compaction.c | 75 | ||||
| -rw-r--r-- | mm/filemap.c | 130 | ||||
| -rw-r--r-- | mm/gup.c | 128 | ||||
| -rw-r--r-- | mm/huge_memory.c | 136 | ||||
| -rw-r--r-- | mm/hugetlb.c | 250 | ||||
| -rw-r--r-- | mm/internal.h | 8 | ||||
| -rw-r--r-- | mm/iov_iter.c | 753 | ||||
| -rw-r--r-- | mm/kasan/kasan.c | 27 | ||||
| -rw-r--r-- | mm/ksm.c | 10 | ||||
| -rw-r--r-- | mm/memblock.c | 22 | ||||
| -rw-r--r-- | mm/memcontrol.c | 245 | ||||
| -rw-r--r-- | mm/memory-failure.c | 122 | ||||
| -rw-r--r-- | mm/memory.c | 436 | ||||
| -rw-r--r-- | mm/memory_hotplug.c | 50 | ||||
| -rw-r--r-- | mm/mempolicy.c | 6 | ||||
| -rw-r--r-- | mm/mempool.c | 127 | ||||
| -rw-r--r-- | mm/memtest.c | 118 | ||||
| -rw-r--r-- | mm/migrate.c | 40 | ||||
| -rw-r--r-- | mm/mlock.c | 135 | ||||
| -rw-r--r-- | mm/mmap.c | 29 | ||||
| -rw-r--r-- | mm/mprotect.c | 3 | ||||
| -rw-r--r-- | mm/mremap.c | 35 | ||||
| -rw-r--r-- | mm/nommu.c | 5 | ||||
| -rw-r--r-- | mm/oom_kill.c | 9 | ||||
| -rw-r--r-- | mm/page-writeback.c | 29 | ||||
| -rw-r--r-- | mm/page_alloc.c | 256 | ||||
| -rw-r--r-- | mm/page_io.c | 7 | ||||
| -rw-r--r-- | mm/page_isolation.c | 1 | ||||
| -rw-r--r-- | mm/pagewalk.c | 9 | ||||
| -rw-r--r-- | mm/percpu.c | 4 | ||||
| -rw-r--r-- | mm/process_vm_access.c | 35 | ||||
| -rw-r--r-- | mm/rmap.c | 13 | ||||
| -rw-r--r-- | mm/shmem.c | 34 | ||||
| -rw-r--r-- | mm/slab.c | 22 | ||||
| -rw-r--r-- | mm/slob.c | 3 | ||||
| -rw-r--r-- | mm/slub.c | 38 | ||||
| -rw-r--r-- | mm/swap.c | 34 | ||||
| -rw-r--r-- | mm/swap_state.c | 2 | ||||
| -rw-r--r-- | mm/swapfile.c | 2 | ||||
| -rw-r--r-- | mm/truncate.c | 39 | ||||
| -rw-r--r-- | mm/util.c | 41 | ||||
| -rw-r--r-- | mm/vmalloc.c | 104 | ||||
| -rw-r--r-- | mm/zsmalloc.c | 971 |
49 files changed, 2807 insertions, 2313 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a03131b6ba8e..390214da4546 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -517,6 +517,12 @@ config CMA_DEBUG | |||
| 517 | processing calls such as dma_alloc_from_contiguous(). | 517 | processing calls such as dma_alloc_from_contiguous(). |
| 518 | This option does not affect warning and error messages. | 518 | This option does not affect warning and error messages. |
| 519 | 519 | ||
| 520 | config CMA_DEBUGFS | ||
| 521 | bool "CMA debugfs interface" | ||
| 522 | depends on CMA && DEBUG_FS | ||
| 523 | help | ||
| 524 | Turns on the DebugFS interface for CMA. | ||
| 525 | |||
| 520 | config CMA_AREAS | 526 | config CMA_AREAS |
| 521 | int "Maximum count of the CMA areas" | 527 | int "Maximum count of the CMA areas" |
| 522 | depends on CMA | 528 | depends on CMA |
diff --git a/mm/Makefile b/mm/Makefile index 3c1caa2693bd..98c4eaeabdcb 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
| @@ -21,7 +21,7 @@ obj-y := filemap.o mempool.o oom_kill.o \ | |||
| 21 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 21 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
| 22 | compaction.o vmacache.o \ | 22 | compaction.o vmacache.o \ |
| 23 | interval_tree.o list_lru.o workingset.o \ | 23 | interval_tree.o list_lru.o workingset.o \ |
| 24 | iov_iter.o debug.o $(mmu-y) | 24 | debug.o $(mmu-y) |
| 25 | 25 | ||
| 26 | obj-y += init-mm.o | 26 | obj-y += init-mm.o |
| 27 | 27 | ||
| @@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o | |||
| 55 | obj-$(CONFIG_KASAN) += kasan/ | 55 | obj-$(CONFIG_KASAN) += kasan/ |
| 56 | obj-$(CONFIG_FAILSLAB) += failslab.o | 56 | obj-$(CONFIG_FAILSLAB) += failslab.o |
| 57 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o | 57 | obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o |
| 58 | obj-$(CONFIG_MEMTEST) += memtest.o | ||
| 58 | obj-$(CONFIG_MIGRATION) += migrate.o | 59 | obj-$(CONFIG_MIGRATION) += migrate.o |
| 59 | obj-$(CONFIG_QUICKLIST) += quicklist.o | 60 | obj-$(CONFIG_QUICKLIST) += quicklist.o |
| 60 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o | 61 | obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o |
| @@ -76,3 +77,4 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o | |||
| 76 | obj-$(CONFIG_CMA) += cma.o | 77 | obj-$(CONFIG_CMA) += cma.o |
| 77 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o | 78 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o |
| 78 | obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o | 79 | obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o |
| 80 | obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o | ||
diff --git a/mm/cleancache.c b/mm/cleancache.c index 053bcd8f12fb..8fc50811119b 100644 --- a/mm/cleancache.c +++ b/mm/cleancache.c | |||
| @@ -19,7 +19,7 @@ | |||
| 19 | #include <linux/cleancache.h> | 19 | #include <linux/cleancache.h> |
| 20 | 20 | ||
| 21 | /* | 21 | /* |
| 22 | * cleancache_ops is set by cleancache_ops_register to contain the pointers | 22 | * cleancache_ops is set by cleancache_register_ops to contain the pointers |
| 23 | * to the cleancache "backend" implementation functions. | 23 | * to the cleancache "backend" implementation functions. |
| 24 | */ | 24 | */ |
| 25 | static struct cleancache_ops *cleancache_ops __read_mostly; | 25 | static struct cleancache_ops *cleancache_ops __read_mostly; |
| @@ -34,145 +34,107 @@ static u64 cleancache_failed_gets; | |||
| 34 | static u64 cleancache_puts; | 34 | static u64 cleancache_puts; |
| 35 | static u64 cleancache_invalidates; | 35 | static u64 cleancache_invalidates; |
| 36 | 36 | ||
| 37 | /* | 37 | static void cleancache_register_ops_sb(struct super_block *sb, void *unused) |
| 38 | * When no backend is registered all calls to init_fs and init_shared_fs | 38 | { |
| 39 | * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or | 39 | switch (sb->cleancache_poolid) { |
| 40 | * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array | 40 | case CLEANCACHE_NO_BACKEND: |
| 41 | * [shared_|]fs_poolid_map) are given to the respective super block | 41 | __cleancache_init_fs(sb); |
| 42 | * (sb->cleancache_poolid) and no tmem_pools are created. When a backend | 42 | break; |
| 43 | * registers with cleancache the previous calls to init_fs and init_shared_fs | 43 | case CLEANCACHE_NO_BACKEND_SHARED: |
| 44 | * are executed to create tmem_pools and set the respective poolids. While no | 44 | __cleancache_init_shared_fs(sb); |
| 45 | * backend is registered all "puts", "gets" and "flushes" are ignored or failed. | 45 | break; |
| 46 | */ | 46 | } |
| 47 | #define MAX_INITIALIZABLE_FS 32 | 47 | } |
| 48 | #define FAKE_FS_POOLID_OFFSET 1000 | ||
| 49 | #define FAKE_SHARED_FS_POOLID_OFFSET 2000 | ||
| 50 | |||
| 51 | #define FS_NO_BACKEND (-1) | ||
| 52 | #define FS_UNKNOWN (-2) | ||
| 53 | static int fs_poolid_map[MAX_INITIALIZABLE_FS]; | ||
| 54 | static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS]; | ||
| 55 | static char *uuids[MAX_INITIALIZABLE_FS]; | ||
| 56 | /* | ||
| 57 | * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads | ||
| 58 | * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple | ||
| 59 | * threads calling mount (and ending up in __cleancache_init_[shared|]fs). | ||
| 60 | */ | ||
| 61 | static DEFINE_MUTEX(poolid_mutex); | ||
| 62 | /* | ||
| 63 | * When set to false (default) all calls to the cleancache functions, except | ||
| 64 | * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded | ||
| 65 | * by the if (!cleancache_ops) return. This means multiple threads (from | ||
| 66 | * different filesystems) will be checking cleancache_ops. The usage of a | ||
| 67 | * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are | ||
| 68 | * OK if the time between the backend's have been initialized (and | ||
| 69 | * cleancache_ops has been set to not NULL) and when the filesystems start | ||
| 70 | * actually calling the backends. The inverse (when unloading) is obviously | ||
| 71 | * not good - but this shim does not do that (yet). | ||
| 72 | */ | ||
| 73 | |||
| 74 | /* | ||
| 75 | * The backends and filesystems work all asynchronously. This is b/c the | ||
| 76 | * backends can be built as modules. | ||
| 77 | * The usual sequence of events is: | ||
| 78 | * a) mount / -> __cleancache_init_fs is called. We set the | ||
| 79 | * [shared_|]fs_poolid_map and uuids for. | ||
| 80 | * | ||
| 81 | * b). user does I/Os -> we call the rest of __cleancache_* functions | ||
| 82 | * which return immediately as cleancache_ops is false. | ||
| 83 | * | ||
| 84 | * c). modprobe zcache -> cleancache_register_ops. We init the backend | ||
| 85 | * and set cleancache_ops to true, and for any fs_poolid_map | ||
| 86 | * (which is set by __cleancache_init_fs) we initialize the poolid. | ||
| 87 | * | ||
| 88 | * d). user does I/Os -> now that cleancache_ops is true all the | ||
| 89 | * __cleancache_* functions can call the backend. They all check | ||
| 90 | * that fs_poolid_map is valid and if so invoke the backend. | ||
| 91 | * | ||
| 92 | * e). umount / -> __cleancache_invalidate_fs, the fs_poolid_map is | ||
| 93 | * reset (which is the second check in the __cleancache_* ops | ||
| 94 | * to call the backend). | ||
| 95 | * | ||
| 96 | * The sequence of event could also be c), followed by a), and d). and e). The | ||
| 97 | * c) would not happen anymore. There is also the chance of c), and one thread | ||
| 98 | * doing a) + d), and another doing e). For that case we depend on the | ||
| 99 | * filesystem calling __cleancache_invalidate_fs in the proper sequence (so | ||
| 100 | * that it handles all I/Os before it invalidates the fs (which is last part | ||
| 101 | * of unmounting process). | ||
| 102 | * | ||
| 103 | * Note: The acute reader will notice that there is no "rmmod zcache" case. | ||
| 104 | * This is b/c the functionality for that is not yet implemented and when | ||
| 105 | * done, will require some extra locking not yet devised. | ||
| 106 | */ | ||
| 107 | 48 | ||
| 108 | /* | 49 | /* |
| 109 | * Register operations for cleancache, returning previous thus allowing | 50 | * Register operations for cleancache. Returns 0 on success. |
| 110 | * detection of multiple backends and possible nesting. | ||
| 111 | */ | 51 | */ |
| 112 | struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops) | 52 | int cleancache_register_ops(struct cleancache_ops *ops) |
| 113 | { | 53 | { |
| 114 | struct cleancache_ops *old = cleancache_ops; | 54 | if (cmpxchg(&cleancache_ops, NULL, ops)) |
| 115 | int i; | 55 | return -EBUSY; |
| 116 | 56 | ||
| 117 | mutex_lock(&poolid_mutex); | ||
| 118 | for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { | ||
| 119 | if (fs_poolid_map[i] == FS_NO_BACKEND) | ||
| 120 | fs_poolid_map[i] = ops->init_fs(PAGE_SIZE); | ||
| 121 | if (shared_fs_poolid_map[i] == FS_NO_BACKEND) | ||
| 122 | shared_fs_poolid_map[i] = ops->init_shared_fs | ||
| 123 | (uuids[i], PAGE_SIZE); | ||
| 124 | } | ||
| 125 | /* | 57 | /* |
| 126 | * We MUST set cleancache_ops _after_ we have called the backends | 58 | * A cleancache backend can be built as a module and hence loaded after |
| 127 | * init_fs or init_shared_fs functions. Otherwise the compiler might | 59 | * a cleancache enabled filesystem has called cleancache_init_fs. To |
| 128 | * re-order where cleancache_ops is set in this function. | 60 | * handle such a scenario, here we call ->init_fs or ->init_shared_fs |
| 61 | * for each active super block. To differentiate between local and | ||
| 62 | * shared filesystems, we temporarily initialize sb->cleancache_poolid | ||
| 63 | * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED | ||
| 64 | * respectively in case there is no backend registered at the time | ||
| 65 | * cleancache_init_fs or cleancache_init_shared_fs is called. | ||
| 66 | * | ||
| 67 | * Since filesystems can be mounted concurrently with cleancache | ||
| 68 | * backend registration, we have to be careful to guarantee that all | ||
| 69 | * cleancache enabled filesystems that has been mounted by the time | ||
| 70 | * cleancache_register_ops is called has got and all mounted later will | ||
| 71 | * get cleancache_poolid. This is assured by the following statements | ||
| 72 | * tied together: | ||
| 73 | * | ||
| 74 | * a) iterate_supers skips only those super blocks that has started | ||
| 75 | * ->kill_sb | ||
| 76 | * | ||
| 77 | * b) if iterate_supers encounters a super block that has not finished | ||
| 78 | * ->mount yet, it waits until it is finished | ||
| 79 | * | ||
| 80 | * c) cleancache_init_fs is called from ->mount and | ||
| 81 | * cleancache_invalidate_fs is called from ->kill_sb | ||
| 82 | * | ||
| 83 | * d) we call iterate_supers after cleancache_ops has been set | ||
| 84 | * | ||
| 85 | * From a) it follows that if iterate_supers skips a super block, then | ||
| 86 | * either the super block is already dead, in which case we do not need | ||
| 87 | * to bother initializing cleancache for it, or it was mounted after we | ||
| 88 | * initiated iterate_supers. In the latter case, it must have seen | ||
| 89 | * cleancache_ops set according to d) and initialized cleancache from | ||
| 90 | * ->mount by itself according to c). This proves that we call | ||
| 91 | * ->init_fs at least once for each active super block. | ||
| 92 | * | ||
| 93 | * From b) and c) it follows that if iterate_supers encounters a super | ||
| 94 | * block that has already started ->init_fs, it will wait until ->mount | ||
| 95 | * and hence ->init_fs has finished, then check cleancache_poolid, see | ||
| 96 | * that it has already been set and therefore do nothing. This proves | ||
| 97 | * that we call ->init_fs no more than once for each super block. | ||
| 98 | * | ||
| 99 | * Combined together, the last two paragraphs prove the function | ||
| 100 | * correctness. | ||
| 101 | * | ||
| 102 | * Note that various cleancache callbacks may proceed before this | ||
| 103 | * function is called or even concurrently with it, but since | ||
| 104 | * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop | ||
| 105 | * until the corresponding ->init_fs has been actually called and | ||
| 106 | * cleancache_ops has been set. | ||
| 129 | */ | 107 | */ |
| 130 | barrier(); | 108 | iterate_supers(cleancache_register_ops_sb, NULL); |
| 131 | cleancache_ops = ops; | 109 | return 0; |
| 132 | mutex_unlock(&poolid_mutex); | ||
| 133 | return old; | ||
| 134 | } | 110 | } |
| 135 | EXPORT_SYMBOL(cleancache_register_ops); | 111 | EXPORT_SYMBOL(cleancache_register_ops); |
| 136 | 112 | ||
| 137 | /* Called by a cleancache-enabled filesystem at time of mount */ | 113 | /* Called by a cleancache-enabled filesystem at time of mount */ |
| 138 | void __cleancache_init_fs(struct super_block *sb) | 114 | void __cleancache_init_fs(struct super_block *sb) |
| 139 | { | 115 | { |
| 140 | int i; | 116 | int pool_id = CLEANCACHE_NO_BACKEND; |
| 141 | 117 | ||
| 142 | mutex_lock(&poolid_mutex); | 118 | if (cleancache_ops) { |
| 143 | for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { | 119 | pool_id = cleancache_ops->init_fs(PAGE_SIZE); |
| 144 | if (fs_poolid_map[i] == FS_UNKNOWN) { | 120 | if (pool_id < 0) |
| 145 | sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET; | 121 | pool_id = CLEANCACHE_NO_POOL; |
| 146 | if (cleancache_ops) | ||
| 147 | fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE); | ||
| 148 | else | ||
| 149 | fs_poolid_map[i] = FS_NO_BACKEND; | ||
| 150 | break; | ||
| 151 | } | ||
| 152 | } | 122 | } |
| 153 | mutex_unlock(&poolid_mutex); | 123 | sb->cleancache_poolid = pool_id; |
| 154 | } | 124 | } |
| 155 | EXPORT_SYMBOL(__cleancache_init_fs); | 125 | EXPORT_SYMBOL(__cleancache_init_fs); |
| 156 | 126 | ||
| 157 | /* Called by a cleancache-enabled clustered filesystem at time of mount */ | 127 | /* Called by a cleancache-enabled clustered filesystem at time of mount */ |
| 158 | void __cleancache_init_shared_fs(char *uuid, struct super_block *sb) | 128 | void __cleancache_init_shared_fs(struct super_block *sb) |
| 159 | { | 129 | { |
| 160 | int i; | 130 | int pool_id = CLEANCACHE_NO_BACKEND_SHARED; |
| 161 | 131 | ||
| 162 | mutex_lock(&poolid_mutex); | 132 | if (cleancache_ops) { |
| 163 | for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { | 133 | pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE); |
| 164 | if (shared_fs_poolid_map[i] == FS_UNKNOWN) { | 134 | if (pool_id < 0) |
| 165 | sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET; | 135 | pool_id = CLEANCACHE_NO_POOL; |
| 166 | uuids[i] = uuid; | ||
| 167 | if (cleancache_ops) | ||
| 168 | shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs | ||
| 169 | (uuid, PAGE_SIZE); | ||
| 170 | else | ||
| 171 | shared_fs_poolid_map[i] = FS_NO_BACKEND; | ||
| 172 | break; | ||
| 173 | } | ||
| 174 | } | 136 | } |
| 175 | mutex_unlock(&poolid_mutex); | 137 | sb->cleancache_poolid = pool_id; |
| 176 | } | 138 | } |
| 177 | EXPORT_SYMBOL(__cleancache_init_shared_fs); | 139 | EXPORT_SYMBOL(__cleancache_init_shared_fs); |
| 178 | 140 | ||
| @@ -202,19 +164,6 @@ static int cleancache_get_key(struct inode *inode, | |||
| 202 | } | 164 | } |
| 203 | 165 | ||
| 204 | /* | 166 | /* |
| 205 | * Returns a pool_id that is associated with a given fake poolid. | ||
| 206 | */ | ||
| 207 | static int get_poolid_from_fake(int fake_pool_id) | ||
| 208 | { | ||
| 209 | if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) | ||
| 210 | return shared_fs_poolid_map[fake_pool_id - | ||
| 211 | FAKE_SHARED_FS_POOLID_OFFSET]; | ||
| 212 | else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) | ||
| 213 | return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET]; | ||
| 214 | return FS_NO_BACKEND; | ||
| 215 | } | ||
| 216 | |||
| 217 | /* | ||
| 218 | * "Get" data from cleancache associated with the poolid/inode/index | 167 | * "Get" data from cleancache associated with the poolid/inode/index |
| 219 | * that were specified when the data was put to cleanache and, if | 168 | * that were specified when the data was put to cleanache and, if |
| 220 | * successful, use it to fill the specified page with data and return 0. | 169 | * successful, use it to fill the specified page with data and return 0. |
| @@ -229,7 +178,6 @@ int __cleancache_get_page(struct page *page) | |||
| 229 | { | 178 | { |
| 230 | int ret = -1; | 179 | int ret = -1; |
| 231 | int pool_id; | 180 | int pool_id; |
| 232 | int fake_pool_id; | ||
| 233 | struct cleancache_filekey key = { .u.key = { 0 } }; | 181 | struct cleancache_filekey key = { .u.key = { 0 } }; |
| 234 | 182 | ||
| 235 | if (!cleancache_ops) { | 183 | if (!cleancache_ops) { |
| @@ -238,17 +186,14 @@ int __cleancache_get_page(struct page *page) | |||
| 238 | } | 186 | } |
| 239 | 187 | ||
| 240 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 188 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 241 | fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; | 189 | pool_id = page->mapping->host->i_sb->cleancache_poolid; |
| 242 | if (fake_pool_id < 0) | 190 | if (pool_id < 0) |
| 243 | goto out; | 191 | goto out; |
| 244 | pool_id = get_poolid_from_fake(fake_pool_id); | ||
| 245 | 192 | ||
| 246 | if (cleancache_get_key(page->mapping->host, &key) < 0) | 193 | if (cleancache_get_key(page->mapping->host, &key) < 0) |
| 247 | goto out; | 194 | goto out; |
| 248 | 195 | ||
| 249 | if (pool_id >= 0) | 196 | ret = cleancache_ops->get_page(pool_id, key, page->index, page); |
| 250 | ret = cleancache_ops->get_page(pool_id, | ||
| 251 | key, page->index, page); | ||
| 252 | if (ret == 0) | 197 | if (ret == 0) |
| 253 | cleancache_succ_gets++; | 198 | cleancache_succ_gets++; |
| 254 | else | 199 | else |
| @@ -271,7 +216,6 @@ EXPORT_SYMBOL(__cleancache_get_page); | |||
| 271 | void __cleancache_put_page(struct page *page) | 216 | void __cleancache_put_page(struct page *page) |
| 272 | { | 217 | { |
| 273 | int pool_id; | 218 | int pool_id; |
| 274 | int fake_pool_id; | ||
| 275 | struct cleancache_filekey key = { .u.key = { 0 } }; | 219 | struct cleancache_filekey key = { .u.key = { 0 } }; |
| 276 | 220 | ||
| 277 | if (!cleancache_ops) { | 221 | if (!cleancache_ops) { |
| @@ -280,12 +224,7 @@ void __cleancache_put_page(struct page *page) | |||
| 280 | } | 224 | } |
| 281 | 225 | ||
| 282 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 226 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 283 | fake_pool_id = page->mapping->host->i_sb->cleancache_poolid; | 227 | pool_id = page->mapping->host->i_sb->cleancache_poolid; |
| 284 | if (fake_pool_id < 0) | ||
| 285 | return; | ||
| 286 | |||
| 287 | pool_id = get_poolid_from_fake(fake_pool_id); | ||
| 288 | |||
| 289 | if (pool_id >= 0 && | 228 | if (pool_id >= 0 && |
| 290 | cleancache_get_key(page->mapping->host, &key) >= 0) { | 229 | cleancache_get_key(page->mapping->host, &key) >= 0) { |
| 291 | cleancache_ops->put_page(pool_id, key, page->index, page); | 230 | cleancache_ops->put_page(pool_id, key, page->index, page); |
| @@ -306,18 +245,13 @@ void __cleancache_invalidate_page(struct address_space *mapping, | |||
| 306 | struct page *page) | 245 | struct page *page) |
| 307 | { | 246 | { |
| 308 | /* careful... page->mapping is NULL sometimes when this is called */ | 247 | /* careful... page->mapping is NULL sometimes when this is called */ |
| 309 | int pool_id; | 248 | int pool_id = mapping->host->i_sb->cleancache_poolid; |
| 310 | int fake_pool_id = mapping->host->i_sb->cleancache_poolid; | ||
| 311 | struct cleancache_filekey key = { .u.key = { 0 } }; | 249 | struct cleancache_filekey key = { .u.key = { 0 } }; |
| 312 | 250 | ||
| 313 | if (!cleancache_ops) | 251 | if (!cleancache_ops) |
| 314 | return; | 252 | return; |
| 315 | 253 | ||
| 316 | if (fake_pool_id >= 0) { | 254 | if (pool_id >= 0) { |
| 317 | pool_id = get_poolid_from_fake(fake_pool_id); | ||
| 318 | if (pool_id < 0) | ||
| 319 | return; | ||
| 320 | |||
| 321 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 255 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
| 322 | if (cleancache_get_key(mapping->host, &key) >= 0) { | 256 | if (cleancache_get_key(mapping->host, &key) >= 0) { |
| 323 | cleancache_ops->invalidate_page(pool_id, | 257 | cleancache_ops->invalidate_page(pool_id, |
| @@ -339,18 +273,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page); | |||
| 339 | */ | 273 | */ |
| 340 | void __cleancache_invalidate_inode(struct address_space *mapping) | 274 | void __cleancache_invalidate_inode(struct address_space *mapping) |
| 341 | { | 275 | { |
| 342 | int pool_id; | 276 | int pool_id = mapping->host->i_sb->cleancache_poolid; |
| 343 | int fake_pool_id = mapping->host->i_sb->cleancache_poolid; | ||
| 344 | struct cleancache_filekey key = { .u.key = { 0 } }; | 277 | struct cleancache_filekey key = { .u.key = { 0 } }; |
| 345 | 278 | ||
| 346 | if (!cleancache_ops) | 279 | if (!cleancache_ops) |
| 347 | return; | 280 | return; |
| 348 | 281 | ||
| 349 | if (fake_pool_id < 0) | ||
| 350 | return; | ||
| 351 | |||
| 352 | pool_id = get_poolid_from_fake(fake_pool_id); | ||
| 353 | |||
| 354 | if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) | 282 | if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0) |
| 355 | cleancache_ops->invalidate_inode(pool_id, key); | 283 | cleancache_ops->invalidate_inode(pool_id, key); |
| 356 | } | 284 | } |
| @@ -363,32 +291,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode); | |||
| 363 | */ | 291 | */ |
| 364 | void __cleancache_invalidate_fs(struct super_block *sb) | 292 | void __cleancache_invalidate_fs(struct super_block *sb) |
| 365 | { | 293 | { |
| 366 | int index; | 294 | int pool_id; |
| 367 | int fake_pool_id = sb->cleancache_poolid; | ||
| 368 | int old_poolid = fake_pool_id; | ||
| 369 | 295 | ||
| 370 | mutex_lock(&poolid_mutex); | 296 | pool_id = sb->cleancache_poolid; |
| 371 | if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) { | 297 | sb->cleancache_poolid = CLEANCACHE_NO_POOL; |
| 372 | index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET; | 298 | |
| 373 | old_poolid = shared_fs_poolid_map[index]; | 299 | if (cleancache_ops && pool_id >= 0) |
| 374 | shared_fs_poolid_map[index] = FS_UNKNOWN; | 300 | cleancache_ops->invalidate_fs(pool_id); |
| 375 | uuids[index] = NULL; | ||
| 376 | } else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) { | ||
| 377 | index = fake_pool_id - FAKE_FS_POOLID_OFFSET; | ||
| 378 | old_poolid = fs_poolid_map[index]; | ||
| 379 | fs_poolid_map[index] = FS_UNKNOWN; | ||
| 380 | } | ||
| 381 | sb->cleancache_poolid = -1; | ||
| 382 | if (cleancache_ops) | ||
| 383 | cleancache_ops->invalidate_fs(old_poolid); | ||
| 384 | mutex_unlock(&poolid_mutex); | ||
| 385 | } | 301 | } |
| 386 | EXPORT_SYMBOL(__cleancache_invalidate_fs); | 302 | EXPORT_SYMBOL(__cleancache_invalidate_fs); |
| 387 | 303 | ||
| 388 | static int __init init_cleancache(void) | 304 | static int __init init_cleancache(void) |
| 389 | { | 305 | { |
| 390 | int i; | ||
| 391 | |||
| 392 | #ifdef CONFIG_DEBUG_FS | 306 | #ifdef CONFIG_DEBUG_FS |
| 393 | struct dentry *root = debugfs_create_dir("cleancache", NULL); | 307 | struct dentry *root = debugfs_create_dir("cleancache", NULL); |
| 394 | if (root == NULL) | 308 | if (root == NULL) |
| @@ -400,10 +314,6 @@ static int __init init_cleancache(void) | |||
| 400 | debugfs_create_u64("invalidates", S_IRUGO, | 314 | debugfs_create_u64("invalidates", S_IRUGO, |
| 401 | root, &cleancache_invalidates); | 315 | root, &cleancache_invalidates); |
| 402 | #endif | 316 | #endif |
| 403 | for (i = 0; i < MAX_INITIALIZABLE_FS; i++) { | ||
| 404 | fs_poolid_map[i] = FS_UNKNOWN; | ||
| 405 | shared_fs_poolid_map[i] = FS_UNKNOWN; | ||
| 406 | } | ||
| 407 | return 0; | 317 | return 0; |
| 408 | } | 318 | } |
| 409 | module_init(init_cleancache) | 319 | module_init(init_cleancache) |
| @@ -23,6 +23,7 @@ | |||
| 23 | # define DEBUG | 23 | # define DEBUG |
| 24 | #endif | 24 | #endif |
| 25 | #endif | 25 | #endif |
| 26 | #define CREATE_TRACE_POINTS | ||
| 26 | 27 | ||
| 27 | #include <linux/memblock.h> | 28 | #include <linux/memblock.h> |
| 28 | #include <linux/err.h> | 29 | #include <linux/err.h> |
| @@ -34,59 +35,54 @@ | |||
| 34 | #include <linux/cma.h> | 35 | #include <linux/cma.h> |
| 35 | #include <linux/highmem.h> | 36 | #include <linux/highmem.h> |
| 36 | #include <linux/io.h> | 37 | #include <linux/io.h> |
| 38 | #include <trace/events/cma.h> | ||
| 37 | 39 | ||
| 38 | struct cma { | 40 | #include "cma.h" |
| 39 | unsigned long base_pfn; | ||
| 40 | unsigned long count; | ||
| 41 | unsigned long *bitmap; | ||
| 42 | unsigned int order_per_bit; /* Order of pages represented by one bit */ | ||
| 43 | struct mutex lock; | ||
| 44 | }; | ||
| 45 | 41 | ||
| 46 | static struct cma cma_areas[MAX_CMA_AREAS]; | 42 | struct cma cma_areas[MAX_CMA_AREAS]; |
| 47 | static unsigned cma_area_count; | 43 | unsigned cma_area_count; |
| 48 | static DEFINE_MUTEX(cma_mutex); | 44 | static DEFINE_MUTEX(cma_mutex); |
| 49 | 45 | ||
| 50 | phys_addr_t cma_get_base(struct cma *cma) | 46 | phys_addr_t cma_get_base(const struct cma *cma) |
| 51 | { | 47 | { |
| 52 | return PFN_PHYS(cma->base_pfn); | 48 | return PFN_PHYS(cma->base_pfn); |
| 53 | } | 49 | } |
| 54 | 50 | ||
| 55 | unsigned long cma_get_size(struct cma *cma) | 51 | unsigned long cma_get_size(const struct cma *cma) |
| 56 | { | 52 | { |
| 57 | return cma->count << PAGE_SHIFT; | 53 | return cma->count << PAGE_SHIFT; |
| 58 | } | 54 | } |
| 59 | 55 | ||
| 60 | static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) | 56 | static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, |
| 57 | int align_order) | ||
| 61 | { | 58 | { |
| 62 | if (align_order <= cma->order_per_bit) | 59 | if (align_order <= cma->order_per_bit) |
| 63 | return 0; | 60 | return 0; |
| 64 | return (1UL << (align_order - cma->order_per_bit)) - 1; | 61 | return (1UL << (align_order - cma->order_per_bit)) - 1; |
| 65 | } | 62 | } |
| 66 | 63 | ||
| 67 | static unsigned long cma_bitmap_aligned_offset(struct cma *cma, int align_order) | 64 | /* |
| 65 | * Find a PFN aligned to the specified order and return an offset represented in | ||
| 66 | * order_per_bits. | ||
| 67 | */ | ||
| 68 | static unsigned long cma_bitmap_aligned_offset(const struct cma *cma, | ||
| 69 | int align_order) | ||
| 68 | { | 70 | { |
| 69 | unsigned int alignment; | ||
| 70 | |||
| 71 | if (align_order <= cma->order_per_bit) | 71 | if (align_order <= cma->order_per_bit) |
| 72 | return 0; | 72 | return 0; |
| 73 | alignment = 1UL << (align_order - cma->order_per_bit); | ||
| 74 | return ALIGN(cma->base_pfn, alignment) - | ||
| 75 | (cma->base_pfn >> cma->order_per_bit); | ||
| 76 | } | ||
| 77 | 73 | ||
| 78 | static unsigned long cma_bitmap_maxno(struct cma *cma) | 74 | return (ALIGN(cma->base_pfn, (1UL << align_order)) |
| 79 | { | 75 | - cma->base_pfn) >> cma->order_per_bit; |
| 80 | return cma->count >> cma->order_per_bit; | ||
| 81 | } | 76 | } |
| 82 | 77 | ||
| 83 | static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, | 78 | static unsigned long cma_bitmap_pages_to_bits(const struct cma *cma, |
| 84 | unsigned long pages) | 79 | unsigned long pages) |
| 85 | { | 80 | { |
| 86 | return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; | 81 | return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; |
| 87 | } | 82 | } |
| 88 | 83 | ||
| 89 | static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) | 84 | static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, |
| 85 | unsigned int count) | ||
| 90 | { | 86 | { |
| 91 | unsigned long bitmap_no, bitmap_count; | 87 | unsigned long bitmap_no, bitmap_count; |
| 92 | 88 | ||
| @@ -132,6 +128,12 @@ static int __init cma_activate_area(struct cma *cma) | |||
| 132 | } while (--i); | 128 | } while (--i); |
| 133 | 129 | ||
| 134 | mutex_init(&cma->lock); | 130 | mutex_init(&cma->lock); |
| 131 | |||
| 132 | #ifdef CONFIG_CMA_DEBUGFS | ||
| 133 | INIT_HLIST_HEAD(&cma->mem_head); | ||
| 134 | spin_lock_init(&cma->mem_head_lock); | ||
| 135 | #endif | ||
| 136 | |||
| 135 | return 0; | 137 | return 0; |
| 136 | 138 | ||
| 137 | err: | 139 | err: |
| @@ -165,7 +167,8 @@ core_initcall(cma_init_reserved_areas); | |||
| 165 | * This function creates custom contiguous area from already reserved memory. | 167 | * This function creates custom contiguous area from already reserved memory. |
| 166 | */ | 168 | */ |
| 167 | int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, | 169 | int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, |
| 168 | int order_per_bit, struct cma **res_cma) | 170 | unsigned int order_per_bit, |
| 171 | struct cma **res_cma) | ||
| 169 | { | 172 | { |
| 170 | struct cma *cma; | 173 | struct cma *cma; |
| 171 | phys_addr_t alignment; | 174 | phys_addr_t alignment; |
| @@ -356,7 +359,7 @@ err: | |||
| 356 | * This function allocates part of contiguous memory on specific | 359 | * This function allocates part of contiguous memory on specific |
| 357 | * contiguous memory area. | 360 | * contiguous memory area. |
| 358 | */ | 361 | */ |
| 359 | struct page *cma_alloc(struct cma *cma, int count, unsigned int align) | 362 | struct page *cma_alloc(struct cma *cma, unsigned int count, unsigned int align) |
| 360 | { | 363 | { |
| 361 | unsigned long mask, offset, pfn, start = 0; | 364 | unsigned long mask, offset, pfn, start = 0; |
| 362 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; | 365 | unsigned long bitmap_maxno, bitmap_no, bitmap_count; |
| @@ -413,6 +416,8 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) | |||
| 413 | start = bitmap_no + mask + 1; | 416 | start = bitmap_no + mask + 1; |
| 414 | } | 417 | } |
| 415 | 418 | ||
| 419 | trace_cma_alloc(page ? pfn : -1UL, page, count, align); | ||
| 420 | |||
| 416 | pr_debug("%s(): returned %p\n", __func__, page); | 421 | pr_debug("%s(): returned %p\n", __func__, page); |
| 417 | return page; | 422 | return page; |
| 418 | } | 423 | } |
| @@ -427,7 +432,7 @@ struct page *cma_alloc(struct cma *cma, int count, unsigned int align) | |||
| 427 | * It returns false when provided pages do not belong to contiguous area and | 432 | * It returns false when provided pages do not belong to contiguous area and |
| 428 | * true otherwise. | 433 | * true otherwise. |
| 429 | */ | 434 | */ |
| 430 | bool cma_release(struct cma *cma, struct page *pages, int count) | 435 | bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) |
| 431 | { | 436 | { |
| 432 | unsigned long pfn; | 437 | unsigned long pfn; |
| 433 | 438 | ||
| @@ -445,6 +450,7 @@ bool cma_release(struct cma *cma, struct page *pages, int count) | |||
| 445 | 450 | ||
| 446 | free_contig_range(pfn, count); | 451 | free_contig_range(pfn, count); |
| 447 | cma_clear_bitmap(cma, pfn, count); | 452 | cma_clear_bitmap(cma, pfn, count); |
| 453 | trace_cma_release(pfn, pages, count); | ||
| 448 | 454 | ||
| 449 | return true; | 455 | return true; |
| 450 | } | 456 | } |
diff --git a/mm/cma.h b/mm/cma.h new file mode 100644 index 000000000000..1132d733556d --- /dev/null +++ b/mm/cma.h | |||
| @@ -0,0 +1,24 @@ | |||
| 1 | #ifndef __MM_CMA_H__ | ||
| 2 | #define __MM_CMA_H__ | ||
| 3 | |||
| 4 | struct cma { | ||
| 5 | unsigned long base_pfn; | ||
| 6 | unsigned long count; | ||
| 7 | unsigned long *bitmap; | ||
| 8 | unsigned int order_per_bit; /* Order of pages represented by one bit */ | ||
| 9 | struct mutex lock; | ||
| 10 | #ifdef CONFIG_CMA_DEBUGFS | ||
| 11 | struct hlist_head mem_head; | ||
| 12 | spinlock_t mem_head_lock; | ||
| 13 | #endif | ||
| 14 | }; | ||
| 15 | |||
| 16 | extern struct cma cma_areas[MAX_CMA_AREAS]; | ||
| 17 | extern unsigned cma_area_count; | ||
| 18 | |||
| 19 | static unsigned long cma_bitmap_maxno(struct cma *cma) | ||
| 20 | { | ||
| 21 | return cma->count >> cma->order_per_bit; | ||
| 22 | } | ||
| 23 | |||
| 24 | #endif | ||
diff --git a/mm/cma_debug.c b/mm/cma_debug.c new file mode 100644 index 000000000000..7621ee34daa0 --- /dev/null +++ b/mm/cma_debug.c | |||
| @@ -0,0 +1,205 @@ | |||
| 1 | /* | ||
| 2 | * CMA DebugFS Interface | ||
| 3 | * | ||
| 4 | * Copyright (c) 2015 Sasha Levin <sasha.levin@oracle.com> | ||
| 5 | */ | ||
| 6 | |||
| 7 | |||
| 8 | #include <linux/debugfs.h> | ||
| 9 | #include <linux/cma.h> | ||
| 10 | #include <linux/list.h> | ||
| 11 | #include <linux/kernel.h> | ||
| 12 | #include <linux/slab.h> | ||
| 13 | #include <linux/mm_types.h> | ||
| 14 | |||
| 15 | #include "cma.h" | ||
| 16 | |||
| 17 | struct cma_mem { | ||
| 18 | struct hlist_node node; | ||
| 19 | struct page *p; | ||
| 20 | unsigned long n; | ||
| 21 | }; | ||
| 22 | |||
| 23 | static struct dentry *cma_debugfs_root; | ||
| 24 | |||
| 25 | static int cma_debugfs_get(void *data, u64 *val) | ||
| 26 | { | ||
| 27 | unsigned long *p = data; | ||
| 28 | |||
| 29 | *val = *p; | ||
| 30 | |||
| 31 | return 0; | ||
| 32 | } | ||
| 33 | DEFINE_SIMPLE_ATTRIBUTE(cma_debugfs_fops, cma_debugfs_get, NULL, "%llu\n"); | ||
| 34 | |||
| 35 | static int cma_used_get(void *data, u64 *val) | ||
| 36 | { | ||
| 37 | struct cma *cma = data; | ||
| 38 | unsigned long used; | ||
| 39 | |||
| 40 | mutex_lock(&cma->lock); | ||
| 41 | /* pages counter is smaller than sizeof(int) */ | ||
| 42 | used = bitmap_weight(cma->bitmap, (int)cma->count); | ||
| 43 | mutex_unlock(&cma->lock); | ||
| 44 | *val = (u64)used << cma->order_per_bit; | ||
| 45 | |||
| 46 | return 0; | ||
| 47 | } | ||
| 48 | DEFINE_SIMPLE_ATTRIBUTE(cma_used_fops, cma_used_get, NULL, "%llu\n"); | ||
| 49 | |||
| 50 | static int cma_maxchunk_get(void *data, u64 *val) | ||
| 51 | { | ||
| 52 | struct cma *cma = data; | ||
| 53 | unsigned long maxchunk = 0; | ||
| 54 | unsigned long start, end = 0; | ||
| 55 | |||
| 56 | mutex_lock(&cma->lock); | ||
| 57 | for (;;) { | ||
| 58 | start = find_next_zero_bit(cma->bitmap, cma->count, end); | ||
| 59 | if (start >= cma->count) | ||
| 60 | break; | ||
| 61 | end = find_next_bit(cma->bitmap, cma->count, start); | ||
| 62 | maxchunk = max(end - start, maxchunk); | ||
| 63 | } | ||
| 64 | mutex_unlock(&cma->lock); | ||
| 65 | *val = (u64)maxchunk << cma->order_per_bit; | ||
| 66 | |||
| 67 | return 0; | ||
| 68 | } | ||
| 69 | DEFINE_SIMPLE_ATTRIBUTE(cma_maxchunk_fops, cma_maxchunk_get, NULL, "%llu\n"); | ||
| 70 | |||
| 71 | static void cma_add_to_cma_mem_list(struct cma *cma, struct cma_mem *mem) | ||
| 72 | { | ||
| 73 | spin_lock(&cma->mem_head_lock); | ||
| 74 | hlist_add_head(&mem->node, &cma->mem_head); | ||
| 75 | spin_unlock(&cma->mem_head_lock); | ||
| 76 | } | ||
| 77 | |||
| 78 | static struct cma_mem *cma_get_entry_from_list(struct cma *cma) | ||
| 79 | { | ||
| 80 | struct cma_mem *mem = NULL; | ||
| 81 | |||
| 82 | spin_lock(&cma->mem_head_lock); | ||
| 83 | if (!hlist_empty(&cma->mem_head)) { | ||
| 84 | mem = hlist_entry(cma->mem_head.first, struct cma_mem, node); | ||
| 85 | hlist_del_init(&mem->node); | ||
| 86 | } | ||
| 87 | spin_unlock(&cma->mem_head_lock); | ||
| 88 | |||
| 89 | return mem; | ||
| 90 | } | ||
| 91 | |||
| 92 | static int cma_free_mem(struct cma *cma, int count) | ||
| 93 | { | ||
| 94 | struct cma_mem *mem = NULL; | ||
| 95 | |||
| 96 | while (count) { | ||
| 97 | mem = cma_get_entry_from_list(cma); | ||
| 98 | if (mem == NULL) | ||
| 99 | return 0; | ||
| 100 | |||
| 101 | if (mem->n <= count) { | ||
| 102 | cma_release(cma, mem->p, mem->n); | ||
| 103 | count -= mem->n; | ||
| 104 | kfree(mem); | ||
| 105 | } else if (cma->order_per_bit == 0) { | ||
| 106 | cma_release(cma, mem->p, count); | ||
| 107 | mem->p += count; | ||
| 108 | mem->n -= count; | ||
| 109 | count = 0; | ||
| 110 | cma_add_to_cma_mem_list(cma, mem); | ||
| 111 | } else { | ||
| 112 | pr_debug("cma: cannot release partial block when order_per_bit != 0\n"); | ||
| 113 | cma_add_to_cma_mem_list(cma, mem); | ||
| 114 | break; | ||
| 115 | } | ||
| 116 | } | ||
| 117 | |||
| 118 | return 0; | ||
| 119 | |||
| 120 | } | ||
| 121 | |||
| 122 | static int cma_free_write(void *data, u64 val) | ||
| 123 | { | ||
| 124 | int pages = val; | ||
| 125 | struct cma *cma = data; | ||
| 126 | |||
| 127 | return cma_free_mem(cma, pages); | ||
| 128 | } | ||
| 129 | DEFINE_SIMPLE_ATTRIBUTE(cma_free_fops, NULL, cma_free_write, "%llu\n"); | ||
| 130 | |||
| 131 | static int cma_alloc_mem(struct cma *cma, int count) | ||
| 132 | { | ||
| 133 | struct cma_mem *mem; | ||
| 134 | struct page *p; | ||
| 135 | |||
| 136 | mem = kzalloc(sizeof(*mem), GFP_KERNEL); | ||
| 137 | if (!mem) | ||
| 138 | return -ENOMEM; | ||
| 139 | |||
| 140 | p = cma_alloc(cma, count, 0); | ||
| 141 | if (!p) { | ||
| 142 | kfree(mem); | ||
| 143 | return -ENOMEM; | ||
| 144 | } | ||
| 145 | |||
| 146 | mem->p = p; | ||
| 147 | mem->n = count; | ||
| 148 | |||
| 149 | cma_add_to_cma_mem_list(cma, mem); | ||
| 150 | |||
| 151 | return 0; | ||
| 152 | } | ||
| 153 | |||
| 154 | static int cma_alloc_write(void *data, u64 val) | ||
| 155 | { | ||
| 156 | int pages = val; | ||
| 157 | struct cma *cma = data; | ||
| 158 | |||
| 159 | return cma_alloc_mem(cma, pages); | ||
| 160 | } | ||
| 161 | DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); | ||
| 162 | |||
| 163 | static void cma_debugfs_add_one(struct cma *cma, int idx) | ||
| 164 | { | ||
| 165 | struct dentry *tmp; | ||
| 166 | char name[16]; | ||
| 167 | int u32s; | ||
| 168 | |||
| 169 | sprintf(name, "cma-%d", idx); | ||
| 170 | |||
| 171 | tmp = debugfs_create_dir(name, cma_debugfs_root); | ||
| 172 | |||
| 173 | debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma, | ||
| 174 | &cma_alloc_fops); | ||
| 175 | |||
| 176 | debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma, | ||
| 177 | &cma_free_fops); | ||
| 178 | |||
| 179 | debugfs_create_file("base_pfn", S_IRUGO, tmp, | ||
| 180 | &cma->base_pfn, &cma_debugfs_fops); | ||
| 181 | debugfs_create_file("count", S_IRUGO, tmp, | ||
| 182 | &cma->count, &cma_debugfs_fops); | ||
| 183 | debugfs_create_file("order_per_bit", S_IRUGO, tmp, | ||
| 184 | &cma->order_per_bit, &cma_debugfs_fops); | ||
| 185 | debugfs_create_file("used", S_IRUGO, tmp, cma, &cma_used_fops); | ||
| 186 | debugfs_create_file("maxchunk", S_IRUGO, tmp, cma, &cma_maxchunk_fops); | ||
| 187 | |||
| 188 | u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32)); | ||
| 189 | debugfs_create_u32_array("bitmap", S_IRUGO, tmp, (u32*)cma->bitmap, u32s); | ||
| 190 | } | ||
| 191 | |||
| 192 | static int __init cma_debugfs_init(void) | ||
| 193 | { | ||
| 194 | int i; | ||
| 195 | |||
| 196 | cma_debugfs_root = debugfs_create_dir("cma", NULL); | ||
| 197 | if (!cma_debugfs_root) | ||
| 198 | return -ENOMEM; | ||
| 199 | |||
| 200 | for (i = 0; i < cma_area_count; i++) | ||
| 201 | cma_debugfs_add_one(&cma_areas[i], i); | ||
| 202 | |||
| 203 | return 0; | ||
| 204 | } | ||
| 205 | late_initcall(cma_debugfs_init); | ||
diff --git a/mm/compaction.c b/mm/compaction.c index 8c0d9459b54a..018f08da99a2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
| @@ -391,28 +391,6 @@ static inline bool compact_should_abort(struct compact_control *cc) | |||
| 391 | return false; | 391 | return false; |
| 392 | } | 392 | } |
| 393 | 393 | ||
| 394 | /* Returns true if the page is within a block suitable for migration to */ | ||
| 395 | static bool suitable_migration_target(struct page *page) | ||
| 396 | { | ||
| 397 | /* If the page is a large free page, then disallow migration */ | ||
| 398 | if (PageBuddy(page)) { | ||
| 399 | /* | ||
| 400 | * We are checking page_order without zone->lock taken. But | ||
| 401 | * the only small danger is that we skip a potentially suitable | ||
| 402 | * pageblock, so it's not worth to check order for valid range. | ||
| 403 | */ | ||
| 404 | if (page_order_unsafe(page) >= pageblock_order) | ||
| 405 | return false; | ||
| 406 | } | ||
| 407 | |||
| 408 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
| 409 | if (migrate_async_suitable(get_pageblock_migratetype(page))) | ||
| 410 | return true; | ||
| 411 | |||
| 412 | /* Otherwise skip the block */ | ||
| 413 | return false; | ||
| 414 | } | ||
| 415 | |||
| 416 | /* | 394 | /* |
| 417 | * Isolate free pages onto a private freelist. If @strict is true, will abort | 395 | * Isolate free pages onto a private freelist. If @strict is true, will abort |
| 418 | * returning 0 on any invalid PFNs or non-free pages inside of the pageblock | 396 | * returning 0 on any invalid PFNs or non-free pages inside of the pageblock |
| @@ -896,6 +874,29 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, | |||
| 896 | 874 | ||
| 897 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 875 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
| 898 | #ifdef CONFIG_COMPACTION | 876 | #ifdef CONFIG_COMPACTION |
| 877 | |||
| 878 | /* Returns true if the page is within a block suitable for migration to */ | ||
| 879 | static bool suitable_migration_target(struct page *page) | ||
| 880 | { | ||
| 881 | /* If the page is a large free page, then disallow migration */ | ||
| 882 | if (PageBuddy(page)) { | ||
| 883 | /* | ||
| 884 | * We are checking page_order without zone->lock taken. But | ||
| 885 | * the only small danger is that we skip a potentially suitable | ||
| 886 | * pageblock, so it's not worth to check order for valid range. | ||
| 887 | */ | ||
| 888 | if (page_order_unsafe(page) >= pageblock_order) | ||
| 889 | return false; | ||
| 890 | } | ||
| 891 | |||
| 892 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | ||
| 893 | if (migrate_async_suitable(get_pageblock_migratetype(page))) | ||
| 894 | return true; | ||
| 895 | |||
| 896 | /* Otherwise skip the block */ | ||
| 897 | return false; | ||
| 898 | } | ||
| 899 | |||
| 899 | /* | 900 | /* |
| 900 | * Based on information in the current compact_control, find blocks | 901 | * Based on information in the current compact_control, find blocks |
| 901 | * suitable for isolating free pages from and then isolate them. | 902 | * suitable for isolating free pages from and then isolate them. |
| @@ -1047,6 +1048,12 @@ typedef enum { | |||
| 1047 | } isolate_migrate_t; | 1048 | } isolate_migrate_t; |
| 1048 | 1049 | ||
| 1049 | /* | 1050 | /* |
| 1051 | * Allow userspace to control policy on scanning the unevictable LRU for | ||
| 1052 | * compactable pages. | ||
| 1053 | */ | ||
| 1054 | int sysctl_compact_unevictable_allowed __read_mostly = 1; | ||
| 1055 | |||
| 1056 | /* | ||
| 1050 | * Isolate all pages that can be migrated from the first suitable block, | 1057 | * Isolate all pages that can be migrated from the first suitable block, |
| 1051 | * starting at the block pointed to by the migrate scanner pfn within | 1058 | * starting at the block pointed to by the migrate scanner pfn within |
| 1052 | * compact_control. | 1059 | * compact_control. |
| @@ -1057,6 +1064,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, | |||
| 1057 | unsigned long low_pfn, end_pfn; | 1064 | unsigned long low_pfn, end_pfn; |
| 1058 | struct page *page; | 1065 | struct page *page; |
| 1059 | const isolate_mode_t isolate_mode = | 1066 | const isolate_mode_t isolate_mode = |
| 1067 | (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | | ||
| 1060 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); | 1068 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); |
| 1061 | 1069 | ||
| 1062 | /* | 1070 | /* |
| @@ -1174,13 +1182,24 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, | |||
| 1174 | /* Direct compactor: Is a suitable page free? */ | 1182 | /* Direct compactor: Is a suitable page free? */ |
| 1175 | for (order = cc->order; order < MAX_ORDER; order++) { | 1183 | for (order = cc->order; order < MAX_ORDER; order++) { |
| 1176 | struct free_area *area = &zone->free_area[order]; | 1184 | struct free_area *area = &zone->free_area[order]; |
| 1185 | bool can_steal; | ||
| 1177 | 1186 | ||
| 1178 | /* Job done if page is free of the right migratetype */ | 1187 | /* Job done if page is free of the right migratetype */ |
| 1179 | if (!list_empty(&area->free_list[migratetype])) | 1188 | if (!list_empty(&area->free_list[migratetype])) |
| 1180 | return COMPACT_PARTIAL; | 1189 | return COMPACT_PARTIAL; |
| 1181 | 1190 | ||
| 1182 | /* Job done if allocation would set block type */ | 1191 | #ifdef CONFIG_CMA |
| 1183 | if (order >= pageblock_order && area->nr_free) | 1192 | /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */ |
| 1193 | if (migratetype == MIGRATE_MOVABLE && | ||
| 1194 | !list_empty(&area->free_list[MIGRATE_CMA])) | ||
| 1195 | return COMPACT_PARTIAL; | ||
| 1196 | #endif | ||
| 1197 | /* | ||
| 1198 | * Job done if allocation would steal freepages from | ||
| 1199 | * other migratetype buddy lists. | ||
| 1200 | */ | ||
| 1201 | if (find_suitable_fallback(area, order, migratetype, | ||
| 1202 | true, &can_steal) != -1) | ||
| 1184 | return COMPACT_PARTIAL; | 1203 | return COMPACT_PARTIAL; |
| 1185 | } | 1204 | } |
| 1186 | 1205 | ||
| @@ -1587,6 +1606,14 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) | |||
| 1587 | INIT_LIST_HEAD(&cc->freepages); | 1606 | INIT_LIST_HEAD(&cc->freepages); |
| 1588 | INIT_LIST_HEAD(&cc->migratepages); | 1607 | INIT_LIST_HEAD(&cc->migratepages); |
| 1589 | 1608 | ||
| 1609 | /* | ||
| 1610 | * When called via /proc/sys/vm/compact_memory | ||
| 1611 | * this makes sure we compact the whole zone regardless of | ||
| 1612 | * cached scanner positions. | ||
| 1613 | */ | ||
| 1614 | if (cc->order == -1) | ||
| 1615 | __reset_isolation_suitable(zone); | ||
| 1616 | |||
| 1590 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) | 1617 | if (cc->order == -1 || !compaction_deferred(zone, cc->order)) |
| 1591 | compact_zone(zone, cc); | 1618 | compact_zone(zone, cc); |
| 1592 | 1619 | ||
diff --git a/mm/filemap.c b/mm/filemap.c index ad7242043bdb..6bf5e42d560a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -13,7 +13,6 @@ | |||
| 13 | #include <linux/compiler.h> | 13 | #include <linux/compiler.h> |
| 14 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
| 15 | #include <linux/uaccess.h> | 15 | #include <linux/uaccess.h> |
| 16 | #include <linux/aio.h> | ||
| 17 | #include <linux/capability.h> | 16 | #include <linux/capability.h> |
| 18 | #include <linux/kernel_stat.h> | 17 | #include <linux/kernel_stat.h> |
| 19 | #include <linux/gfp.h> | 18 | #include <linux/gfp.h> |
| @@ -203,16 +202,15 @@ void __delete_from_page_cache(struct page *page, void *shadow) | |||
| 203 | BUG_ON(page_mapped(page)); | 202 | BUG_ON(page_mapped(page)); |
| 204 | 203 | ||
| 205 | /* | 204 | /* |
| 206 | * Some filesystems seem to re-dirty the page even after | 205 | * At this point page must be either written or cleaned by truncate. |
| 207 | * the VM has canceled the dirty bit (eg ext3 journaling). | 206 | * Dirty page here signals a bug and loss of unwritten data. |
| 208 | * | 207 | * |
| 209 | * Fix it up by doing a final dirty accounting check after | 208 | * This fixes dirty accounting after removing the page entirely but |
| 210 | * having removed the page entirely. | 209 | * leaves PageDirty set: it has no effect for truncated page and |
| 210 | * anyway will be cleared before returning page into buddy allocator. | ||
| 211 | */ | 211 | */ |
| 212 | if (PageDirty(page) && mapping_cap_account_dirty(mapping)) { | 212 | if (WARN_ON_ONCE(PageDirty(page))) |
| 213 | dec_zone_page_state(page, NR_FILE_DIRTY); | 213 | account_page_cleaned(page, mapping); |
| 214 | dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); | ||
| 215 | } | ||
| 216 | } | 214 | } |
| 217 | 215 | ||
| 218 | /** | 216 | /** |
| @@ -1695,7 +1693,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) | |||
| 1695 | loff_t *ppos = &iocb->ki_pos; | 1693 | loff_t *ppos = &iocb->ki_pos; |
| 1696 | loff_t pos = *ppos; | 1694 | loff_t pos = *ppos; |
| 1697 | 1695 | ||
| 1698 | if (io_is_direct(file)) { | 1696 | if (iocb->ki_flags & IOCB_DIRECT) { |
| 1699 | struct address_space *mapping = file->f_mapping; | 1697 | struct address_space *mapping = file->f_mapping; |
| 1700 | struct inode *inode = mapping->host; | 1698 | struct inode *inode = mapping->host; |
| 1701 | size_t count = iov_iter_count(iter); | 1699 | size_t count = iov_iter_count(iter); |
| @@ -1708,7 +1706,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) | |||
| 1708 | pos + count - 1); | 1706 | pos + count - 1); |
| 1709 | if (!retval) { | 1707 | if (!retval) { |
| 1710 | struct iov_iter data = *iter; | 1708 | struct iov_iter data = *iter; |
| 1711 | retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos); | 1709 | retval = mapping->a_ops->direct_IO(iocb, &data, pos); |
| 1712 | } | 1710 | } |
| 1713 | 1711 | ||
| 1714 | if (retval > 0) { | 1712 | if (retval > 0) { |
| @@ -2261,41 +2259,38 @@ EXPORT_SYMBOL(read_cache_page_gfp); | |||
| 2261 | * Returns appropriate error code that caller should return or | 2259 | * Returns appropriate error code that caller should return or |
| 2262 | * zero in case that write should be allowed. | 2260 | * zero in case that write should be allowed. |
| 2263 | */ | 2261 | */ |
| 2264 | inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) | 2262 | inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) |
| 2265 | { | 2263 | { |
| 2264 | struct file *file = iocb->ki_filp; | ||
| 2266 | struct inode *inode = file->f_mapping->host; | 2265 | struct inode *inode = file->f_mapping->host; |
| 2267 | unsigned long limit = rlimit(RLIMIT_FSIZE); | 2266 | unsigned long limit = rlimit(RLIMIT_FSIZE); |
| 2267 | loff_t pos; | ||
| 2268 | 2268 | ||
| 2269 | if (unlikely(*pos < 0)) | 2269 | if (!iov_iter_count(from)) |
| 2270 | return -EINVAL; | 2270 | return 0; |
| 2271 | 2271 | ||
| 2272 | if (!isblk) { | 2272 | /* FIXME: this is for backwards compatibility with 2.4 */ |
| 2273 | /* FIXME: this is for backwards compatibility with 2.4 */ | 2273 | if (iocb->ki_flags & IOCB_APPEND) |
| 2274 | if (file->f_flags & O_APPEND) | 2274 | iocb->ki_pos = i_size_read(inode); |
| 2275 | *pos = i_size_read(inode); | ||
| 2276 | 2275 | ||
| 2277 | if (limit != RLIM_INFINITY) { | 2276 | pos = iocb->ki_pos; |
| 2278 | if (*pos >= limit) { | 2277 | |
| 2279 | send_sig(SIGXFSZ, current, 0); | 2278 | if (limit != RLIM_INFINITY) { |
| 2280 | return -EFBIG; | 2279 | if (iocb->ki_pos >= limit) { |
| 2281 | } | 2280 | send_sig(SIGXFSZ, current, 0); |
| 2282 | if (*count > limit - (typeof(limit))*pos) { | 2281 | return -EFBIG; |
| 2283 | *count = limit - (typeof(limit))*pos; | ||
| 2284 | } | ||
| 2285 | } | 2282 | } |
| 2283 | iov_iter_truncate(from, limit - (unsigned long)pos); | ||
| 2286 | } | 2284 | } |
| 2287 | 2285 | ||
| 2288 | /* | 2286 | /* |
| 2289 | * LFS rule | 2287 | * LFS rule |
| 2290 | */ | 2288 | */ |
| 2291 | if (unlikely(*pos + *count > MAX_NON_LFS && | 2289 | if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS && |
| 2292 | !(file->f_flags & O_LARGEFILE))) { | 2290 | !(file->f_flags & O_LARGEFILE))) { |
| 2293 | if (*pos >= MAX_NON_LFS) { | 2291 | if (pos >= MAX_NON_LFS) |
| 2294 | return -EFBIG; | 2292 | return -EFBIG; |
| 2295 | } | 2293 | iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos); |
| 2296 | if (*count > MAX_NON_LFS - (unsigned long)*pos) { | ||
| 2297 | *count = MAX_NON_LFS - (unsigned long)*pos; | ||
| 2298 | } | ||
| 2299 | } | 2294 | } |
| 2300 | 2295 | ||
| 2301 | /* | 2296 | /* |
| @@ -2305,34 +2300,11 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
| 2305 | * exceeded without writing data we send a signal and return EFBIG. | 2300 | * exceeded without writing data we send a signal and return EFBIG. |
| 2306 | * Linus frestrict idea will clean these up nicely.. | 2301 | * Linus frestrict idea will clean these up nicely.. |
| 2307 | */ | 2302 | */ |
| 2308 | if (likely(!isblk)) { | 2303 | if (unlikely(pos >= inode->i_sb->s_maxbytes)) |
| 2309 | if (unlikely(*pos >= inode->i_sb->s_maxbytes)) { | 2304 | return -EFBIG; |
| 2310 | if (*count || *pos > inode->i_sb->s_maxbytes) { | ||
| 2311 | return -EFBIG; | ||
| 2312 | } | ||
| 2313 | /* zero-length writes at ->s_maxbytes are OK */ | ||
| 2314 | } | ||
| 2315 | |||
| 2316 | if (unlikely(*pos + *count > inode->i_sb->s_maxbytes)) | ||
| 2317 | *count = inode->i_sb->s_maxbytes - *pos; | ||
| 2318 | } else { | ||
| 2319 | #ifdef CONFIG_BLOCK | ||
| 2320 | loff_t isize; | ||
| 2321 | if (bdev_read_only(I_BDEV(inode))) | ||
| 2322 | return -EPERM; | ||
| 2323 | isize = i_size_read(inode); | ||
| 2324 | if (*pos >= isize) { | ||
| 2325 | if (*count || *pos > isize) | ||
| 2326 | return -ENOSPC; | ||
| 2327 | } | ||
| 2328 | 2305 | ||
| 2329 | if (*pos + *count > isize) | 2306 | iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos); |
| 2330 | *count = isize - *pos; | 2307 | return iov_iter_count(from); |
| 2331 | #else | ||
| 2332 | return -EPERM; | ||
| 2333 | #endif | ||
| 2334 | } | ||
| 2335 | return 0; | ||
| 2336 | } | 2308 | } |
| 2337 | EXPORT_SYMBOL(generic_write_checks); | 2309 | EXPORT_SYMBOL(generic_write_checks); |
| 2338 | 2310 | ||
| @@ -2396,7 +2368,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) | |||
| 2396 | } | 2368 | } |
| 2397 | 2369 | ||
| 2398 | data = *from; | 2370 | data = *from; |
| 2399 | written = mapping->a_ops->direct_IO(WRITE, iocb, &data, pos); | 2371 | written = mapping->a_ops->direct_IO(iocb, &data, pos); |
| 2400 | 2372 | ||
| 2401 | /* | 2373 | /* |
| 2402 | * Finally, try again to invalidate clean pages which might have been | 2374 | * Finally, try again to invalidate clean pages which might have been |
| @@ -2558,23 +2530,12 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
| 2558 | struct file *file = iocb->ki_filp; | 2530 | struct file *file = iocb->ki_filp; |
| 2559 | struct address_space * mapping = file->f_mapping; | 2531 | struct address_space * mapping = file->f_mapping; |
| 2560 | struct inode *inode = mapping->host; | 2532 | struct inode *inode = mapping->host; |
| 2561 | loff_t pos = iocb->ki_pos; | ||
| 2562 | ssize_t written = 0; | 2533 | ssize_t written = 0; |
| 2563 | ssize_t err; | 2534 | ssize_t err; |
| 2564 | ssize_t status; | 2535 | ssize_t status; |
| 2565 | size_t count = iov_iter_count(from); | ||
| 2566 | 2536 | ||
| 2567 | /* We can write back this queue in page reclaim */ | 2537 | /* We can write back this queue in page reclaim */ |
| 2568 | current->backing_dev_info = inode_to_bdi(inode); | 2538 | current->backing_dev_info = inode_to_bdi(inode); |
| 2569 | err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); | ||
| 2570 | if (err) | ||
| 2571 | goto out; | ||
| 2572 | |||
| 2573 | if (count == 0) | ||
| 2574 | goto out; | ||
| 2575 | |||
| 2576 | iov_iter_truncate(from, count); | ||
| 2577 | |||
| 2578 | err = file_remove_suid(file); | 2539 | err = file_remove_suid(file); |
| 2579 | if (err) | 2540 | if (err) |
| 2580 | goto out; | 2541 | goto out; |
| @@ -2583,10 +2544,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
| 2583 | if (err) | 2544 | if (err) |
| 2584 | goto out; | 2545 | goto out; |
| 2585 | 2546 | ||
| 2586 | if (io_is_direct(file)) { | 2547 | if (iocb->ki_flags & IOCB_DIRECT) { |
| 2587 | loff_t endbyte; | 2548 | loff_t pos, endbyte; |
| 2588 | 2549 | ||
| 2589 | written = generic_file_direct_write(iocb, from, pos); | 2550 | written = generic_file_direct_write(iocb, from, iocb->ki_pos); |
| 2590 | /* | 2551 | /* |
| 2591 | * If the write stopped short of completing, fall back to | 2552 | * If the write stopped short of completing, fall back to |
| 2592 | * buffered writes. Some filesystems do this for writes to | 2553 | * buffered writes. Some filesystems do this for writes to |
| @@ -2594,13 +2555,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
| 2594 | * not succeed (even if it did, DAX does not handle dirty | 2555 | * not succeed (even if it did, DAX does not handle dirty |
| 2595 | * page-cache pages correctly). | 2556 | * page-cache pages correctly). |
| 2596 | */ | 2557 | */ |
| 2597 | if (written < 0 || written == count || IS_DAX(inode)) | 2558 | if (written < 0 || !iov_iter_count(from) || IS_DAX(inode)) |
| 2598 | goto out; | 2559 | goto out; |
| 2599 | 2560 | ||
| 2600 | pos += written; | 2561 | status = generic_perform_write(file, from, pos = iocb->ki_pos); |
| 2601 | count -= written; | ||
| 2602 | |||
| 2603 | status = generic_perform_write(file, from, pos); | ||
| 2604 | /* | 2562 | /* |
| 2605 | * If generic_perform_write() returned a synchronous error | 2563 | * If generic_perform_write() returned a synchronous error |
| 2606 | * then we want to return the number of bytes which were | 2564 | * then we want to return the number of bytes which were |
| @@ -2612,15 +2570,15 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
| 2612 | err = status; | 2570 | err = status; |
| 2613 | goto out; | 2571 | goto out; |
| 2614 | } | 2572 | } |
| 2615 | iocb->ki_pos = pos + status; | ||
| 2616 | /* | 2573 | /* |
| 2617 | * We need to ensure that the page cache pages are written to | 2574 | * We need to ensure that the page cache pages are written to |
| 2618 | * disk and invalidated to preserve the expected O_DIRECT | 2575 | * disk and invalidated to preserve the expected O_DIRECT |
| 2619 | * semantics. | 2576 | * semantics. |
| 2620 | */ | 2577 | */ |
| 2621 | endbyte = pos + status - 1; | 2578 | endbyte = pos + status - 1; |
| 2622 | err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); | 2579 | err = filemap_write_and_wait_range(mapping, pos, endbyte); |
| 2623 | if (err == 0) { | 2580 | if (err == 0) { |
| 2581 | iocb->ki_pos = endbyte + 1; | ||
| 2624 | written += status; | 2582 | written += status; |
| 2625 | invalidate_mapping_pages(mapping, | 2583 | invalidate_mapping_pages(mapping, |
| 2626 | pos >> PAGE_CACHE_SHIFT, | 2584 | pos >> PAGE_CACHE_SHIFT, |
| @@ -2632,9 +2590,9 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
| 2632 | */ | 2590 | */ |
| 2633 | } | 2591 | } |
| 2634 | } else { | 2592 | } else { |
| 2635 | written = generic_perform_write(file, from, pos); | 2593 | written = generic_perform_write(file, from, iocb->ki_pos); |
| 2636 | if (likely(written >= 0)) | 2594 | if (likely(written > 0)) |
| 2637 | iocb->ki_pos = pos + written; | 2595 | iocb->ki_pos += written; |
| 2638 | } | 2596 | } |
| 2639 | out: | 2597 | out: |
| 2640 | current->backing_dev_info = NULL; | 2598 | current->backing_dev_info = NULL; |
| @@ -2658,7 +2616,9 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
| 2658 | ssize_t ret; | 2616 | ssize_t ret; |
| 2659 | 2617 | ||
| 2660 | mutex_lock(&inode->i_mutex); | 2618 | mutex_lock(&inode->i_mutex); |
| 2661 | ret = __generic_file_write_iter(iocb, from); | 2619 | ret = generic_write_checks(iocb, from); |
| 2620 | if (ret > 0) | ||
| 2621 | ret = __generic_file_write_iter(iocb, from); | ||
| 2662 | mutex_unlock(&inode->i_mutex); | 2622 | mutex_unlock(&inode->i_mutex); |
| 2663 | 2623 | ||
| 2664 | if (ret > 0) { | 2624 | if (ret > 0) { |
| @@ -92,7 +92,7 @@ retry: | |||
| 92 | */ | 92 | */ |
| 93 | mark_page_accessed(page); | 93 | mark_page_accessed(page); |
| 94 | } | 94 | } |
| 95 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | 95 | if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { |
| 96 | /* | 96 | /* |
| 97 | * The preliminary mapping check is mainly to avoid the | 97 | * The preliminary mapping check is mainly to avoid the |
| 98 | * pointless overhead of lock_page on the ZERO_PAGE | 98 | * pointless overhead of lock_page on the ZERO_PAGE |
| @@ -265,8 +265,8 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, | |||
| 265 | unsigned int fault_flags = 0; | 265 | unsigned int fault_flags = 0; |
| 266 | int ret; | 266 | int ret; |
| 267 | 267 | ||
| 268 | /* For mlock, just skip the stack guard page. */ | 268 | /* For mm_populate(), just skip the stack guard page. */ |
| 269 | if ((*flags & FOLL_MLOCK) && | 269 | if ((*flags & FOLL_POPULATE) && |
| 270 | (stack_guard_page_start(vma, address) || | 270 | (stack_guard_page_start(vma, address) || |
| 271 | stack_guard_page_end(vma, address + PAGE_SIZE))) | 271 | stack_guard_page_end(vma, address + PAGE_SIZE))) |
| 272 | return -ENOENT; | 272 | return -ENOENT; |
| @@ -819,6 +819,124 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
| 819 | EXPORT_SYMBOL(get_user_pages); | 819 | EXPORT_SYMBOL(get_user_pages); |
| 820 | 820 | ||
| 821 | /** | 821 | /** |
| 822 | * populate_vma_page_range() - populate a range of pages in the vma. | ||
| 823 | * @vma: target vma | ||
| 824 | * @start: start address | ||
| 825 | * @end: end address | ||
| 826 | * @nonblocking: | ||
| 827 | * | ||
| 828 | * This takes care of mlocking the pages too if VM_LOCKED is set. | ||
| 829 | * | ||
| 830 | * return 0 on success, negative error code on error. | ||
| 831 | * | ||
| 832 | * vma->vm_mm->mmap_sem must be held. | ||
| 833 | * | ||
| 834 | * If @nonblocking is NULL, it may be held for read or write and will | ||
| 835 | * be unperturbed. | ||
| 836 | * | ||
| 837 | * If @nonblocking is non-NULL, it must held for read only and may be | ||
| 838 | * released. If it's released, *@nonblocking will be set to 0. | ||
| 839 | */ | ||
| 840 | long populate_vma_page_range(struct vm_area_struct *vma, | ||
| 841 | unsigned long start, unsigned long end, int *nonblocking) | ||
| 842 | { | ||
| 843 | struct mm_struct *mm = vma->vm_mm; | ||
| 844 | unsigned long nr_pages = (end - start) / PAGE_SIZE; | ||
| 845 | int gup_flags; | ||
| 846 | |||
| 847 | VM_BUG_ON(start & ~PAGE_MASK); | ||
| 848 | VM_BUG_ON(end & ~PAGE_MASK); | ||
| 849 | VM_BUG_ON_VMA(start < vma->vm_start, vma); | ||
| 850 | VM_BUG_ON_VMA(end > vma->vm_end, vma); | ||
| 851 | VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); | ||
| 852 | |||
| 853 | gup_flags = FOLL_TOUCH | FOLL_POPULATE; | ||
| 854 | /* | ||
| 855 | * We want to touch writable mappings with a write fault in order | ||
| 856 | * to break COW, except for shared mappings because these don't COW | ||
| 857 | * and we would not want to dirty them for nothing. | ||
| 858 | */ | ||
| 859 | if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) | ||
| 860 | gup_flags |= FOLL_WRITE; | ||
| 861 | |||
| 862 | /* | ||
| 863 | * We want mlock to succeed for regions that have any permissions | ||
| 864 | * other than PROT_NONE. | ||
| 865 | */ | ||
| 866 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) | ||
| 867 | gup_flags |= FOLL_FORCE; | ||
| 868 | |||
| 869 | /* | ||
| 870 | * We made sure addr is within a VMA, so the following will | ||
| 871 | * not result in a stack expansion that recurses back here. | ||
| 872 | */ | ||
| 873 | return __get_user_pages(current, mm, start, nr_pages, gup_flags, | ||
| 874 | NULL, NULL, nonblocking); | ||
| 875 | } | ||
| 876 | |||
| 877 | /* | ||
| 878 | * __mm_populate - populate and/or mlock pages within a range of address space. | ||
| 879 | * | ||
| 880 | * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap | ||
| 881 | * flags. VMAs must be already marked with the desired vm_flags, and | ||
| 882 | * mmap_sem must not be held. | ||
| 883 | */ | ||
| 884 | int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) | ||
| 885 | { | ||
| 886 | struct mm_struct *mm = current->mm; | ||
| 887 | unsigned long end, nstart, nend; | ||
| 888 | struct vm_area_struct *vma = NULL; | ||
| 889 | int locked = 0; | ||
| 890 | long ret = 0; | ||
| 891 | |||
| 892 | VM_BUG_ON(start & ~PAGE_MASK); | ||
| 893 | VM_BUG_ON(len != PAGE_ALIGN(len)); | ||
| 894 | end = start + len; | ||
| 895 | |||
| 896 | for (nstart = start; nstart < end; nstart = nend) { | ||
| 897 | /* | ||
| 898 | * We want to fault in pages for [nstart; end) address range. | ||
| 899 | * Find first corresponding VMA. | ||
| 900 | */ | ||
| 901 | if (!locked) { | ||
| 902 | locked = 1; | ||
| 903 | down_read(&mm->mmap_sem); | ||
| 904 | vma = find_vma(mm, nstart); | ||
| 905 | } else if (nstart >= vma->vm_end) | ||
| 906 | vma = vma->vm_next; | ||
| 907 | if (!vma || vma->vm_start >= end) | ||
| 908 | break; | ||
| 909 | /* | ||
| 910 | * Set [nstart; nend) to intersection of desired address | ||
| 911 | * range with the first VMA. Also, skip undesirable VMA types. | ||
| 912 | */ | ||
| 913 | nend = min(end, vma->vm_end); | ||
| 914 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
| 915 | continue; | ||
| 916 | if (nstart < vma->vm_start) | ||
| 917 | nstart = vma->vm_start; | ||
| 918 | /* | ||
| 919 | * Now fault in a range of pages. populate_vma_page_range() | ||
| 920 | * double checks the vma flags, so that it won't mlock pages | ||
| 921 | * if the vma was already munlocked. | ||
| 922 | */ | ||
| 923 | ret = populate_vma_page_range(vma, nstart, nend, &locked); | ||
| 924 | if (ret < 0) { | ||
| 925 | if (ignore_errors) { | ||
| 926 | ret = 0; | ||
| 927 | continue; /* continue at next VMA */ | ||
| 928 | } | ||
| 929 | break; | ||
| 930 | } | ||
| 931 | nend = nstart + ret * PAGE_SIZE; | ||
| 932 | ret = 0; | ||
| 933 | } | ||
| 934 | if (locked) | ||
| 935 | up_read(&mm->mmap_sem); | ||
| 936 | return ret; /* 0 or negative error code */ | ||
| 937 | } | ||
| 938 | |||
| 939 | /** | ||
| 822 | * get_dump_page() - pin user page in memory while writing it to core dump | 940 | * get_dump_page() - pin user page in memory while writing it to core dump |
| 823 | * @addr: user address | 941 | * @addr: user address |
| 824 | * | 942 | * |
| @@ -901,7 +1019,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
| 901 | * | 1019 | * |
| 902 | * for an example see gup_get_pte in arch/x86/mm/gup.c | 1020 | * for an example see gup_get_pte in arch/x86/mm/gup.c |
| 903 | */ | 1021 | */ |
| 904 | pte_t pte = ACCESS_ONCE(*ptep); | 1022 | pte_t pte = READ_ONCE(*ptep); |
| 905 | struct page *page; | 1023 | struct page *page; |
| 906 | 1024 | ||
| 907 | /* | 1025 | /* |
| @@ -1191,7 +1309,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | |||
| 1191 | local_irq_save(flags); | 1309 | local_irq_save(flags); |
| 1192 | pgdp = pgd_offset(mm, addr); | 1310 | pgdp = pgd_offset(mm, addr); |
| 1193 | do { | 1311 | do { |
| 1194 | pgd_t pgd = ACCESS_ONCE(*pgdp); | 1312 | pgd_t pgd = READ_ONCE(*pgdp); |
| 1195 | 1313 | ||
| 1196 | next = pgd_addr_end(addr, end); | 1314 | next = pgd_addr_end(addr, end); |
| 1197 | if (pgd_none(pgd)) | 1315 | if (pgd_none(pgd)) |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fc00c8cb5a82..078832cf3636 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -67,6 +67,7 @@ static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | |||
| 67 | 67 | ||
| 68 | static int khugepaged(void *none); | 68 | static int khugepaged(void *none); |
| 69 | static int khugepaged_slab_init(void); | 69 | static int khugepaged_slab_init(void); |
| 70 | static void khugepaged_slab_exit(void); | ||
| 70 | 71 | ||
| 71 | #define MM_SLOTS_HASH_BITS 10 | 72 | #define MM_SLOTS_HASH_BITS 10 |
| 72 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); | 73 | static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); |
| @@ -109,9 +110,6 @@ static int set_recommended_min_free_kbytes(void) | |||
| 109 | int nr_zones = 0; | 110 | int nr_zones = 0; |
| 110 | unsigned long recommended_min; | 111 | unsigned long recommended_min; |
| 111 | 112 | ||
| 112 | if (!khugepaged_enabled()) | ||
| 113 | return 0; | ||
| 114 | |||
| 115 | for_each_populated_zone(zone) | 113 | for_each_populated_zone(zone) |
| 116 | nr_zones++; | 114 | nr_zones++; |
| 117 | 115 | ||
| @@ -143,9 +141,8 @@ static int set_recommended_min_free_kbytes(void) | |||
| 143 | setup_per_zone_wmarks(); | 141 | setup_per_zone_wmarks(); |
| 144 | return 0; | 142 | return 0; |
| 145 | } | 143 | } |
| 146 | late_initcall(set_recommended_min_free_kbytes); | ||
| 147 | 144 | ||
| 148 | static int start_khugepaged(void) | 145 | static int start_stop_khugepaged(void) |
| 149 | { | 146 | { |
| 150 | int err = 0; | 147 | int err = 0; |
| 151 | if (khugepaged_enabled()) { | 148 | if (khugepaged_enabled()) { |
| @@ -156,6 +153,7 @@ static int start_khugepaged(void) | |||
| 156 | pr_err("khugepaged: kthread_run(khugepaged) failed\n"); | 153 | pr_err("khugepaged: kthread_run(khugepaged) failed\n"); |
| 157 | err = PTR_ERR(khugepaged_thread); | 154 | err = PTR_ERR(khugepaged_thread); |
| 158 | khugepaged_thread = NULL; | 155 | khugepaged_thread = NULL; |
| 156 | goto fail; | ||
| 159 | } | 157 | } |
| 160 | 158 | ||
| 161 | if (!list_empty(&khugepaged_scan.mm_head)) | 159 | if (!list_empty(&khugepaged_scan.mm_head)) |
| @@ -166,7 +164,7 @@ static int start_khugepaged(void) | |||
| 166 | kthread_stop(khugepaged_thread); | 164 | kthread_stop(khugepaged_thread); |
| 167 | khugepaged_thread = NULL; | 165 | khugepaged_thread = NULL; |
| 168 | } | 166 | } |
| 169 | 167 | fail: | |
| 170 | return err; | 168 | return err; |
| 171 | } | 169 | } |
| 172 | 170 | ||
| @@ -183,7 +181,7 @@ static struct page *get_huge_zero_page(void) | |||
| 183 | struct page *zero_page; | 181 | struct page *zero_page; |
| 184 | retry: | 182 | retry: |
| 185 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | 183 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) |
| 186 | return ACCESS_ONCE(huge_zero_page); | 184 | return READ_ONCE(huge_zero_page); |
| 187 | 185 | ||
| 188 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | 186 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, |
| 189 | HPAGE_PMD_ORDER); | 187 | HPAGE_PMD_ORDER); |
| @@ -202,7 +200,7 @@ retry: | |||
| 202 | /* We take additional reference here. It will be put back by shrinker */ | 200 | /* We take additional reference here. It will be put back by shrinker */ |
| 203 | atomic_set(&huge_zero_refcount, 2); | 201 | atomic_set(&huge_zero_refcount, 2); |
| 204 | preempt_enable(); | 202 | preempt_enable(); |
| 205 | return ACCESS_ONCE(huge_zero_page); | 203 | return READ_ONCE(huge_zero_page); |
| 206 | } | 204 | } |
| 207 | 205 | ||
| 208 | static void put_huge_zero_page(void) | 206 | static void put_huge_zero_page(void) |
| @@ -300,7 +298,7 @@ static ssize_t enabled_store(struct kobject *kobj, | |||
| 300 | int err; | 298 | int err; |
| 301 | 299 | ||
| 302 | mutex_lock(&khugepaged_mutex); | 300 | mutex_lock(&khugepaged_mutex); |
| 303 | err = start_khugepaged(); | 301 | err = start_stop_khugepaged(); |
| 304 | mutex_unlock(&khugepaged_mutex); | 302 | mutex_unlock(&khugepaged_mutex); |
| 305 | 303 | ||
| 306 | if (err) | 304 | if (err) |
| @@ -634,27 +632,38 @@ static int __init hugepage_init(void) | |||
| 634 | 632 | ||
| 635 | err = hugepage_init_sysfs(&hugepage_kobj); | 633 | err = hugepage_init_sysfs(&hugepage_kobj); |
| 636 | if (err) | 634 | if (err) |
| 637 | return err; | 635 | goto err_sysfs; |
| 638 | 636 | ||
| 639 | err = khugepaged_slab_init(); | 637 | err = khugepaged_slab_init(); |
| 640 | if (err) | 638 | if (err) |
| 641 | goto out; | 639 | goto err_slab; |
| 642 | 640 | ||
| 643 | register_shrinker(&huge_zero_page_shrinker); | 641 | err = register_shrinker(&huge_zero_page_shrinker); |
| 642 | if (err) | ||
| 643 | goto err_hzp_shrinker; | ||
| 644 | 644 | ||
| 645 | /* | 645 | /* |
| 646 | * By default disable transparent hugepages on smaller systems, | 646 | * By default disable transparent hugepages on smaller systems, |
| 647 | * where the extra memory used could hurt more than TLB overhead | 647 | * where the extra memory used could hurt more than TLB overhead |
| 648 | * is likely to save. The admin can still enable it through /sys. | 648 | * is likely to save. The admin can still enable it through /sys. |
| 649 | */ | 649 | */ |
| 650 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) | 650 | if (totalram_pages < (512 << (20 - PAGE_SHIFT))) { |
| 651 | transparent_hugepage_flags = 0; | 651 | transparent_hugepage_flags = 0; |
| 652 | return 0; | ||
| 653 | } | ||
| 652 | 654 | ||
| 653 | start_khugepaged(); | 655 | err = start_stop_khugepaged(); |
| 656 | if (err) | ||
| 657 | goto err_khugepaged; | ||
| 654 | 658 | ||
| 655 | return 0; | 659 | return 0; |
| 656 | out: | 660 | err_khugepaged: |
| 661 | unregister_shrinker(&huge_zero_page_shrinker); | ||
| 662 | err_hzp_shrinker: | ||
| 663 | khugepaged_slab_exit(); | ||
| 664 | err_slab: | ||
| 657 | hugepage_exit_sysfs(hugepage_kobj); | 665 | hugepage_exit_sysfs(hugepage_kobj); |
| 666 | err_sysfs: | ||
| 658 | return err; | 667 | return err; |
| 659 | } | 668 | } |
| 660 | subsys_initcall(hugepage_init); | 669 | subsys_initcall(hugepage_init); |
| @@ -708,7 +717,7 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) | |||
| 708 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 717 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, |
| 709 | struct vm_area_struct *vma, | 718 | struct vm_area_struct *vma, |
| 710 | unsigned long haddr, pmd_t *pmd, | 719 | unsigned long haddr, pmd_t *pmd, |
| 711 | struct page *page) | 720 | struct page *page, gfp_t gfp) |
| 712 | { | 721 | { |
| 713 | struct mem_cgroup *memcg; | 722 | struct mem_cgroup *memcg; |
| 714 | pgtable_t pgtable; | 723 | pgtable_t pgtable; |
| @@ -716,7 +725,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
| 716 | 725 | ||
| 717 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 726 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
| 718 | 727 | ||
| 719 | if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) | 728 | if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) |
| 720 | return VM_FAULT_OOM; | 729 | return VM_FAULT_OOM; |
| 721 | 730 | ||
| 722 | pgtable = pte_alloc_one(mm, haddr); | 731 | pgtable = pte_alloc_one(mm, haddr); |
| @@ -822,7 +831,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 822 | count_vm_event(THP_FAULT_FALLBACK); | 831 | count_vm_event(THP_FAULT_FALLBACK); |
| 823 | return VM_FAULT_FALLBACK; | 832 | return VM_FAULT_FALLBACK; |
| 824 | } | 833 | } |
| 825 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { | 834 | if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page, gfp))) { |
| 826 | put_page(page); | 835 | put_page(page); |
| 827 | count_vm_event(THP_FAULT_FALLBACK); | 836 | count_vm_event(THP_FAULT_FALLBACK); |
| 828 | return VM_FAULT_FALLBACK; | 837 | return VM_FAULT_FALLBACK; |
| @@ -1080,6 +1089,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1080 | unsigned long haddr; | 1089 | unsigned long haddr; |
| 1081 | unsigned long mmun_start; /* For mmu_notifiers */ | 1090 | unsigned long mmun_start; /* For mmu_notifiers */ |
| 1082 | unsigned long mmun_end; /* For mmu_notifiers */ | 1091 | unsigned long mmun_end; /* For mmu_notifiers */ |
| 1092 | gfp_t huge_gfp; /* for allocation and charge */ | ||
| 1083 | 1093 | ||
| 1084 | ptl = pmd_lockptr(mm, pmd); | 1094 | ptl = pmd_lockptr(mm, pmd); |
| 1085 | VM_BUG_ON_VMA(!vma->anon_vma, vma); | 1095 | VM_BUG_ON_VMA(!vma->anon_vma, vma); |
| @@ -1106,10 +1116,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1106 | alloc: | 1116 | alloc: |
| 1107 | if (transparent_hugepage_enabled(vma) && | 1117 | if (transparent_hugepage_enabled(vma) && |
| 1108 | !transparent_hugepage_debug_cow()) { | 1118 | !transparent_hugepage_debug_cow()) { |
| 1109 | gfp_t gfp; | 1119 | huge_gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); |
| 1110 | 1120 | new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); | |
| 1111 | gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0); | ||
| 1112 | new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); | ||
| 1113 | } else | 1121 | } else |
| 1114 | new_page = NULL; | 1122 | new_page = NULL; |
| 1115 | 1123 | ||
| @@ -1130,8 +1138,7 @@ alloc: | |||
| 1130 | goto out; | 1138 | goto out; |
| 1131 | } | 1139 | } |
| 1132 | 1140 | ||
| 1133 | if (unlikely(mem_cgroup_try_charge(new_page, mm, | 1141 | if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { |
| 1134 | GFP_TRANSHUGE, &memcg))) { | ||
| 1135 | put_page(new_page); | 1142 | put_page(new_page); |
| 1136 | if (page) { | 1143 | if (page) { |
| 1137 | split_huge_page(page); | 1144 | split_huge_page(page); |
| @@ -1231,7 +1238,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
| 1231 | pmd, _pmd, 1)) | 1238 | pmd, _pmd, 1)) |
| 1232 | update_mmu_cache_pmd(vma, addr, pmd); | 1239 | update_mmu_cache_pmd(vma, addr, pmd); |
| 1233 | } | 1240 | } |
| 1234 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | 1241 | if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { |
| 1235 | if (page->mapping && trylock_page(page)) { | 1242 | if (page->mapping && trylock_page(page)) { |
| 1236 | lru_add_drain(); | 1243 | lru_add_drain(); |
| 1237 | if (page->mapping) | 1244 | if (page->mapping) |
| @@ -1260,6 +1267,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1260 | int target_nid, last_cpupid = -1; | 1267 | int target_nid, last_cpupid = -1; |
| 1261 | bool page_locked; | 1268 | bool page_locked; |
| 1262 | bool migrated = false; | 1269 | bool migrated = false; |
| 1270 | bool was_writable; | ||
| 1263 | int flags = 0; | 1271 | int flags = 0; |
| 1264 | 1272 | ||
| 1265 | /* A PROT_NONE fault should not end up here */ | 1273 | /* A PROT_NONE fault should not end up here */ |
| @@ -1291,12 +1299,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1291 | flags |= TNF_FAULT_LOCAL; | 1299 | flags |= TNF_FAULT_LOCAL; |
| 1292 | } | 1300 | } |
| 1293 | 1301 | ||
| 1294 | /* | 1302 | /* See similar comment in do_numa_page for explanation */ |
| 1295 | * Avoid grouping on DSO/COW pages in specific and RO pages | 1303 | if (!(vma->vm_flags & VM_WRITE)) |
| 1296 | * in general, RO pages shouldn't hurt as much anyway since | ||
| 1297 | * they can be in shared cache state. | ||
| 1298 | */ | ||
| 1299 | if (!pmd_write(pmd)) | ||
| 1300 | flags |= TNF_NO_GROUP; | 1304 | flags |= TNF_NO_GROUP; |
| 1301 | 1305 | ||
| 1302 | /* | 1306 | /* |
| @@ -1353,12 +1357,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 1353 | if (migrated) { | 1357 | if (migrated) { |
| 1354 | flags |= TNF_MIGRATED; | 1358 | flags |= TNF_MIGRATED; |
| 1355 | page_nid = target_nid; | 1359 | page_nid = target_nid; |
| 1356 | } | 1360 | } else |
| 1361 | flags |= TNF_MIGRATE_FAIL; | ||
| 1357 | 1362 | ||
| 1358 | goto out; | 1363 | goto out; |
| 1359 | clear_pmdnuma: | 1364 | clear_pmdnuma: |
| 1360 | BUG_ON(!PageLocked(page)); | 1365 | BUG_ON(!PageLocked(page)); |
| 1366 | was_writable = pmd_write(pmd); | ||
| 1361 | pmd = pmd_modify(pmd, vma->vm_page_prot); | 1367 | pmd = pmd_modify(pmd, vma->vm_page_prot); |
| 1368 | pmd = pmd_mkyoung(pmd); | ||
| 1369 | if (was_writable) | ||
| 1370 | pmd = pmd_mkwrite(pmd); | ||
| 1362 | set_pmd_at(mm, haddr, pmdp, pmd); | 1371 | set_pmd_at(mm, haddr, pmdp, pmd); |
| 1363 | update_mmu_cache_pmd(vma, addr, pmdp); | 1372 | update_mmu_cache_pmd(vma, addr, pmdp); |
| 1364 | unlock_page(page); | 1373 | unlock_page(page); |
| @@ -1482,6 +1491,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 1482 | 1491 | ||
| 1483 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1492 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { |
| 1484 | pmd_t entry; | 1493 | pmd_t entry; |
| 1494 | bool preserve_write = prot_numa && pmd_write(*pmd); | ||
| 1495 | ret = 1; | ||
| 1485 | 1496 | ||
| 1486 | /* | 1497 | /* |
| 1487 | * Avoid trapping faults against the zero page. The read-only | 1498 | * Avoid trapping faults against the zero page. The read-only |
| @@ -1490,16 +1501,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 1490 | */ | 1501 | */ |
| 1491 | if (prot_numa && is_huge_zero_pmd(*pmd)) { | 1502 | if (prot_numa && is_huge_zero_pmd(*pmd)) { |
| 1492 | spin_unlock(ptl); | 1503 | spin_unlock(ptl); |
| 1493 | return 0; | 1504 | return ret; |
| 1494 | } | 1505 | } |
| 1495 | 1506 | ||
| 1496 | if (!prot_numa || !pmd_protnone(*pmd)) { | 1507 | if (!prot_numa || !pmd_protnone(*pmd)) { |
| 1497 | ret = 1; | ||
| 1498 | entry = pmdp_get_and_clear_notify(mm, addr, pmd); | 1508 | entry = pmdp_get_and_clear_notify(mm, addr, pmd); |
| 1499 | entry = pmd_modify(entry, newprot); | 1509 | entry = pmd_modify(entry, newprot); |
| 1510 | if (preserve_write) | ||
| 1511 | entry = pmd_mkwrite(entry); | ||
| 1500 | ret = HPAGE_PMD_NR; | 1512 | ret = HPAGE_PMD_NR; |
| 1501 | set_pmd_at(mm, addr, pmd, entry); | 1513 | set_pmd_at(mm, addr, pmd, entry); |
| 1502 | BUG_ON(pmd_write(entry)); | 1514 | BUG_ON(!preserve_write && pmd_write(entry)); |
| 1503 | } | 1515 | } |
| 1504 | spin_unlock(ptl); | 1516 | spin_unlock(ptl); |
| 1505 | } | 1517 | } |
| @@ -1971,6 +1983,11 @@ static int __init khugepaged_slab_init(void) | |||
| 1971 | return 0; | 1983 | return 0; |
| 1972 | } | 1984 | } |
| 1973 | 1985 | ||
| 1986 | static void __init khugepaged_slab_exit(void) | ||
| 1987 | { | ||
| 1988 | kmem_cache_destroy(mm_slot_cache); | ||
| 1989 | } | ||
| 1990 | |||
| 1974 | static inline struct mm_slot *alloc_mm_slot(void) | 1991 | static inline struct mm_slot *alloc_mm_slot(void) |
| 1975 | { | 1992 | { |
| 1976 | if (!mm_slot_cache) /* initialization failed */ | 1993 | if (!mm_slot_cache) /* initialization failed */ |
| @@ -2104,7 +2121,7 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) | |||
| 2104 | { | 2121 | { |
| 2105 | while (--_pte >= pte) { | 2122 | while (--_pte >= pte) { |
| 2106 | pte_t pteval = *_pte; | 2123 | pte_t pteval = *_pte; |
| 2107 | if (!pte_none(pteval)) | 2124 | if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval))) |
| 2108 | release_pte_page(pte_page(pteval)); | 2125 | release_pte_page(pte_page(pteval)); |
| 2109 | } | 2126 | } |
| 2110 | } | 2127 | } |
| @@ -2115,13 +2132,13 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | |||
| 2115 | { | 2132 | { |
| 2116 | struct page *page; | 2133 | struct page *page; |
| 2117 | pte_t *_pte; | 2134 | pte_t *_pte; |
| 2118 | int none = 0; | 2135 | int none_or_zero = 0; |
| 2119 | bool referenced = false, writable = false; | 2136 | bool referenced = false, writable = false; |
| 2120 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | 2137 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; |
| 2121 | _pte++, address += PAGE_SIZE) { | 2138 | _pte++, address += PAGE_SIZE) { |
| 2122 | pte_t pteval = *_pte; | 2139 | pte_t pteval = *_pte; |
| 2123 | if (pte_none(pteval)) { | 2140 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { |
| 2124 | if (++none <= khugepaged_max_ptes_none) | 2141 | if (++none_or_zero <= khugepaged_max_ptes_none) |
| 2125 | continue; | 2142 | continue; |
| 2126 | else | 2143 | else |
| 2127 | goto out; | 2144 | goto out; |
| @@ -2202,9 +2219,21 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | |||
| 2202 | pte_t pteval = *_pte; | 2219 | pte_t pteval = *_pte; |
| 2203 | struct page *src_page; | 2220 | struct page *src_page; |
| 2204 | 2221 | ||
| 2205 | if (pte_none(pteval)) { | 2222 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { |
| 2206 | clear_user_highpage(page, address); | 2223 | clear_user_highpage(page, address); |
| 2207 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); | 2224 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); |
| 2225 | if (is_zero_pfn(pte_pfn(pteval))) { | ||
| 2226 | /* | ||
| 2227 | * ptl mostly unnecessary. | ||
| 2228 | */ | ||
| 2229 | spin_lock(ptl); | ||
| 2230 | /* | ||
| 2231 | * paravirt calls inside pte_clear here are | ||
| 2232 | * superfluous. | ||
| 2233 | */ | ||
| 2234 | pte_clear(vma->vm_mm, address, _pte); | ||
| 2235 | spin_unlock(ptl); | ||
| 2236 | } | ||
| 2208 | } else { | 2237 | } else { |
| 2209 | src_page = pte_page(pteval); | 2238 | src_page = pte_page(pteval); |
| 2210 | copy_user_highpage(page, src_page, address, vma); | 2239 | copy_user_highpage(page, src_page, address, vma); |
| @@ -2306,8 +2335,8 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | |||
| 2306 | return true; | 2335 | return true; |
| 2307 | } | 2336 | } |
| 2308 | 2337 | ||
| 2309 | static struct page | 2338 | static struct page * |
| 2310 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | 2339 | khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, |
| 2311 | struct vm_area_struct *vma, unsigned long address, | 2340 | struct vm_area_struct *vma, unsigned long address, |
| 2312 | int node) | 2341 | int node) |
| 2313 | { | 2342 | { |
| @@ -2321,8 +2350,7 @@ static struct page | |||
| 2321 | */ | 2350 | */ |
| 2322 | up_read(&mm->mmap_sem); | 2351 | up_read(&mm->mmap_sem); |
| 2323 | 2352 | ||
| 2324 | *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( | 2353 | *hpage = alloc_pages_exact_node(node, gfp, HPAGE_PMD_ORDER); |
| 2325 | khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); | ||
| 2326 | if (unlikely(!*hpage)) { | 2354 | if (unlikely(!*hpage)) { |
| 2327 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 2355 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
| 2328 | *hpage = ERR_PTR(-ENOMEM); | 2356 | *hpage = ERR_PTR(-ENOMEM); |
| @@ -2375,13 +2403,14 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) | |||
| 2375 | return true; | 2403 | return true; |
| 2376 | } | 2404 | } |
| 2377 | 2405 | ||
| 2378 | static struct page | 2406 | static struct page * |
| 2379 | *khugepaged_alloc_page(struct page **hpage, struct mm_struct *mm, | 2407 | khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, |
| 2380 | struct vm_area_struct *vma, unsigned long address, | 2408 | struct vm_area_struct *vma, unsigned long address, |
| 2381 | int node) | 2409 | int node) |
| 2382 | { | 2410 | { |
| 2383 | up_read(&mm->mmap_sem); | 2411 | up_read(&mm->mmap_sem); |
| 2384 | VM_BUG_ON(!*hpage); | 2412 | VM_BUG_ON(!*hpage); |
| 2413 | |||
| 2385 | return *hpage; | 2414 | return *hpage; |
| 2386 | } | 2415 | } |
| 2387 | #endif | 2416 | #endif |
| @@ -2416,16 +2445,21 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
| 2416 | struct mem_cgroup *memcg; | 2445 | struct mem_cgroup *memcg; |
| 2417 | unsigned long mmun_start; /* For mmu_notifiers */ | 2446 | unsigned long mmun_start; /* For mmu_notifiers */ |
| 2418 | unsigned long mmun_end; /* For mmu_notifiers */ | 2447 | unsigned long mmun_end; /* For mmu_notifiers */ |
| 2448 | gfp_t gfp; | ||
| 2419 | 2449 | ||
| 2420 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 2450 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
| 2421 | 2451 | ||
| 2452 | /* Only allocate from the target node */ | ||
| 2453 | gfp = alloc_hugepage_gfpmask(khugepaged_defrag(), __GFP_OTHER_NODE) | | ||
| 2454 | __GFP_THISNODE; | ||
| 2455 | |||
| 2422 | /* release the mmap_sem read lock. */ | 2456 | /* release the mmap_sem read lock. */ |
| 2423 | new_page = khugepaged_alloc_page(hpage, mm, vma, address, node); | 2457 | new_page = khugepaged_alloc_page(hpage, gfp, mm, vma, address, node); |
| 2424 | if (!new_page) | 2458 | if (!new_page) |
| 2425 | return; | 2459 | return; |
| 2426 | 2460 | ||
| 2427 | if (unlikely(mem_cgroup_try_charge(new_page, mm, | 2461 | if (unlikely(mem_cgroup_try_charge(new_page, mm, |
| 2428 | GFP_TRANSHUGE, &memcg))) | 2462 | gfp, &memcg))) |
| 2429 | return; | 2463 | return; |
| 2430 | 2464 | ||
| 2431 | /* | 2465 | /* |
| @@ -2538,7 +2572,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
| 2538 | { | 2572 | { |
| 2539 | pmd_t *pmd; | 2573 | pmd_t *pmd; |
| 2540 | pte_t *pte, *_pte; | 2574 | pte_t *pte, *_pte; |
| 2541 | int ret = 0, none = 0; | 2575 | int ret = 0, none_or_zero = 0; |
| 2542 | struct page *page; | 2576 | struct page *page; |
| 2543 | unsigned long _address; | 2577 | unsigned long _address; |
| 2544 | spinlock_t *ptl; | 2578 | spinlock_t *ptl; |
| @@ -2556,8 +2590,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
| 2556 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | 2590 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; |
| 2557 | _pte++, _address += PAGE_SIZE) { | 2591 | _pte++, _address += PAGE_SIZE) { |
| 2558 | pte_t pteval = *_pte; | 2592 | pte_t pteval = *_pte; |
| 2559 | if (pte_none(pteval)) { | 2593 | if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { |
| 2560 | if (++none <= khugepaged_max_ptes_none) | 2594 | if (++none_or_zero <= khugepaged_max_ptes_none) |
| 2561 | continue; | 2595 | continue; |
| 2562 | else | 2596 | else |
| 2563 | goto out_unmap; | 2597 | goto out_unmap; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0a9ac6c26832..271e4432734c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock); | |||
| 61 | static int num_fault_mutexes; | 61 | static int num_fault_mutexes; |
| 62 | static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; | 62 | static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp; |
| 63 | 63 | ||
| 64 | /* Forward declaration */ | ||
| 65 | static int hugetlb_acct_memory(struct hstate *h, long delta); | ||
| 66 | |||
| 64 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | 67 | static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) |
| 65 | { | 68 | { |
| 66 | bool free = (spool->count == 0) && (spool->used_hpages == 0); | 69 | bool free = (spool->count == 0) && (spool->used_hpages == 0); |
| @@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) | |||
| 68 | spin_unlock(&spool->lock); | 71 | spin_unlock(&spool->lock); |
| 69 | 72 | ||
| 70 | /* If no pages are used, and no other handles to the subpool | 73 | /* If no pages are used, and no other handles to the subpool |
| 71 | * remain, free the subpool the subpool remain */ | 74 | * remain, give up any reservations mased on minimum size and |
| 72 | if (free) | 75 | * free the subpool */ |
| 76 | if (free) { | ||
| 77 | if (spool->min_hpages != -1) | ||
| 78 | hugetlb_acct_memory(spool->hstate, | ||
| 79 | -spool->min_hpages); | ||
| 73 | kfree(spool); | 80 | kfree(spool); |
| 81 | } | ||
| 74 | } | 82 | } |
| 75 | 83 | ||
| 76 | struct hugepage_subpool *hugepage_new_subpool(long nr_blocks) | 84 | struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, |
| 85 | long min_hpages) | ||
| 77 | { | 86 | { |
| 78 | struct hugepage_subpool *spool; | 87 | struct hugepage_subpool *spool; |
| 79 | 88 | ||
| 80 | spool = kmalloc(sizeof(*spool), GFP_KERNEL); | 89 | spool = kzalloc(sizeof(*spool), GFP_KERNEL); |
| 81 | if (!spool) | 90 | if (!spool) |
| 82 | return NULL; | 91 | return NULL; |
| 83 | 92 | ||
| 84 | spin_lock_init(&spool->lock); | 93 | spin_lock_init(&spool->lock); |
| 85 | spool->count = 1; | 94 | spool->count = 1; |
| 86 | spool->max_hpages = nr_blocks; | 95 | spool->max_hpages = max_hpages; |
| 87 | spool->used_hpages = 0; | 96 | spool->hstate = h; |
| 97 | spool->min_hpages = min_hpages; | ||
| 98 | |||
| 99 | if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { | ||
| 100 | kfree(spool); | ||
| 101 | return NULL; | ||
| 102 | } | ||
| 103 | spool->rsv_hpages = min_hpages; | ||
| 88 | 104 | ||
| 89 | return spool; | 105 | return spool; |
| 90 | } | 106 | } |
| @@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) | |||
| 97 | unlock_or_release_subpool(spool); | 113 | unlock_or_release_subpool(spool); |
| 98 | } | 114 | } |
| 99 | 115 | ||
| 100 | static int hugepage_subpool_get_pages(struct hugepage_subpool *spool, | 116 | /* |
| 117 | * Subpool accounting for allocating and reserving pages. | ||
| 118 | * Return -ENOMEM if there are not enough resources to satisfy the | ||
| 119 | * the request. Otherwise, return the number of pages by which the | ||
| 120 | * global pools must be adjusted (upward). The returned value may | ||
| 121 | * only be different than the passed value (delta) in the case where | ||
| 122 | * a subpool minimum size must be manitained. | ||
| 123 | */ | ||
| 124 | static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, | ||
| 101 | long delta) | 125 | long delta) |
| 102 | { | 126 | { |
| 103 | int ret = 0; | 127 | long ret = delta; |
| 104 | 128 | ||
| 105 | if (!spool) | 129 | if (!spool) |
| 106 | return 0; | 130 | return ret; |
| 107 | 131 | ||
| 108 | spin_lock(&spool->lock); | 132 | spin_lock(&spool->lock); |
| 109 | if ((spool->used_hpages + delta) <= spool->max_hpages) { | 133 | |
| 110 | spool->used_hpages += delta; | 134 | if (spool->max_hpages != -1) { /* maximum size accounting */ |
| 111 | } else { | 135 | if ((spool->used_hpages + delta) <= spool->max_hpages) |
| 112 | ret = -ENOMEM; | 136 | spool->used_hpages += delta; |
| 137 | else { | ||
| 138 | ret = -ENOMEM; | ||
| 139 | goto unlock_ret; | ||
| 140 | } | ||
| 113 | } | 141 | } |
| 114 | spin_unlock(&spool->lock); | ||
| 115 | 142 | ||
| 143 | if (spool->min_hpages != -1) { /* minimum size accounting */ | ||
| 144 | if (delta > spool->rsv_hpages) { | ||
| 145 | /* | ||
| 146 | * Asking for more reserves than those already taken on | ||
| 147 | * behalf of subpool. Return difference. | ||
| 148 | */ | ||
| 149 | ret = delta - spool->rsv_hpages; | ||
| 150 | spool->rsv_hpages = 0; | ||
| 151 | } else { | ||
| 152 | ret = 0; /* reserves already accounted for */ | ||
| 153 | spool->rsv_hpages -= delta; | ||
| 154 | } | ||
| 155 | } | ||
| 156 | |||
| 157 | unlock_ret: | ||
| 158 | spin_unlock(&spool->lock); | ||
| 116 | return ret; | 159 | return ret; |
| 117 | } | 160 | } |
| 118 | 161 | ||
| 119 | static void hugepage_subpool_put_pages(struct hugepage_subpool *spool, | 162 | /* |
| 163 | * Subpool accounting for freeing and unreserving pages. | ||
| 164 | * Return the number of global page reservations that must be dropped. | ||
| 165 | * The return value may only be different than the passed value (delta) | ||
| 166 | * in the case where a subpool minimum size must be maintained. | ||
| 167 | */ | ||
| 168 | static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, | ||
| 120 | long delta) | 169 | long delta) |
| 121 | { | 170 | { |
| 171 | long ret = delta; | ||
| 172 | |||
| 122 | if (!spool) | 173 | if (!spool) |
| 123 | return; | 174 | return delta; |
| 124 | 175 | ||
| 125 | spin_lock(&spool->lock); | 176 | spin_lock(&spool->lock); |
| 126 | spool->used_hpages -= delta; | 177 | |
| 127 | /* If hugetlbfs_put_super couldn't free spool due to | 178 | if (spool->max_hpages != -1) /* maximum size accounting */ |
| 128 | * an outstanding quota reference, free it now. */ | 179 | spool->used_hpages -= delta; |
| 180 | |||
| 181 | if (spool->min_hpages != -1) { /* minimum size accounting */ | ||
| 182 | if (spool->rsv_hpages + delta <= spool->min_hpages) | ||
| 183 | ret = 0; | ||
| 184 | else | ||
| 185 | ret = spool->rsv_hpages + delta - spool->min_hpages; | ||
| 186 | |||
| 187 | spool->rsv_hpages += delta; | ||
| 188 | if (spool->rsv_hpages > spool->min_hpages) | ||
| 189 | spool->rsv_hpages = spool->min_hpages; | ||
| 190 | } | ||
| 191 | |||
| 192 | /* | ||
| 193 | * If hugetlbfs_put_super couldn't free spool due to an outstanding | ||
| 194 | * quota reference, free it now. | ||
| 195 | */ | ||
| 129 | unlock_or_release_subpool(spool); | 196 | unlock_or_release_subpool(spool); |
| 197 | |||
| 198 | return ret; | ||
| 130 | } | 199 | } |
| 131 | 200 | ||
| 132 | static inline struct hugepage_subpool *subpool_inode(struct inode *inode) | 201 | static inline struct hugepage_subpool *subpool_inode(struct inode *inode) |
| @@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size) | |||
| 855 | return NULL; | 924 | return NULL; |
| 856 | } | 925 | } |
| 857 | 926 | ||
| 927 | /* | ||
| 928 | * Test to determine whether the hugepage is "active/in-use" (i.e. being linked | ||
| 929 | * to hstate->hugepage_activelist.) | ||
| 930 | * | ||
| 931 | * This function can be called for tail pages, but never returns true for them. | ||
| 932 | */ | ||
| 933 | bool page_huge_active(struct page *page) | ||
| 934 | { | ||
| 935 | VM_BUG_ON_PAGE(!PageHuge(page), page); | ||
| 936 | return PageHead(page) && PagePrivate(&page[1]); | ||
| 937 | } | ||
| 938 | |||
| 939 | /* never called for tail page */ | ||
| 940 | static void set_page_huge_active(struct page *page) | ||
| 941 | { | ||
| 942 | VM_BUG_ON_PAGE(!PageHeadHuge(page), page); | ||
| 943 | SetPagePrivate(&page[1]); | ||
| 944 | } | ||
| 945 | |||
| 946 | static void clear_page_huge_active(struct page *page) | ||
| 947 | { | ||
| 948 | VM_BUG_ON_PAGE(!PageHeadHuge(page), page); | ||
| 949 | ClearPagePrivate(&page[1]); | ||
| 950 | } | ||
| 951 | |||
| 858 | void free_huge_page(struct page *page) | 952 | void free_huge_page(struct page *page) |
| 859 | { | 953 | { |
| 860 | /* | 954 | /* |
| @@ -874,7 +968,16 @@ void free_huge_page(struct page *page) | |||
| 874 | restore_reserve = PagePrivate(page); | 968 | restore_reserve = PagePrivate(page); |
| 875 | ClearPagePrivate(page); | 969 | ClearPagePrivate(page); |
| 876 | 970 | ||
| 971 | /* | ||
| 972 | * A return code of zero implies that the subpool will be under its | ||
| 973 | * minimum size if the reservation is not restored after page is free. | ||
| 974 | * Therefore, force restore_reserve operation. | ||
| 975 | */ | ||
| 976 | if (hugepage_subpool_put_pages(spool, 1) == 0) | ||
| 977 | restore_reserve = true; | ||
| 978 | |||
| 877 | spin_lock(&hugetlb_lock); | 979 | spin_lock(&hugetlb_lock); |
| 980 | clear_page_huge_active(page); | ||
| 878 | hugetlb_cgroup_uncharge_page(hstate_index(h), | 981 | hugetlb_cgroup_uncharge_page(hstate_index(h), |
| 879 | pages_per_huge_page(h), page); | 982 | pages_per_huge_page(h), page); |
| 880 | if (restore_reserve) | 983 | if (restore_reserve) |
| @@ -891,7 +994,6 @@ void free_huge_page(struct page *page) | |||
| 891 | enqueue_huge_page(h, page); | 994 | enqueue_huge_page(h, page); |
| 892 | } | 995 | } |
| 893 | spin_unlock(&hugetlb_lock); | 996 | spin_unlock(&hugetlb_lock); |
| 894 | hugepage_subpool_put_pages(spool, 1); | ||
| 895 | } | 997 | } |
| 896 | 998 | ||
| 897 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) | 999 | static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) |
| @@ -917,7 +1019,6 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
| 917 | __SetPageHead(page); | 1019 | __SetPageHead(page); |
| 918 | __ClearPageReserved(page); | 1020 | __ClearPageReserved(page); |
| 919 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | 1021 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { |
| 920 | __SetPageTail(p); | ||
| 921 | /* | 1022 | /* |
| 922 | * For gigantic hugepages allocated through bootmem at | 1023 | * For gigantic hugepages allocated through bootmem at |
| 923 | * boot, it's safer to be consistent with the not-gigantic | 1024 | * boot, it's safer to be consistent with the not-gigantic |
| @@ -933,6 +1034,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
| 933 | __ClearPageReserved(p); | 1034 | __ClearPageReserved(p); |
| 934 | set_page_count(p, 0); | 1035 | set_page_count(p, 0); |
| 935 | p->first_page = page; | 1036 | p->first_page = page; |
| 1037 | /* Make sure p->first_page is always valid for PageTail() */ | ||
| 1038 | smp_wmb(); | ||
| 1039 | __SetPageTail(p); | ||
| 936 | } | 1040 | } |
| 937 | } | 1041 | } |
| 938 | 1042 | ||
| @@ -1384,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
| 1384 | if (chg < 0) | 1488 | if (chg < 0) |
| 1385 | return ERR_PTR(-ENOMEM); | 1489 | return ERR_PTR(-ENOMEM); |
| 1386 | if (chg || avoid_reserve) | 1490 | if (chg || avoid_reserve) |
| 1387 | if (hugepage_subpool_get_pages(spool, 1)) | 1491 | if (hugepage_subpool_get_pages(spool, 1) < 0) |
| 1388 | return ERR_PTR(-ENOSPC); | 1492 | return ERR_PTR(-ENOSPC); |
| 1389 | 1493 | ||
| 1390 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); | 1494 | ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); |
| @@ -2452,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
| 2452 | struct resv_map *resv = vma_resv_map(vma); | 2556 | struct resv_map *resv = vma_resv_map(vma); |
| 2453 | struct hugepage_subpool *spool = subpool_vma(vma); | 2557 | struct hugepage_subpool *spool = subpool_vma(vma); |
| 2454 | unsigned long reserve, start, end; | 2558 | unsigned long reserve, start, end; |
| 2559 | long gbl_reserve; | ||
| 2455 | 2560 | ||
| 2456 | if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) | 2561 | if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) |
| 2457 | return; | 2562 | return; |
| @@ -2464,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) | |||
| 2464 | kref_put(&resv->refs, resv_map_release); | 2569 | kref_put(&resv->refs, resv_map_release); |
| 2465 | 2570 | ||
| 2466 | if (reserve) { | 2571 | if (reserve) { |
| 2467 | hugetlb_acct_memory(h, -reserve); | 2572 | /* |
| 2468 | hugepage_subpool_put_pages(spool, reserve); | 2573 | * Decrement reserve counts. The global reserve count may be |
| 2574 | * adjusted if the subpool has a minimum size. | ||
| 2575 | */ | ||
| 2576 | gbl_reserve = hugepage_subpool_put_pages(spool, reserve); | ||
| 2577 | hugetlb_acct_memory(h, -gbl_reserve); | ||
| 2469 | } | 2578 | } |
| 2470 | } | 2579 | } |
| 2471 | 2580 | ||
| @@ -2889,6 +2998,7 @@ retry_avoidcopy: | |||
| 2889 | copy_user_huge_page(new_page, old_page, address, vma, | 2998 | copy_user_huge_page(new_page, old_page, address, vma, |
| 2890 | pages_per_huge_page(h)); | 2999 | pages_per_huge_page(h)); |
| 2891 | __SetPageUptodate(new_page); | 3000 | __SetPageUptodate(new_page); |
| 3001 | set_page_huge_active(new_page); | ||
| 2892 | 3002 | ||
| 2893 | mmun_start = address & huge_page_mask(h); | 3003 | mmun_start = address & huge_page_mask(h); |
| 2894 | mmun_end = mmun_start + huge_page_size(h); | 3004 | mmun_end = mmun_start + huge_page_size(h); |
| @@ -3001,6 +3111,7 @@ retry: | |||
| 3001 | } | 3111 | } |
| 3002 | clear_huge_page(page, address, pages_per_huge_page(h)); | 3112 | clear_huge_page(page, address, pages_per_huge_page(h)); |
| 3003 | __SetPageUptodate(page); | 3113 | __SetPageUptodate(page); |
| 3114 | set_page_huge_active(page); | ||
| 3004 | 3115 | ||
| 3005 | if (vma->vm_flags & VM_MAYSHARE) { | 3116 | if (vma->vm_flags & VM_MAYSHARE) { |
| 3006 | int err; | 3117 | int err; |
| @@ -3276,6 +3387,15 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3276 | struct page *page; | 3387 | struct page *page; |
| 3277 | 3388 | ||
| 3278 | /* | 3389 | /* |
| 3390 | * If we have a pending SIGKILL, don't keep faulting pages and | ||
| 3391 | * potentially allocating memory. | ||
| 3392 | */ | ||
| 3393 | if (unlikely(fatal_signal_pending(current))) { | ||
| 3394 | remainder = 0; | ||
| 3395 | break; | ||
| 3396 | } | ||
| 3397 | |||
| 3398 | /* | ||
| 3279 | * Some archs (sparc64, sh*) have multiple pte_ts to | 3399 | * Some archs (sparc64, sh*) have multiple pte_ts to |
| 3280 | * each hugepage. We have to make sure we get the | 3400 | * each hugepage. We have to make sure we get the |
| 3281 | * first, for the page indexing below to work. | 3401 | * first, for the page indexing below to work. |
| @@ -3436,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
| 3436 | struct hstate *h = hstate_inode(inode); | 3556 | struct hstate *h = hstate_inode(inode); |
| 3437 | struct hugepage_subpool *spool = subpool_inode(inode); | 3557 | struct hugepage_subpool *spool = subpool_inode(inode); |
| 3438 | struct resv_map *resv_map; | 3558 | struct resv_map *resv_map; |
| 3559 | long gbl_reserve; | ||
| 3439 | 3560 | ||
| 3440 | /* | 3561 | /* |
| 3441 | * Only apply hugepage reservation if asked. At fault time, an | 3562 | * Only apply hugepage reservation if asked. At fault time, an |
| @@ -3472,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
| 3472 | goto out_err; | 3593 | goto out_err; |
| 3473 | } | 3594 | } |
| 3474 | 3595 | ||
| 3475 | /* There must be enough pages in the subpool for the mapping */ | 3596 | /* |
| 3476 | if (hugepage_subpool_get_pages(spool, chg)) { | 3597 | * There must be enough pages in the subpool for the mapping. If |
| 3598 | * the subpool has a minimum size, there may be some global | ||
| 3599 | * reservations already in place (gbl_reserve). | ||
| 3600 | */ | ||
| 3601 | gbl_reserve = hugepage_subpool_get_pages(spool, chg); | ||
| 3602 | if (gbl_reserve < 0) { | ||
| 3477 | ret = -ENOSPC; | 3603 | ret = -ENOSPC; |
| 3478 | goto out_err; | 3604 | goto out_err; |
| 3479 | } | 3605 | } |
| @@ -3482,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode, | |||
| 3482 | * Check enough hugepages are available for the reservation. | 3608 | * Check enough hugepages are available for the reservation. |
| 3483 | * Hand the pages back to the subpool if there are not | 3609 | * Hand the pages back to the subpool if there are not |
| 3484 | */ | 3610 | */ |
| 3485 | ret = hugetlb_acct_memory(h, chg); | 3611 | ret = hugetlb_acct_memory(h, gbl_reserve); |
| 3486 | if (ret < 0) { | 3612 | if (ret < 0) { |
| 3487 | hugepage_subpool_put_pages(spool, chg); | 3613 | /* put back original number of pages, chg */ |
| 3614 | (void)hugepage_subpool_put_pages(spool, chg); | ||
| 3488 | goto out_err; | 3615 | goto out_err; |
| 3489 | } | 3616 | } |
| 3490 | 3617 | ||
| @@ -3514,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
| 3514 | struct resv_map *resv_map = inode_resv_map(inode); | 3641 | struct resv_map *resv_map = inode_resv_map(inode); |
| 3515 | long chg = 0; | 3642 | long chg = 0; |
| 3516 | struct hugepage_subpool *spool = subpool_inode(inode); | 3643 | struct hugepage_subpool *spool = subpool_inode(inode); |
| 3644 | long gbl_reserve; | ||
| 3517 | 3645 | ||
| 3518 | if (resv_map) | 3646 | if (resv_map) |
| 3519 | chg = region_truncate(resv_map, offset); | 3647 | chg = region_truncate(resv_map, offset); |
| @@ -3521,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | |||
| 3521 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); | 3649 | inode->i_blocks -= (blocks_per_huge_page(h) * freed); |
| 3522 | spin_unlock(&inode->i_lock); | 3650 | spin_unlock(&inode->i_lock); |
| 3523 | 3651 | ||
| 3524 | hugepage_subpool_put_pages(spool, (chg - freed)); | 3652 | /* |
| 3525 | hugetlb_acct_memory(h, -(chg - freed)); | 3653 | * If the subpool has a minimum size, the number of global |
| 3654 | * reservations to be released may be adjusted. | ||
| 3655 | */ | ||
| 3656 | gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); | ||
| 3657 | hugetlb_acct_memory(h, -gbl_reserve); | ||
| 3526 | } | 3658 | } |
| 3527 | 3659 | ||
| 3528 | #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE | 3660 | #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE |
| @@ -3733,8 +3865,7 @@ retry: | |||
| 3733 | if (!pmd_huge(*pmd)) | 3865 | if (!pmd_huge(*pmd)) |
| 3734 | goto out; | 3866 | goto out; |
| 3735 | if (pmd_present(*pmd)) { | 3867 | if (pmd_present(*pmd)) { |
| 3736 | page = pte_page(*(pte_t *)pmd) + | 3868 | page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT); |
| 3737 | ((address & ~PMD_MASK) >> PAGE_SHIFT); | ||
| 3738 | if (flags & FOLL_GET) | 3869 | if (flags & FOLL_GET) |
| 3739 | get_page(page); | 3870 | get_page(page); |
| 3740 | } else { | 3871 | } else { |
| @@ -3765,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, | |||
| 3765 | 3896 | ||
| 3766 | #ifdef CONFIG_MEMORY_FAILURE | 3897 | #ifdef CONFIG_MEMORY_FAILURE |
| 3767 | 3898 | ||
| 3768 | /* Should be called in hugetlb_lock */ | ||
| 3769 | static int is_hugepage_on_freelist(struct page *hpage) | ||
| 3770 | { | ||
| 3771 | struct page *page; | ||
| 3772 | struct page *tmp; | ||
| 3773 | struct hstate *h = page_hstate(hpage); | ||
| 3774 | int nid = page_to_nid(hpage); | ||
| 3775 | |||
| 3776 | list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) | ||
| 3777 | if (page == hpage) | ||
| 3778 | return 1; | ||
| 3779 | return 0; | ||
| 3780 | } | ||
| 3781 | |||
| 3782 | /* | 3899 | /* |
| 3783 | * This function is called from memory failure code. | 3900 | * This function is called from memory failure code. |
| 3784 | * Assume the caller holds page lock of the head page. | 3901 | * Assume the caller holds page lock of the head page. |
| @@ -3790,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
| 3790 | int ret = -EBUSY; | 3907 | int ret = -EBUSY; |
| 3791 | 3908 | ||
| 3792 | spin_lock(&hugetlb_lock); | 3909 | spin_lock(&hugetlb_lock); |
| 3793 | if (is_hugepage_on_freelist(hpage)) { | 3910 | /* |
| 3911 | * Just checking !page_huge_active is not enough, because that could be | ||
| 3912 | * an isolated/hwpoisoned hugepage (which have >0 refcount). | ||
| 3913 | */ | ||
| 3914 | if (!page_huge_active(hpage) && !page_count(hpage)) { | ||
| 3794 | /* | 3915 | /* |
| 3795 | * Hwpoisoned hugepage isn't linked to activelist or freelist, | 3916 | * Hwpoisoned hugepage isn't linked to activelist or freelist, |
| 3796 | * but dangling hpage->lru can trigger list-debug warnings | 3917 | * but dangling hpage->lru can trigger list-debug warnings |
| @@ -3810,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
| 3810 | 3931 | ||
| 3811 | bool isolate_huge_page(struct page *page, struct list_head *list) | 3932 | bool isolate_huge_page(struct page *page, struct list_head *list) |
| 3812 | { | 3933 | { |
| 3934 | bool ret = true; | ||
| 3935 | |||
| 3813 | VM_BUG_ON_PAGE(!PageHead(page), page); | 3936 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 3814 | if (!get_page_unless_zero(page)) | ||
| 3815 | return false; | ||
| 3816 | spin_lock(&hugetlb_lock); | 3937 | spin_lock(&hugetlb_lock); |
| 3938 | if (!page_huge_active(page) || !get_page_unless_zero(page)) { | ||
| 3939 | ret = false; | ||
| 3940 | goto unlock; | ||
| 3941 | } | ||
| 3942 | clear_page_huge_active(page); | ||
| 3817 | list_move_tail(&page->lru, list); | 3943 | list_move_tail(&page->lru, list); |
| 3944 | unlock: | ||
| 3818 | spin_unlock(&hugetlb_lock); | 3945 | spin_unlock(&hugetlb_lock); |
| 3819 | return true; | 3946 | return ret; |
| 3820 | } | 3947 | } |
| 3821 | 3948 | ||
| 3822 | void putback_active_hugepage(struct page *page) | 3949 | void putback_active_hugepage(struct page *page) |
| 3823 | { | 3950 | { |
| 3824 | VM_BUG_ON_PAGE(!PageHead(page), page); | 3951 | VM_BUG_ON_PAGE(!PageHead(page), page); |
| 3825 | spin_lock(&hugetlb_lock); | 3952 | spin_lock(&hugetlb_lock); |
| 3953 | set_page_huge_active(page); | ||
| 3826 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); | 3954 | list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); |
| 3827 | spin_unlock(&hugetlb_lock); | 3955 | spin_unlock(&hugetlb_lock); |
| 3828 | put_page(page); | 3956 | put_page(page); |
| 3829 | } | 3957 | } |
| 3830 | |||
| 3831 | bool is_hugepage_active(struct page *page) | ||
| 3832 | { | ||
| 3833 | VM_BUG_ON_PAGE(!PageHuge(page), page); | ||
| 3834 | /* | ||
| 3835 | * This function can be called for a tail page because the caller, | ||
| 3836 | * scan_movable_pages, scans through a given pfn-range which typically | ||
| 3837 | * covers one memory block. In systems using gigantic hugepage (1GB | ||
| 3838 | * for x86_64,) a hugepage is larger than a memory block, and we don't | ||
| 3839 | * support migrating such large hugepages for now, so return false | ||
| 3840 | * when called for tail pages. | ||
| 3841 | */ | ||
| 3842 | if (PageTail(page)) | ||
| 3843 | return false; | ||
| 3844 | /* | ||
| 3845 | * Refcount of a hwpoisoned hugepages is 1, but they are not active, | ||
| 3846 | * so we should return false for them. | ||
| 3847 | */ | ||
| 3848 | if (unlikely(PageHWPoison(page))) | ||
| 3849 | return false; | ||
| 3850 | return page_count(page) > 0; | ||
| 3851 | } | ||
diff --git a/mm/internal.h b/mm/internal.h index a96da5b0029d..a25e359a4039 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
| @@ -200,6 +200,8 @@ isolate_freepages_range(struct compact_control *cc, | |||
| 200 | unsigned long | 200 | unsigned long |
| 201 | isolate_migratepages_range(struct compact_control *cc, | 201 | isolate_migratepages_range(struct compact_control *cc, |
| 202 | unsigned long low_pfn, unsigned long end_pfn); | 202 | unsigned long low_pfn, unsigned long end_pfn); |
| 203 | int find_suitable_fallback(struct free_area *area, unsigned int order, | ||
| 204 | int migratetype, bool only_stealable, bool *can_steal); | ||
| 203 | 205 | ||
| 204 | #endif | 206 | #endif |
| 205 | 207 | ||
| @@ -222,13 +224,13 @@ static inline unsigned long page_order(struct page *page) | |||
| 222 | * PageBuddy() should be checked first by the caller to minimize race window, | 224 | * PageBuddy() should be checked first by the caller to minimize race window, |
| 223 | * and invalid values must be handled gracefully. | 225 | * and invalid values must be handled gracefully. |
| 224 | * | 226 | * |
| 225 | * ACCESS_ONCE is used so that if the caller assigns the result into a local | 227 | * READ_ONCE is used so that if the caller assigns the result into a local |
| 226 | * variable and e.g. tests it for valid range before using, the compiler cannot | 228 | * variable and e.g. tests it for valid range before using, the compiler cannot |
| 227 | * decide to remove the variable and inline the page_private(page) multiple | 229 | * decide to remove the variable and inline the page_private(page) multiple |
| 228 | * times, potentially observing different values in the tests and the actual | 230 | * times, potentially observing different values in the tests and the actual |
| 229 | * use of the result. | 231 | * use of the result. |
| 230 | */ | 232 | */ |
| 231 | #define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) | 233 | #define page_order_unsafe(page) READ_ONCE(page_private(page)) |
| 232 | 234 | ||
| 233 | static inline bool is_cow_mapping(vm_flags_t flags) | 235 | static inline bool is_cow_mapping(vm_flags_t flags) |
| 234 | { | 236 | { |
| @@ -240,7 +242,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 240 | struct vm_area_struct *prev, struct rb_node *rb_parent); | 242 | struct vm_area_struct *prev, struct rb_node *rb_parent); |
| 241 | 243 | ||
| 242 | #ifdef CONFIG_MMU | 244 | #ifdef CONFIG_MMU |
| 243 | extern long __mlock_vma_pages_range(struct vm_area_struct *vma, | 245 | extern long populate_vma_page_range(struct vm_area_struct *vma, |
| 244 | unsigned long start, unsigned long end, int *nonblocking); | 246 | unsigned long start, unsigned long end, int *nonblocking); |
| 245 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, | 247 | extern void munlock_vma_pages_range(struct vm_area_struct *vma, |
| 246 | unsigned long start, unsigned long end); | 248 | unsigned long start, unsigned long end); |
diff --git a/mm/iov_iter.c b/mm/iov_iter.c deleted file mode 100644 index 827732047da1..000000000000 --- a/mm/iov_iter.c +++ /dev/null | |||
| @@ -1,753 +0,0 @@ | |||
| 1 | #include <linux/export.h> | ||
| 2 | #include <linux/uio.h> | ||
| 3 | #include <linux/pagemap.h> | ||
| 4 | #include <linux/slab.h> | ||
| 5 | #include <linux/vmalloc.h> | ||
| 6 | #include <net/checksum.h> | ||
| 7 | |||
| 8 | #define iterate_iovec(i, n, __v, __p, skip, STEP) { \ | ||
| 9 | size_t left; \ | ||
| 10 | size_t wanted = n; \ | ||
| 11 | __p = i->iov; \ | ||
| 12 | __v.iov_len = min(n, __p->iov_len - skip); \ | ||
| 13 | if (likely(__v.iov_len)) { \ | ||
| 14 | __v.iov_base = __p->iov_base + skip; \ | ||
| 15 | left = (STEP); \ | ||
| 16 | __v.iov_len -= left; \ | ||
| 17 | skip += __v.iov_len; \ | ||
| 18 | n -= __v.iov_len; \ | ||
| 19 | } else { \ | ||
| 20 | left = 0; \ | ||
| 21 | } \ | ||
| 22 | while (unlikely(!left && n)) { \ | ||
| 23 | __p++; \ | ||
| 24 | __v.iov_len = min(n, __p->iov_len); \ | ||
| 25 | if (unlikely(!__v.iov_len)) \ | ||
| 26 | continue; \ | ||
| 27 | __v.iov_base = __p->iov_base; \ | ||
| 28 | left = (STEP); \ | ||
| 29 | __v.iov_len -= left; \ | ||
| 30 | skip = __v.iov_len; \ | ||
| 31 | n -= __v.iov_len; \ | ||
| 32 | } \ | ||
| 33 | n = wanted - n; \ | ||
| 34 | } | ||
| 35 | |||
| 36 | #define iterate_kvec(i, n, __v, __p, skip, STEP) { \ | ||
| 37 | size_t wanted = n; \ | ||
| 38 | __p = i->kvec; \ | ||
| 39 | __v.iov_len = min(n, __p->iov_len - skip); \ | ||
| 40 | if (likely(__v.iov_len)) { \ | ||
| 41 | __v.iov_base = __p->iov_base + skip; \ | ||
| 42 | (void)(STEP); \ | ||
| 43 | skip += __v.iov_len; \ | ||
| 44 | n -= __v.iov_len; \ | ||
| 45 | } \ | ||
| 46 | while (unlikely(n)) { \ | ||
| 47 | __p++; \ | ||
| 48 | __v.iov_len = min(n, __p->iov_len); \ | ||
| 49 | if (unlikely(!__v.iov_len)) \ | ||
| 50 | continue; \ | ||
| 51 | __v.iov_base = __p->iov_base; \ | ||
| 52 | (void)(STEP); \ | ||
| 53 | skip = __v.iov_len; \ | ||
| 54 | n -= __v.iov_len; \ | ||
| 55 | } \ | ||
| 56 | n = wanted; \ | ||
| 57 | } | ||
| 58 | |||
| 59 | #define iterate_bvec(i, n, __v, __p, skip, STEP) { \ | ||
| 60 | size_t wanted = n; \ | ||
| 61 | __p = i->bvec; \ | ||
| 62 | __v.bv_len = min_t(size_t, n, __p->bv_len - skip); \ | ||
| 63 | if (likely(__v.bv_len)) { \ | ||
| 64 | __v.bv_page = __p->bv_page; \ | ||
| 65 | __v.bv_offset = __p->bv_offset + skip; \ | ||
| 66 | (void)(STEP); \ | ||
| 67 | skip += __v.bv_len; \ | ||
| 68 | n -= __v.bv_len; \ | ||
| 69 | } \ | ||
| 70 | while (unlikely(n)) { \ | ||
| 71 | __p++; \ | ||
| 72 | __v.bv_len = min_t(size_t, n, __p->bv_len); \ | ||
| 73 | if (unlikely(!__v.bv_len)) \ | ||
| 74 | continue; \ | ||
| 75 | __v.bv_page = __p->bv_page; \ | ||
| 76 | __v.bv_offset = __p->bv_offset; \ | ||
| 77 | (void)(STEP); \ | ||
| 78 | skip = __v.bv_len; \ | ||
| 79 | n -= __v.bv_len; \ | ||
| 80 | } \ | ||
| 81 | n = wanted; \ | ||
| 82 | } | ||
| 83 | |||
| 84 | #define iterate_all_kinds(i, n, v, I, B, K) { \ | ||
| 85 | size_t skip = i->iov_offset; \ | ||
| 86 | if (unlikely(i->type & ITER_BVEC)) { \ | ||
| 87 | const struct bio_vec *bvec; \ | ||
| 88 | struct bio_vec v; \ | ||
| 89 | iterate_bvec(i, n, v, bvec, skip, (B)) \ | ||
| 90 | } else if (unlikely(i->type & ITER_KVEC)) { \ | ||
| 91 | const struct kvec *kvec; \ | ||
| 92 | struct kvec v; \ | ||
| 93 | iterate_kvec(i, n, v, kvec, skip, (K)) \ | ||
| 94 | } else { \ | ||
| 95 | const struct iovec *iov; \ | ||
| 96 | struct iovec v; \ | ||
| 97 | iterate_iovec(i, n, v, iov, skip, (I)) \ | ||
| 98 | } \ | ||
| 99 | } | ||
| 100 | |||
| 101 | #define iterate_and_advance(i, n, v, I, B, K) { \ | ||
| 102 | size_t skip = i->iov_offset; \ | ||
| 103 | if (unlikely(i->type & ITER_BVEC)) { \ | ||
| 104 | const struct bio_vec *bvec; \ | ||
| 105 | struct bio_vec v; \ | ||
| 106 | iterate_bvec(i, n, v, bvec, skip, (B)) \ | ||
| 107 | if (skip == bvec->bv_len) { \ | ||
| 108 | bvec++; \ | ||
| 109 | skip = 0; \ | ||
| 110 | } \ | ||
| 111 | i->nr_segs -= bvec - i->bvec; \ | ||
| 112 | i->bvec = bvec; \ | ||
| 113 | } else if (unlikely(i->type & ITER_KVEC)) { \ | ||
| 114 | const struct kvec *kvec; \ | ||
| 115 | struct kvec v; \ | ||
| 116 | iterate_kvec(i, n, v, kvec, skip, (K)) \ | ||
| 117 | if (skip == kvec->iov_len) { \ | ||
| 118 | kvec++; \ | ||
| 119 | skip = 0; \ | ||
| 120 | } \ | ||
| 121 | i->nr_segs -= kvec - i->kvec; \ | ||
| 122 | i->kvec = kvec; \ | ||
| 123 | } else { \ | ||
| 124 | const struct iovec *iov; \ | ||
| 125 | struct iovec v; \ | ||
| 126 | iterate_iovec(i, n, v, iov, skip, (I)) \ | ||
| 127 | if (skip == iov->iov_len) { \ | ||
| 128 | iov++; \ | ||
| 129 | skip = 0; \ | ||
| 130 | } \ | ||
| 131 | i->nr_segs -= iov - i->iov; \ | ||
| 132 | i->iov = iov; \ | ||
| 133 | } \ | ||
| 134 | i->count -= n; \ | ||
| 135 | i->iov_offset = skip; \ | ||
| 136 | } | ||
| 137 | |||
| 138 | static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes, | ||
| 139 | struct iov_iter *i) | ||
| 140 | { | ||
| 141 | size_t skip, copy, left, wanted; | ||
| 142 | const struct iovec *iov; | ||
| 143 | char __user *buf; | ||
| 144 | void *kaddr, *from; | ||
| 145 | |||
| 146 | if (unlikely(bytes > i->count)) | ||
| 147 | bytes = i->count; | ||
| 148 | |||
| 149 | if (unlikely(!bytes)) | ||
| 150 | return 0; | ||
| 151 | |||
| 152 | wanted = bytes; | ||
| 153 | iov = i->iov; | ||
| 154 | skip = i->iov_offset; | ||
| 155 | buf = iov->iov_base + skip; | ||
| 156 | copy = min(bytes, iov->iov_len - skip); | ||
| 157 | |||
| 158 | if (!fault_in_pages_writeable(buf, copy)) { | ||
| 159 | kaddr = kmap_atomic(page); | ||
| 160 | from = kaddr + offset; | ||
| 161 | |||
| 162 | /* first chunk, usually the only one */ | ||
| 163 | left = __copy_to_user_inatomic(buf, from, copy); | ||
| 164 | copy -= left; | ||
| 165 | skip += copy; | ||
| 166 | from += copy; | ||
| 167 | bytes -= copy; | ||
| 168 | |||
| 169 | while (unlikely(!left && bytes)) { | ||
| 170 | iov++; | ||
| 171 | buf = iov->iov_base; | ||
| 172 | copy = min(bytes, iov->iov_len); | ||
| 173 | left = __copy_to_user_inatomic(buf, from, copy); | ||
| 174 | copy -= left; | ||
| 175 | skip = copy; | ||
| 176 | from += copy; | ||
| 177 | bytes -= copy; | ||
| 178 | } | ||
| 179 | if (likely(!bytes)) { | ||
| 180 | kunmap_atomic(kaddr); | ||
| 181 | goto done; | ||
| 182 | } | ||
| 183 | offset = from - kaddr; | ||
| 184 | buf += copy; | ||
| 185 | kunmap_atomic(kaddr); | ||
| 186 | copy = min(bytes, iov->iov_len - skip); | ||
| 187 | } | ||
| 188 | /* Too bad - revert to non-atomic kmap */ | ||
| 189 | kaddr = kmap(page); | ||
| 190 | from = kaddr + offset; | ||
| 191 | left = __copy_to_user(buf, from, copy); | ||
| 192 | copy -= left; | ||
| 193 | skip += copy; | ||
| 194 | from += copy; | ||
| 195 | bytes -= copy; | ||
| 196 | while (unlikely(!left && bytes)) { | ||
| 197 | iov++; | ||
| 198 | buf = iov->iov_base; | ||
| 199 | copy = min(bytes, iov->iov_len); | ||
| 200 | left = __copy_to_user(buf, from, copy); | ||
| 201 | copy -= left; | ||
| 202 | skip = copy; | ||
| 203 | from += copy; | ||
| 204 | bytes -= copy; | ||
| 205 | } | ||
| 206 | kunmap(page); | ||
| 207 | done: | ||
| 208 | if (skip == iov->iov_len) { | ||
| 209 | iov++; | ||
| 210 | skip = 0; | ||
| 211 | } | ||
| 212 | i->count -= wanted - bytes; | ||
| 213 | i->nr_segs -= iov - i->iov; | ||
| 214 | i->iov = iov; | ||
| 215 | i->iov_offset = skip; | ||
| 216 | return wanted - bytes; | ||
| 217 | } | ||
| 218 | |||
| 219 | static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes, | ||
| 220 | struct iov_iter *i) | ||
| 221 | { | ||
| 222 | size_t skip, copy, left, wanted; | ||
| 223 | const struct iovec *iov; | ||
| 224 | char __user *buf; | ||
| 225 | void *kaddr, *to; | ||
| 226 | |||
| 227 | if (unlikely(bytes > i->count)) | ||
| 228 | bytes = i->count; | ||
| 229 | |||
| 230 | if (unlikely(!bytes)) | ||
| 231 | return 0; | ||
| 232 | |||
| 233 | wanted = bytes; | ||
| 234 | iov = i->iov; | ||
| 235 | skip = i->iov_offset; | ||
| 236 | buf = iov->iov_base + skip; | ||
| 237 | copy = min(bytes, iov->iov_len - skip); | ||
| 238 | |||
| 239 | if (!fault_in_pages_readable(buf, copy)) { | ||
| 240 | kaddr = kmap_atomic(page); | ||
| 241 | to = kaddr + offset; | ||
| 242 | |||
| 243 | /* first chunk, usually the only one */ | ||
| 244 | left = __copy_from_user_inatomic(to, buf, copy); | ||
| 245 | copy -= left; | ||
| 246 | skip += copy; | ||
| 247 | to += copy; | ||
| 248 | bytes -= copy; | ||
| 249 | |||
| 250 | while (unlikely(!left && bytes)) { | ||
| 251 | iov++; | ||
| 252 | buf = iov->iov_base; | ||
| 253 | copy = min(bytes, iov->iov_len); | ||
| 254 | left = __copy_from_user_inatomic(to, buf, copy); | ||
| 255 | copy -= left; | ||
| 256 | skip = copy; | ||
| 257 | to += copy; | ||
| 258 | bytes -= copy; | ||
| 259 | } | ||
| 260 | if (likely(!bytes)) { | ||
| 261 | kunmap_atomic(kaddr); | ||
| 262 | goto done; | ||
| 263 | } | ||
| 264 | offset = to - kaddr; | ||
| 265 | buf += copy; | ||
| 266 | kunmap_atomic(kaddr); | ||
| 267 | copy = min(bytes, iov->iov_len - skip); | ||
| 268 | } | ||
| 269 | /* Too bad - revert to non-atomic kmap */ | ||
| 270 | kaddr = kmap(page); | ||
| 271 | to = kaddr + offset; | ||
| 272 | left = __copy_from_user(to, buf, copy); | ||
| 273 | copy -= left; | ||
| 274 | skip += copy; | ||
| 275 | to += copy; | ||
| 276 | bytes -= copy; | ||
| 277 | while (unlikely(!left && bytes)) { | ||
| 278 | iov++; | ||
| 279 | buf = iov->iov_base; | ||
| 280 | copy = min(bytes, iov->iov_len); | ||
| 281 | left = __copy_from_user(to, buf, copy); | ||
| 282 | copy -= left; | ||
| 283 | skip = copy; | ||
| 284 | to += copy; | ||
| 285 | bytes -= copy; | ||
| 286 | } | ||
| 287 | kunmap(page); | ||
| 288 | done: | ||
| 289 | if (skip == iov->iov_len) { | ||
| 290 | iov++; | ||
| 291 | skip = 0; | ||
| 292 | } | ||
| 293 | i->count -= wanted - bytes; | ||
| 294 | i->nr_segs -= iov - i->iov; | ||
| 295 | i->iov = iov; | ||
| 296 | i->iov_offset = skip; | ||
| 297 | return wanted - bytes; | ||
| 298 | } | ||
| 299 | |||
| 300 | /* | ||
| 301 | * Fault in the first iovec of the given iov_iter, to a maximum length | ||
| 302 | * of bytes. Returns 0 on success, or non-zero if the memory could not be | ||
| 303 | * accessed (ie. because it is an invalid address). | ||
| 304 | * | ||
| 305 | * writev-intensive code may want this to prefault several iovecs -- that | ||
| 306 | * would be possible (callers must not rely on the fact that _only_ the | ||
| 307 | * first iovec will be faulted with the current implementation). | ||
| 308 | */ | ||
| 309 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) | ||
| 310 | { | ||
| 311 | if (!(i->type & (ITER_BVEC|ITER_KVEC))) { | ||
| 312 | char __user *buf = i->iov->iov_base + i->iov_offset; | ||
| 313 | bytes = min(bytes, i->iov->iov_len - i->iov_offset); | ||
| 314 | return fault_in_pages_readable(buf, bytes); | ||
| 315 | } | ||
| 316 | return 0; | ||
| 317 | } | ||
| 318 | EXPORT_SYMBOL(iov_iter_fault_in_readable); | ||
| 319 | |||
| 320 | void iov_iter_init(struct iov_iter *i, int direction, | ||
| 321 | const struct iovec *iov, unsigned long nr_segs, | ||
| 322 | size_t count) | ||
| 323 | { | ||
| 324 | /* It will get better. Eventually... */ | ||
| 325 | if (segment_eq(get_fs(), KERNEL_DS)) { | ||
| 326 | direction |= ITER_KVEC; | ||
| 327 | i->type = direction; | ||
| 328 | i->kvec = (struct kvec *)iov; | ||
| 329 | } else { | ||
| 330 | i->type = direction; | ||
| 331 | i->iov = iov; | ||
| 332 | } | ||
| 333 | i->nr_segs = nr_segs; | ||
| 334 | i->iov_offset = 0; | ||
| 335 | i->count = count; | ||
| 336 | } | ||
| 337 | EXPORT_SYMBOL(iov_iter_init); | ||
| 338 | |||
| 339 | static void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) | ||
| 340 | { | ||
| 341 | char *from = kmap_atomic(page); | ||
| 342 | memcpy(to, from + offset, len); | ||
| 343 | kunmap_atomic(from); | ||
| 344 | } | ||
| 345 | |||
| 346 | static void memcpy_to_page(struct page *page, size_t offset, char *from, size_t len) | ||
| 347 | { | ||
| 348 | char *to = kmap_atomic(page); | ||
| 349 | memcpy(to + offset, from, len); | ||
| 350 | kunmap_atomic(to); | ||
| 351 | } | ||
| 352 | |||
| 353 | static void memzero_page(struct page *page, size_t offset, size_t len) | ||
| 354 | { | ||
| 355 | char *addr = kmap_atomic(page); | ||
| 356 | memset(addr + offset, 0, len); | ||
| 357 | kunmap_atomic(addr); | ||
| 358 | } | ||
| 359 | |||
| 360 | size_t copy_to_iter(void *addr, size_t bytes, struct iov_iter *i) | ||
| 361 | { | ||
| 362 | char *from = addr; | ||
| 363 | if (unlikely(bytes > i->count)) | ||
| 364 | bytes = i->count; | ||
| 365 | |||
| 366 | if (unlikely(!bytes)) | ||
| 367 | return 0; | ||
| 368 | |||
| 369 | iterate_and_advance(i, bytes, v, | ||
| 370 | __copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len, | ||
| 371 | v.iov_len), | ||
| 372 | memcpy_to_page(v.bv_page, v.bv_offset, | ||
| 373 | (from += v.bv_len) - v.bv_len, v.bv_len), | ||
| 374 | memcpy(v.iov_base, (from += v.iov_len) - v.iov_len, v.iov_len) | ||
| 375 | ) | ||
| 376 | |||
| 377 | return bytes; | ||
| 378 | } | ||
| 379 | EXPORT_SYMBOL(copy_to_iter); | ||
| 380 | |||
| 381 | size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) | ||
| 382 | { | ||
| 383 | char *to = addr; | ||
| 384 | if (unlikely(bytes > i->count)) | ||
| 385 | bytes = i->count; | ||
| 386 | |||
| 387 | if (unlikely(!bytes)) | ||
| 388 | return 0; | ||
| 389 | |||
| 390 | iterate_and_advance(i, bytes, v, | ||
| 391 | __copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base, | ||
| 392 | v.iov_len), | ||
| 393 | memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, | ||
| 394 | v.bv_offset, v.bv_len), | ||
| 395 | memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) | ||
| 396 | ) | ||
| 397 | |||
| 398 | return bytes; | ||
| 399 | } | ||
| 400 | EXPORT_SYMBOL(copy_from_iter); | ||
| 401 | |||
| 402 | size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) | ||
| 403 | { | ||
| 404 | char *to = addr; | ||
| 405 | if (unlikely(bytes > i->count)) | ||
| 406 | bytes = i->count; | ||
| 407 | |||
| 408 | if (unlikely(!bytes)) | ||
| 409 | return 0; | ||
| 410 | |||
| 411 | iterate_and_advance(i, bytes, v, | ||
| 412 | __copy_from_user_nocache((to += v.iov_len) - v.iov_len, | ||
| 413 | v.iov_base, v.iov_len), | ||
| 414 | memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, | ||
| 415 | v.bv_offset, v.bv_len), | ||
| 416 | memcpy((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) | ||
| 417 | ) | ||
| 418 | |||
| 419 | return bytes; | ||
| 420 | } | ||
| 421 | EXPORT_SYMBOL(copy_from_iter_nocache); | ||
| 422 | |||
| 423 | size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, | ||
| 424 | struct iov_iter *i) | ||
| 425 | { | ||
| 426 | if (i->type & (ITER_BVEC|ITER_KVEC)) { | ||
| 427 | void *kaddr = kmap_atomic(page); | ||
| 428 | size_t wanted = copy_to_iter(kaddr + offset, bytes, i); | ||
| 429 | kunmap_atomic(kaddr); | ||
| 430 | return wanted; | ||
| 431 | } else | ||
| 432 | return copy_page_to_iter_iovec(page, offset, bytes, i); | ||
| 433 | } | ||
| 434 | EXPORT_SYMBOL(copy_page_to_iter); | ||
| 435 | |||
| 436 | size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, | ||
| 437 | struct iov_iter *i) | ||
| 438 | { | ||
| 439 | if (i->type & (ITER_BVEC|ITER_KVEC)) { | ||
| 440 | void *kaddr = kmap_atomic(page); | ||
| 441 | size_t wanted = copy_from_iter(kaddr + offset, bytes, i); | ||
| 442 | kunmap_atomic(kaddr); | ||
| 443 | return wanted; | ||
| 444 | } else | ||
| 445 | return copy_page_from_iter_iovec(page, offset, bytes, i); | ||
| 446 | } | ||
| 447 | EXPORT_SYMBOL(copy_page_from_iter); | ||
| 448 | |||
| 449 | size_t iov_iter_zero(size_t bytes, struct iov_iter *i) | ||
| 450 | { | ||
| 451 | if (unlikely(bytes > i->count)) | ||
| 452 | bytes = i->count; | ||
| 453 | |||
| 454 | if (unlikely(!bytes)) | ||
| 455 | return 0; | ||
| 456 | |||
| 457 | iterate_and_advance(i, bytes, v, | ||
| 458 | __clear_user(v.iov_base, v.iov_len), | ||
| 459 | memzero_page(v.bv_page, v.bv_offset, v.bv_len), | ||
| 460 | memset(v.iov_base, 0, v.iov_len) | ||
| 461 | ) | ||
| 462 | |||
| 463 | return bytes; | ||
| 464 | } | ||
| 465 | EXPORT_SYMBOL(iov_iter_zero); | ||
| 466 | |||
| 467 | size_t iov_iter_copy_from_user_atomic(struct page *page, | ||
| 468 | struct iov_iter *i, unsigned long offset, size_t bytes) | ||
| 469 | { | ||
| 470 | char *kaddr = kmap_atomic(page), *p = kaddr + offset; | ||
| 471 | iterate_all_kinds(i, bytes, v, | ||
| 472 | __copy_from_user_inatomic((p += v.iov_len) - v.iov_len, | ||
| 473 | v.iov_base, v.iov_len), | ||
| 474 | memcpy_from_page((p += v.bv_len) - v.bv_len, v.bv_page, | ||
| 475 | v.bv_offset, v.bv_len), | ||
| 476 | memcpy((p += v.iov_len) - v.iov_len, v.iov_base, v.iov_len) | ||
| 477 | ) | ||
| 478 | kunmap_atomic(kaddr); | ||
| 479 | return bytes; | ||
| 480 | } | ||
| 481 | EXPORT_SYMBOL(iov_iter_copy_from_user_atomic); | ||
| 482 | |||
| 483 | void iov_iter_advance(struct iov_iter *i, size_t size) | ||
| 484 | { | ||
| 485 | iterate_and_advance(i, size, v, 0, 0, 0) | ||
| 486 | } | ||
| 487 | EXPORT_SYMBOL(iov_iter_advance); | ||
| 488 | |||
| 489 | /* | ||
| 490 | * Return the count of just the current iov_iter segment. | ||
| 491 | */ | ||
| 492 | size_t iov_iter_single_seg_count(const struct iov_iter *i) | ||
| 493 | { | ||
| 494 | if (i->nr_segs == 1) | ||
| 495 | return i->count; | ||
| 496 | else if (i->type & ITER_BVEC) | ||
| 497 | return min(i->count, i->bvec->bv_len - i->iov_offset); | ||
| 498 | else | ||
| 499 | return min(i->count, i->iov->iov_len - i->iov_offset); | ||
| 500 | } | ||
| 501 | EXPORT_SYMBOL(iov_iter_single_seg_count); | ||
| 502 | |||
| 503 | void iov_iter_kvec(struct iov_iter *i, int direction, | ||
| 504 | const struct kvec *kvec, unsigned long nr_segs, | ||
| 505 | size_t count) | ||
| 506 | { | ||
| 507 | BUG_ON(!(direction & ITER_KVEC)); | ||
| 508 | i->type = direction; | ||
| 509 | i->kvec = kvec; | ||
| 510 | i->nr_segs = nr_segs; | ||
| 511 | i->iov_offset = 0; | ||
| 512 | i->count = count; | ||
| 513 | } | ||
| 514 | EXPORT_SYMBOL(iov_iter_kvec); | ||
| 515 | |||
| 516 | void iov_iter_bvec(struct iov_iter *i, int direction, | ||
| 517 | const struct bio_vec *bvec, unsigned long nr_segs, | ||
| 518 | size_t count) | ||
| 519 | { | ||
| 520 | BUG_ON(!(direction & ITER_BVEC)); | ||
| 521 | i->type = direction; | ||
| 522 | i->bvec = bvec; | ||
| 523 | i->nr_segs = nr_segs; | ||
| 524 | i->iov_offset = 0; | ||
| 525 | i->count = count; | ||
| 526 | } | ||
| 527 | EXPORT_SYMBOL(iov_iter_bvec); | ||
| 528 | |||
| 529 | unsigned long iov_iter_alignment(const struct iov_iter *i) | ||
| 530 | { | ||
| 531 | unsigned long res = 0; | ||
| 532 | size_t size = i->count; | ||
| 533 | |||
| 534 | if (!size) | ||
| 535 | return 0; | ||
| 536 | |||
| 537 | iterate_all_kinds(i, size, v, | ||
| 538 | (res |= (unsigned long)v.iov_base | v.iov_len, 0), | ||
| 539 | res |= v.bv_offset | v.bv_len, | ||
| 540 | res |= (unsigned long)v.iov_base | v.iov_len | ||
| 541 | ) | ||
| 542 | return res; | ||
| 543 | } | ||
| 544 | EXPORT_SYMBOL(iov_iter_alignment); | ||
| 545 | |||
| 546 | ssize_t iov_iter_get_pages(struct iov_iter *i, | ||
| 547 | struct page **pages, size_t maxsize, unsigned maxpages, | ||
| 548 | size_t *start) | ||
| 549 | { | ||
| 550 | if (maxsize > i->count) | ||
| 551 | maxsize = i->count; | ||
| 552 | |||
| 553 | if (!maxsize) | ||
| 554 | return 0; | ||
| 555 | |||
| 556 | iterate_all_kinds(i, maxsize, v, ({ | ||
| 557 | unsigned long addr = (unsigned long)v.iov_base; | ||
| 558 | size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); | ||
| 559 | int n; | ||
| 560 | int res; | ||
| 561 | |||
| 562 | if (len > maxpages * PAGE_SIZE) | ||
| 563 | len = maxpages * PAGE_SIZE; | ||
| 564 | addr &= ~(PAGE_SIZE - 1); | ||
| 565 | n = DIV_ROUND_UP(len, PAGE_SIZE); | ||
| 566 | res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, pages); | ||
| 567 | if (unlikely(res < 0)) | ||
| 568 | return res; | ||
| 569 | return (res == n ? len : res * PAGE_SIZE) - *start; | ||
| 570 | 0;}),({ | ||
| 571 | /* can't be more than PAGE_SIZE */ | ||
| 572 | *start = v.bv_offset; | ||
| 573 | get_page(*pages = v.bv_page); | ||
| 574 | return v.bv_len; | ||
| 575 | }),({ | ||
| 576 | return -EFAULT; | ||
| 577 | }) | ||
| 578 | ) | ||
| 579 | return 0; | ||
| 580 | } | ||
| 581 | EXPORT_SYMBOL(iov_iter_get_pages); | ||
| 582 | |||
| 583 | static struct page **get_pages_array(size_t n) | ||
| 584 | { | ||
| 585 | struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); | ||
| 586 | if (!p) | ||
| 587 | p = vmalloc(n * sizeof(struct page *)); | ||
| 588 | return p; | ||
| 589 | } | ||
| 590 | |||
| 591 | ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, | ||
| 592 | struct page ***pages, size_t maxsize, | ||
| 593 | size_t *start) | ||
| 594 | { | ||
| 595 | struct page **p; | ||
| 596 | |||
| 597 | if (maxsize > i->count) | ||
| 598 | maxsize = i->count; | ||
| 599 | |||
| 600 | if (!maxsize) | ||
| 601 | return 0; | ||
| 602 | |||
| 603 | iterate_all_kinds(i, maxsize, v, ({ | ||
| 604 | unsigned long addr = (unsigned long)v.iov_base; | ||
| 605 | size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1)); | ||
| 606 | int n; | ||
| 607 | int res; | ||
| 608 | |||
| 609 | addr &= ~(PAGE_SIZE - 1); | ||
| 610 | n = DIV_ROUND_UP(len, PAGE_SIZE); | ||
| 611 | p = get_pages_array(n); | ||
| 612 | if (!p) | ||
| 613 | return -ENOMEM; | ||
| 614 | res = get_user_pages_fast(addr, n, (i->type & WRITE) != WRITE, p); | ||
| 615 | if (unlikely(res < 0)) { | ||
| 616 | kvfree(p); | ||
| 617 | return res; | ||
| 618 | } | ||
| 619 | *pages = p; | ||
| 620 | return (res == n ? len : res * PAGE_SIZE) - *start; | ||
| 621 | 0;}),({ | ||
| 622 | /* can't be more than PAGE_SIZE */ | ||
| 623 | *start = v.bv_offset; | ||
| 624 | *pages = p = get_pages_array(1); | ||
| 625 | if (!p) | ||
| 626 | return -ENOMEM; | ||
| 627 | get_page(*p = v.bv_page); | ||
| 628 | return v.bv_len; | ||
| 629 | }),({ | ||
| 630 | return -EFAULT; | ||
| 631 | }) | ||
| 632 | ) | ||
| 633 | return 0; | ||
| 634 | } | ||
| 635 | EXPORT_SYMBOL(iov_iter_get_pages_alloc); | ||
| 636 | |||
| 637 | size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, | ||
| 638 | struct iov_iter *i) | ||
| 639 | { | ||
| 640 | char *to = addr; | ||
| 641 | __wsum sum, next; | ||
| 642 | size_t off = 0; | ||
| 643 | if (unlikely(bytes > i->count)) | ||
| 644 | bytes = i->count; | ||
| 645 | |||
| 646 | if (unlikely(!bytes)) | ||
| 647 | return 0; | ||
| 648 | |||
| 649 | sum = *csum; | ||
| 650 | iterate_and_advance(i, bytes, v, ({ | ||
| 651 | int err = 0; | ||
| 652 | next = csum_and_copy_from_user(v.iov_base, | ||
| 653 | (to += v.iov_len) - v.iov_len, | ||
| 654 | v.iov_len, 0, &err); | ||
| 655 | if (!err) { | ||
| 656 | sum = csum_block_add(sum, next, off); | ||
| 657 | off += v.iov_len; | ||
| 658 | } | ||
| 659 | err ? v.iov_len : 0; | ||
| 660 | }), ({ | ||
| 661 | char *p = kmap_atomic(v.bv_page); | ||
| 662 | next = csum_partial_copy_nocheck(p + v.bv_offset, | ||
| 663 | (to += v.bv_len) - v.bv_len, | ||
| 664 | v.bv_len, 0); | ||
| 665 | kunmap_atomic(p); | ||
| 666 | sum = csum_block_add(sum, next, off); | ||
| 667 | off += v.bv_len; | ||
| 668 | }),({ | ||
| 669 | next = csum_partial_copy_nocheck(v.iov_base, | ||
| 670 | (to += v.iov_len) - v.iov_len, | ||
| 671 | v.iov_len, 0); | ||
| 672 | sum = csum_block_add(sum, next, off); | ||
| 673 | off += v.iov_len; | ||
| 674 | }) | ||
| 675 | ) | ||
| 676 | *csum = sum; | ||
| 677 | return bytes; | ||
| 678 | } | ||
| 679 | EXPORT_SYMBOL(csum_and_copy_from_iter); | ||
| 680 | |||
| 681 | size_t csum_and_copy_to_iter(void *addr, size_t bytes, __wsum *csum, | ||
| 682 | struct iov_iter *i) | ||
| 683 | { | ||
| 684 | char *from = addr; | ||
| 685 | __wsum sum, next; | ||
| 686 | size_t off = 0; | ||
| 687 | if (unlikely(bytes > i->count)) | ||
| 688 | bytes = i->count; | ||
| 689 | |||
| 690 | if (unlikely(!bytes)) | ||
| 691 | return 0; | ||
| 692 | |||
| 693 | sum = *csum; | ||
| 694 | iterate_and_advance(i, bytes, v, ({ | ||
| 695 | int err = 0; | ||
| 696 | next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len, | ||
| 697 | v.iov_base, | ||
| 698 | v.iov_len, 0, &err); | ||
| 699 | if (!err) { | ||
| 700 | sum = csum_block_add(sum, next, off); | ||
| 701 | off += v.iov_len; | ||
| 702 | } | ||
| 703 | err ? v.iov_len : 0; | ||
| 704 | }), ({ | ||
| 705 | char *p = kmap_atomic(v.bv_page); | ||
| 706 | next = csum_partial_copy_nocheck((from += v.bv_len) - v.bv_len, | ||
| 707 | p + v.bv_offset, | ||
| 708 | v.bv_len, 0); | ||
| 709 | kunmap_atomic(p); | ||
| 710 | sum = csum_block_add(sum, next, off); | ||
| 711 | off += v.bv_len; | ||
| 712 | }),({ | ||
| 713 | next = csum_partial_copy_nocheck((from += v.iov_len) - v.iov_len, | ||
| 714 | v.iov_base, | ||
| 715 | v.iov_len, 0); | ||
| 716 | sum = csum_block_add(sum, next, off); | ||
| 717 | off += v.iov_len; | ||
| 718 | }) | ||
| 719 | ) | ||
| 720 | *csum = sum; | ||
| 721 | return bytes; | ||
| 722 | } | ||
| 723 | EXPORT_SYMBOL(csum_and_copy_to_iter); | ||
| 724 | |||
| 725 | int iov_iter_npages(const struct iov_iter *i, int maxpages) | ||
| 726 | { | ||
| 727 | size_t size = i->count; | ||
| 728 | int npages = 0; | ||
| 729 | |||
| 730 | if (!size) | ||
| 731 | return 0; | ||
| 732 | |||
| 733 | iterate_all_kinds(i, size, v, ({ | ||
| 734 | unsigned long p = (unsigned long)v.iov_base; | ||
| 735 | npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) | ||
| 736 | - p / PAGE_SIZE; | ||
| 737 | if (npages >= maxpages) | ||
| 738 | return maxpages; | ||
| 739 | 0;}),({ | ||
| 740 | npages++; | ||
| 741 | if (npages >= maxpages) | ||
| 742 | return maxpages; | ||
| 743 | }),({ | ||
| 744 | unsigned long p = (unsigned long)v.iov_base; | ||
| 745 | npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE) | ||
| 746 | - p / PAGE_SIZE; | ||
| 747 | if (npages >= maxpages) | ||
| 748 | return maxpages; | ||
| 749 | }) | ||
| 750 | ) | ||
| 751 | return npages; | ||
| 752 | } | ||
| 753 | EXPORT_SYMBOL(iov_iter_npages); | ||
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 78fee632a7ee..6c513a63ea84 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c | |||
| @@ -29,6 +29,7 @@ | |||
| 29 | #include <linux/stacktrace.h> | 29 | #include <linux/stacktrace.h> |
| 30 | #include <linux/string.h> | 30 | #include <linux/string.h> |
| 31 | #include <linux/types.h> | 31 | #include <linux/types.h> |
| 32 | #include <linux/vmalloc.h> | ||
| 32 | #include <linux/kasan.h> | 33 | #include <linux/kasan.h> |
| 33 | 34 | ||
| 34 | #include "kasan.h" | 35 | #include "kasan.h" |
| @@ -388,6 +389,19 @@ void kasan_krealloc(const void *object, size_t size) | |||
| 388 | kasan_kmalloc(page->slab_cache, object, size); | 389 | kasan_kmalloc(page->slab_cache, object, size); |
| 389 | } | 390 | } |
| 390 | 391 | ||
| 392 | void kasan_kfree(void *ptr) | ||
| 393 | { | ||
| 394 | struct page *page; | ||
| 395 | |||
| 396 | page = virt_to_head_page(ptr); | ||
| 397 | |||
| 398 | if (unlikely(!PageSlab(page))) | ||
| 399 | kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), | ||
| 400 | KASAN_FREE_PAGE); | ||
| 401 | else | ||
| 402 | kasan_slab_free(page->slab_cache, ptr); | ||
| 403 | } | ||
| 404 | |||
| 391 | void kasan_kfree_large(const void *ptr) | 405 | void kasan_kfree_large(const void *ptr) |
| 392 | { | 406 | { |
| 393 | struct page *page = virt_to_page(ptr); | 407 | struct page *page = virt_to_page(ptr); |
| @@ -414,12 +428,19 @@ int kasan_module_alloc(void *addr, size_t size) | |||
| 414 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, | 428 | GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, |
| 415 | PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, | 429 | PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, |
| 416 | __builtin_return_address(0)); | 430 | __builtin_return_address(0)); |
| 417 | return ret ? 0 : -ENOMEM; | 431 | |
| 432 | if (ret) { | ||
| 433 | find_vm_area(addr)->flags |= VM_KASAN; | ||
| 434 | return 0; | ||
| 435 | } | ||
| 436 | |||
| 437 | return -ENOMEM; | ||
| 418 | } | 438 | } |
| 419 | 439 | ||
| 420 | void kasan_module_free(void *addr) | 440 | void kasan_free_shadow(const struct vm_struct *vm) |
| 421 | { | 441 | { |
| 422 | vfree(kasan_mem_to_shadow(addr)); | 442 | if (vm->flags & VM_KASAN) |
| 443 | vfree(kasan_mem_to_shadow(vm->addr)); | ||
| 423 | } | 444 | } |
| 424 | 445 | ||
| 425 | static void register_global(struct kasan_global *global) | 446 | static void register_global(struct kasan_global *global) |
| @@ -542,7 +542,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it) | |||
| 542 | expected_mapping = (void *)stable_node + | 542 | expected_mapping = (void *)stable_node + |
| 543 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 543 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); |
| 544 | again: | 544 | again: |
| 545 | kpfn = ACCESS_ONCE(stable_node->kpfn); | 545 | kpfn = READ_ONCE(stable_node->kpfn); |
| 546 | page = pfn_to_page(kpfn); | 546 | page = pfn_to_page(kpfn); |
| 547 | 547 | ||
| 548 | /* | 548 | /* |
| @@ -551,7 +551,7 @@ again: | |||
| 551 | * but on Alpha we need to be more careful. | 551 | * but on Alpha we need to be more careful. |
| 552 | */ | 552 | */ |
| 553 | smp_read_barrier_depends(); | 553 | smp_read_barrier_depends(); |
| 554 | if (ACCESS_ONCE(page->mapping) != expected_mapping) | 554 | if (READ_ONCE(page->mapping) != expected_mapping) |
| 555 | goto stale; | 555 | goto stale; |
| 556 | 556 | ||
| 557 | /* | 557 | /* |
| @@ -577,14 +577,14 @@ again: | |||
| 577 | cpu_relax(); | 577 | cpu_relax(); |
| 578 | } | 578 | } |
| 579 | 579 | ||
| 580 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | 580 | if (READ_ONCE(page->mapping) != expected_mapping) { |
| 581 | put_page(page); | 581 | put_page(page); |
| 582 | goto stale; | 582 | goto stale; |
| 583 | } | 583 | } |
| 584 | 584 | ||
| 585 | if (lock_it) { | 585 | if (lock_it) { |
| 586 | lock_page(page); | 586 | lock_page(page); |
| 587 | if (ACCESS_ONCE(page->mapping) != expected_mapping) { | 587 | if (READ_ONCE(page->mapping) != expected_mapping) { |
| 588 | unlock_page(page); | 588 | unlock_page(page); |
| 589 | put_page(page); | 589 | put_page(page); |
| 590 | goto stale; | 590 | goto stale; |
| @@ -600,7 +600,7 @@ stale: | |||
| 600 | * before checking whether node->kpfn has been changed. | 600 | * before checking whether node->kpfn has been changed. |
| 601 | */ | 601 | */ |
| 602 | smp_rmb(); | 602 | smp_rmb(); |
| 603 | if (ACCESS_ONCE(stable_node->kpfn) != kpfn) | 603 | if (READ_ONCE(stable_node->kpfn) != kpfn) |
| 604 | goto again; | 604 | goto again; |
| 605 | remove_node_from_stable_tree(stable_node); | 605 | remove_node_from_stable_tree(stable_node); |
| 606 | return NULL; | 606 | return NULL; |
diff --git a/mm/memblock.c b/mm/memblock.c index 252b77bdf65e..9318b567ed79 100644 --- a/mm/memblock.c +++ b/mm/memblock.c | |||
| @@ -580,10 +580,24 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, | |||
| 580 | return memblock_add_range(&memblock.memory, base, size, nid, 0); | 580 | return memblock_add_range(&memblock.memory, base, size, nid, 0); |
| 581 | } | 581 | } |
| 582 | 582 | ||
| 583 | static int __init_memblock memblock_add_region(phys_addr_t base, | ||
| 584 | phys_addr_t size, | ||
| 585 | int nid, | ||
| 586 | unsigned long flags) | ||
| 587 | { | ||
| 588 | struct memblock_type *_rgn = &memblock.memory; | ||
| 589 | |||
| 590 | memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", | ||
| 591 | (unsigned long long)base, | ||
| 592 | (unsigned long long)base + size - 1, | ||
| 593 | flags, (void *)_RET_IP_); | ||
| 594 | |||
| 595 | return memblock_add_range(_rgn, base, size, nid, flags); | ||
| 596 | } | ||
| 597 | |||
| 583 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) | 598 | int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) |
| 584 | { | 599 | { |
| 585 | return memblock_add_range(&memblock.memory, base, size, | 600 | return memblock_add_region(base, size, MAX_NUMNODES, 0); |
| 586 | MAX_NUMNODES, 0); | ||
| 587 | } | 601 | } |
| 588 | 602 | ||
| 589 | /** | 603 | /** |
| @@ -699,14 +713,14 @@ static int __init_memblock memblock_reserve_region(phys_addr_t base, | |||
| 699 | int nid, | 713 | int nid, |
| 700 | unsigned long flags) | 714 | unsigned long flags) |
| 701 | { | 715 | { |
| 702 | struct memblock_type *_rgn = &memblock.reserved; | 716 | struct memblock_type *type = &memblock.reserved; |
| 703 | 717 | ||
| 704 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", | 718 | memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", |
| 705 | (unsigned long long)base, | 719 | (unsigned long long)base, |
| 706 | (unsigned long long)base + size - 1, | 720 | (unsigned long long)base + size - 1, |
| 707 | flags, (void *)_RET_IP_); | 721 | flags, (void *)_RET_IP_); |
| 708 | 722 | ||
| 709 | return memblock_add_range(_rgn, base, size, nid, flags); | 723 | return memblock_add_range(type, base, size, nid, flags); |
| 710 | } | 724 | } |
| 711 | 725 | ||
| 712 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) | 726 | int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9fe07692eaad..14c2f2017e37 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -14,6 +14,12 @@ | |||
| 14 | * Copyright (C) 2012 Parallels Inc. and Google Inc. | 14 | * Copyright (C) 2012 Parallels Inc. and Google Inc. |
| 15 | * Authors: Glauber Costa and Suleiman Souhlal | 15 | * Authors: Glauber Costa and Suleiman Souhlal |
| 16 | * | 16 | * |
| 17 | * Native page reclaim | ||
| 18 | * Charge lifetime sanitation | ||
| 19 | * Lockless page tracking & accounting | ||
| 20 | * Unified hierarchy configuration model | ||
| 21 | * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner | ||
| 22 | * | ||
| 17 | * This program is free software; you can redistribute it and/or modify | 23 | * This program is free software; you can redistribute it and/or modify |
| 18 | * it under the terms of the GNU General Public License as published by | 24 | * it under the terms of the GNU General Public License as published by |
| 19 | * the Free Software Foundation; either version 2 of the License, or | 25 | * the Free Software Foundation; either version 2 of the License, or |
| @@ -253,11 +259,6 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); | |||
| 253 | * page cache and RSS per cgroup. We would eventually like to provide | 259 | * page cache and RSS per cgroup. We would eventually like to provide |
| 254 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 260 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
| 255 | * to help the administrator determine what knobs to tune. | 261 | * to help the administrator determine what knobs to tune. |
| 256 | * | ||
| 257 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | ||
| 258 | * we hit the water mark. May be even add a low water mark, such that | ||
| 259 | * no reclaim occurs from a cgroup at it's low water mark, this is | ||
| 260 | * a feature that will be implemented much later in the future. | ||
| 261 | */ | 262 | */ |
| 262 | struct mem_cgroup { | 263 | struct mem_cgroup { |
| 263 | struct cgroup_subsys_state css; | 264 | struct cgroup_subsys_state css; |
| @@ -454,6 +455,12 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) | |||
| 454 | return memcg->css.id; | 455 | return memcg->css.id; |
| 455 | } | 456 | } |
| 456 | 457 | ||
| 458 | /* | ||
| 459 | * A helper function to get mem_cgroup from ID. must be called under | ||
| 460 | * rcu_read_lock(). The caller is responsible for calling | ||
| 461 | * css_tryget_online() if the mem_cgroup is used for charging. (dropping | ||
| 462 | * refcnt from swap can be called against removed memcg.) | ||
| 463 | */ | ||
| 457 | static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) | 464 | static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) |
| 458 | { | 465 | { |
| 459 | struct cgroup_subsys_state *css; | 466 | struct cgroup_subsys_state *css; |
| @@ -667,7 +674,7 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, | |||
| 667 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) | 674 | static unsigned long soft_limit_excess(struct mem_cgroup *memcg) |
| 668 | { | 675 | { |
| 669 | unsigned long nr_pages = page_counter_read(&memcg->memory); | 676 | unsigned long nr_pages = page_counter_read(&memcg->memory); |
| 670 | unsigned long soft_limit = ACCESS_ONCE(memcg->soft_limit); | 677 | unsigned long soft_limit = READ_ONCE(memcg->soft_limit); |
| 671 | unsigned long excess = 0; | 678 | unsigned long excess = 0; |
| 672 | 679 | ||
| 673 | if (nr_pages > soft_limit) | 680 | if (nr_pages > soft_limit) |
| @@ -1035,7 +1042,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, | |||
| 1035 | goto out_unlock; | 1042 | goto out_unlock; |
| 1036 | 1043 | ||
| 1037 | do { | 1044 | do { |
| 1038 | pos = ACCESS_ONCE(iter->position); | 1045 | pos = READ_ONCE(iter->position); |
| 1039 | /* | 1046 | /* |
| 1040 | * A racing update may change the position and | 1047 | * A racing update may change the position and |
| 1041 | * put the last reference, hence css_tryget(), | 1048 | * put the last reference, hence css_tryget(), |
| @@ -1352,13 +1359,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) | |||
| 1352 | unsigned long limit; | 1359 | unsigned long limit; |
| 1353 | 1360 | ||
| 1354 | count = page_counter_read(&memcg->memory); | 1361 | count = page_counter_read(&memcg->memory); |
| 1355 | limit = ACCESS_ONCE(memcg->memory.limit); | 1362 | limit = READ_ONCE(memcg->memory.limit); |
| 1356 | if (count < limit) | 1363 | if (count < limit) |
| 1357 | margin = limit - count; | 1364 | margin = limit - count; |
| 1358 | 1365 | ||
| 1359 | if (do_swap_account) { | 1366 | if (do_swap_account) { |
| 1360 | count = page_counter_read(&memcg->memsw); | 1367 | count = page_counter_read(&memcg->memsw); |
| 1361 | limit = ACCESS_ONCE(memcg->memsw.limit); | 1368 | limit = READ_ONCE(memcg->memsw.limit); |
| 1362 | if (count <= limit) | 1369 | if (count <= limit) |
| 1363 | margin = min(margin, limit - count); | 1370 | margin = min(margin, limit - count); |
| 1364 | } | 1371 | } |
| @@ -1436,15 +1443,17 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
| 1436 | struct mem_cgroup *iter; | 1443 | struct mem_cgroup *iter; |
| 1437 | unsigned int i; | 1444 | unsigned int i; |
| 1438 | 1445 | ||
| 1439 | if (!p) | ||
| 1440 | return; | ||
| 1441 | |||
| 1442 | mutex_lock(&oom_info_lock); | 1446 | mutex_lock(&oom_info_lock); |
| 1443 | rcu_read_lock(); | 1447 | rcu_read_lock(); |
| 1444 | 1448 | ||
| 1445 | pr_info("Task in "); | 1449 | if (p) { |
| 1446 | pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); | 1450 | pr_info("Task in "); |
| 1447 | pr_cont(" killed as a result of limit of "); | 1451 | pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id)); |
| 1452 | pr_cont(" killed as a result of limit of "); | ||
| 1453 | } else { | ||
| 1454 | pr_info("Memory limit reached of cgroup "); | ||
| 1455 | } | ||
| 1456 | |||
| 1448 | pr_cont_cgroup_path(memcg->css.cgroup); | 1457 | pr_cont_cgroup_path(memcg->css.cgroup); |
| 1449 | pr_cont("\n"); | 1458 | pr_cont("\n"); |
| 1450 | 1459 | ||
| @@ -1531,7 +1540,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
| 1531 | return; | 1540 | return; |
| 1532 | } | 1541 | } |
| 1533 | 1542 | ||
| 1534 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | 1543 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg); |
| 1535 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; | 1544 | totalpages = mem_cgroup_get_limit(memcg) ? : 1; |
| 1536 | for_each_mem_cgroup_tree(iter, memcg) { | 1545 | for_each_mem_cgroup_tree(iter, memcg) { |
| 1537 | struct css_task_iter it; | 1546 | struct css_task_iter it; |
| @@ -2341,20 +2350,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) | |||
| 2341 | } | 2350 | } |
| 2342 | 2351 | ||
| 2343 | /* | 2352 | /* |
| 2344 | * A helper function to get mem_cgroup from ID. must be called under | ||
| 2345 | * rcu_read_lock(). The caller is responsible for calling | ||
| 2346 | * css_tryget_online() if the mem_cgroup is used for charging. (dropping | ||
| 2347 | * refcnt from swap can be called against removed memcg.) | ||
| 2348 | */ | ||
| 2349 | static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | ||
| 2350 | { | ||
| 2351 | /* ID 0 is unused ID */ | ||
| 2352 | if (!id) | ||
| 2353 | return NULL; | ||
| 2354 | return mem_cgroup_from_id(id); | ||
| 2355 | } | ||
| 2356 | |||
| 2357 | /* | ||
| 2358 | * try_get_mem_cgroup_from_page - look up page's memcg association | 2353 | * try_get_mem_cgroup_from_page - look up page's memcg association |
| 2359 | * @page: the page | 2354 | * @page: the page |
| 2360 | * | 2355 | * |
| @@ -2380,7 +2375,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) | |||
| 2380 | ent.val = page_private(page); | 2375 | ent.val = page_private(page); |
| 2381 | id = lookup_swap_cgroup_id(ent); | 2376 | id = lookup_swap_cgroup_id(ent); |
| 2382 | rcu_read_lock(); | 2377 | rcu_read_lock(); |
| 2383 | memcg = mem_cgroup_lookup(id); | 2378 | memcg = mem_cgroup_from_id(id); |
| 2384 | if (memcg && !css_tryget_online(&memcg->css)) | 2379 | if (memcg && !css_tryget_online(&memcg->css)) |
| 2385 | memcg = NULL; | 2380 | memcg = NULL; |
| 2386 | rcu_read_unlock(); | 2381 | rcu_read_unlock(); |
| @@ -2642,7 +2637,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep) | |||
| 2642 | return cachep; | 2637 | return cachep; |
| 2643 | 2638 | ||
| 2644 | memcg = get_mem_cgroup_from_mm(current->mm); | 2639 | memcg = get_mem_cgroup_from_mm(current->mm); |
| 2645 | kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); | 2640 | kmemcg_id = READ_ONCE(memcg->kmemcg_id); |
| 2646 | if (kmemcg_id < 0) | 2641 | if (kmemcg_id < 0) |
| 2647 | goto out; | 2642 | goto out; |
| 2648 | 2643 | ||
| @@ -2779,92 +2774,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) | |||
| 2779 | } | 2774 | } |
| 2780 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 2775 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
| 2781 | 2776 | ||
| 2782 | /** | ||
| 2783 | * mem_cgroup_move_account - move account of the page | ||
| 2784 | * @page: the page | ||
| 2785 | * @nr_pages: number of regular pages (>1 for huge pages) | ||
| 2786 | * @from: mem_cgroup which the page is moved from. | ||
| 2787 | * @to: mem_cgroup which the page is moved to. @from != @to. | ||
| 2788 | * | ||
| 2789 | * The caller must confirm following. | ||
| 2790 | * - page is not on LRU (isolate_page() is useful.) | ||
| 2791 | * - compound_lock is held when nr_pages > 1 | ||
| 2792 | * | ||
| 2793 | * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" | ||
| 2794 | * from old cgroup. | ||
| 2795 | */ | ||
| 2796 | static int mem_cgroup_move_account(struct page *page, | ||
| 2797 | unsigned int nr_pages, | ||
| 2798 | struct mem_cgroup *from, | ||
| 2799 | struct mem_cgroup *to) | ||
| 2800 | { | ||
| 2801 | unsigned long flags; | ||
| 2802 | int ret; | ||
| 2803 | |||
| 2804 | VM_BUG_ON(from == to); | ||
| 2805 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
| 2806 | /* | ||
| 2807 | * The page is isolated from LRU. So, collapse function | ||
| 2808 | * will not handle this page. But page splitting can happen. | ||
| 2809 | * Do this check under compound_page_lock(). The caller should | ||
| 2810 | * hold it. | ||
| 2811 | */ | ||
| 2812 | ret = -EBUSY; | ||
| 2813 | if (nr_pages > 1 && !PageTransHuge(page)) | ||
| 2814 | goto out; | ||
| 2815 | |||
| 2816 | /* | ||
| 2817 | * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup | ||
| 2818 | * of its source page while we change it: page migration takes | ||
| 2819 | * both pages off the LRU, but page cache replacement doesn't. | ||
| 2820 | */ | ||
| 2821 | if (!trylock_page(page)) | ||
| 2822 | goto out; | ||
| 2823 | |||
| 2824 | ret = -EINVAL; | ||
| 2825 | if (page->mem_cgroup != from) | ||
| 2826 | goto out_unlock; | ||
| 2827 | |||
| 2828 | spin_lock_irqsave(&from->move_lock, flags); | ||
| 2829 | |||
| 2830 | if (!PageAnon(page) && page_mapped(page)) { | ||
| 2831 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | ||
| 2832 | nr_pages); | ||
| 2833 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | ||
| 2834 | nr_pages); | ||
| 2835 | } | ||
| 2836 | |||
| 2837 | if (PageWriteback(page)) { | ||
| 2838 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], | ||
| 2839 | nr_pages); | ||
| 2840 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], | ||
| 2841 | nr_pages); | ||
| 2842 | } | ||
| 2843 | |||
| 2844 | /* | ||
| 2845 | * It is safe to change page->mem_cgroup here because the page | ||
| 2846 | * is referenced, charged, and isolated - we can't race with | ||
| 2847 | * uncharging, charging, migration, or LRU putback. | ||
| 2848 | */ | ||
| 2849 | |||
| 2850 | /* caller should have done css_get */ | ||
| 2851 | page->mem_cgroup = to; | ||
| 2852 | spin_unlock_irqrestore(&from->move_lock, flags); | ||
| 2853 | |||
| 2854 | ret = 0; | ||
| 2855 | |||
| 2856 | local_irq_disable(); | ||
| 2857 | mem_cgroup_charge_statistics(to, page, nr_pages); | ||
| 2858 | memcg_check_events(to, page); | ||
| 2859 | mem_cgroup_charge_statistics(from, page, -nr_pages); | ||
| 2860 | memcg_check_events(from, page); | ||
| 2861 | local_irq_enable(); | ||
| 2862 | out_unlock: | ||
| 2863 | unlock_page(page); | ||
| 2864 | out: | ||
| 2865 | return ret; | ||
| 2866 | } | ||
| 2867 | |||
| 2868 | #ifdef CONFIG_MEMCG_SWAP | 2777 | #ifdef CONFIG_MEMCG_SWAP |
| 2869 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, | 2778 | static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, |
| 2870 | bool charge) | 2779 | bool charge) |
| @@ -4816,6 +4725,92 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
| 4816 | return page; | 4725 | return page; |
| 4817 | } | 4726 | } |
| 4818 | 4727 | ||
| 4728 | /** | ||
| 4729 | * mem_cgroup_move_account - move account of the page | ||
| 4730 | * @page: the page | ||
| 4731 | * @nr_pages: number of regular pages (>1 for huge pages) | ||
| 4732 | * @from: mem_cgroup which the page is moved from. | ||
| 4733 | * @to: mem_cgroup which the page is moved to. @from != @to. | ||
| 4734 | * | ||
| 4735 | * The caller must confirm following. | ||
| 4736 | * - page is not on LRU (isolate_page() is useful.) | ||
| 4737 | * - compound_lock is held when nr_pages > 1 | ||
| 4738 | * | ||
| 4739 | * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" | ||
| 4740 | * from old cgroup. | ||
| 4741 | */ | ||
| 4742 | static int mem_cgroup_move_account(struct page *page, | ||
| 4743 | unsigned int nr_pages, | ||
| 4744 | struct mem_cgroup *from, | ||
| 4745 | struct mem_cgroup *to) | ||
| 4746 | { | ||
| 4747 | unsigned long flags; | ||
| 4748 | int ret; | ||
| 4749 | |||
| 4750 | VM_BUG_ON(from == to); | ||
| 4751 | VM_BUG_ON_PAGE(PageLRU(page), page); | ||
| 4752 | /* | ||
| 4753 | * The page is isolated from LRU. So, collapse function | ||
| 4754 | * will not handle this page. But page splitting can happen. | ||
| 4755 | * Do this check under compound_page_lock(). The caller should | ||
| 4756 | * hold it. | ||
| 4757 | */ | ||
| 4758 | ret = -EBUSY; | ||
| 4759 | if (nr_pages > 1 && !PageTransHuge(page)) | ||
| 4760 | goto out; | ||
| 4761 | |||
| 4762 | /* | ||
| 4763 | * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup | ||
| 4764 | * of its source page while we change it: page migration takes | ||
| 4765 | * both pages off the LRU, but page cache replacement doesn't. | ||
| 4766 | */ | ||
| 4767 | if (!trylock_page(page)) | ||
| 4768 | goto out; | ||
| 4769 | |||
| 4770 | ret = -EINVAL; | ||
| 4771 | if (page->mem_cgroup != from) | ||
| 4772 | goto out_unlock; | ||
| 4773 | |||
| 4774 | spin_lock_irqsave(&from->move_lock, flags); | ||
| 4775 | |||
| 4776 | if (!PageAnon(page) && page_mapped(page)) { | ||
| 4777 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | ||
| 4778 | nr_pages); | ||
| 4779 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], | ||
| 4780 | nr_pages); | ||
| 4781 | } | ||
| 4782 | |||
| 4783 | if (PageWriteback(page)) { | ||
| 4784 | __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], | ||
| 4785 | nr_pages); | ||
| 4786 | __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], | ||
| 4787 | nr_pages); | ||
| 4788 | } | ||
| 4789 | |||
| 4790 | /* | ||
| 4791 | * It is safe to change page->mem_cgroup here because the page | ||
| 4792 | * is referenced, charged, and isolated - we can't race with | ||
| 4793 | * uncharging, charging, migration, or LRU putback. | ||
| 4794 | */ | ||
| 4795 | |||
| 4796 | /* caller should have done css_get */ | ||
| 4797 | page->mem_cgroup = to; | ||
| 4798 | spin_unlock_irqrestore(&from->move_lock, flags); | ||
| 4799 | |||
| 4800 | ret = 0; | ||
| 4801 | |||
| 4802 | local_irq_disable(); | ||
| 4803 | mem_cgroup_charge_statistics(to, page, nr_pages); | ||
| 4804 | memcg_check_events(to, page); | ||
| 4805 | mem_cgroup_charge_statistics(from, page, -nr_pages); | ||
| 4806 | memcg_check_events(from, page); | ||
| 4807 | local_irq_enable(); | ||
| 4808 | out_unlock: | ||
| 4809 | unlock_page(page); | ||
| 4810 | out: | ||
| 4811 | return ret; | ||
| 4812 | } | ||
| 4813 | |||
| 4819 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, | 4814 | static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, |
| 4820 | unsigned long addr, pte_t ptent, union mc_target *target) | 4815 | unsigned long addr, pte_t ptent, union mc_target *target) |
| 4821 | { | 4816 | { |
| @@ -5012,7 +5007,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, | |||
| 5012 | * tunable will only affect upcoming migrations, not the current one. | 5007 | * tunable will only affect upcoming migrations, not the current one. |
| 5013 | * So we need to save it, and keep it going. | 5008 | * So we need to save it, and keep it going. |
| 5014 | */ | 5009 | */ |
| 5015 | move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate); | 5010 | move_flags = READ_ONCE(memcg->move_charge_at_immigrate); |
| 5016 | if (move_flags) { | 5011 | if (move_flags) { |
| 5017 | struct mm_struct *mm; | 5012 | struct mm_struct *mm; |
| 5018 | struct mem_cgroup *from = mem_cgroup_from_task(p); | 5013 | struct mem_cgroup *from = mem_cgroup_from_task(p); |
| @@ -5232,7 +5227,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) | |||
| 5232 | * on for the root memcg is enough. | 5227 | * on for the root memcg is enough. |
| 5233 | */ | 5228 | */ |
| 5234 | if (cgroup_on_dfl(root_css->cgroup)) | 5229 | if (cgroup_on_dfl(root_css->cgroup)) |
| 5235 | mem_cgroup_from_css(root_css)->use_hierarchy = true; | 5230 | root_mem_cgroup->use_hierarchy = true; |
| 5231 | else | ||
| 5232 | root_mem_cgroup->use_hierarchy = false; | ||
| 5236 | } | 5233 | } |
| 5237 | 5234 | ||
| 5238 | static u64 memory_current_read(struct cgroup_subsys_state *css, | 5235 | static u64 memory_current_read(struct cgroup_subsys_state *css, |
| @@ -5244,7 +5241,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, | |||
| 5244 | static int memory_low_show(struct seq_file *m, void *v) | 5241 | static int memory_low_show(struct seq_file *m, void *v) |
| 5245 | { | 5242 | { |
| 5246 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5243 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5247 | unsigned long low = ACCESS_ONCE(memcg->low); | 5244 | unsigned long low = READ_ONCE(memcg->low); |
| 5248 | 5245 | ||
| 5249 | if (low == PAGE_COUNTER_MAX) | 5246 | if (low == PAGE_COUNTER_MAX) |
| 5250 | seq_puts(m, "max\n"); | 5247 | seq_puts(m, "max\n"); |
| @@ -5274,7 +5271,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, | |||
| 5274 | static int memory_high_show(struct seq_file *m, void *v) | 5271 | static int memory_high_show(struct seq_file *m, void *v) |
| 5275 | { | 5272 | { |
| 5276 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5273 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5277 | unsigned long high = ACCESS_ONCE(memcg->high); | 5274 | unsigned long high = READ_ONCE(memcg->high); |
| 5278 | 5275 | ||
| 5279 | if (high == PAGE_COUNTER_MAX) | 5276 | if (high == PAGE_COUNTER_MAX) |
| 5280 | seq_puts(m, "max\n"); | 5277 | seq_puts(m, "max\n"); |
| @@ -5304,7 +5301,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, | |||
| 5304 | static int memory_max_show(struct seq_file *m, void *v) | 5301 | static int memory_max_show(struct seq_file *m, void *v) |
| 5305 | { | 5302 | { |
| 5306 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 5303 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
| 5307 | unsigned long max = ACCESS_ONCE(memcg->memory.limit); | 5304 | unsigned long max = READ_ONCE(memcg->memory.limit); |
| 5308 | 5305 | ||
| 5309 | if (max == PAGE_COUNTER_MAX) | 5306 | if (max == PAGE_COUNTER_MAX) |
| 5310 | seq_puts(m, "max\n"); | 5307 | seq_puts(m, "max\n"); |
| @@ -5859,7 +5856,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry) | |||
| 5859 | 5856 | ||
| 5860 | id = swap_cgroup_record(entry, 0); | 5857 | id = swap_cgroup_record(entry, 0); |
| 5861 | rcu_read_lock(); | 5858 | rcu_read_lock(); |
| 5862 | memcg = mem_cgroup_lookup(id); | 5859 | memcg = mem_cgroup_from_id(id); |
| 5863 | if (memcg) { | 5860 | if (memcg) { |
| 5864 | if (!mem_cgroup_is_root(memcg)) | 5861 | if (!mem_cgroup_is_root(memcg)) |
| 5865 | page_counter_uncharge(&memcg->memsw, 1); | 5862 | page_counter_uncharge(&memcg->memsw, 1); |
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d487f8dc6d39..d9359b770cd9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
| @@ -521,6 +521,52 @@ static const char *action_name[] = { | |||
| 521 | [RECOVERED] = "Recovered", | 521 | [RECOVERED] = "Recovered", |
| 522 | }; | 522 | }; |
| 523 | 523 | ||
| 524 | enum action_page_type { | ||
| 525 | MSG_KERNEL, | ||
| 526 | MSG_KERNEL_HIGH_ORDER, | ||
| 527 | MSG_SLAB, | ||
| 528 | MSG_DIFFERENT_COMPOUND, | ||
| 529 | MSG_POISONED_HUGE, | ||
| 530 | MSG_HUGE, | ||
| 531 | MSG_FREE_HUGE, | ||
| 532 | MSG_UNMAP_FAILED, | ||
| 533 | MSG_DIRTY_SWAPCACHE, | ||
| 534 | MSG_CLEAN_SWAPCACHE, | ||
| 535 | MSG_DIRTY_MLOCKED_LRU, | ||
| 536 | MSG_CLEAN_MLOCKED_LRU, | ||
| 537 | MSG_DIRTY_UNEVICTABLE_LRU, | ||
| 538 | MSG_CLEAN_UNEVICTABLE_LRU, | ||
| 539 | MSG_DIRTY_LRU, | ||
| 540 | MSG_CLEAN_LRU, | ||
| 541 | MSG_TRUNCATED_LRU, | ||
| 542 | MSG_BUDDY, | ||
| 543 | MSG_BUDDY_2ND, | ||
| 544 | MSG_UNKNOWN, | ||
| 545 | }; | ||
| 546 | |||
| 547 | static const char * const action_page_types[] = { | ||
| 548 | [MSG_KERNEL] = "reserved kernel page", | ||
| 549 | [MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", | ||
| 550 | [MSG_SLAB] = "kernel slab page", | ||
| 551 | [MSG_DIFFERENT_COMPOUND] = "different compound page after locking", | ||
| 552 | [MSG_POISONED_HUGE] = "huge page already hardware poisoned", | ||
| 553 | [MSG_HUGE] = "huge page", | ||
| 554 | [MSG_FREE_HUGE] = "free huge page", | ||
| 555 | [MSG_UNMAP_FAILED] = "unmapping failed page", | ||
| 556 | [MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", | ||
| 557 | [MSG_CLEAN_SWAPCACHE] = "clean swapcache page", | ||
| 558 | [MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", | ||
| 559 | [MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", | ||
| 560 | [MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", | ||
| 561 | [MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", | ||
| 562 | [MSG_DIRTY_LRU] = "dirty LRU page", | ||
| 563 | [MSG_CLEAN_LRU] = "clean LRU page", | ||
| 564 | [MSG_TRUNCATED_LRU] = "already truncated LRU page", | ||
| 565 | [MSG_BUDDY] = "free buddy page", | ||
| 566 | [MSG_BUDDY_2ND] = "free buddy page (2nd try)", | ||
| 567 | [MSG_UNKNOWN] = "unknown page", | ||
| 568 | }; | ||
| 569 | |||
| 524 | /* | 570 | /* |
| 525 | * XXX: It is possible that a page is isolated from LRU cache, | 571 | * XXX: It is possible that a page is isolated from LRU cache, |
| 526 | * and then kept in swap cache or failed to remove from page cache. | 572 | * and then kept in swap cache or failed to remove from page cache. |
| @@ -777,10 +823,10 @@ static int me_huge_page(struct page *p, unsigned long pfn) | |||
| 777 | static struct page_state { | 823 | static struct page_state { |
| 778 | unsigned long mask; | 824 | unsigned long mask; |
| 779 | unsigned long res; | 825 | unsigned long res; |
| 780 | char *msg; | 826 | enum action_page_type type; |
| 781 | int (*action)(struct page *p, unsigned long pfn); | 827 | int (*action)(struct page *p, unsigned long pfn); |
| 782 | } error_states[] = { | 828 | } error_states[] = { |
| 783 | { reserved, reserved, "reserved kernel", me_kernel }, | 829 | { reserved, reserved, MSG_KERNEL, me_kernel }, |
| 784 | /* | 830 | /* |
| 785 | * free pages are specially detected outside this table: | 831 | * free pages are specially detected outside this table: |
| 786 | * PG_buddy pages only make a small fraction of all free pages. | 832 | * PG_buddy pages only make a small fraction of all free pages. |
| @@ -791,31 +837,31 @@ static struct page_state { | |||
| 791 | * currently unused objects without touching them. But just | 837 | * currently unused objects without touching them. But just |
| 792 | * treat it as standard kernel for now. | 838 | * treat it as standard kernel for now. |
| 793 | */ | 839 | */ |
| 794 | { slab, slab, "kernel slab", me_kernel }, | 840 | { slab, slab, MSG_SLAB, me_kernel }, |
| 795 | 841 | ||
| 796 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | 842 | #ifdef CONFIG_PAGEFLAGS_EXTENDED |
| 797 | { head, head, "huge", me_huge_page }, | 843 | { head, head, MSG_HUGE, me_huge_page }, |
| 798 | { tail, tail, "huge", me_huge_page }, | 844 | { tail, tail, MSG_HUGE, me_huge_page }, |
| 799 | #else | 845 | #else |
| 800 | { compound, compound, "huge", me_huge_page }, | 846 | { compound, compound, MSG_HUGE, me_huge_page }, |
| 801 | #endif | 847 | #endif |
| 802 | 848 | ||
| 803 | { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, | 849 | { sc|dirty, sc|dirty, MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, |
| 804 | { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, | 850 | { sc|dirty, sc, MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, |
| 805 | 851 | ||
| 806 | { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, | 852 | { mlock|dirty, mlock|dirty, MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, |
| 807 | { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, | 853 | { mlock|dirty, mlock, MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, |
| 808 | 854 | ||
| 809 | { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, | 855 | { unevict|dirty, unevict|dirty, MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, |
| 810 | { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, | 856 | { unevict|dirty, unevict, MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, |
| 811 | 857 | ||
| 812 | { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, | 858 | { lru|dirty, lru|dirty, MSG_DIRTY_LRU, me_pagecache_dirty }, |
| 813 | { lru|dirty, lru, "clean LRU", me_pagecache_clean }, | 859 | { lru|dirty, lru, MSG_CLEAN_LRU, me_pagecache_clean }, |
| 814 | 860 | ||
| 815 | /* | 861 | /* |
| 816 | * Catchall entry: must be at end. | 862 | * Catchall entry: must be at end. |
| 817 | */ | 863 | */ |
| 818 | { 0, 0, "unknown page state", me_unknown }, | 864 | { 0, 0, MSG_UNKNOWN, me_unknown }, |
| 819 | }; | 865 | }; |
| 820 | 866 | ||
| 821 | #undef dirty | 867 | #undef dirty |
| @@ -835,10 +881,10 @@ static struct page_state { | |||
| 835 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of | 881 | * "Dirty/Clean" indication is not 100% accurate due to the possibility of |
| 836 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). | 882 | * setting PG_dirty outside page lock. See also comment above set_page_dirty(). |
| 837 | */ | 883 | */ |
| 838 | static void action_result(unsigned long pfn, char *msg, int result) | 884 | static void action_result(unsigned long pfn, enum action_page_type type, int result) |
| 839 | { | 885 | { |
| 840 | pr_err("MCE %#lx: %s page recovery: %s\n", | 886 | pr_err("MCE %#lx: recovery action for %s: %s\n", |
| 841 | pfn, msg, action_name[result]); | 887 | pfn, action_page_types[type], action_name[result]); |
| 842 | } | 888 | } |
| 843 | 889 | ||
| 844 | static int page_action(struct page_state *ps, struct page *p, | 890 | static int page_action(struct page_state *ps, struct page *p, |
| @@ -854,11 +900,11 @@ static int page_action(struct page_state *ps, struct page *p, | |||
| 854 | count--; | 900 | count--; |
| 855 | if (count != 0) { | 901 | if (count != 0) { |
| 856 | printk(KERN_ERR | 902 | printk(KERN_ERR |
| 857 | "MCE %#lx: %s page still referenced by %d users\n", | 903 | "MCE %#lx: %s still referenced by %d users\n", |
| 858 | pfn, ps->msg, count); | 904 | pfn, action_page_types[ps->type], count); |
| 859 | result = FAILED; | 905 | result = FAILED; |
| 860 | } | 906 | } |
| 861 | action_result(pfn, ps->msg, result); | 907 | action_result(pfn, ps->type, result); |
| 862 | 908 | ||
| 863 | /* Could do more checks here if page looks ok */ | 909 | /* Could do more checks here if page looks ok */ |
| 864 | /* | 910 | /* |
| @@ -1106,7 +1152,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1106 | if (!(flags & MF_COUNT_INCREASED) && | 1152 | if (!(flags & MF_COUNT_INCREASED) && |
| 1107 | !get_page_unless_zero(hpage)) { | 1153 | !get_page_unless_zero(hpage)) { |
| 1108 | if (is_free_buddy_page(p)) { | 1154 | if (is_free_buddy_page(p)) { |
| 1109 | action_result(pfn, "free buddy", DELAYED); | 1155 | action_result(pfn, MSG_BUDDY, DELAYED); |
| 1110 | return 0; | 1156 | return 0; |
| 1111 | } else if (PageHuge(hpage)) { | 1157 | } else if (PageHuge(hpage)) { |
| 1112 | /* | 1158 | /* |
| @@ -1123,12 +1169,12 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1123 | } | 1169 | } |
| 1124 | set_page_hwpoison_huge_page(hpage); | 1170 | set_page_hwpoison_huge_page(hpage); |
| 1125 | res = dequeue_hwpoisoned_huge_page(hpage); | 1171 | res = dequeue_hwpoisoned_huge_page(hpage); |
| 1126 | action_result(pfn, "free huge", | 1172 | action_result(pfn, MSG_FREE_HUGE, |
| 1127 | res ? IGNORED : DELAYED); | 1173 | res ? IGNORED : DELAYED); |
| 1128 | unlock_page(hpage); | 1174 | unlock_page(hpage); |
| 1129 | return res; | 1175 | return res; |
| 1130 | } else { | 1176 | } else { |
| 1131 | action_result(pfn, "high order kernel", IGNORED); | 1177 | action_result(pfn, MSG_KERNEL_HIGH_ORDER, IGNORED); |
| 1132 | return -EBUSY; | 1178 | return -EBUSY; |
| 1133 | } | 1179 | } |
| 1134 | } | 1180 | } |
| @@ -1150,9 +1196,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1150 | */ | 1196 | */ |
| 1151 | if (is_free_buddy_page(p)) { | 1197 | if (is_free_buddy_page(p)) { |
| 1152 | if (flags & MF_COUNT_INCREASED) | 1198 | if (flags & MF_COUNT_INCREASED) |
| 1153 | action_result(pfn, "free buddy", DELAYED); | 1199 | action_result(pfn, MSG_BUDDY, DELAYED); |
| 1154 | else | 1200 | else |
| 1155 | action_result(pfn, "free buddy, 2nd try", DELAYED); | 1201 | action_result(pfn, MSG_BUDDY_2ND, |
| 1202 | DELAYED); | ||
| 1156 | return 0; | 1203 | return 0; |
| 1157 | } | 1204 | } |
| 1158 | } | 1205 | } |
| @@ -1165,7 +1212,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1165 | * If this happens just bail out. | 1212 | * If this happens just bail out. |
| 1166 | */ | 1213 | */ |
| 1167 | if (compound_head(p) != hpage) { | 1214 | if (compound_head(p) != hpage) { |
| 1168 | action_result(pfn, "different compound page after locking", IGNORED); | 1215 | action_result(pfn, MSG_DIFFERENT_COMPOUND, IGNORED); |
| 1169 | res = -EBUSY; | 1216 | res = -EBUSY; |
| 1170 | goto out; | 1217 | goto out; |
| 1171 | } | 1218 | } |
| @@ -1205,8 +1252,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1205 | * on the head page to show that the hugepage is hwpoisoned | 1252 | * on the head page to show that the hugepage is hwpoisoned |
| 1206 | */ | 1253 | */ |
| 1207 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { | 1254 | if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { |
| 1208 | action_result(pfn, "hugepage already hardware poisoned", | 1255 | action_result(pfn, MSG_POISONED_HUGE, IGNORED); |
| 1209 | IGNORED); | ||
| 1210 | unlock_page(hpage); | 1256 | unlock_page(hpage); |
| 1211 | put_page(hpage); | 1257 | put_page(hpage); |
| 1212 | return 0; | 1258 | return 0; |
| @@ -1235,7 +1281,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1235 | */ | 1281 | */ |
| 1236 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) | 1282 | if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) |
| 1237 | != SWAP_SUCCESS) { | 1283 | != SWAP_SUCCESS) { |
| 1238 | action_result(pfn, "unmapping failed", IGNORED); | 1284 | action_result(pfn, MSG_UNMAP_FAILED, IGNORED); |
| 1239 | res = -EBUSY; | 1285 | res = -EBUSY; |
| 1240 | goto out; | 1286 | goto out; |
| 1241 | } | 1287 | } |
| @@ -1244,7 +1290,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
| 1244 | * Torn down by someone else? | 1290 | * Torn down by someone else? |
| 1245 | */ | 1291 | */ |
| 1246 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { | 1292 | if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { |
| 1247 | action_result(pfn, "already truncated LRU", IGNORED); | 1293 | action_result(pfn, MSG_TRUNCATED_LRU, IGNORED); |
| 1248 | res = -EBUSY; | 1294 | res = -EBUSY; |
| 1249 | goto out; | 1295 | goto out; |
| 1250 | } | 1296 | } |
| @@ -1540,8 +1586,18 @@ static int soft_offline_huge_page(struct page *page, int flags) | |||
| 1540 | } | 1586 | } |
| 1541 | unlock_page(hpage); | 1587 | unlock_page(hpage); |
| 1542 | 1588 | ||
| 1543 | /* Keep page count to indicate a given hugepage is isolated. */ | 1589 | ret = isolate_huge_page(hpage, &pagelist); |
| 1544 | list_move(&hpage->lru, &pagelist); | 1590 | if (ret) { |
| 1591 | /* | ||
| 1592 | * get_any_page() and isolate_huge_page() takes a refcount each, | ||
| 1593 | * so need to drop one here. | ||
| 1594 | */ | ||
| 1595 | put_page(hpage); | ||
| 1596 | } else { | ||
| 1597 | pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); | ||
| 1598 | return -EBUSY; | ||
| 1599 | } | ||
| 1600 | |||
| 1545 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, | 1601 | ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, |
| 1546 | MIGRATE_SYNC, MR_MEMORY_FAILURE); | 1602 | MIGRATE_SYNC, MR_MEMORY_FAILURE); |
| 1547 | if (ret) { | 1603 | if (ret) { |
diff --git a/mm/memory.c b/mm/memory.c index 8068893697bb..22e037e3364e 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -690,12 +690,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, | |||
| 690 | /* | 690 | /* |
| 691 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y | 691 | * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y |
| 692 | */ | 692 | */ |
| 693 | if (vma->vm_ops) | 693 | pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", |
| 694 | printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", | 694 | vma->vm_file, |
| 695 | vma->vm_ops->fault); | 695 | vma->vm_ops ? vma->vm_ops->fault : NULL, |
| 696 | if (vma->vm_file) | 696 | vma->vm_file ? vma->vm_file->f_op->mmap : NULL, |
| 697 | printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", | 697 | mapping ? mapping->a_ops->readpage : NULL); |
| 698 | vma->vm_file->f_op->mmap); | ||
| 699 | dump_stack(); | 698 | dump_stack(); |
| 700 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); | 699 | add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
| 701 | } | 700 | } |
| @@ -1983,167 +1982,91 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, | |||
| 1983 | } | 1982 | } |
| 1984 | 1983 | ||
| 1985 | /* | 1984 | /* |
| 1986 | * This routine handles present pages, when users try to write | 1985 | * Handle write page faults for pages that can be reused in the current vma |
| 1987 | * to a shared page. It is done by copying the page to a new address | ||
| 1988 | * and decrementing the shared-page counter for the old page. | ||
| 1989 | * | 1986 | * |
| 1990 | * Note that this routine assumes that the protection checks have been | 1987 | * This can happen either due to the mapping being with the VM_SHARED flag, |
| 1991 | * done by the caller (the low-level page fault routine in most cases). | 1988 | * or due to us being the last reference standing to the page. In either |
| 1992 | * Thus we can safely just mark it writable once we've done any necessary | 1989 | * case, all we need to do here is to mark the page as writable and update |
| 1993 | * COW. | 1990 | * any related book-keeping. |
| 1994 | * | ||
| 1995 | * We also mark the page dirty at this point even though the page will | ||
| 1996 | * change only once the write actually happens. This avoids a few races, | ||
| 1997 | * and potentially makes it more efficient. | ||
| 1998 | * | ||
| 1999 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
| 2000 | * but allow concurrent faults), with pte both mapped and locked. | ||
| 2001 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
| 2002 | */ | 1991 | */ |
| 2003 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | 1992 | static inline int wp_page_reuse(struct mm_struct *mm, |
| 2004 | unsigned long address, pte_t *page_table, pmd_t *pmd, | 1993 | struct vm_area_struct *vma, unsigned long address, |
| 2005 | spinlock_t *ptl, pte_t orig_pte) | 1994 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, |
| 1995 | struct page *page, int page_mkwrite, | ||
| 1996 | int dirty_shared) | ||
| 2006 | __releases(ptl) | 1997 | __releases(ptl) |
| 2007 | { | 1998 | { |
| 2008 | struct page *old_page, *new_page = NULL; | ||
| 2009 | pte_t entry; | 1999 | pte_t entry; |
| 2010 | int ret = 0; | ||
| 2011 | int page_mkwrite = 0; | ||
| 2012 | bool dirty_shared = false; | ||
| 2013 | unsigned long mmun_start = 0; /* For mmu_notifiers */ | ||
| 2014 | unsigned long mmun_end = 0; /* For mmu_notifiers */ | ||
| 2015 | struct mem_cgroup *memcg; | ||
| 2016 | |||
| 2017 | old_page = vm_normal_page(vma, address, orig_pte); | ||
| 2018 | if (!old_page) { | ||
| 2019 | /* | ||
| 2020 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a | ||
| 2021 | * VM_PFNMAP VMA. | ||
| 2022 | * | ||
| 2023 | * We should not cow pages in a shared writeable mapping. | ||
| 2024 | * Just mark the pages writable as we can't do any dirty | ||
| 2025 | * accounting on raw pfn maps. | ||
| 2026 | */ | ||
| 2027 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
| 2028 | (VM_WRITE|VM_SHARED)) | ||
| 2029 | goto reuse; | ||
| 2030 | goto gotten; | ||
| 2031 | } | ||
| 2032 | |||
| 2033 | /* | 2000 | /* |
| 2034 | * Take out anonymous pages first, anonymous shared vmas are | 2001 | * Clear the pages cpupid information as the existing |
| 2035 | * not dirty accountable. | 2002 | * information potentially belongs to a now completely |
| 2003 | * unrelated process. | ||
| 2036 | */ | 2004 | */ |
| 2037 | if (PageAnon(old_page) && !PageKsm(old_page)) { | 2005 | if (page) |
| 2038 | if (!trylock_page(old_page)) { | 2006 | page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); |
| 2039 | page_cache_get(old_page); | ||
| 2040 | pte_unmap_unlock(page_table, ptl); | ||
| 2041 | lock_page(old_page); | ||
| 2042 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
| 2043 | &ptl); | ||
| 2044 | if (!pte_same(*page_table, orig_pte)) { | ||
| 2045 | unlock_page(old_page); | ||
| 2046 | goto unlock; | ||
| 2047 | } | ||
| 2048 | page_cache_release(old_page); | ||
| 2049 | } | ||
| 2050 | if (reuse_swap_page(old_page)) { | ||
| 2051 | /* | ||
| 2052 | * The page is all ours. Move it to our anon_vma so | ||
| 2053 | * the rmap code will not search our parent or siblings. | ||
| 2054 | * Protected against the rmap code by the page lock. | ||
| 2055 | */ | ||
| 2056 | page_move_anon_rmap(old_page, vma, address); | ||
| 2057 | unlock_page(old_page); | ||
| 2058 | goto reuse; | ||
| 2059 | } | ||
| 2060 | unlock_page(old_page); | ||
| 2061 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
| 2062 | (VM_WRITE|VM_SHARED))) { | ||
| 2063 | page_cache_get(old_page); | ||
| 2064 | /* | ||
| 2065 | * Only catch write-faults on shared writable pages, | ||
| 2066 | * read-only shared pages can get COWed by | ||
| 2067 | * get_user_pages(.write=1, .force=1). | ||
| 2068 | */ | ||
| 2069 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | ||
| 2070 | int tmp; | ||
| 2071 | |||
| 2072 | pte_unmap_unlock(page_table, ptl); | ||
| 2073 | tmp = do_page_mkwrite(vma, old_page, address); | ||
| 2074 | if (unlikely(!tmp || (tmp & | ||
| 2075 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | ||
| 2076 | page_cache_release(old_page); | ||
| 2077 | return tmp; | ||
| 2078 | } | ||
| 2079 | /* | ||
| 2080 | * Since we dropped the lock we need to revalidate | ||
| 2081 | * the PTE as someone else may have changed it. If | ||
| 2082 | * they did, we just return, as we can count on the | ||
| 2083 | * MMU to tell us if they didn't also make it writable. | ||
| 2084 | */ | ||
| 2085 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
| 2086 | &ptl); | ||
| 2087 | if (!pte_same(*page_table, orig_pte)) { | ||
| 2088 | unlock_page(old_page); | ||
| 2089 | goto unlock; | ||
| 2090 | } | ||
| 2091 | page_mkwrite = 1; | ||
| 2092 | } | ||
| 2093 | |||
| 2094 | dirty_shared = true; | ||
| 2095 | |||
| 2096 | reuse: | ||
| 2097 | /* | ||
| 2098 | * Clear the pages cpupid information as the existing | ||
| 2099 | * information potentially belongs to a now completely | ||
| 2100 | * unrelated process. | ||
| 2101 | */ | ||
| 2102 | if (old_page) | ||
| 2103 | page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); | ||
| 2104 | |||
| 2105 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | ||
| 2106 | entry = pte_mkyoung(orig_pte); | ||
| 2107 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
| 2108 | if (ptep_set_access_flags(vma, address, page_table, entry,1)) | ||
| 2109 | update_mmu_cache(vma, address, page_table); | ||
| 2110 | pte_unmap_unlock(page_table, ptl); | ||
| 2111 | ret |= VM_FAULT_WRITE; | ||
| 2112 | 2007 | ||
| 2113 | if (dirty_shared) { | 2008 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
| 2114 | struct address_space *mapping; | 2009 | entry = pte_mkyoung(orig_pte); |
| 2115 | int dirtied; | 2010 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2011 | if (ptep_set_access_flags(vma, address, page_table, entry, 1)) | ||
| 2012 | update_mmu_cache(vma, address, page_table); | ||
| 2013 | pte_unmap_unlock(page_table, ptl); | ||
| 2116 | 2014 | ||
| 2117 | if (!page_mkwrite) | 2015 | if (dirty_shared) { |
| 2118 | lock_page(old_page); | 2016 | struct address_space *mapping; |
| 2017 | int dirtied; | ||
| 2119 | 2018 | ||
| 2120 | dirtied = set_page_dirty(old_page); | 2019 | if (!page_mkwrite) |
| 2121 | VM_BUG_ON_PAGE(PageAnon(old_page), old_page); | 2020 | lock_page(page); |
| 2122 | mapping = old_page->mapping; | ||
| 2123 | unlock_page(old_page); | ||
| 2124 | page_cache_release(old_page); | ||
| 2125 | 2021 | ||
| 2126 | if ((dirtied || page_mkwrite) && mapping) { | 2022 | dirtied = set_page_dirty(page); |
| 2127 | /* | 2023 | VM_BUG_ON_PAGE(PageAnon(page), page); |
| 2128 | * Some device drivers do not set page.mapping | 2024 | mapping = page->mapping; |
| 2129 | * but still dirty their pages | 2025 | unlock_page(page); |
| 2130 | */ | 2026 | page_cache_release(page); |
| 2131 | balance_dirty_pages_ratelimited(mapping); | ||
| 2132 | } | ||
| 2133 | 2027 | ||
| 2134 | if (!page_mkwrite) | 2028 | if ((dirtied || page_mkwrite) && mapping) { |
| 2135 | file_update_time(vma->vm_file); | 2029 | /* |
| 2030 | * Some device drivers do not set page.mapping | ||
| 2031 | * but still dirty their pages | ||
| 2032 | */ | ||
| 2033 | balance_dirty_pages_ratelimited(mapping); | ||
| 2136 | } | 2034 | } |
| 2137 | 2035 | ||
| 2138 | return ret; | 2036 | if (!page_mkwrite) |
| 2037 | file_update_time(vma->vm_file); | ||
| 2139 | } | 2038 | } |
| 2140 | 2039 | ||
| 2141 | /* | 2040 | return VM_FAULT_WRITE; |
| 2142 | * Ok, we need to copy. Oh, well.. | 2041 | } |
| 2143 | */ | 2042 | |
| 2144 | page_cache_get(old_page); | 2043 | /* |
| 2145 | gotten: | 2044 | * Handle the case of a page which we actually need to copy to a new page. |
| 2146 | pte_unmap_unlock(page_table, ptl); | 2045 | * |
| 2046 | * Called with mmap_sem locked and the old page referenced, but | ||
| 2047 | * without the ptl held. | ||
| 2048 | * | ||
| 2049 | * High level logic flow: | ||
| 2050 | * | ||
| 2051 | * - Allocate a page, copy the content of the old page to the new one. | ||
| 2052 | * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. | ||
| 2053 | * - Take the PTL. If the pte changed, bail out and release the allocated page | ||
| 2054 | * - If the pte is still the way we remember it, update the page table and all | ||
| 2055 | * relevant references. This includes dropping the reference the page-table | ||
| 2056 | * held to the old page, as well as updating the rmap. | ||
| 2057 | * - In any case, unlock the PTL and drop the reference we took to the old page. | ||
| 2058 | */ | ||
| 2059 | static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 2060 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
| 2061 | pte_t orig_pte, struct page *old_page) | ||
| 2062 | { | ||
| 2063 | struct page *new_page = NULL; | ||
| 2064 | spinlock_t *ptl = NULL; | ||
| 2065 | pte_t entry; | ||
| 2066 | int page_copied = 0; | ||
| 2067 | const unsigned long mmun_start = address & PAGE_MASK; /* For mmu_notifiers */ | ||
| 2068 | const unsigned long mmun_end = mmun_start + PAGE_SIZE; /* For mmu_notifiers */ | ||
| 2069 | struct mem_cgroup *memcg; | ||
| 2147 | 2070 | ||
| 2148 | if (unlikely(anon_vma_prepare(vma))) | 2071 | if (unlikely(anon_vma_prepare(vma))) |
| 2149 | goto oom; | 2072 | goto oom; |
| @@ -2163,8 +2086,6 @@ gotten: | |||
| 2163 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) | 2086 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) |
| 2164 | goto oom_free_new; | 2087 | goto oom_free_new; |
| 2165 | 2088 | ||
| 2166 | mmun_start = address & PAGE_MASK; | ||
| 2167 | mmun_end = mmun_start + PAGE_SIZE; | ||
| 2168 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2089 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
| 2169 | 2090 | ||
| 2170 | /* | 2091 | /* |
| @@ -2177,8 +2098,9 @@ gotten: | |||
| 2177 | dec_mm_counter_fast(mm, MM_FILEPAGES); | 2098 | dec_mm_counter_fast(mm, MM_FILEPAGES); |
| 2178 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2099 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2179 | } | 2100 | } |
| 2180 | } else | 2101 | } else { |
| 2181 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2102 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
| 2103 | } | ||
| 2182 | flush_cache_page(vma, address, pte_pfn(orig_pte)); | 2104 | flush_cache_page(vma, address, pte_pfn(orig_pte)); |
| 2183 | entry = mk_pte(new_page, vma->vm_page_prot); | 2105 | entry = mk_pte(new_page, vma->vm_page_prot); |
| 2184 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2106 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| @@ -2227,29 +2149,29 @@ gotten: | |||
| 2227 | 2149 | ||
| 2228 | /* Free the old page.. */ | 2150 | /* Free the old page.. */ |
| 2229 | new_page = old_page; | 2151 | new_page = old_page; |
| 2230 | ret |= VM_FAULT_WRITE; | 2152 | page_copied = 1; |
| 2231 | } else | 2153 | } else { |
| 2232 | mem_cgroup_cancel_charge(new_page, memcg); | 2154 | mem_cgroup_cancel_charge(new_page, memcg); |
| 2155 | } | ||
| 2233 | 2156 | ||
| 2234 | if (new_page) | 2157 | if (new_page) |
| 2235 | page_cache_release(new_page); | 2158 | page_cache_release(new_page); |
| 2236 | unlock: | 2159 | |
| 2237 | pte_unmap_unlock(page_table, ptl); | 2160 | pte_unmap_unlock(page_table, ptl); |
| 2238 | if (mmun_end > mmun_start) | 2161 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
| 2239 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
| 2240 | if (old_page) { | 2162 | if (old_page) { |
| 2241 | /* | 2163 | /* |
| 2242 | * Don't let another task, with possibly unlocked vma, | 2164 | * Don't let another task, with possibly unlocked vma, |
| 2243 | * keep the mlocked page. | 2165 | * keep the mlocked page. |
| 2244 | */ | 2166 | */ |
| 2245 | if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { | 2167 | if (page_copied && (vma->vm_flags & VM_LOCKED)) { |
| 2246 | lock_page(old_page); /* LRU manipulation */ | 2168 | lock_page(old_page); /* LRU manipulation */ |
| 2247 | munlock_vma_page(old_page); | 2169 | munlock_vma_page(old_page); |
| 2248 | unlock_page(old_page); | 2170 | unlock_page(old_page); |
| 2249 | } | 2171 | } |
| 2250 | page_cache_release(old_page); | 2172 | page_cache_release(old_page); |
| 2251 | } | 2173 | } |
| 2252 | return ret; | 2174 | return page_copied ? VM_FAULT_WRITE : 0; |
| 2253 | oom_free_new: | 2175 | oom_free_new: |
| 2254 | page_cache_release(new_page); | 2176 | page_cache_release(new_page); |
| 2255 | oom: | 2177 | oom: |
| @@ -2258,6 +2180,179 @@ oom: | |||
| 2258 | return VM_FAULT_OOM; | 2180 | return VM_FAULT_OOM; |
| 2259 | } | 2181 | } |
| 2260 | 2182 | ||
| 2183 | /* | ||
| 2184 | * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED | ||
| 2185 | * mapping | ||
| 2186 | */ | ||
| 2187 | static int wp_pfn_shared(struct mm_struct *mm, | ||
| 2188 | struct vm_area_struct *vma, unsigned long address, | ||
| 2189 | pte_t *page_table, spinlock_t *ptl, pte_t orig_pte, | ||
| 2190 | pmd_t *pmd) | ||
| 2191 | { | ||
| 2192 | if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { | ||
| 2193 | struct vm_fault vmf = { | ||
| 2194 | .page = NULL, | ||
| 2195 | .pgoff = linear_page_index(vma, address), | ||
| 2196 | .virtual_address = (void __user *)(address & PAGE_MASK), | ||
| 2197 | .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, | ||
| 2198 | }; | ||
| 2199 | int ret; | ||
| 2200 | |||
| 2201 | pte_unmap_unlock(page_table, ptl); | ||
| 2202 | ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); | ||
| 2203 | if (ret & VM_FAULT_ERROR) | ||
| 2204 | return ret; | ||
| 2205 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
| 2206 | /* | ||
| 2207 | * We might have raced with another page fault while we | ||
| 2208 | * released the pte_offset_map_lock. | ||
| 2209 | */ | ||
| 2210 | if (!pte_same(*page_table, orig_pte)) { | ||
| 2211 | pte_unmap_unlock(page_table, ptl); | ||
| 2212 | return 0; | ||
| 2213 | } | ||
| 2214 | } | ||
| 2215 | return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, | ||
| 2216 | NULL, 0, 0); | ||
| 2217 | } | ||
| 2218 | |||
| 2219 | static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 2220 | unsigned long address, pte_t *page_table, | ||
| 2221 | pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte, | ||
| 2222 | struct page *old_page) | ||
| 2223 | __releases(ptl) | ||
| 2224 | { | ||
| 2225 | int page_mkwrite = 0; | ||
| 2226 | |||
| 2227 | page_cache_get(old_page); | ||
| 2228 | |||
| 2229 | /* | ||
| 2230 | * Only catch write-faults on shared writable pages, | ||
| 2231 | * read-only shared pages can get COWed by | ||
| 2232 | * get_user_pages(.write=1, .force=1). | ||
| 2233 | */ | ||
| 2234 | if (vma->vm_ops && vma->vm_ops->page_mkwrite) { | ||
| 2235 | int tmp; | ||
| 2236 | |||
| 2237 | pte_unmap_unlock(page_table, ptl); | ||
| 2238 | tmp = do_page_mkwrite(vma, old_page, address); | ||
| 2239 | if (unlikely(!tmp || (tmp & | ||
| 2240 | (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { | ||
| 2241 | page_cache_release(old_page); | ||
| 2242 | return tmp; | ||
| 2243 | } | ||
| 2244 | /* | ||
| 2245 | * Since we dropped the lock we need to revalidate | ||
| 2246 | * the PTE as someone else may have changed it. If | ||
| 2247 | * they did, we just return, as we can count on the | ||
| 2248 | * MMU to tell us if they didn't also make it writable. | ||
| 2249 | */ | ||
| 2250 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
| 2251 | &ptl); | ||
| 2252 | if (!pte_same(*page_table, orig_pte)) { | ||
| 2253 | unlock_page(old_page); | ||
| 2254 | pte_unmap_unlock(page_table, ptl); | ||
| 2255 | page_cache_release(old_page); | ||
| 2256 | return 0; | ||
| 2257 | } | ||
| 2258 | page_mkwrite = 1; | ||
| 2259 | } | ||
| 2260 | |||
| 2261 | return wp_page_reuse(mm, vma, address, page_table, ptl, | ||
| 2262 | orig_pte, old_page, page_mkwrite, 1); | ||
| 2263 | } | ||
| 2264 | |||
| 2265 | /* | ||
| 2266 | * This routine handles present pages, when users try to write | ||
| 2267 | * to a shared page. It is done by copying the page to a new address | ||
| 2268 | * and decrementing the shared-page counter for the old page. | ||
| 2269 | * | ||
| 2270 | * Note that this routine assumes that the protection checks have been | ||
| 2271 | * done by the caller (the low-level page fault routine in most cases). | ||
| 2272 | * Thus we can safely just mark it writable once we've done any necessary | ||
| 2273 | * COW. | ||
| 2274 | * | ||
| 2275 | * We also mark the page dirty at this point even though the page will | ||
| 2276 | * change only once the write actually happens. This avoids a few races, | ||
| 2277 | * and potentially makes it more efficient. | ||
| 2278 | * | ||
| 2279 | * We enter with non-exclusive mmap_sem (to exclude vma changes, | ||
| 2280 | * but allow concurrent faults), with pte both mapped and locked. | ||
| 2281 | * We return with mmap_sem still held, but pte unmapped and unlocked. | ||
| 2282 | */ | ||
| 2283 | static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | ||
| 2284 | unsigned long address, pte_t *page_table, pmd_t *pmd, | ||
| 2285 | spinlock_t *ptl, pte_t orig_pte) | ||
| 2286 | __releases(ptl) | ||
| 2287 | { | ||
| 2288 | struct page *old_page; | ||
| 2289 | |||
| 2290 | old_page = vm_normal_page(vma, address, orig_pte); | ||
| 2291 | if (!old_page) { | ||
| 2292 | /* | ||
| 2293 | * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a | ||
| 2294 | * VM_PFNMAP VMA. | ||
| 2295 | * | ||
| 2296 | * We should not cow pages in a shared writeable mapping. | ||
| 2297 | * Just mark the pages writable and/or call ops->pfn_mkwrite. | ||
| 2298 | */ | ||
| 2299 | if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
| 2300 | (VM_WRITE|VM_SHARED)) | ||
| 2301 | return wp_pfn_shared(mm, vma, address, page_table, ptl, | ||
| 2302 | orig_pte, pmd); | ||
| 2303 | |||
| 2304 | pte_unmap_unlock(page_table, ptl); | ||
| 2305 | return wp_page_copy(mm, vma, address, page_table, pmd, | ||
| 2306 | orig_pte, old_page); | ||
| 2307 | } | ||
| 2308 | |||
| 2309 | /* | ||
| 2310 | * Take out anonymous pages first, anonymous shared vmas are | ||
| 2311 | * not dirty accountable. | ||
| 2312 | */ | ||
| 2313 | if (PageAnon(old_page) && !PageKsm(old_page)) { | ||
| 2314 | if (!trylock_page(old_page)) { | ||
| 2315 | page_cache_get(old_page); | ||
| 2316 | pte_unmap_unlock(page_table, ptl); | ||
| 2317 | lock_page(old_page); | ||
| 2318 | page_table = pte_offset_map_lock(mm, pmd, address, | ||
| 2319 | &ptl); | ||
| 2320 | if (!pte_same(*page_table, orig_pte)) { | ||
| 2321 | unlock_page(old_page); | ||
| 2322 | pte_unmap_unlock(page_table, ptl); | ||
| 2323 | page_cache_release(old_page); | ||
| 2324 | return 0; | ||
| 2325 | } | ||
| 2326 | page_cache_release(old_page); | ||
| 2327 | } | ||
| 2328 | if (reuse_swap_page(old_page)) { | ||
| 2329 | /* | ||
| 2330 | * The page is all ours. Move it to our anon_vma so | ||
| 2331 | * the rmap code will not search our parent or siblings. | ||
| 2332 | * Protected against the rmap code by the page lock. | ||
| 2333 | */ | ||
| 2334 | page_move_anon_rmap(old_page, vma, address); | ||
| 2335 | unlock_page(old_page); | ||
| 2336 | return wp_page_reuse(mm, vma, address, page_table, ptl, | ||
| 2337 | orig_pte, old_page, 0, 0); | ||
| 2338 | } | ||
| 2339 | unlock_page(old_page); | ||
| 2340 | } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == | ||
| 2341 | (VM_WRITE|VM_SHARED))) { | ||
| 2342 | return wp_page_shared(mm, vma, address, page_table, pmd, | ||
| 2343 | ptl, orig_pte, old_page); | ||
| 2344 | } | ||
| 2345 | |||
| 2346 | /* | ||
| 2347 | * Ok, we need to copy. Oh, well.. | ||
| 2348 | */ | ||
| 2349 | page_cache_get(old_page); | ||
| 2350 | |||
| 2351 | pte_unmap_unlock(page_table, ptl); | ||
| 2352 | return wp_page_copy(mm, vma, address, page_table, pmd, | ||
| 2353 | orig_pte, old_page); | ||
| 2354 | } | ||
| 2355 | |||
| 2261 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, | 2356 | static void unmap_mapping_range_vma(struct vm_area_struct *vma, |
| 2262 | unsigned long start_addr, unsigned long end_addr, | 2357 | unsigned long start_addr, unsigned long end_addr, |
| 2263 | struct zap_details *details) | 2358 | struct zap_details *details) |
| @@ -2784,7 +2879,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, | |||
| 2784 | struct vm_fault vmf; | 2879 | struct vm_fault vmf; |
| 2785 | int off; | 2880 | int off; |
| 2786 | 2881 | ||
| 2787 | nr_pages = ACCESS_ONCE(fault_around_bytes) >> PAGE_SHIFT; | 2882 | nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; |
| 2788 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; | 2883 | mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; |
| 2789 | 2884 | ||
| 2790 | start_addr = max(address & mask, vma->vm_start); | 2885 | start_addr = max(address & mask, vma->vm_start); |
| @@ -3035,6 +3130,7 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3035 | int last_cpupid; | 3130 | int last_cpupid; |
| 3036 | int target_nid; | 3131 | int target_nid; |
| 3037 | bool migrated = false; | 3132 | bool migrated = false; |
| 3133 | bool was_writable = pte_write(pte); | ||
| 3038 | int flags = 0; | 3134 | int flags = 0; |
| 3039 | 3135 | ||
| 3040 | /* A PROT_NONE fault should not end up here */ | 3136 | /* A PROT_NONE fault should not end up here */ |
| @@ -3059,6 +3155,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3059 | /* Make it present again */ | 3155 | /* Make it present again */ |
| 3060 | pte = pte_modify(pte, vma->vm_page_prot); | 3156 | pte = pte_modify(pte, vma->vm_page_prot); |
| 3061 | pte = pte_mkyoung(pte); | 3157 | pte = pte_mkyoung(pte); |
| 3158 | if (was_writable) | ||
| 3159 | pte = pte_mkwrite(pte); | ||
| 3062 | set_pte_at(mm, addr, ptep, pte); | 3160 | set_pte_at(mm, addr, ptep, pte); |
| 3063 | update_mmu_cache(vma, addr, ptep); | 3161 | update_mmu_cache(vma, addr, ptep); |
| 3064 | 3162 | ||
| @@ -3069,11 +3167,14 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3069 | } | 3167 | } |
| 3070 | 3168 | ||
| 3071 | /* | 3169 | /* |
| 3072 | * Avoid grouping on DSO/COW pages in specific and RO pages | 3170 | * Avoid grouping on RO pages in general. RO pages shouldn't hurt as |
| 3073 | * in general, RO pages shouldn't hurt as much anyway since | 3171 | * much anyway since they can be in shared cache state. This misses |
| 3074 | * they can be in shared cache state. | 3172 | * the case where a mapping is writable but the process never writes |
| 3173 | * to it but pte_write gets cleared during protection updates and | ||
| 3174 | * pte_dirty has unpredictable behaviour between PTE scan updates, | ||
| 3175 | * background writeback, dirty balancing and application behaviour. | ||
| 3075 | */ | 3176 | */ |
| 3076 | if (!pte_write(pte)) | 3177 | if (!(vma->vm_flags & VM_WRITE)) |
| 3077 | flags |= TNF_NO_GROUP; | 3178 | flags |= TNF_NO_GROUP; |
| 3078 | 3179 | ||
| 3079 | /* | 3180 | /* |
| @@ -3097,7 +3198,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3097 | if (migrated) { | 3198 | if (migrated) { |
| 3098 | page_nid = target_nid; | 3199 | page_nid = target_nid; |
| 3099 | flags |= TNF_MIGRATED; | 3200 | flags |= TNF_MIGRATED; |
| 3100 | } | 3201 | } else |
| 3202 | flags |= TNF_MIGRATE_FAIL; | ||
| 3101 | 3203 | ||
| 3102 | out: | 3204 | out: |
| 3103 | if (page_nid != -1) | 3205 | if (page_nid != -1) |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9fab10795bea..457bde530cbe 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -104,7 +104,7 @@ void put_online_mems(void) | |||
| 104 | 104 | ||
| 105 | } | 105 | } |
| 106 | 106 | ||
| 107 | static void mem_hotplug_begin(void) | 107 | void mem_hotplug_begin(void) |
| 108 | { | 108 | { |
| 109 | mem_hotplug.active_writer = current; | 109 | mem_hotplug.active_writer = current; |
| 110 | 110 | ||
| @@ -119,7 +119,7 @@ static void mem_hotplug_begin(void) | |||
| 119 | } | 119 | } |
| 120 | } | 120 | } |
| 121 | 121 | ||
| 122 | static void mem_hotplug_done(void) | 122 | void mem_hotplug_done(void) |
| 123 | { | 123 | { |
| 124 | mem_hotplug.active_writer = NULL; | 124 | mem_hotplug.active_writer = NULL; |
| 125 | mutex_unlock(&mem_hotplug.lock); | 125 | mutex_unlock(&mem_hotplug.lock); |
| @@ -502,7 +502,7 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, | |||
| 502 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); | 502 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); |
| 503 | 503 | ||
| 504 | for (i = start_sec; i <= end_sec; i++) { | 504 | for (i = start_sec; i <= end_sec; i++) { |
| 505 | err = __add_section(nid, zone, i << PFN_SECTION_SHIFT); | 505 | err = __add_section(nid, zone, section_nr_to_pfn(i)); |
| 506 | 506 | ||
| 507 | /* | 507 | /* |
| 508 | * EEXIST is finally dealt with by ioresource collision | 508 | * EEXIST is finally dealt with by ioresource collision |
| @@ -959,6 +959,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) | |||
| 959 | } | 959 | } |
| 960 | 960 | ||
| 961 | 961 | ||
| 962 | /* Must be protected by mem_hotplug_begin() */ | ||
| 962 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) | 963 | int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) |
| 963 | { | 964 | { |
| 964 | unsigned long flags; | 965 | unsigned long flags; |
| @@ -969,7 +970,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
| 969 | int ret; | 970 | int ret; |
| 970 | struct memory_notify arg; | 971 | struct memory_notify arg; |
| 971 | 972 | ||
| 972 | mem_hotplug_begin(); | ||
| 973 | /* | 973 | /* |
| 974 | * This doesn't need a lock to do pfn_to_page(). | 974 | * This doesn't need a lock to do pfn_to_page(). |
| 975 | * The section can't be removed here because of the | 975 | * The section can't be removed here because of the |
| @@ -977,21 +977,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
| 977 | */ | 977 | */ |
| 978 | zone = page_zone(pfn_to_page(pfn)); | 978 | zone = page_zone(pfn_to_page(pfn)); |
| 979 | 979 | ||
| 980 | ret = -EINVAL; | ||
| 981 | if ((zone_idx(zone) > ZONE_NORMAL || | 980 | if ((zone_idx(zone) > ZONE_NORMAL || |
| 982 | online_type == MMOP_ONLINE_MOVABLE) && | 981 | online_type == MMOP_ONLINE_MOVABLE) && |
| 983 | !can_online_high_movable(zone)) | 982 | !can_online_high_movable(zone)) |
| 984 | goto out; | 983 | return -EINVAL; |
| 985 | 984 | ||
| 986 | if (online_type == MMOP_ONLINE_KERNEL && | 985 | if (online_type == MMOP_ONLINE_KERNEL && |
| 987 | zone_idx(zone) == ZONE_MOVABLE) { | 986 | zone_idx(zone) == ZONE_MOVABLE) { |
| 988 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) | 987 | if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) |
| 989 | goto out; | 988 | return -EINVAL; |
| 990 | } | 989 | } |
| 991 | if (online_type == MMOP_ONLINE_MOVABLE && | 990 | if (online_type == MMOP_ONLINE_MOVABLE && |
| 992 | zone_idx(zone) == ZONE_MOVABLE - 1) { | 991 | zone_idx(zone) == ZONE_MOVABLE - 1) { |
| 993 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) | 992 | if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) |
| 994 | goto out; | 993 | return -EINVAL; |
| 995 | } | 994 | } |
| 996 | 995 | ||
| 997 | /* Previous code may changed the zone of the pfn range */ | 996 | /* Previous code may changed the zone of the pfn range */ |
| @@ -1007,7 +1006,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
| 1007 | ret = notifier_to_errno(ret); | 1006 | ret = notifier_to_errno(ret); |
| 1008 | if (ret) { | 1007 | if (ret) { |
| 1009 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 1008 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
| 1010 | goto out; | 1009 | return ret; |
| 1011 | } | 1010 | } |
| 1012 | /* | 1011 | /* |
| 1013 | * If this zone is not populated, then it is not in zonelist. | 1012 | * If this zone is not populated, then it is not in zonelist. |
| @@ -1031,7 +1030,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
| 1031 | (((unsigned long long) pfn + nr_pages) | 1030 | (((unsigned long long) pfn + nr_pages) |
| 1032 | << PAGE_SHIFT) - 1); | 1031 | << PAGE_SHIFT) - 1); |
| 1033 | memory_notify(MEM_CANCEL_ONLINE, &arg); | 1032 | memory_notify(MEM_CANCEL_ONLINE, &arg); |
| 1034 | goto out; | 1033 | return ret; |
| 1035 | } | 1034 | } |
| 1036 | 1035 | ||
| 1037 | zone->present_pages += onlined_pages; | 1036 | zone->present_pages += onlined_pages; |
| @@ -1061,9 +1060,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
| 1061 | 1060 | ||
| 1062 | if (onlined_pages) | 1061 | if (onlined_pages) |
| 1063 | memory_notify(MEM_ONLINE, &arg); | 1062 | memory_notify(MEM_ONLINE, &arg); |
| 1064 | out: | 1063 | return 0; |
| 1065 | mem_hotplug_done(); | ||
| 1066 | return ret; | ||
| 1067 | } | 1064 | } |
| 1068 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ | 1065 | #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ |
| 1069 | 1066 | ||
| @@ -1092,6 +1089,10 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) | |||
| 1092 | return NULL; | 1089 | return NULL; |
| 1093 | 1090 | ||
| 1094 | arch_refresh_nodedata(nid, pgdat); | 1091 | arch_refresh_nodedata(nid, pgdat); |
| 1092 | } else { | ||
| 1093 | /* Reset the nr_zones and classzone_idx to 0 before reuse */ | ||
| 1094 | pgdat->nr_zones = 0; | ||
| 1095 | pgdat->classzone_idx = 0; | ||
| 1095 | } | 1096 | } |
| 1096 | 1097 | ||
| 1097 | /* we can use NODE_DATA(nid) from here */ | 1098 | /* we can use NODE_DATA(nid) from here */ |
| @@ -1372,7 +1373,7 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end) | |||
| 1372 | if (PageLRU(page)) | 1373 | if (PageLRU(page)) |
| 1373 | return pfn; | 1374 | return pfn; |
| 1374 | if (PageHuge(page)) { | 1375 | if (PageHuge(page)) { |
| 1375 | if (is_hugepage_active(page)) | 1376 | if (page_huge_active(page)) |
| 1376 | return pfn; | 1377 | return pfn; |
| 1377 | else | 1378 | else |
| 1378 | pfn = round_up(pfn + 1, | 1379 | pfn = round_up(pfn + 1, |
| @@ -1684,21 +1685,18 @@ static int __ref __offline_pages(unsigned long start_pfn, | |||
| 1684 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | 1685 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) |
| 1685 | return -EINVAL; | 1686 | return -EINVAL; |
| 1686 | 1687 | ||
| 1687 | mem_hotplug_begin(); | ||
| 1688 | |||
| 1689 | zone = page_zone(pfn_to_page(start_pfn)); | 1688 | zone = page_zone(pfn_to_page(start_pfn)); |
| 1690 | node = zone_to_nid(zone); | 1689 | node = zone_to_nid(zone); |
| 1691 | nr_pages = end_pfn - start_pfn; | 1690 | nr_pages = end_pfn - start_pfn; |
| 1692 | 1691 | ||
| 1693 | ret = -EINVAL; | ||
| 1694 | if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) | 1692 | if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) |
| 1695 | goto out; | 1693 | return -EINVAL; |
| 1696 | 1694 | ||
| 1697 | /* set above range as isolated */ | 1695 | /* set above range as isolated */ |
| 1698 | ret = start_isolate_page_range(start_pfn, end_pfn, | 1696 | ret = start_isolate_page_range(start_pfn, end_pfn, |
| 1699 | MIGRATE_MOVABLE, true); | 1697 | MIGRATE_MOVABLE, true); |
| 1700 | if (ret) | 1698 | if (ret) |
| 1701 | goto out; | 1699 | return ret; |
| 1702 | 1700 | ||
| 1703 | arg.start_pfn = start_pfn; | 1701 | arg.start_pfn = start_pfn; |
| 1704 | arg.nr_pages = nr_pages; | 1702 | arg.nr_pages = nr_pages; |
| @@ -1791,7 +1789,6 @@ repeat: | |||
| 1791 | writeback_set_ratelimit(); | 1789 | writeback_set_ratelimit(); |
| 1792 | 1790 | ||
| 1793 | memory_notify(MEM_OFFLINE, &arg); | 1791 | memory_notify(MEM_OFFLINE, &arg); |
| 1794 | mem_hotplug_done(); | ||
| 1795 | return 0; | 1792 | return 0; |
| 1796 | 1793 | ||
| 1797 | failed_removal: | 1794 | failed_removal: |
| @@ -1801,12 +1798,10 @@ failed_removal: | |||
| 1801 | memory_notify(MEM_CANCEL_OFFLINE, &arg); | 1798 | memory_notify(MEM_CANCEL_OFFLINE, &arg); |
| 1802 | /* pushback to free area */ | 1799 | /* pushback to free area */ |
| 1803 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1800 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
| 1804 | |||
| 1805 | out: | ||
| 1806 | mem_hotplug_done(); | ||
| 1807 | return ret; | 1801 | return ret; |
| 1808 | } | 1802 | } |
| 1809 | 1803 | ||
| 1804 | /* Must be protected by mem_hotplug_begin() */ | ||
| 1810 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) | 1805 | int offline_pages(unsigned long start_pfn, unsigned long nr_pages) |
| 1811 | { | 1806 | { |
| 1812 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); | 1807 | return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); |
| @@ -1977,15 +1972,6 @@ void try_offline_node(int nid) | |||
| 1977 | if (is_vmalloc_addr(zone->wait_table)) | 1972 | if (is_vmalloc_addr(zone->wait_table)) |
| 1978 | vfree(zone->wait_table); | 1973 | vfree(zone->wait_table); |
| 1979 | } | 1974 | } |
| 1980 | |||
| 1981 | /* | ||
| 1982 | * Since there is no way to guarentee the address of pgdat/zone is not | ||
| 1983 | * on stack of any kernel threads or used by other kernel objects | ||
| 1984 | * without reference counting or other symchronizing method, do not | ||
| 1985 | * reset node_data and free pgdat here. Just reset it to 0 and reuse | ||
| 1986 | * the memory when the node is online again. | ||
| 1987 | */ | ||
| 1988 | memset(pgdat, 0, sizeof(*pgdat)); | ||
| 1989 | } | 1975 | } |
| 1990 | EXPORT_SYMBOL(try_offline_node); | 1976 | EXPORT_SYMBOL(try_offline_node); |
| 1991 | 1977 | ||
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4721046a134a..ede26291d4aa 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -945,7 +945,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x | |||
| 945 | return alloc_huge_page_node(page_hstate(compound_head(page)), | 945 | return alloc_huge_page_node(page_hstate(compound_head(page)), |
| 946 | node); | 946 | node); |
| 947 | else | 947 | else |
| 948 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); | 948 | return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE | |
| 949 | __GFP_THISNODE, 0); | ||
| 949 | } | 950 | } |
| 950 | 951 | ||
| 951 | /* | 952 | /* |
| @@ -1985,7 +1986,8 @@ retry_cpuset: | |||
| 1985 | nmask = policy_nodemask(gfp, pol); | 1986 | nmask = policy_nodemask(gfp, pol); |
| 1986 | if (!nmask || node_isset(node, *nmask)) { | 1987 | if (!nmask || node_isset(node, *nmask)) { |
| 1987 | mpol_cond_put(pol); | 1988 | mpol_cond_put(pol); |
| 1988 | page = alloc_pages_exact_node(node, gfp, order); | 1989 | page = alloc_pages_exact_node(node, |
| 1990 | gfp | __GFP_THISNODE, order); | ||
| 1989 | goto out; | 1991 | goto out; |
| 1990 | } | 1992 | } |
| 1991 | } | 1993 | } |
diff --git a/mm/mempool.c b/mm/mempool.c index e209c98c7203..2cc08de8b1db 100644 --- a/mm/mempool.c +++ b/mm/mempool.c | |||
| @@ -6,26 +6,138 @@ | |||
| 6 | * extreme VM load. | 6 | * extreme VM load. |
| 7 | * | 7 | * |
| 8 | * started by Ingo Molnar, Copyright (C) 2001 | 8 | * started by Ingo Molnar, Copyright (C) 2001 |
| 9 | * debugging by David Rientjes, Copyright (C) 2015 | ||
| 9 | */ | 10 | */ |
| 10 | 11 | ||
| 11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
| 12 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
| 14 | #include <linux/highmem.h> | ||
| 15 | #include <linux/kasan.h> | ||
| 13 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
| 14 | #include <linux/export.h> | 17 | #include <linux/export.h> |
| 15 | #include <linux/mempool.h> | 18 | #include <linux/mempool.h> |
| 16 | #include <linux/blkdev.h> | 19 | #include <linux/blkdev.h> |
| 17 | #include <linux/writeback.h> | 20 | #include <linux/writeback.h> |
| 21 | #include "slab.h" | ||
| 22 | |||
| 23 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON) | ||
| 24 | static void poison_error(mempool_t *pool, void *element, size_t size, | ||
| 25 | size_t byte) | ||
| 26 | { | ||
| 27 | const int nr = pool->curr_nr; | ||
| 28 | const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0); | ||
| 29 | const int end = min_t(int, byte + (BITS_PER_LONG / 8), size); | ||
| 30 | int i; | ||
| 31 | |||
| 32 | pr_err("BUG: mempool element poison mismatch\n"); | ||
| 33 | pr_err("Mempool %p size %zu\n", pool, size); | ||
| 34 | pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : ""); | ||
| 35 | for (i = start; i < end; i++) | ||
| 36 | pr_cont("%x ", *(u8 *)(element + i)); | ||
| 37 | pr_cont("%s\n", end < size ? "..." : ""); | ||
| 38 | dump_stack(); | ||
| 39 | } | ||
| 40 | |||
| 41 | static void __check_element(mempool_t *pool, void *element, size_t size) | ||
| 42 | { | ||
| 43 | u8 *obj = element; | ||
| 44 | size_t i; | ||
| 45 | |||
| 46 | for (i = 0; i < size; i++) { | ||
| 47 | u8 exp = (i < size - 1) ? POISON_FREE : POISON_END; | ||
| 48 | |||
| 49 | if (obj[i] != exp) { | ||
| 50 | poison_error(pool, element, size, i); | ||
| 51 | return; | ||
| 52 | } | ||
| 53 | } | ||
| 54 | memset(obj, POISON_INUSE, size); | ||
| 55 | } | ||
| 56 | |||
| 57 | static void check_element(mempool_t *pool, void *element) | ||
| 58 | { | ||
| 59 | /* Mempools backed by slab allocator */ | ||
| 60 | if (pool->free == mempool_free_slab || pool->free == mempool_kfree) | ||
| 61 | __check_element(pool, element, ksize(element)); | ||
| 62 | |||
| 63 | /* Mempools backed by page allocator */ | ||
| 64 | if (pool->free == mempool_free_pages) { | ||
| 65 | int order = (int)(long)pool->pool_data; | ||
| 66 | void *addr = kmap_atomic((struct page *)element); | ||
| 67 | |||
| 68 | __check_element(pool, addr, 1UL << (PAGE_SHIFT + order)); | ||
| 69 | kunmap_atomic(addr); | ||
| 70 | } | ||
| 71 | } | ||
| 72 | |||
| 73 | static void __poison_element(void *element, size_t size) | ||
| 74 | { | ||
| 75 | u8 *obj = element; | ||
| 76 | |||
| 77 | memset(obj, POISON_FREE, size - 1); | ||
| 78 | obj[size - 1] = POISON_END; | ||
| 79 | } | ||
| 80 | |||
| 81 | static void poison_element(mempool_t *pool, void *element) | ||
| 82 | { | ||
| 83 | /* Mempools backed by slab allocator */ | ||
| 84 | if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) | ||
| 85 | __poison_element(element, ksize(element)); | ||
| 86 | |||
| 87 | /* Mempools backed by page allocator */ | ||
| 88 | if (pool->alloc == mempool_alloc_pages) { | ||
| 89 | int order = (int)(long)pool->pool_data; | ||
| 90 | void *addr = kmap_atomic((struct page *)element); | ||
| 91 | |||
| 92 | __poison_element(addr, 1UL << (PAGE_SHIFT + order)); | ||
| 93 | kunmap_atomic(addr); | ||
| 94 | } | ||
| 95 | } | ||
| 96 | #else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ | ||
| 97 | static inline void check_element(mempool_t *pool, void *element) | ||
| 98 | { | ||
| 99 | } | ||
| 100 | static inline void poison_element(mempool_t *pool, void *element) | ||
| 101 | { | ||
| 102 | } | ||
| 103 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */ | ||
| 104 | |||
| 105 | static void kasan_poison_element(mempool_t *pool, void *element) | ||
| 106 | { | ||
| 107 | if (pool->alloc == mempool_alloc_slab) | ||
| 108 | kasan_slab_free(pool->pool_data, element); | ||
| 109 | if (pool->alloc == mempool_kmalloc) | ||
| 110 | kasan_kfree(element); | ||
| 111 | if (pool->alloc == mempool_alloc_pages) | ||
| 112 | kasan_free_pages(element, (unsigned long)pool->pool_data); | ||
| 113 | } | ||
| 114 | |||
| 115 | static void kasan_unpoison_element(mempool_t *pool, void *element) | ||
| 116 | { | ||
| 117 | if (pool->alloc == mempool_alloc_slab) | ||
| 118 | kasan_slab_alloc(pool->pool_data, element); | ||
| 119 | if (pool->alloc == mempool_kmalloc) | ||
| 120 | kasan_krealloc(element, (size_t)pool->pool_data); | ||
| 121 | if (pool->alloc == mempool_alloc_pages) | ||
| 122 | kasan_alloc_pages(element, (unsigned long)pool->pool_data); | ||
| 123 | } | ||
| 18 | 124 | ||
| 19 | static void add_element(mempool_t *pool, void *element) | 125 | static void add_element(mempool_t *pool, void *element) |
| 20 | { | 126 | { |
| 21 | BUG_ON(pool->curr_nr >= pool->min_nr); | 127 | BUG_ON(pool->curr_nr >= pool->min_nr); |
| 128 | poison_element(pool, element); | ||
| 129 | kasan_poison_element(pool, element); | ||
| 22 | pool->elements[pool->curr_nr++] = element; | 130 | pool->elements[pool->curr_nr++] = element; |
| 23 | } | 131 | } |
| 24 | 132 | ||
| 25 | static void *remove_element(mempool_t *pool) | 133 | static void *remove_element(mempool_t *pool) |
| 26 | { | 134 | { |
| 27 | BUG_ON(pool->curr_nr <= 0); | 135 | void *element = pool->elements[--pool->curr_nr]; |
| 28 | return pool->elements[--pool->curr_nr]; | 136 | |
| 137 | BUG_ON(pool->curr_nr < 0); | ||
| 138 | check_element(pool, element); | ||
| 139 | kasan_unpoison_element(pool, element); | ||
| 140 | return element; | ||
| 29 | } | 141 | } |
| 30 | 142 | ||
| 31 | /** | 143 | /** |
| @@ -113,23 +225,24 @@ EXPORT_SYMBOL(mempool_create_node); | |||
| 113 | * mempool_create(). | 225 | * mempool_create(). |
| 114 | * @new_min_nr: the new minimum number of elements guaranteed to be | 226 | * @new_min_nr: the new minimum number of elements guaranteed to be |
| 115 | * allocated for this pool. | 227 | * allocated for this pool. |
| 116 | * @gfp_mask: the usual allocation bitmask. | ||
| 117 | * | 228 | * |
| 118 | * This function shrinks/grows the pool. In the case of growing, | 229 | * This function shrinks/grows the pool. In the case of growing, |
| 119 | * it cannot be guaranteed that the pool will be grown to the new | 230 | * it cannot be guaranteed that the pool will be grown to the new |
| 120 | * size immediately, but new mempool_free() calls will refill it. | 231 | * size immediately, but new mempool_free() calls will refill it. |
| 232 | * This function may sleep. | ||
| 121 | * | 233 | * |
| 122 | * Note, the caller must guarantee that no mempool_destroy is called | 234 | * Note, the caller must guarantee that no mempool_destroy is called |
| 123 | * while this function is running. mempool_alloc() & mempool_free() | 235 | * while this function is running. mempool_alloc() & mempool_free() |
| 124 | * might be called (eg. from IRQ contexts) while this function executes. | 236 | * might be called (eg. from IRQ contexts) while this function executes. |
| 125 | */ | 237 | */ |
| 126 | int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) | 238 | int mempool_resize(mempool_t *pool, int new_min_nr) |
| 127 | { | 239 | { |
| 128 | void *element; | 240 | void *element; |
| 129 | void **new_elements; | 241 | void **new_elements; |
| 130 | unsigned long flags; | 242 | unsigned long flags; |
| 131 | 243 | ||
| 132 | BUG_ON(new_min_nr <= 0); | 244 | BUG_ON(new_min_nr <= 0); |
| 245 | might_sleep(); | ||
| 133 | 246 | ||
| 134 | spin_lock_irqsave(&pool->lock, flags); | 247 | spin_lock_irqsave(&pool->lock, flags); |
| 135 | if (new_min_nr <= pool->min_nr) { | 248 | if (new_min_nr <= pool->min_nr) { |
| @@ -145,7 +258,8 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) | |||
| 145 | spin_unlock_irqrestore(&pool->lock, flags); | 258 | spin_unlock_irqrestore(&pool->lock, flags); |
| 146 | 259 | ||
| 147 | /* Grow the pool */ | 260 | /* Grow the pool */ |
| 148 | new_elements = kmalloc(new_min_nr * sizeof(*new_elements), gfp_mask); | 261 | new_elements = kmalloc_array(new_min_nr, sizeof(*new_elements), |
| 262 | GFP_KERNEL); | ||
| 149 | if (!new_elements) | 263 | if (!new_elements) |
| 150 | return -ENOMEM; | 264 | return -ENOMEM; |
| 151 | 265 | ||
| @@ -164,7 +278,7 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) | |||
| 164 | 278 | ||
| 165 | while (pool->curr_nr < pool->min_nr) { | 279 | while (pool->curr_nr < pool->min_nr) { |
| 166 | spin_unlock_irqrestore(&pool->lock, flags); | 280 | spin_unlock_irqrestore(&pool->lock, flags); |
| 167 | element = pool->alloc(gfp_mask, pool->pool_data); | 281 | element = pool->alloc(GFP_KERNEL, pool->pool_data); |
| 168 | if (!element) | 282 | if (!element) |
| 169 | goto out; | 283 | goto out; |
| 170 | spin_lock_irqsave(&pool->lock, flags); | 284 | spin_lock_irqsave(&pool->lock, flags); |
| @@ -332,6 +446,7 @@ EXPORT_SYMBOL(mempool_free); | |||
| 332 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) | 446 | void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) |
| 333 | { | 447 | { |
| 334 | struct kmem_cache *mem = pool_data; | 448 | struct kmem_cache *mem = pool_data; |
| 449 | VM_BUG_ON(mem->ctor); | ||
| 335 | return kmem_cache_alloc(mem, gfp_mask); | 450 | return kmem_cache_alloc(mem, gfp_mask); |
| 336 | } | 451 | } |
| 337 | EXPORT_SYMBOL(mempool_alloc_slab); | 452 | EXPORT_SYMBOL(mempool_alloc_slab); |
diff --git a/mm/memtest.c b/mm/memtest.c new file mode 100644 index 000000000000..1997d934b13b --- /dev/null +++ b/mm/memtest.c | |||
| @@ -0,0 +1,118 @@ | |||
| 1 | #include <linux/kernel.h> | ||
| 2 | #include <linux/errno.h> | ||
| 3 | #include <linux/string.h> | ||
| 4 | #include <linux/types.h> | ||
| 5 | #include <linux/mm.h> | ||
| 6 | #include <linux/smp.h> | ||
| 7 | #include <linux/init.h> | ||
| 8 | #include <linux/pfn.h> | ||
| 9 | #include <linux/memblock.h> | ||
| 10 | |||
| 11 | static u64 patterns[] __initdata = { | ||
| 12 | /* The first entry has to be 0 to leave memtest with zeroed memory */ | ||
| 13 | 0, | ||
| 14 | 0xffffffffffffffffULL, | ||
| 15 | 0x5555555555555555ULL, | ||
| 16 | 0xaaaaaaaaaaaaaaaaULL, | ||
| 17 | 0x1111111111111111ULL, | ||
| 18 | 0x2222222222222222ULL, | ||
| 19 | 0x4444444444444444ULL, | ||
| 20 | 0x8888888888888888ULL, | ||
| 21 | 0x3333333333333333ULL, | ||
| 22 | 0x6666666666666666ULL, | ||
| 23 | 0x9999999999999999ULL, | ||
| 24 | 0xccccccccccccccccULL, | ||
| 25 | 0x7777777777777777ULL, | ||
| 26 | 0xbbbbbbbbbbbbbbbbULL, | ||
| 27 | 0xddddddddddddddddULL, | ||
| 28 | 0xeeeeeeeeeeeeeeeeULL, | ||
| 29 | 0x7a6c7258554e494cULL, /* yeah ;-) */ | ||
| 30 | }; | ||
| 31 | |||
| 32 | static void __init reserve_bad_mem(u64 pattern, phys_addr_t start_bad, phys_addr_t end_bad) | ||
| 33 | { | ||
| 34 | printk(KERN_INFO " %016llx bad mem addr %010llx - %010llx reserved\n", | ||
| 35 | (unsigned long long) pattern, | ||
| 36 | (unsigned long long) start_bad, | ||
| 37 | (unsigned long long) end_bad); | ||
| 38 | memblock_reserve(start_bad, end_bad - start_bad); | ||
| 39 | } | ||
| 40 | |||
| 41 | static void __init memtest(u64 pattern, phys_addr_t start_phys, phys_addr_t size) | ||
| 42 | { | ||
| 43 | u64 *p, *start, *end; | ||
| 44 | phys_addr_t start_bad, last_bad; | ||
| 45 | phys_addr_t start_phys_aligned; | ||
| 46 | const size_t incr = sizeof(pattern); | ||
| 47 | |||
| 48 | start_phys_aligned = ALIGN(start_phys, incr); | ||
| 49 | start = __va(start_phys_aligned); | ||
| 50 | end = start + (size - (start_phys_aligned - start_phys)) / incr; | ||
| 51 | start_bad = 0; | ||
| 52 | last_bad = 0; | ||
| 53 | |||
| 54 | for (p = start; p < end; p++) | ||
| 55 | *p = pattern; | ||
| 56 | |||
| 57 | for (p = start; p < end; p++, start_phys_aligned += incr) { | ||
| 58 | if (*p == pattern) | ||
| 59 | continue; | ||
| 60 | if (start_phys_aligned == last_bad + incr) { | ||
| 61 | last_bad += incr; | ||
| 62 | continue; | ||
| 63 | } | ||
| 64 | if (start_bad) | ||
| 65 | reserve_bad_mem(pattern, start_bad, last_bad + incr); | ||
| 66 | start_bad = last_bad = start_phys_aligned; | ||
| 67 | } | ||
| 68 | if (start_bad) | ||
| 69 | reserve_bad_mem(pattern, start_bad, last_bad + incr); | ||
| 70 | } | ||
| 71 | |||
| 72 | static void __init do_one_pass(u64 pattern, phys_addr_t start, phys_addr_t end) | ||
| 73 | { | ||
| 74 | u64 i; | ||
| 75 | phys_addr_t this_start, this_end; | ||
| 76 | |||
| 77 | for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) { | ||
| 78 | this_start = clamp(this_start, start, end); | ||
| 79 | this_end = clamp(this_end, start, end); | ||
| 80 | if (this_start < this_end) { | ||
| 81 | printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", | ||
| 82 | (unsigned long long)this_start, | ||
| 83 | (unsigned long long)this_end, | ||
| 84 | (unsigned long long)cpu_to_be64(pattern)); | ||
| 85 | memtest(pattern, this_start, this_end - this_start); | ||
| 86 | } | ||
| 87 | } | ||
| 88 | } | ||
| 89 | |||
| 90 | /* default is disabled */ | ||
| 91 | static int memtest_pattern __initdata; | ||
| 92 | |||
| 93 | static int __init parse_memtest(char *arg) | ||
| 94 | { | ||
| 95 | if (arg) | ||
| 96 | memtest_pattern = simple_strtoul(arg, NULL, 0); | ||
| 97 | else | ||
| 98 | memtest_pattern = ARRAY_SIZE(patterns); | ||
| 99 | |||
| 100 | return 0; | ||
| 101 | } | ||
| 102 | |||
| 103 | early_param("memtest", parse_memtest); | ||
| 104 | |||
| 105 | void __init early_memtest(phys_addr_t start, phys_addr_t end) | ||
| 106 | { | ||
| 107 | unsigned int i; | ||
| 108 | unsigned int idx = 0; | ||
| 109 | |||
| 110 | if (!memtest_pattern) | ||
| 111 | return; | ||
| 112 | |||
| 113 | printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern); | ||
| 114 | for (i = memtest_pattern-1; i < UINT_MAX; --i) { | ||
| 115 | idx = i % ARRAY_SIZE(patterns); | ||
| 116 | do_one_pass(patterns[idx], start, end); | ||
| 117 | } | ||
| 118 | } | ||
diff --git a/mm/migrate.c b/mm/migrate.c index 85e042686031..f53838fe3dfe 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -537,7 +537,8 @@ void migrate_page_copy(struct page *newpage, struct page *page) | |||
| 537 | * Please do not reorder this without considering how mm/ksm.c's | 537 | * Please do not reorder this without considering how mm/ksm.c's |
| 538 | * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). | 538 | * get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache(). |
| 539 | */ | 539 | */ |
| 540 | ClearPageSwapCache(page); | 540 | if (PageSwapCache(page)) |
| 541 | ClearPageSwapCache(page); | ||
| 541 | ClearPagePrivate(page); | 542 | ClearPagePrivate(page); |
| 542 | set_page_private(page, 0); | 543 | set_page_private(page, 0); |
| 543 | 544 | ||
| @@ -901,12 +902,23 @@ out: | |||
| 901 | } | 902 | } |
| 902 | 903 | ||
| 903 | /* | 904 | /* |
| 905 | * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work | ||
| 906 | * around it. | ||
| 907 | */ | ||
| 908 | #if (GCC_VERSION >= 40700 && GCC_VERSION < 40900) && defined(CONFIG_ARM) | ||
| 909 | #define ICE_noinline noinline | ||
| 910 | #else | ||
| 911 | #define ICE_noinline | ||
| 912 | #endif | ||
| 913 | |||
| 914 | /* | ||
| 904 | * Obtain the lock on page, remove all ptes and migrate the page | 915 | * Obtain the lock on page, remove all ptes and migrate the page |
| 905 | * to the newly allocated page in newpage. | 916 | * to the newly allocated page in newpage. |
| 906 | */ | 917 | */ |
| 907 | static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, | 918 | static ICE_noinline int unmap_and_move(new_page_t get_new_page, |
| 908 | unsigned long private, struct page *page, int force, | 919 | free_page_t put_new_page, |
| 909 | enum migrate_mode mode) | 920 | unsigned long private, struct page *page, |
| 921 | int force, enum migrate_mode mode) | ||
| 910 | { | 922 | { |
| 911 | int rc = 0; | 923 | int rc = 0; |
| 912 | int *result = NULL; | 924 | int *result = NULL; |
| @@ -1554,30 +1566,10 @@ static struct page *alloc_misplaced_dst_page(struct page *page, | |||
| 1554 | * page migration rate limiting control. | 1566 | * page migration rate limiting control. |
| 1555 | * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs | 1567 | * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs |
| 1556 | * window of time. Default here says do not migrate more than 1280M per second. | 1568 | * window of time. Default here says do not migrate more than 1280M per second. |
| 1557 | * If a node is rate-limited then PTE NUMA updates are also rate-limited. However | ||
| 1558 | * as it is faults that reset the window, pte updates will happen unconditionally | ||
| 1559 | * if there has not been a fault since @pteupdate_interval_millisecs after the | ||
| 1560 | * throttle window closed. | ||
| 1561 | */ | 1569 | */ |
| 1562 | static unsigned int migrate_interval_millisecs __read_mostly = 100; | 1570 | static unsigned int migrate_interval_millisecs __read_mostly = 100; |
| 1563 | static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; | ||
| 1564 | static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); | 1571 | static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); |
| 1565 | 1572 | ||
| 1566 | /* Returns true if NUMA migration is currently rate limited */ | ||
| 1567 | bool migrate_ratelimited(int node) | ||
| 1568 | { | ||
| 1569 | pg_data_t *pgdat = NODE_DATA(node); | ||
| 1570 | |||
| 1571 | if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + | ||
| 1572 | msecs_to_jiffies(pteupdate_interval_millisecs))) | ||
| 1573 | return false; | ||
| 1574 | |||
| 1575 | if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) | ||
| 1576 | return false; | ||
| 1577 | |||
| 1578 | return true; | ||
| 1579 | } | ||
| 1580 | |||
| 1581 | /* Returns true if the node is migrate rate-limited after the update */ | 1573 | /* Returns true if the node is migrate rate-limited after the update */ |
| 1582 | static bool numamigrate_update_ratelimit(pg_data_t *pgdat, | 1574 | static bool numamigrate_update_ratelimit(pg_data_t *pgdat, |
| 1583 | unsigned long nr_pages) | 1575 | unsigned long nr_pages) |
diff --git a/mm/mlock.c b/mm/mlock.c index 73cf0987088c..6fd2cf15e868 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
| @@ -26,10 +26,10 @@ | |||
| 26 | 26 | ||
| 27 | int can_do_mlock(void) | 27 | int can_do_mlock(void) |
| 28 | { | 28 | { |
| 29 | if (capable(CAP_IPC_LOCK)) | ||
| 30 | return 1; | ||
| 31 | if (rlimit(RLIMIT_MEMLOCK) != 0) | 29 | if (rlimit(RLIMIT_MEMLOCK) != 0) |
| 32 | return 1; | 30 | return 1; |
| 31 | if (capable(CAP_IPC_LOCK)) | ||
| 32 | return 1; | ||
| 33 | return 0; | 33 | return 0; |
| 34 | } | 34 | } |
| 35 | EXPORT_SYMBOL(can_do_mlock); | 35 | EXPORT_SYMBOL(can_do_mlock); |
| @@ -205,62 +205,6 @@ out: | |||
| 205 | return nr_pages - 1; | 205 | return nr_pages - 1; |
| 206 | } | 206 | } |
| 207 | 207 | ||
| 208 | /** | ||
| 209 | * __mlock_vma_pages_range() - mlock a range of pages in the vma. | ||
| 210 | * @vma: target vma | ||
| 211 | * @start: start address | ||
| 212 | * @end: end address | ||
| 213 | * @nonblocking: | ||
| 214 | * | ||
| 215 | * This takes care of making the pages present too. | ||
| 216 | * | ||
| 217 | * return 0 on success, negative error code on error. | ||
| 218 | * | ||
| 219 | * vma->vm_mm->mmap_sem must be held. | ||
| 220 | * | ||
| 221 | * If @nonblocking is NULL, it may be held for read or write and will | ||
| 222 | * be unperturbed. | ||
| 223 | * | ||
| 224 | * If @nonblocking is non-NULL, it must held for read only and may be | ||
| 225 | * released. If it's released, *@nonblocking will be set to 0. | ||
| 226 | */ | ||
| 227 | long __mlock_vma_pages_range(struct vm_area_struct *vma, | ||
| 228 | unsigned long start, unsigned long end, int *nonblocking) | ||
| 229 | { | ||
| 230 | struct mm_struct *mm = vma->vm_mm; | ||
| 231 | unsigned long nr_pages = (end - start) / PAGE_SIZE; | ||
| 232 | int gup_flags; | ||
| 233 | |||
| 234 | VM_BUG_ON(start & ~PAGE_MASK); | ||
| 235 | VM_BUG_ON(end & ~PAGE_MASK); | ||
| 236 | VM_BUG_ON_VMA(start < vma->vm_start, vma); | ||
| 237 | VM_BUG_ON_VMA(end > vma->vm_end, vma); | ||
| 238 | VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); | ||
| 239 | |||
| 240 | gup_flags = FOLL_TOUCH | FOLL_MLOCK; | ||
| 241 | /* | ||
| 242 | * We want to touch writable mappings with a write fault in order | ||
| 243 | * to break COW, except for shared mappings because these don't COW | ||
| 244 | * and we would not want to dirty them for nothing. | ||
| 245 | */ | ||
| 246 | if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) | ||
| 247 | gup_flags |= FOLL_WRITE; | ||
| 248 | |||
| 249 | /* | ||
| 250 | * We want mlock to succeed for regions that have any permissions | ||
| 251 | * other than PROT_NONE. | ||
| 252 | */ | ||
| 253 | if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) | ||
| 254 | gup_flags |= FOLL_FORCE; | ||
| 255 | |||
| 256 | /* | ||
| 257 | * We made sure addr is within a VMA, so the following will | ||
| 258 | * not result in a stack expansion that recurses back here. | ||
| 259 | */ | ||
| 260 | return __get_user_pages(current, mm, start, nr_pages, gup_flags, | ||
| 261 | NULL, NULL, nonblocking); | ||
| 262 | } | ||
| 263 | |||
| 264 | /* | 208 | /* |
| 265 | * convert get_user_pages() return value to posix mlock() error | 209 | * convert get_user_pages() return value to posix mlock() error |
| 266 | */ | 210 | */ |
| @@ -596,7 +540,7 @@ success: | |||
| 596 | /* | 540 | /* |
| 597 | * vm_flags is protected by the mmap_sem held in write mode. | 541 | * vm_flags is protected by the mmap_sem held in write mode. |
| 598 | * It's okay if try_to_unmap_one unmaps a page just after we | 542 | * It's okay if try_to_unmap_one unmaps a page just after we |
| 599 | * set VM_LOCKED, __mlock_vma_pages_range will bring it back. | 543 | * set VM_LOCKED, populate_vma_page_range will bring it back. |
| 600 | */ | 544 | */ |
| 601 | 545 | ||
| 602 | if (lock) | 546 | if (lock) |
| @@ -660,69 +604,6 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
| 660 | return error; | 604 | return error; |
| 661 | } | 605 | } |
| 662 | 606 | ||
| 663 | /* | ||
| 664 | * __mm_populate - populate and/or mlock pages within a range of address space. | ||
| 665 | * | ||
| 666 | * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap | ||
| 667 | * flags. VMAs must be already marked with the desired vm_flags, and | ||
| 668 | * mmap_sem must not be held. | ||
| 669 | */ | ||
| 670 | int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) | ||
| 671 | { | ||
| 672 | struct mm_struct *mm = current->mm; | ||
| 673 | unsigned long end, nstart, nend; | ||
| 674 | struct vm_area_struct *vma = NULL; | ||
| 675 | int locked = 0; | ||
| 676 | long ret = 0; | ||
| 677 | |||
| 678 | VM_BUG_ON(start & ~PAGE_MASK); | ||
| 679 | VM_BUG_ON(len != PAGE_ALIGN(len)); | ||
| 680 | end = start + len; | ||
| 681 | |||
| 682 | for (nstart = start; nstart < end; nstart = nend) { | ||
| 683 | /* | ||
| 684 | * We want to fault in pages for [nstart; end) address range. | ||
| 685 | * Find first corresponding VMA. | ||
| 686 | */ | ||
| 687 | if (!locked) { | ||
| 688 | locked = 1; | ||
| 689 | down_read(&mm->mmap_sem); | ||
| 690 | vma = find_vma(mm, nstart); | ||
| 691 | } else if (nstart >= vma->vm_end) | ||
| 692 | vma = vma->vm_next; | ||
| 693 | if (!vma || vma->vm_start >= end) | ||
| 694 | break; | ||
| 695 | /* | ||
| 696 | * Set [nstart; nend) to intersection of desired address | ||
| 697 | * range with the first VMA. Also, skip undesirable VMA types. | ||
| 698 | */ | ||
| 699 | nend = min(end, vma->vm_end); | ||
| 700 | if (vma->vm_flags & (VM_IO | VM_PFNMAP)) | ||
| 701 | continue; | ||
| 702 | if (nstart < vma->vm_start) | ||
| 703 | nstart = vma->vm_start; | ||
| 704 | /* | ||
| 705 | * Now fault in a range of pages. __mlock_vma_pages_range() | ||
| 706 | * double checks the vma flags, so that it won't mlock pages | ||
| 707 | * if the vma was already munlocked. | ||
| 708 | */ | ||
| 709 | ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); | ||
| 710 | if (ret < 0) { | ||
| 711 | if (ignore_errors) { | ||
| 712 | ret = 0; | ||
| 713 | continue; /* continue at next VMA */ | ||
| 714 | } | ||
| 715 | ret = __mlock_posix_error_return(ret); | ||
| 716 | break; | ||
| 717 | } | ||
| 718 | nend = nstart + ret * PAGE_SIZE; | ||
| 719 | ret = 0; | ||
| 720 | } | ||
| 721 | if (locked) | ||
| 722 | up_read(&mm->mmap_sem); | ||
| 723 | return ret; /* 0 or negative error code */ | ||
| 724 | } | ||
| 725 | |||
| 726 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | 607 | SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) |
| 727 | { | 608 | { |
| 728 | unsigned long locked; | 609 | unsigned long locked; |
| @@ -750,9 +631,13 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
| 750 | error = do_mlock(start, len, 1); | 631 | error = do_mlock(start, len, 1); |
| 751 | 632 | ||
| 752 | up_write(¤t->mm->mmap_sem); | 633 | up_write(¤t->mm->mmap_sem); |
| 753 | if (!error) | 634 | if (error) |
| 754 | error = __mm_populate(start, len, 0); | 635 | return error; |
| 755 | return error; | 636 | |
| 637 | error = __mm_populate(start, len, 0); | ||
| 638 | if (error) | ||
| 639 | return __mlock_posix_error_return(error); | ||
| 640 | return 0; | ||
| 756 | } | 641 | } |
| 757 | 642 | ||
| 758 | SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) | 643 | SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) |
| @@ -774,10 +774,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
| 774 | 774 | ||
| 775 | importer->anon_vma = exporter->anon_vma; | 775 | importer->anon_vma = exporter->anon_vma; |
| 776 | error = anon_vma_clone(importer, exporter); | 776 | error = anon_vma_clone(importer, exporter); |
| 777 | if (error) { | 777 | if (error) |
| 778 | importer->anon_vma = NULL; | ||
| 779 | return error; | 778 | return error; |
| 780 | } | ||
| 781 | } | 779 | } |
| 782 | } | 780 | } |
| 783 | 781 | ||
| @@ -1135,7 +1133,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * | |||
| 1135 | * by another page fault trying to merge _that_. But that's ok: if it | 1133 | * by another page fault trying to merge _that_. But that's ok: if it |
| 1136 | * is being set up, that automatically means that it will be a singleton | 1134 | * is being set up, that automatically means that it will be a singleton |
| 1137 | * acceptable for merging, so we can do all of this optimistically. But | 1135 | * acceptable for merging, so we can do all of this optimistically. But |
| 1138 | * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. | 1136 | * we do that READ_ONCE() to make sure that we never re-load the pointer. |
| 1139 | * | 1137 | * |
| 1140 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only | 1138 | * IOW: that the "list_is_singular()" test on the anon_vma_chain only |
| 1141 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid | 1139 | * matters for the 'stable anon_vma' case (ie the thing we want to avoid |
| @@ -1149,7 +1147,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * | |||
| 1149 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) | 1147 | static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) |
| 1150 | { | 1148 | { |
| 1151 | if (anon_vma_compatible(a, b)) { | 1149 | if (anon_vma_compatible(a, b)) { |
| 1152 | struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); | 1150 | struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); |
| 1153 | 1151 | ||
| 1154 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) | 1152 | if (anon_vma && list_is_singular(&old->anon_vma_chain)) |
| 1155 | return anon_vma; | 1153 | return anon_vma; |
| @@ -1553,11 +1551,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, | |||
| 1553 | 1551 | ||
| 1554 | /* Clear old maps */ | 1552 | /* Clear old maps */ |
| 1555 | error = -ENOMEM; | 1553 | error = -ENOMEM; |
| 1556 | munmap_back: | 1554 | while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, |
| 1557 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { | 1555 | &rb_parent)) { |
| 1558 | if (do_munmap(mm, addr, len)) | 1556 | if (do_munmap(mm, addr, len)) |
| 1559 | return -ENOMEM; | 1557 | return -ENOMEM; |
| 1560 | goto munmap_back; | ||
| 1561 | } | 1558 | } |
| 1562 | 1559 | ||
| 1563 | /* | 1560 | /* |
| @@ -1573,7 +1570,8 @@ munmap_back: | |||
| 1573 | /* | 1570 | /* |
| 1574 | * Can we just expand an old mapping? | 1571 | * Can we just expand an old mapping? |
| 1575 | */ | 1572 | */ |
| 1576 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL); | 1573 | vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, |
| 1574 | NULL); | ||
| 1577 | if (vma) | 1575 | if (vma) |
| 1578 | goto out; | 1576 | goto out; |
| 1579 | 1577 | ||
| @@ -2102,7 +2100,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
| 2102 | actual_size = size; | 2100 | actual_size = size; |
| 2103 | if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) | 2101 | if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN))) |
| 2104 | actual_size -= PAGE_SIZE; | 2102 | actual_size -= PAGE_SIZE; |
| 2105 | if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) | 2103 | if (actual_size > READ_ONCE(rlim[RLIMIT_STACK].rlim_cur)) |
| 2106 | return -ENOMEM; | 2104 | return -ENOMEM; |
| 2107 | 2105 | ||
| 2108 | /* mlock limit tests */ | 2106 | /* mlock limit tests */ |
| @@ -2110,7 +2108,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns | |||
| 2110 | unsigned long locked; | 2108 | unsigned long locked; |
| 2111 | unsigned long limit; | 2109 | unsigned long limit; |
| 2112 | locked = mm->locked_vm + grow; | 2110 | locked = mm->locked_vm + grow; |
| 2113 | limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); | 2111 | limit = READ_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); |
| 2114 | limit >>= PAGE_SHIFT; | 2112 | limit >>= PAGE_SHIFT; |
| 2115 | if (locked > limit && !capable(CAP_IPC_LOCK)) | 2113 | if (locked > limit && !capable(CAP_IPC_LOCK)) |
| 2116 | return -ENOMEM; | 2114 | return -ENOMEM; |
| @@ -2318,7 +2316,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
| 2318 | if (!prev || expand_stack(prev, addr)) | 2316 | if (!prev || expand_stack(prev, addr)) |
| 2319 | return NULL; | 2317 | return NULL; |
| 2320 | if (prev->vm_flags & VM_LOCKED) | 2318 | if (prev->vm_flags & VM_LOCKED) |
| 2321 | __mlock_vma_pages_range(prev, addr, prev->vm_end, NULL); | 2319 | populate_vma_page_range(prev, addr, prev->vm_end, NULL); |
| 2322 | return prev; | 2320 | return prev; |
| 2323 | } | 2321 | } |
| 2324 | #else | 2322 | #else |
| @@ -2353,7 +2351,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) | |||
| 2353 | if (expand_stack(vma, addr)) | 2351 | if (expand_stack(vma, addr)) |
| 2354 | return NULL; | 2352 | return NULL; |
| 2355 | if (vma->vm_flags & VM_LOCKED) | 2353 | if (vma->vm_flags & VM_LOCKED) |
| 2356 | __mlock_vma_pages_range(vma, addr, start, NULL); | 2354 | populate_vma_page_range(vma, addr, start, NULL); |
| 2357 | return vma; | 2355 | return vma; |
| 2358 | } | 2356 | } |
| 2359 | #endif | 2357 | #endif |
| @@ -2741,11 +2739,10 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) | |||
| 2741 | /* | 2739 | /* |
| 2742 | * Clear old maps. this also does some error checking for us | 2740 | * Clear old maps. this also does some error checking for us |
| 2743 | */ | 2741 | */ |
| 2744 | munmap_back: | 2742 | while (find_vma_links(mm, addr, addr + len, &prev, &rb_link, |
| 2745 | if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) { | 2743 | &rb_parent)) { |
| 2746 | if (do_munmap(mm, addr, len)) | 2744 | if (do_munmap(mm, addr, len)) |
| 2747 | return -ENOMEM; | 2745 | return -ENOMEM; |
| 2748 | goto munmap_back; | ||
| 2749 | } | 2746 | } |
| 2750 | 2747 | ||
| 2751 | /* Check against address space limits *after* clearing old maps... */ | 2748 | /* Check against address space limits *after* clearing old maps... */ |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 44727811bf4c..88584838e704 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -75,6 +75,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 75 | oldpte = *pte; | 75 | oldpte = *pte; |
| 76 | if (pte_present(oldpte)) { | 76 | if (pte_present(oldpte)) { |
| 77 | pte_t ptent; | 77 | pte_t ptent; |
| 78 | bool preserve_write = prot_numa && pte_write(oldpte); | ||
| 78 | 79 | ||
| 79 | /* | 80 | /* |
| 80 | * Avoid trapping faults against the zero or KSM | 81 | * Avoid trapping faults against the zero or KSM |
| @@ -94,6 +95,8 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 94 | 95 | ||
| 95 | ptent = ptep_modify_prot_start(mm, addr, pte); | 96 | ptent = ptep_modify_prot_start(mm, addr, pte); |
| 96 | ptent = pte_modify(ptent, newprot); | 97 | ptent = pte_modify(ptent, newprot); |
| 98 | if (preserve_write) | ||
| 99 | ptent = pte_mkwrite(ptent); | ||
| 97 | 100 | ||
| 98 | /* Avoid taking write faults for known dirty pages */ | 101 | /* Avoid taking write faults for known dirty pages */ |
| 99 | if (dirty_accountable && pte_dirty(ptent) && | 102 | if (dirty_accountable && pte_dirty(ptent) && |
diff --git a/mm/mremap.c b/mm/mremap.c index 57dadc025c64..034e2d360652 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -286,8 +286,14 @@ static unsigned long move_vma(struct vm_area_struct *vma, | |||
| 286 | old_len = new_len; | 286 | old_len = new_len; |
| 287 | old_addr = new_addr; | 287 | old_addr = new_addr; |
| 288 | new_addr = -ENOMEM; | 288 | new_addr = -ENOMEM; |
| 289 | } else if (vma->vm_file && vma->vm_file->f_op->mremap) | 289 | } else if (vma->vm_file && vma->vm_file->f_op->mremap) { |
| 290 | vma->vm_file->f_op->mremap(vma->vm_file, new_vma); | 290 | err = vma->vm_file->f_op->mremap(vma->vm_file, new_vma); |
| 291 | if (err < 0) { | ||
| 292 | move_page_tables(new_vma, new_addr, vma, old_addr, | ||
| 293 | moved_len, true); | ||
| 294 | return err; | ||
| 295 | } | ||
| 296 | } | ||
| 291 | 297 | ||
| 292 | /* Conceal VM_ACCOUNT so old reservation is not undone */ | 298 | /* Conceal VM_ACCOUNT so old reservation is not undone */ |
| 293 | if (vm_flags & VM_ACCOUNT) { | 299 | if (vm_flags & VM_ACCOUNT) { |
| @@ -339,25 +345,25 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
| 339 | struct vm_area_struct *vma = find_vma(mm, addr); | 345 | struct vm_area_struct *vma = find_vma(mm, addr); |
| 340 | 346 | ||
| 341 | if (!vma || vma->vm_start > addr) | 347 | if (!vma || vma->vm_start > addr) |
| 342 | goto Efault; | 348 | return ERR_PTR(-EFAULT); |
| 343 | 349 | ||
| 344 | if (is_vm_hugetlb_page(vma)) | 350 | if (is_vm_hugetlb_page(vma)) |
| 345 | goto Einval; | 351 | return ERR_PTR(-EINVAL); |
| 346 | 352 | ||
| 347 | /* We can't remap across vm area boundaries */ | 353 | /* We can't remap across vm area boundaries */ |
| 348 | if (old_len > vma->vm_end - addr) | 354 | if (old_len > vma->vm_end - addr) |
| 349 | goto Efault; | 355 | return ERR_PTR(-EFAULT); |
| 350 | 356 | ||
| 351 | /* Need to be careful about a growing mapping */ | 357 | /* Need to be careful about a growing mapping */ |
| 352 | if (new_len > old_len) { | 358 | if (new_len > old_len) { |
| 353 | unsigned long pgoff; | 359 | unsigned long pgoff; |
| 354 | 360 | ||
| 355 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) | 361 | if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) |
| 356 | goto Efault; | 362 | return ERR_PTR(-EFAULT); |
| 357 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; | 363 | pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; |
| 358 | pgoff += vma->vm_pgoff; | 364 | pgoff += vma->vm_pgoff; |
| 359 | if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) | 365 | if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) |
| 360 | goto Einval; | 366 | return ERR_PTR(-EINVAL); |
| 361 | } | 367 | } |
| 362 | 368 | ||
| 363 | if (vma->vm_flags & VM_LOCKED) { | 369 | if (vma->vm_flags & VM_LOCKED) { |
| @@ -366,29 +372,20 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, | |||
| 366 | lock_limit = rlimit(RLIMIT_MEMLOCK); | 372 | lock_limit = rlimit(RLIMIT_MEMLOCK); |
| 367 | locked += new_len - old_len; | 373 | locked += new_len - old_len; |
| 368 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) | 374 | if (locked > lock_limit && !capable(CAP_IPC_LOCK)) |
| 369 | goto Eagain; | 375 | return ERR_PTR(-EAGAIN); |
| 370 | } | 376 | } |
| 371 | 377 | ||
| 372 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) | 378 | if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) |
| 373 | goto Enomem; | 379 | return ERR_PTR(-ENOMEM); |
| 374 | 380 | ||
| 375 | if (vma->vm_flags & VM_ACCOUNT) { | 381 | if (vma->vm_flags & VM_ACCOUNT) { |
| 376 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; | 382 | unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; |
| 377 | if (security_vm_enough_memory_mm(mm, charged)) | 383 | if (security_vm_enough_memory_mm(mm, charged)) |
| 378 | goto Efault; | 384 | return ERR_PTR(-ENOMEM); |
| 379 | *p = charged; | 385 | *p = charged; |
| 380 | } | 386 | } |
| 381 | 387 | ||
| 382 | return vma; | 388 | return vma; |
| 383 | |||
| 384 | Efault: /* very odd choice for most of the cases, but... */ | ||
| 385 | return ERR_PTR(-EFAULT); | ||
| 386 | Einval: | ||
| 387 | return ERR_PTR(-EINVAL); | ||
| 388 | Enomem: | ||
| 389 | return ERR_PTR(-ENOMEM); | ||
| 390 | Eagain: | ||
| 391 | return ERR_PTR(-EAGAIN); | ||
| 392 | } | 389 | } |
| 393 | 390 | ||
| 394 | static unsigned long mremap_to(unsigned long addr, unsigned long old_len, | 391 | static unsigned long mremap_to(unsigned long addr, unsigned long old_len, |
diff --git a/mm/nommu.c b/mm/nommu.c index 3e67e7538ecf..e544508e2a4b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
| @@ -62,6 +62,7 @@ void *high_memory; | |||
| 62 | EXPORT_SYMBOL(high_memory); | 62 | EXPORT_SYMBOL(high_memory); |
| 63 | struct page *mem_map; | 63 | struct page *mem_map; |
| 64 | unsigned long max_mapnr; | 64 | unsigned long max_mapnr; |
| 65 | EXPORT_SYMBOL(max_mapnr); | ||
| 65 | unsigned long highest_memmap_pfn; | 66 | unsigned long highest_memmap_pfn; |
| 66 | struct percpu_counter vm_committed_as; | 67 | struct percpu_counter vm_committed_as; |
| 67 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | 68 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ |
| @@ -1015,7 +1016,7 @@ static int validate_mmap_request(struct file *file, | |||
| 1015 | * device */ | 1016 | * device */ |
| 1016 | if (!file->f_op->get_unmapped_area) | 1017 | if (!file->f_op->get_unmapped_area) |
| 1017 | capabilities &= ~NOMMU_MAP_DIRECT; | 1018 | capabilities &= ~NOMMU_MAP_DIRECT; |
| 1018 | if (!file->f_op->read) | 1019 | if (!(file->f_mode & FMODE_CAN_READ)) |
| 1019 | capabilities &= ~NOMMU_MAP_COPY; | 1020 | capabilities &= ~NOMMU_MAP_COPY; |
| 1020 | 1021 | ||
| 1021 | /* The file shall have been opened with read permission. */ | 1022 | /* The file shall have been opened with read permission. */ |
| @@ -1239,7 +1240,7 @@ static int do_mmap_private(struct vm_area_struct *vma, | |||
| 1239 | 1240 | ||
| 1240 | old_fs = get_fs(); | 1241 | old_fs = get_fs(); |
| 1241 | set_fs(KERNEL_DS); | 1242 | set_fs(KERNEL_DS); |
| 1242 | ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); | 1243 | ret = __vfs_read(vma->vm_file, base, len, &fpos); |
| 1243 | set_fs(old_fs); | 1244 | set_fs(old_fs); |
| 1244 | 1245 | ||
| 1245 | if (ret < 0) | 1246 | if (ret < 0) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 642f38cb175a..2b665da1b3c9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -408,7 +408,7 @@ bool oom_killer_disabled __read_mostly; | |||
| 408 | static DECLARE_RWSEM(oom_sem); | 408 | static DECLARE_RWSEM(oom_sem); |
| 409 | 409 | ||
| 410 | /** | 410 | /** |
| 411 | * mark_tsk_oom_victim - marks the given taks as OOM victim. | 411 | * mark_tsk_oom_victim - marks the given task as OOM victim. |
| 412 | * @tsk: task to mark | 412 | * @tsk: task to mark |
| 413 | * | 413 | * |
| 414 | * Has to be called with oom_sem taken for read and never after | 414 | * Has to be called with oom_sem taken for read and never after |
| @@ -612,7 +612,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 612 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. | 612 | * Determines whether the kernel must panic because of the panic_on_oom sysctl. |
| 613 | */ | 613 | */ |
| 614 | void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | 614 | void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, |
| 615 | int order, const nodemask_t *nodemask) | 615 | int order, const nodemask_t *nodemask, |
| 616 | struct mem_cgroup *memcg) | ||
| 616 | { | 617 | { |
| 617 | if (likely(!sysctl_panic_on_oom)) | 618 | if (likely(!sysctl_panic_on_oom)) |
| 618 | return; | 619 | return; |
| @@ -625,7 +626,7 @@ void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
| 625 | if (constraint != CONSTRAINT_NONE) | 626 | if (constraint != CONSTRAINT_NONE) |
| 626 | return; | 627 | return; |
| 627 | } | 628 | } |
| 628 | dump_header(NULL, gfp_mask, order, NULL, nodemask); | 629 | dump_header(NULL, gfp_mask, order, memcg, nodemask); |
| 629 | panic("Out of memory: %s panic_on_oom is enabled\n", | 630 | panic("Out of memory: %s panic_on_oom is enabled\n", |
| 630 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); | 631 | sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); |
| 631 | } | 632 | } |
| @@ -740,7 +741,7 @@ static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
| 740 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask, | 741 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask, |
| 741 | &totalpages); | 742 | &totalpages); |
| 742 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; | 743 | mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; |
| 743 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); | 744 | check_panic_on_oom(constraint, gfp_mask, order, mpol_mask, NULL); |
| 744 | 745 | ||
| 745 | if (sysctl_oom_kill_allocating_task && current->mm && | 746 | if (sysctl_oom_kill_allocating_task && current->mm && |
| 746 | !oom_unkillable_task(current, NULL, nodemask) && | 747 | !oom_unkillable_task(current, NULL, nodemask) && |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 45e187b2d971..5daf5568b9e1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -857,8 +857,11 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, | |||
| 857 | * bw * elapsed + write_bandwidth * (period - elapsed) | 857 | * bw * elapsed + write_bandwidth * (period - elapsed) |
| 858 | * write_bandwidth = --------------------------------------------------- | 858 | * write_bandwidth = --------------------------------------------------- |
| 859 | * period | 859 | * period |
| 860 | * | ||
| 861 | * @written may have decreased due to account_page_redirty(). | ||
| 862 | * Avoid underflowing @bw calculation. | ||
| 860 | */ | 863 | */ |
| 861 | bw = written - bdi->written_stamp; | 864 | bw = written - min(written, bdi->written_stamp); |
| 862 | bw *= HZ; | 865 | bw *= HZ; |
| 863 | if (unlikely(elapsed > period)) { | 866 | if (unlikely(elapsed > period)) { |
| 864 | do_div(bw, elapsed); | 867 | do_div(bw, elapsed); |
| @@ -922,7 +925,7 @@ static void global_update_bandwidth(unsigned long thresh, | |||
| 922 | unsigned long now) | 925 | unsigned long now) |
| 923 | { | 926 | { |
| 924 | static DEFINE_SPINLOCK(dirty_lock); | 927 | static DEFINE_SPINLOCK(dirty_lock); |
| 925 | static unsigned long update_time; | 928 | static unsigned long update_time = INITIAL_JIFFIES; |
| 926 | 929 | ||
| 927 | /* | 930 | /* |
| 928 | * check locklessly first to optimize away locking for the most time | 931 | * check locklessly first to optimize away locking for the most time |
| @@ -2108,6 +2111,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) | |||
| 2108 | EXPORT_SYMBOL(account_page_dirtied); | 2111 | EXPORT_SYMBOL(account_page_dirtied); |
| 2109 | 2112 | ||
| 2110 | /* | 2113 | /* |
| 2114 | * Helper function for deaccounting dirty page without writeback. | ||
| 2115 | * | ||
| 2116 | * Doing this should *normally* only ever be done when a page | ||
| 2117 | * is truncated, and is not actually mapped anywhere at all. However, | ||
| 2118 | * fs/buffer.c does this when it notices that somebody has cleaned | ||
| 2119 | * out all the buffers on a page without actually doing it through | ||
| 2120 | * the VM. Can you say "ext3 is horribly ugly"? Thought you could. | ||
| 2121 | */ | ||
| 2122 | void account_page_cleaned(struct page *page, struct address_space *mapping) | ||
| 2123 | { | ||
| 2124 | if (mapping_cap_account_dirty(mapping)) { | ||
| 2125 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
| 2126 | dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); | ||
| 2127 | task_io_account_cancelled_write(PAGE_CACHE_SIZE); | ||
| 2128 | } | ||
| 2129 | } | ||
| 2130 | EXPORT_SYMBOL(account_page_cleaned); | ||
| 2131 | |||
| 2132 | /* | ||
| 2111 | * For address_spaces which do not use buffers. Just tag the page as dirty in | 2133 | * For address_spaces which do not use buffers. Just tag the page as dirty in |
| 2112 | * its radix tree. | 2134 | * its radix tree. |
| 2113 | * | 2135 | * |
| @@ -2206,7 +2228,8 @@ int set_page_dirty(struct page *page) | |||
| 2206 | * it will confuse readahead and make it restart the size rampup | 2228 | * it will confuse readahead and make it restart the size rampup |
| 2207 | * process. But it's a trivial problem. | 2229 | * process. But it's a trivial problem. |
| 2208 | */ | 2230 | */ |
| 2209 | ClearPageReclaim(page); | 2231 | if (PageReclaim(page)) |
| 2232 | ClearPageReclaim(page); | ||
| 2210 | #ifdef CONFIG_BLOCK | 2233 | #ifdef CONFIG_BLOCK |
| 2211 | if (!spd) | 2234 | if (!spd) |
| 2212 | spd = __set_page_dirty_buffers; | 2235 | spd = __set_page_dirty_buffers; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7abfa70cdc1a..ebffa0e4a9c0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -1032,11 +1032,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |||
| 1032 | static int fallbacks[MIGRATE_TYPES][4] = { | 1032 | static int fallbacks[MIGRATE_TYPES][4] = { |
| 1033 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 1033 | [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
| 1034 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | 1034 | [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, |
| 1035 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | ||
| 1035 | #ifdef CONFIG_CMA | 1036 | #ifdef CONFIG_CMA |
| 1036 | [MIGRATE_MOVABLE] = { MIGRATE_CMA, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | ||
| 1037 | [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ | 1037 | [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ |
| 1038 | #else | ||
| 1039 | [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | ||
| 1040 | #endif | 1038 | #endif |
| 1041 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ | 1039 | [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ |
| 1042 | #ifdef CONFIG_MEMORY_ISOLATION | 1040 | #ifdef CONFIG_MEMORY_ISOLATION |
| @@ -1044,6 +1042,17 @@ static int fallbacks[MIGRATE_TYPES][4] = { | |||
| 1044 | #endif | 1042 | #endif |
| 1045 | }; | 1043 | }; |
| 1046 | 1044 | ||
| 1045 | #ifdef CONFIG_CMA | ||
| 1046 | static struct page *__rmqueue_cma_fallback(struct zone *zone, | ||
| 1047 | unsigned int order) | ||
| 1048 | { | ||
| 1049 | return __rmqueue_smallest(zone, order, MIGRATE_CMA); | ||
| 1050 | } | ||
| 1051 | #else | ||
| 1052 | static inline struct page *__rmqueue_cma_fallback(struct zone *zone, | ||
| 1053 | unsigned int order) { return NULL; } | ||
| 1054 | #endif | ||
| 1055 | |||
| 1047 | /* | 1056 | /* |
| 1048 | * Move the free pages in a range to the free lists of the requested type. | 1057 | * Move the free pages in a range to the free lists of the requested type. |
| 1049 | * Note that start_page and end_pages are not aligned on a pageblock | 1058 | * Note that start_page and end_pages are not aligned on a pageblock |
| @@ -1136,14 +1145,40 @@ static void change_pageblock_range(struct page *pageblock_page, | |||
| 1136 | * as fragmentation caused by those allocations polluting movable pageblocks | 1145 | * as fragmentation caused by those allocations polluting movable pageblocks |
| 1137 | * is worse than movable allocations stealing from unmovable and reclaimable | 1146 | * is worse than movable allocations stealing from unmovable and reclaimable |
| 1138 | * pageblocks. | 1147 | * pageblocks. |
| 1139 | * | ||
| 1140 | * If we claim more than half of the pageblock, change pageblock's migratetype | ||
| 1141 | * as well. | ||
| 1142 | */ | 1148 | */ |
| 1143 | static void try_to_steal_freepages(struct zone *zone, struct page *page, | 1149 | static bool can_steal_fallback(unsigned int order, int start_mt) |
| 1144 | int start_type, int fallback_type) | 1150 | { |
| 1151 | /* | ||
| 1152 | * Leaving this order check is intended, although there is | ||
| 1153 | * relaxed order check in next check. The reason is that | ||
| 1154 | * we can actually steal whole pageblock if this condition met, | ||
| 1155 | * but, below check doesn't guarantee it and that is just heuristic | ||
| 1156 | * so could be changed anytime. | ||
| 1157 | */ | ||
| 1158 | if (order >= pageblock_order) | ||
| 1159 | return true; | ||
| 1160 | |||
| 1161 | if (order >= pageblock_order / 2 || | ||
| 1162 | start_mt == MIGRATE_RECLAIMABLE || | ||
| 1163 | start_mt == MIGRATE_UNMOVABLE || | ||
| 1164 | page_group_by_mobility_disabled) | ||
| 1165 | return true; | ||
| 1166 | |||
| 1167 | return false; | ||
| 1168 | } | ||
| 1169 | |||
| 1170 | /* | ||
| 1171 | * This function implements actual steal behaviour. If order is large enough, | ||
| 1172 | * we can steal whole pageblock. If not, we first move freepages in this | ||
| 1173 | * pageblock and check whether half of pages are moved or not. If half of | ||
| 1174 | * pages are moved, we can change migratetype of pageblock and permanently | ||
| 1175 | * use it's pages as requested migratetype in the future. | ||
| 1176 | */ | ||
| 1177 | static void steal_suitable_fallback(struct zone *zone, struct page *page, | ||
| 1178 | int start_type) | ||
| 1145 | { | 1179 | { |
| 1146 | int current_order = page_order(page); | 1180 | int current_order = page_order(page); |
| 1181 | int pages; | ||
| 1147 | 1182 | ||
| 1148 | /* Take ownership for orders >= pageblock_order */ | 1183 | /* Take ownership for orders >= pageblock_order */ |
| 1149 | if (current_order >= pageblock_order) { | 1184 | if (current_order >= pageblock_order) { |
| @@ -1151,19 +1186,49 @@ static void try_to_steal_freepages(struct zone *zone, struct page *page, | |||
| 1151 | return; | 1186 | return; |
| 1152 | } | 1187 | } |
| 1153 | 1188 | ||
| 1154 | if (current_order >= pageblock_order / 2 || | 1189 | pages = move_freepages_block(zone, page, start_type); |
| 1155 | start_type == MIGRATE_RECLAIMABLE || | 1190 | |
| 1156 | start_type == MIGRATE_UNMOVABLE || | 1191 | /* Claim the whole block if over half of it is free */ |
| 1157 | page_group_by_mobility_disabled) { | 1192 | if (pages >= (1 << (pageblock_order-1)) || |
| 1158 | int pages; | 1193 | page_group_by_mobility_disabled) |
| 1194 | set_pageblock_migratetype(page, start_type); | ||
| 1195 | } | ||
| 1196 | |||
| 1197 | /* | ||
| 1198 | * Check whether there is a suitable fallback freepage with requested order. | ||
| 1199 | * If only_stealable is true, this function returns fallback_mt only if | ||
| 1200 | * we can steal other freepages all together. This would help to reduce | ||
| 1201 | * fragmentation due to mixed migratetype pages in one pageblock. | ||
| 1202 | */ | ||
| 1203 | int find_suitable_fallback(struct free_area *area, unsigned int order, | ||
| 1204 | int migratetype, bool only_stealable, bool *can_steal) | ||
| 1205 | { | ||
| 1206 | int i; | ||
| 1207 | int fallback_mt; | ||
| 1208 | |||
| 1209 | if (area->nr_free == 0) | ||
| 1210 | return -1; | ||
| 1211 | |||
| 1212 | *can_steal = false; | ||
| 1213 | for (i = 0;; i++) { | ||
| 1214 | fallback_mt = fallbacks[migratetype][i]; | ||
| 1215 | if (fallback_mt == MIGRATE_RESERVE) | ||
| 1216 | break; | ||
| 1217 | |||
| 1218 | if (list_empty(&area->free_list[fallback_mt])) | ||
| 1219 | continue; | ||
| 1159 | 1220 | ||
| 1160 | pages = move_freepages_block(zone, page, start_type); | 1221 | if (can_steal_fallback(order, migratetype)) |
| 1222 | *can_steal = true; | ||
| 1161 | 1223 | ||
| 1162 | /* Claim the whole block if over half of it is free */ | 1224 | if (!only_stealable) |
| 1163 | if (pages >= (1 << (pageblock_order-1)) || | 1225 | return fallback_mt; |
| 1164 | page_group_by_mobility_disabled) | 1226 | |
| 1165 | set_pageblock_migratetype(page, start_type); | 1227 | if (*can_steal) |
| 1228 | return fallback_mt; | ||
| 1166 | } | 1229 | } |
| 1230 | |||
| 1231 | return -1; | ||
| 1167 | } | 1232 | } |
| 1168 | 1233 | ||
| 1169 | /* Remove an element from the buddy allocator from the fallback list */ | 1234 | /* Remove an element from the buddy allocator from the fallback list */ |
| @@ -1173,64 +1238,45 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) | |||
| 1173 | struct free_area *area; | 1238 | struct free_area *area; |
| 1174 | unsigned int current_order; | 1239 | unsigned int current_order; |
| 1175 | struct page *page; | 1240 | struct page *page; |
| 1241 | int fallback_mt; | ||
| 1242 | bool can_steal; | ||
| 1176 | 1243 | ||
| 1177 | /* Find the largest possible block of pages in the other list */ | 1244 | /* Find the largest possible block of pages in the other list */ |
| 1178 | for (current_order = MAX_ORDER-1; | 1245 | for (current_order = MAX_ORDER-1; |
| 1179 | current_order >= order && current_order <= MAX_ORDER-1; | 1246 | current_order >= order && current_order <= MAX_ORDER-1; |
| 1180 | --current_order) { | 1247 | --current_order) { |
| 1181 | int i; | 1248 | area = &(zone->free_area[current_order]); |
| 1182 | for (i = 0;; i++) { | 1249 | fallback_mt = find_suitable_fallback(area, current_order, |
| 1183 | int migratetype = fallbacks[start_migratetype][i]; | 1250 | start_migratetype, false, &can_steal); |
| 1184 | int buddy_type = start_migratetype; | 1251 | if (fallback_mt == -1) |
| 1185 | 1252 | continue; | |
| 1186 | /* MIGRATE_RESERVE handled later if necessary */ | ||
| 1187 | if (migratetype == MIGRATE_RESERVE) | ||
| 1188 | break; | ||
| 1189 | |||
| 1190 | area = &(zone->free_area[current_order]); | ||
| 1191 | if (list_empty(&area->free_list[migratetype])) | ||
| 1192 | continue; | ||
| 1193 | |||
| 1194 | page = list_entry(area->free_list[migratetype].next, | ||
| 1195 | struct page, lru); | ||
| 1196 | area->nr_free--; | ||
| 1197 | |||
| 1198 | if (!is_migrate_cma(migratetype)) { | ||
| 1199 | try_to_steal_freepages(zone, page, | ||
| 1200 | start_migratetype, | ||
| 1201 | migratetype); | ||
| 1202 | } else { | ||
| 1203 | /* | ||
| 1204 | * When borrowing from MIGRATE_CMA, we need to | ||
| 1205 | * release the excess buddy pages to CMA | ||
| 1206 | * itself, and we do not try to steal extra | ||
| 1207 | * free pages. | ||
| 1208 | */ | ||
| 1209 | buddy_type = migratetype; | ||
| 1210 | } | ||
| 1211 | 1253 | ||
| 1212 | /* Remove the page from the freelists */ | 1254 | page = list_entry(area->free_list[fallback_mt].next, |
| 1213 | list_del(&page->lru); | 1255 | struct page, lru); |
| 1214 | rmv_page_order(page); | 1256 | if (can_steal) |
| 1257 | steal_suitable_fallback(zone, page, start_migratetype); | ||
| 1215 | 1258 | ||
| 1216 | expand(zone, page, order, current_order, area, | 1259 | /* Remove the page from the freelists */ |
| 1217 | buddy_type); | 1260 | area->nr_free--; |
| 1261 | list_del(&page->lru); | ||
| 1262 | rmv_page_order(page); | ||
| 1218 | 1263 | ||
| 1219 | /* | 1264 | expand(zone, page, order, current_order, area, |
| 1220 | * The freepage_migratetype may differ from pageblock's | 1265 | start_migratetype); |
| 1221 | * migratetype depending on the decisions in | 1266 | /* |
| 1222 | * try_to_steal_freepages(). This is OK as long as it | 1267 | * The freepage_migratetype may differ from pageblock's |
| 1223 | * does not differ for MIGRATE_CMA pageblocks. For CMA | 1268 | * migratetype depending on the decisions in |
| 1224 | * we need to make sure unallocated pages flushed from | 1269 | * try_to_steal_freepages(). This is OK as long as it |
| 1225 | * pcp lists are returned to the correct freelist. | 1270 | * does not differ for MIGRATE_CMA pageblocks. For CMA |
| 1226 | */ | 1271 | * we need to make sure unallocated pages flushed from |
| 1227 | set_freepage_migratetype(page, buddy_type); | 1272 | * pcp lists are returned to the correct freelist. |
| 1273 | */ | ||
| 1274 | set_freepage_migratetype(page, start_migratetype); | ||
| 1228 | 1275 | ||
| 1229 | trace_mm_page_alloc_extfrag(page, order, current_order, | 1276 | trace_mm_page_alloc_extfrag(page, order, current_order, |
| 1230 | start_migratetype, migratetype); | 1277 | start_migratetype, fallback_mt); |
| 1231 | 1278 | ||
| 1232 | return page; | 1279 | return page; |
| 1233 | } | ||
| 1234 | } | 1280 | } |
| 1235 | 1281 | ||
| 1236 | return NULL; | 1282 | return NULL; |
| @@ -1249,7 +1295,11 @@ retry_reserve: | |||
| 1249 | page = __rmqueue_smallest(zone, order, migratetype); | 1295 | page = __rmqueue_smallest(zone, order, migratetype); |
| 1250 | 1296 | ||
| 1251 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { | 1297 | if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { |
| 1252 | page = __rmqueue_fallback(zone, order, migratetype); | 1298 | if (migratetype == MIGRATE_MOVABLE) |
| 1299 | page = __rmqueue_cma_fallback(zone, order); | ||
| 1300 | |||
| 1301 | if (!page) | ||
| 1302 | page = __rmqueue_fallback(zone, order, migratetype); | ||
| 1253 | 1303 | ||
| 1254 | /* | 1304 | /* |
| 1255 | * Use MIGRATE_RESERVE rather than fail an allocation. goto | 1305 | * Use MIGRATE_RESERVE rather than fail an allocation. goto |
| @@ -1321,7 +1371,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
| 1321 | int to_drain, batch; | 1371 | int to_drain, batch; |
| 1322 | 1372 | ||
| 1323 | local_irq_save(flags); | 1373 | local_irq_save(flags); |
| 1324 | batch = ACCESS_ONCE(pcp->batch); | 1374 | batch = READ_ONCE(pcp->batch); |
| 1325 | to_drain = min(pcp->count, batch); | 1375 | to_drain = min(pcp->count, batch); |
| 1326 | if (to_drain > 0) { | 1376 | if (to_drain > 0) { |
| 1327 | free_pcppages_bulk(zone, to_drain, pcp); | 1377 | free_pcppages_bulk(zone, to_drain, pcp); |
| @@ -1520,7 +1570,7 @@ void free_hot_cold_page(struct page *page, bool cold) | |||
| 1520 | list_add_tail(&page->lru, &pcp->lists[migratetype]); | 1570 | list_add_tail(&page->lru, &pcp->lists[migratetype]); |
| 1521 | pcp->count++; | 1571 | pcp->count++; |
| 1522 | if (pcp->count >= pcp->high) { | 1572 | if (pcp->count >= pcp->high) { |
| 1523 | unsigned long batch = ACCESS_ONCE(pcp->batch); | 1573 | unsigned long batch = READ_ONCE(pcp->batch); |
| 1524 | free_pcppages_bulk(zone, batch, pcp); | 1574 | free_pcppages_bulk(zone, batch, pcp); |
| 1525 | pcp->count -= batch; | 1575 | pcp->count -= batch; |
| 1526 | } | 1576 | } |
| @@ -2362,18 +2412,13 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
| 2362 | *did_some_progress = 1; | 2412 | *did_some_progress = 1; |
| 2363 | goto out; | 2413 | goto out; |
| 2364 | } | 2414 | } |
| 2365 | /* | 2415 | /* The OOM killer may not free memory on a specific node */ |
| 2366 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | ||
| 2367 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | ||
| 2368 | * The caller should handle page allocation failure by itself if | ||
| 2369 | * it specifies __GFP_THISNODE. | ||
| 2370 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. | ||
| 2371 | */ | ||
| 2372 | if (gfp_mask & __GFP_THISNODE) | 2416 | if (gfp_mask & __GFP_THISNODE) |
| 2373 | goto out; | 2417 | goto out; |
| 2374 | } | 2418 | } |
| 2375 | /* Exhausted what can be done so it's blamo time */ | 2419 | /* Exhausted what can be done so it's blamo time */ |
| 2376 | if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)) | 2420 | if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false) |
| 2421 | || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) | ||
| 2377 | *did_some_progress = 1; | 2422 | *did_some_progress = 1; |
| 2378 | out: | 2423 | out: |
| 2379 | oom_zonelist_unlock(ac->zonelist, gfp_mask); | 2424 | oom_zonelist_unlock(ac->zonelist, gfp_mask); |
| @@ -2622,15 +2667,11 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
| 2622 | } | 2667 | } |
| 2623 | 2668 | ||
| 2624 | /* | 2669 | /* |
| 2625 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 2670 | * If this allocation cannot block and it is for a specific node, then |
| 2626 | * __GFP_NOWARN set) should not cause reclaim since the subsystem | 2671 | * fail early. There's no need to wakeup kswapd or retry for a |
| 2627 | * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim | 2672 | * speculative node-specific allocation. |
| 2628 | * using a larger set of nodes after it has established that the | ||
| 2629 | * allowed per node queues are empty and that nodes are | ||
| 2630 | * over allocated. | ||
| 2631 | */ | 2673 | */ |
| 2632 | if (IS_ENABLED(CONFIG_NUMA) && | 2674 | if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait) |
| 2633 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
| 2634 | goto nopage; | 2675 | goto nopage; |
| 2635 | 2676 | ||
| 2636 | retry: | 2677 | retry: |
| @@ -2823,7 +2864,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
| 2823 | /* | 2864 | /* |
| 2824 | * Check the zones suitable for the gfp_mask contain at least one | 2865 | * Check the zones suitable for the gfp_mask contain at least one |
| 2825 | * valid zone. It's possible to have an empty zonelist as a result | 2866 | * valid zone. It's possible to have an empty zonelist as a result |
| 2826 | * of GFP_THISNODE and a memoryless node | 2867 | * of __GFP_THISNODE and a memoryless node |
| 2827 | */ | 2868 | */ |
| 2828 | if (unlikely(!zonelist->_zonerefs->zone)) | 2869 | if (unlikely(!zonelist->_zonerefs->zone)) |
| 2829 | return NULL; | 2870 | return NULL; |
| @@ -3200,38 +3241,31 @@ static void show_migration_types(unsigned char type) | |||
| 3200 | * Show free area list (used inside shift_scroll-lock stuff) | 3241 | * Show free area list (used inside shift_scroll-lock stuff) |
| 3201 | * We also calculate the percentage fragmentation. We do this by counting the | 3242 | * We also calculate the percentage fragmentation. We do this by counting the |
| 3202 | * memory on each free list with the exception of the first item on the list. | 3243 | * memory on each free list with the exception of the first item on the list. |
| 3203 | * Suppresses nodes that are not allowed by current's cpuset if | 3244 | * |
| 3204 | * SHOW_MEM_FILTER_NODES is passed. | 3245 | * Bits in @filter: |
| 3246 | * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's | ||
| 3247 | * cpuset. | ||
| 3205 | */ | 3248 | */ |
| 3206 | void show_free_areas(unsigned int filter) | 3249 | void show_free_areas(unsigned int filter) |
| 3207 | { | 3250 | { |
| 3251 | unsigned long free_pcp = 0; | ||
| 3208 | int cpu; | 3252 | int cpu; |
| 3209 | struct zone *zone; | 3253 | struct zone *zone; |
| 3210 | 3254 | ||
| 3211 | for_each_populated_zone(zone) { | 3255 | for_each_populated_zone(zone) { |
| 3212 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 3256 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
| 3213 | continue; | 3257 | continue; |
| 3214 | show_node(zone); | ||
| 3215 | printk("%s per-cpu:\n", zone->name); | ||
| 3216 | 3258 | ||
| 3217 | for_each_online_cpu(cpu) { | 3259 | for_each_online_cpu(cpu) |
| 3218 | struct per_cpu_pageset *pageset; | 3260 | free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; |
| 3219 | |||
| 3220 | pageset = per_cpu_ptr(zone->pageset, cpu); | ||
| 3221 | |||
| 3222 | printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", | ||
| 3223 | cpu, pageset->pcp.high, | ||
| 3224 | pageset->pcp.batch, pageset->pcp.count); | ||
| 3225 | } | ||
| 3226 | } | 3261 | } |
| 3227 | 3262 | ||
| 3228 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" | 3263 | printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" |
| 3229 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" | 3264 | " active_file:%lu inactive_file:%lu isolated_file:%lu\n" |
| 3230 | " unevictable:%lu" | 3265 | " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" |
| 3231 | " dirty:%lu writeback:%lu unstable:%lu\n" | 3266 | " slab_reclaimable:%lu slab_unreclaimable:%lu\n" |
| 3232 | " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" | ||
| 3233 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" | 3267 | " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" |
| 3234 | " free_cma:%lu\n", | 3268 | " free:%lu free_pcp:%lu free_cma:%lu\n", |
| 3235 | global_page_state(NR_ACTIVE_ANON), | 3269 | global_page_state(NR_ACTIVE_ANON), |
| 3236 | global_page_state(NR_INACTIVE_ANON), | 3270 | global_page_state(NR_INACTIVE_ANON), |
| 3237 | global_page_state(NR_ISOLATED_ANON), | 3271 | global_page_state(NR_ISOLATED_ANON), |
| @@ -3242,13 +3276,14 @@ void show_free_areas(unsigned int filter) | |||
| 3242 | global_page_state(NR_FILE_DIRTY), | 3276 | global_page_state(NR_FILE_DIRTY), |
| 3243 | global_page_state(NR_WRITEBACK), | 3277 | global_page_state(NR_WRITEBACK), |
| 3244 | global_page_state(NR_UNSTABLE_NFS), | 3278 | global_page_state(NR_UNSTABLE_NFS), |
| 3245 | global_page_state(NR_FREE_PAGES), | ||
| 3246 | global_page_state(NR_SLAB_RECLAIMABLE), | 3279 | global_page_state(NR_SLAB_RECLAIMABLE), |
| 3247 | global_page_state(NR_SLAB_UNRECLAIMABLE), | 3280 | global_page_state(NR_SLAB_UNRECLAIMABLE), |
| 3248 | global_page_state(NR_FILE_MAPPED), | 3281 | global_page_state(NR_FILE_MAPPED), |
| 3249 | global_page_state(NR_SHMEM), | 3282 | global_page_state(NR_SHMEM), |
| 3250 | global_page_state(NR_PAGETABLE), | 3283 | global_page_state(NR_PAGETABLE), |
| 3251 | global_page_state(NR_BOUNCE), | 3284 | global_page_state(NR_BOUNCE), |
| 3285 | global_page_state(NR_FREE_PAGES), | ||
| 3286 | free_pcp, | ||
| 3252 | global_page_state(NR_FREE_CMA_PAGES)); | 3287 | global_page_state(NR_FREE_CMA_PAGES)); |
| 3253 | 3288 | ||
| 3254 | for_each_populated_zone(zone) { | 3289 | for_each_populated_zone(zone) { |
| @@ -3256,6 +3291,11 @@ void show_free_areas(unsigned int filter) | |||
| 3256 | 3291 | ||
| 3257 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 3292 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
| 3258 | continue; | 3293 | continue; |
| 3294 | |||
| 3295 | free_pcp = 0; | ||
| 3296 | for_each_online_cpu(cpu) | ||
| 3297 | free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count; | ||
| 3298 | |||
| 3259 | show_node(zone); | 3299 | show_node(zone); |
| 3260 | printk("%s" | 3300 | printk("%s" |
| 3261 | " free:%lukB" | 3301 | " free:%lukB" |
| @@ -3282,6 +3322,8 @@ void show_free_areas(unsigned int filter) | |||
| 3282 | " pagetables:%lukB" | 3322 | " pagetables:%lukB" |
| 3283 | " unstable:%lukB" | 3323 | " unstable:%lukB" |
| 3284 | " bounce:%lukB" | 3324 | " bounce:%lukB" |
| 3325 | " free_pcp:%lukB" | ||
| 3326 | " local_pcp:%ukB" | ||
| 3285 | " free_cma:%lukB" | 3327 | " free_cma:%lukB" |
| 3286 | " writeback_tmp:%lukB" | 3328 | " writeback_tmp:%lukB" |
| 3287 | " pages_scanned:%lu" | 3329 | " pages_scanned:%lu" |
| @@ -3313,6 +3355,8 @@ void show_free_areas(unsigned int filter) | |||
| 3313 | K(zone_page_state(zone, NR_PAGETABLE)), | 3355 | K(zone_page_state(zone, NR_PAGETABLE)), |
| 3314 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), | 3356 | K(zone_page_state(zone, NR_UNSTABLE_NFS)), |
| 3315 | K(zone_page_state(zone, NR_BOUNCE)), | 3357 | K(zone_page_state(zone, NR_BOUNCE)), |
| 3358 | K(free_pcp), | ||
| 3359 | K(this_cpu_read(zone->pageset->pcp.count)), | ||
| 3316 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), | 3360 | K(zone_page_state(zone, NR_FREE_CMA_PAGES)), |
| 3317 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), | 3361 | K(zone_page_state(zone, NR_WRITEBACK_TEMP)), |
| 3318 | K(zone_page_state(zone, NR_PAGES_SCANNED)), | 3362 | K(zone_page_state(zone, NR_PAGES_SCANNED)), |
| @@ -5716,7 +5760,7 @@ static void __setup_per_zone_wmarks(void) | |||
| 5716 | * value here. | 5760 | * value here. |
| 5717 | * | 5761 | * |
| 5718 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) | 5762 | * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) |
| 5719 | * deltas controls asynch page reclaim, and so should | 5763 | * deltas control asynch page reclaim, and so should |
| 5720 | * not be capped for highmem. | 5764 | * not be capped for highmem. |
| 5721 | */ | 5765 | */ |
| 5722 | unsigned long min_pages; | 5766 | unsigned long min_pages; |
| @@ -6163,7 +6207,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, | |||
| 6163 | mask <<= (BITS_PER_LONG - bitidx - 1); | 6207 | mask <<= (BITS_PER_LONG - bitidx - 1); |
| 6164 | flags <<= (BITS_PER_LONG - bitidx - 1); | 6208 | flags <<= (BITS_PER_LONG - bitidx - 1); |
| 6165 | 6209 | ||
| 6166 | word = ACCESS_ONCE(bitmap[word_bitidx]); | 6210 | word = READ_ONCE(bitmap[word_bitidx]); |
| 6167 | for (;;) { | 6211 | for (;;) { |
| 6168 | old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); | 6212 | old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); |
| 6169 | if (word == old_word) | 6213 | if (word == old_word) |
diff --git a/mm/page_io.c b/mm/page_io.c index e6045804c8d8..6424869e275e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c | |||
| @@ -20,8 +20,8 @@ | |||
| 20 | #include <linux/buffer_head.h> | 20 | #include <linux/buffer_head.h> |
| 21 | #include <linux/writeback.h> | 21 | #include <linux/writeback.h> |
| 22 | #include <linux/frontswap.h> | 22 | #include <linux/frontswap.h> |
| 23 | #include <linux/aio.h> | ||
| 24 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
| 24 | #include <linux/uio.h> | ||
| 25 | #include <asm/pgtable.h> | 25 | #include <asm/pgtable.h> |
| 26 | 26 | ||
| 27 | static struct bio *get_swap_bio(gfp_t gfp_flags, | 27 | static struct bio *get_swap_bio(gfp_t gfp_flags, |
| @@ -274,13 +274,10 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, | |||
| 274 | iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); | 274 | iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); |
| 275 | init_sync_kiocb(&kiocb, swap_file); | 275 | init_sync_kiocb(&kiocb, swap_file); |
| 276 | kiocb.ki_pos = page_file_offset(page); | 276 | kiocb.ki_pos = page_file_offset(page); |
| 277 | kiocb.ki_nbytes = PAGE_SIZE; | ||
| 278 | 277 | ||
| 279 | set_page_writeback(page); | 278 | set_page_writeback(page); |
| 280 | unlock_page(page); | 279 | unlock_page(page); |
| 281 | ret = mapping->a_ops->direct_IO(ITER_BVEC | WRITE, | 280 | ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos); |
| 282 | &kiocb, &from, | ||
| 283 | kiocb.ki_pos); | ||
| 284 | if (ret == PAGE_SIZE) { | 281 | if (ret == PAGE_SIZE) { |
| 285 | count_vm_event(PSWPOUT); | 282 | count_vm_event(PSWPOUT); |
| 286 | ret = 0; | 283 | ret = 0; |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 72f5ac381ab3..755a42c76eb4 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
| @@ -103,6 +103,7 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) | |||
| 103 | 103 | ||
| 104 | if (!is_migrate_isolate_page(buddy)) { | 104 | if (!is_migrate_isolate_page(buddy)) { |
| 105 | __isolate_free_page(page, order); | 105 | __isolate_free_page(page, order); |
| 106 | kernel_map_pages(page, (1 << order), 1); | ||
| 106 | set_page_refcounted(page); | 107 | set_page_refcounted(page); |
| 107 | isolated_page = page; | 108 | isolated_page = page; |
| 108 | } | 109 | } |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 75c1f2878519..29f2f8b853ae 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
| @@ -265,8 +265,15 @@ int walk_page_range(unsigned long start, unsigned long end, | |||
| 265 | vma = vma->vm_next; | 265 | vma = vma->vm_next; |
| 266 | 266 | ||
| 267 | err = walk_page_test(start, next, walk); | 267 | err = walk_page_test(start, next, walk); |
| 268 | if (err > 0) | 268 | if (err > 0) { |
| 269 | /* | ||
| 270 | * positive return values are purely for | ||
| 271 | * controlling the pagewalk, so should never | ||
| 272 | * be passed to the callers. | ||
| 273 | */ | ||
| 274 | err = 0; | ||
| 269 | continue; | 275 | continue; |
| 276 | } | ||
| 270 | if (err < 0) | 277 | if (err < 0) |
| 271 | break; | 278 | break; |
| 272 | } | 279 | } |
diff --git a/mm/percpu.c b/mm/percpu.c index 73c97a5f4495..dfd02484e8de 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
| @@ -1310,7 +1310,7 @@ bool is_kernel_percpu_address(unsigned long addr) | |||
| 1310 | * and, from the second one, the backing allocator (currently either vm or | 1310 | * and, from the second one, the backing allocator (currently either vm or |
| 1311 | * km) provides translation. | 1311 | * km) provides translation. |
| 1312 | * | 1312 | * |
| 1313 | * The addr can be tranlated simply without checking if it falls into the | 1313 | * The addr can be translated simply without checking if it falls into the |
| 1314 | * first chunk. But the current code reflects better how percpu allocator | 1314 | * first chunk. But the current code reflects better how percpu allocator |
| 1315 | * actually works, and the verification can discover both bugs in percpu | 1315 | * actually works, and the verification can discover both bugs in percpu |
| 1316 | * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current | 1316 | * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current |
| @@ -1762,7 +1762,7 @@ early_param("percpu_alloc", percpu_alloc_setup); | |||
| 1762 | * and other parameters considering needed percpu size, allocation | 1762 | * and other parameters considering needed percpu size, allocation |
| 1763 | * atom size and distances between CPUs. | 1763 | * atom size and distances between CPUs. |
| 1764 | * | 1764 | * |
| 1765 | * Groups are always mutliples of atom size and CPUs which are of | 1765 | * Groups are always multiples of atom size and CPUs which are of |
| 1766 | * LOCAL_DISTANCE both ways are grouped together and share space for | 1766 | * LOCAL_DISTANCE both ways are grouped together and share space for |
| 1767 | * units in the same group. The returned configuration is guaranteed | 1767 | * units in the same group. The returned configuration is guaranteed |
| 1768 | * to have CPUs on different nodes on different groups and >=75% usage | 1768 | * to have CPUs on different nodes on different groups and >=75% usage |
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index b1597690530c..e88d071648c2 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c | |||
| @@ -257,22 +257,18 @@ static ssize_t process_vm_rw(pid_t pid, | |||
| 257 | struct iovec *iov_r = iovstack_r; | 257 | struct iovec *iov_r = iovstack_r; |
| 258 | struct iov_iter iter; | 258 | struct iov_iter iter; |
| 259 | ssize_t rc; | 259 | ssize_t rc; |
| 260 | int dir = vm_write ? WRITE : READ; | ||
| 260 | 261 | ||
| 261 | if (flags != 0) | 262 | if (flags != 0) |
| 262 | return -EINVAL; | 263 | return -EINVAL; |
| 263 | 264 | ||
| 264 | /* Check iovecs */ | 265 | /* Check iovecs */ |
| 265 | if (vm_write) | 266 | rc = import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); |
| 266 | rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, | 267 | if (rc < 0) |
| 267 | iovstack_l, &iov_l); | 268 | return rc; |
| 268 | else | 269 | if (!iov_iter_count(&iter)) |
| 269 | rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, | ||
| 270 | iovstack_l, &iov_l); | ||
| 271 | if (rc <= 0) | ||
| 272 | goto free_iovecs; | 270 | goto free_iovecs; |
| 273 | 271 | ||
| 274 | iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc); | ||
| 275 | |||
| 276 | rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, | 272 | rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV, |
| 277 | iovstack_r, &iov_r); | 273 | iovstack_r, &iov_r); |
| 278 | if (rc <= 0) | 274 | if (rc <= 0) |
| @@ -283,8 +279,7 @@ static ssize_t process_vm_rw(pid_t pid, | |||
| 283 | free_iovecs: | 279 | free_iovecs: |
| 284 | if (iov_r != iovstack_r) | 280 | if (iov_r != iovstack_r) |
| 285 | kfree(iov_r); | 281 | kfree(iov_r); |
| 286 | if (iov_l != iovstack_l) | 282 | kfree(iov_l); |
| 287 | kfree(iov_l); | ||
| 288 | 283 | ||
| 289 | return rc; | 284 | return rc; |
| 290 | } | 285 | } |
| @@ -320,21 +315,16 @@ compat_process_vm_rw(compat_pid_t pid, | |||
| 320 | struct iovec *iov_r = iovstack_r; | 315 | struct iovec *iov_r = iovstack_r; |
| 321 | struct iov_iter iter; | 316 | struct iov_iter iter; |
| 322 | ssize_t rc = -EFAULT; | 317 | ssize_t rc = -EFAULT; |
| 318 | int dir = vm_write ? WRITE : READ; | ||
| 323 | 319 | ||
| 324 | if (flags != 0) | 320 | if (flags != 0) |
| 325 | return -EINVAL; | 321 | return -EINVAL; |
| 326 | 322 | ||
| 327 | if (vm_write) | 323 | rc = compat_import_iovec(dir, lvec, liovcnt, UIO_FASTIOV, &iov_l, &iter); |
| 328 | rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, | 324 | if (rc < 0) |
| 329 | UIO_FASTIOV, iovstack_l, | 325 | return rc; |
| 330 | &iov_l); | 326 | if (!iov_iter_count(&iter)) |
| 331 | else | ||
| 332 | rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, | ||
| 333 | UIO_FASTIOV, iovstack_l, | ||
| 334 | &iov_l); | ||
| 335 | if (rc <= 0) | ||
| 336 | goto free_iovecs; | 327 | goto free_iovecs; |
| 337 | iov_iter_init(&iter, vm_write ? WRITE : READ, iov_l, liovcnt, rc); | ||
| 338 | rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, | 328 | rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, |
| 339 | UIO_FASTIOV, iovstack_r, | 329 | UIO_FASTIOV, iovstack_r, |
| 340 | &iov_r); | 330 | &iov_r); |
| @@ -346,8 +336,7 @@ compat_process_vm_rw(compat_pid_t pid, | |||
| 346 | free_iovecs: | 336 | free_iovecs: |
| 347 | if (iov_r != iovstack_r) | 337 | if (iov_r != iovstack_r) |
| 348 | kfree(iov_r); | 338 | kfree(iov_r); |
| 349 | if (iov_l != iovstack_l) | 339 | kfree(iov_l); |
| 350 | kfree(iov_l); | ||
| 351 | return rc; | 340 | return rc; |
| 352 | } | 341 | } |
| 353 | 342 | ||
| @@ -287,6 +287,13 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) | |||
| 287 | return 0; | 287 | return 0; |
| 288 | 288 | ||
| 289 | enomem_failure: | 289 | enomem_failure: |
| 290 | /* | ||
| 291 | * dst->anon_vma is dropped here otherwise its degree can be incorrectly | ||
| 292 | * decremented in unlink_anon_vmas(). | ||
| 293 | * We can safely do this because callers of anon_vma_clone() don't care | ||
| 294 | * about dst->anon_vma if anon_vma_clone() failed. | ||
| 295 | */ | ||
| 296 | dst->anon_vma = NULL; | ||
| 290 | unlink_anon_vmas(dst); | 297 | unlink_anon_vmas(dst); |
| 291 | return -ENOMEM; | 298 | return -ENOMEM; |
| 292 | } | 299 | } |
| @@ -449,7 +456,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) | |||
| 449 | unsigned long anon_mapping; | 456 | unsigned long anon_mapping; |
| 450 | 457 | ||
| 451 | rcu_read_lock(); | 458 | rcu_read_lock(); |
| 452 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | 459 | anon_mapping = (unsigned long)READ_ONCE(page->mapping); |
| 453 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | 460 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
| 454 | goto out; | 461 | goto out; |
| 455 | if (!page_mapped(page)) | 462 | if (!page_mapped(page)) |
| @@ -493,14 +500,14 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) | |||
| 493 | unsigned long anon_mapping; | 500 | unsigned long anon_mapping; |
| 494 | 501 | ||
| 495 | rcu_read_lock(); | 502 | rcu_read_lock(); |
| 496 | anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); | 503 | anon_mapping = (unsigned long)READ_ONCE(page->mapping); |
| 497 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | 504 | if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) |
| 498 | goto out; | 505 | goto out; |
| 499 | if (!page_mapped(page)) | 506 | if (!page_mapped(page)) |
| 500 | goto out; | 507 | goto out; |
| 501 | 508 | ||
| 502 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); | 509 | anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); |
| 503 | root_anon_vma = ACCESS_ONCE(anon_vma->root); | 510 | root_anon_vma = READ_ONCE(anon_vma->root); |
| 504 | if (down_read_trylock(&root_anon_vma->rwsem)) { | 511 | if (down_read_trylock(&root_anon_vma->rwsem)) { |
| 505 | /* | 512 | /* |
| 506 | * If the page is still mapped, then this anon_vma is still | 513 | * If the page is still mapped, then this anon_vma is still |
diff --git a/mm/shmem.c b/mm/shmem.c index cf2d0ca010bc..de981370fbc5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -31,7 +31,7 @@ | |||
| 31 | #include <linux/mm.h> | 31 | #include <linux/mm.h> |
| 32 | #include <linux/export.h> | 32 | #include <linux/export.h> |
| 33 | #include <linux/swap.h> | 33 | #include <linux/swap.h> |
| 34 | #include <linux/aio.h> | 34 | #include <linux/uio.h> |
| 35 | 35 | ||
| 36 | static struct vfsmount *shm_mnt; | 36 | static struct vfsmount *shm_mnt; |
| 37 | 37 | ||
| @@ -544,7 +544,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range); | |||
| 544 | 544 | ||
| 545 | static int shmem_setattr(struct dentry *dentry, struct iattr *attr) | 545 | static int shmem_setattr(struct dentry *dentry, struct iattr *attr) |
| 546 | { | 546 | { |
| 547 | struct inode *inode = dentry->d_inode; | 547 | struct inode *inode = d_inode(dentry); |
| 548 | struct shmem_inode_info *info = SHMEM_I(inode); | 548 | struct shmem_inode_info *info = SHMEM_I(inode); |
| 549 | int error; | 549 | int error; |
| 550 | 550 | ||
| @@ -2274,7 +2274,7 @@ static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, | |||
| 2274 | */ | 2274 | */ |
| 2275 | static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) | 2275 | static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) |
| 2276 | { | 2276 | { |
| 2277 | struct inode *inode = old_dentry->d_inode; | 2277 | struct inode *inode = d_inode(old_dentry); |
| 2278 | int ret; | 2278 | int ret; |
| 2279 | 2279 | ||
| 2280 | /* | 2280 | /* |
| @@ -2298,7 +2298,7 @@ out: | |||
| 2298 | 2298 | ||
| 2299 | static int shmem_unlink(struct inode *dir, struct dentry *dentry) | 2299 | static int shmem_unlink(struct inode *dir, struct dentry *dentry) |
| 2300 | { | 2300 | { |
| 2301 | struct inode *inode = dentry->d_inode; | 2301 | struct inode *inode = d_inode(dentry); |
| 2302 | 2302 | ||
| 2303 | if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) | 2303 | if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode)) |
| 2304 | shmem_free_inode(inode->i_sb); | 2304 | shmem_free_inode(inode->i_sb); |
| @@ -2315,7 +2315,7 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) | |||
| 2315 | if (!simple_empty(dentry)) | 2315 | if (!simple_empty(dentry)) |
| 2316 | return -ENOTEMPTY; | 2316 | return -ENOTEMPTY; |
| 2317 | 2317 | ||
| 2318 | drop_nlink(dentry->d_inode); | 2318 | drop_nlink(d_inode(dentry)); |
| 2319 | drop_nlink(dir); | 2319 | drop_nlink(dir); |
| 2320 | return shmem_unlink(dir, dentry); | 2320 | return shmem_unlink(dir, dentry); |
| 2321 | } | 2321 | } |
| @@ -2336,8 +2336,8 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru | |||
| 2336 | } | 2336 | } |
| 2337 | old_dir->i_ctime = old_dir->i_mtime = | 2337 | old_dir->i_ctime = old_dir->i_mtime = |
| 2338 | new_dir->i_ctime = new_dir->i_mtime = | 2338 | new_dir->i_ctime = new_dir->i_mtime = |
| 2339 | old_dentry->d_inode->i_ctime = | 2339 | d_inode(old_dentry)->i_ctime = |
| 2340 | new_dentry->d_inode->i_ctime = CURRENT_TIME; | 2340 | d_inode(new_dentry)->i_ctime = CURRENT_TIME; |
| 2341 | 2341 | ||
| 2342 | return 0; | 2342 | return 0; |
| 2343 | } | 2343 | } |
| @@ -2376,7 +2376,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) | |||
| 2376 | */ | 2376 | */ |
| 2377 | static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) | 2377 | static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) |
| 2378 | { | 2378 | { |
| 2379 | struct inode *inode = old_dentry->d_inode; | 2379 | struct inode *inode = d_inode(old_dentry); |
| 2380 | int they_are_dirs = S_ISDIR(inode->i_mode); | 2380 | int they_are_dirs = S_ISDIR(inode->i_mode); |
| 2381 | 2381 | ||
| 2382 | if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) | 2382 | if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) |
| @@ -2396,10 +2396,10 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc | |||
| 2396 | return error; | 2396 | return error; |
| 2397 | } | 2397 | } |
| 2398 | 2398 | ||
| 2399 | if (new_dentry->d_inode) { | 2399 | if (d_really_is_positive(new_dentry)) { |
| 2400 | (void) shmem_unlink(new_dir, new_dentry); | 2400 | (void) shmem_unlink(new_dir, new_dentry); |
| 2401 | if (they_are_dirs) { | 2401 | if (they_are_dirs) { |
| 2402 | drop_nlink(new_dentry->d_inode); | 2402 | drop_nlink(d_inode(new_dentry)); |
| 2403 | drop_nlink(old_dir); | 2403 | drop_nlink(old_dir); |
| 2404 | } | 2404 | } |
| 2405 | } else if (they_are_dirs) { | 2405 | } else if (they_are_dirs) { |
| @@ -2476,14 +2476,14 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s | |||
| 2476 | 2476 | ||
| 2477 | static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) | 2477 | static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd) |
| 2478 | { | 2478 | { |
| 2479 | nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink); | 2479 | nd_set_link(nd, SHMEM_I(d_inode(dentry))->symlink); |
| 2480 | return NULL; | 2480 | return NULL; |
| 2481 | } | 2481 | } |
| 2482 | 2482 | ||
| 2483 | static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) | 2483 | static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd) |
| 2484 | { | 2484 | { |
| 2485 | struct page *page = NULL; | 2485 | struct page *page = NULL; |
| 2486 | int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL); | 2486 | int error = shmem_getpage(d_inode(dentry), 0, &page, SGP_READ, NULL); |
| 2487 | nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); | 2487 | nd_set_link(nd, error ? ERR_PTR(error) : kmap(page)); |
| 2488 | if (page) | 2488 | if (page) |
| 2489 | unlock_page(page); | 2489 | unlock_page(page); |
| @@ -2574,7 +2574,7 @@ static int shmem_xattr_validate(const char *name) | |||
| 2574 | static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, | 2574 | static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, |
| 2575 | void *buffer, size_t size) | 2575 | void *buffer, size_t size) |
| 2576 | { | 2576 | { |
| 2577 | struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); | 2577 | struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); |
| 2578 | int err; | 2578 | int err; |
| 2579 | 2579 | ||
| 2580 | /* | 2580 | /* |
| @@ -2595,7 +2595,7 @@ static ssize_t shmem_getxattr(struct dentry *dentry, const char *name, | |||
| 2595 | static int shmem_setxattr(struct dentry *dentry, const char *name, | 2595 | static int shmem_setxattr(struct dentry *dentry, const char *name, |
| 2596 | const void *value, size_t size, int flags) | 2596 | const void *value, size_t size, int flags) |
| 2597 | { | 2597 | { |
| 2598 | struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); | 2598 | struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); |
| 2599 | int err; | 2599 | int err; |
| 2600 | 2600 | ||
| 2601 | /* | 2601 | /* |
| @@ -2615,7 +2615,7 @@ static int shmem_setxattr(struct dentry *dentry, const char *name, | |||
| 2615 | 2615 | ||
| 2616 | static int shmem_removexattr(struct dentry *dentry, const char *name) | 2616 | static int shmem_removexattr(struct dentry *dentry, const char *name) |
| 2617 | { | 2617 | { |
| 2618 | struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); | 2618 | struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); |
| 2619 | int err; | 2619 | int err; |
| 2620 | 2620 | ||
| 2621 | /* | 2621 | /* |
| @@ -2635,7 +2635,7 @@ static int shmem_removexattr(struct dentry *dentry, const char *name) | |||
| 2635 | 2635 | ||
| 2636 | static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) | 2636 | static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) |
| 2637 | { | 2637 | { |
| 2638 | struct shmem_inode_info *info = SHMEM_I(dentry->d_inode); | 2638 | struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); |
| 2639 | return simple_xattr_list(&info->xattrs, buffer, size); | 2639 | return simple_xattr_list(&info->xattrs, buffer, size); |
| 2640 | } | 2640 | } |
| 2641 | #endif /* CONFIG_TMPFS_XATTR */ | 2641 | #endif /* CONFIG_TMPFS_XATTR */ |
| @@ -3118,8 +3118,6 @@ static const struct file_operations shmem_file_operations = { | |||
| 3118 | .mmap = shmem_mmap, | 3118 | .mmap = shmem_mmap, |
| 3119 | #ifdef CONFIG_TMPFS | 3119 | #ifdef CONFIG_TMPFS |
| 3120 | .llseek = shmem_file_llseek, | 3120 | .llseek = shmem_file_llseek, |
| 3121 | .read = new_sync_read, | ||
| 3122 | .write = new_sync_write, | ||
| 3123 | .read_iter = shmem_file_read_iter, | 3121 | .read_iter = shmem_file_read_iter, |
| 3124 | .write_iter = generic_file_write_iter, | 3122 | .write_iter = generic_file_write_iter, |
| 3125 | .fsync = noop_fsync, | 3123 | .fsync = noop_fsync, |
| @@ -857,6 +857,11 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, | |||
| 857 | return NULL; | 857 | return NULL; |
| 858 | } | 858 | } |
| 859 | 859 | ||
| 860 | static inline gfp_t gfp_exact_node(gfp_t flags) | ||
| 861 | { | ||
| 862 | return flags; | ||
| 863 | } | ||
| 864 | |||
| 860 | #else /* CONFIG_NUMA */ | 865 | #else /* CONFIG_NUMA */ |
| 861 | 866 | ||
| 862 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); | 867 | static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); |
| @@ -1023,6 +1028,15 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
| 1023 | 1028 | ||
| 1024 | return __cache_free_alien(cachep, objp, node, page_node); | 1029 | return __cache_free_alien(cachep, objp, node, page_node); |
| 1025 | } | 1030 | } |
| 1031 | |||
| 1032 | /* | ||
| 1033 | * Construct gfp mask to allocate from a specific node but do not invoke reclaim | ||
| 1034 | * or warn about failures. | ||
| 1035 | */ | ||
| 1036 | static inline gfp_t gfp_exact_node(gfp_t flags) | ||
| 1037 | { | ||
| 1038 | return (flags | __GFP_THISNODE | __GFP_NOWARN) & ~__GFP_WAIT; | ||
| 1039 | } | ||
| 1026 | #endif | 1040 | #endif |
| 1027 | 1041 | ||
| 1028 | /* | 1042 | /* |
| @@ -2825,7 +2839,7 @@ alloc_done: | |||
| 2825 | if (unlikely(!ac->avail)) { | 2839 | if (unlikely(!ac->avail)) { |
| 2826 | int x; | 2840 | int x; |
| 2827 | force_grow: | 2841 | force_grow: |
| 2828 | x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); | 2842 | x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); |
| 2829 | 2843 | ||
| 2830 | /* cache_grow can reenable interrupts, then ac could change. */ | 2844 | /* cache_grow can reenable interrupts, then ac could change. */ |
| 2831 | ac = cpu_cache_get(cachep); | 2845 | ac = cpu_cache_get(cachep); |
| @@ -3019,7 +3033,7 @@ retry: | |||
| 3019 | get_node(cache, nid) && | 3033 | get_node(cache, nid) && |
| 3020 | get_node(cache, nid)->free_objects) { | 3034 | get_node(cache, nid)->free_objects) { |
| 3021 | obj = ____cache_alloc_node(cache, | 3035 | obj = ____cache_alloc_node(cache, |
| 3022 | flags | GFP_THISNODE, nid); | 3036 | gfp_exact_node(flags), nid); |
| 3023 | if (obj) | 3037 | if (obj) |
| 3024 | break; | 3038 | break; |
| 3025 | } | 3039 | } |
| @@ -3047,7 +3061,7 @@ retry: | |||
| 3047 | nid = page_to_nid(page); | 3061 | nid = page_to_nid(page); |
| 3048 | if (cache_grow(cache, flags, nid, page)) { | 3062 | if (cache_grow(cache, flags, nid, page)) { |
| 3049 | obj = ____cache_alloc_node(cache, | 3063 | obj = ____cache_alloc_node(cache, |
| 3050 | flags | GFP_THISNODE, nid); | 3064 | gfp_exact_node(flags), nid); |
| 3051 | if (!obj) | 3065 | if (!obj) |
| 3052 | /* | 3066 | /* |
| 3053 | * Another processor may allocate the | 3067 | * Another processor may allocate the |
| @@ -3118,7 +3132,7 @@ retry: | |||
| 3118 | 3132 | ||
| 3119 | must_grow: | 3133 | must_grow: |
| 3120 | spin_unlock(&n->list_lock); | 3134 | spin_unlock(&n->list_lock); |
| 3121 | x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL); | 3135 | x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); |
| 3122 | if (x) | 3136 | if (x) |
| 3123 | goto retry; | 3137 | goto retry; |
| 3124 | 3138 | ||
| @@ -532,7 +532,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) | |||
| 532 | return 0; | 532 | return 0; |
| 533 | } | 533 | } |
| 534 | 534 | ||
| 535 | void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | 535 | static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) |
| 536 | { | 536 | { |
| 537 | void *b; | 537 | void *b; |
| 538 | 538 | ||
| @@ -558,7 +558,6 @@ void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) | |||
| 558 | kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); | 558 | kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); |
| 559 | return b; | 559 | return b; |
| 560 | } | 560 | } |
| 561 | EXPORT_SYMBOL(slob_alloc_node); | ||
| 562 | 561 | ||
| 563 | void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) | 562 | void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) |
| 564 | { | 563 | { |
| @@ -374,7 +374,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page | |||
| 374 | if (cmpxchg_double(&page->freelist, &page->counters, | 374 | if (cmpxchg_double(&page->freelist, &page->counters, |
| 375 | freelist_old, counters_old, | 375 | freelist_old, counters_old, |
| 376 | freelist_new, counters_new)) | 376 | freelist_new, counters_new)) |
| 377 | return 1; | 377 | return true; |
| 378 | } else | 378 | } else |
| 379 | #endif | 379 | #endif |
| 380 | { | 380 | { |
| @@ -384,7 +384,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page | |||
| 384 | page->freelist = freelist_new; | 384 | page->freelist = freelist_new; |
| 385 | set_page_slub_counters(page, counters_new); | 385 | set_page_slub_counters(page, counters_new); |
| 386 | slab_unlock(page); | 386 | slab_unlock(page); |
| 387 | return 1; | 387 | return true; |
| 388 | } | 388 | } |
| 389 | slab_unlock(page); | 389 | slab_unlock(page); |
| 390 | } | 390 | } |
| @@ -396,7 +396,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page | |||
| 396 | pr_info("%s %s: cmpxchg double redo ", n, s->name); | 396 | pr_info("%s %s: cmpxchg double redo ", n, s->name); |
| 397 | #endif | 397 | #endif |
| 398 | 398 | ||
| 399 | return 0; | 399 | return false; |
| 400 | } | 400 | } |
| 401 | 401 | ||
| 402 | static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | 402 | static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, |
| @@ -410,7 +410,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
| 410 | if (cmpxchg_double(&page->freelist, &page->counters, | 410 | if (cmpxchg_double(&page->freelist, &page->counters, |
| 411 | freelist_old, counters_old, | 411 | freelist_old, counters_old, |
| 412 | freelist_new, counters_new)) | 412 | freelist_new, counters_new)) |
| 413 | return 1; | 413 | return true; |
| 414 | } else | 414 | } else |
| 415 | #endif | 415 | #endif |
| 416 | { | 416 | { |
| @@ -424,7 +424,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
| 424 | set_page_slub_counters(page, counters_new); | 424 | set_page_slub_counters(page, counters_new); |
| 425 | slab_unlock(page); | 425 | slab_unlock(page); |
| 426 | local_irq_restore(flags); | 426 | local_irq_restore(flags); |
| 427 | return 1; | 427 | return true; |
| 428 | } | 428 | } |
| 429 | slab_unlock(page); | 429 | slab_unlock(page); |
| 430 | local_irq_restore(flags); | 430 | local_irq_restore(flags); |
| @@ -437,7 +437,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
| 437 | pr_info("%s %s: cmpxchg double redo ", n, s->name); | 437 | pr_info("%s %s: cmpxchg double redo ", n, s->name); |
| 438 | #endif | 438 | #endif |
| 439 | 439 | ||
| 440 | return 0; | 440 | return false; |
| 441 | } | 441 | } |
| 442 | 442 | ||
| 443 | #ifdef CONFIG_SLUB_DEBUG | 443 | #ifdef CONFIG_SLUB_DEBUG |
| @@ -1137,15 +1137,6 @@ static int __init setup_slub_debug(char *str) | |||
| 1137 | */ | 1137 | */ |
| 1138 | goto check_slabs; | 1138 | goto check_slabs; |
| 1139 | 1139 | ||
| 1140 | if (tolower(*str) == 'o') { | ||
| 1141 | /* | ||
| 1142 | * Avoid enabling debugging on caches if its minimum order | ||
| 1143 | * would increase as a result. | ||
| 1144 | */ | ||
| 1145 | disable_higher_order_debug = 1; | ||
| 1146 | goto out; | ||
| 1147 | } | ||
| 1148 | |||
| 1149 | slub_debug = 0; | 1140 | slub_debug = 0; |
| 1150 | if (*str == '-') | 1141 | if (*str == '-') |
| 1151 | /* | 1142 | /* |
| @@ -1176,6 +1167,13 @@ static int __init setup_slub_debug(char *str) | |||
| 1176 | case 'a': | 1167 | case 'a': |
| 1177 | slub_debug |= SLAB_FAILSLAB; | 1168 | slub_debug |= SLAB_FAILSLAB; |
| 1178 | break; | 1169 | break; |
| 1170 | case 'o': | ||
| 1171 | /* | ||
| 1172 | * Avoid enabling debugging on caches if its minimum | ||
| 1173 | * order would increase as a result. | ||
| 1174 | */ | ||
| 1175 | disable_higher_order_debug = 1; | ||
| 1176 | break; | ||
| 1179 | default: | 1177 | default: |
| 1180 | pr_err("slub_debug option '%c' unknown. skipped\n", | 1178 | pr_err("slub_debug option '%c' unknown. skipped\n", |
| 1181 | *str); | 1179 | *str); |
| @@ -2449,7 +2447,8 @@ redo: | |||
| 2449 | do { | 2447 | do { |
| 2450 | tid = this_cpu_read(s->cpu_slab->tid); | 2448 | tid = this_cpu_read(s->cpu_slab->tid); |
| 2451 | c = raw_cpu_ptr(s->cpu_slab); | 2449 | c = raw_cpu_ptr(s->cpu_slab); |
| 2452 | } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); | 2450 | } while (IS_ENABLED(CONFIG_PREEMPT) && |
| 2451 | unlikely(tid != READ_ONCE(c->tid))); | ||
| 2453 | 2452 | ||
| 2454 | /* | 2453 | /* |
| 2455 | * Irqless object alloc/free algorithm used here depends on sequence | 2454 | * Irqless object alloc/free algorithm used here depends on sequence |
| @@ -2718,7 +2717,8 @@ redo: | |||
| 2718 | do { | 2717 | do { |
| 2719 | tid = this_cpu_read(s->cpu_slab->tid); | 2718 | tid = this_cpu_read(s->cpu_slab->tid); |
| 2720 | c = raw_cpu_ptr(s->cpu_slab); | 2719 | c = raw_cpu_ptr(s->cpu_slab); |
| 2721 | } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid)); | 2720 | } while (IS_ENABLED(CONFIG_PREEMPT) && |
| 2721 | unlikely(tid != READ_ONCE(c->tid))); | ||
| 2722 | 2722 | ||
| 2723 | /* Same with comment on barrier() in slab_alloc_node() */ | 2723 | /* Same with comment on barrier() in slab_alloc_node() */ |
| 2724 | barrier(); | 2724 | barrier(); |
| @@ -4277,7 +4277,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
| 4277 | int node; | 4277 | int node; |
| 4278 | struct page *page; | 4278 | struct page *page; |
| 4279 | 4279 | ||
| 4280 | page = ACCESS_ONCE(c->page); | 4280 | page = READ_ONCE(c->page); |
| 4281 | if (!page) | 4281 | if (!page) |
| 4282 | continue; | 4282 | continue; |
| 4283 | 4283 | ||
| @@ -4292,7 +4292,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, | |||
| 4292 | total += x; | 4292 | total += x; |
| 4293 | nodes[node] += x; | 4293 | nodes[node] += x; |
| 4294 | 4294 | ||
| 4295 | page = ACCESS_ONCE(c->partial); | 4295 | page = READ_ONCE(c->partial); |
| 4296 | if (page) { | 4296 | if (page) { |
| 4297 | node = page_to_nid(page); | 4297 | node = page_to_nid(page); |
| 4298 | if (flags & SO_TOTAL) | 4298 | if (flags & SO_TOTAL) |
| @@ -31,6 +31,7 @@ | |||
| 31 | #include <linux/memcontrol.h> | 31 | #include <linux/memcontrol.h> |
| 32 | #include <linux/gfp.h> | 32 | #include <linux/gfp.h> |
| 33 | #include <linux/uio.h> | 33 | #include <linux/uio.h> |
| 34 | #include <linux/hugetlb.h> | ||
| 34 | 35 | ||
| 35 | #include "internal.h" | 36 | #include "internal.h" |
| 36 | 37 | ||
| @@ -42,7 +43,7 @@ int page_cluster; | |||
| 42 | 43 | ||
| 43 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); | 44 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); |
| 44 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); | 45 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
| 45 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | 46 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); |
| 46 | 47 | ||
| 47 | /* | 48 | /* |
| 48 | * This path almost never happens for VM activity - pages are normally | 49 | * This path almost never happens for VM activity - pages are normally |
| @@ -75,7 +76,14 @@ static void __put_compound_page(struct page *page) | |||
| 75 | { | 76 | { |
| 76 | compound_page_dtor *dtor; | 77 | compound_page_dtor *dtor; |
| 77 | 78 | ||
| 78 | __page_cache_release(page); | 79 | /* |
| 80 | * __page_cache_release() is supposed to be called for thp, not for | ||
| 81 | * hugetlb. This is because hugetlb page does never have PageLRU set | ||
| 82 | * (it's never listed to any LRU lists) and no memcg routines should | ||
| 83 | * be called for hugetlb (it has a separate hugetlb_cgroup.) | ||
| 84 | */ | ||
| 85 | if (!PageHuge(page)) | ||
| 86 | __page_cache_release(page); | ||
| 79 | dtor = get_compound_page_dtor(page); | 87 | dtor = get_compound_page_dtor(page); |
| 80 | (*dtor)(page); | 88 | (*dtor)(page); |
| 81 | } | 89 | } |
| @@ -743,7 +751,7 @@ void lru_cache_add_active_or_unevictable(struct page *page, | |||
| 743 | * be write it out by flusher threads as this is much more effective | 751 | * be write it out by flusher threads as this is much more effective |
| 744 | * than the single-page writeout from reclaim. | 752 | * than the single-page writeout from reclaim. |
| 745 | */ | 753 | */ |
| 746 | static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, | 754 | static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, |
| 747 | void *arg) | 755 | void *arg) |
| 748 | { | 756 | { |
| 749 | int lru, file; | 757 | int lru, file; |
| @@ -811,36 +819,36 @@ void lru_add_drain_cpu(int cpu) | |||
| 811 | local_irq_restore(flags); | 819 | local_irq_restore(flags); |
| 812 | } | 820 | } |
| 813 | 821 | ||
| 814 | pvec = &per_cpu(lru_deactivate_pvecs, cpu); | 822 | pvec = &per_cpu(lru_deactivate_file_pvecs, cpu); |
| 815 | if (pagevec_count(pvec)) | 823 | if (pagevec_count(pvec)) |
| 816 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | 824 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
| 817 | 825 | ||
| 818 | activate_page_drain(cpu); | 826 | activate_page_drain(cpu); |
| 819 | } | 827 | } |
| 820 | 828 | ||
| 821 | /** | 829 | /** |
| 822 | * deactivate_page - forcefully deactivate a page | 830 | * deactivate_file_page - forcefully deactivate a file page |
| 823 | * @page: page to deactivate | 831 | * @page: page to deactivate |
| 824 | * | 832 | * |
| 825 | * This function hints the VM that @page is a good reclaim candidate, | 833 | * This function hints the VM that @page is a good reclaim candidate, |
| 826 | * for example if its invalidation fails due to the page being dirty | 834 | * for example if its invalidation fails due to the page being dirty |
| 827 | * or under writeback. | 835 | * or under writeback. |
| 828 | */ | 836 | */ |
| 829 | void deactivate_page(struct page *page) | 837 | void deactivate_file_page(struct page *page) |
| 830 | { | 838 | { |
| 831 | /* | 839 | /* |
| 832 | * In a workload with many unevictable page such as mprotect, unevictable | 840 | * In a workload with many unevictable page such as mprotect, |
| 833 | * page deactivation for accelerating reclaim is pointless. | 841 | * unevictable page deactivation for accelerating reclaim is pointless. |
| 834 | */ | 842 | */ |
| 835 | if (PageUnevictable(page)) | 843 | if (PageUnevictable(page)) |
| 836 | return; | 844 | return; |
| 837 | 845 | ||
| 838 | if (likely(get_page_unless_zero(page))) { | 846 | if (likely(get_page_unless_zero(page))) { |
| 839 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); | 847 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); |
| 840 | 848 | ||
| 841 | if (!pagevec_add(pvec, page)) | 849 | if (!pagevec_add(pvec, page)) |
| 842 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | 850 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
| 843 | put_cpu_var(lru_deactivate_pvecs); | 851 | put_cpu_var(lru_deactivate_file_pvecs); |
| 844 | } | 852 | } |
| 845 | } | 853 | } |
| 846 | 854 | ||
| @@ -872,7 +880,7 @@ void lru_add_drain_all(void) | |||
| 872 | 880 | ||
| 873 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || | 881 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || |
| 874 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || | 882 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || |
| 875 | pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || | 883 | pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || |
| 876 | need_activate_page_drain(cpu)) { | 884 | need_activate_page_drain(cpu)) { |
| 877 | INIT_WORK(work, lru_add_drain_per_cpu); | 885 | INIT_WORK(work, lru_add_drain_per_cpu); |
| 878 | schedule_work_on(cpu, work); | 886 | schedule_work_on(cpu, work); |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 405923f77334..8bc8e66138da 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -390,7 +390,7 @@ static unsigned long swapin_nr_pages(unsigned long offset) | |||
| 390 | unsigned int pages, max_pages, last_ra; | 390 | unsigned int pages, max_pages, last_ra; |
| 391 | static atomic_t last_readahead_pages; | 391 | static atomic_t last_readahead_pages; |
| 392 | 392 | ||
| 393 | max_pages = 1 << ACCESS_ONCE(page_cluster); | 393 | max_pages = 1 << READ_ONCE(page_cluster); |
| 394 | if (max_pages <= 1) | 394 | if (max_pages <= 1) |
| 395 | return 1; | 395 | return 1; |
| 396 | 396 | ||
diff --git a/mm/swapfile.c b/mm/swapfile.c index 63f55ccb9b26..a7e72103f23b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -1312,7 +1312,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, | |||
| 1312 | else | 1312 | else |
| 1313 | continue; | 1313 | continue; |
| 1314 | } | 1314 | } |
| 1315 | count = ACCESS_ONCE(si->swap_map[i]); | 1315 | count = READ_ONCE(si->swap_map[i]); |
| 1316 | if (count && swap_count(count) != SWAP_MAP_BAD) | 1316 | if (count && swap_count(count) != SWAP_MAP_BAD) |
| 1317 | break; | 1317 | break; |
| 1318 | } | 1318 | } |
diff --git a/mm/truncate.c b/mm/truncate.c index ddec5a5966d7..66af9031fae8 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
| @@ -93,35 +93,6 @@ void do_invalidatepage(struct page *page, unsigned int offset, | |||
| 93 | } | 93 | } |
| 94 | 94 | ||
| 95 | /* | 95 | /* |
| 96 | * This cancels just the dirty bit on the kernel page itself, it | ||
| 97 | * does NOT actually remove dirty bits on any mmap's that may be | ||
| 98 | * around. It also leaves the page tagged dirty, so any sync | ||
| 99 | * activity will still find it on the dirty lists, and in particular, | ||
| 100 | * clear_page_dirty_for_io() will still look at the dirty bits in | ||
| 101 | * the VM. | ||
| 102 | * | ||
| 103 | * Doing this should *normally* only ever be done when a page | ||
| 104 | * is truncated, and is not actually mapped anywhere at all. However, | ||
| 105 | * fs/buffer.c does this when it notices that somebody has cleaned | ||
| 106 | * out all the buffers on a page without actually doing it through | ||
| 107 | * the VM. Can you say "ext3 is horribly ugly"? Tought you could. | ||
| 108 | */ | ||
| 109 | void cancel_dirty_page(struct page *page, unsigned int account_size) | ||
| 110 | { | ||
| 111 | if (TestClearPageDirty(page)) { | ||
| 112 | struct address_space *mapping = page->mapping; | ||
| 113 | if (mapping && mapping_cap_account_dirty(mapping)) { | ||
| 114 | dec_zone_page_state(page, NR_FILE_DIRTY); | ||
| 115 | dec_bdi_stat(inode_to_bdi(mapping->host), | ||
| 116 | BDI_RECLAIMABLE); | ||
| 117 | if (account_size) | ||
| 118 | task_io_account_cancelled_write(account_size); | ||
| 119 | } | ||
| 120 | } | ||
| 121 | } | ||
| 122 | EXPORT_SYMBOL(cancel_dirty_page); | ||
| 123 | |||
| 124 | /* | ||
| 125 | * If truncate cannot remove the fs-private metadata from the page, the page | 96 | * If truncate cannot remove the fs-private metadata from the page, the page |
| 126 | * becomes orphaned. It will be left on the LRU and may even be mapped into | 97 | * becomes orphaned. It will be left on the LRU and may even be mapped into |
| 127 | * user pagetables if we're racing with filemap_fault(). | 98 | * user pagetables if we're racing with filemap_fault(). |
| @@ -140,7 +111,13 @@ truncate_complete_page(struct address_space *mapping, struct page *page) | |||
| 140 | if (page_has_private(page)) | 111 | if (page_has_private(page)) |
| 141 | do_invalidatepage(page, 0, PAGE_CACHE_SIZE); | 112 | do_invalidatepage(page, 0, PAGE_CACHE_SIZE); |
| 142 | 113 | ||
| 143 | cancel_dirty_page(page, PAGE_CACHE_SIZE); | 114 | /* |
| 115 | * Some filesystems seem to re-dirty the page even after | ||
| 116 | * the VM has canceled the dirty bit (eg ext3 journaling). | ||
| 117 | * Hence dirty accounting check is placed after invalidation. | ||
| 118 | */ | ||
| 119 | if (TestClearPageDirty(page)) | ||
| 120 | account_page_cleaned(page, mapping); | ||
| 144 | 121 | ||
| 145 | ClearPageMappedToDisk(page); | 122 | ClearPageMappedToDisk(page); |
| 146 | delete_from_page_cache(page); | 123 | delete_from_page_cache(page); |
| @@ -513,7 +490,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
| 513 | * of interest and try to speed up its reclaim. | 490 | * of interest and try to speed up its reclaim. |
| 514 | */ | 491 | */ |
| 515 | if (!ret) | 492 | if (!ret) |
| 516 | deactivate_page(page); | 493 | deactivate_file_page(page); |
| 517 | count += ret; | 494 | count += ret; |
| 518 | } | 495 | } |
| 519 | pagevec_remove_exceptionals(&pvec); | 496 | pagevec_remove_exceptionals(&pvec); |
| @@ -325,9 +325,37 @@ void kvfree(const void *addr) | |||
| 325 | } | 325 | } |
| 326 | EXPORT_SYMBOL(kvfree); | 326 | EXPORT_SYMBOL(kvfree); |
| 327 | 327 | ||
| 328 | static inline void *__page_rmapping(struct page *page) | ||
| 329 | { | ||
| 330 | unsigned long mapping; | ||
| 331 | |||
| 332 | mapping = (unsigned long)page->mapping; | ||
| 333 | mapping &= ~PAGE_MAPPING_FLAGS; | ||
| 334 | |||
| 335 | return (void *)mapping; | ||
| 336 | } | ||
| 337 | |||
| 338 | /* Neutral page->mapping pointer to address_space or anon_vma or other */ | ||
| 339 | void *page_rmapping(struct page *page) | ||
| 340 | { | ||
| 341 | page = compound_head(page); | ||
| 342 | return __page_rmapping(page); | ||
| 343 | } | ||
| 344 | |||
| 345 | struct anon_vma *page_anon_vma(struct page *page) | ||
| 346 | { | ||
| 347 | unsigned long mapping; | ||
| 348 | |||
| 349 | page = compound_head(page); | ||
| 350 | mapping = (unsigned long)page->mapping; | ||
| 351 | if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) | ||
| 352 | return NULL; | ||
| 353 | return __page_rmapping(page); | ||
| 354 | } | ||
| 355 | |||
| 328 | struct address_space *page_mapping(struct page *page) | 356 | struct address_space *page_mapping(struct page *page) |
| 329 | { | 357 | { |
| 330 | struct address_space *mapping = page->mapping; | 358 | unsigned long mapping; |
| 331 | 359 | ||
| 332 | /* This happens if someone calls flush_dcache_page on slab page */ | 360 | /* This happens if someone calls flush_dcache_page on slab page */ |
| 333 | if (unlikely(PageSlab(page))) | 361 | if (unlikely(PageSlab(page))) |
| @@ -337,10 +365,13 @@ struct address_space *page_mapping(struct page *page) | |||
| 337 | swp_entry_t entry; | 365 | swp_entry_t entry; |
| 338 | 366 | ||
| 339 | entry.val = page_private(page); | 367 | entry.val = page_private(page); |
| 340 | mapping = swap_address_space(entry); | 368 | return swap_address_space(entry); |
| 341 | } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) | 369 | } |
| 342 | mapping = NULL; | 370 | |
| 343 | return mapping; | 371 | mapping = (unsigned long)page->mapping; |
| 372 | if (mapping & PAGE_MAPPING_FLAGS) | ||
| 373 | return NULL; | ||
| 374 | return page->mapping; | ||
| 344 | } | 375 | } |
| 345 | 376 | ||
| 346 | int overcommit_ratio_handler(struct ctl_table *table, int write, | 377 | int overcommit_ratio_handler(struct ctl_table *table, int write, |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35b25e1340ca..2faaa2976447 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -29,6 +29,7 @@ | |||
| 29 | #include <linux/atomic.h> | 29 | #include <linux/atomic.h> |
| 30 | #include <linux/compiler.h> | 30 | #include <linux/compiler.h> |
| 31 | #include <linux/llist.h> | 31 | #include <linux/llist.h> |
| 32 | #include <linux/bitops.h> | ||
| 32 | 33 | ||
| 33 | #include <asm/uaccess.h> | 34 | #include <asm/uaccess.h> |
| 34 | #include <asm/tlbflush.h> | 35 | #include <asm/tlbflush.h> |
| @@ -74,6 +75,8 @@ static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) | |||
| 74 | pmd = pmd_offset(pud, addr); | 75 | pmd = pmd_offset(pud, addr); |
| 75 | do { | 76 | do { |
| 76 | next = pmd_addr_end(addr, end); | 77 | next = pmd_addr_end(addr, end); |
| 78 | if (pmd_clear_huge(pmd)) | ||
| 79 | continue; | ||
| 77 | if (pmd_none_or_clear_bad(pmd)) | 80 | if (pmd_none_or_clear_bad(pmd)) |
| 78 | continue; | 81 | continue; |
| 79 | vunmap_pte_range(pmd, addr, next); | 82 | vunmap_pte_range(pmd, addr, next); |
| @@ -88,6 +91,8 @@ static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) | |||
| 88 | pud = pud_offset(pgd, addr); | 91 | pud = pud_offset(pgd, addr); |
| 89 | do { | 92 | do { |
| 90 | next = pud_addr_end(addr, end); | 93 | next = pud_addr_end(addr, end); |
| 94 | if (pud_clear_huge(pud)) | ||
| 95 | continue; | ||
| 91 | if (pud_none_or_clear_bad(pud)) | 96 | if (pud_none_or_clear_bad(pud)) |
| 92 | continue; | 97 | continue; |
| 93 | vunmap_pmd_range(pud, addr, next); | 98 | vunmap_pmd_range(pud, addr, next); |
| @@ -760,7 +765,7 @@ struct vmap_block { | |||
| 760 | spinlock_t lock; | 765 | spinlock_t lock; |
| 761 | struct vmap_area *va; | 766 | struct vmap_area *va; |
| 762 | unsigned long free, dirty; | 767 | unsigned long free, dirty; |
| 763 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | 768 | unsigned long dirty_min, dirty_max; /*< dirty range */ |
| 764 | struct list_head free_list; | 769 | struct list_head free_list; |
| 765 | struct rcu_head rcu_head; | 770 | struct rcu_head rcu_head; |
| 766 | struct list_head purge; | 771 | struct list_head purge; |
| @@ -791,13 +796,31 @@ static unsigned long addr_to_vb_idx(unsigned long addr) | |||
| 791 | return addr; | 796 | return addr; |
| 792 | } | 797 | } |
| 793 | 798 | ||
| 794 | static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | 799 | static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off) |
| 800 | { | ||
| 801 | unsigned long addr; | ||
| 802 | |||
| 803 | addr = va_start + (pages_off << PAGE_SHIFT); | ||
| 804 | BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start)); | ||
| 805 | return (void *)addr; | ||
| 806 | } | ||
| 807 | |||
| 808 | /** | ||
| 809 | * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this | ||
| 810 | * block. Of course pages number can't exceed VMAP_BBMAP_BITS | ||
| 811 | * @order: how many 2^order pages should be occupied in newly allocated block | ||
| 812 | * @gfp_mask: flags for the page level allocator | ||
| 813 | * | ||
| 814 | * Returns: virtual address in a newly allocated block or ERR_PTR(-errno) | ||
| 815 | */ | ||
| 816 | static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) | ||
| 795 | { | 817 | { |
| 796 | struct vmap_block_queue *vbq; | 818 | struct vmap_block_queue *vbq; |
| 797 | struct vmap_block *vb; | 819 | struct vmap_block *vb; |
| 798 | struct vmap_area *va; | 820 | struct vmap_area *va; |
| 799 | unsigned long vb_idx; | 821 | unsigned long vb_idx; |
| 800 | int node, err; | 822 | int node, err; |
| 823 | void *vaddr; | ||
| 801 | 824 | ||
| 802 | node = numa_node_id(); | 825 | node = numa_node_id(); |
| 803 | 826 | ||
| @@ -821,11 +844,15 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
| 821 | return ERR_PTR(err); | 844 | return ERR_PTR(err); |
| 822 | } | 845 | } |
| 823 | 846 | ||
| 847 | vaddr = vmap_block_vaddr(va->va_start, 0); | ||
| 824 | spin_lock_init(&vb->lock); | 848 | spin_lock_init(&vb->lock); |
| 825 | vb->va = va; | 849 | vb->va = va; |
| 826 | vb->free = VMAP_BBMAP_BITS; | 850 | /* At least something should be left free */ |
| 851 | BUG_ON(VMAP_BBMAP_BITS <= (1UL << order)); | ||
| 852 | vb->free = VMAP_BBMAP_BITS - (1UL << order); | ||
| 827 | vb->dirty = 0; | 853 | vb->dirty = 0; |
| 828 | bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); | 854 | vb->dirty_min = VMAP_BBMAP_BITS; |
| 855 | vb->dirty_max = 0; | ||
| 829 | INIT_LIST_HEAD(&vb->free_list); | 856 | INIT_LIST_HEAD(&vb->free_list); |
| 830 | 857 | ||
| 831 | vb_idx = addr_to_vb_idx(va->va_start); | 858 | vb_idx = addr_to_vb_idx(va->va_start); |
| @@ -837,11 +864,11 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | |||
| 837 | 864 | ||
| 838 | vbq = &get_cpu_var(vmap_block_queue); | 865 | vbq = &get_cpu_var(vmap_block_queue); |
| 839 | spin_lock(&vbq->lock); | 866 | spin_lock(&vbq->lock); |
| 840 | list_add_rcu(&vb->free_list, &vbq->free); | 867 | list_add_tail_rcu(&vb->free_list, &vbq->free); |
| 841 | spin_unlock(&vbq->lock); | 868 | spin_unlock(&vbq->lock); |
| 842 | put_cpu_var(vmap_block_queue); | 869 | put_cpu_var(vmap_block_queue); |
| 843 | 870 | ||
| 844 | return vb; | 871 | return vaddr; |
| 845 | } | 872 | } |
| 846 | 873 | ||
| 847 | static void free_vmap_block(struct vmap_block *vb) | 874 | static void free_vmap_block(struct vmap_block *vb) |
| @@ -876,7 +903,8 @@ static void purge_fragmented_blocks(int cpu) | |||
| 876 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { | 903 | if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { |
| 877 | vb->free = 0; /* prevent further allocs after releasing lock */ | 904 | vb->free = 0; /* prevent further allocs after releasing lock */ |
| 878 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ | 905 | vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ |
| 879 | bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); | 906 | vb->dirty_min = 0; |
| 907 | vb->dirty_max = VMAP_BBMAP_BITS; | ||
| 880 | spin_lock(&vbq->lock); | 908 | spin_lock(&vbq->lock); |
| 881 | list_del_rcu(&vb->free_list); | 909 | list_del_rcu(&vb->free_list); |
| 882 | spin_unlock(&vbq->lock); | 910 | spin_unlock(&vbq->lock); |
| @@ -905,7 +933,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
| 905 | { | 933 | { |
| 906 | struct vmap_block_queue *vbq; | 934 | struct vmap_block_queue *vbq; |
| 907 | struct vmap_block *vb; | 935 | struct vmap_block *vb; |
| 908 | unsigned long addr = 0; | 936 | void *vaddr = NULL; |
| 909 | unsigned int order; | 937 | unsigned int order; |
| 910 | 938 | ||
| 911 | BUG_ON(size & ~PAGE_MASK); | 939 | BUG_ON(size & ~PAGE_MASK); |
| @@ -920,43 +948,38 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | |||
| 920 | } | 948 | } |
| 921 | order = get_order(size); | 949 | order = get_order(size); |
| 922 | 950 | ||
| 923 | again: | ||
| 924 | rcu_read_lock(); | 951 | rcu_read_lock(); |
| 925 | vbq = &get_cpu_var(vmap_block_queue); | 952 | vbq = &get_cpu_var(vmap_block_queue); |
| 926 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | 953 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { |
| 927 | int i; | 954 | unsigned long pages_off; |
| 928 | 955 | ||
| 929 | spin_lock(&vb->lock); | 956 | spin_lock(&vb->lock); |
| 930 | if (vb->free < 1UL << order) | 957 | if (vb->free < (1UL << order)) { |
| 931 | goto next; | 958 | spin_unlock(&vb->lock); |
| 959 | continue; | ||
| 960 | } | ||
| 932 | 961 | ||
| 933 | i = VMAP_BBMAP_BITS - vb->free; | 962 | pages_off = VMAP_BBMAP_BITS - vb->free; |
| 934 | addr = vb->va->va_start + (i << PAGE_SHIFT); | 963 | vaddr = vmap_block_vaddr(vb->va->va_start, pages_off); |
| 935 | BUG_ON(addr_to_vb_idx(addr) != | ||
| 936 | addr_to_vb_idx(vb->va->va_start)); | ||
| 937 | vb->free -= 1UL << order; | 964 | vb->free -= 1UL << order; |
| 938 | if (vb->free == 0) { | 965 | if (vb->free == 0) { |
| 939 | spin_lock(&vbq->lock); | 966 | spin_lock(&vbq->lock); |
| 940 | list_del_rcu(&vb->free_list); | 967 | list_del_rcu(&vb->free_list); |
| 941 | spin_unlock(&vbq->lock); | 968 | spin_unlock(&vbq->lock); |
| 942 | } | 969 | } |
| 970 | |||
| 943 | spin_unlock(&vb->lock); | 971 | spin_unlock(&vb->lock); |
| 944 | break; | 972 | break; |
| 945 | next: | ||
| 946 | spin_unlock(&vb->lock); | ||
| 947 | } | 973 | } |
| 948 | 974 | ||
| 949 | put_cpu_var(vmap_block_queue); | 975 | put_cpu_var(vmap_block_queue); |
| 950 | rcu_read_unlock(); | 976 | rcu_read_unlock(); |
| 951 | 977 | ||
| 952 | if (!addr) { | 978 | /* Allocate new block if nothing was found */ |
| 953 | vb = new_vmap_block(gfp_mask); | 979 | if (!vaddr) |
| 954 | if (IS_ERR(vb)) | 980 | vaddr = new_vmap_block(order, gfp_mask); |
| 955 | return vb; | ||
| 956 | goto again; | ||
| 957 | } | ||
| 958 | 981 | ||
| 959 | return (void *)addr; | 982 | return vaddr; |
| 960 | } | 983 | } |
| 961 | 984 | ||
| 962 | static void vb_free(const void *addr, unsigned long size) | 985 | static void vb_free(const void *addr, unsigned long size) |
| @@ -974,6 +997,7 @@ static void vb_free(const void *addr, unsigned long size) | |||
| 974 | order = get_order(size); | 997 | order = get_order(size); |
| 975 | 998 | ||
| 976 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); | 999 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); |
| 1000 | offset >>= PAGE_SHIFT; | ||
| 977 | 1001 | ||
| 978 | vb_idx = addr_to_vb_idx((unsigned long)addr); | 1002 | vb_idx = addr_to_vb_idx((unsigned long)addr); |
| 979 | rcu_read_lock(); | 1003 | rcu_read_lock(); |
| @@ -984,7 +1008,10 @@ static void vb_free(const void *addr, unsigned long size) | |||
| 984 | vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); | 1008 | vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); |
| 985 | 1009 | ||
| 986 | spin_lock(&vb->lock); | 1010 | spin_lock(&vb->lock); |
| 987 | BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); | 1011 | |
| 1012 | /* Expand dirty range */ | ||
| 1013 | vb->dirty_min = min(vb->dirty_min, offset); | ||
| 1014 | vb->dirty_max = max(vb->dirty_max, offset + (1UL << order)); | ||
| 988 | 1015 | ||
| 989 | vb->dirty += 1UL << order; | 1016 | vb->dirty += 1UL << order; |
| 990 | if (vb->dirty == VMAP_BBMAP_BITS) { | 1017 | if (vb->dirty == VMAP_BBMAP_BITS) { |
| @@ -1023,25 +1050,18 @@ void vm_unmap_aliases(void) | |||
| 1023 | 1050 | ||
| 1024 | rcu_read_lock(); | 1051 | rcu_read_lock(); |
| 1025 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | 1052 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { |
| 1026 | int i, j; | ||
| 1027 | |||
| 1028 | spin_lock(&vb->lock); | 1053 | spin_lock(&vb->lock); |
| 1029 | i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); | 1054 | if (vb->dirty) { |
| 1030 | if (i < VMAP_BBMAP_BITS) { | 1055 | unsigned long va_start = vb->va->va_start; |
| 1031 | unsigned long s, e; | 1056 | unsigned long s, e; |
| 1032 | 1057 | ||
| 1033 | j = find_last_bit(vb->dirty_map, | 1058 | s = va_start + (vb->dirty_min << PAGE_SHIFT); |
| 1034 | VMAP_BBMAP_BITS); | 1059 | e = va_start + (vb->dirty_max << PAGE_SHIFT); |
| 1035 | j = j + 1; /* need exclusive index */ | ||
| 1036 | 1060 | ||
| 1037 | s = vb->va->va_start + (i << PAGE_SHIFT); | 1061 | start = min(s, start); |
| 1038 | e = vb->va->va_start + (j << PAGE_SHIFT); | 1062 | end = max(e, end); |
| 1039 | flush = 1; | ||
| 1040 | 1063 | ||
| 1041 | if (s < start) | 1064 | flush = 1; |
| 1042 | start = s; | ||
| 1043 | if (e > end) | ||
| 1044 | end = e; | ||
| 1045 | } | 1065 | } |
| 1046 | spin_unlock(&vb->lock); | 1066 | spin_unlock(&vb->lock); |
| 1047 | } | 1067 | } |
| @@ -1314,7 +1334,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, | |||
| 1314 | 1334 | ||
| 1315 | BUG_ON(in_interrupt()); | 1335 | BUG_ON(in_interrupt()); |
| 1316 | if (flags & VM_IOREMAP) | 1336 | if (flags & VM_IOREMAP) |
| 1317 | align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); | 1337 | align = 1ul << clamp_t(int, fls_long(size), |
| 1338 | PAGE_SHIFT, IOREMAP_MAX_ORDER); | ||
| 1318 | 1339 | ||
| 1319 | size = PAGE_ALIGN(size); | 1340 | size = PAGE_ALIGN(size); |
| 1320 | if (unlikely(!size)) | 1341 | if (unlikely(!size)) |
| @@ -1418,6 +1439,7 @@ struct vm_struct *remove_vm_area(const void *addr) | |||
| 1418 | spin_unlock(&vmap_area_lock); | 1439 | spin_unlock(&vmap_area_lock); |
| 1419 | 1440 | ||
| 1420 | vmap_debug_free_range(va->va_start, va->va_end); | 1441 | vmap_debug_free_range(va->va_start, va->va_end); |
| 1442 | kasan_free_shadow(vm); | ||
| 1421 | free_unmap_vmap_area(va); | 1443 | free_unmap_vmap_area(va); |
| 1422 | vm->size -= PAGE_SIZE; | 1444 | vm->size -= PAGE_SIZE; |
| 1423 | 1445 | ||
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 0dec1fa5f656..08bd7a3d464a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
| @@ -12,35 +12,6 @@ | |||
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| 14 | /* | 14 | /* |
| 15 | * This allocator is designed for use with zram. Thus, the allocator is | ||
| 16 | * supposed to work well under low memory conditions. In particular, it | ||
| 17 | * never attempts higher order page allocation which is very likely to | ||
| 18 | * fail under memory pressure. On the other hand, if we just use single | ||
| 19 | * (0-order) pages, it would suffer from very high fragmentation -- | ||
| 20 | * any object of size PAGE_SIZE/2 or larger would occupy an entire page. | ||
| 21 | * This was one of the major issues with its predecessor (xvmalloc). | ||
| 22 | * | ||
| 23 | * To overcome these issues, zsmalloc allocates a bunch of 0-order pages | ||
| 24 | * and links them together using various 'struct page' fields. These linked | ||
| 25 | * pages act as a single higher-order page i.e. an object can span 0-order | ||
| 26 | * page boundaries. The code refers to these linked pages as a single entity | ||
| 27 | * called zspage. | ||
| 28 | * | ||
| 29 | * For simplicity, zsmalloc can only allocate objects of size up to PAGE_SIZE | ||
| 30 | * since this satisfies the requirements of all its current users (in the | ||
| 31 | * worst case, page is incompressible and is thus stored "as-is" i.e. in | ||
| 32 | * uncompressed form). For allocation requests larger than this size, failure | ||
| 33 | * is returned (see zs_malloc). | ||
| 34 | * | ||
| 35 | * Additionally, zs_malloc() does not return a dereferenceable pointer. | ||
| 36 | * Instead, it returns an opaque handle (unsigned long) which encodes actual | ||
| 37 | * location of the allocated object. The reason for this indirection is that | ||
| 38 | * zsmalloc does not keep zspages permanently mapped since that would cause | ||
| 39 | * issues on 32-bit systems where the VA region for kernel space mappings | ||
| 40 | * is very small. So, before using the allocating memory, the object has to | ||
| 41 | * be mapped using zs_map_object() to get a usable pointer and subsequently | ||
| 42 | * unmapped using zs_unmap_object(). | ||
| 43 | * | ||
| 44 | * Following is how we use various fields and flags of underlying | 15 | * Following is how we use various fields and flags of underlying |
| 45 | * struct page(s) to form a zspage. | 16 | * struct page(s) to form a zspage. |
| 46 | * | 17 | * |
| @@ -57,6 +28,8 @@ | |||
| 57 | * | 28 | * |
| 58 | * page->private (union with page->first_page): refers to the | 29 | * page->private (union with page->first_page): refers to the |
| 59 | * component page after the first page | 30 | * component page after the first page |
| 31 | * If the page is first_page for huge object, it stores handle. | ||
| 32 | * Look at size_class->huge. | ||
| 60 | * page->freelist: points to the first free object in zspage. | 33 | * page->freelist: points to the first free object in zspage. |
| 61 | * Free objects are linked together using in-place | 34 | * Free objects are linked together using in-place |
| 62 | * metadata. | 35 | * metadata. |
| @@ -78,6 +51,7 @@ | |||
| 78 | 51 | ||
| 79 | #include <linux/module.h> | 52 | #include <linux/module.h> |
| 80 | #include <linux/kernel.h> | 53 | #include <linux/kernel.h> |
| 54 | #include <linux/sched.h> | ||
| 81 | #include <linux/bitops.h> | 55 | #include <linux/bitops.h> |
| 82 | #include <linux/errno.h> | 56 | #include <linux/errno.h> |
| 83 | #include <linux/highmem.h> | 57 | #include <linux/highmem.h> |
| @@ -110,6 +84,8 @@ | |||
| 110 | #define ZS_MAX_ZSPAGE_ORDER 2 | 84 | #define ZS_MAX_ZSPAGE_ORDER 2 |
| 111 | #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) | 85 | #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(1, UL) << ZS_MAX_ZSPAGE_ORDER) |
| 112 | 86 | ||
| 87 | #define ZS_HANDLE_SIZE (sizeof(unsigned long)) | ||
| 88 | |||
| 113 | /* | 89 | /* |
| 114 | * Object location (<PFN>, <obj_idx>) is encoded as | 90 | * Object location (<PFN>, <obj_idx>) is encoded as |
| 115 | * as single (unsigned long) handle value. | 91 | * as single (unsigned long) handle value. |
| @@ -133,13 +109,33 @@ | |||
| 133 | #endif | 109 | #endif |
| 134 | #endif | 110 | #endif |
| 135 | #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) | 111 | #define _PFN_BITS (MAX_PHYSMEM_BITS - PAGE_SHIFT) |
| 136 | #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) | 112 | |
| 113 | /* | ||
| 114 | * Memory for allocating for handle keeps object position by | ||
| 115 | * encoding <page, obj_idx> and the encoded value has a room | ||
| 116 | * in least bit(ie, look at obj_to_location). | ||
| 117 | * We use the bit to synchronize between object access by | ||
| 118 | * user and migration. | ||
| 119 | */ | ||
| 120 | #define HANDLE_PIN_BIT 0 | ||
| 121 | |||
| 122 | /* | ||
| 123 | * Head in allocated object should have OBJ_ALLOCATED_TAG | ||
| 124 | * to identify the object was allocated or not. | ||
| 125 | * It's okay to add the status bit in the least bit because | ||
| 126 | * header keeps handle which is 4byte-aligned address so we | ||
| 127 | * have room for two bit at least. | ||
| 128 | */ | ||
| 129 | #define OBJ_ALLOCATED_TAG 1 | ||
| 130 | #define OBJ_TAG_BITS 1 | ||
| 131 | #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) | ||
| 137 | #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) | 132 | #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) |
| 138 | 133 | ||
| 139 | #define MAX(a, b) ((a) >= (b) ? (a) : (b)) | 134 | #define MAX(a, b) ((a) >= (b) ? (a) : (b)) |
| 140 | /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ | 135 | /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ |
| 141 | #define ZS_MIN_ALLOC_SIZE \ | 136 | #define ZS_MIN_ALLOC_SIZE \ |
| 142 | MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) | 137 | MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) |
| 138 | /* each chunk includes extra space to keep handle */ | ||
| 143 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE | 139 | #define ZS_MAX_ALLOC_SIZE PAGE_SIZE |
| 144 | 140 | ||
| 145 | /* | 141 | /* |
| @@ -172,6 +168,8 @@ enum fullness_group { | |||
| 172 | enum zs_stat_type { | 168 | enum zs_stat_type { |
| 173 | OBJ_ALLOCATED, | 169 | OBJ_ALLOCATED, |
| 174 | OBJ_USED, | 170 | OBJ_USED, |
| 171 | CLASS_ALMOST_FULL, | ||
| 172 | CLASS_ALMOST_EMPTY, | ||
| 175 | NR_ZS_STAT_TYPE, | 173 | NR_ZS_STAT_TYPE, |
| 176 | }; | 174 | }; |
| 177 | 175 | ||
| @@ -216,6 +214,8 @@ struct size_class { | |||
| 216 | 214 | ||
| 217 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ | 215 | /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ |
| 218 | int pages_per_zspage; | 216 | int pages_per_zspage; |
| 217 | /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ | ||
| 218 | bool huge; | ||
| 219 | 219 | ||
| 220 | #ifdef CONFIG_ZSMALLOC_STAT | 220 | #ifdef CONFIG_ZSMALLOC_STAT |
| 221 | struct zs_size_stat stats; | 221 | struct zs_size_stat stats; |
| @@ -233,14 +233,24 @@ struct size_class { | |||
| 233 | * This must be power of 2 and less than or equal to ZS_ALIGN | 233 | * This must be power of 2 and less than or equal to ZS_ALIGN |
| 234 | */ | 234 | */ |
| 235 | struct link_free { | 235 | struct link_free { |
| 236 | /* Handle of next free chunk (encodes <PFN, obj_idx>) */ | 236 | union { |
| 237 | void *next; | 237 | /* |
| 238 | * Position of next free chunk (encodes <PFN, obj_idx>) | ||
| 239 | * It's valid for non-allocated object | ||
| 240 | */ | ||
| 241 | void *next; | ||
| 242 | /* | ||
| 243 | * Handle of allocated object. | ||
| 244 | */ | ||
| 245 | unsigned long handle; | ||
| 246 | }; | ||
| 238 | }; | 247 | }; |
| 239 | 248 | ||
| 240 | struct zs_pool { | 249 | struct zs_pool { |
| 241 | char *name; | 250 | char *name; |
| 242 | 251 | ||
| 243 | struct size_class **size_class; | 252 | struct size_class **size_class; |
| 253 | struct kmem_cache *handle_cachep; | ||
| 244 | 254 | ||
| 245 | gfp_t flags; /* allocation flags used when growing pool */ | 255 | gfp_t flags; /* allocation flags used when growing pool */ |
| 246 | atomic_long_t pages_allocated; | 256 | atomic_long_t pages_allocated; |
| @@ -267,8 +277,37 @@ struct mapping_area { | |||
| 267 | #endif | 277 | #endif |
| 268 | char *vm_addr; /* address of kmap_atomic()'ed pages */ | 278 | char *vm_addr; /* address of kmap_atomic()'ed pages */ |
| 269 | enum zs_mapmode vm_mm; /* mapping mode */ | 279 | enum zs_mapmode vm_mm; /* mapping mode */ |
| 280 | bool huge; | ||
| 270 | }; | 281 | }; |
| 271 | 282 | ||
| 283 | static int create_handle_cache(struct zs_pool *pool) | ||
| 284 | { | ||
| 285 | pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, | ||
| 286 | 0, 0, NULL); | ||
| 287 | return pool->handle_cachep ? 0 : 1; | ||
| 288 | } | ||
| 289 | |||
| 290 | static void destroy_handle_cache(struct zs_pool *pool) | ||
| 291 | { | ||
| 292 | kmem_cache_destroy(pool->handle_cachep); | ||
| 293 | } | ||
| 294 | |||
| 295 | static unsigned long alloc_handle(struct zs_pool *pool) | ||
| 296 | { | ||
| 297 | return (unsigned long)kmem_cache_alloc(pool->handle_cachep, | ||
| 298 | pool->flags & ~__GFP_HIGHMEM); | ||
| 299 | } | ||
| 300 | |||
| 301 | static void free_handle(struct zs_pool *pool, unsigned long handle) | ||
| 302 | { | ||
| 303 | kmem_cache_free(pool->handle_cachep, (void *)handle); | ||
| 304 | } | ||
| 305 | |||
| 306 | static void record_obj(unsigned long handle, unsigned long obj) | ||
| 307 | { | ||
| 308 | *(unsigned long *)handle = obj; | ||
| 309 | } | ||
| 310 | |||
| 272 | /* zpool driver */ | 311 | /* zpool driver */ |
| 273 | 312 | ||
| 274 | #ifdef CONFIG_ZPOOL | 313 | #ifdef CONFIG_ZPOOL |
| @@ -346,6 +385,11 @@ static struct zpool_driver zs_zpool_driver = { | |||
| 346 | MODULE_ALIAS("zpool-zsmalloc"); | 385 | MODULE_ALIAS("zpool-zsmalloc"); |
| 347 | #endif /* CONFIG_ZPOOL */ | 386 | #endif /* CONFIG_ZPOOL */ |
| 348 | 387 | ||
| 388 | static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) | ||
| 389 | { | ||
| 390 | return pages_per_zspage * PAGE_SIZE / size; | ||
| 391 | } | ||
| 392 | |||
| 349 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ | 393 | /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ |
| 350 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); | 394 | static DEFINE_PER_CPU(struct mapping_area, zs_map_area); |
| 351 | 395 | ||
| @@ -396,9 +440,182 @@ static int get_size_class_index(int size) | |||
| 396 | idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, | 440 | idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, |
| 397 | ZS_SIZE_CLASS_DELTA); | 441 | ZS_SIZE_CLASS_DELTA); |
| 398 | 442 | ||
| 399 | return idx; | 443 | return min(zs_size_classes - 1, idx); |
| 444 | } | ||
| 445 | |||
| 446 | #ifdef CONFIG_ZSMALLOC_STAT | ||
| 447 | |||
| 448 | static inline void zs_stat_inc(struct size_class *class, | ||
| 449 | enum zs_stat_type type, unsigned long cnt) | ||
| 450 | { | ||
| 451 | class->stats.objs[type] += cnt; | ||
| 452 | } | ||
| 453 | |||
| 454 | static inline void zs_stat_dec(struct size_class *class, | ||
| 455 | enum zs_stat_type type, unsigned long cnt) | ||
| 456 | { | ||
| 457 | class->stats.objs[type] -= cnt; | ||
| 458 | } | ||
| 459 | |||
| 460 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 461 | enum zs_stat_type type) | ||
| 462 | { | ||
| 463 | return class->stats.objs[type]; | ||
| 464 | } | ||
| 465 | |||
| 466 | static int __init zs_stat_init(void) | ||
| 467 | { | ||
| 468 | if (!debugfs_initialized()) | ||
| 469 | return -ENODEV; | ||
| 470 | |||
| 471 | zs_stat_root = debugfs_create_dir("zsmalloc", NULL); | ||
| 472 | if (!zs_stat_root) | ||
| 473 | return -ENOMEM; | ||
| 474 | |||
| 475 | return 0; | ||
| 476 | } | ||
| 477 | |||
| 478 | static void __exit zs_stat_exit(void) | ||
| 479 | { | ||
| 480 | debugfs_remove_recursive(zs_stat_root); | ||
| 481 | } | ||
| 482 | |||
| 483 | static int zs_stats_size_show(struct seq_file *s, void *v) | ||
| 484 | { | ||
| 485 | int i; | ||
| 486 | struct zs_pool *pool = s->private; | ||
| 487 | struct size_class *class; | ||
| 488 | int objs_per_zspage; | ||
| 489 | unsigned long class_almost_full, class_almost_empty; | ||
| 490 | unsigned long obj_allocated, obj_used, pages_used; | ||
| 491 | unsigned long total_class_almost_full = 0, total_class_almost_empty = 0; | ||
| 492 | unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; | ||
| 493 | |||
| 494 | seq_printf(s, " %5s %5s %11s %12s %13s %10s %10s %16s\n", | ||
| 495 | "class", "size", "almost_full", "almost_empty", | ||
| 496 | "obj_allocated", "obj_used", "pages_used", | ||
| 497 | "pages_per_zspage"); | ||
| 498 | |||
| 499 | for (i = 0; i < zs_size_classes; i++) { | ||
| 500 | class = pool->size_class[i]; | ||
| 501 | |||
| 502 | if (class->index != i) | ||
| 503 | continue; | ||
| 504 | |||
| 505 | spin_lock(&class->lock); | ||
| 506 | class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); | ||
| 507 | class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); | ||
| 508 | obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); | ||
| 509 | obj_used = zs_stat_get(class, OBJ_USED); | ||
| 510 | spin_unlock(&class->lock); | ||
| 511 | |||
| 512 | objs_per_zspage = get_maxobj_per_zspage(class->size, | ||
| 513 | class->pages_per_zspage); | ||
| 514 | pages_used = obj_allocated / objs_per_zspage * | ||
| 515 | class->pages_per_zspage; | ||
| 516 | |||
| 517 | seq_printf(s, " %5u %5u %11lu %12lu %13lu %10lu %10lu %16d\n", | ||
| 518 | i, class->size, class_almost_full, class_almost_empty, | ||
| 519 | obj_allocated, obj_used, pages_used, | ||
| 520 | class->pages_per_zspage); | ||
| 521 | |||
| 522 | total_class_almost_full += class_almost_full; | ||
| 523 | total_class_almost_empty += class_almost_empty; | ||
| 524 | total_objs += obj_allocated; | ||
| 525 | total_used_objs += obj_used; | ||
| 526 | total_pages += pages_used; | ||
| 527 | } | ||
| 528 | |||
| 529 | seq_puts(s, "\n"); | ||
| 530 | seq_printf(s, " %5s %5s %11lu %12lu %13lu %10lu %10lu\n", | ||
| 531 | "Total", "", total_class_almost_full, | ||
| 532 | total_class_almost_empty, total_objs, | ||
| 533 | total_used_objs, total_pages); | ||
| 534 | |||
| 535 | return 0; | ||
| 536 | } | ||
| 537 | |||
| 538 | static int zs_stats_size_open(struct inode *inode, struct file *file) | ||
| 539 | { | ||
| 540 | return single_open(file, zs_stats_size_show, inode->i_private); | ||
| 541 | } | ||
| 542 | |||
| 543 | static const struct file_operations zs_stat_size_ops = { | ||
| 544 | .open = zs_stats_size_open, | ||
| 545 | .read = seq_read, | ||
| 546 | .llseek = seq_lseek, | ||
| 547 | .release = single_release, | ||
| 548 | }; | ||
| 549 | |||
| 550 | static int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
| 551 | { | ||
| 552 | struct dentry *entry; | ||
| 553 | |||
| 554 | if (!zs_stat_root) | ||
| 555 | return -ENODEV; | ||
| 556 | |||
| 557 | entry = debugfs_create_dir(name, zs_stat_root); | ||
| 558 | if (!entry) { | ||
| 559 | pr_warn("debugfs dir <%s> creation failed\n", name); | ||
| 560 | return -ENOMEM; | ||
| 561 | } | ||
| 562 | pool->stat_dentry = entry; | ||
| 563 | |||
| 564 | entry = debugfs_create_file("classes", S_IFREG | S_IRUGO, | ||
| 565 | pool->stat_dentry, pool, &zs_stat_size_ops); | ||
| 566 | if (!entry) { | ||
| 567 | pr_warn("%s: debugfs file entry <%s> creation failed\n", | ||
| 568 | name, "classes"); | ||
| 569 | return -ENOMEM; | ||
| 570 | } | ||
| 571 | |||
| 572 | return 0; | ||
| 573 | } | ||
| 574 | |||
| 575 | static void zs_pool_stat_destroy(struct zs_pool *pool) | ||
| 576 | { | ||
| 577 | debugfs_remove_recursive(pool->stat_dentry); | ||
| 578 | } | ||
| 579 | |||
| 580 | #else /* CONFIG_ZSMALLOC_STAT */ | ||
| 581 | |||
| 582 | static inline void zs_stat_inc(struct size_class *class, | ||
| 583 | enum zs_stat_type type, unsigned long cnt) | ||
| 584 | { | ||
| 585 | } | ||
| 586 | |||
| 587 | static inline void zs_stat_dec(struct size_class *class, | ||
| 588 | enum zs_stat_type type, unsigned long cnt) | ||
| 589 | { | ||
| 590 | } | ||
| 591 | |||
| 592 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 593 | enum zs_stat_type type) | ||
| 594 | { | ||
| 595 | return 0; | ||
| 596 | } | ||
| 597 | |||
| 598 | static int __init zs_stat_init(void) | ||
| 599 | { | ||
| 600 | return 0; | ||
| 601 | } | ||
| 602 | |||
| 603 | static void __exit zs_stat_exit(void) | ||
| 604 | { | ||
| 605 | } | ||
| 606 | |||
| 607 | static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
| 608 | { | ||
| 609 | return 0; | ||
| 610 | } | ||
| 611 | |||
| 612 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) | ||
| 613 | { | ||
| 400 | } | 614 | } |
| 401 | 615 | ||
| 616 | #endif | ||
| 617 | |||
| 618 | |||
| 402 | /* | 619 | /* |
| 403 | * For each size class, zspages are divided into different groups | 620 | * For each size class, zspages are divided into different groups |
| 404 | * depending on how "full" they are. This was done so that we could | 621 | * depending on how "full" they are. This was done so that we could |
| @@ -419,7 +636,7 @@ static enum fullness_group get_fullness_group(struct page *page) | |||
| 419 | fg = ZS_EMPTY; | 636 | fg = ZS_EMPTY; |
| 420 | else if (inuse == max_objects) | 637 | else if (inuse == max_objects) |
| 421 | fg = ZS_FULL; | 638 | fg = ZS_FULL; |
| 422 | else if (inuse <= max_objects / fullness_threshold_frac) | 639 | else if (inuse <= 3 * max_objects / fullness_threshold_frac) |
| 423 | fg = ZS_ALMOST_EMPTY; | 640 | fg = ZS_ALMOST_EMPTY; |
| 424 | else | 641 | else |
| 425 | fg = ZS_ALMOST_FULL; | 642 | fg = ZS_ALMOST_FULL; |
| @@ -448,6 +665,8 @@ static void insert_zspage(struct page *page, struct size_class *class, | |||
| 448 | list_add_tail(&page->lru, &(*head)->lru); | 665 | list_add_tail(&page->lru, &(*head)->lru); |
| 449 | 666 | ||
| 450 | *head = page; | 667 | *head = page; |
| 668 | zs_stat_inc(class, fullness == ZS_ALMOST_EMPTY ? | ||
| 669 | CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); | ||
| 451 | } | 670 | } |
| 452 | 671 | ||
| 453 | /* | 672 | /* |
| @@ -473,6 +692,8 @@ static void remove_zspage(struct page *page, struct size_class *class, | |||
| 473 | struct page, lru); | 692 | struct page, lru); |
| 474 | 693 | ||
| 475 | list_del_init(&page->lru); | 694 | list_del_init(&page->lru); |
| 695 | zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? | ||
| 696 | CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); | ||
| 476 | } | 697 | } |
| 477 | 698 | ||
| 478 | /* | 699 | /* |
| @@ -484,11 +705,10 @@ static void remove_zspage(struct page *page, struct size_class *class, | |||
| 484 | * page from the freelist of the old fullness group to that of the new | 705 | * page from the freelist of the old fullness group to that of the new |
| 485 | * fullness group. | 706 | * fullness group. |
| 486 | */ | 707 | */ |
| 487 | static enum fullness_group fix_fullness_group(struct zs_pool *pool, | 708 | static enum fullness_group fix_fullness_group(struct size_class *class, |
| 488 | struct page *page) | 709 | struct page *page) |
| 489 | { | 710 | { |
| 490 | int class_idx; | 711 | int class_idx; |
| 491 | struct size_class *class; | ||
| 492 | enum fullness_group currfg, newfg; | 712 | enum fullness_group currfg, newfg; |
| 493 | 713 | ||
| 494 | BUG_ON(!is_first_page(page)); | 714 | BUG_ON(!is_first_page(page)); |
| @@ -498,7 +718,6 @@ static enum fullness_group fix_fullness_group(struct zs_pool *pool, | |||
| 498 | if (newfg == currfg) | 718 | if (newfg == currfg) |
| 499 | goto out; | 719 | goto out; |
| 500 | 720 | ||
| 501 | class = pool->size_class[class_idx]; | ||
| 502 | remove_zspage(page, class, currfg); | 721 | remove_zspage(page, class, currfg); |
| 503 | insert_zspage(page, class, newfg); | 722 | insert_zspage(page, class, newfg); |
| 504 | set_zspage_mapping(page, class_idx, newfg); | 723 | set_zspage_mapping(page, class_idx, newfg); |
| @@ -512,7 +731,8 @@ out: | |||
| 512 | * to form a zspage for each size class. This is important | 731 | * to form a zspage for each size class. This is important |
| 513 | * to reduce wastage due to unusable space left at end of | 732 | * to reduce wastage due to unusable space left at end of |
| 514 | * each zspage which is given as: | 733 | * each zspage which is given as: |
| 515 | * wastage = Zp - Zp % size_class | 734 | * wastage = Zp % class_size |
| 735 | * usage = Zp - wastage | ||
| 516 | * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... | 736 | * where Zp = zspage size = k * PAGE_SIZE where k = 1, 2, ... |
| 517 | * | 737 | * |
| 518 | * For example, for size class of 3/8 * PAGE_SIZE, we should | 738 | * For example, for size class of 3/8 * PAGE_SIZE, we should |
| @@ -571,35 +791,50 @@ static struct page *get_next_page(struct page *page) | |||
| 571 | 791 | ||
| 572 | /* | 792 | /* |
| 573 | * Encode <page, obj_idx> as a single handle value. | 793 | * Encode <page, obj_idx> as a single handle value. |
| 574 | * On hardware platforms with physical memory starting at 0x0 the pfn | 794 | * We use the least bit of handle for tagging. |
| 575 | * could be 0 so we ensure that the handle will never be 0 by adjusting the | ||
| 576 | * encoded obj_idx value before encoding. | ||
| 577 | */ | 795 | */ |
| 578 | static void *obj_location_to_handle(struct page *page, unsigned long obj_idx) | 796 | static void *location_to_obj(struct page *page, unsigned long obj_idx) |
| 579 | { | 797 | { |
| 580 | unsigned long handle; | 798 | unsigned long obj; |
| 581 | 799 | ||
| 582 | if (!page) { | 800 | if (!page) { |
| 583 | BUG_ON(obj_idx); | 801 | BUG_ON(obj_idx); |
| 584 | return NULL; | 802 | return NULL; |
| 585 | } | 803 | } |
| 586 | 804 | ||
| 587 | handle = page_to_pfn(page) << OBJ_INDEX_BITS; | 805 | obj = page_to_pfn(page) << OBJ_INDEX_BITS; |
| 588 | handle |= ((obj_idx + 1) & OBJ_INDEX_MASK); | 806 | obj |= ((obj_idx) & OBJ_INDEX_MASK); |
| 807 | obj <<= OBJ_TAG_BITS; | ||
| 589 | 808 | ||
| 590 | return (void *)handle; | 809 | return (void *)obj; |
| 591 | } | 810 | } |
| 592 | 811 | ||
| 593 | /* | 812 | /* |
| 594 | * Decode <page, obj_idx> pair from the given object handle. We adjust the | 813 | * Decode <page, obj_idx> pair from the given object handle. We adjust the |
| 595 | * decoded obj_idx back to its original value since it was adjusted in | 814 | * decoded obj_idx back to its original value since it was adjusted in |
| 596 | * obj_location_to_handle(). | 815 | * location_to_obj(). |
| 597 | */ | 816 | */ |
| 598 | static void obj_handle_to_location(unsigned long handle, struct page **page, | 817 | static void obj_to_location(unsigned long obj, struct page **page, |
| 599 | unsigned long *obj_idx) | 818 | unsigned long *obj_idx) |
| 600 | { | 819 | { |
| 601 | *page = pfn_to_page(handle >> OBJ_INDEX_BITS); | 820 | obj >>= OBJ_TAG_BITS; |
| 602 | *obj_idx = (handle & OBJ_INDEX_MASK) - 1; | 821 | *page = pfn_to_page(obj >> OBJ_INDEX_BITS); |
| 822 | *obj_idx = (obj & OBJ_INDEX_MASK); | ||
| 823 | } | ||
| 824 | |||
| 825 | static unsigned long handle_to_obj(unsigned long handle) | ||
| 826 | { | ||
| 827 | return *(unsigned long *)handle; | ||
| 828 | } | ||
| 829 | |||
| 830 | static unsigned long obj_to_head(struct size_class *class, struct page *page, | ||
| 831 | void *obj) | ||
| 832 | { | ||
| 833 | if (class->huge) { | ||
| 834 | VM_BUG_ON(!is_first_page(page)); | ||
| 835 | return *(unsigned long *)page_private(page); | ||
| 836 | } else | ||
| 837 | return *(unsigned long *)obj; | ||
| 603 | } | 838 | } |
| 604 | 839 | ||
| 605 | static unsigned long obj_idx_to_offset(struct page *page, | 840 | static unsigned long obj_idx_to_offset(struct page *page, |
| @@ -613,6 +848,25 @@ static unsigned long obj_idx_to_offset(struct page *page, | |||
| 613 | return off + obj_idx * class_size; | 848 | return off + obj_idx * class_size; |
| 614 | } | 849 | } |
| 615 | 850 | ||
| 851 | static inline int trypin_tag(unsigned long handle) | ||
| 852 | { | ||
| 853 | unsigned long *ptr = (unsigned long *)handle; | ||
| 854 | |||
| 855 | return !test_and_set_bit_lock(HANDLE_PIN_BIT, ptr); | ||
| 856 | } | ||
| 857 | |||
| 858 | static void pin_tag(unsigned long handle) | ||
| 859 | { | ||
| 860 | while (!trypin_tag(handle)); | ||
| 861 | } | ||
| 862 | |||
| 863 | static void unpin_tag(unsigned long handle) | ||
| 864 | { | ||
| 865 | unsigned long *ptr = (unsigned long *)handle; | ||
| 866 | |||
| 867 | clear_bit_unlock(HANDLE_PIN_BIT, ptr); | ||
| 868 | } | ||
| 869 | |||
| 616 | static void reset_page(struct page *page) | 870 | static void reset_page(struct page *page) |
| 617 | { | 871 | { |
| 618 | clear_bit(PG_private, &page->flags); | 872 | clear_bit(PG_private, &page->flags); |
| @@ -674,7 +928,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
| 674 | link = (struct link_free *)vaddr + off / sizeof(*link); | 928 | link = (struct link_free *)vaddr + off / sizeof(*link); |
| 675 | 929 | ||
| 676 | while ((off += class->size) < PAGE_SIZE) { | 930 | while ((off += class->size) < PAGE_SIZE) { |
| 677 | link->next = obj_location_to_handle(page, i++); | 931 | link->next = location_to_obj(page, i++); |
| 678 | link += class->size / sizeof(*link); | 932 | link += class->size / sizeof(*link); |
| 679 | } | 933 | } |
| 680 | 934 | ||
| @@ -684,7 +938,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
| 684 | * page (if present) | 938 | * page (if present) |
| 685 | */ | 939 | */ |
| 686 | next_page = get_next_page(page); | 940 | next_page = get_next_page(page); |
| 687 | link->next = obj_location_to_handle(next_page, 0); | 941 | link->next = location_to_obj(next_page, 0); |
| 688 | kunmap_atomic(vaddr); | 942 | kunmap_atomic(vaddr); |
| 689 | page = next_page; | 943 | page = next_page; |
| 690 | off %= PAGE_SIZE; | 944 | off %= PAGE_SIZE; |
| @@ -738,7 +992,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) | |||
| 738 | 992 | ||
| 739 | init_zspage(first_page, class); | 993 | init_zspage(first_page, class); |
| 740 | 994 | ||
| 741 | first_page->freelist = obj_location_to_handle(first_page, 0); | 995 | first_page->freelist = location_to_obj(first_page, 0); |
| 742 | /* Maximum number of objects we can store in this zspage */ | 996 | /* Maximum number of objects we can store in this zspage */ |
| 743 | first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; | 997 | first_page->objects = class->pages_per_zspage * PAGE_SIZE / class->size; |
| 744 | 998 | ||
| @@ -860,12 +1114,19 @@ static void __zs_unmap_object(struct mapping_area *area, | |||
| 860 | { | 1114 | { |
| 861 | int sizes[2]; | 1115 | int sizes[2]; |
| 862 | void *addr; | 1116 | void *addr; |
| 863 | char *buf = area->vm_buf; | 1117 | char *buf; |
| 864 | 1118 | ||
| 865 | /* no write fastpath */ | 1119 | /* no write fastpath */ |
| 866 | if (area->vm_mm == ZS_MM_RO) | 1120 | if (area->vm_mm == ZS_MM_RO) |
| 867 | goto out; | 1121 | goto out; |
| 868 | 1122 | ||
| 1123 | buf = area->vm_buf; | ||
| 1124 | if (!area->huge) { | ||
| 1125 | buf = buf + ZS_HANDLE_SIZE; | ||
| 1126 | size -= ZS_HANDLE_SIZE; | ||
| 1127 | off += ZS_HANDLE_SIZE; | ||
| 1128 | } | ||
| 1129 | |||
| 869 | sizes[0] = PAGE_SIZE - off; | 1130 | sizes[0] = PAGE_SIZE - off; |
| 870 | sizes[1] = size - sizes[0]; | 1131 | sizes[1] = size - sizes[0]; |
| 871 | 1132 | ||
| @@ -952,11 +1213,6 @@ static void init_zs_size_classes(void) | |||
| 952 | zs_size_classes = nr; | 1213 | zs_size_classes = nr; |
| 953 | } | 1214 | } |
| 954 | 1215 | ||
| 955 | static unsigned int get_maxobj_per_zspage(int size, int pages_per_zspage) | ||
| 956 | { | ||
| 957 | return pages_per_zspage * PAGE_SIZE / size; | ||
| 958 | } | ||
| 959 | |||
| 960 | static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | 1216 | static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) |
| 961 | { | 1217 | { |
| 962 | if (prev->pages_per_zspage != pages_per_zspage) | 1218 | if (prev->pages_per_zspage != pages_per_zspage) |
| @@ -969,166 +1225,13 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) | |||
| 969 | return true; | 1225 | return true; |
| 970 | } | 1226 | } |
| 971 | 1227 | ||
| 972 | #ifdef CONFIG_ZSMALLOC_STAT | 1228 | static bool zspage_full(struct page *page) |
| 973 | |||
| 974 | static inline void zs_stat_inc(struct size_class *class, | ||
| 975 | enum zs_stat_type type, unsigned long cnt) | ||
| 976 | { | ||
| 977 | class->stats.objs[type] += cnt; | ||
| 978 | } | ||
| 979 | |||
| 980 | static inline void zs_stat_dec(struct size_class *class, | ||
| 981 | enum zs_stat_type type, unsigned long cnt) | ||
| 982 | { | ||
| 983 | class->stats.objs[type] -= cnt; | ||
| 984 | } | ||
| 985 | |||
| 986 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 987 | enum zs_stat_type type) | ||
| 988 | { | ||
| 989 | return class->stats.objs[type]; | ||
| 990 | } | ||
| 991 | |||
| 992 | static int __init zs_stat_init(void) | ||
| 993 | { | ||
| 994 | if (!debugfs_initialized()) | ||
| 995 | return -ENODEV; | ||
| 996 | |||
| 997 | zs_stat_root = debugfs_create_dir("zsmalloc", NULL); | ||
| 998 | if (!zs_stat_root) | ||
| 999 | return -ENOMEM; | ||
| 1000 | |||
| 1001 | return 0; | ||
| 1002 | } | ||
| 1003 | |||
| 1004 | static void __exit zs_stat_exit(void) | ||
| 1005 | { | ||
| 1006 | debugfs_remove_recursive(zs_stat_root); | ||
| 1007 | } | ||
| 1008 | |||
| 1009 | static int zs_stats_size_show(struct seq_file *s, void *v) | ||
| 1010 | { | 1229 | { |
| 1011 | int i; | 1230 | BUG_ON(!is_first_page(page)); |
| 1012 | struct zs_pool *pool = s->private; | ||
| 1013 | struct size_class *class; | ||
| 1014 | int objs_per_zspage; | ||
| 1015 | unsigned long obj_allocated, obj_used, pages_used; | ||
| 1016 | unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; | ||
| 1017 | |||
| 1018 | seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size", | ||
| 1019 | "obj_allocated", "obj_used", "pages_used"); | ||
| 1020 | |||
| 1021 | for (i = 0; i < zs_size_classes; i++) { | ||
| 1022 | class = pool->size_class[i]; | ||
| 1023 | |||
| 1024 | if (class->index != i) | ||
| 1025 | continue; | ||
| 1026 | |||
| 1027 | spin_lock(&class->lock); | ||
| 1028 | obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); | ||
| 1029 | obj_used = zs_stat_get(class, OBJ_USED); | ||
| 1030 | spin_unlock(&class->lock); | ||
| 1031 | |||
| 1032 | objs_per_zspage = get_maxobj_per_zspage(class->size, | ||
| 1033 | class->pages_per_zspage); | ||
| 1034 | pages_used = obj_allocated / objs_per_zspage * | ||
| 1035 | class->pages_per_zspage; | ||
| 1036 | |||
| 1037 | seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i, | ||
| 1038 | class->size, obj_allocated, obj_used, pages_used); | ||
| 1039 | |||
| 1040 | total_objs += obj_allocated; | ||
| 1041 | total_used_objs += obj_used; | ||
| 1042 | total_pages += pages_used; | ||
| 1043 | } | ||
| 1044 | |||
| 1045 | seq_puts(s, "\n"); | ||
| 1046 | seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "", | ||
| 1047 | total_objs, total_used_objs, total_pages); | ||
| 1048 | |||
| 1049 | return 0; | ||
| 1050 | } | ||
| 1051 | |||
| 1052 | static int zs_stats_size_open(struct inode *inode, struct file *file) | ||
| 1053 | { | ||
| 1054 | return single_open(file, zs_stats_size_show, inode->i_private); | ||
| 1055 | } | ||
| 1056 | |||
| 1057 | static const struct file_operations zs_stat_size_ops = { | ||
| 1058 | .open = zs_stats_size_open, | ||
| 1059 | .read = seq_read, | ||
| 1060 | .llseek = seq_lseek, | ||
| 1061 | .release = single_release, | ||
| 1062 | }; | ||
| 1063 | |||
| 1064 | static int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
| 1065 | { | ||
| 1066 | struct dentry *entry; | ||
| 1067 | |||
| 1068 | if (!zs_stat_root) | ||
| 1069 | return -ENODEV; | ||
| 1070 | |||
| 1071 | entry = debugfs_create_dir(name, zs_stat_root); | ||
| 1072 | if (!entry) { | ||
| 1073 | pr_warn("debugfs dir <%s> creation failed\n", name); | ||
| 1074 | return -ENOMEM; | ||
| 1075 | } | ||
| 1076 | pool->stat_dentry = entry; | ||
| 1077 | |||
| 1078 | entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO, | ||
| 1079 | pool->stat_dentry, pool, &zs_stat_size_ops); | ||
| 1080 | if (!entry) { | ||
| 1081 | pr_warn("%s: debugfs file entry <%s> creation failed\n", | ||
| 1082 | name, "obj_in_classes"); | ||
| 1083 | return -ENOMEM; | ||
| 1084 | } | ||
| 1085 | |||
| 1086 | return 0; | ||
| 1087 | } | ||
| 1088 | |||
| 1089 | static void zs_pool_stat_destroy(struct zs_pool *pool) | ||
| 1090 | { | ||
| 1091 | debugfs_remove_recursive(pool->stat_dentry); | ||
| 1092 | } | ||
| 1093 | |||
| 1094 | #else /* CONFIG_ZSMALLOC_STAT */ | ||
| 1095 | |||
| 1096 | static inline void zs_stat_inc(struct size_class *class, | ||
| 1097 | enum zs_stat_type type, unsigned long cnt) | ||
| 1098 | { | ||
| 1099 | } | ||
| 1100 | |||
| 1101 | static inline void zs_stat_dec(struct size_class *class, | ||
| 1102 | enum zs_stat_type type, unsigned long cnt) | ||
| 1103 | { | ||
| 1104 | } | ||
| 1105 | |||
| 1106 | static inline unsigned long zs_stat_get(struct size_class *class, | ||
| 1107 | enum zs_stat_type type) | ||
| 1108 | { | ||
| 1109 | return 0; | ||
| 1110 | } | ||
| 1111 | |||
| 1112 | static int __init zs_stat_init(void) | ||
| 1113 | { | ||
| 1114 | return 0; | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | static void __exit zs_stat_exit(void) | ||
| 1118 | { | ||
| 1119 | } | ||
| 1120 | |||
| 1121 | static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) | ||
| 1122 | { | ||
| 1123 | return 0; | ||
| 1124 | } | ||
| 1125 | 1231 | ||
| 1126 | static inline void zs_pool_stat_destroy(struct zs_pool *pool) | 1232 | return page->inuse == page->objects; |
| 1127 | { | ||
| 1128 | } | 1233 | } |
| 1129 | 1234 | ||
| 1130 | #endif | ||
| 1131 | |||
| 1132 | unsigned long zs_get_total_pages(struct zs_pool *pool) | 1235 | unsigned long zs_get_total_pages(struct zs_pool *pool) |
| 1133 | { | 1236 | { |
| 1134 | return atomic_long_read(&pool->pages_allocated); | 1237 | return atomic_long_read(&pool->pages_allocated); |
| @@ -1153,13 +1256,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
| 1153 | enum zs_mapmode mm) | 1256 | enum zs_mapmode mm) |
| 1154 | { | 1257 | { |
| 1155 | struct page *page; | 1258 | struct page *page; |
| 1156 | unsigned long obj_idx, off; | 1259 | unsigned long obj, obj_idx, off; |
| 1157 | 1260 | ||
| 1158 | unsigned int class_idx; | 1261 | unsigned int class_idx; |
| 1159 | enum fullness_group fg; | 1262 | enum fullness_group fg; |
| 1160 | struct size_class *class; | 1263 | struct size_class *class; |
| 1161 | struct mapping_area *area; | 1264 | struct mapping_area *area; |
| 1162 | struct page *pages[2]; | 1265 | struct page *pages[2]; |
| 1266 | void *ret; | ||
| 1163 | 1267 | ||
| 1164 | BUG_ON(!handle); | 1268 | BUG_ON(!handle); |
| 1165 | 1269 | ||
| @@ -1170,7 +1274,11 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
| 1170 | */ | 1274 | */ |
| 1171 | BUG_ON(in_interrupt()); | 1275 | BUG_ON(in_interrupt()); |
| 1172 | 1276 | ||
| 1173 | obj_handle_to_location(handle, &page, &obj_idx); | 1277 | /* From now on, migration cannot move the object */ |
| 1278 | pin_tag(handle); | ||
| 1279 | |||
| 1280 | obj = handle_to_obj(handle); | ||
| 1281 | obj_to_location(obj, &page, &obj_idx); | ||
| 1174 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1282 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
| 1175 | class = pool->size_class[class_idx]; | 1283 | class = pool->size_class[class_idx]; |
| 1176 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1284 | off = obj_idx_to_offset(page, obj_idx, class->size); |
| @@ -1180,7 +1288,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
| 1180 | if (off + class->size <= PAGE_SIZE) { | 1288 | if (off + class->size <= PAGE_SIZE) { |
| 1181 | /* this object is contained entirely within a page */ | 1289 | /* this object is contained entirely within a page */ |
| 1182 | area->vm_addr = kmap_atomic(page); | 1290 | area->vm_addr = kmap_atomic(page); |
| 1183 | return area->vm_addr + off; | 1291 | ret = area->vm_addr + off; |
| 1292 | goto out; | ||
| 1184 | } | 1293 | } |
| 1185 | 1294 | ||
| 1186 | /* this object spans two pages */ | 1295 | /* this object spans two pages */ |
| @@ -1188,14 +1297,19 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
| 1188 | pages[1] = get_next_page(page); | 1297 | pages[1] = get_next_page(page); |
| 1189 | BUG_ON(!pages[1]); | 1298 | BUG_ON(!pages[1]); |
| 1190 | 1299 | ||
| 1191 | return __zs_map_object(area, pages, off, class->size); | 1300 | ret = __zs_map_object(area, pages, off, class->size); |
| 1301 | out: | ||
| 1302 | if (!class->huge) | ||
| 1303 | ret += ZS_HANDLE_SIZE; | ||
| 1304 | |||
| 1305 | return ret; | ||
| 1192 | } | 1306 | } |
| 1193 | EXPORT_SYMBOL_GPL(zs_map_object); | 1307 | EXPORT_SYMBOL_GPL(zs_map_object); |
| 1194 | 1308 | ||
| 1195 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | 1309 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle) |
| 1196 | { | 1310 | { |
| 1197 | struct page *page; | 1311 | struct page *page; |
| 1198 | unsigned long obj_idx, off; | 1312 | unsigned long obj, obj_idx, off; |
| 1199 | 1313 | ||
| 1200 | unsigned int class_idx; | 1314 | unsigned int class_idx; |
| 1201 | enum fullness_group fg; | 1315 | enum fullness_group fg; |
| @@ -1204,7 +1318,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
| 1204 | 1318 | ||
| 1205 | BUG_ON(!handle); | 1319 | BUG_ON(!handle); |
| 1206 | 1320 | ||
| 1207 | obj_handle_to_location(handle, &page, &obj_idx); | 1321 | obj = handle_to_obj(handle); |
| 1322 | obj_to_location(obj, &page, &obj_idx); | ||
| 1208 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); | 1323 | get_zspage_mapping(get_first_page(page), &class_idx, &fg); |
| 1209 | class = pool->size_class[class_idx]; | 1324 | class = pool->size_class[class_idx]; |
| 1210 | off = obj_idx_to_offset(page, obj_idx, class->size); | 1325 | off = obj_idx_to_offset(page, obj_idx, class->size); |
| @@ -1222,9 +1337,42 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
| 1222 | __zs_unmap_object(area, pages, off, class->size); | 1337 | __zs_unmap_object(area, pages, off, class->size); |
| 1223 | } | 1338 | } |
| 1224 | put_cpu_var(zs_map_area); | 1339 | put_cpu_var(zs_map_area); |
| 1340 | unpin_tag(handle); | ||
| 1225 | } | 1341 | } |
| 1226 | EXPORT_SYMBOL_GPL(zs_unmap_object); | 1342 | EXPORT_SYMBOL_GPL(zs_unmap_object); |
| 1227 | 1343 | ||
| 1344 | static unsigned long obj_malloc(struct page *first_page, | ||
| 1345 | struct size_class *class, unsigned long handle) | ||
| 1346 | { | ||
| 1347 | unsigned long obj; | ||
| 1348 | struct link_free *link; | ||
| 1349 | |||
| 1350 | struct page *m_page; | ||
| 1351 | unsigned long m_objidx, m_offset; | ||
| 1352 | void *vaddr; | ||
| 1353 | |||
| 1354 | handle |= OBJ_ALLOCATED_TAG; | ||
| 1355 | obj = (unsigned long)first_page->freelist; | ||
| 1356 | obj_to_location(obj, &m_page, &m_objidx); | ||
| 1357 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | ||
| 1358 | |||
| 1359 | vaddr = kmap_atomic(m_page); | ||
| 1360 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); | ||
| 1361 | first_page->freelist = link->next; | ||
| 1362 | if (!class->huge) | ||
| 1363 | /* record handle in the header of allocated chunk */ | ||
| 1364 | link->handle = handle; | ||
| 1365 | else | ||
| 1366 | /* record handle in first_page->private */ | ||
| 1367 | set_page_private(first_page, handle); | ||
| 1368 | kunmap_atomic(vaddr); | ||
| 1369 | first_page->inuse++; | ||
| 1370 | zs_stat_inc(class, OBJ_USED, 1); | ||
| 1371 | |||
| 1372 | return obj; | ||
| 1373 | } | ||
| 1374 | |||
| 1375 | |||
| 1228 | /** | 1376 | /** |
| 1229 | * zs_malloc - Allocate block of given size from pool. | 1377 | * zs_malloc - Allocate block of given size from pool. |
| 1230 | * @pool: pool to allocate from | 1378 | * @pool: pool to allocate from |
| @@ -1236,17 +1384,19 @@ EXPORT_SYMBOL_GPL(zs_unmap_object); | |||
| 1236 | */ | 1384 | */ |
| 1237 | unsigned long zs_malloc(struct zs_pool *pool, size_t size) | 1385 | unsigned long zs_malloc(struct zs_pool *pool, size_t size) |
| 1238 | { | 1386 | { |
| 1239 | unsigned long obj; | 1387 | unsigned long handle, obj; |
| 1240 | struct link_free *link; | ||
| 1241 | struct size_class *class; | 1388 | struct size_class *class; |
| 1242 | void *vaddr; | 1389 | struct page *first_page; |
| 1243 | |||
| 1244 | struct page *first_page, *m_page; | ||
| 1245 | unsigned long m_objidx, m_offset; | ||
| 1246 | 1390 | ||
| 1247 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) | 1391 | if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) |
| 1248 | return 0; | 1392 | return 0; |
| 1249 | 1393 | ||
| 1394 | handle = alloc_handle(pool); | ||
| 1395 | if (!handle) | ||
| 1396 | return 0; | ||
| 1397 | |||
| 1398 | /* extra space in chunk to keep the handle */ | ||
| 1399 | size += ZS_HANDLE_SIZE; | ||
| 1250 | class = pool->size_class[get_size_class_index(size)]; | 1400 | class = pool->size_class[get_size_class_index(size)]; |
| 1251 | 1401 | ||
| 1252 | spin_lock(&class->lock); | 1402 | spin_lock(&class->lock); |
| @@ -1255,8 +1405,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
| 1255 | if (!first_page) { | 1405 | if (!first_page) { |
| 1256 | spin_unlock(&class->lock); | 1406 | spin_unlock(&class->lock); |
| 1257 | first_page = alloc_zspage(class, pool->flags); | 1407 | first_page = alloc_zspage(class, pool->flags); |
| 1258 | if (unlikely(!first_page)) | 1408 | if (unlikely(!first_page)) { |
| 1409 | free_handle(pool, handle); | ||
| 1259 | return 0; | 1410 | return 0; |
| 1411 | } | ||
| 1260 | 1412 | ||
| 1261 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); | 1413 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); |
| 1262 | atomic_long_add(class->pages_per_zspage, | 1414 | atomic_long_add(class->pages_per_zspage, |
| @@ -1267,73 +1419,360 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
| 1267 | class->size, class->pages_per_zspage)); | 1419 | class->size, class->pages_per_zspage)); |
| 1268 | } | 1420 | } |
| 1269 | 1421 | ||
| 1270 | obj = (unsigned long)first_page->freelist; | 1422 | obj = obj_malloc(first_page, class, handle); |
| 1271 | obj_handle_to_location(obj, &m_page, &m_objidx); | ||
| 1272 | m_offset = obj_idx_to_offset(m_page, m_objidx, class->size); | ||
| 1273 | |||
| 1274 | vaddr = kmap_atomic(m_page); | ||
| 1275 | link = (struct link_free *)vaddr + m_offset / sizeof(*link); | ||
| 1276 | first_page->freelist = link->next; | ||
| 1277 | memset(link, POISON_INUSE, sizeof(*link)); | ||
| 1278 | kunmap_atomic(vaddr); | ||
| 1279 | |||
| 1280 | first_page->inuse++; | ||
| 1281 | zs_stat_inc(class, OBJ_USED, 1); | ||
| 1282 | /* Now move the zspage to another fullness group, if required */ | 1423 | /* Now move the zspage to another fullness group, if required */ |
| 1283 | fix_fullness_group(pool, first_page); | 1424 | fix_fullness_group(class, first_page); |
| 1425 | record_obj(handle, obj); | ||
| 1284 | spin_unlock(&class->lock); | 1426 | spin_unlock(&class->lock); |
| 1285 | 1427 | ||
| 1286 | return obj; | 1428 | return handle; |
| 1287 | } | 1429 | } |
| 1288 | EXPORT_SYMBOL_GPL(zs_malloc); | 1430 | EXPORT_SYMBOL_GPL(zs_malloc); |
| 1289 | 1431 | ||
| 1290 | void zs_free(struct zs_pool *pool, unsigned long obj) | 1432 | static void obj_free(struct zs_pool *pool, struct size_class *class, |
| 1433 | unsigned long obj) | ||
| 1291 | { | 1434 | { |
| 1292 | struct link_free *link; | 1435 | struct link_free *link; |
| 1293 | struct page *first_page, *f_page; | 1436 | struct page *first_page, *f_page; |
| 1294 | unsigned long f_objidx, f_offset; | 1437 | unsigned long f_objidx, f_offset; |
| 1295 | void *vaddr; | 1438 | void *vaddr; |
| 1296 | |||
| 1297 | int class_idx; | 1439 | int class_idx; |
| 1298 | struct size_class *class; | ||
| 1299 | enum fullness_group fullness; | 1440 | enum fullness_group fullness; |
| 1300 | 1441 | ||
| 1301 | if (unlikely(!obj)) | 1442 | BUG_ON(!obj); |
| 1302 | return; | ||
| 1303 | 1443 | ||
| 1304 | obj_handle_to_location(obj, &f_page, &f_objidx); | 1444 | obj &= ~OBJ_ALLOCATED_TAG; |
| 1445 | obj_to_location(obj, &f_page, &f_objidx); | ||
| 1305 | first_page = get_first_page(f_page); | 1446 | first_page = get_first_page(f_page); |
| 1306 | 1447 | ||
| 1307 | get_zspage_mapping(first_page, &class_idx, &fullness); | 1448 | get_zspage_mapping(first_page, &class_idx, &fullness); |
| 1308 | class = pool->size_class[class_idx]; | ||
| 1309 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); | 1449 | f_offset = obj_idx_to_offset(f_page, f_objidx, class->size); |
| 1310 | 1450 | ||
| 1311 | spin_lock(&class->lock); | 1451 | vaddr = kmap_atomic(f_page); |
| 1312 | 1452 | ||
| 1313 | /* Insert this object in containing zspage's freelist */ | 1453 | /* Insert this object in containing zspage's freelist */ |
| 1314 | vaddr = kmap_atomic(f_page); | ||
| 1315 | link = (struct link_free *)(vaddr + f_offset); | 1454 | link = (struct link_free *)(vaddr + f_offset); |
| 1316 | link->next = first_page->freelist; | 1455 | link->next = first_page->freelist; |
| 1456 | if (class->huge) | ||
| 1457 | set_page_private(first_page, 0); | ||
| 1317 | kunmap_atomic(vaddr); | 1458 | kunmap_atomic(vaddr); |
| 1318 | first_page->freelist = (void *)obj; | 1459 | first_page->freelist = (void *)obj; |
| 1319 | |||
| 1320 | first_page->inuse--; | 1460 | first_page->inuse--; |
| 1321 | fullness = fix_fullness_group(pool, first_page); | ||
| 1322 | |||
| 1323 | zs_stat_dec(class, OBJ_USED, 1); | 1461 | zs_stat_dec(class, OBJ_USED, 1); |
| 1324 | if (fullness == ZS_EMPTY) | 1462 | } |
| 1463 | |||
| 1464 | void zs_free(struct zs_pool *pool, unsigned long handle) | ||
| 1465 | { | ||
| 1466 | struct page *first_page, *f_page; | ||
| 1467 | unsigned long obj, f_objidx; | ||
| 1468 | int class_idx; | ||
| 1469 | struct size_class *class; | ||
| 1470 | enum fullness_group fullness; | ||
| 1471 | |||
| 1472 | if (unlikely(!handle)) | ||
| 1473 | return; | ||
| 1474 | |||
| 1475 | pin_tag(handle); | ||
| 1476 | obj = handle_to_obj(handle); | ||
| 1477 | obj_to_location(obj, &f_page, &f_objidx); | ||
| 1478 | first_page = get_first_page(f_page); | ||
| 1479 | |||
| 1480 | get_zspage_mapping(first_page, &class_idx, &fullness); | ||
| 1481 | class = pool->size_class[class_idx]; | ||
| 1482 | |||
| 1483 | spin_lock(&class->lock); | ||
| 1484 | obj_free(pool, class, obj); | ||
| 1485 | fullness = fix_fullness_group(class, first_page); | ||
| 1486 | if (fullness == ZS_EMPTY) { | ||
| 1325 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | 1487 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( |
| 1326 | class->size, class->pages_per_zspage)); | 1488 | class->size, class->pages_per_zspage)); |
| 1327 | 1489 | atomic_long_sub(class->pages_per_zspage, | |
| 1490 | &pool->pages_allocated); | ||
| 1491 | free_zspage(first_page); | ||
| 1492 | } | ||
| 1328 | spin_unlock(&class->lock); | 1493 | spin_unlock(&class->lock); |
| 1494 | unpin_tag(handle); | ||
| 1495 | |||
| 1496 | free_handle(pool, handle); | ||
| 1497 | } | ||
| 1498 | EXPORT_SYMBOL_GPL(zs_free); | ||
| 1499 | |||
| 1500 | static void zs_object_copy(unsigned long src, unsigned long dst, | ||
| 1501 | struct size_class *class) | ||
| 1502 | { | ||
| 1503 | struct page *s_page, *d_page; | ||
| 1504 | unsigned long s_objidx, d_objidx; | ||
| 1505 | unsigned long s_off, d_off; | ||
| 1506 | void *s_addr, *d_addr; | ||
| 1507 | int s_size, d_size, size; | ||
| 1508 | int written = 0; | ||
| 1509 | |||
| 1510 | s_size = d_size = class->size; | ||
| 1511 | |||
| 1512 | obj_to_location(src, &s_page, &s_objidx); | ||
| 1513 | obj_to_location(dst, &d_page, &d_objidx); | ||
| 1514 | |||
| 1515 | s_off = obj_idx_to_offset(s_page, s_objidx, class->size); | ||
| 1516 | d_off = obj_idx_to_offset(d_page, d_objidx, class->size); | ||
| 1517 | |||
| 1518 | if (s_off + class->size > PAGE_SIZE) | ||
| 1519 | s_size = PAGE_SIZE - s_off; | ||
| 1520 | |||
| 1521 | if (d_off + class->size > PAGE_SIZE) | ||
| 1522 | d_size = PAGE_SIZE - d_off; | ||
| 1523 | |||
| 1524 | s_addr = kmap_atomic(s_page); | ||
| 1525 | d_addr = kmap_atomic(d_page); | ||
| 1526 | |||
| 1527 | while (1) { | ||
| 1528 | size = min(s_size, d_size); | ||
| 1529 | memcpy(d_addr + d_off, s_addr + s_off, size); | ||
| 1530 | written += size; | ||
| 1531 | |||
| 1532 | if (written == class->size) | ||
| 1533 | break; | ||
| 1534 | |||
| 1535 | s_off += size; | ||
| 1536 | s_size -= size; | ||
| 1537 | d_off += size; | ||
| 1538 | d_size -= size; | ||
| 1539 | |||
| 1540 | if (s_off >= PAGE_SIZE) { | ||
| 1541 | kunmap_atomic(d_addr); | ||
| 1542 | kunmap_atomic(s_addr); | ||
| 1543 | s_page = get_next_page(s_page); | ||
| 1544 | BUG_ON(!s_page); | ||
| 1545 | s_addr = kmap_atomic(s_page); | ||
| 1546 | d_addr = kmap_atomic(d_page); | ||
| 1547 | s_size = class->size - written; | ||
| 1548 | s_off = 0; | ||
| 1549 | } | ||
| 1550 | |||
| 1551 | if (d_off >= PAGE_SIZE) { | ||
| 1552 | kunmap_atomic(d_addr); | ||
| 1553 | d_page = get_next_page(d_page); | ||
| 1554 | BUG_ON(!d_page); | ||
| 1555 | d_addr = kmap_atomic(d_page); | ||
| 1556 | d_size = class->size - written; | ||
| 1557 | d_off = 0; | ||
| 1558 | } | ||
| 1559 | } | ||
| 1560 | |||
| 1561 | kunmap_atomic(d_addr); | ||
| 1562 | kunmap_atomic(s_addr); | ||
| 1563 | } | ||
| 1564 | |||
| 1565 | /* | ||
| 1566 | * Find alloced object in zspage from index object and | ||
| 1567 | * return handle. | ||
| 1568 | */ | ||
| 1569 | static unsigned long find_alloced_obj(struct page *page, int index, | ||
| 1570 | struct size_class *class) | ||
| 1571 | { | ||
| 1572 | unsigned long head; | ||
| 1573 | int offset = 0; | ||
| 1574 | unsigned long handle = 0; | ||
| 1575 | void *addr = kmap_atomic(page); | ||
| 1576 | |||
| 1577 | if (!is_first_page(page)) | ||
| 1578 | offset = page->index; | ||
| 1579 | offset += class->size * index; | ||
| 1580 | |||
| 1581 | while (offset < PAGE_SIZE) { | ||
| 1582 | head = obj_to_head(class, page, addr + offset); | ||
| 1583 | if (head & OBJ_ALLOCATED_TAG) { | ||
| 1584 | handle = head & ~OBJ_ALLOCATED_TAG; | ||
| 1585 | if (trypin_tag(handle)) | ||
| 1586 | break; | ||
| 1587 | handle = 0; | ||
| 1588 | } | ||
| 1589 | |||
| 1590 | offset += class->size; | ||
| 1591 | index++; | ||
| 1592 | } | ||
| 1593 | |||
| 1594 | kunmap_atomic(addr); | ||
| 1595 | return handle; | ||
| 1596 | } | ||
| 1597 | |||
| 1598 | struct zs_compact_control { | ||
| 1599 | /* Source page for migration which could be a subpage of zspage. */ | ||
| 1600 | struct page *s_page; | ||
| 1601 | /* Destination page for migration which should be a first page | ||
| 1602 | * of zspage. */ | ||
| 1603 | struct page *d_page; | ||
| 1604 | /* Starting object index within @s_page which used for live object | ||
| 1605 | * in the subpage. */ | ||
| 1606 | int index; | ||
| 1607 | /* how many of objects are migrated */ | ||
| 1608 | int nr_migrated; | ||
| 1609 | }; | ||
| 1610 | |||
| 1611 | static int migrate_zspage(struct zs_pool *pool, struct size_class *class, | ||
| 1612 | struct zs_compact_control *cc) | ||
| 1613 | { | ||
| 1614 | unsigned long used_obj, free_obj; | ||
| 1615 | unsigned long handle; | ||
| 1616 | struct page *s_page = cc->s_page; | ||
| 1617 | struct page *d_page = cc->d_page; | ||
| 1618 | unsigned long index = cc->index; | ||
| 1619 | int nr_migrated = 0; | ||
| 1620 | int ret = 0; | ||
| 1621 | |||
| 1622 | while (1) { | ||
| 1623 | handle = find_alloced_obj(s_page, index, class); | ||
| 1624 | if (!handle) { | ||
| 1625 | s_page = get_next_page(s_page); | ||
| 1626 | if (!s_page) | ||
| 1627 | break; | ||
| 1628 | index = 0; | ||
| 1629 | continue; | ||
| 1630 | } | ||
| 1631 | |||
| 1632 | /* Stop if there is no more space */ | ||
| 1633 | if (zspage_full(d_page)) { | ||
| 1634 | unpin_tag(handle); | ||
| 1635 | ret = -ENOMEM; | ||
| 1636 | break; | ||
| 1637 | } | ||
| 1638 | |||
| 1639 | used_obj = handle_to_obj(handle); | ||
| 1640 | free_obj = obj_malloc(d_page, class, handle); | ||
| 1641 | zs_object_copy(used_obj, free_obj, class); | ||
| 1642 | index++; | ||
| 1643 | record_obj(handle, free_obj); | ||
| 1644 | unpin_tag(handle); | ||
| 1645 | obj_free(pool, class, used_obj); | ||
| 1646 | nr_migrated++; | ||
| 1647 | } | ||
| 1648 | |||
| 1649 | /* Remember last position in this iteration */ | ||
| 1650 | cc->s_page = s_page; | ||
| 1651 | cc->index = index; | ||
| 1652 | cc->nr_migrated = nr_migrated; | ||
| 1653 | |||
| 1654 | return ret; | ||
| 1655 | } | ||
| 1656 | |||
| 1657 | static struct page *alloc_target_page(struct size_class *class) | ||
| 1658 | { | ||
| 1659 | int i; | ||
| 1660 | struct page *page; | ||
| 1661 | |||
| 1662 | for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { | ||
| 1663 | page = class->fullness_list[i]; | ||
| 1664 | if (page) { | ||
| 1665 | remove_zspage(page, class, i); | ||
| 1666 | break; | ||
| 1667 | } | ||
| 1668 | } | ||
| 1669 | |||
| 1670 | return page; | ||
| 1671 | } | ||
| 1672 | |||
| 1673 | static void putback_zspage(struct zs_pool *pool, struct size_class *class, | ||
| 1674 | struct page *first_page) | ||
| 1675 | { | ||
| 1676 | enum fullness_group fullness; | ||
| 1677 | |||
| 1678 | BUG_ON(!is_first_page(first_page)); | ||
| 1679 | |||
| 1680 | fullness = get_fullness_group(first_page); | ||
| 1681 | insert_zspage(first_page, class, fullness); | ||
| 1682 | set_zspage_mapping(first_page, class->index, fullness); | ||
| 1329 | 1683 | ||
| 1330 | if (fullness == ZS_EMPTY) { | 1684 | if (fullness == ZS_EMPTY) { |
| 1685 | zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( | ||
| 1686 | class->size, class->pages_per_zspage)); | ||
| 1331 | atomic_long_sub(class->pages_per_zspage, | 1687 | atomic_long_sub(class->pages_per_zspage, |
| 1332 | &pool->pages_allocated); | 1688 | &pool->pages_allocated); |
| 1689 | |||
| 1333 | free_zspage(first_page); | 1690 | free_zspage(first_page); |
| 1334 | } | 1691 | } |
| 1335 | } | 1692 | } |
| 1336 | EXPORT_SYMBOL_GPL(zs_free); | 1693 | |
| 1694 | static struct page *isolate_source_page(struct size_class *class) | ||
| 1695 | { | ||
| 1696 | struct page *page; | ||
| 1697 | |||
| 1698 | page = class->fullness_list[ZS_ALMOST_EMPTY]; | ||
| 1699 | if (page) | ||
| 1700 | remove_zspage(page, class, ZS_ALMOST_EMPTY); | ||
| 1701 | |||
| 1702 | return page; | ||
| 1703 | } | ||
| 1704 | |||
| 1705 | static unsigned long __zs_compact(struct zs_pool *pool, | ||
| 1706 | struct size_class *class) | ||
| 1707 | { | ||
| 1708 | int nr_to_migrate; | ||
| 1709 | struct zs_compact_control cc; | ||
| 1710 | struct page *src_page; | ||
| 1711 | struct page *dst_page = NULL; | ||
| 1712 | unsigned long nr_total_migrated = 0; | ||
| 1713 | |||
| 1714 | spin_lock(&class->lock); | ||
| 1715 | while ((src_page = isolate_source_page(class))) { | ||
| 1716 | |||
| 1717 | BUG_ON(!is_first_page(src_page)); | ||
| 1718 | |||
| 1719 | /* The goal is to migrate all live objects in source page */ | ||
| 1720 | nr_to_migrate = src_page->inuse; | ||
| 1721 | cc.index = 0; | ||
| 1722 | cc.s_page = src_page; | ||
| 1723 | |||
| 1724 | while ((dst_page = alloc_target_page(class))) { | ||
| 1725 | cc.d_page = dst_page; | ||
| 1726 | /* | ||
| 1727 | * If there is no more space in dst_page, try to | ||
| 1728 | * allocate another zspage. | ||
| 1729 | */ | ||
| 1730 | if (!migrate_zspage(pool, class, &cc)) | ||
| 1731 | break; | ||
| 1732 | |||
| 1733 | putback_zspage(pool, class, dst_page); | ||
| 1734 | nr_total_migrated += cc.nr_migrated; | ||
| 1735 | nr_to_migrate -= cc.nr_migrated; | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | /* Stop if we couldn't find slot */ | ||
| 1739 | if (dst_page == NULL) | ||
| 1740 | break; | ||
| 1741 | |||
| 1742 | putback_zspage(pool, class, dst_page); | ||
| 1743 | putback_zspage(pool, class, src_page); | ||
| 1744 | spin_unlock(&class->lock); | ||
| 1745 | nr_total_migrated += cc.nr_migrated; | ||
| 1746 | cond_resched(); | ||
| 1747 | spin_lock(&class->lock); | ||
| 1748 | } | ||
| 1749 | |||
| 1750 | if (src_page) | ||
| 1751 | putback_zspage(pool, class, src_page); | ||
| 1752 | |||
| 1753 | spin_unlock(&class->lock); | ||
| 1754 | |||
| 1755 | return nr_total_migrated; | ||
| 1756 | } | ||
| 1757 | |||
| 1758 | unsigned long zs_compact(struct zs_pool *pool) | ||
| 1759 | { | ||
| 1760 | int i; | ||
| 1761 | unsigned long nr_migrated = 0; | ||
| 1762 | struct size_class *class; | ||
| 1763 | |||
| 1764 | for (i = zs_size_classes - 1; i >= 0; i--) { | ||
| 1765 | class = pool->size_class[i]; | ||
| 1766 | if (!class) | ||
| 1767 | continue; | ||
| 1768 | if (class->index != i) | ||
| 1769 | continue; | ||
| 1770 | nr_migrated += __zs_compact(pool, class); | ||
| 1771 | } | ||
| 1772 | |||
| 1773 | return nr_migrated; | ||
| 1774 | } | ||
| 1775 | EXPORT_SYMBOL_GPL(zs_compact); | ||
| 1337 | 1776 | ||
| 1338 | /** | 1777 | /** |
| 1339 | * zs_create_pool - Creates an allocation pool to work from. | 1778 | * zs_create_pool - Creates an allocation pool to work from. |
| @@ -1355,20 +1794,20 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) | |||
| 1355 | if (!pool) | 1794 | if (!pool) |
| 1356 | return NULL; | 1795 | return NULL; |
| 1357 | 1796 | ||
| 1358 | pool->name = kstrdup(name, GFP_KERNEL); | ||
| 1359 | if (!pool->name) { | ||
| 1360 | kfree(pool); | ||
| 1361 | return NULL; | ||
| 1362 | } | ||
| 1363 | |||
| 1364 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), | 1797 | pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), |
| 1365 | GFP_KERNEL); | 1798 | GFP_KERNEL); |
| 1366 | if (!pool->size_class) { | 1799 | if (!pool->size_class) { |
| 1367 | kfree(pool->name); | ||
| 1368 | kfree(pool); | 1800 | kfree(pool); |
| 1369 | return NULL; | 1801 | return NULL; |
| 1370 | } | 1802 | } |
| 1371 | 1803 | ||
| 1804 | pool->name = kstrdup(name, GFP_KERNEL); | ||
| 1805 | if (!pool->name) | ||
| 1806 | goto err; | ||
| 1807 | |||
| 1808 | if (create_handle_cache(pool)) | ||
| 1809 | goto err; | ||
| 1810 | |||
| 1372 | /* | 1811 | /* |
| 1373 | * Iterate reversly, because, size of size_class that we want to use | 1812 | * Iterate reversly, because, size of size_class that we want to use |
| 1374 | * for merging should be larger or equal to current size. | 1813 | * for merging should be larger or equal to current size. |
| @@ -1406,6 +1845,9 @@ struct zs_pool *zs_create_pool(char *name, gfp_t flags) | |||
| 1406 | class->size = size; | 1845 | class->size = size; |
| 1407 | class->index = i; | 1846 | class->index = i; |
| 1408 | class->pages_per_zspage = pages_per_zspage; | 1847 | class->pages_per_zspage = pages_per_zspage; |
| 1848 | if (pages_per_zspage == 1 && | ||
| 1849 | get_maxobj_per_zspage(size, pages_per_zspage) == 1) | ||
| 1850 | class->huge = true; | ||
| 1409 | spin_lock_init(&class->lock); | 1851 | spin_lock_init(&class->lock); |
| 1410 | pool->size_class[i] = class; | 1852 | pool->size_class[i] = class; |
| 1411 | 1853 | ||
| @@ -1450,6 +1892,7 @@ void zs_destroy_pool(struct zs_pool *pool) | |||
| 1450 | kfree(class); | 1892 | kfree(class); |
| 1451 | } | 1893 | } |
| 1452 | 1894 | ||
| 1895 | destroy_handle_cache(pool); | ||
| 1453 | kfree(pool->size_class); | 1896 | kfree(pool->size_class); |
| 1454 | kfree(pool->name); | 1897 | kfree(pool->name); |
| 1455 | kfree(pool); | 1898 | kfree(pool); |
