aboutsummaryrefslogtreecommitdiffstats
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2011-01-13 18:46:58 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-01-13 20:32:43 -0500
commitba76149f47d8c939efa0acc07a191237af900471 (patch)
tree162990f51dd24984f114cba14fc7169a3b54f0f1 /mm/huge_memory.c
parent79134171df238171daa4c024a42b77b401ccb00b (diff)
thp: khugepaged
Add khugepaged to relocate fragmented pages into hugepages if new hugepages become available. (this is indipendent of the defrag logic that will have to make new hugepages available) The fundamental reason why khugepaged is unavoidable, is that some memory can be fragmented and not everything can be relocated. So when a virtual machine quits and releases gigabytes of hugepages, we want to use those freely available hugepages to create huge-pmd in the other virtual machines that may be running on fragmented memory, to maximize the CPU efficiency at all times. The scan is slow, it takes nearly zero cpu time, except when it copies data (in which case it means we definitely want to pay for that cpu time) so it seems a good tradeoff. In addition to the hugepages being released by other process releasing memory, we have the strong suspicion that the performance impact of potentially defragmenting hugepages during or before each page fault could lead to more performance inconsistency than allocating small pages at first and having them collapsed into large pages later... if they prove themselfs to be long lived mappings (khugepaged scan is slow so short lived mappings have low probability to run into khugepaged if compared to long lived mappings). Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Acked-by: Rik van Riel <riel@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c1073
1 files changed, 1063 insertions, 10 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7101112a5429..ae2bf08b1099 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -12,14 +12,111 @@
12#include <linux/mmu_notifier.h> 12#include <linux/mmu_notifier.h>
13#include <linux/rmap.h> 13#include <linux/rmap.h>
14#include <linux/swap.h> 14#include <linux/swap.h>
15#include <linux/mm_inline.h>
16#include <linux/kthread.h>
17#include <linux/khugepaged.h>
15#include <asm/tlb.h> 18#include <asm/tlb.h>
16#include <asm/pgalloc.h> 19#include <asm/pgalloc.h>
17#include "internal.h" 20#include "internal.h"
18 21
22/*
23 * By default transparent hugepage support is enabled for all mappings
24 * and khugepaged scans all mappings. Defrag is only invoked by
25 * khugepaged hugepage allocations and by page faults inside
26 * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
27 * allocations.
28 */
19unsigned long transparent_hugepage_flags __read_mostly = 29unsigned long transparent_hugepage_flags __read_mostly =
20 (1<<TRANSPARENT_HUGEPAGE_FLAG); 30 (1<<TRANSPARENT_HUGEPAGE_FLAG)|
31 (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
32
33/* default scan 8*512 pte (or vmas) every 30 second */
34static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
35static unsigned int khugepaged_pages_collapsed;
36static unsigned int khugepaged_full_scans;
37static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
38/* during fragmentation poll the hugepage allocator once every minute */
39static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
40static struct task_struct *khugepaged_thread __read_mostly;
41static DEFINE_MUTEX(khugepaged_mutex);
42static DEFINE_SPINLOCK(khugepaged_mm_lock);
43static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
44/*
45 * default collapse hugepages if there is at least one pte mapped like
46 * it would have happened if the vma was large enough during page
47 * fault.
48 */
49static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
50
51static int khugepaged(void *none);
52static int mm_slots_hash_init(void);
53static int khugepaged_slab_init(void);
54static void khugepaged_slab_free(void);
55
56#define MM_SLOTS_HASH_HEADS 1024
57static struct hlist_head *mm_slots_hash __read_mostly;
58static struct kmem_cache *mm_slot_cache __read_mostly;
59
60/**
61 * struct mm_slot - hash lookup from mm to mm_slot
62 * @hash: hash collision list
63 * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
64 * @mm: the mm that this information is valid for
65 */
66struct mm_slot {
67 struct hlist_node hash;
68 struct list_head mm_node;
69 struct mm_struct *mm;
70};
71
72/**
73 * struct khugepaged_scan - cursor for scanning
74 * @mm_head: the head of the mm list to scan
75 * @mm_slot: the current mm_slot we are scanning
76 * @address: the next address inside that to be scanned
77 *
78 * There is only the one khugepaged_scan instance of this cursor structure.
79 */
80struct khugepaged_scan {
81 struct list_head mm_head;
82 struct mm_slot *mm_slot;
83 unsigned long address;
84} khugepaged_scan = {
85 .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
86};
87
88static int start_khugepaged(void)
89{
90 int err = 0;
91 if (khugepaged_enabled()) {
92 int wakeup;
93 if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
94 err = -ENOMEM;
95 goto out;
96 }
97 mutex_lock(&khugepaged_mutex);
98 if (!khugepaged_thread)
99 khugepaged_thread = kthread_run(khugepaged, NULL,
100 "khugepaged");
101 if (unlikely(IS_ERR(khugepaged_thread))) {
102 printk(KERN_ERR
103 "khugepaged: kthread_run(khugepaged) failed\n");
104 err = PTR_ERR(khugepaged_thread);
105 khugepaged_thread = NULL;
106 }
107 wakeup = !list_empty(&khugepaged_scan.mm_head);
108 mutex_unlock(&khugepaged_mutex);
109 if (wakeup)
110 wake_up_interruptible(&khugepaged_wait);
111 } else
112 /* wakeup to exit */
113 wake_up_interruptible(&khugepaged_wait);
114out:
115 return err;
116}
21 117
22#ifdef CONFIG_SYSFS 118#ifdef CONFIG_SYSFS
119
23static ssize_t double_flag_show(struct kobject *kobj, 120static ssize_t double_flag_show(struct kobject *kobj,
24 struct kobj_attribute *attr, char *buf, 121 struct kobj_attribute *attr, char *buf,
25 enum transparent_hugepage_flag enabled, 122 enum transparent_hugepage_flag enabled,
@@ -68,9 +165,19 @@ static ssize_t enabled_store(struct kobject *kobj,
68 struct kobj_attribute *attr, 165 struct kobj_attribute *attr,
69 const char *buf, size_t count) 166 const char *buf, size_t count)
70{ 167{
71 return double_flag_store(kobj, attr, buf, count, 168 ssize_t ret;
72 TRANSPARENT_HUGEPAGE_FLAG, 169
73 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); 170 ret = double_flag_store(kobj, attr, buf, count,
171 TRANSPARENT_HUGEPAGE_FLAG,
172 TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
173
174 if (ret > 0) {
175 int err = start_khugepaged();
176 if (err)
177 ret = err;
178 }
179
180 return ret;
74} 181}
75static struct kobj_attribute enabled_attr = 182static struct kobj_attribute enabled_attr =
76 __ATTR(enabled, 0644, enabled_show, enabled_store); 183 __ATTR(enabled, 0644, enabled_show, enabled_store);
@@ -153,20 +260,212 @@ static struct attribute *hugepage_attr[] = {
153 260
154static struct attribute_group hugepage_attr_group = { 261static struct attribute_group hugepage_attr_group = {
155 .attrs = hugepage_attr, 262 .attrs = hugepage_attr,
156 .name = "transparent_hugepage", 263};
264
265static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
266 struct kobj_attribute *attr,
267 char *buf)
268{
269 return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
270}
271
272static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
273 struct kobj_attribute *attr,
274 const char *buf, size_t count)
275{
276 unsigned long msecs;
277 int err;
278
279 err = strict_strtoul(buf, 10, &msecs);
280 if (err || msecs > UINT_MAX)
281 return -EINVAL;
282
283 khugepaged_scan_sleep_millisecs = msecs;
284 wake_up_interruptible(&khugepaged_wait);
285
286 return count;
287}
288static struct kobj_attribute scan_sleep_millisecs_attr =
289 __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
290 scan_sleep_millisecs_store);
291
292static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
293 struct kobj_attribute *attr,
294 char *buf)
295{
296 return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
297}
298
299static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
300 struct kobj_attribute *attr,
301 const char *buf, size_t count)
302{
303 unsigned long msecs;
304 int err;
305
306 err = strict_strtoul(buf, 10, &msecs);
307 if (err || msecs > UINT_MAX)
308 return -EINVAL;
309
310 khugepaged_alloc_sleep_millisecs = msecs;
311 wake_up_interruptible(&khugepaged_wait);
312
313 return count;
314}
315static struct kobj_attribute alloc_sleep_millisecs_attr =
316 __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
317 alloc_sleep_millisecs_store);
318
319static ssize_t pages_to_scan_show(struct kobject *kobj,
320 struct kobj_attribute *attr,
321 char *buf)
322{
323 return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
324}
325static ssize_t pages_to_scan_store(struct kobject *kobj,
326 struct kobj_attribute *attr,
327 const char *buf, size_t count)
328{
329 int err;
330 unsigned long pages;
331
332 err = strict_strtoul(buf, 10, &pages);
333 if (err || !pages || pages > UINT_MAX)
334 return -EINVAL;
335
336 khugepaged_pages_to_scan = pages;
337
338 return count;
339}
340static struct kobj_attribute pages_to_scan_attr =
341 __ATTR(pages_to_scan, 0644, pages_to_scan_show,
342 pages_to_scan_store);
343
344static ssize_t pages_collapsed_show(struct kobject *kobj,
345 struct kobj_attribute *attr,
346 char *buf)
347{
348 return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
349}
350static struct kobj_attribute pages_collapsed_attr =
351 __ATTR_RO(pages_collapsed);
352
353static ssize_t full_scans_show(struct kobject *kobj,
354 struct kobj_attribute *attr,
355 char *buf)
356{
357 return sprintf(buf, "%u\n", khugepaged_full_scans);
358}
359static struct kobj_attribute full_scans_attr =
360 __ATTR_RO(full_scans);
361
362static ssize_t khugepaged_defrag_show(struct kobject *kobj,
363 struct kobj_attribute *attr, char *buf)
364{
365 return single_flag_show(kobj, attr, buf,
366 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
367}
368static ssize_t khugepaged_defrag_store(struct kobject *kobj,
369 struct kobj_attribute *attr,
370 const char *buf, size_t count)
371{
372 return single_flag_store(kobj, attr, buf, count,
373 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
374}
375static struct kobj_attribute khugepaged_defrag_attr =
376 __ATTR(defrag, 0644, khugepaged_defrag_show,
377 khugepaged_defrag_store);
378
379/*
380 * max_ptes_none controls if khugepaged should collapse hugepages over
381 * any unmapped ptes in turn potentially increasing the memory
382 * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
383 * reduce the available free memory in the system as it
384 * runs. Increasing max_ptes_none will instead potentially reduce the
385 * free memory in the system during the khugepaged scan.
386 */
387static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
388 struct kobj_attribute *attr,
389 char *buf)
390{
391 return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
392}
393static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
394 struct kobj_attribute *attr,
395 const char *buf, size_t count)
396{
397 int err;
398 unsigned long max_ptes_none;
399
400 err = strict_strtoul(buf, 10, &max_ptes_none);
401 if (err || max_ptes_none > HPAGE_PMD_NR-1)
402 return -EINVAL;
403
404 khugepaged_max_ptes_none = max_ptes_none;
405
406 return count;
407}
408static struct kobj_attribute khugepaged_max_ptes_none_attr =
409 __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
410 khugepaged_max_ptes_none_store);
411
412static struct attribute *khugepaged_attr[] = {
413 &khugepaged_defrag_attr.attr,
414 &khugepaged_max_ptes_none_attr.attr,
415 &pages_to_scan_attr.attr,
416 &pages_collapsed_attr.attr,
417 &full_scans_attr.attr,
418 &scan_sleep_millisecs_attr.attr,
419 &alloc_sleep_millisecs_attr.attr,
420 NULL,
421};
422
423static struct attribute_group khugepaged_attr_group = {
424 .attrs = khugepaged_attr,
425 .name = "khugepaged",
157}; 426};
158#endif /* CONFIG_SYSFS */ 427#endif /* CONFIG_SYSFS */
159 428
160static int __init hugepage_init(void) 429static int __init hugepage_init(void)
161{ 430{
162#ifdef CONFIG_SYSFS
163 int err; 431 int err;
432#ifdef CONFIG_SYSFS
433 static struct kobject *hugepage_kobj;
164 434
165 err = sysfs_create_group(mm_kobj, &hugepage_attr_group); 435 err = -ENOMEM;
166 if (err) 436 hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
167 printk(KERN_ERR "hugepage: register sysfs failed\n"); 437 if (unlikely(!hugepage_kobj)) {
438 printk(KERN_ERR "hugepage: failed kobject create\n");
439 goto out;
440 }
441
442 err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
443 if (err) {
444 printk(KERN_ERR "hugepage: failed register hugeage group\n");
445 goto out;
446 }
447
448 err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
449 if (err) {
450 printk(KERN_ERR "hugepage: failed register hugeage group\n");
451 goto out;
452 }
168#endif 453#endif
169 return 0; 454
455 err = khugepaged_slab_init();
456 if (err)
457 goto out;
458
459 err = mm_slots_hash_init();
460 if (err) {
461 khugepaged_slab_free();
462 goto out;
463 }
464
465 start_khugepaged();
466
467out:
468 return err;
170} 469}
171module_init(hugepage_init) 470module_init(hugepage_init)
172 471
@@ -285,6 +584,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
285 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { 584 if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
286 if (unlikely(anon_vma_prepare(vma))) 585 if (unlikely(anon_vma_prepare(vma)))
287 return VM_FAULT_OOM; 586 return VM_FAULT_OOM;
587 if (unlikely(khugepaged_enter(vma)))
588 return VM_FAULT_OOM;
288 page = alloc_hugepage(transparent_hugepage_defrag(vma)); 589 page = alloc_hugepage(transparent_hugepage_defrag(vma));
289 if (unlikely(!page)) 590 if (unlikely(!page))
290 goto out; 591 goto out;
@@ -941,6 +1242,758 @@ int hugepage_madvise(unsigned long *vm_flags)
941 return 0; 1242 return 0;
942} 1243}
943 1244
1245static int __init khugepaged_slab_init(void)
1246{
1247 mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
1248 sizeof(struct mm_slot),
1249 __alignof__(struct mm_slot), 0, NULL);
1250 if (!mm_slot_cache)
1251 return -ENOMEM;
1252
1253 return 0;
1254}
1255
1256static void __init khugepaged_slab_free(void)
1257{
1258 kmem_cache_destroy(mm_slot_cache);
1259 mm_slot_cache = NULL;
1260}
1261
1262static inline struct mm_slot *alloc_mm_slot(void)
1263{
1264 if (!mm_slot_cache) /* initialization failed */
1265 return NULL;
1266 return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
1267}
1268
1269static inline void free_mm_slot(struct mm_slot *mm_slot)
1270{
1271 kmem_cache_free(mm_slot_cache, mm_slot);
1272}
1273
1274static int __init mm_slots_hash_init(void)
1275{
1276 mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
1277 GFP_KERNEL);
1278 if (!mm_slots_hash)
1279 return -ENOMEM;
1280 return 0;
1281}
1282
1283#if 0
1284static void __init mm_slots_hash_free(void)
1285{
1286 kfree(mm_slots_hash);
1287 mm_slots_hash = NULL;
1288}
1289#endif
1290
1291static struct mm_slot *get_mm_slot(struct mm_struct *mm)
1292{
1293 struct mm_slot *mm_slot;
1294 struct hlist_head *bucket;
1295 struct hlist_node *node;
1296
1297 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1298 % MM_SLOTS_HASH_HEADS];
1299 hlist_for_each_entry(mm_slot, node, bucket, hash) {
1300 if (mm == mm_slot->mm)
1301 return mm_slot;
1302 }
1303 return NULL;
1304}
1305
1306static void insert_to_mm_slots_hash(struct mm_struct *mm,
1307 struct mm_slot *mm_slot)
1308{
1309 struct hlist_head *bucket;
1310
1311 bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
1312 % MM_SLOTS_HASH_HEADS];
1313 mm_slot->mm = mm;
1314 hlist_add_head(&mm_slot->hash, bucket);
1315}
1316
1317static inline int khugepaged_test_exit(struct mm_struct *mm)
1318{
1319 return atomic_read(&mm->mm_users) == 0;
1320}
1321
1322int __khugepaged_enter(struct mm_struct *mm)
1323{
1324 struct mm_slot *mm_slot;
1325 int wakeup;
1326
1327 mm_slot = alloc_mm_slot();
1328 if (!mm_slot)
1329 return -ENOMEM;
1330
1331 /* __khugepaged_exit() must not run from under us */
1332 VM_BUG_ON(khugepaged_test_exit(mm));
1333 if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
1334 free_mm_slot(mm_slot);
1335 return 0;
1336 }
1337
1338 spin_lock(&khugepaged_mm_lock);
1339 insert_to_mm_slots_hash(mm, mm_slot);
1340 /*
1341 * Insert just behind the scanning cursor, to let the area settle
1342 * down a little.
1343 */
1344 wakeup = list_empty(&khugepaged_scan.mm_head);
1345 list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
1346 spin_unlock(&khugepaged_mm_lock);
1347
1348 atomic_inc(&mm->mm_count);
1349 if (wakeup)
1350 wake_up_interruptible(&khugepaged_wait);
1351
1352 return 0;
1353}
1354
1355int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
1356{
1357 unsigned long hstart, hend;
1358 if (!vma->anon_vma)
1359 /*
1360 * Not yet faulted in so we will register later in the
1361 * page fault if needed.
1362 */
1363 return 0;
1364 if (vma->vm_file || vma->vm_ops)
1365 /* khugepaged not yet working on file or special mappings */
1366 return 0;
1367 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1368 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1369 hend = vma->vm_end & HPAGE_PMD_MASK;
1370 if (hstart < hend)
1371 return khugepaged_enter(vma);
1372 return 0;
1373}
1374
1375void __khugepaged_exit(struct mm_struct *mm)
1376{
1377 struct mm_slot *mm_slot;
1378 int free = 0;
1379
1380 spin_lock(&khugepaged_mm_lock);
1381 mm_slot = get_mm_slot(mm);
1382 if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
1383 hlist_del(&mm_slot->hash);
1384 list_del(&mm_slot->mm_node);
1385 free = 1;
1386 }
1387
1388 if (free) {
1389 spin_unlock(&khugepaged_mm_lock);
1390 clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1391 free_mm_slot(mm_slot);
1392 mmdrop(mm);
1393 } else if (mm_slot) {
1394 spin_unlock(&khugepaged_mm_lock);
1395 /*
1396 * This is required to serialize against
1397 * khugepaged_test_exit() (which is guaranteed to run
1398 * under mmap sem read mode). Stop here (after we
1399 * return all pagetables will be destroyed) until
1400 * khugepaged has finished working on the pagetables
1401 * under the mmap_sem.
1402 */
1403 down_write(&mm->mmap_sem);
1404 up_write(&mm->mmap_sem);
1405 } else
1406 spin_unlock(&khugepaged_mm_lock);
1407}
1408
1409static void release_pte_page(struct page *page)
1410{
1411 /* 0 stands for page_is_file_cache(page) == false */
1412 dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
1413 unlock_page(page);
1414 putback_lru_page(page);
1415}
1416
1417static void release_pte_pages(pte_t *pte, pte_t *_pte)
1418{
1419 while (--_pte >= pte) {
1420 pte_t pteval = *_pte;
1421 if (!pte_none(pteval))
1422 release_pte_page(pte_page(pteval));
1423 }
1424}
1425
1426static void release_all_pte_pages(pte_t *pte)
1427{
1428 release_pte_pages(pte, pte + HPAGE_PMD_NR);
1429}
1430
1431static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
1432 unsigned long address,
1433 pte_t *pte)
1434{
1435 struct page *page;
1436 pte_t *_pte;
1437 int referenced = 0, isolated = 0, none = 0;
1438 for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
1439 _pte++, address += PAGE_SIZE) {
1440 pte_t pteval = *_pte;
1441 if (pte_none(pteval)) {
1442 if (++none <= khugepaged_max_ptes_none)
1443 continue;
1444 else {
1445 release_pte_pages(pte, _pte);
1446 goto out;
1447 }
1448 }
1449 if (!pte_present(pteval) || !pte_write(pteval)) {
1450 release_pte_pages(pte, _pte);
1451 goto out;
1452 }
1453 page = vm_normal_page(vma, address, pteval);
1454 if (unlikely(!page)) {
1455 release_pte_pages(pte, _pte);
1456 goto out;
1457 }
1458 VM_BUG_ON(PageCompound(page));
1459 BUG_ON(!PageAnon(page));
1460 VM_BUG_ON(!PageSwapBacked(page));
1461
1462 /* cannot use mapcount: can't collapse if there's a gup pin */
1463 if (page_count(page) != 1) {
1464 release_pte_pages(pte, _pte);
1465 goto out;
1466 }
1467 /*
1468 * We can do it before isolate_lru_page because the
1469 * page can't be freed from under us. NOTE: PG_lock
1470 * is needed to serialize against split_huge_page
1471 * when invoked from the VM.
1472 */
1473 if (!trylock_page(page)) {
1474 release_pte_pages(pte, _pte);
1475 goto out;
1476 }
1477 /*
1478 * Isolate the page to avoid collapsing an hugepage
1479 * currently in use by the VM.
1480 */
1481 if (isolate_lru_page(page)) {
1482 unlock_page(page);
1483 release_pte_pages(pte, _pte);
1484 goto out;
1485 }
1486 /* 0 stands for page_is_file_cache(page) == false */
1487 inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
1488 VM_BUG_ON(!PageLocked(page));
1489 VM_BUG_ON(PageLRU(page));
1490
1491 /* If there is no mapped pte young don't collapse the page */
1492 if (pte_young(pteval))
1493 referenced = 1;
1494 }
1495 if (unlikely(!referenced))
1496 release_all_pte_pages(pte);
1497 else
1498 isolated = 1;
1499out:
1500 return isolated;
1501}
1502
1503static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
1504 struct vm_area_struct *vma,
1505 unsigned long address,
1506 spinlock_t *ptl)
1507{
1508 pte_t *_pte;
1509 for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
1510 pte_t pteval = *_pte;
1511 struct page *src_page;
1512
1513 if (pte_none(pteval)) {
1514 clear_user_highpage(page, address);
1515 add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
1516 } else {
1517 src_page = pte_page(pteval);
1518 copy_user_highpage(page, src_page, address, vma);
1519 VM_BUG_ON(page_mapcount(src_page) != 1);
1520 VM_BUG_ON(page_count(src_page) != 2);
1521 release_pte_page(src_page);
1522 /*
1523 * ptl mostly unnecessary, but preempt has to
1524 * be disabled to update the per-cpu stats
1525 * inside page_remove_rmap().
1526 */
1527 spin_lock(ptl);
1528 /*
1529 * paravirt calls inside pte_clear here are
1530 * superfluous.
1531 */
1532 pte_clear(vma->vm_mm, address, _pte);
1533 page_remove_rmap(src_page);
1534 spin_unlock(ptl);
1535 free_page_and_swap_cache(src_page);
1536 }
1537
1538 address += PAGE_SIZE;
1539 page++;
1540 }
1541}
1542
1543static void collapse_huge_page(struct mm_struct *mm,
1544 unsigned long address,
1545 struct page **hpage)
1546{
1547 struct vm_area_struct *vma;
1548 pgd_t *pgd;
1549 pud_t *pud;
1550 pmd_t *pmd, _pmd;
1551 pte_t *pte;
1552 pgtable_t pgtable;
1553 struct page *new_page;
1554 spinlock_t *ptl;
1555 int isolated;
1556 unsigned long hstart, hend;
1557
1558 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1559 VM_BUG_ON(!*hpage);
1560
1561 /*
1562 * Prevent all access to pagetables with the exception of
1563 * gup_fast later hanlded by the ptep_clear_flush and the VM
1564 * handled by the anon_vma lock + PG_lock.
1565 */
1566 down_write(&mm->mmap_sem);
1567 if (unlikely(khugepaged_test_exit(mm)))
1568 goto out;
1569
1570 vma = find_vma(mm, address);
1571 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1572 hend = vma->vm_end & HPAGE_PMD_MASK;
1573 if (address < hstart || address + HPAGE_PMD_SIZE > hend)
1574 goto out;
1575
1576 if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always())
1577 goto out;
1578
1579 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1580 if (!vma->anon_vma || vma->vm_ops || vma->vm_file)
1581 goto out;
1582 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1583
1584 pgd = pgd_offset(mm, address);
1585 if (!pgd_present(*pgd))
1586 goto out;
1587
1588 pud = pud_offset(pgd, address);
1589 if (!pud_present(*pud))
1590 goto out;
1591
1592 pmd = pmd_offset(pud, address);
1593 /* pmd can't go away or become huge under us */
1594 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1595 goto out;
1596
1597 new_page = *hpage;
1598 if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
1599 goto out;
1600
1601 anon_vma_lock(vma->anon_vma);
1602
1603 pte = pte_offset_map(pmd, address);
1604 ptl = pte_lockptr(mm, pmd);
1605
1606 spin_lock(&mm->page_table_lock); /* probably unnecessary */
1607 /*
1608 * After this gup_fast can't run anymore. This also removes
1609 * any huge TLB entry from the CPU so we won't allow
1610 * huge and small TLB entries for the same virtual address
1611 * to avoid the risk of CPU bugs in that area.
1612 */
1613 _pmd = pmdp_clear_flush_notify(vma, address, pmd);
1614 spin_unlock(&mm->page_table_lock);
1615
1616 spin_lock(ptl);
1617 isolated = __collapse_huge_page_isolate(vma, address, pte);
1618 spin_unlock(ptl);
1619 pte_unmap(pte);
1620
1621 if (unlikely(!isolated)) {
1622 spin_lock(&mm->page_table_lock);
1623 BUG_ON(!pmd_none(*pmd));
1624 set_pmd_at(mm, address, pmd, _pmd);
1625 spin_unlock(&mm->page_table_lock);
1626 anon_vma_unlock(vma->anon_vma);
1627 mem_cgroup_uncharge_page(new_page);
1628 goto out;
1629 }
1630
1631 /*
1632 * All pages are isolated and locked so anon_vma rmap
1633 * can't run anymore.
1634 */
1635 anon_vma_unlock(vma->anon_vma);
1636
1637 __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
1638 __SetPageUptodate(new_page);
1639 pgtable = pmd_pgtable(_pmd);
1640 VM_BUG_ON(page_count(pgtable) != 1);
1641 VM_BUG_ON(page_mapcount(pgtable) != 0);
1642
1643 _pmd = mk_pmd(new_page, vma->vm_page_prot);
1644 _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
1645 _pmd = pmd_mkhuge(_pmd);
1646
1647 /*
1648 * spin_lock() below is not the equivalent of smp_wmb(), so
1649 * this is needed to avoid the copy_huge_page writes to become
1650 * visible after the set_pmd_at() write.
1651 */
1652 smp_wmb();
1653
1654 spin_lock(&mm->page_table_lock);
1655 BUG_ON(!pmd_none(*pmd));
1656 page_add_new_anon_rmap(new_page, vma, address);
1657 set_pmd_at(mm, address, pmd, _pmd);
1658 update_mmu_cache(vma, address, entry);
1659 prepare_pmd_huge_pte(pgtable, mm);
1660 mm->nr_ptes--;
1661 spin_unlock(&mm->page_table_lock);
1662
1663 *hpage = NULL;
1664 khugepaged_pages_collapsed++;
1665out:
1666 up_write(&mm->mmap_sem);
1667}
1668
1669static int khugepaged_scan_pmd(struct mm_struct *mm,
1670 struct vm_area_struct *vma,
1671 unsigned long address,
1672 struct page **hpage)
1673{
1674 pgd_t *pgd;
1675 pud_t *pud;
1676 pmd_t *pmd;
1677 pte_t *pte, *_pte;
1678 int ret = 0, referenced = 0, none = 0;
1679 struct page *page;
1680 unsigned long _address;
1681 spinlock_t *ptl;
1682
1683 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1684
1685 pgd = pgd_offset(mm, address);
1686 if (!pgd_present(*pgd))
1687 goto out;
1688
1689 pud = pud_offset(pgd, address);
1690 if (!pud_present(*pud))
1691 goto out;
1692
1693 pmd = pmd_offset(pud, address);
1694 if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
1695 goto out;
1696
1697 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
1698 for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
1699 _pte++, _address += PAGE_SIZE) {
1700 pte_t pteval = *_pte;
1701 if (pte_none(pteval)) {
1702 if (++none <= khugepaged_max_ptes_none)
1703 continue;
1704 else
1705 goto out_unmap;
1706 }
1707 if (!pte_present(pteval) || !pte_write(pteval))
1708 goto out_unmap;
1709 page = vm_normal_page(vma, _address, pteval);
1710 if (unlikely(!page))
1711 goto out_unmap;
1712 VM_BUG_ON(PageCompound(page));
1713 if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
1714 goto out_unmap;
1715 /* cannot use mapcount: can't collapse if there's a gup pin */
1716 if (page_count(page) != 1)
1717 goto out_unmap;
1718 if (pte_young(pteval))
1719 referenced = 1;
1720 }
1721 if (referenced)
1722 ret = 1;
1723out_unmap:
1724 pte_unmap_unlock(pte, ptl);
1725 if (ret) {
1726 up_read(&mm->mmap_sem);
1727 collapse_huge_page(mm, address, hpage);
1728 }
1729out:
1730 return ret;
1731}
1732
1733static void collect_mm_slot(struct mm_slot *mm_slot)
1734{
1735 struct mm_struct *mm = mm_slot->mm;
1736
1737 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1738
1739 if (khugepaged_test_exit(mm)) {
1740 /* free mm_slot */
1741 hlist_del(&mm_slot->hash);
1742 list_del(&mm_slot->mm_node);
1743
1744 /*
1745 * Not strictly needed because the mm exited already.
1746 *
1747 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
1748 */
1749
1750 /* khugepaged_mm_lock actually not necessary for the below */
1751 free_mm_slot(mm_slot);
1752 mmdrop(mm);
1753 }
1754}
1755
1756static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
1757 struct page **hpage)
1758{
1759 struct mm_slot *mm_slot;
1760 struct mm_struct *mm;
1761 struct vm_area_struct *vma;
1762 int progress = 0;
1763
1764 VM_BUG_ON(!pages);
1765 VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
1766
1767 if (khugepaged_scan.mm_slot)
1768 mm_slot = khugepaged_scan.mm_slot;
1769 else {
1770 mm_slot = list_entry(khugepaged_scan.mm_head.next,
1771 struct mm_slot, mm_node);
1772 khugepaged_scan.address = 0;
1773 khugepaged_scan.mm_slot = mm_slot;
1774 }
1775 spin_unlock(&khugepaged_mm_lock);
1776
1777 mm = mm_slot->mm;
1778 down_read(&mm->mmap_sem);
1779 if (unlikely(khugepaged_test_exit(mm)))
1780 vma = NULL;
1781 else
1782 vma = find_vma(mm, khugepaged_scan.address);
1783
1784 progress++;
1785 for (; vma; vma = vma->vm_next) {
1786 unsigned long hstart, hend;
1787
1788 cond_resched();
1789 if (unlikely(khugepaged_test_exit(mm))) {
1790 progress++;
1791 break;
1792 }
1793
1794 if (!(vma->vm_flags & VM_HUGEPAGE) &&
1795 !khugepaged_always()) {
1796 progress++;
1797 continue;
1798 }
1799
1800 /* VM_PFNMAP vmas may have vm_ops null but vm_file set */
1801 if (!vma->anon_vma || vma->vm_ops || vma->vm_file) {
1802 khugepaged_scan.address = vma->vm_end;
1803 progress++;
1804 continue;
1805 }
1806 VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma));
1807
1808 hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
1809 hend = vma->vm_end & HPAGE_PMD_MASK;
1810 if (hstart >= hend) {
1811 progress++;
1812 continue;
1813 }
1814 if (khugepaged_scan.address < hstart)
1815 khugepaged_scan.address = hstart;
1816 if (khugepaged_scan.address > hend) {
1817 khugepaged_scan.address = hend + HPAGE_PMD_SIZE;
1818 progress++;
1819 continue;
1820 }
1821 BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
1822
1823 while (khugepaged_scan.address < hend) {
1824 int ret;
1825 cond_resched();
1826 if (unlikely(khugepaged_test_exit(mm)))
1827 goto breakouterloop;
1828
1829 VM_BUG_ON(khugepaged_scan.address < hstart ||
1830 khugepaged_scan.address + HPAGE_PMD_SIZE >
1831 hend);
1832 ret = khugepaged_scan_pmd(mm, vma,
1833 khugepaged_scan.address,
1834 hpage);
1835 /* move to next address */
1836 khugepaged_scan.address += HPAGE_PMD_SIZE;
1837 progress += HPAGE_PMD_NR;
1838 if (ret)
1839 /* we released mmap_sem so break loop */
1840 goto breakouterloop_mmap_sem;
1841 if (progress >= pages)
1842 goto breakouterloop;
1843 }
1844 }
1845breakouterloop:
1846 up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
1847breakouterloop_mmap_sem:
1848
1849 spin_lock(&khugepaged_mm_lock);
1850 BUG_ON(khugepaged_scan.mm_slot != mm_slot);
1851 /*
1852 * Release the current mm_slot if this mm is about to die, or
1853 * if we scanned all vmas of this mm.
1854 */
1855 if (khugepaged_test_exit(mm) || !vma) {
1856 /*
1857 * Make sure that if mm_users is reaching zero while
1858 * khugepaged runs here, khugepaged_exit will find
1859 * mm_slot not pointing to the exiting mm.
1860 */
1861 if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
1862 khugepaged_scan.mm_slot = list_entry(
1863 mm_slot->mm_node.next,
1864 struct mm_slot, mm_node);
1865 khugepaged_scan.address = 0;
1866 } else {
1867 khugepaged_scan.mm_slot = NULL;
1868 khugepaged_full_scans++;
1869 }
1870
1871 collect_mm_slot(mm_slot);
1872 }
1873
1874 return progress;
1875}
1876
1877static int khugepaged_has_work(void)
1878{
1879 return !list_empty(&khugepaged_scan.mm_head) &&
1880 khugepaged_enabled();
1881}
1882
1883static int khugepaged_wait_event(void)
1884{
1885 return !list_empty(&khugepaged_scan.mm_head) ||
1886 !khugepaged_enabled();
1887}
1888
1889static void khugepaged_do_scan(struct page **hpage)
1890{
1891 unsigned int progress = 0, pass_through_head = 0;
1892 unsigned int pages = khugepaged_pages_to_scan;
1893
1894 barrier(); /* write khugepaged_pages_to_scan to local stack */
1895
1896 while (progress < pages) {
1897 cond_resched();
1898
1899 if (!*hpage) {
1900 *hpage = alloc_hugepage(khugepaged_defrag());
1901 if (unlikely(!*hpage))
1902 break;
1903 }
1904
1905 spin_lock(&khugepaged_mm_lock);
1906 if (!khugepaged_scan.mm_slot)
1907 pass_through_head++;
1908 if (khugepaged_has_work() &&
1909 pass_through_head < 2)
1910 progress += khugepaged_scan_mm_slot(pages - progress,
1911 hpage);
1912 else
1913 progress = pages;
1914 spin_unlock(&khugepaged_mm_lock);
1915 }
1916}
1917
1918static struct page *khugepaged_alloc_hugepage(void)
1919{
1920 struct page *hpage;
1921
1922 do {
1923 hpage = alloc_hugepage(khugepaged_defrag());
1924 if (!hpage) {
1925 DEFINE_WAIT(wait);
1926 add_wait_queue(&khugepaged_wait, &wait);
1927 schedule_timeout_interruptible(
1928 msecs_to_jiffies(
1929 khugepaged_alloc_sleep_millisecs));
1930 remove_wait_queue(&khugepaged_wait, &wait);
1931 }
1932 } while (unlikely(!hpage) &&
1933 likely(khugepaged_enabled()));
1934 return hpage;
1935}
1936
1937static void khugepaged_loop(void)
1938{
1939 struct page *hpage;
1940
1941 while (likely(khugepaged_enabled())) {
1942 hpage = khugepaged_alloc_hugepage();
1943 if (unlikely(!hpage))
1944 break;
1945
1946 khugepaged_do_scan(&hpage);
1947 if (hpage)
1948 put_page(hpage);
1949 if (khugepaged_has_work()) {
1950 DEFINE_WAIT(wait);
1951 if (!khugepaged_scan_sleep_millisecs)
1952 continue;
1953 add_wait_queue(&khugepaged_wait, &wait);
1954 schedule_timeout_interruptible(
1955 msecs_to_jiffies(
1956 khugepaged_scan_sleep_millisecs));
1957 remove_wait_queue(&khugepaged_wait, &wait);
1958 } else if (khugepaged_enabled())
1959 wait_event_interruptible(khugepaged_wait,
1960 khugepaged_wait_event());
1961 }
1962}
1963
1964static int khugepaged(void *none)
1965{
1966 struct mm_slot *mm_slot;
1967
1968 set_user_nice(current, 19);
1969
1970 /* serialize with start_khugepaged() */
1971 mutex_lock(&khugepaged_mutex);
1972
1973 for (;;) {
1974 mutex_unlock(&khugepaged_mutex);
1975 BUG_ON(khugepaged_thread != current);
1976 khugepaged_loop();
1977 BUG_ON(khugepaged_thread != current);
1978
1979 mutex_lock(&khugepaged_mutex);
1980 if (!khugepaged_enabled())
1981 break;
1982 }
1983
1984 spin_lock(&khugepaged_mm_lock);
1985 mm_slot = khugepaged_scan.mm_slot;
1986 khugepaged_scan.mm_slot = NULL;
1987 if (mm_slot)
1988 collect_mm_slot(mm_slot);
1989 spin_unlock(&khugepaged_mm_lock);
1990
1991 khugepaged_thread = NULL;
1992 mutex_unlock(&khugepaged_mutex);
1993
1994 return 0;
1995}
1996
944void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) 1997void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
945{ 1998{
946 struct page *page; 1999 struct page *page;