diff options
-rw-r--r-- | include/linux/huge_mm.h | 1 | ||||
-rw-r--r-- | include/linux/khugepaged.h | 66 | ||||
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | kernel/fork.c | 5 | ||||
-rw-r--r-- | mm/huge_memory.c | 1073 |
5 files changed, 1136 insertions, 10 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d9ab70d776e2..43a694ef8904 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -25,6 +25,7 @@ enum transparent_hugepage_flag { | |||
25 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | 25 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, |
26 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | 26 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, |
27 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, | 27 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, |
28 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, | ||
28 | #ifdef CONFIG_DEBUG_VM | 29 | #ifdef CONFIG_DEBUG_VM |
29 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, | 30 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, |
30 | #endif | 31 | #endif |
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h new file mode 100644 index 000000000000..552f3184756c --- /dev/null +++ b/include/linux/khugepaged.h | |||
@@ -0,0 +1,66 @@ | |||
1 | #ifndef _LINUX_KHUGEPAGED_H | ||
2 | #define _LINUX_KHUGEPAGED_H | ||
3 | |||
4 | #include <linux/sched.h> /* MMF_VM_HUGEPAGE */ | ||
5 | |||
6 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
7 | extern int __khugepaged_enter(struct mm_struct *mm); | ||
8 | extern void __khugepaged_exit(struct mm_struct *mm); | ||
9 | extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); | ||
10 | |||
11 | #define khugepaged_enabled() \ | ||
12 | (transparent_hugepage_flags & \ | ||
13 | ((1<<TRANSPARENT_HUGEPAGE_FLAG) | \ | ||
14 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG))) | ||
15 | #define khugepaged_always() \ | ||
16 | (transparent_hugepage_flags & \ | ||
17 | (1<<TRANSPARENT_HUGEPAGE_FLAG)) | ||
18 | #define khugepaged_req_madv() \ | ||
19 | (transparent_hugepage_flags & \ | ||
20 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) | ||
21 | #define khugepaged_defrag() \ | ||
22 | (transparent_hugepage_flags & \ | ||
23 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)) | ||
24 | |||
25 | static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) | ||
26 | { | ||
27 | if (test_bit(MMF_VM_HUGEPAGE, &oldmm->flags)) | ||
28 | return __khugepaged_enter(mm); | ||
29 | return 0; | ||
30 | } | ||
31 | |||
32 | static inline void khugepaged_exit(struct mm_struct *mm) | ||
33 | { | ||
34 | if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) | ||
35 | __khugepaged_exit(mm); | ||
36 | } | ||
37 | |||
38 | static inline int khugepaged_enter(struct vm_area_struct *vma) | ||
39 | { | ||
40 | if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) | ||
41 | if (khugepaged_always() || | ||
42 | (khugepaged_req_madv() && | ||
43 | vma->vm_flags & VM_HUGEPAGE)) | ||
44 | if (__khugepaged_enter(vma->vm_mm)) | ||
45 | return -ENOMEM; | ||
46 | return 0; | ||
47 | } | ||
48 | #else /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
49 | static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) | ||
50 | { | ||
51 | return 0; | ||
52 | } | ||
53 | static inline void khugepaged_exit(struct mm_struct *mm) | ||
54 | { | ||
55 | } | ||
56 | static inline int khugepaged_enter(struct vm_area_struct *vma) | ||
57 | { | ||
58 | return 0; | ||
59 | } | ||
60 | static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | ||
61 | { | ||
62 | return 0; | ||
63 | } | ||
64 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
65 | |||
66 | #endif /* _LINUX_KHUGEPAGED_H */ | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index f23b5bb6f52e..d747f948b34e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -434,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm); | |||
434 | #endif | 434 | #endif |
435 | /* leave room for more dump flags */ | 435 | /* leave room for more dump flags */ |
436 | #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ | 436 | #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ |
437 | #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ | ||
437 | 438 | ||
438 | #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) | 439 | #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) |
439 | 440 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index f78f50ba6cb2..25e429152ddc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -66,6 +66,7 @@ | |||
66 | #include <linux/posix-timers.h> | 66 | #include <linux/posix-timers.h> |
67 | #include <linux/user-return-notifier.h> | 67 | #include <linux/user-return-notifier.h> |
68 | #include <linux/oom.h> | 68 | #include <linux/oom.h> |
69 | #include <linux/khugepaged.h> | ||
69 | 70 | ||
70 | #include <asm/pgtable.h> | 71 | #include <asm/pgtable.h> |
71 | #include <asm/pgalloc.h> | 72 | #include <asm/pgalloc.h> |
@@ -330,6 +331,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
330 | retval = ksm_fork(mm, oldmm); | 331 | retval = ksm_fork(mm, oldmm); |
331 | if (retval) | 332 | if (retval) |
332 | goto out; | 333 | goto out; |
334 | retval = khugepaged_fork(mm, oldmm); | ||
335 | if (retval) | ||
336 | goto out; | ||
333 | 337 | ||
334 | prev = NULL; | 338 | prev = NULL; |
335 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { | 339 | for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { |
@@ -546,6 +550,7 @@ void mmput(struct mm_struct *mm) | |||
546 | if (atomic_dec_and_test(&mm->mm_users)) { | 550 | if (atomic_dec_and_test(&mm->mm_users)) { |
547 | exit_aio(mm); | 551 | exit_aio(mm); |
548 | ksm_exit(mm); | 552 | ksm_exit(mm); |
553 | khugepaged_exit(mm); /* must run before exit_mmap */ | ||
549 | exit_mmap(mm); | 554 | exit_mmap(mm); |
550 | set_mm_exe_file(mm, NULL); | 555 | set_mm_exe_file(mm, NULL); |
551 | if (!list_empty(&mm->mmlist)) { | 556 | if (!list_empty(&mm->mmlist)) { |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7101112a5429..ae2bf08b1099 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -12,14 +12,111 @@ | |||
12 | #include <linux/mmu_notifier.h> | 12 | #include <linux/mmu_notifier.h> |
13 | #include <linux/rmap.h> | 13 | #include <linux/rmap.h> |
14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
15 | #include <linux/mm_inline.h> | ||
16 | #include <linux/kthread.h> | ||
17 | #include <linux/khugepaged.h> | ||
15 | #include <asm/tlb.h> | 18 | #include <asm/tlb.h> |
16 | #include <asm/pgalloc.h> | 19 | #include <asm/pgalloc.h> |
17 | #include "internal.h" | 20 | #include "internal.h" |
18 | 21 | ||
22 | /* | ||
23 | * By default transparent hugepage support is enabled for all mappings | ||
24 | * and khugepaged scans all mappings. Defrag is only invoked by | ||
25 | * khugepaged hugepage allocations and by page faults inside | ||
26 | * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived | ||
27 | * allocations. | ||
28 | */ | ||
19 | unsigned long transparent_hugepage_flags __read_mostly = | 29 | unsigned long transparent_hugepage_flags __read_mostly = |
20 | (1<<TRANSPARENT_HUGEPAGE_FLAG); | 30 | (1<<TRANSPARENT_HUGEPAGE_FLAG)| |
31 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
32 | |||
33 | /* default scan 8*512 pte (or vmas) every 30 second */ | ||
34 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | ||
35 | static unsigned int khugepaged_pages_collapsed; | ||
36 | static unsigned int khugepaged_full_scans; | ||
37 | static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; | ||
38 | /* during fragmentation poll the hugepage allocator once every minute */ | ||
39 | static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; | ||
40 | static struct task_struct *khugepaged_thread __read_mostly; | ||
41 | static DEFINE_MUTEX(khugepaged_mutex); | ||
42 | static DEFINE_SPINLOCK(khugepaged_mm_lock); | ||
43 | static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); | ||
44 | /* | ||
45 | * default collapse hugepages if there is at least one pte mapped like | ||
46 | * it would have happened if the vma was large enough during page | ||
47 | * fault. | ||
48 | */ | ||
49 | static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1; | ||
50 | |||
51 | static int khugepaged(void *none); | ||
52 | static int mm_slots_hash_init(void); | ||
53 | static int khugepaged_slab_init(void); | ||
54 | static void khugepaged_slab_free(void); | ||
55 | |||
56 | #define MM_SLOTS_HASH_HEADS 1024 | ||
57 | static struct hlist_head *mm_slots_hash __read_mostly; | ||
58 | static struct kmem_cache *mm_slot_cache __read_mostly; | ||
59 | |||
60 | /** | ||
61 | * struct mm_slot - hash lookup from mm to mm_slot | ||
62 | * @hash: hash collision list | ||
63 | * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head | ||
64 | * @mm: the mm that this information is valid for | ||
65 | */ | ||
66 | struct mm_slot { | ||
67 | struct hlist_node hash; | ||
68 | struct list_head mm_node; | ||
69 | struct mm_struct *mm; | ||
70 | }; | ||
71 | |||
72 | /** | ||
73 | * struct khugepaged_scan - cursor for scanning | ||
74 | * @mm_head: the head of the mm list to scan | ||
75 | * @mm_slot: the current mm_slot we are scanning | ||
76 | * @address: the next address inside that to be scanned | ||
77 | * | ||
78 | * There is only the one khugepaged_scan instance of this cursor structure. | ||
79 | */ | ||
80 | struct khugepaged_scan { | ||
81 | struct list_head mm_head; | ||
82 | struct mm_slot *mm_slot; | ||
83 | unsigned long address; | ||
84 | } khugepaged_scan = { | ||
85 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | ||
86 | }; | ||
87 | |||
88 | static int start_khugepaged(void) | ||
89 | { | ||
90 | int err = 0; | ||
91 | if (khugepaged_enabled()) { | ||
92 | int wakeup; | ||
93 | if (unlikely(!mm_slot_cache || !mm_slots_hash)) { | ||
94 | err = -ENOMEM; | ||
95 | goto out; | ||
96 | } | ||
97 | mutex_lock(&khugepaged_mutex); | ||
98 | if (!khugepaged_thread) | ||
99 | khugepaged_thread = kthread_run(khugepaged, NULL, | ||
100 | "khugepaged"); | ||
101 | if (unlikely(IS_ERR(khugepaged_thread))) { | ||
102 | printk(KERN_ERR | ||
103 | "khugepaged: kthread_run(khugepaged) failed\n"); | ||
104 | err = PTR_ERR(khugepaged_thread); | ||
105 | khugepaged_thread = NULL; | ||
106 | } | ||
107 | wakeup = !list_empty(&khugepaged_scan.mm_head); | ||
108 | mutex_unlock(&khugepaged_mutex); | ||
109 | if (wakeup) | ||
110 | wake_up_interruptible(&khugepaged_wait); | ||
111 | } else | ||
112 | /* wakeup to exit */ | ||
113 | wake_up_interruptible(&khugepaged_wait); | ||
114 | out: | ||
115 | return err; | ||
116 | } | ||
21 | 117 | ||
22 | #ifdef CONFIG_SYSFS | 118 | #ifdef CONFIG_SYSFS |
119 | |||
23 | static ssize_t double_flag_show(struct kobject *kobj, | 120 | static ssize_t double_flag_show(struct kobject *kobj, |
24 | struct kobj_attribute *attr, char *buf, | 121 | struct kobj_attribute *attr, char *buf, |
25 | enum transparent_hugepage_flag enabled, | 122 | enum transparent_hugepage_flag enabled, |
@@ -68,9 +165,19 @@ static ssize_t enabled_store(struct kobject *kobj, | |||
68 | struct kobj_attribute *attr, | 165 | struct kobj_attribute *attr, |
69 | const char *buf, size_t count) | 166 | const char *buf, size_t count) |
70 | { | 167 | { |
71 | return double_flag_store(kobj, attr, buf, count, | 168 | ssize_t ret; |
72 | TRANSPARENT_HUGEPAGE_FLAG, | 169 | |
73 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | 170 | ret = double_flag_store(kobj, attr, buf, count, |
171 | TRANSPARENT_HUGEPAGE_FLAG, | ||
172 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG); | ||
173 | |||
174 | if (ret > 0) { | ||
175 | int err = start_khugepaged(); | ||
176 | if (err) | ||
177 | ret = err; | ||
178 | } | ||
179 | |||
180 | return ret; | ||
74 | } | 181 | } |
75 | static struct kobj_attribute enabled_attr = | 182 | static struct kobj_attribute enabled_attr = |
76 | __ATTR(enabled, 0644, enabled_show, enabled_store); | 183 | __ATTR(enabled, 0644, enabled_show, enabled_store); |
@@ -153,20 +260,212 @@ static struct attribute *hugepage_attr[] = { | |||
153 | 260 | ||
154 | static struct attribute_group hugepage_attr_group = { | 261 | static struct attribute_group hugepage_attr_group = { |
155 | .attrs = hugepage_attr, | 262 | .attrs = hugepage_attr, |
156 | .name = "transparent_hugepage", | 263 | }; |
264 | |||
265 | static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, | ||
266 | struct kobj_attribute *attr, | ||
267 | char *buf) | ||
268 | { | ||
269 | return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); | ||
270 | } | ||
271 | |||
272 | static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, | ||
273 | struct kobj_attribute *attr, | ||
274 | const char *buf, size_t count) | ||
275 | { | ||
276 | unsigned long msecs; | ||
277 | int err; | ||
278 | |||
279 | err = strict_strtoul(buf, 10, &msecs); | ||
280 | if (err || msecs > UINT_MAX) | ||
281 | return -EINVAL; | ||
282 | |||
283 | khugepaged_scan_sleep_millisecs = msecs; | ||
284 | wake_up_interruptible(&khugepaged_wait); | ||
285 | |||
286 | return count; | ||
287 | } | ||
288 | static struct kobj_attribute scan_sleep_millisecs_attr = | ||
289 | __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, | ||
290 | scan_sleep_millisecs_store); | ||
291 | |||
292 | static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, | ||
293 | struct kobj_attribute *attr, | ||
294 | char *buf) | ||
295 | { | ||
296 | return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); | ||
297 | } | ||
298 | |||
299 | static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, | ||
300 | struct kobj_attribute *attr, | ||
301 | const char *buf, size_t count) | ||
302 | { | ||
303 | unsigned long msecs; | ||
304 | int err; | ||
305 | |||
306 | err = strict_strtoul(buf, 10, &msecs); | ||
307 | if (err || msecs > UINT_MAX) | ||
308 | return -EINVAL; | ||
309 | |||
310 | khugepaged_alloc_sleep_millisecs = msecs; | ||
311 | wake_up_interruptible(&khugepaged_wait); | ||
312 | |||
313 | return count; | ||
314 | } | ||
315 | static struct kobj_attribute alloc_sleep_millisecs_attr = | ||
316 | __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, | ||
317 | alloc_sleep_millisecs_store); | ||
318 | |||
319 | static ssize_t pages_to_scan_show(struct kobject *kobj, | ||
320 | struct kobj_attribute *attr, | ||
321 | char *buf) | ||
322 | { | ||
323 | return sprintf(buf, "%u\n", khugepaged_pages_to_scan); | ||
324 | } | ||
325 | static ssize_t pages_to_scan_store(struct kobject *kobj, | ||
326 | struct kobj_attribute *attr, | ||
327 | const char *buf, size_t count) | ||
328 | { | ||
329 | int err; | ||
330 | unsigned long pages; | ||
331 | |||
332 | err = strict_strtoul(buf, 10, &pages); | ||
333 | if (err || !pages || pages > UINT_MAX) | ||
334 | return -EINVAL; | ||
335 | |||
336 | khugepaged_pages_to_scan = pages; | ||
337 | |||
338 | return count; | ||
339 | } | ||
340 | static struct kobj_attribute pages_to_scan_attr = | ||
341 | __ATTR(pages_to_scan, 0644, pages_to_scan_show, | ||
342 | pages_to_scan_store); | ||
343 | |||
344 | static ssize_t pages_collapsed_show(struct kobject *kobj, | ||
345 | struct kobj_attribute *attr, | ||
346 | char *buf) | ||
347 | { | ||
348 | return sprintf(buf, "%u\n", khugepaged_pages_collapsed); | ||
349 | } | ||
350 | static struct kobj_attribute pages_collapsed_attr = | ||
351 | __ATTR_RO(pages_collapsed); | ||
352 | |||
353 | static ssize_t full_scans_show(struct kobject *kobj, | ||
354 | struct kobj_attribute *attr, | ||
355 | char *buf) | ||
356 | { | ||
357 | return sprintf(buf, "%u\n", khugepaged_full_scans); | ||
358 | } | ||
359 | static struct kobj_attribute full_scans_attr = | ||
360 | __ATTR_RO(full_scans); | ||
361 | |||
362 | static ssize_t khugepaged_defrag_show(struct kobject *kobj, | ||
363 | struct kobj_attribute *attr, char *buf) | ||
364 | { | ||
365 | return single_flag_show(kobj, attr, buf, | ||
366 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
367 | } | ||
368 | static ssize_t khugepaged_defrag_store(struct kobject *kobj, | ||
369 | struct kobj_attribute *attr, | ||
370 | const char *buf, size_t count) | ||
371 | { | ||
372 | return single_flag_store(kobj, attr, buf, count, | ||
373 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | ||
374 | } | ||
375 | static struct kobj_attribute khugepaged_defrag_attr = | ||
376 | __ATTR(defrag, 0644, khugepaged_defrag_show, | ||
377 | khugepaged_defrag_store); | ||
378 | |||
379 | /* | ||
380 | * max_ptes_none controls if khugepaged should collapse hugepages over | ||
381 | * any unmapped ptes in turn potentially increasing the memory | ||
382 | * footprint of the vmas. When max_ptes_none is 0 khugepaged will not | ||
383 | * reduce the available free memory in the system as it | ||
384 | * runs. Increasing max_ptes_none will instead potentially reduce the | ||
385 | * free memory in the system during the khugepaged scan. | ||
386 | */ | ||
387 | static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, | ||
388 | struct kobj_attribute *attr, | ||
389 | char *buf) | ||
390 | { | ||
391 | return sprintf(buf, "%u\n", khugepaged_max_ptes_none); | ||
392 | } | ||
393 | static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, | ||
394 | struct kobj_attribute *attr, | ||
395 | const char *buf, size_t count) | ||
396 | { | ||
397 | int err; | ||
398 | unsigned long max_ptes_none; | ||
399 | |||
400 | err = strict_strtoul(buf, 10, &max_ptes_none); | ||
401 | if (err || max_ptes_none > HPAGE_PMD_NR-1) | ||
402 | return -EINVAL; | ||
403 | |||
404 | khugepaged_max_ptes_none = max_ptes_none; | ||
405 | |||
406 | return count; | ||
407 | } | ||
408 | static struct kobj_attribute khugepaged_max_ptes_none_attr = | ||
409 | __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, | ||
410 | khugepaged_max_ptes_none_store); | ||
411 | |||
412 | static struct attribute *khugepaged_attr[] = { | ||
413 | &khugepaged_defrag_attr.attr, | ||
414 | &khugepaged_max_ptes_none_attr.attr, | ||
415 | &pages_to_scan_attr.attr, | ||
416 | &pages_collapsed_attr.attr, | ||
417 | &full_scans_attr.attr, | ||
418 | &scan_sleep_millisecs_attr.attr, | ||
419 | &alloc_sleep_millisecs_attr.attr, | ||
420 | NULL, | ||
421 | }; | ||
422 | |||
423 | static struct attribute_group khugepaged_attr_group = { | ||
424 | .attrs = khugepaged_attr, | ||
425 | .name = "khugepaged", | ||
157 | }; | 426 | }; |
158 | #endif /* CONFIG_SYSFS */ | 427 | #endif /* CONFIG_SYSFS */ |
159 | 428 | ||
160 | static int __init hugepage_init(void) | 429 | static int __init hugepage_init(void) |
161 | { | 430 | { |
162 | #ifdef CONFIG_SYSFS | ||
163 | int err; | 431 | int err; |
432 | #ifdef CONFIG_SYSFS | ||
433 | static struct kobject *hugepage_kobj; | ||
164 | 434 | ||
165 | err = sysfs_create_group(mm_kobj, &hugepage_attr_group); | 435 | err = -ENOMEM; |
166 | if (err) | 436 | hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); |
167 | printk(KERN_ERR "hugepage: register sysfs failed\n"); | 437 | if (unlikely(!hugepage_kobj)) { |
438 | printk(KERN_ERR "hugepage: failed kobject create\n"); | ||
439 | goto out; | ||
440 | } | ||
441 | |||
442 | err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); | ||
443 | if (err) { | ||
444 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | ||
445 | goto out; | ||
446 | } | ||
447 | |||
448 | err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); | ||
449 | if (err) { | ||
450 | printk(KERN_ERR "hugepage: failed register hugeage group\n"); | ||
451 | goto out; | ||
452 | } | ||
168 | #endif | 453 | #endif |
169 | return 0; | 454 | |
455 | err = khugepaged_slab_init(); | ||
456 | if (err) | ||
457 | goto out; | ||
458 | |||
459 | err = mm_slots_hash_init(); | ||
460 | if (err) { | ||
461 | khugepaged_slab_free(); | ||
462 | goto out; | ||
463 | } | ||
464 | |||
465 | start_khugepaged(); | ||
466 | |||
467 | out: | ||
468 | return err; | ||
170 | } | 469 | } |
171 | module_init(hugepage_init) | 470 | module_init(hugepage_init) |
172 | 471 | ||
@@ -285,6 +584,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
285 | if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { | 584 | if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { |
286 | if (unlikely(anon_vma_prepare(vma))) | 585 | if (unlikely(anon_vma_prepare(vma))) |
287 | return VM_FAULT_OOM; | 586 | return VM_FAULT_OOM; |
587 | if (unlikely(khugepaged_enter(vma))) | ||
588 | return VM_FAULT_OOM; | ||
288 | page = alloc_hugepage(transparent_hugepage_defrag(vma)); | 589 | page = alloc_hugepage(transparent_hugepage_defrag(vma)); |
289 | if (unlikely(!page)) | 590 | if (unlikely(!page)) |
290 | goto out; | 591 | goto out; |
@@ -941,6 +1242,758 @@ int hugepage_madvise(unsigned long *vm_flags) | |||
941 | return 0; | 1242 | return 0; |
942 | } | 1243 | } |
943 | 1244 | ||
1245 | static int __init khugepaged_slab_init(void) | ||
1246 | { | ||
1247 | mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", | ||
1248 | sizeof(struct mm_slot), | ||
1249 | __alignof__(struct mm_slot), 0, NULL); | ||
1250 | if (!mm_slot_cache) | ||
1251 | return -ENOMEM; | ||
1252 | |||
1253 | return 0; | ||
1254 | } | ||
1255 | |||
1256 | static void __init khugepaged_slab_free(void) | ||
1257 | { | ||
1258 | kmem_cache_destroy(mm_slot_cache); | ||
1259 | mm_slot_cache = NULL; | ||
1260 | } | ||
1261 | |||
1262 | static inline struct mm_slot *alloc_mm_slot(void) | ||
1263 | { | ||
1264 | if (!mm_slot_cache) /* initialization failed */ | ||
1265 | return NULL; | ||
1266 | return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); | ||
1267 | } | ||
1268 | |||
1269 | static inline void free_mm_slot(struct mm_slot *mm_slot) | ||
1270 | { | ||
1271 | kmem_cache_free(mm_slot_cache, mm_slot); | ||
1272 | } | ||
1273 | |||
1274 | static int __init mm_slots_hash_init(void) | ||
1275 | { | ||
1276 | mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), | ||
1277 | GFP_KERNEL); | ||
1278 | if (!mm_slots_hash) | ||
1279 | return -ENOMEM; | ||
1280 | return 0; | ||
1281 | } | ||
1282 | |||
1283 | #if 0 | ||
1284 | static void __init mm_slots_hash_free(void) | ||
1285 | { | ||
1286 | kfree(mm_slots_hash); | ||
1287 | mm_slots_hash = NULL; | ||
1288 | } | ||
1289 | #endif | ||
1290 | |||
1291 | static struct mm_slot *get_mm_slot(struct mm_struct *mm) | ||
1292 | { | ||
1293 | struct mm_slot *mm_slot; | ||
1294 | struct hlist_head *bucket; | ||
1295 | struct hlist_node *node; | ||
1296 | |||
1297 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
1298 | % MM_SLOTS_HASH_HEADS]; | ||
1299 | hlist_for_each_entry(mm_slot, node, bucket, hash) { | ||
1300 | if (mm == mm_slot->mm) | ||
1301 | return mm_slot; | ||
1302 | } | ||
1303 | return NULL; | ||
1304 | } | ||
1305 | |||
1306 | static void insert_to_mm_slots_hash(struct mm_struct *mm, | ||
1307 | struct mm_slot *mm_slot) | ||
1308 | { | ||
1309 | struct hlist_head *bucket; | ||
1310 | |||
1311 | bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) | ||
1312 | % MM_SLOTS_HASH_HEADS]; | ||
1313 | mm_slot->mm = mm; | ||
1314 | hlist_add_head(&mm_slot->hash, bucket); | ||
1315 | } | ||
1316 | |||
1317 | static inline int khugepaged_test_exit(struct mm_struct *mm) | ||
1318 | { | ||
1319 | return atomic_read(&mm->mm_users) == 0; | ||
1320 | } | ||
1321 | |||
1322 | int __khugepaged_enter(struct mm_struct *mm) | ||
1323 | { | ||
1324 | struct mm_slot *mm_slot; | ||
1325 | int wakeup; | ||
1326 | |||
1327 | mm_slot = alloc_mm_slot(); | ||
1328 | if (!mm_slot) | ||
1329 | return -ENOMEM; | ||
1330 | |||
1331 | /* __khugepaged_exit() must not run from under us */ | ||
1332 | VM_BUG_ON(khugepaged_test_exit(mm)); | ||
1333 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { | ||
1334 | free_mm_slot(mm_slot); | ||
1335 | return 0; | ||
1336 | } | ||
1337 | |||
1338 | spin_lock(&khugepaged_mm_lock); | ||
1339 | insert_to_mm_slots_hash(mm, mm_slot); | ||
1340 | /* | ||
1341 | * Insert just behind the scanning cursor, to let the area settle | ||
1342 | * down a little. | ||
1343 | */ | ||
1344 | wakeup = list_empty(&khugepaged_scan.mm_head); | ||
1345 | list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); | ||
1346 | spin_unlock(&khugepaged_mm_lock); | ||
1347 | |||
1348 | atomic_inc(&mm->mm_count); | ||
1349 | if (wakeup) | ||
1350 | wake_up_interruptible(&khugepaged_wait); | ||
1351 | |||
1352 | return 0; | ||
1353 | } | ||
1354 | |||
1355 | int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | ||
1356 | { | ||
1357 | unsigned long hstart, hend; | ||
1358 | if (!vma->anon_vma) | ||
1359 | /* | ||
1360 | * Not yet faulted in so we will register later in the | ||
1361 | * page fault if needed. | ||
1362 | */ | ||
1363 | return 0; | ||
1364 | if (vma->vm_file || vma->vm_ops) | ||
1365 | /* khugepaged not yet working on file or special mappings */ | ||
1366 | return 0; | ||
1367 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
1368 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
1369 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
1370 | if (hstart < hend) | ||
1371 | return khugepaged_enter(vma); | ||
1372 | return 0; | ||
1373 | } | ||
1374 | |||
1375 | void __khugepaged_exit(struct mm_struct *mm) | ||
1376 | { | ||
1377 | struct mm_slot *mm_slot; | ||
1378 | int free = 0; | ||
1379 | |||
1380 | spin_lock(&khugepaged_mm_lock); | ||
1381 | mm_slot = get_mm_slot(mm); | ||
1382 | if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { | ||
1383 | hlist_del(&mm_slot->hash); | ||
1384 | list_del(&mm_slot->mm_node); | ||
1385 | free = 1; | ||
1386 | } | ||
1387 | |||
1388 | if (free) { | ||
1389 | spin_unlock(&khugepaged_mm_lock); | ||
1390 | clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
1391 | free_mm_slot(mm_slot); | ||
1392 | mmdrop(mm); | ||
1393 | } else if (mm_slot) { | ||
1394 | spin_unlock(&khugepaged_mm_lock); | ||
1395 | /* | ||
1396 | * This is required to serialize against | ||
1397 | * khugepaged_test_exit() (which is guaranteed to run | ||
1398 | * under mmap sem read mode). Stop here (after we | ||
1399 | * return all pagetables will be destroyed) until | ||
1400 | * khugepaged has finished working on the pagetables | ||
1401 | * under the mmap_sem. | ||
1402 | */ | ||
1403 | down_write(&mm->mmap_sem); | ||
1404 | up_write(&mm->mmap_sem); | ||
1405 | } else | ||
1406 | spin_unlock(&khugepaged_mm_lock); | ||
1407 | } | ||
1408 | |||
1409 | static void release_pte_page(struct page *page) | ||
1410 | { | ||
1411 | /* 0 stands for page_is_file_cache(page) == false */ | ||
1412 | dec_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
1413 | unlock_page(page); | ||
1414 | putback_lru_page(page); | ||
1415 | } | ||
1416 | |||
1417 | static void release_pte_pages(pte_t *pte, pte_t *_pte) | ||
1418 | { | ||
1419 | while (--_pte >= pte) { | ||
1420 | pte_t pteval = *_pte; | ||
1421 | if (!pte_none(pteval)) | ||
1422 | release_pte_page(pte_page(pteval)); | ||
1423 | } | ||
1424 | } | ||
1425 | |||
1426 | static void release_all_pte_pages(pte_t *pte) | ||
1427 | { | ||
1428 | release_pte_pages(pte, pte + HPAGE_PMD_NR); | ||
1429 | } | ||
1430 | |||
1431 | static int __collapse_huge_page_isolate(struct vm_area_struct *vma, | ||
1432 | unsigned long address, | ||
1433 | pte_t *pte) | ||
1434 | { | ||
1435 | struct page *page; | ||
1436 | pte_t *_pte; | ||
1437 | int referenced = 0, isolated = 0, none = 0; | ||
1438 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
1439 | _pte++, address += PAGE_SIZE) { | ||
1440 | pte_t pteval = *_pte; | ||
1441 | if (pte_none(pteval)) { | ||
1442 | if (++none <= khugepaged_max_ptes_none) | ||
1443 | continue; | ||
1444 | else { | ||
1445 | release_pte_pages(pte, _pte); | ||
1446 | goto out; | ||
1447 | } | ||
1448 | } | ||
1449 | if (!pte_present(pteval) || !pte_write(pteval)) { | ||
1450 | release_pte_pages(pte, _pte); | ||
1451 | goto out; | ||
1452 | } | ||
1453 | page = vm_normal_page(vma, address, pteval); | ||
1454 | if (unlikely(!page)) { | ||
1455 | release_pte_pages(pte, _pte); | ||
1456 | goto out; | ||
1457 | } | ||
1458 | VM_BUG_ON(PageCompound(page)); | ||
1459 | BUG_ON(!PageAnon(page)); | ||
1460 | VM_BUG_ON(!PageSwapBacked(page)); | ||
1461 | |||
1462 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
1463 | if (page_count(page) != 1) { | ||
1464 | release_pte_pages(pte, _pte); | ||
1465 | goto out; | ||
1466 | } | ||
1467 | /* | ||
1468 | * We can do it before isolate_lru_page because the | ||
1469 | * page can't be freed from under us. NOTE: PG_lock | ||
1470 | * is needed to serialize against split_huge_page | ||
1471 | * when invoked from the VM. | ||
1472 | */ | ||
1473 | if (!trylock_page(page)) { | ||
1474 | release_pte_pages(pte, _pte); | ||
1475 | goto out; | ||
1476 | } | ||
1477 | /* | ||
1478 | * Isolate the page to avoid collapsing an hugepage | ||
1479 | * currently in use by the VM. | ||
1480 | */ | ||
1481 | if (isolate_lru_page(page)) { | ||
1482 | unlock_page(page); | ||
1483 | release_pte_pages(pte, _pte); | ||
1484 | goto out; | ||
1485 | } | ||
1486 | /* 0 stands for page_is_file_cache(page) == false */ | ||
1487 | inc_zone_page_state(page, NR_ISOLATED_ANON + 0); | ||
1488 | VM_BUG_ON(!PageLocked(page)); | ||
1489 | VM_BUG_ON(PageLRU(page)); | ||
1490 | |||
1491 | /* If there is no mapped pte young don't collapse the page */ | ||
1492 | if (pte_young(pteval)) | ||
1493 | referenced = 1; | ||
1494 | } | ||
1495 | if (unlikely(!referenced)) | ||
1496 | release_all_pte_pages(pte); | ||
1497 | else | ||
1498 | isolated = 1; | ||
1499 | out: | ||
1500 | return isolated; | ||
1501 | } | ||
1502 | |||
1503 | static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | ||
1504 | struct vm_area_struct *vma, | ||
1505 | unsigned long address, | ||
1506 | spinlock_t *ptl) | ||
1507 | { | ||
1508 | pte_t *_pte; | ||
1509 | for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { | ||
1510 | pte_t pteval = *_pte; | ||
1511 | struct page *src_page; | ||
1512 | |||
1513 | if (pte_none(pteval)) { | ||
1514 | clear_user_highpage(page, address); | ||
1515 | add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); | ||
1516 | } else { | ||
1517 | src_page = pte_page(pteval); | ||
1518 | copy_user_highpage(page, src_page, address, vma); | ||
1519 | VM_BUG_ON(page_mapcount(src_page) != 1); | ||
1520 | VM_BUG_ON(page_count(src_page) != 2); | ||
1521 | release_pte_page(src_page); | ||
1522 | /* | ||
1523 | * ptl mostly unnecessary, but preempt has to | ||
1524 | * be disabled to update the per-cpu stats | ||
1525 | * inside page_remove_rmap(). | ||
1526 | */ | ||
1527 | spin_lock(ptl); | ||
1528 | /* | ||
1529 | * paravirt calls inside pte_clear here are | ||
1530 | * superfluous. | ||
1531 | */ | ||
1532 | pte_clear(vma->vm_mm, address, _pte); | ||
1533 | page_remove_rmap(src_page); | ||
1534 | spin_unlock(ptl); | ||
1535 | free_page_and_swap_cache(src_page); | ||
1536 | } | ||
1537 | |||
1538 | address += PAGE_SIZE; | ||
1539 | page++; | ||
1540 | } | ||
1541 | } | ||
1542 | |||
1543 | static void collapse_huge_page(struct mm_struct *mm, | ||
1544 | unsigned long address, | ||
1545 | struct page **hpage) | ||
1546 | { | ||
1547 | struct vm_area_struct *vma; | ||
1548 | pgd_t *pgd; | ||
1549 | pud_t *pud; | ||
1550 | pmd_t *pmd, _pmd; | ||
1551 | pte_t *pte; | ||
1552 | pgtable_t pgtable; | ||
1553 | struct page *new_page; | ||
1554 | spinlock_t *ptl; | ||
1555 | int isolated; | ||
1556 | unsigned long hstart, hend; | ||
1557 | |||
1558 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1559 | VM_BUG_ON(!*hpage); | ||
1560 | |||
1561 | /* | ||
1562 | * Prevent all access to pagetables with the exception of | ||
1563 | * gup_fast later hanlded by the ptep_clear_flush and the VM | ||
1564 | * handled by the anon_vma lock + PG_lock. | ||
1565 | */ | ||
1566 | down_write(&mm->mmap_sem); | ||
1567 | if (unlikely(khugepaged_test_exit(mm))) | ||
1568 | goto out; | ||
1569 | |||
1570 | vma = find_vma(mm, address); | ||
1571 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
1572 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
1573 | if (address < hstart || address + HPAGE_PMD_SIZE > hend) | ||
1574 | goto out; | ||
1575 | |||
1576 | if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) | ||
1577 | goto out; | ||
1578 | |||
1579 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | ||
1580 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) | ||
1581 | goto out; | ||
1582 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
1583 | |||
1584 | pgd = pgd_offset(mm, address); | ||
1585 | if (!pgd_present(*pgd)) | ||
1586 | goto out; | ||
1587 | |||
1588 | pud = pud_offset(pgd, address); | ||
1589 | if (!pud_present(*pud)) | ||
1590 | goto out; | ||
1591 | |||
1592 | pmd = pmd_offset(pud, address); | ||
1593 | /* pmd can't go away or become huge under us */ | ||
1594 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
1595 | goto out; | ||
1596 | |||
1597 | new_page = *hpage; | ||
1598 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) | ||
1599 | goto out; | ||
1600 | |||
1601 | anon_vma_lock(vma->anon_vma); | ||
1602 | |||
1603 | pte = pte_offset_map(pmd, address); | ||
1604 | ptl = pte_lockptr(mm, pmd); | ||
1605 | |||
1606 | spin_lock(&mm->page_table_lock); /* probably unnecessary */ | ||
1607 | /* | ||
1608 | * After this gup_fast can't run anymore. This also removes | ||
1609 | * any huge TLB entry from the CPU so we won't allow | ||
1610 | * huge and small TLB entries for the same virtual address | ||
1611 | * to avoid the risk of CPU bugs in that area. | ||
1612 | */ | ||
1613 | _pmd = pmdp_clear_flush_notify(vma, address, pmd); | ||
1614 | spin_unlock(&mm->page_table_lock); | ||
1615 | |||
1616 | spin_lock(ptl); | ||
1617 | isolated = __collapse_huge_page_isolate(vma, address, pte); | ||
1618 | spin_unlock(ptl); | ||
1619 | pte_unmap(pte); | ||
1620 | |||
1621 | if (unlikely(!isolated)) { | ||
1622 | spin_lock(&mm->page_table_lock); | ||
1623 | BUG_ON(!pmd_none(*pmd)); | ||
1624 | set_pmd_at(mm, address, pmd, _pmd); | ||
1625 | spin_unlock(&mm->page_table_lock); | ||
1626 | anon_vma_unlock(vma->anon_vma); | ||
1627 | mem_cgroup_uncharge_page(new_page); | ||
1628 | goto out; | ||
1629 | } | ||
1630 | |||
1631 | /* | ||
1632 | * All pages are isolated and locked so anon_vma rmap | ||
1633 | * can't run anymore. | ||
1634 | */ | ||
1635 | anon_vma_unlock(vma->anon_vma); | ||
1636 | |||
1637 | __collapse_huge_page_copy(pte, new_page, vma, address, ptl); | ||
1638 | __SetPageUptodate(new_page); | ||
1639 | pgtable = pmd_pgtable(_pmd); | ||
1640 | VM_BUG_ON(page_count(pgtable) != 1); | ||
1641 | VM_BUG_ON(page_mapcount(pgtable) != 0); | ||
1642 | |||
1643 | _pmd = mk_pmd(new_page, vma->vm_page_prot); | ||
1644 | _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); | ||
1645 | _pmd = pmd_mkhuge(_pmd); | ||
1646 | |||
1647 | /* | ||
1648 | * spin_lock() below is not the equivalent of smp_wmb(), so | ||
1649 | * this is needed to avoid the copy_huge_page writes to become | ||
1650 | * visible after the set_pmd_at() write. | ||
1651 | */ | ||
1652 | smp_wmb(); | ||
1653 | |||
1654 | spin_lock(&mm->page_table_lock); | ||
1655 | BUG_ON(!pmd_none(*pmd)); | ||
1656 | page_add_new_anon_rmap(new_page, vma, address); | ||
1657 | set_pmd_at(mm, address, pmd, _pmd); | ||
1658 | update_mmu_cache(vma, address, entry); | ||
1659 | prepare_pmd_huge_pte(pgtable, mm); | ||
1660 | mm->nr_ptes--; | ||
1661 | spin_unlock(&mm->page_table_lock); | ||
1662 | |||
1663 | *hpage = NULL; | ||
1664 | khugepaged_pages_collapsed++; | ||
1665 | out: | ||
1666 | up_write(&mm->mmap_sem); | ||
1667 | } | ||
1668 | |||
1669 | static int khugepaged_scan_pmd(struct mm_struct *mm, | ||
1670 | struct vm_area_struct *vma, | ||
1671 | unsigned long address, | ||
1672 | struct page **hpage) | ||
1673 | { | ||
1674 | pgd_t *pgd; | ||
1675 | pud_t *pud; | ||
1676 | pmd_t *pmd; | ||
1677 | pte_t *pte, *_pte; | ||
1678 | int ret = 0, referenced = 0, none = 0; | ||
1679 | struct page *page; | ||
1680 | unsigned long _address; | ||
1681 | spinlock_t *ptl; | ||
1682 | |||
1683 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1684 | |||
1685 | pgd = pgd_offset(mm, address); | ||
1686 | if (!pgd_present(*pgd)) | ||
1687 | goto out; | ||
1688 | |||
1689 | pud = pud_offset(pgd, address); | ||
1690 | if (!pud_present(*pud)) | ||
1691 | goto out; | ||
1692 | |||
1693 | pmd = pmd_offset(pud, address); | ||
1694 | if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) | ||
1695 | goto out; | ||
1696 | |||
1697 | pte = pte_offset_map_lock(mm, pmd, address, &ptl); | ||
1698 | for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; | ||
1699 | _pte++, _address += PAGE_SIZE) { | ||
1700 | pte_t pteval = *_pte; | ||
1701 | if (pte_none(pteval)) { | ||
1702 | if (++none <= khugepaged_max_ptes_none) | ||
1703 | continue; | ||
1704 | else | ||
1705 | goto out_unmap; | ||
1706 | } | ||
1707 | if (!pte_present(pteval) || !pte_write(pteval)) | ||
1708 | goto out_unmap; | ||
1709 | page = vm_normal_page(vma, _address, pteval); | ||
1710 | if (unlikely(!page)) | ||
1711 | goto out_unmap; | ||
1712 | VM_BUG_ON(PageCompound(page)); | ||
1713 | if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) | ||
1714 | goto out_unmap; | ||
1715 | /* cannot use mapcount: can't collapse if there's a gup pin */ | ||
1716 | if (page_count(page) != 1) | ||
1717 | goto out_unmap; | ||
1718 | if (pte_young(pteval)) | ||
1719 | referenced = 1; | ||
1720 | } | ||
1721 | if (referenced) | ||
1722 | ret = 1; | ||
1723 | out_unmap: | ||
1724 | pte_unmap_unlock(pte, ptl); | ||
1725 | if (ret) { | ||
1726 | up_read(&mm->mmap_sem); | ||
1727 | collapse_huge_page(mm, address, hpage); | ||
1728 | } | ||
1729 | out: | ||
1730 | return ret; | ||
1731 | } | ||
1732 | |||
1733 | static void collect_mm_slot(struct mm_slot *mm_slot) | ||
1734 | { | ||
1735 | struct mm_struct *mm = mm_slot->mm; | ||
1736 | |||
1737 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | ||
1738 | |||
1739 | if (khugepaged_test_exit(mm)) { | ||
1740 | /* free mm_slot */ | ||
1741 | hlist_del(&mm_slot->hash); | ||
1742 | list_del(&mm_slot->mm_node); | ||
1743 | |||
1744 | /* | ||
1745 | * Not strictly needed because the mm exited already. | ||
1746 | * | ||
1747 | * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); | ||
1748 | */ | ||
1749 | |||
1750 | /* khugepaged_mm_lock actually not necessary for the below */ | ||
1751 | free_mm_slot(mm_slot); | ||
1752 | mmdrop(mm); | ||
1753 | } | ||
1754 | } | ||
1755 | |||
1756 | static unsigned int khugepaged_scan_mm_slot(unsigned int pages, | ||
1757 | struct page **hpage) | ||
1758 | { | ||
1759 | struct mm_slot *mm_slot; | ||
1760 | struct mm_struct *mm; | ||
1761 | struct vm_area_struct *vma; | ||
1762 | int progress = 0; | ||
1763 | |||
1764 | VM_BUG_ON(!pages); | ||
1765 | VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); | ||
1766 | |||
1767 | if (khugepaged_scan.mm_slot) | ||
1768 | mm_slot = khugepaged_scan.mm_slot; | ||
1769 | else { | ||
1770 | mm_slot = list_entry(khugepaged_scan.mm_head.next, | ||
1771 | struct mm_slot, mm_node); | ||
1772 | khugepaged_scan.address = 0; | ||
1773 | khugepaged_scan.mm_slot = mm_slot; | ||
1774 | } | ||
1775 | spin_unlock(&khugepaged_mm_lock); | ||
1776 | |||
1777 | mm = mm_slot->mm; | ||
1778 | down_read(&mm->mmap_sem); | ||
1779 | if (unlikely(khugepaged_test_exit(mm))) | ||
1780 | vma = NULL; | ||
1781 | else | ||
1782 | vma = find_vma(mm, khugepaged_scan.address); | ||
1783 | |||
1784 | progress++; | ||
1785 | for (; vma; vma = vma->vm_next) { | ||
1786 | unsigned long hstart, hend; | ||
1787 | |||
1788 | cond_resched(); | ||
1789 | if (unlikely(khugepaged_test_exit(mm))) { | ||
1790 | progress++; | ||
1791 | break; | ||
1792 | } | ||
1793 | |||
1794 | if (!(vma->vm_flags & VM_HUGEPAGE) && | ||
1795 | !khugepaged_always()) { | ||
1796 | progress++; | ||
1797 | continue; | ||
1798 | } | ||
1799 | |||
1800 | /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ | ||
1801 | if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { | ||
1802 | khugepaged_scan.address = vma->vm_end; | ||
1803 | progress++; | ||
1804 | continue; | ||
1805 | } | ||
1806 | VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); | ||
1807 | |||
1808 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | ||
1809 | hend = vma->vm_end & HPAGE_PMD_MASK; | ||
1810 | if (hstart >= hend) { | ||
1811 | progress++; | ||
1812 | continue; | ||
1813 | } | ||
1814 | if (khugepaged_scan.address < hstart) | ||
1815 | khugepaged_scan.address = hstart; | ||
1816 | if (khugepaged_scan.address > hend) { | ||
1817 | khugepaged_scan.address = hend + HPAGE_PMD_SIZE; | ||
1818 | progress++; | ||
1819 | continue; | ||
1820 | } | ||
1821 | BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); | ||
1822 | |||
1823 | while (khugepaged_scan.address < hend) { | ||
1824 | int ret; | ||
1825 | cond_resched(); | ||
1826 | if (unlikely(khugepaged_test_exit(mm))) | ||
1827 | goto breakouterloop; | ||
1828 | |||
1829 | VM_BUG_ON(khugepaged_scan.address < hstart || | ||
1830 | khugepaged_scan.address + HPAGE_PMD_SIZE > | ||
1831 | hend); | ||
1832 | ret = khugepaged_scan_pmd(mm, vma, | ||
1833 | khugepaged_scan.address, | ||
1834 | hpage); | ||
1835 | /* move to next address */ | ||
1836 | khugepaged_scan.address += HPAGE_PMD_SIZE; | ||
1837 | progress += HPAGE_PMD_NR; | ||
1838 | if (ret) | ||
1839 | /* we released mmap_sem so break loop */ | ||
1840 | goto breakouterloop_mmap_sem; | ||
1841 | if (progress >= pages) | ||
1842 | goto breakouterloop; | ||
1843 | } | ||
1844 | } | ||
1845 | breakouterloop: | ||
1846 | up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ | ||
1847 | breakouterloop_mmap_sem: | ||
1848 | |||
1849 | spin_lock(&khugepaged_mm_lock); | ||
1850 | BUG_ON(khugepaged_scan.mm_slot != mm_slot); | ||
1851 | /* | ||
1852 | * Release the current mm_slot if this mm is about to die, or | ||
1853 | * if we scanned all vmas of this mm. | ||
1854 | */ | ||
1855 | if (khugepaged_test_exit(mm) || !vma) { | ||
1856 | /* | ||
1857 | * Make sure that if mm_users is reaching zero while | ||
1858 | * khugepaged runs here, khugepaged_exit will find | ||
1859 | * mm_slot not pointing to the exiting mm. | ||
1860 | */ | ||
1861 | if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { | ||
1862 | khugepaged_scan.mm_slot = list_entry( | ||
1863 | mm_slot->mm_node.next, | ||
1864 | struct mm_slot, mm_node); | ||
1865 | khugepaged_scan.address = 0; | ||
1866 | } else { | ||
1867 | khugepaged_scan.mm_slot = NULL; | ||
1868 | khugepaged_full_scans++; | ||
1869 | } | ||
1870 | |||
1871 | collect_mm_slot(mm_slot); | ||
1872 | } | ||
1873 | |||
1874 | return progress; | ||
1875 | } | ||
1876 | |||
1877 | static int khugepaged_has_work(void) | ||
1878 | { | ||
1879 | return !list_empty(&khugepaged_scan.mm_head) && | ||
1880 | khugepaged_enabled(); | ||
1881 | } | ||
1882 | |||
1883 | static int khugepaged_wait_event(void) | ||
1884 | { | ||
1885 | return !list_empty(&khugepaged_scan.mm_head) || | ||
1886 | !khugepaged_enabled(); | ||
1887 | } | ||
1888 | |||
1889 | static void khugepaged_do_scan(struct page **hpage) | ||
1890 | { | ||
1891 | unsigned int progress = 0, pass_through_head = 0; | ||
1892 | unsigned int pages = khugepaged_pages_to_scan; | ||
1893 | |||
1894 | barrier(); /* write khugepaged_pages_to_scan to local stack */ | ||
1895 | |||
1896 | while (progress < pages) { | ||
1897 | cond_resched(); | ||
1898 | |||
1899 | if (!*hpage) { | ||
1900 | *hpage = alloc_hugepage(khugepaged_defrag()); | ||
1901 | if (unlikely(!*hpage)) | ||
1902 | break; | ||
1903 | } | ||
1904 | |||
1905 | spin_lock(&khugepaged_mm_lock); | ||
1906 | if (!khugepaged_scan.mm_slot) | ||
1907 | pass_through_head++; | ||
1908 | if (khugepaged_has_work() && | ||
1909 | pass_through_head < 2) | ||
1910 | progress += khugepaged_scan_mm_slot(pages - progress, | ||
1911 | hpage); | ||
1912 | else | ||
1913 | progress = pages; | ||
1914 | spin_unlock(&khugepaged_mm_lock); | ||
1915 | } | ||
1916 | } | ||
1917 | |||
1918 | static struct page *khugepaged_alloc_hugepage(void) | ||
1919 | { | ||
1920 | struct page *hpage; | ||
1921 | |||
1922 | do { | ||
1923 | hpage = alloc_hugepage(khugepaged_defrag()); | ||
1924 | if (!hpage) { | ||
1925 | DEFINE_WAIT(wait); | ||
1926 | add_wait_queue(&khugepaged_wait, &wait); | ||
1927 | schedule_timeout_interruptible( | ||
1928 | msecs_to_jiffies( | ||
1929 | khugepaged_alloc_sleep_millisecs)); | ||
1930 | remove_wait_queue(&khugepaged_wait, &wait); | ||
1931 | } | ||
1932 | } while (unlikely(!hpage) && | ||
1933 | likely(khugepaged_enabled())); | ||
1934 | return hpage; | ||
1935 | } | ||
1936 | |||
1937 | static void khugepaged_loop(void) | ||
1938 | { | ||
1939 | struct page *hpage; | ||
1940 | |||
1941 | while (likely(khugepaged_enabled())) { | ||
1942 | hpage = khugepaged_alloc_hugepage(); | ||
1943 | if (unlikely(!hpage)) | ||
1944 | break; | ||
1945 | |||
1946 | khugepaged_do_scan(&hpage); | ||
1947 | if (hpage) | ||
1948 | put_page(hpage); | ||
1949 | if (khugepaged_has_work()) { | ||
1950 | DEFINE_WAIT(wait); | ||
1951 | if (!khugepaged_scan_sleep_millisecs) | ||
1952 | continue; | ||
1953 | add_wait_queue(&khugepaged_wait, &wait); | ||
1954 | schedule_timeout_interruptible( | ||
1955 | msecs_to_jiffies( | ||
1956 | khugepaged_scan_sleep_millisecs)); | ||
1957 | remove_wait_queue(&khugepaged_wait, &wait); | ||
1958 | } else if (khugepaged_enabled()) | ||
1959 | wait_event_interruptible(khugepaged_wait, | ||
1960 | khugepaged_wait_event()); | ||
1961 | } | ||
1962 | } | ||
1963 | |||
1964 | static int khugepaged(void *none) | ||
1965 | { | ||
1966 | struct mm_slot *mm_slot; | ||
1967 | |||
1968 | set_user_nice(current, 19); | ||
1969 | |||
1970 | /* serialize with start_khugepaged() */ | ||
1971 | mutex_lock(&khugepaged_mutex); | ||
1972 | |||
1973 | for (;;) { | ||
1974 | mutex_unlock(&khugepaged_mutex); | ||
1975 | BUG_ON(khugepaged_thread != current); | ||
1976 | khugepaged_loop(); | ||
1977 | BUG_ON(khugepaged_thread != current); | ||
1978 | |||
1979 | mutex_lock(&khugepaged_mutex); | ||
1980 | if (!khugepaged_enabled()) | ||
1981 | break; | ||
1982 | } | ||
1983 | |||
1984 | spin_lock(&khugepaged_mm_lock); | ||
1985 | mm_slot = khugepaged_scan.mm_slot; | ||
1986 | khugepaged_scan.mm_slot = NULL; | ||
1987 | if (mm_slot) | ||
1988 | collect_mm_slot(mm_slot); | ||
1989 | spin_unlock(&khugepaged_mm_lock); | ||
1990 | |||
1991 | khugepaged_thread = NULL; | ||
1992 | mutex_unlock(&khugepaged_mutex); | ||
1993 | |||
1994 | return 0; | ||
1995 | } | ||
1996 | |||
944 | void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | 1997 | void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) |
945 | { | 1998 | { |
946 | struct page *page; | 1999 | struct page *page; |