diff options
52 files changed, 996 insertions, 422 deletions
diff --git a/Documentation/cgroups/cpusets.txt b/Documentation/cgroups/cpusets.txt index cefd3d8bbd1..12e01d432bf 100644 --- a/Documentation/cgroups/cpusets.txt +++ b/Documentation/cgroups/cpusets.txt | |||
| @@ -218,7 +218,7 @@ and name space for cpusets, with a minimum of additional kernel code. | |||
| 218 | The cpus and mems files in the root (top_cpuset) cpuset are | 218 | The cpus and mems files in the root (top_cpuset) cpuset are |
| 219 | read-only. The cpus file automatically tracks the value of | 219 | read-only. The cpus file automatically tracks the value of |
| 220 | cpu_online_mask using a CPU hotplug notifier, and the mems file | 220 | cpu_online_mask using a CPU hotplug notifier, and the mems file |
| 221 | automatically tracks the value of node_states[N_HIGH_MEMORY]--i.e., | 221 | automatically tracks the value of node_states[N_MEMORY]--i.e., |
| 222 | nodes with memory--using the cpuset_track_online_nodes() hook. | 222 | nodes with memory--using the cpuset_track_online_nodes() hook. |
| 223 | 223 | ||
| 224 | 224 | ||
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index c6f993d491b..8e5eacbdcfa 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt | |||
| @@ -390,6 +390,7 @@ struct memory_notify { | |||
| 390 | unsigned long start_pfn; | 390 | unsigned long start_pfn; |
| 391 | unsigned long nr_pages; | 391 | unsigned long nr_pages; |
| 392 | int status_change_nid_normal; | 392 | int status_change_nid_normal; |
| 393 | int status_change_nid_high; | ||
| 393 | int status_change_nid; | 394 | int status_change_nid; |
| 394 | } | 395 | } |
| 395 | 396 | ||
| @@ -397,7 +398,9 @@ start_pfn is start_pfn of online/offline memory. | |||
| 397 | nr_pages is # of pages of online/offline memory. | 398 | nr_pages is # of pages of online/offline memory. |
| 398 | status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask | 399 | status_change_nid_normal is set node id when N_NORMAL_MEMORY of nodemask |
| 399 | is (will be) set/clear, if this is -1, then nodemask status is not changed. | 400 | is (will be) set/clear, if this is -1, then nodemask status is not changed. |
| 400 | status_change_nid is set node id when N_HIGH_MEMORY of nodemask is (will be) | 401 | status_change_nid_high is set node id when N_HIGH_MEMORY of nodemask |
| 402 | is (will be) set/clear, if this is -1, then nodemask status is not changed. | ||
| 403 | status_change_nid is set node id when N_MEMORY of nodemask is (will be) | ||
| 401 | set/clear. It means a new(memoryless) node gets new memory by online and a | 404 | set/clear. It means a new(memoryless) node gets new memory by online and a |
| 402 | node loses all memory. If this is -1, then nodemask status is not changed. | 405 | node loses all memory. If this is -1, then nodemask status is not changed. |
| 403 | If status_changed_nid* >= 0, callback should create/discard structures for the | 406 | If status_changed_nid* >= 0, callback should create/discard structures for the |
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index f734bb2a78d..8785fb87d9c 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt | |||
| @@ -116,6 +116,13 @@ echo always >/sys/kernel/mm/transparent_hugepage/defrag | |||
| 116 | echo madvise >/sys/kernel/mm/transparent_hugepage/defrag | 116 | echo madvise >/sys/kernel/mm/transparent_hugepage/defrag |
| 117 | echo never >/sys/kernel/mm/transparent_hugepage/defrag | 117 | echo never >/sys/kernel/mm/transparent_hugepage/defrag |
| 118 | 118 | ||
| 119 | By default kernel tries to use huge zero page on read page fault. | ||
| 120 | It's possible to disable huge zero page by writing 0 or enable it | ||
| 121 | back by writing 1: | ||
| 122 | |||
| 123 | echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page | ||
| 124 | echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/use_zero_page | ||
| 125 | |||
| 119 | khugepaged will be automatically started when | 126 | khugepaged will be automatically started when |
| 120 | transparent_hugepage/enabled is set to "always" or "madvise, and it'll | 127 | transparent_hugepage/enabled is set to "always" or "madvise, and it'll |
| 121 | be automatically shutdown if it's set to "never". | 128 | be automatically shutdown if it's set to "never". |
| @@ -197,6 +204,14 @@ thp_split is incremented every time a huge page is split into base | |||
| 197 | pages. This can happen for a variety of reasons but a common | 204 | pages. This can happen for a variety of reasons but a common |
| 198 | reason is that a huge page is old and is being reclaimed. | 205 | reason is that a huge page is old and is being reclaimed. |
| 199 | 206 | ||
| 207 | thp_zero_page_alloc is incremented every time a huge zero page is | ||
| 208 | successfully allocated. It includes allocations which where | ||
| 209 | dropped due race with other allocation. Note, it doesn't count | ||
| 210 | every map of the huge zero page, only its allocation. | ||
| 211 | |||
| 212 | thp_zero_page_alloc_failed is incremented if kernel fails to allocate | ||
| 213 | huge zero page and falls back to using small pages. | ||
| 214 | |||
| 200 | As the system ages, allocating huge pages may be expensive as the | 215 | As the system ages, allocating huge pages may be expensive as the |
| 201 | system uses memory compaction to copy data around memory to free a | 216 | system uses memory compaction to copy data around memory to free a |
| 202 | huge page for use. There are some counters in /proc/vmstat to help | 217 | huge page for use. There are some counters in /proc/vmstat to help |
| @@ -276,7 +291,7 @@ unaffected. libhugetlbfs will also work fine as usual. | |||
| 276 | == Graceful fallback == | 291 | == Graceful fallback == |
| 277 | 292 | ||
| 278 | Code walking pagetables but unware about huge pmds can simply call | 293 | Code walking pagetables but unware about huge pmds can simply call |
| 279 | split_huge_page_pmd(mm, pmd) where the pmd is the one returned by | 294 | split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by |
| 280 | pmd_offset. It's trivial to make the code transparent hugepage aware | 295 | pmd_offset. It's trivial to make the code transparent hugepage aware |
| 281 | by just grepping for "pmd_offset" and adding split_huge_page_pmd where | 296 | by just grepping for "pmd_offset" and adding split_huge_page_pmd where |
| 282 | missing after pmd_offset returns the pmd. Thanks to the graceful | 297 | missing after pmd_offset returns the pmd. Thanks to the graceful |
| @@ -299,7 +314,7 @@ diff --git a/mm/mremap.c b/mm/mremap.c | |||
| 299 | return NULL; | 314 | return NULL; |
| 300 | 315 | ||
| 301 | pmd = pmd_offset(pud, addr); | 316 | pmd = pmd_offset(pud, addr); |
| 302 | + split_huge_page_pmd(mm, pmd); | 317 | + split_huge_page_pmd(vma, addr, pmd); |
| 303 | if (pmd_none_or_clear_bad(pmd)) | 318 | if (pmd_none_or_clear_bad(pmd)) |
| 304 | return NULL; | 319 | return NULL; |
| 305 | 320 | ||
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index c02158be836..14490e9443a 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h | |||
| @@ -76,16 +76,7 @@ extern unsigned long zero_page_mask; | |||
| 76 | 76 | ||
| 77 | #define ZERO_PAGE(vaddr) \ | 77 | #define ZERO_PAGE(vaddr) \ |
| 78 | (virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))) | 78 | (virt_to_page((void *)(empty_zero_page + (((unsigned long)(vaddr)) & zero_page_mask)))) |
| 79 | 79 | #define __HAVE_COLOR_ZERO_PAGE | |
| 80 | #define is_zero_pfn is_zero_pfn | ||
| 81 | static inline int is_zero_pfn(unsigned long pfn) | ||
| 82 | { | ||
| 83 | extern unsigned long zero_pfn; | ||
| 84 | unsigned long offset_from_zero_pfn = pfn - zero_pfn; | ||
| 85 | return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); | ||
| 86 | } | ||
| 87 | |||
| 88 | #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) | ||
| 89 | 80 | ||
| 90 | extern void paging_init(void); | 81 | extern void paging_init(void); |
| 91 | 82 | ||
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 0a6b28336eb..3a8489a354e 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c | |||
| @@ -113,19 +113,6 @@ static int store_updates_sp(struct pt_regs *regs) | |||
| 113 | #define MM_FAULT_CONTINUE -1 | 113 | #define MM_FAULT_CONTINUE -1 |
| 114 | #define MM_FAULT_ERR(sig) (sig) | 114 | #define MM_FAULT_ERR(sig) (sig) |
| 115 | 115 | ||
| 116 | static int out_of_memory(struct pt_regs *regs) | ||
| 117 | { | ||
| 118 | /* | ||
| 119 | * We ran out of memory, or some other thing happened to us that made | ||
| 120 | * us unable to handle the page fault gracefully. | ||
| 121 | */ | ||
| 122 | up_read(¤t->mm->mmap_sem); | ||
| 123 | if (!user_mode(regs)) | ||
| 124 | return MM_FAULT_ERR(SIGKILL); | ||
| 125 | pagefault_out_of_memory(); | ||
| 126 | return MM_FAULT_RETURN; | ||
| 127 | } | ||
| 128 | |||
| 129 | static int do_sigbus(struct pt_regs *regs, unsigned long address) | 116 | static int do_sigbus(struct pt_regs *regs, unsigned long address) |
| 130 | { | 117 | { |
| 131 | siginfo_t info; | 118 | siginfo_t info; |
| @@ -169,8 +156,18 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) | |||
| 169 | return MM_FAULT_CONTINUE; | 156 | return MM_FAULT_CONTINUE; |
| 170 | 157 | ||
| 171 | /* Out of memory */ | 158 | /* Out of memory */ |
| 172 | if (fault & VM_FAULT_OOM) | 159 | if (fault & VM_FAULT_OOM) { |
| 173 | return out_of_memory(regs); | 160 | up_read(¤t->mm->mmap_sem); |
| 161 | |||
| 162 | /* | ||
| 163 | * We ran out of memory, or some other thing happened to us that | ||
| 164 | * made us unable to handle the page fault gracefully. | ||
| 165 | */ | ||
| 166 | if (!user_mode(regs)) | ||
| 167 | return MM_FAULT_ERR(SIGKILL); | ||
| 168 | pagefault_out_of_memory(); | ||
| 169 | return MM_FAULT_RETURN; | ||
| 170 | } | ||
| 174 | 171 | ||
| 175 | /* Bus error. x86 handles HWPOISON here, we'll add this if/when | 172 | /* Bus error. x86 handles HWPOISON here, we'll add this if/when |
| 176 | * we support the feature in HW | 173 | * we support the feature in HW |
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2d3b7cb2600..c814e6f5b57 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h | |||
| @@ -55,16 +55,7 @@ extern unsigned long zero_page_mask; | |||
| 55 | #define ZERO_PAGE(vaddr) \ | 55 | #define ZERO_PAGE(vaddr) \ |
| 56 | (virt_to_page((void *)(empty_zero_page + \ | 56 | (virt_to_page((void *)(empty_zero_page + \ |
| 57 | (((unsigned long)(vaddr)) &zero_page_mask)))) | 57 | (((unsigned long)(vaddr)) &zero_page_mask)))) |
| 58 | 58 | #define __HAVE_COLOR_ZERO_PAGE | |
| 59 | #define is_zero_pfn is_zero_pfn | ||
| 60 | static inline int is_zero_pfn(unsigned long pfn) | ||
| 61 | { | ||
| 62 | extern unsigned long zero_pfn; | ||
| 63 | unsigned long offset_from_zero_pfn = pfn - zero_pfn; | ||
| 64 | return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); | ||
| 65 | } | ||
| 66 | |||
| 67 | #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) | ||
| 68 | 59 | ||
| 69 | #endif /* !__ASSEMBLY__ */ | 60 | #endif /* !__ASSEMBLY__ */ |
| 70 | 61 | ||
diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c index cbbdcad8fcb..1f49c28affa 100644 --- a/arch/sh/mm/fault.c +++ b/arch/sh/mm/fault.c | |||
| @@ -301,17 +301,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, | |||
| 301 | __bad_area(regs, error_code, address, SEGV_ACCERR); | 301 | __bad_area(regs, error_code, address, SEGV_ACCERR); |
| 302 | } | 302 | } |
| 303 | 303 | ||
| 304 | static void out_of_memory(void) | ||
| 305 | { | ||
| 306 | /* | ||
| 307 | * We ran out of memory, call the OOM killer, and return the userspace | ||
| 308 | * (which will retry the fault, or kill us if we got oom-killed): | ||
| 309 | */ | ||
| 310 | up_read(¤t->mm->mmap_sem); | ||
| 311 | |||
| 312 | pagefault_out_of_memory(); | ||
| 313 | } | ||
| 314 | |||
| 315 | static void | 304 | static void |
| 316 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) | 305 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address) |
| 317 | { | 306 | { |
| @@ -353,8 +342,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
| 353 | no_context(regs, error_code, address); | 342 | no_context(regs, error_code, address); |
| 354 | return 1; | 343 | return 1; |
| 355 | } | 344 | } |
| 345 | up_read(¤t->mm->mmap_sem); | ||
| 356 | 346 | ||
| 357 | out_of_memory(); | 347 | /* |
| 348 | * We ran out of memory, call the OOM killer, and return the | ||
| 349 | * userspace (which will retry the fault, or kill us if we got | ||
| 350 | * oom-killed): | ||
| 351 | */ | ||
| 352 | pagefault_out_of_memory(); | ||
| 358 | } else { | 353 | } else { |
| 359 | if (fault & VM_FAULT_SIGBUS) | 354 | if (fault & VM_FAULT_SIGBUS) |
| 360 | do_sigbus(regs, error_code, address); | 355 | do_sigbus(regs, error_code, address); |
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 5c9687b1bde..1dfe69cc78a 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
| @@ -182,7 +182,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
| 182 | if (pud_none_or_clear_bad(pud)) | 182 | if (pud_none_or_clear_bad(pud)) |
| 183 | goto out; | 183 | goto out; |
| 184 | pmd = pmd_offset(pud, 0xA0000); | 184 | pmd = pmd_offset(pud, 0xA0000); |
| 185 | split_huge_page_pmd(mm, pmd); | 185 | split_huge_page_pmd_mm(mm, 0xA0000, pmd); |
| 186 | if (pmd_none_or_clear_bad(pmd)) | 186 | if (pmd_none_or_clear_bad(pmd)) |
| 187 | goto out; | 187 | goto out; |
| 188 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); | 188 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); |
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7a529cbab7a..027088f2f7d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c | |||
| @@ -803,20 +803,6 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code, | |||
| 803 | __bad_area(regs, error_code, address, SEGV_ACCERR); | 803 | __bad_area(regs, error_code, address, SEGV_ACCERR); |
| 804 | } | 804 | } |
| 805 | 805 | ||
| 806 | /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ | ||
| 807 | static void | ||
| 808 | out_of_memory(struct pt_regs *regs, unsigned long error_code, | ||
| 809 | unsigned long address) | ||
| 810 | { | ||
| 811 | /* | ||
| 812 | * We ran out of memory, call the OOM killer, and return the userspace | ||
| 813 | * (which will retry the fault, or kill us if we got oom-killed): | ||
| 814 | */ | ||
| 815 | up_read(¤t->mm->mmap_sem); | ||
| 816 | |||
| 817 | pagefault_out_of_memory(); | ||
| 818 | } | ||
| 819 | |||
| 820 | static void | 806 | static void |
| 821 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, | 807 | do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, |
| 822 | unsigned int fault) | 808 | unsigned int fault) |
| @@ -879,7 +865,14 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, | |||
| 879 | return 1; | 865 | return 1; |
| 880 | } | 866 | } |
| 881 | 867 | ||
| 882 | out_of_memory(regs, error_code, address); | 868 | up_read(¤t->mm->mmap_sem); |
| 869 | |||
| 870 | /* | ||
| 871 | * We ran out of memory, call the OOM killer, and return the | ||
| 872 | * userspace (which will retry the fault, or kill us if we got | ||
| 873 | * oom-killed): | ||
| 874 | */ | ||
| 875 | pagefault_out_of_memory(); | ||
| 883 | } else { | 876 | } else { |
| 884 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| | 877 | if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| |
| 885 | VM_FAULT_HWPOISON_LARGE)) | 878 | VM_FAULT_HWPOISON_LARGE)) |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3baff255ada..2ead3c8a4c8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
| @@ -630,7 +630,9 @@ void __init paging_init(void) | |||
| 630 | * numa support is not compiled in, and later node_set_state | 630 | * numa support is not compiled in, and later node_set_state |
| 631 | * will not set it back. | 631 | * will not set it back. |
| 632 | */ | 632 | */ |
| 633 | node_clear_state(0, N_NORMAL_MEMORY); | 633 | node_clear_state(0, N_MEMORY); |
| 634 | if (N_MEMORY != N_NORMAL_MEMORY) | ||
| 635 | node_clear_state(0, N_NORMAL_MEMORY); | ||
| 634 | 636 | ||
| 635 | zone_sizes_init(); | 637 | zone_sizes_init(); |
| 636 | } | 638 | } |
diff --git a/drivers/base/node.c b/drivers/base/node.c index 294e3162621..fac124a7e1c 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
| @@ -227,7 +227,7 @@ static node_registration_func_t __hugetlb_unregister_node; | |||
| 227 | static inline bool hugetlb_register_node(struct node *node) | 227 | static inline bool hugetlb_register_node(struct node *node) |
| 228 | { | 228 | { |
| 229 | if (__hugetlb_register_node && | 229 | if (__hugetlb_register_node && |
| 230 | node_state(node->dev.id, N_HIGH_MEMORY)) { | 230 | node_state(node->dev.id, N_MEMORY)) { |
| 231 | __hugetlb_register_node(node); | 231 | __hugetlb_register_node(node); |
| 232 | return true; | 232 | return true; |
| 233 | } | 233 | } |
| @@ -644,6 +644,9 @@ static struct node_attr node_state_attr[] = { | |||
| 644 | #ifdef CONFIG_HIGHMEM | 644 | #ifdef CONFIG_HIGHMEM |
| 645 | [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), | 645 | [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY), |
| 646 | #endif | 646 | #endif |
| 647 | #ifdef CONFIG_MOVABLE_NODE | ||
| 648 | [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY), | ||
| 649 | #endif | ||
| 647 | [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), | 650 | [N_CPU] = _NODE_ATTR(has_cpu, N_CPU), |
| 648 | }; | 651 | }; |
| 649 | 652 | ||
| @@ -654,6 +657,9 @@ static struct attribute *node_state_attrs[] = { | |||
| 654 | #ifdef CONFIG_HIGHMEM | 657 | #ifdef CONFIG_HIGHMEM |
| 655 | &node_state_attr[N_HIGH_MEMORY].attr.attr, | 658 | &node_state_attr[N_HIGH_MEMORY].attr.attr, |
| 656 | #endif | 659 | #endif |
| 660 | #ifdef CONFIG_MOVABLE_NODE | ||
| 661 | &node_state_attr[N_MEMORY].attr.attr, | ||
| 662 | #endif | ||
| 657 | &node_state_attr[N_CPU].attr.attr, | 663 | &node_state_attr[N_CPU].attr.attr, |
| 658 | NULL | 664 | NULL |
| 659 | }; | 665 | }; |
diff --git a/fs/buffer.c b/fs/buffer.c index 6e9ed48064f..c017a2dfb90 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
| @@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); | |||
| 46 | 46 | ||
| 47 | #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) | 47 | #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) |
| 48 | 48 | ||
| 49 | inline void | 49 | void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) |
| 50 | init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private) | ||
| 51 | { | 50 | { |
| 52 | bh->b_end_io = handler; | 51 | bh->b_end_io = handler; |
| 53 | bh->b_private = private; | 52 | bh->b_private = private; |
| @@ -850,13 +849,10 @@ try_again: | |||
| 850 | if (!bh) | 849 | if (!bh) |
| 851 | goto no_grow; | 850 | goto no_grow; |
| 852 | 851 | ||
| 853 | bh->b_bdev = NULL; | ||
| 854 | bh->b_this_page = head; | 852 | bh->b_this_page = head; |
| 855 | bh->b_blocknr = -1; | 853 | bh->b_blocknr = -1; |
| 856 | head = bh; | 854 | head = bh; |
| 857 | 855 | ||
| 858 | bh->b_state = 0; | ||
| 859 | atomic_set(&bh->b_count, 0); | ||
| 860 | bh->b_size = size; | 856 | bh->b_size = size; |
| 861 | 857 | ||
| 862 | /* Link the buffer to its page */ | 858 | /* Link the buffer to its page */ |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3e3422f7f0a..310972b72a6 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
| @@ -1034,7 +1034,7 @@ int bdi_writeback_thread(void *data) | |||
| 1034 | while (!kthread_freezable_should_stop(NULL)) { | 1034 | while (!kthread_freezable_should_stop(NULL)) { |
| 1035 | /* | 1035 | /* |
| 1036 | * Remove own delayed wake-up timer, since we are already awake | 1036 | * Remove own delayed wake-up timer, since we are already awake |
| 1037 | * and we'll take care of the preriodic write-back. | 1037 | * and we'll take care of the periodic write-back. |
| 1038 | */ | 1038 | */ |
| 1039 | del_timer(&wb->wakeup_timer); | 1039 | del_timer(&wb->wakeup_timer); |
| 1040 | 1040 | ||
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 86c67eee439..e96d4f18ca3 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c | |||
| @@ -249,7 +249,7 @@ static int kcore_update_ram(void) | |||
| 249 | /* Not inialized....update now */ | 249 | /* Not inialized....update now */ |
| 250 | /* find out "max pfn" */ | 250 | /* find out "max pfn" */ |
| 251 | end_pfn = 0; | 251 | end_pfn = 0; |
| 252 | for_each_node_state(nid, N_HIGH_MEMORY) { | 252 | for_each_node_state(nid, N_MEMORY) { |
| 253 | unsigned long node_end; | 253 | unsigned long node_end; |
| 254 | node_end = NODE_DATA(nid)->node_start_pfn + | 254 | node_end = NODE_DATA(nid)->node_start_pfn + |
| 255 | NODE_DATA(nid)->node_spanned_pages; | 255 | NODE_DATA(nid)->node_spanned_pages; |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 90c63f9392a..48775628abb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
| @@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, | |||
| 643 | spinlock_t *ptl; | 643 | spinlock_t *ptl; |
| 644 | struct page *page; | 644 | struct page *page; |
| 645 | 645 | ||
| 646 | split_huge_page_pmd(walk->mm, pmd); | 646 | split_huge_page_pmd(vma, addr, pmd); |
| 647 | if (pmd_trans_unstable(pmd)) | 647 | if (pmd_trans_unstable(pmd)) |
| 648 | return 0; | 648 | return 0; |
| 649 | 649 | ||
| @@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, | |||
| 1126 | return NULL; | 1126 | return NULL; |
| 1127 | 1127 | ||
| 1128 | nid = page_to_nid(page); | 1128 | nid = page_to_nid(page); |
| 1129 | if (!node_isset(nid, node_states[N_HIGH_MEMORY])) | 1129 | if (!node_isset(nid, node_states[N_MEMORY])) |
| 1130 | return NULL; | 1130 | return NULL; |
| 1131 | 1131 | ||
| 1132 | return page; | 1132 | return page; |
| @@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) | |||
| 1279 | if (md->writeback) | 1279 | if (md->writeback) |
| 1280 | seq_printf(m, " writeback=%lu", md->writeback); | 1280 | seq_printf(m, " writeback=%lu", md->writeback); |
| 1281 | 1281 | ||
| 1282 | for_each_node_state(n, N_HIGH_MEMORY) | 1282 | for_each_node_state(n, N_MEMORY) |
| 1283 | if (md->node[n]) | 1283 | if (md->node[n]) |
| 1284 | seq_printf(m, " N%d=%lu", n, md->node[n]); | 1284 | seq_printf(m, " N%d=%lu", n, md->node[n]); |
| 1285 | out: | 1285 | out: |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index b36ce40bd1c..284e80831d2 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
| @@ -449,6 +449,32 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, | |||
| 449 | unsigned long size); | 449 | unsigned long size); |
| 450 | #endif | 450 | #endif |
| 451 | 451 | ||
| 452 | #ifdef __HAVE_COLOR_ZERO_PAGE | ||
| 453 | static inline int is_zero_pfn(unsigned long pfn) | ||
| 454 | { | ||
| 455 | extern unsigned long zero_pfn; | ||
| 456 | unsigned long offset_from_zero_pfn = pfn - zero_pfn; | ||
| 457 | return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); | ||
| 458 | } | ||
| 459 | |||
| 460 | static inline unsigned long my_zero_pfn(unsigned long addr) | ||
| 461 | { | ||
| 462 | return page_to_pfn(ZERO_PAGE(addr)); | ||
| 463 | } | ||
| 464 | #else | ||
| 465 | static inline int is_zero_pfn(unsigned long pfn) | ||
| 466 | { | ||
| 467 | extern unsigned long zero_pfn; | ||
| 468 | return pfn == zero_pfn; | ||
| 469 | } | ||
| 470 | |||
| 471 | static inline unsigned long my_zero_pfn(unsigned long addr) | ||
| 472 | { | ||
| 473 | extern unsigned long zero_pfn; | ||
| 474 | return zero_pfn; | ||
| 475 | } | ||
| 476 | #endif | ||
| 477 | |||
| 452 | #ifdef CONFIG_MMU | 478 | #ifdef CONFIG_MMU |
| 453 | 479 | ||
| 454 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE | 480 | #ifndef CONFIG_TRANSPARENT_HUGEPAGE |
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 7b74452c531..3f778c27f82 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h | |||
| @@ -137,9 +137,6 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, | |||
| 137 | #define alloc_bootmem_low_pages_node(pgdat, x) \ | 137 | #define alloc_bootmem_low_pages_node(pgdat, x) \ |
| 138 | __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) | 138 | __alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0) |
| 139 | 139 | ||
| 140 | extern int reserve_bootmem_generic(unsigned long addr, unsigned long size, | ||
| 141 | int flags); | ||
| 142 | |||
| 143 | #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP | 140 | #ifdef CONFIG_HAVE_ARCH_ALLOC_REMAP |
| 144 | extern void *alloc_remap(int nid, unsigned long size); | 141 | extern void *alloc_remap(int nid, unsigned long size); |
| 145 | #else | 142 | #else |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 838320fc3d1..8c8a60d2940 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
| @@ -144,7 +144,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) | |||
| 144 | return node_possible_map; | 144 | return node_possible_map; |
| 145 | } | 145 | } |
| 146 | 146 | ||
| 147 | #define cpuset_current_mems_allowed (node_states[N_HIGH_MEMORY]) | 147 | #define cpuset_current_mems_allowed (node_states[N_MEMORY]) |
| 148 | static inline void cpuset_init_current_mems_allowed(void) {} | 148 | static inline void cpuset_init_current_mems_allowed(void) {} |
| 149 | 149 | ||
| 150 | static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | 150 | static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 31e8041274f..f74856e17e4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
| @@ -34,6 +34,7 @@ struct vm_area_struct; | |||
| 34 | #define ___GFP_NO_KSWAPD 0x400000u | 34 | #define ___GFP_NO_KSWAPD 0x400000u |
| 35 | #define ___GFP_OTHER_NODE 0x800000u | 35 | #define ___GFP_OTHER_NODE 0x800000u |
| 36 | #define ___GFP_WRITE 0x1000000u | 36 | #define ___GFP_WRITE 0x1000000u |
| 37 | /* If the above are modified, __GFP_BITS_SHIFT may need updating */ | ||
| 37 | 38 | ||
| 38 | /* | 39 | /* |
| 39 | * GFP bitmasks.. | 40 | * GFP bitmasks.. |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1af47755245..092dc5305a3 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
| @@ -39,6 +39,7 @@ enum transparent_hugepage_flag { | |||
| 39 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, | 39 | TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, |
| 40 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, | 40 | TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, |
| 41 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, | 41 | TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, |
| 42 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, | ||
| 42 | #ifdef CONFIG_DEBUG_VM | 43 | #ifdef CONFIG_DEBUG_VM |
| 43 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, | 44 | TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, |
| 44 | #endif | 45 | #endif |
| @@ -78,6 +79,9 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); | |||
| 78 | (transparent_hugepage_flags & \ | 79 | (transparent_hugepage_flags & \ |
| 79 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \ | 80 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG) && \ |
| 80 | (__vma)->vm_flags & VM_HUGEPAGE)) | 81 | (__vma)->vm_flags & VM_HUGEPAGE)) |
| 82 | #define transparent_hugepage_use_zero_page() \ | ||
| 83 | (transparent_hugepage_flags & \ | ||
| 84 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) | ||
| 81 | #ifdef CONFIG_DEBUG_VM | 85 | #ifdef CONFIG_DEBUG_VM |
| 82 | #define transparent_hugepage_debug_cow() \ | 86 | #define transparent_hugepage_debug_cow() \ |
| 83 | (transparent_hugepage_flags & \ | 87 | (transparent_hugepage_flags & \ |
| @@ -95,12 +99,14 @@ extern int handle_pte_fault(struct mm_struct *mm, | |||
| 95 | struct vm_area_struct *vma, unsigned long address, | 99 | struct vm_area_struct *vma, unsigned long address, |
| 96 | pte_t *pte, pmd_t *pmd, unsigned int flags); | 100 | pte_t *pte, pmd_t *pmd, unsigned int flags); |
| 97 | extern int split_huge_page(struct page *page); | 101 | extern int split_huge_page(struct page *page); |
| 98 | extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); | 102 | extern void __split_huge_page_pmd(struct vm_area_struct *vma, |
| 99 | #define split_huge_page_pmd(__mm, __pmd) \ | 103 | unsigned long address, pmd_t *pmd); |
| 104 | #define split_huge_page_pmd(__vma, __address, __pmd) \ | ||
| 100 | do { \ | 105 | do { \ |
| 101 | pmd_t *____pmd = (__pmd); \ | 106 | pmd_t *____pmd = (__pmd); \ |
| 102 | if (unlikely(pmd_trans_huge(*____pmd))) \ | 107 | if (unlikely(pmd_trans_huge(*____pmd))) \ |
| 103 | __split_huge_page_pmd(__mm, ____pmd); \ | 108 | __split_huge_page_pmd(__vma, __address, \ |
| 109 | ____pmd); \ | ||
| 104 | } while (0) | 110 | } while (0) |
| 105 | #define wait_split_huge_page(__anon_vma, __pmd) \ | 111 | #define wait_split_huge_page(__anon_vma, __pmd) \ |
| 106 | do { \ | 112 | do { \ |
| @@ -110,6 +116,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); | |||
| 110 | BUG_ON(pmd_trans_splitting(*____pmd) || \ | 116 | BUG_ON(pmd_trans_splitting(*____pmd) || \ |
| 111 | pmd_trans_huge(*____pmd)); \ | 117 | pmd_trans_huge(*____pmd)); \ |
| 112 | } while (0) | 118 | } while (0) |
| 119 | extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | ||
| 120 | pmd_t *pmd); | ||
| 113 | #if HPAGE_PMD_ORDER > MAX_ORDER | 121 | #if HPAGE_PMD_ORDER > MAX_ORDER |
| 114 | #error "hugepages can't be allocated by the buddy allocator" | 122 | #error "hugepages can't be allocated by the buddy allocator" |
| 115 | #endif | 123 | #endif |
| @@ -177,10 +185,12 @@ static inline int split_huge_page(struct page *page) | |||
| 177 | { | 185 | { |
| 178 | return 0; | 186 | return 0; |
| 179 | } | 187 | } |
| 180 | #define split_huge_page_pmd(__mm, __pmd) \ | 188 | #define split_huge_page_pmd(__vma, __address, __pmd) \ |
| 181 | do { } while (0) | 189 | do { } while (0) |
| 182 | #define wait_split_huge_page(__anon_vma, __pmd) \ | 190 | #define wait_split_huge_page(__anon_vma, __pmd) \ |
| 183 | do { } while (0) | 191 | do { } while (0) |
| 192 | #define split_huge_page_pmd_mm(__mm, __address, __pmd) \ | ||
| 193 | do { } while (0) | ||
| 184 | #define compound_trans_head(page) compound_head(page) | 194 | #define compound_trans_head(page) compound_head(page) |
| 185 | static inline int hugepage_madvise(struct vm_area_struct *vma, | 195 | static inline int hugepage_madvise(struct vm_area_struct *vma, |
| 186 | unsigned long *vm_flags, int advice) | 196 | unsigned long *vm_flags, int advice) |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 11ddc7ffeba..e98a74c0c9c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
| @@ -181,7 +181,14 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
| 181 | gfp_t gfp_mask, | 181 | gfp_t gfp_mask, |
| 182 | unsigned long *total_scanned); | 182 | unsigned long *total_scanned); |
| 183 | 183 | ||
| 184 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); | 184 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); |
| 185 | static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, | ||
| 186 | enum vm_event_item idx) | ||
| 187 | { | ||
| 188 | if (mem_cgroup_disabled()) | ||
| 189 | return; | ||
| 190 | __mem_cgroup_count_vm_event(mm, idx); | ||
| 191 | } | ||
| 185 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 192 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| 186 | void mem_cgroup_split_huge_fixup(struct page *head); | 193 | void mem_cgroup_split_huge_fixup(struct page *head); |
| 187 | #endif | 194 | #endif |
diff --git a/include/linux/memory.h b/include/linux/memory.h index a09216d0dcc..45e93b46887 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h | |||
| @@ -54,6 +54,7 @@ struct memory_notify { | |||
| 54 | unsigned long start_pfn; | 54 | unsigned long start_pfn; |
| 55 | unsigned long nr_pages; | 55 | unsigned long nr_pages; |
| 56 | int status_change_nid_normal; | 56 | int status_change_nid_normal; |
| 57 | int status_change_nid_high; | ||
| 57 | int status_change_nid; | 58 | int status_change_nid; |
| 58 | }; | 59 | }; |
| 59 | 60 | ||
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0c0b1d608a6..cd55dad56aa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
| @@ -460,17 +460,44 @@ struct zone { | |||
| 460 | unsigned long zone_start_pfn; | 460 | unsigned long zone_start_pfn; |
| 461 | 461 | ||
| 462 | /* | 462 | /* |
| 463 | * zone_start_pfn, spanned_pages and present_pages are all | 463 | * spanned_pages is the total pages spanned by the zone, including |
| 464 | * protected by span_seqlock. It is a seqlock because it has | 464 | * holes, which is calculated as: |
| 465 | * to be read outside of zone->lock, and it is done in the main | 465 | * spanned_pages = zone_end_pfn - zone_start_pfn; |
| 466 | * allocator path. But, it is written quite infrequently. | ||
| 467 | * | 466 | * |
| 468 | * The lock is declared along with zone->lock because it is | 467 | * present_pages is physical pages existing within the zone, which |
| 468 | * is calculated as: | ||
| 469 | * present_pages = spanned_pages - absent_pages(pags in holes); | ||
| 470 | * | ||
| 471 | * managed_pages is present pages managed by the buddy system, which | ||
| 472 | * is calculated as (reserved_pages includes pages allocated by the | ||
| 473 | * bootmem allocator): | ||
| 474 | * managed_pages = present_pages - reserved_pages; | ||
| 475 | * | ||
| 476 | * So present_pages may be used by memory hotplug or memory power | ||
| 477 | * management logic to figure out unmanaged pages by checking | ||
| 478 | * (present_pages - managed_pages). And managed_pages should be used | ||
| 479 | * by page allocator and vm scanner to calculate all kinds of watermarks | ||
| 480 | * and thresholds. | ||
| 481 | * | ||
| 482 | * Locking rules: | ||
| 483 | * | ||
| 484 | * zone_start_pfn and spanned_pages are protected by span_seqlock. | ||
| 485 | * It is a seqlock because it has to be read outside of zone->lock, | ||
| 486 | * and it is done in the main allocator path. But, it is written | ||
| 487 | * quite infrequently. | ||
| 488 | * | ||
| 489 | * The span_seq lock is declared along with zone->lock because it is | ||
| 469 | * frequently read in proximity to zone->lock. It's good to | 490 | * frequently read in proximity to zone->lock. It's good to |
| 470 | * give them a chance of being in the same cacheline. | 491 | * give them a chance of being in the same cacheline. |
| 492 | * | ||
| 493 | * Write access to present_pages and managed_pages at runtime should | ||
| 494 | * be protected by lock_memory_hotplug()/unlock_memory_hotplug(). | ||
| 495 | * Any reader who can't tolerant drift of present_pages and | ||
| 496 | * managed_pages should hold memory hotplug lock to get a stable value. | ||
| 471 | */ | 497 | */ |
| 472 | unsigned long spanned_pages; /* total size, including holes */ | 498 | unsigned long spanned_pages; |
| 473 | unsigned long present_pages; /* amount of memory (excluding holes) */ | 499 | unsigned long present_pages; |
| 500 | unsigned long managed_pages; | ||
| 474 | 501 | ||
| 475 | /* | 502 | /* |
| 476 | * rarely used fields: | 503 | * rarely used fields: |
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 7afc36334d5..4e2cbfa640b 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h | |||
| @@ -380,6 +380,11 @@ enum node_states { | |||
| 380 | #else | 380 | #else |
| 381 | N_HIGH_MEMORY = N_NORMAL_MEMORY, | 381 | N_HIGH_MEMORY = N_NORMAL_MEMORY, |
| 382 | #endif | 382 | #endif |
| 383 | #ifdef CONFIG_MOVABLE_NODE | ||
| 384 | N_MEMORY, /* The node has memory(regular, high, movable) */ | ||
| 385 | #else | ||
| 386 | N_MEMORY = N_HIGH_MEMORY, | ||
| 387 | #endif | ||
| 383 | N_CPU, /* The node has one or more cpus */ | 388 | N_CPU, /* The node has one or more cpus */ |
| 384 | NR_NODE_STATES | 389 | NR_NODE_STATES |
| 385 | }; | 390 | }; |
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 7d7fbe2ef78..6f54e40fa21 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h | |||
| @@ -74,14 +74,9 @@ ssize_t res_counter_read(struct res_counter *counter, int member, | |||
| 74 | const char __user *buf, size_t nbytes, loff_t *pos, | 74 | const char __user *buf, size_t nbytes, loff_t *pos, |
| 75 | int (*read_strategy)(unsigned long long val, char *s)); | 75 | int (*read_strategy)(unsigned long long val, char *s)); |
| 76 | 76 | ||
| 77 | typedef int (*write_strategy_fn)(const char *buf, unsigned long long *val); | ||
| 78 | |||
| 79 | int res_counter_memparse_write_strategy(const char *buf, | 77 | int res_counter_memparse_write_strategy(const char *buf, |
| 80 | unsigned long long *res); | 78 | unsigned long long *res); |
| 81 | 79 | ||
| 82 | int res_counter_write(struct res_counter *counter, int member, | ||
| 83 | const char *buffer, write_strategy_fn write_strategy); | ||
| 84 | |||
| 85 | /* | 80 | /* |
| 86 | * the field descriptors. one for each member of res_counter | 81 | * the field descriptors. one for each member of res_counter |
| 87 | */ | 82 | */ |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 3d311459437..fe786f07d2b 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
| @@ -58,6 +58,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
| 58 | THP_COLLAPSE_ALLOC, | 58 | THP_COLLAPSE_ALLOC, |
| 59 | THP_COLLAPSE_ALLOC_FAILED, | 59 | THP_COLLAPSE_ALLOC_FAILED, |
| 60 | THP_SPLIT, | 60 | THP_SPLIT, |
| 61 | THP_ZERO_PAGE_ALLOC, | ||
| 62 | THP_ZERO_PAGE_ALLOC_FAILED, | ||
| 61 | #endif | 63 | #endif |
| 62 | NR_VM_EVENT_ITEMS | 64 | NR_VM_EVENT_ITEMS |
| 63 | }; | 65 | }; |
diff --git a/init/main.c b/init/main.c index e33e09df3cb..63ae904a99a 100644 --- a/init/main.c +++ b/init/main.c | |||
| @@ -857,7 +857,7 @@ static void __init kernel_init_freeable(void) | |||
| 857 | /* | 857 | /* |
| 858 | * init can allocate pages on any node | 858 | * init can allocate pages on any node |
| 859 | */ | 859 | */ |
| 860 | set_mems_allowed(node_states[N_HIGH_MEMORY]); | 860 | set_mems_allowed(node_states[N_MEMORY]); |
| 861 | /* | 861 | /* |
| 862 | * init can run on any cpu. | 862 | * init can run on any cpu. |
| 863 | */ | 863 | */ |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b017887d632..7bb63eea6eb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
| @@ -302,10 +302,10 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
| 302 | * are online, with memory. If none are online with memory, walk | 302 | * are online, with memory. If none are online with memory, walk |
| 303 | * up the cpuset hierarchy until we find one that does have some | 303 | * up the cpuset hierarchy until we find one that does have some |
| 304 | * online mems. If we get all the way to the top and still haven't | 304 | * online mems. If we get all the way to the top and still haven't |
| 305 | * found any online mems, return node_states[N_HIGH_MEMORY]. | 305 | * found any online mems, return node_states[N_MEMORY]. |
| 306 | * | 306 | * |
| 307 | * One way or another, we guarantee to return some non-empty subset | 307 | * One way or another, we guarantee to return some non-empty subset |
| 308 | * of node_states[N_HIGH_MEMORY]. | 308 | * of node_states[N_MEMORY]. |
| 309 | * | 309 | * |
| 310 | * Call with callback_mutex held. | 310 | * Call with callback_mutex held. |
| 311 | */ | 311 | */ |
| @@ -313,14 +313,14 @@ static void guarantee_online_cpus(const struct cpuset *cs, | |||
| 313 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | 313 | static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) |
| 314 | { | 314 | { |
| 315 | while (cs && !nodes_intersects(cs->mems_allowed, | 315 | while (cs && !nodes_intersects(cs->mems_allowed, |
| 316 | node_states[N_HIGH_MEMORY])) | 316 | node_states[N_MEMORY])) |
| 317 | cs = cs->parent; | 317 | cs = cs->parent; |
| 318 | if (cs) | 318 | if (cs) |
| 319 | nodes_and(*pmask, cs->mems_allowed, | 319 | nodes_and(*pmask, cs->mems_allowed, |
| 320 | node_states[N_HIGH_MEMORY]); | 320 | node_states[N_MEMORY]); |
| 321 | else | 321 | else |
| 322 | *pmask = node_states[N_HIGH_MEMORY]; | 322 | *pmask = node_states[N_MEMORY]; |
| 323 | BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); | 323 | BUG_ON(!nodes_intersects(*pmask, node_states[N_MEMORY])); |
| 324 | } | 324 | } |
| 325 | 325 | ||
| 326 | /* | 326 | /* |
| @@ -1100,7 +1100,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 1100 | return -ENOMEM; | 1100 | return -ENOMEM; |
| 1101 | 1101 | ||
| 1102 | /* | 1102 | /* |
| 1103 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | 1103 | * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; |
| 1104 | * it's read-only | 1104 | * it's read-only |
| 1105 | */ | 1105 | */ |
| 1106 | if (cs == &top_cpuset) { | 1106 | if (cs == &top_cpuset) { |
| @@ -1122,7 +1122,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
| 1122 | goto done; | 1122 | goto done; |
| 1123 | 1123 | ||
| 1124 | if (!nodes_subset(trialcs->mems_allowed, | 1124 | if (!nodes_subset(trialcs->mems_allowed, |
| 1125 | node_states[N_HIGH_MEMORY])) { | 1125 | node_states[N_MEMORY])) { |
| 1126 | retval = -EINVAL; | 1126 | retval = -EINVAL; |
| 1127 | goto done; | 1127 | goto done; |
| 1128 | } | 1128 | } |
| @@ -2026,7 +2026,7 @@ static struct cpuset *cpuset_next(struct list_head *queue) | |||
| 2026 | * before dropping down to the next. It always processes a node before | 2026 | * before dropping down to the next. It always processes a node before |
| 2027 | * any of its children. | 2027 | * any of its children. |
| 2028 | * | 2028 | * |
| 2029 | * In the case of memory hot-unplug, it will remove nodes from N_HIGH_MEMORY | 2029 | * In the case of memory hot-unplug, it will remove nodes from N_MEMORY |
| 2030 | * if all present pages from a node are offlined. | 2030 | * if all present pages from a node are offlined. |
| 2031 | */ | 2031 | */ |
| 2032 | static void | 2032 | static void |
| @@ -2065,7 +2065,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | |||
| 2065 | 2065 | ||
| 2066 | /* Continue past cpusets with all mems online */ | 2066 | /* Continue past cpusets with all mems online */ |
| 2067 | if (nodes_subset(cp->mems_allowed, | 2067 | if (nodes_subset(cp->mems_allowed, |
| 2068 | node_states[N_HIGH_MEMORY])) | 2068 | node_states[N_MEMORY])) |
| 2069 | continue; | 2069 | continue; |
| 2070 | 2070 | ||
| 2071 | oldmems = cp->mems_allowed; | 2071 | oldmems = cp->mems_allowed; |
| @@ -2073,7 +2073,7 @@ scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event) | |||
| 2073 | /* Remove offline mems from this cpuset. */ | 2073 | /* Remove offline mems from this cpuset. */ |
| 2074 | mutex_lock(&callback_mutex); | 2074 | mutex_lock(&callback_mutex); |
| 2075 | nodes_and(cp->mems_allowed, cp->mems_allowed, | 2075 | nodes_and(cp->mems_allowed, cp->mems_allowed, |
| 2076 | node_states[N_HIGH_MEMORY]); | 2076 | node_states[N_MEMORY]); |
| 2077 | mutex_unlock(&callback_mutex); | 2077 | mutex_unlock(&callback_mutex); |
| 2078 | 2078 | ||
| 2079 | /* Move tasks from the empty cpuset to a parent */ | 2079 | /* Move tasks from the empty cpuset to a parent */ |
| @@ -2126,8 +2126,8 @@ void cpuset_update_active_cpus(bool cpu_online) | |||
| 2126 | 2126 | ||
| 2127 | #ifdef CONFIG_MEMORY_HOTPLUG | 2127 | #ifdef CONFIG_MEMORY_HOTPLUG |
| 2128 | /* | 2128 | /* |
| 2129 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2129 | * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. |
| 2130 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. | 2130 | * Call this routine anytime after node_states[N_MEMORY] changes. |
| 2131 | * See cpuset_update_active_cpus() for CPU hotplug handling. | 2131 | * See cpuset_update_active_cpus() for CPU hotplug handling. |
| 2132 | */ | 2132 | */ |
| 2133 | static int cpuset_track_online_nodes(struct notifier_block *self, | 2133 | static int cpuset_track_online_nodes(struct notifier_block *self, |
| @@ -2140,7 +2140,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
| 2140 | case MEM_ONLINE: | 2140 | case MEM_ONLINE: |
| 2141 | oldmems = top_cpuset.mems_allowed; | 2141 | oldmems = top_cpuset.mems_allowed; |
| 2142 | mutex_lock(&callback_mutex); | 2142 | mutex_lock(&callback_mutex); |
| 2143 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2143 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
| 2144 | mutex_unlock(&callback_mutex); | 2144 | mutex_unlock(&callback_mutex); |
| 2145 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); | 2145 | update_tasks_nodemask(&top_cpuset, &oldmems, NULL); |
| 2146 | break; | 2146 | break; |
| @@ -2169,7 +2169,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self, | |||
| 2169 | void __init cpuset_init_smp(void) | 2169 | void __init cpuset_init_smp(void) |
| 2170 | { | 2170 | { |
| 2171 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); | 2171 | cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); |
| 2172 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2172 | top_cpuset.mems_allowed = node_states[N_MEMORY]; |
| 2173 | 2173 | ||
| 2174 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); | 2174 | hotplug_memory_notifier(cpuset_track_online_nodes, 10); |
| 2175 | 2175 | ||
| @@ -2237,7 +2237,7 @@ void cpuset_init_current_mems_allowed(void) | |||
| 2237 | * | 2237 | * |
| 2238 | * Description: Returns the nodemask_t mems_allowed of the cpuset | 2238 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
| 2239 | * attached to the specified @tsk. Guaranteed to return some non-empty | 2239 | * attached to the specified @tsk. Guaranteed to return some non-empty |
| 2240 | * subset of node_states[N_HIGH_MEMORY], even if this means going outside the | 2240 | * subset of node_states[N_MEMORY], even if this means going outside the |
| 2241 | * tasks cpuset. | 2241 | * tasks cpuset. |
| 2242 | **/ | 2242 | **/ |
| 2243 | 2243 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index 29fb60caecb..691dc2ef9ba 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -428,7 +428,7 @@ int kthreadd(void *unused) | |||
| 428 | set_task_comm(tsk, "kthreadd"); | 428 | set_task_comm(tsk, "kthreadd"); |
| 429 | ignore_signals(tsk); | 429 | ignore_signals(tsk); |
| 430 | set_cpus_allowed_ptr(tsk, cpu_all_mask); | 430 | set_cpus_allowed_ptr(tsk, cpu_all_mask); |
| 431 | set_mems_allowed(node_states[N_HIGH_MEMORY]); | 431 | set_mems_allowed(node_states[N_MEMORY]); |
| 432 | 432 | ||
| 433 | current->flags |= PF_NOFREEZE; | 433 | current->flags |= PF_NOFREEZE; |
| 434 | 434 | ||
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ad581aa2369..3920d593e63 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
| @@ -192,25 +192,3 @@ int res_counter_memparse_write_strategy(const char *buf, | |||
| 192 | *res = PAGE_ALIGN(*res); | 192 | *res = PAGE_ALIGN(*res); |
| 193 | return 0; | 193 | return 0; |
| 194 | } | 194 | } |
| 195 | |||
| 196 | int res_counter_write(struct res_counter *counter, int member, | ||
| 197 | const char *buf, write_strategy_fn write_strategy) | ||
| 198 | { | ||
| 199 | char *end; | ||
| 200 | unsigned long flags; | ||
| 201 | unsigned long long tmp, *val; | ||
| 202 | |||
| 203 | if (write_strategy) { | ||
| 204 | if (write_strategy(buf, &tmp)) | ||
| 205 | return -EINVAL; | ||
| 206 | } else { | ||
| 207 | tmp = simple_strtoull(buf, &end, 10); | ||
| 208 | if (*end != '\0') | ||
| 209 | return -EINVAL; | ||
| 210 | } | ||
| 211 | spin_lock_irqsave(&counter->lock, flags); | ||
| 212 | val = res_counter_member(counter, member); | ||
| 213 | *val = tmp; | ||
| 214 | spin_unlock_irqrestore(&counter->lock, flags); | ||
| 215 | return 0; | ||
| 216 | } | ||
diff --git a/mm/Kconfig b/mm/Kconfig index e6651c5de14..71259e052ce 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
| @@ -143,6 +143,14 @@ config NO_BOOTMEM | |||
| 143 | config MEMORY_ISOLATION | 143 | config MEMORY_ISOLATION |
| 144 | boolean | 144 | boolean |
| 145 | 145 | ||
| 146 | config MOVABLE_NODE | ||
| 147 | boolean "Enable to assign a node which has only movable memory" | ||
| 148 | depends on HAVE_MEMBLOCK | ||
| 149 | depends on NO_BOOTMEM | ||
| 150 | depends on X86_64 | ||
| 151 | depends on NUMA | ||
| 152 | depends on BROKEN | ||
| 153 | |||
| 146 | # eventually, we can have this option just 'select SPARSEMEM' | 154 | # eventually, we can have this option just 'select SPARSEMEM' |
| 147 | config MEMORY_HOTPLUG | 155 | config MEMORY_HOTPLUG |
| 148 | bool "Allow for memory hot-add" | 156 | bool "Allow for memory hot-add" |
diff --git a/mm/bootmem.c b/mm/bootmem.c index ecc45958ac0..1324cd74fae 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
| @@ -229,6 +229,22 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
| 229 | return count; | 229 | return count; |
| 230 | } | 230 | } |
| 231 | 231 | ||
| 232 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | ||
| 233 | { | ||
| 234 | struct zone *z; | ||
| 235 | |||
| 236 | /* | ||
| 237 | * In free_area_init_core(), highmem zone's managed_pages is set to | ||
| 238 | * present_pages, and bootmem allocator doesn't allocate from highmem | ||
| 239 | * zones. So there's no need to recalculate managed_pages because all | ||
| 240 | * highmem pages will be managed by the buddy system. Here highmem | ||
| 241 | * zone also includes highmem movable zone. | ||
| 242 | */ | ||
| 243 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | ||
| 244 | if (!is_highmem(z)) | ||
| 245 | z->managed_pages = 0; | ||
| 246 | } | ||
| 247 | |||
| 232 | /** | 248 | /** |
| 233 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 249 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
| 234 | * @pgdat: node to be released | 250 | * @pgdat: node to be released |
| @@ -238,6 +254,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) | |||
| 238 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 254 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
| 239 | { | 255 | { |
| 240 | register_page_bootmem_info_node(pgdat); | 256 | register_page_bootmem_info_node(pgdat); |
| 257 | reset_node_lowmem_managed_pages(pgdat); | ||
| 241 | return free_all_bootmem_core(pgdat->bdata); | 258 | return free_all_bootmem_core(pgdat->bdata); |
| 242 | } | 259 | } |
| 243 | 260 | ||
| @@ -250,6 +267,10 @@ unsigned long __init free_all_bootmem(void) | |||
| 250 | { | 267 | { |
| 251 | unsigned long total_pages = 0; | 268 | unsigned long total_pages = 0; |
| 252 | bootmem_data_t *bdata; | 269 | bootmem_data_t *bdata; |
| 270 | struct pglist_data *pgdat; | ||
| 271 | |||
| 272 | for_each_online_pgdat(pgdat) | ||
| 273 | reset_node_lowmem_managed_pages(pgdat); | ||
| 253 | 274 | ||
| 254 | list_for_each_entry(bdata, &bdata_list, list) | 275 | list_for_each_entry(bdata, &bdata_list, list) |
| 255 | total_pages += free_all_bootmem_core(bdata); | 276 | total_pages += free_all_bootmem_core(bdata); |
| @@ -439,12 +460,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size, | |||
| 439 | return mark_bootmem(start, end, 1, flags); | 460 | return mark_bootmem(start, end, 1, flags); |
| 440 | } | 461 | } |
| 441 | 462 | ||
| 442 | int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, | ||
| 443 | int flags) | ||
| 444 | { | ||
| 445 | return reserve_bootmem(phys, len, flags); | ||
| 446 | } | ||
| 447 | |||
| 448 | static unsigned long __init align_idx(struct bootmem_data *bdata, | 463 | static unsigned long __init align_idx(struct bootmem_data *bdata, |
| 449 | unsigned long idx, unsigned long step) | 464 | unsigned long idx, unsigned long step) |
| 450 | { | 465 | { |
| @@ -575,27 +590,6 @@ find_block: | |||
| 575 | return NULL; | 590 | return NULL; |
| 576 | } | 591 | } |
| 577 | 592 | ||
| 578 | static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, | ||
| 579 | unsigned long size, unsigned long align, | ||
| 580 | unsigned long goal, unsigned long limit) | ||
| 581 | { | ||
| 582 | if (WARN_ON_ONCE(slab_is_available())) | ||
| 583 | return kzalloc(size, GFP_NOWAIT); | ||
| 584 | |||
| 585 | #ifdef CONFIG_HAVE_ARCH_BOOTMEM | ||
| 586 | { | ||
| 587 | bootmem_data_t *p_bdata; | ||
| 588 | |||
| 589 | p_bdata = bootmem_arch_preferred_node(bdata, size, align, | ||
| 590 | goal, limit); | ||
| 591 | if (p_bdata) | ||
| 592 | return alloc_bootmem_bdata(p_bdata, size, align, | ||
| 593 | goal, limit); | ||
| 594 | } | ||
| 595 | #endif | ||
| 596 | return NULL; | ||
| 597 | } | ||
| 598 | |||
| 599 | static void * __init alloc_bootmem_core(unsigned long size, | 593 | static void * __init alloc_bootmem_core(unsigned long size, |
| 600 | unsigned long align, | 594 | unsigned long align, |
| 601 | unsigned long goal, | 595 | unsigned long goal, |
| @@ -604,9 +598,8 @@ static void * __init alloc_bootmem_core(unsigned long size, | |||
| 604 | bootmem_data_t *bdata; | 598 | bootmem_data_t *bdata; |
| 605 | void *region; | 599 | void *region; |
| 606 | 600 | ||
| 607 | region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit); | 601 | if (WARN_ON_ONCE(slab_is_available())) |
| 608 | if (region) | 602 | return kzalloc(size, GFP_NOWAIT); |
| 609 | return region; | ||
| 610 | 603 | ||
| 611 | list_for_each_entry(bdata, &bdata_list, list) { | 604 | list_for_each_entry(bdata, &bdata_list, list) { |
| 612 | if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) | 605 | if (goal && bdata->node_low_pfn <= PFN_DOWN(goal)) |
| @@ -704,11 +697,9 @@ void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, | |||
| 704 | { | 697 | { |
| 705 | void *ptr; | 698 | void *ptr; |
| 706 | 699 | ||
| 700 | if (WARN_ON_ONCE(slab_is_available())) | ||
| 701 | return kzalloc(size, GFP_NOWAIT); | ||
| 707 | again: | 702 | again: |
| 708 | ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, | ||
| 709 | align, goal, limit); | ||
| 710 | if (ptr) | ||
| 711 | return ptr; | ||
| 712 | 703 | ||
| 713 | /* do not panic in alloc_bootmem_bdata() */ | 704 | /* do not panic in alloc_bootmem_bdata() */ |
| 714 | if (limit && goal + size > limit) | 705 | if (limit && goal + size > limit) |
diff --git a/mm/compaction.c b/mm/compaction.c index d24dd2d7bad..12979121822 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
| @@ -215,60 +215,6 @@ static bool suitable_migration_target(struct page *page) | |||
| 215 | return false; | 215 | return false; |
| 216 | } | 216 | } |
| 217 | 217 | ||
| 218 | static void compact_capture_page(struct compact_control *cc) | ||
| 219 | { | ||
| 220 | unsigned long flags; | ||
| 221 | int mtype, mtype_low, mtype_high; | ||
| 222 | |||
| 223 | if (!cc->page || *cc->page) | ||
| 224 | return; | ||
| 225 | |||
| 226 | /* | ||
| 227 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
| 228 | * regardless of the migratetype of the freelist is is captured from. | ||
| 229 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
| 230 | * allocation is typically at least a pageblock size and overall | ||
| 231 | * fragmentation is not impaired. Other allocation types must | ||
| 232 | * capture pages from their own migratelist because otherwise they | ||
| 233 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
| 234 | * difficult to move pages and making fragmentation worse overall. | ||
| 235 | */ | ||
| 236 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
| 237 | mtype_low = 0; | ||
| 238 | mtype_high = MIGRATE_PCPTYPES; | ||
| 239 | } else { | ||
| 240 | mtype_low = cc->migratetype; | ||
| 241 | mtype_high = cc->migratetype + 1; | ||
| 242 | } | ||
| 243 | |||
| 244 | /* Speculatively examine the free lists without zone lock */ | ||
| 245 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
| 246 | int order; | ||
| 247 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
| 248 | struct page *page; | ||
| 249 | struct free_area *area; | ||
| 250 | area = &(cc->zone->free_area[order]); | ||
| 251 | if (list_empty(&area->free_list[mtype])) | ||
| 252 | continue; | ||
| 253 | |||
| 254 | /* Take the lock and attempt capture of the page */ | ||
| 255 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
| 256 | return; | ||
| 257 | if (!list_empty(&area->free_list[mtype])) { | ||
| 258 | page = list_entry(area->free_list[mtype].next, | ||
| 259 | struct page, lru); | ||
| 260 | if (capture_free_page(page, cc->order, mtype)) { | ||
| 261 | spin_unlock_irqrestore(&cc->zone->lock, | ||
| 262 | flags); | ||
| 263 | *cc->page = page; | ||
| 264 | return; | ||
| 265 | } | ||
| 266 | } | ||
| 267 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
| 268 | } | ||
| 269 | } | ||
| 270 | } | ||
| 271 | |||
| 272 | /* | 218 | /* |
| 273 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. | 219 | * Isolate free pages onto a private freelist. Caller must hold zone->lock. |
| 274 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free | 220 | * If @strict is true, will abort returning 0 on any invalid PFNs or non-free |
| @@ -953,6 +899,60 @@ unsigned long compaction_suitable(struct zone *zone, int order) | |||
| 953 | return COMPACT_CONTINUE; | 899 | return COMPACT_CONTINUE; |
| 954 | } | 900 | } |
| 955 | 901 | ||
| 902 | static void compact_capture_page(struct compact_control *cc) | ||
| 903 | { | ||
| 904 | unsigned long flags; | ||
| 905 | int mtype, mtype_low, mtype_high; | ||
| 906 | |||
| 907 | if (!cc->page || *cc->page) | ||
| 908 | return; | ||
| 909 | |||
| 910 | /* | ||
| 911 | * For MIGRATE_MOVABLE allocations we capture a suitable page ASAP | ||
| 912 | * regardless of the migratetype of the freelist is is captured from. | ||
| 913 | * This is fine because the order for a high-order MIGRATE_MOVABLE | ||
| 914 | * allocation is typically at least a pageblock size and overall | ||
| 915 | * fragmentation is not impaired. Other allocation types must | ||
| 916 | * capture pages from their own migratelist because otherwise they | ||
| 917 | * could pollute other pageblocks like MIGRATE_MOVABLE with | ||
| 918 | * difficult to move pages and making fragmentation worse overall. | ||
| 919 | */ | ||
| 920 | if (cc->migratetype == MIGRATE_MOVABLE) { | ||
| 921 | mtype_low = 0; | ||
| 922 | mtype_high = MIGRATE_PCPTYPES; | ||
| 923 | } else { | ||
| 924 | mtype_low = cc->migratetype; | ||
| 925 | mtype_high = cc->migratetype + 1; | ||
| 926 | } | ||
| 927 | |||
| 928 | /* Speculatively examine the free lists without zone lock */ | ||
| 929 | for (mtype = mtype_low; mtype < mtype_high; mtype++) { | ||
| 930 | int order; | ||
| 931 | for (order = cc->order; order < MAX_ORDER; order++) { | ||
| 932 | struct page *page; | ||
| 933 | struct free_area *area; | ||
| 934 | area = &(cc->zone->free_area[order]); | ||
| 935 | if (list_empty(&area->free_list[mtype])) | ||
| 936 | continue; | ||
| 937 | |||
| 938 | /* Take the lock and attempt capture of the page */ | ||
| 939 | if (!compact_trylock_irqsave(&cc->zone->lock, &flags, cc)) | ||
| 940 | return; | ||
| 941 | if (!list_empty(&area->free_list[mtype])) { | ||
| 942 | page = list_entry(area->free_list[mtype].next, | ||
| 943 | struct page, lru); | ||
| 944 | if (capture_free_page(page, cc->order, mtype)) { | ||
| 945 | spin_unlock_irqrestore(&cc->zone->lock, | ||
| 946 | flags); | ||
| 947 | *cc->page = page; | ||
| 948 | return; | ||
| 949 | } | ||
| 950 | } | ||
| 951 | spin_unlock_irqrestore(&cc->zone->lock, flags); | ||
| 952 | } | ||
| 953 | } | ||
| 954 | } | ||
| 955 | |||
| 956 | static int compact_zone(struct zone *zone, struct compact_control *cc) | 956 | static int compact_zone(struct zone *zone, struct compact_control *cc) |
| 957 | { | 957 | { |
| 958 | int ret; | 958 | int ret; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5f902e20e8c..827d9c81305 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -12,12 +12,14 @@ | |||
| 12 | #include <linux/mmu_notifier.h> | 12 | #include <linux/mmu_notifier.h> |
| 13 | #include <linux/rmap.h> | 13 | #include <linux/rmap.h> |
| 14 | #include <linux/swap.h> | 14 | #include <linux/swap.h> |
| 15 | #include <linux/shrinker.h> | ||
| 15 | #include <linux/mm_inline.h> | 16 | #include <linux/mm_inline.h> |
| 16 | #include <linux/kthread.h> | 17 | #include <linux/kthread.h> |
| 17 | #include <linux/khugepaged.h> | 18 | #include <linux/khugepaged.h> |
| 18 | #include <linux/freezer.h> | 19 | #include <linux/freezer.h> |
| 19 | #include <linux/mman.h> | 20 | #include <linux/mman.h> |
| 20 | #include <linux/pagemap.h> | 21 | #include <linux/pagemap.h> |
| 22 | |||
| 21 | #include <asm/tlb.h> | 23 | #include <asm/tlb.h> |
| 22 | #include <asm/pgalloc.h> | 24 | #include <asm/pgalloc.h> |
| 23 | #include "internal.h" | 25 | #include "internal.h" |
| @@ -37,7 +39,8 @@ unsigned long transparent_hugepage_flags __read_mostly = | |||
| 37 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| | 39 | (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)| |
| 38 | #endif | 40 | #endif |
| 39 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| | 41 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)| |
| 40 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); | 42 | (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)| |
| 43 | (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
| 41 | 44 | ||
| 42 | /* default scan 8*512 pte (or vmas) every 30 second */ | 45 | /* default scan 8*512 pte (or vmas) every 30 second */ |
| 43 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; | 46 | static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8; |
| @@ -159,6 +162,77 @@ static int start_khugepaged(void) | |||
| 159 | return err; | 162 | return err; |
| 160 | } | 163 | } |
| 161 | 164 | ||
| 165 | static atomic_t huge_zero_refcount; | ||
| 166 | static unsigned long huge_zero_pfn __read_mostly; | ||
| 167 | |||
| 168 | static inline bool is_huge_zero_pfn(unsigned long pfn) | ||
| 169 | { | ||
| 170 | unsigned long zero_pfn = ACCESS_ONCE(huge_zero_pfn); | ||
| 171 | return zero_pfn && pfn == zero_pfn; | ||
| 172 | } | ||
| 173 | |||
| 174 | static inline bool is_huge_zero_pmd(pmd_t pmd) | ||
| 175 | { | ||
| 176 | return is_huge_zero_pfn(pmd_pfn(pmd)); | ||
| 177 | } | ||
| 178 | |||
| 179 | static unsigned long get_huge_zero_page(void) | ||
| 180 | { | ||
| 181 | struct page *zero_page; | ||
| 182 | retry: | ||
| 183 | if (likely(atomic_inc_not_zero(&huge_zero_refcount))) | ||
| 184 | return ACCESS_ONCE(huge_zero_pfn); | ||
| 185 | |||
| 186 | zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE, | ||
| 187 | HPAGE_PMD_ORDER); | ||
| 188 | if (!zero_page) { | ||
| 189 | count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); | ||
| 190 | return 0; | ||
| 191 | } | ||
| 192 | count_vm_event(THP_ZERO_PAGE_ALLOC); | ||
| 193 | preempt_disable(); | ||
| 194 | if (cmpxchg(&huge_zero_pfn, 0, page_to_pfn(zero_page))) { | ||
| 195 | preempt_enable(); | ||
| 196 | __free_page(zero_page); | ||
| 197 | goto retry; | ||
| 198 | } | ||
| 199 | |||
| 200 | /* We take additional reference here. It will be put back by shrinker */ | ||
| 201 | atomic_set(&huge_zero_refcount, 2); | ||
| 202 | preempt_enable(); | ||
| 203 | return ACCESS_ONCE(huge_zero_pfn); | ||
| 204 | } | ||
| 205 | |||
| 206 | static void put_huge_zero_page(void) | ||
| 207 | { | ||
| 208 | /* | ||
| 209 | * Counter should never go to zero here. Only shrinker can put | ||
| 210 | * last reference. | ||
| 211 | */ | ||
| 212 | BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); | ||
| 213 | } | ||
| 214 | |||
| 215 | static int shrink_huge_zero_page(struct shrinker *shrink, | ||
| 216 | struct shrink_control *sc) | ||
| 217 | { | ||
| 218 | if (!sc->nr_to_scan) | ||
| 219 | /* we can free zero page only if last reference remains */ | ||
| 220 | return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; | ||
| 221 | |||
| 222 | if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { | ||
| 223 | unsigned long zero_pfn = xchg(&huge_zero_pfn, 0); | ||
| 224 | BUG_ON(zero_pfn == 0); | ||
| 225 | __free_page(__pfn_to_page(zero_pfn)); | ||
| 226 | } | ||
| 227 | |||
| 228 | return 0; | ||
| 229 | } | ||
| 230 | |||
| 231 | static struct shrinker huge_zero_page_shrinker = { | ||
| 232 | .shrink = shrink_huge_zero_page, | ||
| 233 | .seeks = DEFAULT_SEEKS, | ||
| 234 | }; | ||
| 235 | |||
| 162 | #ifdef CONFIG_SYSFS | 236 | #ifdef CONFIG_SYSFS |
| 163 | 237 | ||
| 164 | static ssize_t double_flag_show(struct kobject *kobj, | 238 | static ssize_t double_flag_show(struct kobject *kobj, |
| @@ -284,6 +358,20 @@ static ssize_t defrag_store(struct kobject *kobj, | |||
| 284 | static struct kobj_attribute defrag_attr = | 358 | static struct kobj_attribute defrag_attr = |
| 285 | __ATTR(defrag, 0644, defrag_show, defrag_store); | 359 | __ATTR(defrag, 0644, defrag_show, defrag_store); |
| 286 | 360 | ||
| 361 | static ssize_t use_zero_page_show(struct kobject *kobj, | ||
| 362 | struct kobj_attribute *attr, char *buf) | ||
| 363 | { | ||
| 364 | return single_flag_show(kobj, attr, buf, | ||
| 365 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
| 366 | } | ||
| 367 | static ssize_t use_zero_page_store(struct kobject *kobj, | ||
| 368 | struct kobj_attribute *attr, const char *buf, size_t count) | ||
| 369 | { | ||
| 370 | return single_flag_store(kobj, attr, buf, count, | ||
| 371 | TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG); | ||
| 372 | } | ||
| 373 | static struct kobj_attribute use_zero_page_attr = | ||
| 374 | __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); | ||
| 287 | #ifdef CONFIG_DEBUG_VM | 375 | #ifdef CONFIG_DEBUG_VM |
| 288 | static ssize_t debug_cow_show(struct kobject *kobj, | 376 | static ssize_t debug_cow_show(struct kobject *kobj, |
| 289 | struct kobj_attribute *attr, char *buf) | 377 | struct kobj_attribute *attr, char *buf) |
| @@ -305,6 +393,7 @@ static struct kobj_attribute debug_cow_attr = | |||
| 305 | static struct attribute *hugepage_attr[] = { | 393 | static struct attribute *hugepage_attr[] = { |
| 306 | &enabled_attr.attr, | 394 | &enabled_attr.attr, |
| 307 | &defrag_attr.attr, | 395 | &defrag_attr.attr, |
| 396 | &use_zero_page_attr.attr, | ||
| 308 | #ifdef CONFIG_DEBUG_VM | 397 | #ifdef CONFIG_DEBUG_VM |
| 309 | &debug_cow_attr.attr, | 398 | &debug_cow_attr.attr, |
| 310 | #endif | 399 | #endif |
| @@ -550,6 +639,8 @@ static int __init hugepage_init(void) | |||
| 550 | goto out; | 639 | goto out; |
| 551 | } | 640 | } |
| 552 | 641 | ||
| 642 | register_shrinker(&huge_zero_page_shrinker); | ||
| 643 | |||
| 553 | /* | 644 | /* |
| 554 | * By default disable transparent hugepages on smaller systems, | 645 | * By default disable transparent hugepages on smaller systems, |
| 555 | * where the extra memory used could hurt more than TLB overhead | 646 | * where the extra memory used could hurt more than TLB overhead |
| @@ -678,6 +769,22 @@ static inline struct page *alloc_hugepage(int defrag) | |||
| 678 | } | 769 | } |
| 679 | #endif | 770 | #endif |
| 680 | 771 | ||
| 772 | static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, | ||
| 773 | struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, | ||
| 774 | unsigned long zero_pfn) | ||
| 775 | { | ||
| 776 | pmd_t entry; | ||
| 777 | if (!pmd_none(*pmd)) | ||
| 778 | return false; | ||
| 779 | entry = pfn_pmd(zero_pfn, vma->vm_page_prot); | ||
| 780 | entry = pmd_wrprotect(entry); | ||
| 781 | entry = pmd_mkhuge(entry); | ||
| 782 | set_pmd_at(mm, haddr, pmd, entry); | ||
| 783 | pgtable_trans_huge_deposit(mm, pgtable); | ||
| 784 | mm->nr_ptes++; | ||
| 785 | return true; | ||
| 786 | } | ||
| 787 | |||
| 681 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | 788 | int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, |
| 682 | unsigned long address, pmd_t *pmd, | 789 | unsigned long address, pmd_t *pmd, |
| 683 | unsigned int flags) | 790 | unsigned int flags) |
| @@ -691,6 +798,30 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 691 | return VM_FAULT_OOM; | 798 | return VM_FAULT_OOM; |
| 692 | if (unlikely(khugepaged_enter(vma))) | 799 | if (unlikely(khugepaged_enter(vma))) |
| 693 | return VM_FAULT_OOM; | 800 | return VM_FAULT_OOM; |
| 801 | if (!(flags & FAULT_FLAG_WRITE) && | ||
| 802 | transparent_hugepage_use_zero_page()) { | ||
| 803 | pgtable_t pgtable; | ||
| 804 | unsigned long zero_pfn; | ||
| 805 | bool set; | ||
| 806 | pgtable = pte_alloc_one(mm, haddr); | ||
| 807 | if (unlikely(!pgtable)) | ||
| 808 | return VM_FAULT_OOM; | ||
| 809 | zero_pfn = get_huge_zero_page(); | ||
| 810 | if (unlikely(!zero_pfn)) { | ||
| 811 | pte_free(mm, pgtable); | ||
| 812 | count_vm_event(THP_FAULT_FALLBACK); | ||
| 813 | goto out; | ||
| 814 | } | ||
| 815 | spin_lock(&mm->page_table_lock); | ||
| 816 | set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, | ||
| 817 | zero_pfn); | ||
| 818 | spin_unlock(&mm->page_table_lock); | ||
| 819 | if (!set) { | ||
| 820 | pte_free(mm, pgtable); | ||
| 821 | put_huge_zero_page(); | ||
| 822 | } | ||
| 823 | return 0; | ||
| 824 | } | ||
| 694 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 825 | page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
| 695 | vma, haddr, numa_node_id(), 0); | 826 | vma, haddr, numa_node_id(), 0); |
| 696 | if (unlikely(!page)) { | 827 | if (unlikely(!page)) { |
| @@ -755,6 +886,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 755 | pte_free(dst_mm, pgtable); | 886 | pte_free(dst_mm, pgtable); |
| 756 | goto out_unlock; | 887 | goto out_unlock; |
| 757 | } | 888 | } |
| 889 | /* | ||
| 890 | * mm->page_table_lock is enough to be sure that huge zero pmd is not | ||
| 891 | * under splitting since we don't split the page itself, only pmd to | ||
| 892 | * a page table. | ||
| 893 | */ | ||
| 894 | if (is_huge_zero_pmd(pmd)) { | ||
| 895 | unsigned long zero_pfn; | ||
| 896 | bool set; | ||
| 897 | /* | ||
| 898 | * get_huge_zero_page() will never allocate a new page here, | ||
| 899 | * since we already have a zero page to copy. It just takes a | ||
| 900 | * reference. | ||
| 901 | */ | ||
| 902 | zero_pfn = get_huge_zero_page(); | ||
| 903 | set = set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, | ||
| 904 | zero_pfn); | ||
| 905 | BUG_ON(!set); /* unexpected !pmd_none(dst_pmd) */ | ||
| 906 | ret = 0; | ||
| 907 | goto out_unlock; | ||
| 908 | } | ||
| 758 | if (unlikely(pmd_trans_splitting(pmd))) { | 909 | if (unlikely(pmd_trans_splitting(pmd))) { |
| 759 | /* split huge page running from under us */ | 910 | /* split huge page running from under us */ |
| 760 | spin_unlock(&src_mm->page_table_lock); | 911 | spin_unlock(&src_mm->page_table_lock); |
| @@ -806,6 +957,80 @@ unlock: | |||
| 806 | spin_unlock(&mm->page_table_lock); | 957 | spin_unlock(&mm->page_table_lock); |
| 807 | } | 958 | } |
| 808 | 959 | ||
| 960 | static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, | ||
| 961 | struct vm_area_struct *vma, unsigned long address, | ||
| 962 | pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) | ||
| 963 | { | ||
| 964 | pgtable_t pgtable; | ||
| 965 | pmd_t _pmd; | ||
| 966 | struct page *page; | ||
| 967 | int i, ret = 0; | ||
| 968 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
| 969 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
| 970 | |||
| 971 | page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); | ||
| 972 | if (!page) { | ||
| 973 | ret |= VM_FAULT_OOM; | ||
| 974 | goto out; | ||
| 975 | } | ||
| 976 | |||
| 977 | if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { | ||
| 978 | put_page(page); | ||
| 979 | ret |= VM_FAULT_OOM; | ||
| 980 | goto out; | ||
| 981 | } | ||
| 982 | |||
| 983 | clear_user_highpage(page, address); | ||
| 984 | __SetPageUptodate(page); | ||
| 985 | |||
| 986 | mmun_start = haddr; | ||
| 987 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
| 988 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
| 989 | |||
| 990 | spin_lock(&mm->page_table_lock); | ||
| 991 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | ||
| 992 | goto out_free_page; | ||
| 993 | |||
| 994 | pmdp_clear_flush(vma, haddr, pmd); | ||
| 995 | /* leave pmd empty until pte is filled */ | ||
| 996 | |||
| 997 | pgtable = pgtable_trans_huge_withdraw(mm); | ||
| 998 | pmd_populate(mm, &_pmd, pgtable); | ||
| 999 | |||
| 1000 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
| 1001 | pte_t *pte, entry; | ||
| 1002 | if (haddr == (address & PAGE_MASK)) { | ||
| 1003 | entry = mk_pte(page, vma->vm_page_prot); | ||
| 1004 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
| 1005 | page_add_new_anon_rmap(page, vma, haddr); | ||
| 1006 | } else { | ||
| 1007 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | ||
| 1008 | entry = pte_mkspecial(entry); | ||
| 1009 | } | ||
| 1010 | pte = pte_offset_map(&_pmd, haddr); | ||
| 1011 | VM_BUG_ON(!pte_none(*pte)); | ||
| 1012 | set_pte_at(mm, haddr, pte, entry); | ||
| 1013 | pte_unmap(pte); | ||
| 1014 | } | ||
| 1015 | smp_wmb(); /* make pte visible before pmd */ | ||
| 1016 | pmd_populate(mm, pmd, pgtable); | ||
| 1017 | spin_unlock(&mm->page_table_lock); | ||
| 1018 | put_huge_zero_page(); | ||
| 1019 | inc_mm_counter(mm, MM_ANONPAGES); | ||
| 1020 | |||
| 1021 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
| 1022 | |||
| 1023 | ret |= VM_FAULT_WRITE; | ||
| 1024 | out: | ||
| 1025 | return ret; | ||
| 1026 | out_free_page: | ||
| 1027 | spin_unlock(&mm->page_table_lock); | ||
| 1028 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
| 1029 | mem_cgroup_uncharge_page(page); | ||
| 1030 | put_page(page); | ||
| 1031 | goto out; | ||
| 1032 | } | ||
| 1033 | |||
| 809 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 1034 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
| 810 | struct vm_area_struct *vma, | 1035 | struct vm_area_struct *vma, |
| 811 | unsigned long address, | 1036 | unsigned long address, |
| @@ -912,19 +1137,21 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 912 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) | 1137 | unsigned long address, pmd_t *pmd, pmd_t orig_pmd) |
| 913 | { | 1138 | { |
| 914 | int ret = 0; | 1139 | int ret = 0; |
| 915 | struct page *page, *new_page; | 1140 | struct page *page = NULL, *new_page; |
| 916 | unsigned long haddr; | 1141 | unsigned long haddr; |
| 917 | unsigned long mmun_start; /* For mmu_notifiers */ | 1142 | unsigned long mmun_start; /* For mmu_notifiers */ |
| 918 | unsigned long mmun_end; /* For mmu_notifiers */ | 1143 | unsigned long mmun_end; /* For mmu_notifiers */ |
| 919 | 1144 | ||
| 920 | VM_BUG_ON(!vma->anon_vma); | 1145 | VM_BUG_ON(!vma->anon_vma); |
| 1146 | haddr = address & HPAGE_PMD_MASK; | ||
| 1147 | if (is_huge_zero_pmd(orig_pmd)) | ||
| 1148 | goto alloc; | ||
| 921 | spin_lock(&mm->page_table_lock); | 1149 | spin_lock(&mm->page_table_lock); |
| 922 | if (unlikely(!pmd_same(*pmd, orig_pmd))) | 1150 | if (unlikely(!pmd_same(*pmd, orig_pmd))) |
| 923 | goto out_unlock; | 1151 | goto out_unlock; |
| 924 | 1152 | ||
| 925 | page = pmd_page(orig_pmd); | 1153 | page = pmd_page(orig_pmd); |
| 926 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); | 1154 | VM_BUG_ON(!PageCompound(page) || !PageHead(page)); |
| 927 | haddr = address & HPAGE_PMD_MASK; | ||
| 928 | if (page_mapcount(page) == 1) { | 1155 | if (page_mapcount(page) == 1) { |
| 929 | pmd_t entry; | 1156 | pmd_t entry; |
| 930 | entry = pmd_mkyoung(orig_pmd); | 1157 | entry = pmd_mkyoung(orig_pmd); |
| @@ -936,7 +1163,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 936 | } | 1163 | } |
| 937 | get_page(page); | 1164 | get_page(page); |
| 938 | spin_unlock(&mm->page_table_lock); | 1165 | spin_unlock(&mm->page_table_lock); |
| 939 | 1166 | alloc: | |
| 940 | if (transparent_hugepage_enabled(vma) && | 1167 | if (transparent_hugepage_enabled(vma) && |
| 941 | !transparent_hugepage_debug_cow()) | 1168 | !transparent_hugepage_debug_cow()) |
| 942 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), | 1169 | new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), |
| @@ -946,24 +1173,34 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 946 | 1173 | ||
| 947 | if (unlikely(!new_page)) { | 1174 | if (unlikely(!new_page)) { |
| 948 | count_vm_event(THP_FAULT_FALLBACK); | 1175 | count_vm_event(THP_FAULT_FALLBACK); |
| 949 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 1176 | if (is_huge_zero_pmd(orig_pmd)) { |
| 950 | pmd, orig_pmd, page, haddr); | 1177 | ret = do_huge_pmd_wp_zero_page_fallback(mm, vma, |
| 951 | if (ret & VM_FAULT_OOM) | 1178 | address, pmd, orig_pmd, haddr); |
| 952 | split_huge_page(page); | 1179 | } else { |
| 953 | put_page(page); | 1180 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
| 1181 | pmd, orig_pmd, page, haddr); | ||
| 1182 | if (ret & VM_FAULT_OOM) | ||
| 1183 | split_huge_page(page); | ||
| 1184 | put_page(page); | ||
| 1185 | } | ||
| 954 | goto out; | 1186 | goto out; |
| 955 | } | 1187 | } |
| 956 | count_vm_event(THP_FAULT_ALLOC); | 1188 | count_vm_event(THP_FAULT_ALLOC); |
| 957 | 1189 | ||
| 958 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { | 1190 | if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { |
| 959 | put_page(new_page); | 1191 | put_page(new_page); |
| 960 | split_huge_page(page); | 1192 | if (page) { |
| 961 | put_page(page); | 1193 | split_huge_page(page); |
| 1194 | put_page(page); | ||
| 1195 | } | ||
| 962 | ret |= VM_FAULT_OOM; | 1196 | ret |= VM_FAULT_OOM; |
| 963 | goto out; | 1197 | goto out; |
| 964 | } | 1198 | } |
| 965 | 1199 | ||
| 966 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | 1200 | if (is_huge_zero_pmd(orig_pmd)) |
| 1201 | clear_huge_page(new_page, haddr, HPAGE_PMD_NR); | ||
| 1202 | else | ||
| 1203 | copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); | ||
| 967 | __SetPageUptodate(new_page); | 1204 | __SetPageUptodate(new_page); |
| 968 | 1205 | ||
| 969 | mmun_start = haddr; | 1206 | mmun_start = haddr; |
| @@ -971,7 +1208,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 971 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 1208 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
| 972 | 1209 | ||
| 973 | spin_lock(&mm->page_table_lock); | 1210 | spin_lock(&mm->page_table_lock); |
| 974 | put_page(page); | 1211 | if (page) |
| 1212 | put_page(page); | ||
| 975 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1213 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
| 976 | spin_unlock(&mm->page_table_lock); | 1214 | spin_unlock(&mm->page_table_lock); |
| 977 | mem_cgroup_uncharge_page(new_page); | 1215 | mem_cgroup_uncharge_page(new_page); |
| @@ -979,14 +1217,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 979 | goto out_mn; | 1217 | goto out_mn; |
| 980 | } else { | 1218 | } else { |
| 981 | pmd_t entry; | 1219 | pmd_t entry; |
| 982 | VM_BUG_ON(!PageHead(page)); | ||
| 983 | entry = mk_huge_pmd(new_page, vma); | 1220 | entry = mk_huge_pmd(new_page, vma); |
| 984 | pmdp_clear_flush(vma, haddr, pmd); | 1221 | pmdp_clear_flush(vma, haddr, pmd); |
| 985 | page_add_new_anon_rmap(new_page, vma, haddr); | 1222 | page_add_new_anon_rmap(new_page, vma, haddr); |
| 986 | set_pmd_at(mm, haddr, pmd, entry); | 1223 | set_pmd_at(mm, haddr, pmd, entry); |
| 987 | update_mmu_cache_pmd(vma, address, pmd); | 1224 | update_mmu_cache_pmd(vma, address, pmd); |
| 988 | page_remove_rmap(page); | 1225 | if (is_huge_zero_pmd(orig_pmd)) { |
| 989 | put_page(page); | 1226 | add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); |
| 1227 | put_huge_zero_page(); | ||
| 1228 | } else { | ||
| 1229 | VM_BUG_ON(!PageHead(page)); | ||
| 1230 | page_remove_rmap(page); | ||
| 1231 | put_page(page); | ||
| 1232 | } | ||
| 990 | ret |= VM_FAULT_WRITE; | 1233 | ret |= VM_FAULT_WRITE; |
| 991 | } | 1234 | } |
| 992 | spin_unlock(&mm->page_table_lock); | 1235 | spin_unlock(&mm->page_table_lock); |
| @@ -1055,15 +1298,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
| 1055 | pmd_t orig_pmd; | 1298 | pmd_t orig_pmd; |
| 1056 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); | 1299 | pgtable = pgtable_trans_huge_withdraw(tlb->mm); |
| 1057 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); | 1300 | orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); |
| 1058 | page = pmd_page(orig_pmd); | ||
| 1059 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | 1301 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
| 1060 | page_remove_rmap(page); | 1302 | if (is_huge_zero_pmd(orig_pmd)) { |
| 1061 | VM_BUG_ON(page_mapcount(page) < 0); | 1303 | tlb->mm->nr_ptes--; |
| 1062 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1304 | spin_unlock(&tlb->mm->page_table_lock); |
| 1063 | VM_BUG_ON(!PageHead(page)); | 1305 | put_huge_zero_page(); |
| 1064 | tlb->mm->nr_ptes--; | 1306 | } else { |
| 1065 | spin_unlock(&tlb->mm->page_table_lock); | 1307 | page = pmd_page(orig_pmd); |
| 1066 | tlb_remove_page(tlb, page); | 1308 | page_remove_rmap(page); |
| 1309 | VM_BUG_ON(page_mapcount(page) < 0); | ||
| 1310 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | ||
| 1311 | VM_BUG_ON(!PageHead(page)); | ||
| 1312 | tlb->mm->nr_ptes--; | ||
| 1313 | spin_unlock(&tlb->mm->page_table_lock); | ||
| 1314 | tlb_remove_page(tlb, page); | ||
| 1315 | } | ||
| 1067 | pte_free(tlb->mm, pgtable); | 1316 | pte_free(tlb->mm, pgtable); |
| 1068 | ret = 1; | 1317 | ret = 1; |
| 1069 | } | 1318 | } |
| @@ -1135,6 +1384,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 1135 | pmd_t entry; | 1384 | pmd_t entry; |
| 1136 | entry = pmdp_get_and_clear(mm, addr, pmd); | 1385 | entry = pmdp_get_and_clear(mm, addr, pmd); |
| 1137 | entry = pmd_modify(entry, newprot); | 1386 | entry = pmd_modify(entry, newprot); |
| 1387 | BUG_ON(pmd_write(entry)); | ||
| 1138 | set_pmd_at(mm, addr, pmd, entry); | 1388 | set_pmd_at(mm, addr, pmd, entry); |
| 1139 | spin_unlock(&vma->vm_mm->page_table_lock); | 1389 | spin_unlock(&vma->vm_mm->page_table_lock); |
| 1140 | ret = 1; | 1390 | ret = 1; |
| @@ -1477,6 +1727,7 @@ int split_huge_page(struct page *page) | |||
| 1477 | struct anon_vma *anon_vma; | 1727 | struct anon_vma *anon_vma; |
| 1478 | int ret = 1; | 1728 | int ret = 1; |
| 1479 | 1729 | ||
| 1730 | BUG_ON(is_huge_zero_pfn(page_to_pfn(page))); | ||
| 1480 | BUG_ON(!PageAnon(page)); | 1731 | BUG_ON(!PageAnon(page)); |
| 1481 | anon_vma = page_lock_anon_vma(page); | 1732 | anon_vma = page_lock_anon_vma(page); |
| 1482 | if (!anon_vma) | 1733 | if (!anon_vma) |
| @@ -2336,19 +2587,65 @@ static int khugepaged(void *none) | |||
| 2336 | return 0; | 2587 | return 0; |
| 2337 | } | 2588 | } |
| 2338 | 2589 | ||
| 2339 | void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | 2590 | static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, |
| 2591 | unsigned long haddr, pmd_t *pmd) | ||
| 2592 | { | ||
| 2593 | struct mm_struct *mm = vma->vm_mm; | ||
| 2594 | pgtable_t pgtable; | ||
| 2595 | pmd_t _pmd; | ||
| 2596 | int i; | ||
| 2597 | |||
| 2598 | pmdp_clear_flush(vma, haddr, pmd); | ||
| 2599 | /* leave pmd empty until pte is filled */ | ||
| 2600 | |||
| 2601 | pgtable = pgtable_trans_huge_withdraw(mm); | ||
| 2602 | pmd_populate(mm, &_pmd, pgtable); | ||
| 2603 | |||
| 2604 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
| 2605 | pte_t *pte, entry; | ||
| 2606 | entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot); | ||
| 2607 | entry = pte_mkspecial(entry); | ||
| 2608 | pte = pte_offset_map(&_pmd, haddr); | ||
| 2609 | VM_BUG_ON(!pte_none(*pte)); | ||
| 2610 | set_pte_at(mm, haddr, pte, entry); | ||
| 2611 | pte_unmap(pte); | ||
| 2612 | } | ||
| 2613 | smp_wmb(); /* make pte visible before pmd */ | ||
| 2614 | pmd_populate(mm, pmd, pgtable); | ||
| 2615 | put_huge_zero_page(); | ||
| 2616 | } | ||
| 2617 | |||
| 2618 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | ||
| 2619 | pmd_t *pmd) | ||
| 2340 | { | 2620 | { |
| 2341 | struct page *page; | 2621 | struct page *page; |
| 2622 | struct mm_struct *mm = vma->vm_mm; | ||
| 2623 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
| 2624 | unsigned long mmun_start; /* For mmu_notifiers */ | ||
| 2625 | unsigned long mmun_end; /* For mmu_notifiers */ | ||
| 2626 | |||
| 2627 | BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); | ||
| 2342 | 2628 | ||
| 2629 | mmun_start = haddr; | ||
| 2630 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
| 2631 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
| 2343 | spin_lock(&mm->page_table_lock); | 2632 | spin_lock(&mm->page_table_lock); |
| 2344 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2633 | if (unlikely(!pmd_trans_huge(*pmd))) { |
| 2345 | spin_unlock(&mm->page_table_lock); | 2634 | spin_unlock(&mm->page_table_lock); |
| 2635 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
| 2636 | return; | ||
| 2637 | } | ||
| 2638 | if (is_huge_zero_pmd(*pmd)) { | ||
| 2639 | __split_huge_zero_page_pmd(vma, haddr, pmd); | ||
| 2640 | spin_unlock(&mm->page_table_lock); | ||
| 2641 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
| 2346 | return; | 2642 | return; |
| 2347 | } | 2643 | } |
| 2348 | page = pmd_page(*pmd); | 2644 | page = pmd_page(*pmd); |
| 2349 | VM_BUG_ON(!page_count(page)); | 2645 | VM_BUG_ON(!page_count(page)); |
| 2350 | get_page(page); | 2646 | get_page(page); |
| 2351 | spin_unlock(&mm->page_table_lock); | 2647 | spin_unlock(&mm->page_table_lock); |
| 2648 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
| 2352 | 2649 | ||
| 2353 | split_huge_page(page); | 2650 | split_huge_page(page); |
| 2354 | 2651 | ||
| @@ -2356,6 +2653,16 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) | |||
| 2356 | BUG_ON(pmd_trans_huge(*pmd)); | 2653 | BUG_ON(pmd_trans_huge(*pmd)); |
| 2357 | } | 2654 | } |
| 2358 | 2655 | ||
| 2656 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | ||
| 2657 | pmd_t *pmd) | ||
| 2658 | { | ||
| 2659 | struct vm_area_struct *vma; | ||
| 2660 | |||
| 2661 | vma = find_vma(mm, address); | ||
| 2662 | BUG_ON(vma == NULL); | ||
| 2663 | split_huge_page_pmd(vma, address, pmd); | ||
| 2664 | } | ||
| 2665 | |||
| 2359 | static void split_huge_page_address(struct mm_struct *mm, | 2666 | static void split_huge_page_address(struct mm_struct *mm, |
| 2360 | unsigned long address) | 2667 | unsigned long address) |
| 2361 | { | 2668 | { |
| @@ -2370,7 +2677,7 @@ static void split_huge_page_address(struct mm_struct *mm, | |||
| 2370 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | 2677 | * Caller holds the mmap_sem write mode, so a huge pmd cannot |
| 2371 | * materialize from under us. | 2678 | * materialize from under us. |
| 2372 | */ | 2679 | */ |
| 2373 | split_huge_page_pmd(mm, pmd); | 2680 | split_huge_page_pmd_mm(mm, address, pmd); |
| 2374 | } | 2681 | } |
| 2375 | 2682 | ||
| 2376 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, | 2683 | void __vma_adjust_trans_huge(struct vm_area_struct *vma, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 34f372ad89d..88e7293b96b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -1057,7 +1057,7 @@ static void return_unused_surplus_pages(struct hstate *h, | |||
| 1057 | * on-line nodes with memory and will handle the hstate accounting. | 1057 | * on-line nodes with memory and will handle the hstate accounting. |
| 1058 | */ | 1058 | */ |
| 1059 | while (nr_pages--) { | 1059 | while (nr_pages--) { |
| 1060 | if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) | 1060 | if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1)) |
| 1061 | break; | 1061 | break; |
| 1062 | } | 1062 | } |
| 1063 | } | 1063 | } |
| @@ -1180,14 +1180,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
| 1180 | int __weak alloc_bootmem_huge_page(struct hstate *h) | 1180 | int __weak alloc_bootmem_huge_page(struct hstate *h) |
| 1181 | { | 1181 | { |
| 1182 | struct huge_bootmem_page *m; | 1182 | struct huge_bootmem_page *m; |
| 1183 | int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 1183 | int nr_nodes = nodes_weight(node_states[N_MEMORY]); |
| 1184 | 1184 | ||
| 1185 | while (nr_nodes) { | 1185 | while (nr_nodes) { |
| 1186 | void *addr; | 1186 | void *addr; |
| 1187 | 1187 | ||
| 1188 | addr = __alloc_bootmem_node_nopanic( | 1188 | addr = __alloc_bootmem_node_nopanic( |
| 1189 | NODE_DATA(hstate_next_node_to_alloc(h, | 1189 | NODE_DATA(hstate_next_node_to_alloc(h, |
| 1190 | &node_states[N_HIGH_MEMORY])), | 1190 | &node_states[N_MEMORY])), |
| 1191 | huge_page_size(h), huge_page_size(h), 0); | 1191 | huge_page_size(h), huge_page_size(h), 0); |
| 1192 | 1192 | ||
| 1193 | if (addr) { | 1193 | if (addr) { |
| @@ -1259,7 +1259,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) | |||
| 1259 | if (!alloc_bootmem_huge_page(h)) | 1259 | if (!alloc_bootmem_huge_page(h)) |
| 1260 | break; | 1260 | break; |
| 1261 | } else if (!alloc_fresh_huge_page(h, | 1261 | } else if (!alloc_fresh_huge_page(h, |
| 1262 | &node_states[N_HIGH_MEMORY])) | 1262 | &node_states[N_MEMORY])) |
| 1263 | break; | 1263 | break; |
| 1264 | } | 1264 | } |
| 1265 | h->max_huge_pages = i; | 1265 | h->max_huge_pages = i; |
| @@ -1527,7 +1527,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
| 1527 | if (!(obey_mempolicy && | 1527 | if (!(obey_mempolicy && |
| 1528 | init_nodemask_of_mempolicy(nodes_allowed))) { | 1528 | init_nodemask_of_mempolicy(nodes_allowed))) { |
| 1529 | NODEMASK_FREE(nodes_allowed); | 1529 | NODEMASK_FREE(nodes_allowed); |
| 1530 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 1530 | nodes_allowed = &node_states[N_MEMORY]; |
| 1531 | } | 1531 | } |
| 1532 | } else if (nodes_allowed) { | 1532 | } else if (nodes_allowed) { |
| 1533 | /* | 1533 | /* |
| @@ -1537,11 +1537,11 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, | |||
| 1537 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; | 1537 | count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; |
| 1538 | init_nodemask_of_node(nodes_allowed, nid); | 1538 | init_nodemask_of_node(nodes_allowed, nid); |
| 1539 | } else | 1539 | } else |
| 1540 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 1540 | nodes_allowed = &node_states[N_MEMORY]; |
| 1541 | 1541 | ||
| 1542 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); | 1542 | h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); |
| 1543 | 1543 | ||
| 1544 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 1544 | if (nodes_allowed != &node_states[N_MEMORY]) |
| 1545 | NODEMASK_FREE(nodes_allowed); | 1545 | NODEMASK_FREE(nodes_allowed); |
| 1546 | 1546 | ||
| 1547 | return len; | 1547 | return len; |
| @@ -1844,7 +1844,7 @@ static void hugetlb_register_all_nodes(void) | |||
| 1844 | { | 1844 | { |
| 1845 | int nid; | 1845 | int nid; |
| 1846 | 1846 | ||
| 1847 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1847 | for_each_node_state(nid, N_MEMORY) { |
| 1848 | struct node *node = node_devices[nid]; | 1848 | struct node *node = node_devices[nid]; |
| 1849 | if (node->dev.id == nid) | 1849 | if (node->dev.id == nid) |
| 1850 | hugetlb_register_node(node); | 1850 | hugetlb_register_node(node); |
| @@ -1939,8 +1939,8 @@ void __init hugetlb_add_hstate(unsigned order) | |||
| 1939 | for (i = 0; i < MAX_NUMNODES; ++i) | 1939 | for (i = 0; i < MAX_NUMNODES; ++i) |
| 1940 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); | 1940 | INIT_LIST_HEAD(&h->hugepage_freelists[i]); |
| 1941 | INIT_LIST_HEAD(&h->hugepage_activelist); | 1941 | INIT_LIST_HEAD(&h->hugepage_activelist); |
| 1942 | h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); | 1942 | h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); |
| 1943 | h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); | 1943 | h->next_nid_to_free = first_node(node_states[N_MEMORY]); |
| 1944 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", | 1944 | snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", |
| 1945 | huge_page_size(h)/1024); | 1945 | huge_page_size(h)/1024); |
| 1946 | /* | 1946 | /* |
| @@ -2035,11 +2035,11 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, | |||
| 2035 | if (!(obey_mempolicy && | 2035 | if (!(obey_mempolicy && |
| 2036 | init_nodemask_of_mempolicy(nodes_allowed))) { | 2036 | init_nodemask_of_mempolicy(nodes_allowed))) { |
| 2037 | NODEMASK_FREE(nodes_allowed); | 2037 | NODEMASK_FREE(nodes_allowed); |
| 2038 | nodes_allowed = &node_states[N_HIGH_MEMORY]; | 2038 | nodes_allowed = &node_states[N_MEMORY]; |
| 2039 | } | 2039 | } |
| 2040 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); | 2040 | h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); |
| 2041 | 2041 | ||
| 2042 | if (nodes_allowed != &node_states[N_HIGH_MEMORY]) | 2042 | if (nodes_allowed != &node_states[N_MEMORY]) |
| 2043 | NODEMASK_FREE(nodes_allowed); | 2043 | NODEMASK_FREE(nodes_allowed); |
| 2044 | } | 2044 | } |
| 2045 | out: | 2045 | out: |
| @@ -2386,8 +2386,10 @@ again: | |||
| 2386 | /* | 2386 | /* |
| 2387 | * HWPoisoned hugepage is already unmapped and dropped reference | 2387 | * HWPoisoned hugepage is already unmapped and dropped reference |
| 2388 | */ | 2388 | */ |
| 2389 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) | 2389 | if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { |
| 2390 | pte_clear(mm, address, ptep); | ||
| 2390 | continue; | 2391 | continue; |
| 2392 | } | ||
| 2391 | 2393 | ||
| 2392 | page = pte_page(pte); | 2394 | page = pte_page(pte); |
| 2393 | /* | 2395 | /* |
| @@ -3170,7 +3172,13 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) | |||
| 3170 | 3172 | ||
| 3171 | spin_lock(&hugetlb_lock); | 3173 | spin_lock(&hugetlb_lock); |
| 3172 | if (is_hugepage_on_freelist(hpage)) { | 3174 | if (is_hugepage_on_freelist(hpage)) { |
| 3173 | list_del(&hpage->lru); | 3175 | /* |
| 3176 | * Hwpoisoned hugepage isn't linked to activelist or freelist, | ||
| 3177 | * but dangling hpage->lru can trigger list-debug warnings | ||
| 3178 | * (this happens when we call unpoison_memory() on it), | ||
| 3179 | * so let it point to itself with list_del_init(). | ||
| 3180 | */ | ||
| 3181 | list_del_init(&hpage->lru); | ||
| 3174 | set_page_refcounted(hpage); | 3182 | set_page_refcounted(hpage); |
| 3175 | h->free_huge_pages--; | 3183 | h->free_huge_pages--; |
| 3176 | h->free_huge_pages_node[nid]--; | 3184 | h->free_huge_pages_node[nid]--; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 12307b3838f..6c055929c8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -59,6 +59,8 @@ | |||
| 59 | #include <trace/events/vmscan.h> | 59 | #include <trace/events/vmscan.h> |
| 60 | 60 | ||
| 61 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 61 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
| 62 | EXPORT_SYMBOL(mem_cgroup_subsys); | ||
| 63 | |||
| 62 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 64 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
| 63 | static struct mem_cgroup *root_mem_cgroup __read_mostly; | 65 | static struct mem_cgroup *root_mem_cgroup __read_mostly; |
| 64 | 66 | ||
| @@ -800,7 +802,7 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, | |||
| 800 | int nid; | 802 | int nid; |
| 801 | u64 total = 0; | 803 | u64 total = 0; |
| 802 | 804 | ||
| 803 | for_each_node_state(nid, N_HIGH_MEMORY) | 805 | for_each_node_state(nid, N_MEMORY) |
| 804 | total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); | 806 | total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); |
| 805 | return total; | 807 | return total; |
| 806 | } | 808 | } |
| @@ -1015,13 +1017,10 @@ void mem_cgroup_iter_break(struct mem_cgroup *root, | |||
| 1015 | iter != NULL; \ | 1017 | iter != NULL; \ |
| 1016 | iter = mem_cgroup_iter(NULL, iter, NULL)) | 1018 | iter = mem_cgroup_iter(NULL, iter, NULL)) |
| 1017 | 1019 | ||
| 1018 | void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | 1020 | void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) |
| 1019 | { | 1021 | { |
| 1020 | struct mem_cgroup *memcg; | 1022 | struct mem_cgroup *memcg; |
| 1021 | 1023 | ||
| 1022 | if (!mm) | ||
| 1023 | return; | ||
| 1024 | |||
| 1025 | rcu_read_lock(); | 1024 | rcu_read_lock(); |
| 1026 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1025 | memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); |
| 1027 | if (unlikely(!memcg)) | 1026 | if (unlikely(!memcg)) |
| @@ -1040,7 +1039,7 @@ void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) | |||
| 1040 | out: | 1039 | out: |
| 1041 | rcu_read_unlock(); | 1040 | rcu_read_unlock(); |
| 1042 | } | 1041 | } |
| 1043 | EXPORT_SYMBOL(mem_cgroup_count_vm_event); | 1042 | EXPORT_SYMBOL(__mem_cgroup_count_vm_event); |
| 1044 | 1043 | ||
| 1045 | /** | 1044 | /** |
| 1046 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg | 1045 | * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg |
| @@ -1644,9 +1643,9 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) | |||
| 1644 | return; | 1643 | return; |
| 1645 | 1644 | ||
| 1646 | /* make a nodemask where this memcg uses memory from */ | 1645 | /* make a nodemask where this memcg uses memory from */ |
| 1647 | memcg->scan_nodes = node_states[N_HIGH_MEMORY]; | 1646 | memcg->scan_nodes = node_states[N_MEMORY]; |
| 1648 | 1647 | ||
| 1649 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | 1648 | for_each_node_mask(nid, node_states[N_MEMORY]) { |
| 1650 | 1649 | ||
| 1651 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) | 1650 | if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) |
| 1652 | node_clear(nid, memcg->scan_nodes); | 1651 | node_clear(nid, memcg->scan_nodes); |
| @@ -1717,7 +1716,7 @@ static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) | |||
| 1717 | /* | 1716 | /* |
| 1718 | * Check rest of nodes. | 1717 | * Check rest of nodes. |
| 1719 | */ | 1718 | */ |
| 1720 | for_each_node_state(nid, N_HIGH_MEMORY) { | 1719 | for_each_node_state(nid, N_MEMORY) { |
| 1721 | if (node_isset(nid, memcg->scan_nodes)) | 1720 | if (node_isset(nid, memcg->scan_nodes)) |
| 1722 | continue; | 1721 | continue; |
| 1723 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) | 1722 | if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) |
| @@ -3776,7 +3775,7 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) | |||
| 3776 | lru_add_drain_all(); | 3775 | lru_add_drain_all(); |
| 3777 | drain_all_stock_sync(memcg); | 3776 | drain_all_stock_sync(memcg); |
| 3778 | mem_cgroup_start_move(memcg); | 3777 | mem_cgroup_start_move(memcg); |
| 3779 | for_each_node_state(node, N_HIGH_MEMORY) { | 3778 | for_each_node_state(node, N_MEMORY) { |
| 3780 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 3779 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
| 3781 | enum lru_list lru; | 3780 | enum lru_list lru; |
| 3782 | for_each_lru(lru) { | 3781 | for_each_lru(lru) { |
| @@ -4122,7 +4121,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
| 4122 | 4121 | ||
| 4123 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); | 4122 | total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); |
| 4124 | seq_printf(m, "total=%lu", total_nr); | 4123 | seq_printf(m, "total=%lu", total_nr); |
| 4125 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4124 | for_each_node_state(nid, N_MEMORY) { |
| 4126 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); | 4125 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); |
| 4127 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4126 | seq_printf(m, " N%d=%lu", nid, node_nr); |
| 4128 | } | 4127 | } |
| @@ -4130,7 +4129,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
| 4130 | 4129 | ||
| 4131 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); | 4130 | file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); |
| 4132 | seq_printf(m, "file=%lu", file_nr); | 4131 | seq_printf(m, "file=%lu", file_nr); |
| 4133 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4132 | for_each_node_state(nid, N_MEMORY) { |
| 4134 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 4133 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
| 4135 | LRU_ALL_FILE); | 4134 | LRU_ALL_FILE); |
| 4136 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4135 | seq_printf(m, " N%d=%lu", nid, node_nr); |
| @@ -4139,7 +4138,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
| 4139 | 4138 | ||
| 4140 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); | 4139 | anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); |
| 4141 | seq_printf(m, "anon=%lu", anon_nr); | 4140 | seq_printf(m, "anon=%lu", anon_nr); |
| 4142 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4141 | for_each_node_state(nid, N_MEMORY) { |
| 4143 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 4142 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
| 4144 | LRU_ALL_ANON); | 4143 | LRU_ALL_ANON); |
| 4145 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4144 | seq_printf(m, " N%d=%lu", nid, node_nr); |
| @@ -4148,7 +4147,7 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, | |||
| 4148 | 4147 | ||
| 4149 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); | 4148 | unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); |
| 4150 | seq_printf(m, "unevictable=%lu", unevictable_nr); | 4149 | seq_printf(m, "unevictable=%lu", unevictable_nr); |
| 4151 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4150 | for_each_node_state(nid, N_MEMORY) { |
| 4152 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, | 4151 | node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, |
| 4153 | BIT(LRU_UNEVICTABLE)); | 4152 | BIT(LRU_UNEVICTABLE)); |
| 4154 | seq_printf(m, " N%d=%lu", nid, node_nr); | 4153 | seq_printf(m, " N%d=%lu", nid, node_nr); |
diff --git a/mm/memory.c b/mm/memory.c index 76537738563..db2e9e797a0 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -717,20 +717,6 @@ static inline bool is_cow_mapping(vm_flags_t flags) | |||
| 717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 717 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
| 718 | } | 718 | } |
| 719 | 719 | ||
| 720 | #ifndef is_zero_pfn | ||
| 721 | static inline int is_zero_pfn(unsigned long pfn) | ||
| 722 | { | ||
| 723 | return pfn == zero_pfn; | ||
| 724 | } | ||
| 725 | #endif | ||
| 726 | |||
| 727 | #ifndef my_zero_pfn | ||
| 728 | static inline unsigned long my_zero_pfn(unsigned long addr) | ||
| 729 | { | ||
| 730 | return zero_pfn; | ||
| 731 | } | ||
| 732 | #endif | ||
| 733 | |||
| 734 | /* | 720 | /* |
| 735 | * vm_normal_page -- This function gets the "struct page" associated with a pte. | 721 | * vm_normal_page -- This function gets the "struct page" associated with a pte. |
| 736 | * | 722 | * |
| @@ -1250,7 +1236,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
| 1250 | BUG(); | 1236 | BUG(); |
| 1251 | } | 1237 | } |
| 1252 | #endif | 1238 | #endif |
| 1253 | split_huge_page_pmd(vma->vm_mm, pmd); | 1239 | split_huge_page_pmd(vma, addr, pmd); |
| 1254 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1240 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
| 1255 | goto next; | 1241 | goto next; |
| 1256 | /* fall through */ | 1242 | /* fall through */ |
| @@ -1519,7 +1505,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, | |||
| 1519 | } | 1505 | } |
| 1520 | if (pmd_trans_huge(*pmd)) { | 1506 | if (pmd_trans_huge(*pmd)) { |
| 1521 | if (flags & FOLL_SPLIT) { | 1507 | if (flags & FOLL_SPLIT) { |
| 1522 | split_huge_page_pmd(mm, pmd); | 1508 | split_huge_page_pmd(vma, address, pmd); |
| 1523 | goto split_fallthrough; | 1509 | goto split_fallthrough; |
| 1524 | } | 1510 | } |
| 1525 | spin_lock(&mm->page_table_lock); | 1511 | spin_lock(&mm->page_table_lock); |
| @@ -2794,13 +2780,8 @@ unlock: | |||
| 2794 | oom_free_new: | 2780 | oom_free_new: |
| 2795 | page_cache_release(new_page); | 2781 | page_cache_release(new_page); |
| 2796 | oom: | 2782 | oom: |
| 2797 | if (old_page) { | 2783 | if (old_page) |
| 2798 | if (page_mkwrite) { | ||
| 2799 | unlock_page(old_page); | ||
| 2800 | page_cache_release(old_page); | ||
| 2801 | } | ||
| 2802 | page_cache_release(old_page); | 2784 | page_cache_release(old_page); |
| 2803 | } | ||
| 2804 | return VM_FAULT_OOM; | 2785 | return VM_FAULT_OOM; |
| 2805 | 2786 | ||
| 2806 | unwritable_page: | 2787 | unwritable_page: |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c3e66ae411f..518baa896e8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
| @@ -106,6 +106,7 @@ static void get_page_bootmem(unsigned long info, struct page *page, | |||
| 106 | void __ref put_page_bootmem(struct page *page) | 106 | void __ref put_page_bootmem(struct page *page) |
| 107 | { | 107 | { |
| 108 | unsigned long type; | 108 | unsigned long type; |
| 109 | static DEFINE_MUTEX(ppb_lock); | ||
| 109 | 110 | ||
| 110 | type = (unsigned long) page->lru.next; | 111 | type = (unsigned long) page->lru.next; |
| 111 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || | 112 | BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || |
| @@ -115,7 +116,14 @@ void __ref put_page_bootmem(struct page *page) | |||
| 115 | ClearPagePrivate(page); | 116 | ClearPagePrivate(page); |
| 116 | set_page_private(page, 0); | 117 | set_page_private(page, 0); |
| 117 | INIT_LIST_HEAD(&page->lru); | 118 | INIT_LIST_HEAD(&page->lru); |
| 119 | |||
| 120 | /* | ||
| 121 | * Please refer to comment for __free_pages_bootmem() | ||
| 122 | * for why we serialize here. | ||
| 123 | */ | ||
| 124 | mutex_lock(&ppb_lock); | ||
| 118 | __free_pages_bootmem(page, 0); | 125 | __free_pages_bootmem(page, 0); |
| 126 | mutex_unlock(&ppb_lock); | ||
| 119 | } | 127 | } |
| 120 | 128 | ||
| 121 | } | 129 | } |
| @@ -581,11 +589,19 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, | |||
| 581 | return 0; | 589 | return 0; |
| 582 | } | 590 | } |
| 583 | 591 | ||
| 592 | #ifdef CONFIG_MOVABLE_NODE | ||
| 593 | /* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ | ||
| 594 | static bool can_online_high_movable(struct zone *zone) | ||
| 595 | { | ||
| 596 | return true; | ||
| 597 | } | ||
| 598 | #else /* #ifdef CONFIG_MOVABLE_NODE */ | ||
| 584 | /* ensure every online node has NORMAL memory */ | 599 | /* ensure every online node has NORMAL memory */ |
| 585 | static bool can_online_high_movable(struct zone *zone) | 600 | static bool can_online_high_movable(struct zone *zone) |
| 586 | { | 601 | { |
| 587 | return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 602 | return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); |
| 588 | } | 603 | } |
| 604 | #endif /* #ifdef CONFIG_MOVABLE_NODE */ | ||
| 589 | 605 | ||
| 590 | /* check which state of node_states will be changed when online memory */ | 606 | /* check which state of node_states will be changed when online memory */ |
| 591 | static void node_states_check_changes_online(unsigned long nr_pages, | 607 | static void node_states_check_changes_online(unsigned long nr_pages, |
| @@ -595,13 +611,15 @@ static void node_states_check_changes_online(unsigned long nr_pages, | |||
| 595 | enum zone_type zone_last = ZONE_NORMAL; | 611 | enum zone_type zone_last = ZONE_NORMAL; |
| 596 | 612 | ||
| 597 | /* | 613 | /* |
| 598 | * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | 614 | * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] |
| 599 | * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. | 615 | * contains nodes which have zones of 0...ZONE_NORMAL, |
| 616 | * set zone_last to ZONE_NORMAL. | ||
| 600 | * | 617 | * |
| 601 | * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | 618 | * If we don't have HIGHMEM nor movable node, |
| 602 | * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | 619 | * node_states[N_NORMAL_MEMORY] contains nodes which have zones of |
| 620 | * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
| 603 | */ | 621 | */ |
| 604 | if (N_HIGH_MEMORY == N_NORMAL_MEMORY) | 622 | if (N_MEMORY == N_NORMAL_MEMORY) |
| 605 | zone_last = ZONE_MOVABLE; | 623 | zone_last = ZONE_MOVABLE; |
| 606 | 624 | ||
| 607 | /* | 625 | /* |
| @@ -615,12 +633,34 @@ static void node_states_check_changes_online(unsigned long nr_pages, | |||
| 615 | else | 633 | else |
| 616 | arg->status_change_nid_normal = -1; | 634 | arg->status_change_nid_normal = -1; |
| 617 | 635 | ||
| 636 | #ifdef CONFIG_HIGHMEM | ||
| 637 | /* | ||
| 638 | * If we have movable node, node_states[N_HIGH_MEMORY] | ||
| 639 | * contains nodes which have zones of 0...ZONE_HIGHMEM, | ||
| 640 | * set zone_last to ZONE_HIGHMEM. | ||
| 641 | * | ||
| 642 | * If we don't have movable node, node_states[N_NORMAL_MEMORY] | ||
| 643 | * contains nodes which have zones of 0...ZONE_MOVABLE, | ||
| 644 | * set zone_last to ZONE_MOVABLE. | ||
| 645 | */ | ||
| 646 | zone_last = ZONE_HIGHMEM; | ||
| 647 | if (N_MEMORY == N_HIGH_MEMORY) | ||
| 648 | zone_last = ZONE_MOVABLE; | ||
| 649 | |||
| 650 | if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY)) | ||
| 651 | arg->status_change_nid_high = nid; | ||
| 652 | else | ||
| 653 | arg->status_change_nid_high = -1; | ||
| 654 | #else | ||
| 655 | arg->status_change_nid_high = arg->status_change_nid_normal; | ||
| 656 | #endif | ||
| 657 | |||
| 618 | /* | 658 | /* |
| 619 | * if the node don't have memory befor online, we will need to | 659 | * if the node don't have memory befor online, we will need to |
| 620 | * set the node to node_states[N_HIGH_MEMORY] after the memory | 660 | * set the node to node_states[N_MEMORY] after the memory |
| 621 | * is online. | 661 | * is online. |
| 622 | */ | 662 | */ |
| 623 | if (!node_state(nid, N_HIGH_MEMORY)) | 663 | if (!node_state(nid, N_MEMORY)) |
| 624 | arg->status_change_nid = nid; | 664 | arg->status_change_nid = nid; |
| 625 | else | 665 | else |
| 626 | arg->status_change_nid = -1; | 666 | arg->status_change_nid = -1; |
| @@ -631,7 +671,10 @@ static void node_states_set_node(int node, struct memory_notify *arg) | |||
| 631 | if (arg->status_change_nid_normal >= 0) | 671 | if (arg->status_change_nid_normal >= 0) |
| 632 | node_set_state(node, N_NORMAL_MEMORY); | 672 | node_set_state(node, N_NORMAL_MEMORY); |
| 633 | 673 | ||
| 634 | node_set_state(node, N_HIGH_MEMORY); | 674 | if (arg->status_change_nid_high >= 0) |
| 675 | node_set_state(node, N_HIGH_MEMORY); | ||
| 676 | |||
| 677 | node_set_state(node, N_MEMORY); | ||
| 635 | } | 678 | } |
| 636 | 679 | ||
| 637 | 680 | ||
| @@ -713,6 +756,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ | |||
| 713 | return ret; | 756 | return ret; |
| 714 | } | 757 | } |
| 715 | 758 | ||
| 759 | zone->managed_pages += onlined_pages; | ||
| 716 | zone->present_pages += onlined_pages; | 760 | zone->present_pages += onlined_pages; |
| 717 | zone->zone_pgdat->node_present_pages += onlined_pages; | 761 | zone->zone_pgdat->node_present_pages += onlined_pages; |
| 718 | if (onlined_pages) { | 762 | if (onlined_pages) { |
| @@ -1066,6 +1110,13 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) | |||
| 1066 | return offlined; | 1110 | return offlined; |
| 1067 | } | 1111 | } |
| 1068 | 1112 | ||
| 1113 | #ifdef CONFIG_MOVABLE_NODE | ||
| 1114 | /* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */ | ||
| 1115 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | ||
| 1116 | { | ||
| 1117 | return true; | ||
| 1118 | } | ||
| 1119 | #else /* #ifdef CONFIG_MOVABLE_NODE */ | ||
| 1069 | /* ensure the node has NORMAL memory if it is still online */ | 1120 | /* ensure the node has NORMAL memory if it is still online */ |
| 1070 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | 1121 | static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) |
| 1071 | { | 1122 | { |
| @@ -1089,6 +1140,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) | |||
| 1089 | */ | 1140 | */ |
| 1090 | return present_pages == 0; | 1141 | return present_pages == 0; |
| 1091 | } | 1142 | } |
| 1143 | #endif /* #ifdef CONFIG_MOVABLE_NODE */ | ||
| 1092 | 1144 | ||
| 1093 | /* check which state of node_states will be changed when offline memory */ | 1145 | /* check which state of node_states will be changed when offline memory */ |
| 1094 | static void node_states_check_changes_offline(unsigned long nr_pages, | 1146 | static void node_states_check_changes_offline(unsigned long nr_pages, |
| @@ -1099,13 +1151,15 @@ static void node_states_check_changes_offline(unsigned long nr_pages, | |||
| 1099 | enum zone_type zt, zone_last = ZONE_NORMAL; | 1151 | enum zone_type zt, zone_last = ZONE_NORMAL; |
| 1100 | 1152 | ||
| 1101 | /* | 1153 | /* |
| 1102 | * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | 1154 | * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY] |
| 1103 | * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. | 1155 | * contains nodes which have zones of 0...ZONE_NORMAL, |
| 1156 | * set zone_last to ZONE_NORMAL. | ||
| 1104 | * | 1157 | * |
| 1105 | * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes | 1158 | * If we don't have HIGHMEM nor movable node, |
| 1106 | * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | 1159 | * node_states[N_NORMAL_MEMORY] contains nodes which have zones of |
| 1160 | * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. | ||
| 1107 | */ | 1161 | */ |
| 1108 | if (N_HIGH_MEMORY == N_NORMAL_MEMORY) | 1162 | if (N_MEMORY == N_NORMAL_MEMORY) |
| 1109 | zone_last = ZONE_MOVABLE; | 1163 | zone_last = ZONE_MOVABLE; |
| 1110 | 1164 | ||
| 1111 | /* | 1165 | /* |
| @@ -1122,6 +1176,30 @@ static void node_states_check_changes_offline(unsigned long nr_pages, | |||
| 1122 | else | 1176 | else |
| 1123 | arg->status_change_nid_normal = -1; | 1177 | arg->status_change_nid_normal = -1; |
| 1124 | 1178 | ||
| 1179 | #ifdef CONFIG_HIGHMEM | ||
| 1180 | /* | ||
| 1181 | * If we have movable node, node_states[N_HIGH_MEMORY] | ||
| 1182 | * contains nodes which have zones of 0...ZONE_HIGHMEM, | ||
| 1183 | * set zone_last to ZONE_HIGHMEM. | ||
| 1184 | * | ||
| 1185 | * If we don't have movable node, node_states[N_NORMAL_MEMORY] | ||
| 1186 | * contains nodes which have zones of 0...ZONE_MOVABLE, | ||
| 1187 | * set zone_last to ZONE_MOVABLE. | ||
| 1188 | */ | ||
| 1189 | zone_last = ZONE_HIGHMEM; | ||
| 1190 | if (N_MEMORY == N_HIGH_MEMORY) | ||
| 1191 | zone_last = ZONE_MOVABLE; | ||
| 1192 | |||
| 1193 | for (; zt <= zone_last; zt++) | ||
| 1194 | present_pages += pgdat->node_zones[zt].present_pages; | ||
| 1195 | if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) | ||
| 1196 | arg->status_change_nid_high = zone_to_nid(zone); | ||
| 1197 | else | ||
| 1198 | arg->status_change_nid_high = -1; | ||
| 1199 | #else | ||
| 1200 | arg->status_change_nid_high = arg->status_change_nid_normal; | ||
| 1201 | #endif | ||
| 1202 | |||
| 1125 | /* | 1203 | /* |
| 1126 | * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE | 1204 | * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE |
| 1127 | */ | 1205 | */ |
| @@ -1146,9 +1224,13 @@ static void node_states_clear_node(int node, struct memory_notify *arg) | |||
| 1146 | if (arg->status_change_nid_normal >= 0) | 1224 | if (arg->status_change_nid_normal >= 0) |
| 1147 | node_clear_state(node, N_NORMAL_MEMORY); | 1225 | node_clear_state(node, N_NORMAL_MEMORY); |
| 1148 | 1226 | ||
| 1149 | if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) && | 1227 | if ((N_MEMORY != N_NORMAL_MEMORY) && |
| 1150 | (arg->status_change_nid >= 0)) | 1228 | (arg->status_change_nid_high >= 0)) |
| 1151 | node_clear_state(node, N_HIGH_MEMORY); | 1229 | node_clear_state(node, N_HIGH_MEMORY); |
| 1230 | |||
| 1231 | if ((N_MEMORY != N_HIGH_MEMORY) && | ||
| 1232 | (arg->status_change_nid >= 0)) | ||
| 1233 | node_clear_state(node, N_MEMORY); | ||
| 1152 | } | 1234 | } |
| 1153 | 1235 | ||
| 1154 | static int __ref __offline_pages(unsigned long start_pfn, | 1236 | static int __ref __offline_pages(unsigned long start_pfn, |
| @@ -1248,6 +1330,7 @@ repeat: | |||
| 1248 | /* reset pagetype flags and makes migrate type to be MOVABLE */ | 1330 | /* reset pagetype flags and makes migrate type to be MOVABLE */ |
| 1249 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); | 1331 | undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); |
| 1250 | /* removal success */ | 1332 | /* removal success */ |
| 1333 | zone->managed_pages -= offlined_pages; | ||
| 1251 | zone->present_pages -= offlined_pages; | 1334 | zone->present_pages -= offlined_pages; |
| 1252 | zone->zone_pgdat->node_present_pages -= offlined_pages; | 1335 | zone->zone_pgdat->node_present_pages -= offlined_pages; |
| 1253 | totalram_pages -= offlined_pages; | 1336 | totalram_pages -= offlined_pages; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 05b28361a39..aaf54566cb6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
| @@ -212,9 +212,9 @@ static int mpol_set_nodemask(struct mempolicy *pol, | |||
| 212 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ | 212 | /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ |
| 213 | if (pol == NULL) | 213 | if (pol == NULL) |
| 214 | return 0; | 214 | return 0; |
| 215 | /* Check N_HIGH_MEMORY */ | 215 | /* Check N_MEMORY */ |
| 216 | nodes_and(nsc->mask1, | 216 | nodes_and(nsc->mask1, |
| 217 | cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); | 217 | cpuset_current_mems_allowed, node_states[N_MEMORY]); |
| 218 | 218 | ||
| 219 | VM_BUG_ON(!nodes); | 219 | VM_BUG_ON(!nodes); |
| 220 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) | 220 | if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) |
| @@ -511,7 +511,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
| 511 | pmd = pmd_offset(pud, addr); | 511 | pmd = pmd_offset(pud, addr); |
| 512 | do { | 512 | do { |
| 513 | next = pmd_addr_end(addr, end); | 513 | next = pmd_addr_end(addr, end); |
| 514 | split_huge_page_pmd(vma->vm_mm, pmd); | 514 | split_huge_page_pmd(vma, addr, pmd); |
| 515 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 515 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
| 516 | continue; | 516 | continue; |
| 517 | if (check_pte_range(vma, pmd, addr, next, nodes, | 517 | if (check_pte_range(vma, pmd, addr, next, nodes, |
| @@ -1388,7 +1388,7 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, | |||
| 1388 | goto out_put; | 1388 | goto out_put; |
| 1389 | } | 1389 | } |
| 1390 | 1390 | ||
| 1391 | if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { | 1391 | if (!nodes_subset(*new, node_states[N_MEMORY])) { |
| 1392 | err = -EINVAL; | 1392 | err = -EINVAL; |
| 1393 | goto out_put; | 1393 | goto out_put; |
| 1394 | } | 1394 | } |
| @@ -2326,7 +2326,7 @@ void __init numa_policy_init(void) | |||
| 2326 | * fall back to the largest node if they're all smaller. | 2326 | * fall back to the largest node if they're all smaller. |
| 2327 | */ | 2327 | */ |
| 2328 | nodes_clear(interleave_nodes); | 2328 | nodes_clear(interleave_nodes); |
| 2329 | for_each_node_state(nid, N_HIGH_MEMORY) { | 2329 | for_each_node_state(nid, N_MEMORY) { |
| 2330 | unsigned long total_pages = node_present_pages(nid); | 2330 | unsigned long total_pages = node_present_pages(nid); |
| 2331 | 2331 | ||
| 2332 | /* Preserve the largest node */ | 2332 | /* Preserve the largest node */ |
| @@ -2407,7 +2407,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2407 | *nodelist++ = '\0'; | 2407 | *nodelist++ = '\0'; |
| 2408 | if (nodelist_parse(nodelist, nodes)) | 2408 | if (nodelist_parse(nodelist, nodes)) |
| 2409 | goto out; | 2409 | goto out; |
| 2410 | if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY])) | 2410 | if (!nodes_subset(nodes, node_states[N_MEMORY])) |
| 2411 | goto out; | 2411 | goto out; |
| 2412 | } else | 2412 | } else |
| 2413 | nodes_clear(nodes); | 2413 | nodes_clear(nodes); |
| @@ -2441,7 +2441,7 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
| 2441 | * Default to online nodes with memory if no nodelist | 2441 | * Default to online nodes with memory if no nodelist |
| 2442 | */ | 2442 | */ |
| 2443 | if (!nodelist) | 2443 | if (!nodelist) |
| 2444 | nodes = node_states[N_HIGH_MEMORY]; | 2444 | nodes = node_states[N_MEMORY]; |
| 2445 | break; | 2445 | break; |
| 2446 | case MPOL_LOCAL: | 2446 | case MPOL_LOCAL: |
| 2447 | /* | 2447 | /* |
diff --git a/mm/migrate.c b/mm/migrate.c index 3f675ca0827..cae02711181 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -1238,7 +1238,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, | |||
| 1238 | if (node < 0 || node >= MAX_NUMNODES) | 1238 | if (node < 0 || node >= MAX_NUMNODES) |
| 1239 | goto out_pm; | 1239 | goto out_pm; |
| 1240 | 1240 | ||
| 1241 | if (!node_state(node, N_HIGH_MEMORY)) | 1241 | if (!node_state(node, N_MEMORY)) |
| 1242 | goto out_pm; | 1242 | goto out_pm; |
| 1243 | 1243 | ||
| 1244 | err = -EACCES; | 1244 | err = -EACCES; |
| @@ -1488,7 +1488,11 @@ munmap_back: | |||
| 1488 | * | 1488 | * |
| 1489 | * Answer: Yes, several device drivers can do it in their | 1489 | * Answer: Yes, several device drivers can do it in their |
| 1490 | * f_op->mmap method. -DaveM | 1490 | * f_op->mmap method. -DaveM |
| 1491 | * Bug: If addr is changed, prev, rb_link, rb_parent should | ||
| 1492 | * be updated for vma_link() | ||
| 1491 | */ | 1493 | */ |
| 1494 | WARN_ON_ONCE(addr != vma->vm_start); | ||
| 1495 | |||
| 1492 | addr = vma->vm_start; | 1496 | addr = vma->vm_start; |
| 1493 | pgoff = vma->vm_pgoff; | 1497 | pgoff = vma->vm_pgoff; |
| 1494 | vm_flags = vma->vm_flags; | 1498 | vm_flags = vma->vm_flags; |
| @@ -2065,6 +2069,18 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
| 2065 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { | 2069 | if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { |
| 2066 | error = acct_stack_growth(vma, size, grow); | 2070 | error = acct_stack_growth(vma, size, grow); |
| 2067 | if (!error) { | 2071 | if (!error) { |
| 2072 | /* | ||
| 2073 | * vma_gap_update() doesn't support concurrent | ||
| 2074 | * updates, but we only hold a shared mmap_sem | ||
| 2075 | * lock here, so we need to protect against | ||
| 2076 | * concurrent vma expansions. | ||
| 2077 | * vma_lock_anon_vma() doesn't help here, as | ||
| 2078 | * we don't guarantee that all growable vmas | ||
| 2079 | * in a mm share the same root anon vma. | ||
| 2080 | * So, we reuse mm->page_table_lock to guard | ||
| 2081 | * against concurrent vma expansions. | ||
| 2082 | */ | ||
| 2083 | spin_lock(&vma->vm_mm->page_table_lock); | ||
| 2068 | anon_vma_interval_tree_pre_update_vma(vma); | 2084 | anon_vma_interval_tree_pre_update_vma(vma); |
| 2069 | vma->vm_end = address; | 2085 | vma->vm_end = address; |
| 2070 | anon_vma_interval_tree_post_update_vma(vma); | 2086 | anon_vma_interval_tree_post_update_vma(vma); |
| @@ -2072,6 +2088,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) | |||
| 2072 | vma_gap_update(vma->vm_next); | 2088 | vma_gap_update(vma->vm_next); |
| 2073 | else | 2089 | else |
| 2074 | vma->vm_mm->highest_vm_end = address; | 2090 | vma->vm_mm->highest_vm_end = address; |
| 2091 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
| 2092 | |||
| 2075 | perf_event_mmap(vma); | 2093 | perf_event_mmap(vma); |
| 2076 | } | 2094 | } |
| 2077 | } | 2095 | } |
| @@ -2122,11 +2140,25 @@ int expand_downwards(struct vm_area_struct *vma, | |||
| 2122 | if (grow <= vma->vm_pgoff) { | 2140 | if (grow <= vma->vm_pgoff) { |
| 2123 | error = acct_stack_growth(vma, size, grow); | 2141 | error = acct_stack_growth(vma, size, grow); |
| 2124 | if (!error) { | 2142 | if (!error) { |
| 2143 | /* | ||
| 2144 | * vma_gap_update() doesn't support concurrent | ||
| 2145 | * updates, but we only hold a shared mmap_sem | ||
| 2146 | * lock here, so we need to protect against | ||
| 2147 | * concurrent vma expansions. | ||
| 2148 | * vma_lock_anon_vma() doesn't help here, as | ||
| 2149 | * we don't guarantee that all growable vmas | ||
| 2150 | * in a mm share the same root anon vma. | ||
| 2151 | * So, we reuse mm->page_table_lock to guard | ||
| 2152 | * against concurrent vma expansions. | ||
| 2153 | */ | ||
| 2154 | spin_lock(&vma->vm_mm->page_table_lock); | ||
| 2125 | anon_vma_interval_tree_pre_update_vma(vma); | 2155 | anon_vma_interval_tree_pre_update_vma(vma); |
| 2126 | vma->vm_start = address; | 2156 | vma->vm_start = address; |
| 2127 | vma->vm_pgoff -= grow; | 2157 | vma->vm_pgoff -= grow; |
| 2128 | anon_vma_interval_tree_post_update_vma(vma); | 2158 | anon_vma_interval_tree_post_update_vma(vma); |
| 2129 | vma_gap_update(vma); | 2159 | vma_gap_update(vma); |
| 2160 | spin_unlock(&vma->vm_mm->page_table_lock); | ||
| 2161 | |||
| 2130 | perf_event_mmap(vma); | 2162 | perf_event_mmap(vma); |
| 2131 | } | 2163 | } |
| 2132 | } | 2164 | } |
diff --git a/mm/mprotect.c b/mm/mprotect.c index a40992610ab..e8c3938db6f 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -90,7 +90,7 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
| 90 | next = pmd_addr_end(addr, end); | 90 | next = pmd_addr_end(addr, end); |
| 91 | if (pmd_trans_huge(*pmd)) { | 91 | if (pmd_trans_huge(*pmd)) { |
| 92 | if (next - addr != HPAGE_PMD_SIZE) | 92 | if (next - addr != HPAGE_PMD_SIZE) |
| 93 | split_huge_page_pmd(vma->vm_mm, pmd); | 93 | split_huge_page_pmd(vma, addr, pmd); |
| 94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) | 94 | else if (change_huge_pmd(vma, pmd, addr, newprot)) |
| 95 | continue; | 95 | continue; |
| 96 | /* fall through */ | 96 | /* fall through */ |
diff --git a/mm/mremap.c b/mm/mremap.c index 1b61c2d3307..eabb24da6c9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -182,7 +182,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
| 182 | need_flush = true; | 182 | need_flush = true; |
| 183 | continue; | 183 | continue; |
| 184 | } else if (!err) { | 184 | } else if (!err) { |
| 185 | split_huge_page_pmd(vma->vm_mm, old_pmd); | 185 | split_huge_page_pmd(vma, old_addr, old_pmd); |
| 186 | } | 186 | } |
| 187 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); | 187 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); |
| 188 | } | 188 | } |
diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bd82f6b3141..b8294fc03df 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c | |||
| @@ -137,6 +137,22 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
| 137 | return count; | 137 | return count; |
| 138 | } | 138 | } |
| 139 | 139 | ||
| 140 | static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) | ||
| 141 | { | ||
| 142 | struct zone *z; | ||
| 143 | |||
| 144 | /* | ||
| 145 | * In free_area_init_core(), highmem zone's managed_pages is set to | ||
| 146 | * present_pages, and bootmem allocator doesn't allocate from highmem | ||
| 147 | * zones. So there's no need to recalculate managed_pages because all | ||
| 148 | * highmem pages will be managed by the buddy system. Here highmem | ||
| 149 | * zone also includes highmem movable zone. | ||
| 150 | */ | ||
| 151 | for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) | ||
| 152 | if (!is_highmem(z)) | ||
| 153 | z->managed_pages = 0; | ||
| 154 | } | ||
| 155 | |||
| 140 | /** | 156 | /** |
| 141 | * free_all_bootmem_node - release a node's free pages to the buddy allocator | 157 | * free_all_bootmem_node - release a node's free pages to the buddy allocator |
| 142 | * @pgdat: node to be released | 158 | * @pgdat: node to be released |
| @@ -146,6 +162,7 @@ unsigned long __init free_low_memory_core_early(int nodeid) | |||
| 146 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | 162 | unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) |
| 147 | { | 163 | { |
| 148 | register_page_bootmem_info_node(pgdat); | 164 | register_page_bootmem_info_node(pgdat); |
| 165 | reset_node_lowmem_managed_pages(pgdat); | ||
| 149 | 166 | ||
| 150 | /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ | 167 | /* free_low_memory_core_early(MAX_NUMNODES) will be called later */ |
| 151 | return 0; | 168 | return 0; |
| @@ -158,6 +175,11 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) | |||
| 158 | */ | 175 | */ |
| 159 | unsigned long __init free_all_bootmem(void) | 176 | unsigned long __init free_all_bootmem(void) |
| 160 | { | 177 | { |
| 178 | struct pglist_data *pgdat; | ||
| 179 | |||
| 180 | for_each_online_pgdat(pgdat) | ||
| 181 | reset_node_lowmem_managed_pages(pgdat); | ||
| 182 | |||
| 161 | /* | 183 | /* |
| 162 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id | 184 | * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id |
| 163 | * because in some case like Node0 doesn't have RAM installed | 185 | * because in some case like Node0 doesn't have RAM installed |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 18f1ae2b45d..0399f146ae4 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -215,7 +215,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
| 215 | * the page allocator means a mempolicy is in effect. Cpuset policy | 215 | * the page allocator means a mempolicy is in effect. Cpuset policy |
| 216 | * is enforced in get_page_from_freelist(). | 216 | * is enforced in get_page_from_freelist(). |
| 217 | */ | 217 | */ |
| 218 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { | 218 | if (nodemask && !nodes_subset(node_states[N_MEMORY], *nodemask)) { |
| 219 | *totalpages = total_swap_pages; | 219 | *totalpages = total_swap_pages; |
| 220 | for_each_node_mask(nid, *nodemask) | 220 | for_each_node_mask(nid, *nodemask) |
| 221 | *totalpages += node_spanned_pages(nid); | 221 | *totalpages += node_spanned_pages(nid); |
| @@ -591,43 +591,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) | |||
| 591 | spin_unlock(&zone_scan_lock); | 591 | spin_unlock(&zone_scan_lock); |
| 592 | } | 592 | } |
| 593 | 593 | ||
| 594 | /* | ||
| 595 | * Try to acquire the oom killer lock for all system zones. Returns zero if a | ||
| 596 | * parallel oom killing is taking place, otherwise locks all zones and returns | ||
| 597 | * non-zero. | ||
| 598 | */ | ||
| 599 | static int try_set_system_oom(void) | ||
| 600 | { | ||
| 601 | struct zone *zone; | ||
| 602 | int ret = 1; | ||
| 603 | |||
| 604 | spin_lock(&zone_scan_lock); | ||
| 605 | for_each_populated_zone(zone) | ||
| 606 | if (zone_is_oom_locked(zone)) { | ||
| 607 | ret = 0; | ||
| 608 | goto out; | ||
| 609 | } | ||
| 610 | for_each_populated_zone(zone) | ||
| 611 | zone_set_flag(zone, ZONE_OOM_LOCKED); | ||
| 612 | out: | ||
| 613 | spin_unlock(&zone_scan_lock); | ||
| 614 | return ret; | ||
| 615 | } | ||
| 616 | |||
| 617 | /* | ||
| 618 | * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation | ||
| 619 | * attempts or page faults may now recall the oom killer, if necessary. | ||
| 620 | */ | ||
| 621 | static void clear_system_oom(void) | ||
| 622 | { | ||
| 623 | struct zone *zone; | ||
| 624 | |||
| 625 | spin_lock(&zone_scan_lock); | ||
| 626 | for_each_populated_zone(zone) | ||
| 627 | zone_clear_flag(zone, ZONE_OOM_LOCKED); | ||
| 628 | spin_unlock(&zone_scan_lock); | ||
| 629 | } | ||
| 630 | |||
| 631 | /** | 594 | /** |
| 632 | * out_of_memory - kill the "best" process when we run out of memory | 595 | * out_of_memory - kill the "best" process when we run out of memory |
| 633 | * @zonelist: zonelist pointer | 596 | * @zonelist: zonelist pointer |
| @@ -708,15 +671,16 @@ out: | |||
| 708 | 671 | ||
| 709 | /* | 672 | /* |
| 710 | * The pagefault handler calls here because it is out of memory, so kill a | 673 | * The pagefault handler calls here because it is out of memory, so kill a |
| 711 | * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel | 674 | * memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a |
| 712 | * oom killing is already in progress so do nothing. If a task is found with | 675 | * parallel oom killing is already in progress so do nothing. |
| 713 | * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. | ||
| 714 | */ | 676 | */ |
| 715 | void pagefault_out_of_memory(void) | 677 | void pagefault_out_of_memory(void) |
| 716 | { | 678 | { |
| 717 | if (try_set_system_oom()) { | 679 | struct zonelist *zonelist = node_zonelist(first_online_node, |
| 680 | GFP_KERNEL); | ||
| 681 | |||
| 682 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { | ||
| 718 | out_of_memory(NULL, 0, 0, NULL, false); | 683 | out_of_memory(NULL, 0, 0, NULL, false); |
| 719 | clear_system_oom(); | 684 | clear_zonelist_oom(zonelist, GFP_KERNEL); |
| 720 | } | 685 | } |
| 721 | schedule_timeout_killable(1); | ||
| 722 | } | 686 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee3efa58c9..83637dfba11 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | |||
| 90 | #ifdef CONFIG_HIGHMEM | 90 | #ifdef CONFIG_HIGHMEM |
| 91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | 91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
| 92 | #endif | 92 | #endif |
| 93 | #ifdef CONFIG_MOVABLE_NODE | ||
| 94 | [N_MEMORY] = { { [0] = 1UL } }, | ||
| 95 | #endif | ||
| 93 | [N_CPU] = { { [0] = 1UL } }, | 96 | [N_CPU] = { { [0] = 1UL } }, |
| 94 | #endif /* NUMA */ | 97 | #endif /* NUMA */ |
| 95 | }; | 98 | }; |
| @@ -732,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
| 732 | local_irq_restore(flags); | 735 | local_irq_restore(flags); |
| 733 | } | 736 | } |
| 734 | 737 | ||
| 738 | /* | ||
| 739 | * Read access to zone->managed_pages is safe because it's unsigned long, | ||
| 740 | * but we still need to serialize writers. Currently all callers of | ||
| 741 | * __free_pages_bootmem() except put_page_bootmem() should only be used | ||
| 742 | * at boot time. So for shorter boot time, we shift the burden to | ||
| 743 | * put_page_bootmem() to serialize writers. | ||
| 744 | */ | ||
| 735 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | 745 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
| 736 | { | 746 | { |
| 737 | unsigned int nr_pages = 1 << order; | 747 | unsigned int nr_pages = 1 << order; |
| @@ -747,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | |||
| 747 | set_page_count(p, 0); | 757 | set_page_count(p, 0); |
| 748 | } | 758 | } |
| 749 | 759 | ||
| 760 | page_zone(page)->managed_pages += 1 << order; | ||
| 750 | set_page_refcounted(page); | 761 | set_page_refcounted(page); |
| 751 | __free_pages(page, order); | 762 | __free_pages(page, order); |
| 752 | } | 763 | } |
| @@ -1695,7 +1706,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
| 1695 | * | 1706 | * |
| 1696 | * If the zonelist cache is present in the passed in zonelist, then | 1707 | * If the zonelist cache is present in the passed in zonelist, then |
| 1697 | * returns a pointer to the allowed node mask (either the current | 1708 | * returns a pointer to the allowed node mask (either the current |
| 1698 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) | 1709 | * tasks mems_allowed, or node_states[N_MEMORY].) |
| 1699 | * | 1710 | * |
| 1700 | * If the zonelist cache is not available for this zonelist, does | 1711 | * If the zonelist cache is not available for this zonelist, does |
| 1701 | * nothing and returns NULL. | 1712 | * nothing and returns NULL. |
| @@ -1724,7 +1735,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
| 1724 | 1735 | ||
| 1725 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1736 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
| 1726 | &cpuset_current_mems_allowed : | 1737 | &cpuset_current_mems_allowed : |
| 1727 | &node_states[N_HIGH_MEMORY]; | 1738 | &node_states[N_MEMORY]; |
| 1728 | return allowednodes; | 1739 | return allowednodes; |
| 1729 | } | 1740 | } |
| 1730 | 1741 | ||
| @@ -2981,6 +2992,7 @@ void show_free_areas(unsigned int filter) | |||
| 2981 | " isolated(anon):%lukB" | 2992 | " isolated(anon):%lukB" |
| 2982 | " isolated(file):%lukB" | 2993 | " isolated(file):%lukB" |
| 2983 | " present:%lukB" | 2994 | " present:%lukB" |
| 2995 | " managed:%lukB" | ||
| 2984 | " mlocked:%lukB" | 2996 | " mlocked:%lukB" |
| 2985 | " dirty:%lukB" | 2997 | " dirty:%lukB" |
| 2986 | " writeback:%lukB" | 2998 | " writeback:%lukB" |
| @@ -3010,6 +3022,7 @@ void show_free_areas(unsigned int filter) | |||
| 3010 | K(zone_page_state(zone, NR_ISOLATED_ANON)), | 3022 | K(zone_page_state(zone, NR_ISOLATED_ANON)), |
| 3011 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | 3023 | K(zone_page_state(zone, NR_ISOLATED_FILE)), |
| 3012 | K(zone->present_pages), | 3024 | K(zone->present_pages), |
| 3025 | K(zone->managed_pages), | ||
| 3013 | K(zone_page_state(zone, NR_MLOCK)), | 3026 | K(zone_page_state(zone, NR_MLOCK)), |
| 3014 | K(zone_page_state(zone, NR_FILE_DIRTY)), | 3027 | K(zone_page_state(zone, NR_FILE_DIRTY)), |
| 3015 | K(zone_page_state(zone, NR_WRITEBACK)), | 3028 | K(zone_page_state(zone, NR_WRITEBACK)), |
| @@ -3238,7 +3251,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
| 3238 | return node; | 3251 | return node; |
| 3239 | } | 3252 | } |
| 3240 | 3253 | ||
| 3241 | for_each_node_state(n, N_HIGH_MEMORY) { | 3254 | for_each_node_state(n, N_MEMORY) { |
| 3242 | 3255 | ||
| 3243 | /* Don't want a node to appear more than once */ | 3256 | /* Don't want a node to appear more than once */ |
| 3244 | if (node_isset(n, *used_node_mask)) | 3257 | if (node_isset(n, *used_node_mask)) |
| @@ -3380,7 +3393,7 @@ static int default_zonelist_order(void) | |||
| 3380 | * local memory, NODE_ORDER may be suitable. | 3393 | * local memory, NODE_ORDER may be suitable. |
| 3381 | */ | 3394 | */ |
| 3382 | average_size = total_size / | 3395 | average_size = total_size / |
| 3383 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | 3396 | (nodes_weight(node_states[N_MEMORY]) + 1); |
| 3384 | for_each_online_node(nid) { | 3397 | for_each_online_node(nid) { |
| 3385 | low_kmem_size = 0; | 3398 | low_kmem_size = 0; |
| 3386 | total_size = 0; | 3399 | total_size = 0; |
| @@ -4476,6 +4489,26 @@ void __init set_pageblock_order(void) | |||
| 4476 | 4489 | ||
| 4477 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4490 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
| 4478 | 4491 | ||
| 4492 | static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, | ||
| 4493 | unsigned long present_pages) | ||
| 4494 | { | ||
| 4495 | unsigned long pages = spanned_pages; | ||
| 4496 | |||
| 4497 | /* | ||
| 4498 | * Provide a more accurate estimation if there are holes within | ||
| 4499 | * the zone and SPARSEMEM is in use. If there are holes within the | ||
| 4500 | * zone, each populated memory region may cost us one or two extra | ||
| 4501 | * memmap pages due to alignment because memmap pages for each | ||
| 4502 | * populated regions may not naturally algined on page boundary. | ||
| 4503 | * So the (present_pages >> 4) heuristic is a tradeoff for that. | ||
| 4504 | */ | ||
| 4505 | if (spanned_pages > present_pages + (present_pages >> 4) && | ||
| 4506 | IS_ENABLED(CONFIG_SPARSEMEM)) | ||
| 4507 | pages = present_pages; | ||
| 4508 | |||
| 4509 | return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; | ||
| 4510 | } | ||
| 4511 | |||
| 4479 | /* | 4512 | /* |
| 4480 | * Set up the zone data structures: | 4513 | * Set up the zone data structures: |
| 4481 | * - mark all pages reserved | 4514 | * - mark all pages reserved |
| @@ -4499,48 +4532,56 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
| 4499 | 4532 | ||
| 4500 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4533 | for (j = 0; j < MAX_NR_ZONES; j++) { |
| 4501 | struct zone *zone = pgdat->node_zones + j; | 4534 | struct zone *zone = pgdat->node_zones + j; |
| 4502 | unsigned long size, realsize, memmap_pages; | 4535 | unsigned long size, realsize, freesize, memmap_pages; |
| 4503 | 4536 | ||
| 4504 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4537 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
| 4505 | realsize = size - zone_absent_pages_in_node(nid, j, | 4538 | realsize = freesize = size - zone_absent_pages_in_node(nid, j, |
| 4506 | zholes_size); | 4539 | zholes_size); |
| 4507 | 4540 | ||
| 4508 | /* | 4541 | /* |
| 4509 | * Adjust realsize so that it accounts for how much memory | 4542 | * Adjust freesize so that it accounts for how much memory |
| 4510 | * is used by this zone for memmap. This affects the watermark | 4543 | * is used by this zone for memmap. This affects the watermark |
| 4511 | * and per-cpu initialisations | 4544 | * and per-cpu initialisations |
| 4512 | */ | 4545 | */ |
| 4513 | memmap_pages = | 4546 | memmap_pages = calc_memmap_size(size, realsize); |
| 4514 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 4547 | if (freesize >= memmap_pages) { |
| 4515 | if (realsize >= memmap_pages) { | 4548 | freesize -= memmap_pages; |
| 4516 | realsize -= memmap_pages; | ||
| 4517 | if (memmap_pages) | 4549 | if (memmap_pages) |
| 4518 | printk(KERN_DEBUG | 4550 | printk(KERN_DEBUG |
| 4519 | " %s zone: %lu pages used for memmap\n", | 4551 | " %s zone: %lu pages used for memmap\n", |
| 4520 | zone_names[j], memmap_pages); | 4552 | zone_names[j], memmap_pages); |
| 4521 | } else | 4553 | } else |
| 4522 | printk(KERN_WARNING | 4554 | printk(KERN_WARNING |
| 4523 | " %s zone: %lu pages exceeds realsize %lu\n", | 4555 | " %s zone: %lu pages exceeds freesize %lu\n", |
| 4524 | zone_names[j], memmap_pages, realsize); | 4556 | zone_names[j], memmap_pages, freesize); |
| 4525 | 4557 | ||
| 4526 | /* Account for reserved pages */ | 4558 | /* Account for reserved pages */ |
| 4527 | if (j == 0 && realsize > dma_reserve) { | 4559 | if (j == 0 && freesize > dma_reserve) { |
| 4528 | realsize -= dma_reserve; | 4560 | freesize -= dma_reserve; |
| 4529 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", | 4561 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
| 4530 | zone_names[0], dma_reserve); | 4562 | zone_names[0], dma_reserve); |
| 4531 | } | 4563 | } |
| 4532 | 4564 | ||
| 4533 | if (!is_highmem_idx(j)) | 4565 | if (!is_highmem_idx(j)) |
| 4534 | nr_kernel_pages += realsize; | 4566 | nr_kernel_pages += freesize; |
| 4535 | nr_all_pages += realsize; | 4567 | /* Charge for highmem memmap if there are enough kernel pages */ |
| 4568 | else if (nr_kernel_pages > memmap_pages * 2) | ||
| 4569 | nr_kernel_pages -= memmap_pages; | ||
| 4570 | nr_all_pages += freesize; | ||
| 4536 | 4571 | ||
| 4537 | zone->spanned_pages = size; | 4572 | zone->spanned_pages = size; |
| 4538 | zone->present_pages = realsize; | 4573 | zone->present_pages = freesize; |
| 4574 | /* | ||
| 4575 | * Set an approximate value for lowmem here, it will be adjusted | ||
| 4576 | * when the bootmem allocator frees pages into the buddy system. | ||
| 4577 | * And all highmem pages will be managed by the buddy system. | ||
| 4578 | */ | ||
| 4579 | zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; | ||
| 4539 | #ifdef CONFIG_NUMA | 4580 | #ifdef CONFIG_NUMA |
| 4540 | zone->node = nid; | 4581 | zone->node = nid; |
| 4541 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4582 | zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) |
| 4542 | / 100; | 4583 | / 100; |
| 4543 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 4584 | zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; |
| 4544 | #endif | 4585 | #endif |
| 4545 | zone->name = zone_names[j]; | 4586 | zone->name = zone_names[j]; |
| 4546 | spin_lock_init(&zone->lock); | 4587 | spin_lock_init(&zone->lock); |
| @@ -4731,7 +4772,7 @@ unsigned long __init find_min_pfn_with_active_regions(void) | |||
| 4731 | /* | 4772 | /* |
| 4732 | * early_calculate_totalpages() | 4773 | * early_calculate_totalpages() |
| 4733 | * Sum pages in active regions for movable zone. | 4774 | * Sum pages in active regions for movable zone. |
| 4734 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | 4775 | * Populate N_MEMORY for calculating usable_nodes. |
| 4735 | */ | 4776 | */ |
| 4736 | static unsigned long __init early_calculate_totalpages(void) | 4777 | static unsigned long __init early_calculate_totalpages(void) |
| 4737 | { | 4778 | { |
| @@ -4744,7 +4785,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
| 4744 | 4785 | ||
| 4745 | totalpages += pages; | 4786 | totalpages += pages; |
| 4746 | if (pages) | 4787 | if (pages) |
| 4747 | node_set_state(nid, N_HIGH_MEMORY); | 4788 | node_set_state(nid, N_MEMORY); |
| 4748 | } | 4789 | } |
| 4749 | return totalpages; | 4790 | return totalpages; |
| 4750 | } | 4791 | } |
| @@ -4761,9 +4802,9 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
| 4761 | unsigned long usable_startpfn; | 4802 | unsigned long usable_startpfn; |
| 4762 | unsigned long kernelcore_node, kernelcore_remaining; | 4803 | unsigned long kernelcore_node, kernelcore_remaining; |
| 4763 | /* save the state before borrow the nodemask */ | 4804 | /* save the state before borrow the nodemask */ |
| 4764 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; | 4805 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
| 4765 | unsigned long totalpages = early_calculate_totalpages(); | 4806 | unsigned long totalpages = early_calculate_totalpages(); |
| 4766 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 4807 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
| 4767 | 4808 | ||
| 4768 | /* | 4809 | /* |
| 4769 | * If movablecore was specified, calculate what size of | 4810 | * If movablecore was specified, calculate what size of |
| @@ -4798,7 +4839,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
| 4798 | restart: | 4839 | restart: |
| 4799 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 4840 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
| 4800 | kernelcore_node = required_kernelcore / usable_nodes; | 4841 | kernelcore_node = required_kernelcore / usable_nodes; |
| 4801 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4842 | for_each_node_state(nid, N_MEMORY) { |
| 4802 | unsigned long start_pfn, end_pfn; | 4843 | unsigned long start_pfn, end_pfn; |
| 4803 | 4844 | ||
| 4804 | /* | 4845 | /* |
| @@ -4890,23 +4931,27 @@ restart: | |||
| 4890 | 4931 | ||
| 4891 | out: | 4932 | out: |
| 4892 | /* restore the node_state */ | 4933 | /* restore the node_state */ |
| 4893 | node_states[N_HIGH_MEMORY] = saved_node_state; | 4934 | node_states[N_MEMORY] = saved_node_state; |
| 4894 | } | 4935 | } |
| 4895 | 4936 | ||
| 4896 | /* Any regular memory on that node ? */ | 4937 | /* Any regular or high memory on that node ? */ |
| 4897 | static void __init check_for_regular_memory(pg_data_t *pgdat) | 4938 | static void check_for_memory(pg_data_t *pgdat, int nid) |
| 4898 | { | 4939 | { |
| 4899 | #ifdef CONFIG_HIGHMEM | ||
| 4900 | enum zone_type zone_type; | 4940 | enum zone_type zone_type; |
| 4901 | 4941 | ||
| 4902 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | 4942 | if (N_MEMORY == N_NORMAL_MEMORY) |
| 4943 | return; | ||
| 4944 | |||
| 4945 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { | ||
| 4903 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4946 | struct zone *zone = &pgdat->node_zones[zone_type]; |
| 4904 | if (zone->present_pages) { | 4947 | if (zone->present_pages) { |
| 4905 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 4948 | node_set_state(nid, N_HIGH_MEMORY); |
| 4949 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && | ||
| 4950 | zone_type <= ZONE_NORMAL) | ||
| 4951 | node_set_state(nid, N_NORMAL_MEMORY); | ||
| 4906 | break; | 4952 | break; |
| 4907 | } | 4953 | } |
| 4908 | } | 4954 | } |
| 4909 | #endif | ||
| 4910 | } | 4955 | } |
| 4911 | 4956 | ||
| 4912 | /** | 4957 | /** |
| @@ -4989,8 +5034,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
| 4989 | 5034 | ||
| 4990 | /* Any memory on that node */ | 5035 | /* Any memory on that node */ |
| 4991 | if (pgdat->node_present_pages) | 5036 | if (pgdat->node_present_pages) |
| 4992 | node_set_state(nid, N_HIGH_MEMORY); | 5037 | node_set_state(nid, N_MEMORY); |
| 4993 | check_for_regular_memory(pgdat); | 5038 | check_for_memory(pgdat, nid); |
| 4994 | } | 5039 | } |
| 4995 | } | 5040 | } |
| 4996 | 5041 | ||
| @@ -5727,7 +5772,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
| 5727 | unsigned int tries = 0; | 5772 | unsigned int tries = 0; |
| 5728 | int ret = 0; | 5773 | int ret = 0; |
| 5729 | 5774 | ||
| 5730 | migrate_prep_local(); | 5775 | migrate_prep(); |
| 5731 | 5776 | ||
| 5732 | while (pfn < end || !list_empty(&cc->migratepages)) { | 5777 | while (pfn < end || !list_empty(&cc->migratepages)) { |
| 5733 | if (fatal_signal_pending(current)) { | 5778 | if (fatal_signal_pending(current)) { |
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 44db00e253e..6d757e3a872 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c | |||
| @@ -274,7 +274,7 @@ void __init page_cgroup_init(void) | |||
| 274 | if (mem_cgroup_disabled()) | 274 | if (mem_cgroup_disabled()) |
| 275 | return; | 275 | return; |
| 276 | 276 | ||
| 277 | for_each_node_state(nid, N_HIGH_MEMORY) { | 277 | for_each_node_state(nid, N_MEMORY) { |
| 278 | unsigned long start_pfn, end_pfn; | 278 | unsigned long start_pfn, end_pfn; |
| 279 | 279 | ||
| 280 | start_pfn = node_start_pfn(nid); | 280 | start_pfn = node_start_pfn(nid); |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 6c118d012bb..35aa294656c 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
| @@ -58,7 +58,7 @@ again: | |||
| 58 | if (!walk->pte_entry) | 58 | if (!walk->pte_entry) |
| 59 | continue; | 59 | continue; |
| 60 | 60 | ||
| 61 | split_huge_page_pmd(walk->mm, pmd); | 61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); |
| 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) | 62 | if (pmd_none_or_trans_huge_or_clear_bad(pmd)) |
| 63 | goto again; | 63 | goto again; |
| 64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
| @@ -1249,12 +1249,14 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
| 1249 | update_hiwater_rss(mm); | 1249 | update_hiwater_rss(mm); |
| 1250 | 1250 | ||
| 1251 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { | 1251 | if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { |
| 1252 | if (PageAnon(page)) | 1252 | if (!PageHuge(page)) { |
| 1253 | dec_mm_counter(mm, MM_ANONPAGES); | 1253 | if (PageAnon(page)) |
| 1254 | else | 1254 | dec_mm_counter(mm, MM_ANONPAGES); |
| 1255 | dec_mm_counter(mm, MM_FILEPAGES); | 1255 | else |
| 1256 | dec_mm_counter(mm, MM_FILEPAGES); | ||
| 1257 | } | ||
| 1256 | set_pte_at(mm, address, pte, | 1258 | set_pte_at(mm, address, pte, |
| 1257 | swp_entry_to_pte(make_hwpoison_entry(page))); | 1259 | swp_entry_to_pte(make_hwpoison_entry(page))); |
| 1258 | } else if (PageAnon(page)) { | 1260 | } else if (PageAnon(page)) { |
| 1259 | swp_entry_t entry = { .val = page_private(page) }; | 1261 | swp_entry_t entry = { .val = page_private(page) }; |
| 1260 | 1262 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 50c5b8f3a35..03f9ba8fb8e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
| @@ -1715,6 +1715,96 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, | |||
| 1715 | return error; | 1715 | return error; |
| 1716 | } | 1716 | } |
| 1717 | 1717 | ||
| 1718 | /* | ||
| 1719 | * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. | ||
| 1720 | */ | ||
| 1721 | static pgoff_t shmem_seek_hole_data(struct address_space *mapping, | ||
| 1722 | pgoff_t index, pgoff_t end, int origin) | ||
| 1723 | { | ||
| 1724 | struct page *page; | ||
| 1725 | struct pagevec pvec; | ||
| 1726 | pgoff_t indices[PAGEVEC_SIZE]; | ||
| 1727 | bool done = false; | ||
| 1728 | int i; | ||
| 1729 | |||
| 1730 | pagevec_init(&pvec, 0); | ||
| 1731 | pvec.nr = 1; /* start small: we may be there already */ | ||
| 1732 | while (!done) { | ||
| 1733 | pvec.nr = shmem_find_get_pages_and_swap(mapping, index, | ||
| 1734 | pvec.nr, pvec.pages, indices); | ||
| 1735 | if (!pvec.nr) { | ||
| 1736 | if (origin == SEEK_DATA) | ||
| 1737 | index = end; | ||
| 1738 | break; | ||
| 1739 | } | ||
| 1740 | for (i = 0; i < pvec.nr; i++, index++) { | ||
| 1741 | if (index < indices[i]) { | ||
| 1742 | if (origin == SEEK_HOLE) { | ||
| 1743 | done = true; | ||
| 1744 | break; | ||
| 1745 | } | ||
| 1746 | index = indices[i]; | ||
| 1747 | } | ||
| 1748 | page = pvec.pages[i]; | ||
| 1749 | if (page && !radix_tree_exceptional_entry(page)) { | ||
| 1750 | if (!PageUptodate(page)) | ||
| 1751 | page = NULL; | ||
| 1752 | } | ||
| 1753 | if (index >= end || | ||
| 1754 | (page && origin == SEEK_DATA) || | ||
| 1755 | (!page && origin == SEEK_HOLE)) { | ||
| 1756 | done = true; | ||
| 1757 | break; | ||
| 1758 | } | ||
| 1759 | } | ||
| 1760 | shmem_deswap_pagevec(&pvec); | ||
| 1761 | pagevec_release(&pvec); | ||
| 1762 | pvec.nr = PAGEVEC_SIZE; | ||
| 1763 | cond_resched(); | ||
| 1764 | } | ||
| 1765 | return index; | ||
| 1766 | } | ||
| 1767 | |||
| 1768 | static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) | ||
| 1769 | { | ||
| 1770 | struct address_space *mapping = file->f_mapping; | ||
| 1771 | struct inode *inode = mapping->host; | ||
| 1772 | pgoff_t start, end; | ||
| 1773 | loff_t new_offset; | ||
| 1774 | |||
| 1775 | if (origin != SEEK_DATA && origin != SEEK_HOLE) | ||
| 1776 | return generic_file_llseek_size(file, offset, origin, | ||
| 1777 | MAX_LFS_FILESIZE, i_size_read(inode)); | ||
| 1778 | mutex_lock(&inode->i_mutex); | ||
| 1779 | /* We're holding i_mutex so we can access i_size directly */ | ||
| 1780 | |||
| 1781 | if (offset < 0) | ||
| 1782 | offset = -EINVAL; | ||
| 1783 | else if (offset >= inode->i_size) | ||
| 1784 | offset = -ENXIO; | ||
| 1785 | else { | ||
| 1786 | start = offset >> PAGE_CACHE_SHIFT; | ||
| 1787 | end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 1788 | new_offset = shmem_seek_hole_data(mapping, start, end, origin); | ||
| 1789 | new_offset <<= PAGE_CACHE_SHIFT; | ||
| 1790 | if (new_offset > offset) { | ||
| 1791 | if (new_offset < inode->i_size) | ||
| 1792 | offset = new_offset; | ||
| 1793 | else if (origin == SEEK_DATA) | ||
| 1794 | offset = -ENXIO; | ||
| 1795 | else | ||
| 1796 | offset = inode->i_size; | ||
| 1797 | } | ||
| 1798 | } | ||
| 1799 | |||
| 1800 | if (offset >= 0 && offset != file->f_pos) { | ||
| 1801 | file->f_pos = offset; | ||
| 1802 | file->f_version = 0; | ||
| 1803 | } | ||
| 1804 | mutex_unlock(&inode->i_mutex); | ||
| 1805 | return offset; | ||
| 1806 | } | ||
| 1807 | |||
| 1718 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, | 1808 | static long shmem_fallocate(struct file *file, int mode, loff_t offset, |
| 1719 | loff_t len) | 1809 | loff_t len) |
| 1720 | { | 1810 | { |
| @@ -2586,7 +2676,7 @@ static const struct address_space_operations shmem_aops = { | |||
| 2586 | static const struct file_operations shmem_file_operations = { | 2676 | static const struct file_operations shmem_file_operations = { |
| 2587 | .mmap = shmem_mmap, | 2677 | .mmap = shmem_mmap, |
| 2588 | #ifdef CONFIG_TMPFS | 2678 | #ifdef CONFIG_TMPFS |
| 2589 | .llseek = generic_file_llseek, | 2679 | .llseek = shmem_file_llseek, |
| 2590 | .read = do_sync_read, | 2680 | .read = do_sync_read, |
| 2591 | .write = do_sync_write, | 2681 | .write = do_sync_write, |
| 2592 | .aio_read = shmem_file_aio_read, | 2682 | .aio_read = shmem_file_aio_read, |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 157bb116dec..7f3096137b8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -3131,7 +3131,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, | |||
| 3131 | int nid; | 3131 | int nid; |
| 3132 | 3132 | ||
| 3133 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { | 3133 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { |
| 3134 | for_each_node_state(nid, N_HIGH_MEMORY) { | 3134 | for_each_node_state(nid, N_MEMORY) { |
| 3135 | pg_data_t *pgdat = NODE_DATA(nid); | 3135 | pg_data_t *pgdat = NODE_DATA(nid); |
| 3136 | const struct cpumask *mask; | 3136 | const struct cpumask *mask; |
| 3137 | 3137 | ||
| @@ -3187,7 +3187,7 @@ static int __init kswapd_init(void) | |||
| 3187 | int nid; | 3187 | int nid; |
| 3188 | 3188 | ||
| 3189 | swap_setup(); | 3189 | swap_setup(); |
| 3190 | for_each_node_state(nid, N_HIGH_MEMORY) | 3190 | for_each_node_state(nid, N_MEMORY) |
| 3191 | kswapd_run(nid); | 3191 | kswapd_run(nid); |
| 3192 | hotcpu_notifier(cpu_callback, 0); | 3192 | hotcpu_notifier(cpu_callback, 0); |
| 3193 | return 0; | 3193 | return 0; |
diff --git a/mm/vmstat.c b/mm/vmstat.c index c7370579111..df14808f0a3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
| @@ -801,6 +801,8 @@ const char * const vmstat_text[] = { | |||
| 801 | "thp_collapse_alloc", | 801 | "thp_collapse_alloc", |
| 802 | "thp_collapse_alloc_failed", | 802 | "thp_collapse_alloc_failed", |
| 803 | "thp_split", | 803 | "thp_split", |
| 804 | "thp_zero_page_alloc", | ||
| 805 | "thp_zero_page_alloc_failed", | ||
| 804 | #endif | 806 | #endif |
| 805 | 807 | ||
| 806 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ | 808 | #endif /* CONFIG_VM_EVENTS_COUNTERS */ |
| @@ -930,7 +932,7 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg) | |||
| 930 | pg_data_t *pgdat = (pg_data_t *)arg; | 932 | pg_data_t *pgdat = (pg_data_t *)arg; |
| 931 | 933 | ||
| 932 | /* check memoryless node */ | 934 | /* check memoryless node */ |
| 933 | if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) | 935 | if (!node_state(pgdat->node_id, N_MEMORY)) |
| 934 | return 0; | 936 | return 0; |
| 935 | 937 | ||
| 936 | seq_printf(m, "Page block order: %d\n", pageblock_order); | 938 | seq_printf(m, "Page block order: %d\n", pageblock_order); |
| @@ -992,14 +994,16 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, | |||
| 992 | "\n high %lu" | 994 | "\n high %lu" |
| 993 | "\n scanned %lu" | 995 | "\n scanned %lu" |
| 994 | "\n spanned %lu" | 996 | "\n spanned %lu" |
| 995 | "\n present %lu", | 997 | "\n present %lu" |
| 998 | "\n managed %lu", | ||
| 996 | zone_page_state(zone, NR_FREE_PAGES), | 999 | zone_page_state(zone, NR_FREE_PAGES), |
| 997 | min_wmark_pages(zone), | 1000 | min_wmark_pages(zone), |
| 998 | low_wmark_pages(zone), | 1001 | low_wmark_pages(zone), |
| 999 | high_wmark_pages(zone), | 1002 | high_wmark_pages(zone), |
| 1000 | zone->pages_scanned, | 1003 | zone->pages_scanned, |
| 1001 | zone->spanned_pages, | 1004 | zone->spanned_pages, |
| 1002 | zone->present_pages); | 1005 | zone->present_pages, |
| 1006 | zone->managed_pages); | ||
| 1003 | 1007 | ||
| 1004 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 1008 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
| 1005 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], | 1009 | seq_printf(m, "\n %-12s %lu", vmstat_text[i], |
| @@ -1292,7 +1296,7 @@ static int unusable_show(struct seq_file *m, void *arg) | |||
| 1292 | pg_data_t *pgdat = (pg_data_t *)arg; | 1296 | pg_data_t *pgdat = (pg_data_t *)arg; |
| 1293 | 1297 | ||
| 1294 | /* check memoryless node */ | 1298 | /* check memoryless node */ |
| 1295 | if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) | 1299 | if (!node_state(pgdat->node_id, N_MEMORY)) |
| 1296 | return 0; | 1300 | return 0; |
| 1297 | 1301 | ||
| 1298 | walk_zones_in_node(m, pgdat, unusable_show_print); | 1302 | walk_zones_in_node(m, pgdat, unusable_show_print); |
