diff options
177 files changed, 4103 insertions, 2866 deletions
diff --git a/Documentation/ABI/stable/sysfs-devices-node b/Documentation/ABI/stable/sysfs-devices-node index ce259c13c36a..5b2d0f08867c 100644 --- a/Documentation/ABI/stable/sysfs-devices-node +++ b/Documentation/ABI/stable/sysfs-devices-node | |||
@@ -85,14 +85,6 @@ Description: | |||
85 | will be compacted. When it completes, memory will be freed | 85 | will be compacted. When it completes, memory will be freed |
86 | into blocks which have as many contiguous pages as possible | 86 | into blocks which have as many contiguous pages as possible |
87 | 87 | ||
88 | What: /sys/devices/system/node/nodeX/scan_unevictable_pages | ||
89 | Date: October 2008 | ||
90 | Contact: Lee Schermerhorn <lee.schermerhorn@hp.com> | ||
91 | Description: | ||
92 | When set, it triggers scanning the node's unevictable lists | ||
93 | and move any pages that have become evictable onto the respective | ||
94 | zone's inactive list. See mm/vmscan.c | ||
95 | |||
96 | What: /sys/devices/system/node/nodeX/hugepages/hugepages-<size>/ | 88 | What: /sys/devices/system/node/nodeX/hugepages/hugepages-<size>/ |
97 | Date: December 2009 | 89 | Date: December 2009 |
98 | Contact: Lee Schermerhorn <lee.schermerhorn@hp.com> | 90 | Contact: Lee Schermerhorn <lee.schermerhorn@hp.com> |
diff --git a/Documentation/ABI/testing/sysfs-block-zram b/Documentation/ABI/testing/sysfs-block-zram index 70ec992514d0..a6148eaf91e5 100644 --- a/Documentation/ABI/testing/sysfs-block-zram +++ b/Documentation/ABI/testing/sysfs-block-zram | |||
@@ -77,11 +77,14 @@ What: /sys/block/zram<id>/notify_free | |||
77 | Date: August 2010 | 77 | Date: August 2010 |
78 | Contact: Nitin Gupta <ngupta@vflare.org> | 78 | Contact: Nitin Gupta <ngupta@vflare.org> |
79 | Description: | 79 | Description: |
80 | The notify_free file is read-only and specifies the number of | 80 | The notify_free file is read-only. Depending on device usage |
81 | swap slot free notifications received by this device. These | 81 | scenario it may account a) the number of pages freed because |
82 | notifications are sent to a swap block device when a swap slot | 82 | of swap slot free notifications or b) the number of pages freed |
83 | is freed. This statistic is applicable only when this disk is | 83 | because of REQ_DISCARD requests sent by bio. The former ones |
84 | being used as a swap disk. | 84 | are sent to a swap block device when a swap slot is freed, which |
85 | implies that this disk is being used as a swap disk. The latter | ||
86 | ones are sent by filesystem mounted with discard option, | ||
87 | whenever some data blocks are getting discarded. | ||
85 | 88 | ||
86 | What: /sys/block/zram<id>/zero_pages | 89 | What: /sys/block/zram<id>/zero_pages |
87 | Date: August 2010 | 90 | Date: August 2010 |
@@ -119,3 +122,22 @@ Description: | |||
119 | efficiency can be calculated using compr_data_size and this | 122 | efficiency can be calculated using compr_data_size and this |
120 | statistic. | 123 | statistic. |
121 | Unit: bytes | 124 | Unit: bytes |
125 | |||
126 | What: /sys/block/zram<id>/mem_used_max | ||
127 | Date: August 2014 | ||
128 | Contact: Minchan Kim <minchan@kernel.org> | ||
129 | Description: | ||
130 | The mem_used_max file is read/write and specifies the amount | ||
131 | of maximum memory zram have consumed to store compressed data. | ||
132 | For resetting the value, you should write "0". Otherwise, | ||
133 | you could see -EINVAL. | ||
134 | Unit: bytes | ||
135 | |||
136 | What: /sys/block/zram<id>/mem_limit | ||
137 | Date: August 2014 | ||
138 | Contact: Minchan Kim <minchan@kernel.org> | ||
139 | Description: | ||
140 | The mem_limit file is read/write and specifies the maximum | ||
141 | amount of memory ZRAM can use to store the compressed data. The | ||
142 | limit could be changed in run time and "0" means disable the | ||
143 | limit. No limit is the initial state. Unit: bytes | ||
diff --git a/Documentation/ABI/testing/sysfs-devices-memory b/Documentation/ABI/testing/sysfs-devices-memory index 7405de26ee60..deef3b5723cf 100644 --- a/Documentation/ABI/testing/sysfs-devices-memory +++ b/Documentation/ABI/testing/sysfs-devices-memory | |||
@@ -61,6 +61,14 @@ Users: hotplug memory remove tools | |||
61 | http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils | 61 | http://www.ibm.com/developerworks/wikis/display/LinuxP/powerpc-utils |
62 | 62 | ||
63 | 63 | ||
64 | What: /sys/devices/system/memory/memoryX/valid_zones | ||
65 | Date: July 2014 | ||
66 | Contact: Zhang Zhen <zhenzhang.zhang@huawei.com> | ||
67 | Description: | ||
68 | The file /sys/devices/system/memory/memoryX/valid_zones is | ||
69 | read-only and is designed to show which zone this memory | ||
70 | block can be onlined to. | ||
71 | |||
64 | What: /sys/devices/system/memoryX/nodeY | 72 | What: /sys/devices/system/memoryX/nodeY |
65 | Date: October 2009 | 73 | Date: October 2009 |
66 | Contact: Linux Memory Management list <linux-mm@kvack.org> | 74 | Contact: Linux Memory Management list <linux-mm@kvack.org> |
diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 0595c3f56ccf..7fcf9c6592ec 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt | |||
@@ -74,14 +74,30 @@ There is little point creating a zram of greater than twice the size of memory | |||
74 | since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the | 74 | since we expect a 2:1 compression ratio. Note that zram uses about 0.1% of the |
75 | size of the disk when not in use so a huge zram is wasteful. | 75 | size of the disk when not in use so a huge zram is wasteful. |
76 | 76 | ||
77 | 5) Activate: | 77 | 5) Set memory limit: Optional |
78 | Set memory limit by writing the value to sysfs node 'mem_limit'. | ||
79 | The value can be either in bytes or you can use mem suffixes. | ||
80 | In addition, you could change the value in runtime. | ||
81 | Examples: | ||
82 | # limit /dev/zram0 with 50MB memory | ||
83 | echo $((50*1024*1024)) > /sys/block/zram0/mem_limit | ||
84 | |||
85 | # Using mem suffixes | ||
86 | echo 256K > /sys/block/zram0/mem_limit | ||
87 | echo 512M > /sys/block/zram0/mem_limit | ||
88 | echo 1G > /sys/block/zram0/mem_limit | ||
89 | |||
90 | # To disable memory limit | ||
91 | echo 0 > /sys/block/zram0/mem_limit | ||
92 | |||
93 | 6) Activate: | ||
78 | mkswap /dev/zram0 | 94 | mkswap /dev/zram0 |
79 | swapon /dev/zram0 | 95 | swapon /dev/zram0 |
80 | 96 | ||
81 | mkfs.ext4 /dev/zram1 | 97 | mkfs.ext4 /dev/zram1 |
82 | mount /dev/zram1 /tmp | 98 | mount /dev/zram1 /tmp |
83 | 99 | ||
84 | 6) Stats: | 100 | 7) Stats: |
85 | Per-device statistics are exported as various nodes under | 101 | Per-device statistics are exported as various nodes under |
86 | /sys/block/zram<id>/ | 102 | /sys/block/zram<id>/ |
87 | disksize | 103 | disksize |
@@ -95,12 +111,13 @@ size of the disk when not in use so a huge zram is wasteful. | |||
95 | orig_data_size | 111 | orig_data_size |
96 | compr_data_size | 112 | compr_data_size |
97 | mem_used_total | 113 | mem_used_total |
114 | mem_used_max | ||
98 | 115 | ||
99 | 7) Deactivate: | 116 | 8) Deactivate: |
100 | swapoff /dev/zram0 | 117 | swapoff /dev/zram0 |
101 | umount /dev/zram1 | 118 | umount /dev/zram1 |
102 | 119 | ||
103 | 8) Reset: | 120 | 9) Reset: |
104 | Write any positive value to 'reset' sysfs node | 121 | Write any positive value to 'reset' sysfs node |
105 | echo 1 > /sys/block/zram0/reset | 122 | echo 1 > /sys/block/zram0/reset |
106 | echo 1 > /sys/block/zram1/reset | 123 | echo 1 > /sys/block/zram1/reset |
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cc4ab2517abc..41f7ec1fcf61 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt | |||
@@ -656,7 +656,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
656 | Sets the size of kernel global memory area for | 656 | Sets the size of kernel global memory area for |
657 | contiguous memory allocations and optionally the | 657 | contiguous memory allocations and optionally the |
658 | placement constraint by the physical address range of | 658 | placement constraint by the physical address range of |
659 | memory allocations. For more information, see | 659 | memory allocations. A value of 0 disables CMA |
660 | altogether. For more information, see | ||
660 | include/linux/dma-contiguous.h | 661 | include/linux/dma-contiguous.h |
661 | 662 | ||
662 | cmo_free_hint= [PPC] Format: { yes | no } | 663 | cmo_free_hint= [PPC] Format: { yes | no } |
@@ -3158,6 +3159,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
3158 | 3159 | ||
3159 | slram= [HW,MTD] | 3160 | slram= [HW,MTD] |
3160 | 3161 | ||
3162 | slab_nomerge [MM] | ||
3163 | Disable merging of slabs with similar size. May be | ||
3164 | necessary if there is some reason to distinguish | ||
3165 | allocs to different slabs. Debug options disable | ||
3166 | merging on their own. | ||
3167 | For more information see Documentation/vm/slub.txt. | ||
3168 | |||
3161 | slab_max_order= [MM, SLAB] | 3169 | slab_max_order= [MM, SLAB] |
3162 | Determines the maximum allowed order for slabs. | 3170 | Determines the maximum allowed order for slabs. |
3163 | A high setting may cause OOMs due to memory | 3171 | A high setting may cause OOMs due to memory |
@@ -3193,11 +3201,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. | |||
3193 | For more information see Documentation/vm/slub.txt. | 3201 | For more information see Documentation/vm/slub.txt. |
3194 | 3202 | ||
3195 | slub_nomerge [MM, SLUB] | 3203 | slub_nomerge [MM, SLUB] |
3196 | Disable merging of slabs with similar size. May be | 3204 | Same with slab_nomerge. This is supported for legacy. |
3197 | necessary if there is some reason to distinguish | 3205 | See slab_nomerge for more information. |
3198 | allocs to different slabs. Debug options disable | ||
3199 | merging on their own. | ||
3200 | For more information see Documentation/vm/slub.txt. | ||
3201 | 3206 | ||
3202 | smart2= [HW] | 3207 | smart2= [HW] |
3203 | Format: <io1>[,<io2>[,...,<io8>]] | 3208 | Format: <io1>[,<io2>[,...,<io8>]] |
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 45134dc23854..ea03abfc97e9 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt | |||
@@ -155,6 +155,7 @@ Under each memory block, you can see 4 files: | |||
155 | /sys/devices/system/memory/memoryXXX/phys_device | 155 | /sys/devices/system/memory/memoryXXX/phys_device |
156 | /sys/devices/system/memory/memoryXXX/state | 156 | /sys/devices/system/memory/memoryXXX/state |
157 | /sys/devices/system/memory/memoryXXX/removable | 157 | /sys/devices/system/memory/memoryXXX/removable |
158 | /sys/devices/system/memory/memoryXXX/valid_zones | ||
158 | 159 | ||
159 | 'phys_index' : read-only and contains memory block id, same as XXX. | 160 | 'phys_index' : read-only and contains memory block id, same as XXX. |
160 | 'state' : read-write | 161 | 'state' : read-write |
@@ -170,6 +171,15 @@ Under each memory block, you can see 4 files: | |||
170 | block is removable and a value of 0 indicates that | 171 | block is removable and a value of 0 indicates that |
171 | it is not removable. A memory block is removable only if | 172 | it is not removable. A memory block is removable only if |
172 | every section in the block is removable. | 173 | every section in the block is removable. |
174 | 'valid_zones' : read-only: designed to show which zones this memory block | ||
175 | can be onlined to. | ||
176 | The first column shows it's default zone. | ||
177 | "memory6/valid_zones: Normal Movable" shows this memoryblock | ||
178 | can be onlined to ZONE_NORMAL by default and to ZONE_MOVABLE | ||
179 | by online_movable. | ||
180 | "memory7/valid_zones: Movable Normal" shows this memoryblock | ||
181 | can be onlined to ZONE_MOVABLE by default and to ZONE_NORMAL | ||
182 | by online_kernel. | ||
173 | 183 | ||
174 | NOTE: | 184 | NOTE: |
175 | These directories/files appear after physical memory hotplug phase. | 185 | These directories/files appear after physical memory hotplug phase. |
@@ -408,7 +418,6 @@ node if necessary. | |||
408 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like | 418 | - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like |
409 | sysctl or new control file. | 419 | sysctl or new control file. |
410 | - showing memory block and physical device relationship. | 420 | - showing memory block and physical device relationship. |
411 | - showing memory block is under ZONE_MOVABLE or not | ||
412 | - test and make it better memory offlining. | 421 | - test and make it better memory offlining. |
413 | - support HugeTLB page migration and offlining. | 422 | - support HugeTLB page migration and offlining. |
414 | - memmap removing at memory offline. | 423 | - memmap removing at memory offline. |
diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index a52cbf178c3a..25b49725df07 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild | |||
@@ -8,4 +8,5 @@ generic-y += irq_work.h | |||
8 | generic-y += mcs_spinlock.h | 8 | generic-y += mcs_spinlock.h |
9 | generic-y += preempt.h | 9 | generic-y += preempt.h |
10 | generic-y += scatterlist.h | 10 | generic-y += scatterlist.h |
11 | generic-y += sections.h | ||
11 | generic-y += trace_clock.h | 12 | generic-y += trace_clock.h |
diff --git a/arch/alpha/include/asm/sections.h b/arch/alpha/include/asm/sections.h deleted file mode 100644 index 43b40edd6e44..000000000000 --- a/arch/alpha/include/asm/sections.h +++ /dev/null | |||
@@ -1,7 +0,0 @@ | |||
1 | #ifndef _ALPHA_SECTIONS_H | ||
2 | #define _ALPHA_SECTIONS_H | ||
3 | |||
4 | /* nothing to see, move along */ | ||
5 | #include <asm-generic/sections.h> | ||
6 | |||
7 | #endif | ||
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index d9d32de9628c..18f392f8b744 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig | |||
@@ -14,6 +14,7 @@ config ARM | |||
14 | select CLONE_BACKWARDS | 14 | select CLONE_BACKWARDS |
15 | select CPU_PM if (SUSPEND || CPU_IDLE) | 15 | select CPU_PM if (SUSPEND || CPU_IDLE) |
16 | select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS | 16 | select DCACHE_WORD_ACCESS if HAVE_EFFICIENT_UNALIGNED_ACCESS |
17 | select GENERIC_ALLOCATOR | ||
17 | select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI) | 18 | select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI) |
18 | select GENERIC_CLOCKEVENTS_BROADCAST if SMP | 19 | select GENERIC_CLOCKEVENTS_BROADCAST if SMP |
19 | select GENERIC_IDLE_POLL_SETUP | 20 | select GENERIC_IDLE_POLL_SETUP |
@@ -61,6 +62,7 @@ config ARM | |||
61 | select HAVE_PERF_EVENTS | 62 | select HAVE_PERF_EVENTS |
62 | select HAVE_PERF_REGS | 63 | select HAVE_PERF_REGS |
63 | select HAVE_PERF_USER_STACK_DUMP | 64 | select HAVE_PERF_USER_STACK_DUMP |
65 | select HAVE_RCU_TABLE_FREE if (SMP && ARM_LPAE) | ||
64 | select HAVE_REGS_AND_STACK_ACCESS_API | 66 | select HAVE_REGS_AND_STACK_ACCESS_API |
65 | select HAVE_SYSCALL_TRACEPOINTS | 67 | select HAVE_SYSCALL_TRACEPOINTS |
66 | select HAVE_UID16 | 68 | select HAVE_UID16 |
@@ -1659,6 +1661,10 @@ config ARCH_SELECT_MEMORY_MODEL | |||
1659 | config HAVE_ARCH_PFN_VALID | 1661 | config HAVE_ARCH_PFN_VALID |
1660 | def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM | 1662 | def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM |
1661 | 1663 | ||
1664 | config HAVE_GENERIC_RCU_GUP | ||
1665 | def_bool y | ||
1666 | depends on ARM_LPAE | ||
1667 | |||
1662 | config HIGHMEM | 1668 | config HIGHMEM |
1663 | bool "High Memory Support" | 1669 | bool "High Memory Support" |
1664 | depends on MMU | 1670 | depends on MMU |
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index 219ac88a9542..f0279411847d 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h | |||
@@ -182,6 +182,8 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) | |||
182 | #define pmd_addr_end(addr,end) (end) | 182 | #define pmd_addr_end(addr,end) (end) |
183 | 183 | ||
184 | #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext) | 184 | #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext) |
185 | #define pte_special(pte) (0) | ||
186 | static inline pte_t pte_mkspecial(pte_t pte) { return pte; } | ||
185 | 187 | ||
186 | /* | 188 | /* |
187 | * We don't have huge page support for short descriptors, for the moment | 189 | * We don't have huge page support for short descriptors, for the moment |
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 06e0bc0f8b00..a31ecdad4b59 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h | |||
@@ -213,10 +213,19 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) | |||
213 | #define pmd_isclear(pmd, val) (!(pmd_val(pmd) & (val))) | 213 | #define pmd_isclear(pmd, val) (!(pmd_val(pmd) & (val))) |
214 | 214 | ||
215 | #define pmd_young(pmd) (pmd_isset((pmd), PMD_SECT_AF)) | 215 | #define pmd_young(pmd) (pmd_isset((pmd), PMD_SECT_AF)) |
216 | #define pte_special(pte) (pte_isset((pte), L_PTE_SPECIAL)) | ||
217 | static inline pte_t pte_mkspecial(pte_t pte) | ||
218 | { | ||
219 | pte_val(pte) |= L_PTE_SPECIAL; | ||
220 | return pte; | ||
221 | } | ||
222 | #define __HAVE_ARCH_PTE_SPECIAL | ||
216 | 223 | ||
217 | #define __HAVE_ARCH_PMD_WRITE | 224 | #define __HAVE_ARCH_PMD_WRITE |
218 | #define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY)) | 225 | #define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY)) |
219 | #define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY)) | 226 | #define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY)) |
227 | #define pud_page(pud) pmd_page(__pmd(pud_val(pud))) | ||
228 | #define pud_write(pud) pmd_write(__pmd(pud_val(pud))) | ||
220 | 229 | ||
221 | #define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd)) | 230 | #define pmd_hugewillfault(pmd) (!pmd_young(pmd) || !pmd_write(pmd)) |
222 | #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) | 231 | #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) |
@@ -224,6 +233,12 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) | |||
224 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 233 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
225 | #define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd)) | 234 | #define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd)) |
226 | #define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING)) | 235 | #define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING)) |
236 | |||
237 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
238 | #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
239 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
240 | pmd_t *pmdp); | ||
241 | #endif | ||
227 | #endif | 242 | #endif |
228 | 243 | ||
229 | #define PMD_BIT_FUNC(fn,op) \ | 244 | #define PMD_BIT_FUNC(fn,op) \ |
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 01baef07cd0c..90aa4583b308 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h | |||
@@ -226,7 +226,6 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd) | |||
226 | #define pte_dirty(pte) (pte_isset((pte), L_PTE_DIRTY)) | 226 | #define pte_dirty(pte) (pte_isset((pte), L_PTE_DIRTY)) |
227 | #define pte_young(pte) (pte_isset((pte), L_PTE_YOUNG)) | 227 | #define pte_young(pte) (pte_isset((pte), L_PTE_YOUNG)) |
228 | #define pte_exec(pte) (pte_isclear((pte), L_PTE_XN)) | 228 | #define pte_exec(pte) (pte_isclear((pte), L_PTE_XN)) |
229 | #define pte_special(pte) (0) | ||
230 | 229 | ||
231 | #define pte_valid_user(pte) \ | 230 | #define pte_valid_user(pte) \ |
232 | (pte_valid(pte) && pte_isset((pte), L_PTE_USER) && pte_young(pte)) | 231 | (pte_valid(pte) && pte_isset((pte), L_PTE_USER) && pte_young(pte)) |
@@ -245,7 +244,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
245 | unsigned long ext = 0; | 244 | unsigned long ext = 0; |
246 | 245 | ||
247 | if (addr < TASK_SIZE && pte_valid_user(pteval)) { | 246 | if (addr < TASK_SIZE && pte_valid_user(pteval)) { |
248 | __sync_icache_dcache(pteval); | 247 | if (!pte_special(pteval)) |
248 | __sync_icache_dcache(pteval); | ||
249 | ext |= PTE_EXT_NG; | 249 | ext |= PTE_EXT_NG; |
250 | } | 250 | } |
251 | 251 | ||
@@ -264,8 +264,6 @@ PTE_BIT_FUNC(mkyoung, |= L_PTE_YOUNG); | |||
264 | PTE_BIT_FUNC(mkexec, &= ~L_PTE_XN); | 264 | PTE_BIT_FUNC(mkexec, &= ~L_PTE_XN); |
265 | PTE_BIT_FUNC(mknexec, |= L_PTE_XN); | 265 | PTE_BIT_FUNC(mknexec, |= L_PTE_XN); |
266 | 266 | ||
267 | static inline pte_t pte_mkspecial(pte_t pte) { return pte; } | ||
268 | |||
269 | static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) | 267 | static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) |
270 | { | 268 | { |
271 | const pteval_t mask = L_PTE_XN | L_PTE_RDONLY | L_PTE_USER | | 269 | const pteval_t mask = L_PTE_XN | L_PTE_RDONLY | L_PTE_USER | |
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index f1a0dace3efe..3cadb726ec88 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h | |||
@@ -35,12 +35,39 @@ | |||
35 | 35 | ||
36 | #define MMU_GATHER_BUNDLE 8 | 36 | #define MMU_GATHER_BUNDLE 8 |
37 | 37 | ||
38 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
39 | static inline void __tlb_remove_table(void *_table) | ||
40 | { | ||
41 | free_page_and_swap_cache((struct page *)_table); | ||
42 | } | ||
43 | |||
44 | struct mmu_table_batch { | ||
45 | struct rcu_head rcu; | ||
46 | unsigned int nr; | ||
47 | void *tables[0]; | ||
48 | }; | ||
49 | |||
50 | #define MAX_TABLE_BATCH \ | ||
51 | ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) | ||
52 | |||
53 | extern void tlb_table_flush(struct mmu_gather *tlb); | ||
54 | extern void tlb_remove_table(struct mmu_gather *tlb, void *table); | ||
55 | |||
56 | #define tlb_remove_entry(tlb, entry) tlb_remove_table(tlb, entry) | ||
57 | #else | ||
58 | #define tlb_remove_entry(tlb, entry) tlb_remove_page(tlb, entry) | ||
59 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | ||
60 | |||
38 | /* | 61 | /* |
39 | * TLB handling. This allows us to remove pages from the page | 62 | * TLB handling. This allows us to remove pages from the page |
40 | * tables, and efficiently handle the TLB issues. | 63 | * tables, and efficiently handle the TLB issues. |
41 | */ | 64 | */ |
42 | struct mmu_gather { | 65 | struct mmu_gather { |
43 | struct mm_struct *mm; | 66 | struct mm_struct *mm; |
67 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
68 | struct mmu_table_batch *batch; | ||
69 | unsigned int need_flush; | ||
70 | #endif | ||
44 | unsigned int fullmm; | 71 | unsigned int fullmm; |
45 | struct vm_area_struct *vma; | 72 | struct vm_area_struct *vma; |
46 | unsigned long start, end; | 73 | unsigned long start, end; |
@@ -101,6 +128,9 @@ static inline void __tlb_alloc_page(struct mmu_gather *tlb) | |||
101 | static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) | 128 | static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) |
102 | { | 129 | { |
103 | tlb_flush(tlb); | 130 | tlb_flush(tlb); |
131 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
132 | tlb_table_flush(tlb); | ||
133 | #endif | ||
104 | } | 134 | } |
105 | 135 | ||
106 | static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) | 136 | static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) |
@@ -129,6 +159,10 @@ tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start | |||
129 | tlb->pages = tlb->local; | 159 | tlb->pages = tlb->local; |
130 | tlb->nr = 0; | 160 | tlb->nr = 0; |
131 | __tlb_alloc_page(tlb); | 161 | __tlb_alloc_page(tlb); |
162 | |||
163 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
164 | tlb->batch = NULL; | ||
165 | #endif | ||
132 | } | 166 | } |
133 | 167 | ||
134 | static inline void | 168 | static inline void |
@@ -205,7 +239,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, | |||
205 | tlb_add_flush(tlb, addr + SZ_1M); | 239 | tlb_add_flush(tlb, addr + SZ_1M); |
206 | #endif | 240 | #endif |
207 | 241 | ||
208 | tlb_remove_page(tlb, pte); | 242 | tlb_remove_entry(tlb, pte); |
209 | } | 243 | } |
210 | 244 | ||
211 | static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, | 245 | static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, |
@@ -213,7 +247,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, | |||
213 | { | 247 | { |
214 | #ifdef CONFIG_ARM_LPAE | 248 | #ifdef CONFIG_ARM_LPAE |
215 | tlb_add_flush(tlb, addr); | 249 | tlb_add_flush(tlb, addr); |
216 | tlb_remove_page(tlb, virt_to_page(pmdp)); | 250 | tlb_remove_entry(tlb, virt_to_page(pmdp)); |
217 | #endif | 251 | #endif |
218 | } | 252 | } |
219 | 253 | ||
diff --git a/arch/arm/kernel/hibernate.c b/arch/arm/kernel/hibernate.c index bb8b79648643..c4cc50e58c13 100644 --- a/arch/arm/kernel/hibernate.c +++ b/arch/arm/kernel/hibernate.c | |||
@@ -21,8 +21,7 @@ | |||
21 | #include <asm/idmap.h> | 21 | #include <asm/idmap.h> |
22 | #include <asm/suspend.h> | 22 | #include <asm/suspend.h> |
23 | #include <asm/memory.h> | 23 | #include <asm/memory.h> |
24 | 24 | #include <asm/sections.h> | |
25 | extern const void __nosave_begin, __nosave_end; | ||
26 | 25 | ||
27 | int pfn_is_nosave(unsigned long pfn) | 26 | int pfn_is_nosave(unsigned long pfn) |
28 | { | 27 | { |
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 7a996aaa061e..c245d903927f 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/bootmem.h> | 12 | #include <linux/bootmem.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/mm.h> | 14 | #include <linux/mm.h> |
15 | #include <linux/genalloc.h> | ||
15 | #include <linux/gfp.h> | 16 | #include <linux/gfp.h> |
16 | #include <linux/errno.h> | 17 | #include <linux/errno.h> |
17 | #include <linux/list.h> | 18 | #include <linux/list.h> |
@@ -298,57 +299,29 @@ static void * | |||
298 | __dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot, | 299 | __dma_alloc_remap(struct page *page, size_t size, gfp_t gfp, pgprot_t prot, |
299 | const void *caller) | 300 | const void *caller) |
300 | { | 301 | { |
301 | struct vm_struct *area; | ||
302 | unsigned long addr; | ||
303 | |||
304 | /* | 302 | /* |
305 | * DMA allocation can be mapped to user space, so lets | 303 | * DMA allocation can be mapped to user space, so lets |
306 | * set VM_USERMAP flags too. | 304 | * set VM_USERMAP flags too. |
307 | */ | 305 | */ |
308 | area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP, | 306 | return dma_common_contiguous_remap(page, size, |
309 | caller); | 307 | VM_ARM_DMA_CONSISTENT | VM_USERMAP, |
310 | if (!area) | 308 | prot, caller); |
311 | return NULL; | ||
312 | addr = (unsigned long)area->addr; | ||
313 | area->phys_addr = __pfn_to_phys(page_to_pfn(page)); | ||
314 | |||
315 | if (ioremap_page_range(addr, addr + size, area->phys_addr, prot)) { | ||
316 | vunmap((void *)addr); | ||
317 | return NULL; | ||
318 | } | ||
319 | return (void *)addr; | ||
320 | } | 309 | } |
321 | 310 | ||
322 | static void __dma_free_remap(void *cpu_addr, size_t size) | 311 | static void __dma_free_remap(void *cpu_addr, size_t size) |
323 | { | 312 | { |
324 | unsigned int flags = VM_ARM_DMA_CONSISTENT | VM_USERMAP; | 313 | dma_common_free_remap(cpu_addr, size, |
325 | struct vm_struct *area = find_vm_area(cpu_addr); | 314 | VM_ARM_DMA_CONSISTENT | VM_USERMAP); |
326 | if (!area || (area->flags & flags) != flags) { | ||
327 | WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); | ||
328 | return; | ||
329 | } | ||
330 | unmap_kernel_range((unsigned long)cpu_addr, size); | ||
331 | vunmap(cpu_addr); | ||
332 | } | 315 | } |
333 | 316 | ||
334 | #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K | 317 | #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K |
318 | static struct gen_pool *atomic_pool; | ||
335 | 319 | ||
336 | struct dma_pool { | 320 | static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; |
337 | size_t size; | ||
338 | spinlock_t lock; | ||
339 | unsigned long *bitmap; | ||
340 | unsigned long nr_pages; | ||
341 | void *vaddr; | ||
342 | struct page **pages; | ||
343 | }; | ||
344 | |||
345 | static struct dma_pool atomic_pool = { | ||
346 | .size = DEFAULT_DMA_COHERENT_POOL_SIZE, | ||
347 | }; | ||
348 | 321 | ||
349 | static int __init early_coherent_pool(char *p) | 322 | static int __init early_coherent_pool(char *p) |
350 | { | 323 | { |
351 | atomic_pool.size = memparse(p, &p); | 324 | atomic_pool_size = memparse(p, &p); |
352 | return 0; | 325 | return 0; |
353 | } | 326 | } |
354 | early_param("coherent_pool", early_coherent_pool); | 327 | early_param("coherent_pool", early_coherent_pool); |
@@ -358,14 +331,14 @@ void __init init_dma_coherent_pool_size(unsigned long size) | |||
358 | /* | 331 | /* |
359 | * Catch any attempt to set the pool size too late. | 332 | * Catch any attempt to set the pool size too late. |
360 | */ | 333 | */ |
361 | BUG_ON(atomic_pool.vaddr); | 334 | BUG_ON(atomic_pool); |
362 | 335 | ||
363 | /* | 336 | /* |
364 | * Set architecture specific coherent pool size only if | 337 | * Set architecture specific coherent pool size only if |
365 | * it has not been changed by kernel command line parameter. | 338 | * it has not been changed by kernel command line parameter. |
366 | */ | 339 | */ |
367 | if (atomic_pool.size == DEFAULT_DMA_COHERENT_POOL_SIZE) | 340 | if (atomic_pool_size == DEFAULT_DMA_COHERENT_POOL_SIZE) |
368 | atomic_pool.size = size; | 341 | atomic_pool_size = size; |
369 | } | 342 | } |
370 | 343 | ||
371 | /* | 344 | /* |
@@ -373,52 +346,44 @@ void __init init_dma_coherent_pool_size(unsigned long size) | |||
373 | */ | 346 | */ |
374 | static int __init atomic_pool_init(void) | 347 | static int __init atomic_pool_init(void) |
375 | { | 348 | { |
376 | struct dma_pool *pool = &atomic_pool; | ||
377 | pgprot_t prot = pgprot_dmacoherent(PAGE_KERNEL); | 349 | pgprot_t prot = pgprot_dmacoherent(PAGE_KERNEL); |
378 | gfp_t gfp = GFP_KERNEL | GFP_DMA; | 350 | gfp_t gfp = GFP_KERNEL | GFP_DMA; |
379 | unsigned long nr_pages = pool->size >> PAGE_SHIFT; | ||
380 | unsigned long *bitmap; | ||
381 | struct page *page; | 351 | struct page *page; |
382 | struct page **pages; | ||
383 | void *ptr; | 352 | void *ptr; |
384 | int bitmap_size = BITS_TO_LONGS(nr_pages) * sizeof(long); | ||
385 | 353 | ||
386 | bitmap = kzalloc(bitmap_size, GFP_KERNEL); | 354 | atomic_pool = gen_pool_create(PAGE_SHIFT, -1); |
387 | if (!bitmap) | 355 | if (!atomic_pool) |
388 | goto no_bitmap; | 356 | goto out; |
389 | |||
390 | pages = kzalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); | ||
391 | if (!pages) | ||
392 | goto no_pages; | ||
393 | 357 | ||
394 | if (dev_get_cma_area(NULL)) | 358 | if (dev_get_cma_area(NULL)) |
395 | ptr = __alloc_from_contiguous(NULL, pool->size, prot, &page, | 359 | ptr = __alloc_from_contiguous(NULL, atomic_pool_size, prot, |
396 | atomic_pool_init); | 360 | &page, atomic_pool_init); |
397 | else | 361 | else |
398 | ptr = __alloc_remap_buffer(NULL, pool->size, gfp, prot, &page, | 362 | ptr = __alloc_remap_buffer(NULL, atomic_pool_size, gfp, prot, |
399 | atomic_pool_init); | 363 | &page, atomic_pool_init); |
400 | if (ptr) { | 364 | if (ptr) { |
401 | int i; | 365 | int ret; |
402 | 366 | ||
403 | for (i = 0; i < nr_pages; i++) | 367 | ret = gen_pool_add_virt(atomic_pool, (unsigned long)ptr, |
404 | pages[i] = page + i; | 368 | page_to_phys(page), |
405 | 369 | atomic_pool_size, -1); | |
406 | spin_lock_init(&pool->lock); | 370 | if (ret) |
407 | pool->vaddr = ptr; | 371 | goto destroy_genpool; |
408 | pool->pages = pages; | 372 | |
409 | pool->bitmap = bitmap; | 373 | gen_pool_set_algo(atomic_pool, |
410 | pool->nr_pages = nr_pages; | 374 | gen_pool_first_fit_order_align, |
411 | pr_info("DMA: preallocated %u KiB pool for atomic coherent allocations\n", | 375 | (void *)PAGE_SHIFT); |
412 | (unsigned)pool->size / 1024); | 376 | pr_info("DMA: preallocated %zd KiB pool for atomic coherent allocations\n", |
377 | atomic_pool_size / 1024); | ||
413 | return 0; | 378 | return 0; |
414 | } | 379 | } |
415 | 380 | ||
416 | kfree(pages); | 381 | destroy_genpool: |
417 | no_pages: | 382 | gen_pool_destroy(atomic_pool); |
418 | kfree(bitmap); | 383 | atomic_pool = NULL; |
419 | no_bitmap: | 384 | out: |
420 | pr_err("DMA: failed to allocate %u KiB pool for atomic coherent allocation\n", | 385 | pr_err("DMA: failed to allocate %zx KiB pool for atomic coherent allocation\n", |
421 | (unsigned)pool->size / 1024); | 386 | atomic_pool_size / 1024); |
422 | return -ENOMEM; | 387 | return -ENOMEM; |
423 | } | 388 | } |
424 | /* | 389 | /* |
@@ -522,76 +487,36 @@ static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp, | |||
522 | 487 | ||
523 | static void *__alloc_from_pool(size_t size, struct page **ret_page) | 488 | static void *__alloc_from_pool(size_t size, struct page **ret_page) |
524 | { | 489 | { |
525 | struct dma_pool *pool = &atomic_pool; | 490 | unsigned long val; |
526 | unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; | ||
527 | unsigned int pageno; | ||
528 | unsigned long flags; | ||
529 | void *ptr = NULL; | 491 | void *ptr = NULL; |
530 | unsigned long align_mask; | ||
531 | 492 | ||
532 | if (!pool->vaddr) { | 493 | if (!atomic_pool) { |
533 | WARN(1, "coherent pool not initialised!\n"); | 494 | WARN(1, "coherent pool not initialised!\n"); |
534 | return NULL; | 495 | return NULL; |
535 | } | 496 | } |
536 | 497 | ||
537 | /* | 498 | val = gen_pool_alloc(atomic_pool, size); |
538 | * Align the region allocation - allocations from pool are rather | 499 | if (val) { |
539 | * small, so align them to their order in pages, minimum is a page | 500 | phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); |
540 | * size. This helps reduce fragmentation of the DMA space. | 501 | |
541 | */ | 502 | *ret_page = phys_to_page(phys); |
542 | align_mask = (1 << get_order(size)) - 1; | 503 | ptr = (void *)val; |
543 | |||
544 | spin_lock_irqsave(&pool->lock, flags); | ||
545 | pageno = bitmap_find_next_zero_area(pool->bitmap, pool->nr_pages, | ||
546 | 0, count, align_mask); | ||
547 | if (pageno < pool->nr_pages) { | ||
548 | bitmap_set(pool->bitmap, pageno, count); | ||
549 | ptr = pool->vaddr + PAGE_SIZE * pageno; | ||
550 | *ret_page = pool->pages[pageno]; | ||
551 | } else { | ||
552 | pr_err_once("ERROR: %u KiB atomic DMA coherent pool is too small!\n" | ||
553 | "Please increase it with coherent_pool= kernel parameter!\n", | ||
554 | (unsigned)pool->size / 1024); | ||
555 | } | 504 | } |
556 | spin_unlock_irqrestore(&pool->lock, flags); | ||
557 | 505 | ||
558 | return ptr; | 506 | return ptr; |
559 | } | 507 | } |
560 | 508 | ||
561 | static bool __in_atomic_pool(void *start, size_t size) | 509 | static bool __in_atomic_pool(void *start, size_t size) |
562 | { | 510 | { |
563 | struct dma_pool *pool = &atomic_pool; | 511 | return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); |
564 | void *end = start + size; | ||
565 | void *pool_start = pool->vaddr; | ||
566 | void *pool_end = pool->vaddr + pool->size; | ||
567 | |||
568 | if (start < pool_start || start >= pool_end) | ||
569 | return false; | ||
570 | |||
571 | if (end <= pool_end) | ||
572 | return true; | ||
573 | |||
574 | WARN(1, "Wrong coherent size(%p-%p) from atomic pool(%p-%p)\n", | ||
575 | start, end - 1, pool_start, pool_end - 1); | ||
576 | |||
577 | return false; | ||
578 | } | 512 | } |
579 | 513 | ||
580 | static int __free_from_pool(void *start, size_t size) | 514 | static int __free_from_pool(void *start, size_t size) |
581 | { | 515 | { |
582 | struct dma_pool *pool = &atomic_pool; | ||
583 | unsigned long pageno, count; | ||
584 | unsigned long flags; | ||
585 | |||
586 | if (!__in_atomic_pool(start, size)) | 516 | if (!__in_atomic_pool(start, size)) |
587 | return 0; | 517 | return 0; |
588 | 518 | ||
589 | pageno = (start - pool->vaddr) >> PAGE_SHIFT; | 519 | gen_pool_free(atomic_pool, (unsigned long)start, size); |
590 | count = size >> PAGE_SHIFT; | ||
591 | |||
592 | spin_lock_irqsave(&pool->lock, flags); | ||
593 | bitmap_clear(pool->bitmap, pageno, count); | ||
594 | spin_unlock_irqrestore(&pool->lock, flags); | ||
595 | 520 | ||
596 | return 1; | 521 | return 1; |
597 | } | 522 | } |
@@ -1271,29 +1196,8 @@ static void * | |||
1271 | __iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot, | 1196 | __iommu_alloc_remap(struct page **pages, size_t size, gfp_t gfp, pgprot_t prot, |
1272 | const void *caller) | 1197 | const void *caller) |
1273 | { | 1198 | { |
1274 | unsigned int i, nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; | 1199 | return dma_common_pages_remap(pages, size, |
1275 | struct vm_struct *area; | 1200 | VM_ARM_DMA_CONSISTENT | VM_USERMAP, prot, caller); |
1276 | unsigned long p; | ||
1277 | |||
1278 | area = get_vm_area_caller(size, VM_ARM_DMA_CONSISTENT | VM_USERMAP, | ||
1279 | caller); | ||
1280 | if (!area) | ||
1281 | return NULL; | ||
1282 | |||
1283 | area->pages = pages; | ||
1284 | area->nr_pages = nr_pages; | ||
1285 | p = (unsigned long)area->addr; | ||
1286 | |||
1287 | for (i = 0; i < nr_pages; i++) { | ||
1288 | phys_addr_t phys = __pfn_to_phys(page_to_pfn(pages[i])); | ||
1289 | if (ioremap_page_range(p, p + PAGE_SIZE, phys, prot)) | ||
1290 | goto err; | ||
1291 | p += PAGE_SIZE; | ||
1292 | } | ||
1293 | return area->addr; | ||
1294 | err: | ||
1295 | unmap_kernel_range((unsigned long)area->addr, size); | ||
1296 | vunmap(area->addr); | ||
1297 | return NULL; | 1201 | return NULL; |
1298 | } | 1202 | } |
1299 | 1203 | ||
@@ -1355,11 +1259,13 @@ static int __iommu_remove_mapping(struct device *dev, dma_addr_t iova, size_t si | |||
1355 | 1259 | ||
1356 | static struct page **__atomic_get_pages(void *addr) | 1260 | static struct page **__atomic_get_pages(void *addr) |
1357 | { | 1261 | { |
1358 | struct dma_pool *pool = &atomic_pool; | 1262 | struct page *page; |
1359 | struct page **pages = pool->pages; | 1263 | phys_addr_t phys; |
1360 | int offs = (addr - pool->vaddr) >> PAGE_SHIFT; | 1264 | |
1265 | phys = gen_pool_virt_to_phys(atomic_pool, (unsigned long)addr); | ||
1266 | page = phys_to_page(phys); | ||
1361 | 1267 | ||
1362 | return pages + offs; | 1268 | return (struct page **)page; |
1363 | } | 1269 | } |
1364 | 1270 | ||
1365 | static struct page **__iommu_get_pages(void *cpu_addr, struct dma_attrs *attrs) | 1271 | static struct page **__iommu_get_pages(void *cpu_addr, struct dma_attrs *attrs) |
@@ -1501,8 +1407,8 @@ void arm_iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr, | |||
1501 | } | 1407 | } |
1502 | 1408 | ||
1503 | if (!dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs)) { | 1409 | if (!dma_get_attr(DMA_ATTR_NO_KERNEL_MAPPING, attrs)) { |
1504 | unmap_kernel_range((unsigned long)cpu_addr, size); | 1410 | dma_common_free_remap(cpu_addr, size, |
1505 | vunmap(cpu_addr); | 1411 | VM_ARM_DMA_CONSISTENT | VM_USERMAP); |
1506 | } | 1412 | } |
1507 | 1413 | ||
1508 | __iommu_remove_mapping(dev, handle, size); | 1414 | __iommu_remove_mapping(dev, handle, size); |
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 43d54f5b26b9..265b836b3bd1 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c | |||
@@ -400,3 +400,18 @@ void __flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned l | |||
400 | */ | 400 | */ |
401 | __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); | 401 | __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); |
402 | } | 402 | } |
403 | |||
404 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
405 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
406 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
407 | pmd_t *pmdp) | ||
408 | { | ||
409 | pmd_t pmd = pmd_mksplitting(*pmdp); | ||
410 | VM_BUG_ON(address & ~PMD_MASK); | ||
411 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); | ||
412 | |||
413 | /* dummy IPI to serialise against fast_gup */ | ||
414 | kick_all_cpus_sync(); | ||
415 | } | ||
416 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | ||
417 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 9221645dd192..92bba32d9230 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c | |||
@@ -322,7 +322,7 @@ void __init arm_memblock_init(const struct machine_desc *mdesc) | |||
322 | * reserve memory for DMA contigouos allocations, | 322 | * reserve memory for DMA contigouos allocations, |
323 | * must come from DMA area inside low memory | 323 | * must come from DMA area inside low memory |
324 | */ | 324 | */ |
325 | dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit)); | 325 | dma_contiguous_reserve(arm_dma_limit); |
326 | 326 | ||
327 | arm_memblock_steal_permitted = false; | 327 | arm_memblock_steal_permitted = false; |
328 | memblock_dump_all(); | 328 | memblock_dump_all(); |
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3f0e854d0ff4..c49ca4c738bb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig | |||
@@ -18,6 +18,7 @@ config ARM64 | |||
18 | select COMMON_CLK | 18 | select COMMON_CLK |
19 | select CPU_PM if (SUSPEND || CPU_IDLE) | 19 | select CPU_PM if (SUSPEND || CPU_IDLE) |
20 | select DCACHE_WORD_ACCESS | 20 | select DCACHE_WORD_ACCESS |
21 | select GENERIC_ALLOCATOR | ||
21 | select GENERIC_CLOCKEVENTS | 22 | select GENERIC_CLOCKEVENTS |
22 | select GENERIC_CLOCKEVENTS_BROADCAST if SMP | 23 | select GENERIC_CLOCKEVENTS_BROADCAST if SMP |
23 | select GENERIC_CPU_AUTOPROBE | 24 | select GENERIC_CPU_AUTOPROBE |
@@ -56,6 +57,7 @@ config ARM64 | |||
56 | select HAVE_PERF_EVENTS | 57 | select HAVE_PERF_EVENTS |
57 | select HAVE_PERF_REGS | 58 | select HAVE_PERF_REGS |
58 | select HAVE_PERF_USER_STACK_DUMP | 59 | select HAVE_PERF_USER_STACK_DUMP |
60 | select HAVE_RCU_TABLE_FREE | ||
59 | select HAVE_SYSCALL_TRACEPOINTS | 61 | select HAVE_SYSCALL_TRACEPOINTS |
60 | select IRQ_DOMAIN | 62 | select IRQ_DOMAIN |
61 | select MODULES_USE_ELF_RELA | 63 | select MODULES_USE_ELF_RELA |
@@ -109,6 +111,9 @@ config GENERIC_CALIBRATE_DELAY | |||
109 | config ZONE_DMA | 111 | config ZONE_DMA |
110 | def_bool y | 112 | def_bool y |
111 | 113 | ||
114 | config HAVE_GENERIC_RCU_GUP | ||
115 | def_bool y | ||
116 | |||
112 | config ARCH_DMA_ADDR_T_64BIT | 117 | config ARCH_DMA_ADDR_T_64BIT |
113 | def_bool y | 118 | def_bool y |
114 | 119 | ||
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 77dbe1e6398d..cefd3e825612 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h | |||
@@ -244,6 +244,16 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, | |||
244 | 244 | ||
245 | #define __HAVE_ARCH_PTE_SPECIAL | 245 | #define __HAVE_ARCH_PTE_SPECIAL |
246 | 246 | ||
247 | static inline pte_t pud_pte(pud_t pud) | ||
248 | { | ||
249 | return __pte(pud_val(pud)); | ||
250 | } | ||
251 | |||
252 | static inline pmd_t pud_pmd(pud_t pud) | ||
253 | { | ||
254 | return __pmd(pud_val(pud)); | ||
255 | } | ||
256 | |||
247 | static inline pte_t pmd_pte(pmd_t pmd) | 257 | static inline pte_t pmd_pte(pmd_t pmd) |
248 | { | 258 | { |
249 | return __pte(pmd_val(pmd)); | 259 | return __pte(pmd_val(pmd)); |
@@ -261,7 +271,13 @@ static inline pmd_t pte_pmd(pte_t pte) | |||
261 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 271 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
262 | #define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) | 272 | #define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) |
263 | #define pmd_trans_splitting(pmd) pte_special(pmd_pte(pmd)) | 273 | #define pmd_trans_splitting(pmd) pte_special(pmd_pte(pmd)) |
264 | #endif | 274 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE |
275 | #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
276 | struct vm_area_struct; | ||
277 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
278 | pmd_t *pmdp); | ||
279 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | ||
280 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
265 | 281 | ||
266 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) | 282 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) |
267 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) | 283 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) |
@@ -282,6 +298,7 @@ static inline pmd_t pte_pmd(pte_t pte) | |||
282 | #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) | 298 | #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) |
283 | 299 | ||
284 | #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) | 300 | #define pmd_page(pmd) pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK)) |
301 | #define pud_write(pud) pte_write(pud_pte(pud)) | ||
285 | #define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT) | 302 | #define pud_pfn(pud) (((pud_val(pud) & PUD_MASK) & PHYS_MASK) >> PAGE_SHIFT) |
286 | 303 | ||
287 | #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) | 304 | #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) |
@@ -383,6 +400,8 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) | |||
383 | return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr); | 400 | return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(addr); |
384 | } | 401 | } |
385 | 402 | ||
403 | #define pud_page(pud) pmd_page(pud_pmd(pud)) | ||
404 | |||
386 | #endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */ | 405 | #endif /* CONFIG_ARM64_PGTABLE_LEVELS > 2 */ |
387 | 406 | ||
388 | #if CONFIG_ARM64_PGTABLE_LEVELS > 3 | 407 | #if CONFIG_ARM64_PGTABLE_LEVELS > 3 |
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 62731ef9749a..a82c0c5c8b52 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h | |||
@@ -23,6 +23,20 @@ | |||
23 | 23 | ||
24 | #include <asm-generic/tlb.h> | 24 | #include <asm-generic/tlb.h> |
25 | 25 | ||
26 | #include <linux/pagemap.h> | ||
27 | #include <linux/swap.h> | ||
28 | |||
29 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
30 | |||
31 | #define tlb_remove_entry(tlb, entry) tlb_remove_table(tlb, entry) | ||
32 | static inline void __tlb_remove_table(void *_table) | ||
33 | { | ||
34 | free_page_and_swap_cache((struct page *)_table); | ||
35 | } | ||
36 | #else | ||
37 | #define tlb_remove_entry(tlb, entry) tlb_remove_page(tlb, entry) | ||
38 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | ||
39 | |||
26 | /* | 40 | /* |
27 | * There's three ways the TLB shootdown code is used: | 41 | * There's three ways the TLB shootdown code is used: |
28 | * 1. Unmapping a range of vmas. See zap_page_range(), unmap_region(). | 42 | * 1. Unmapping a range of vmas. See zap_page_range(), unmap_region(). |
@@ -88,7 +102,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, | |||
88 | { | 102 | { |
89 | pgtable_page_dtor(pte); | 103 | pgtable_page_dtor(pte); |
90 | tlb_add_flush(tlb, addr); | 104 | tlb_add_flush(tlb, addr); |
91 | tlb_remove_page(tlb, pte); | 105 | tlb_remove_entry(tlb, pte); |
92 | } | 106 | } |
93 | 107 | ||
94 | #if CONFIG_ARM64_PGTABLE_LEVELS > 2 | 108 | #if CONFIG_ARM64_PGTABLE_LEVELS > 2 |
@@ -96,7 +110,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, | |||
96 | unsigned long addr) | 110 | unsigned long addr) |
97 | { | 111 | { |
98 | tlb_add_flush(tlb, addr); | 112 | tlb_add_flush(tlb, addr); |
99 | tlb_remove_page(tlb, virt_to_page(pmdp)); | 113 | tlb_remove_entry(tlb, virt_to_page(pmdp)); |
100 | } | 114 | } |
101 | #endif | 115 | #endif |
102 | 116 | ||
@@ -105,7 +119,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, | |||
105 | unsigned long addr) | 119 | unsigned long addr) |
106 | { | 120 | { |
107 | tlb_add_flush(tlb, addr); | 121 | tlb_add_flush(tlb, addr); |
108 | tlb_remove_page(tlb, virt_to_page(pudp)); | 122 | tlb_remove_entry(tlb, virt_to_page(pudp)); |
109 | } | 123 | } |
110 | #endif | 124 | #endif |
111 | 125 | ||
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 2c71077cacfd..d92094203913 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/gfp.h> | 20 | #include <linux/gfp.h> |
21 | #include <linux/export.h> | 21 | #include <linux/export.h> |
22 | #include <linux/slab.h> | 22 | #include <linux/slab.h> |
23 | #include <linux/genalloc.h> | ||
23 | #include <linux/dma-mapping.h> | 24 | #include <linux/dma-mapping.h> |
24 | #include <linux/dma-contiguous.h> | 25 | #include <linux/dma-contiguous.h> |
25 | #include <linux/vmalloc.h> | 26 | #include <linux/vmalloc.h> |
@@ -38,6 +39,54 @@ static pgprot_t __get_dma_pgprot(struct dma_attrs *attrs, pgprot_t prot, | |||
38 | return prot; | 39 | return prot; |
39 | } | 40 | } |
40 | 41 | ||
42 | static struct gen_pool *atomic_pool; | ||
43 | |||
44 | #define DEFAULT_DMA_COHERENT_POOL_SIZE SZ_256K | ||
45 | static size_t atomic_pool_size = DEFAULT_DMA_COHERENT_POOL_SIZE; | ||
46 | |||
47 | static int __init early_coherent_pool(char *p) | ||
48 | { | ||
49 | atomic_pool_size = memparse(p, &p); | ||
50 | return 0; | ||
51 | } | ||
52 | early_param("coherent_pool", early_coherent_pool); | ||
53 | |||
54 | static void *__alloc_from_pool(size_t size, struct page **ret_page) | ||
55 | { | ||
56 | unsigned long val; | ||
57 | void *ptr = NULL; | ||
58 | |||
59 | if (!atomic_pool) { | ||
60 | WARN(1, "coherent pool not initialised!\n"); | ||
61 | return NULL; | ||
62 | } | ||
63 | |||
64 | val = gen_pool_alloc(atomic_pool, size); | ||
65 | if (val) { | ||
66 | phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val); | ||
67 | |||
68 | *ret_page = phys_to_page(phys); | ||
69 | ptr = (void *)val; | ||
70 | } | ||
71 | |||
72 | return ptr; | ||
73 | } | ||
74 | |||
75 | static bool __in_atomic_pool(void *start, size_t size) | ||
76 | { | ||
77 | return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); | ||
78 | } | ||
79 | |||
80 | static int __free_from_pool(void *start, size_t size) | ||
81 | { | ||
82 | if (!__in_atomic_pool(start, size)) | ||
83 | return 0; | ||
84 | |||
85 | gen_pool_free(atomic_pool, (unsigned long)start, size); | ||
86 | |||
87 | return 1; | ||
88 | } | ||
89 | |||
41 | static void *__dma_alloc_coherent(struct device *dev, size_t size, | 90 | static void *__dma_alloc_coherent(struct device *dev, size_t size, |
42 | dma_addr_t *dma_handle, gfp_t flags, | 91 | dma_addr_t *dma_handle, gfp_t flags, |
43 | struct dma_attrs *attrs) | 92 | struct dma_attrs *attrs) |
@@ -50,7 +99,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, | |||
50 | if (IS_ENABLED(CONFIG_ZONE_DMA) && | 99 | if (IS_ENABLED(CONFIG_ZONE_DMA) && |
51 | dev->coherent_dma_mask <= DMA_BIT_MASK(32)) | 100 | dev->coherent_dma_mask <= DMA_BIT_MASK(32)) |
52 | flags |= GFP_DMA; | 101 | flags |= GFP_DMA; |
53 | if (IS_ENABLED(CONFIG_DMA_CMA)) { | 102 | if (IS_ENABLED(CONFIG_DMA_CMA) && (flags & __GFP_WAIT)) { |
54 | struct page *page; | 103 | struct page *page; |
55 | 104 | ||
56 | size = PAGE_ALIGN(size); | 105 | size = PAGE_ALIGN(size); |
@@ -70,50 +119,54 @@ static void __dma_free_coherent(struct device *dev, size_t size, | |||
70 | void *vaddr, dma_addr_t dma_handle, | 119 | void *vaddr, dma_addr_t dma_handle, |
71 | struct dma_attrs *attrs) | 120 | struct dma_attrs *attrs) |
72 | { | 121 | { |
122 | bool freed; | ||
123 | phys_addr_t paddr = dma_to_phys(dev, dma_handle); | ||
124 | |||
73 | if (dev == NULL) { | 125 | if (dev == NULL) { |
74 | WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); | 126 | WARN_ONCE(1, "Use an actual device structure for DMA allocation\n"); |
75 | return; | 127 | return; |
76 | } | 128 | } |
77 | 129 | ||
78 | if (IS_ENABLED(CONFIG_DMA_CMA)) { | 130 | freed = dma_release_from_contiguous(dev, |
79 | phys_addr_t paddr = dma_to_phys(dev, dma_handle); | ||
80 | |||
81 | dma_release_from_contiguous(dev, | ||
82 | phys_to_page(paddr), | 131 | phys_to_page(paddr), |
83 | size >> PAGE_SHIFT); | 132 | size >> PAGE_SHIFT); |
84 | } else { | 133 | if (!freed) |
85 | swiotlb_free_coherent(dev, size, vaddr, dma_handle); | 134 | swiotlb_free_coherent(dev, size, vaddr, dma_handle); |
86 | } | ||
87 | } | 135 | } |
88 | 136 | ||
89 | static void *__dma_alloc_noncoherent(struct device *dev, size_t size, | 137 | static void *__dma_alloc_noncoherent(struct device *dev, size_t size, |
90 | dma_addr_t *dma_handle, gfp_t flags, | 138 | dma_addr_t *dma_handle, gfp_t flags, |
91 | struct dma_attrs *attrs) | 139 | struct dma_attrs *attrs) |
92 | { | 140 | { |
93 | struct page *page, **map; | 141 | struct page *page; |
94 | void *ptr, *coherent_ptr; | 142 | void *ptr, *coherent_ptr; |
95 | int order, i; | ||
96 | 143 | ||
97 | size = PAGE_ALIGN(size); | 144 | size = PAGE_ALIGN(size); |
98 | order = get_order(size); | 145 | |
146 | if (!(flags & __GFP_WAIT)) { | ||
147 | struct page *page = NULL; | ||
148 | void *addr = __alloc_from_pool(size, &page); | ||
149 | |||
150 | if (addr) | ||
151 | *dma_handle = phys_to_dma(dev, page_to_phys(page)); | ||
152 | |||
153 | return addr; | ||
154 | |||
155 | } | ||
99 | 156 | ||
100 | ptr = __dma_alloc_coherent(dev, size, dma_handle, flags, attrs); | 157 | ptr = __dma_alloc_coherent(dev, size, dma_handle, flags, attrs); |
101 | if (!ptr) | 158 | if (!ptr) |
102 | goto no_mem; | 159 | goto no_mem; |
103 | map = kmalloc(sizeof(struct page *) << order, flags & ~GFP_DMA); | ||
104 | if (!map) | ||
105 | goto no_map; | ||
106 | 160 | ||
107 | /* remove any dirty cache lines on the kernel alias */ | 161 | /* remove any dirty cache lines on the kernel alias */ |
108 | __dma_flush_range(ptr, ptr + size); | 162 | __dma_flush_range(ptr, ptr + size); |
109 | 163 | ||
110 | /* create a coherent mapping */ | 164 | /* create a coherent mapping */ |
111 | page = virt_to_page(ptr); | 165 | page = virt_to_page(ptr); |
112 | for (i = 0; i < (size >> PAGE_SHIFT); i++) | 166 | coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP, |
113 | map[i] = page + i; | 167 | __get_dma_pgprot(attrs, |
114 | coherent_ptr = vmap(map, size >> PAGE_SHIFT, VM_MAP, | 168 | __pgprot(PROT_NORMAL_NC), false), |
115 | __get_dma_pgprot(attrs, __pgprot(PROT_NORMAL_NC), false)); | 169 | NULL); |
116 | kfree(map); | ||
117 | if (!coherent_ptr) | 170 | if (!coherent_ptr) |
118 | goto no_map; | 171 | goto no_map; |
119 | 172 | ||
@@ -132,6 +185,8 @@ static void __dma_free_noncoherent(struct device *dev, size_t size, | |||
132 | { | 185 | { |
133 | void *swiotlb_addr = phys_to_virt(dma_to_phys(dev, dma_handle)); | 186 | void *swiotlb_addr = phys_to_virt(dma_to_phys(dev, dma_handle)); |
134 | 187 | ||
188 | if (__free_from_pool(vaddr, size)) | ||
189 | return; | ||
135 | vunmap(vaddr); | 190 | vunmap(vaddr); |
136 | __dma_free_coherent(dev, size, swiotlb_addr, dma_handle, attrs); | 191 | __dma_free_coherent(dev, size, swiotlb_addr, dma_handle, attrs); |
137 | } | 192 | } |
@@ -307,6 +362,67 @@ EXPORT_SYMBOL(coherent_swiotlb_dma_ops); | |||
307 | 362 | ||
308 | extern int swiotlb_late_init_with_default_size(size_t default_size); | 363 | extern int swiotlb_late_init_with_default_size(size_t default_size); |
309 | 364 | ||
365 | static int __init atomic_pool_init(void) | ||
366 | { | ||
367 | pgprot_t prot = __pgprot(PROT_NORMAL_NC); | ||
368 | unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT; | ||
369 | struct page *page; | ||
370 | void *addr; | ||
371 | unsigned int pool_size_order = get_order(atomic_pool_size); | ||
372 | |||
373 | if (dev_get_cma_area(NULL)) | ||
374 | page = dma_alloc_from_contiguous(NULL, nr_pages, | ||
375 | pool_size_order); | ||
376 | else | ||
377 | page = alloc_pages(GFP_DMA, pool_size_order); | ||
378 | |||
379 | if (page) { | ||
380 | int ret; | ||
381 | void *page_addr = page_address(page); | ||
382 | |||
383 | memset(page_addr, 0, atomic_pool_size); | ||
384 | __dma_flush_range(page_addr, page_addr + atomic_pool_size); | ||
385 | |||
386 | atomic_pool = gen_pool_create(PAGE_SHIFT, -1); | ||
387 | if (!atomic_pool) | ||
388 | goto free_page; | ||
389 | |||
390 | addr = dma_common_contiguous_remap(page, atomic_pool_size, | ||
391 | VM_USERMAP, prot, atomic_pool_init); | ||
392 | |||
393 | if (!addr) | ||
394 | goto destroy_genpool; | ||
395 | |||
396 | ret = gen_pool_add_virt(atomic_pool, (unsigned long)addr, | ||
397 | page_to_phys(page), | ||
398 | atomic_pool_size, -1); | ||
399 | if (ret) | ||
400 | goto remove_mapping; | ||
401 | |||
402 | gen_pool_set_algo(atomic_pool, | ||
403 | gen_pool_first_fit_order_align, | ||
404 | (void *)PAGE_SHIFT); | ||
405 | |||
406 | pr_info("DMA: preallocated %zu KiB pool for atomic allocations\n", | ||
407 | atomic_pool_size / 1024); | ||
408 | return 0; | ||
409 | } | ||
410 | goto out; | ||
411 | |||
412 | remove_mapping: | ||
413 | dma_common_free_remap(addr, atomic_pool_size, VM_USERMAP); | ||
414 | destroy_genpool: | ||
415 | gen_pool_destroy(atomic_pool); | ||
416 | atomic_pool = NULL; | ||
417 | free_page: | ||
418 | if (!dma_release_from_contiguous(NULL, page, nr_pages)) | ||
419 | __free_pages(page, pool_size_order); | ||
420 | out: | ||
421 | pr_err("DMA: failed to allocate %zu KiB pool for atomic coherent allocation\n", | ||
422 | atomic_pool_size / 1024); | ||
423 | return -ENOMEM; | ||
424 | } | ||
425 | |||
310 | static int __init swiotlb_late_init(void) | 426 | static int __init swiotlb_late_init(void) |
311 | { | 427 | { |
312 | size_t swiotlb_size = min(SZ_64M, MAX_ORDER_NR_PAGES << PAGE_SHIFT); | 428 | size_t swiotlb_size = min(SZ_64M, MAX_ORDER_NR_PAGES << PAGE_SHIFT); |
@@ -315,7 +431,17 @@ static int __init swiotlb_late_init(void) | |||
315 | 431 | ||
316 | return swiotlb_late_init_with_default_size(swiotlb_size); | 432 | return swiotlb_late_init_with_default_size(swiotlb_size); |
317 | } | 433 | } |
318 | arch_initcall(swiotlb_late_init); | 434 | |
435 | static int __init arm64_dma_init(void) | ||
436 | { | ||
437 | int ret = 0; | ||
438 | |||
439 | ret |= swiotlb_late_init(); | ||
440 | ret |= atomic_pool_init(); | ||
441 | |||
442 | return ret; | ||
443 | } | ||
444 | arch_initcall(arm64_dma_init); | ||
319 | 445 | ||
320 | #define PREALLOC_DMA_DEBUG_ENTRIES 4096 | 446 | #define PREALLOC_DMA_DEBUG_ENTRIES 4096 |
321 | 447 | ||
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 0d64089d28b5..b6f14e8d2121 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c | |||
@@ -104,3 +104,19 @@ EXPORT_SYMBOL(flush_dcache_page); | |||
104 | */ | 104 | */ |
105 | EXPORT_SYMBOL(flush_cache_all); | 105 | EXPORT_SYMBOL(flush_cache_all); |
106 | EXPORT_SYMBOL(flush_icache_range); | 106 | EXPORT_SYMBOL(flush_icache_range); |
107 | |||
108 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
109 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
110 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
111 | pmd_t *pmdp) | ||
112 | { | ||
113 | pmd_t pmd = pmd_mksplitting(*pmdp); | ||
114 | |||
115 | VM_BUG_ON(address & ~PMD_MASK); | ||
116 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); | ||
117 | |||
118 | /* dummy IPI to serialise against fast_gup */ | ||
119 | kick_all_cpus_sync(); | ||
120 | } | ||
121 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | ||
122 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index 802b94c4ca86..2ca489eaadd3 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild | |||
@@ -15,6 +15,7 @@ generic-y += mcs_spinlock.h | |||
15 | generic-y += module.h | 15 | generic-y += module.h |
16 | generic-y += preempt.h | 16 | generic-y += preempt.h |
17 | generic-y += scatterlist.h | 17 | generic-y += scatterlist.h |
18 | generic-y += sections.h | ||
18 | generic-y += trace_clock.h | 19 | generic-y += trace_clock.h |
19 | generic-y += vga.h | 20 | generic-y += vga.h |
20 | generic-y += xor.h | 21 | generic-y += xor.h |
diff --git a/arch/cris/include/asm/sections.h b/arch/cris/include/asm/sections.h deleted file mode 100644 index 2c998ce8967b..000000000000 --- a/arch/cris/include/asm/sections.h +++ /dev/null | |||
@@ -1,7 +0,0 @@ | |||
1 | #ifndef _CRIS_SECTIONS_H | ||
2 | #define _CRIS_SECTIONS_H | ||
3 | |||
4 | /* nothing to see, move along */ | ||
5 | #include <asm-generic/sections.h> | ||
6 | |||
7 | #endif | ||
diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h index 6554e78893f2..ae8d423e79d9 100644 --- a/arch/frv/include/asm/processor.h +++ b/arch/frv/include/asm/processor.h | |||
@@ -35,22 +35,6 @@ | |||
35 | struct task_struct; | 35 | struct task_struct; |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * CPU type and hardware bug flags. Kept separately for each CPU. | ||
39 | */ | ||
40 | struct cpuinfo_frv { | ||
41 | #ifdef CONFIG_MMU | ||
42 | unsigned long *pgd_quick; | ||
43 | unsigned long *pte_quick; | ||
44 | unsigned long pgtable_cache_sz; | ||
45 | #endif | ||
46 | } __cacheline_aligned; | ||
47 | |||
48 | extern struct cpuinfo_frv __nongprelbss boot_cpu_data; | ||
49 | |||
50 | #define cpu_data (&boot_cpu_data) | ||
51 | #define current_cpu_data boot_cpu_data | ||
52 | |||
53 | /* | ||
54 | * Bus types | 38 | * Bus types |
55 | */ | 39 | */ |
56 | #define EISA_bus 0 | 40 | #define EISA_bus 0 |
diff --git a/arch/frv/kernel/irq-mb93091.c b/arch/frv/kernel/irq-mb93091.c index 2cc327a1ca44..091b2839be90 100644 --- a/arch/frv/kernel/irq-mb93091.c +++ b/arch/frv/kernel/irq-mb93091.c | |||
@@ -107,25 +107,25 @@ static irqreturn_t fpga_interrupt(int irq, void *_mask) | |||
107 | static struct irqaction fpga_irq[4] = { | 107 | static struct irqaction fpga_irq[4] = { |
108 | [0] = { | 108 | [0] = { |
109 | .handler = fpga_interrupt, | 109 | .handler = fpga_interrupt, |
110 | .flags = IRQF_DISABLED | IRQF_SHARED, | 110 | .flags = IRQF_SHARED, |
111 | .name = "fpga.0", | 111 | .name = "fpga.0", |
112 | .dev_id = (void *) 0x0028UL, | 112 | .dev_id = (void *) 0x0028UL, |
113 | }, | 113 | }, |
114 | [1] = { | 114 | [1] = { |
115 | .handler = fpga_interrupt, | 115 | .handler = fpga_interrupt, |
116 | .flags = IRQF_DISABLED | IRQF_SHARED, | 116 | .flags = IRQF_SHARED, |
117 | .name = "fpga.1", | 117 | .name = "fpga.1", |
118 | .dev_id = (void *) 0x0050UL, | 118 | .dev_id = (void *) 0x0050UL, |
119 | }, | 119 | }, |
120 | [2] = { | 120 | [2] = { |
121 | .handler = fpga_interrupt, | 121 | .handler = fpga_interrupt, |
122 | .flags = IRQF_DISABLED | IRQF_SHARED, | 122 | .flags = IRQF_SHARED, |
123 | .name = "fpga.2", | 123 | .name = "fpga.2", |
124 | .dev_id = (void *) 0x1c00UL, | 124 | .dev_id = (void *) 0x1c00UL, |
125 | }, | 125 | }, |
126 | [3] = { | 126 | [3] = { |
127 | .handler = fpga_interrupt, | 127 | .handler = fpga_interrupt, |
128 | .flags = IRQF_DISABLED | IRQF_SHARED, | 128 | .flags = IRQF_SHARED, |
129 | .name = "fpga.3", | 129 | .name = "fpga.3", |
130 | .dev_id = (void *) 0x6386UL, | 130 | .dev_id = (void *) 0x6386UL, |
131 | } | 131 | } |
diff --git a/arch/frv/kernel/irq-mb93093.c b/arch/frv/kernel/irq-mb93093.c index 95e4eb4f1f38..1f3015cf80f5 100644 --- a/arch/frv/kernel/irq-mb93093.c +++ b/arch/frv/kernel/irq-mb93093.c | |||
@@ -105,7 +105,6 @@ static irqreturn_t fpga_interrupt(int irq, void *_mask) | |||
105 | static struct irqaction fpga_irq[1] = { | 105 | static struct irqaction fpga_irq[1] = { |
106 | [0] = { | 106 | [0] = { |
107 | .handler = fpga_interrupt, | 107 | .handler = fpga_interrupt, |
108 | .flags = IRQF_DISABLED, | ||
109 | .name = "fpga.0", | 108 | .name = "fpga.0", |
110 | .dev_id = (void *) 0x0700UL, | 109 | .dev_id = (void *) 0x0700UL, |
111 | } | 110 | } |
diff --git a/arch/frv/kernel/irq-mb93493.c b/arch/frv/kernel/irq-mb93493.c index ba648da0932d..8ca5aa4ff595 100644 --- a/arch/frv/kernel/irq-mb93493.c +++ b/arch/frv/kernel/irq-mb93493.c | |||
@@ -118,13 +118,13 @@ static irqreturn_t mb93493_interrupt(int irq, void *_piqsr) | |||
118 | static struct irqaction mb93493_irq[2] = { | 118 | static struct irqaction mb93493_irq[2] = { |
119 | [0] = { | 119 | [0] = { |
120 | .handler = mb93493_interrupt, | 120 | .handler = mb93493_interrupt, |
121 | .flags = IRQF_DISABLED | IRQF_SHARED, | 121 | .flags = IRQF_SHARED, |
122 | .name = "mb93493.0", | 122 | .name = "mb93493.0", |
123 | .dev_id = (void *) __addr_MB93493_IQSR(0), | 123 | .dev_id = (void *) __addr_MB93493_IQSR(0), |
124 | }, | 124 | }, |
125 | [1] = { | 125 | [1] = { |
126 | .handler = mb93493_interrupt, | 126 | .handler = mb93493_interrupt, |
127 | .flags = IRQF_DISABLED | IRQF_SHARED, | 127 | .flags = IRQF_SHARED, |
128 | .name = "mb93493.1", | 128 | .name = "mb93493.1", |
129 | .dev_id = (void *) __addr_MB93493_IQSR(1), | 129 | .dev_id = (void *) __addr_MB93493_IQSR(1), |
130 | } | 130 | } |
diff --git a/arch/frv/kernel/setup.c b/arch/frv/kernel/setup.c index 9f3a7a62d787..9f4a9a607dbe 100644 --- a/arch/frv/kernel/setup.c +++ b/arch/frv/kernel/setup.c | |||
@@ -104,8 +104,6 @@ unsigned long __nongprelbss dma_coherent_mem_end; | |||
104 | unsigned long __initdata __sdram_old_base; | 104 | unsigned long __initdata __sdram_old_base; |
105 | unsigned long __initdata num_mappedpages; | 105 | unsigned long __initdata num_mappedpages; |
106 | 106 | ||
107 | struct cpuinfo_frv __nongprelbss boot_cpu_data; | ||
108 | |||
109 | char __initdata command_line[COMMAND_LINE_SIZE]; | 107 | char __initdata command_line[COMMAND_LINE_SIZE]; |
110 | char __initdata redboot_command_line[COMMAND_LINE_SIZE]; | 108 | char __initdata redboot_command_line[COMMAND_LINE_SIZE]; |
111 | 109 | ||
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c index b457de496b70..332e00bf9d06 100644 --- a/arch/frv/kernel/time.c +++ b/arch/frv/kernel/time.c | |||
@@ -44,7 +44,6 @@ static irqreturn_t timer_interrupt(int irq, void *dummy); | |||
44 | 44 | ||
45 | static struct irqaction timer_irq = { | 45 | static struct irqaction timer_irq = { |
46 | .handler = timer_interrupt, | 46 | .handler = timer_interrupt, |
47 | .flags = IRQF_DISABLED, | ||
48 | .name = "timer", | 47 | .name = "timer", |
49 | }; | 48 | }; |
50 | 49 | ||
diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index e02448b0648b..3796801d6e0c 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild | |||
@@ -8,4 +8,5 @@ generic-y += mcs_spinlock.h | |||
8 | generic-y += module.h | 8 | generic-y += module.h |
9 | generic-y += preempt.h | 9 | generic-y += preempt.h |
10 | generic-y += scatterlist.h | 10 | generic-y += scatterlist.h |
11 | generic-y += sections.h | ||
11 | generic-y += trace_clock.h | 12 | generic-y += trace_clock.h |
diff --git a/arch/m32r/include/asm/sections.h b/arch/m32r/include/asm/sections.h deleted file mode 100644 index 5e5d21c4908a..000000000000 --- a/arch/m32r/include/asm/sections.h +++ /dev/null | |||
@@ -1,7 +0,0 @@ | |||
1 | #ifndef _M32R_SECTIONS_H | ||
2 | #define _M32R_SECTIONS_H | ||
3 | |||
4 | /* nothing to see, move along */ | ||
5 | #include <asm-generic/sections.h> | ||
6 | |||
7 | #endif /* _M32R_SECTIONS_H */ | ||
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c index 1a15f81ea1bd..093f2761aa51 100644 --- a/arch/m32r/kernel/time.c +++ b/arch/m32r/kernel/time.c | |||
@@ -134,7 +134,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) | |||
134 | 134 | ||
135 | static struct irqaction irq0 = { | 135 | static struct irqaction irq0 = { |
136 | .handler = timer_interrupt, | 136 | .handler = timer_interrupt, |
137 | .flags = IRQF_DISABLED, | ||
138 | .name = "MFT2", | 137 | .name = "MFT2", |
139 | }; | 138 | }; |
140 | 139 | ||
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index 3a480b3df0d6..9aa01adb407f 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c | |||
@@ -376,7 +376,6 @@ cache_flush_060 (unsigned long addr, int scope, int cache, unsigned long len) | |||
376 | asmlinkage int | 376 | asmlinkage int |
377 | sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) | 377 | sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) |
378 | { | 378 | { |
379 | struct vm_area_struct *vma; | ||
380 | int ret = -EINVAL; | 379 | int ret = -EINVAL; |
381 | 380 | ||
382 | if (scope < FLUSH_SCOPE_LINE || scope > FLUSH_SCOPE_ALL || | 381 | if (scope < FLUSH_SCOPE_LINE || scope > FLUSH_SCOPE_ALL || |
@@ -389,17 +388,21 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) | |||
389 | if (!capable(CAP_SYS_ADMIN)) | 388 | if (!capable(CAP_SYS_ADMIN)) |
390 | goto out; | 389 | goto out; |
391 | } else { | 390 | } else { |
391 | struct vm_area_struct *vma; | ||
392 | |||
393 | /* Check for overflow. */ | ||
394 | if (addr + len < addr) | ||
395 | goto out; | ||
396 | |||
392 | /* | 397 | /* |
393 | * Verify that the specified address region actually belongs | 398 | * Verify that the specified address region actually belongs |
394 | * to this process. | 399 | * to this process. |
395 | */ | 400 | */ |
396 | vma = find_vma (current->mm, addr); | ||
397 | ret = -EINVAL; | 401 | ret = -EINVAL; |
398 | /* Check for overflow. */ | 402 | down_read(¤t->mm->mmap_sem); |
399 | if (addr + len < addr) | 403 | vma = find_vma(current->mm, addr); |
400 | goto out; | 404 | if (!vma || addr < vma->vm_start || addr + len > vma->vm_end) |
401 | if (vma == NULL || addr < vma->vm_start || addr + len > vma->vm_end) | 405 | goto out_unlock; |
402 | goto out; | ||
403 | } | 406 | } |
404 | 407 | ||
405 | if (CPU_IS_020_OR_030) { | 408 | if (CPU_IS_020_OR_030) { |
@@ -429,7 +432,7 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) | |||
429 | __asm__ __volatile__ ("movec %0, %%cacr" : : "r" (cacr)); | 432 | __asm__ __volatile__ ("movec %0, %%cacr" : : "r" (cacr)); |
430 | } | 433 | } |
431 | ret = 0; | 434 | ret = 0; |
432 | goto out; | 435 | goto out_unlock; |
433 | } else { | 436 | } else { |
434 | /* | 437 | /* |
435 | * 040 or 060: don't blindly trust 'scope', someone could | 438 | * 040 or 060: don't blindly trust 'scope', someone could |
@@ -446,6 +449,8 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) | |||
446 | ret = cache_flush_060 (addr, scope, cache, len); | 449 | ret = cache_flush_060 (addr, scope, cache, len); |
447 | } | 450 | } |
448 | } | 451 | } |
452 | out_unlock: | ||
453 | up_read(¤t->mm->mmap_sem); | ||
449 | out: | 454 | out: |
450 | return ret; | 455 | return ret; |
451 | } | 456 | } |
diff --git a/arch/mips/include/asm/suspend.h b/arch/mips/include/asm/suspend.h deleted file mode 100644 index 3adac3b53d19..000000000000 --- a/arch/mips/include/asm/suspend.h +++ /dev/null | |||
@@ -1,7 +0,0 @@ | |||
1 | #ifndef __ASM_SUSPEND_H | ||
2 | #define __ASM_SUSPEND_H | ||
3 | |||
4 | /* References to section boundaries */ | ||
5 | extern const void __nosave_begin, __nosave_end; | ||
6 | |||
7 | #endif /* __ASM_SUSPEND_H */ | ||
diff --git a/arch/mips/power/cpu.c b/arch/mips/power/cpu.c index 521e5963df05..2129e67723ff 100644 --- a/arch/mips/power/cpu.c +++ b/arch/mips/power/cpu.c | |||
@@ -7,7 +7,7 @@ | |||
7 | * Author: Hu Hongbing <huhb@lemote.com> | 7 | * Author: Hu Hongbing <huhb@lemote.com> |
8 | * Wu Zhangjin <wuzhangjin@gmail.com> | 8 | * Wu Zhangjin <wuzhangjin@gmail.com> |
9 | */ | 9 | */ |
10 | #include <asm/suspend.h> | 10 | #include <asm/sections.h> |
11 | #include <asm/fpu.h> | 11 | #include <asm/fpu.h> |
12 | #include <asm/dsp.h> | 12 | #include <asm/dsp.h> |
13 | 13 | ||
diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index 77eb1a68d13b..54a062cb9f2c 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild | |||
@@ -8,4 +8,5 @@ generic-y += irq_work.h | |||
8 | generic-y += mcs_spinlock.h | 8 | generic-y += mcs_spinlock.h |
9 | generic-y += preempt.h | 9 | generic-y += preempt.h |
10 | generic-y += scatterlist.h | 10 | generic-y += scatterlist.h |
11 | generic-y += sections.h | ||
11 | generic-y += trace_clock.h | 12 | generic-y += trace_clock.h |
diff --git a/arch/mn10300/include/asm/sections.h b/arch/mn10300/include/asm/sections.h deleted file mode 100644 index 2b8c5160388f..000000000000 --- a/arch/mn10300/include/asm/sections.h +++ /dev/null | |||
@@ -1 +0,0 @@ | |||
1 | #include <asm-generic/sections.h> | ||
diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index d98c1ecc3266..f60d4ea8b50c 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h | |||
@@ -38,10 +38,9 @@ static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) | |||
38 | static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } | 38 | static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } |
39 | 39 | ||
40 | #ifdef CONFIG_NUMA_BALANCING | 40 | #ifdef CONFIG_NUMA_BALANCING |
41 | |||
42 | static inline int pte_present(pte_t pte) | 41 | static inline int pte_present(pte_t pte) |
43 | { | 42 | { |
44 | return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA); | 43 | return pte_val(pte) & _PAGE_NUMA_MASK; |
45 | } | 44 | } |
46 | 45 | ||
47 | #define pte_present_nonuma pte_present_nonuma | 46 | #define pte_present_nonuma pte_present_nonuma |
@@ -50,37 +49,6 @@ static inline int pte_present_nonuma(pte_t pte) | |||
50 | return pte_val(pte) & (_PAGE_PRESENT); | 49 | return pte_val(pte) & (_PAGE_PRESENT); |
51 | } | 50 | } |
52 | 51 | ||
53 | #define pte_numa pte_numa | ||
54 | static inline int pte_numa(pte_t pte) | ||
55 | { | ||
56 | return (pte_val(pte) & | ||
57 | (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; | ||
58 | } | ||
59 | |||
60 | #define pte_mknonnuma pte_mknonnuma | ||
61 | static inline pte_t pte_mknonnuma(pte_t pte) | ||
62 | { | ||
63 | pte_val(pte) &= ~_PAGE_NUMA; | ||
64 | pte_val(pte) |= _PAGE_PRESENT | _PAGE_ACCESSED; | ||
65 | return pte; | ||
66 | } | ||
67 | |||
68 | #define pte_mknuma pte_mknuma | ||
69 | static inline pte_t pte_mknuma(pte_t pte) | ||
70 | { | ||
71 | /* | ||
72 | * We should not set _PAGE_NUMA on non present ptes. Also clear the | ||
73 | * present bit so that hash_page will return 1 and we collect this | ||
74 | * as numa fault. | ||
75 | */ | ||
76 | if (pte_present(pte)) { | ||
77 | pte_val(pte) |= _PAGE_NUMA; | ||
78 | pte_val(pte) &= ~_PAGE_PRESENT; | ||
79 | } else | ||
80 | VM_BUG_ON(1); | ||
81 | return pte; | ||
82 | } | ||
83 | |||
84 | #define ptep_set_numa ptep_set_numa | 52 | #define ptep_set_numa ptep_set_numa |
85 | static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, | 53 | static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, |
86 | pte_t *ptep) | 54 | pte_t *ptep) |
@@ -92,12 +60,6 @@ static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, | |||
92 | return; | 60 | return; |
93 | } | 61 | } |
94 | 62 | ||
95 | #define pmd_numa pmd_numa | ||
96 | static inline int pmd_numa(pmd_t pmd) | ||
97 | { | ||
98 | return pte_numa(pmd_pte(pmd)); | ||
99 | } | ||
100 | |||
101 | #define pmdp_set_numa pmdp_set_numa | 63 | #define pmdp_set_numa pmdp_set_numa |
102 | static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, | 64 | static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, |
103 | pmd_t *pmdp) | 65 | pmd_t *pmdp) |
@@ -109,16 +71,21 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, | |||
109 | return; | 71 | return; |
110 | } | 72 | } |
111 | 73 | ||
112 | #define pmd_mknonnuma pmd_mknonnuma | 74 | /* |
113 | static inline pmd_t pmd_mknonnuma(pmd_t pmd) | 75 | * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist |
76 | * which was inherited from x86. For the purposes of powerpc pte_basic_t and | ||
77 | * pmd_t are equivalent | ||
78 | */ | ||
79 | #define pteval_t pte_basic_t | ||
80 | #define pmdval_t pmd_t | ||
81 | static inline pteval_t ptenuma_flags(pte_t pte) | ||
114 | { | 82 | { |
115 | return pte_pmd(pte_mknonnuma(pmd_pte(pmd))); | 83 | return pte_val(pte) & _PAGE_NUMA_MASK; |
116 | } | 84 | } |
117 | 85 | ||
118 | #define pmd_mknuma pmd_mknuma | 86 | static inline pmdval_t pmdnuma_flags(pmd_t pmd) |
119 | static inline pmd_t pmd_mknuma(pmd_t pmd) | ||
120 | { | 87 | { |
121 | return pte_pmd(pte_mknuma(pmd_pte(pmd))); | 88 | return pmd_val(pmd) & _PAGE_NUMA_MASK; |
122 | } | 89 | } |
123 | 90 | ||
124 | # else | 91 | # else |
diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 8d1569c29042..e040c3595129 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h | |||
@@ -98,6 +98,11 @@ extern unsigned long bad_call_to_PMD_PAGE_SIZE(void); | |||
98 | _PAGE_USER | _PAGE_ACCESSED | \ | 98 | _PAGE_USER | _PAGE_ACCESSED | \ |
99 | _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) | 99 | _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) |
100 | 100 | ||
101 | #ifdef CONFIG_NUMA_BALANCING | ||
102 | /* Mask of bits that distinguish present and numa ptes */ | ||
103 | #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT) | ||
104 | #endif | ||
105 | |||
101 | /* | 106 | /* |
102 | * We define 2 sets of base prot bits, one for basic pages (ie, | 107 | * We define 2 sets of base prot bits, one for basic pages (ie, |
103 | * cacheable kernel and user pages) and one for non cacheable | 108 | * cacheable kernel and user pages) and one for non cacheable |
diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c index 0167d53da30c..a531154cc0f3 100644 --- a/arch/powerpc/kernel/suspend.c +++ b/arch/powerpc/kernel/suspend.c | |||
@@ -9,9 +9,7 @@ | |||
9 | 9 | ||
10 | #include <linux/mm.h> | 10 | #include <linux/mm.h> |
11 | #include <asm/page.h> | 11 | #include <asm/page.h> |
12 | 12 | #include <asm/sections.h> | |
13 | /* References to section boundaries */ | ||
14 | extern const void __nosave_begin, __nosave_end; | ||
15 | 13 | ||
16 | /* | 14 | /* |
17 | * pfn_is_nosave - check if given pfn is in the 'nosave' section | 15 | * pfn_is_nosave - check if given pfn is in the 'nosave' section |
diff --git a/arch/s390/kernel/suspend.c b/arch/s390/kernel/suspend.c index a7a7537ce1e7..1c4c5accd220 100644 --- a/arch/s390/kernel/suspend.c +++ b/arch/s390/kernel/suspend.c | |||
@@ -13,14 +13,10 @@ | |||
13 | #include <asm/ipl.h> | 13 | #include <asm/ipl.h> |
14 | #include <asm/cio.h> | 14 | #include <asm/cio.h> |
15 | #include <asm/pci.h> | 15 | #include <asm/pci.h> |
16 | #include <asm/sections.h> | ||
16 | #include "entry.h" | 17 | #include "entry.h" |
17 | 18 | ||
18 | /* | 19 | /* |
19 | * References to section boundaries | ||
20 | */ | ||
21 | extern const void __nosave_begin, __nosave_end; | ||
22 | |||
23 | /* | ||
24 | * The restore of the saved pages in an hibernation image will set | 20 | * The restore of the saved pages in an hibernation image will set |
25 | * the change and referenced bits in the storage key for each page. | 21 | * the change and referenced bits in the storage key for each page. |
26 | * Overindication of the referenced bits after an hibernation cycle | 22 | * Overindication of the referenced bits after an hibernation cycle |
diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index 3fe5681744f1..46461c19f284 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild | |||
@@ -10,6 +10,7 @@ generic-y += irq_work.h | |||
10 | generic-y += mcs_spinlock.h | 10 | generic-y += mcs_spinlock.h |
11 | generic-y += preempt.h | 11 | generic-y += preempt.h |
12 | generic-y += scatterlist.h | 12 | generic-y += scatterlist.h |
13 | generic-y += sections.h | ||
13 | generic-y += trace_clock.h | 14 | generic-y += trace_clock.h |
14 | generic-y += xor.h | 15 | generic-y += xor.h |
15 | generic-y += serial.h | 16 | generic-y += serial.h |
diff --git a/arch/score/include/asm/sections.h b/arch/score/include/asm/sections.h deleted file mode 100644 index 9441d23af005..000000000000 --- a/arch/score/include/asm/sections.h +++ /dev/null | |||
@@ -1,6 +0,0 @@ | |||
1 | #ifndef _ASM_SCORE_SECTIONS_H | ||
2 | #define _ASM_SCORE_SECTIONS_H | ||
3 | |||
4 | #include <asm-generic/sections.h> | ||
5 | |||
6 | #endif /* _ASM_SCORE_SECTIONS_H */ | ||
diff --git a/arch/sh/include/asm/sections.h b/arch/sh/include/asm/sections.h index 1b6199740e98..7a99e6af6372 100644 --- a/arch/sh/include/asm/sections.h +++ b/arch/sh/include/asm/sections.h | |||
@@ -3,7 +3,6 @@ | |||
3 | 3 | ||
4 | #include <asm-generic/sections.h> | 4 | #include <asm-generic/sections.h> |
5 | 5 | ||
6 | extern long __nosave_begin, __nosave_end; | ||
7 | extern long __machvec_start, __machvec_end; | 6 | extern long __machvec_start, __machvec_end; |
8 | extern char __uncached_start, __uncached_end; | 7 | extern char __uncached_start, __uncached_end; |
9 | extern char __start_eh_frame[], __stop_eh_frame[]; | 8 | extern char __start_eh_frame[], __stop_eh_frame[]; |
diff --git a/arch/sparc/power/hibernate.c b/arch/sparc/power/hibernate.c index 42b0b8ce699a..17bd2e167e07 100644 --- a/arch/sparc/power/hibernate.c +++ b/arch/sparc/power/hibernate.c | |||
@@ -9,11 +9,9 @@ | |||
9 | #include <asm/hibernate.h> | 9 | #include <asm/hibernate.h> |
10 | #include <asm/visasm.h> | 10 | #include <asm/visasm.h> |
11 | #include <asm/page.h> | 11 | #include <asm/page.h> |
12 | #include <asm/sections.h> | ||
12 | #include <asm/tlb.h> | 13 | #include <asm/tlb.h> |
13 | 14 | ||
14 | /* References to section boundaries */ | ||
15 | extern const void __nosave_begin, __nosave_end; | ||
16 | |||
17 | struct saved_context saved_context; | 15 | struct saved_context saved_context; |
18 | 16 | ||
19 | /* | 17 | /* |
diff --git a/arch/unicore32/include/mach/pm.h b/arch/unicore32/include/mach/pm.h index 4dcd34ae194c..77b522694e74 100644 --- a/arch/unicore32/include/mach/pm.h +++ b/arch/unicore32/include/mach/pm.h | |||
@@ -36,8 +36,5 @@ extern int puv3_pm_enter(suspend_state_t state); | |||
36 | /* Defined in hibernate_asm.S */ | 36 | /* Defined in hibernate_asm.S */ |
37 | extern int restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist); | 37 | extern int restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist); |
38 | 38 | ||
39 | /* References to section boundaries */ | ||
40 | extern const void __nosave_begin, __nosave_end; | ||
41 | |||
42 | extern struct pbe *restore_pblist; | 39 | extern struct pbe *restore_pblist; |
43 | #endif | 40 | #endif |
diff --git a/arch/unicore32/kernel/hibernate.c b/arch/unicore32/kernel/hibernate.c index d75ef8b6cb56..9969ec374abb 100644 --- a/arch/unicore32/kernel/hibernate.c +++ b/arch/unicore32/kernel/hibernate.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <asm/page.h> | 18 | #include <asm/page.h> |
19 | #include <asm/pgtable.h> | 19 | #include <asm/pgtable.h> |
20 | #include <asm/pgalloc.h> | 20 | #include <asm/pgalloc.h> |
21 | #include <asm/sections.h> | ||
21 | #include <asm/suspend.h> | 22 | #include <asm/suspend.h> |
22 | 23 | ||
23 | #include "mach/pm.h" | 24 | #include "mach/pm.h" |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e4b1f431c7ed..3eb8a41509b3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -30,7 +30,6 @@ config X86 | |||
30 | select HAVE_UNSTABLE_SCHED_CLOCK | 30 | select HAVE_UNSTABLE_SCHED_CLOCK |
31 | select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 | 31 | select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 |
32 | select ARCH_SUPPORTS_INT128 if X86_64 | 32 | select ARCH_SUPPORTS_INT128 if X86_64 |
33 | select ARCH_WANTS_PROT_NUMA_PROT_NONE | ||
34 | select HAVE_IDE | 33 | select HAVE_IDE |
35 | select HAVE_OPROFILE | 34 | select HAVE_OPROFILE |
36 | select HAVE_PCSPKR_PLATFORM | 35 | select HAVE_PCSPKR_PLATFORM |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f216963760e5..0f9724c9c510 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -325,6 +325,20 @@ static inline pteval_t pte_flags(pte_t pte) | |||
325 | return native_pte_val(pte) & PTE_FLAGS_MASK; | 325 | return native_pte_val(pte) & PTE_FLAGS_MASK; |
326 | } | 326 | } |
327 | 327 | ||
328 | #ifdef CONFIG_NUMA_BALANCING | ||
329 | /* Set of bits that distinguishes present, prot_none and numa ptes */ | ||
330 | #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) | ||
331 | static inline pteval_t ptenuma_flags(pte_t pte) | ||
332 | { | ||
333 | return pte_flags(pte) & _PAGE_NUMA_MASK; | ||
334 | } | ||
335 | |||
336 | static inline pmdval_t pmdnuma_flags(pmd_t pmd) | ||
337 | { | ||
338 | return pmd_flags(pmd) & _PAGE_NUMA_MASK; | ||
339 | } | ||
340 | #endif /* CONFIG_NUMA_BALANCING */ | ||
341 | |||
328 | #define pgprot_val(x) ((x).pgprot) | 342 | #define pgprot_val(x) ((x).pgprot) |
329 | #define __pgprot(x) ((pgprot_t) { (x) } ) | 343 | #define __pgprot(x) ((pgprot_t) { (x) } ) |
330 | 344 | ||
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 7d28c885d238..291226b952a9 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c | |||
@@ -13,13 +13,11 @@ | |||
13 | #include <asm/page.h> | 13 | #include <asm/page.h> |
14 | #include <asm/pgtable.h> | 14 | #include <asm/pgtable.h> |
15 | #include <asm/mmzone.h> | 15 | #include <asm/mmzone.h> |
16 | #include <asm/sections.h> | ||
16 | 17 | ||
17 | /* Defined in hibernate_asm_32.S */ | 18 | /* Defined in hibernate_asm_32.S */ |
18 | extern int restore_image(void); | 19 | extern int restore_image(void); |
19 | 20 | ||
20 | /* References to section boundaries */ | ||
21 | extern const void __nosave_begin, __nosave_end; | ||
22 | |||
23 | /* Pointer to the temporary resume page tables */ | 21 | /* Pointer to the temporary resume page tables */ |
24 | pgd_t *resume_pg_dir; | 22 | pgd_t *resume_pg_dir; |
25 | 23 | ||
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 35e2bb6c0f37..009947d419a6 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c | |||
@@ -17,11 +17,9 @@ | |||
17 | #include <asm/page.h> | 17 | #include <asm/page.h> |
18 | #include <asm/pgtable.h> | 18 | #include <asm/pgtable.h> |
19 | #include <asm/mtrr.h> | 19 | #include <asm/mtrr.h> |
20 | #include <asm/sections.h> | ||
20 | #include <asm/suspend.h> | 21 | #include <asm/suspend.h> |
21 | 22 | ||
22 | /* References to section boundaries */ | ||
23 | extern __visible const void __nosave_begin, __nosave_end; | ||
24 | |||
25 | /* Defined in hibernate_asm_64.S */ | 23 | /* Defined in hibernate_asm_64.S */ |
26 | extern asmlinkage __visible int restore_image(void); | 24 | extern asmlinkage __visible int restore_image(void); |
27 | 25 | ||
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 134f763d90fd..61a33f4ba608 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig | |||
@@ -252,6 +252,9 @@ config DMA_CMA | |||
252 | to allocate big physically-contiguous blocks of memory for use with | 252 | to allocate big physically-contiguous blocks of memory for use with |
253 | hardware components that do not support I/O map nor scatter-gather. | 253 | hardware components that do not support I/O map nor scatter-gather. |
254 | 254 | ||
255 | You can disable CMA by specifying "cma=0" on the kernel's command | ||
256 | line. | ||
257 | |||
255 | For more information see <include/linux/dma-contiguous.h>. | 258 | For more information see <include/linux/dma-contiguous.h>. |
256 | If unsure, say "n". | 259 | If unsure, say "n". |
257 | 260 | ||
diff --git a/drivers/base/dma-mapping.c b/drivers/base/dma-mapping.c index 6cd08e145bfa..9e8bbdd470ca 100644 --- a/drivers/base/dma-mapping.c +++ b/drivers/base/dma-mapping.c | |||
@@ -10,6 +10,8 @@ | |||
10 | #include <linux/dma-mapping.h> | 10 | #include <linux/dma-mapping.h> |
11 | #include <linux/export.h> | 11 | #include <linux/export.h> |
12 | #include <linux/gfp.h> | 12 | #include <linux/gfp.h> |
13 | #include <linux/slab.h> | ||
14 | #include <linux/vmalloc.h> | ||
13 | #include <asm-generic/dma-coherent.h> | 15 | #include <asm-generic/dma-coherent.h> |
14 | 16 | ||
15 | /* | 17 | /* |
@@ -267,3 +269,73 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, | |||
267 | return ret; | 269 | return ret; |
268 | } | 270 | } |
269 | EXPORT_SYMBOL(dma_common_mmap); | 271 | EXPORT_SYMBOL(dma_common_mmap); |
272 | |||
273 | #ifdef CONFIG_MMU | ||
274 | /* | ||
275 | * remaps an array of PAGE_SIZE pages into another vm_area | ||
276 | * Cannot be used in non-sleeping contexts | ||
277 | */ | ||
278 | void *dma_common_pages_remap(struct page **pages, size_t size, | ||
279 | unsigned long vm_flags, pgprot_t prot, | ||
280 | const void *caller) | ||
281 | { | ||
282 | struct vm_struct *area; | ||
283 | |||
284 | area = get_vm_area_caller(size, vm_flags, caller); | ||
285 | if (!area) | ||
286 | return NULL; | ||
287 | |||
288 | area->pages = pages; | ||
289 | |||
290 | if (map_vm_area(area, prot, pages)) { | ||
291 | vunmap(area->addr); | ||
292 | return NULL; | ||
293 | } | ||
294 | |||
295 | return area->addr; | ||
296 | } | ||
297 | |||
298 | /* | ||
299 | * remaps an allocated contiguous region into another vm_area. | ||
300 | * Cannot be used in non-sleeping contexts | ||
301 | */ | ||
302 | |||
303 | void *dma_common_contiguous_remap(struct page *page, size_t size, | ||
304 | unsigned long vm_flags, | ||
305 | pgprot_t prot, const void *caller) | ||
306 | { | ||
307 | int i; | ||
308 | struct page **pages; | ||
309 | void *ptr; | ||
310 | unsigned long pfn; | ||
311 | |||
312 | pages = kmalloc(sizeof(struct page *) << get_order(size), GFP_KERNEL); | ||
313 | if (!pages) | ||
314 | return NULL; | ||
315 | |||
316 | for (i = 0, pfn = page_to_pfn(page); i < (size >> PAGE_SHIFT); i++) | ||
317 | pages[i] = pfn_to_page(pfn + i); | ||
318 | |||
319 | ptr = dma_common_pages_remap(pages, size, vm_flags, prot, caller); | ||
320 | |||
321 | kfree(pages); | ||
322 | |||
323 | return ptr; | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * unmaps a range previously mapped by dma_common_*_remap | ||
328 | */ | ||
329 | void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags) | ||
330 | { | ||
331 | struct vm_struct *area = find_vm_area(cpu_addr); | ||
332 | |||
333 | if (!area || (area->flags & vm_flags) != vm_flags) { | ||
334 | WARN(1, "trying to free invalid coherent area: %p\n", cpu_addr); | ||
335 | return; | ||
336 | } | ||
337 | |||
338 | unmap_kernel_range((unsigned long)cpu_addr, size); | ||
339 | vunmap(cpu_addr); | ||
340 | } | ||
341 | #endif | ||
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index a2e13e250bba..7c5d87191b28 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -373,6 +373,45 @@ static ssize_t show_phys_device(struct device *dev, | |||
373 | return sprintf(buf, "%d\n", mem->phys_device); | 373 | return sprintf(buf, "%d\n", mem->phys_device); |
374 | } | 374 | } |
375 | 375 | ||
376 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
377 | static ssize_t show_valid_zones(struct device *dev, | ||
378 | struct device_attribute *attr, char *buf) | ||
379 | { | ||
380 | struct memory_block *mem = to_memory_block(dev); | ||
381 | unsigned long start_pfn, end_pfn; | ||
382 | unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; | ||
383 | struct page *first_page; | ||
384 | struct zone *zone; | ||
385 | |||
386 | start_pfn = section_nr_to_pfn(mem->start_section_nr); | ||
387 | end_pfn = start_pfn + nr_pages; | ||
388 | first_page = pfn_to_page(start_pfn); | ||
389 | |||
390 | /* The block contains more than one zone can not be offlined. */ | ||
391 | if (!test_pages_in_a_zone(start_pfn, end_pfn)) | ||
392 | return sprintf(buf, "none\n"); | ||
393 | |||
394 | zone = page_zone(first_page); | ||
395 | |||
396 | if (zone_idx(zone) == ZONE_MOVABLE - 1) { | ||
397 | /*The mem block is the last memoryblock of this zone.*/ | ||
398 | if (end_pfn == zone_end_pfn(zone)) | ||
399 | return sprintf(buf, "%s %s\n", | ||
400 | zone->name, (zone + 1)->name); | ||
401 | } | ||
402 | |||
403 | if (zone_idx(zone) == ZONE_MOVABLE) { | ||
404 | /*The mem block is the first memoryblock of ZONE_MOVABLE.*/ | ||
405 | if (start_pfn == zone->zone_start_pfn) | ||
406 | return sprintf(buf, "%s %s\n", | ||
407 | zone->name, (zone - 1)->name); | ||
408 | } | ||
409 | |||
410 | return sprintf(buf, "%s\n", zone->name); | ||
411 | } | ||
412 | static DEVICE_ATTR(valid_zones, 0444, show_valid_zones, NULL); | ||
413 | #endif | ||
414 | |||
376 | static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); | 415 | static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); |
377 | static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); | 416 | static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); |
378 | static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); | 417 | static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); |
@@ -523,6 +562,9 @@ static struct attribute *memory_memblk_attrs[] = { | |||
523 | &dev_attr_state.attr, | 562 | &dev_attr_state.attr, |
524 | &dev_attr_phys_device.attr, | 563 | &dev_attr_phys_device.attr, |
525 | &dev_attr_removable.attr, | 564 | &dev_attr_removable.attr, |
565 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
566 | &dev_attr_valid_zones.attr, | ||
567 | #endif | ||
526 | NULL | 568 | NULL |
527 | }; | 569 | }; |
528 | 570 | ||
diff --git a/drivers/base/node.c b/drivers/base/node.c index d51c49c9bafa..472168cd0c97 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c | |||
@@ -289,8 +289,6 @@ static int register_node(struct node *node, int num, struct node *parent) | |||
289 | device_create_file(&node->dev, &dev_attr_distance); | 289 | device_create_file(&node->dev, &dev_attr_distance); |
290 | device_create_file(&node->dev, &dev_attr_vmstat); | 290 | device_create_file(&node->dev, &dev_attr_vmstat); |
291 | 291 | ||
292 | scan_unevictable_register_node(node); | ||
293 | |||
294 | hugetlb_register_node(node); | 292 | hugetlb_register_node(node); |
295 | 293 | ||
296 | compaction_register_node(node); | 294 | compaction_register_node(node); |
@@ -314,7 +312,6 @@ void unregister_node(struct node *node) | |||
314 | device_remove_file(&node->dev, &dev_attr_distance); | 312 | device_remove_file(&node->dev, &dev_attr_distance); |
315 | device_remove_file(&node->dev, &dev_attr_vmstat); | 313 | device_remove_file(&node->dev, &dev_attr_vmstat); |
316 | 314 | ||
317 | scan_unevictable_unregister_node(node); | ||
318 | hugetlb_unregister_node(node); /* no-op, if memoryless node */ | 315 | hugetlb_unregister_node(node); /* no-op, if memoryless node */ |
319 | 316 | ||
320 | device_unregister(&node->dev); | 317 | device_unregister(&node->dev); |
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d00831c3d731..3b850164c65c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c | |||
@@ -103,10 +103,10 @@ static ssize_t mem_used_total_show(struct device *dev, | |||
103 | 103 | ||
104 | down_read(&zram->init_lock); | 104 | down_read(&zram->init_lock); |
105 | if (init_done(zram)) | 105 | if (init_done(zram)) |
106 | val = zs_get_total_size_bytes(meta->mem_pool); | 106 | val = zs_get_total_pages(meta->mem_pool); |
107 | up_read(&zram->init_lock); | 107 | up_read(&zram->init_lock); |
108 | 108 | ||
109 | return scnprintf(buf, PAGE_SIZE, "%llu\n", val); | 109 | return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); |
110 | } | 110 | } |
111 | 111 | ||
112 | static ssize_t max_comp_streams_show(struct device *dev, | 112 | static ssize_t max_comp_streams_show(struct device *dev, |
@@ -122,6 +122,72 @@ static ssize_t max_comp_streams_show(struct device *dev, | |||
122 | return scnprintf(buf, PAGE_SIZE, "%d\n", val); | 122 | return scnprintf(buf, PAGE_SIZE, "%d\n", val); |
123 | } | 123 | } |
124 | 124 | ||
125 | static ssize_t mem_limit_show(struct device *dev, | ||
126 | struct device_attribute *attr, char *buf) | ||
127 | { | ||
128 | u64 val; | ||
129 | struct zram *zram = dev_to_zram(dev); | ||
130 | |||
131 | down_read(&zram->init_lock); | ||
132 | val = zram->limit_pages; | ||
133 | up_read(&zram->init_lock); | ||
134 | |||
135 | return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); | ||
136 | } | ||
137 | |||
138 | static ssize_t mem_limit_store(struct device *dev, | ||
139 | struct device_attribute *attr, const char *buf, size_t len) | ||
140 | { | ||
141 | u64 limit; | ||
142 | char *tmp; | ||
143 | struct zram *zram = dev_to_zram(dev); | ||
144 | |||
145 | limit = memparse(buf, &tmp); | ||
146 | if (buf == tmp) /* no chars parsed, invalid input */ | ||
147 | return -EINVAL; | ||
148 | |||
149 | down_write(&zram->init_lock); | ||
150 | zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT; | ||
151 | up_write(&zram->init_lock); | ||
152 | |||
153 | return len; | ||
154 | } | ||
155 | |||
156 | static ssize_t mem_used_max_show(struct device *dev, | ||
157 | struct device_attribute *attr, char *buf) | ||
158 | { | ||
159 | u64 val = 0; | ||
160 | struct zram *zram = dev_to_zram(dev); | ||
161 | |||
162 | down_read(&zram->init_lock); | ||
163 | if (init_done(zram)) | ||
164 | val = atomic_long_read(&zram->stats.max_used_pages); | ||
165 | up_read(&zram->init_lock); | ||
166 | |||
167 | return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT); | ||
168 | } | ||
169 | |||
170 | static ssize_t mem_used_max_store(struct device *dev, | ||
171 | struct device_attribute *attr, const char *buf, size_t len) | ||
172 | { | ||
173 | int err; | ||
174 | unsigned long val; | ||
175 | struct zram *zram = dev_to_zram(dev); | ||
176 | struct zram_meta *meta = zram->meta; | ||
177 | |||
178 | err = kstrtoul(buf, 10, &val); | ||
179 | if (err || val != 0) | ||
180 | return -EINVAL; | ||
181 | |||
182 | down_read(&zram->init_lock); | ||
183 | if (init_done(zram)) | ||
184 | atomic_long_set(&zram->stats.max_used_pages, | ||
185 | zs_get_total_pages(meta->mem_pool)); | ||
186 | up_read(&zram->init_lock); | ||
187 | |||
188 | return len; | ||
189 | } | ||
190 | |||
125 | static ssize_t max_comp_streams_store(struct device *dev, | 191 | static ssize_t max_comp_streams_store(struct device *dev, |
126 | struct device_attribute *attr, const char *buf, size_t len) | 192 | struct device_attribute *attr, const char *buf, size_t len) |
127 | { | 193 | { |
@@ -434,6 +500,21 @@ out_cleanup: | |||
434 | return ret; | 500 | return ret; |
435 | } | 501 | } |
436 | 502 | ||
503 | static inline void update_used_max(struct zram *zram, | ||
504 | const unsigned long pages) | ||
505 | { | ||
506 | int old_max, cur_max; | ||
507 | |||
508 | old_max = atomic_long_read(&zram->stats.max_used_pages); | ||
509 | |||
510 | do { | ||
511 | cur_max = old_max; | ||
512 | if (pages > cur_max) | ||
513 | old_max = atomic_long_cmpxchg( | ||
514 | &zram->stats.max_used_pages, cur_max, pages); | ||
515 | } while (old_max != cur_max); | ||
516 | } | ||
517 | |||
437 | static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, | 518 | static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, |
438 | int offset) | 519 | int offset) |
439 | { | 520 | { |
@@ -445,6 +526,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, | |||
445 | struct zram_meta *meta = zram->meta; | 526 | struct zram_meta *meta = zram->meta; |
446 | struct zcomp_strm *zstrm; | 527 | struct zcomp_strm *zstrm; |
447 | bool locked = false; | 528 | bool locked = false; |
529 | unsigned long alloced_pages; | ||
448 | 530 | ||
449 | page = bvec->bv_page; | 531 | page = bvec->bv_page; |
450 | if (is_partial_io(bvec)) { | 532 | if (is_partial_io(bvec)) { |
@@ -513,6 +595,16 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, | |||
513 | ret = -ENOMEM; | 595 | ret = -ENOMEM; |
514 | goto out; | 596 | goto out; |
515 | } | 597 | } |
598 | |||
599 | alloced_pages = zs_get_total_pages(meta->mem_pool); | ||
600 | if (zram->limit_pages && alloced_pages > zram->limit_pages) { | ||
601 | zs_free(meta->mem_pool, handle); | ||
602 | ret = -ENOMEM; | ||
603 | goto out; | ||
604 | } | ||
605 | |||
606 | update_used_max(zram, alloced_pages); | ||
607 | |||
516 | cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); | 608 | cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO); |
517 | 609 | ||
518 | if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { | 610 | if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) { |
@@ -606,6 +698,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, | |||
606 | bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); | 698 | bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); |
607 | zram_free_page(zram, index); | 699 | zram_free_page(zram, index); |
608 | bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); | 700 | bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); |
701 | atomic64_inc(&zram->stats.notify_free); | ||
609 | index++; | 702 | index++; |
610 | n -= PAGE_SIZE; | 703 | n -= PAGE_SIZE; |
611 | } | 704 | } |
@@ -617,6 +710,9 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) | |||
617 | struct zram_meta *meta; | 710 | struct zram_meta *meta; |
618 | 711 | ||
619 | down_write(&zram->init_lock); | 712 | down_write(&zram->init_lock); |
713 | |||
714 | zram->limit_pages = 0; | ||
715 | |||
620 | if (!init_done(zram)) { | 716 | if (!init_done(zram)) { |
621 | up_write(&zram->init_lock); | 717 | up_write(&zram->init_lock); |
622 | return; | 718 | return; |
@@ -857,6 +953,10 @@ static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); | |||
857 | static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); | 953 | static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); |
858 | static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); | 954 | static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); |
859 | static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); | 955 | static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); |
956 | static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show, | ||
957 | mem_limit_store); | ||
958 | static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show, | ||
959 | mem_used_max_store); | ||
860 | static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, | 960 | static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, |
861 | max_comp_streams_show, max_comp_streams_store); | 961 | max_comp_streams_show, max_comp_streams_store); |
862 | static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, | 962 | static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, |
@@ -885,6 +985,8 @@ static struct attribute *zram_disk_attrs[] = { | |||
885 | &dev_attr_orig_data_size.attr, | 985 | &dev_attr_orig_data_size.attr, |
886 | &dev_attr_compr_data_size.attr, | 986 | &dev_attr_compr_data_size.attr, |
887 | &dev_attr_mem_used_total.attr, | 987 | &dev_attr_mem_used_total.attr, |
988 | &dev_attr_mem_limit.attr, | ||
989 | &dev_attr_mem_used_max.attr, | ||
888 | &dev_attr_max_comp_streams.attr, | 990 | &dev_attr_max_comp_streams.attr, |
889 | &dev_attr_comp_algorithm.attr, | 991 | &dev_attr_comp_algorithm.attr, |
890 | NULL, | 992 | NULL, |
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index e0f725c87cc6..c6ee271317f5 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h | |||
@@ -90,6 +90,7 @@ struct zram_stats { | |||
90 | atomic64_t notify_free; /* no. of swap slot free notifications */ | 90 | atomic64_t notify_free; /* no. of swap slot free notifications */ |
91 | atomic64_t zero_pages; /* no. of zero filled pages */ | 91 | atomic64_t zero_pages; /* no. of zero filled pages */ |
92 | atomic64_t pages_stored; /* no. of pages currently stored */ | 92 | atomic64_t pages_stored; /* no. of pages currently stored */ |
93 | atomic_long_t max_used_pages; /* no. of maximum pages stored */ | ||
93 | }; | 94 | }; |
94 | 95 | ||
95 | struct zram_meta { | 96 | struct zram_meta { |
@@ -112,6 +113,11 @@ struct zram { | |||
112 | u64 disksize; /* bytes */ | 113 | u64 disksize; /* bytes */ |
113 | int max_comp_streams; | 114 | int max_comp_streams; |
114 | struct zram_stats stats; | 115 | struct zram_stats stats; |
116 | /* | ||
117 | * the number of pages zram can consume for storing compressed data | ||
118 | */ | ||
119 | unsigned long limit_pages; | ||
120 | |||
115 | char compressor[10]; | 121 | char compressor[10]; |
116 | }; | 122 | }; |
117 | #endif | 123 | #endif |
diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 79f18e6d9c4f..cc016c615c19 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c | |||
@@ -184,6 +184,9 @@ static int add_sysfs_fw_map_entry(struct firmware_map_entry *entry) | |||
184 | static int map_entries_nr; | 184 | static int map_entries_nr; |
185 | static struct kset *mmap_kset; | 185 | static struct kset *mmap_kset; |
186 | 186 | ||
187 | if (entry->kobj.state_in_sysfs) | ||
188 | return -EEXIST; | ||
189 | |||
187 | if (!mmap_kset) { | 190 | if (!mmap_kset) { |
188 | mmap_kset = kset_create_and_add("memmap", NULL, firmware_kobj); | 191 | mmap_kset = kset_create_and_add("memmap", NULL, firmware_kobj); |
189 | if (!mmap_kset) | 192 | if (!mmap_kset) |
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index c6683f2e396c..00b228638274 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig | |||
@@ -25,6 +25,7 @@ config VIRTIO_PCI | |||
25 | config VIRTIO_BALLOON | 25 | config VIRTIO_BALLOON |
26 | tristate "Virtio balloon driver" | 26 | tristate "Virtio balloon driver" |
27 | depends on VIRTIO | 27 | depends on VIRTIO |
28 | select MEMORY_BALLOON | ||
28 | ---help--- | 29 | ---help--- |
29 | This driver supports increasing and decreasing the amount | 30 | This driver supports increasing and decreasing the amount |
30 | of memory within a KVM guest. | 31 | of memory within a KVM guest. |
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 25ebe8eecdb7..f893148a107b 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c | |||
@@ -59,7 +59,7 @@ struct virtio_balloon | |||
59 | * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE | 59 | * Each page on this list adds VIRTIO_BALLOON_PAGES_PER_PAGE |
60 | * to num_pages above. | 60 | * to num_pages above. |
61 | */ | 61 | */ |
62 | struct balloon_dev_info *vb_dev_info; | 62 | struct balloon_dev_info vb_dev_info; |
63 | 63 | ||
64 | /* Synchronize access/update to this struct virtio_balloon elements */ | 64 | /* Synchronize access/update to this struct virtio_balloon elements */ |
65 | struct mutex balloon_lock; | 65 | struct mutex balloon_lock; |
@@ -127,7 +127,7 @@ static void set_page_pfns(u32 pfns[], struct page *page) | |||
127 | 127 | ||
128 | static void fill_balloon(struct virtio_balloon *vb, size_t num) | 128 | static void fill_balloon(struct virtio_balloon *vb, size_t num) |
129 | { | 129 | { |
130 | struct balloon_dev_info *vb_dev_info = vb->vb_dev_info; | 130 | struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; |
131 | 131 | ||
132 | /* We can only do one array worth at a time. */ | 132 | /* We can only do one array worth at a time. */ |
133 | num = min(num, ARRAY_SIZE(vb->pfns)); | 133 | num = min(num, ARRAY_SIZE(vb->pfns)); |
@@ -163,15 +163,15 @@ static void release_pages_by_pfn(const u32 pfns[], unsigned int num) | |||
163 | /* Find pfns pointing at start of each page, get pages and free them. */ | 163 | /* Find pfns pointing at start of each page, get pages and free them. */ |
164 | for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) { | 164 | for (i = 0; i < num; i += VIRTIO_BALLOON_PAGES_PER_PAGE) { |
165 | struct page *page = balloon_pfn_to_page(pfns[i]); | 165 | struct page *page = balloon_pfn_to_page(pfns[i]); |
166 | balloon_page_free(page); | ||
167 | adjust_managed_page_count(page, 1); | 166 | adjust_managed_page_count(page, 1); |
167 | put_page(page); /* balloon reference */ | ||
168 | } | 168 | } |
169 | } | 169 | } |
170 | 170 | ||
171 | static void leak_balloon(struct virtio_balloon *vb, size_t num) | 171 | static void leak_balloon(struct virtio_balloon *vb, size_t num) |
172 | { | 172 | { |
173 | struct page *page; | 173 | struct page *page; |
174 | struct balloon_dev_info *vb_dev_info = vb->vb_dev_info; | 174 | struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; |
175 | 175 | ||
176 | /* We can only do one array worth at a time. */ | 176 | /* We can only do one array worth at a time. */ |
177 | num = min(num, ARRAY_SIZE(vb->pfns)); | 177 | num = min(num, ARRAY_SIZE(vb->pfns)); |
@@ -353,12 +353,11 @@ static int init_vqs(struct virtio_balloon *vb) | |||
353 | return 0; | 353 | return 0; |
354 | } | 354 | } |
355 | 355 | ||
356 | static const struct address_space_operations virtio_balloon_aops; | ||
357 | #ifdef CONFIG_BALLOON_COMPACTION | 356 | #ifdef CONFIG_BALLOON_COMPACTION |
358 | /* | 357 | /* |
359 | * virtballoon_migratepage - perform the balloon page migration on behalf of | 358 | * virtballoon_migratepage - perform the balloon page migration on behalf of |
360 | * a compation thread. (called under page lock) | 359 | * a compation thread. (called under page lock) |
361 | * @mapping: the page->mapping which will be assigned to the new migrated page. | 360 | * @vb_dev_info: the balloon device |
362 | * @newpage: page that will replace the isolated page after migration finishes. | 361 | * @newpage: page that will replace the isolated page after migration finishes. |
363 | * @page : the isolated (old) page that is about to be migrated to newpage. | 362 | * @page : the isolated (old) page that is about to be migrated to newpage. |
364 | * @mode : compaction mode -- not used for balloon page migration. | 363 | * @mode : compaction mode -- not used for balloon page migration. |
@@ -373,17 +372,13 @@ static const struct address_space_operations virtio_balloon_aops; | |||
373 | * This function preforms the balloon page migration task. | 372 | * This function preforms the balloon page migration task. |
374 | * Called through balloon_mapping->a_ops->migratepage | 373 | * Called through balloon_mapping->a_ops->migratepage |
375 | */ | 374 | */ |
376 | static int virtballoon_migratepage(struct address_space *mapping, | 375 | static int virtballoon_migratepage(struct balloon_dev_info *vb_dev_info, |
377 | struct page *newpage, struct page *page, enum migrate_mode mode) | 376 | struct page *newpage, struct page *page, enum migrate_mode mode) |
378 | { | 377 | { |
379 | struct balloon_dev_info *vb_dev_info = balloon_page_device(page); | 378 | struct virtio_balloon *vb = container_of(vb_dev_info, |
380 | struct virtio_balloon *vb; | 379 | struct virtio_balloon, vb_dev_info); |
381 | unsigned long flags; | 380 | unsigned long flags; |
382 | 381 | ||
383 | BUG_ON(!vb_dev_info); | ||
384 | |||
385 | vb = vb_dev_info->balloon_device; | ||
386 | |||
387 | /* | 382 | /* |
388 | * In order to avoid lock contention while migrating pages concurrently | 383 | * In order to avoid lock contention while migrating pages concurrently |
389 | * to leak_balloon() or fill_balloon() we just give up the balloon_lock | 384 | * to leak_balloon() or fill_balloon() we just give up the balloon_lock |
@@ -395,21 +390,19 @@ static int virtballoon_migratepage(struct address_space *mapping, | |||
395 | if (!mutex_trylock(&vb->balloon_lock)) | 390 | if (!mutex_trylock(&vb->balloon_lock)) |
396 | return -EAGAIN; | 391 | return -EAGAIN; |
397 | 392 | ||
393 | get_page(newpage); /* balloon reference */ | ||
394 | |||
398 | /* balloon's page migration 1st step -- inflate "newpage" */ | 395 | /* balloon's page migration 1st step -- inflate "newpage" */ |
399 | spin_lock_irqsave(&vb_dev_info->pages_lock, flags); | 396 | spin_lock_irqsave(&vb_dev_info->pages_lock, flags); |
400 | balloon_page_insert(newpage, mapping, &vb_dev_info->pages); | 397 | balloon_page_insert(vb_dev_info, newpage); |
401 | vb_dev_info->isolated_pages--; | 398 | vb_dev_info->isolated_pages--; |
399 | __count_vm_event(BALLOON_MIGRATE); | ||
402 | spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); | 400 | spin_unlock_irqrestore(&vb_dev_info->pages_lock, flags); |
403 | vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; | 401 | vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; |
404 | set_page_pfns(vb->pfns, newpage); | 402 | set_page_pfns(vb->pfns, newpage); |
405 | tell_host(vb, vb->inflate_vq); | 403 | tell_host(vb, vb->inflate_vq); |
406 | 404 | ||
407 | /* | 405 | /* balloon's page migration 2nd step -- deflate "page" */ |
408 | * balloon's page migration 2nd step -- deflate "page" | ||
409 | * | ||
410 | * It's safe to delete page->lru here because this page is at | ||
411 | * an isolated migration list, and this step is expected to happen here | ||
412 | */ | ||
413 | balloon_page_delete(page); | 406 | balloon_page_delete(page); |
414 | vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; | 407 | vb->num_pfns = VIRTIO_BALLOON_PAGES_PER_PAGE; |
415 | set_page_pfns(vb->pfns, page); | 408 | set_page_pfns(vb->pfns, page); |
@@ -417,20 +410,15 @@ static int virtballoon_migratepage(struct address_space *mapping, | |||
417 | 410 | ||
418 | mutex_unlock(&vb->balloon_lock); | 411 | mutex_unlock(&vb->balloon_lock); |
419 | 412 | ||
420 | return MIGRATEPAGE_BALLOON_SUCCESS; | 413 | put_page(page); /* balloon reference */ |
421 | } | ||
422 | 414 | ||
423 | /* define the balloon_mapping->a_ops callback to allow balloon page migration */ | 415 | return MIGRATEPAGE_SUCCESS; |
424 | static const struct address_space_operations virtio_balloon_aops = { | 416 | } |
425 | .migratepage = virtballoon_migratepage, | ||
426 | }; | ||
427 | #endif /* CONFIG_BALLOON_COMPACTION */ | 417 | #endif /* CONFIG_BALLOON_COMPACTION */ |
428 | 418 | ||
429 | static int virtballoon_probe(struct virtio_device *vdev) | 419 | static int virtballoon_probe(struct virtio_device *vdev) |
430 | { | 420 | { |
431 | struct virtio_balloon *vb; | 421 | struct virtio_balloon *vb; |
432 | struct address_space *vb_mapping; | ||
433 | struct balloon_dev_info *vb_devinfo; | ||
434 | int err; | 422 | int err; |
435 | 423 | ||
436 | vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); | 424 | vdev->priv = vb = kmalloc(sizeof(*vb), GFP_KERNEL); |
@@ -446,30 +434,14 @@ static int virtballoon_probe(struct virtio_device *vdev) | |||
446 | vb->vdev = vdev; | 434 | vb->vdev = vdev; |
447 | vb->need_stats_update = 0; | 435 | vb->need_stats_update = 0; |
448 | 436 | ||
449 | vb_devinfo = balloon_devinfo_alloc(vb); | 437 | balloon_devinfo_init(&vb->vb_dev_info); |
450 | if (IS_ERR(vb_devinfo)) { | 438 | #ifdef CONFIG_BALLOON_COMPACTION |
451 | err = PTR_ERR(vb_devinfo); | 439 | vb->vb_dev_info.migratepage = virtballoon_migratepage; |
452 | goto out_free_vb; | 440 | #endif |
453 | } | ||
454 | |||
455 | vb_mapping = balloon_mapping_alloc(vb_devinfo, | ||
456 | (balloon_compaction_check()) ? | ||
457 | &virtio_balloon_aops : NULL); | ||
458 | if (IS_ERR(vb_mapping)) { | ||
459 | /* | ||
460 | * IS_ERR(vb_mapping) && PTR_ERR(vb_mapping) == -EOPNOTSUPP | ||
461 | * This means !CONFIG_BALLOON_COMPACTION, otherwise we get off. | ||
462 | */ | ||
463 | err = PTR_ERR(vb_mapping); | ||
464 | if (err != -EOPNOTSUPP) | ||
465 | goto out_free_vb_devinfo; | ||
466 | } | ||
467 | |||
468 | vb->vb_dev_info = vb_devinfo; | ||
469 | 441 | ||
470 | err = init_vqs(vb); | 442 | err = init_vqs(vb); |
471 | if (err) | 443 | if (err) |
472 | goto out_free_vb_mapping; | 444 | goto out_free_vb; |
473 | 445 | ||
474 | vb->thread = kthread_run(balloon, vb, "vballoon"); | 446 | vb->thread = kthread_run(balloon, vb, "vballoon"); |
475 | if (IS_ERR(vb->thread)) { | 447 | if (IS_ERR(vb->thread)) { |
@@ -481,10 +453,6 @@ static int virtballoon_probe(struct virtio_device *vdev) | |||
481 | 453 | ||
482 | out_del_vqs: | 454 | out_del_vqs: |
483 | vdev->config->del_vqs(vdev); | 455 | vdev->config->del_vqs(vdev); |
484 | out_free_vb_mapping: | ||
485 | balloon_mapping_free(vb_mapping); | ||
486 | out_free_vb_devinfo: | ||
487 | balloon_devinfo_free(vb_devinfo); | ||
488 | out_free_vb: | 456 | out_free_vb: |
489 | kfree(vb); | 457 | kfree(vb); |
490 | out: | 458 | out: |
@@ -510,8 +478,6 @@ static void virtballoon_remove(struct virtio_device *vdev) | |||
510 | 478 | ||
511 | kthread_stop(vb->thread); | 479 | kthread_stop(vb->thread); |
512 | remove_common(vb); | 480 | remove_common(vb); |
513 | balloon_mapping_free(vb->vb_dev_info->mapping); | ||
514 | balloon_devinfo_free(vb->vb_dev_info); | ||
515 | kfree(vb); | 481 | kfree(vb); |
516 | } | 482 | } |
517 | 483 | ||
diff --git a/fs/block_dev.c b/fs/block_dev.c index 6d7274619bf9..e2f3ad0879ce 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -304,6 +304,12 @@ static int blkdev_readpage(struct file * file, struct page * page) | |||
304 | return block_read_full_page(page, blkdev_get_block); | 304 | return block_read_full_page(page, blkdev_get_block); |
305 | } | 305 | } |
306 | 306 | ||
307 | static int blkdev_readpages(struct file *file, struct address_space *mapping, | ||
308 | struct list_head *pages, unsigned nr_pages) | ||
309 | { | ||
310 | return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); | ||
311 | } | ||
312 | |||
307 | static int blkdev_write_begin(struct file *file, struct address_space *mapping, | 313 | static int blkdev_write_begin(struct file *file, struct address_space *mapping, |
308 | loff_t pos, unsigned len, unsigned flags, | 314 | loff_t pos, unsigned len, unsigned flags, |
309 | struct page **pagep, void **fsdata) | 315 | struct page **pagep, void **fsdata) |
@@ -1622,6 +1628,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) | |||
1622 | 1628 | ||
1623 | static const struct address_space_operations def_blk_aops = { | 1629 | static const struct address_space_operations def_blk_aops = { |
1624 | .readpage = blkdev_readpage, | 1630 | .readpage = blkdev_readpage, |
1631 | .readpages = blkdev_readpages, | ||
1625 | .writepage = blkdev_writepage, | 1632 | .writepage = blkdev_writepage, |
1626 | .write_begin = blkdev_write_begin, | 1633 | .write_begin = blkdev_write_begin, |
1627 | .write_end = blkdev_write_end, | 1634 | .write_end = blkdev_write_end, |
diff --git a/fs/buffer.c b/fs/buffer.c index 3588a80854b2..44c14a87750e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -1253,7 +1253,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh) | |||
1253 | * a local interrupt disable for that. | 1253 | * a local interrupt disable for that. |
1254 | */ | 1254 | */ |
1255 | 1255 | ||
1256 | #define BH_LRU_SIZE 8 | 1256 | #define BH_LRU_SIZE 16 |
1257 | 1257 | ||
1258 | struct bh_lru { | 1258 | struct bh_lru { |
1259 | struct buffer_head *bhs[BH_LRU_SIZE]; | 1259 | struct buffer_head *bhs[BH_LRU_SIZE]; |
@@ -2956,7 +2956,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) | |||
2956 | 2956 | ||
2957 | /* | 2957 | /* |
2958 | * This allows us to do IO even on the odd last sectors | 2958 | * This allows us to do IO even on the odd last sectors |
2959 | * of a device, even if the bh block size is some multiple | 2959 | * of a device, even if the block size is some multiple |
2960 | * of the physical sector size. | 2960 | * of the physical sector size. |
2961 | * | 2961 | * |
2962 | * We'll just truncate the bio to the size of the device, | 2962 | * We'll just truncate the bio to the size of the device, |
@@ -2966,10 +2966,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) | |||
2966 | * errors, this only handles the "we need to be able to | 2966 | * errors, this only handles the "we need to be able to |
2967 | * do IO at the final sector" case. | 2967 | * do IO at the final sector" case. |
2968 | */ | 2968 | */ |
2969 | static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) | 2969 | void guard_bio_eod(int rw, struct bio *bio) |
2970 | { | 2970 | { |
2971 | sector_t maxsector; | 2971 | sector_t maxsector; |
2972 | unsigned bytes; | 2972 | struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; |
2973 | unsigned truncated_bytes; | ||
2973 | 2974 | ||
2974 | maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; | 2975 | maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; |
2975 | if (!maxsector) | 2976 | if (!maxsector) |
@@ -2984,23 +2985,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) | |||
2984 | return; | 2985 | return; |
2985 | 2986 | ||
2986 | maxsector -= bio->bi_iter.bi_sector; | 2987 | maxsector -= bio->bi_iter.bi_sector; |
2987 | bytes = bio->bi_iter.bi_size; | 2988 | if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) |
2988 | if (likely((bytes >> 9) <= maxsector)) | ||
2989 | return; | 2989 | return; |
2990 | 2990 | ||
2991 | /* Uhhuh. We've got a bh that straddles the device size! */ | 2991 | /* Uhhuh. We've got a bio that straddles the device size! */ |
2992 | bytes = maxsector << 9; | 2992 | truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); |
2993 | 2993 | ||
2994 | /* Truncate the bio.. */ | 2994 | /* Truncate the bio.. */ |
2995 | bio->bi_iter.bi_size = bytes; | 2995 | bio->bi_iter.bi_size -= truncated_bytes; |
2996 | bio->bi_io_vec[0].bv_len = bytes; | 2996 | bvec->bv_len -= truncated_bytes; |
2997 | 2997 | ||
2998 | /* ..and clear the end of the buffer for reads */ | 2998 | /* ..and clear the end of the buffer for reads */ |
2999 | if ((rw & RW_MASK) == READ) { | 2999 | if ((rw & RW_MASK) == READ) { |
3000 | void *kaddr = kmap_atomic(bh->b_page); | 3000 | zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, |
3001 | memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); | 3001 | truncated_bytes); |
3002 | kunmap_atomic(kaddr); | ||
3003 | flush_dcache_page(bh->b_page); | ||
3004 | } | 3002 | } |
3005 | } | 3003 | } |
3006 | 3004 | ||
@@ -3041,7 +3039,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) | |||
3041 | bio->bi_flags |= bio_flags; | 3039 | bio->bi_flags |= bio_flags; |
3042 | 3040 | ||
3043 | /* Take care of bh's that straddle the end of the device */ | 3041 | /* Take care of bh's that straddle the end of the device */ |
3044 | guard_bh_eod(rw, bio, bh); | 3042 | guard_bio_eod(rw, bio); |
3045 | 3043 | ||
3046 | if (buffer_meta(bh)) | 3044 | if (buffer_meta(bh)) |
3047 | rw |= REQ_META; | 3045 | rw |= REQ_META; |
diff --git a/fs/internal.h b/fs/internal.h index e325b4f9c799..b2623200107b 100644 --- a/fs/internal.h +++ b/fs/internal.h | |||
@@ -35,6 +35,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) | |||
35 | #endif | 35 | #endif |
36 | 36 | ||
37 | /* | 37 | /* |
38 | * buffer.c | ||
39 | */ | ||
40 | extern void guard_bio_eod(int rw, struct bio *bio); | ||
41 | |||
42 | /* | ||
38 | * char_dev.c | 43 | * char_dev.c |
39 | */ | 44 | */ |
40 | extern void __init chrdev_init(void); | 45 | extern void __init chrdev_init(void); |
diff --git a/fs/mpage.c b/fs/mpage.c index 5f9ed622274f..3e79220babac 100644 --- a/fs/mpage.c +++ b/fs/mpage.c | |||
@@ -28,6 +28,7 @@ | |||
28 | #include <linux/backing-dev.h> | 28 | #include <linux/backing-dev.h> |
29 | #include <linux/pagevec.h> | 29 | #include <linux/pagevec.h> |
30 | #include <linux/cleancache.h> | 30 | #include <linux/cleancache.h> |
31 | #include "internal.h" | ||
31 | 32 | ||
32 | /* | 33 | /* |
33 | * I/O completion handler for multipage BIOs. | 34 | * I/O completion handler for multipage BIOs. |
@@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err) | |||
57 | static struct bio *mpage_bio_submit(int rw, struct bio *bio) | 58 | static struct bio *mpage_bio_submit(int rw, struct bio *bio) |
58 | { | 59 | { |
59 | bio->bi_end_io = mpage_end_io; | 60 | bio->bi_end_io = mpage_end_io; |
61 | guard_bio_eod(rw, bio); | ||
60 | submit_bio(rw, bio); | 62 | submit_bio(rw, bio); |
61 | return NULL; | 63 | return NULL; |
62 | } | 64 | } |
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b13992a41bd9..c991616acca9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c | |||
@@ -78,7 +78,7 @@ static int create_fd(struct fsnotify_group *group, | |||
78 | 78 | ||
79 | pr_debug("%s: group=%p event=%p\n", __func__, group, event); | 79 | pr_debug("%s: group=%p event=%p\n", __func__, group, event); |
80 | 80 | ||
81 | client_fd = get_unused_fd(); | 81 | client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); |
82 | if (client_fd < 0) | 82 | if (client_fd < 0) |
83 | return client_fd; | 83 | return client_fd; |
84 | 84 | ||
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 85e7d2b431d9..9c0898c4cfe1 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h | |||
@@ -23,9 +23,6 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, | |||
23 | struct fsnotify_group *group, struct vfsmount *mnt, | 23 | struct fsnotify_group *group, struct vfsmount *mnt, |
24 | int allow_dups); | 24 | int allow_dups); |
25 | 25 | ||
26 | /* final kfree of a group */ | ||
27 | extern void fsnotify_final_destroy_group(struct fsnotify_group *group); | ||
28 | |||
29 | /* vfsmount specific destruction of a mark */ | 26 | /* vfsmount specific destruction of a mark */ |
30 | extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); | 27 | extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); |
31 | /* inode specific destruction of a mark */ | 28 | /* inode specific destruction of a mark */ |
diff --git a/fs/notify/group.c b/fs/notify/group.c index ad1995980456..d16b62cb2854 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c | |||
@@ -31,7 +31,7 @@ | |||
31 | /* | 31 | /* |
32 | * Final freeing of a group | 32 | * Final freeing of a group |
33 | */ | 33 | */ |
34 | void fsnotify_final_destroy_group(struct fsnotify_group *group) | 34 | static void fsnotify_final_destroy_group(struct fsnotify_group *group) |
35 | { | 35 | { |
36 | if (group->ops->free_group_priv) | 36 | if (group->ops->free_group_priv) |
37 | group->ops->free_group_priv(group); | 37 | group->ops->free_group_priv(group); |
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 0f88bc0b4e6c..7d888d77d59a 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c | |||
@@ -165,8 +165,10 @@ static void inotify_free_group_priv(struct fsnotify_group *group) | |||
165 | /* ideally the idr is empty and we won't hit the BUG in the callback */ | 165 | /* ideally the idr is empty and we won't hit the BUG in the callback */ |
166 | idr_for_each(&group->inotify_data.idr, idr_callback, group); | 166 | idr_for_each(&group->inotify_data.idr, idr_callback, group); |
167 | idr_destroy(&group->inotify_data.idr); | 167 | idr_destroy(&group->inotify_data.idr); |
168 | atomic_dec(&group->inotify_data.user->inotify_devs); | 168 | if (group->inotify_data.user) { |
169 | free_uid(group->inotify_data.user); | 169 | atomic_dec(&group->inotify_data.user->inotify_devs); |
170 | free_uid(group->inotify_data.user); | ||
171 | } | ||
170 | } | 172 | } |
171 | 173 | ||
172 | static void inotify_free_event(struct fsnotify_event *fsn_event) | 174 | static void inotify_free_event(struct fsnotify_event *fsn_event) |
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c index dd6103cc93c1..825a54e8f490 100644 --- a/fs/ntfs/debug.c +++ b/fs/ntfs/debug.c | |||
@@ -112,7 +112,7 @@ void __ntfs_error(const char *function, const struct super_block *sb, | |||
112 | /* If 1, output debug messages, and if 0, don't. */ | 112 | /* If 1, output debug messages, and if 0, don't. */ |
113 | int debug_msgs = 0; | 113 | int debug_msgs = 0; |
114 | 114 | ||
115 | void __ntfs_debug (const char *file, int line, const char *function, | 115 | void __ntfs_debug(const char *file, int line, const char *function, |
116 | const char *fmt, ...) | 116 | const char *fmt, ...) |
117 | { | 117 | { |
118 | struct va_format vaf; | 118 | struct va_format vaf; |
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f5ec1ce7a532..643faa44f22b 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. | 2 | * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. |
3 | * | 3 | * |
4 | * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. | 4 | * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. |
5 | * | 5 | * |
6 | * This program/include file is free software; you can redistribute it and/or | 6 | * This program/include file is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU General Public License as published | 7 | * modify it under the terms of the GNU General Public License as published |
@@ -410,7 +410,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, | |||
410 | BUG_ON(!nr_pages); | 410 | BUG_ON(!nr_pages); |
411 | err = nr = 0; | 411 | err = nr = 0; |
412 | do { | 412 | do { |
413 | pages[nr] = find_lock_page(mapping, index); | 413 | pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | |
414 | FGP_ACCESSED); | ||
414 | if (!pages[nr]) { | 415 | if (!pages[nr]) { |
415 | if (!*cached_page) { | 416 | if (!*cached_page) { |
416 | *cached_page = page_cache_alloc(mapping); | 417 | *cached_page = page_cache_alloc(mapping); |
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 6c3296e546c3..9e1e112074fb 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c | |||
@@ -3208,7 +3208,7 @@ static void __exit exit_ntfs_fs(void) | |||
3208 | } | 3208 | } |
3209 | 3209 | ||
3210 | MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); | 3210 | MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); |
3211 | MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc."); | 3211 | MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); |
3212 | MODULE_VERSION(NTFS_VERSION); | 3212 | MODULE_VERSION(NTFS_VERSION); |
3213 | MODULE_LICENSE("GPL"); | 3213 | MODULE_LICENSE("GPL"); |
3214 | #ifdef DEBUG | 3214 | #ifdef DEBUG |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4a231a166cf8..1ef547e49373 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -1481,8 +1481,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, | |||
1481 | handle_t *handle; | 1481 | handle_t *handle; |
1482 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; | 1482 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; |
1483 | 1483 | ||
1484 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | ||
1485 | if (IS_ERR(handle)) { | ||
1486 | ret = PTR_ERR(handle); | ||
1487 | mlog_errno(ret); | ||
1488 | goto out; | ||
1489 | } | ||
1490 | |||
1484 | page = find_or_create_page(mapping, 0, GFP_NOFS); | 1491 | page = find_or_create_page(mapping, 0, GFP_NOFS); |
1485 | if (!page) { | 1492 | if (!page) { |
1493 | ocfs2_commit_trans(osb, handle); | ||
1486 | ret = -ENOMEM; | 1494 | ret = -ENOMEM; |
1487 | mlog_errno(ret); | 1495 | mlog_errno(ret); |
1488 | goto out; | 1496 | goto out; |
@@ -1494,13 +1502,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, | |||
1494 | wc->w_pages[0] = wc->w_target_page = page; | 1502 | wc->w_pages[0] = wc->w_target_page = page; |
1495 | wc->w_num_pages = 1; | 1503 | wc->w_num_pages = 1; |
1496 | 1504 | ||
1497 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | ||
1498 | if (IS_ERR(handle)) { | ||
1499 | ret = PTR_ERR(handle); | ||
1500 | mlog_errno(ret); | ||
1501 | goto out; | ||
1502 | } | ||
1503 | |||
1504 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, | 1505 | ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, |
1505 | OCFS2_JOURNAL_ACCESS_WRITE); | 1506 | OCFS2_JOURNAL_ACCESS_WRITE); |
1506 | if (ret) { | 1507 | if (ret) { |
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 73039295d0d1..d13385448168 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num) | |||
2572 | } | 2572 | } |
2573 | EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); | 2573 | EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); |
2574 | 2574 | ||
2575 | int o2hb_check_node_heartbeating_no_sem(u8 node_num) | ||
2576 | { | ||
2577 | unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
2578 | unsigned long flags; | ||
2579 | |||
2580 | spin_lock_irqsave(&o2hb_live_lock, flags); | ||
2581 | o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); | ||
2582 | spin_unlock_irqrestore(&o2hb_live_lock, flags); | ||
2583 | if (!test_bit(node_num, testing_map)) { | ||
2584 | mlog(ML_HEARTBEAT, | ||
2585 | "node (%u) does not have heartbeating enabled.\n", | ||
2586 | node_num); | ||
2587 | return 0; | ||
2588 | } | ||
2589 | |||
2590 | return 1; | ||
2591 | } | ||
2592 | EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem); | ||
2593 | |||
2575 | int o2hb_check_node_heartbeating_from_callback(u8 node_num) | 2594 | int o2hb_check_node_heartbeating_from_callback(u8 node_num) |
2576 | { | 2595 | { |
2577 | unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 2596 | unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 00ad8e8fea51..3ef5137dc362 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h | |||
@@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map, | |||
80 | void o2hb_exit(void); | 80 | void o2hb_exit(void); |
81 | int o2hb_init(void); | 81 | int o2hb_init(void); |
82 | int o2hb_check_node_heartbeating(u8 node_num); | 82 | int o2hb_check_node_heartbeating(u8 node_num); |
83 | int o2hb_check_node_heartbeating_no_sem(u8 node_num); | ||
83 | int o2hb_check_node_heartbeating_from_callback(u8 node_num); | 84 | int o2hb_check_node_heartbeating_from_callback(u8 node_num); |
84 | int o2hb_check_local_node_heartbeating(void); | 85 | int o2hb_check_local_node_heartbeating(void); |
85 | void o2hb_stop_all_regions(void); | 86 | void o2hb_stop_all_regions(void); |
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 73ba81928bce..27d1242c8383 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c | |||
@@ -185,29 +185,13 @@ static const struct seq_operations nst_seq_ops = { | |||
185 | static int nst_fop_open(struct inode *inode, struct file *file) | 185 | static int nst_fop_open(struct inode *inode, struct file *file) |
186 | { | 186 | { |
187 | struct o2net_send_tracking *dummy_nst; | 187 | struct o2net_send_tracking *dummy_nst; |
188 | struct seq_file *seq; | ||
189 | int ret; | ||
190 | 188 | ||
191 | dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); | 189 | dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst)); |
192 | if (dummy_nst == NULL) { | 190 | if (!dummy_nst) |
193 | ret = -ENOMEM; | 191 | return -ENOMEM; |
194 | goto out; | ||
195 | } | ||
196 | dummy_nst->st_task = NULL; | ||
197 | |||
198 | ret = seq_open(file, &nst_seq_ops); | ||
199 | if (ret) | ||
200 | goto out; | ||
201 | |||
202 | seq = file->private_data; | ||
203 | seq->private = dummy_nst; | ||
204 | o2net_debug_add_nst(dummy_nst); | 192 | o2net_debug_add_nst(dummy_nst); |
205 | 193 | ||
206 | dummy_nst = NULL; | 194 | return 0; |
207 | |||
208 | out: | ||
209 | kfree(dummy_nst); | ||
210 | return ret; | ||
211 | } | 195 | } |
212 | 196 | ||
213 | static int nst_fop_release(struct inode *inode, struct file *file) | 197 | static int nst_fop_release(struct inode *inode, struct file *file) |
@@ -412,33 +396,27 @@ static const struct seq_operations sc_seq_ops = { | |||
412 | .show = sc_seq_show, | 396 | .show = sc_seq_show, |
413 | }; | 397 | }; |
414 | 398 | ||
415 | static int sc_common_open(struct file *file, struct o2net_sock_debug *sd) | 399 | static int sc_common_open(struct file *file, int ctxt) |
416 | { | 400 | { |
401 | struct o2net_sock_debug *sd; | ||
417 | struct o2net_sock_container *dummy_sc; | 402 | struct o2net_sock_container *dummy_sc; |
418 | struct seq_file *seq; | ||
419 | int ret; | ||
420 | 403 | ||
421 | dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); | 404 | dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL); |
422 | if (dummy_sc == NULL) { | 405 | if (!dummy_sc) |
423 | ret = -ENOMEM; | 406 | return -ENOMEM; |
424 | goto out; | ||
425 | } | ||
426 | dummy_sc->sc_page = NULL; | ||
427 | 407 | ||
428 | ret = seq_open(file, &sc_seq_ops); | 408 | sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd)); |
429 | if (ret) | 409 | if (!sd) { |
430 | goto out; | 410 | kfree(dummy_sc); |
411 | return -ENOMEM; | ||
412 | } | ||
431 | 413 | ||
432 | seq = file->private_data; | 414 | sd->dbg_ctxt = ctxt; |
433 | seq->private = sd; | ||
434 | sd->dbg_sock = dummy_sc; | 415 | sd->dbg_sock = dummy_sc; |
435 | o2net_debug_add_sc(dummy_sc); | ||
436 | 416 | ||
437 | dummy_sc = NULL; | 417 | o2net_debug_add_sc(dummy_sc); |
438 | 418 | ||
439 | out: | 419 | return 0; |
440 | kfree(dummy_sc); | ||
441 | return ret; | ||
442 | } | 420 | } |
443 | 421 | ||
444 | static int sc_fop_release(struct inode *inode, struct file *file) | 422 | static int sc_fop_release(struct inode *inode, struct file *file) |
@@ -453,16 +431,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) | |||
453 | 431 | ||
454 | static int stats_fop_open(struct inode *inode, struct file *file) | 432 | static int stats_fop_open(struct inode *inode, struct file *file) |
455 | { | 433 | { |
456 | struct o2net_sock_debug *sd; | 434 | return sc_common_open(file, SHOW_SOCK_STATS); |
457 | |||
458 | sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); | ||
459 | if (sd == NULL) | ||
460 | return -ENOMEM; | ||
461 | |||
462 | sd->dbg_ctxt = SHOW_SOCK_STATS; | ||
463 | sd->dbg_sock = NULL; | ||
464 | |||
465 | return sc_common_open(file, sd); | ||
466 | } | 435 | } |
467 | 436 | ||
468 | static const struct file_operations stats_seq_fops = { | 437 | static const struct file_operations stats_seq_fops = { |
@@ -474,16 +443,7 @@ static const struct file_operations stats_seq_fops = { | |||
474 | 443 | ||
475 | static int sc_fop_open(struct inode *inode, struct file *file) | 444 | static int sc_fop_open(struct inode *inode, struct file *file) |
476 | { | 445 | { |
477 | struct o2net_sock_debug *sd; | 446 | return sc_common_open(file, SHOW_SOCK_CONTAINERS); |
478 | |||
479 | sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); | ||
480 | if (sd == NULL) | ||
481 | return -ENOMEM; | ||
482 | |||
483 | sd->dbg_ctxt = SHOW_SOCK_CONTAINERS; | ||
484 | sd->dbg_sock = NULL; | ||
485 | |||
486 | return sc_common_open(file, sd); | ||
487 | } | 447 | } |
488 | 448 | ||
489 | static const struct file_operations sc_seq_fops = { | 449 | static const struct file_operations sc_seq_fops = { |
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index ea34952f9496..97de0fbd9f78 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -536,7 +536,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
536 | if (nn->nn_persistent_error || nn->nn_sc_valid) | 536 | if (nn->nn_persistent_error || nn->nn_sc_valid) |
537 | wake_up(&nn->nn_sc_wq); | 537 | wake_up(&nn->nn_sc_wq); |
538 | 538 | ||
539 | if (!was_err && nn->nn_persistent_error) { | 539 | if (was_valid && !was_err && nn->nn_persistent_error) { |
540 | o2quo_conn_err(o2net_num_from_nn(nn)); | 540 | o2quo_conn_err(o2net_num_from_nn(nn)); |
541 | queue_delayed_work(o2net_wq, &nn->nn_still_up, | 541 | queue_delayed_work(o2net_wq, &nn->nn_still_up, |
542 | msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); | 542 | msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); |
@@ -1601,7 +1601,15 @@ static void o2net_start_connect(struct work_struct *work) | |||
1601 | struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; | 1601 | struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; |
1602 | int ret = 0, stop; | 1602 | int ret = 0, stop; |
1603 | unsigned int timeout; | 1603 | unsigned int timeout; |
1604 | unsigned int noio_flag; | ||
1604 | 1605 | ||
1606 | /* | ||
1607 | * sock_create allocates the sock with GFP_KERNEL. We must set | ||
1608 | * per-process flag PF_MEMALLOC_NOIO so that all allocations done | ||
1609 | * by this process are done as if GFP_NOIO was specified. So we | ||
1610 | * are not reentering filesystem while doing memory reclaim. | ||
1611 | */ | ||
1612 | noio_flag = memalloc_noio_save(); | ||
1605 | /* if we're greater we initiate tx, otherwise we accept */ | 1613 | /* if we're greater we initiate tx, otherwise we accept */ |
1606 | if (o2nm_this_node() <= o2net_num_from_nn(nn)) | 1614 | if (o2nm_this_node() <= o2net_num_from_nn(nn)) |
1607 | goto out; | 1615 | goto out; |
@@ -1710,6 +1718,7 @@ out: | |||
1710 | if (mynode) | 1718 | if (mynode) |
1711 | o2nm_node_put(mynode); | 1719 | o2nm_node_put(mynode); |
1712 | 1720 | ||
1721 | memalloc_noio_restore(noio_flag); | ||
1713 | return; | 1722 | return; |
1714 | } | 1723 | } |
1715 | 1724 | ||
@@ -1721,7 +1730,8 @@ static void o2net_connect_expired(struct work_struct *work) | |||
1721 | spin_lock(&nn->nn_lock); | 1730 | spin_lock(&nn->nn_lock); |
1722 | if (!nn->nn_sc_valid) { | 1731 | if (!nn->nn_sc_valid) { |
1723 | printk(KERN_NOTICE "o2net: No connection established with " | 1732 | printk(KERN_NOTICE "o2net: No connection established with " |
1724 | "node %u after %u.%u seconds, giving up.\n", | 1733 | "node %u after %u.%u seconds, check network and" |
1734 | " cluster configuration.\n", | ||
1725 | o2net_num_from_nn(nn), | 1735 | o2net_num_from_nn(nn), |
1726 | o2net_idle_timeout() / 1000, | 1736 | o2net_idle_timeout() / 1000, |
1727 | o2net_idle_timeout() % 1000); | 1737 | o2net_idle_timeout() % 1000); |
@@ -1835,6 +1845,15 @@ static int o2net_accept_one(struct socket *sock, int *more) | |||
1835 | struct o2nm_node *local_node = NULL; | 1845 | struct o2nm_node *local_node = NULL; |
1836 | struct o2net_sock_container *sc = NULL; | 1846 | struct o2net_sock_container *sc = NULL; |
1837 | struct o2net_node *nn; | 1847 | struct o2net_node *nn; |
1848 | unsigned int noio_flag; | ||
1849 | |||
1850 | /* | ||
1851 | * sock_create_lite allocates the sock with GFP_KERNEL. We must set | ||
1852 | * per-process flag PF_MEMALLOC_NOIO so that all allocations done | ||
1853 | * by this process are done as if GFP_NOIO was specified. So we | ||
1854 | * are not reentering filesystem while doing memory reclaim. | ||
1855 | */ | ||
1856 | noio_flag = memalloc_noio_save(); | ||
1838 | 1857 | ||
1839 | BUG_ON(sock == NULL); | 1858 | BUG_ON(sock == NULL); |
1840 | *more = 0; | 1859 | *more = 0; |
@@ -1951,6 +1970,8 @@ out: | |||
1951 | o2nm_node_put(local_node); | 1970 | o2nm_node_put(local_node); |
1952 | if (sc) | 1971 | if (sc) |
1953 | sc_put(sc); | 1972 | sc_put(sc); |
1973 | |||
1974 | memalloc_noio_restore(noio_flag); | ||
1954 | return ret; | 1975 | return ret; |
1955 | } | 1976 | } |
1956 | 1977 | ||
@@ -2146,17 +2167,13 @@ int o2net_init(void) | |||
2146 | o2quo_init(); | 2167 | o2quo_init(); |
2147 | 2168 | ||
2148 | if (o2net_debugfs_init()) | 2169 | if (o2net_debugfs_init()) |
2149 | return -ENOMEM; | 2170 | goto out; |
2150 | 2171 | ||
2151 | o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); | 2172 | o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); |
2152 | o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); | 2173 | o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); |
2153 | o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); | 2174 | o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); |
2154 | if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { | 2175 | if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) |
2155 | kfree(o2net_hand); | 2176 | goto out; |
2156 | kfree(o2net_keep_req); | ||
2157 | kfree(o2net_keep_resp); | ||
2158 | return -ENOMEM; | ||
2159 | } | ||
2160 | 2177 | ||
2161 | o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); | 2178 | o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); |
2162 | o2net_hand->connector_id = cpu_to_be64(1); | 2179 | o2net_hand->connector_id = cpu_to_be64(1); |
@@ -2181,6 +2198,14 @@ int o2net_init(void) | |||
2181 | } | 2198 | } |
2182 | 2199 | ||
2183 | return 0; | 2200 | return 0; |
2201 | |||
2202 | out: | ||
2203 | kfree(o2net_hand); | ||
2204 | kfree(o2net_keep_req); | ||
2205 | kfree(o2net_keep_resp); | ||
2206 | |||
2207 | o2quo_exit(); | ||
2208 | return -ENOMEM; | ||
2184 | } | 2209 | } |
2185 | 2210 | ||
2186 | void o2net_exit(void) | 2211 | void o2net_exit(void) |
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 18f13c2e4a10..149eb556b8c6 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c | |||
@@ -647,41 +647,30 @@ static const struct seq_operations debug_lockres_ops = { | |||
647 | static int debug_lockres_open(struct inode *inode, struct file *file) | 647 | static int debug_lockres_open(struct inode *inode, struct file *file) |
648 | { | 648 | { |
649 | struct dlm_ctxt *dlm = inode->i_private; | 649 | struct dlm_ctxt *dlm = inode->i_private; |
650 | int ret = -ENOMEM; | 650 | struct debug_lockres *dl; |
651 | struct seq_file *seq; | 651 | void *buf; |
652 | struct debug_lockres *dl = NULL; | ||
653 | 652 | ||
654 | dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL); | 653 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
655 | if (!dl) { | 654 | if (!buf) |
656 | mlog_errno(ret); | ||
657 | goto bail; | 655 | goto bail; |
658 | } | ||
659 | 656 | ||
660 | dl->dl_len = PAGE_SIZE; | 657 | dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl)); |
661 | dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL); | 658 | if (!dl) |
662 | if (!dl->dl_buf) { | 659 | goto bailfree; |
663 | mlog_errno(ret); | ||
664 | goto bail; | ||
665 | } | ||
666 | 660 | ||
667 | ret = seq_open(file, &debug_lockres_ops); | 661 | dl->dl_len = PAGE_SIZE; |
668 | if (ret) { | 662 | dl->dl_buf = buf; |
669 | mlog_errno(ret); | ||
670 | goto bail; | ||
671 | } | ||
672 | |||
673 | seq = file->private_data; | ||
674 | seq->private = dl; | ||
675 | 663 | ||
676 | dlm_grab(dlm); | 664 | dlm_grab(dlm); |
677 | dl->dl_ctxt = dlm; | 665 | dl->dl_ctxt = dlm; |
678 | 666 | ||
679 | return 0; | 667 | return 0; |
668 | |||
669 | bailfree: | ||
670 | kfree(buf); | ||
680 | bail: | 671 | bail: |
681 | if (dl) | 672 | mlog_errno(-ENOMEM); |
682 | kfree(dl->dl_buf); | 673 | return -ENOMEM; |
683 | kfree(dl); | ||
684 | return ret; | ||
685 | } | 674 | } |
686 | 675 | ||
687 | static int debug_lockres_release(struct inode *inode, struct file *file) | 676 | static int debug_lockres_release(struct inode *inode, struct file *file) |
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 3fcf205ee900..02d315fef432 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, | |||
839 | * to back off and try again. This gives heartbeat a chance | 839 | * to back off and try again. This gives heartbeat a chance |
840 | * to catch up. | 840 | * to catch up. |
841 | */ | 841 | */ |
842 | if (!o2hb_check_node_heartbeating(query->node_idx)) { | 842 | if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) { |
843 | mlog(0, "node %u is not in our live map yet\n", | 843 | mlog(0, "node %u is not in our live map yet\n", |
844 | query->node_idx); | 844 | query->node_idx); |
845 | 845 | ||
@@ -1975,24 +1975,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
1975 | 1975 | ||
1976 | dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); | 1976 | dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); |
1977 | if (!dlm) { | 1977 | if (!dlm) { |
1978 | mlog_errno(-ENOMEM); | 1978 | ret = -ENOMEM; |
1979 | mlog_errno(ret); | ||
1979 | goto leave; | 1980 | goto leave; |
1980 | } | 1981 | } |
1981 | 1982 | ||
1982 | dlm->name = kstrdup(domain, GFP_KERNEL); | 1983 | dlm->name = kstrdup(domain, GFP_KERNEL); |
1983 | if (dlm->name == NULL) { | 1984 | if (dlm->name == NULL) { |
1984 | mlog_errno(-ENOMEM); | 1985 | ret = -ENOMEM; |
1985 | kfree(dlm); | 1986 | mlog_errno(ret); |
1986 | dlm = NULL; | ||
1987 | goto leave; | 1987 | goto leave; |
1988 | } | 1988 | } |
1989 | 1989 | ||
1990 | dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); | 1990 | dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); |
1991 | if (!dlm->lockres_hash) { | 1991 | if (!dlm->lockres_hash) { |
1992 | mlog_errno(-ENOMEM); | 1992 | ret = -ENOMEM; |
1993 | kfree(dlm->name); | 1993 | mlog_errno(ret); |
1994 | kfree(dlm); | ||
1995 | dlm = NULL; | ||
1996 | goto leave; | 1994 | goto leave; |
1997 | } | 1995 | } |
1998 | 1996 | ||
@@ -2002,11 +2000,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
2002 | dlm->master_hash = (struct hlist_head **) | 2000 | dlm->master_hash = (struct hlist_head **) |
2003 | dlm_alloc_pagevec(DLM_HASH_PAGES); | 2001 | dlm_alloc_pagevec(DLM_HASH_PAGES); |
2004 | if (!dlm->master_hash) { | 2002 | if (!dlm->master_hash) { |
2005 | mlog_errno(-ENOMEM); | 2003 | ret = -ENOMEM; |
2006 | dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); | 2004 | mlog_errno(ret); |
2007 | kfree(dlm->name); | ||
2008 | kfree(dlm); | ||
2009 | dlm = NULL; | ||
2010 | goto leave; | 2005 | goto leave; |
2011 | } | 2006 | } |
2012 | 2007 | ||
@@ -2017,14 +2012,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
2017 | dlm->node_num = o2nm_this_node(); | 2012 | dlm->node_num = o2nm_this_node(); |
2018 | 2013 | ||
2019 | ret = dlm_create_debugfs_subroot(dlm); | 2014 | ret = dlm_create_debugfs_subroot(dlm); |
2020 | if (ret < 0) { | 2015 | if (ret < 0) |
2021 | dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); | ||
2022 | dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); | ||
2023 | kfree(dlm->name); | ||
2024 | kfree(dlm); | ||
2025 | dlm = NULL; | ||
2026 | goto leave; | 2016 | goto leave; |
2027 | } | ||
2028 | 2017 | ||
2029 | spin_lock_init(&dlm->spinlock); | 2018 | spin_lock_init(&dlm->spinlock); |
2030 | spin_lock_init(&dlm->master_lock); | 2019 | spin_lock_init(&dlm->master_lock); |
@@ -2085,6 +2074,19 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
2085 | atomic_read(&dlm->dlm_refs.refcount)); | 2074 | atomic_read(&dlm->dlm_refs.refcount)); |
2086 | 2075 | ||
2087 | leave: | 2076 | leave: |
2077 | if (ret < 0 && dlm) { | ||
2078 | if (dlm->master_hash) | ||
2079 | dlm_free_pagevec((void **)dlm->master_hash, | ||
2080 | DLM_HASH_PAGES); | ||
2081 | |||
2082 | if (dlm->lockres_hash) | ||
2083 | dlm_free_pagevec((void **)dlm->lockres_hash, | ||
2084 | DLM_HASH_PAGES); | ||
2085 | |||
2086 | kfree(dlm->name); | ||
2087 | kfree(dlm); | ||
2088 | dlm = NULL; | ||
2089 | } | ||
2088 | return dlm; | 2090 | return dlm; |
2089 | } | 2091 | } |
2090 | 2092 | ||
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 12ba682fc53c..215e41abf101 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -625,9 +625,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, | |||
625 | return res; | 625 | return res; |
626 | 626 | ||
627 | error: | 627 | error: |
628 | if (res && res->lockname.name) | ||
629 | kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); | ||
630 | |||
631 | if (res) | 628 | if (res) |
632 | kmem_cache_free(dlm_lockres_cache, res); | 629 | kmem_cache_free(dlm_lockres_cache, res); |
633 | return NULL; | 630 | return NULL; |
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 45067faf5695..3365839d2971 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -1710,9 +1710,12 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, | |||
1710 | BUG(); | 1710 | BUG(); |
1711 | } else | 1711 | } else |
1712 | __dlm_lockres_grab_inflight_worker(dlm, res); | 1712 | __dlm_lockres_grab_inflight_worker(dlm, res); |
1713 | } else /* put.. incase we are not the master */ | 1713 | spin_unlock(&res->spinlock); |
1714 | } else { | ||
1715 | /* put.. incase we are not the master */ | ||
1716 | spin_unlock(&res->spinlock); | ||
1714 | dlm_lockres_put(res); | 1717 | dlm_lockres_put(res); |
1715 | spin_unlock(&res->spinlock); | 1718 | } |
1716 | } | 1719 | } |
1717 | spin_unlock(&dlm->spinlock); | 1720 | spin_unlock(&dlm->spinlock); |
1718 | 1721 | ||
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 52cfe99ae056..21262f2b1654 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -2892,37 +2892,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) | |||
2892 | 2892 | ||
2893 | static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) | 2893 | static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) |
2894 | { | 2894 | { |
2895 | int ret; | ||
2896 | struct ocfs2_dlm_seq_priv *priv; | 2895 | struct ocfs2_dlm_seq_priv *priv; |
2897 | struct seq_file *seq; | ||
2898 | struct ocfs2_super *osb; | 2896 | struct ocfs2_super *osb; |
2899 | 2897 | ||
2900 | priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); | 2898 | priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv)); |
2901 | if (!priv) { | 2899 | if (!priv) { |
2902 | ret = -ENOMEM; | 2900 | mlog_errno(-ENOMEM); |
2903 | mlog_errno(ret); | 2901 | return -ENOMEM; |
2904 | goto out; | ||
2905 | } | 2902 | } |
2903 | |||
2906 | osb = inode->i_private; | 2904 | osb = inode->i_private; |
2907 | ocfs2_get_dlm_debug(osb->osb_dlm_debug); | 2905 | ocfs2_get_dlm_debug(osb->osb_dlm_debug); |
2908 | priv->p_dlm_debug = osb->osb_dlm_debug; | 2906 | priv->p_dlm_debug = osb->osb_dlm_debug; |
2909 | INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); | 2907 | INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); |
2910 | 2908 | ||
2911 | ret = seq_open(file, &ocfs2_dlm_seq_ops); | ||
2912 | if (ret) { | ||
2913 | kfree(priv); | ||
2914 | mlog_errno(ret); | ||
2915 | goto out; | ||
2916 | } | ||
2917 | |||
2918 | seq = file->private_data; | ||
2919 | seq->private = priv; | ||
2920 | |||
2921 | ocfs2_add_lockres_tracking(&priv->p_iter_res, | 2909 | ocfs2_add_lockres_tracking(&priv->p_iter_res, |
2922 | priv->p_dlm_debug); | 2910 | priv->p_dlm_debug); |
2923 | 2911 | ||
2924 | out: | 2912 | return 0; |
2925 | return ret; | ||
2926 | } | 2913 | } |
2927 | 2914 | ||
2928 | static const struct file_operations ocfs2_dlm_debug_fops = { | 2915 | static const struct file_operations ocfs2_dlm_debug_fops = { |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 2930e231f3f9..682732f3f0d8 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, | |||
760 | struct address_space *mapping = inode->i_mapping; | 760 | struct address_space *mapping = inode->i_mapping; |
761 | struct page *page; | 761 | struct page *page; |
762 | unsigned long index = abs_from >> PAGE_CACHE_SHIFT; | 762 | unsigned long index = abs_from >> PAGE_CACHE_SHIFT; |
763 | handle_t *handle = NULL; | 763 | handle_t *handle; |
764 | int ret = 0; | 764 | int ret = 0; |
765 | unsigned zero_from, zero_to, block_start, block_end; | 765 | unsigned zero_from, zero_to, block_start, block_end; |
766 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; | 766 | struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; |
@@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, | |||
769 | BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); | 769 | BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); |
770 | BUG_ON(abs_from & (inode->i_blkbits - 1)); | 770 | BUG_ON(abs_from & (inode->i_blkbits - 1)); |
771 | 771 | ||
772 | handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); | ||
773 | if (IS_ERR(handle)) { | ||
774 | ret = PTR_ERR(handle); | ||
775 | goto out; | ||
776 | } | ||
777 | |||
772 | page = find_or_create_page(mapping, index, GFP_NOFS); | 778 | page = find_or_create_page(mapping, index, GFP_NOFS); |
773 | if (!page) { | 779 | if (!page) { |
774 | ret = -ENOMEM; | 780 | ret = -ENOMEM; |
775 | mlog_errno(ret); | 781 | mlog_errno(ret); |
776 | goto out; | 782 | goto out_commit_trans; |
777 | } | 783 | } |
778 | 784 | ||
779 | /* Get the offsets within the page that we want to zero */ | 785 | /* Get the offsets within the page that we want to zero */ |
@@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, | |||
805 | goto out_unlock; | 811 | goto out_unlock; |
806 | } | 812 | } |
807 | 813 | ||
808 | if (!handle) { | ||
809 | handle = ocfs2_zero_start_ordered_transaction(inode, | ||
810 | di_bh); | ||
811 | if (IS_ERR(handle)) { | ||
812 | ret = PTR_ERR(handle); | ||
813 | handle = NULL; | ||
814 | break; | ||
815 | } | ||
816 | } | ||
817 | 814 | ||
818 | /* must not update i_size! */ | 815 | /* must not update i_size! */ |
819 | ret = block_commit_write(page, block_start + 1, | 816 | ret = block_commit_write(page, block_start + 1, |
@@ -824,27 +821,29 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, | |||
824 | ret = 0; | 821 | ret = 0; |
825 | } | 822 | } |
826 | 823 | ||
824 | /* | ||
825 | * fs-writeback will release the dirty pages without page lock | ||
826 | * whose offset are over inode size, the release happens at | ||
827 | * block_write_full_page(). | ||
828 | */ | ||
829 | i_size_write(inode, abs_to); | ||
830 | inode->i_blocks = ocfs2_inode_sector_count(inode); | ||
831 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
832 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
833 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
834 | di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
835 | di->i_mtime_nsec = di->i_ctime_nsec; | ||
827 | if (handle) { | 836 | if (handle) { |
828 | /* | ||
829 | * fs-writeback will release the dirty pages without page lock | ||
830 | * whose offset are over inode size, the release happens at | ||
831 | * block_write_full_page(). | ||
832 | */ | ||
833 | i_size_write(inode, abs_to); | ||
834 | inode->i_blocks = ocfs2_inode_sector_count(inode); | ||
835 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
836 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
837 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
838 | di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
839 | di->i_mtime_nsec = di->i_ctime_nsec; | ||
840 | ocfs2_journal_dirty(handle, di_bh); | 837 | ocfs2_journal_dirty(handle, di_bh); |
841 | ocfs2_update_inode_fsync_trans(handle, inode, 1); | 838 | ocfs2_update_inode_fsync_trans(handle, inode, 1); |
842 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); | ||
843 | } | 839 | } |
844 | 840 | ||
845 | out_unlock: | 841 | out_unlock: |
846 | unlock_page(page); | 842 | unlock_page(page); |
847 | page_cache_release(page); | 843 | page_cache_release(page); |
844 | out_commit_trans: | ||
845 | if (handle) | ||
846 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); | ||
848 | out: | 847 | out: |
849 | return ret; | 848 | return ret; |
850 | } | 849 | } |
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index a6c991c0fc98..a9b76de46047 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h | |||
@@ -162,7 +162,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) | |||
162 | { | 162 | { |
163 | int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; | 163 | int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; |
164 | 164 | ||
165 | return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); | 165 | return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits; |
166 | } | 166 | } |
167 | 167 | ||
168 | /* Validate that a bh contains a valid inode */ | 168 | /* Validate that a bh contains a valid inode */ |
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 6219aaadeb08..74caffeeee1d 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c | |||
@@ -404,7 +404,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, | |||
404 | * 'vict_blkno' was out of the valid range. | 404 | * 'vict_blkno' was out of the valid range. |
405 | */ | 405 | */ |
406 | if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || | 406 | if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || |
407 | (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << | 407 | (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << |
408 | bits_per_unit))) { | 408 | bits_per_unit))) { |
409 | ret = -EINVAL; | 409 | ret = -EINVAL; |
410 | goto out; | 410 | goto out; |
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 13a8537d8e8b..720aa389e0ea 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c | |||
@@ -591,7 +591,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file) | |||
591 | */ | 591 | */ |
592 | ocfs2_control_this_node = -1; | 592 | ocfs2_control_this_node = -1; |
593 | running_proto.pv_major = 0; | 593 | running_proto.pv_major = 0; |
594 | running_proto.pv_major = 0; | 594 | running_proto.pv_minor = 0; |
595 | } | 595 | } |
596 | 596 | ||
597 | out: | 597 | out: |
diff --git a/fs/proc/base.c b/fs/proc/base.c index baf852b648ad..4c542b907754 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -632,29 +632,35 @@ static const struct file_operations proc_single_file_operations = { | |||
632 | .release = single_release, | 632 | .release = single_release, |
633 | }; | 633 | }; |
634 | 634 | ||
635 | static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) | 635 | |
636 | struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) | ||
636 | { | 637 | { |
637 | struct task_struct *task = get_proc_task(file_inode(file)); | 638 | struct task_struct *task = get_proc_task(inode); |
638 | struct mm_struct *mm; | 639 | struct mm_struct *mm = ERR_PTR(-ESRCH); |
639 | 640 | ||
640 | if (!task) | 641 | if (task) { |
641 | return -ESRCH; | 642 | mm = mm_access(task, mode); |
643 | put_task_struct(task); | ||
642 | 644 | ||
643 | mm = mm_access(task, mode); | 645 | if (!IS_ERR_OR_NULL(mm)) { |
644 | put_task_struct(task); | 646 | /* ensure this mm_struct can't be freed */ |
647 | atomic_inc(&mm->mm_count); | ||
648 | /* but do not pin its memory */ | ||
649 | mmput(mm); | ||
650 | } | ||
651 | } | ||
652 | |||
653 | return mm; | ||
654 | } | ||
655 | |||
656 | static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) | ||
657 | { | ||
658 | struct mm_struct *mm = proc_mem_open(inode, mode); | ||
645 | 659 | ||
646 | if (IS_ERR(mm)) | 660 | if (IS_ERR(mm)) |
647 | return PTR_ERR(mm); | 661 | return PTR_ERR(mm); |
648 | 662 | ||
649 | if (mm) { | ||
650 | /* ensure this mm_struct can't be freed */ | ||
651 | atomic_inc(&mm->mm_count); | ||
652 | /* but do not pin its memory */ | ||
653 | mmput(mm); | ||
654 | } | ||
655 | |||
656 | file->private_data = mm; | 663 | file->private_data = mm; |
657 | |||
658 | return 0; | 664 | return 0; |
659 | } | 665 | } |
660 | 666 | ||
diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7da13e49128a..aa7a0ee182e1 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h | |||
@@ -268,8 +268,9 @@ extern int proc_remount(struct super_block *, int *, char *); | |||
268 | * task_[no]mmu.c | 268 | * task_[no]mmu.c |
269 | */ | 269 | */ |
270 | struct proc_maps_private { | 270 | struct proc_maps_private { |
271 | struct pid *pid; | 271 | struct inode *inode; |
272 | struct task_struct *task; | 272 | struct task_struct *task; |
273 | struct mm_struct *mm; | ||
273 | #ifdef CONFIG_MMU | 274 | #ifdef CONFIG_MMU |
274 | struct vm_area_struct *tail_vma; | 275 | struct vm_area_struct *tail_vma; |
275 | #endif | 276 | #endif |
@@ -278,6 +279,8 @@ struct proc_maps_private { | |||
278 | #endif | 279 | #endif |
279 | }; | 280 | }; |
280 | 281 | ||
282 | struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); | ||
283 | |||
281 | extern const struct file_operations proc_pid_maps_operations; | 284 | extern const struct file_operations proc_pid_maps_operations; |
282 | extern const struct file_operations proc_tid_maps_operations; | 285 | extern const struct file_operations proc_tid_maps_operations; |
283 | extern const struct file_operations proc_pid_numa_maps_operations; | 286 | extern const struct file_operations proc_pid_numa_maps_operations; |
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 6df8d0722c97..91a4e6426321 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c | |||
@@ -610,8 +610,10 @@ static void __init proc_kcore_text_init(void) | |||
610 | struct kcore_list kcore_modules; | 610 | struct kcore_list kcore_modules; |
611 | static void __init add_modules_range(void) | 611 | static void __init add_modules_range(void) |
612 | { | 612 | { |
613 | kclist_add(&kcore_modules, (void *)MODULES_VADDR, | 613 | if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) { |
614 | kclist_add(&kcore_modules, (void *)MODULES_VADDR, | ||
614 | MODULES_END - MODULES_VADDR, KCORE_VMALLOC); | 615 | MODULES_END - MODULES_VADDR, KCORE_VMALLOC); |
616 | } | ||
615 | } | 617 | } |
616 | #else | 618 | #else |
617 | static void __init add_modules_range(void) | 619 | static void __init add_modules_range(void) |
diff --git a/fs/proc/page.c b/fs/proc/page.c index e647c55275d9..1e3187da1fed 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c | |||
@@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page) | |||
133 | if (PageBuddy(page)) | 133 | if (PageBuddy(page)) |
134 | u |= 1 << KPF_BUDDY; | 134 | u |= 1 << KPF_BUDDY; |
135 | 135 | ||
136 | if (PageBalloon(page)) | ||
137 | u |= 1 << KPF_BALLOON; | ||
138 | |||
136 | u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); | 139 | u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); |
137 | 140 | ||
138 | u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); | 141 | u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c34156888d70..b7a7dc963a35 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -87,32 +87,14 @@ unsigned long task_statm(struct mm_struct *mm, | |||
87 | 87 | ||
88 | #ifdef CONFIG_NUMA | 88 | #ifdef CONFIG_NUMA |
89 | /* | 89 | /* |
90 | * These functions are for numa_maps but called in generic **maps seq_file | 90 | * Save get_task_policy() for show_numa_map(). |
91 | * ->start(), ->stop() ops. | ||
92 | * | ||
93 | * numa_maps scans all vmas under mmap_sem and checks their mempolicy. | ||
94 | * Each mempolicy object is controlled by reference counting. The problem here | ||
95 | * is how to avoid accessing dead mempolicy object. | ||
96 | * | ||
97 | * Because we're holding mmap_sem while reading seq_file, it's safe to access | ||
98 | * each vma's mempolicy, no vma objects will never drop refs to mempolicy. | ||
99 | * | ||
100 | * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy | ||
101 | * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). | ||
102 | * So, without task_lock(), we cannot trust get_vma_policy() because we cannot | ||
103 | * gurantee the task never exits under us. But taking task_lock() around | ||
104 | * get_vma_plicy() causes lock order problem. | ||
105 | * | ||
106 | * To access task->mempolicy without lock, we hold a reference count of an | ||
107 | * object pointed by task->mempolicy and remember it. This will guarantee | ||
108 | * that task->mempolicy points to an alive object or NULL in numa_maps accesses. | ||
109 | */ | 91 | */ |
110 | static void hold_task_mempolicy(struct proc_maps_private *priv) | 92 | static void hold_task_mempolicy(struct proc_maps_private *priv) |
111 | { | 93 | { |
112 | struct task_struct *task = priv->task; | 94 | struct task_struct *task = priv->task; |
113 | 95 | ||
114 | task_lock(task); | 96 | task_lock(task); |
115 | priv->task_mempolicy = task->mempolicy; | 97 | priv->task_mempolicy = get_task_policy(task); |
116 | mpol_get(priv->task_mempolicy); | 98 | mpol_get(priv->task_mempolicy); |
117 | task_unlock(task); | 99 | task_unlock(task); |
118 | } | 100 | } |
@@ -129,124 +111,154 @@ static void release_task_mempolicy(struct proc_maps_private *priv) | |||
129 | } | 111 | } |
130 | #endif | 112 | #endif |
131 | 113 | ||
132 | static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) | 114 | static void vma_stop(struct proc_maps_private *priv) |
133 | { | 115 | { |
134 | if (vma && vma != priv->tail_vma) { | 116 | struct mm_struct *mm = priv->mm; |
135 | struct mm_struct *mm = vma->vm_mm; | 117 | |
136 | release_task_mempolicy(priv); | 118 | release_task_mempolicy(priv); |
137 | up_read(&mm->mmap_sem); | 119 | up_read(&mm->mmap_sem); |
138 | mmput(mm); | 120 | mmput(mm); |
139 | } | 121 | } |
122 | |||
123 | static struct vm_area_struct * | ||
124 | m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) | ||
125 | { | ||
126 | if (vma == priv->tail_vma) | ||
127 | return NULL; | ||
128 | return vma->vm_next ?: priv->tail_vma; | ||
129 | } | ||
130 | |||
131 | static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) | ||
132 | { | ||
133 | if (m->count < m->size) /* vma is copied successfully */ | ||
134 | m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL; | ||
140 | } | 135 | } |
141 | 136 | ||
142 | static void *m_start(struct seq_file *m, loff_t *pos) | 137 | static void *m_start(struct seq_file *m, loff_t *ppos) |
143 | { | 138 | { |
144 | struct proc_maps_private *priv = m->private; | 139 | struct proc_maps_private *priv = m->private; |
145 | unsigned long last_addr = m->version; | 140 | unsigned long last_addr = m->version; |
146 | struct mm_struct *mm; | 141 | struct mm_struct *mm; |
147 | struct vm_area_struct *vma, *tail_vma = NULL; | 142 | struct vm_area_struct *vma; |
148 | loff_t l = *pos; | 143 | unsigned int pos = *ppos; |
149 | |||
150 | /* Clear the per syscall fields in priv */ | ||
151 | priv->task = NULL; | ||
152 | priv->tail_vma = NULL; | ||
153 | |||
154 | /* | ||
155 | * We remember last_addr rather than next_addr to hit with | ||
156 | * vmacache most of the time. We have zero last_addr at | ||
157 | * the beginning and also after lseek. We will have -1 last_addr | ||
158 | * after the end of the vmas. | ||
159 | */ | ||
160 | 144 | ||
145 | /* See m_cache_vma(). Zero at the start or after lseek. */ | ||
161 | if (last_addr == -1UL) | 146 | if (last_addr == -1UL) |
162 | return NULL; | 147 | return NULL; |
163 | 148 | ||
164 | priv->task = get_pid_task(priv->pid, PIDTYPE_PID); | 149 | priv->task = get_proc_task(priv->inode); |
165 | if (!priv->task) | 150 | if (!priv->task) |
166 | return ERR_PTR(-ESRCH); | 151 | return ERR_PTR(-ESRCH); |
167 | 152 | ||
168 | mm = mm_access(priv->task, PTRACE_MODE_READ); | 153 | mm = priv->mm; |
169 | if (!mm || IS_ERR(mm)) | 154 | if (!mm || !atomic_inc_not_zero(&mm->mm_users)) |
170 | return mm; | 155 | return NULL; |
171 | down_read(&mm->mmap_sem); | ||
172 | 156 | ||
173 | tail_vma = get_gate_vma(priv->task->mm); | 157 | down_read(&mm->mmap_sem); |
174 | priv->tail_vma = tail_vma; | ||
175 | hold_task_mempolicy(priv); | 158 | hold_task_mempolicy(priv); |
176 | /* Start with last addr hint */ | 159 | priv->tail_vma = get_gate_vma(mm); |
177 | vma = find_vma(mm, last_addr); | 160 | |
178 | if (last_addr && vma) { | 161 | if (last_addr) { |
179 | vma = vma->vm_next; | 162 | vma = find_vma(mm, last_addr); |
180 | goto out; | 163 | if (vma && (vma = m_next_vma(priv, vma))) |
164 | return vma; | ||
181 | } | 165 | } |
182 | 166 | ||
183 | /* | 167 | m->version = 0; |
184 | * Check the vma index is within the range and do | 168 | if (pos < mm->map_count) { |
185 | * sequential scan until m_index. | 169 | for (vma = mm->mmap; pos; pos--) { |
186 | */ | 170 | m->version = vma->vm_start; |
187 | vma = NULL; | ||
188 | if ((unsigned long)l < mm->map_count) { | ||
189 | vma = mm->mmap; | ||
190 | while (l-- && vma) | ||
191 | vma = vma->vm_next; | 171 | vma = vma->vm_next; |
192 | goto out; | 172 | } |
173 | return vma; | ||
193 | } | 174 | } |
194 | 175 | ||
195 | if (l != mm->map_count) | 176 | /* we do not bother to update m->version in this case */ |
196 | tail_vma = NULL; /* After gate vma */ | 177 | if (pos == mm->map_count && priv->tail_vma) |
197 | 178 | return priv->tail_vma; | |
198 | out: | ||
199 | if (vma) | ||
200 | return vma; | ||
201 | 179 | ||
202 | release_task_mempolicy(priv); | 180 | vma_stop(priv); |
203 | /* End of vmas has been reached */ | 181 | return NULL; |
204 | m->version = (tail_vma != NULL)? 0: -1UL; | ||
205 | up_read(&mm->mmap_sem); | ||
206 | mmput(mm); | ||
207 | return tail_vma; | ||
208 | } | 182 | } |
209 | 183 | ||
210 | static void *m_next(struct seq_file *m, void *v, loff_t *pos) | 184 | static void *m_next(struct seq_file *m, void *v, loff_t *pos) |
211 | { | 185 | { |
212 | struct proc_maps_private *priv = m->private; | 186 | struct proc_maps_private *priv = m->private; |
213 | struct vm_area_struct *vma = v; | 187 | struct vm_area_struct *next; |
214 | struct vm_area_struct *tail_vma = priv->tail_vma; | ||
215 | 188 | ||
216 | (*pos)++; | 189 | (*pos)++; |
217 | if (vma && (vma != tail_vma) && vma->vm_next) | 190 | next = m_next_vma(priv, v); |
218 | return vma->vm_next; | 191 | if (!next) |
219 | vma_stop(priv, vma); | 192 | vma_stop(priv); |
220 | return (vma != tail_vma)? tail_vma: NULL; | 193 | return next; |
221 | } | 194 | } |
222 | 195 | ||
223 | static void m_stop(struct seq_file *m, void *v) | 196 | static void m_stop(struct seq_file *m, void *v) |
224 | { | 197 | { |
225 | struct proc_maps_private *priv = m->private; | 198 | struct proc_maps_private *priv = m->private; |
226 | struct vm_area_struct *vma = v; | ||
227 | 199 | ||
228 | if (!IS_ERR(vma)) | 200 | if (!IS_ERR_OR_NULL(v)) |
229 | vma_stop(priv, vma); | 201 | vma_stop(priv); |
230 | if (priv->task) | 202 | if (priv->task) { |
231 | put_task_struct(priv->task); | 203 | put_task_struct(priv->task); |
204 | priv->task = NULL; | ||
205 | } | ||
206 | } | ||
207 | |||
208 | static int proc_maps_open(struct inode *inode, struct file *file, | ||
209 | const struct seq_operations *ops, int psize) | ||
210 | { | ||
211 | struct proc_maps_private *priv = __seq_open_private(file, ops, psize); | ||
212 | |||
213 | if (!priv) | ||
214 | return -ENOMEM; | ||
215 | |||
216 | priv->inode = inode; | ||
217 | priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); | ||
218 | if (IS_ERR(priv->mm)) { | ||
219 | int err = PTR_ERR(priv->mm); | ||
220 | |||
221 | seq_release_private(inode, file); | ||
222 | return err; | ||
223 | } | ||
224 | |||
225 | return 0; | ||
226 | } | ||
227 | |||
228 | static int proc_map_release(struct inode *inode, struct file *file) | ||
229 | { | ||
230 | struct seq_file *seq = file->private_data; | ||
231 | struct proc_maps_private *priv = seq->private; | ||
232 | |||
233 | if (priv->mm) | ||
234 | mmdrop(priv->mm); | ||
235 | |||
236 | return seq_release_private(inode, file); | ||
232 | } | 237 | } |
233 | 238 | ||
234 | static int do_maps_open(struct inode *inode, struct file *file, | 239 | static int do_maps_open(struct inode *inode, struct file *file, |
235 | const struct seq_operations *ops) | 240 | const struct seq_operations *ops) |
236 | { | 241 | { |
237 | struct proc_maps_private *priv; | 242 | return proc_maps_open(inode, file, ops, |
238 | int ret = -ENOMEM; | 243 | sizeof(struct proc_maps_private)); |
239 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | 244 | } |
240 | if (priv) { | 245 | |
241 | priv->pid = proc_pid(inode); | 246 | static pid_t pid_of_stack(struct proc_maps_private *priv, |
242 | ret = seq_open(file, ops); | 247 | struct vm_area_struct *vma, bool is_pid) |
243 | if (!ret) { | 248 | { |
244 | struct seq_file *m = file->private_data; | 249 | struct inode *inode = priv->inode; |
245 | m->private = priv; | 250 | struct task_struct *task; |
246 | } else { | 251 | pid_t ret = 0; |
247 | kfree(priv); | 252 | |
248 | } | 253 | rcu_read_lock(); |
254 | task = pid_task(proc_pid(inode), PIDTYPE_PID); | ||
255 | if (task) { | ||
256 | task = task_of_stack(task, vma, is_pid); | ||
257 | if (task) | ||
258 | ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); | ||
249 | } | 259 | } |
260 | rcu_read_unlock(); | ||
261 | |||
250 | return ret; | 262 | return ret; |
251 | } | 263 | } |
252 | 264 | ||
@@ -256,7 +268,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) | |||
256 | struct mm_struct *mm = vma->vm_mm; | 268 | struct mm_struct *mm = vma->vm_mm; |
257 | struct file *file = vma->vm_file; | 269 | struct file *file = vma->vm_file; |
258 | struct proc_maps_private *priv = m->private; | 270 | struct proc_maps_private *priv = m->private; |
259 | struct task_struct *task = priv->task; | ||
260 | vm_flags_t flags = vma->vm_flags; | 271 | vm_flags_t flags = vma->vm_flags; |
261 | unsigned long ino = 0; | 272 | unsigned long ino = 0; |
262 | unsigned long long pgoff = 0; | 273 | unsigned long long pgoff = 0; |
@@ -321,8 +332,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) | |||
321 | goto done; | 332 | goto done; |
322 | } | 333 | } |
323 | 334 | ||
324 | tid = vm_is_stack(task, vma, is_pid); | 335 | tid = pid_of_stack(priv, vma, is_pid); |
325 | |||
326 | if (tid != 0) { | 336 | if (tid != 0) { |
327 | /* | 337 | /* |
328 | * Thread stack in /proc/PID/task/TID/maps or | 338 | * Thread stack in /proc/PID/task/TID/maps or |
@@ -349,15 +359,8 @@ done: | |||
349 | 359 | ||
350 | static int show_map(struct seq_file *m, void *v, int is_pid) | 360 | static int show_map(struct seq_file *m, void *v, int is_pid) |
351 | { | 361 | { |
352 | struct vm_area_struct *vma = v; | 362 | show_map_vma(m, v, is_pid); |
353 | struct proc_maps_private *priv = m->private; | 363 | m_cache_vma(m, v); |
354 | struct task_struct *task = priv->task; | ||
355 | |||
356 | show_map_vma(m, vma, is_pid); | ||
357 | |||
358 | if (m->count < m->size) /* vma is copied successfully */ | ||
359 | m->version = (vma != get_gate_vma(task->mm)) | ||
360 | ? vma->vm_start : 0; | ||
361 | return 0; | 364 | return 0; |
362 | } | 365 | } |
363 | 366 | ||
@@ -399,14 +402,14 @@ const struct file_operations proc_pid_maps_operations = { | |||
399 | .open = pid_maps_open, | 402 | .open = pid_maps_open, |
400 | .read = seq_read, | 403 | .read = seq_read, |
401 | .llseek = seq_lseek, | 404 | .llseek = seq_lseek, |
402 | .release = seq_release_private, | 405 | .release = proc_map_release, |
403 | }; | 406 | }; |
404 | 407 | ||
405 | const struct file_operations proc_tid_maps_operations = { | 408 | const struct file_operations proc_tid_maps_operations = { |
406 | .open = tid_maps_open, | 409 | .open = tid_maps_open, |
407 | .read = seq_read, | 410 | .read = seq_read, |
408 | .llseek = seq_lseek, | 411 | .llseek = seq_lseek, |
409 | .release = seq_release_private, | 412 | .release = proc_map_release, |
410 | }; | 413 | }; |
411 | 414 | ||
412 | /* | 415 | /* |
@@ -583,8 +586,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) | |||
583 | 586 | ||
584 | static int show_smap(struct seq_file *m, void *v, int is_pid) | 587 | static int show_smap(struct seq_file *m, void *v, int is_pid) |
585 | { | 588 | { |
586 | struct proc_maps_private *priv = m->private; | ||
587 | struct task_struct *task = priv->task; | ||
588 | struct vm_area_struct *vma = v; | 589 | struct vm_area_struct *vma = v; |
589 | struct mem_size_stats mss; | 590 | struct mem_size_stats mss; |
590 | struct mm_walk smaps_walk = { | 591 | struct mm_walk smaps_walk = { |
@@ -637,10 +638,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) | |||
637 | mss.nonlinear >> 10); | 638 | mss.nonlinear >> 10); |
638 | 639 | ||
639 | show_smap_vma_flags(m, vma); | 640 | show_smap_vma_flags(m, vma); |
640 | 641 | m_cache_vma(m, vma); | |
641 | if (m->count < m->size) /* vma is copied successfully */ | ||
642 | m->version = (vma != get_gate_vma(task->mm)) | ||
643 | ? vma->vm_start : 0; | ||
644 | return 0; | 642 | return 0; |
645 | } | 643 | } |
646 | 644 | ||
@@ -682,14 +680,14 @@ const struct file_operations proc_pid_smaps_operations = { | |||
682 | .open = pid_smaps_open, | 680 | .open = pid_smaps_open, |
683 | .read = seq_read, | 681 | .read = seq_read, |
684 | .llseek = seq_lseek, | 682 | .llseek = seq_lseek, |
685 | .release = seq_release_private, | 683 | .release = proc_map_release, |
686 | }; | 684 | }; |
687 | 685 | ||
688 | const struct file_operations proc_tid_smaps_operations = { | 686 | const struct file_operations proc_tid_smaps_operations = { |
689 | .open = tid_smaps_open, | 687 | .open = tid_smaps_open, |
690 | .read = seq_read, | 688 | .read = seq_read, |
691 | .llseek = seq_lseek, | 689 | .llseek = seq_lseek, |
692 | .release = seq_release_private, | 690 | .release = proc_map_release, |
693 | }; | 691 | }; |
694 | 692 | ||
695 | /* | 693 | /* |
@@ -1029,7 +1027,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
1029 | spinlock_t *ptl; | 1027 | spinlock_t *ptl; |
1030 | pte_t *pte; | 1028 | pte_t *pte; |
1031 | int err = 0; | 1029 | int err = 0; |
1032 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); | ||
1033 | 1030 | ||
1034 | /* find the first VMA at or above 'addr' */ | 1031 | /* find the first VMA at or above 'addr' */ |
1035 | vma = find_vma(walk->mm, addr); | 1032 | vma = find_vma(walk->mm, addr); |
@@ -1043,6 +1040,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
1043 | 1040 | ||
1044 | for (; addr != end; addr += PAGE_SIZE) { | 1041 | for (; addr != end; addr += PAGE_SIZE) { |
1045 | unsigned long offset; | 1042 | unsigned long offset; |
1043 | pagemap_entry_t pme; | ||
1046 | 1044 | ||
1047 | offset = (addr & ~PAGEMAP_WALK_MASK) >> | 1045 | offset = (addr & ~PAGEMAP_WALK_MASK) >> |
1048 | PAGE_SHIFT; | 1046 | PAGE_SHIFT; |
@@ -1057,32 +1055,51 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
1057 | 1055 | ||
1058 | if (pmd_trans_unstable(pmd)) | 1056 | if (pmd_trans_unstable(pmd)) |
1059 | return 0; | 1057 | return 0; |
1060 | for (; addr != end; addr += PAGE_SIZE) { | 1058 | |
1061 | int flags2; | 1059 | while (1) { |
1062 | 1060 | /* End of address space hole, which we mark as non-present. */ | |
1063 | /* check to see if we've left 'vma' behind | 1061 | unsigned long hole_end; |
1064 | * and need a new, higher one */ | 1062 | |
1065 | if (vma && (addr >= vma->vm_end)) { | 1063 | if (vma) |
1066 | vma = find_vma(walk->mm, addr); | 1064 | hole_end = min(end, vma->vm_start); |
1067 | if (vma && (vma->vm_flags & VM_SOFTDIRTY)) | 1065 | else |
1068 | flags2 = __PM_SOFT_DIRTY; | 1066 | hole_end = end; |
1069 | else | 1067 | |
1070 | flags2 = 0; | 1068 | for (; addr < hole_end; addr += PAGE_SIZE) { |
1071 | pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); | 1069 | pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); |
1070 | |||
1071 | err = add_to_pagemap(addr, &pme, pm); | ||
1072 | if (err) | ||
1073 | return err; | ||
1072 | } | 1074 | } |
1073 | 1075 | ||
1074 | /* check that 'vma' actually covers this address, | 1076 | if (!vma || vma->vm_start >= end) |
1075 | * and that it isn't a huge page vma */ | 1077 | break; |
1076 | if (vma && (vma->vm_start <= addr) && | 1078 | /* |
1077 | !is_vm_hugetlb_page(vma)) { | 1079 | * We can't possibly be in a hugetlb VMA. In general, |
1080 | * for a mm_walk with a pmd_entry and a hugetlb_entry, | ||
1081 | * the pmd_entry can only be called on addresses in a | ||
1082 | * hugetlb if the walk starts in a non-hugetlb VMA and | ||
1083 | * spans a hugepage VMA. Since pagemap_read walks are | ||
1084 | * PMD-sized and PMD-aligned, this will never be true. | ||
1085 | */ | ||
1086 | BUG_ON(is_vm_hugetlb_page(vma)); | ||
1087 | |||
1088 | /* Addresses in the VMA. */ | ||
1089 | for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { | ||
1090 | pagemap_entry_t pme; | ||
1078 | pte = pte_offset_map(pmd, addr); | 1091 | pte = pte_offset_map(pmd, addr); |
1079 | pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); | 1092 | pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); |
1080 | /* unmap before userspace copy */ | ||
1081 | pte_unmap(pte); | 1093 | pte_unmap(pte); |
1094 | err = add_to_pagemap(addr, &pme, pm); | ||
1095 | if (err) | ||
1096 | return err; | ||
1082 | } | 1097 | } |
1083 | err = add_to_pagemap(addr, &pme, pm); | 1098 | |
1084 | if (err) | 1099 | if (addr == end) |
1085 | return err; | 1100 | break; |
1101 | |||
1102 | vma = find_vma(walk->mm, addr); | ||
1086 | } | 1103 | } |
1087 | 1104 | ||
1088 | cond_resched(); | 1105 | cond_resched(); |
@@ -1415,7 +1432,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) | |||
1415 | struct vm_area_struct *vma = v; | 1432 | struct vm_area_struct *vma = v; |
1416 | struct numa_maps *md = &numa_priv->md; | 1433 | struct numa_maps *md = &numa_priv->md; |
1417 | struct file *file = vma->vm_file; | 1434 | struct file *file = vma->vm_file; |
1418 | struct task_struct *task = proc_priv->task; | ||
1419 | struct mm_struct *mm = vma->vm_mm; | 1435 | struct mm_struct *mm = vma->vm_mm; |
1420 | struct mm_walk walk = {}; | 1436 | struct mm_walk walk = {}; |
1421 | struct mempolicy *pol; | 1437 | struct mempolicy *pol; |
@@ -1435,9 +1451,13 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) | |||
1435 | walk.private = md; | 1451 | walk.private = md; |
1436 | walk.mm = mm; | 1452 | walk.mm = mm; |
1437 | 1453 | ||
1438 | pol = get_vma_policy(task, vma, vma->vm_start); | 1454 | pol = __get_vma_policy(vma, vma->vm_start); |
1439 | mpol_to_str(buffer, sizeof(buffer), pol); | 1455 | if (pol) { |
1440 | mpol_cond_put(pol); | 1456 | mpol_to_str(buffer, sizeof(buffer), pol); |
1457 | mpol_cond_put(pol); | ||
1458 | } else { | ||
1459 | mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); | ||
1460 | } | ||
1441 | 1461 | ||
1442 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); | 1462 | seq_printf(m, "%08lx %s", vma->vm_start, buffer); |
1443 | 1463 | ||
@@ -1447,7 +1467,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) | |||
1447 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { | 1467 | } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { |
1448 | seq_puts(m, " heap"); | 1468 | seq_puts(m, " heap"); |
1449 | } else { | 1469 | } else { |
1450 | pid_t tid = vm_is_stack(task, vma, is_pid); | 1470 | pid_t tid = pid_of_stack(proc_priv, vma, is_pid); |
1451 | if (tid != 0) { | 1471 | if (tid != 0) { |
1452 | /* | 1472 | /* |
1453 | * Thread stack in /proc/PID/task/TID/maps or | 1473 | * Thread stack in /proc/PID/task/TID/maps or |
@@ -1495,9 +1515,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) | |||
1495 | seq_printf(m, " N%d=%lu", nid, md->node[nid]); | 1515 | seq_printf(m, " N%d=%lu", nid, md->node[nid]); |
1496 | out: | 1516 | out: |
1497 | seq_putc(m, '\n'); | 1517 | seq_putc(m, '\n'); |
1498 | 1518 | m_cache_vma(m, vma); | |
1499 | if (m->count < m->size) | ||
1500 | m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; | ||
1501 | return 0; | 1519 | return 0; |
1502 | } | 1520 | } |
1503 | 1521 | ||
@@ -1528,20 +1546,8 @@ static const struct seq_operations proc_tid_numa_maps_op = { | |||
1528 | static int numa_maps_open(struct inode *inode, struct file *file, | 1546 | static int numa_maps_open(struct inode *inode, struct file *file, |
1529 | const struct seq_operations *ops) | 1547 | const struct seq_operations *ops) |
1530 | { | 1548 | { |
1531 | struct numa_maps_private *priv; | 1549 | return proc_maps_open(inode, file, ops, |
1532 | int ret = -ENOMEM; | 1550 | sizeof(struct numa_maps_private)); |
1533 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | ||
1534 | if (priv) { | ||
1535 | priv->proc_maps.pid = proc_pid(inode); | ||
1536 | ret = seq_open(file, ops); | ||
1537 | if (!ret) { | ||
1538 | struct seq_file *m = file->private_data; | ||
1539 | m->private = priv; | ||
1540 | } else { | ||
1541 | kfree(priv); | ||
1542 | } | ||
1543 | } | ||
1544 | return ret; | ||
1545 | } | 1551 | } |
1546 | 1552 | ||
1547 | static int pid_numa_maps_open(struct inode *inode, struct file *file) | 1553 | static int pid_numa_maps_open(struct inode *inode, struct file *file) |
@@ -1558,13 +1564,13 @@ const struct file_operations proc_pid_numa_maps_operations = { | |||
1558 | .open = pid_numa_maps_open, | 1564 | .open = pid_numa_maps_open, |
1559 | .read = seq_read, | 1565 | .read = seq_read, |
1560 | .llseek = seq_lseek, | 1566 | .llseek = seq_lseek, |
1561 | .release = seq_release_private, | 1567 | .release = proc_map_release, |
1562 | }; | 1568 | }; |
1563 | 1569 | ||
1564 | const struct file_operations proc_tid_numa_maps_operations = { | 1570 | const struct file_operations proc_tid_numa_maps_operations = { |
1565 | .open = tid_numa_maps_open, | 1571 | .open = tid_numa_maps_open, |
1566 | .read = seq_read, | 1572 | .read = seq_read, |
1567 | .llseek = seq_lseek, | 1573 | .llseek = seq_lseek, |
1568 | .release = seq_release_private, | 1574 | .release = proc_map_release, |
1569 | }; | 1575 | }; |
1570 | #endif /* CONFIG_NUMA */ | 1576 | #endif /* CONFIG_NUMA */ |
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 678455d2d683..599ec2e20104 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c | |||
@@ -123,6 +123,25 @@ unsigned long task_statm(struct mm_struct *mm, | |||
123 | return size; | 123 | return size; |
124 | } | 124 | } |
125 | 125 | ||
126 | static pid_t pid_of_stack(struct proc_maps_private *priv, | ||
127 | struct vm_area_struct *vma, bool is_pid) | ||
128 | { | ||
129 | struct inode *inode = priv->inode; | ||
130 | struct task_struct *task; | ||
131 | pid_t ret = 0; | ||
132 | |||
133 | rcu_read_lock(); | ||
134 | task = pid_task(proc_pid(inode), PIDTYPE_PID); | ||
135 | if (task) { | ||
136 | task = task_of_stack(task, vma, is_pid); | ||
137 | if (task) | ||
138 | ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); | ||
139 | } | ||
140 | rcu_read_unlock(); | ||
141 | |||
142 | return ret; | ||
143 | } | ||
144 | |||
126 | /* | 145 | /* |
127 | * display a single VMA to a sequenced file | 146 | * display a single VMA to a sequenced file |
128 | */ | 147 | */ |
@@ -163,7 +182,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, | |||
163 | seq_pad(m, ' '); | 182 | seq_pad(m, ' '); |
164 | seq_path(m, &file->f_path, ""); | 183 | seq_path(m, &file->f_path, ""); |
165 | } else if (mm) { | 184 | } else if (mm) { |
166 | pid_t tid = vm_is_stack(priv->task, vma, is_pid); | 185 | pid_t tid = pid_of_stack(priv, vma, is_pid); |
167 | 186 | ||
168 | if (tid != 0) { | 187 | if (tid != 0) { |
169 | seq_pad(m, ' '); | 188 | seq_pad(m, ' '); |
@@ -212,22 +231,22 @@ static void *m_start(struct seq_file *m, loff_t *pos) | |||
212 | loff_t n = *pos; | 231 | loff_t n = *pos; |
213 | 232 | ||
214 | /* pin the task and mm whilst we play with them */ | 233 | /* pin the task and mm whilst we play with them */ |
215 | priv->task = get_pid_task(priv->pid, PIDTYPE_PID); | 234 | priv->task = get_proc_task(priv->inode); |
216 | if (!priv->task) | 235 | if (!priv->task) |
217 | return ERR_PTR(-ESRCH); | 236 | return ERR_PTR(-ESRCH); |
218 | 237 | ||
219 | mm = mm_access(priv->task, PTRACE_MODE_READ); | 238 | mm = priv->mm; |
220 | if (!mm || IS_ERR(mm)) { | 239 | if (!mm || !atomic_inc_not_zero(&mm->mm_users)) |
221 | put_task_struct(priv->task); | 240 | return NULL; |
222 | priv->task = NULL; | ||
223 | return mm; | ||
224 | } | ||
225 | down_read(&mm->mmap_sem); | ||
226 | 241 | ||
242 | down_read(&mm->mmap_sem); | ||
227 | /* start from the Nth VMA */ | 243 | /* start from the Nth VMA */ |
228 | for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) | 244 | for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) |
229 | if (n-- == 0) | 245 | if (n-- == 0) |
230 | return p; | 246 | return p; |
247 | |||
248 | up_read(&mm->mmap_sem); | ||
249 | mmput(mm); | ||
231 | return NULL; | 250 | return NULL; |
232 | } | 251 | } |
233 | 252 | ||
@@ -235,11 +254,13 @@ static void m_stop(struct seq_file *m, void *_vml) | |||
235 | { | 254 | { |
236 | struct proc_maps_private *priv = m->private; | 255 | struct proc_maps_private *priv = m->private; |
237 | 256 | ||
257 | if (!IS_ERR_OR_NULL(_vml)) { | ||
258 | up_read(&priv->mm->mmap_sem); | ||
259 | mmput(priv->mm); | ||
260 | } | ||
238 | if (priv->task) { | 261 | if (priv->task) { |
239 | struct mm_struct *mm = priv->task->mm; | ||
240 | up_read(&mm->mmap_sem); | ||
241 | mmput(mm); | ||
242 | put_task_struct(priv->task); | 262 | put_task_struct(priv->task); |
263 | priv->task = NULL; | ||
243 | } | 264 | } |
244 | } | 265 | } |
245 | 266 | ||
@@ -269,20 +290,33 @@ static int maps_open(struct inode *inode, struct file *file, | |||
269 | const struct seq_operations *ops) | 290 | const struct seq_operations *ops) |
270 | { | 291 | { |
271 | struct proc_maps_private *priv; | 292 | struct proc_maps_private *priv; |
272 | int ret = -ENOMEM; | 293 | |
273 | 294 | priv = __seq_open_private(file, ops, sizeof(*priv)); | |
274 | priv = kzalloc(sizeof(*priv), GFP_KERNEL); | 295 | if (!priv) |
275 | if (priv) { | 296 | return -ENOMEM; |
276 | priv->pid = proc_pid(inode); | 297 | |
277 | ret = seq_open(file, ops); | 298 | priv->inode = inode; |
278 | if (!ret) { | 299 | priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); |
279 | struct seq_file *m = file->private_data; | 300 | if (IS_ERR(priv->mm)) { |
280 | m->private = priv; | 301 | int err = PTR_ERR(priv->mm); |
281 | } else { | 302 | |
282 | kfree(priv); | 303 | seq_release_private(inode, file); |
283 | } | 304 | return err; |
284 | } | 305 | } |
285 | return ret; | 306 | |
307 | return 0; | ||
308 | } | ||
309 | |||
310 | |||
311 | static int map_release(struct inode *inode, struct file *file) | ||
312 | { | ||
313 | struct seq_file *seq = file->private_data; | ||
314 | struct proc_maps_private *priv = seq->private; | ||
315 | |||
316 | if (priv->mm) | ||
317 | mmdrop(priv->mm); | ||
318 | |||
319 | return seq_release_private(inode, file); | ||
286 | } | 320 | } |
287 | 321 | ||
288 | static int pid_maps_open(struct inode *inode, struct file *file) | 322 | static int pid_maps_open(struct inode *inode, struct file *file) |
@@ -299,13 +333,13 @@ const struct file_operations proc_pid_maps_operations = { | |||
299 | .open = pid_maps_open, | 333 | .open = pid_maps_open, |
300 | .read = seq_read, | 334 | .read = seq_read, |
301 | .llseek = seq_lseek, | 335 | .llseek = seq_lseek, |
302 | .release = seq_release_private, | 336 | .release = map_release, |
303 | }; | 337 | }; |
304 | 338 | ||
305 | const struct file_operations proc_tid_maps_operations = { | 339 | const struct file_operations proc_tid_maps_operations = { |
306 | .open = tid_maps_open, | 340 | .open = tid_maps_open, |
307 | .read = seq_read, | 341 | .read = seq_read, |
308 | .llseek = seq_lseek, | 342 | .llseek = seq_lseek, |
309 | .release = seq_release_private, | 343 | .release = map_release, |
310 | }; | 344 | }; |
311 | 345 | ||
diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h index de8bf89940f8..a9fd248f5d48 100644 --- a/include/asm-generic/dma-mapping-common.h +++ b/include/asm-generic/dma-mapping-common.h | |||
@@ -179,6 +179,15 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, | |||
179 | extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, | 179 | extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, |
180 | void *cpu_addr, dma_addr_t dma_addr, size_t size); | 180 | void *cpu_addr, dma_addr_t dma_addr, size_t size); |
181 | 181 | ||
182 | void *dma_common_contiguous_remap(struct page *page, size_t size, | ||
183 | unsigned long vm_flags, | ||
184 | pgprot_t prot, const void *caller); | ||
185 | |||
186 | void *dma_common_pages_remap(struct page **pages, size_t size, | ||
187 | unsigned long vm_flags, pgprot_t prot, | ||
188 | const void *caller); | ||
189 | void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags); | ||
190 | |||
182 | /** | 191 | /** |
183 | * dma_mmap_attrs - map a coherent DMA allocation into user space | 192 | * dma_mmap_attrs - map a coherent DMA allocation into user space |
184 | * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices | 193 | * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 977e545a64c3..081ff8826bf6 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -664,11 +664,12 @@ static inline int pmd_trans_unstable(pmd_t *pmd) | |||
664 | } | 664 | } |
665 | 665 | ||
666 | #ifdef CONFIG_NUMA_BALANCING | 666 | #ifdef CONFIG_NUMA_BALANCING |
667 | #ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE | ||
668 | /* | 667 | /* |
669 | * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the | 668 | * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that |
670 | * same bit too). It's set only when _PAGE_PRESET is not set and it's | 669 | * is protected for PROT_NONE and a NUMA hinting fault entry. If the |
671 | * never set if _PAGE_PRESENT is set. | 670 | * architecture defines __PAGE_PROTNONE then it should take that into account |
671 | * but those that do not can rely on the fact that the NUMA hinting scanner | ||
672 | * skips inaccessible VMAs. | ||
672 | * | 673 | * |
673 | * pte/pmd_present() returns true if pte/pmd_numa returns true. Page | 674 | * pte/pmd_present() returns true if pte/pmd_numa returns true. Page |
674 | * fault triggers on those regions if pte/pmd_numa returns true | 675 | * fault triggers on those regions if pte/pmd_numa returns true |
@@ -677,16 +678,14 @@ static inline int pmd_trans_unstable(pmd_t *pmd) | |||
677 | #ifndef pte_numa | 678 | #ifndef pte_numa |
678 | static inline int pte_numa(pte_t pte) | 679 | static inline int pte_numa(pte_t pte) |
679 | { | 680 | { |
680 | return (pte_flags(pte) & | 681 | return ptenuma_flags(pte) == _PAGE_NUMA; |
681 | (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; | ||
682 | } | 682 | } |
683 | #endif | 683 | #endif |
684 | 684 | ||
685 | #ifndef pmd_numa | 685 | #ifndef pmd_numa |
686 | static inline int pmd_numa(pmd_t pmd) | 686 | static inline int pmd_numa(pmd_t pmd) |
687 | { | 687 | { |
688 | return (pmd_flags(pmd) & | 688 | return pmdnuma_flags(pmd) == _PAGE_NUMA; |
689 | (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; | ||
690 | } | 689 | } |
691 | #endif | 690 | #endif |
692 | 691 | ||
@@ -726,6 +725,8 @@ static inline pte_t pte_mknuma(pte_t pte) | |||
726 | { | 725 | { |
727 | pteval_t val = pte_val(pte); | 726 | pteval_t val = pte_val(pte); |
728 | 727 | ||
728 | VM_BUG_ON(!(val & _PAGE_PRESENT)); | ||
729 | |||
729 | val &= ~_PAGE_PRESENT; | 730 | val &= ~_PAGE_PRESENT; |
730 | val |= _PAGE_NUMA; | 731 | val |= _PAGE_NUMA; |
731 | 732 | ||
@@ -769,16 +770,6 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, | |||
769 | } | 770 | } |
770 | #endif | 771 | #endif |
771 | #else | 772 | #else |
772 | extern int pte_numa(pte_t pte); | ||
773 | extern int pmd_numa(pmd_t pmd); | ||
774 | extern pte_t pte_mknonnuma(pte_t pte); | ||
775 | extern pmd_t pmd_mknonnuma(pmd_t pmd); | ||
776 | extern pte_t pte_mknuma(pte_t pte); | ||
777 | extern pmd_t pmd_mknuma(pmd_t pmd); | ||
778 | extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep); | ||
779 | extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); | ||
780 | #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ | ||
781 | #else | ||
782 | static inline int pmd_numa(pmd_t pmd) | 773 | static inline int pmd_numa(pmd_t pmd) |
783 | { | 774 | { |
784 | return 0; | 775 | return 0; |
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index f1a24b5c3b90..b58fd667f87b 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h | |||
@@ -3,6 +3,8 @@ | |||
3 | 3 | ||
4 | /* References to section boundaries */ | 4 | /* References to section boundaries */ |
5 | 5 | ||
6 | #include <linux/compiler.h> | ||
7 | |||
6 | /* | 8 | /* |
7 | * Usage guidelines: | 9 | * Usage guidelines: |
8 | * _text, _data: architecture specific, don't use them in arch-independent code | 10 | * _text, _data: architecture specific, don't use them in arch-independent code |
@@ -37,6 +39,8 @@ extern char __start_rodata[], __end_rodata[]; | |||
37 | /* Start and end of .ctors section - used for constructor calls. */ | 39 | /* Start and end of .ctors section - used for constructor calls. */ |
38 | extern char __ctors_start[], __ctors_end[]; | 40 | extern char __ctors_start[], __ctors_end[]; |
39 | 41 | ||
42 | extern __visible const void __nosave_begin, __nosave_end; | ||
43 | |||
40 | /* function descriptor handling (if any). Override | 44 | /* function descriptor handling (if any). Override |
41 | * in asm/sections.h */ | 45 | * in asm/sections.h */ |
42 | #ifndef dereference_function_descriptor | 46 | #ifndef dereference_function_descriptor |
diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index 089743ade734..9b0a15d06a4f 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h | |||
@@ -27,10 +27,13 @@ | |||
27 | * counter raised only while it is under our special handling; | 27 | * counter raised only while it is under our special handling; |
28 | * | 28 | * |
29 | * iii. after the lockless scan step have selected a potential balloon page for | 29 | * iii. after the lockless scan step have selected a potential balloon page for |
30 | * isolation, re-test the page->mapping flags and the page ref counter | 30 | * isolation, re-test the PageBalloon mark and the PagePrivate flag |
31 | * under the proper page lock, to ensure isolating a valid balloon page | 31 | * under the proper page lock, to ensure isolating a valid balloon page |
32 | * (not yet isolated, nor under release procedure) | 32 | * (not yet isolated, nor under release procedure) |
33 | * | 33 | * |
34 | * iv. isolation or dequeueing procedure must clear PagePrivate flag under | ||
35 | * page lock together with removing page from balloon device page list. | ||
36 | * | ||
34 | * The functions provided by this interface are placed to help on coping with | 37 | * The functions provided by this interface are placed to help on coping with |
35 | * the aforementioned balloon page corner case, as well as to ensure the simple | 38 | * the aforementioned balloon page corner case, as well as to ensure the simple |
36 | * set of exposed rules are satisfied while we are dealing with balloon pages | 39 | * set of exposed rules are satisfied while we are dealing with balloon pages |
@@ -54,43 +57,22 @@ | |||
54 | * balloon driver as a page book-keeper for its registered balloon devices. | 57 | * balloon driver as a page book-keeper for its registered balloon devices. |
55 | */ | 58 | */ |
56 | struct balloon_dev_info { | 59 | struct balloon_dev_info { |
57 | void *balloon_device; /* balloon device descriptor */ | ||
58 | struct address_space *mapping; /* balloon special page->mapping */ | ||
59 | unsigned long isolated_pages; /* # of isolated pages for migration */ | 60 | unsigned long isolated_pages; /* # of isolated pages for migration */ |
60 | spinlock_t pages_lock; /* Protection to pages list */ | 61 | spinlock_t pages_lock; /* Protection to pages list */ |
61 | struct list_head pages; /* Pages enqueued & handled to Host */ | 62 | struct list_head pages; /* Pages enqueued & handled to Host */ |
63 | int (*migratepage)(struct balloon_dev_info *, struct page *newpage, | ||
64 | struct page *page, enum migrate_mode mode); | ||
62 | }; | 65 | }; |
63 | 66 | ||
64 | extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); | 67 | extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); |
65 | extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); | 68 | extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); |
66 | extern struct balloon_dev_info *balloon_devinfo_alloc( | ||
67 | void *balloon_dev_descriptor); | ||
68 | 69 | ||
69 | static inline void balloon_devinfo_free(struct balloon_dev_info *b_dev_info) | 70 | static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) |
70 | { | ||
71 | kfree(b_dev_info); | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * balloon_page_free - release a balloon page back to the page free lists | ||
76 | * @page: ballooned page to be set free | ||
77 | * | ||
78 | * This function must be used to properly set free an isolated/dequeued balloon | ||
79 | * page at the end of a sucessful page migration, or at the balloon driver's | ||
80 | * page release procedure. | ||
81 | */ | ||
82 | static inline void balloon_page_free(struct page *page) | ||
83 | { | 71 | { |
84 | /* | 72 | balloon->isolated_pages = 0; |
85 | * Balloon pages always get an extra refcount before being isolated | 73 | spin_lock_init(&balloon->pages_lock); |
86 | * and before being dequeued to help on sorting out fortuite colisions | 74 | INIT_LIST_HEAD(&balloon->pages); |
87 | * between a thread attempting to isolate and another thread attempting | 75 | balloon->migratepage = NULL; |
88 | * to release the very same balloon page. | ||
89 | * | ||
90 | * Before we handle the page back to Buddy, lets drop its extra refcnt. | ||
91 | */ | ||
92 | put_page(page); | ||
93 | __free_page(page); | ||
94 | } | 76 | } |
95 | 77 | ||
96 | #ifdef CONFIG_BALLOON_COMPACTION | 78 | #ifdef CONFIG_BALLOON_COMPACTION |
@@ -98,107 +80,58 @@ extern bool balloon_page_isolate(struct page *page); | |||
98 | extern void balloon_page_putback(struct page *page); | 80 | extern void balloon_page_putback(struct page *page); |
99 | extern int balloon_page_migrate(struct page *newpage, | 81 | extern int balloon_page_migrate(struct page *newpage, |
100 | struct page *page, enum migrate_mode mode); | 82 | struct page *page, enum migrate_mode mode); |
101 | extern struct address_space | ||
102 | *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, | ||
103 | const struct address_space_operations *a_ops); | ||
104 | |||
105 | static inline void balloon_mapping_free(struct address_space *balloon_mapping) | ||
106 | { | ||
107 | kfree(balloon_mapping); | ||
108 | } | ||
109 | 83 | ||
110 | /* | 84 | /* |
111 | * page_flags_cleared - helper to perform balloon @page ->flags tests. | 85 | * __is_movable_balloon_page - helper to perform @page PageBalloon tests |
112 | * | ||
113 | * As balloon pages are obtained from buddy and we do not play with page->flags | ||
114 | * at driver level (exception made when we get the page lock for compaction), | ||
115 | * we can safely identify a ballooned page by checking if the | ||
116 | * PAGE_FLAGS_CHECK_AT_PREP page->flags are all cleared. This approach also | ||
117 | * helps us skip ballooned pages that are locked for compaction or release, thus | ||
118 | * mitigating their racy check at balloon_page_movable() | ||
119 | */ | ||
120 | static inline bool page_flags_cleared(struct page *page) | ||
121 | { | ||
122 | return !(page->flags & PAGE_FLAGS_CHECK_AT_PREP); | ||
123 | } | ||
124 | |||
125 | /* | ||
126 | * __is_movable_balloon_page - helper to perform @page mapping->flags tests | ||
127 | */ | 86 | */ |
128 | static inline bool __is_movable_balloon_page(struct page *page) | 87 | static inline bool __is_movable_balloon_page(struct page *page) |
129 | { | 88 | { |
130 | struct address_space *mapping = page->mapping; | 89 | return PageBalloon(page); |
131 | return mapping_balloon(mapping); | ||
132 | } | 90 | } |
133 | 91 | ||
134 | /* | 92 | /* |
135 | * balloon_page_movable - test page->mapping->flags to identify balloon pages | 93 | * balloon_page_movable - test PageBalloon to identify balloon pages |
136 | * that can be moved by compaction/migration. | 94 | * and PagePrivate to check that the page is not |
137 | * | 95 | * isolated and can be moved by compaction/migration. |
138 | * This function is used at core compaction's page isolation scheme, therefore | ||
139 | * most pages exposed to it are not enlisted as balloon pages and so, to avoid | ||
140 | * undesired side effects like racing against __free_pages(), we cannot afford | ||
141 | * holding the page locked while testing page->mapping->flags here. | ||
142 | * | 96 | * |
143 | * As we might return false positives in the case of a balloon page being just | 97 | * As we might return false positives in the case of a balloon page being just |
144 | * released under us, the page->mapping->flags need to be re-tested later, | 98 | * released under us, this need to be re-tested later, under the page lock. |
145 | * under the proper page lock, at the functions that will be coping with the | ||
146 | * balloon page case. | ||
147 | */ | 99 | */ |
148 | static inline bool balloon_page_movable(struct page *page) | 100 | static inline bool balloon_page_movable(struct page *page) |
149 | { | 101 | { |
150 | /* | 102 | return PageBalloon(page) && PagePrivate(page); |
151 | * Before dereferencing and testing mapping->flags, let's make sure | ||
152 | * this is not a page that uses ->mapping in a different way | ||
153 | */ | ||
154 | if (page_flags_cleared(page) && !page_mapped(page) && | ||
155 | page_count(page) == 1) | ||
156 | return __is_movable_balloon_page(page); | ||
157 | |||
158 | return false; | ||
159 | } | 103 | } |
160 | 104 | ||
161 | /* | 105 | /* |
162 | * isolated_balloon_page - identify an isolated balloon page on private | 106 | * isolated_balloon_page - identify an isolated balloon page on private |
163 | * compaction/migration page lists. | 107 | * compaction/migration page lists. |
164 | * | ||
165 | * After a compaction thread isolates a balloon page for migration, it raises | ||
166 | * the page refcount to prevent concurrent compaction threads from re-isolating | ||
167 | * the same page. For that reason putback_movable_pages(), or other routines | ||
168 | * that need to identify isolated balloon pages on private pagelists, cannot | ||
169 | * rely on balloon_page_movable() to accomplish the task. | ||
170 | */ | 108 | */ |
171 | static inline bool isolated_balloon_page(struct page *page) | 109 | static inline bool isolated_balloon_page(struct page *page) |
172 | { | 110 | { |
173 | /* Already isolated balloon pages, by default, have a raised refcount */ | 111 | return PageBalloon(page); |
174 | if (page_flags_cleared(page) && !page_mapped(page) && | ||
175 | page_count(page) >= 2) | ||
176 | return __is_movable_balloon_page(page); | ||
177 | |||
178 | return false; | ||
179 | } | 112 | } |
180 | 113 | ||
181 | /* | 114 | /* |
182 | * balloon_page_insert - insert a page into the balloon's page list and make | 115 | * balloon_page_insert - insert a page into the balloon's page list and make |
183 | * the page->mapping assignment accordingly. | 116 | * the page->private assignment accordingly. |
117 | * @balloon : pointer to balloon device | ||
184 | * @page : page to be assigned as a 'balloon page' | 118 | * @page : page to be assigned as a 'balloon page' |
185 | * @mapping : allocated special 'balloon_mapping' | ||
186 | * @head : balloon's device page list head | ||
187 | * | 119 | * |
188 | * Caller must ensure the page is locked and the spin_lock protecting balloon | 120 | * Caller must ensure the page is locked and the spin_lock protecting balloon |
189 | * pages list is held before inserting a page into the balloon device. | 121 | * pages list is held before inserting a page into the balloon device. |
190 | */ | 122 | */ |
191 | static inline void balloon_page_insert(struct page *page, | 123 | static inline void balloon_page_insert(struct balloon_dev_info *balloon, |
192 | struct address_space *mapping, | 124 | struct page *page) |
193 | struct list_head *head) | ||
194 | { | 125 | { |
195 | page->mapping = mapping; | 126 | __SetPageBalloon(page); |
196 | list_add(&page->lru, head); | 127 | SetPagePrivate(page); |
128 | set_page_private(page, (unsigned long)balloon); | ||
129 | list_add(&page->lru, &balloon->pages); | ||
197 | } | 130 | } |
198 | 131 | ||
199 | /* | 132 | /* |
200 | * balloon_page_delete - delete a page from balloon's page list and clear | 133 | * balloon_page_delete - delete a page from balloon's page list and clear |
201 | * the page->mapping assignement accordingly. | 134 | * the page->private assignement accordingly. |
202 | * @page : page to be released from balloon's page list | 135 | * @page : page to be released from balloon's page list |
203 | * | 136 | * |
204 | * Caller must ensure the page is locked and the spin_lock protecting balloon | 137 | * Caller must ensure the page is locked and the spin_lock protecting balloon |
@@ -206,8 +139,12 @@ static inline void balloon_page_insert(struct page *page, | |||
206 | */ | 139 | */ |
207 | static inline void balloon_page_delete(struct page *page) | 140 | static inline void balloon_page_delete(struct page *page) |
208 | { | 141 | { |
209 | page->mapping = NULL; | 142 | __ClearPageBalloon(page); |
210 | list_del(&page->lru); | 143 | set_page_private(page, 0); |
144 | if (PagePrivate(page)) { | ||
145 | ClearPagePrivate(page); | ||
146 | list_del(&page->lru); | ||
147 | } | ||
211 | } | 148 | } |
212 | 149 | ||
213 | /* | 150 | /* |
@@ -216,11 +153,7 @@ static inline void balloon_page_delete(struct page *page) | |||
216 | */ | 153 | */ |
217 | static inline struct balloon_dev_info *balloon_page_device(struct page *page) | 154 | static inline struct balloon_dev_info *balloon_page_device(struct page *page) |
218 | { | 155 | { |
219 | struct address_space *mapping = page->mapping; | 156 | return (struct balloon_dev_info *)page_private(page); |
220 | if (likely(mapping)) | ||
221 | return mapping->private_data; | ||
222 | |||
223 | return NULL; | ||
224 | } | 157 | } |
225 | 158 | ||
226 | static inline gfp_t balloon_mapping_gfp_mask(void) | 159 | static inline gfp_t balloon_mapping_gfp_mask(void) |
@@ -228,34 +161,24 @@ static inline gfp_t balloon_mapping_gfp_mask(void) | |||
228 | return GFP_HIGHUSER_MOVABLE; | 161 | return GFP_HIGHUSER_MOVABLE; |
229 | } | 162 | } |
230 | 163 | ||
231 | static inline bool balloon_compaction_check(void) | ||
232 | { | ||
233 | return true; | ||
234 | } | ||
235 | |||
236 | #else /* !CONFIG_BALLOON_COMPACTION */ | 164 | #else /* !CONFIG_BALLOON_COMPACTION */ |
237 | 165 | ||
238 | static inline void *balloon_mapping_alloc(void *balloon_device, | 166 | static inline void balloon_page_insert(struct balloon_dev_info *balloon, |
239 | const struct address_space_operations *a_ops) | 167 | struct page *page) |
240 | { | ||
241 | return ERR_PTR(-EOPNOTSUPP); | ||
242 | } | ||
243 | |||
244 | static inline void balloon_mapping_free(struct address_space *balloon_mapping) | ||
245 | { | 168 | { |
246 | return; | 169 | __SetPageBalloon(page); |
170 | list_add(&page->lru, &balloon->pages); | ||
247 | } | 171 | } |
248 | 172 | ||
249 | static inline void balloon_page_insert(struct page *page, | 173 | static inline void balloon_page_delete(struct page *page) |
250 | struct address_space *mapping, | ||
251 | struct list_head *head) | ||
252 | { | 174 | { |
253 | list_add(&page->lru, head); | 175 | __ClearPageBalloon(page); |
176 | list_del(&page->lru); | ||
254 | } | 177 | } |
255 | 178 | ||
256 | static inline void balloon_page_delete(struct page *page) | 179 | static inline bool __is_movable_balloon_page(struct page *page) |
257 | { | 180 | { |
258 | list_del(&page->lru); | 181 | return false; |
259 | } | 182 | } |
260 | 183 | ||
261 | static inline bool balloon_page_movable(struct page *page) | 184 | static inline bool balloon_page_movable(struct page *page) |
@@ -289,9 +212,5 @@ static inline gfp_t balloon_mapping_gfp_mask(void) | |||
289 | return GFP_HIGHUSER; | 212 | return GFP_HIGHUSER; |
290 | } | 213 | } |
291 | 214 | ||
292 | static inline bool balloon_compaction_check(void) | ||
293 | { | ||
294 | return false; | ||
295 | } | ||
296 | #endif /* CONFIG_BALLOON_COMPACTION */ | 215 | #endif /* CONFIG_BALLOON_COMPACTION */ |
297 | #endif /* _LINUX_BALLOON_COMPACTION_H */ | 216 | #endif /* _LINUX_BALLOON_COMPACTION_H */ |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 518b46555b80..87be398166d3 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -1564,7 +1564,7 @@ static inline int blk_rq_map_integrity_sg(struct request_queue *q, | |||
1564 | } | 1564 | } |
1565 | static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) | 1565 | static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) |
1566 | { | 1566 | { |
1567 | return 0; | 1567 | return NULL; |
1568 | } | 1568 | } |
1569 | static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) | 1569 | static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) |
1570 | { | 1570 | { |
diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 01e3132820da..60bdf8dc02a3 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h | |||
@@ -2,14 +2,24 @@ | |||
2 | #define _LINUX_COMPACTION_H | 2 | #define _LINUX_COMPACTION_H |
3 | 3 | ||
4 | /* Return values for compact_zone() and try_to_compact_pages() */ | 4 | /* Return values for compact_zone() and try_to_compact_pages() */ |
5 | /* compaction didn't start as it was deferred due to past failures */ | ||
6 | #define COMPACT_DEFERRED 0 | ||
5 | /* compaction didn't start as it was not possible or direct reclaim was more suitable */ | 7 | /* compaction didn't start as it was not possible or direct reclaim was more suitable */ |
6 | #define COMPACT_SKIPPED 0 | 8 | #define COMPACT_SKIPPED 1 |
7 | /* compaction should continue to another pageblock */ | 9 | /* compaction should continue to another pageblock */ |
8 | #define COMPACT_CONTINUE 1 | 10 | #define COMPACT_CONTINUE 2 |
9 | /* direct compaction partially compacted a zone and there are suitable pages */ | 11 | /* direct compaction partially compacted a zone and there are suitable pages */ |
10 | #define COMPACT_PARTIAL 2 | 12 | #define COMPACT_PARTIAL 3 |
11 | /* The full zone was compacted */ | 13 | /* The full zone was compacted */ |
12 | #define COMPACT_COMPLETE 3 | 14 | #define COMPACT_COMPLETE 4 |
15 | |||
16 | /* Used to signal whether compaction detected need_sched() or lock contention */ | ||
17 | /* No contention detected */ | ||
18 | #define COMPACT_CONTENDED_NONE 0 | ||
19 | /* Either need_sched() was true or fatal signal pending */ | ||
20 | #define COMPACT_CONTENDED_SCHED 1 | ||
21 | /* Zone lock or lru_lock was contended in async compaction */ | ||
22 | #define COMPACT_CONTENDED_LOCK 2 | ||
13 | 23 | ||
14 | #ifdef CONFIG_COMPACTION | 24 | #ifdef CONFIG_COMPACTION |
15 | extern int sysctl_compact_memory; | 25 | extern int sysctl_compact_memory; |
@@ -22,7 +32,8 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, | |||
22 | extern int fragmentation_index(struct zone *zone, unsigned int order); | 32 | extern int fragmentation_index(struct zone *zone, unsigned int order); |
23 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, | 33 | extern unsigned long try_to_compact_pages(struct zonelist *zonelist, |
24 | int order, gfp_t gfp_mask, nodemask_t *mask, | 34 | int order, gfp_t gfp_mask, nodemask_t *mask, |
25 | enum migrate_mode mode, bool *contended); | 35 | enum migrate_mode mode, int *contended, |
36 | struct zone **candidate_zone); | ||
26 | extern void compact_pgdat(pg_data_t *pgdat, int order); | 37 | extern void compact_pgdat(pg_data_t *pgdat, int order); |
27 | extern void reset_isolation_suitable(pg_data_t *pgdat); | 38 | extern void reset_isolation_suitable(pg_data_t *pgdat); |
28 | extern unsigned long compaction_suitable(struct zone *zone, int order); | 39 | extern unsigned long compaction_suitable(struct zone *zone, int order); |
@@ -91,7 +102,8 @@ static inline bool compaction_restarting(struct zone *zone, int order) | |||
91 | #else | 102 | #else |
92 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, | 103 | static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, |
93 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 104 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
94 | enum migrate_mode mode, bool *contended) | 105 | enum migrate_mode mode, int *contended, |
106 | struct zone **candidate_zone) | ||
95 | { | 107 | { |
96 | return COMPACT_CONTINUE; | 108 | return COMPACT_CONTINUE; |
97 | } | 109 | } |
diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 1c2fdaa2ffc3..1ccaab44abcc 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h | |||
@@ -110,6 +110,10 @@ extern void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, | |||
110 | extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, | 110 | extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, |
111 | unsigned long start, unsigned int nr, void *data); | 111 | unsigned long start, unsigned int nr, void *data); |
112 | 112 | ||
113 | extern unsigned long gen_pool_first_fit_order_align(unsigned long *map, | ||
114 | unsigned long size, unsigned long start, unsigned int nr, | ||
115 | void *data); | ||
116 | |||
113 | extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, | 117 | extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, |
114 | unsigned long start, unsigned int nr, void *data); | 118 | unsigned long start, unsigned int nr, void *data); |
115 | 119 | ||
@@ -117,6 +121,9 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev, | |||
117 | int min_alloc_order, int nid); | 121 | int min_alloc_order, int nid); |
118 | extern struct gen_pool *dev_get_gen_pool(struct device *dev); | 122 | extern struct gen_pool *dev_get_gen_pool(struct device *dev); |
119 | 123 | ||
124 | bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, | ||
125 | size_t size); | ||
126 | |||
120 | #ifdef CONFIG_OF | 127 | #ifdef CONFIG_OF |
121 | extern struct gen_pool *of_get_named_gen_pool(struct device_node *np, | 128 | extern struct gen_pool *of_get_named_gen_pool(struct device_node *np, |
122 | const char *propname, int index); | 129 | const char *propname, int index); |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5e7219dc0fae..41b30fd4d041 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -156,7 +156,7 @@ struct vm_area_struct; | |||
156 | #define GFP_DMA32 __GFP_DMA32 | 156 | #define GFP_DMA32 __GFP_DMA32 |
157 | 157 | ||
158 | /* Convert GFP flags to their corresponding migrate type */ | 158 | /* Convert GFP flags to their corresponding migrate type */ |
159 | static inline int allocflags_to_migratetype(gfp_t gfp_flags) | 159 | static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) |
160 | { | 160 | { |
161 | WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); | 161 | WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); |
162 | 162 | ||
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 63579cb8d3dc..ad9051bab267 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -132,7 +132,7 @@ extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, | |||
132 | static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, | 132 | static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, |
133 | spinlock_t **ptl) | 133 | spinlock_t **ptl) |
134 | { | 134 | { |
135 | VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); | 135 | VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); |
136 | if (pmd_trans_huge(*pmd)) | 136 | if (pmd_trans_huge(*pmd)) |
137 | return __pmd_trans_huge_lock(pmd, vma, ptl); | 137 | return __pmd_trans_huge_lock(pmd, vma, ptl); |
138 | else | 138 | else |
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 95624bed87ef..e9e420b6d931 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h | |||
@@ -715,23 +715,8 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } | |||
715 | (void) (&_max1 == &_max2); \ | 715 | (void) (&_max1 == &_max2); \ |
716 | _max1 > _max2 ? _max1 : _max2; }) | 716 | _max1 > _max2 ? _max1 : _max2; }) |
717 | 717 | ||
718 | #define min3(x, y, z) ({ \ | 718 | #define min3(x, y, z) min((typeof(x))min(x, y), z) |
719 | typeof(x) _min1 = (x); \ | 719 | #define max3(x, y, z) max((typeof(x))max(x, y), z) |
720 | typeof(y) _min2 = (y); \ | ||
721 | typeof(z) _min3 = (z); \ | ||
722 | (void) (&_min1 == &_min2); \ | ||
723 | (void) (&_min1 == &_min3); \ | ||
724 | _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \ | ||
725 | (_min2 < _min3 ? _min2 : _min3); }) | ||
726 | |||
727 | #define max3(x, y, z) ({ \ | ||
728 | typeof(x) _max1 = (x); \ | ||
729 | typeof(y) _max2 = (y); \ | ||
730 | typeof(z) _max3 = (z); \ | ||
731 | (void) (&_max1 == &_max2); \ | ||
732 | (void) (&_max1 == &_max3); \ | ||
733 | _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \ | ||
734 | (_max2 > _max3 ? _max2 : _max3); }) | ||
735 | 720 | ||
736 | /** | 721 | /** |
737 | * min_not_zero - return the minimum that is _not_ zero, unless both are zero | 722 | * min_not_zero - return the minimum that is _not_ zero, unless both are zero |
@@ -746,20 +731,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } | |||
746 | /** | 731 | /** |
747 | * clamp - return a value clamped to a given range with strict typechecking | 732 | * clamp - return a value clamped to a given range with strict typechecking |
748 | * @val: current value | 733 | * @val: current value |
749 | * @min: minimum allowable value | 734 | * @lo: lowest allowable value |
750 | * @max: maximum allowable value | 735 | * @hi: highest allowable value |
751 | * | 736 | * |
752 | * This macro does strict typechecking of min/max to make sure they are of the | 737 | * This macro does strict typechecking of lo/hi to make sure they are of the |
753 | * same type as val. See the unnecessary pointer comparisons. | 738 | * same type as val. See the unnecessary pointer comparisons. |
754 | */ | 739 | */ |
755 | #define clamp(val, min, max) ({ \ | 740 | #define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) |
756 | typeof(val) __val = (val); \ | ||
757 | typeof(min) __min = (min); \ | ||
758 | typeof(max) __max = (max); \ | ||
759 | (void) (&__val == &__min); \ | ||
760 | (void) (&__val == &__max); \ | ||
761 | __val = __val < __min ? __min: __val; \ | ||
762 | __val > __max ? __max: __val; }) | ||
763 | 741 | ||
764 | /* | 742 | /* |
765 | * ..and if you can't take the strict | 743 | * ..and if you can't take the strict |
@@ -781,36 +759,26 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } | |||
781 | * clamp_t - return a value clamped to a given range using a given type | 759 | * clamp_t - return a value clamped to a given range using a given type |
782 | * @type: the type of variable to use | 760 | * @type: the type of variable to use |
783 | * @val: current value | 761 | * @val: current value |
784 | * @min: minimum allowable value | 762 | * @lo: minimum allowable value |
785 | * @max: maximum allowable value | 763 | * @hi: maximum allowable value |
786 | * | 764 | * |
787 | * This macro does no typechecking and uses temporary variables of type | 765 | * This macro does no typechecking and uses temporary variables of type |
788 | * 'type' to make all the comparisons. | 766 | * 'type' to make all the comparisons. |
789 | */ | 767 | */ |
790 | #define clamp_t(type, val, min, max) ({ \ | 768 | #define clamp_t(type, val, lo, hi) min_t(type, max_t(type, val, lo), hi) |
791 | type __val = (val); \ | ||
792 | type __min = (min); \ | ||
793 | type __max = (max); \ | ||
794 | __val = __val < __min ? __min: __val; \ | ||
795 | __val > __max ? __max: __val; }) | ||
796 | 769 | ||
797 | /** | 770 | /** |
798 | * clamp_val - return a value clamped to a given range using val's type | 771 | * clamp_val - return a value clamped to a given range using val's type |
799 | * @val: current value | 772 | * @val: current value |
800 | * @min: minimum allowable value | 773 | * @lo: minimum allowable value |
801 | * @max: maximum allowable value | 774 | * @hi: maximum allowable value |
802 | * | 775 | * |
803 | * This macro does no typechecking and uses temporary variables of whatever | 776 | * This macro does no typechecking and uses temporary variables of whatever |
804 | * type the input argument 'val' is. This is useful when val is an unsigned | 777 | * type the input argument 'val' is. This is useful when val is an unsigned |
805 | * type and min and max are literals that will otherwise be assigned a signed | 778 | * type and min and max are literals that will otherwise be assigned a signed |
806 | * integer type. | 779 | * integer type. |
807 | */ | 780 | */ |
808 | #define clamp_val(val, min, max) ({ \ | 781 | #define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) |
809 | typeof(val) __val = (val); \ | ||
810 | typeof(val) __min = (min); \ | ||
811 | typeof(val) __max = (max); \ | ||
812 | __val = __val < __min ? __min: __val; \ | ||
813 | __val > __max ? __max: __val; }) | ||
814 | 782 | ||
815 | 783 | ||
816 | /* | 784 | /* |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e0752d204d9e..19df5d857411 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -440,11 +440,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order); | |||
440 | 440 | ||
441 | int memcg_cache_id(struct mem_cgroup *memcg); | 441 | int memcg_cache_id(struct mem_cgroup *memcg); |
442 | 442 | ||
443 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | ||
444 | struct kmem_cache *root_cache); | ||
445 | void memcg_free_cache_params(struct kmem_cache *s); | ||
446 | |||
447 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups); | ||
448 | void memcg_update_array_size(int num_groups); | 443 | void memcg_update_array_size(int num_groups); |
449 | 444 | ||
450 | struct kmem_cache * | 445 | struct kmem_cache * |
@@ -574,16 +569,6 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) | |||
574 | return -1; | 569 | return -1; |
575 | } | 570 | } |
576 | 571 | ||
577 | static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, | ||
578 | struct kmem_cache *s, struct kmem_cache *root_cache) | ||
579 | { | ||
580 | return 0; | ||
581 | } | ||
582 | |||
583 | static inline void memcg_free_cache_params(struct kmem_cache *s) | ||
584 | { | ||
585 | } | ||
586 | |||
587 | static inline struct kmem_cache * | 572 | static inline struct kmem_cache * |
588 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) | 573 | memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) |
589 | { | 574 | { |
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index d9524c49d767..8f1a41951df9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
@@ -84,6 +84,7 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); | |||
84 | extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); | 84 | extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); |
85 | /* VM interface that may be used by firmware interface */ | 85 | /* VM interface that may be used by firmware interface */ |
86 | extern int online_pages(unsigned long, unsigned long, int); | 86 | extern int online_pages(unsigned long, unsigned long, int); |
87 | extern int test_pages_in_a_zone(unsigned long, unsigned long); | ||
87 | extern void __offline_isolated_pages(unsigned long, unsigned long); | 88 | extern void __offline_isolated_pages(unsigned long, unsigned long); |
88 | 89 | ||
89 | typedef void (*online_page_callback_t)(struct page *page); | 90 | typedef void (*online_page_callback_t)(struct page *page); |
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index f230a978e6ba..3d385c81c153 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -134,9 +134,10 @@ void mpol_free_shared_policy(struct shared_policy *p); | |||
134 | struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, | 134 | struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, |
135 | unsigned long idx); | 135 | unsigned long idx); |
136 | 136 | ||
137 | struct mempolicy *get_vma_policy(struct task_struct *tsk, | 137 | struct mempolicy *get_task_policy(struct task_struct *p); |
138 | struct vm_area_struct *vma, unsigned long addr); | 138 | struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, |
139 | bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma); | 139 | unsigned long addr); |
140 | bool vma_policy_mof(struct vm_area_struct *vma); | ||
140 | 141 | ||
141 | extern void numa_default_policy(void); | 142 | extern void numa_default_policy(void); |
142 | extern void numa_policy_init(void); | 143 | extern void numa_policy_init(void); |
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index a2901c414664..01aad3ed89ec 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h | |||
@@ -13,18 +13,9 @@ typedef void free_page_t(struct page *page, unsigned long private); | |||
13 | * Return values from addresss_space_operations.migratepage(): | 13 | * Return values from addresss_space_operations.migratepage(): |
14 | * - negative errno on page migration failure; | 14 | * - negative errno on page migration failure; |
15 | * - zero on page migration success; | 15 | * - zero on page migration success; |
16 | * | ||
17 | * The balloon page migration introduces this special case where a 'distinct' | ||
18 | * return code is used to flag a successful page migration to unmap_and_move(). | ||
19 | * This approach is necessary because page migration can race against balloon | ||
20 | * deflation procedure, and for such case we could introduce a nasty page leak | ||
21 | * if a successfully migrated balloon page gets released concurrently with | ||
22 | * migration's unmap_and_move() wrap-up steps. | ||
23 | */ | 16 | */ |
24 | #define MIGRATEPAGE_SUCCESS 0 | 17 | #define MIGRATEPAGE_SUCCESS 0 |
25 | #define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page | 18 | |
26 | * sucessful migration case. | ||
27 | */ | ||
28 | enum migrate_reason { | 19 | enum migrate_reason { |
29 | MR_COMPACTION, | 20 | MR_COMPACTION, |
30 | MR_MEMORY_FAILURE, | 21 | MR_MEMORY_FAILURE, |
@@ -82,9 +73,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, | |||
82 | return -ENOSYS; | 73 | return -ENOSYS; |
83 | } | 74 | } |
84 | 75 | ||
85 | /* Possible settings for the migrate_page() method in address_operations */ | ||
86 | #define migrate_page NULL | ||
87 | |||
88 | #endif /* CONFIG_MIGRATION */ | 76 | #endif /* CONFIG_MIGRATION */ |
89 | 77 | ||
90 | #ifdef CONFIG_NUMA_BALANCING | 78 | #ifdef CONFIG_NUMA_BALANCING |
diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f4196a0bc20..fa0d74e06428 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/pfn.h> | 18 | #include <linux/pfn.h> |
19 | #include <linux/bit_spinlock.h> | 19 | #include <linux/bit_spinlock.h> |
20 | #include <linux/shrinker.h> | 20 | #include <linux/shrinker.h> |
21 | #include <linux/resource.h> | ||
21 | 22 | ||
22 | struct mempolicy; | 23 | struct mempolicy; |
23 | struct anon_vma; | 24 | struct anon_vma; |
@@ -553,6 +554,25 @@ static inline void __ClearPageBuddy(struct page *page) | |||
553 | atomic_set(&page->_mapcount, -1); | 554 | atomic_set(&page->_mapcount, -1); |
554 | } | 555 | } |
555 | 556 | ||
557 | #define PAGE_BALLOON_MAPCOUNT_VALUE (-256) | ||
558 | |||
559 | static inline int PageBalloon(struct page *page) | ||
560 | { | ||
561 | return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE; | ||
562 | } | ||
563 | |||
564 | static inline void __SetPageBalloon(struct page *page) | ||
565 | { | ||
566 | VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); | ||
567 | atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE); | ||
568 | } | ||
569 | |||
570 | static inline void __ClearPageBalloon(struct page *page) | ||
571 | { | ||
572 | VM_BUG_ON_PAGE(!PageBalloon(page), page); | ||
573 | atomic_set(&page->_mapcount, -1); | ||
574 | } | ||
575 | |||
556 | void put_page(struct page *page); | 576 | void put_page(struct page *page); |
557 | void put_pages_list(struct list_head *pages); | 577 | void put_pages_list(struct list_head *pages); |
558 | 578 | ||
@@ -1247,8 +1267,8 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma, | |||
1247 | !vma_growsup(vma->vm_next, addr); | 1267 | !vma_growsup(vma->vm_next, addr); |
1248 | } | 1268 | } |
1249 | 1269 | ||
1250 | extern pid_t | 1270 | extern struct task_struct *task_of_stack(struct task_struct *task, |
1251 | vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group); | 1271 | struct vm_area_struct *vma, bool in_group); |
1252 | 1272 | ||
1253 | extern unsigned long move_page_tables(struct vm_area_struct *vma, | 1273 | extern unsigned long move_page_tables(struct vm_area_struct *vma, |
1254 | unsigned long old_addr, struct vm_area_struct *new_vma, | 1274 | unsigned long old_addr, struct vm_area_struct *new_vma, |
@@ -1780,6 +1800,20 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, | |||
1780 | bool *need_rmap_locks); | 1800 | bool *need_rmap_locks); |
1781 | extern void exit_mmap(struct mm_struct *); | 1801 | extern void exit_mmap(struct mm_struct *); |
1782 | 1802 | ||
1803 | static inline int check_data_rlimit(unsigned long rlim, | ||
1804 | unsigned long new, | ||
1805 | unsigned long start, | ||
1806 | unsigned long end_data, | ||
1807 | unsigned long start_data) | ||
1808 | { | ||
1809 | if (rlim < RLIM_INFINITY) { | ||
1810 | if (((new - start) + (end_data - start_data)) > rlim) | ||
1811 | return -ENOSPC; | ||
1812 | } | ||
1813 | |||
1814 | return 0; | ||
1815 | } | ||
1816 | |||
1783 | extern int mm_take_all_locks(struct mm_struct *mm); | 1817 | extern int mm_take_all_locks(struct mm_struct *mm); |
1784 | extern void mm_drop_all_locks(struct mm_struct *mm); | 1818 | extern void mm_drop_all_locks(struct mm_struct *mm); |
1785 | 1819 | ||
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 2f348d02f640..877ef226f90f 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h | |||
@@ -4,10 +4,14 @@ | |||
4 | #include <linux/stringify.h> | 4 | #include <linux/stringify.h> |
5 | 5 | ||
6 | struct page; | 6 | struct page; |
7 | struct vm_area_struct; | ||
8 | struct mm_struct; | ||
7 | 9 | ||
8 | extern void dump_page(struct page *page, const char *reason); | 10 | extern void dump_page(struct page *page, const char *reason); |
9 | extern void dump_page_badflags(struct page *page, const char *reason, | 11 | extern void dump_page_badflags(struct page *page, const char *reason, |
10 | unsigned long badflags); | 12 | unsigned long badflags); |
13 | void dump_vma(const struct vm_area_struct *vma); | ||
14 | void dump_mm(const struct mm_struct *mm); | ||
11 | 15 | ||
12 | #ifdef CONFIG_DEBUG_VM | 16 | #ifdef CONFIG_DEBUG_VM |
13 | #define VM_BUG_ON(cond) BUG_ON(cond) | 17 | #define VM_BUG_ON(cond) BUG_ON(cond) |
@@ -18,12 +22,28 @@ extern void dump_page_badflags(struct page *page, const char *reason, | |||
18 | BUG(); \ | 22 | BUG(); \ |
19 | } \ | 23 | } \ |
20 | } while (0) | 24 | } while (0) |
25 | #define VM_BUG_ON_VMA(cond, vma) \ | ||
26 | do { \ | ||
27 | if (unlikely(cond)) { \ | ||
28 | dump_vma(vma); \ | ||
29 | BUG(); \ | ||
30 | } \ | ||
31 | } while (0) | ||
32 | #define VM_BUG_ON_MM(cond, mm) \ | ||
33 | do { \ | ||
34 | if (unlikely(cond)) { \ | ||
35 | dump_mm(mm); \ | ||
36 | BUG(); \ | ||
37 | } \ | ||
38 | } while (0) | ||
21 | #define VM_WARN_ON(cond) WARN_ON(cond) | 39 | #define VM_WARN_ON(cond) WARN_ON(cond) |
22 | #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) | 40 | #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) |
23 | #define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) | 41 | #define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) |
24 | #else | 42 | #else |
25 | #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) | 43 | #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) |
26 | #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) | 44 | #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) |
45 | #define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond) | ||
46 | #define VM_BUG_ON_MM(cond, mm) VM_BUG_ON(cond) | ||
27 | #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) | 47 | #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) |
28 | #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) | 48 | #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) |
29 | #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) | 49 | #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) |
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 318df7051850..48bf12ef6620 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -521,13 +521,13 @@ struct zone { | |||
521 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; | 521 | atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; |
522 | } ____cacheline_internodealigned_in_smp; | 522 | } ____cacheline_internodealigned_in_smp; |
523 | 523 | ||
524 | typedef enum { | 524 | enum zone_flags { |
525 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ | 525 | ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ |
526 | ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ | 526 | ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ |
527 | ZONE_CONGESTED, /* zone has many dirty pages backed by | 527 | ZONE_CONGESTED, /* zone has many dirty pages backed by |
528 | * a congested BDI | 528 | * a congested BDI |
529 | */ | 529 | */ |
530 | ZONE_TAIL_LRU_DIRTY, /* reclaim scanning has recently found | 530 | ZONE_DIRTY, /* reclaim scanning has recently found |
531 | * many dirty file pages at the tail | 531 | * many dirty file pages at the tail |
532 | * of the LRU. | 532 | * of the LRU. |
533 | */ | 533 | */ |
@@ -535,52 +535,7 @@ typedef enum { | |||
535 | * many pages under writeback | 535 | * many pages under writeback |
536 | */ | 536 | */ |
537 | ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ | 537 | ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ |
538 | } zone_flags_t; | 538 | }; |
539 | |||
540 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) | ||
541 | { | ||
542 | set_bit(flag, &zone->flags); | ||
543 | } | ||
544 | |||
545 | static inline int zone_test_and_set_flag(struct zone *zone, zone_flags_t flag) | ||
546 | { | ||
547 | return test_and_set_bit(flag, &zone->flags); | ||
548 | } | ||
549 | |||
550 | static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag) | ||
551 | { | ||
552 | clear_bit(flag, &zone->flags); | ||
553 | } | ||
554 | |||
555 | static inline int zone_is_reclaim_congested(const struct zone *zone) | ||
556 | { | ||
557 | return test_bit(ZONE_CONGESTED, &zone->flags); | ||
558 | } | ||
559 | |||
560 | static inline int zone_is_reclaim_dirty(const struct zone *zone) | ||
561 | { | ||
562 | return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); | ||
563 | } | ||
564 | |||
565 | static inline int zone_is_reclaim_writeback(const struct zone *zone) | ||
566 | { | ||
567 | return test_bit(ZONE_WRITEBACK, &zone->flags); | ||
568 | } | ||
569 | |||
570 | static inline int zone_is_reclaim_locked(const struct zone *zone) | ||
571 | { | ||
572 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); | ||
573 | } | ||
574 | |||
575 | static inline int zone_is_fair_depleted(const struct zone *zone) | ||
576 | { | ||
577 | return test_bit(ZONE_FAIR_DEPLETED, &zone->flags); | ||
578 | } | ||
579 | |||
580 | static inline int zone_is_oom_locked(const struct zone *zone) | ||
581 | { | ||
582 | return test_bit(ZONE_OOM_LOCKED, &zone->flags); | ||
583 | } | ||
584 | 539 | ||
585 | static inline unsigned long zone_end_pfn(const struct zone *zone) | 540 | static inline unsigned long zone_end_pfn(const struct zone *zone) |
586 | { | 541 | { |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 19191d39c4f3..7ea069cd3257 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -24,8 +24,7 @@ enum mapping_flags { | |||
24 | AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ | 24 | AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ |
25 | AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ | 25 | AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ |
26 | AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ | 26 | AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ |
27 | AS_BALLOON_MAP = __GFP_BITS_SHIFT + 4, /* balloon page special map */ | 27 | AS_EXITING = __GFP_BITS_SHIFT + 4, /* final truncate in progress */ |
28 | AS_EXITING = __GFP_BITS_SHIFT + 5, /* final truncate in progress */ | ||
29 | }; | 28 | }; |
30 | 29 | ||
31 | static inline void mapping_set_error(struct address_space *mapping, int error) | 30 | static inline void mapping_set_error(struct address_space *mapping, int error) |
@@ -55,21 +54,6 @@ static inline int mapping_unevictable(struct address_space *mapping) | |||
55 | return !!mapping; | 54 | return !!mapping; |
56 | } | 55 | } |
57 | 56 | ||
58 | static inline void mapping_set_balloon(struct address_space *mapping) | ||
59 | { | ||
60 | set_bit(AS_BALLOON_MAP, &mapping->flags); | ||
61 | } | ||
62 | |||
63 | static inline void mapping_clear_balloon(struct address_space *mapping) | ||
64 | { | ||
65 | clear_bit(AS_BALLOON_MAP, &mapping->flags); | ||
66 | } | ||
67 | |||
68 | static inline int mapping_balloon(struct address_space *mapping) | ||
69 | { | ||
70 | return mapping && test_bit(AS_BALLOON_MAP, &mapping->flags); | ||
71 | } | ||
72 | |||
73 | static inline void mapping_set_exiting(struct address_space *mapping) | 57 | static inline void mapping_set_exiting(struct address_space *mapping) |
74 | { | 58 | { |
75 | set_bit(AS_EXITING, &mapping->flags); | 59 | set_bit(AS_EXITING, &mapping->flags); |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index be574506e6a9..c0c2bce6b0b7 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -150,7 +150,7 @@ int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); | |||
150 | static inline void anon_vma_merge(struct vm_area_struct *vma, | 150 | static inline void anon_vma_merge(struct vm_area_struct *vma, |
151 | struct vm_area_struct *next) | 151 | struct vm_area_struct *next) |
152 | { | 152 | { |
153 | VM_BUG_ON(vma->anon_vma != next->anon_vma); | 153 | VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); |
154 | unlink_anon_vmas(next); | 154 | unlink_anon_vmas(next); |
155 | } | 155 | } |
156 | 156 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c6353d9e63a..5e63ba59258c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1935,11 +1935,13 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, | |||
1935 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) | 1935 | #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) |
1936 | #define used_math() tsk_used_math(current) | 1936 | #define used_math() tsk_used_math(current) |
1937 | 1937 | ||
1938 | /* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ | 1938 | /* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags |
1939 | * __GFP_FS is also cleared as it implies __GFP_IO. | ||
1940 | */ | ||
1939 | static inline gfp_t memalloc_noio_flags(gfp_t flags) | 1941 | static inline gfp_t memalloc_noio_flags(gfp_t flags) |
1940 | { | 1942 | { |
1941 | if (unlikely(current->flags & PF_MEMALLOC_NOIO)) | 1943 | if (unlikely(current->flags & PF_MEMALLOC_NOIO)) |
1942 | flags &= ~__GFP_IO; | 1944 | flags &= ~(__GFP_IO | __GFP_FS); |
1943 | return flags; | 1945 | return flags; |
1944 | } | 1946 | } |
1945 | 1947 | ||
diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h index 005bf3e38db5..f0f8bad54be9 100644 --- a/include/linux/screen_info.h +++ b/include/linux/screen_info.h | |||
@@ -5,12 +5,4 @@ | |||
5 | 5 | ||
6 | extern struct screen_info screen_info; | 6 | extern struct screen_info screen_info; |
7 | 7 | ||
8 | #define ORIG_X (screen_info.orig_x) | ||
9 | #define ORIG_Y (screen_info.orig_y) | ||
10 | #define ORIG_VIDEO_MODE (screen_info.orig_video_mode) | ||
11 | #define ORIG_VIDEO_COLS (screen_info.orig_video_cols) | ||
12 | #define ORIG_VIDEO_EGA_BX (screen_info.orig_video_ega_bx) | ||
13 | #define ORIG_VIDEO_LINES (screen_info.orig_video_lines) | ||
14 | #define ORIG_VIDEO_ISVGA (screen_info.orig_video_isVGA) | ||
15 | #define ORIG_VIDEO_POINTS (screen_info.orig_video_points) | ||
16 | #endif /* _SCREEN_INFO_H */ | 8 | #endif /* _SCREEN_INFO_H */ |
diff --git a/include/linux/slab.h b/include/linux/slab.h index 1d9abb7d22a0..c265bec6a57d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h | |||
@@ -158,31 +158,6 @@ size_t ksize(const void *); | |||
158 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) | 158 | #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) |
159 | #endif | 159 | #endif |
160 | 160 | ||
161 | #ifdef CONFIG_SLOB | ||
162 | /* | ||
163 | * Common fields provided in kmem_cache by all slab allocators | ||
164 | * This struct is either used directly by the allocator (SLOB) | ||
165 | * or the allocator must include definitions for all fields | ||
166 | * provided in kmem_cache_common in their definition of kmem_cache. | ||
167 | * | ||
168 | * Once we can do anonymous structs (C11 standard) we could put a | ||
169 | * anonymous struct definition in these allocators so that the | ||
170 | * separate allocations in the kmem_cache structure of SLAB and | ||
171 | * SLUB is no longer needed. | ||
172 | */ | ||
173 | struct kmem_cache { | ||
174 | unsigned int object_size;/* The original size of the object */ | ||
175 | unsigned int size; /* The aligned/padded/added on size */ | ||
176 | unsigned int align; /* Alignment as calculated */ | ||
177 | unsigned long flags; /* Active flags on the slab */ | ||
178 | const char *name; /* Slab name for sysfs */ | ||
179 | int refcount; /* Use counter */ | ||
180 | void (*ctor)(void *); /* Called on object slot creation */ | ||
181 | struct list_head list; /* List of all slab caches on the system */ | ||
182 | }; | ||
183 | |||
184 | #endif /* CONFIG_SLOB */ | ||
185 | |||
186 | /* | 161 | /* |
187 | * Kmalloc array related definitions | 162 | * Kmalloc array related definitions |
188 | */ | 163 | */ |
@@ -363,14 +338,6 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, | |||
363 | } | 338 | } |
364 | #endif /* CONFIG_TRACING */ | 339 | #endif /* CONFIG_TRACING */ |
365 | 340 | ||
366 | #ifdef CONFIG_SLAB | ||
367 | #include <linux/slab_def.h> | ||
368 | #endif | ||
369 | |||
370 | #ifdef CONFIG_SLUB | ||
371 | #include <linux/slub_def.h> | ||
372 | #endif | ||
373 | |||
374 | extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); | 341 | extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); |
375 | 342 | ||
376 | #ifdef CONFIG_TRACING | 343 | #ifdef CONFIG_TRACING |
@@ -582,37 +549,15 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) | |||
582 | * allocator where we care about the real place the memory allocation | 549 | * allocator where we care about the real place the memory allocation |
583 | * request comes from. | 550 | * request comes from. |
584 | */ | 551 | */ |
585 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \ | ||
586 | (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \ | ||
587 | (defined(CONFIG_SLOB) && defined(CONFIG_TRACING)) | ||
588 | extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); | 552 | extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); |
589 | #define kmalloc_track_caller(size, flags) \ | 553 | #define kmalloc_track_caller(size, flags) \ |
590 | __kmalloc_track_caller(size, flags, _RET_IP_) | 554 | __kmalloc_track_caller(size, flags, _RET_IP_) |
591 | #else | ||
592 | #define kmalloc_track_caller(size, flags) \ | ||
593 | __kmalloc(size, flags) | ||
594 | #endif /* DEBUG_SLAB */ | ||
595 | 555 | ||
596 | #ifdef CONFIG_NUMA | 556 | #ifdef CONFIG_NUMA |
597 | /* | ||
598 | * kmalloc_node_track_caller is a special version of kmalloc_node that | ||
599 | * records the calling function of the routine calling it for slab leak | ||
600 | * tracking instead of just the calling function (confusing, eh?). | ||
601 | * It's useful when the call to kmalloc_node comes from a widely-used | ||
602 | * standard allocator where we care about the real place the memory | ||
603 | * allocation request comes from. | ||
604 | */ | ||
605 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \ | ||
606 | (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \ | ||
607 | (defined(CONFIG_SLOB) && defined(CONFIG_TRACING)) | ||
608 | extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); | 557 | extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); |
609 | #define kmalloc_node_track_caller(size, flags, node) \ | 558 | #define kmalloc_node_track_caller(size, flags, node) \ |
610 | __kmalloc_node_track_caller(size, flags, node, \ | 559 | __kmalloc_node_track_caller(size, flags, node, \ |
611 | _RET_IP_) | 560 | _RET_IP_) |
612 | #else | ||
613 | #define kmalloc_node_track_caller(size, flags, node) \ | ||
614 | __kmalloc_node(size, flags, node) | ||
615 | #endif | ||
616 | 561 | ||
617 | #else /* CONFIG_NUMA */ | 562 | #else /* CONFIG_NUMA */ |
618 | 563 | ||
@@ -650,14 +595,7 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) | |||
650 | return kmalloc_node(size, flags | __GFP_ZERO, node); | 595 | return kmalloc_node(size, flags | __GFP_ZERO, node); |
651 | } | 596 | } |
652 | 597 | ||
653 | /* | 598 | unsigned int kmem_cache_size(struct kmem_cache *s); |
654 | * Determine the size of a slab object | ||
655 | */ | ||
656 | static inline unsigned int kmem_cache_size(struct kmem_cache *s) | ||
657 | { | ||
658 | return s->object_size; | ||
659 | } | ||
660 | |||
661 | void __init kmem_cache_init_late(void); | 599 | void __init kmem_cache_init_late(void); |
662 | 600 | ||
663 | #endif /* _LINUX_SLAB_H */ | 601 | #endif /* _LINUX_SLAB_H */ |
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 8235dfbb3b05..b869d1662ba3 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h | |||
@@ -8,6 +8,8 @@ | |||
8 | */ | 8 | */ |
9 | 9 | ||
10 | struct kmem_cache { | 10 | struct kmem_cache { |
11 | struct array_cache __percpu *cpu_cache; | ||
12 | |||
11 | /* 1) Cache tunables. Protected by slab_mutex */ | 13 | /* 1) Cache tunables. Protected by slab_mutex */ |
12 | unsigned int batchcount; | 14 | unsigned int batchcount; |
13 | unsigned int limit; | 15 | unsigned int limit; |
@@ -71,23 +73,7 @@ struct kmem_cache { | |||
71 | struct memcg_cache_params *memcg_params; | 73 | struct memcg_cache_params *memcg_params; |
72 | #endif | 74 | #endif |
73 | 75 | ||
74 | /* 6) per-cpu/per-node data, touched during every alloc/free */ | 76 | struct kmem_cache_node *node[MAX_NUMNODES]; |
75 | /* | ||
76 | * We put array[] at the end of kmem_cache, because we want to size | ||
77 | * this array to nr_cpu_ids slots instead of NR_CPUS | ||
78 | * (see kmem_cache_init()) | ||
79 | * We still use [NR_CPUS] and not [1] or [0] because cache_cache | ||
80 | * is statically defined, so we reserve the max number of cpus. | ||
81 | * | ||
82 | * We also need to guarantee that the list is able to accomodate a | ||
83 | * pointer for each node since "nodelists" uses the remainder of | ||
84 | * available pointers. | ||
85 | */ | ||
86 | struct kmem_cache_node **node; | ||
87 | struct array_cache *array[NR_CPUS + MAX_NUMNODES]; | ||
88 | /* | ||
89 | * Do not add fields after array[] | ||
90 | */ | ||
91 | }; | 77 | }; |
92 | 78 | ||
93 | #endif /* _LINUX_SLAB_DEF_H */ | 79 | #endif /* _LINUX_SLAB_DEF_H */ |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 1b72060f093a..37a585beef5c 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -327,8 +327,10 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, | |||
327 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 327 | extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
328 | gfp_t gfp_mask, nodemask_t *mask); | 328 | gfp_t gfp_mask, nodemask_t *mask); |
329 | extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); | 329 | extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); |
330 | extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, | 330 | extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, |
331 | gfp_t gfp_mask, bool noswap); | 331 | unsigned long nr_pages, |
332 | gfp_t gfp_mask, | ||
333 | bool may_swap); | ||
332 | extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, | 334 | extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, |
333 | gfp_t gfp_mask, bool noswap, | 335 | gfp_t gfp_mask, bool noswap, |
334 | struct zone *zone, | 336 | struct zone *zone, |
@@ -354,22 +356,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) | |||
354 | extern int page_evictable(struct page *page); | 356 | extern int page_evictable(struct page *page); |
355 | extern void check_move_unevictable_pages(struct page **, int nr_pages); | 357 | extern void check_move_unevictable_pages(struct page **, int nr_pages); |
356 | 358 | ||
357 | extern unsigned long scan_unevictable_pages; | ||
358 | extern int scan_unevictable_handler(struct ctl_table *, int, | ||
359 | void __user *, size_t *, loff_t *); | ||
360 | #ifdef CONFIG_NUMA | ||
361 | extern int scan_unevictable_register_node(struct node *node); | ||
362 | extern void scan_unevictable_unregister_node(struct node *node); | ||
363 | #else | ||
364 | static inline int scan_unevictable_register_node(struct node *node) | ||
365 | { | ||
366 | return 0; | ||
367 | } | ||
368 | static inline void scan_unevictable_unregister_node(struct node *node) | ||
369 | { | ||
370 | } | ||
371 | #endif | ||
372 | |||
373 | extern int kswapd_run(int nid); | 359 | extern int kswapd_run(int nid); |
374 | extern void kswapd_stop(int nid); | 360 | extern void kswapd_stop(int nid); |
375 | #ifdef CONFIG_MEMCG | 361 | #ifdef CONFIG_MEMCG |
diff --git a/include/linux/topology.h b/include/linux/topology.h index dda6ee521e74..909b6e43b694 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h | |||
@@ -119,11 +119,20 @@ static inline int numa_node_id(void) | |||
119 | * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). | 119 | * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). |
120 | */ | 120 | */ |
121 | DECLARE_PER_CPU(int, _numa_mem_); | 121 | DECLARE_PER_CPU(int, _numa_mem_); |
122 | extern int _node_numa_mem_[MAX_NUMNODES]; | ||
122 | 123 | ||
123 | #ifndef set_numa_mem | 124 | #ifndef set_numa_mem |
124 | static inline void set_numa_mem(int node) | 125 | static inline void set_numa_mem(int node) |
125 | { | 126 | { |
126 | this_cpu_write(_numa_mem_, node); | 127 | this_cpu_write(_numa_mem_, node); |
128 | _node_numa_mem_[numa_node_id()] = node; | ||
129 | } | ||
130 | #endif | ||
131 | |||
132 | #ifndef node_to_mem_node | ||
133 | static inline int node_to_mem_node(int node) | ||
134 | { | ||
135 | return _node_numa_mem_[node]; | ||
127 | } | 136 | } |
128 | #endif | 137 | #endif |
129 | 138 | ||
@@ -146,6 +155,7 @@ static inline int cpu_to_mem(int cpu) | |||
146 | static inline void set_cpu_numa_mem(int cpu, int node) | 155 | static inline void set_cpu_numa_mem(int cpu, int node) |
147 | { | 156 | { |
148 | per_cpu(_numa_mem_, cpu) = node; | 157 | per_cpu(_numa_mem_, cpu) = node; |
158 | _node_numa_mem_[cpu_to_node(cpu)] = node; | ||
149 | } | 159 | } |
150 | #endif | 160 | #endif |
151 | 161 | ||
@@ -159,6 +169,13 @@ static inline int numa_mem_id(void) | |||
159 | } | 169 | } |
160 | #endif | 170 | #endif |
161 | 171 | ||
172 | #ifndef node_to_mem_node | ||
173 | static inline int node_to_mem_node(int node) | ||
174 | { | ||
175 | return node; | ||
176 | } | ||
177 | #endif | ||
178 | |||
162 | #ifndef cpu_to_mem | 179 | #ifndef cpu_to_mem |
163 | static inline int cpu_to_mem(int cpu) | 180 | static inline int cpu_to_mem(int cpu) |
164 | { | 181 | { |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index ced92345c963..730334cdf037 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -72,6 +72,13 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
72 | THP_ZERO_PAGE_ALLOC, | 72 | THP_ZERO_PAGE_ALLOC, |
73 | THP_ZERO_PAGE_ALLOC_FAILED, | 73 | THP_ZERO_PAGE_ALLOC_FAILED, |
74 | #endif | 74 | #endif |
75 | #ifdef CONFIG_MEMORY_BALLOON | ||
76 | BALLOON_INFLATE, | ||
77 | BALLOON_DEFLATE, | ||
78 | #ifdef CONFIG_BALLOON_COMPACTION | ||
79 | BALLOON_MIGRATE, | ||
80 | #endif | ||
81 | #endif | ||
75 | #ifdef CONFIG_DEBUG_TLBFLUSH | 82 | #ifdef CONFIG_DEBUG_TLBFLUSH |
76 | #ifdef CONFIG_SMP | 83 | #ifdef CONFIG_SMP |
77 | NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ | 84 | NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ |
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index e44d634e7fb7..05c214760977 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h | |||
@@ -46,6 +46,6 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, | |||
46 | enum zs_mapmode mm); | 46 | enum zs_mapmode mm); |
47 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle); | 47 | void zs_unmap_object(struct zs_pool *pool, unsigned long handle); |
48 | 48 | ||
49 | u64 zs_get_total_size_bytes(struct zs_pool *pool); | 49 | unsigned long zs_get_total_pages(struct zs_pool *pool); |
50 | 50 | ||
51 | #endif | 51 | #endif |
diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 5116a0e48172..2f96d233c980 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h | |||
@@ -31,6 +31,7 @@ | |||
31 | 31 | ||
32 | #define KPF_KSM 21 | 32 | #define KPF_KSM 21 |
33 | #define KPF_THP 22 | 33 | #define KPF_THP 22 |
34 | #define KPF_BALLOON 23 | ||
34 | 35 | ||
35 | 36 | ||
36 | #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ | 37 | #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ |
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 58afc04c107e..513df75d0fc9 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef _LINUX_PRCTL_H | 1 | #ifndef _LINUX_PRCTL_H |
2 | #define _LINUX_PRCTL_H | 2 | #define _LINUX_PRCTL_H |
3 | 3 | ||
4 | #include <linux/types.h> | ||
5 | |||
4 | /* Values to pass as first argument to prctl() */ | 6 | /* Values to pass as first argument to prctl() */ |
5 | 7 | ||
6 | #define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ | 8 | #define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ |
@@ -119,6 +121,31 @@ | |||
119 | # define PR_SET_MM_ENV_END 11 | 121 | # define PR_SET_MM_ENV_END 11 |
120 | # define PR_SET_MM_AUXV 12 | 122 | # define PR_SET_MM_AUXV 12 |
121 | # define PR_SET_MM_EXE_FILE 13 | 123 | # define PR_SET_MM_EXE_FILE 13 |
124 | # define PR_SET_MM_MAP 14 | ||
125 | # define PR_SET_MM_MAP_SIZE 15 | ||
126 | |||
127 | /* | ||
128 | * This structure provides new memory descriptor | ||
129 | * map which mostly modifies /proc/pid/stat[m] | ||
130 | * output for a task. This mostly done in a | ||
131 | * sake of checkpoint/restore functionality. | ||
132 | */ | ||
133 | struct prctl_mm_map { | ||
134 | __u64 start_code; /* code section bounds */ | ||
135 | __u64 end_code; | ||
136 | __u64 start_data; /* data section bounds */ | ||
137 | __u64 end_data; | ||
138 | __u64 start_brk; /* heap for brk() syscall */ | ||
139 | __u64 brk; | ||
140 | __u64 start_stack; /* stack starts at */ | ||
141 | __u64 arg_start; /* command line arguments bounds */ | ||
142 | __u64 arg_end; | ||
143 | __u64 env_start; /* environment variables bounds */ | ||
144 | __u64 env_end; | ||
145 | __u64 *auxv; /* auxiliary vector */ | ||
146 | __u32 auxv_size; /* vector size */ | ||
147 | __u32 exe_fd; /* /proc/$pid/exe link file */ | ||
148 | }; | ||
122 | 149 | ||
123 | /* | 150 | /* |
124 | * Set specific pid that is allowed to ptrace the current task. | 151 | * Set specific pid that is allowed to ptrace the current task. |
diff --git a/init/Kconfig b/init/Kconfig index e25a82a291a6..d2355812ba48 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -889,17 +889,6 @@ config ARCH_SUPPORTS_INT128 | |||
889 | config ARCH_WANT_NUMA_VARIABLE_LOCALITY | 889 | config ARCH_WANT_NUMA_VARIABLE_LOCALITY |
890 | bool | 890 | bool |
891 | 891 | ||
892 | # | ||
893 | # For architectures that are willing to define _PAGE_NUMA as _PAGE_PROTNONE | ||
894 | config ARCH_WANTS_PROT_NUMA_PROT_NONE | ||
895 | bool | ||
896 | |||
897 | config ARCH_USES_NUMA_PROT_NONE | ||
898 | bool | ||
899 | default y | ||
900 | depends on ARCH_WANTS_PROT_NUMA_PROT_NONE | ||
901 | depends on NUMA_BALANCING | ||
902 | |||
903 | config NUMA_BALANCING_DEFAULT_ENABLED | 892 | config NUMA_BALANCING_DEFAULT_ENABLED |
904 | bool "Automatically enable NUMA aware memory/task placement" | 893 | bool "Automatically enable NUMA aware memory/task placement" |
905 | default y | 894 | default y |
diff --git a/kernel/acct.c b/kernel/acct.c index b4c667d22e79..33738ef972f3 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -472,7 +472,6 @@ static void do_acct_process(struct bsd_acct_struct *acct) | |||
472 | acct_t ac; | 472 | acct_t ac; |
473 | unsigned long flim; | 473 | unsigned long flim; |
474 | const struct cred *orig_cred; | 474 | const struct cred *orig_cred; |
475 | struct pid_namespace *ns = acct->ns; | ||
476 | struct file *file = acct->file; | 475 | struct file *file = acct->file; |
477 | 476 | ||
478 | /* | 477 | /* |
@@ -500,10 +499,15 @@ static void do_acct_process(struct bsd_acct_struct *acct) | |||
500 | ac.ac_gid16 = ac.ac_gid; | 499 | ac.ac_gid16 = ac.ac_gid; |
501 | #endif | 500 | #endif |
502 | #if ACCT_VERSION == 3 | 501 | #if ACCT_VERSION == 3 |
503 | ac.ac_pid = task_tgid_nr_ns(current, ns); | 502 | { |
504 | rcu_read_lock(); | 503 | struct pid_namespace *ns = acct->ns; |
505 | ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); | 504 | |
506 | rcu_read_unlock(); | 505 | ac.ac_pid = task_tgid_nr_ns(current, ns); |
506 | rcu_read_lock(); | ||
507 | ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), | ||
508 | ns); | ||
509 | rcu_read_unlock(); | ||
510 | } | ||
507 | #endif | 511 | #endif |
508 | /* | 512 | /* |
509 | * Get freeze protection. If the fs is frozen, just skip the write | 513 | * Get freeze protection. If the fs is frozen, just skip the write |
diff --git a/kernel/async.c b/kernel/async.c index 61f023ce0228..4c3773c0bf63 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
115 | 115 | ||
116 | /* 1) run (and print duration) */ | 116 | /* 1) run (and print duration) */ |
117 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 117 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
118 | printk(KERN_DEBUG "calling %lli_%pF @ %i\n", | 118 | pr_debug("calling %lli_%pF @ %i\n", |
119 | (long long)entry->cookie, | 119 | (long long)entry->cookie, |
120 | entry->func, task_pid_nr(current)); | 120 | entry->func, task_pid_nr(current)); |
121 | calltime = ktime_get(); | 121 | calltime = ktime_get(); |
@@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work) | |||
124 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 124 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
125 | rettime = ktime_get(); | 125 | rettime = ktime_get(); |
126 | delta = ktime_sub(rettime, calltime); | 126 | delta = ktime_sub(rettime, calltime); |
127 | printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", | 127 | pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n", |
128 | (long long)entry->cookie, | 128 | (long long)entry->cookie, |
129 | entry->func, | 129 | entry->func, |
130 | (long long)ktime_to_ns(delta) >> 10); | 130 | (long long)ktime_to_ns(delta) >> 10); |
@@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain | |||
285 | ktime_t uninitialized_var(starttime), delta, endtime; | 285 | ktime_t uninitialized_var(starttime), delta, endtime; |
286 | 286 | ||
287 | if (initcall_debug && system_state == SYSTEM_BOOTING) { | 287 | if (initcall_debug && system_state == SYSTEM_BOOTING) { |
288 | printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); | 288 | pr_debug("async_waiting @ %i\n", task_pid_nr(current)); |
289 | starttime = ktime_get(); | 289 | starttime = ktime_get(); |
290 | } | 290 | } |
291 | 291 | ||
@@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain | |||
295 | endtime = ktime_get(); | 295 | endtime = ktime_get(); |
296 | delta = ktime_sub(endtime, starttime); | 296 | delta = ktime_sub(endtime, starttime); |
297 | 297 | ||
298 | printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", | 298 | pr_debug("async_continuing @ %i after %lli usec\n", |
299 | task_pid_nr(current), | 299 | task_pid_nr(current), |
300 | (long long)ktime_to_ns(delta) >> 10); | 300 | (long long)ktime_to_ns(delta) >> 10); |
301 | } | 301 | } |
diff --git a/kernel/fork.c b/kernel/fork.c index a91e47d86de2..8c162d102740 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -601,9 +601,8 @@ static void check_mm(struct mm_struct *mm) | |||
601 | printk(KERN_ALERT "BUG: Bad rss-counter state " | 601 | printk(KERN_ALERT "BUG: Bad rss-counter state " |
602 | "mm:%p idx:%d val:%ld\n", mm, i, x); | 602 | "mm:%p idx:%d val:%ld\n", mm, i, x); |
603 | } | 603 | } |
604 | |||
605 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS | 604 | #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS |
606 | VM_BUG_ON(mm->pmd_huge_pte); | 605 | VM_BUG_ON_MM(mm->pmd_huge_pte, mm); |
607 | #endif | 606 | #endif |
608 | } | 607 | } |
609 | 608 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index ef483220e855..10e489c448fe 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | |||
369 | { | 369 | { |
370 | struct task_struct *p; | 370 | struct task_struct *p; |
371 | 371 | ||
372 | p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, | 372 | p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, |
373 | cpu); | 373 | cpu); |
374 | if (IS_ERR(p)) | 374 | if (IS_ERR(p)) |
375 | return p; | 375 | return p; |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bfa3c86d0d68..82088b29704e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -1946,7 +1946,7 @@ void task_numa_work(struct callback_head *work) | |||
1946 | vma = mm->mmap; | 1946 | vma = mm->mmap; |
1947 | } | 1947 | } |
1948 | for (; vma; vma = vma->vm_next) { | 1948 | for (; vma; vma = vma->vm_next) { |
1949 | if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) | 1949 | if (!vma_migratable(vma) || !vma_policy_mof(vma)) |
1950 | continue; | 1950 | continue; |
1951 | 1951 | ||
1952 | /* | 1952 | /* |
diff --git a/kernel/sys.c b/kernel/sys.c index ce8129192a26..dfce4debd138 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -62,28 +62,28 @@ | |||
62 | #include <asm/unistd.h> | 62 | #include <asm/unistd.h> |
63 | 63 | ||
64 | #ifndef SET_UNALIGN_CTL | 64 | #ifndef SET_UNALIGN_CTL |
65 | # define SET_UNALIGN_CTL(a,b) (-EINVAL) | 65 | # define SET_UNALIGN_CTL(a, b) (-EINVAL) |
66 | #endif | 66 | #endif |
67 | #ifndef GET_UNALIGN_CTL | 67 | #ifndef GET_UNALIGN_CTL |
68 | # define GET_UNALIGN_CTL(a,b) (-EINVAL) | 68 | # define GET_UNALIGN_CTL(a, b) (-EINVAL) |
69 | #endif | 69 | #endif |
70 | #ifndef SET_FPEMU_CTL | 70 | #ifndef SET_FPEMU_CTL |
71 | # define SET_FPEMU_CTL(a,b) (-EINVAL) | 71 | # define SET_FPEMU_CTL(a, b) (-EINVAL) |
72 | #endif | 72 | #endif |
73 | #ifndef GET_FPEMU_CTL | 73 | #ifndef GET_FPEMU_CTL |
74 | # define GET_FPEMU_CTL(a,b) (-EINVAL) | 74 | # define GET_FPEMU_CTL(a, b) (-EINVAL) |
75 | #endif | 75 | #endif |
76 | #ifndef SET_FPEXC_CTL | 76 | #ifndef SET_FPEXC_CTL |
77 | # define SET_FPEXC_CTL(a,b) (-EINVAL) | 77 | # define SET_FPEXC_CTL(a, b) (-EINVAL) |
78 | #endif | 78 | #endif |
79 | #ifndef GET_FPEXC_CTL | 79 | #ifndef GET_FPEXC_CTL |
80 | # define GET_FPEXC_CTL(a,b) (-EINVAL) | 80 | # define GET_FPEXC_CTL(a, b) (-EINVAL) |
81 | #endif | 81 | #endif |
82 | #ifndef GET_ENDIAN | 82 | #ifndef GET_ENDIAN |
83 | # define GET_ENDIAN(a,b) (-EINVAL) | 83 | # define GET_ENDIAN(a, b) (-EINVAL) |
84 | #endif | 84 | #endif |
85 | #ifndef SET_ENDIAN | 85 | #ifndef SET_ENDIAN |
86 | # define SET_ENDIAN(a,b) (-EINVAL) | 86 | # define SET_ENDIAN(a, b) (-EINVAL) |
87 | #endif | 87 | #endif |
88 | #ifndef GET_TSC_CTL | 88 | #ifndef GET_TSC_CTL |
89 | # define GET_TSC_CTL(a) (-EINVAL) | 89 | # define GET_TSC_CTL(a) (-EINVAL) |
@@ -182,39 +182,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) | |||
182 | rcu_read_lock(); | 182 | rcu_read_lock(); |
183 | read_lock(&tasklist_lock); | 183 | read_lock(&tasklist_lock); |
184 | switch (which) { | 184 | switch (which) { |
185 | case PRIO_PROCESS: | 185 | case PRIO_PROCESS: |
186 | if (who) | 186 | if (who) |
187 | p = find_task_by_vpid(who); | 187 | p = find_task_by_vpid(who); |
188 | else | 188 | else |
189 | p = current; | 189 | p = current; |
190 | if (p) | 190 | if (p) |
191 | error = set_one_prio(p, niceval, error); | 191 | error = set_one_prio(p, niceval, error); |
192 | break; | 192 | break; |
193 | case PRIO_PGRP: | 193 | case PRIO_PGRP: |
194 | if (who) | 194 | if (who) |
195 | pgrp = find_vpid(who); | 195 | pgrp = find_vpid(who); |
196 | else | 196 | else |
197 | pgrp = task_pgrp(current); | 197 | pgrp = task_pgrp(current); |
198 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | 198 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { |
199 | error = set_one_prio(p, niceval, error); | 199 | error = set_one_prio(p, niceval, error); |
200 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | 200 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); |
201 | break; | 201 | break; |
202 | case PRIO_USER: | 202 | case PRIO_USER: |
203 | uid = make_kuid(cred->user_ns, who); | 203 | uid = make_kuid(cred->user_ns, who); |
204 | user = cred->user; | 204 | user = cred->user; |
205 | if (!who) | 205 | if (!who) |
206 | uid = cred->uid; | 206 | uid = cred->uid; |
207 | else if (!uid_eq(uid, cred->uid) && | 207 | else if (!uid_eq(uid, cred->uid)) { |
208 | !(user = find_user(uid))) | 208 | user = find_user(uid); |
209 | if (!user) | ||
209 | goto out_unlock; /* No processes for this user */ | 210 | goto out_unlock; /* No processes for this user */ |
210 | 211 | } | |
211 | do_each_thread(g, p) { | 212 | do_each_thread(g, p) { |
212 | if (uid_eq(task_uid(p), uid)) | 213 | if (uid_eq(task_uid(p), uid)) |
213 | error = set_one_prio(p, niceval, error); | 214 | error = set_one_prio(p, niceval, error); |
214 | } while_each_thread(g, p); | 215 | } while_each_thread(g, p); |
215 | if (!uid_eq(uid, cred->uid)) | 216 | if (!uid_eq(uid, cred->uid)) |
216 | free_uid(user); /* For find_user() */ | 217 | free_uid(user); /* For find_user() */ |
217 | break; | 218 | break; |
218 | } | 219 | } |
219 | out_unlock: | 220 | out_unlock: |
220 | read_unlock(&tasklist_lock); | 221 | read_unlock(&tasklist_lock); |
@@ -244,47 +245,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) | |||
244 | rcu_read_lock(); | 245 | rcu_read_lock(); |
245 | read_lock(&tasklist_lock); | 246 | read_lock(&tasklist_lock); |
246 | switch (which) { | 247 | switch (which) { |
247 | case PRIO_PROCESS: | 248 | case PRIO_PROCESS: |
248 | if (who) | 249 | if (who) |
249 | p = find_task_by_vpid(who); | 250 | p = find_task_by_vpid(who); |
250 | else | 251 | else |
251 | p = current; | 252 | p = current; |
252 | if (p) { | 253 | if (p) { |
254 | niceval = nice_to_rlimit(task_nice(p)); | ||
255 | if (niceval > retval) | ||
256 | retval = niceval; | ||
257 | } | ||
258 | break; | ||
259 | case PRIO_PGRP: | ||
260 | if (who) | ||
261 | pgrp = find_vpid(who); | ||
262 | else | ||
263 | pgrp = task_pgrp(current); | ||
264 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | ||
265 | niceval = nice_to_rlimit(task_nice(p)); | ||
266 | if (niceval > retval) | ||
267 | retval = niceval; | ||
268 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | ||
269 | break; | ||
270 | case PRIO_USER: | ||
271 | uid = make_kuid(cred->user_ns, who); | ||
272 | user = cred->user; | ||
273 | if (!who) | ||
274 | uid = cred->uid; | ||
275 | else if (!uid_eq(uid, cred->uid)) { | ||
276 | user = find_user(uid); | ||
277 | if (!user) | ||
278 | goto out_unlock; /* No processes for this user */ | ||
279 | } | ||
280 | do_each_thread(g, p) { | ||
281 | if (uid_eq(task_uid(p), uid)) { | ||
253 | niceval = nice_to_rlimit(task_nice(p)); | 282 | niceval = nice_to_rlimit(task_nice(p)); |
254 | if (niceval > retval) | 283 | if (niceval > retval) |
255 | retval = niceval; | 284 | retval = niceval; |
256 | } | 285 | } |
257 | break; | 286 | } while_each_thread(g, p); |
258 | case PRIO_PGRP: | 287 | if (!uid_eq(uid, cred->uid)) |
259 | if (who) | 288 | free_uid(user); /* for find_user() */ |
260 | pgrp = find_vpid(who); | 289 | break; |
261 | else | ||
262 | pgrp = task_pgrp(current); | ||
263 | do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { | ||
264 | niceval = nice_to_rlimit(task_nice(p)); | ||
265 | if (niceval > retval) | ||
266 | retval = niceval; | ||
267 | } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); | ||
268 | break; | ||
269 | case PRIO_USER: | ||
270 | uid = make_kuid(cred->user_ns, who); | ||
271 | user = cred->user; | ||
272 | if (!who) | ||
273 | uid = cred->uid; | ||
274 | else if (!uid_eq(uid, cred->uid) && | ||
275 | !(user = find_user(uid))) | ||
276 | goto out_unlock; /* No processes for this user */ | ||
277 | |||
278 | do_each_thread(g, p) { | ||
279 | if (uid_eq(task_uid(p), uid)) { | ||
280 | niceval = nice_to_rlimit(task_nice(p)); | ||
281 | if (niceval > retval) | ||
282 | retval = niceval; | ||
283 | } | ||
284 | } while_each_thread(g, p); | ||
285 | if (!uid_eq(uid, cred->uid)) | ||
286 | free_uid(user); /* for find_user() */ | ||
287 | break; | ||
288 | } | 290 | } |
289 | out_unlock: | 291 | out_unlock: |
290 | read_unlock(&tasklist_lock); | 292 | read_unlock(&tasklist_lock); |
@@ -306,7 +308,7 @@ out_unlock: | |||
306 | * | 308 | * |
307 | * The general idea is that a program which uses just setregid() will be | 309 | * The general idea is that a program which uses just setregid() will be |
308 | * 100% compatible with BSD. A program which uses just setgid() will be | 310 | * 100% compatible with BSD. A program which uses just setgid() will be |
309 | * 100% compatible with POSIX with saved IDs. | 311 | * 100% compatible with POSIX with saved IDs. |
310 | * | 312 | * |
311 | * SMP: There are not races, the GIDs are checked only by filesystem | 313 | * SMP: There are not races, the GIDs are checked only by filesystem |
312 | * operations (as far as semantic preservation is concerned). | 314 | * operations (as far as semantic preservation is concerned). |
@@ -364,7 +366,7 @@ error: | |||
364 | } | 366 | } |
365 | 367 | ||
366 | /* | 368 | /* |
367 | * setgid() is implemented like SysV w/ SAVED_IDS | 369 | * setgid() is implemented like SysV w/ SAVED_IDS |
368 | * | 370 | * |
369 | * SMP: Same implicit races as above. | 371 | * SMP: Same implicit races as above. |
370 | */ | 372 | */ |
@@ -442,7 +444,7 @@ static int set_user(struct cred *new) | |||
442 | * | 444 | * |
443 | * The general idea is that a program which uses just setreuid() will be | 445 | * The general idea is that a program which uses just setreuid() will be |
444 | * 100% compatible with BSD. A program which uses just setuid() will be | 446 | * 100% compatible with BSD. A program which uses just setuid() will be |
445 | * 100% compatible with POSIX with saved IDs. | 447 | * 100% compatible with POSIX with saved IDs. |
446 | */ | 448 | */ |
447 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) | 449 | SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) |
448 | { | 450 | { |
@@ -503,17 +505,17 @@ error: | |||
503 | abort_creds(new); | 505 | abort_creds(new); |
504 | return retval; | 506 | return retval; |
505 | } | 507 | } |
506 | 508 | ||
507 | /* | 509 | /* |
508 | * setuid() is implemented like SysV with SAVED_IDS | 510 | * setuid() is implemented like SysV with SAVED_IDS |
509 | * | 511 | * |
510 | * Note that SAVED_ID's is deficient in that a setuid root program | 512 | * Note that SAVED_ID's is deficient in that a setuid root program |
511 | * like sendmail, for example, cannot set its uid to be a normal | 513 | * like sendmail, for example, cannot set its uid to be a normal |
512 | * user and then switch back, because if you're root, setuid() sets | 514 | * user and then switch back, because if you're root, setuid() sets |
513 | * the saved uid too. If you don't like this, blame the bright people | 515 | * the saved uid too. If you don't like this, blame the bright people |
514 | * in the POSIX committee and/or USG. Note that the BSD-style setreuid() | 516 | * in the POSIX committee and/or USG. Note that the BSD-style setreuid() |
515 | * will allow a root program to temporarily drop privileges and be able to | 517 | * will allow a root program to temporarily drop privileges and be able to |
516 | * regain them by swapping the real and effective uid. | 518 | * regain them by swapping the real and effective uid. |
517 | */ | 519 | */ |
518 | SYSCALL_DEFINE1(setuid, uid_t, uid) | 520 | SYSCALL_DEFINE1(setuid, uid_t, uid) |
519 | { | 521 | { |
@@ -637,10 +639,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _ | |||
637 | euid = from_kuid_munged(cred->user_ns, cred->euid); | 639 | euid = from_kuid_munged(cred->user_ns, cred->euid); |
638 | suid = from_kuid_munged(cred->user_ns, cred->suid); | 640 | suid = from_kuid_munged(cred->user_ns, cred->suid); |
639 | 641 | ||
640 | if (!(retval = put_user(ruid, ruidp)) && | 642 | retval = put_user(ruid, ruidp); |
641 | !(retval = put_user(euid, euidp))) | 643 | if (!retval) { |
642 | retval = put_user(suid, suidp); | 644 | retval = put_user(euid, euidp); |
643 | 645 | if (!retval) | |
646 | return put_user(suid, suidp); | ||
647 | } | ||
644 | return retval; | 648 | return retval; |
645 | } | 649 | } |
646 | 650 | ||
@@ -709,9 +713,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _ | |||
709 | egid = from_kgid_munged(cred->user_ns, cred->egid); | 713 | egid = from_kgid_munged(cred->user_ns, cred->egid); |
710 | sgid = from_kgid_munged(cred->user_ns, cred->sgid); | 714 | sgid = from_kgid_munged(cred->user_ns, cred->sgid); |
711 | 715 | ||
712 | if (!(retval = put_user(rgid, rgidp)) && | 716 | retval = put_user(rgid, rgidp); |
713 | !(retval = put_user(egid, egidp))) | 717 | if (!retval) { |
714 | retval = put_user(sgid, sgidp); | 718 | retval = put_user(egid, egidp); |
719 | if (!retval) | ||
720 | retval = put_user(sgid, sgidp); | ||
721 | } | ||
715 | 722 | ||
716 | return retval; | 723 | return retval; |
717 | } | 724 | } |
@@ -1284,7 +1291,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) | |||
1284 | /* | 1291 | /* |
1285 | * Back compatibility for getrlimit. Needed for some apps. | 1292 | * Back compatibility for getrlimit. Needed for some apps. |
1286 | */ | 1293 | */ |
1287 | |||
1288 | SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | 1294 | SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, |
1289 | struct rlimit __user *, rlim) | 1295 | struct rlimit __user *, rlim) |
1290 | { | 1296 | { |
@@ -1299,7 +1305,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, | |||
1299 | x.rlim_cur = 0x7FFFFFFF; | 1305 | x.rlim_cur = 0x7FFFFFFF; |
1300 | if (x.rlim_max > 0x7FFFFFFF) | 1306 | if (x.rlim_max > 0x7FFFFFFF) |
1301 | x.rlim_max = 0x7FFFFFFF; | 1307 | x.rlim_max = 0x7FFFFFFF; |
1302 | return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; | 1308 | return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0; |
1303 | } | 1309 | } |
1304 | 1310 | ||
1305 | #endif | 1311 | #endif |
@@ -1527,7 +1533,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1527 | cputime_t tgutime, tgstime, utime, stime; | 1533 | cputime_t tgutime, tgstime, utime, stime; |
1528 | unsigned long maxrss = 0; | 1534 | unsigned long maxrss = 0; |
1529 | 1535 | ||
1530 | memset((char *) r, 0, sizeof *r); | 1536 | memset((char *)r, 0, sizeof (*r)); |
1531 | utime = stime = 0; | 1537 | utime = stime = 0; |
1532 | 1538 | ||
1533 | if (who == RUSAGE_THREAD) { | 1539 | if (who == RUSAGE_THREAD) { |
@@ -1541,41 +1547,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) | |||
1541 | return; | 1547 | return; |
1542 | 1548 | ||
1543 | switch (who) { | 1549 | switch (who) { |
1544 | case RUSAGE_BOTH: | 1550 | case RUSAGE_BOTH: |
1545 | case RUSAGE_CHILDREN: | 1551 | case RUSAGE_CHILDREN: |
1546 | utime = p->signal->cutime; | 1552 | utime = p->signal->cutime; |
1547 | stime = p->signal->cstime; | 1553 | stime = p->signal->cstime; |
1548 | r->ru_nvcsw = p->signal->cnvcsw; | 1554 | r->ru_nvcsw = p->signal->cnvcsw; |
1549 | r->ru_nivcsw = p->signal->cnivcsw; | 1555 | r->ru_nivcsw = p->signal->cnivcsw; |
1550 | r->ru_minflt = p->signal->cmin_flt; | 1556 | r->ru_minflt = p->signal->cmin_flt; |
1551 | r->ru_majflt = p->signal->cmaj_flt; | 1557 | r->ru_majflt = p->signal->cmaj_flt; |
1552 | r->ru_inblock = p->signal->cinblock; | 1558 | r->ru_inblock = p->signal->cinblock; |
1553 | r->ru_oublock = p->signal->coublock; | 1559 | r->ru_oublock = p->signal->coublock; |
1554 | maxrss = p->signal->cmaxrss; | 1560 | maxrss = p->signal->cmaxrss; |
1555 | 1561 | ||
1556 | if (who == RUSAGE_CHILDREN) | 1562 | if (who == RUSAGE_CHILDREN) |
1557 | break; | ||
1558 | |||
1559 | case RUSAGE_SELF: | ||
1560 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); | ||
1561 | utime += tgutime; | ||
1562 | stime += tgstime; | ||
1563 | r->ru_nvcsw += p->signal->nvcsw; | ||
1564 | r->ru_nivcsw += p->signal->nivcsw; | ||
1565 | r->ru_minflt += p->signal->min_flt; | ||
1566 | r->ru_majflt += p->signal->maj_flt; | ||
1567 | r->ru_inblock += p->signal->inblock; | ||
1568 | r->ru_oublock += p->signal->oublock; | ||
1569 | if (maxrss < p->signal->maxrss) | ||
1570 | maxrss = p->signal->maxrss; | ||
1571 | t = p; | ||
1572 | do { | ||
1573 | accumulate_thread_rusage(t, r); | ||
1574 | } while_each_thread(p, t); | ||
1575 | break; | 1563 | break; |
1576 | 1564 | ||
1577 | default: | 1565 | case RUSAGE_SELF: |
1578 | BUG(); | 1566 | thread_group_cputime_adjusted(p, &tgutime, &tgstime); |
1567 | utime += tgutime; | ||
1568 | stime += tgstime; | ||
1569 | r->ru_nvcsw += p->signal->nvcsw; | ||
1570 | r->ru_nivcsw += p->signal->nivcsw; | ||
1571 | r->ru_minflt += p->signal->min_flt; | ||
1572 | r->ru_majflt += p->signal->maj_flt; | ||
1573 | r->ru_inblock += p->signal->inblock; | ||
1574 | r->ru_oublock += p->signal->oublock; | ||
1575 | if (maxrss < p->signal->maxrss) | ||
1576 | maxrss = p->signal->maxrss; | ||
1577 | t = p; | ||
1578 | do { | ||
1579 | accumulate_thread_rusage(t, r); | ||
1580 | } while_each_thread(p, t); | ||
1581 | break; | ||
1582 | |||
1583 | default: | ||
1584 | BUG(); | ||
1579 | } | 1585 | } |
1580 | unlock_task_sighand(p, &flags); | 1586 | unlock_task_sighand(p, &flags); |
1581 | 1587 | ||
@@ -1585,6 +1591,7 @@ out: | |||
1585 | 1591 | ||
1586 | if (who != RUSAGE_CHILDREN) { | 1592 | if (who != RUSAGE_CHILDREN) { |
1587 | struct mm_struct *mm = get_task_mm(p); | 1593 | struct mm_struct *mm = get_task_mm(p); |
1594 | |||
1588 | if (mm) { | 1595 | if (mm) { |
1589 | setmax_mm_hiwater_rss(&maxrss, mm); | 1596 | setmax_mm_hiwater_rss(&maxrss, mm); |
1590 | mmput(mm); | 1597 | mmput(mm); |
@@ -1596,6 +1603,7 @@ out: | |||
1596 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) | 1603 | int getrusage(struct task_struct *p, int who, struct rusage __user *ru) |
1597 | { | 1604 | { |
1598 | struct rusage r; | 1605 | struct rusage r; |
1606 | |||
1599 | k_getrusage(p, who, &r); | 1607 | k_getrusage(p, who, &r); |
1600 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; | 1608 | return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; |
1601 | } | 1609 | } |
@@ -1628,12 +1636,14 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
1628 | return mask; | 1636 | return mask; |
1629 | } | 1637 | } |
1630 | 1638 | ||
1631 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1639 | static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd) |
1632 | { | 1640 | { |
1633 | struct fd exe; | 1641 | struct fd exe; |
1634 | struct inode *inode; | 1642 | struct inode *inode; |
1635 | int err; | 1643 | int err; |
1636 | 1644 | ||
1645 | VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); | ||
1646 | |||
1637 | exe = fdget(fd); | 1647 | exe = fdget(fd); |
1638 | if (!exe.file) | 1648 | if (!exe.file) |
1639 | return -EBADF; | 1649 | return -EBADF; |
@@ -1654,8 +1664,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1654 | if (err) | 1664 | if (err) |
1655 | goto exit; | 1665 | goto exit; |
1656 | 1666 | ||
1657 | down_write(&mm->mmap_sem); | ||
1658 | |||
1659 | /* | 1667 | /* |
1660 | * Forbid mm->exe_file change if old file still mapped. | 1668 | * Forbid mm->exe_file change if old file still mapped. |
1661 | */ | 1669 | */ |
@@ -1667,7 +1675,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1667 | if (vma->vm_file && | 1675 | if (vma->vm_file && |
1668 | path_equal(&vma->vm_file->f_path, | 1676 | path_equal(&vma->vm_file->f_path, |
1669 | &mm->exe_file->f_path)) | 1677 | &mm->exe_file->f_path)) |
1670 | goto exit_unlock; | 1678 | goto exit; |
1671 | } | 1679 | } |
1672 | 1680 | ||
1673 | /* | 1681 | /* |
@@ -1678,34 +1686,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
1678 | */ | 1686 | */ |
1679 | err = -EPERM; | 1687 | err = -EPERM; |
1680 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) | 1688 | if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) |
1681 | goto exit_unlock; | 1689 | goto exit; |
1682 | 1690 | ||
1683 | err = 0; | 1691 | err = 0; |
1684 | set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ | 1692 | set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ |
1685 | exit_unlock: | ||
1686 | up_write(&mm->mmap_sem); | ||
1687 | |||
1688 | exit: | 1693 | exit: |
1689 | fdput(exe); | 1694 | fdput(exe); |
1690 | return err; | 1695 | return err; |
1691 | } | 1696 | } |
1692 | 1697 | ||
1698 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
1699 | /* | ||
1700 | * WARNING: we don't require any capability here so be very careful | ||
1701 | * in what is allowed for modification from userspace. | ||
1702 | */ | ||
1703 | static int validate_prctl_map(struct prctl_mm_map *prctl_map) | ||
1704 | { | ||
1705 | unsigned long mmap_max_addr = TASK_SIZE; | ||
1706 | struct mm_struct *mm = current->mm; | ||
1707 | int error = -EINVAL, i; | ||
1708 | |||
1709 | static const unsigned char offsets[] = { | ||
1710 | offsetof(struct prctl_mm_map, start_code), | ||
1711 | offsetof(struct prctl_mm_map, end_code), | ||
1712 | offsetof(struct prctl_mm_map, start_data), | ||
1713 | offsetof(struct prctl_mm_map, end_data), | ||
1714 | offsetof(struct prctl_mm_map, start_brk), | ||
1715 | offsetof(struct prctl_mm_map, brk), | ||
1716 | offsetof(struct prctl_mm_map, start_stack), | ||
1717 | offsetof(struct prctl_mm_map, arg_start), | ||
1718 | offsetof(struct prctl_mm_map, arg_end), | ||
1719 | offsetof(struct prctl_mm_map, env_start), | ||
1720 | offsetof(struct prctl_mm_map, env_end), | ||
1721 | }; | ||
1722 | |||
1723 | /* | ||
1724 | * Make sure the members are not somewhere outside | ||
1725 | * of allowed address space. | ||
1726 | */ | ||
1727 | for (i = 0; i < ARRAY_SIZE(offsets); i++) { | ||
1728 | u64 val = *(u64 *)((char *)prctl_map + offsets[i]); | ||
1729 | |||
1730 | if ((unsigned long)val >= mmap_max_addr || | ||
1731 | (unsigned long)val < mmap_min_addr) | ||
1732 | goto out; | ||
1733 | } | ||
1734 | |||
1735 | /* | ||
1736 | * Make sure the pairs are ordered. | ||
1737 | */ | ||
1738 | #define __prctl_check_order(__m1, __op, __m2) \ | ||
1739 | ((unsigned long)prctl_map->__m1 __op \ | ||
1740 | (unsigned long)prctl_map->__m2) ? 0 : -EINVAL | ||
1741 | error = __prctl_check_order(start_code, <, end_code); | ||
1742 | error |= __prctl_check_order(start_data, <, end_data); | ||
1743 | error |= __prctl_check_order(start_brk, <=, brk); | ||
1744 | error |= __prctl_check_order(arg_start, <=, arg_end); | ||
1745 | error |= __prctl_check_order(env_start, <=, env_end); | ||
1746 | if (error) | ||
1747 | goto out; | ||
1748 | #undef __prctl_check_order | ||
1749 | |||
1750 | error = -EINVAL; | ||
1751 | |||
1752 | /* | ||
1753 | * @brk should be after @end_data in traditional maps. | ||
1754 | */ | ||
1755 | if (prctl_map->start_brk <= prctl_map->end_data || | ||
1756 | prctl_map->brk <= prctl_map->end_data) | ||
1757 | goto out; | ||
1758 | |||
1759 | /* | ||
1760 | * Neither we should allow to override limits if they set. | ||
1761 | */ | ||
1762 | if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk, | ||
1763 | prctl_map->start_brk, prctl_map->end_data, | ||
1764 | prctl_map->start_data)) | ||
1765 | goto out; | ||
1766 | |||
1767 | /* | ||
1768 | * Someone is trying to cheat the auxv vector. | ||
1769 | */ | ||
1770 | if (prctl_map->auxv_size) { | ||
1771 | if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) | ||
1772 | goto out; | ||
1773 | } | ||
1774 | |||
1775 | /* | ||
1776 | * Finally, make sure the caller has the rights to | ||
1777 | * change /proc/pid/exe link: only local root should | ||
1778 | * be allowed to. | ||
1779 | */ | ||
1780 | if (prctl_map->exe_fd != (u32)-1) { | ||
1781 | struct user_namespace *ns = current_user_ns(); | ||
1782 | const struct cred *cred = current_cred(); | ||
1783 | |||
1784 | if (!uid_eq(cred->uid, make_kuid(ns, 0)) || | ||
1785 | !gid_eq(cred->gid, make_kgid(ns, 0))) | ||
1786 | goto out; | ||
1787 | } | ||
1788 | |||
1789 | error = 0; | ||
1790 | out: | ||
1791 | return error; | ||
1792 | } | ||
1793 | |||
1794 | static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size) | ||
1795 | { | ||
1796 | struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, }; | ||
1797 | unsigned long user_auxv[AT_VECTOR_SIZE]; | ||
1798 | struct mm_struct *mm = current->mm; | ||
1799 | int error; | ||
1800 | |||
1801 | BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv)); | ||
1802 | BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256); | ||
1803 | |||
1804 | if (opt == PR_SET_MM_MAP_SIZE) | ||
1805 | return put_user((unsigned int)sizeof(prctl_map), | ||
1806 | (unsigned int __user *)addr); | ||
1807 | |||
1808 | if (data_size != sizeof(prctl_map)) | ||
1809 | return -EINVAL; | ||
1810 | |||
1811 | if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) | ||
1812 | return -EFAULT; | ||
1813 | |||
1814 | error = validate_prctl_map(&prctl_map); | ||
1815 | if (error) | ||
1816 | return error; | ||
1817 | |||
1818 | if (prctl_map.auxv_size) { | ||
1819 | memset(user_auxv, 0, sizeof(user_auxv)); | ||
1820 | if (copy_from_user(user_auxv, | ||
1821 | (const void __user *)prctl_map.auxv, | ||
1822 | prctl_map.auxv_size)) | ||
1823 | return -EFAULT; | ||
1824 | |||
1825 | /* Last entry must be AT_NULL as specification requires */ | ||
1826 | user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL; | ||
1827 | user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; | ||
1828 | } | ||
1829 | |||
1830 | down_write(&mm->mmap_sem); | ||
1831 | if (prctl_map.exe_fd != (u32)-1) | ||
1832 | error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd); | ||
1833 | downgrade_write(&mm->mmap_sem); | ||
1834 | if (error) | ||
1835 | goto out; | ||
1836 | |||
1837 | /* | ||
1838 | * We don't validate if these members are pointing to | ||
1839 | * real present VMAs because application may have correspond | ||
1840 | * VMAs already unmapped and kernel uses these members for statistics | ||
1841 | * output in procfs mostly, except | ||
1842 | * | ||
1843 | * - @start_brk/@brk which are used in do_brk but kernel lookups | ||
1844 | * for VMAs when updating these memvers so anything wrong written | ||
1845 | * here cause kernel to swear at userspace program but won't lead | ||
1846 | * to any problem in kernel itself | ||
1847 | */ | ||
1848 | |||
1849 | mm->start_code = prctl_map.start_code; | ||
1850 | mm->end_code = prctl_map.end_code; | ||
1851 | mm->start_data = prctl_map.start_data; | ||
1852 | mm->end_data = prctl_map.end_data; | ||
1853 | mm->start_brk = prctl_map.start_brk; | ||
1854 | mm->brk = prctl_map.brk; | ||
1855 | mm->start_stack = prctl_map.start_stack; | ||
1856 | mm->arg_start = prctl_map.arg_start; | ||
1857 | mm->arg_end = prctl_map.arg_end; | ||
1858 | mm->env_start = prctl_map.env_start; | ||
1859 | mm->env_end = prctl_map.env_end; | ||
1860 | |||
1861 | /* | ||
1862 | * Note this update of @saved_auxv is lockless thus | ||
1863 | * if someone reads this member in procfs while we're | ||
1864 | * updating -- it may get partly updated results. It's | ||
1865 | * known and acceptable trade off: we leave it as is to | ||
1866 | * not introduce additional locks here making the kernel | ||
1867 | * more complex. | ||
1868 | */ | ||
1869 | if (prctl_map.auxv_size) | ||
1870 | memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); | ||
1871 | |||
1872 | error = 0; | ||
1873 | out: | ||
1874 | up_read(&mm->mmap_sem); | ||
1875 | return error; | ||
1876 | } | ||
1877 | #endif /* CONFIG_CHECKPOINT_RESTORE */ | ||
1878 | |||
1693 | static int prctl_set_mm(int opt, unsigned long addr, | 1879 | static int prctl_set_mm(int opt, unsigned long addr, |
1694 | unsigned long arg4, unsigned long arg5) | 1880 | unsigned long arg4, unsigned long arg5) |
1695 | { | 1881 | { |
1696 | unsigned long rlim = rlimit(RLIMIT_DATA); | ||
1697 | struct mm_struct *mm = current->mm; | 1882 | struct mm_struct *mm = current->mm; |
1698 | struct vm_area_struct *vma; | 1883 | struct vm_area_struct *vma; |
1699 | int error; | 1884 | int error; |
1700 | 1885 | ||
1701 | if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) | 1886 | if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV && |
1887 | opt != PR_SET_MM_MAP && | ||
1888 | opt != PR_SET_MM_MAP_SIZE))) | ||
1702 | return -EINVAL; | 1889 | return -EINVAL; |
1703 | 1890 | ||
1891 | #ifdef CONFIG_CHECKPOINT_RESTORE | ||
1892 | if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE) | ||
1893 | return prctl_set_mm_map(opt, (const void __user *)addr, arg4); | ||
1894 | #endif | ||
1895 | |||
1704 | if (!capable(CAP_SYS_RESOURCE)) | 1896 | if (!capable(CAP_SYS_RESOURCE)) |
1705 | return -EPERM; | 1897 | return -EPERM; |
1706 | 1898 | ||
1707 | if (opt == PR_SET_MM_EXE_FILE) | 1899 | if (opt == PR_SET_MM_EXE_FILE) { |
1708 | return prctl_set_mm_exe_file(mm, (unsigned int)addr); | 1900 | down_write(&mm->mmap_sem); |
1901 | error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr); | ||
1902 | up_write(&mm->mmap_sem); | ||
1903 | return error; | ||
1904 | } | ||
1709 | 1905 | ||
1710 | if (addr >= TASK_SIZE || addr < mmap_min_addr) | 1906 | if (addr >= TASK_SIZE || addr < mmap_min_addr) |
1711 | return -EINVAL; | 1907 | return -EINVAL; |
@@ -1733,9 +1929,8 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
1733 | if (addr <= mm->end_data) | 1929 | if (addr <= mm->end_data) |
1734 | goto out; | 1930 | goto out; |
1735 | 1931 | ||
1736 | if (rlim < RLIM_INFINITY && | 1932 | if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr, |
1737 | (mm->brk - addr) + | 1933 | mm->end_data, mm->start_data)) |
1738 | (mm->end_data - mm->start_data) > rlim) | ||
1739 | goto out; | 1934 | goto out; |
1740 | 1935 | ||
1741 | mm->start_brk = addr; | 1936 | mm->start_brk = addr; |
@@ -1745,9 +1940,8 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
1745 | if (addr <= mm->end_data) | 1940 | if (addr <= mm->end_data) |
1746 | goto out; | 1941 | goto out; |
1747 | 1942 | ||
1748 | if (rlim < RLIM_INFINITY && | 1943 | if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk, |
1749 | (addr - mm->start_brk) + | 1944 | mm->end_data, mm->start_data)) |
1750 | (mm->end_data - mm->start_data) > rlim) | ||
1751 | goto out; | 1945 | goto out; |
1752 | 1946 | ||
1753 | mm->brk = addr; | 1947 | mm->brk = addr; |
@@ -2023,6 +2217,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, | |||
2023 | { | 2217 | { |
2024 | int err = 0; | 2218 | int err = 0; |
2025 | int cpu = raw_smp_processor_id(); | 2219 | int cpu = raw_smp_processor_id(); |
2220 | |||
2026 | if (cpup) | 2221 | if (cpup) |
2027 | err |= put_user(cpu, cpup); | 2222 | err |= put_user(cpu, cpup); |
2028 | if (nodep) | 2223 | if (nodep) |
@@ -2135,7 +2330,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info) | |||
2135 | /* Check to see if any memory value is too large for 32-bit and scale | 2330 | /* Check to see if any memory value is too large for 32-bit and scale |
2136 | * down if needed | 2331 | * down if needed |
2137 | */ | 2332 | */ |
2138 | if ((s.totalram >> 32) || (s.totalswap >> 32)) { | 2333 | if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) { |
2139 | int bitcount = 0; | 2334 | int bitcount = 0; |
2140 | 2335 | ||
2141 | while (s.mem_unit < PAGE_SIZE) { | 2336 | while (s.mem_unit < PAGE_SIZE) { |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75875a741b5e..91180987e40e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1460,13 +1460,6 @@ static struct ctl_table vm_table[] = { | |||
1460 | .extra2 = &one, | 1460 | .extra2 = &one, |
1461 | }, | 1461 | }, |
1462 | #endif | 1462 | #endif |
1463 | { | ||
1464 | .procname = "scan_unevictable_pages", | ||
1465 | .data = &scan_unevictable_pages, | ||
1466 | .maxlen = sizeof(scan_unevictable_pages), | ||
1467 | .mode = 0644, | ||
1468 | .proc_handler = scan_unevictable_handler, | ||
1469 | }, | ||
1470 | #ifdef CONFIG_MEMORY_FAILURE | 1463 | #ifdef CONFIG_MEMORY_FAILURE |
1471 | { | 1464 | { |
1472 | .procname = "memory_failure_early_kill", | 1465 | .procname = "memory_failure_early_kill", |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a8d6914030fe..7b223b212683 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
@@ -47,6 +47,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync); | |||
47 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); | 47 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); |
48 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | 48 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); |
49 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); | 49 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); |
50 | static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); | ||
50 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 51 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
51 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | 52 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); |
52 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | 53 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
@@ -333,8 +334,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
333 | return HRTIMER_RESTART; | 334 | return HRTIMER_RESTART; |
334 | 335 | ||
335 | /* only warn once */ | 336 | /* only warn once */ |
336 | if (__this_cpu_read(soft_watchdog_warn) == true) | 337 | if (__this_cpu_read(soft_watchdog_warn) == true) { |
338 | /* | ||
339 | * When multiple processes are causing softlockups the | ||
340 | * softlockup detector only warns on the first one | ||
341 | * because the code relies on a full quiet cycle to | ||
342 | * re-arm. The second process prevents the quiet cycle | ||
343 | * and never gets reported. Use task pointers to detect | ||
344 | * this. | ||
345 | */ | ||
346 | if (__this_cpu_read(softlockup_task_ptr_saved) != | ||
347 | current) { | ||
348 | __this_cpu_write(soft_watchdog_warn, false); | ||
349 | __touch_watchdog(); | ||
350 | } | ||
337 | return HRTIMER_RESTART; | 351 | return HRTIMER_RESTART; |
352 | } | ||
338 | 353 | ||
339 | if (softlockup_all_cpu_backtrace) { | 354 | if (softlockup_all_cpu_backtrace) { |
340 | /* Prevent multiple soft-lockup reports if one cpu is already | 355 | /* Prevent multiple soft-lockup reports if one cpu is already |
@@ -350,6 +365,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
350 | pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", | 365 | pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", |
351 | smp_processor_id(), duration, | 366 | smp_processor_id(), duration, |
352 | current->comm, task_pid_nr(current)); | 367 | current->comm, task_pid_nr(current)); |
368 | __this_cpu_write(softlockup_task_ptr_saved, current); | ||
353 | print_modules(); | 369 | print_modules(); |
354 | print_irqtrace_events(current); | 370 | print_irqtrace_events(current); |
355 | if (regs) | 371 | if (regs) |
diff --git a/lib/genalloc.c b/lib/genalloc.c index 38d2db82228c..cce4dd68c40d 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c | |||
@@ -403,6 +403,35 @@ void gen_pool_for_each_chunk(struct gen_pool *pool, | |||
403 | EXPORT_SYMBOL(gen_pool_for_each_chunk); | 403 | EXPORT_SYMBOL(gen_pool_for_each_chunk); |
404 | 404 | ||
405 | /** | 405 | /** |
406 | * addr_in_gen_pool - checks if an address falls within the range of a pool | ||
407 | * @pool: the generic memory pool | ||
408 | * @start: start address | ||
409 | * @size: size of the region | ||
410 | * | ||
411 | * Check if the range of addresses falls within the specified pool. Returns | ||
412 | * true if the entire range is contained in the pool and false otherwise. | ||
413 | */ | ||
414 | bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, | ||
415 | size_t size) | ||
416 | { | ||
417 | bool found = false; | ||
418 | unsigned long end = start + size; | ||
419 | struct gen_pool_chunk *chunk; | ||
420 | |||
421 | rcu_read_lock(); | ||
422 | list_for_each_entry_rcu(chunk, &(pool)->chunks, next_chunk) { | ||
423 | if (start >= chunk->start_addr && start <= chunk->end_addr) { | ||
424 | if (end <= chunk->end_addr) { | ||
425 | found = true; | ||
426 | break; | ||
427 | } | ||
428 | } | ||
429 | } | ||
430 | rcu_read_unlock(); | ||
431 | return found; | ||
432 | } | ||
433 | |||
434 | /** | ||
406 | * gen_pool_avail - get available free space of the pool | 435 | * gen_pool_avail - get available free space of the pool |
407 | * @pool: pool to get available free space | 436 | * @pool: pool to get available free space |
408 | * | 437 | * |
@@ -481,6 +510,26 @@ unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, | |||
481 | EXPORT_SYMBOL(gen_pool_first_fit); | 510 | EXPORT_SYMBOL(gen_pool_first_fit); |
482 | 511 | ||
483 | /** | 512 | /** |
513 | * gen_pool_first_fit_order_align - find the first available region | ||
514 | * of memory matching the size requirement. The region will be aligned | ||
515 | * to the order of the size specified. | ||
516 | * @map: The address to base the search on | ||
517 | * @size: The bitmap size in bits | ||
518 | * @start: The bitnumber to start searching at | ||
519 | * @nr: The number of zeroed bits we're looking for | ||
520 | * @data: additional data - unused | ||
521 | */ | ||
522 | unsigned long gen_pool_first_fit_order_align(unsigned long *map, | ||
523 | unsigned long size, unsigned long start, | ||
524 | unsigned int nr, void *data) | ||
525 | { | ||
526 | unsigned long align_mask = roundup_pow_of_two(nr) - 1; | ||
527 | |||
528 | return bitmap_find_next_zero_area(map, size, start, nr, align_mask); | ||
529 | } | ||
530 | EXPORT_SYMBOL(gen_pool_first_fit_order_align); | ||
531 | |||
532 | /** | ||
484 | * gen_pool_best_fit - find the best fitting region of memory | 533 | * gen_pool_best_fit - find the best fitting region of memory |
485 | * macthing the size requirement (no alignment constraint) | 534 | * macthing the size requirement (no alignment constraint) |
486 | * @map: The address to base the search on | 535 | * @map: The address to base the search on |
diff --git a/mm/Kconfig b/mm/Kconfig index 886db2158538..1d1ae6b078fd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig | |||
@@ -137,6 +137,9 @@ config HAVE_MEMBLOCK_NODE_MAP | |||
137 | config HAVE_MEMBLOCK_PHYS_MAP | 137 | config HAVE_MEMBLOCK_PHYS_MAP |
138 | boolean | 138 | boolean |
139 | 139 | ||
140 | config HAVE_GENERIC_RCU_GUP | ||
141 | boolean | ||
142 | |||
140 | config ARCH_DISCARD_MEMBLOCK | 143 | config ARCH_DISCARD_MEMBLOCK |
141 | boolean | 144 | boolean |
142 | 145 | ||
@@ -228,11 +231,16 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK | |||
228 | boolean | 231 | boolean |
229 | 232 | ||
230 | # | 233 | # |
234 | # support for memory balloon | ||
235 | config MEMORY_BALLOON | ||
236 | boolean | ||
237 | |||
238 | # | ||
231 | # support for memory balloon compaction | 239 | # support for memory balloon compaction |
232 | config BALLOON_COMPACTION | 240 | config BALLOON_COMPACTION |
233 | bool "Allow for balloon memory compaction/migration" | 241 | bool "Allow for balloon memory compaction/migration" |
234 | def_bool y | 242 | def_bool y |
235 | depends on COMPACTION && VIRTIO_BALLOON | 243 | depends on COMPACTION && MEMORY_BALLOON |
236 | help | 244 | help |
237 | Memory fragmentation introduced by ballooning might reduce | 245 | Memory fragmentation introduced by ballooning might reduce |
238 | significantly the number of 2MB contiguous memory blocks that can be | 246 | significantly the number of 2MB contiguous memory blocks that can be |
diff --git a/mm/Makefile b/mm/Makefile index fe7a053c0f45..1f534a7f0a71 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -16,9 +16,9 @@ obj-y := filemap.o mempool.o oom_kill.o \ | |||
16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ | 16 | readahead.o swap.o truncate.o vmscan.o shmem.o \ |
17 | util.o mmzone.o vmstat.o backing-dev.o \ | 17 | util.o mmzone.o vmstat.o backing-dev.o \ |
18 | mm_init.o mmu_context.o percpu.o slab_common.o \ | 18 | mm_init.o mmu_context.o percpu.o slab_common.o \ |
19 | compaction.o balloon_compaction.o vmacache.o \ | 19 | compaction.o vmacache.o \ |
20 | interval_tree.o list_lru.o workingset.o \ | 20 | interval_tree.o list_lru.o workingset.o \ |
21 | iov_iter.o $(mmu-y) | 21 | iov_iter.o debug.o $(mmu-y) |
22 | 22 | ||
23 | obj-y += init-mm.o | 23 | obj-y += init-mm.o |
24 | 24 | ||
@@ -67,3 +67,4 @@ obj-$(CONFIG_ZBUD) += zbud.o | |||
67 | obj-$(CONFIG_ZSMALLOC) += zsmalloc.o | 67 | obj-$(CONFIG_ZSMALLOC) += zsmalloc.o |
68 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o | 68 | obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o |
69 | obj-$(CONFIG_CMA) += cma.o | 69 | obj-$(CONFIG_CMA) += cma.o |
70 | obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o | ||
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 1706cbbdf5f0..b27714f1b40f 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -631,7 +631,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) | |||
631 | * of sleeping on the congestion queue | 631 | * of sleeping on the congestion queue |
632 | */ | 632 | */ |
633 | if (atomic_read(&nr_bdi_congested[sync]) == 0 || | 633 | if (atomic_read(&nr_bdi_congested[sync]) == 0 || |
634 | !zone_is_reclaim_congested(zone)) { | 634 | !test_bit(ZONE_CONGESTED, &zone->flags)) { |
635 | cond_resched(); | 635 | cond_resched(); |
636 | 636 | ||
637 | /* In case we scheduled, work out time remaining */ | 637 | /* In case we scheduled, work out time remaining */ |
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 6e45a5074bf0..b3cbe19f71b5 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c | |||
@@ -11,32 +11,6 @@ | |||
11 | #include <linux/balloon_compaction.h> | 11 | #include <linux/balloon_compaction.h> |
12 | 12 | ||
13 | /* | 13 | /* |
14 | * balloon_devinfo_alloc - allocates a balloon device information descriptor. | ||
15 | * @balloon_dev_descriptor: pointer to reference the balloon device which | ||
16 | * this struct balloon_dev_info will be servicing. | ||
17 | * | ||
18 | * Driver must call it to properly allocate and initialize an instance of | ||
19 | * struct balloon_dev_info which will be used to reference a balloon device | ||
20 | * as well as to keep track of the balloon device page list. | ||
21 | */ | ||
22 | struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) | ||
23 | { | ||
24 | struct balloon_dev_info *b_dev_info; | ||
25 | b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); | ||
26 | if (!b_dev_info) | ||
27 | return ERR_PTR(-ENOMEM); | ||
28 | |||
29 | b_dev_info->balloon_device = balloon_dev_descriptor; | ||
30 | b_dev_info->mapping = NULL; | ||
31 | b_dev_info->isolated_pages = 0; | ||
32 | spin_lock_init(&b_dev_info->pages_lock); | ||
33 | INIT_LIST_HEAD(&b_dev_info->pages); | ||
34 | |||
35 | return b_dev_info; | ||
36 | } | ||
37 | EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); | ||
38 | |||
39 | /* | ||
40 | * balloon_page_enqueue - allocates a new page and inserts it into the balloon | 14 | * balloon_page_enqueue - allocates a new page and inserts it into the balloon |
41 | * page list. | 15 | * page list. |
42 | * @b_dev_info: balloon device decriptor where we will insert a new page to | 16 | * @b_dev_info: balloon device decriptor where we will insert a new page to |
@@ -61,7 +35,8 @@ struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) | |||
61 | */ | 35 | */ |
62 | BUG_ON(!trylock_page(page)); | 36 | BUG_ON(!trylock_page(page)); |
63 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | 37 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); |
64 | balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); | 38 | balloon_page_insert(b_dev_info, page); |
39 | __count_vm_event(BALLOON_INFLATE); | ||
65 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | 40 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); |
66 | unlock_page(page); | 41 | unlock_page(page); |
67 | return page; | 42 | return page; |
@@ -93,18 +68,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) | |||
93 | * to be released by the balloon driver. | 68 | * to be released by the balloon driver. |
94 | */ | 69 | */ |
95 | if (trylock_page(page)) { | 70 | if (trylock_page(page)) { |
71 | if (!PagePrivate(page)) { | ||
72 | /* raced with isolation */ | ||
73 | unlock_page(page); | ||
74 | continue; | ||
75 | } | ||
96 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | 76 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); |
97 | /* | ||
98 | * Raise the page refcount here to prevent any wrong | ||
99 | * attempt to isolate this page, in case of coliding | ||
100 | * with balloon_page_isolate() just after we release | ||
101 | * the page lock. | ||
102 | * | ||
103 | * balloon_page_free() will take care of dropping | ||
104 | * this extra refcount later. | ||
105 | */ | ||
106 | get_page(page); | ||
107 | balloon_page_delete(page); | 77 | balloon_page_delete(page); |
78 | __count_vm_event(BALLOON_DEFLATE); | ||
108 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | 79 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); |
109 | unlock_page(page); | 80 | unlock_page(page); |
110 | dequeued_page = true; | 81 | dequeued_page = true; |
@@ -132,62 +103,14 @@ struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) | |||
132 | EXPORT_SYMBOL_GPL(balloon_page_dequeue); | 103 | EXPORT_SYMBOL_GPL(balloon_page_dequeue); |
133 | 104 | ||
134 | #ifdef CONFIG_BALLOON_COMPACTION | 105 | #ifdef CONFIG_BALLOON_COMPACTION |
135 | /* | ||
136 | * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. | ||
137 | * @b_dev_info: holds the balloon device information descriptor. | ||
138 | * @a_ops: balloon_mapping address_space_operations descriptor. | ||
139 | * | ||
140 | * Driver must call it to properly allocate and initialize an instance of | ||
141 | * struct address_space which will be used as the special page->mapping for | ||
142 | * balloon device enlisted page instances. | ||
143 | */ | ||
144 | struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, | ||
145 | const struct address_space_operations *a_ops) | ||
146 | { | ||
147 | struct address_space *mapping; | ||
148 | |||
149 | mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); | ||
150 | if (!mapping) | ||
151 | return ERR_PTR(-ENOMEM); | ||
152 | |||
153 | /* | ||
154 | * Give a clean 'zeroed' status to all elements of this special | ||
155 | * balloon page->mapping struct address_space instance. | ||
156 | */ | ||
157 | address_space_init_once(mapping); | ||
158 | |||
159 | /* | ||
160 | * Set mapping->flags appropriately, to allow balloon pages | ||
161 | * ->mapping identification. | ||
162 | */ | ||
163 | mapping_set_balloon(mapping); | ||
164 | mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); | ||
165 | |||
166 | /* balloon's page->mapping->a_ops callback descriptor */ | ||
167 | mapping->a_ops = a_ops; | ||
168 | |||
169 | /* | ||
170 | * Establish a pointer reference back to the balloon device descriptor | ||
171 | * this particular page->mapping will be servicing. | ||
172 | * This is used by compaction / migration procedures to identify and | ||
173 | * access the balloon device pageset while isolating / migrating pages. | ||
174 | * | ||
175 | * As some balloon drivers can register multiple balloon devices | ||
176 | * for a single guest, this also helps compaction / migration to | ||
177 | * properly deal with multiple balloon pagesets, when required. | ||
178 | */ | ||
179 | mapping->private_data = b_dev_info; | ||
180 | b_dev_info->mapping = mapping; | ||
181 | |||
182 | return mapping; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(balloon_mapping_alloc); | ||
185 | 106 | ||
186 | static inline void __isolate_balloon_page(struct page *page) | 107 | static inline void __isolate_balloon_page(struct page *page) |
187 | { | 108 | { |
188 | struct balloon_dev_info *b_dev_info = page->mapping->private_data; | 109 | struct balloon_dev_info *b_dev_info = balloon_page_device(page); |
189 | unsigned long flags; | 110 | unsigned long flags; |
111 | |||
190 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | 112 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); |
113 | ClearPagePrivate(page); | ||
191 | list_del(&page->lru); | 114 | list_del(&page->lru); |
192 | b_dev_info->isolated_pages++; | 115 | b_dev_info->isolated_pages++; |
193 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | 116 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); |
@@ -195,20 +118,16 @@ static inline void __isolate_balloon_page(struct page *page) | |||
195 | 118 | ||
196 | static inline void __putback_balloon_page(struct page *page) | 119 | static inline void __putback_balloon_page(struct page *page) |
197 | { | 120 | { |
198 | struct balloon_dev_info *b_dev_info = page->mapping->private_data; | 121 | struct balloon_dev_info *b_dev_info = balloon_page_device(page); |
199 | unsigned long flags; | 122 | unsigned long flags; |
123 | |||
200 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); | 124 | spin_lock_irqsave(&b_dev_info->pages_lock, flags); |
125 | SetPagePrivate(page); | ||
201 | list_add(&page->lru, &b_dev_info->pages); | 126 | list_add(&page->lru, &b_dev_info->pages); |
202 | b_dev_info->isolated_pages--; | 127 | b_dev_info->isolated_pages--; |
203 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); | 128 | spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); |
204 | } | 129 | } |
205 | 130 | ||
206 | static inline int __migrate_balloon_page(struct address_space *mapping, | ||
207 | struct page *newpage, struct page *page, enum migrate_mode mode) | ||
208 | { | ||
209 | return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); | ||
210 | } | ||
211 | |||
212 | /* __isolate_lru_page() counterpart for a ballooned page */ | 131 | /* __isolate_lru_page() counterpart for a ballooned page */ |
213 | bool balloon_page_isolate(struct page *page) | 132 | bool balloon_page_isolate(struct page *page) |
214 | { | 133 | { |
@@ -235,12 +154,11 @@ bool balloon_page_isolate(struct page *page) | |||
235 | */ | 154 | */ |
236 | if (likely(trylock_page(page))) { | 155 | if (likely(trylock_page(page))) { |
237 | /* | 156 | /* |
238 | * A ballooned page, by default, has just one refcount. | 157 | * A ballooned page, by default, has PagePrivate set. |
239 | * Prevent concurrent compaction threads from isolating | 158 | * Prevent concurrent compaction threads from isolating |
240 | * an already isolated balloon page by refcount check. | 159 | * an already isolated balloon page by clearing it. |
241 | */ | 160 | */ |
242 | if (__is_movable_balloon_page(page) && | 161 | if (balloon_page_movable(page)) { |
243 | page_count(page) == 2) { | ||
244 | __isolate_balloon_page(page); | 162 | __isolate_balloon_page(page); |
245 | unlock_page(page); | 163 | unlock_page(page); |
246 | return true; | 164 | return true; |
@@ -276,7 +194,7 @@ void balloon_page_putback(struct page *page) | |||
276 | int balloon_page_migrate(struct page *newpage, | 194 | int balloon_page_migrate(struct page *newpage, |
277 | struct page *page, enum migrate_mode mode) | 195 | struct page *page, enum migrate_mode mode) |
278 | { | 196 | { |
279 | struct address_space *mapping; | 197 | struct balloon_dev_info *balloon = balloon_page_device(page); |
280 | int rc = -EAGAIN; | 198 | int rc = -EAGAIN; |
281 | 199 | ||
282 | /* | 200 | /* |
@@ -292,9 +210,8 @@ int balloon_page_migrate(struct page *newpage, | |||
292 | return rc; | 210 | return rc; |
293 | } | 211 | } |
294 | 212 | ||
295 | mapping = page->mapping; | 213 | if (balloon && balloon->migratepage) |
296 | if (mapping) | 214 | rc = balloon->migratepage(balloon, newpage, page, mode); |
297 | rc = __migrate_balloon_page(mapping, newpage, page, mode); | ||
298 | 215 | ||
299 | unlock_page(newpage); | 216 | unlock_page(newpage); |
300 | return rc; | 217 | return rc; |
diff --git a/mm/bootmem.c b/mm/bootmem.c index 90bd3507b413..8a000cebb0d7 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c | |||
@@ -16,9 +16,9 @@ | |||
16 | #include <linux/kmemleak.h> | 16 | #include <linux/kmemleak.h> |
17 | #include <linux/range.h> | 17 | #include <linux/range.h> |
18 | #include <linux/memblock.h> | 18 | #include <linux/memblock.h> |
19 | #include <linux/bug.h> | ||
20 | #include <linux/io.h> | ||
19 | 21 | ||
20 | #include <asm/bug.h> | ||
21 | #include <asm/io.h> | ||
22 | #include <asm/processor.h> | 22 | #include <asm/processor.h> |
23 | 23 | ||
24 | #include "internal.h" | 24 | #include "internal.h" |
@@ -32,6 +32,7 @@ | |||
32 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
33 | #include <linux/log2.h> | 33 | #include <linux/log2.h> |
34 | #include <linux/cma.h> | 34 | #include <linux/cma.h> |
35 | #include <linux/highmem.h> | ||
35 | 36 | ||
36 | struct cma { | 37 | struct cma { |
37 | unsigned long base_pfn; | 38 | unsigned long base_pfn; |
@@ -163,6 +164,8 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
163 | bool fixed, struct cma **res_cma) | 164 | bool fixed, struct cma **res_cma) |
164 | { | 165 | { |
165 | struct cma *cma; | 166 | struct cma *cma; |
167 | phys_addr_t memblock_end = memblock_end_of_DRAM(); | ||
168 | phys_addr_t highmem_start = __pa(high_memory); | ||
166 | int ret = 0; | 169 | int ret = 0; |
167 | 170 | ||
168 | pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", | 171 | pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", |
@@ -196,6 +199,24 @@ int __init cma_declare_contiguous(phys_addr_t base, | |||
196 | if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) | 199 | if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) |
197 | return -EINVAL; | 200 | return -EINVAL; |
198 | 201 | ||
202 | /* | ||
203 | * adjust limit to avoid crossing low/high memory boundary for | ||
204 | * automatically allocated regions | ||
205 | */ | ||
206 | if (((limit == 0 || limit > memblock_end) && | ||
207 | (memblock_end - size < highmem_start && | ||
208 | memblock_end > highmem_start)) || | ||
209 | (!fixed && limit > highmem_start && limit - size < highmem_start)) { | ||
210 | limit = highmem_start; | ||
211 | } | ||
212 | |||
213 | if (fixed && base < highmem_start && base+size > highmem_start) { | ||
214 | ret = -EINVAL; | ||
215 | pr_err("Region at %08lx defined on low/high memory boundary (%08lx)\n", | ||
216 | (unsigned long)base, (unsigned long)highmem_start); | ||
217 | goto err; | ||
218 | } | ||
219 | |||
199 | /* Reserve memory */ | 220 | /* Reserve memory */ |
200 | if (base && fixed) { | 221 | if (base && fixed) { |
201 | if (memblock_is_region_reserved(base, size) || | 222 | if (memblock_is_region_reserved(base, size) || |
diff --git a/mm/compaction.c b/mm/compaction.c index 21bf292b642a..edba18aed173 100644 --- a/mm/compaction.c +++ b/mm/compaction.c | |||
@@ -67,6 +67,49 @@ static inline bool migrate_async_suitable(int migratetype) | |||
67 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; | 67 | return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; |
68 | } | 68 | } |
69 | 69 | ||
70 | /* | ||
71 | * Check that the whole (or subset of) a pageblock given by the interval of | ||
72 | * [start_pfn, end_pfn) is valid and within the same zone, before scanning it | ||
73 | * with the migration of free compaction scanner. The scanners then need to | ||
74 | * use only pfn_valid_within() check for arches that allow holes within | ||
75 | * pageblocks. | ||
76 | * | ||
77 | * Return struct page pointer of start_pfn, or NULL if checks were not passed. | ||
78 | * | ||
79 | * It's possible on some configurations to have a setup like node0 node1 node0 | ||
80 | * i.e. it's possible that all pages within a zones range of pages do not | ||
81 | * belong to a single zone. We assume that a border between node0 and node1 | ||
82 | * can occur within a single pageblock, but not a node0 node1 node0 | ||
83 | * interleaving within a single pageblock. It is therefore sufficient to check | ||
84 | * the first and last page of a pageblock and avoid checking each individual | ||
85 | * page in a pageblock. | ||
86 | */ | ||
87 | static struct page *pageblock_pfn_to_page(unsigned long start_pfn, | ||
88 | unsigned long end_pfn, struct zone *zone) | ||
89 | { | ||
90 | struct page *start_page; | ||
91 | struct page *end_page; | ||
92 | |||
93 | /* end_pfn is one past the range we are checking */ | ||
94 | end_pfn--; | ||
95 | |||
96 | if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) | ||
97 | return NULL; | ||
98 | |||
99 | start_page = pfn_to_page(start_pfn); | ||
100 | |||
101 | if (page_zone(start_page) != zone) | ||
102 | return NULL; | ||
103 | |||
104 | end_page = pfn_to_page(end_pfn); | ||
105 | |||
106 | /* This gives a shorter code than deriving page_zone(end_page) */ | ||
107 | if (page_zone_id(start_page) != page_zone_id(end_page)) | ||
108 | return NULL; | ||
109 | |||
110 | return start_page; | ||
111 | } | ||
112 | |||
70 | #ifdef CONFIG_COMPACTION | 113 | #ifdef CONFIG_COMPACTION |
71 | /* Returns true if the pageblock should be scanned for pages to isolate. */ | 114 | /* Returns true if the pageblock should be scanned for pages to isolate. */ |
72 | static inline bool isolation_suitable(struct compact_control *cc, | 115 | static inline bool isolation_suitable(struct compact_control *cc, |
@@ -132,7 +175,7 @@ void reset_isolation_suitable(pg_data_t *pgdat) | |||
132 | */ | 175 | */ |
133 | static void update_pageblock_skip(struct compact_control *cc, | 176 | static void update_pageblock_skip(struct compact_control *cc, |
134 | struct page *page, unsigned long nr_isolated, | 177 | struct page *page, unsigned long nr_isolated, |
135 | bool set_unsuitable, bool migrate_scanner) | 178 | bool migrate_scanner) |
136 | { | 179 | { |
137 | struct zone *zone = cc->zone; | 180 | struct zone *zone = cc->zone; |
138 | unsigned long pfn; | 181 | unsigned long pfn; |
@@ -146,12 +189,7 @@ static void update_pageblock_skip(struct compact_control *cc, | |||
146 | if (nr_isolated) | 189 | if (nr_isolated) |
147 | return; | 190 | return; |
148 | 191 | ||
149 | /* | 192 | set_pageblock_skip(page); |
150 | * Only skip pageblocks when all forms of compaction will be known to | ||
151 | * fail in the near future. | ||
152 | */ | ||
153 | if (set_unsuitable) | ||
154 | set_pageblock_skip(page); | ||
155 | 193 | ||
156 | pfn = page_to_pfn(page); | 194 | pfn = page_to_pfn(page); |
157 | 195 | ||
@@ -180,52 +218,77 @@ static inline bool isolation_suitable(struct compact_control *cc, | |||
180 | 218 | ||
181 | static void update_pageblock_skip(struct compact_control *cc, | 219 | static void update_pageblock_skip(struct compact_control *cc, |
182 | struct page *page, unsigned long nr_isolated, | 220 | struct page *page, unsigned long nr_isolated, |
183 | bool set_unsuitable, bool migrate_scanner) | 221 | bool migrate_scanner) |
184 | { | 222 | { |
185 | } | 223 | } |
186 | #endif /* CONFIG_COMPACTION */ | 224 | #endif /* CONFIG_COMPACTION */ |
187 | 225 | ||
188 | static inline bool should_release_lock(spinlock_t *lock) | 226 | /* |
227 | * Compaction requires the taking of some coarse locks that are potentially | ||
228 | * very heavily contended. For async compaction, back out if the lock cannot | ||
229 | * be taken immediately. For sync compaction, spin on the lock if needed. | ||
230 | * | ||
231 | * Returns true if the lock is held | ||
232 | * Returns false if the lock is not held and compaction should abort | ||
233 | */ | ||
234 | static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags, | ||
235 | struct compact_control *cc) | ||
189 | { | 236 | { |
190 | return need_resched() || spin_is_contended(lock); | 237 | if (cc->mode == MIGRATE_ASYNC) { |
238 | if (!spin_trylock_irqsave(lock, *flags)) { | ||
239 | cc->contended = COMPACT_CONTENDED_LOCK; | ||
240 | return false; | ||
241 | } | ||
242 | } else { | ||
243 | spin_lock_irqsave(lock, *flags); | ||
244 | } | ||
245 | |||
246 | return true; | ||
191 | } | 247 | } |
192 | 248 | ||
193 | /* | 249 | /* |
194 | * Compaction requires the taking of some coarse locks that are potentially | 250 | * Compaction requires the taking of some coarse locks that are potentially |
195 | * very heavily contended. Check if the process needs to be scheduled or | 251 | * very heavily contended. The lock should be periodically unlocked to avoid |
196 | * if the lock is contended. For async compaction, back out in the event | 252 | * having disabled IRQs for a long time, even when there is nobody waiting on |
197 | * if contention is severe. For sync compaction, schedule. | 253 | * the lock. It might also be that allowing the IRQs will result in |
254 | * need_resched() becoming true. If scheduling is needed, async compaction | ||
255 | * aborts. Sync compaction schedules. | ||
256 | * Either compaction type will also abort if a fatal signal is pending. | ||
257 | * In either case if the lock was locked, it is dropped and not regained. | ||
198 | * | 258 | * |
199 | * Returns true if the lock is held. | 259 | * Returns true if compaction should abort due to fatal signal pending, or |
200 | * Returns false if the lock is released and compaction should abort | 260 | * async compaction due to need_resched() |
261 | * Returns false when compaction can continue (sync compaction might have | ||
262 | * scheduled) | ||
201 | */ | 263 | */ |
202 | static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, | 264 | static bool compact_unlock_should_abort(spinlock_t *lock, |
203 | bool locked, struct compact_control *cc) | 265 | unsigned long flags, bool *locked, struct compact_control *cc) |
204 | { | 266 | { |
205 | if (should_release_lock(lock)) { | 267 | if (*locked) { |
206 | if (locked) { | 268 | spin_unlock_irqrestore(lock, flags); |
207 | spin_unlock_irqrestore(lock, *flags); | 269 | *locked = false; |
208 | locked = false; | 270 | } |
209 | } | 271 | |
272 | if (fatal_signal_pending(current)) { | ||
273 | cc->contended = COMPACT_CONTENDED_SCHED; | ||
274 | return true; | ||
275 | } | ||
210 | 276 | ||
211 | /* async aborts if taking too long or contended */ | 277 | if (need_resched()) { |
212 | if (cc->mode == MIGRATE_ASYNC) { | 278 | if (cc->mode == MIGRATE_ASYNC) { |
213 | cc->contended = true; | 279 | cc->contended = COMPACT_CONTENDED_SCHED; |
214 | return false; | 280 | return true; |
215 | } | 281 | } |
216 | |||
217 | cond_resched(); | 282 | cond_resched(); |
218 | } | 283 | } |
219 | 284 | ||
220 | if (!locked) | 285 | return false; |
221 | spin_lock_irqsave(lock, *flags); | ||
222 | return true; | ||
223 | } | 286 | } |
224 | 287 | ||
225 | /* | 288 | /* |
226 | * Aside from avoiding lock contention, compaction also periodically checks | 289 | * Aside from avoiding lock contention, compaction also periodically checks |
227 | * need_resched() and either schedules in sync compaction or aborts async | 290 | * need_resched() and either schedules in sync compaction or aborts async |
228 | * compaction. This is similar to what compact_checklock_irqsave() does, but | 291 | * compaction. This is similar to what compact_unlock_should_abort() does, but |
229 | * is used where no lock is concerned. | 292 | * is used where no lock is concerned. |
230 | * | 293 | * |
231 | * Returns false when no scheduling was needed, or sync compaction scheduled. | 294 | * Returns false when no scheduling was needed, or sync compaction scheduled. |
@@ -236,7 +299,7 @@ static inline bool compact_should_abort(struct compact_control *cc) | |||
236 | /* async compaction aborts if contended */ | 299 | /* async compaction aborts if contended */ |
237 | if (need_resched()) { | 300 | if (need_resched()) { |
238 | if (cc->mode == MIGRATE_ASYNC) { | 301 | if (cc->mode == MIGRATE_ASYNC) { |
239 | cc->contended = true; | 302 | cc->contended = COMPACT_CONTENDED_SCHED; |
240 | return true; | 303 | return true; |
241 | } | 304 | } |
242 | 305 | ||
@@ -250,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc) | |||
250 | static bool suitable_migration_target(struct page *page) | 313 | static bool suitable_migration_target(struct page *page) |
251 | { | 314 | { |
252 | /* If the page is a large free page, then disallow migration */ | 315 | /* If the page is a large free page, then disallow migration */ |
253 | if (PageBuddy(page) && page_order(page) >= pageblock_order) | 316 | if (PageBuddy(page)) { |
254 | return false; | 317 | /* |
318 | * We are checking page_order without zone->lock taken. But | ||
319 | * the only small danger is that we skip a potentially suitable | ||
320 | * pageblock, so it's not worth to check order for valid range. | ||
321 | */ | ||
322 | if (page_order_unsafe(page) >= pageblock_order) | ||
323 | return false; | ||
324 | } | ||
255 | 325 | ||
256 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ | 326 | /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ |
257 | if (migrate_async_suitable(get_pageblock_migratetype(page))) | 327 | if (migrate_async_suitable(get_pageblock_migratetype(page))) |
@@ -267,16 +337,16 @@ static bool suitable_migration_target(struct page *page) | |||
267 | * (even though it may still end up isolating some pages). | 337 | * (even though it may still end up isolating some pages). |
268 | */ | 338 | */ |
269 | static unsigned long isolate_freepages_block(struct compact_control *cc, | 339 | static unsigned long isolate_freepages_block(struct compact_control *cc, |
270 | unsigned long blockpfn, | 340 | unsigned long *start_pfn, |
271 | unsigned long end_pfn, | 341 | unsigned long end_pfn, |
272 | struct list_head *freelist, | 342 | struct list_head *freelist, |
273 | bool strict) | 343 | bool strict) |
274 | { | 344 | { |
275 | int nr_scanned = 0, total_isolated = 0; | 345 | int nr_scanned = 0, total_isolated = 0; |
276 | struct page *cursor, *valid_page = NULL; | 346 | struct page *cursor, *valid_page = NULL; |
277 | unsigned long flags; | 347 | unsigned long flags = 0; |
278 | bool locked = false; | 348 | bool locked = false; |
279 | bool checked_pageblock = false; | 349 | unsigned long blockpfn = *start_pfn; |
280 | 350 | ||
281 | cursor = pfn_to_page(blockpfn); | 351 | cursor = pfn_to_page(blockpfn); |
282 | 352 | ||
@@ -285,6 +355,16 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
285 | int isolated, i; | 355 | int isolated, i; |
286 | struct page *page = cursor; | 356 | struct page *page = cursor; |
287 | 357 | ||
358 | /* | ||
359 | * Periodically drop the lock (if held) regardless of its | ||
360 | * contention, to give chance to IRQs. Abort if fatal signal | ||
361 | * pending or async compaction detects need_resched() | ||
362 | */ | ||
363 | if (!(blockpfn % SWAP_CLUSTER_MAX) | ||
364 | && compact_unlock_should_abort(&cc->zone->lock, flags, | ||
365 | &locked, cc)) | ||
366 | break; | ||
367 | |||
288 | nr_scanned++; | 368 | nr_scanned++; |
289 | if (!pfn_valid_within(blockpfn)) | 369 | if (!pfn_valid_within(blockpfn)) |
290 | goto isolate_fail; | 370 | goto isolate_fail; |
@@ -295,33 +375,30 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, | |||
295 | goto isolate_fail; | 375 | goto isolate_fail; |
296 | 376 | ||
297 | /* | 377 | /* |
298 | * The zone lock must be held to isolate freepages. | 378 | * If we already hold the lock, we can skip some rechecking. |
299 | * Unfortunately this is a very coarse lock and can be | 379 | * Note that if we hold the lock now, checked_pageblock was |
300 | * heavily contended if there are parallel allocations | 380 | * already set in some previous iteration (or strict is true), |
301 | * or parallel compactions. For async compaction do not | 381 | * so it is correct to skip the suitable migration target |
302 | * spin on the lock and we acquire the lock as late as | 382 | * recheck as well. |
303 | * possible. | ||
304 | */ | 383 | */ |
305 | locked = compact_checklock_irqsave(&cc->zone->lock, &flags, | 384 | if (!locked) { |
306 | locked, cc); | ||
307 | if (!locked) | ||
308 | break; | ||
309 | |||
310 | /* Recheck this is a suitable migration target under lock */ | ||
311 | if (!strict && !checked_pageblock) { | ||
312 | /* | 385 | /* |
313 | * We need to check suitability of pageblock only once | 386 | * The zone lock must be held to isolate freepages. |
314 | * and this isolate_freepages_block() is called with | 387 | * Unfortunately this is a very coarse lock and can be |
315 | * pageblock range, so just check once is sufficient. | 388 | * heavily contended if there are parallel allocations |
389 | * or parallel compactions. For async compaction do not | ||
390 | * spin on the lock and we acquire the lock as late as | ||
391 | * possible. | ||
316 | */ | 392 | */ |
317 | checked_pageblock = true; | 393 | locked = compact_trylock_irqsave(&cc->zone->lock, |
318 | if (!suitable_migration_target(page)) | 394 | &flags, cc); |
395 | if (!locked) | ||
319 | break; | 396 | break; |
320 | } | ||
321 | 397 | ||
322 | /* Recheck this is a buddy page under lock */ | 398 | /* Recheck this is a buddy page under lock */ |
323 | if (!PageBuddy(page)) | 399 | if (!PageBuddy(page)) |
324 | goto isolate_fail; | 400 | goto isolate_fail; |
401 | } | ||
325 | 402 | ||
326 | /* Found a free page, break it into order-0 pages */ | 403 | /* Found a free page, break it into order-0 pages */ |
327 | isolated = split_free_page(page); | 404 | isolated = split_free_page(page); |
@@ -346,6 +423,9 @@ isolate_fail: | |||
346 | 423 | ||
347 | } | 424 | } |
348 | 425 | ||
426 | /* Record how far we have got within the block */ | ||
427 | *start_pfn = blockpfn; | ||
428 | |||
349 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); | 429 | trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); |
350 | 430 | ||
351 | /* | 431 | /* |
@@ -361,8 +441,7 @@ isolate_fail: | |||
361 | 441 | ||
362 | /* Update the pageblock-skip if the whole pageblock was scanned */ | 442 | /* Update the pageblock-skip if the whole pageblock was scanned */ |
363 | if (blockpfn == end_pfn) | 443 | if (blockpfn == end_pfn) |
364 | update_pageblock_skip(cc, valid_page, total_isolated, true, | 444 | update_pageblock_skip(cc, valid_page, total_isolated, false); |
365 | false); | ||
366 | 445 | ||
367 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); | 446 | count_compact_events(COMPACTFREE_SCANNED, nr_scanned); |
368 | if (total_isolated) | 447 | if (total_isolated) |
@@ -390,19 +469,21 @@ isolate_freepages_range(struct compact_control *cc, | |||
390 | unsigned long isolated, pfn, block_end_pfn; | 469 | unsigned long isolated, pfn, block_end_pfn; |
391 | LIST_HEAD(freelist); | 470 | LIST_HEAD(freelist); |
392 | 471 | ||
393 | for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) { | 472 | pfn = start_pfn; |
394 | if (!pfn_valid(pfn) || cc->zone != page_zone(pfn_to_page(pfn))) | 473 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); |
395 | break; | 474 | |
475 | for (; pfn < end_pfn; pfn += isolated, | ||
476 | block_end_pfn += pageblock_nr_pages) { | ||
477 | /* Protect pfn from changing by isolate_freepages_block */ | ||
478 | unsigned long isolate_start_pfn = pfn; | ||
396 | 479 | ||
397 | /* | ||
398 | * On subsequent iterations ALIGN() is actually not needed, | ||
399 | * but we keep it that we not to complicate the code. | ||
400 | */ | ||
401 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
402 | block_end_pfn = min(block_end_pfn, end_pfn); | 480 | block_end_pfn = min(block_end_pfn, end_pfn); |
403 | 481 | ||
404 | isolated = isolate_freepages_block(cc, pfn, block_end_pfn, | 482 | if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) |
405 | &freelist, true); | 483 | break; |
484 | |||
485 | isolated = isolate_freepages_block(cc, &isolate_start_pfn, | ||
486 | block_end_pfn, &freelist, true); | ||
406 | 487 | ||
407 | /* | 488 | /* |
408 | * In strict mode, isolate_freepages_block() returns 0 if | 489 | * In strict mode, isolate_freepages_block() returns 0 if |
@@ -433,22 +514,19 @@ isolate_freepages_range(struct compact_control *cc, | |||
433 | } | 514 | } |
434 | 515 | ||
435 | /* Update the number of anon and file isolated pages in the zone */ | 516 | /* Update the number of anon and file isolated pages in the zone */ |
436 | static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc) | 517 | static void acct_isolated(struct zone *zone, struct compact_control *cc) |
437 | { | 518 | { |
438 | struct page *page; | 519 | struct page *page; |
439 | unsigned int count[2] = { 0, }; | 520 | unsigned int count[2] = { 0, }; |
440 | 521 | ||
522 | if (list_empty(&cc->migratepages)) | ||
523 | return; | ||
524 | |||
441 | list_for_each_entry(page, &cc->migratepages, lru) | 525 | list_for_each_entry(page, &cc->migratepages, lru) |
442 | count[!!page_is_file_cache(page)]++; | 526 | count[!!page_is_file_cache(page)]++; |
443 | 527 | ||
444 | /* If locked we can use the interrupt unsafe versions */ | 528 | mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); |
445 | if (locked) { | 529 | mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); |
446 | __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); | ||
447 | __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); | ||
448 | } else { | ||
449 | mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); | ||
450 | mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); | ||
451 | } | ||
452 | } | 530 | } |
453 | 531 | ||
454 | /* Similar to reclaim, but different enough that they don't share logic */ | 532 | /* Similar to reclaim, but different enough that they don't share logic */ |
@@ -467,40 +545,34 @@ static bool too_many_isolated(struct zone *zone) | |||
467 | } | 545 | } |
468 | 546 | ||
469 | /** | 547 | /** |
470 | * isolate_migratepages_range() - isolate all migrate-able pages in range. | 548 | * isolate_migratepages_block() - isolate all migrate-able pages within |
471 | * @zone: Zone pages are in. | 549 | * a single pageblock |
472 | * @cc: Compaction control structure. | 550 | * @cc: Compaction control structure. |
473 | * @low_pfn: The first PFN of the range. | 551 | * @low_pfn: The first PFN to isolate |
474 | * @end_pfn: The one-past-the-last PFN of the range. | 552 | * @end_pfn: The one-past-the-last PFN to isolate, within same pageblock |
475 | * @unevictable: true if it allows to isolate unevictable pages | 553 | * @isolate_mode: Isolation mode to be used. |
476 | * | 554 | * |
477 | * Isolate all pages that can be migrated from the range specified by | 555 | * Isolate all pages that can be migrated from the range specified by |
478 | * [low_pfn, end_pfn). Returns zero if there is a fatal signal | 556 | * [low_pfn, end_pfn). The range is expected to be within same pageblock. |
479 | * pending), otherwise PFN of the first page that was not scanned | 557 | * Returns zero if there is a fatal signal pending, otherwise PFN of the |
480 | * (which may be both less, equal to or more then end_pfn). | 558 | * first page that was not scanned (which may be both less, equal to or more |
559 | * than end_pfn). | ||
481 | * | 560 | * |
482 | * Assumes that cc->migratepages is empty and cc->nr_migratepages is | 561 | * The pages are isolated on cc->migratepages list (not required to be empty), |
483 | * zero. | 562 | * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field |
484 | * | 563 | * is neither read nor updated. |
485 | * Apart from cc->migratepages and cc->nr_migratetypes this function | ||
486 | * does not modify any cc's fields, in particular it does not modify | ||
487 | * (or read for that matter) cc->migrate_pfn. | ||
488 | */ | 564 | */ |
489 | unsigned long | 565 | static unsigned long |
490 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 566 | isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, |
491 | unsigned long low_pfn, unsigned long end_pfn, bool unevictable) | 567 | unsigned long end_pfn, isolate_mode_t isolate_mode) |
492 | { | 568 | { |
493 | unsigned long last_pageblock_nr = 0, pageblock_nr; | 569 | struct zone *zone = cc->zone; |
494 | unsigned long nr_scanned = 0, nr_isolated = 0; | 570 | unsigned long nr_scanned = 0, nr_isolated = 0; |
495 | struct list_head *migratelist = &cc->migratepages; | 571 | struct list_head *migratelist = &cc->migratepages; |
496 | struct lruvec *lruvec; | 572 | struct lruvec *lruvec; |
497 | unsigned long flags; | 573 | unsigned long flags = 0; |
498 | bool locked = false; | 574 | bool locked = false; |
499 | struct page *page = NULL, *valid_page = NULL; | 575 | struct page *page = NULL, *valid_page = NULL; |
500 | bool set_unsuitable = true; | ||
501 | const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? | ||
502 | ISOLATE_ASYNC_MIGRATE : 0) | | ||
503 | (unevictable ? ISOLATE_UNEVICTABLE : 0); | ||
504 | 576 | ||
505 | /* | 577 | /* |
506 | * Ensure that there are not too many pages isolated from the LRU | 578 | * Ensure that there are not too many pages isolated from the LRU |
@@ -523,72 +595,43 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
523 | 595 | ||
524 | /* Time to isolate some pages for migration */ | 596 | /* Time to isolate some pages for migration */ |
525 | for (; low_pfn < end_pfn; low_pfn++) { | 597 | for (; low_pfn < end_pfn; low_pfn++) { |
526 | /* give a chance to irqs before checking need_resched() */ | ||
527 | if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { | ||
528 | if (should_release_lock(&zone->lru_lock)) { | ||
529 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
530 | locked = false; | ||
531 | } | ||
532 | } | ||
533 | |||
534 | /* | 598 | /* |
535 | * migrate_pfn does not necessarily start aligned to a | 599 | * Periodically drop the lock (if held) regardless of its |
536 | * pageblock. Ensure that pfn_valid is called when moving | 600 | * contention, to give chance to IRQs. Abort async compaction |
537 | * into a new MAX_ORDER_NR_PAGES range in case of large | 601 | * if contended. |
538 | * memory holes within the zone | ||
539 | */ | 602 | */ |
540 | if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) { | 603 | if (!(low_pfn % SWAP_CLUSTER_MAX) |
541 | if (!pfn_valid(low_pfn)) { | 604 | && compact_unlock_should_abort(&zone->lru_lock, flags, |
542 | low_pfn += MAX_ORDER_NR_PAGES - 1; | 605 | &locked, cc)) |
543 | continue; | 606 | break; |
544 | } | ||
545 | } | ||
546 | 607 | ||
547 | if (!pfn_valid_within(low_pfn)) | 608 | if (!pfn_valid_within(low_pfn)) |
548 | continue; | 609 | continue; |
549 | nr_scanned++; | 610 | nr_scanned++; |
550 | 611 | ||
551 | /* | ||
552 | * Get the page and ensure the page is within the same zone. | ||
553 | * See the comment in isolate_freepages about overlapping | ||
554 | * nodes. It is deliberate that the new zone lock is not taken | ||
555 | * as memory compaction should not move pages between nodes. | ||
556 | */ | ||
557 | page = pfn_to_page(low_pfn); | 612 | page = pfn_to_page(low_pfn); |
558 | if (page_zone(page) != zone) | ||
559 | continue; | ||
560 | 613 | ||
561 | if (!valid_page) | 614 | if (!valid_page) |
562 | valid_page = page; | 615 | valid_page = page; |
563 | 616 | ||
564 | /* If isolation recently failed, do not retry */ | 617 | /* |
565 | pageblock_nr = low_pfn >> pageblock_order; | 618 | * Skip if free. We read page order here without zone lock |
566 | if (last_pageblock_nr != pageblock_nr) { | 619 | * which is generally unsafe, but the race window is small and |
567 | int mt; | 620 | * the worst thing that can happen is that we skip some |
568 | 621 | * potential isolation targets. | |
569 | last_pageblock_nr = pageblock_nr; | 622 | */ |
570 | if (!isolation_suitable(cc, page)) | 623 | if (PageBuddy(page)) { |
571 | goto next_pageblock; | 624 | unsigned long freepage_order = page_order_unsafe(page); |
572 | 625 | ||
573 | /* | 626 | /* |
574 | * For async migration, also only scan in MOVABLE | 627 | * Without lock, we cannot be sure that what we got is |
575 | * blocks. Async migration is optimistic to see if | 628 | * a valid page order. Consider only values in the |
576 | * the minimum amount of work satisfies the allocation | 629 | * valid order range to prevent low_pfn overflow. |
577 | */ | 630 | */ |
578 | mt = get_pageblock_migratetype(page); | 631 | if (freepage_order > 0 && freepage_order < MAX_ORDER) |
579 | if (cc->mode == MIGRATE_ASYNC && | 632 | low_pfn += (1UL << freepage_order) - 1; |
580 | !migrate_async_suitable(mt)) { | ||
581 | set_unsuitable = false; | ||
582 | goto next_pageblock; | ||
583 | } | ||
584 | } | ||
585 | |||
586 | /* | ||
587 | * Skip if free. page_order cannot be used without zone->lock | ||
588 | * as nothing prevents parallel allocations or buddy merging. | ||
589 | */ | ||
590 | if (PageBuddy(page)) | ||
591 | continue; | 633 | continue; |
634 | } | ||
592 | 635 | ||
593 | /* | 636 | /* |
594 | * Check may be lockless but that's ok as we recheck later. | 637 | * Check may be lockless but that's ok as we recheck later. |
@@ -597,7 +640,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
597 | */ | 640 | */ |
598 | if (!PageLRU(page)) { | 641 | if (!PageLRU(page)) { |
599 | if (unlikely(balloon_page_movable(page))) { | 642 | if (unlikely(balloon_page_movable(page))) { |
600 | if (locked && balloon_page_isolate(page)) { | 643 | if (balloon_page_isolate(page)) { |
601 | /* Successfully isolated */ | 644 | /* Successfully isolated */ |
602 | goto isolate_success; | 645 | goto isolate_success; |
603 | } | 646 | } |
@@ -617,8 +660,11 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
617 | */ | 660 | */ |
618 | if (PageTransHuge(page)) { | 661 | if (PageTransHuge(page)) { |
619 | if (!locked) | 662 | if (!locked) |
620 | goto next_pageblock; | 663 | low_pfn = ALIGN(low_pfn + 1, |
621 | low_pfn += (1 << compound_order(page)) - 1; | 664 | pageblock_nr_pages) - 1; |
665 | else | ||
666 | low_pfn += (1 << compound_order(page)) - 1; | ||
667 | |||
622 | continue; | 668 | continue; |
623 | } | 669 | } |
624 | 670 | ||
@@ -631,24 +677,26 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
631 | page_count(page) > page_mapcount(page)) | 677 | page_count(page) > page_mapcount(page)) |
632 | continue; | 678 | continue; |
633 | 679 | ||
634 | /* Check if it is ok to still hold the lock */ | 680 | /* If we already hold the lock, we can skip some rechecking */ |
635 | locked = compact_checklock_irqsave(&zone->lru_lock, &flags, | 681 | if (!locked) { |
636 | locked, cc); | 682 | locked = compact_trylock_irqsave(&zone->lru_lock, |
637 | if (!locked || fatal_signal_pending(current)) | 683 | &flags, cc); |
638 | break; | 684 | if (!locked) |
685 | break; | ||
639 | 686 | ||
640 | /* Recheck PageLRU and PageTransHuge under lock */ | 687 | /* Recheck PageLRU and PageTransHuge under lock */ |
641 | if (!PageLRU(page)) | 688 | if (!PageLRU(page)) |
642 | continue; | 689 | continue; |
643 | if (PageTransHuge(page)) { | 690 | if (PageTransHuge(page)) { |
644 | low_pfn += (1 << compound_order(page)) - 1; | 691 | low_pfn += (1 << compound_order(page)) - 1; |
645 | continue; | 692 | continue; |
693 | } | ||
646 | } | 694 | } |
647 | 695 | ||
648 | lruvec = mem_cgroup_page_lruvec(page, zone); | 696 | lruvec = mem_cgroup_page_lruvec(page, zone); |
649 | 697 | ||
650 | /* Try isolate the page */ | 698 | /* Try isolate the page */ |
651 | if (__isolate_lru_page(page, mode) != 0) | 699 | if (__isolate_lru_page(page, isolate_mode) != 0) |
652 | continue; | 700 | continue; |
653 | 701 | ||
654 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | 702 | VM_BUG_ON_PAGE(PageTransCompound(page), page); |
@@ -667,14 +715,14 @@ isolate_success: | |||
667 | ++low_pfn; | 715 | ++low_pfn; |
668 | break; | 716 | break; |
669 | } | 717 | } |
670 | |||
671 | continue; | ||
672 | |||
673 | next_pageblock: | ||
674 | low_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages) - 1; | ||
675 | } | 718 | } |
676 | 719 | ||
677 | acct_isolated(zone, locked, cc); | 720 | /* |
721 | * The PageBuddy() check could have potentially brought us outside | ||
722 | * the range to be scanned. | ||
723 | */ | ||
724 | if (unlikely(low_pfn > end_pfn)) | ||
725 | low_pfn = end_pfn; | ||
678 | 726 | ||
679 | if (locked) | 727 | if (locked) |
680 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 728 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
@@ -684,8 +732,7 @@ next_pageblock: | |||
684 | * if the whole pageblock was scanned without isolating any page. | 732 | * if the whole pageblock was scanned without isolating any page. |
685 | */ | 733 | */ |
686 | if (low_pfn == end_pfn) | 734 | if (low_pfn == end_pfn) |
687 | update_pageblock_skip(cc, valid_page, nr_isolated, | 735 | update_pageblock_skip(cc, valid_page, nr_isolated, true); |
688 | set_unsuitable, true); | ||
689 | 736 | ||
690 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); | 737 | trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); |
691 | 738 | ||
@@ -696,17 +743,65 @@ next_pageblock: | |||
696 | return low_pfn; | 743 | return low_pfn; |
697 | } | 744 | } |
698 | 745 | ||
746 | /** | ||
747 | * isolate_migratepages_range() - isolate migrate-able pages in a PFN range | ||
748 | * @cc: Compaction control structure. | ||
749 | * @start_pfn: The first PFN to start isolating. | ||
750 | * @end_pfn: The one-past-last PFN. | ||
751 | * | ||
752 | * Returns zero if isolation fails fatally due to e.g. pending signal. | ||
753 | * Otherwise, function returns one-past-the-last PFN of isolated page | ||
754 | * (which may be greater than end_pfn if end fell in a middle of a THP page). | ||
755 | */ | ||
756 | unsigned long | ||
757 | isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, | ||
758 | unsigned long end_pfn) | ||
759 | { | ||
760 | unsigned long pfn, block_end_pfn; | ||
761 | |||
762 | /* Scan block by block. First and last block may be incomplete */ | ||
763 | pfn = start_pfn; | ||
764 | block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); | ||
765 | |||
766 | for (; pfn < end_pfn; pfn = block_end_pfn, | ||
767 | block_end_pfn += pageblock_nr_pages) { | ||
768 | |||
769 | block_end_pfn = min(block_end_pfn, end_pfn); | ||
770 | |||
771 | if (!pageblock_pfn_to_page(pfn, block_end_pfn, cc->zone)) | ||
772 | continue; | ||
773 | |||
774 | pfn = isolate_migratepages_block(cc, pfn, block_end_pfn, | ||
775 | ISOLATE_UNEVICTABLE); | ||
776 | |||
777 | /* | ||
778 | * In case of fatal failure, release everything that might | ||
779 | * have been isolated in the previous iteration, and signal | ||
780 | * the failure back to caller. | ||
781 | */ | ||
782 | if (!pfn) { | ||
783 | putback_movable_pages(&cc->migratepages); | ||
784 | cc->nr_migratepages = 0; | ||
785 | break; | ||
786 | } | ||
787 | } | ||
788 | acct_isolated(cc->zone, cc); | ||
789 | |||
790 | return pfn; | ||
791 | } | ||
792 | |||
699 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ | 793 | #endif /* CONFIG_COMPACTION || CONFIG_CMA */ |
700 | #ifdef CONFIG_COMPACTION | 794 | #ifdef CONFIG_COMPACTION |
701 | /* | 795 | /* |
702 | * Based on information in the current compact_control, find blocks | 796 | * Based on information in the current compact_control, find blocks |
703 | * suitable for isolating free pages from and then isolate them. | 797 | * suitable for isolating free pages from and then isolate them. |
704 | */ | 798 | */ |
705 | static void isolate_freepages(struct zone *zone, | 799 | static void isolate_freepages(struct compact_control *cc) |
706 | struct compact_control *cc) | ||
707 | { | 800 | { |
801 | struct zone *zone = cc->zone; | ||
708 | struct page *page; | 802 | struct page *page; |
709 | unsigned long block_start_pfn; /* start of current pageblock */ | 803 | unsigned long block_start_pfn; /* start of current pageblock */ |
804 | unsigned long isolate_start_pfn; /* exact pfn we start at */ | ||
710 | unsigned long block_end_pfn; /* end of current pageblock */ | 805 | unsigned long block_end_pfn; /* end of current pageblock */ |
711 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ | 806 | unsigned long low_pfn; /* lowest pfn scanner is able to scan */ |
712 | int nr_freepages = cc->nr_freepages; | 807 | int nr_freepages = cc->nr_freepages; |
@@ -715,14 +810,15 @@ static void isolate_freepages(struct zone *zone, | |||
715 | /* | 810 | /* |
716 | * Initialise the free scanner. The starting point is where we last | 811 | * Initialise the free scanner. The starting point is where we last |
717 | * successfully isolated from, zone-cached value, or the end of the | 812 | * successfully isolated from, zone-cached value, or the end of the |
718 | * zone when isolating for the first time. We need this aligned to | 813 | * zone when isolating for the first time. For looping we also need |
719 | * the pageblock boundary, because we do | 814 | * this pfn aligned down to the pageblock boundary, because we do |
720 | * block_start_pfn -= pageblock_nr_pages in the for loop. | 815 | * block_start_pfn -= pageblock_nr_pages in the for loop. |
721 | * For ending point, take care when isolating in last pageblock of a | 816 | * For ending point, take care when isolating in last pageblock of a |
722 | * a zone which ends in the middle of a pageblock. | 817 | * a zone which ends in the middle of a pageblock. |
723 | * The low boundary is the end of the pageblock the migration scanner | 818 | * The low boundary is the end of the pageblock the migration scanner |
724 | * is using. | 819 | * is using. |
725 | */ | 820 | */ |
821 | isolate_start_pfn = cc->free_pfn; | ||
726 | block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); | 822 | block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); |
727 | block_end_pfn = min(block_start_pfn + pageblock_nr_pages, | 823 | block_end_pfn = min(block_start_pfn + pageblock_nr_pages, |
728 | zone_end_pfn(zone)); | 824 | zone_end_pfn(zone)); |
@@ -735,7 +831,8 @@ static void isolate_freepages(struct zone *zone, | |||
735 | */ | 831 | */ |
736 | for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; | 832 | for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; |
737 | block_end_pfn = block_start_pfn, | 833 | block_end_pfn = block_start_pfn, |
738 | block_start_pfn -= pageblock_nr_pages) { | 834 | block_start_pfn -= pageblock_nr_pages, |
835 | isolate_start_pfn = block_start_pfn) { | ||
739 | unsigned long isolated; | 836 | unsigned long isolated; |
740 | 837 | ||
741 | /* | 838 | /* |
@@ -747,18 +844,9 @@ static void isolate_freepages(struct zone *zone, | |||
747 | && compact_should_abort(cc)) | 844 | && compact_should_abort(cc)) |
748 | break; | 845 | break; |
749 | 846 | ||
750 | if (!pfn_valid(block_start_pfn)) | 847 | page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, |
751 | continue; | 848 | zone); |
752 | 849 | if (!page) | |
753 | /* | ||
754 | * Check for overlapping nodes/zones. It's possible on some | ||
755 | * configurations to have a setup like | ||
756 | * node0 node1 node0 | ||
757 | * i.e. it's possible that all pages within a zones range of | ||
758 | * pages do not belong to a single zone. | ||
759 | */ | ||
760 | page = pfn_to_page(block_start_pfn); | ||
761 | if (page_zone(page) != zone) | ||
762 | continue; | 850 | continue; |
763 | 851 | ||
764 | /* Check the block is suitable for migration */ | 852 | /* Check the block is suitable for migration */ |
@@ -769,13 +857,25 @@ static void isolate_freepages(struct zone *zone, | |||
769 | if (!isolation_suitable(cc, page)) | 857 | if (!isolation_suitable(cc, page)) |
770 | continue; | 858 | continue; |
771 | 859 | ||
772 | /* Found a block suitable for isolating free pages from */ | 860 | /* Found a block suitable for isolating free pages from. */ |
773 | cc->free_pfn = block_start_pfn; | 861 | isolated = isolate_freepages_block(cc, &isolate_start_pfn, |
774 | isolated = isolate_freepages_block(cc, block_start_pfn, | ||
775 | block_end_pfn, freelist, false); | 862 | block_end_pfn, freelist, false); |
776 | nr_freepages += isolated; | 863 | nr_freepages += isolated; |
777 | 864 | ||
778 | /* | 865 | /* |
866 | * Remember where the free scanner should restart next time, | ||
867 | * which is where isolate_freepages_block() left off. | ||
868 | * But if it scanned the whole pageblock, isolate_start_pfn | ||
869 | * now points at block_end_pfn, which is the start of the next | ||
870 | * pageblock. | ||
871 | * In that case we will however want to restart at the start | ||
872 | * of the previous pageblock. | ||
873 | */ | ||
874 | cc->free_pfn = (isolate_start_pfn < block_end_pfn) ? | ||
875 | isolate_start_pfn : | ||
876 | block_start_pfn - pageblock_nr_pages; | ||
877 | |||
878 | /* | ||
779 | * Set a flag that we successfully isolated in this pageblock. | 879 | * Set a flag that we successfully isolated in this pageblock. |
780 | * In the next loop iteration, zone->compact_cached_free_pfn | 880 | * In the next loop iteration, zone->compact_cached_free_pfn |
781 | * will not be updated and thus it will effectively contain the | 881 | * will not be updated and thus it will effectively contain the |
@@ -822,7 +922,7 @@ static struct page *compaction_alloc(struct page *migratepage, | |||
822 | */ | 922 | */ |
823 | if (list_empty(&cc->freepages)) { | 923 | if (list_empty(&cc->freepages)) { |
824 | if (!cc->contended) | 924 | if (!cc->contended) |
825 | isolate_freepages(cc->zone, cc); | 925 | isolate_freepages(cc); |
826 | 926 | ||
827 | if (list_empty(&cc->freepages)) | 927 | if (list_empty(&cc->freepages)) |
828 | return NULL; | 928 | return NULL; |
@@ -856,38 +956,84 @@ typedef enum { | |||
856 | } isolate_migrate_t; | 956 | } isolate_migrate_t; |
857 | 957 | ||
858 | /* | 958 | /* |
859 | * Isolate all pages that can be migrated from the block pointed to by | 959 | * Isolate all pages that can be migrated from the first suitable block, |
860 | * the migrate scanner within compact_control. | 960 | * starting at the block pointed to by the migrate scanner pfn within |
961 | * compact_control. | ||
861 | */ | 962 | */ |
862 | static isolate_migrate_t isolate_migratepages(struct zone *zone, | 963 | static isolate_migrate_t isolate_migratepages(struct zone *zone, |
863 | struct compact_control *cc) | 964 | struct compact_control *cc) |
864 | { | 965 | { |
865 | unsigned long low_pfn, end_pfn; | 966 | unsigned long low_pfn, end_pfn; |
967 | struct page *page; | ||
968 | const isolate_mode_t isolate_mode = | ||
969 | (cc->mode == MIGRATE_ASYNC ? ISOLATE_ASYNC_MIGRATE : 0); | ||
866 | 970 | ||
867 | /* Do not scan outside zone boundaries */ | 971 | /* |
868 | low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); | 972 | * Start at where we last stopped, or beginning of the zone as |
973 | * initialized by compact_zone() | ||
974 | */ | ||
975 | low_pfn = cc->migrate_pfn; | ||
869 | 976 | ||
870 | /* Only scan within a pageblock boundary */ | 977 | /* Only scan within a pageblock boundary */ |
871 | end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); | 978 | end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); |
872 | 979 | ||
873 | /* Do not cross the free scanner or scan within a memory hole */ | 980 | /* |
874 | if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { | 981 | * Iterate over whole pageblocks until we find the first suitable. |
875 | cc->migrate_pfn = end_pfn; | 982 | * Do not cross the free scanner. |
876 | return ISOLATE_NONE; | 983 | */ |
877 | } | 984 | for (; end_pfn <= cc->free_pfn; |
985 | low_pfn = end_pfn, end_pfn += pageblock_nr_pages) { | ||
878 | 986 | ||
879 | /* Perform the isolation */ | 987 | /* |
880 | low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn, false); | 988 | * This can potentially iterate a massively long zone with |
881 | if (!low_pfn || cc->contended) | 989 | * many pageblocks unsuitable, so periodically check if we |
882 | return ISOLATE_ABORT; | 990 | * need to schedule, or even abort async compaction. |
991 | */ | ||
992 | if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) | ||
993 | && compact_should_abort(cc)) | ||
994 | break; | ||
995 | |||
996 | page = pageblock_pfn_to_page(low_pfn, end_pfn, zone); | ||
997 | if (!page) | ||
998 | continue; | ||
999 | |||
1000 | /* If isolation recently failed, do not retry */ | ||
1001 | if (!isolation_suitable(cc, page)) | ||
1002 | continue; | ||
1003 | |||
1004 | /* | ||
1005 | * For async compaction, also only scan in MOVABLE blocks. | ||
1006 | * Async compaction is optimistic to see if the minimum amount | ||
1007 | * of work satisfies the allocation. | ||
1008 | */ | ||
1009 | if (cc->mode == MIGRATE_ASYNC && | ||
1010 | !migrate_async_suitable(get_pageblock_migratetype(page))) | ||
1011 | continue; | ||
1012 | |||
1013 | /* Perform the isolation */ | ||
1014 | low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, | ||
1015 | isolate_mode); | ||
883 | 1016 | ||
1017 | if (!low_pfn || cc->contended) | ||
1018 | return ISOLATE_ABORT; | ||
1019 | |||
1020 | /* | ||
1021 | * Either we isolated something and proceed with migration. Or | ||
1022 | * we failed and compact_zone should decide if we should | ||
1023 | * continue or not. | ||
1024 | */ | ||
1025 | break; | ||
1026 | } | ||
1027 | |||
1028 | acct_isolated(zone, cc); | ||
1029 | /* Record where migration scanner will be restarted */ | ||
884 | cc->migrate_pfn = low_pfn; | 1030 | cc->migrate_pfn = low_pfn; |
885 | 1031 | ||
886 | return ISOLATE_SUCCESS; | 1032 | return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE; |
887 | } | 1033 | } |
888 | 1034 | ||
889 | static int compact_finished(struct zone *zone, | 1035 | static int compact_finished(struct zone *zone, struct compact_control *cc, |
890 | struct compact_control *cc) | 1036 | const int migratetype) |
891 | { | 1037 | { |
892 | unsigned int order; | 1038 | unsigned int order; |
893 | unsigned long watermark; | 1039 | unsigned long watermark; |
@@ -933,7 +1079,7 @@ static int compact_finished(struct zone *zone, | |||
933 | struct free_area *area = &zone->free_area[order]; | 1079 | struct free_area *area = &zone->free_area[order]; |
934 | 1080 | ||
935 | /* Job done if page is free of the right migratetype */ | 1081 | /* Job done if page is free of the right migratetype */ |
936 | if (!list_empty(&area->free_list[cc->migratetype])) | 1082 | if (!list_empty(&area->free_list[migratetype])) |
937 | return COMPACT_PARTIAL; | 1083 | return COMPACT_PARTIAL; |
938 | 1084 | ||
939 | /* Job done if allocation would set block type */ | 1085 | /* Job done if allocation would set block type */ |
@@ -999,6 +1145,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
999 | int ret; | 1145 | int ret; |
1000 | unsigned long start_pfn = zone->zone_start_pfn; | 1146 | unsigned long start_pfn = zone->zone_start_pfn; |
1001 | unsigned long end_pfn = zone_end_pfn(zone); | 1147 | unsigned long end_pfn = zone_end_pfn(zone); |
1148 | const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); | ||
1002 | const bool sync = cc->mode != MIGRATE_ASYNC; | 1149 | const bool sync = cc->mode != MIGRATE_ASYNC; |
1003 | 1150 | ||
1004 | ret = compaction_suitable(zone, cc->order); | 1151 | ret = compaction_suitable(zone, cc->order); |
@@ -1041,7 +1188,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1041 | 1188 | ||
1042 | migrate_prep_local(); | 1189 | migrate_prep_local(); |
1043 | 1190 | ||
1044 | while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { | 1191 | while ((ret = compact_finished(zone, cc, migratetype)) == |
1192 | COMPACT_CONTINUE) { | ||
1045 | int err; | 1193 | int err; |
1046 | 1194 | ||
1047 | switch (isolate_migratepages(zone, cc)) { | 1195 | switch (isolate_migratepages(zone, cc)) { |
@@ -1056,9 +1204,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) | |||
1056 | ; | 1204 | ; |
1057 | } | 1205 | } |
1058 | 1206 | ||
1059 | if (!cc->nr_migratepages) | ||
1060 | continue; | ||
1061 | |||
1062 | err = migrate_pages(&cc->migratepages, compaction_alloc, | 1207 | err = migrate_pages(&cc->migratepages, compaction_alloc, |
1063 | compaction_free, (unsigned long)cc, cc->mode, | 1208 | compaction_free, (unsigned long)cc, cc->mode, |
1064 | MR_COMPACTION); | 1209 | MR_COMPACTION); |
@@ -1092,14 +1237,14 @@ out: | |||
1092 | } | 1237 | } |
1093 | 1238 | ||
1094 | static unsigned long compact_zone_order(struct zone *zone, int order, | 1239 | static unsigned long compact_zone_order(struct zone *zone, int order, |
1095 | gfp_t gfp_mask, enum migrate_mode mode, bool *contended) | 1240 | gfp_t gfp_mask, enum migrate_mode mode, int *contended) |
1096 | { | 1241 | { |
1097 | unsigned long ret; | 1242 | unsigned long ret; |
1098 | struct compact_control cc = { | 1243 | struct compact_control cc = { |
1099 | .nr_freepages = 0, | 1244 | .nr_freepages = 0, |
1100 | .nr_migratepages = 0, | 1245 | .nr_migratepages = 0, |
1101 | .order = order, | 1246 | .order = order, |
1102 | .migratetype = allocflags_to_migratetype(gfp_mask), | 1247 | .gfp_mask = gfp_mask, |
1103 | .zone = zone, | 1248 | .zone = zone, |
1104 | .mode = mode, | 1249 | .mode = mode, |
1105 | }; | 1250 | }; |
@@ -1124,48 +1269,117 @@ int sysctl_extfrag_threshold = 500; | |||
1124 | * @gfp_mask: The GFP mask of the current allocation | 1269 | * @gfp_mask: The GFP mask of the current allocation |
1125 | * @nodemask: The allowed nodes to allocate from | 1270 | * @nodemask: The allowed nodes to allocate from |
1126 | * @mode: The migration mode for async, sync light, or sync migration | 1271 | * @mode: The migration mode for async, sync light, or sync migration |
1127 | * @contended: Return value that is true if compaction was aborted due to lock contention | 1272 | * @contended: Return value that determines if compaction was aborted due to |
1128 | * @page: Optionally capture a free page of the requested order during compaction | 1273 | * need_resched() or lock contention |
1274 | * @candidate_zone: Return the zone where we think allocation should succeed | ||
1129 | * | 1275 | * |
1130 | * This is the main entry point for direct page compaction. | 1276 | * This is the main entry point for direct page compaction. |
1131 | */ | 1277 | */ |
1132 | unsigned long try_to_compact_pages(struct zonelist *zonelist, | 1278 | unsigned long try_to_compact_pages(struct zonelist *zonelist, |
1133 | int order, gfp_t gfp_mask, nodemask_t *nodemask, | 1279 | int order, gfp_t gfp_mask, nodemask_t *nodemask, |
1134 | enum migrate_mode mode, bool *contended) | 1280 | enum migrate_mode mode, int *contended, |
1281 | struct zone **candidate_zone) | ||
1135 | { | 1282 | { |
1136 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1283 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1137 | int may_enter_fs = gfp_mask & __GFP_FS; | 1284 | int may_enter_fs = gfp_mask & __GFP_FS; |
1138 | int may_perform_io = gfp_mask & __GFP_IO; | 1285 | int may_perform_io = gfp_mask & __GFP_IO; |
1139 | struct zoneref *z; | 1286 | struct zoneref *z; |
1140 | struct zone *zone; | 1287 | struct zone *zone; |
1141 | int rc = COMPACT_SKIPPED; | 1288 | int rc = COMPACT_DEFERRED; |
1142 | int alloc_flags = 0; | 1289 | int alloc_flags = 0; |
1290 | int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ | ||
1291 | |||
1292 | *contended = COMPACT_CONTENDED_NONE; | ||
1143 | 1293 | ||
1144 | /* Check if the GFP flags allow compaction */ | 1294 | /* Check if the GFP flags allow compaction */ |
1145 | if (!order || !may_enter_fs || !may_perform_io) | 1295 | if (!order || !may_enter_fs || !may_perform_io) |
1146 | return rc; | 1296 | return COMPACT_SKIPPED; |
1147 | |||
1148 | count_compact_event(COMPACTSTALL); | ||
1149 | 1297 | ||
1150 | #ifdef CONFIG_CMA | 1298 | #ifdef CONFIG_CMA |
1151 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 1299 | if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
1152 | alloc_flags |= ALLOC_CMA; | 1300 | alloc_flags |= ALLOC_CMA; |
1153 | #endif | 1301 | #endif |
1154 | /* Compact each zone in the list */ | 1302 | /* Compact each zone in the list */ |
1155 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, | 1303 | for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, |
1156 | nodemask) { | 1304 | nodemask) { |
1157 | int status; | 1305 | int status; |
1306 | int zone_contended; | ||
1307 | |||
1308 | if (compaction_deferred(zone, order)) | ||
1309 | continue; | ||
1158 | 1310 | ||
1159 | status = compact_zone_order(zone, order, gfp_mask, mode, | 1311 | status = compact_zone_order(zone, order, gfp_mask, mode, |
1160 | contended); | 1312 | &zone_contended); |
1161 | rc = max(status, rc); | 1313 | rc = max(status, rc); |
1314 | /* | ||
1315 | * It takes at least one zone that wasn't lock contended | ||
1316 | * to clear all_zones_contended. | ||
1317 | */ | ||
1318 | all_zones_contended &= zone_contended; | ||
1162 | 1319 | ||
1163 | /* If a normal allocation would succeed, stop compacting */ | 1320 | /* If a normal allocation would succeed, stop compacting */ |
1164 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, | 1321 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, |
1165 | alloc_flags)) | 1322 | alloc_flags)) { |
1166 | break; | 1323 | *candidate_zone = zone; |
1324 | /* | ||
1325 | * We think the allocation will succeed in this zone, | ||
1326 | * but it is not certain, hence the false. The caller | ||
1327 | * will repeat this with true if allocation indeed | ||
1328 | * succeeds in this zone. | ||
1329 | */ | ||
1330 | compaction_defer_reset(zone, order, false); | ||
1331 | /* | ||
1332 | * It is possible that async compaction aborted due to | ||
1333 | * need_resched() and the watermarks were ok thanks to | ||
1334 | * somebody else freeing memory. The allocation can | ||
1335 | * however still fail so we better signal the | ||
1336 | * need_resched() contention anyway (this will not | ||
1337 | * prevent the allocation attempt). | ||
1338 | */ | ||
1339 | if (zone_contended == COMPACT_CONTENDED_SCHED) | ||
1340 | *contended = COMPACT_CONTENDED_SCHED; | ||
1341 | |||
1342 | goto break_loop; | ||
1343 | } | ||
1344 | |||
1345 | if (mode != MIGRATE_ASYNC) { | ||
1346 | /* | ||
1347 | * We think that allocation won't succeed in this zone | ||
1348 | * so we defer compaction there. If it ends up | ||
1349 | * succeeding after all, it will be reset. | ||
1350 | */ | ||
1351 | defer_compaction(zone, order); | ||
1352 | } | ||
1353 | |||
1354 | /* | ||
1355 | * We might have stopped compacting due to need_resched() in | ||
1356 | * async compaction, or due to a fatal signal detected. In that | ||
1357 | * case do not try further zones and signal need_resched() | ||
1358 | * contention. | ||
1359 | */ | ||
1360 | if ((zone_contended == COMPACT_CONTENDED_SCHED) | ||
1361 | || fatal_signal_pending(current)) { | ||
1362 | *contended = COMPACT_CONTENDED_SCHED; | ||
1363 | goto break_loop; | ||
1364 | } | ||
1365 | |||
1366 | continue; | ||
1367 | break_loop: | ||
1368 | /* | ||
1369 | * We might not have tried all the zones, so be conservative | ||
1370 | * and assume they are not all lock contended. | ||
1371 | */ | ||
1372 | all_zones_contended = 0; | ||
1373 | break; | ||
1167 | } | 1374 | } |
1168 | 1375 | ||
1376 | /* | ||
1377 | * If at least one zone wasn't deferred or skipped, we report if all | ||
1378 | * zones that were tried were lock contended. | ||
1379 | */ | ||
1380 | if (rc > COMPACT_SKIPPED && all_zones_contended) | ||
1381 | *contended = COMPACT_CONTENDED_LOCK; | ||
1382 | |||
1169 | return rc; | 1383 | return rc; |
1170 | } | 1384 | } |
1171 | 1385 | ||
diff --git a/mm/debug.c b/mm/debug.c new file mode 100644 index 000000000000..5ce45c9a29b5 --- /dev/null +++ b/mm/debug.c | |||
@@ -0,0 +1,237 @@ | |||
1 | /* | ||
2 | * mm/debug.c | ||
3 | * | ||
4 | * mm/ specific debug routines. | ||
5 | * | ||
6 | */ | ||
7 | |||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/mm.h> | ||
10 | #include <linux/ftrace_event.h> | ||
11 | #include <linux/memcontrol.h> | ||
12 | |||
13 | static const struct trace_print_flags pageflag_names[] = { | ||
14 | {1UL << PG_locked, "locked" }, | ||
15 | {1UL << PG_error, "error" }, | ||
16 | {1UL << PG_referenced, "referenced" }, | ||
17 | {1UL << PG_uptodate, "uptodate" }, | ||
18 | {1UL << PG_dirty, "dirty" }, | ||
19 | {1UL << PG_lru, "lru" }, | ||
20 | {1UL << PG_active, "active" }, | ||
21 | {1UL << PG_slab, "slab" }, | ||
22 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | ||
23 | {1UL << PG_arch_1, "arch_1" }, | ||
24 | {1UL << PG_reserved, "reserved" }, | ||
25 | {1UL << PG_private, "private" }, | ||
26 | {1UL << PG_private_2, "private_2" }, | ||
27 | {1UL << PG_writeback, "writeback" }, | ||
28 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
29 | {1UL << PG_head, "head" }, | ||
30 | {1UL << PG_tail, "tail" }, | ||
31 | #else | ||
32 | {1UL << PG_compound, "compound" }, | ||
33 | #endif | ||
34 | {1UL << PG_swapcache, "swapcache" }, | ||
35 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | ||
36 | {1UL << PG_reclaim, "reclaim" }, | ||
37 | {1UL << PG_swapbacked, "swapbacked" }, | ||
38 | {1UL << PG_unevictable, "unevictable" }, | ||
39 | #ifdef CONFIG_MMU | ||
40 | {1UL << PG_mlocked, "mlocked" }, | ||
41 | #endif | ||
42 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | ||
43 | {1UL << PG_uncached, "uncached" }, | ||
44 | #endif | ||
45 | #ifdef CONFIG_MEMORY_FAILURE | ||
46 | {1UL << PG_hwpoison, "hwpoison" }, | ||
47 | #endif | ||
48 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
49 | {1UL << PG_compound_lock, "compound_lock" }, | ||
50 | #endif | ||
51 | }; | ||
52 | |||
53 | static void dump_flags(unsigned long flags, | ||
54 | const struct trace_print_flags *names, int count) | ||
55 | { | ||
56 | const char *delim = ""; | ||
57 | unsigned long mask; | ||
58 | int i; | ||
59 | |||
60 | pr_emerg("flags: %#lx(", flags); | ||
61 | |||
62 | /* remove zone id */ | ||
63 | flags &= (1UL << NR_PAGEFLAGS) - 1; | ||
64 | |||
65 | for (i = 0; i < count && flags; i++) { | ||
66 | |||
67 | mask = names[i].mask; | ||
68 | if ((flags & mask) != mask) | ||
69 | continue; | ||
70 | |||
71 | flags &= ~mask; | ||
72 | pr_cont("%s%s", delim, names[i].name); | ||
73 | delim = "|"; | ||
74 | } | ||
75 | |||
76 | /* check for left over flags */ | ||
77 | if (flags) | ||
78 | pr_cont("%s%#lx", delim, flags); | ||
79 | |||
80 | pr_cont(")\n"); | ||
81 | } | ||
82 | |||
83 | void dump_page_badflags(struct page *page, const char *reason, | ||
84 | unsigned long badflags) | ||
85 | { | ||
86 | pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | ||
87 | page, atomic_read(&page->_count), page_mapcount(page), | ||
88 | page->mapping, page->index); | ||
89 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | ||
90 | dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); | ||
91 | if (reason) | ||
92 | pr_alert("page dumped because: %s\n", reason); | ||
93 | if (page->flags & badflags) { | ||
94 | pr_alert("bad because of flags:\n"); | ||
95 | dump_flags(page->flags & badflags, | ||
96 | pageflag_names, ARRAY_SIZE(pageflag_names)); | ||
97 | } | ||
98 | mem_cgroup_print_bad_page(page); | ||
99 | } | ||
100 | |||
101 | void dump_page(struct page *page, const char *reason) | ||
102 | { | ||
103 | dump_page_badflags(page, reason, 0); | ||
104 | } | ||
105 | EXPORT_SYMBOL(dump_page); | ||
106 | |||
107 | #ifdef CONFIG_DEBUG_VM | ||
108 | |||
109 | static const struct trace_print_flags vmaflags_names[] = { | ||
110 | {VM_READ, "read" }, | ||
111 | {VM_WRITE, "write" }, | ||
112 | {VM_EXEC, "exec" }, | ||
113 | {VM_SHARED, "shared" }, | ||
114 | {VM_MAYREAD, "mayread" }, | ||
115 | {VM_MAYWRITE, "maywrite" }, | ||
116 | {VM_MAYEXEC, "mayexec" }, | ||
117 | {VM_MAYSHARE, "mayshare" }, | ||
118 | {VM_GROWSDOWN, "growsdown" }, | ||
119 | {VM_PFNMAP, "pfnmap" }, | ||
120 | {VM_DENYWRITE, "denywrite" }, | ||
121 | {VM_LOCKED, "locked" }, | ||
122 | {VM_IO, "io" }, | ||
123 | {VM_SEQ_READ, "seqread" }, | ||
124 | {VM_RAND_READ, "randread" }, | ||
125 | {VM_DONTCOPY, "dontcopy" }, | ||
126 | {VM_DONTEXPAND, "dontexpand" }, | ||
127 | {VM_ACCOUNT, "account" }, | ||
128 | {VM_NORESERVE, "noreserve" }, | ||
129 | {VM_HUGETLB, "hugetlb" }, | ||
130 | {VM_NONLINEAR, "nonlinear" }, | ||
131 | #if defined(CONFIG_X86) | ||
132 | {VM_PAT, "pat" }, | ||
133 | #elif defined(CONFIG_PPC) | ||
134 | {VM_SAO, "sao" }, | ||
135 | #elif defined(CONFIG_PARISC) || defined(CONFIG_METAG) || defined(CONFIG_IA64) | ||
136 | {VM_GROWSUP, "growsup" }, | ||
137 | #elif !defined(CONFIG_MMU) | ||
138 | {VM_MAPPED_COPY, "mappedcopy" }, | ||
139 | #else | ||
140 | {VM_ARCH_1, "arch_1" }, | ||
141 | #endif | ||
142 | {VM_DONTDUMP, "dontdump" }, | ||
143 | #ifdef CONFIG_MEM_SOFT_DIRTY | ||
144 | {VM_SOFTDIRTY, "softdirty" }, | ||
145 | #endif | ||
146 | {VM_MIXEDMAP, "mixedmap" }, | ||
147 | {VM_HUGEPAGE, "hugepage" }, | ||
148 | {VM_NOHUGEPAGE, "nohugepage" }, | ||
149 | {VM_MERGEABLE, "mergeable" }, | ||
150 | }; | ||
151 | |||
152 | void dump_vma(const struct vm_area_struct *vma) | ||
153 | { | ||
154 | pr_emerg("vma %p start %p end %p\n" | ||
155 | "next %p prev %p mm %p\n" | ||
156 | "prot %lx anon_vma %p vm_ops %p\n" | ||
157 | "pgoff %lx file %p private_data %p\n", | ||
158 | vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, | ||
159 | vma->vm_prev, vma->vm_mm, | ||
160 | (unsigned long)pgprot_val(vma->vm_page_prot), | ||
161 | vma->anon_vma, vma->vm_ops, vma->vm_pgoff, | ||
162 | vma->vm_file, vma->vm_private_data); | ||
163 | dump_flags(vma->vm_flags, vmaflags_names, ARRAY_SIZE(vmaflags_names)); | ||
164 | } | ||
165 | EXPORT_SYMBOL(dump_vma); | ||
166 | |||
167 | void dump_mm(const struct mm_struct *mm) | ||
168 | { | ||
169 | pr_emerg("mm %p mmap %p seqnum %d task_size %lu\n" | ||
170 | #ifdef CONFIG_MMU | ||
171 | "get_unmapped_area %p\n" | ||
172 | #endif | ||
173 | "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" | ||
174 | "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" | ||
175 | "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" | ||
176 | "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" | ||
177 | "start_code %lx end_code %lx start_data %lx end_data %lx\n" | ||
178 | "start_brk %lx brk %lx start_stack %lx\n" | ||
179 | "arg_start %lx arg_end %lx env_start %lx env_end %lx\n" | ||
180 | "binfmt %p flags %lx core_state %p\n" | ||
181 | #ifdef CONFIG_AIO | ||
182 | "ioctx_table %p\n" | ||
183 | #endif | ||
184 | #ifdef CONFIG_MEMCG | ||
185 | "owner %p " | ||
186 | #endif | ||
187 | "exe_file %p\n" | ||
188 | #ifdef CONFIG_MMU_NOTIFIER | ||
189 | "mmu_notifier_mm %p\n" | ||
190 | #endif | ||
191 | #ifdef CONFIG_NUMA_BALANCING | ||
192 | "numa_next_scan %lu numa_scan_offset %lu numa_scan_seq %d\n" | ||
193 | #endif | ||
194 | #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) | ||
195 | "tlb_flush_pending %d\n" | ||
196 | #endif | ||
197 | "%s", /* This is here to hold the comma */ | ||
198 | |||
199 | mm, mm->mmap, mm->vmacache_seqnum, mm->task_size, | ||
200 | #ifdef CONFIG_MMU | ||
201 | mm->get_unmapped_area, | ||
202 | #endif | ||
203 | mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, | ||
204 | mm->pgd, atomic_read(&mm->mm_users), | ||
205 | atomic_read(&mm->mm_count), | ||
206 | atomic_long_read((atomic_long_t *)&mm->nr_ptes), | ||
207 | mm->map_count, | ||
208 | mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, | ||
209 | mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, | ||
210 | mm->start_code, mm->end_code, mm->start_data, mm->end_data, | ||
211 | mm->start_brk, mm->brk, mm->start_stack, | ||
212 | mm->arg_start, mm->arg_end, mm->env_start, mm->env_end, | ||
213 | mm->binfmt, mm->flags, mm->core_state, | ||
214 | #ifdef CONFIG_AIO | ||
215 | mm->ioctx_table, | ||
216 | #endif | ||
217 | #ifdef CONFIG_MEMCG | ||
218 | mm->owner, | ||
219 | #endif | ||
220 | mm->exe_file, | ||
221 | #ifdef CONFIG_MMU_NOTIFIER | ||
222 | mm->mmu_notifier_mm, | ||
223 | #endif | ||
224 | #ifdef CONFIG_NUMA_BALANCING | ||
225 | mm->numa_next_scan, mm->numa_scan_offset, mm->numa_scan_seq, | ||
226 | #endif | ||
227 | #if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION) | ||
228 | mm->tlb_flush_pending, | ||
229 | #endif | ||
230 | "" /* This is here to not have a comma! */ | ||
231 | ); | ||
232 | |||
233 | dump_flags(mm->def_flags, vmaflags_names, | ||
234 | ARRAY_SIZE(vmaflags_names)); | ||
235 | } | ||
236 | |||
237 | #endif /* CONFIG_DEBUG_VM */ | ||
diff --git a/mm/dmapool.c b/mm/dmapool.c index ba8019b063e1..fd5fe4342e93 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c | |||
@@ -62,6 +62,7 @@ struct dma_page { /* cacheable header for 'allocation' bytes */ | |||
62 | }; | 62 | }; |
63 | 63 | ||
64 | static DEFINE_MUTEX(pools_lock); | 64 | static DEFINE_MUTEX(pools_lock); |
65 | static DEFINE_MUTEX(pools_reg_lock); | ||
65 | 66 | ||
66 | static ssize_t | 67 | static ssize_t |
67 | show_pools(struct device *dev, struct device_attribute *attr, char *buf) | 68 | show_pools(struct device *dev, struct device_attribute *attr, char *buf) |
@@ -132,29 +133,27 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, | |||
132 | { | 133 | { |
133 | struct dma_pool *retval; | 134 | struct dma_pool *retval; |
134 | size_t allocation; | 135 | size_t allocation; |
136 | bool empty = false; | ||
135 | 137 | ||
136 | if (align == 0) { | 138 | if (align == 0) |
137 | align = 1; | 139 | align = 1; |
138 | } else if (align & (align - 1)) { | 140 | else if (align & (align - 1)) |
139 | return NULL; | 141 | return NULL; |
140 | } | ||
141 | 142 | ||
142 | if (size == 0) { | 143 | if (size == 0) |
143 | return NULL; | 144 | return NULL; |
144 | } else if (size < 4) { | 145 | else if (size < 4) |
145 | size = 4; | 146 | size = 4; |
146 | } | ||
147 | 147 | ||
148 | if ((size % align) != 0) | 148 | if ((size % align) != 0) |
149 | size = ALIGN(size, align); | 149 | size = ALIGN(size, align); |
150 | 150 | ||
151 | allocation = max_t(size_t, size, PAGE_SIZE); | 151 | allocation = max_t(size_t, size, PAGE_SIZE); |
152 | 152 | ||
153 | if (!boundary) { | 153 | if (!boundary) |
154 | boundary = allocation; | 154 | boundary = allocation; |
155 | } else if ((boundary < size) || (boundary & (boundary - 1))) { | 155 | else if ((boundary < size) || (boundary & (boundary - 1))) |
156 | return NULL; | 156 | return NULL; |
157 | } | ||
158 | 157 | ||
159 | retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); | 158 | retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); |
160 | if (!retval) | 159 | if (!retval) |
@@ -172,15 +171,34 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, | |||
172 | 171 | ||
173 | INIT_LIST_HEAD(&retval->pools); | 172 | INIT_LIST_HEAD(&retval->pools); |
174 | 173 | ||
174 | /* | ||
175 | * pools_lock ensures that the ->dma_pools list does not get corrupted. | ||
176 | * pools_reg_lock ensures that there is not a race between | ||
177 | * dma_pool_create() and dma_pool_destroy() or within dma_pool_create() | ||
178 | * when the first invocation of dma_pool_create() failed on | ||
179 | * device_create_file() and the second assumes that it has been done (I | ||
180 | * know it is a short window). | ||
181 | */ | ||
182 | mutex_lock(&pools_reg_lock); | ||
175 | mutex_lock(&pools_lock); | 183 | mutex_lock(&pools_lock); |
176 | if (list_empty(&dev->dma_pools) && | 184 | if (list_empty(&dev->dma_pools)) |
177 | device_create_file(dev, &dev_attr_pools)) { | 185 | empty = true; |
178 | kfree(retval); | 186 | list_add(&retval->pools, &dev->dma_pools); |
179 | retval = NULL; | ||
180 | } else | ||
181 | list_add(&retval->pools, &dev->dma_pools); | ||
182 | mutex_unlock(&pools_lock); | 187 | mutex_unlock(&pools_lock); |
183 | 188 | if (empty) { | |
189 | int err; | ||
190 | |||
191 | err = device_create_file(dev, &dev_attr_pools); | ||
192 | if (err) { | ||
193 | mutex_lock(&pools_lock); | ||
194 | list_del(&retval->pools); | ||
195 | mutex_unlock(&pools_lock); | ||
196 | mutex_unlock(&pools_reg_lock); | ||
197 | kfree(retval); | ||
198 | return NULL; | ||
199 | } | ||
200 | } | ||
201 | mutex_unlock(&pools_reg_lock); | ||
184 | return retval; | 202 | return retval; |
185 | } | 203 | } |
186 | EXPORT_SYMBOL(dma_pool_create); | 204 | EXPORT_SYMBOL(dma_pool_create); |
@@ -251,11 +269,17 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page) | |||
251 | */ | 269 | */ |
252 | void dma_pool_destroy(struct dma_pool *pool) | 270 | void dma_pool_destroy(struct dma_pool *pool) |
253 | { | 271 | { |
272 | bool empty = false; | ||
273 | |||
274 | mutex_lock(&pools_reg_lock); | ||
254 | mutex_lock(&pools_lock); | 275 | mutex_lock(&pools_lock); |
255 | list_del(&pool->pools); | 276 | list_del(&pool->pools); |
256 | if (pool->dev && list_empty(&pool->dev->dma_pools)) | 277 | if (pool->dev && list_empty(&pool->dev->dma_pools)) |
257 | device_remove_file(pool->dev, &dev_attr_pools); | 278 | empty = true; |
258 | mutex_unlock(&pools_lock); | 279 | mutex_unlock(&pools_lock); |
280 | if (empty) | ||
281 | device_remove_file(pool->dev, &dev_attr_pools); | ||
282 | mutex_unlock(&pools_reg_lock); | ||
259 | 283 | ||
260 | while (!list_empty(&pool->page_list)) { | 284 | while (!list_empty(&pool->page_list)) { |
261 | struct dma_page *page; | 285 | struct dma_page *page; |
diff --git a/mm/filemap.c b/mm/filemap.c index 0ab0a3ea5721..14b4642279f1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1753,7 +1753,7 @@ EXPORT_SYMBOL(generic_file_read_iter); | |||
1753 | static int page_cache_read(struct file *file, pgoff_t offset) | 1753 | static int page_cache_read(struct file *file, pgoff_t offset) |
1754 | { | 1754 | { |
1755 | struct address_space *mapping = file->f_mapping; | 1755 | struct address_space *mapping = file->f_mapping; |
1756 | struct page *page; | 1756 | struct page *page; |
1757 | int ret; | 1757 | int ret; |
1758 | 1758 | ||
1759 | do { | 1759 | do { |
@@ -1770,7 +1770,7 @@ static int page_cache_read(struct file *file, pgoff_t offset) | |||
1770 | page_cache_release(page); | 1770 | page_cache_release(page); |
1771 | 1771 | ||
1772 | } while (ret == AOP_TRUNCATED_PAGE); | 1772 | } while (ret == AOP_TRUNCATED_PAGE); |
1773 | 1773 | ||
1774 | return ret; | 1774 | return ret; |
1775 | } | 1775 | } |
1776 | 1776 | ||
@@ -10,6 +10,10 @@ | |||
10 | #include <linux/swap.h> | 10 | #include <linux/swap.h> |
11 | #include <linux/swapops.h> | 11 | #include <linux/swapops.h> |
12 | 12 | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/rwsem.h> | ||
15 | #include <asm/pgtable.h> | ||
16 | |||
13 | #include "internal.h" | 17 | #include "internal.h" |
14 | 18 | ||
15 | static struct page *no_page_table(struct vm_area_struct *vma, | 19 | static struct page *no_page_table(struct vm_area_struct *vma, |
@@ -676,3 +680,353 @@ struct page *get_dump_page(unsigned long addr) | |||
676 | return page; | 680 | return page; |
677 | } | 681 | } |
678 | #endif /* CONFIG_ELF_CORE */ | 682 | #endif /* CONFIG_ELF_CORE */ |
683 | |||
684 | /* | ||
685 | * Generic RCU Fast GUP | ||
686 | * | ||
687 | * get_user_pages_fast attempts to pin user pages by walking the page | ||
688 | * tables directly and avoids taking locks. Thus the walker needs to be | ||
689 | * protected from page table pages being freed from under it, and should | ||
690 | * block any THP splits. | ||
691 | * | ||
692 | * One way to achieve this is to have the walker disable interrupts, and | ||
693 | * rely on IPIs from the TLB flushing code blocking before the page table | ||
694 | * pages are freed. This is unsuitable for architectures that do not need | ||
695 | * to broadcast an IPI when invalidating TLBs. | ||
696 | * | ||
697 | * Another way to achieve this is to batch up page table containing pages | ||
698 | * belonging to more than one mm_user, then rcu_sched a callback to free those | ||
699 | * pages. Disabling interrupts will allow the fast_gup walker to both block | ||
700 | * the rcu_sched callback, and an IPI that we broadcast for splitting THPs | ||
701 | * (which is a relatively rare event). The code below adopts this strategy. | ||
702 | * | ||
703 | * Before activating this code, please be aware that the following assumptions | ||
704 | * are currently made: | ||
705 | * | ||
706 | * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free | ||
707 | * pages containing page tables. | ||
708 | * | ||
709 | * *) THP splits will broadcast an IPI, this can be achieved by overriding | ||
710 | * pmdp_splitting_flush. | ||
711 | * | ||
712 | * *) ptes can be read atomically by the architecture. | ||
713 | * | ||
714 | * *) access_ok is sufficient to validate userspace address ranges. | ||
715 | * | ||
716 | * The last two assumptions can be relaxed by the addition of helper functions. | ||
717 | * | ||
718 | * This code is based heavily on the PowerPC implementation by Nick Piggin. | ||
719 | */ | ||
720 | #ifdef CONFIG_HAVE_GENERIC_RCU_GUP | ||
721 | |||
722 | #ifdef __HAVE_ARCH_PTE_SPECIAL | ||
723 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | ||
724 | int write, struct page **pages, int *nr) | ||
725 | { | ||
726 | pte_t *ptep, *ptem; | ||
727 | int ret = 0; | ||
728 | |||
729 | ptem = ptep = pte_offset_map(&pmd, addr); | ||
730 | do { | ||
731 | /* | ||
732 | * In the line below we are assuming that the pte can be read | ||
733 | * atomically. If this is not the case for your architecture, | ||
734 | * please wrap this in a helper function! | ||
735 | * | ||
736 | * for an example see gup_get_pte in arch/x86/mm/gup.c | ||
737 | */ | ||
738 | pte_t pte = ACCESS_ONCE(*ptep); | ||
739 | struct page *page; | ||
740 | |||
741 | /* | ||
742 | * Similar to the PMD case below, NUMA hinting must take slow | ||
743 | * path | ||
744 | */ | ||
745 | if (!pte_present(pte) || pte_special(pte) || | ||
746 | pte_numa(pte) || (write && !pte_write(pte))) | ||
747 | goto pte_unmap; | ||
748 | |||
749 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | ||
750 | page = pte_page(pte); | ||
751 | |||
752 | if (!page_cache_get_speculative(page)) | ||
753 | goto pte_unmap; | ||
754 | |||
755 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | ||
756 | put_page(page); | ||
757 | goto pte_unmap; | ||
758 | } | ||
759 | |||
760 | pages[*nr] = page; | ||
761 | (*nr)++; | ||
762 | |||
763 | } while (ptep++, addr += PAGE_SIZE, addr != end); | ||
764 | |||
765 | ret = 1; | ||
766 | |||
767 | pte_unmap: | ||
768 | pte_unmap(ptem); | ||
769 | return ret; | ||
770 | } | ||
771 | #else | ||
772 | |||
773 | /* | ||
774 | * If we can't determine whether or not a pte is special, then fail immediately | ||
775 | * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not | ||
776 | * to be special. | ||
777 | * | ||
778 | * For a futex to be placed on a THP tail page, get_futex_key requires a | ||
779 | * __get_user_pages_fast implementation that can pin pages. Thus it's still | ||
780 | * useful to have gup_huge_pmd even if we can't operate on ptes. | ||
781 | */ | ||
782 | static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | ||
783 | int write, struct page **pages, int *nr) | ||
784 | { | ||
785 | return 0; | ||
786 | } | ||
787 | #endif /* __HAVE_ARCH_PTE_SPECIAL */ | ||
788 | |||
789 | static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, | ||
790 | unsigned long end, int write, struct page **pages, int *nr) | ||
791 | { | ||
792 | struct page *head, *page, *tail; | ||
793 | int refs; | ||
794 | |||
795 | if (write && !pmd_write(orig)) | ||
796 | return 0; | ||
797 | |||
798 | refs = 0; | ||
799 | head = pmd_page(orig); | ||
800 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | ||
801 | tail = page; | ||
802 | do { | ||
803 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
804 | pages[*nr] = page; | ||
805 | (*nr)++; | ||
806 | page++; | ||
807 | refs++; | ||
808 | } while (addr += PAGE_SIZE, addr != end); | ||
809 | |||
810 | if (!page_cache_add_speculative(head, refs)) { | ||
811 | *nr -= refs; | ||
812 | return 0; | ||
813 | } | ||
814 | |||
815 | if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) { | ||
816 | *nr -= refs; | ||
817 | while (refs--) | ||
818 | put_page(head); | ||
819 | return 0; | ||
820 | } | ||
821 | |||
822 | /* | ||
823 | * Any tail pages need their mapcount reference taken before we | ||
824 | * return. (This allows the THP code to bump their ref count when | ||
825 | * they are split into base pages). | ||
826 | */ | ||
827 | while (refs--) { | ||
828 | if (PageTail(tail)) | ||
829 | get_huge_page_tail(tail); | ||
830 | tail++; | ||
831 | } | ||
832 | |||
833 | return 1; | ||
834 | } | ||
835 | |||
836 | static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, | ||
837 | unsigned long end, int write, struct page **pages, int *nr) | ||
838 | { | ||
839 | struct page *head, *page, *tail; | ||
840 | int refs; | ||
841 | |||
842 | if (write && !pud_write(orig)) | ||
843 | return 0; | ||
844 | |||
845 | refs = 0; | ||
846 | head = pud_page(orig); | ||
847 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | ||
848 | tail = page; | ||
849 | do { | ||
850 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
851 | pages[*nr] = page; | ||
852 | (*nr)++; | ||
853 | page++; | ||
854 | refs++; | ||
855 | } while (addr += PAGE_SIZE, addr != end); | ||
856 | |||
857 | if (!page_cache_add_speculative(head, refs)) { | ||
858 | *nr -= refs; | ||
859 | return 0; | ||
860 | } | ||
861 | |||
862 | if (unlikely(pud_val(orig) != pud_val(*pudp))) { | ||
863 | *nr -= refs; | ||
864 | while (refs--) | ||
865 | put_page(head); | ||
866 | return 0; | ||
867 | } | ||
868 | |||
869 | while (refs--) { | ||
870 | if (PageTail(tail)) | ||
871 | get_huge_page_tail(tail); | ||
872 | tail++; | ||
873 | } | ||
874 | |||
875 | return 1; | ||
876 | } | ||
877 | |||
878 | static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | ||
879 | int write, struct page **pages, int *nr) | ||
880 | { | ||
881 | unsigned long next; | ||
882 | pmd_t *pmdp; | ||
883 | |||
884 | pmdp = pmd_offset(&pud, addr); | ||
885 | do { | ||
886 | pmd_t pmd = ACCESS_ONCE(*pmdp); | ||
887 | |||
888 | next = pmd_addr_end(addr, end); | ||
889 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
890 | return 0; | ||
891 | |||
892 | if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { | ||
893 | /* | ||
894 | * NUMA hinting faults need to be handled in the GUP | ||
895 | * slowpath for accounting purposes and so that they | ||
896 | * can be serialised against THP migration. | ||
897 | */ | ||
898 | if (pmd_numa(pmd)) | ||
899 | return 0; | ||
900 | |||
901 | if (!gup_huge_pmd(pmd, pmdp, addr, next, write, | ||
902 | pages, nr)) | ||
903 | return 0; | ||
904 | |||
905 | } else if (!gup_pte_range(pmd, addr, next, write, pages, nr)) | ||
906 | return 0; | ||
907 | } while (pmdp++, addr = next, addr != end); | ||
908 | |||
909 | return 1; | ||
910 | } | ||
911 | |||
912 | static int gup_pud_range(pgd_t *pgdp, unsigned long addr, unsigned long end, | ||
913 | int write, struct page **pages, int *nr) | ||
914 | { | ||
915 | unsigned long next; | ||
916 | pud_t *pudp; | ||
917 | |||
918 | pudp = pud_offset(pgdp, addr); | ||
919 | do { | ||
920 | pud_t pud = ACCESS_ONCE(*pudp); | ||
921 | |||
922 | next = pud_addr_end(addr, end); | ||
923 | if (pud_none(pud)) | ||
924 | return 0; | ||
925 | if (pud_huge(pud)) { | ||
926 | if (!gup_huge_pud(pud, pudp, addr, next, write, | ||
927 | pages, nr)) | ||
928 | return 0; | ||
929 | } else if (!gup_pmd_range(pud, addr, next, write, pages, nr)) | ||
930 | return 0; | ||
931 | } while (pudp++, addr = next, addr != end); | ||
932 | |||
933 | return 1; | ||
934 | } | ||
935 | |||
936 | /* | ||
937 | * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to | ||
938 | * the regular GUP. It will only return non-negative values. | ||
939 | */ | ||
940 | int __get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
941 | struct page **pages) | ||
942 | { | ||
943 | struct mm_struct *mm = current->mm; | ||
944 | unsigned long addr, len, end; | ||
945 | unsigned long next, flags; | ||
946 | pgd_t *pgdp; | ||
947 | int nr = 0; | ||
948 | |||
949 | start &= PAGE_MASK; | ||
950 | addr = start; | ||
951 | len = (unsigned long) nr_pages << PAGE_SHIFT; | ||
952 | end = start + len; | ||
953 | |||
954 | if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, | ||
955 | start, len))) | ||
956 | return 0; | ||
957 | |||
958 | /* | ||
959 | * Disable interrupts. We use the nested form as we can already have | ||
960 | * interrupts disabled by get_futex_key. | ||
961 | * | ||
962 | * With interrupts disabled, we block page table pages from being | ||
963 | * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h | ||
964 | * for more details. | ||
965 | * | ||
966 | * We do not adopt an rcu_read_lock(.) here as we also want to | ||
967 | * block IPIs that come from THPs splitting. | ||
968 | */ | ||
969 | |||
970 | local_irq_save(flags); | ||
971 | pgdp = pgd_offset(mm, addr); | ||
972 | do { | ||
973 | next = pgd_addr_end(addr, end); | ||
974 | if (pgd_none(*pgdp)) | ||
975 | break; | ||
976 | else if (!gup_pud_range(pgdp, addr, next, write, pages, &nr)) | ||
977 | break; | ||
978 | } while (pgdp++, addr = next, addr != end); | ||
979 | local_irq_restore(flags); | ||
980 | |||
981 | return nr; | ||
982 | } | ||
983 | |||
984 | /** | ||
985 | * get_user_pages_fast() - pin user pages in memory | ||
986 | * @start: starting user address | ||
987 | * @nr_pages: number of pages from start to pin | ||
988 | * @write: whether pages will be written to | ||
989 | * @pages: array that receives pointers to the pages pinned. | ||
990 | * Should be at least nr_pages long. | ||
991 | * | ||
992 | * Attempt to pin user pages in memory without taking mm->mmap_sem. | ||
993 | * If not successful, it will fall back to taking the lock and | ||
994 | * calling get_user_pages(). | ||
995 | * | ||
996 | * Returns number of pages pinned. This may be fewer than the number | ||
997 | * requested. If nr_pages is 0 or negative, returns 0. If no pages | ||
998 | * were pinned, returns -errno. | ||
999 | */ | ||
1000 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | ||
1001 | struct page **pages) | ||
1002 | { | ||
1003 | struct mm_struct *mm = current->mm; | ||
1004 | int nr, ret; | ||
1005 | |||
1006 | start &= PAGE_MASK; | ||
1007 | nr = __get_user_pages_fast(start, nr_pages, write, pages); | ||
1008 | ret = nr; | ||
1009 | |||
1010 | if (nr < nr_pages) { | ||
1011 | /* Try to get the remaining pages with get_user_pages */ | ||
1012 | start += nr << PAGE_SHIFT; | ||
1013 | pages += nr; | ||
1014 | |||
1015 | down_read(&mm->mmap_sem); | ||
1016 | ret = get_user_pages(current, mm, start, | ||
1017 | nr_pages - nr, write, 0, pages, NULL); | ||
1018 | up_read(&mm->mmap_sem); | ||
1019 | |||
1020 | /* Have to be a bit careful with return values */ | ||
1021 | if (nr > 0) { | ||
1022 | if (ret < 0) | ||
1023 | ret = nr; | ||
1024 | else | ||
1025 | ret += nr; | ||
1026 | } | ||
1027 | } | ||
1028 | |||
1029 | return ret; | ||
1030 | } | ||
1031 | |||
1032 | #endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ | ||
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8ffd9412ec5..74c78aa8bc2f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -1096,7 +1096,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1096 | unsigned long mmun_end; /* For mmu_notifiers */ | 1096 | unsigned long mmun_end; /* For mmu_notifiers */ |
1097 | 1097 | ||
1098 | ptl = pmd_lockptr(mm, pmd); | 1098 | ptl = pmd_lockptr(mm, pmd); |
1099 | VM_BUG_ON(!vma->anon_vma); | 1099 | VM_BUG_ON_VMA(!vma->anon_vma, vma); |
1100 | haddr = address & HPAGE_PMD_MASK; | 1100 | haddr = address & HPAGE_PMD_MASK; |
1101 | if (is_huge_zero_pmd(orig_pmd)) | 1101 | if (is_huge_zero_pmd(orig_pmd)) |
1102 | goto alloc; | 1102 | goto alloc; |
@@ -2048,7 +2048,7 @@ int __khugepaged_enter(struct mm_struct *mm) | |||
2048 | return -ENOMEM; | 2048 | return -ENOMEM; |
2049 | 2049 | ||
2050 | /* __khugepaged_exit() must not run from under us */ | 2050 | /* __khugepaged_exit() must not run from under us */ |
2051 | VM_BUG_ON(khugepaged_test_exit(mm)); | 2051 | VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); |
2052 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { | 2052 | if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { |
2053 | free_mm_slot(mm_slot); | 2053 | free_mm_slot(mm_slot); |
2054 | return 0; | 2054 | return 0; |
@@ -2083,7 +2083,7 @@ int khugepaged_enter_vma_merge(struct vm_area_struct *vma) | |||
2083 | if (vma->vm_ops) | 2083 | if (vma->vm_ops) |
2084 | /* khugepaged not yet working on file or special mappings */ | 2084 | /* khugepaged not yet working on file or special mappings */ |
2085 | return 0; | 2085 | return 0; |
2086 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 2086 | VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); |
2087 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; | 2087 | hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; |
2088 | hend = vma->vm_end & HPAGE_PMD_MASK; | 2088 | hend = vma->vm_end & HPAGE_PMD_MASK; |
2089 | if (hstart < hend) | 2089 | if (hstart < hend) |
@@ -2322,23 +2322,17 @@ static struct page | |||
2322 | int node) | 2322 | int node) |
2323 | { | 2323 | { |
2324 | VM_BUG_ON_PAGE(*hpage, *hpage); | 2324 | VM_BUG_ON_PAGE(*hpage, *hpage); |
2325 | |||
2325 | /* | 2326 | /* |
2326 | * Allocate the page while the vma is still valid and under | 2327 | * Before allocating the hugepage, release the mmap_sem read lock. |
2327 | * the mmap_sem read mode so there is no memory allocation | 2328 | * The allocation can take potentially a long time if it involves |
2328 | * later when we take the mmap_sem in write mode. This is more | 2329 | * sync compaction, and we do not need to hold the mmap_sem during |
2329 | * friendly behavior (OTOH it may actually hide bugs) to | 2330 | * that. We will recheck the vma after taking it again in write mode. |
2330 | * filesystems in userland with daemons allocating memory in | ||
2331 | * the userland I/O paths. Allocating memory with the | ||
2332 | * mmap_sem in read mode is good idea also to allow greater | ||
2333 | * scalability. | ||
2334 | */ | 2331 | */ |
2332 | up_read(&mm->mmap_sem); | ||
2333 | |||
2335 | *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( | 2334 | *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( |
2336 | khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); | 2335 | khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); |
2337 | /* | ||
2338 | * After allocating the hugepage, release the mmap_sem read lock in | ||
2339 | * preparation for taking it in write mode. | ||
2340 | */ | ||
2341 | up_read(&mm->mmap_sem); | ||
2342 | if (unlikely(!*hpage)) { | 2336 | if (unlikely(!*hpage)) { |
2343 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); | 2337 | count_vm_event(THP_COLLAPSE_ALLOC_FAILED); |
2344 | *hpage = ERR_PTR(-ENOMEM); | 2338 | *hpage = ERR_PTR(-ENOMEM); |
@@ -2412,7 +2406,7 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) | |||
2412 | return false; | 2406 | return false; |
2413 | if (is_vma_temporary_stack(vma)) | 2407 | if (is_vma_temporary_stack(vma)) |
2414 | return false; | 2408 | return false; |
2415 | VM_BUG_ON(vma->vm_flags & VM_NO_THP); | 2409 | VM_BUG_ON_VMA(vma->vm_flags & VM_NO_THP, vma); |
2416 | return true; | 2410 | return true; |
2417 | } | 2411 | } |
2418 | 2412 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eeceeeb09019..9fd722769927 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -434,7 +434,7 @@ static inline struct resv_map *inode_resv_map(struct inode *inode) | |||
434 | 434 | ||
435 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | 435 | static struct resv_map *vma_resv_map(struct vm_area_struct *vma) |
436 | { | 436 | { |
437 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 437 | VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
438 | if (vma->vm_flags & VM_MAYSHARE) { | 438 | if (vma->vm_flags & VM_MAYSHARE) { |
439 | struct address_space *mapping = vma->vm_file->f_mapping; | 439 | struct address_space *mapping = vma->vm_file->f_mapping; |
440 | struct inode *inode = mapping->host; | 440 | struct inode *inode = mapping->host; |
@@ -449,8 +449,8 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) | |||
449 | 449 | ||
450 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | 450 | static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) |
451 | { | 451 | { |
452 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 452 | VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
453 | VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); | 453 | VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); |
454 | 454 | ||
455 | set_vma_private_data(vma, (get_vma_private_data(vma) & | 455 | set_vma_private_data(vma, (get_vma_private_data(vma) & |
456 | HPAGE_RESV_MASK) | (unsigned long)map); | 456 | HPAGE_RESV_MASK) | (unsigned long)map); |
@@ -458,15 +458,15 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) | |||
458 | 458 | ||
459 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) | 459 | static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) |
460 | { | 460 | { |
461 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 461 | VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
462 | VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); | 462 | VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); |
463 | 463 | ||
464 | set_vma_private_data(vma, get_vma_private_data(vma) | flags); | 464 | set_vma_private_data(vma, get_vma_private_data(vma) | flags); |
465 | } | 465 | } |
466 | 466 | ||
467 | static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) | 467 | static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) |
468 | { | 468 | { |
469 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 469 | VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
470 | 470 | ||
471 | return (get_vma_private_data(vma) & flag) != 0; | 471 | return (get_vma_private_data(vma) & flag) != 0; |
472 | } | 472 | } |
@@ -474,7 +474,7 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) | |||
474 | /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ | 474 | /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ |
475 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | 475 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) |
476 | { | 476 | { |
477 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | 477 | VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); |
478 | if (!(vma->vm_flags & VM_MAYSHARE)) | 478 | if (!(vma->vm_flags & VM_MAYSHARE)) |
479 | vma->vm_private_data = (void *)0; | 479 | vma->vm_private_data = (void *)0; |
480 | } | 480 | } |
diff --git a/mm/internal.h b/mm/internal.h index a1b651b11c5f..829304090b90 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -142,10 +142,10 @@ struct compact_control { | |||
142 | bool finished_update_migrate; | 142 | bool finished_update_migrate; |
143 | 143 | ||
144 | int order; /* order a direct compactor needs */ | 144 | int order; /* order a direct compactor needs */ |
145 | int migratetype; /* MOVABLE, RECLAIMABLE etc */ | 145 | const gfp_t gfp_mask; /* gfp mask of a direct compactor */ |
146 | struct zone *zone; | 146 | struct zone *zone; |
147 | bool contended; /* True if a lock was contended, or | 147 | int contended; /* Signal need_sched() or lock |
148 | * need_resched() true during async | 148 | * contention detected during |
149 | * compaction | 149 | * compaction |
150 | */ | 150 | */ |
151 | }; | 151 | }; |
@@ -154,8 +154,8 @@ unsigned long | |||
154 | isolate_freepages_range(struct compact_control *cc, | 154 | isolate_freepages_range(struct compact_control *cc, |
155 | unsigned long start_pfn, unsigned long end_pfn); | 155 | unsigned long start_pfn, unsigned long end_pfn); |
156 | unsigned long | 156 | unsigned long |
157 | isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | 157 | isolate_migratepages_range(struct compact_control *cc, |
158 | unsigned long low_pfn, unsigned long end_pfn, bool unevictable); | 158 | unsigned long low_pfn, unsigned long end_pfn); |
159 | 159 | ||
160 | #endif | 160 | #endif |
161 | 161 | ||
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, | |||
164 | * general, page_zone(page)->lock must be held by the caller to prevent the | 164 | * general, page_zone(page)->lock must be held by the caller to prevent the |
165 | * page from being allocated in parallel and returning garbage as the order. | 165 | * page from being allocated in parallel and returning garbage as the order. |
166 | * If a caller does not hold page_zone(page)->lock, it must guarantee that the | 166 | * If a caller does not hold page_zone(page)->lock, it must guarantee that the |
167 | * page cannot be allocated or merged in parallel. | 167 | * page cannot be allocated or merged in parallel. Alternatively, it must |
168 | * handle invalid values gracefully, and use page_order_unsafe() below. | ||
168 | */ | 169 | */ |
169 | static inline unsigned long page_order(struct page *page) | 170 | static inline unsigned long page_order(struct page *page) |
170 | { | 171 | { |
@@ -172,6 +173,19 @@ static inline unsigned long page_order(struct page *page) | |||
172 | return page_private(page); | 173 | return page_private(page); |
173 | } | 174 | } |
174 | 175 | ||
176 | /* | ||
177 | * Like page_order(), but for callers who cannot afford to hold the zone lock. | ||
178 | * PageBuddy() should be checked first by the caller to minimize race window, | ||
179 | * and invalid values must be handled gracefully. | ||
180 | * | ||
181 | * ACCESS_ONCE is used so that if the caller assigns the result into a local | ||
182 | * variable and e.g. tests it for valid range before using, the compiler cannot | ||
183 | * decide to remove the variable and inline the page_private(page) multiple | ||
184 | * times, potentially observing different values in the tests and the actual | ||
185 | * use of the result. | ||
186 | */ | ||
187 | #define page_order_unsafe(page) ACCESS_ONCE(page_private(page)) | ||
188 | |||
175 | static inline bool is_cow_mapping(vm_flags_t flags) | 189 | static inline bool is_cow_mapping(vm_flags_t flags) |
176 | { | 190 | { |
177 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; | 191 | return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; |
diff --git a/mm/interval_tree.c b/mm/interval_tree.c index 4a5822a586e6..8da581fa9060 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c | |||
@@ -34,7 +34,7 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node, | |||
34 | struct vm_area_struct *parent; | 34 | struct vm_area_struct *parent; |
35 | unsigned long last = vma_last_pgoff(node); | 35 | unsigned long last = vma_last_pgoff(node); |
36 | 36 | ||
37 | VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev)); | 37 | VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node); |
38 | 38 | ||
39 | if (!prev->shared.linear.rb.rb_right) { | 39 | if (!prev->shared.linear.rb.rb_right) { |
40 | parent = prev; | 40 | parent = prev; |
diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index fd814fd61319..cab58bb592d8 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c | |||
@@ -2,6 +2,7 @@ | |||
2 | #include <linux/mm_types.h> | 2 | #include <linux/mm_types.h> |
3 | #include <linux/mm.h> | 3 | #include <linux/mm.h> |
4 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
5 | #include "slab.h" | ||
5 | #include <linux/kmemcheck.h> | 6 | #include <linux/kmemcheck.h> |
6 | 7 | ||
7 | void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) | 8 | void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) |
@@ -2310,7 +2310,7 @@ static int __init ksm_init(void) | |||
2310 | 2310 | ||
2311 | ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); | 2311 | ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); |
2312 | if (IS_ERR(ksm_thread)) { | 2312 | if (IS_ERR(ksm_thread)) { |
2313 | printk(KERN_ERR "ksm: creating kthread failed\n"); | 2313 | pr_err("ksm: creating kthread failed\n"); |
2314 | err = PTR_ERR(ksm_thread); | 2314 | err = PTR_ERR(ksm_thread); |
2315 | goto out_free; | 2315 | goto out_free; |
2316 | } | 2316 | } |
@@ -2318,7 +2318,7 @@ static int __init ksm_init(void) | |||
2318 | #ifdef CONFIG_SYSFS | 2318 | #ifdef CONFIG_SYSFS |
2319 | err = sysfs_create_group(mm_kobj, &ksm_attr_group); | 2319 | err = sysfs_create_group(mm_kobj, &ksm_attr_group); |
2320 | if (err) { | 2320 | if (err) { |
2321 | printk(KERN_ERR "ksm: register sysfs failed\n"); | 2321 | pr_err("ksm: register sysfs failed\n"); |
2322 | kthread_stop(ksm_thread); | 2322 | kthread_stop(ksm_thread); |
2323 | goto out_free; | 2323 | goto out_free; |
2324 | } | 2324 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 28928ce9b07f..23976fd885fd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -318,9 +318,6 @@ struct mem_cgroup { | |||
318 | /* OOM-Killer disable */ | 318 | /* OOM-Killer disable */ |
319 | int oom_kill_disable; | 319 | int oom_kill_disable; |
320 | 320 | ||
321 | /* set when res.limit == memsw.limit */ | ||
322 | bool memsw_is_minimum; | ||
323 | |||
324 | /* protect arrays of thresholds */ | 321 | /* protect arrays of thresholds */ |
325 | struct mutex thresholds_lock; | 322 | struct mutex thresholds_lock; |
326 | 323 | ||
@@ -484,14 +481,6 @@ enum res_type { | |||
484 | #define OOM_CONTROL (0) | 481 | #define OOM_CONTROL (0) |
485 | 482 | ||
486 | /* | 483 | /* |
487 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | ||
488 | */ | ||
489 | #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 | ||
490 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | ||
491 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | ||
492 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | ||
493 | |||
494 | /* | ||
495 | * The memcg_create_mutex will be held whenever a new cgroup is created. | 484 | * The memcg_create_mutex will be held whenever a new cgroup is created. |
496 | * As a consequence, any change that needs to protect against new child cgroups | 485 | * As a consequence, any change that needs to protect against new child cgroups |
497 | * appearing has to hold it as well. | 486 | * appearing has to hold it as well. |
@@ -649,11 +638,13 @@ int memcg_limited_groups_array_size; | |||
649 | struct static_key memcg_kmem_enabled_key; | 638 | struct static_key memcg_kmem_enabled_key; |
650 | EXPORT_SYMBOL(memcg_kmem_enabled_key); | 639 | EXPORT_SYMBOL(memcg_kmem_enabled_key); |
651 | 640 | ||
641 | static void memcg_free_cache_id(int id); | ||
642 | |||
652 | static void disarm_kmem_keys(struct mem_cgroup *memcg) | 643 | static void disarm_kmem_keys(struct mem_cgroup *memcg) |
653 | { | 644 | { |
654 | if (memcg_kmem_is_active(memcg)) { | 645 | if (memcg_kmem_is_active(memcg)) { |
655 | static_key_slow_dec(&memcg_kmem_enabled_key); | 646 | static_key_slow_dec(&memcg_kmem_enabled_key); |
656 | ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); | 647 | memcg_free_cache_id(memcg->kmemcg_id); |
657 | } | 648 | } |
658 | /* | 649 | /* |
659 | * This check can't live in kmem destruction function, | 650 | * This check can't live in kmem destruction function, |
@@ -1806,42 +1797,6 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
1806 | NULL, "Memory cgroup out of memory"); | 1797 | NULL, "Memory cgroup out of memory"); |
1807 | } | 1798 | } |
1808 | 1799 | ||
1809 | static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, | ||
1810 | gfp_t gfp_mask, | ||
1811 | unsigned long flags) | ||
1812 | { | ||
1813 | unsigned long total = 0; | ||
1814 | bool noswap = false; | ||
1815 | int loop; | ||
1816 | |||
1817 | if (flags & MEM_CGROUP_RECLAIM_NOSWAP) | ||
1818 | noswap = true; | ||
1819 | if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) | ||
1820 | noswap = true; | ||
1821 | |||
1822 | for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { | ||
1823 | if (loop) | ||
1824 | drain_all_stock_async(memcg); | ||
1825 | total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); | ||
1826 | /* | ||
1827 | * Allow limit shrinkers, which are triggered directly | ||
1828 | * by userspace, to catch signals and stop reclaim | ||
1829 | * after minimal progress, regardless of the margin. | ||
1830 | */ | ||
1831 | if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) | ||
1832 | break; | ||
1833 | if (mem_cgroup_margin(memcg)) | ||
1834 | break; | ||
1835 | /* | ||
1836 | * If nothing was reclaimed after two attempts, there | ||
1837 | * may be no reclaimable pages in this hierarchy. | ||
1838 | */ | ||
1839 | if (loop && !total) | ||
1840 | break; | ||
1841 | } | ||
1842 | return total; | ||
1843 | } | ||
1844 | |||
1845 | /** | 1800 | /** |
1846 | * test_mem_cgroup_node_reclaimable | 1801 | * test_mem_cgroup_node_reclaimable |
1847 | * @memcg: the target memcg | 1802 | * @memcg: the target memcg |
@@ -2544,8 +2499,9 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2544 | struct mem_cgroup *mem_over_limit; | 2499 | struct mem_cgroup *mem_over_limit; |
2545 | struct res_counter *fail_res; | 2500 | struct res_counter *fail_res; |
2546 | unsigned long nr_reclaimed; | 2501 | unsigned long nr_reclaimed; |
2547 | unsigned long flags = 0; | ||
2548 | unsigned long long size; | 2502 | unsigned long long size; |
2503 | bool may_swap = true; | ||
2504 | bool drained = false; | ||
2549 | int ret = 0; | 2505 | int ret = 0; |
2550 | 2506 | ||
2551 | if (mem_cgroup_is_root(memcg)) | 2507 | if (mem_cgroup_is_root(memcg)) |
@@ -2555,16 +2511,17 @@ retry: | |||
2555 | goto done; | 2511 | goto done; |
2556 | 2512 | ||
2557 | size = batch * PAGE_SIZE; | 2513 | size = batch * PAGE_SIZE; |
2558 | if (!res_counter_charge(&memcg->res, size, &fail_res)) { | 2514 | if (!do_swap_account || |
2559 | if (!do_swap_account) | 2515 | !res_counter_charge(&memcg->memsw, size, &fail_res)) { |
2560 | goto done_restock; | 2516 | if (!res_counter_charge(&memcg->res, size, &fail_res)) |
2561 | if (!res_counter_charge(&memcg->memsw, size, &fail_res)) | ||
2562 | goto done_restock; | 2517 | goto done_restock; |
2563 | res_counter_uncharge(&memcg->res, size); | 2518 | if (do_swap_account) |
2564 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | 2519 | res_counter_uncharge(&memcg->memsw, size); |
2565 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | ||
2566 | } else | ||
2567 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); | 2520 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); |
2521 | } else { | ||
2522 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); | ||
2523 | may_swap = false; | ||
2524 | } | ||
2568 | 2525 | ||
2569 | if (batch > nr_pages) { | 2526 | if (batch > nr_pages) { |
2570 | batch = nr_pages; | 2527 | batch = nr_pages; |
@@ -2588,11 +2545,18 @@ retry: | |||
2588 | if (!(gfp_mask & __GFP_WAIT)) | 2545 | if (!(gfp_mask & __GFP_WAIT)) |
2589 | goto nomem; | 2546 | goto nomem; |
2590 | 2547 | ||
2591 | nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); | 2548 | nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, |
2549 | gfp_mask, may_swap); | ||
2592 | 2550 | ||
2593 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) | 2551 | if (mem_cgroup_margin(mem_over_limit) >= nr_pages) |
2594 | goto retry; | 2552 | goto retry; |
2595 | 2553 | ||
2554 | if (!drained) { | ||
2555 | drain_all_stock_async(mem_over_limit); | ||
2556 | drained = true; | ||
2557 | goto retry; | ||
2558 | } | ||
2559 | |||
2596 | if (gfp_mask & __GFP_NORETRY) | 2560 | if (gfp_mask & __GFP_NORETRY) |
2597 | goto nomem; | 2561 | goto nomem; |
2598 | /* | 2562 | /* |
@@ -2798,12 +2762,6 @@ static DEFINE_MUTEX(memcg_slab_mutex); | |||
2798 | 2762 | ||
2799 | static DEFINE_MUTEX(activate_kmem_mutex); | 2763 | static DEFINE_MUTEX(activate_kmem_mutex); |
2800 | 2764 | ||
2801 | static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) | ||
2802 | { | ||
2803 | return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && | ||
2804 | memcg_kmem_is_active(memcg); | ||
2805 | } | ||
2806 | |||
2807 | /* | 2765 | /* |
2808 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer | 2766 | * This is a bit cumbersome, but it is rarely used and avoids a backpointer |
2809 | * in the memcg_cache_params struct. | 2767 | * in the memcg_cache_params struct. |
@@ -2823,7 +2781,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) | |||
2823 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); | 2781 | struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); |
2824 | struct memcg_cache_params *params; | 2782 | struct memcg_cache_params *params; |
2825 | 2783 | ||
2826 | if (!memcg_can_account_kmem(memcg)) | 2784 | if (!memcg_kmem_is_active(memcg)) |
2827 | return -EIO; | 2785 | return -EIO; |
2828 | 2786 | ||
2829 | print_slabinfo_header(m); | 2787 | print_slabinfo_header(m); |
@@ -2906,19 +2864,44 @@ int memcg_cache_id(struct mem_cgroup *memcg) | |||
2906 | return memcg ? memcg->kmemcg_id : -1; | 2864 | return memcg ? memcg->kmemcg_id : -1; |
2907 | } | 2865 | } |
2908 | 2866 | ||
2909 | static size_t memcg_caches_array_size(int num_groups) | 2867 | static int memcg_alloc_cache_id(void) |
2910 | { | 2868 | { |
2911 | ssize_t size; | 2869 | int id, size; |
2912 | if (num_groups <= 0) | 2870 | int err; |
2913 | return 0; | 2871 | |
2872 | id = ida_simple_get(&kmem_limited_groups, | ||
2873 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | ||
2874 | if (id < 0) | ||
2875 | return id; | ||
2876 | |||
2877 | if (id < memcg_limited_groups_array_size) | ||
2878 | return id; | ||
2879 | |||
2880 | /* | ||
2881 | * There's no space for the new id in memcg_caches arrays, | ||
2882 | * so we have to grow them. | ||
2883 | */ | ||
2914 | 2884 | ||
2915 | size = 2 * num_groups; | 2885 | size = 2 * (id + 1); |
2916 | if (size < MEMCG_CACHES_MIN_SIZE) | 2886 | if (size < MEMCG_CACHES_MIN_SIZE) |
2917 | size = MEMCG_CACHES_MIN_SIZE; | 2887 | size = MEMCG_CACHES_MIN_SIZE; |
2918 | else if (size > MEMCG_CACHES_MAX_SIZE) | 2888 | else if (size > MEMCG_CACHES_MAX_SIZE) |
2919 | size = MEMCG_CACHES_MAX_SIZE; | 2889 | size = MEMCG_CACHES_MAX_SIZE; |
2920 | 2890 | ||
2921 | return size; | 2891 | mutex_lock(&memcg_slab_mutex); |
2892 | err = memcg_update_all_caches(size); | ||
2893 | mutex_unlock(&memcg_slab_mutex); | ||
2894 | |||
2895 | if (err) { | ||
2896 | ida_simple_remove(&kmem_limited_groups, id); | ||
2897 | return err; | ||
2898 | } | ||
2899 | return id; | ||
2900 | } | ||
2901 | |||
2902 | static void memcg_free_cache_id(int id) | ||
2903 | { | ||
2904 | ida_simple_remove(&kmem_limited_groups, id); | ||
2922 | } | 2905 | } |
2923 | 2906 | ||
2924 | /* | 2907 | /* |
@@ -2928,97 +2911,7 @@ static size_t memcg_caches_array_size(int num_groups) | |||
2928 | */ | 2911 | */ |
2929 | void memcg_update_array_size(int num) | 2912 | void memcg_update_array_size(int num) |
2930 | { | 2913 | { |
2931 | if (num > memcg_limited_groups_array_size) | 2914 | memcg_limited_groups_array_size = num; |
2932 | memcg_limited_groups_array_size = memcg_caches_array_size(num); | ||
2933 | } | ||
2934 | |||
2935 | int memcg_update_cache_size(struct kmem_cache *s, int num_groups) | ||
2936 | { | ||
2937 | struct memcg_cache_params *cur_params = s->memcg_params; | ||
2938 | |||
2939 | VM_BUG_ON(!is_root_cache(s)); | ||
2940 | |||
2941 | if (num_groups > memcg_limited_groups_array_size) { | ||
2942 | int i; | ||
2943 | struct memcg_cache_params *new_params; | ||
2944 | ssize_t size = memcg_caches_array_size(num_groups); | ||
2945 | |||
2946 | size *= sizeof(void *); | ||
2947 | size += offsetof(struct memcg_cache_params, memcg_caches); | ||
2948 | |||
2949 | new_params = kzalloc(size, GFP_KERNEL); | ||
2950 | if (!new_params) | ||
2951 | return -ENOMEM; | ||
2952 | |||
2953 | new_params->is_root_cache = true; | ||
2954 | |||
2955 | /* | ||
2956 | * There is the chance it will be bigger than | ||
2957 | * memcg_limited_groups_array_size, if we failed an allocation | ||
2958 | * in a cache, in which case all caches updated before it, will | ||
2959 | * have a bigger array. | ||
2960 | * | ||
2961 | * But if that is the case, the data after | ||
2962 | * memcg_limited_groups_array_size is certainly unused | ||
2963 | */ | ||
2964 | for (i = 0; i < memcg_limited_groups_array_size; i++) { | ||
2965 | if (!cur_params->memcg_caches[i]) | ||
2966 | continue; | ||
2967 | new_params->memcg_caches[i] = | ||
2968 | cur_params->memcg_caches[i]; | ||
2969 | } | ||
2970 | |||
2971 | /* | ||
2972 | * Ideally, we would wait until all caches succeed, and only | ||
2973 | * then free the old one. But this is not worth the extra | ||
2974 | * pointer per-cache we'd have to have for this. | ||
2975 | * | ||
2976 | * It is not a big deal if some caches are left with a size | ||
2977 | * bigger than the others. And all updates will reset this | ||
2978 | * anyway. | ||
2979 | */ | ||
2980 | rcu_assign_pointer(s->memcg_params, new_params); | ||
2981 | if (cur_params) | ||
2982 | kfree_rcu(cur_params, rcu_head); | ||
2983 | } | ||
2984 | return 0; | ||
2985 | } | ||
2986 | |||
2987 | int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, | ||
2988 | struct kmem_cache *root_cache) | ||
2989 | { | ||
2990 | size_t size; | ||
2991 | |||
2992 | if (!memcg_kmem_enabled()) | ||
2993 | return 0; | ||
2994 | |||
2995 | if (!memcg) { | ||
2996 | size = offsetof(struct memcg_cache_params, memcg_caches); | ||
2997 | size += memcg_limited_groups_array_size * sizeof(void *); | ||
2998 | } else | ||
2999 | size = sizeof(struct memcg_cache_params); | ||
3000 | |||
3001 | s->memcg_params = kzalloc(size, GFP_KERNEL); | ||
3002 | if (!s->memcg_params) | ||
3003 | return -ENOMEM; | ||
3004 | |||
3005 | if (memcg) { | ||
3006 | s->memcg_params->memcg = memcg; | ||
3007 | s->memcg_params->root_cache = root_cache; | ||
3008 | css_get(&memcg->css); | ||
3009 | } else | ||
3010 | s->memcg_params->is_root_cache = true; | ||
3011 | |||
3012 | return 0; | ||
3013 | } | ||
3014 | |||
3015 | void memcg_free_cache_params(struct kmem_cache *s) | ||
3016 | { | ||
3017 | if (!s->memcg_params) | ||
3018 | return; | ||
3019 | if (!s->memcg_params->is_root_cache) | ||
3020 | css_put(&s->memcg_params->memcg->css); | ||
3021 | kfree(s->memcg_params); | ||
3022 | } | 2915 | } |
3023 | 2916 | ||
3024 | static void memcg_register_cache(struct mem_cgroup *memcg, | 2917 | static void memcg_register_cache(struct mem_cgroup *memcg, |
@@ -3051,6 +2944,7 @@ static void memcg_register_cache(struct mem_cgroup *memcg, | |||
3051 | if (!cachep) | 2944 | if (!cachep) |
3052 | return; | 2945 | return; |
3053 | 2946 | ||
2947 | css_get(&memcg->css); | ||
3054 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); | 2948 | list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); |
3055 | 2949 | ||
3056 | /* | 2950 | /* |
@@ -3084,6 +2978,9 @@ static void memcg_unregister_cache(struct kmem_cache *cachep) | |||
3084 | list_del(&cachep->memcg_params->list); | 2978 | list_del(&cachep->memcg_params->list); |
3085 | 2979 | ||
3086 | kmem_cache_destroy(cachep); | 2980 | kmem_cache_destroy(cachep); |
2981 | |||
2982 | /* drop the reference taken in memcg_register_cache */ | ||
2983 | css_put(&memcg->css); | ||
3087 | } | 2984 | } |
3088 | 2985 | ||
3089 | /* | 2986 | /* |
@@ -3261,7 +3158,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, | |||
3261 | rcu_read_lock(); | 3158 | rcu_read_lock(); |
3262 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); | 3159 | memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); |
3263 | 3160 | ||
3264 | if (!memcg_can_account_kmem(memcg)) | 3161 | if (!memcg_kmem_is_active(memcg)) |
3265 | goto out; | 3162 | goto out; |
3266 | 3163 | ||
3267 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); | 3164 | memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); |
@@ -3346,7 +3243,7 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) | |||
3346 | 3243 | ||
3347 | memcg = get_mem_cgroup_from_mm(current->mm); | 3244 | memcg = get_mem_cgroup_from_mm(current->mm); |
3348 | 3245 | ||
3349 | if (!memcg_can_account_kmem(memcg)) { | 3246 | if (!memcg_kmem_is_active(memcg)) { |
3350 | css_put(&memcg->css); | 3247 | css_put(&memcg->css); |
3351 | return true; | 3248 | return true; |
3352 | } | 3249 | } |
@@ -3688,7 +3585,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3688 | unsigned long long val) | 3585 | unsigned long long val) |
3689 | { | 3586 | { |
3690 | int retry_count; | 3587 | int retry_count; |
3691 | u64 memswlimit, memlimit; | ||
3692 | int ret = 0; | 3588 | int ret = 0; |
3693 | int children = mem_cgroup_count_children(memcg); | 3589 | int children = mem_cgroup_count_children(memcg); |
3694 | u64 curusage, oldusage; | 3590 | u64 curusage, oldusage; |
@@ -3715,31 +3611,23 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
3715 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. | 3611 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3716 | */ | 3612 | */ |
3717 | mutex_lock(&set_limit_mutex); | 3613 | mutex_lock(&set_limit_mutex); |
3718 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3614 | if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) { |
3719 | if (memswlimit < val) { | ||
3720 | ret = -EINVAL; | 3615 | ret = -EINVAL; |
3721 | mutex_unlock(&set_limit_mutex); | 3616 | mutex_unlock(&set_limit_mutex); |
3722 | break; | 3617 | break; |
3723 | } | 3618 | } |
3724 | 3619 | ||
3725 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3620 | if (res_counter_read_u64(&memcg->res, RES_LIMIT) < val) |
3726 | if (memlimit < val) | ||
3727 | enlarge = 1; | 3621 | enlarge = 1; |
3728 | 3622 | ||
3729 | ret = res_counter_set_limit(&memcg->res, val); | 3623 | ret = res_counter_set_limit(&memcg->res, val); |
3730 | if (!ret) { | ||
3731 | if (memswlimit == val) | ||
3732 | memcg->memsw_is_minimum = true; | ||
3733 | else | ||
3734 | memcg->memsw_is_minimum = false; | ||
3735 | } | ||
3736 | mutex_unlock(&set_limit_mutex); | 3624 | mutex_unlock(&set_limit_mutex); |
3737 | 3625 | ||
3738 | if (!ret) | 3626 | if (!ret) |
3739 | break; | 3627 | break; |
3740 | 3628 | ||
3741 | mem_cgroup_reclaim(memcg, GFP_KERNEL, | 3629 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); |
3742 | MEM_CGROUP_RECLAIM_SHRINK); | 3630 | |
3743 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 3631 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
3744 | /* Usage is reduced ? */ | 3632 | /* Usage is reduced ? */ |
3745 | if (curusage >= oldusage) | 3633 | if (curusage >= oldusage) |
@@ -3757,7 +3645,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3757 | unsigned long long val) | 3645 | unsigned long long val) |
3758 | { | 3646 | { |
3759 | int retry_count; | 3647 | int retry_count; |
3760 | u64 memlimit, memswlimit, oldusage, curusage; | 3648 | u64 oldusage, curusage; |
3761 | int children = mem_cgroup_count_children(memcg); | 3649 | int children = mem_cgroup_count_children(memcg); |
3762 | int ret = -EBUSY; | 3650 | int ret = -EBUSY; |
3763 | int enlarge = 0; | 3651 | int enlarge = 0; |
@@ -3776,30 +3664,21 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
3776 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. | 3664 | * We have to guarantee memcg->res.limit <= memcg->memsw.limit. |
3777 | */ | 3665 | */ |
3778 | mutex_lock(&set_limit_mutex); | 3666 | mutex_lock(&set_limit_mutex); |
3779 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | 3667 | if (res_counter_read_u64(&memcg->res, RES_LIMIT) > val) { |
3780 | if (memlimit > val) { | ||
3781 | ret = -EINVAL; | 3668 | ret = -EINVAL; |
3782 | mutex_unlock(&set_limit_mutex); | 3669 | mutex_unlock(&set_limit_mutex); |
3783 | break; | 3670 | break; |
3784 | } | 3671 | } |
3785 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | 3672 | if (res_counter_read_u64(&memcg->memsw, RES_LIMIT) < val) |
3786 | if (memswlimit < val) | ||
3787 | enlarge = 1; | 3673 | enlarge = 1; |
3788 | ret = res_counter_set_limit(&memcg->memsw, val); | 3674 | ret = res_counter_set_limit(&memcg->memsw, val); |
3789 | if (!ret) { | ||
3790 | if (memlimit == val) | ||
3791 | memcg->memsw_is_minimum = true; | ||
3792 | else | ||
3793 | memcg->memsw_is_minimum = false; | ||
3794 | } | ||
3795 | mutex_unlock(&set_limit_mutex); | 3675 | mutex_unlock(&set_limit_mutex); |
3796 | 3676 | ||
3797 | if (!ret) | 3677 | if (!ret) |
3798 | break; | 3678 | break; |
3799 | 3679 | ||
3800 | mem_cgroup_reclaim(memcg, GFP_KERNEL, | 3680 | try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); |
3801 | MEM_CGROUP_RECLAIM_NOSWAP | | 3681 | |
3802 | MEM_CGROUP_RECLAIM_SHRINK); | ||
3803 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 3682 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
3804 | /* Usage is reduced ? */ | 3683 | /* Usage is reduced ? */ |
3805 | if (curusage >= oldusage) | 3684 | if (curusage >= oldusage) |
@@ -4048,8 +3927,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) | |||
4048 | if (signal_pending(current)) | 3927 | if (signal_pending(current)) |
4049 | return -EINTR; | 3928 | return -EINTR; |
4050 | 3929 | ||
4051 | progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, | 3930 | progress = try_to_free_mem_cgroup_pages(memcg, 1, |
4052 | false); | 3931 | GFP_KERNEL, true); |
4053 | if (!progress) { | 3932 | if (!progress) { |
4054 | nr_retries--; | 3933 | nr_retries--; |
4055 | /* maybe some writeback is necessary */ | 3934 | /* maybe some writeback is necessary */ |
@@ -4214,23 +4093,12 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, | |||
4214 | if (err) | 4093 | if (err) |
4215 | goto out; | 4094 | goto out; |
4216 | 4095 | ||
4217 | memcg_id = ida_simple_get(&kmem_limited_groups, | 4096 | memcg_id = memcg_alloc_cache_id(); |
4218 | 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); | ||
4219 | if (memcg_id < 0) { | 4097 | if (memcg_id < 0) { |
4220 | err = memcg_id; | 4098 | err = memcg_id; |
4221 | goto out; | 4099 | goto out; |
4222 | } | 4100 | } |
4223 | 4101 | ||
4224 | /* | ||
4225 | * Make sure we have enough space for this cgroup in each root cache's | ||
4226 | * memcg_params. | ||
4227 | */ | ||
4228 | mutex_lock(&memcg_slab_mutex); | ||
4229 | err = memcg_update_all_caches(memcg_id + 1); | ||
4230 | mutex_unlock(&memcg_slab_mutex); | ||
4231 | if (err) | ||
4232 | goto out_rmid; | ||
4233 | |||
4234 | memcg->kmemcg_id = memcg_id; | 4102 | memcg->kmemcg_id = memcg_id; |
4235 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); | 4103 | INIT_LIST_HEAD(&memcg->memcg_slab_caches); |
4236 | 4104 | ||
@@ -4251,10 +4119,6 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, | |||
4251 | out: | 4119 | out: |
4252 | memcg_resume_kmem_account(); | 4120 | memcg_resume_kmem_account(); |
4253 | return err; | 4121 | return err; |
4254 | |||
4255 | out_rmid: | ||
4256 | ida_simple_remove(&kmem_limited_groups, memcg_id); | ||
4257 | goto out; | ||
4258 | } | 4122 | } |
4259 | 4123 | ||
4260 | static int memcg_activate_kmem(struct mem_cgroup *memcg, | 4124 | static int memcg_activate_kmem(struct mem_cgroup *memcg, |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2ff8c2325e96..29d8693d0c61 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -1307,7 +1307,7 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) | |||
1307 | /* | 1307 | /* |
1308 | * Confirm all pages in a range [start, end) is belongs to the same zone. | 1308 | * Confirm all pages in a range [start, end) is belongs to the same zone. |
1309 | */ | 1309 | */ |
1310 | static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) | 1310 | int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) |
1311 | { | 1311 | { |
1312 | unsigned long pfn; | 1312 | unsigned long pfn; |
1313 | struct zone *zone = NULL; | 1313 | struct zone *zone = NULL; |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8f5330d74f47..e58725aff7e9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -123,25 +123,23 @@ static struct mempolicy default_policy = { | |||
123 | 123 | ||
124 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; | 124 | static struct mempolicy preferred_node_policy[MAX_NUMNODES]; |
125 | 125 | ||
126 | static struct mempolicy *get_task_policy(struct task_struct *p) | 126 | struct mempolicy *get_task_policy(struct task_struct *p) |
127 | { | 127 | { |
128 | struct mempolicy *pol = p->mempolicy; | 128 | struct mempolicy *pol = p->mempolicy; |
129 | int node; | ||
129 | 130 | ||
130 | if (!pol) { | 131 | if (pol) |
131 | int node = numa_node_id(); | 132 | return pol; |
132 | 133 | ||
133 | if (node != NUMA_NO_NODE) { | 134 | node = numa_node_id(); |
134 | pol = &preferred_node_policy[node]; | 135 | if (node != NUMA_NO_NODE) { |
135 | /* | 136 | pol = &preferred_node_policy[node]; |
136 | * preferred_node_policy is not initialised early in | 137 | /* preferred_node_policy is not initialised early in boot */ |
137 | * boot | 138 | if (pol->mode) |
138 | */ | 139 | return pol; |
139 | if (!pol->mode) | ||
140 | pol = NULL; | ||
141 | } | ||
142 | } | 140 | } |
143 | 141 | ||
144 | return pol; | 142 | return &default_policy; |
145 | } | 143 | } |
146 | 144 | ||
147 | static const struct mempolicy_operations { | 145 | static const struct mempolicy_operations { |
@@ -683,7 +681,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, | |||
683 | } | 681 | } |
684 | 682 | ||
685 | if (flags & MPOL_MF_LAZY) { | 683 | if (flags & MPOL_MF_LAZY) { |
686 | change_prot_numa(vma, start, endvma); | 684 | /* Similar to task_numa_work, skip inaccessible VMAs */ |
685 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
686 | change_prot_numa(vma, start, endvma); | ||
687 | goto next; | 687 | goto next; |
688 | } | 688 | } |
689 | 689 | ||
@@ -804,7 +804,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
804 | nodemask_t *nodes) | 804 | nodemask_t *nodes) |
805 | { | 805 | { |
806 | struct mempolicy *new, *old; | 806 | struct mempolicy *new, *old; |
807 | struct mm_struct *mm = current->mm; | ||
808 | NODEMASK_SCRATCH(scratch); | 807 | NODEMASK_SCRATCH(scratch); |
809 | int ret; | 808 | int ret; |
810 | 809 | ||
@@ -816,20 +815,11 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
816 | ret = PTR_ERR(new); | 815 | ret = PTR_ERR(new); |
817 | goto out; | 816 | goto out; |
818 | } | 817 | } |
819 | /* | 818 | |
820 | * prevent changing our mempolicy while show_numa_maps() | ||
821 | * is using it. | ||
822 | * Note: do_set_mempolicy() can be called at init time | ||
823 | * with no 'mm'. | ||
824 | */ | ||
825 | if (mm) | ||
826 | down_write(&mm->mmap_sem); | ||
827 | task_lock(current); | 819 | task_lock(current); |
828 | ret = mpol_set_nodemask(new, nodes, scratch); | 820 | ret = mpol_set_nodemask(new, nodes, scratch); |
829 | if (ret) { | 821 | if (ret) { |
830 | task_unlock(current); | 822 | task_unlock(current); |
831 | if (mm) | ||
832 | up_write(&mm->mmap_sem); | ||
833 | mpol_put(new); | 823 | mpol_put(new); |
834 | goto out; | 824 | goto out; |
835 | } | 825 | } |
@@ -839,9 +829,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
839 | nodes_weight(new->v.nodes)) | 829 | nodes_weight(new->v.nodes)) |
840 | current->il_next = first_node(new->v.nodes); | 830 | current->il_next = first_node(new->v.nodes); |
841 | task_unlock(current); | 831 | task_unlock(current); |
842 | if (mm) | ||
843 | up_write(&mm->mmap_sem); | ||
844 | |||
845 | mpol_put(old); | 832 | mpol_put(old); |
846 | ret = 0; | 833 | ret = 0; |
847 | out: | 834 | out: |
@@ -1605,32 +1592,14 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, | |||
1605 | 1592 | ||
1606 | #endif | 1593 | #endif |
1607 | 1594 | ||
1608 | /* | 1595 | struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, |
1609 | * get_vma_policy(@task, @vma, @addr) | 1596 | unsigned long addr) |
1610 | * @task: task for fallback if vma policy == default | ||
1611 | * @vma: virtual memory area whose policy is sought | ||
1612 | * @addr: address in @vma for shared policy lookup | ||
1613 | * | ||
1614 | * Returns effective policy for a VMA at specified address. | ||
1615 | * Falls back to @task or system default policy, as necessary. | ||
1616 | * Current or other task's task mempolicy and non-shared vma policies must be | ||
1617 | * protected by task_lock(task) by the caller. | ||
1618 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference | ||
1619 | * count--added by the get_policy() vm_op, as appropriate--to protect against | ||
1620 | * freeing by another task. It is the caller's responsibility to free the | ||
1621 | * extra reference for shared policies. | ||
1622 | */ | ||
1623 | struct mempolicy *get_vma_policy(struct task_struct *task, | ||
1624 | struct vm_area_struct *vma, unsigned long addr) | ||
1625 | { | 1597 | { |
1626 | struct mempolicy *pol = get_task_policy(task); | 1598 | struct mempolicy *pol = NULL; |
1627 | 1599 | ||
1628 | if (vma) { | 1600 | if (vma) { |
1629 | if (vma->vm_ops && vma->vm_ops->get_policy) { | 1601 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
1630 | struct mempolicy *vpol = vma->vm_ops->get_policy(vma, | 1602 | pol = vma->vm_ops->get_policy(vma, addr); |
1631 | addr); | ||
1632 | if (vpol) | ||
1633 | pol = vpol; | ||
1634 | } else if (vma->vm_policy) { | 1603 | } else if (vma->vm_policy) { |
1635 | pol = vma->vm_policy; | 1604 | pol = vma->vm_policy; |
1636 | 1605 | ||
@@ -1644,31 +1613,51 @@ struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1644 | mpol_get(pol); | 1613 | mpol_get(pol); |
1645 | } | 1614 | } |
1646 | } | 1615 | } |
1616 | |||
1617 | return pol; | ||
1618 | } | ||
1619 | |||
1620 | /* | ||
1621 | * get_vma_policy(@vma, @addr) | ||
1622 | * @vma: virtual memory area whose policy is sought | ||
1623 | * @addr: address in @vma for shared policy lookup | ||
1624 | * | ||
1625 | * Returns effective policy for a VMA at specified address. | ||
1626 | * Falls back to current->mempolicy or system default policy, as necessary. | ||
1627 | * Shared policies [those marked as MPOL_F_SHARED] require an extra reference | ||
1628 | * count--added by the get_policy() vm_op, as appropriate--to protect against | ||
1629 | * freeing by another task. It is the caller's responsibility to free the | ||
1630 | * extra reference for shared policies. | ||
1631 | */ | ||
1632 | static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, | ||
1633 | unsigned long addr) | ||
1634 | { | ||
1635 | struct mempolicy *pol = __get_vma_policy(vma, addr); | ||
1636 | |||
1647 | if (!pol) | 1637 | if (!pol) |
1648 | pol = &default_policy; | 1638 | pol = get_task_policy(current); |
1639 | |||
1649 | return pol; | 1640 | return pol; |
1650 | } | 1641 | } |
1651 | 1642 | ||
1652 | bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) | 1643 | bool vma_policy_mof(struct vm_area_struct *vma) |
1653 | { | 1644 | { |
1654 | struct mempolicy *pol = get_task_policy(task); | 1645 | struct mempolicy *pol; |
1655 | if (vma) { | ||
1656 | if (vma->vm_ops && vma->vm_ops->get_policy) { | ||
1657 | bool ret = false; | ||
1658 | 1646 | ||
1659 | pol = vma->vm_ops->get_policy(vma, vma->vm_start); | 1647 | if (vma->vm_ops && vma->vm_ops->get_policy) { |
1660 | if (pol && (pol->flags & MPOL_F_MOF)) | 1648 | bool ret = false; |
1661 | ret = true; | ||
1662 | mpol_cond_put(pol); | ||
1663 | 1649 | ||
1664 | return ret; | 1650 | pol = vma->vm_ops->get_policy(vma, vma->vm_start); |
1665 | } else if (vma->vm_policy) { | 1651 | if (pol && (pol->flags & MPOL_F_MOF)) |
1666 | pol = vma->vm_policy; | 1652 | ret = true; |
1667 | } | 1653 | mpol_cond_put(pol); |
1654 | |||
1655 | return ret; | ||
1668 | } | 1656 | } |
1669 | 1657 | ||
1658 | pol = vma->vm_policy; | ||
1670 | if (!pol) | 1659 | if (!pol) |
1671 | return default_policy.flags & MPOL_F_MOF; | 1660 | pol = get_task_policy(current); |
1672 | 1661 | ||
1673 | return pol->flags & MPOL_F_MOF; | 1662 | return pol->flags & MPOL_F_MOF; |
1674 | } | 1663 | } |
@@ -1874,7 +1863,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | |||
1874 | { | 1863 | { |
1875 | struct zonelist *zl; | 1864 | struct zonelist *zl; |
1876 | 1865 | ||
1877 | *mpol = get_vma_policy(current, vma, addr); | 1866 | *mpol = get_vma_policy(vma, addr); |
1878 | *nodemask = NULL; /* assume !MPOL_BIND */ | 1867 | *nodemask = NULL; /* assume !MPOL_BIND */ |
1879 | 1868 | ||
1880 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { | 1869 | if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { |
@@ -2029,7 +2018,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, | |||
2029 | unsigned int cpuset_mems_cookie; | 2018 | unsigned int cpuset_mems_cookie; |
2030 | 2019 | ||
2031 | retry_cpuset: | 2020 | retry_cpuset: |
2032 | pol = get_vma_policy(current, vma, addr); | 2021 | pol = get_vma_policy(vma, addr); |
2033 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2022 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2034 | 2023 | ||
2035 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { | 2024 | if (unlikely(pol->mode == MPOL_INTERLEAVE)) { |
@@ -2046,8 +2035,7 @@ retry_cpuset: | |||
2046 | page = __alloc_pages_nodemask(gfp, order, | 2035 | page = __alloc_pages_nodemask(gfp, order, |
2047 | policy_zonelist(gfp, pol, node), | 2036 | policy_zonelist(gfp, pol, node), |
2048 | policy_nodemask(gfp, pol)); | 2037 | policy_nodemask(gfp, pol)); |
2049 | if (unlikely(mpol_needs_cond_ref(pol))) | 2038 | mpol_cond_put(pol); |
2050 | __mpol_put(pol); | ||
2051 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) | 2039 | if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) |
2052 | goto retry_cpuset; | 2040 | goto retry_cpuset; |
2053 | return page; | 2041 | return page; |
@@ -2074,12 +2062,12 @@ retry_cpuset: | |||
2074 | */ | 2062 | */ |
2075 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) | 2063 | struct page *alloc_pages_current(gfp_t gfp, unsigned order) |
2076 | { | 2064 | { |
2077 | struct mempolicy *pol = get_task_policy(current); | 2065 | struct mempolicy *pol = &default_policy; |
2078 | struct page *page; | 2066 | struct page *page; |
2079 | unsigned int cpuset_mems_cookie; | 2067 | unsigned int cpuset_mems_cookie; |
2080 | 2068 | ||
2081 | if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) | 2069 | if (!in_interrupt() && !(gfp & __GFP_THISNODE)) |
2082 | pol = &default_policy; | 2070 | pol = get_task_policy(current); |
2083 | 2071 | ||
2084 | retry_cpuset: | 2072 | retry_cpuset: |
2085 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2073 | cpuset_mems_cookie = read_mems_allowed_begin(); |
@@ -2296,7 +2284,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long | |||
2296 | 2284 | ||
2297 | BUG_ON(!vma); | 2285 | BUG_ON(!vma); |
2298 | 2286 | ||
2299 | pol = get_vma_policy(current, vma, addr); | 2287 | pol = get_vma_policy(vma, addr); |
2300 | if (!(pol->flags & MPOL_F_MOF)) | 2288 | if (!(pol->flags & MPOL_F_MOF)) |
2301 | goto out; | 2289 | goto out; |
2302 | 2290 | ||
diff --git a/mm/migrate.c b/mm/migrate.c index 2740360cd216..01439953abf5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -876,7 +876,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, | |||
876 | } | 876 | } |
877 | } | 877 | } |
878 | 878 | ||
879 | if (unlikely(balloon_page_movable(page))) { | 879 | if (unlikely(isolated_balloon_page(page))) { |
880 | /* | 880 | /* |
881 | * A ballooned page does not need any special attention from | 881 | * A ballooned page does not need any special attention from |
882 | * physical to virtual reverse mapping procedures. | 882 | * physical to virtual reverse mapping procedures. |
@@ -955,17 +955,6 @@ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, | |||
955 | 955 | ||
956 | rc = __unmap_and_move(page, newpage, force, mode); | 956 | rc = __unmap_and_move(page, newpage, force, mode); |
957 | 957 | ||
958 | if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { | ||
959 | /* | ||
960 | * A ballooned page has been migrated already. | ||
961 | * Now, it's the time to wrap-up counters, | ||
962 | * handle the page back to Buddy and return. | ||
963 | */ | ||
964 | dec_zone_page_state(page, NR_ISOLATED_ANON + | ||
965 | page_is_file_cache(page)); | ||
966 | balloon_page_free(page); | ||
967 | return MIGRATEPAGE_SUCCESS; | ||
968 | } | ||
969 | out: | 958 | out: |
970 | if (rc != -EAGAIN) { | 959 | if (rc != -EAGAIN) { |
971 | /* | 960 | /* |
@@ -988,6 +977,9 @@ out: | |||
988 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { | 977 | if (rc != MIGRATEPAGE_SUCCESS && put_new_page) { |
989 | ClearPageSwapBacked(newpage); | 978 | ClearPageSwapBacked(newpage); |
990 | put_new_page(newpage, private); | 979 | put_new_page(newpage, private); |
980 | } else if (unlikely(__is_movable_balloon_page(newpage))) { | ||
981 | /* drop our reference, page already in the balloon */ | ||
982 | put_page(newpage); | ||
991 | } else | 983 | } else |
992 | putback_lru_page(newpage); | 984 | putback_lru_page(newpage); |
993 | 985 | ||
diff --git a/mm/mlock.c b/mm/mlock.c index ce84cb0b83ef..03aa8512723b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -233,9 +233,9 @@ long __mlock_vma_pages_range(struct vm_area_struct *vma, | |||
233 | 233 | ||
234 | VM_BUG_ON(start & ~PAGE_MASK); | 234 | VM_BUG_ON(start & ~PAGE_MASK); |
235 | VM_BUG_ON(end & ~PAGE_MASK); | 235 | VM_BUG_ON(end & ~PAGE_MASK); |
236 | VM_BUG_ON(start < vma->vm_start); | 236 | VM_BUG_ON_VMA(start < vma->vm_start, vma); |
237 | VM_BUG_ON(end > vma->vm_end); | 237 | VM_BUG_ON_VMA(end > vma->vm_end, vma); |
238 | VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); | 238 | VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); |
239 | 239 | ||
240 | gup_flags = FOLL_TOUCH | FOLL_MLOCK; | 240 | gup_flags = FOLL_TOUCH | FOLL_MLOCK; |
241 | /* | 241 | /* |
@@ -70,7 +70,7 @@ static void unmap_region(struct mm_struct *mm, | |||
70 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes | 70 | * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes |
71 | * w: (no) no w: (no) no w: (yes) yes w: (no) no | 71 | * w: (no) no w: (no) no w: (yes) yes w: (no) no |
72 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | 72 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes |
73 | * | 73 | * |
74 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes | 74 | * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes |
75 | * w: (no) no w: (no) no w: (copy) copy w: (no) no | 75 | * w: (no) no w: (no) no w: (copy) copy w: (no) no |
76 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes | 76 | * x: (no) no x: (no) yes x: (no) yes x: (yes) yes |
@@ -268,7 +268,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len); | |||
268 | 268 | ||
269 | SYSCALL_DEFINE1(brk, unsigned long, brk) | 269 | SYSCALL_DEFINE1(brk, unsigned long, brk) |
270 | { | 270 | { |
271 | unsigned long rlim, retval; | 271 | unsigned long retval; |
272 | unsigned long newbrk, oldbrk; | 272 | unsigned long newbrk, oldbrk; |
273 | struct mm_struct *mm = current->mm; | 273 | struct mm_struct *mm = current->mm; |
274 | unsigned long min_brk; | 274 | unsigned long min_brk; |
@@ -298,9 +298,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) | |||
298 | * segment grow beyond its set limit the in case where the limit is | 298 | * segment grow beyond its set limit the in case where the limit is |
299 | * not page aligned -Ram Gupta | 299 | * not page aligned -Ram Gupta |
300 | */ | 300 | */ |
301 | rlim = rlimit(RLIMIT_DATA); | 301 | if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk, |
302 | if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + | 302 | mm->end_data, mm->start_data)) |
303 | (mm->end_data - mm->start_data) > rlim) | ||
304 | goto out; | 303 | goto out; |
305 | 304 | ||
306 | newbrk = PAGE_ALIGN(brk); | 305 | newbrk = PAGE_ALIGN(brk); |
@@ -369,16 +368,18 @@ static int browse_rb(struct rb_root *root) | |||
369 | struct vm_area_struct *vma; | 368 | struct vm_area_struct *vma; |
370 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 369 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
371 | if (vma->vm_start < prev) { | 370 | if (vma->vm_start < prev) { |
372 | pr_emerg("vm_start %lx prev %lx\n", vma->vm_start, prev); | 371 | pr_emerg("vm_start %lx < prev %lx\n", |
372 | vma->vm_start, prev); | ||
373 | bug = 1; | 373 | bug = 1; |
374 | } | 374 | } |
375 | if (vma->vm_start < pend) { | 375 | if (vma->vm_start < pend) { |
376 | pr_emerg("vm_start %lx pend %lx\n", vma->vm_start, pend); | 376 | pr_emerg("vm_start %lx < pend %lx\n", |
377 | vma->vm_start, pend); | ||
377 | bug = 1; | 378 | bug = 1; |
378 | } | 379 | } |
379 | if (vma->vm_start > vma->vm_end) { | 380 | if (vma->vm_start > vma->vm_end) { |
380 | pr_emerg("vm_end %lx < vm_start %lx\n", | 381 | pr_emerg("vm_start %lx > vm_end %lx\n", |
381 | vma->vm_end, vma->vm_start); | 382 | vma->vm_start, vma->vm_end); |
382 | bug = 1; | 383 | bug = 1; |
383 | } | 384 | } |
384 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { | 385 | if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { |
@@ -409,8 +410,9 @@ static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) | |||
409 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { | 410 | for (nd = rb_first(root); nd; nd = rb_next(nd)) { |
410 | struct vm_area_struct *vma; | 411 | struct vm_area_struct *vma; |
411 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); | 412 | vma = rb_entry(nd, struct vm_area_struct, vm_rb); |
412 | BUG_ON(vma != ignore && | 413 | VM_BUG_ON_VMA(vma != ignore && |
413 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); | 414 | vma->rb_subtree_gap != vma_compute_subtree_gap(vma), |
415 | vma); | ||
414 | } | 416 | } |
415 | } | 417 | } |
416 | 418 | ||
@@ -420,8 +422,10 @@ static void validate_mm(struct mm_struct *mm) | |||
420 | int i = 0; | 422 | int i = 0; |
421 | unsigned long highest_address = 0; | 423 | unsigned long highest_address = 0; |
422 | struct vm_area_struct *vma = mm->mmap; | 424 | struct vm_area_struct *vma = mm->mmap; |
425 | |||
423 | while (vma) { | 426 | while (vma) { |
424 | struct anon_vma_chain *avc; | 427 | struct anon_vma_chain *avc; |
428 | |||
425 | vma_lock_anon_vma(vma); | 429 | vma_lock_anon_vma(vma); |
426 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) | 430 | list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) |
427 | anon_vma_interval_tree_verify(avc); | 431 | anon_vma_interval_tree_verify(avc); |
@@ -436,15 +440,16 @@ static void validate_mm(struct mm_struct *mm) | |||
436 | } | 440 | } |
437 | if (highest_address != mm->highest_vm_end) { | 441 | if (highest_address != mm->highest_vm_end) { |
438 | pr_emerg("mm->highest_vm_end %lx, found %lx\n", | 442 | pr_emerg("mm->highest_vm_end %lx, found %lx\n", |
439 | mm->highest_vm_end, highest_address); | 443 | mm->highest_vm_end, highest_address); |
440 | bug = 1; | 444 | bug = 1; |
441 | } | 445 | } |
442 | i = browse_rb(&mm->mm_rb); | 446 | i = browse_rb(&mm->mm_rb); |
443 | if (i != mm->map_count) { | 447 | if (i != mm->map_count) { |
444 | pr_emerg("map_count %d rb %d\n", mm->map_count, i); | 448 | if (i != -1) |
449 | pr_emerg("map_count %d rb %d\n", mm->map_count, i); | ||
445 | bug = 1; | 450 | bug = 1; |
446 | } | 451 | } |
447 | BUG_ON(bug); | 452 | VM_BUG_ON_MM(bug, mm); |
448 | } | 453 | } |
449 | #else | 454 | #else |
450 | #define validate_mm_rb(root, ignore) do { } while (0) | 455 | #define validate_mm_rb(root, ignore) do { } while (0) |
@@ -741,7 +746,7 @@ again: remove_next = 1 + (end > next->vm_end); | |||
741 | * split_vma inserting another: so it must be | 746 | * split_vma inserting another: so it must be |
742 | * mprotect case 4 shifting the boundary down. | 747 | * mprotect case 4 shifting the boundary down. |
743 | */ | 748 | */ |
744 | adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); | 749 | adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT); |
745 | exporter = vma; | 750 | exporter = vma; |
746 | importer = next; | 751 | importer = next; |
747 | } | 752 | } |
@@ -787,8 +792,8 @@ again: remove_next = 1 + (end > next->vm_end); | |||
787 | if (!anon_vma && adjust_next) | 792 | if (!anon_vma && adjust_next) |
788 | anon_vma = next->anon_vma; | 793 | anon_vma = next->anon_vma; |
789 | if (anon_vma) { | 794 | if (anon_vma) { |
790 | VM_BUG_ON(adjust_next && next->anon_vma && | 795 | VM_BUG_ON_VMA(adjust_next && next->anon_vma && |
791 | anon_vma != next->anon_vma); | 796 | anon_vma != next->anon_vma, next); |
792 | anon_vma_lock_write(anon_vma); | 797 | anon_vma_lock_write(anon_vma); |
793 | anon_vma_interval_tree_pre_update_vma(vma); | 798 | anon_vma_interval_tree_pre_update_vma(vma); |
794 | if (adjust_next) | 799 | if (adjust_next) |
@@ -1010,7 +1015,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, | |||
1010 | struct vm_area_struct *vma_merge(struct mm_struct *mm, | 1015 | struct vm_area_struct *vma_merge(struct mm_struct *mm, |
1011 | struct vm_area_struct *prev, unsigned long addr, | 1016 | struct vm_area_struct *prev, unsigned long addr, |
1012 | unsigned long end, unsigned long vm_flags, | 1017 | unsigned long end, unsigned long vm_flags, |
1013 | struct anon_vma *anon_vma, struct file *file, | 1018 | struct anon_vma *anon_vma, struct file *file, |
1014 | pgoff_t pgoff, struct mempolicy *policy) | 1019 | pgoff_t pgoff, struct mempolicy *policy) |
1015 | { | 1020 | { |
1016 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; | 1021 | pgoff_t pglen = (end - addr) >> PAGE_SHIFT; |
@@ -1036,7 +1041,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
1036 | * Can it merge with the predecessor? | 1041 | * Can it merge with the predecessor? |
1037 | */ | 1042 | */ |
1038 | if (prev && prev->vm_end == addr && | 1043 | if (prev && prev->vm_end == addr && |
1039 | mpol_equal(vma_policy(prev), policy) && | 1044 | mpol_equal(vma_policy(prev), policy) && |
1040 | can_vma_merge_after(prev, vm_flags, | 1045 | can_vma_merge_after(prev, vm_flags, |
1041 | anon_vma, file, pgoff)) { | 1046 | anon_vma, file, pgoff)) { |
1042 | /* | 1047 | /* |
@@ -1064,7 +1069,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, | |||
1064 | * Can this new request be merged in front of next? | 1069 | * Can this new request be merged in front of next? |
1065 | */ | 1070 | */ |
1066 | if (next && end == next->vm_start && | 1071 | if (next && end == next->vm_start && |
1067 | mpol_equal(policy, vma_policy(next)) && | 1072 | mpol_equal(policy, vma_policy(next)) && |
1068 | can_vma_merge_before(next, vm_flags, | 1073 | can_vma_merge_before(next, vm_flags, |
1069 | anon_vma, file, pgoff+pglen)) { | 1074 | anon_vma, file, pgoff+pglen)) { |
1070 | if (prev && addr < prev->vm_end) /* case 4 */ | 1075 | if (prev && addr < prev->vm_end) /* case 4 */ |
@@ -1235,7 +1240,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1235 | unsigned long flags, unsigned long pgoff, | 1240 | unsigned long flags, unsigned long pgoff, |
1236 | unsigned long *populate) | 1241 | unsigned long *populate) |
1237 | { | 1242 | { |
1238 | struct mm_struct * mm = current->mm; | 1243 | struct mm_struct *mm = current->mm; |
1239 | vm_flags_t vm_flags; | 1244 | vm_flags_t vm_flags; |
1240 | 1245 | ||
1241 | *populate = 0; | 1246 | *populate = 0; |
@@ -1263,7 +1268,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1263 | 1268 | ||
1264 | /* offset overflow? */ | 1269 | /* offset overflow? */ |
1265 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) | 1270 | if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) |
1266 | return -EOVERFLOW; | 1271 | return -EOVERFLOW; |
1267 | 1272 | ||
1268 | /* Too many mappings? */ | 1273 | /* Too many mappings? */ |
1269 | if (mm->map_count > sysctl_max_map_count) | 1274 | if (mm->map_count > sysctl_max_map_count) |
@@ -1921,7 +1926,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, | |||
1921 | info.align_mask = 0; | 1926 | info.align_mask = 0; |
1922 | return vm_unmapped_area(&info); | 1927 | return vm_unmapped_area(&info); |
1923 | } | 1928 | } |
1924 | #endif | 1929 | #endif |
1925 | 1930 | ||
1926 | /* | 1931 | /* |
1927 | * This mmap-allocator allocates new areas top-down from below the | 1932 | * This mmap-allocator allocates new areas top-down from below the |
@@ -2321,13 +2326,13 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address) | |||
2321 | } | 2326 | } |
2322 | 2327 | ||
2323 | struct vm_area_struct * | 2328 | struct vm_area_struct * |
2324 | find_extend_vma(struct mm_struct * mm, unsigned long addr) | 2329 | find_extend_vma(struct mm_struct *mm, unsigned long addr) |
2325 | { | 2330 | { |
2326 | struct vm_area_struct * vma; | 2331 | struct vm_area_struct *vma; |
2327 | unsigned long start; | 2332 | unsigned long start; |
2328 | 2333 | ||
2329 | addr &= PAGE_MASK; | 2334 | addr &= PAGE_MASK; |
2330 | vma = find_vma(mm,addr); | 2335 | vma = find_vma(mm, addr); |
2331 | if (!vma) | 2336 | if (!vma) |
2332 | return NULL; | 2337 | return NULL; |
2333 | if (vma->vm_start <= addr) | 2338 | if (vma->vm_start <= addr) |
@@ -2376,7 +2381,7 @@ static void unmap_region(struct mm_struct *mm, | |||
2376 | struct vm_area_struct *vma, struct vm_area_struct *prev, | 2381 | struct vm_area_struct *vma, struct vm_area_struct *prev, |
2377 | unsigned long start, unsigned long end) | 2382 | unsigned long start, unsigned long end) |
2378 | { | 2383 | { |
2379 | struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; | 2384 | struct vm_area_struct *next = prev ? prev->vm_next : mm->mmap; |
2380 | struct mmu_gather tlb; | 2385 | struct mmu_gather tlb; |
2381 | 2386 | ||
2382 | lru_add_drain(); | 2387 | lru_add_drain(); |
@@ -2423,7 +2428,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2423 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the | 2428 | * __split_vma() bypasses sysctl_max_map_count checking. We use this on the |
2424 | * munmap path where it doesn't make sense to fail. | 2429 | * munmap path where it doesn't make sense to fail. |
2425 | */ | 2430 | */ |
2426 | static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | 2431 | static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, |
2427 | unsigned long addr, int new_below) | 2432 | unsigned long addr, int new_below) |
2428 | { | 2433 | { |
2429 | struct vm_area_struct *new; | 2434 | struct vm_area_struct *new; |
@@ -2512,7 +2517,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2512 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) | 2517 | if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) |
2513 | return -EINVAL; | 2518 | return -EINVAL; |
2514 | 2519 | ||
2515 | if ((len = PAGE_ALIGN(len)) == 0) | 2520 | len = PAGE_ALIGN(len); |
2521 | if (len == 0) | ||
2516 | return -EINVAL; | 2522 | return -EINVAL; |
2517 | 2523 | ||
2518 | /* Find the first overlapping VMA */ | 2524 | /* Find the first overlapping VMA */ |
@@ -2558,7 +2564,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) | |||
2558 | if (error) | 2564 | if (error) |
2559 | return error; | 2565 | return error; |
2560 | } | 2566 | } |
2561 | vma = prev? prev->vm_next: mm->mmap; | 2567 | vma = prev ? prev->vm_next : mm->mmap; |
2562 | 2568 | ||
2563 | /* | 2569 | /* |
2564 | * unlock any mlock()ed ranges before detaching vmas | 2570 | * unlock any mlock()ed ranges before detaching vmas |
@@ -2621,10 +2627,10 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) | |||
2621 | */ | 2627 | */ |
2622 | static unsigned long do_brk(unsigned long addr, unsigned long len) | 2628 | static unsigned long do_brk(unsigned long addr, unsigned long len) |
2623 | { | 2629 | { |
2624 | struct mm_struct * mm = current->mm; | 2630 | struct mm_struct *mm = current->mm; |
2625 | struct vm_area_struct * vma, * prev; | 2631 | struct vm_area_struct *vma, *prev; |
2626 | unsigned long flags; | 2632 | unsigned long flags; |
2627 | struct rb_node ** rb_link, * rb_parent; | 2633 | struct rb_node **rb_link, *rb_parent; |
2628 | pgoff_t pgoff = addr >> PAGE_SHIFT; | 2634 | pgoff_t pgoff = addr >> PAGE_SHIFT; |
2629 | int error; | 2635 | int error; |
2630 | 2636 | ||
@@ -2848,7 +2854,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2848 | * safe. It is only safe to keep the vm_pgoff | 2854 | * safe. It is only safe to keep the vm_pgoff |
2849 | * linear if there are no pages mapped yet. | 2855 | * linear if there are no pages mapped yet. |
2850 | */ | 2856 | */ |
2851 | VM_BUG_ON(faulted_in_anon_vma); | 2857 | VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); |
2852 | *vmap = vma = new_vma; | 2858 | *vmap = vma = new_vma; |
2853 | } | 2859 | } |
2854 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); | 2860 | *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); |
diff --git a/mm/mremap.c b/mm/mremap.c index 05f1180e9f21..b147f66f4c40 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -21,8 +21,8 @@ | |||
21 | #include <linux/syscalls.h> | 21 | #include <linux/syscalls.h> |
22 | #include <linux/mmu_notifier.h> | 22 | #include <linux/mmu_notifier.h> |
23 | #include <linux/sched/sysctl.h> | 23 | #include <linux/sched/sysctl.h> |
24 | #include <linux/uaccess.h> | ||
24 | 25 | ||
25 | #include <asm/uaccess.h> | ||
26 | #include <asm/cacheflush.h> | 26 | #include <asm/cacheflush.h> |
27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
28 | 28 | ||
@@ -195,7 +195,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
195 | if (pmd_trans_huge(*old_pmd)) { | 195 | if (pmd_trans_huge(*old_pmd)) { |
196 | int err = 0; | 196 | int err = 0; |
197 | if (extent == HPAGE_PMD_SIZE) { | 197 | if (extent == HPAGE_PMD_SIZE) { |
198 | VM_BUG_ON(vma->vm_file || !vma->anon_vma); | 198 | VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, |
199 | vma); | ||
199 | /* See comment in move_ptes() */ | 200 | /* See comment in move_ptes() */ |
200 | if (need_rmap_locks) | 201 | if (need_rmap_locks) |
201 | anon_vma_lock_write(vma->anon_vma); | 202 | anon_vma_lock_write(vma->anon_vma); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e11df8fa7ec..bbf405a3a18f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -565,7 +565,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
565 | 565 | ||
566 | spin_lock(&zone_scan_lock); | 566 | spin_lock(&zone_scan_lock); |
567 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) | 567 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) |
568 | if (zone_is_oom_locked(zone)) { | 568 | if (test_bit(ZONE_OOM_LOCKED, &zone->flags)) { |
569 | ret = false; | 569 | ret = false; |
570 | goto out; | 570 | goto out; |
571 | } | 571 | } |
@@ -575,7 +575,7 @@ bool oom_zonelist_trylock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
575 | * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. | 575 | * call to oom_zonelist_trylock() doesn't succeed when it shouldn't. |
576 | */ | 576 | */ |
577 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) | 577 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) |
578 | zone_set_flag(zone, ZONE_OOM_LOCKED); | 578 | set_bit(ZONE_OOM_LOCKED, &zone->flags); |
579 | 579 | ||
580 | out: | 580 | out: |
581 | spin_unlock(&zone_scan_lock); | 581 | spin_unlock(&zone_scan_lock); |
@@ -594,7 +594,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask) | |||
594 | 594 | ||
595 | spin_lock(&zone_scan_lock); | 595 | spin_lock(&zone_scan_lock); |
596 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) | 596 | for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) |
597 | zone_clear_flag(zone, ZONE_OOM_LOCKED); | 597 | clear_bit(ZONE_OOM_LOCKED, &zone->flags); |
598 | spin_unlock(&zone_scan_lock); | 598 | spin_unlock(&zone_scan_lock); |
599 | } | 599 | } |
600 | 600 | ||
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 91d73ef1744d..35ca7102d421 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1075,13 +1075,13 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, | |||
1075 | } | 1075 | } |
1076 | 1076 | ||
1077 | if (dirty < setpoint) { | 1077 | if (dirty < setpoint) { |
1078 | x = min(bdi->balanced_dirty_ratelimit, | 1078 | x = min3(bdi->balanced_dirty_ratelimit, |
1079 | min(balanced_dirty_ratelimit, task_ratelimit)); | 1079 | balanced_dirty_ratelimit, task_ratelimit); |
1080 | if (dirty_ratelimit < x) | 1080 | if (dirty_ratelimit < x) |
1081 | step = x - dirty_ratelimit; | 1081 | step = x - dirty_ratelimit; |
1082 | } else { | 1082 | } else { |
1083 | x = max(bdi->balanced_dirty_ratelimit, | 1083 | x = max3(bdi->balanced_dirty_ratelimit, |
1084 | max(balanced_dirty_ratelimit, task_ratelimit)); | 1084 | balanced_dirty_ratelimit, task_ratelimit); |
1085 | if (dirty_ratelimit > x) | 1085 | if (dirty_ratelimit > x) |
1086 | step = dirty_ratelimit - x; | 1086 | step = dirty_ratelimit - x; |
1087 | } | 1087 | } |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee961958021..c9710c9bbee2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -53,8 +53,6 @@ | |||
53 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
54 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
55 | #include <trace/events/kmem.h> | 55 | #include <trace/events/kmem.h> |
56 | #include <linux/ftrace_event.h> | ||
57 | #include <linux/memcontrol.h> | ||
58 | #include <linux/prefetch.h> | 56 | #include <linux/prefetch.h> |
59 | #include <linux/mm_inline.h> | 57 | #include <linux/mm_inline.h> |
60 | #include <linux/migrate.h> | 58 | #include <linux/migrate.h> |
@@ -85,6 +83,7 @@ EXPORT_PER_CPU_SYMBOL(numa_node); | |||
85 | */ | 83 | */ |
86 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ | 84 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ |
87 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); | 85 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); |
86 | int _node_numa_mem_[MAX_NUMNODES]; | ||
88 | #endif | 87 | #endif |
89 | 88 | ||
90 | /* | 89 | /* |
@@ -1014,7 +1013,7 @@ int move_freepages(struct zone *zone, | |||
1014 | * Remove at a later date when no bug reports exist related to | 1013 | * Remove at a later date when no bug reports exist related to |
1015 | * grouping pages by mobility | 1014 | * grouping pages by mobility |
1016 | */ | 1015 | */ |
1017 | BUG_ON(page_zone(start_page) != page_zone(end_page)); | 1016 | VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); |
1018 | #endif | 1017 | #endif |
1019 | 1018 | ||
1020 | for (page = start_page; page <= end_page;) { | 1019 | for (page = start_page; page <= end_page;) { |
@@ -1613,8 +1612,8 @@ again: | |||
1613 | 1612 | ||
1614 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); | 1613 | __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); |
1615 | if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && | 1614 | if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && |
1616 | !zone_is_fair_depleted(zone)) | 1615 | !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) |
1617 | zone_set_flag(zone, ZONE_FAIR_DEPLETED); | 1616 | set_bit(ZONE_FAIR_DEPLETED, &zone->flags); |
1618 | 1617 | ||
1619 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1618 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1620 | zone_statistics(preferred_zone, zone, gfp_flags); | 1619 | zone_statistics(preferred_zone, zone, gfp_flags); |
@@ -1934,7 +1933,7 @@ static void reset_alloc_batches(struct zone *preferred_zone) | |||
1934 | mod_zone_page_state(zone, NR_ALLOC_BATCH, | 1933 | mod_zone_page_state(zone, NR_ALLOC_BATCH, |
1935 | high_wmark_pages(zone) - low_wmark_pages(zone) - | 1934 | high_wmark_pages(zone) - low_wmark_pages(zone) - |
1936 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | 1935 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); |
1937 | zone_clear_flag(zone, ZONE_FAIR_DEPLETED); | 1936 | clear_bit(ZONE_FAIR_DEPLETED, &zone->flags); |
1938 | } while (zone++ != preferred_zone); | 1937 | } while (zone++ != preferred_zone); |
1939 | } | 1938 | } |
1940 | 1939 | ||
@@ -1985,7 +1984,7 @@ zonelist_scan: | |||
1985 | if (alloc_flags & ALLOC_FAIR) { | 1984 | if (alloc_flags & ALLOC_FAIR) { |
1986 | if (!zone_local(preferred_zone, zone)) | 1985 | if (!zone_local(preferred_zone, zone)) |
1987 | break; | 1986 | break; |
1988 | if (zone_is_fair_depleted(zone)) { | 1987 | if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { |
1989 | nr_fair_skipped++; | 1988 | nr_fair_skipped++; |
1990 | continue; | 1989 | continue; |
1991 | } | 1990 | } |
@@ -2296,58 +2295,72 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2296 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2295 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2297 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2296 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2298 | int classzone_idx, int migratetype, enum migrate_mode mode, | 2297 | int classzone_idx, int migratetype, enum migrate_mode mode, |
2299 | bool *contended_compaction, bool *deferred_compaction, | 2298 | int *contended_compaction, bool *deferred_compaction) |
2300 | unsigned long *did_some_progress) | ||
2301 | { | 2299 | { |
2302 | if (!order) | 2300 | struct zone *last_compact_zone = NULL; |
2303 | return NULL; | 2301 | unsigned long compact_result; |
2302 | struct page *page; | ||
2304 | 2303 | ||
2305 | if (compaction_deferred(preferred_zone, order)) { | 2304 | if (!order) |
2306 | *deferred_compaction = true; | ||
2307 | return NULL; | 2305 | return NULL; |
2308 | } | ||
2309 | 2306 | ||
2310 | current->flags |= PF_MEMALLOC; | 2307 | current->flags |= PF_MEMALLOC; |
2311 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2308 | compact_result = try_to_compact_pages(zonelist, order, gfp_mask, |
2312 | nodemask, mode, | 2309 | nodemask, mode, |
2313 | contended_compaction); | 2310 | contended_compaction, |
2311 | &last_compact_zone); | ||
2314 | current->flags &= ~PF_MEMALLOC; | 2312 | current->flags &= ~PF_MEMALLOC; |
2315 | 2313 | ||
2316 | if (*did_some_progress != COMPACT_SKIPPED) { | 2314 | switch (compact_result) { |
2317 | struct page *page; | 2315 | case COMPACT_DEFERRED: |
2316 | *deferred_compaction = true; | ||
2317 | /* fall-through */ | ||
2318 | case COMPACT_SKIPPED: | ||
2319 | return NULL; | ||
2320 | default: | ||
2321 | break; | ||
2322 | } | ||
2318 | 2323 | ||
2319 | /* Page migration frees to the PCP lists but we want merging */ | 2324 | /* |
2320 | drain_pages(get_cpu()); | 2325 | * At least in one zone compaction wasn't deferred or skipped, so let's |
2321 | put_cpu(); | 2326 | * count a compaction stall |
2327 | */ | ||
2328 | count_vm_event(COMPACTSTALL); | ||
2322 | 2329 | ||
2323 | page = get_page_from_freelist(gfp_mask, nodemask, | 2330 | /* Page migration frees to the PCP lists but we want merging */ |
2324 | order, zonelist, high_zoneidx, | 2331 | drain_pages(get_cpu()); |
2325 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2332 | put_cpu(); |
2326 | preferred_zone, classzone_idx, migratetype); | ||
2327 | if (page) { | ||
2328 | preferred_zone->compact_blockskip_flush = false; | ||
2329 | compaction_defer_reset(preferred_zone, order, true); | ||
2330 | count_vm_event(COMPACTSUCCESS); | ||
2331 | return page; | ||
2332 | } | ||
2333 | 2333 | ||
2334 | /* | 2334 | page = get_page_from_freelist(gfp_mask, nodemask, |
2335 | * It's bad if compaction run occurs and fails. | 2335 | order, zonelist, high_zoneidx, |
2336 | * The most likely reason is that pages exist, | 2336 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2337 | * but not enough to satisfy watermarks. | 2337 | preferred_zone, classzone_idx, migratetype); |
2338 | */ | ||
2339 | count_vm_event(COMPACTFAIL); | ||
2340 | 2338 | ||
2341 | /* | 2339 | if (page) { |
2342 | * As async compaction considers a subset of pageblocks, only | 2340 | struct zone *zone = page_zone(page); |
2343 | * defer if the failure was a sync compaction failure. | ||
2344 | */ | ||
2345 | if (mode != MIGRATE_ASYNC) | ||
2346 | defer_compaction(preferred_zone, order); | ||
2347 | 2341 | ||
2348 | cond_resched(); | 2342 | zone->compact_blockskip_flush = false; |
2343 | compaction_defer_reset(zone, order, true); | ||
2344 | count_vm_event(COMPACTSUCCESS); | ||
2345 | return page; | ||
2349 | } | 2346 | } |
2350 | 2347 | ||
2348 | /* | ||
2349 | * last_compact_zone is where try_to_compact_pages thought allocation | ||
2350 | * should succeed, so it did not defer compaction. But here we know | ||
2351 | * that it didn't succeed, so we do the defer. | ||
2352 | */ | ||
2353 | if (last_compact_zone && mode != MIGRATE_ASYNC) | ||
2354 | defer_compaction(last_compact_zone, order); | ||
2355 | |||
2356 | /* | ||
2357 | * It's bad if compaction run occurs and fails. The most likely reason | ||
2358 | * is that pages exist, but not enough to satisfy watermarks. | ||
2359 | */ | ||
2360 | count_vm_event(COMPACTFAIL); | ||
2361 | |||
2362 | cond_resched(); | ||
2363 | |||
2351 | return NULL; | 2364 | return NULL; |
2352 | } | 2365 | } |
2353 | #else | 2366 | #else |
@@ -2355,9 +2368,8 @@ static inline struct page * | |||
2355 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | 2368 | __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, |
2356 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2369 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
2357 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, | 2370 | nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, |
2358 | int classzone_idx, int migratetype, | 2371 | int classzone_idx, int migratetype, enum migrate_mode mode, |
2359 | enum migrate_mode mode, bool *contended_compaction, | 2372 | int *contended_compaction, bool *deferred_compaction) |
2360 | bool *deferred_compaction, unsigned long *did_some_progress) | ||
2361 | { | 2373 | { |
2362 | return NULL; | 2374 | return NULL; |
2363 | } | 2375 | } |
@@ -2457,12 +2469,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | |||
2457 | static void wake_all_kswapds(unsigned int order, | 2469 | static void wake_all_kswapds(unsigned int order, |
2458 | struct zonelist *zonelist, | 2470 | struct zonelist *zonelist, |
2459 | enum zone_type high_zoneidx, | 2471 | enum zone_type high_zoneidx, |
2460 | struct zone *preferred_zone) | 2472 | struct zone *preferred_zone, |
2473 | nodemask_t *nodemask) | ||
2461 | { | 2474 | { |
2462 | struct zoneref *z; | 2475 | struct zoneref *z; |
2463 | struct zone *zone; | 2476 | struct zone *zone; |
2464 | 2477 | ||
2465 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 2478 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
2479 | high_zoneidx, nodemask) | ||
2466 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); | 2480 | wakeup_kswapd(zone, order, zone_idx(preferred_zone)); |
2467 | } | 2481 | } |
2468 | 2482 | ||
@@ -2509,7 +2523,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2509 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2523 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2510 | } | 2524 | } |
2511 | #ifdef CONFIG_CMA | 2525 | #ifdef CONFIG_CMA |
2512 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | 2526 | if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) |
2513 | alloc_flags |= ALLOC_CMA; | 2527 | alloc_flags |= ALLOC_CMA; |
2514 | #endif | 2528 | #endif |
2515 | return alloc_flags; | 2529 | return alloc_flags; |
@@ -2533,7 +2547,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2533 | unsigned long did_some_progress; | 2547 | unsigned long did_some_progress; |
2534 | enum migrate_mode migration_mode = MIGRATE_ASYNC; | 2548 | enum migrate_mode migration_mode = MIGRATE_ASYNC; |
2535 | bool deferred_compaction = false; | 2549 | bool deferred_compaction = false; |
2536 | bool contended_compaction = false; | 2550 | int contended_compaction = COMPACT_CONTENDED_NONE; |
2537 | 2551 | ||
2538 | /* | 2552 | /* |
2539 | * In the slowpath, we sanity check order to avoid ever trying to | 2553 | * In the slowpath, we sanity check order to avoid ever trying to |
@@ -2560,7 +2574,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2560 | 2574 | ||
2561 | restart: | 2575 | restart: |
2562 | if (!(gfp_mask & __GFP_NO_KSWAPD)) | 2576 | if (!(gfp_mask & __GFP_NO_KSWAPD)) |
2563 | wake_all_kswapds(order, zonelist, high_zoneidx, preferred_zone); | 2577 | wake_all_kswapds(order, zonelist, high_zoneidx, |
2578 | preferred_zone, nodemask); | ||
2564 | 2579 | ||
2565 | /* | 2580 | /* |
2566 | * OK, we're below the kswapd watermark and have kicked background | 2581 | * OK, we're below the kswapd watermark and have kicked background |
@@ -2633,20 +2648,40 @@ rebalance: | |||
2633 | preferred_zone, | 2648 | preferred_zone, |
2634 | classzone_idx, migratetype, | 2649 | classzone_idx, migratetype, |
2635 | migration_mode, &contended_compaction, | 2650 | migration_mode, &contended_compaction, |
2636 | &deferred_compaction, | 2651 | &deferred_compaction); |
2637 | &did_some_progress); | ||
2638 | if (page) | 2652 | if (page) |
2639 | goto got_pg; | 2653 | goto got_pg; |
2640 | 2654 | ||
2641 | /* | 2655 | /* Checks for THP-specific high-order allocations */ |
2642 | * If compaction is deferred for high-order allocations, it is because | 2656 | if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) { |
2643 | * sync compaction recently failed. In this is the case and the caller | 2657 | /* |
2644 | * requested a movable allocation that does not heavily disrupt the | 2658 | * If compaction is deferred for high-order allocations, it is |
2645 | * system then fail the allocation instead of entering direct reclaim. | 2659 | * because sync compaction recently failed. If this is the case |
2646 | */ | 2660 | * and the caller requested a THP allocation, we do not want |
2647 | if ((deferred_compaction || contended_compaction) && | 2661 | * to heavily disrupt the system, so we fail the allocation |
2648 | (gfp_mask & __GFP_NO_KSWAPD)) | 2662 | * instead of entering direct reclaim. |
2649 | goto nopage; | 2663 | */ |
2664 | if (deferred_compaction) | ||
2665 | goto nopage; | ||
2666 | |||
2667 | /* | ||
2668 | * In all zones where compaction was attempted (and not | ||
2669 | * deferred or skipped), lock contention has been detected. | ||
2670 | * For THP allocation we do not want to disrupt the others | ||
2671 | * so we fallback to base pages instead. | ||
2672 | */ | ||
2673 | if (contended_compaction == COMPACT_CONTENDED_LOCK) | ||
2674 | goto nopage; | ||
2675 | |||
2676 | /* | ||
2677 | * If compaction was aborted due to need_resched(), we do not | ||
2678 | * want to further increase allocation latency, unless it is | ||
2679 | * khugepaged trying to collapse. | ||
2680 | */ | ||
2681 | if (contended_compaction == COMPACT_CONTENDED_SCHED | ||
2682 | && !(current->flags & PF_KTHREAD)) | ||
2683 | goto nopage; | ||
2684 | } | ||
2650 | 2685 | ||
2651 | /* | 2686 | /* |
2652 | * It can become very expensive to allocate transparent hugepages at | 2687 | * It can become very expensive to allocate transparent hugepages at |
@@ -2726,8 +2761,7 @@ rebalance: | |||
2726 | preferred_zone, | 2761 | preferred_zone, |
2727 | classzone_idx, migratetype, | 2762 | classzone_idx, migratetype, |
2728 | migration_mode, &contended_compaction, | 2763 | migration_mode, &contended_compaction, |
2729 | &deferred_compaction, | 2764 | &deferred_compaction); |
2730 | &did_some_progress); | ||
2731 | if (page) | 2765 | if (page) |
2732 | goto got_pg; | 2766 | goto got_pg; |
2733 | } | 2767 | } |
@@ -2753,7 +2787,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2753 | struct zone *preferred_zone; | 2787 | struct zone *preferred_zone; |
2754 | struct zoneref *preferred_zoneref; | 2788 | struct zoneref *preferred_zoneref; |
2755 | struct page *page = NULL; | 2789 | struct page *page = NULL; |
2756 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2790 | int migratetype = gfpflags_to_migratetype(gfp_mask); |
2757 | unsigned int cpuset_mems_cookie; | 2791 | unsigned int cpuset_mems_cookie; |
2758 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; | 2792 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; |
2759 | int classzone_idx; | 2793 | int classzone_idx; |
@@ -2775,6 +2809,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2775 | if (unlikely(!zonelist->_zonerefs->zone)) | 2809 | if (unlikely(!zonelist->_zonerefs->zone)) |
2776 | return NULL; | 2810 | return NULL; |
2777 | 2811 | ||
2812 | if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE) | ||
2813 | alloc_flags |= ALLOC_CMA; | ||
2814 | |||
2778 | retry_cpuset: | 2815 | retry_cpuset: |
2779 | cpuset_mems_cookie = read_mems_allowed_begin(); | 2816 | cpuset_mems_cookie = read_mems_allowed_begin(); |
2780 | 2817 | ||
@@ -2786,10 +2823,6 @@ retry_cpuset: | |||
2786 | goto out; | 2823 | goto out; |
2787 | classzone_idx = zonelist_zone_idx(preferred_zoneref); | 2824 | classzone_idx = zonelist_zone_idx(preferred_zoneref); |
2788 | 2825 | ||
2789 | #ifdef CONFIG_CMA | ||
2790 | if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) | ||
2791 | alloc_flags |= ALLOC_CMA; | ||
2792 | #endif | ||
2793 | /* First allocation attempt */ | 2826 | /* First allocation attempt */ |
2794 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 2827 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
2795 | zonelist, high_zoneidx, alloc_flags, | 2828 | zonelist, high_zoneidx, alloc_flags, |
@@ -3579,68 +3612,30 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | |||
3579 | zonelist->_zonerefs[pos].zone_idx = 0; | 3612 | zonelist->_zonerefs[pos].zone_idx = 0; |
3580 | } | 3613 | } |
3581 | 3614 | ||
3615 | #if defined(CONFIG_64BIT) | ||
3616 | /* | ||
3617 | * Devices that require DMA32/DMA are relatively rare and do not justify a | ||
3618 | * penalty to every machine in case the specialised case applies. Default | ||
3619 | * to Node-ordering on 64-bit NUMA machines | ||
3620 | */ | ||
3621 | static int default_zonelist_order(void) | ||
3622 | { | ||
3623 | return ZONELIST_ORDER_NODE; | ||
3624 | } | ||
3625 | #else | ||
3626 | /* | ||
3627 | * On 32-bit, the Normal zone needs to be preserved for allocations accessible | ||
3628 | * by the kernel. If processes running on node 0 deplete the low memory zone | ||
3629 | * then reclaim will occur more frequency increasing stalls and potentially | ||
3630 | * be easier to OOM if a large percentage of the zone is under writeback or | ||
3631 | * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set. | ||
3632 | * Hence, default to zone ordering on 32-bit. | ||
3633 | */ | ||
3582 | static int default_zonelist_order(void) | 3634 | static int default_zonelist_order(void) |
3583 | { | 3635 | { |
3584 | int nid, zone_type; | ||
3585 | unsigned long low_kmem_size, total_size; | ||
3586 | struct zone *z; | ||
3587 | int average_size; | ||
3588 | /* | ||
3589 | * ZONE_DMA and ZONE_DMA32 can be very small area in the system. | ||
3590 | * If they are really small and used heavily, the system can fall | ||
3591 | * into OOM very easily. | ||
3592 | * This function detect ZONE_DMA/DMA32 size and configures zone order. | ||
3593 | */ | ||
3594 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ | ||
3595 | low_kmem_size = 0; | ||
3596 | total_size = 0; | ||
3597 | for_each_online_node(nid) { | ||
3598 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | ||
3599 | z = &NODE_DATA(nid)->node_zones[zone_type]; | ||
3600 | if (populated_zone(z)) { | ||
3601 | if (zone_type < ZONE_NORMAL) | ||
3602 | low_kmem_size += z->managed_pages; | ||
3603 | total_size += z->managed_pages; | ||
3604 | } else if (zone_type == ZONE_NORMAL) { | ||
3605 | /* | ||
3606 | * If any node has only lowmem, then node order | ||
3607 | * is preferred to allow kernel allocations | ||
3608 | * locally; otherwise, they can easily infringe | ||
3609 | * on other nodes when there is an abundance of | ||
3610 | * lowmem available to allocate from. | ||
3611 | */ | ||
3612 | return ZONELIST_ORDER_NODE; | ||
3613 | } | ||
3614 | } | ||
3615 | } | ||
3616 | if (!low_kmem_size || /* there are no DMA area. */ | ||
3617 | low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ | ||
3618 | return ZONELIST_ORDER_NODE; | ||
3619 | /* | ||
3620 | * look into each node's config. | ||
3621 | * If there is a node whose DMA/DMA32 memory is very big area on | ||
3622 | * local memory, NODE_ORDER may be suitable. | ||
3623 | */ | ||
3624 | average_size = total_size / | ||
3625 | (nodes_weight(node_states[N_MEMORY]) + 1); | ||
3626 | for_each_online_node(nid) { | ||
3627 | low_kmem_size = 0; | ||
3628 | total_size = 0; | ||
3629 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | ||
3630 | z = &NODE_DATA(nid)->node_zones[zone_type]; | ||
3631 | if (populated_zone(z)) { | ||
3632 | if (zone_type < ZONE_NORMAL) | ||
3633 | low_kmem_size += z->present_pages; | ||
3634 | total_size += z->present_pages; | ||
3635 | } | ||
3636 | } | ||
3637 | if (low_kmem_size && | ||
3638 | total_size > average_size && /* ignore small node */ | ||
3639 | low_kmem_size > total_size * 70/100) | ||
3640 | return ZONELIST_ORDER_NODE; | ||
3641 | } | ||
3642 | return ZONELIST_ORDER_ZONE; | 3636 | return ZONELIST_ORDER_ZONE; |
3643 | } | 3637 | } |
3638 | #endif /* CONFIG_64BIT */ | ||
3644 | 3639 | ||
3645 | static void set_zonelist_order(void) | 3640 | static void set_zonelist_order(void) |
3646 | { | 3641 | { |
@@ -6277,8 +6272,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
6277 | 6272 | ||
6278 | if (list_empty(&cc->migratepages)) { | 6273 | if (list_empty(&cc->migratepages)) { |
6279 | cc->nr_migratepages = 0; | 6274 | cc->nr_migratepages = 0; |
6280 | pfn = isolate_migratepages_range(cc->zone, cc, | 6275 | pfn = isolate_migratepages_range(cc, pfn, end); |
6281 | pfn, end, true); | ||
6282 | if (!pfn) { | 6276 | if (!pfn) { |
6283 | ret = -EINTR; | 6277 | ret = -EINTR; |
6284 | break; | 6278 | break; |
@@ -6554,97 +6548,3 @@ bool is_free_buddy_page(struct page *page) | |||
6554 | return order < MAX_ORDER; | 6548 | return order < MAX_ORDER; |
6555 | } | 6549 | } |
6556 | #endif | 6550 | #endif |
6557 | |||
6558 | static const struct trace_print_flags pageflag_names[] = { | ||
6559 | {1UL << PG_locked, "locked" }, | ||
6560 | {1UL << PG_error, "error" }, | ||
6561 | {1UL << PG_referenced, "referenced" }, | ||
6562 | {1UL << PG_uptodate, "uptodate" }, | ||
6563 | {1UL << PG_dirty, "dirty" }, | ||
6564 | {1UL << PG_lru, "lru" }, | ||
6565 | {1UL << PG_active, "active" }, | ||
6566 | {1UL << PG_slab, "slab" }, | ||
6567 | {1UL << PG_owner_priv_1, "owner_priv_1" }, | ||
6568 | {1UL << PG_arch_1, "arch_1" }, | ||
6569 | {1UL << PG_reserved, "reserved" }, | ||
6570 | {1UL << PG_private, "private" }, | ||
6571 | {1UL << PG_private_2, "private_2" }, | ||
6572 | {1UL << PG_writeback, "writeback" }, | ||
6573 | #ifdef CONFIG_PAGEFLAGS_EXTENDED | ||
6574 | {1UL << PG_head, "head" }, | ||
6575 | {1UL << PG_tail, "tail" }, | ||
6576 | #else | ||
6577 | {1UL << PG_compound, "compound" }, | ||
6578 | #endif | ||
6579 | {1UL << PG_swapcache, "swapcache" }, | ||
6580 | {1UL << PG_mappedtodisk, "mappedtodisk" }, | ||
6581 | {1UL << PG_reclaim, "reclaim" }, | ||
6582 | {1UL << PG_swapbacked, "swapbacked" }, | ||
6583 | {1UL << PG_unevictable, "unevictable" }, | ||
6584 | #ifdef CONFIG_MMU | ||
6585 | {1UL << PG_mlocked, "mlocked" }, | ||
6586 | #endif | ||
6587 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | ||
6588 | {1UL << PG_uncached, "uncached" }, | ||
6589 | #endif | ||
6590 | #ifdef CONFIG_MEMORY_FAILURE | ||
6591 | {1UL << PG_hwpoison, "hwpoison" }, | ||
6592 | #endif | ||
6593 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
6594 | {1UL << PG_compound_lock, "compound_lock" }, | ||
6595 | #endif | ||
6596 | }; | ||
6597 | |||
6598 | static void dump_page_flags(unsigned long flags) | ||
6599 | { | ||
6600 | const char *delim = ""; | ||
6601 | unsigned long mask; | ||
6602 | int i; | ||
6603 | |||
6604 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | ||
6605 | |||
6606 | printk(KERN_ALERT "page flags: %#lx(", flags); | ||
6607 | |||
6608 | /* remove zone id */ | ||
6609 | flags &= (1UL << NR_PAGEFLAGS) - 1; | ||
6610 | |||
6611 | for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) { | ||
6612 | |||
6613 | mask = pageflag_names[i].mask; | ||
6614 | if ((flags & mask) != mask) | ||
6615 | continue; | ||
6616 | |||
6617 | flags &= ~mask; | ||
6618 | printk("%s%s", delim, pageflag_names[i].name); | ||
6619 | delim = "|"; | ||
6620 | } | ||
6621 | |||
6622 | /* check for left over flags */ | ||
6623 | if (flags) | ||
6624 | printk("%s%#lx", delim, flags); | ||
6625 | |||
6626 | printk(")\n"); | ||
6627 | } | ||
6628 | |||
6629 | void dump_page_badflags(struct page *page, const char *reason, | ||
6630 | unsigned long badflags) | ||
6631 | { | ||
6632 | printk(KERN_ALERT | ||
6633 | "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | ||
6634 | page, atomic_read(&page->_count), page_mapcount(page), | ||
6635 | page->mapping, page->index); | ||
6636 | dump_page_flags(page->flags); | ||
6637 | if (reason) | ||
6638 | pr_alert("page dumped because: %s\n", reason); | ||
6639 | if (page->flags & badflags) { | ||
6640 | pr_alert("bad because of flags:\n"); | ||
6641 | dump_page_flags(page->flags & badflags); | ||
6642 | } | ||
6643 | mem_cgroup_print_bad_page(page); | ||
6644 | } | ||
6645 | |||
6646 | void dump_page(struct page *page, const char *reason) | ||
6647 | { | ||
6648 | dump_page_badflags(page, reason, 0); | ||
6649 | } | ||
6650 | EXPORT_SYMBOL(dump_page); | ||
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2beeabf502c5..ad83195521f2 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -177,7 +177,7 @@ int walk_page_range(unsigned long addr, unsigned long end, | |||
177 | if (!walk->mm) | 177 | if (!walk->mm) |
178 | return -EINVAL; | 178 | return -EINVAL; |
179 | 179 | ||
180 | VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); | 180 | VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm); |
181 | 181 | ||
182 | pgd = pgd_offset(walk->mm, addr); | 182 | pgd = pgd_offset(walk->mm, addr); |
183 | do { | 183 | do { |
@@ -527,7 +527,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) | |||
527 | unsigned long address = __vma_address(page, vma); | 527 | unsigned long address = __vma_address(page, vma); |
528 | 528 | ||
529 | /* page should be within @vma mapping range */ | 529 | /* page should be within @vma mapping range */ |
530 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 530 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); |
531 | 531 | ||
532 | return address; | 532 | return address; |
533 | } | 533 | } |
@@ -897,7 +897,7 @@ void page_move_anon_rmap(struct page *page, | |||
897 | struct anon_vma *anon_vma = vma->anon_vma; | 897 | struct anon_vma *anon_vma = vma->anon_vma; |
898 | 898 | ||
899 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 899 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
900 | VM_BUG_ON(!anon_vma); | 900 | VM_BUG_ON_VMA(!anon_vma, vma); |
901 | VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); | 901 | VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); |
902 | 902 | ||
903 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; | 903 | anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; |
@@ -1024,7 +1024,7 @@ void do_page_add_anon_rmap(struct page *page, | |||
1024 | void page_add_new_anon_rmap(struct page *page, | 1024 | void page_add_new_anon_rmap(struct page *page, |
1025 | struct vm_area_struct *vma, unsigned long address) | 1025 | struct vm_area_struct *vma, unsigned long address) |
1026 | { | 1026 | { |
1027 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1027 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); |
1028 | SetPageSwapBacked(page); | 1028 | SetPageSwapBacked(page); |
1029 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 1029 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ |
1030 | if (PageTransHuge(page)) | 1030 | if (PageTransHuge(page)) |
@@ -1670,7 +1670,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) | |||
1670 | * structure at mapping cannot be freed and reused yet, | 1670 | * structure at mapping cannot be freed and reused yet, |
1671 | * so we can safely take mapping->i_mmap_mutex. | 1671 | * so we can safely take mapping->i_mmap_mutex. |
1672 | */ | 1672 | */ |
1673 | VM_BUG_ON(!PageLocked(page)); | 1673 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
1674 | 1674 | ||
1675 | if (!mapping) | 1675 | if (!mapping) |
1676 | return ret; | 1676 | return ret; |
diff --git a/mm/shmem.c b/mm/shmem.c index 469f90d56051..4fad61bb41e5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -3077,7 +3077,9 @@ static const struct address_space_operations shmem_aops = { | |||
3077 | .write_begin = shmem_write_begin, | 3077 | .write_begin = shmem_write_begin, |
3078 | .write_end = shmem_write_end, | 3078 | .write_end = shmem_write_end, |
3079 | #endif | 3079 | #endif |
3080 | #ifdef CONFIG_MIGRATION | ||
3080 | .migratepage = migrate_page, | 3081 | .migratepage = migrate_page, |
3082 | #endif | ||
3081 | .error_remove_page = generic_error_remove_page, | 3083 | .error_remove_page = generic_error_remove_page, |
3082 | }; | 3084 | }; |
3083 | 3085 | ||
@@ -237,11 +237,10 @@ struct arraycache_init { | |||
237 | /* | 237 | /* |
238 | * Need this for bootstrapping a per node allocator. | 238 | * Need this for bootstrapping a per node allocator. |
239 | */ | 239 | */ |
240 | #define NUM_INIT_LISTS (3 * MAX_NUMNODES) | 240 | #define NUM_INIT_LISTS (2 * MAX_NUMNODES) |
241 | static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; | 241 | static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; |
242 | #define CACHE_CACHE 0 | 242 | #define CACHE_CACHE 0 |
243 | #define SIZE_AC MAX_NUMNODES | 243 | #define SIZE_NODE (MAX_NUMNODES) |
244 | #define SIZE_NODE (2 * MAX_NUMNODES) | ||
245 | 244 | ||
246 | static int drain_freelist(struct kmem_cache *cache, | 245 | static int drain_freelist(struct kmem_cache *cache, |
247 | struct kmem_cache_node *n, int tofree); | 246 | struct kmem_cache_node *n, int tofree); |
@@ -253,7 +252,6 @@ static void cache_reap(struct work_struct *unused); | |||
253 | 252 | ||
254 | static int slab_early_init = 1; | 253 | static int slab_early_init = 1; |
255 | 254 | ||
256 | #define INDEX_AC kmalloc_index(sizeof(struct arraycache_init)) | ||
257 | #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) | 255 | #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) |
258 | 256 | ||
259 | static void kmem_cache_node_init(struct kmem_cache_node *parent) | 257 | static void kmem_cache_node_init(struct kmem_cache_node *parent) |
@@ -458,9 +456,6 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache, | |||
458 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); | 456 | return reciprocal_divide(offset, cache->reciprocal_buffer_size); |
459 | } | 457 | } |
460 | 458 | ||
461 | static struct arraycache_init initarray_generic = | ||
462 | { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} }; | ||
463 | |||
464 | /* internal cache of cache description objs */ | 459 | /* internal cache of cache description objs */ |
465 | static struct kmem_cache kmem_cache_boot = { | 460 | static struct kmem_cache kmem_cache_boot = { |
466 | .batchcount = 1, | 461 | .batchcount = 1, |
@@ -476,7 +471,7 @@ static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); | |||
476 | 471 | ||
477 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) | 472 | static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) |
478 | { | 473 | { |
479 | return cachep->array[smp_processor_id()]; | 474 | return this_cpu_ptr(cachep->cpu_cache); |
480 | } | 475 | } |
481 | 476 | ||
482 | static size_t calculate_freelist_size(int nr_objs, size_t align) | 477 | static size_t calculate_freelist_size(int nr_objs, size_t align) |
@@ -785,8 +780,8 @@ static inline void *ac_get_obj(struct kmem_cache *cachep, | |||
785 | return objp; | 780 | return objp; |
786 | } | 781 | } |
787 | 782 | ||
788 | static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, | 783 | static noinline void *__ac_put_obj(struct kmem_cache *cachep, |
789 | void *objp) | 784 | struct array_cache *ac, void *objp) |
790 | { | 785 | { |
791 | if (unlikely(pfmemalloc_active)) { | 786 | if (unlikely(pfmemalloc_active)) { |
792 | /* Some pfmemalloc slabs exist, check if this is one */ | 787 | /* Some pfmemalloc slabs exist, check if this is one */ |
@@ -984,46 +979,50 @@ static void drain_alien_cache(struct kmem_cache *cachep, | |||
984 | } | 979 | } |
985 | } | 980 | } |
986 | 981 | ||
987 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | 982 | static int __cache_free_alien(struct kmem_cache *cachep, void *objp, |
983 | int node, int page_node) | ||
988 | { | 984 | { |
989 | int nodeid = page_to_nid(virt_to_page(objp)); | ||
990 | struct kmem_cache_node *n; | 985 | struct kmem_cache_node *n; |
991 | struct alien_cache *alien = NULL; | 986 | struct alien_cache *alien = NULL; |
992 | struct array_cache *ac; | 987 | struct array_cache *ac; |
993 | int node; | ||
994 | LIST_HEAD(list); | 988 | LIST_HEAD(list); |
995 | 989 | ||
996 | node = numa_mem_id(); | ||
997 | |||
998 | /* | ||
999 | * Make sure we are not freeing a object from another node to the array | ||
1000 | * cache on this cpu. | ||
1001 | */ | ||
1002 | if (likely(nodeid == node)) | ||
1003 | return 0; | ||
1004 | |||
1005 | n = get_node(cachep, node); | 990 | n = get_node(cachep, node); |
1006 | STATS_INC_NODEFREES(cachep); | 991 | STATS_INC_NODEFREES(cachep); |
1007 | if (n->alien && n->alien[nodeid]) { | 992 | if (n->alien && n->alien[page_node]) { |
1008 | alien = n->alien[nodeid]; | 993 | alien = n->alien[page_node]; |
1009 | ac = &alien->ac; | 994 | ac = &alien->ac; |
1010 | spin_lock(&alien->lock); | 995 | spin_lock(&alien->lock); |
1011 | if (unlikely(ac->avail == ac->limit)) { | 996 | if (unlikely(ac->avail == ac->limit)) { |
1012 | STATS_INC_ACOVERFLOW(cachep); | 997 | STATS_INC_ACOVERFLOW(cachep); |
1013 | __drain_alien_cache(cachep, ac, nodeid, &list); | 998 | __drain_alien_cache(cachep, ac, page_node, &list); |
1014 | } | 999 | } |
1015 | ac_put_obj(cachep, ac, objp); | 1000 | ac_put_obj(cachep, ac, objp); |
1016 | spin_unlock(&alien->lock); | 1001 | spin_unlock(&alien->lock); |
1017 | slabs_destroy(cachep, &list); | 1002 | slabs_destroy(cachep, &list); |
1018 | } else { | 1003 | } else { |
1019 | n = get_node(cachep, nodeid); | 1004 | n = get_node(cachep, page_node); |
1020 | spin_lock(&n->list_lock); | 1005 | spin_lock(&n->list_lock); |
1021 | free_block(cachep, &objp, 1, nodeid, &list); | 1006 | free_block(cachep, &objp, 1, page_node, &list); |
1022 | spin_unlock(&n->list_lock); | 1007 | spin_unlock(&n->list_lock); |
1023 | slabs_destroy(cachep, &list); | 1008 | slabs_destroy(cachep, &list); |
1024 | } | 1009 | } |
1025 | return 1; | 1010 | return 1; |
1026 | } | 1011 | } |
1012 | |||
1013 | static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | ||
1014 | { | ||
1015 | int page_node = page_to_nid(virt_to_page(objp)); | ||
1016 | int node = numa_mem_id(); | ||
1017 | /* | ||
1018 | * Make sure we are not freeing a object from another node to the array | ||
1019 | * cache on this cpu. | ||
1020 | */ | ||
1021 | if (likely(node == page_node)) | ||
1022 | return 0; | ||
1023 | |||
1024 | return __cache_free_alien(cachep, objp, node, page_node); | ||
1025 | } | ||
1027 | #endif | 1026 | #endif |
1028 | 1027 | ||
1029 | /* | 1028 | /* |
@@ -1092,24 +1091,25 @@ static void cpuup_canceled(long cpu) | |||
1092 | struct alien_cache **alien; | 1091 | struct alien_cache **alien; |
1093 | LIST_HEAD(list); | 1092 | LIST_HEAD(list); |
1094 | 1093 | ||
1095 | /* cpu is dead; no one can alloc from it. */ | ||
1096 | nc = cachep->array[cpu]; | ||
1097 | cachep->array[cpu] = NULL; | ||
1098 | n = get_node(cachep, node); | 1094 | n = get_node(cachep, node); |
1099 | |||
1100 | if (!n) | 1095 | if (!n) |
1101 | goto free_array_cache; | 1096 | continue; |
1102 | 1097 | ||
1103 | spin_lock_irq(&n->list_lock); | 1098 | spin_lock_irq(&n->list_lock); |
1104 | 1099 | ||
1105 | /* Free limit for this kmem_cache_node */ | 1100 | /* Free limit for this kmem_cache_node */ |
1106 | n->free_limit -= cachep->batchcount; | 1101 | n->free_limit -= cachep->batchcount; |
1107 | if (nc) | 1102 | |
1103 | /* cpu is dead; no one can alloc from it. */ | ||
1104 | nc = per_cpu_ptr(cachep->cpu_cache, cpu); | ||
1105 | if (nc) { | ||
1108 | free_block(cachep, nc->entry, nc->avail, node, &list); | 1106 | free_block(cachep, nc->entry, nc->avail, node, &list); |
1107 | nc->avail = 0; | ||
1108 | } | ||
1109 | 1109 | ||
1110 | if (!cpumask_empty(mask)) { | 1110 | if (!cpumask_empty(mask)) { |
1111 | spin_unlock_irq(&n->list_lock); | 1111 | spin_unlock_irq(&n->list_lock); |
1112 | goto free_array_cache; | 1112 | goto free_slab; |
1113 | } | 1113 | } |
1114 | 1114 | ||
1115 | shared = n->shared; | 1115 | shared = n->shared; |
@@ -1129,9 +1129,9 @@ static void cpuup_canceled(long cpu) | |||
1129 | drain_alien_cache(cachep, alien); | 1129 | drain_alien_cache(cachep, alien); |
1130 | free_alien_cache(alien); | 1130 | free_alien_cache(alien); |
1131 | } | 1131 | } |
1132 | free_array_cache: | 1132 | |
1133 | free_slab: | ||
1133 | slabs_destroy(cachep, &list); | 1134 | slabs_destroy(cachep, &list); |
1134 | kfree(nc); | ||
1135 | } | 1135 | } |
1136 | /* | 1136 | /* |
1137 | * In the previous loop, all the objects were freed to | 1137 | * In the previous loop, all the objects were freed to |
@@ -1168,32 +1168,23 @@ static int cpuup_prepare(long cpu) | |||
1168 | * array caches | 1168 | * array caches |
1169 | */ | 1169 | */ |
1170 | list_for_each_entry(cachep, &slab_caches, list) { | 1170 | list_for_each_entry(cachep, &slab_caches, list) { |
1171 | struct array_cache *nc; | ||
1172 | struct array_cache *shared = NULL; | 1171 | struct array_cache *shared = NULL; |
1173 | struct alien_cache **alien = NULL; | 1172 | struct alien_cache **alien = NULL; |
1174 | 1173 | ||
1175 | nc = alloc_arraycache(node, cachep->limit, | ||
1176 | cachep->batchcount, GFP_KERNEL); | ||
1177 | if (!nc) | ||
1178 | goto bad; | ||
1179 | if (cachep->shared) { | 1174 | if (cachep->shared) { |
1180 | shared = alloc_arraycache(node, | 1175 | shared = alloc_arraycache(node, |
1181 | cachep->shared * cachep->batchcount, | 1176 | cachep->shared * cachep->batchcount, |
1182 | 0xbaadf00d, GFP_KERNEL); | 1177 | 0xbaadf00d, GFP_KERNEL); |
1183 | if (!shared) { | 1178 | if (!shared) |
1184 | kfree(nc); | ||
1185 | goto bad; | 1179 | goto bad; |
1186 | } | ||
1187 | } | 1180 | } |
1188 | if (use_alien_caches) { | 1181 | if (use_alien_caches) { |
1189 | alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); | 1182 | alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); |
1190 | if (!alien) { | 1183 | if (!alien) { |
1191 | kfree(shared); | 1184 | kfree(shared); |
1192 | kfree(nc); | ||
1193 | goto bad; | 1185 | goto bad; |
1194 | } | 1186 | } |
1195 | } | 1187 | } |
1196 | cachep->array[cpu] = nc; | ||
1197 | n = get_node(cachep, node); | 1188 | n = get_node(cachep, node); |
1198 | BUG_ON(!n); | 1189 | BUG_ON(!n); |
1199 | 1190 | ||
@@ -1385,15 +1376,6 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) | |||
1385 | } | 1376 | } |
1386 | 1377 | ||
1387 | /* | 1378 | /* |
1388 | * The memory after the last cpu cache pointer is used for the | ||
1389 | * the node pointer. | ||
1390 | */ | ||
1391 | static void setup_node_pointer(struct kmem_cache *cachep) | ||
1392 | { | ||
1393 | cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids]; | ||
1394 | } | ||
1395 | |||
1396 | /* | ||
1397 | * Initialisation. Called after the page allocator have been initialised and | 1379 | * Initialisation. Called after the page allocator have been initialised and |
1398 | * before smp_init(). | 1380 | * before smp_init(). |
1399 | */ | 1381 | */ |
@@ -1404,7 +1386,6 @@ void __init kmem_cache_init(void) | |||
1404 | BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < | 1386 | BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < |
1405 | sizeof(struct rcu_head)); | 1387 | sizeof(struct rcu_head)); |
1406 | kmem_cache = &kmem_cache_boot; | 1388 | kmem_cache = &kmem_cache_boot; |
1407 | setup_node_pointer(kmem_cache); | ||
1408 | 1389 | ||
1409 | if (num_possible_nodes() == 1) | 1390 | if (num_possible_nodes() == 1) |
1410 | use_alien_caches = 0; | 1391 | use_alien_caches = 0; |
@@ -1412,8 +1393,6 @@ void __init kmem_cache_init(void) | |||
1412 | for (i = 0; i < NUM_INIT_LISTS; i++) | 1393 | for (i = 0; i < NUM_INIT_LISTS; i++) |
1413 | kmem_cache_node_init(&init_kmem_cache_node[i]); | 1394 | kmem_cache_node_init(&init_kmem_cache_node[i]); |
1414 | 1395 | ||
1415 | set_up_node(kmem_cache, CACHE_CACHE); | ||
1416 | |||
1417 | /* | 1396 | /* |
1418 | * Fragmentation resistance on low memory - only use bigger | 1397 | * Fragmentation resistance on low memory - only use bigger |
1419 | * page orders on machines with more than 32MB of memory if | 1398 | * page orders on machines with more than 32MB of memory if |
@@ -1448,49 +1427,22 @@ void __init kmem_cache_init(void) | |||
1448 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids | 1427 | * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids |
1449 | */ | 1428 | */ |
1450 | create_boot_cache(kmem_cache, "kmem_cache", | 1429 | create_boot_cache(kmem_cache, "kmem_cache", |
1451 | offsetof(struct kmem_cache, array[nr_cpu_ids]) + | 1430 | offsetof(struct kmem_cache, node) + |
1452 | nr_node_ids * sizeof(struct kmem_cache_node *), | 1431 | nr_node_ids * sizeof(struct kmem_cache_node *), |
1453 | SLAB_HWCACHE_ALIGN); | 1432 | SLAB_HWCACHE_ALIGN); |
1454 | list_add(&kmem_cache->list, &slab_caches); | 1433 | list_add(&kmem_cache->list, &slab_caches); |
1455 | 1434 | slab_state = PARTIAL; | |
1456 | /* 2+3) create the kmalloc caches */ | ||
1457 | 1435 | ||
1458 | /* | 1436 | /* |
1459 | * Initialize the caches that provide memory for the array cache and the | 1437 | * Initialize the caches that provide memory for the kmem_cache_node |
1460 | * kmem_cache_node structures first. Without this, further allocations will | 1438 | * structures first. Without this, further allocations will bug. |
1461 | * bug. | ||
1462 | */ | 1439 | */ |
1463 | 1440 | kmalloc_caches[INDEX_NODE] = create_kmalloc_cache("kmalloc-node", | |
1464 | kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac", | ||
1465 | kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS); | ||
1466 | |||
1467 | if (INDEX_AC != INDEX_NODE) | ||
1468 | kmalloc_caches[INDEX_NODE] = | ||
1469 | create_kmalloc_cache("kmalloc-node", | ||
1470 | kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); | 1441 | kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS); |
1442 | slab_state = PARTIAL_NODE; | ||
1471 | 1443 | ||
1472 | slab_early_init = 0; | 1444 | slab_early_init = 0; |
1473 | 1445 | ||
1474 | /* 4) Replace the bootstrap head arrays */ | ||
1475 | { | ||
1476 | struct array_cache *ptr; | ||
1477 | |||
1478 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); | ||
1479 | |||
1480 | memcpy(ptr, cpu_cache_get(kmem_cache), | ||
1481 | sizeof(struct arraycache_init)); | ||
1482 | |||
1483 | kmem_cache->array[smp_processor_id()] = ptr; | ||
1484 | |||
1485 | ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT); | ||
1486 | |||
1487 | BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC]) | ||
1488 | != &initarray_generic.cache); | ||
1489 | memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), | ||
1490 | sizeof(struct arraycache_init)); | ||
1491 | |||
1492 | kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; | ||
1493 | } | ||
1494 | /* 5) Replace the bootstrap kmem_cache_node */ | 1446 | /* 5) Replace the bootstrap kmem_cache_node */ |
1495 | { | 1447 | { |
1496 | int nid; | 1448 | int nid; |
@@ -1498,13 +1450,8 @@ void __init kmem_cache_init(void) | |||
1498 | for_each_online_node(nid) { | 1450 | for_each_online_node(nid) { |
1499 | init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); | 1451 | init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid); |
1500 | 1452 | ||
1501 | init_list(kmalloc_caches[INDEX_AC], | 1453 | init_list(kmalloc_caches[INDEX_NODE], |
1502 | &init_kmem_cache_node[SIZE_AC + nid], nid); | ||
1503 | |||
1504 | if (INDEX_AC != INDEX_NODE) { | ||
1505 | init_list(kmalloc_caches[INDEX_NODE], | ||
1506 | &init_kmem_cache_node[SIZE_NODE + nid], nid); | 1454 | &init_kmem_cache_node[SIZE_NODE + nid], nid); |
1507 | } | ||
1508 | } | 1455 | } |
1509 | } | 1456 | } |
1510 | 1457 | ||
@@ -2037,56 +1984,53 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, | |||
2037 | return left_over; | 1984 | return left_over; |
2038 | } | 1985 | } |
2039 | 1986 | ||
1987 | static struct array_cache __percpu *alloc_kmem_cache_cpus( | ||
1988 | struct kmem_cache *cachep, int entries, int batchcount) | ||
1989 | { | ||
1990 | int cpu; | ||
1991 | size_t size; | ||
1992 | struct array_cache __percpu *cpu_cache; | ||
1993 | |||
1994 | size = sizeof(void *) * entries + sizeof(struct array_cache); | ||
1995 | cpu_cache = __alloc_percpu(size, 0); | ||
1996 | |||
1997 | if (!cpu_cache) | ||
1998 | return NULL; | ||
1999 | |||
2000 | for_each_possible_cpu(cpu) { | ||
2001 | init_arraycache(per_cpu_ptr(cpu_cache, cpu), | ||
2002 | entries, batchcount); | ||
2003 | } | ||
2004 | |||
2005 | return cpu_cache; | ||
2006 | } | ||
2007 | |||
2040 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | 2008 | static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) |
2041 | { | 2009 | { |
2042 | if (slab_state >= FULL) | 2010 | if (slab_state >= FULL) |
2043 | return enable_cpucache(cachep, gfp); | 2011 | return enable_cpucache(cachep, gfp); |
2044 | 2012 | ||
2013 | cachep->cpu_cache = alloc_kmem_cache_cpus(cachep, 1, 1); | ||
2014 | if (!cachep->cpu_cache) | ||
2015 | return 1; | ||
2016 | |||
2045 | if (slab_state == DOWN) { | 2017 | if (slab_state == DOWN) { |
2046 | /* | 2018 | /* Creation of first cache (kmem_cache). */ |
2047 | * Note: Creation of first cache (kmem_cache). | 2019 | set_up_node(kmem_cache, CACHE_CACHE); |
2048 | * The setup_node is taken care | ||
2049 | * of by the caller of __kmem_cache_create | ||
2050 | */ | ||
2051 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | ||
2052 | slab_state = PARTIAL; | ||
2053 | } else if (slab_state == PARTIAL) { | 2020 | } else if (slab_state == PARTIAL) { |
2054 | /* | 2021 | /* For kmem_cache_node */ |
2055 | * Note: the second kmem_cache_create must create the cache | 2022 | set_up_node(cachep, SIZE_NODE); |
2056 | * that's used by kmalloc(24), otherwise the creation of | ||
2057 | * further caches will BUG(). | ||
2058 | */ | ||
2059 | cachep->array[smp_processor_id()] = &initarray_generic.cache; | ||
2060 | |||
2061 | /* | ||
2062 | * If the cache that's used by kmalloc(sizeof(kmem_cache_node)) is | ||
2063 | * the second cache, then we need to set up all its node/, | ||
2064 | * otherwise the creation of further caches will BUG(). | ||
2065 | */ | ||
2066 | set_up_node(cachep, SIZE_AC); | ||
2067 | if (INDEX_AC == INDEX_NODE) | ||
2068 | slab_state = PARTIAL_NODE; | ||
2069 | else | ||
2070 | slab_state = PARTIAL_ARRAYCACHE; | ||
2071 | } else { | 2023 | } else { |
2072 | /* Remaining boot caches */ | 2024 | int node; |
2073 | cachep->array[smp_processor_id()] = | ||
2074 | kmalloc(sizeof(struct arraycache_init), gfp); | ||
2075 | 2025 | ||
2076 | if (slab_state == PARTIAL_ARRAYCACHE) { | 2026 | for_each_online_node(node) { |
2077 | set_up_node(cachep, SIZE_NODE); | 2027 | cachep->node[node] = kmalloc_node( |
2078 | slab_state = PARTIAL_NODE; | 2028 | sizeof(struct kmem_cache_node), gfp, node); |
2079 | } else { | 2029 | BUG_ON(!cachep->node[node]); |
2080 | int node; | 2030 | kmem_cache_node_init(cachep->node[node]); |
2081 | for_each_online_node(node) { | ||
2082 | cachep->node[node] = | ||
2083 | kmalloc_node(sizeof(struct kmem_cache_node), | ||
2084 | gfp, node); | ||
2085 | BUG_ON(!cachep->node[node]); | ||
2086 | kmem_cache_node_init(cachep->node[node]); | ||
2087 | } | ||
2088 | } | 2031 | } |
2089 | } | 2032 | } |
2033 | |||
2090 | cachep->node[numa_mem_id()]->next_reap = | 2034 | cachep->node[numa_mem_id()]->next_reap = |
2091 | jiffies + REAPTIMEOUT_NODE + | 2035 | jiffies + REAPTIMEOUT_NODE + |
2092 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; | 2036 | ((unsigned long)cachep) % REAPTIMEOUT_NODE; |
@@ -2100,6 +2044,32 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2100 | return 0; | 2044 | return 0; |
2101 | } | 2045 | } |
2102 | 2046 | ||
2047 | unsigned long kmem_cache_flags(unsigned long object_size, | ||
2048 | unsigned long flags, const char *name, | ||
2049 | void (*ctor)(void *)) | ||
2050 | { | ||
2051 | return flags; | ||
2052 | } | ||
2053 | |||
2054 | struct kmem_cache * | ||
2055 | __kmem_cache_alias(const char *name, size_t size, size_t align, | ||
2056 | unsigned long flags, void (*ctor)(void *)) | ||
2057 | { | ||
2058 | struct kmem_cache *cachep; | ||
2059 | |||
2060 | cachep = find_mergeable(size, align, flags, name, ctor); | ||
2061 | if (cachep) { | ||
2062 | cachep->refcount++; | ||
2063 | |||
2064 | /* | ||
2065 | * Adjust the object sizes so that we clear | ||
2066 | * the complete object on kzalloc. | ||
2067 | */ | ||
2068 | cachep->object_size = max_t(int, cachep->object_size, size); | ||
2069 | } | ||
2070 | return cachep; | ||
2071 | } | ||
2072 | |||
2103 | /** | 2073 | /** |
2104 | * __kmem_cache_create - Create a cache. | 2074 | * __kmem_cache_create - Create a cache. |
2105 | * @cachep: cache management descriptor | 2075 | * @cachep: cache management descriptor |
@@ -2183,7 +2153,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) | |||
2183 | else | 2153 | else |
2184 | gfp = GFP_NOWAIT; | 2154 | gfp = GFP_NOWAIT; |
2185 | 2155 | ||
2186 | setup_node_pointer(cachep); | ||
2187 | #if DEBUG | 2156 | #if DEBUG |
2188 | 2157 | ||
2189 | /* | 2158 | /* |
@@ -2440,8 +2409,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) | |||
2440 | if (rc) | 2409 | if (rc) |
2441 | return rc; | 2410 | return rc; |
2442 | 2411 | ||
2443 | for_each_online_cpu(i) | 2412 | free_percpu(cachep->cpu_cache); |
2444 | kfree(cachep->array[i]); | ||
2445 | 2413 | ||
2446 | /* NUMA: free the node structures */ | 2414 | /* NUMA: free the node structures */ |
2447 | for_each_kmem_cache_node(cachep, i, n) { | 2415 | for_each_kmem_cache_node(cachep, i, n) { |
@@ -3399,7 +3367,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, | |||
3399 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) | 3367 | if (nr_online_nodes > 1 && cache_free_alien(cachep, objp)) |
3400 | return; | 3368 | return; |
3401 | 3369 | ||
3402 | if (likely(ac->avail < ac->limit)) { | 3370 | if (ac->avail < ac->limit) { |
3403 | STATS_INC_FREEHIT(cachep); | 3371 | STATS_INC_FREEHIT(cachep); |
3404 | } else { | 3372 | } else { |
3405 | STATS_INC_FREEMISS(cachep); | 3373 | STATS_INC_FREEMISS(cachep); |
@@ -3496,7 +3464,6 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller) | |||
3496 | return kmem_cache_alloc_node_trace(cachep, flags, node, size); | 3464 | return kmem_cache_alloc_node_trace(cachep, flags, node, size); |
3497 | } | 3465 | } |
3498 | 3466 | ||
3499 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | ||
3500 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | 3467 | void *__kmalloc_node(size_t size, gfp_t flags, int node) |
3501 | { | 3468 | { |
3502 | return __do_kmalloc_node(size, flags, node, _RET_IP_); | 3469 | return __do_kmalloc_node(size, flags, node, _RET_IP_); |
@@ -3509,13 +3476,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, | |||
3509 | return __do_kmalloc_node(size, flags, node, caller); | 3476 | return __do_kmalloc_node(size, flags, node, caller); |
3510 | } | 3477 | } |
3511 | EXPORT_SYMBOL(__kmalloc_node_track_caller); | 3478 | EXPORT_SYMBOL(__kmalloc_node_track_caller); |
3512 | #else | ||
3513 | void *__kmalloc_node(size_t size, gfp_t flags, int node) | ||
3514 | { | ||
3515 | return __do_kmalloc_node(size, flags, node, 0); | ||
3516 | } | ||
3517 | EXPORT_SYMBOL(__kmalloc_node); | ||
3518 | #endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ | ||
3519 | #endif /* CONFIG_NUMA */ | 3479 | #endif /* CONFIG_NUMA */ |
3520 | 3480 | ||
3521 | /** | 3481 | /** |
@@ -3541,8 +3501,6 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, | |||
3541 | return ret; | 3501 | return ret; |
3542 | } | 3502 | } |
3543 | 3503 | ||
3544 | |||
3545 | #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) | ||
3546 | void *__kmalloc(size_t size, gfp_t flags) | 3504 | void *__kmalloc(size_t size, gfp_t flags) |
3547 | { | 3505 | { |
3548 | return __do_kmalloc(size, flags, _RET_IP_); | 3506 | return __do_kmalloc(size, flags, _RET_IP_); |
@@ -3555,14 +3513,6 @@ void *__kmalloc_track_caller(size_t size, gfp_t flags, unsigned long caller) | |||
3555 | } | 3513 | } |
3556 | EXPORT_SYMBOL(__kmalloc_track_caller); | 3514 | EXPORT_SYMBOL(__kmalloc_track_caller); |
3557 | 3515 | ||
3558 | #else | ||
3559 | void *__kmalloc(size_t size, gfp_t flags) | ||
3560 | { | ||
3561 | return __do_kmalloc(size, flags, 0); | ||
3562 | } | ||
3563 | EXPORT_SYMBOL(__kmalloc); | ||
3564 | #endif | ||
3565 | |||
3566 | /** | 3516 | /** |
3567 | * kmem_cache_free - Deallocate an object | 3517 | * kmem_cache_free - Deallocate an object |
3568 | * @cachep: The cache the allocation was from. | 3518 | * @cachep: The cache the allocation was from. |
@@ -3707,72 +3657,45 @@ fail: | |||
3707 | return -ENOMEM; | 3657 | return -ENOMEM; |
3708 | } | 3658 | } |
3709 | 3659 | ||
3710 | struct ccupdate_struct { | ||
3711 | struct kmem_cache *cachep; | ||
3712 | struct array_cache *new[0]; | ||
3713 | }; | ||
3714 | |||
3715 | static void do_ccupdate_local(void *info) | ||
3716 | { | ||
3717 | struct ccupdate_struct *new = info; | ||
3718 | struct array_cache *old; | ||
3719 | |||
3720 | check_irq_off(); | ||
3721 | old = cpu_cache_get(new->cachep); | ||
3722 | |||
3723 | new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; | ||
3724 | new->new[smp_processor_id()] = old; | ||
3725 | } | ||
3726 | |||
3727 | /* Always called with the slab_mutex held */ | 3660 | /* Always called with the slab_mutex held */ |
3728 | static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, | 3661 | static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, |
3729 | int batchcount, int shared, gfp_t gfp) | 3662 | int batchcount, int shared, gfp_t gfp) |
3730 | { | 3663 | { |
3731 | struct ccupdate_struct *new; | 3664 | struct array_cache __percpu *cpu_cache, *prev; |
3732 | int i; | 3665 | int cpu; |
3733 | 3666 | ||
3734 | new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *), | 3667 | cpu_cache = alloc_kmem_cache_cpus(cachep, limit, batchcount); |
3735 | gfp); | 3668 | if (!cpu_cache) |
3736 | if (!new) | ||
3737 | return -ENOMEM; | 3669 | return -ENOMEM; |
3738 | 3670 | ||
3739 | for_each_online_cpu(i) { | 3671 | prev = cachep->cpu_cache; |
3740 | new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, | 3672 | cachep->cpu_cache = cpu_cache; |
3741 | batchcount, gfp); | 3673 | kick_all_cpus_sync(); |
3742 | if (!new->new[i]) { | ||
3743 | for (i--; i >= 0; i--) | ||
3744 | kfree(new->new[i]); | ||
3745 | kfree(new); | ||
3746 | return -ENOMEM; | ||
3747 | } | ||
3748 | } | ||
3749 | new->cachep = cachep; | ||
3750 | |||
3751 | on_each_cpu(do_ccupdate_local, (void *)new, 1); | ||
3752 | 3674 | ||
3753 | check_irq_on(); | 3675 | check_irq_on(); |
3754 | cachep->batchcount = batchcount; | 3676 | cachep->batchcount = batchcount; |
3755 | cachep->limit = limit; | 3677 | cachep->limit = limit; |
3756 | cachep->shared = shared; | 3678 | cachep->shared = shared; |
3757 | 3679 | ||
3758 | for_each_online_cpu(i) { | 3680 | if (!prev) |
3681 | goto alloc_node; | ||
3682 | |||
3683 | for_each_online_cpu(cpu) { | ||
3759 | LIST_HEAD(list); | 3684 | LIST_HEAD(list); |
3760 | struct array_cache *ccold = new->new[i]; | ||
3761 | int node; | 3685 | int node; |
3762 | struct kmem_cache_node *n; | 3686 | struct kmem_cache_node *n; |
3687 | struct array_cache *ac = per_cpu_ptr(prev, cpu); | ||
3763 | 3688 | ||
3764 | if (!ccold) | 3689 | node = cpu_to_mem(cpu); |
3765 | continue; | ||
3766 | |||
3767 | node = cpu_to_mem(i); | ||
3768 | n = get_node(cachep, node); | 3690 | n = get_node(cachep, node); |
3769 | spin_lock_irq(&n->list_lock); | 3691 | spin_lock_irq(&n->list_lock); |
3770 | free_block(cachep, ccold->entry, ccold->avail, node, &list); | 3692 | free_block(cachep, ac->entry, ac->avail, node, &list); |
3771 | spin_unlock_irq(&n->list_lock); | 3693 | spin_unlock_irq(&n->list_lock); |
3772 | slabs_destroy(cachep, &list); | 3694 | slabs_destroy(cachep, &list); |
3773 | kfree(ccold); | ||
3774 | } | 3695 | } |
3775 | kfree(new); | 3696 | free_percpu(prev); |
3697 | |||
3698 | alloc_node: | ||
3776 | return alloc_kmem_cache_node(cachep, gfp); | 3699 | return alloc_kmem_cache_node(cachep, gfp); |
3777 | } | 3700 | } |
3778 | 3701 | ||
@@ -4255,19 +4178,15 @@ static const struct seq_operations slabstats_op = { | |||
4255 | 4178 | ||
4256 | static int slabstats_open(struct inode *inode, struct file *file) | 4179 | static int slabstats_open(struct inode *inode, struct file *file) |
4257 | { | 4180 | { |
4258 | unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL); | 4181 | unsigned long *n; |
4259 | int ret = -ENOMEM; | 4182 | |
4260 | if (n) { | 4183 | n = __seq_open_private(file, &slabstats_op, PAGE_SIZE); |
4261 | ret = seq_open(file, &slabstats_op); | 4184 | if (!n) |
4262 | if (!ret) { | 4185 | return -ENOMEM; |
4263 | struct seq_file *m = file->private_data; | 4186 | |
4264 | *n = PAGE_SIZE / (2 * sizeof(unsigned long)); | 4187 | *n = PAGE_SIZE / (2 * sizeof(unsigned long)); |
4265 | m->private = n; | 4188 | |
4266 | n = NULL; | 4189 | return 0; |
4267 | } | ||
4268 | kfree(n); | ||
4269 | } | ||
4270 | return ret; | ||
4271 | } | 4190 | } |
4272 | 4191 | ||
4273 | static const struct file_operations proc_slabstats_operations = { | 4192 | static const struct file_operations proc_slabstats_operations = { |
@@ -4,6 +4,41 @@ | |||
4 | * Internal slab definitions | 4 | * Internal slab definitions |
5 | */ | 5 | */ |
6 | 6 | ||
7 | #ifdef CONFIG_SLOB | ||
8 | /* | ||
9 | * Common fields provided in kmem_cache by all slab allocators | ||
10 | * This struct is either used directly by the allocator (SLOB) | ||
11 | * or the allocator must include definitions for all fields | ||
12 | * provided in kmem_cache_common in their definition of kmem_cache. | ||
13 | * | ||
14 | * Once we can do anonymous structs (C11 standard) we could put a | ||
15 | * anonymous struct definition in these allocators so that the | ||
16 | * separate allocations in the kmem_cache structure of SLAB and | ||
17 | * SLUB is no longer needed. | ||
18 | */ | ||
19 | struct kmem_cache { | ||
20 | unsigned int object_size;/* The original size of the object */ | ||
21 | unsigned int size; /* The aligned/padded/added on size */ | ||
22 | unsigned int align; /* Alignment as calculated */ | ||
23 | unsigned long flags; /* Active flags on the slab */ | ||
24 | const char *name; /* Slab name for sysfs */ | ||
25 | int refcount; /* Use counter */ | ||
26 | void (*ctor)(void *); /* Called on object slot creation */ | ||
27 | struct list_head list; /* List of all slab caches on the system */ | ||
28 | }; | ||
29 | |||
30 | #endif /* CONFIG_SLOB */ | ||
31 | |||
32 | #ifdef CONFIG_SLAB | ||
33 | #include <linux/slab_def.h> | ||
34 | #endif | ||
35 | |||
36 | #ifdef CONFIG_SLUB | ||
37 | #include <linux/slub_def.h> | ||
38 | #endif | ||
39 | |||
40 | #include <linux/memcontrol.h> | ||
41 | |||
7 | /* | 42 | /* |
8 | * State of the slab allocator. | 43 | * State of the slab allocator. |
9 | * | 44 | * |
@@ -15,7 +50,6 @@ | |||
15 | enum slab_state { | 50 | enum slab_state { |
16 | DOWN, /* No slab functionality yet */ | 51 | DOWN, /* No slab functionality yet */ |
17 | PARTIAL, /* SLUB: kmem_cache_node available */ | 52 | PARTIAL, /* SLUB: kmem_cache_node available */ |
18 | PARTIAL_ARRAYCACHE, /* SLAB: kmalloc size for arraycache available */ | ||
19 | PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ | 53 | PARTIAL_NODE, /* SLAB: kmalloc size for node struct available */ |
20 | UP, /* Slab caches usable but not all extras yet */ | 54 | UP, /* Slab caches usable but not all extras yet */ |
21 | FULL /* Everything is working */ | 55 | FULL /* Everything is working */ |
@@ -53,15 +87,30 @@ extern void create_boot_cache(struct kmem_cache *, const char *name, | |||
53 | size_t size, unsigned long flags); | 87 | size_t size, unsigned long flags); |
54 | 88 | ||
55 | struct mem_cgroup; | 89 | struct mem_cgroup; |
56 | #ifdef CONFIG_SLUB | 90 | |
91 | int slab_unmergeable(struct kmem_cache *s); | ||
92 | struct kmem_cache *find_mergeable(size_t size, size_t align, | ||
93 | unsigned long flags, const char *name, void (*ctor)(void *)); | ||
94 | #ifndef CONFIG_SLOB | ||
57 | struct kmem_cache * | 95 | struct kmem_cache * |
58 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 96 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
59 | unsigned long flags, void (*ctor)(void *)); | 97 | unsigned long flags, void (*ctor)(void *)); |
98 | |||
99 | unsigned long kmem_cache_flags(unsigned long object_size, | ||
100 | unsigned long flags, const char *name, | ||
101 | void (*ctor)(void *)); | ||
60 | #else | 102 | #else |
61 | static inline struct kmem_cache * | 103 | static inline struct kmem_cache * |
62 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 104 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
63 | unsigned long flags, void (*ctor)(void *)) | 105 | unsigned long flags, void (*ctor)(void *)) |
64 | { return NULL; } | 106 | { return NULL; } |
107 | |||
108 | static inline unsigned long kmem_cache_flags(unsigned long object_size, | ||
109 | unsigned long flags, const char *name, | ||
110 | void (*ctor)(void *)) | ||
111 | { | ||
112 | return flags; | ||
113 | } | ||
65 | #endif | 114 | #endif |
66 | 115 | ||
67 | 116 | ||
@@ -303,8 +352,8 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
303 | * a kmem_cache_node structure allocated (which is true for all online nodes) | 352 | * a kmem_cache_node structure allocated (which is true for all online nodes) |
304 | */ | 353 | */ |
305 | #define for_each_kmem_cache_node(__s, __node, __n) \ | 354 | #define for_each_kmem_cache_node(__s, __node, __n) \ |
306 | for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ | 355 | for (__node = 0; __node < nr_node_ids; __node++) \ |
307 | if (__n) | 356 | if ((__n = get_node(__s, __node))) |
308 | 357 | ||
309 | #endif | 358 | #endif |
310 | 359 | ||
diff --git a/mm/slab_common.c b/mm/slab_common.c index d319502b2403..3a6e0cfdf03a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c | |||
@@ -30,6 +30,43 @@ LIST_HEAD(slab_caches); | |||
30 | DEFINE_MUTEX(slab_mutex); | 30 | DEFINE_MUTEX(slab_mutex); |
31 | struct kmem_cache *kmem_cache; | 31 | struct kmem_cache *kmem_cache; |
32 | 32 | ||
33 | /* | ||
34 | * Set of flags that will prevent slab merging | ||
35 | */ | ||
36 | #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | ||
37 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ | ||
38 | SLAB_FAILSLAB) | ||
39 | |||
40 | #define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | ||
41 | SLAB_CACHE_DMA | SLAB_NOTRACK) | ||
42 | |||
43 | /* | ||
44 | * Merge control. If this is set then no merging of slab caches will occur. | ||
45 | * (Could be removed. This was introduced to pacify the merge skeptics.) | ||
46 | */ | ||
47 | static int slab_nomerge; | ||
48 | |||
49 | static int __init setup_slab_nomerge(char *str) | ||
50 | { | ||
51 | slab_nomerge = 1; | ||
52 | return 1; | ||
53 | } | ||
54 | |||
55 | #ifdef CONFIG_SLUB | ||
56 | __setup_param("slub_nomerge", slub_nomerge, setup_slab_nomerge, 0); | ||
57 | #endif | ||
58 | |||
59 | __setup("slab_nomerge", setup_slab_nomerge); | ||
60 | |||
61 | /* | ||
62 | * Determine the size of a slab object | ||
63 | */ | ||
64 | unsigned int kmem_cache_size(struct kmem_cache *s) | ||
65 | { | ||
66 | return s->object_size; | ||
67 | } | ||
68 | EXPORT_SYMBOL(kmem_cache_size); | ||
69 | |||
33 | #ifdef CONFIG_DEBUG_VM | 70 | #ifdef CONFIG_DEBUG_VM |
34 | static int kmem_cache_sanity_check(const char *name, size_t size) | 71 | static int kmem_cache_sanity_check(const char *name, size_t size) |
35 | { | 72 | { |
@@ -79,6 +116,65 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size) | |||
79 | #endif | 116 | #endif |
80 | 117 | ||
81 | #ifdef CONFIG_MEMCG_KMEM | 118 | #ifdef CONFIG_MEMCG_KMEM |
119 | static int memcg_alloc_cache_params(struct mem_cgroup *memcg, | ||
120 | struct kmem_cache *s, struct kmem_cache *root_cache) | ||
121 | { | ||
122 | size_t size; | ||
123 | |||
124 | if (!memcg_kmem_enabled()) | ||
125 | return 0; | ||
126 | |||
127 | if (!memcg) { | ||
128 | size = offsetof(struct memcg_cache_params, memcg_caches); | ||
129 | size += memcg_limited_groups_array_size * sizeof(void *); | ||
130 | } else | ||
131 | size = sizeof(struct memcg_cache_params); | ||
132 | |||
133 | s->memcg_params = kzalloc(size, GFP_KERNEL); | ||
134 | if (!s->memcg_params) | ||
135 | return -ENOMEM; | ||
136 | |||
137 | if (memcg) { | ||
138 | s->memcg_params->memcg = memcg; | ||
139 | s->memcg_params->root_cache = root_cache; | ||
140 | } else | ||
141 | s->memcg_params->is_root_cache = true; | ||
142 | |||
143 | return 0; | ||
144 | } | ||
145 | |||
146 | static void memcg_free_cache_params(struct kmem_cache *s) | ||
147 | { | ||
148 | kfree(s->memcg_params); | ||
149 | } | ||
150 | |||
151 | static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) | ||
152 | { | ||
153 | int size; | ||
154 | struct memcg_cache_params *new_params, *cur_params; | ||
155 | |||
156 | BUG_ON(!is_root_cache(s)); | ||
157 | |||
158 | size = offsetof(struct memcg_cache_params, memcg_caches); | ||
159 | size += num_memcgs * sizeof(void *); | ||
160 | |||
161 | new_params = kzalloc(size, GFP_KERNEL); | ||
162 | if (!new_params) | ||
163 | return -ENOMEM; | ||
164 | |||
165 | cur_params = s->memcg_params; | ||
166 | memcpy(new_params->memcg_caches, cur_params->memcg_caches, | ||
167 | memcg_limited_groups_array_size * sizeof(void *)); | ||
168 | |||
169 | new_params->is_root_cache = true; | ||
170 | |||
171 | rcu_assign_pointer(s->memcg_params, new_params); | ||
172 | if (cur_params) | ||
173 | kfree_rcu(cur_params, rcu_head); | ||
174 | |||
175 | return 0; | ||
176 | } | ||
177 | |||
82 | int memcg_update_all_caches(int num_memcgs) | 178 | int memcg_update_all_caches(int num_memcgs) |
83 | { | 179 | { |
84 | struct kmem_cache *s; | 180 | struct kmem_cache *s; |
@@ -89,9 +185,8 @@ int memcg_update_all_caches(int num_memcgs) | |||
89 | if (!is_root_cache(s)) | 185 | if (!is_root_cache(s)) |
90 | continue; | 186 | continue; |
91 | 187 | ||
92 | ret = memcg_update_cache_size(s, num_memcgs); | 188 | ret = memcg_update_cache_params(s, num_memcgs); |
93 | /* | 189 | /* |
94 | * See comment in memcontrol.c, memcg_update_cache_size: | ||
95 | * Instead of freeing the memory, we'll just leave the caches | 190 | * Instead of freeing the memory, we'll just leave the caches |
96 | * up to this point in an updated state. | 191 | * up to this point in an updated state. |
97 | */ | 192 | */ |
@@ -104,7 +199,80 @@ out: | |||
104 | mutex_unlock(&slab_mutex); | 199 | mutex_unlock(&slab_mutex); |
105 | return ret; | 200 | return ret; |
106 | } | 201 | } |
107 | #endif | 202 | #else |
203 | static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, | ||
204 | struct kmem_cache *s, struct kmem_cache *root_cache) | ||
205 | { | ||
206 | return 0; | ||
207 | } | ||
208 | |||
209 | static inline void memcg_free_cache_params(struct kmem_cache *s) | ||
210 | { | ||
211 | } | ||
212 | #endif /* CONFIG_MEMCG_KMEM */ | ||
213 | |||
214 | /* | ||
215 | * Find a mergeable slab cache | ||
216 | */ | ||
217 | int slab_unmergeable(struct kmem_cache *s) | ||
218 | { | ||
219 | if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE)) | ||
220 | return 1; | ||
221 | |||
222 | if (!is_root_cache(s)) | ||
223 | return 1; | ||
224 | |||
225 | if (s->ctor) | ||
226 | return 1; | ||
227 | |||
228 | /* | ||
229 | * We may have set a slab to be unmergeable during bootstrap. | ||
230 | */ | ||
231 | if (s->refcount < 0) | ||
232 | return 1; | ||
233 | |||
234 | return 0; | ||
235 | } | ||
236 | |||
237 | struct kmem_cache *find_mergeable(size_t size, size_t align, | ||
238 | unsigned long flags, const char *name, void (*ctor)(void *)) | ||
239 | { | ||
240 | struct kmem_cache *s; | ||
241 | |||
242 | if (slab_nomerge || (flags & SLAB_NEVER_MERGE)) | ||
243 | return NULL; | ||
244 | |||
245 | if (ctor) | ||
246 | return NULL; | ||
247 | |||
248 | size = ALIGN(size, sizeof(void *)); | ||
249 | align = calculate_alignment(flags, align, size); | ||
250 | size = ALIGN(size, align); | ||
251 | flags = kmem_cache_flags(size, flags, name, NULL); | ||
252 | |||
253 | list_for_each_entry(s, &slab_caches, list) { | ||
254 | if (slab_unmergeable(s)) | ||
255 | continue; | ||
256 | |||
257 | if (size > s->size) | ||
258 | continue; | ||
259 | |||
260 | if ((flags & SLAB_MERGE_SAME) != (s->flags & SLAB_MERGE_SAME)) | ||
261 | continue; | ||
262 | /* | ||
263 | * Check if alignment is compatible. | ||
264 | * Courtesy of Adrian Drzewiecki | ||
265 | */ | ||
266 | if ((s->size & ~(align - 1)) != s->size) | ||
267 | continue; | ||
268 | |||
269 | if (s->size - size >= sizeof(void *)) | ||
270 | continue; | ||
271 | |||
272 | return s; | ||
273 | } | ||
274 | return NULL; | ||
275 | } | ||
108 | 276 | ||
109 | /* | 277 | /* |
110 | * Figure out what the alignment of the objects will be given a set of | 278 | * Figure out what the alignment of the objects will be given a set of |
@@ -211,8 +379,10 @@ kmem_cache_create(const char *name, size_t size, size_t align, | |||
211 | mutex_lock(&slab_mutex); | 379 | mutex_lock(&slab_mutex); |
212 | 380 | ||
213 | err = kmem_cache_sanity_check(name, size); | 381 | err = kmem_cache_sanity_check(name, size); |
214 | if (err) | 382 | if (err) { |
383 | s = NULL; /* suppress uninit var warning */ | ||
215 | goto out_unlock; | 384 | goto out_unlock; |
385 | } | ||
216 | 386 | ||
217 | /* | 387 | /* |
218 | * Some allocators will constraint the set of valid flags to a subset | 388 | * Some allocators will constraint the set of valid flags to a subset |
@@ -468,7 +468,6 @@ void *__kmalloc(size_t size, gfp_t gfp) | |||
468 | } | 468 | } |
469 | EXPORT_SYMBOL(__kmalloc); | 469 | EXPORT_SYMBOL(__kmalloc); |
470 | 470 | ||
471 | #ifdef CONFIG_TRACING | ||
472 | void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) | 471 | void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller) |
473 | { | 472 | { |
474 | return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); | 473 | return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller); |
@@ -481,7 +480,6 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp, | |||
481 | return __do_kmalloc_node(size, gfp, node, caller); | 480 | return __do_kmalloc_node(size, gfp, node, caller); |
482 | } | 481 | } |
483 | #endif | 482 | #endif |
484 | #endif | ||
485 | 483 | ||
486 | void kfree(const void *block) | 484 | void kfree(const void *block) |
487 | { | 485 | { |
@@ -169,16 +169,6 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) | |||
169 | */ | 169 | */ |
170 | #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) | 170 | #define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) |
171 | 171 | ||
172 | /* | ||
173 | * Set of flags that will prevent slab merging | ||
174 | */ | ||
175 | #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ | ||
176 | SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ | ||
177 | SLAB_FAILSLAB) | ||
178 | |||
179 | #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ | ||
180 | SLAB_CACHE_DMA | SLAB_NOTRACK) | ||
181 | |||
182 | #define OO_SHIFT 16 | 172 | #define OO_SHIFT 16 |
183 | #define OO_MASK ((1 << OO_SHIFT) - 1) | 173 | #define OO_MASK ((1 << OO_SHIFT) - 1) |
184 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ | 174 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ |
@@ -1176,7 +1166,7 @@ out: | |||
1176 | 1166 | ||
1177 | __setup("slub_debug", setup_slub_debug); | 1167 | __setup("slub_debug", setup_slub_debug); |
1178 | 1168 | ||
1179 | static unsigned long kmem_cache_flags(unsigned long object_size, | 1169 | unsigned long kmem_cache_flags(unsigned long object_size, |
1180 | unsigned long flags, const char *name, | 1170 | unsigned long flags, const char *name, |
1181 | void (*ctor)(void *)) | 1171 | void (*ctor)(void *)) |
1182 | { | 1172 | { |
@@ -1208,7 +1198,7 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, | |||
1208 | struct page *page) {} | 1198 | struct page *page) {} |
1209 | static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, | 1199 | static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1210 | struct page *page) {} | 1200 | struct page *page) {} |
1211 | static inline unsigned long kmem_cache_flags(unsigned long object_size, | 1201 | unsigned long kmem_cache_flags(unsigned long object_size, |
1212 | unsigned long flags, const char *name, | 1202 | unsigned long flags, const char *name, |
1213 | void (*ctor)(void *)) | 1203 | void (*ctor)(void *)) |
1214 | { | 1204 | { |
@@ -1699,7 +1689,12 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, | |||
1699 | struct kmem_cache_cpu *c) | 1689 | struct kmem_cache_cpu *c) |
1700 | { | 1690 | { |
1701 | void *object; | 1691 | void *object; |
1702 | int searchnode = (node == NUMA_NO_NODE) ? numa_mem_id() : node; | 1692 | int searchnode = node; |
1693 | |||
1694 | if (node == NUMA_NO_NODE) | ||
1695 | searchnode = numa_mem_id(); | ||
1696 | else if (!node_present_pages(node)) | ||
1697 | searchnode = node_to_mem_node(node); | ||
1703 | 1698 | ||
1704 | object = get_partial_node(s, get_node(s, searchnode), c, flags); | 1699 | object = get_partial_node(s, get_node(s, searchnode), c, flags); |
1705 | if (object || node != NUMA_NO_NODE) | 1700 | if (object || node != NUMA_NO_NODE) |
@@ -2280,11 +2275,18 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
2280 | redo: | 2275 | redo: |
2281 | 2276 | ||
2282 | if (unlikely(!node_match(page, node))) { | 2277 | if (unlikely(!node_match(page, node))) { |
2283 | stat(s, ALLOC_NODE_MISMATCH); | 2278 | int searchnode = node; |
2284 | deactivate_slab(s, page, c->freelist); | 2279 | |
2285 | c->page = NULL; | 2280 | if (node != NUMA_NO_NODE && !node_present_pages(node)) |
2286 | c->freelist = NULL; | 2281 | searchnode = node_to_mem_node(node); |
2287 | goto new_slab; | 2282 | |
2283 | if (unlikely(!node_match(page, searchnode))) { | ||
2284 | stat(s, ALLOC_NODE_MISMATCH); | ||
2285 | deactivate_slab(s, page, c->freelist); | ||
2286 | c->page = NULL; | ||
2287 | c->freelist = NULL; | ||
2288 | goto new_slab; | ||
2289 | } | ||
2288 | } | 2290 | } |
2289 | 2291 | ||
2290 | /* | 2292 | /* |
@@ -2707,12 +2709,6 @@ static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; | |||
2707 | static int slub_min_objects; | 2709 | static int slub_min_objects; |
2708 | 2710 | ||
2709 | /* | 2711 | /* |
2710 | * Merge control. If this is set then no merging of slab caches will occur. | ||
2711 | * (Could be removed. This was introduced to pacify the merge skeptics.) | ||
2712 | */ | ||
2713 | static int slub_nomerge; | ||
2714 | |||
2715 | /* | ||
2716 | * Calculate the order of allocation given an slab object size. | 2712 | * Calculate the order of allocation given an slab object size. |
2717 | * | 2713 | * |
2718 | * The order of allocation has significant impact on performance and other | 2714 | * The order of allocation has significant impact on performance and other |
@@ -3240,14 +3236,6 @@ static int __init setup_slub_min_objects(char *str) | |||
3240 | 3236 | ||
3241 | __setup("slub_min_objects=", setup_slub_min_objects); | 3237 | __setup("slub_min_objects=", setup_slub_min_objects); |
3242 | 3238 | ||
3243 | static int __init setup_slub_nomerge(char *str) | ||
3244 | { | ||
3245 | slub_nomerge = 1; | ||
3246 | return 1; | ||
3247 | } | ||
3248 | |||
3249 | __setup("slub_nomerge", setup_slub_nomerge); | ||
3250 | |||
3251 | void *__kmalloc(size_t size, gfp_t flags) | 3239 | void *__kmalloc(size_t size, gfp_t flags) |
3252 | { | 3240 | { |
3253 | struct kmem_cache *s; | 3241 | struct kmem_cache *s; |
@@ -3625,69 +3613,6 @@ void __init kmem_cache_init_late(void) | |||
3625 | { | 3613 | { |
3626 | } | 3614 | } |
3627 | 3615 | ||
3628 | /* | ||
3629 | * Find a mergeable slab cache | ||
3630 | */ | ||
3631 | static int slab_unmergeable(struct kmem_cache *s) | ||
3632 | { | ||
3633 | if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) | ||
3634 | return 1; | ||
3635 | |||
3636 | if (!is_root_cache(s)) | ||
3637 | return 1; | ||
3638 | |||
3639 | if (s->ctor) | ||
3640 | return 1; | ||
3641 | |||
3642 | /* | ||
3643 | * We may have set a slab to be unmergeable during bootstrap. | ||
3644 | */ | ||
3645 | if (s->refcount < 0) | ||
3646 | return 1; | ||
3647 | |||
3648 | return 0; | ||
3649 | } | ||
3650 | |||
3651 | static struct kmem_cache *find_mergeable(size_t size, size_t align, | ||
3652 | unsigned long flags, const char *name, void (*ctor)(void *)) | ||
3653 | { | ||
3654 | struct kmem_cache *s; | ||
3655 | |||
3656 | if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) | ||
3657 | return NULL; | ||
3658 | |||
3659 | if (ctor) | ||
3660 | return NULL; | ||
3661 | |||
3662 | size = ALIGN(size, sizeof(void *)); | ||
3663 | align = calculate_alignment(flags, align, size); | ||
3664 | size = ALIGN(size, align); | ||
3665 | flags = kmem_cache_flags(size, flags, name, NULL); | ||
3666 | |||
3667 | list_for_each_entry(s, &slab_caches, list) { | ||
3668 | if (slab_unmergeable(s)) | ||
3669 | continue; | ||
3670 | |||
3671 | if (size > s->size) | ||
3672 | continue; | ||
3673 | |||
3674 | if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) | ||
3675 | continue; | ||
3676 | /* | ||
3677 | * Check if alignment is compatible. | ||
3678 | * Courtesy of Adrian Drzewiecki | ||
3679 | */ | ||
3680 | if ((s->size & ~(align - 1)) != s->size) | ||
3681 | continue; | ||
3682 | |||
3683 | if (s->size - size >= sizeof(void *)) | ||
3684 | continue; | ||
3685 | |||
3686 | return s; | ||
3687 | } | ||
3688 | return NULL; | ||
3689 | } | ||
3690 | |||
3691 | struct kmem_cache * | 3616 | struct kmem_cache * |
3692 | __kmem_cache_alias(const char *name, size_t size, size_t align, | 3617 | __kmem_cache_alias(const char *name, size_t size, size_t align, |
3693 | unsigned long flags, void (*ctor)(void *)) | 3618 | unsigned long flags, void (*ctor)(void *)) |
@@ -4604,6 +4529,14 @@ static ssize_t trace_show(struct kmem_cache *s, char *buf) | |||
4604 | static ssize_t trace_store(struct kmem_cache *s, const char *buf, | 4529 | static ssize_t trace_store(struct kmem_cache *s, const char *buf, |
4605 | size_t length) | 4530 | size_t length) |
4606 | { | 4531 | { |
4532 | /* | ||
4533 | * Tracing a merged cache is going to give confusing results | ||
4534 | * as well as cause other issues like converting a mergeable | ||
4535 | * cache into an umergeable one. | ||
4536 | */ | ||
4537 | if (s->refcount > 1) | ||
4538 | return -EINVAL; | ||
4539 | |||
4607 | s->flags &= ~SLAB_TRACE; | 4540 | s->flags &= ~SLAB_TRACE; |
4608 | if (buf[0] == '1') { | 4541 | if (buf[0] == '1') { |
4609 | s->flags &= ~__CMPXCHG_DOUBLE; | 4542 | s->flags &= ~__CMPXCHG_DOUBLE; |
@@ -4721,6 +4654,9 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf) | |||
4721 | static ssize_t failslab_store(struct kmem_cache *s, const char *buf, | 4654 | static ssize_t failslab_store(struct kmem_cache *s, const char *buf, |
4722 | size_t length) | 4655 | size_t length) |
4723 | { | 4656 | { |
4657 | if (s->refcount > 1) | ||
4658 | return -EINVAL; | ||
4659 | |||
4724 | s->flags &= ~SLAB_FAILSLAB; | 4660 | s->flags &= ~SLAB_FAILSLAB; |
4725 | if (buf[0] == '1') | 4661 | if (buf[0] == '1') |
4726 | s->flags |= SLAB_FAILSLAB; | 4662 | s->flags |= SLAB_FAILSLAB; |
@@ -887,18 +887,14 @@ void lru_add_drain_all(void) | |||
887 | mutex_unlock(&lock); | 887 | mutex_unlock(&lock); |
888 | } | 888 | } |
889 | 889 | ||
890 | /* | 890 | /** |
891 | * Batched page_cache_release(). Decrement the reference count on all the | 891 | * release_pages - batched page_cache_release() |
892 | * passed pages. If it fell to zero then remove the page from the LRU and | 892 | * @pages: array of pages to release |
893 | * free it. | 893 | * @nr: number of pages |
894 | * | 894 | * @cold: whether the pages are cache cold |
895 | * Avoid taking zone->lru_lock if possible, but if it is taken, retain it | ||
896 | * for the remainder of the operation. | ||
897 | * | 895 | * |
898 | * The locking in this function is against shrink_inactive_list(): we recheck | 896 | * Decrement the reference count on all the pages in @pages. If it |
899 | * the page count inside the lock to see whether shrink_inactive_list() | 897 | * fell to zero, remove the page from the LRU and free it. |
900 | * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() | ||
901 | * will free it. | ||
902 | */ | 898 | */ |
903 | void release_pages(struct page **pages, int nr, bool cold) | 899 | void release_pages(struct page **pages, int nr, bool cold) |
904 | { | 900 | { |
@@ -907,6 +903,7 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
907 | struct zone *zone = NULL; | 903 | struct zone *zone = NULL; |
908 | struct lruvec *lruvec; | 904 | struct lruvec *lruvec; |
909 | unsigned long uninitialized_var(flags); | 905 | unsigned long uninitialized_var(flags); |
906 | unsigned int uninitialized_var(lock_batch); | ||
910 | 907 | ||
911 | for (i = 0; i < nr; i++) { | 908 | for (i = 0; i < nr; i++) { |
912 | struct page *page = pages[i]; | 909 | struct page *page = pages[i]; |
@@ -920,6 +917,16 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
920 | continue; | 917 | continue; |
921 | } | 918 | } |
922 | 919 | ||
920 | /* | ||
921 | * Make sure the IRQ-safe lock-holding time does not get | ||
922 | * excessive with a continuous string of pages from the | ||
923 | * same zone. The lock is held only if zone != NULL. | ||
924 | */ | ||
925 | if (zone && ++lock_batch == SWAP_CLUSTER_MAX) { | ||
926 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
927 | zone = NULL; | ||
928 | } | ||
929 | |||
923 | if (!put_page_testzero(page)) | 930 | if (!put_page_testzero(page)) |
924 | continue; | 931 | continue; |
925 | 932 | ||
@@ -930,6 +937,7 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
930 | if (zone) | 937 | if (zone) |
931 | spin_unlock_irqrestore(&zone->lru_lock, | 938 | spin_unlock_irqrestore(&zone->lru_lock, |
932 | flags); | 939 | flags); |
940 | lock_batch = 0; | ||
933 | zone = pagezone; | 941 | zone = pagezone; |
934 | spin_lock_irqsave(&zone->lru_lock, flags); | 942 | spin_lock_irqsave(&zone->lru_lock, flags); |
935 | } | 943 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index 3e0ec83d000c..154444918685 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -28,7 +28,9 @@ | |||
28 | static const struct address_space_operations swap_aops = { | 28 | static const struct address_space_operations swap_aops = { |
29 | .writepage = swap_writepage, | 29 | .writepage = swap_writepage, |
30 | .set_page_dirty = swap_set_page_dirty, | 30 | .set_page_dirty = swap_set_page_dirty, |
31 | #ifdef CONFIG_MIGRATION | ||
31 | .migratepage = migrate_page, | 32 | .migratepage = migrate_page, |
33 | #endif | ||
32 | }; | 34 | }; |
33 | 35 | ||
34 | static struct backing_dev_info swap_backing_dev_info = { | 36 | static struct backing_dev_info swap_backing_dev_info = { |
@@ -263,18 +265,12 @@ void free_page_and_swap_cache(struct page *page) | |||
263 | void free_pages_and_swap_cache(struct page **pages, int nr) | 265 | void free_pages_and_swap_cache(struct page **pages, int nr) |
264 | { | 266 | { |
265 | struct page **pagep = pages; | 267 | struct page **pagep = pages; |
268 | int i; | ||
266 | 269 | ||
267 | lru_add_drain(); | 270 | lru_add_drain(); |
268 | while (nr) { | 271 | for (i = 0; i < nr; i++) |
269 | int todo = min(nr, PAGEVEC_SIZE); | 272 | free_swap_cache(pagep[i]); |
270 | int i; | 273 | release_pages(pagep, nr, false); |
271 | |||
272 | for (i = 0; i < todo; i++) | ||
273 | free_swap_cache(pagep[i]); | ||
274 | release_pages(pagep, todo, false); | ||
275 | pagep += todo; | ||
276 | nr -= todo; | ||
277 | } | ||
278 | } | 274 | } |
279 | 275 | ||
280 | /* | 276 | /* |
@@ -170,32 +170,25 @@ static int vm_is_stack_for_task(struct task_struct *t, | |||
170 | /* | 170 | /* |
171 | * Check if the vma is being used as a stack. | 171 | * Check if the vma is being used as a stack. |
172 | * If is_group is non-zero, check in the entire thread group or else | 172 | * If is_group is non-zero, check in the entire thread group or else |
173 | * just check in the current task. Returns the pid of the task that | 173 | * just check in the current task. Returns the task_struct of the task |
174 | * the vma is stack for. | 174 | * that the vma is stack for. Must be called under rcu_read_lock(). |
175 | */ | 175 | */ |
176 | pid_t vm_is_stack(struct task_struct *task, | 176 | struct task_struct *task_of_stack(struct task_struct *task, |
177 | struct vm_area_struct *vma, int in_group) | 177 | struct vm_area_struct *vma, bool in_group) |
178 | { | 178 | { |
179 | pid_t ret = 0; | ||
180 | |||
181 | if (vm_is_stack_for_task(task, vma)) | 179 | if (vm_is_stack_for_task(task, vma)) |
182 | return task->pid; | 180 | return task; |
183 | 181 | ||
184 | if (in_group) { | 182 | if (in_group) { |
185 | struct task_struct *t; | 183 | struct task_struct *t; |
186 | 184 | ||
187 | rcu_read_lock(); | ||
188 | for_each_thread(task, t) { | 185 | for_each_thread(task, t) { |
189 | if (vm_is_stack_for_task(t, vma)) { | 186 | if (vm_is_stack_for_task(t, vma)) |
190 | ret = t->pid; | 187 | return t; |
191 | goto done; | ||
192 | } | ||
193 | } | 188 | } |
194 | done: | ||
195 | rcu_read_unlock(); | ||
196 | } | 189 | } |
197 | 190 | ||
198 | return ret; | 191 | return NULL; |
199 | } | 192 | } |
200 | 193 | ||
201 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) | 194 | #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2b0aa5486092..90520af7f186 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -2646,21 +2646,11 @@ static const struct seq_operations vmalloc_op = { | |||
2646 | 2646 | ||
2647 | static int vmalloc_open(struct inode *inode, struct file *file) | 2647 | static int vmalloc_open(struct inode *inode, struct file *file) |
2648 | { | 2648 | { |
2649 | unsigned int *ptr = NULL; | 2649 | if (IS_ENABLED(CONFIG_NUMA)) |
2650 | int ret; | 2650 | return seq_open_private(file, &vmalloc_op, |
2651 | 2651 | nr_node_ids * sizeof(unsigned int)); | |
2652 | if (IS_ENABLED(CONFIG_NUMA)) { | 2652 | else |
2653 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); | 2653 | return seq_open(file, &vmalloc_op); |
2654 | if (ptr == NULL) | ||
2655 | return -ENOMEM; | ||
2656 | } | ||
2657 | ret = seq_open(file, &vmalloc_op); | ||
2658 | if (!ret) { | ||
2659 | struct seq_file *m = file->private_data; | ||
2660 | m->private = ptr; | ||
2661 | } else | ||
2662 | kfree(ptr); | ||
2663 | return ret; | ||
2664 | } | 2654 | } |
2665 | 2655 | ||
2666 | static const struct file_operations proc_vmalloc_operations = { | 2656 | static const struct file_operations proc_vmalloc_operations = { |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 2836b5373b2e..dcb47074ae03 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -920,7 +920,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
920 | /* Case 1 above */ | 920 | /* Case 1 above */ |
921 | if (current_is_kswapd() && | 921 | if (current_is_kswapd() && |
922 | PageReclaim(page) && | 922 | PageReclaim(page) && |
923 | zone_is_reclaim_writeback(zone)) { | 923 | test_bit(ZONE_WRITEBACK, &zone->flags)) { |
924 | nr_immediate++; | 924 | nr_immediate++; |
925 | goto keep_locked; | 925 | goto keep_locked; |
926 | 926 | ||
@@ -1002,7 +1002,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1002 | */ | 1002 | */ |
1003 | if (page_is_file_cache(page) && | 1003 | if (page_is_file_cache(page) && |
1004 | (!current_is_kswapd() || | 1004 | (!current_is_kswapd() || |
1005 | !zone_is_reclaim_dirty(zone))) { | 1005 | !test_bit(ZONE_DIRTY, &zone->flags))) { |
1006 | /* | 1006 | /* |
1007 | * Immediately reclaim when written back. | 1007 | * Immediately reclaim when written back. |
1008 | * Similar in principal to deactivate_page() | 1008 | * Similar in principal to deactivate_page() |
@@ -1563,7 +1563,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1563 | * are encountered in the nr_immediate check below. | 1563 | * are encountered in the nr_immediate check below. |
1564 | */ | 1564 | */ |
1565 | if (nr_writeback && nr_writeback == nr_taken) | 1565 | if (nr_writeback && nr_writeback == nr_taken) |
1566 | zone_set_flag(zone, ZONE_WRITEBACK); | 1566 | set_bit(ZONE_WRITEBACK, &zone->flags); |
1567 | 1567 | ||
1568 | /* | 1568 | /* |
1569 | * memcg will stall in page writeback so only consider forcibly | 1569 | * memcg will stall in page writeback so only consider forcibly |
@@ -1575,16 +1575,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1575 | * backed by a congested BDI and wait_iff_congested will stall. | 1575 | * backed by a congested BDI and wait_iff_congested will stall. |
1576 | */ | 1576 | */ |
1577 | if (nr_dirty && nr_dirty == nr_congested) | 1577 | if (nr_dirty && nr_dirty == nr_congested) |
1578 | zone_set_flag(zone, ZONE_CONGESTED); | 1578 | set_bit(ZONE_CONGESTED, &zone->flags); |
1579 | 1579 | ||
1580 | /* | 1580 | /* |
1581 | * If dirty pages are scanned that are not queued for IO, it | 1581 | * If dirty pages are scanned that are not queued for IO, it |
1582 | * implies that flushers are not keeping up. In this case, flag | 1582 | * implies that flushers are not keeping up. In this case, flag |
1583 | * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing | 1583 | * the zone ZONE_DIRTY and kswapd will start writing pages from |
1584 | * pages from reclaim context. | 1584 | * reclaim context. |
1585 | */ | 1585 | */ |
1586 | if (nr_unqueued_dirty == nr_taken) | 1586 | if (nr_unqueued_dirty == nr_taken) |
1587 | zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); | 1587 | set_bit(ZONE_DIRTY, &zone->flags); |
1588 | 1588 | ||
1589 | /* | 1589 | /* |
1590 | * If kswapd scans pages marked marked for immediate | 1590 | * If kswapd scans pages marked marked for immediate |
@@ -2315,7 +2315,10 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc) | |||
2315 | return reclaimable; | 2315 | return reclaimable; |
2316 | } | 2316 | } |
2317 | 2317 | ||
2318 | /* Returns true if compaction should go ahead for a high-order request */ | 2318 | /* |
2319 | * Returns true if compaction should go ahead for a high-order request, or | ||
2320 | * the high-order allocation would succeed without compaction. | ||
2321 | */ | ||
2319 | static inline bool compaction_ready(struct zone *zone, int order) | 2322 | static inline bool compaction_ready(struct zone *zone, int order) |
2320 | { | 2323 | { |
2321 | unsigned long balance_gap, watermark; | 2324 | unsigned long balance_gap, watermark; |
@@ -2339,8 +2342,11 @@ static inline bool compaction_ready(struct zone *zone, int order) | |||
2339 | if (compaction_deferred(zone, order)) | 2342 | if (compaction_deferred(zone, order)) |
2340 | return watermark_ok; | 2343 | return watermark_ok; |
2341 | 2344 | ||
2342 | /* If compaction is not ready to start, keep reclaiming */ | 2345 | /* |
2343 | if (!compaction_suitable(zone, order)) | 2346 | * If compaction is not ready to start and allocation is not likely |
2347 | * to succeed without it, then keep reclaiming. | ||
2348 | */ | ||
2349 | if (compaction_suitable(zone, order) == COMPACT_SKIPPED) | ||
2344 | return false; | 2350 | return false; |
2345 | 2351 | ||
2346 | return watermark_ok; | 2352 | return watermark_ok; |
@@ -2753,21 +2759,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, | |||
2753 | } | 2759 | } |
2754 | 2760 | ||
2755 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | 2761 | unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, |
2762 | unsigned long nr_pages, | ||
2756 | gfp_t gfp_mask, | 2763 | gfp_t gfp_mask, |
2757 | bool noswap) | 2764 | bool may_swap) |
2758 | { | 2765 | { |
2759 | struct zonelist *zonelist; | 2766 | struct zonelist *zonelist; |
2760 | unsigned long nr_reclaimed; | 2767 | unsigned long nr_reclaimed; |
2761 | int nid; | 2768 | int nid; |
2762 | struct scan_control sc = { | 2769 | struct scan_control sc = { |
2763 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2770 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
2764 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 2771 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | |
2765 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 2772 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
2766 | .target_mem_cgroup = memcg, | 2773 | .target_mem_cgroup = memcg, |
2767 | .priority = DEF_PRIORITY, | 2774 | .priority = DEF_PRIORITY, |
2768 | .may_writepage = !laptop_mode, | 2775 | .may_writepage = !laptop_mode, |
2769 | .may_unmap = 1, | 2776 | .may_unmap = 1, |
2770 | .may_swap = !noswap, | 2777 | .may_swap = may_swap, |
2771 | }; | 2778 | }; |
2772 | 2779 | ||
2773 | /* | 2780 | /* |
@@ -2818,7 +2825,7 @@ static bool zone_balanced(struct zone *zone, int order, | |||
2818 | return false; | 2825 | return false; |
2819 | 2826 | ||
2820 | if (IS_ENABLED(CONFIG_COMPACTION) && order && | 2827 | if (IS_ENABLED(CONFIG_COMPACTION) && order && |
2821 | !compaction_suitable(zone, order)) | 2828 | compaction_suitable(zone, order) == COMPACT_SKIPPED) |
2822 | return false; | 2829 | return false; |
2823 | 2830 | ||
2824 | return true; | 2831 | return true; |
@@ -2978,7 +2985,7 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2978 | /* Account for the number of pages attempted to reclaim */ | 2985 | /* Account for the number of pages attempted to reclaim */ |
2979 | *nr_attempted += sc->nr_to_reclaim; | 2986 | *nr_attempted += sc->nr_to_reclaim; |
2980 | 2987 | ||
2981 | zone_clear_flag(zone, ZONE_WRITEBACK); | 2988 | clear_bit(ZONE_WRITEBACK, &zone->flags); |
2982 | 2989 | ||
2983 | /* | 2990 | /* |
2984 | * If a zone reaches its high watermark, consider it to be no longer | 2991 | * If a zone reaches its high watermark, consider it to be no longer |
@@ -2988,8 +2995,8 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2988 | */ | 2995 | */ |
2989 | if (zone_reclaimable(zone) && | 2996 | if (zone_reclaimable(zone) && |
2990 | zone_balanced(zone, testorder, 0, classzone_idx)) { | 2997 | zone_balanced(zone, testorder, 0, classzone_idx)) { |
2991 | zone_clear_flag(zone, ZONE_CONGESTED); | 2998 | clear_bit(ZONE_CONGESTED, &zone->flags); |
2992 | zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); | 2999 | clear_bit(ZONE_DIRTY, &zone->flags); |
2993 | } | 3000 | } |
2994 | 3001 | ||
2995 | return sc->nr_scanned >= sc->nr_to_reclaim; | 3002 | return sc->nr_scanned >= sc->nr_to_reclaim; |
@@ -3080,8 +3087,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | |||
3080 | * If balanced, clear the dirty and congested | 3087 | * If balanced, clear the dirty and congested |
3081 | * flags | 3088 | * flags |
3082 | */ | 3089 | */ |
3083 | zone_clear_flag(zone, ZONE_CONGESTED); | 3090 | clear_bit(ZONE_CONGESTED, &zone->flags); |
3084 | zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); | 3091 | clear_bit(ZONE_DIRTY, &zone->flags); |
3085 | } | 3092 | } |
3086 | } | 3093 | } |
3087 | 3094 | ||
@@ -3708,11 +3715,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3708 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) | 3715 | if (node_state(node_id, N_CPU) && node_id != numa_node_id()) |
3709 | return ZONE_RECLAIM_NOSCAN; | 3716 | return ZONE_RECLAIM_NOSCAN; |
3710 | 3717 | ||
3711 | if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) | 3718 | if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) |
3712 | return ZONE_RECLAIM_NOSCAN; | 3719 | return ZONE_RECLAIM_NOSCAN; |
3713 | 3720 | ||
3714 | ret = __zone_reclaim(zone, gfp_mask, order); | 3721 | ret = __zone_reclaim(zone, gfp_mask, order); |
3715 | zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); | 3722 | clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); |
3716 | 3723 | ||
3717 | if (!ret) | 3724 | if (!ret) |
3718 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); | 3725 | count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); |
@@ -3791,66 +3798,3 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) | |||
3791 | } | 3798 | } |
3792 | } | 3799 | } |
3793 | #endif /* CONFIG_SHMEM */ | 3800 | #endif /* CONFIG_SHMEM */ |
3794 | |||
3795 | static void warn_scan_unevictable_pages(void) | ||
3796 | { | ||
3797 | printk_once(KERN_WARNING | ||
3798 | "%s: The scan_unevictable_pages sysctl/node-interface has been " | ||
3799 | "disabled for lack of a legitimate use case. If you have " | ||
3800 | "one, please send an email to linux-mm@kvack.org.\n", | ||
3801 | current->comm); | ||
3802 | } | ||
3803 | |||
3804 | /* | ||
3805 | * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of | ||
3806 | * all nodes' unevictable lists for evictable pages | ||
3807 | */ | ||
3808 | unsigned long scan_unevictable_pages; | ||
3809 | |||
3810 | int scan_unevictable_handler(struct ctl_table *table, int write, | ||
3811 | void __user *buffer, | ||
3812 | size_t *length, loff_t *ppos) | ||
3813 | { | ||
3814 | warn_scan_unevictable_pages(); | ||
3815 | proc_doulongvec_minmax(table, write, buffer, length, ppos); | ||
3816 | scan_unevictable_pages = 0; | ||
3817 | return 0; | ||
3818 | } | ||
3819 | |||
3820 | #ifdef CONFIG_NUMA | ||
3821 | /* | ||
3822 | * per node 'scan_unevictable_pages' attribute. On demand re-scan of | ||
3823 | * a specified node's per zone unevictable lists for evictable pages. | ||
3824 | */ | ||
3825 | |||
3826 | static ssize_t read_scan_unevictable_node(struct device *dev, | ||
3827 | struct device_attribute *attr, | ||
3828 | char *buf) | ||
3829 | { | ||
3830 | warn_scan_unevictable_pages(); | ||
3831 | return sprintf(buf, "0\n"); /* always zero; should fit... */ | ||
3832 | } | ||
3833 | |||
3834 | static ssize_t write_scan_unevictable_node(struct device *dev, | ||
3835 | struct device_attribute *attr, | ||
3836 | const char *buf, size_t count) | ||
3837 | { | ||
3838 | warn_scan_unevictable_pages(); | ||
3839 | return 1; | ||
3840 | } | ||
3841 | |||
3842 | |||
3843 | static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, | ||
3844 | read_scan_unevictable_node, | ||
3845 | write_scan_unevictable_node); | ||
3846 | |||
3847 | int scan_unevictable_register_node(struct node *node) | ||
3848 | { | ||
3849 | return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); | ||
3850 | } | ||
3851 | |||
3852 | void scan_unevictable_unregister_node(struct node *node) | ||
3853 | { | ||
3854 | device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); | ||
3855 | } | ||
3856 | #endif | ||
diff --git a/mm/vmstat.c b/mm/vmstat.c index e9ab104b956f..1b12d390dc68 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -7,6 +7,7 @@ | |||
7 | * zoned VM statistics | 7 | * zoned VM statistics |
8 | * Copyright (C) 2006 Silicon Graphics, Inc., | 8 | * Copyright (C) 2006 Silicon Graphics, Inc., |
9 | * Christoph Lameter <christoph@lameter.com> | 9 | * Christoph Lameter <christoph@lameter.com> |
10 | * Copyright (C) 2008-2014 Christoph Lameter | ||
10 | */ | 11 | */ |
11 | #include <linux/fs.h> | 12 | #include <linux/fs.h> |
12 | #include <linux/mm.h> | 13 | #include <linux/mm.h> |
@@ -14,6 +15,7 @@ | |||
14 | #include <linux/module.h> | 15 | #include <linux/module.h> |
15 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
16 | #include <linux/cpu.h> | 17 | #include <linux/cpu.h> |
18 | #include <linux/cpumask.h> | ||
17 | #include <linux/vmstat.h> | 19 | #include <linux/vmstat.h> |
18 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
19 | #include <linux/math64.h> | 21 | #include <linux/math64.h> |
@@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) | |||
419 | EXPORT_SYMBOL(dec_zone_page_state); | 421 | EXPORT_SYMBOL(dec_zone_page_state); |
420 | #endif | 422 | #endif |
421 | 423 | ||
422 | static inline void fold_diff(int *diff) | 424 | |
425 | /* | ||
426 | * Fold a differential into the global counters. | ||
427 | * Returns the number of counters updated. | ||
428 | */ | ||
429 | static int fold_diff(int *diff) | ||
423 | { | 430 | { |
424 | int i; | 431 | int i; |
432 | int changes = 0; | ||
425 | 433 | ||
426 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) | 434 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) |
427 | if (diff[i]) | 435 | if (diff[i]) { |
428 | atomic_long_add(diff[i], &vm_stat[i]); | 436 | atomic_long_add(diff[i], &vm_stat[i]); |
437 | changes++; | ||
438 | } | ||
439 | return changes; | ||
429 | } | 440 | } |
430 | 441 | ||
431 | /* | 442 | /* |
@@ -441,12 +452,15 @@ static inline void fold_diff(int *diff) | |||
441 | * statistics in the remote zone struct as well as the global cachelines | 452 | * statistics in the remote zone struct as well as the global cachelines |
442 | * with the global counters. These could cause remote node cache line | 453 | * with the global counters. These could cause remote node cache line |
443 | * bouncing and will have to be only done when necessary. | 454 | * bouncing and will have to be only done when necessary. |
455 | * | ||
456 | * The function returns the number of global counters updated. | ||
444 | */ | 457 | */ |
445 | static void refresh_cpu_vm_stats(void) | 458 | static int refresh_cpu_vm_stats(void) |
446 | { | 459 | { |
447 | struct zone *zone; | 460 | struct zone *zone; |
448 | int i; | 461 | int i; |
449 | int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; | 462 | int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; |
463 | int changes = 0; | ||
450 | 464 | ||
451 | for_each_populated_zone(zone) { | 465 | for_each_populated_zone(zone) { |
452 | struct per_cpu_pageset __percpu *p = zone->pageset; | 466 | struct per_cpu_pageset __percpu *p = zone->pageset; |
@@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void) | |||
486 | continue; | 500 | continue; |
487 | } | 501 | } |
488 | 502 | ||
489 | |||
490 | if (__this_cpu_dec_return(p->expire)) | 503 | if (__this_cpu_dec_return(p->expire)) |
491 | continue; | 504 | continue; |
492 | 505 | ||
493 | if (__this_cpu_read(p->pcp.count)) | 506 | if (__this_cpu_read(p->pcp.count)) { |
494 | drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); | 507 | drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); |
508 | changes++; | ||
509 | } | ||
495 | #endif | 510 | #endif |
496 | } | 511 | } |
497 | fold_diff(global_diff); | 512 | changes += fold_diff(global_diff); |
513 | return changes; | ||
498 | } | 514 | } |
499 | 515 | ||
500 | /* | 516 | /* |
@@ -735,7 +751,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, | |||
735 | TEXT_FOR_HIGHMEM(xx) xx "_movable", | 751 | TEXT_FOR_HIGHMEM(xx) xx "_movable", |
736 | 752 | ||
737 | const char * const vmstat_text[] = { | 753 | const char * const vmstat_text[] = { |
738 | /* Zoned VM counters */ | 754 | /* enum zone_stat_item countes */ |
739 | "nr_free_pages", | 755 | "nr_free_pages", |
740 | "nr_alloc_batch", | 756 | "nr_alloc_batch", |
741 | "nr_inactive_anon", | 757 | "nr_inactive_anon", |
@@ -778,10 +794,13 @@ const char * const vmstat_text[] = { | |||
778 | "workingset_nodereclaim", | 794 | "workingset_nodereclaim", |
779 | "nr_anon_transparent_hugepages", | 795 | "nr_anon_transparent_hugepages", |
780 | "nr_free_cma", | 796 | "nr_free_cma", |
797 | |||
798 | /* enum writeback_stat_item counters */ | ||
781 | "nr_dirty_threshold", | 799 | "nr_dirty_threshold", |
782 | "nr_dirty_background_threshold", | 800 | "nr_dirty_background_threshold", |
783 | 801 | ||
784 | #ifdef CONFIG_VM_EVENT_COUNTERS | 802 | #ifdef CONFIG_VM_EVENT_COUNTERS |
803 | /* enum vm_event_item counters */ | ||
785 | "pgpgin", | 804 | "pgpgin", |
786 | "pgpgout", | 805 | "pgpgout", |
787 | "pswpin", | 806 | "pswpin", |
@@ -860,6 +879,13 @@ const char * const vmstat_text[] = { | |||
860 | "thp_zero_page_alloc", | 879 | "thp_zero_page_alloc", |
861 | "thp_zero_page_alloc_failed", | 880 | "thp_zero_page_alloc_failed", |
862 | #endif | 881 | #endif |
882 | #ifdef CONFIG_MEMORY_BALLOON | ||
883 | "balloon_inflate", | ||
884 | "balloon_deflate", | ||
885 | #ifdef CONFIG_BALLOON_COMPACTION | ||
886 | "balloon_migrate", | ||
887 | #endif | ||
888 | #endif /* CONFIG_MEMORY_BALLOON */ | ||
863 | #ifdef CONFIG_DEBUG_TLBFLUSH | 889 | #ifdef CONFIG_DEBUG_TLBFLUSH |
864 | #ifdef CONFIG_SMP | 890 | #ifdef CONFIG_SMP |
865 | "nr_tlb_remote_flush", | 891 | "nr_tlb_remote_flush", |
@@ -1229,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = { | |||
1229 | #ifdef CONFIG_SMP | 1255 | #ifdef CONFIG_SMP |
1230 | static DEFINE_PER_CPU(struct delayed_work, vmstat_work); | 1256 | static DEFINE_PER_CPU(struct delayed_work, vmstat_work); |
1231 | int sysctl_stat_interval __read_mostly = HZ; | 1257 | int sysctl_stat_interval __read_mostly = HZ; |
1258 | static cpumask_var_t cpu_stat_off; | ||
1232 | 1259 | ||
1233 | static void vmstat_update(struct work_struct *w) | 1260 | static void vmstat_update(struct work_struct *w) |
1234 | { | 1261 | { |
1235 | refresh_cpu_vm_stats(); | 1262 | if (refresh_cpu_vm_stats()) |
1236 | schedule_delayed_work(this_cpu_ptr(&vmstat_work), | 1263 | /* |
1264 | * Counters were updated so we expect more updates | ||
1265 | * to occur in the future. Keep on running the | ||
1266 | * update worker thread. | ||
1267 | */ | ||
1268 | schedule_delayed_work(this_cpu_ptr(&vmstat_work), | ||
1269 | round_jiffies_relative(sysctl_stat_interval)); | ||
1270 | else { | ||
1271 | /* | ||
1272 | * We did not update any counters so the app may be in | ||
1273 | * a mode where it does not cause counter updates. | ||
1274 | * We may be uselessly running vmstat_update. | ||
1275 | * Defer the checking for differentials to the | ||
1276 | * shepherd thread on a different processor. | ||
1277 | */ | ||
1278 | int r; | ||
1279 | /* | ||
1280 | * Shepherd work thread does not race since it never | ||
1281 | * changes the bit if its zero but the cpu | ||
1282 | * online / off line code may race if | ||
1283 | * worker threads are still allowed during | ||
1284 | * shutdown / startup. | ||
1285 | */ | ||
1286 | r = cpumask_test_and_set_cpu(smp_processor_id(), | ||
1287 | cpu_stat_off); | ||
1288 | VM_BUG_ON(r); | ||
1289 | } | ||
1290 | } | ||
1291 | |||
1292 | /* | ||
1293 | * Check if the diffs for a certain cpu indicate that | ||
1294 | * an update is needed. | ||
1295 | */ | ||
1296 | static bool need_update(int cpu) | ||
1297 | { | ||
1298 | struct zone *zone; | ||
1299 | |||
1300 | for_each_populated_zone(zone) { | ||
1301 | struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); | ||
1302 | |||
1303 | BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); | ||
1304 | /* | ||
1305 | * The fast way of checking if there are any vmstat diffs. | ||
1306 | * This works because the diffs are byte sized items. | ||
1307 | */ | ||
1308 | if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) | ||
1309 | return true; | ||
1310 | |||
1311 | } | ||
1312 | return false; | ||
1313 | } | ||
1314 | |||
1315 | |||
1316 | /* | ||
1317 | * Shepherd worker thread that checks the | ||
1318 | * differentials of processors that have their worker | ||
1319 | * threads for vm statistics updates disabled because of | ||
1320 | * inactivity. | ||
1321 | */ | ||
1322 | static void vmstat_shepherd(struct work_struct *w); | ||
1323 | |||
1324 | static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); | ||
1325 | |||
1326 | static void vmstat_shepherd(struct work_struct *w) | ||
1327 | { | ||
1328 | int cpu; | ||
1329 | |||
1330 | get_online_cpus(); | ||
1331 | /* Check processors whose vmstat worker threads have been disabled */ | ||
1332 | for_each_cpu(cpu, cpu_stat_off) | ||
1333 | if (need_update(cpu) && | ||
1334 | cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) | ||
1335 | |||
1336 | schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), | ||
1337 | __round_jiffies_relative(sysctl_stat_interval, cpu)); | ||
1338 | |||
1339 | put_online_cpus(); | ||
1340 | |||
1341 | schedule_delayed_work(&shepherd, | ||
1237 | round_jiffies_relative(sysctl_stat_interval)); | 1342 | round_jiffies_relative(sysctl_stat_interval)); |
1343 | |||
1238 | } | 1344 | } |
1239 | 1345 | ||
1240 | static void start_cpu_timer(int cpu) | 1346 | static void __init start_shepherd_timer(void) |
1241 | { | 1347 | { |
1242 | struct delayed_work *work = &per_cpu(vmstat_work, cpu); | 1348 | int cpu; |
1349 | |||
1350 | for_each_possible_cpu(cpu) | ||
1351 | INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), | ||
1352 | vmstat_update); | ||
1353 | |||
1354 | if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) | ||
1355 | BUG(); | ||
1356 | cpumask_copy(cpu_stat_off, cpu_online_mask); | ||
1243 | 1357 | ||
1244 | INIT_DEFERRABLE_WORK(work, vmstat_update); | 1358 | schedule_delayed_work(&shepherd, |
1245 | schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); | 1359 | round_jiffies_relative(sysctl_stat_interval)); |
1246 | } | 1360 | } |
1247 | 1361 | ||
1248 | static void vmstat_cpu_dead(int node) | 1362 | static void vmstat_cpu_dead(int node) |
@@ -1273,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, | |||
1273 | case CPU_ONLINE: | 1387 | case CPU_ONLINE: |
1274 | case CPU_ONLINE_FROZEN: | 1388 | case CPU_ONLINE_FROZEN: |
1275 | refresh_zone_stat_thresholds(); | 1389 | refresh_zone_stat_thresholds(); |
1276 | start_cpu_timer(cpu); | ||
1277 | node_set_state(cpu_to_node(cpu), N_CPU); | 1390 | node_set_state(cpu_to_node(cpu), N_CPU); |
1391 | cpumask_set_cpu(cpu, cpu_stat_off); | ||
1278 | break; | 1392 | break; |
1279 | case CPU_DOWN_PREPARE: | 1393 | case CPU_DOWN_PREPARE: |
1280 | case CPU_DOWN_PREPARE_FROZEN: | 1394 | case CPU_DOWN_PREPARE_FROZEN: |
1281 | cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); | 1395 | cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); |
1282 | per_cpu(vmstat_work, cpu).work.func = NULL; | 1396 | cpumask_clear_cpu(cpu, cpu_stat_off); |
1283 | break; | 1397 | break; |
1284 | case CPU_DOWN_FAILED: | 1398 | case CPU_DOWN_FAILED: |
1285 | case CPU_DOWN_FAILED_FROZEN: | 1399 | case CPU_DOWN_FAILED_FROZEN: |
1286 | start_cpu_timer(cpu); | 1400 | cpumask_set_cpu(cpu, cpu_stat_off); |
1287 | break; | 1401 | break; |
1288 | case CPU_DEAD: | 1402 | case CPU_DEAD: |
1289 | case CPU_DEAD_FROZEN: | 1403 | case CPU_DEAD_FROZEN: |
@@ -1303,15 +1417,10 @@ static struct notifier_block vmstat_notifier = | |||
1303 | static int __init setup_vmstat(void) | 1417 | static int __init setup_vmstat(void) |
1304 | { | 1418 | { |
1305 | #ifdef CONFIG_SMP | 1419 | #ifdef CONFIG_SMP |
1306 | int cpu; | ||
1307 | |||
1308 | cpu_notifier_register_begin(); | 1420 | cpu_notifier_register_begin(); |
1309 | __register_cpu_notifier(&vmstat_notifier); | 1421 | __register_cpu_notifier(&vmstat_notifier); |
1310 | 1422 | ||
1311 | for_each_online_cpu(cpu) { | 1423 | start_shepherd_timer(); |
1312 | start_cpu_timer(cpu); | ||
1313 | node_set_state(cpu_to_node(cpu), N_CPU); | ||
1314 | } | ||
1315 | cpu_notifier_register_done(); | 1424 | cpu_notifier_register_done(); |
1316 | #endif | 1425 | #endif |
1317 | #ifdef CONFIG_PROC_FS | 1426 | #ifdef CONFIG_PROC_FS |
@@ -60,15 +60,17 @@ | |||
60 | * NCHUNKS_ORDER determines the internal allocation granularity, effectively | 60 | * NCHUNKS_ORDER determines the internal allocation granularity, effectively |
61 | * adjusting internal fragmentation. It also determines the number of | 61 | * adjusting internal fragmentation. It also determines the number of |
62 | * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the | 62 | * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the |
63 | * allocation granularity will be in chunks of size PAGE_SIZE/64, and there | 63 | * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk |
64 | * will be 64 freelists per pool. | 64 | * in allocated page is occupied by zbud header, NCHUNKS will be calculated to |
65 | * 63 which shows the max number of free chunks in zbud page, also there will be | ||
66 | * 63 freelists per pool. | ||
65 | */ | 67 | */ |
66 | #define NCHUNKS_ORDER 6 | 68 | #define NCHUNKS_ORDER 6 |
67 | 69 | ||
68 | #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) | 70 | #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) |
69 | #define CHUNK_SIZE (1 << CHUNK_SHIFT) | 71 | #define CHUNK_SIZE (1 << CHUNK_SHIFT) |
70 | #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) | ||
71 | #define ZHDR_SIZE_ALIGNED CHUNK_SIZE | 72 | #define ZHDR_SIZE_ALIGNED CHUNK_SIZE |
73 | #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) | ||
72 | 74 | ||
73 | /** | 75 | /** |
74 | * struct zbud_pool - stores metadata for each zbud pool | 76 | * struct zbud_pool - stores metadata for each zbud pool |
@@ -268,10 +270,9 @@ static int num_free_chunks(struct zbud_header *zhdr) | |||
268 | { | 270 | { |
269 | /* | 271 | /* |
270 | * Rather than branch for different situations, just use the fact that | 272 | * Rather than branch for different situations, just use the fact that |
271 | * free buddies have a length of zero to simplify everything. -1 at the | 273 | * free buddies have a length of zero to simplify everything. |
272 | * end for the zbud header. | ||
273 | */ | 274 | */ |
274 | return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; | 275 | return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; |
275 | } | 276 | } |
276 | 277 | ||
277 | /***************** | 278 | /***************** |
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 94f38fac5e81..839a48c3ca27 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c | |||
@@ -175,7 +175,7 @@ enum fullness_group { | |||
175 | * n <= N / f, where | 175 | * n <= N / f, where |
176 | * n = number of allocated objects | 176 | * n = number of allocated objects |
177 | * N = total number of objects zspage can store | 177 | * N = total number of objects zspage can store |
178 | * f = 1/fullness_threshold_frac | 178 | * f = fullness_threshold_frac |
179 | * | 179 | * |
180 | * Similarly, we assign zspage to: | 180 | * Similarly, we assign zspage to: |
181 | * ZS_ALMOST_FULL when n > N / f | 181 | * ZS_ALMOST_FULL when n > N / f |
@@ -199,9 +199,6 @@ struct size_class { | |||
199 | 199 | ||
200 | spinlock_t lock; | 200 | spinlock_t lock; |
201 | 201 | ||
202 | /* stats */ | ||
203 | u64 pages_allocated; | ||
204 | |||
205 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; | 202 | struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; |
206 | }; | 203 | }; |
207 | 204 | ||
@@ -220,6 +217,7 @@ struct zs_pool { | |||
220 | struct size_class size_class[ZS_SIZE_CLASSES]; | 217 | struct size_class size_class[ZS_SIZE_CLASSES]; |
221 | 218 | ||
222 | gfp_t flags; /* allocation flags used when growing pool */ | 219 | gfp_t flags; /* allocation flags used when growing pool */ |
220 | atomic_long_t pages_allocated; | ||
223 | }; | 221 | }; |
224 | 222 | ||
225 | /* | 223 | /* |
@@ -299,7 +297,7 @@ static void zs_zpool_unmap(void *pool, unsigned long handle) | |||
299 | 297 | ||
300 | static u64 zs_zpool_total_size(void *pool) | 298 | static u64 zs_zpool_total_size(void *pool) |
301 | { | 299 | { |
302 | return zs_get_total_size_bytes(pool); | 300 | return zs_get_total_pages(pool) << PAGE_SHIFT; |
303 | } | 301 | } |
304 | 302 | ||
305 | static struct zpool_driver zs_zpool_driver = { | 303 | static struct zpool_driver zs_zpool_driver = { |
@@ -630,7 +628,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
630 | while (page) { | 628 | while (page) { |
631 | struct page *next_page; | 629 | struct page *next_page; |
632 | struct link_free *link; | 630 | struct link_free *link; |
633 | unsigned int i, objs_on_page; | 631 | unsigned int i = 1; |
634 | 632 | ||
635 | /* | 633 | /* |
636 | * page->index stores offset of first object starting | 634 | * page->index stores offset of first object starting |
@@ -643,14 +641,10 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
643 | 641 | ||
644 | link = (struct link_free *)kmap_atomic(page) + | 642 | link = (struct link_free *)kmap_atomic(page) + |
645 | off / sizeof(*link); | 643 | off / sizeof(*link); |
646 | objs_on_page = (PAGE_SIZE - off) / class->size; | ||
647 | 644 | ||
648 | for (i = 1; i <= objs_on_page; i++) { | 645 | while ((off += class->size) < PAGE_SIZE) { |
649 | off += class->size; | 646 | link->next = obj_location_to_handle(page, i++); |
650 | if (off < PAGE_SIZE) { | 647 | link += class->size / sizeof(*link); |
651 | link->next = obj_location_to_handle(page, i); | ||
652 | link += class->size / sizeof(*link); | ||
653 | } | ||
654 | } | 648 | } |
655 | 649 | ||
656 | /* | 650 | /* |
@@ -662,7 +656,7 @@ static void init_zspage(struct page *first_page, struct size_class *class) | |||
662 | link->next = obj_location_to_handle(next_page, 0); | 656 | link->next = obj_location_to_handle(next_page, 0); |
663 | kunmap_atomic(link); | 657 | kunmap_atomic(link); |
664 | page = next_page; | 658 | page = next_page; |
665 | off = (off + class->size) % PAGE_SIZE; | 659 | off %= PAGE_SIZE; |
666 | } | 660 | } |
667 | } | 661 | } |
668 | 662 | ||
@@ -1028,8 +1022,9 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) | |||
1028 | return 0; | 1022 | return 0; |
1029 | 1023 | ||
1030 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); | 1024 | set_zspage_mapping(first_page, class->index, ZS_EMPTY); |
1025 | atomic_long_add(class->pages_per_zspage, | ||
1026 | &pool->pages_allocated); | ||
1031 | spin_lock(&class->lock); | 1027 | spin_lock(&class->lock); |
1032 | class->pages_allocated += class->pages_per_zspage; | ||
1033 | } | 1028 | } |
1034 | 1029 | ||
1035 | obj = (unsigned long)first_page->freelist; | 1030 | obj = (unsigned long)first_page->freelist; |
@@ -1082,14 +1077,13 @@ void zs_free(struct zs_pool *pool, unsigned long obj) | |||
1082 | 1077 | ||
1083 | first_page->inuse--; | 1078 | first_page->inuse--; |
1084 | fullness = fix_fullness_group(pool, first_page); | 1079 | fullness = fix_fullness_group(pool, first_page); |
1085 | |||
1086 | if (fullness == ZS_EMPTY) | ||
1087 | class->pages_allocated -= class->pages_per_zspage; | ||
1088 | |||
1089 | spin_unlock(&class->lock); | 1080 | spin_unlock(&class->lock); |
1090 | 1081 | ||
1091 | if (fullness == ZS_EMPTY) | 1082 | if (fullness == ZS_EMPTY) { |
1083 | atomic_long_sub(class->pages_per_zspage, | ||
1084 | &pool->pages_allocated); | ||
1092 | free_zspage(first_page); | 1085 | free_zspage(first_page); |
1086 | } | ||
1093 | } | 1087 | } |
1094 | EXPORT_SYMBOL_GPL(zs_free); | 1088 | EXPORT_SYMBOL_GPL(zs_free); |
1095 | 1089 | ||
@@ -1183,17 +1177,11 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) | |||
1183 | } | 1177 | } |
1184 | EXPORT_SYMBOL_GPL(zs_unmap_object); | 1178 | EXPORT_SYMBOL_GPL(zs_unmap_object); |
1185 | 1179 | ||
1186 | u64 zs_get_total_size_bytes(struct zs_pool *pool) | 1180 | unsigned long zs_get_total_pages(struct zs_pool *pool) |
1187 | { | 1181 | { |
1188 | int i; | 1182 | return atomic_long_read(&pool->pages_allocated); |
1189 | u64 npages = 0; | ||
1190 | |||
1191 | for (i = 0; i < ZS_SIZE_CLASSES; i++) | ||
1192 | npages += pool->size_class[i].pages_allocated; | ||
1193 | |||
1194 | return npages << PAGE_SHIFT; | ||
1195 | } | 1183 | } |
1196 | EXPORT_SYMBOL_GPL(zs_get_total_size_bytes); | 1184 | EXPORT_SYMBOL_GPL(zs_get_total_pages); |
1197 | 1185 | ||
1198 | module_init(zs_init); | 1186 | module_init(zs_init); |
1199 | module_exit(zs_exit); | 1187 | module_exit(zs_exit); |
diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 3f94e1afd6cf..4c4b1f631ecf 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile | |||
@@ -3,6 +3,7 @@ | |||
3 | CC = $(CROSS_COMPILE)gcc | 3 | CC = $(CROSS_COMPILE)gcc |
4 | CFLAGS = -Wall | 4 | CFLAGS = -Wall |
5 | BINARIES = hugepage-mmap hugepage-shm map_hugetlb thuge-gen hugetlbfstest | 5 | BINARIES = hugepage-mmap hugepage-shm map_hugetlb thuge-gen hugetlbfstest |
6 | BINARIES += transhuge-stress | ||
6 | 7 | ||
7 | all: $(BINARIES) | 8 | all: $(BINARIES) |
8 | %: %.c | 9 | %: %.c |
diff --git a/tools/testing/selftests/vm/transhuge-stress.c b/tools/testing/selftests/vm/transhuge-stress.c new file mode 100644 index 000000000000..fd7f1b4a96f9 --- /dev/null +++ b/tools/testing/selftests/vm/transhuge-stress.c | |||
@@ -0,0 +1,144 @@ | |||
1 | /* | ||
2 | * Stress test for transparent huge pages, memory compaction and migration. | ||
3 | * | ||
4 | * Authors: Konstantin Khlebnikov <koct9i@gmail.com> | ||
5 | * | ||
6 | * This is free and unencumbered software released into the public domain. | ||
7 | */ | ||
8 | |||
9 | #include <stdlib.h> | ||
10 | #include <stdio.h> | ||
11 | #include <stdint.h> | ||
12 | #include <err.h> | ||
13 | #include <time.h> | ||
14 | #include <unistd.h> | ||
15 | #include <fcntl.h> | ||
16 | #include <string.h> | ||
17 | #include <sys/mman.h> | ||
18 | |||
19 | #define PAGE_SHIFT 12 | ||
20 | #define HPAGE_SHIFT 21 | ||
21 | |||
22 | #define PAGE_SIZE (1 << PAGE_SHIFT) | ||
23 | #define HPAGE_SIZE (1 << HPAGE_SHIFT) | ||
24 | |||
25 | #define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) | ||
26 | #define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) | ||
27 | |||
28 | int pagemap_fd; | ||
29 | |||
30 | int64_t allocate_transhuge(void *ptr) | ||
31 | { | ||
32 | uint64_t ent[2]; | ||
33 | |||
34 | /* drop pmd */ | ||
35 | if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, | ||
36 | MAP_FIXED | MAP_ANONYMOUS | | ||
37 | MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) | ||
38 | errx(2, "mmap transhuge"); | ||
39 | |||
40 | if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) | ||
41 | err(2, "MADV_HUGEPAGE"); | ||
42 | |||
43 | /* allocate transparent huge page */ | ||
44 | *(volatile void **)ptr = ptr; | ||
45 | |||
46 | if (pread(pagemap_fd, ent, sizeof(ent), | ||
47 | (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) | ||
48 | err(2, "read pagemap"); | ||
49 | |||
50 | if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && | ||
51 | PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && | ||
52 | !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) | ||
53 | return PAGEMAP_PFN(ent[0]); | ||
54 | |||
55 | return -1; | ||
56 | } | ||
57 | |||
58 | int main(int argc, char **argv) | ||
59 | { | ||
60 | size_t ram, len; | ||
61 | void *ptr, *p; | ||
62 | struct timespec a, b; | ||
63 | double s; | ||
64 | uint8_t *map; | ||
65 | size_t map_len; | ||
66 | |||
67 | ram = sysconf(_SC_PHYS_PAGES); | ||
68 | if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4) | ||
69 | ram = SIZE_MAX / 4; | ||
70 | else | ||
71 | ram *= sysconf(_SC_PAGESIZE); | ||
72 | |||
73 | if (argc == 1) | ||
74 | len = ram; | ||
75 | else if (!strcmp(argv[1], "-h")) | ||
76 | errx(1, "usage: %s [size in MiB]", argv[0]); | ||
77 | else | ||
78 | len = atoll(argv[1]) << 20; | ||
79 | |||
80 | warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" | ||
81 | " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, | ||
82 | len >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); | ||
83 | |||
84 | pagemap_fd = open("/proc/self/pagemap", O_RDONLY); | ||
85 | if (pagemap_fd < 0) | ||
86 | err(2, "open pagemap"); | ||
87 | |||
88 | len -= len % HPAGE_SIZE; | ||
89 | ptr = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, | ||
90 | MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); | ||
91 | if (ptr == MAP_FAILED) | ||
92 | err(2, "initial mmap"); | ||
93 | ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE; | ||
94 | |||
95 | if (madvise(ptr, len, MADV_HUGEPAGE)) | ||
96 | err(2, "MADV_HUGEPAGE"); | ||
97 | |||
98 | map_len = ram >> (HPAGE_SHIFT - 1); | ||
99 | map = malloc(map_len); | ||
100 | if (!map) | ||
101 | errx(2, "map malloc"); | ||
102 | |||
103 | while (1) { | ||
104 | int nr_succeed = 0, nr_failed = 0, nr_pages = 0; | ||
105 | |||
106 | memset(map, 0, map_len); | ||
107 | |||
108 | clock_gettime(CLOCK_MONOTONIC, &a); | ||
109 | for (p = ptr; p < ptr + len; p += HPAGE_SIZE) { | ||
110 | int64_t pfn; | ||
111 | |||
112 | pfn = allocate_transhuge(p); | ||
113 | |||
114 | if (pfn < 0) { | ||
115 | nr_failed++; | ||
116 | } else { | ||
117 | size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT); | ||
118 | |||
119 | nr_succeed++; | ||
120 | if (idx >= map_len) { | ||
121 | map = realloc(map, idx + 1); | ||
122 | if (!map) | ||
123 | errx(2, "map realloc"); | ||
124 | memset(map + map_len, 0, idx + 1 - map_len); | ||
125 | map_len = idx + 1; | ||
126 | } | ||
127 | if (!map[idx]) | ||
128 | nr_pages++; | ||
129 | map[idx] = 1; | ||
130 | } | ||
131 | |||
132 | /* split transhuge page, keep last page */ | ||
133 | if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED)) | ||
134 | err(2, "MADV_DONTNEED"); | ||
135 | } | ||
136 | clock_gettime(CLOCK_MONOTONIC, &b); | ||
137 | s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.; | ||
138 | |||
139 | warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t" | ||
140 | "%4d succeed, %4d failed, %4d different pages", | ||
141 | s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), | ||
142 | nr_succeed, nr_failed, nr_pages); | ||
143 | } | ||
144 | } | ||
diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index c4d6d2e20e0d..264fbc297e0b 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c | |||
@@ -132,6 +132,7 @@ static const char * const page_flag_names[] = { | |||
132 | [KPF_NOPAGE] = "n:nopage", | 132 | [KPF_NOPAGE] = "n:nopage", |
133 | [KPF_KSM] = "x:ksm", | 133 | [KPF_KSM] = "x:ksm", |
134 | [KPF_THP] = "t:thp", | 134 | [KPF_THP] = "t:thp", |
135 | [KPF_BALLOON] = "o:balloon", | ||
135 | 136 | ||
136 | [KPF_RESERVED] = "r:reserved", | 137 | [KPF_RESERVED] = "r:reserved", |
137 | [KPF_MLOCKED] = "m:mlocked", | 138 | [KPF_MLOCKED] = "m:mlocked", |