diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-17 15:58:52 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-01-17 15:58:52 -0500 |
commit | 0cbeafb245ca568bc0765645aa64f0451b716657 (patch) | |
tree | 663c09ff5a62a1b2b66a17c4dfe0413603530a36 | |
parent | 58cf279acac3080ce03eeea5ca268210b3165fe1 (diff) | |
parent | 06b031de22d28ae76b2e5bfaf22c56a265a1e106 (diff) |
Merge branch 'akpm' (patches from Andrew)
Merge second patch-bomb from Andrew Morton:
- more MM stuff:
- Kirill's page-flags rework
- Kirill's now-allegedly-fixed THP rework
- MADV_FREE implementation
- DAX feature work (msync/fsync). This isn't quite complete but DAX
is new and it's good enough and the guys have a handle on what
needs to be done - I expect this to be wrapped in the next week or
two.
- some vsprintf maintenance work
- various other misc bits
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (145 commits)
printk: change recursion_bug type to bool
lib/vsprintf: factor out %pN[F] handler as netdev_bits()
lib/vsprintf: refactor duplicate code to special_hex_number()
printk-formats.txt: remove unimplemented %pT
printk: help pr_debug and pr_devel to optimize out arguments
lib/test_printf.c: test dentry printing
lib/test_printf.c: add test for large bitmaps
lib/test_printf.c: account for kvasprintf tests
lib/test_printf.c: add a few number() tests
lib/test_printf.c: test precision quirks
lib/test_printf.c: check for out-of-bound writes
lib/test_printf.c: don't BUG
lib/kasprintf.c: add sanity check to kvasprintf
lib/vsprintf.c: warn about too large precisions and field widths
lib/vsprintf.c: help gcc make number() smaller
lib/vsprintf.c: expand field_width to 24 bits
lib/vsprintf.c: eliminate potential race in string()
lib/vsprintf.c: move string() below widen_string()
lib/vsprintf.c: pull out padding code from dentry_name()
printk: do cond_resched() between lines while outputting to consoles
...
189 files changed, 4357 insertions, 2886 deletions
diff --git a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt b/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt deleted file mode 100644 index 26f74b457e0b..000000000000 --- a/Documentation/features/vm/pmdp_splitting_flush/arch-support.txt +++ /dev/null | |||
@@ -1,40 +0,0 @@ | |||
1 | # | ||
2 | # Feature name: pmdp_splitting_flush | ||
3 | # Kconfig: __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
4 | # description: arch supports the pmdp_splitting_flush() VM API | ||
5 | # | ||
6 | ----------------------- | ||
7 | | arch |status| | ||
8 | ----------------------- | ||
9 | | alpha: | TODO | | ||
10 | | arc: | TODO | | ||
11 | | arm: | ok | | ||
12 | | arm64: | ok | | ||
13 | | avr32: | TODO | | ||
14 | | blackfin: | TODO | | ||
15 | | c6x: | TODO | | ||
16 | | cris: | TODO | | ||
17 | | frv: | TODO | | ||
18 | | h8300: | TODO | | ||
19 | | hexagon: | TODO | | ||
20 | | ia64: | TODO | | ||
21 | | m32r: | TODO | | ||
22 | | m68k: | TODO | | ||
23 | | metag: | TODO | | ||
24 | | microblaze: | TODO | | ||
25 | | mips: | ok | | ||
26 | | mn10300: | TODO | | ||
27 | | nios2: | TODO | | ||
28 | | openrisc: | TODO | | ||
29 | | parisc: | TODO | | ||
30 | | powerpc: | ok | | ||
31 | | s390: | ok | | ||
32 | | score: | TODO | | ||
33 | | sh: | TODO | | ||
34 | | sparc: | TODO | | ||
35 | | tile: | TODO | | ||
36 | | um: | TODO | | ||
37 | | unicore32: | TODO | | ||
38 | | x86: | ok | | ||
39 | | xtensa: | TODO | | ||
40 | ----------------------- | ||
diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 6389551bbad6..5d1128bf0282 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt | |||
@@ -306,15 +306,6 @@ Network device features: | |||
306 | 306 | ||
307 | Passed by reference. | 307 | Passed by reference. |
308 | 308 | ||
309 | Command from struct task_struct | ||
310 | |||
311 | %pT ls | ||
312 | |||
313 | For printing executable name excluding path from struct | ||
314 | task_struct. | ||
315 | |||
316 | Passed by reference. | ||
317 | |||
318 | If you add other %p extensions, please extend lib/test_printf.c with | 309 | If you add other %p extensions, please extend lib/test_printf.c with |
319 | one or more test cases, if at all feasible. | 310 | one or more test cases, if at all feasible. |
320 | 311 | ||
diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 8a282687ee06..21cf34f3ddb2 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt | |||
@@ -35,10 +35,10 @@ miss is going to run faster. | |||
35 | 35 | ||
36 | == Design == | 36 | == Design == |
37 | 37 | ||
38 | - "graceful fallback": mm components which don't have transparent | 38 | - "graceful fallback": mm components which don't have transparent hugepage |
39 | hugepage knowledge fall back to breaking a transparent hugepage and | 39 | knowledge fall back to breaking huge pmd mapping into table of ptes and, |
40 | working on the regular pages and their respective regular pmd/pte | 40 | if necessary, split a transparent hugepage. Therefore these components |
41 | mappings | 41 | can continue working on the regular pages or regular pte mappings. |
42 | 42 | ||
43 | - if a hugepage allocation fails because of memory fragmentation, | 43 | - if a hugepage allocation fails because of memory fragmentation, |
44 | regular pages should be gracefully allocated instead and mixed in | 44 | regular pages should be gracefully allocated instead and mixed in |
@@ -221,9 +221,18 @@ thp_collapse_alloc_failed is incremented if khugepaged found a range | |||
221 | of pages that should be collapsed into one huge page but failed | 221 | of pages that should be collapsed into one huge page but failed |
222 | the allocation. | 222 | the allocation. |
223 | 223 | ||
224 | thp_split is incremented every time a huge page is split into base | 224 | thp_split_page is incremented every time a huge page is split into base |
225 | pages. This can happen for a variety of reasons but a common | 225 | pages. This can happen for a variety of reasons but a common |
226 | reason is that a huge page is old and is being reclaimed. | 226 | reason is that a huge page is old and is being reclaimed. |
227 | This action implies splitting all PMD the page mapped with. | ||
228 | |||
229 | thp_split_page_failed is is incremented if kernel fails to split huge | ||
230 | page. This can happen if the page was pinned by somebody. | ||
231 | |||
232 | thp_split_pmd is incremented every time a PMD split into table of PTEs. | ||
233 | This can happen, for instance, when application calls mprotect() or | ||
234 | munmap() on part of huge page. It doesn't split huge page, only | ||
235 | page table entry. | ||
227 | 236 | ||
228 | thp_zero_page_alloc is incremented every time a huge zero page is | 237 | thp_zero_page_alloc is incremented every time a huge zero page is |
229 | successfully allocated. It includes allocations which where | 238 | successfully allocated. It includes allocations which where |
@@ -274,10 +283,8 @@ is complete, so they won't ever notice the fact the page is huge. But | |||
274 | if any driver is going to mangle over the page structure of the tail | 283 | if any driver is going to mangle over the page structure of the tail |
275 | page (like for checking page->mapping or other bits that are relevant | 284 | page (like for checking page->mapping or other bits that are relevant |
276 | for the head page and not the tail page), it should be updated to jump | 285 | for the head page and not the tail page), it should be updated to jump |
277 | to check head page instead (while serializing properly against | 286 | to check head page instead. Taking reference on any head/tail page would |
278 | split_huge_page() to avoid the head and tail pages to disappear from | 287 | prevent page from being split by anyone. |
279 | under it, see the futex code to see an example of that, hugetlbfs also | ||
280 | needed special handling in futex code for similar reasons). | ||
281 | 288 | ||
282 | NOTE: these aren't new constraints to the GUP API, and they match the | 289 | NOTE: these aren't new constraints to the GUP API, and they match the |
283 | same constrains that applies to hugetlbfs too, so any driver capable | 290 | same constrains that applies to hugetlbfs too, so any driver capable |
@@ -312,9 +319,9 @@ unaffected. libhugetlbfs will also work fine as usual. | |||
312 | == Graceful fallback == | 319 | == Graceful fallback == |
313 | 320 | ||
314 | Code walking pagetables but unware about huge pmds can simply call | 321 | Code walking pagetables but unware about huge pmds can simply call |
315 | split_huge_page_pmd(vma, addr, pmd) where the pmd is the one returned by | 322 | split_huge_pmd(vma, pmd, addr) where the pmd is the one returned by |
316 | pmd_offset. It's trivial to make the code transparent hugepage aware | 323 | pmd_offset. It's trivial to make the code transparent hugepage aware |
317 | by just grepping for "pmd_offset" and adding split_huge_page_pmd where | 324 | by just grepping for "pmd_offset" and adding split_huge_pmd where |
318 | missing after pmd_offset returns the pmd. Thanks to the graceful | 325 | missing after pmd_offset returns the pmd. Thanks to the graceful |
319 | fallback design, with a one liner change, you can avoid to write | 326 | fallback design, with a one liner change, you can avoid to write |
320 | hundred if not thousand of lines of complex code to make your code | 327 | hundred if not thousand of lines of complex code to make your code |
@@ -323,7 +330,8 @@ hugepage aware. | |||
323 | If you're not walking pagetables but you run into a physical hugepage | 330 | If you're not walking pagetables but you run into a physical hugepage |
324 | but you can't handle it natively in your code, you can split it by | 331 | but you can't handle it natively in your code, you can split it by |
325 | calling split_huge_page(page). This is what the Linux VM does before | 332 | calling split_huge_page(page). This is what the Linux VM does before |
326 | it tries to swapout the hugepage for example. | 333 | it tries to swapout the hugepage for example. split_huge_page() can fail |
334 | if the page is pinned and you must handle this correctly. | ||
327 | 335 | ||
328 | Example to make mremap.c transparent hugepage aware with a one liner | 336 | Example to make mremap.c transparent hugepage aware with a one liner |
329 | change: | 337 | change: |
@@ -335,14 +343,14 @@ diff --git a/mm/mremap.c b/mm/mremap.c | |||
335 | return NULL; | 343 | return NULL; |
336 | 344 | ||
337 | pmd = pmd_offset(pud, addr); | 345 | pmd = pmd_offset(pud, addr); |
338 | + split_huge_page_pmd(vma, addr, pmd); | 346 | + split_huge_pmd(vma, pmd, addr); |
339 | if (pmd_none_or_clear_bad(pmd)) | 347 | if (pmd_none_or_clear_bad(pmd)) |
340 | return NULL; | 348 | return NULL; |
341 | 349 | ||
342 | == Locking in hugepage aware code == | 350 | == Locking in hugepage aware code == |
343 | 351 | ||
344 | We want as much code as possible hugepage aware, as calling | 352 | We want as much code as possible hugepage aware, as calling |
345 | split_huge_page() or split_huge_page_pmd() has a cost. | 353 | split_huge_page() or split_huge_pmd() has a cost. |
346 | 354 | ||
347 | To make pagetable walks huge pmd aware, all you need to do is to call | 355 | To make pagetable walks huge pmd aware, all you need to do is to call |
348 | pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the | 356 | pmd_trans_huge() on the pmd returned by pmd_offset. You must hold the |
@@ -351,47 +359,80 @@ created from under you by khugepaged (khugepaged collapse_huge_page | |||
351 | takes the mmap_sem in write mode in addition to the anon_vma lock). If | 359 | takes the mmap_sem in write mode in addition to the anon_vma lock). If |
352 | pmd_trans_huge returns false, you just fallback in the old code | 360 | pmd_trans_huge returns false, you just fallback in the old code |
353 | paths. If instead pmd_trans_huge returns true, you have to take the | 361 | paths. If instead pmd_trans_huge returns true, you have to take the |
354 | mm->page_table_lock and re-run pmd_trans_huge. Taking the | 362 | page table lock (pmd_lock()) and re-run pmd_trans_huge. Taking the |
355 | page_table_lock will prevent the huge pmd to be converted into a | 363 | page table lock will prevent the huge pmd to be converted into a |
356 | regular pmd from under you (split_huge_page can run in parallel to the | 364 | regular pmd from under you (split_huge_pmd can run in parallel to the |
357 | pagetable walk). If the second pmd_trans_huge returns false, you | 365 | pagetable walk). If the second pmd_trans_huge returns false, you |
358 | should just drop the page_table_lock and fallback to the old code as | 366 | should just drop the page table lock and fallback to the old code as |
359 | before. Otherwise you should run pmd_trans_splitting on the pmd. In | 367 | before. Otherwise you can proceed to process the huge pmd and the |
360 | case pmd_trans_splitting returns true, it means split_huge_page is | 368 | hugepage natively. Once finished you can drop the page table lock. |
361 | already in the middle of splitting the page. So if pmd_trans_splitting | 369 | |
362 | returns true it's enough to drop the page_table_lock and call | 370 | == Refcounts and transparent huge pages == |
363 | wait_split_huge_page and then fallback the old code paths. You are | 371 | |
364 | guaranteed by the time wait_split_huge_page returns, the pmd isn't | 372 | Refcounting on THP is mostly consistent with refcounting on other compound |
365 | huge anymore. If pmd_trans_splitting returns false, you can proceed to | 373 | pages: |
366 | process the huge pmd and the hugepage natively. Once finished you can | 374 | |
367 | drop the page_table_lock. | 375 | - get_page()/put_page() and GUP operate in head page's ->_count. |
368 | 376 | ||
369 | == compound_lock, get_user_pages and put_page == | 377 | - ->_count in tail pages is always zero: get_page_unless_zero() never |
378 | succeed on tail pages. | ||
379 | |||
380 | - map/unmap of the pages with PTE entry increment/decrement ->_mapcount | ||
381 | on relevant sub-page of the compound page. | ||
382 | |||
383 | - map/unmap of the whole compound page accounted in compound_mapcount | ||
384 | (stored in first tail page). | ||
385 | |||
386 | PageDoubleMap() indicates that ->_mapcount in all subpages is offset up by one. | ||
387 | This additional reference is required to get race-free detection of unmap of | ||
388 | subpages when we have them mapped with both PMDs and PTEs. | ||
389 | |||
390 | This is optimization required to lower overhead of per-subpage mapcount | ||
391 | tracking. The alternative is alter ->_mapcount in all subpages on each | ||
392 | map/unmap of the whole compound page. | ||
393 | |||
394 | We set PG_double_map when a PMD of the page got split for the first time, | ||
395 | but still have PMD mapping. The addtional references go away with last | ||
396 | compound_mapcount. | ||
370 | 397 | ||
371 | split_huge_page internally has to distribute the refcounts in the head | 398 | split_huge_page internally has to distribute the refcounts in the head |
372 | page to the tail pages before clearing all PG_head/tail bits from the | 399 | page to the tail pages before clearing all PG_head/tail bits from the page |
373 | page structures. It can do that easily for refcounts taken by huge pmd | 400 | structures. It can be done easily for refcounts taken by page table |
374 | mappings. But the GUI API as created by hugetlbfs (that returns head | 401 | entries. But we don't have enough information on how to distribute any |
375 | and tail pages if running get_user_pages on an address backed by any | 402 | additional pins (i.e. from get_user_pages). split_huge_page() fails any |
376 | hugepage), requires the refcount to be accounted on the tail pages and | 403 | requests to split pinned huge page: it expects page count to be equal to |
377 | not only in the head pages, if we want to be able to run | 404 | sum of mapcount of all sub-pages plus one (split_huge_page caller must |
378 | split_huge_page while there are gup pins established on any tail | 405 | have reference for head page). |
379 | page. Failure to be able to run split_huge_page if there's any gup pin | 406 | |
380 | on any tail page, would mean having to split all hugepages upfront in | 407 | split_huge_page uses migration entries to stabilize page->_count and |
381 | get_user_pages which is unacceptable as too many gup users are | 408 | page->_mapcount. |
382 | performance critical and they must work natively on hugepages like | 409 | |
383 | they work natively on hugetlbfs already (hugetlbfs is simpler because | 410 | We safe against physical memory scanners too: the only legitimate way |
384 | hugetlbfs pages cannot be split so there wouldn't be requirement of | 411 | scanner can get reference to a page is get_page_unless_zero(). |
385 | accounting the pins on the tail pages for hugetlbfs). If we wouldn't | 412 | |
386 | account the gup refcounts on the tail pages during gup, we won't know | 413 | All tail pages has zero ->_count until atomic_add(). It prevent scanner |
387 | anymore which tail page is pinned by gup and which is not while we run | 414 | from geting reference to tail page up to the point. After the atomic_add() |
388 | split_huge_page. But we still have to add the gup pin to the head page | 415 | we don't care about ->_count value. We already known how many references |
389 | too, to know when we can free the compound page in case it's never | 416 | with should uncharge from head page. |
390 | split during its lifetime. That requires changing not just | 417 | |
391 | get_page, but put_page as well so that when put_page runs on a tail | 418 | For head page get_page_unless_zero() will succeed and we don't mind. It's |
392 | page (and only on a tail page) it will find its respective head page, | 419 | clear where reference should go after split: it will stay on head page. |
393 | and then it will decrease the head page refcount in addition to the | 420 | |
394 | tail page refcount. To obtain a head page reliably and to decrease its | 421 | Note that split_huge_pmd() doesn't have any limitation on refcounting: |
395 | refcount without race conditions, put_page has to serialize against | 422 | pmd can be split at any point and never fails. |
396 | __split_huge_page_refcount using a special per-page lock called | 423 | |
397 | compound_lock. | 424 | == Partial unmap and deferred_split_huge_page() == |
425 | |||
426 | Unmapping part of THP (with munmap() or other way) is not going to free | ||
427 | memory immediately. Instead, we detect that a subpage of THP is not in use | ||
428 | in page_remove_rmap() and queue the THP for splitting if memory pressure | ||
429 | comes. Splitting will free up unused subpages. | ||
430 | |||
431 | Splitting the page right away is not an option due to locking context in | ||
432 | the place where we can detect partial unmap. It's also might be | ||
433 | counterproductive since in many cases partial unmap unmap happens during | ||
434 | exit(2) if an THP crosses VMA boundary. | ||
435 | |||
436 | Function deferred_split_huge_page() is used to queue page for splitting. | ||
437 | The splitting itself will happen when we get memory pressure via shrinker | ||
438 | interface. | ||
diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index f2f949671798..ab336c06153e 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h | |||
@@ -47,8 +47,10 @@ | |||
47 | #define MADV_WILLNEED 3 /* will need these pages */ | 47 | #define MADV_WILLNEED 3 /* will need these pages */ |
48 | #define MADV_SPACEAVAIL 5 /* ensure resources are available */ | 48 | #define MADV_SPACEAVAIL 5 /* ensure resources are available */ |
49 | #define MADV_DONTNEED 6 /* don't need these pages */ | 49 | #define MADV_DONTNEED 6 /* don't need these pages */ |
50 | #define MADV_FREE 7 /* free pages only if memory pressure */ | ||
50 | 51 | ||
51 | /* common/generic parameters */ | 52 | /* common/generic parameters */ |
53 | #define MADV_FREE 8 /* free pages only if memory pressure */ | ||
52 | #define MADV_REMOVE 9 /* remove these pages & resources */ | 54 | #define MADV_REMOVE 9 /* remove these pages & resources */ |
53 | #define MADV_DONTFORK 10 /* don't inherit across fork */ | 55 | #define MADV_DONTFORK 10 /* don't inherit across fork */ |
54 | #define MADV_DOFORK 11 /* do inherit across fork */ | 56 | #define MADV_DOFORK 11 /* do inherit across fork */ |
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 6312f607932f..76dde9db7934 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig | |||
@@ -73,9 +73,6 @@ config STACKTRACE_SUPPORT | |||
73 | def_bool y | 73 | def_bool y |
74 | select STACKTRACE | 74 | select STACKTRACE |
75 | 75 | ||
76 | config HAVE_LATENCYTOP_SUPPORT | ||
77 | def_bool y | ||
78 | |||
79 | config HAVE_ARCH_TRANSPARENT_HUGEPAGE | 76 | config HAVE_ARCH_TRANSPARENT_HUGEPAGE |
80 | def_bool y | 77 | def_bool y |
81 | depends on ARC_MMU_V4 | 78 | depends on ARC_MMU_V4 |
diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index ff7ff6cbb811..b65f797e9ad6 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c | |||
@@ -617,7 +617,7 @@ void flush_dcache_page(struct page *page) | |||
617 | */ | 617 | */ |
618 | if (!mapping_mapped(mapping)) { | 618 | if (!mapping_mapped(mapping)) { |
619 | clear_bit(PG_dc_clean, &page->flags); | 619 | clear_bit(PG_dc_clean, &page->flags); |
620 | } else if (page_mapped(page)) { | 620 | } else if (page_mapcount(page)) { |
621 | 621 | ||
622 | /* kernel reading from page with U-mapping */ | 622 | /* kernel reading from page with U-mapping */ |
623 | phys_addr_t paddr = (unsigned long)page_address(page); | 623 | phys_addr_t paddr = (unsigned long)page_address(page); |
@@ -857,7 +857,7 @@ void copy_user_highpage(struct page *to, struct page *from, | |||
857 | * For !VIPT cache, all of this gets compiled out as | 857 | * For !VIPT cache, all of this gets compiled out as |
858 | * addr_not_cache_congruent() is 0 | 858 | * addr_not_cache_congruent() is 0 |
859 | */ | 859 | */ |
860 | if (page_mapped(from) && addr_not_cache_congruent(kfrom, u_vaddr)) { | 860 | if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) { |
861 | __flush_dcache_page((unsigned long)kfrom, u_vaddr); | 861 | __flush_dcache_page((unsigned long)kfrom, u_vaddr); |
862 | clean_src_k_mappings = 1; | 862 | clean_src_k_mappings = 1; |
863 | } | 863 | } |
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4e489cc5c45e..6a889afa6a2c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig | |||
@@ -168,11 +168,6 @@ config STACKTRACE_SUPPORT | |||
168 | bool | 168 | bool |
169 | default y | 169 | default y |
170 | 170 | ||
171 | config HAVE_LATENCYTOP_SUPPORT | ||
172 | bool | ||
173 | depends on !SMP | ||
174 | default y | ||
175 | |||
176 | config LOCKDEP_SUPPORT | 171 | config LOCKDEP_SUPPORT |
177 | bool | 172 | bool |
178 | default y | 173 | default y |
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 9203c21b4673..a520b7987a29 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h | |||
@@ -182,7 +182,8 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) | |||
182 | return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101; | 182 | return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101; |
183 | } | 183 | } |
184 | 184 | ||
185 | static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, | 185 | static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, |
186 | kvm_pfn_t pfn, | ||
186 | unsigned long size, | 187 | unsigned long size, |
187 | bool ipa_uncached) | 188 | bool ipa_uncached) |
188 | { | 189 | { |
@@ -246,7 +247,7 @@ static inline void __kvm_flush_dcache_pte(pte_t pte) | |||
246 | static inline void __kvm_flush_dcache_pmd(pmd_t pmd) | 247 | static inline void __kvm_flush_dcache_pmd(pmd_t pmd) |
247 | { | 248 | { |
248 | unsigned long size = PMD_SIZE; | 249 | unsigned long size = PMD_SIZE; |
249 | pfn_t pfn = pmd_pfn(pmd); | 250 | kvm_pfn_t pfn = pmd_pfn(pmd); |
250 | 251 | ||
251 | while (size) { | 252 | while (size) { |
252 | void *va = kmap_atomic_pfn(pfn); | 253 | void *va = kmap_atomic_pfn(pfn); |
diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index a745a2a53853..dc46398bc3a5 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h | |||
@@ -88,7 +88,6 @@ | |||
88 | 88 | ||
89 | #define L_PMD_SECT_VALID (_AT(pmdval_t, 1) << 0) | 89 | #define L_PMD_SECT_VALID (_AT(pmdval_t, 1) << 0) |
90 | #define L_PMD_SECT_DIRTY (_AT(pmdval_t, 1) << 55) | 90 | #define L_PMD_SECT_DIRTY (_AT(pmdval_t, 1) << 55) |
91 | #define L_PMD_SECT_SPLITTING (_AT(pmdval_t, 1) << 56) | ||
92 | #define L_PMD_SECT_NONE (_AT(pmdval_t, 1) << 57) | 91 | #define L_PMD_SECT_NONE (_AT(pmdval_t, 1) << 57) |
93 | #define L_PMD_SECT_RDONLY (_AT(pteval_t, 1) << 58) | 92 | #define L_PMD_SECT_RDONLY (_AT(pteval_t, 1) << 58) |
94 | 93 | ||
@@ -232,13 +231,6 @@ static inline pte_t pte_mkspecial(pte_t pte) | |||
232 | 231 | ||
233 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 232 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
234 | #define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd)) | 233 | #define pmd_trans_huge(pmd) (pmd_val(pmd) && !pmd_table(pmd)) |
235 | #define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING)) | ||
236 | |||
237 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
238 | #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
239 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
240 | pmd_t *pmdp); | ||
241 | #endif | ||
242 | #endif | 234 | #endif |
243 | 235 | ||
244 | #define PMD_BIT_FUNC(fn,op) \ | 236 | #define PMD_BIT_FUNC(fn,op) \ |
@@ -246,9 +238,9 @@ static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; } | |||
246 | 238 | ||
247 | PMD_BIT_FUNC(wrprotect, |= L_PMD_SECT_RDONLY); | 239 | PMD_BIT_FUNC(wrprotect, |= L_PMD_SECT_RDONLY); |
248 | PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF); | 240 | PMD_BIT_FUNC(mkold, &= ~PMD_SECT_AF); |
249 | PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING); | ||
250 | PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY); | 241 | PMD_BIT_FUNC(mkwrite, &= ~L_PMD_SECT_RDONLY); |
251 | PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY); | 242 | PMD_BIT_FUNC(mkdirty, |= L_PMD_SECT_DIRTY); |
243 | PMD_BIT_FUNC(mkclean, &= ~L_PMD_SECT_DIRTY); | ||
252 | PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); | 244 | PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); |
253 | 245 | ||
254 | #define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) | 246 | #define pmd_mkhuge(pmd) (__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT)) |
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 22f7fa0124ec..aba61fd3697a 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c | |||
@@ -992,9 +992,9 @@ out: | |||
992 | return ret; | 992 | return ret; |
993 | } | 993 | } |
994 | 994 | ||
995 | static bool transparent_hugepage_adjust(pfn_t *pfnp, phys_addr_t *ipap) | 995 | static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap) |
996 | { | 996 | { |
997 | pfn_t pfn = *pfnp; | 997 | kvm_pfn_t pfn = *pfnp; |
998 | gfn_t gfn = *ipap >> PAGE_SHIFT; | 998 | gfn_t gfn = *ipap >> PAGE_SHIFT; |
999 | 999 | ||
1000 | if (PageTransCompound(pfn_to_page(pfn))) { | 1000 | if (PageTransCompound(pfn_to_page(pfn))) { |
@@ -1201,7 +1201,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, | |||
1201 | kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); | 1201 | kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); |
1202 | } | 1202 | } |
1203 | 1203 | ||
1204 | static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, | 1204 | static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, |
1205 | unsigned long size, bool uncached) | 1205 | unsigned long size, bool uncached) |
1206 | { | 1206 | { |
1207 | __coherent_cache_guest_page(vcpu, pfn, size, uncached); | 1207 | __coherent_cache_guest_page(vcpu, pfn, size, uncached); |
@@ -1218,7 +1218,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, | |||
1218 | struct kvm *kvm = vcpu->kvm; | 1218 | struct kvm *kvm = vcpu->kvm; |
1219 | struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; | 1219 | struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; |
1220 | struct vm_area_struct *vma; | 1220 | struct vm_area_struct *vma; |
1221 | pfn_t pfn; | 1221 | kvm_pfn_t pfn; |
1222 | pgprot_t mem_type = PAGE_S2; | 1222 | pgprot_t mem_type = PAGE_S2; |
1223 | bool fault_ipa_uncached; | 1223 | bool fault_ipa_uncached; |
1224 | bool logging_active = memslot_is_logging(memslot); | 1224 | bool logging_active = memslot_is_logging(memslot); |
@@ -1346,7 +1346,7 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) | |||
1346 | { | 1346 | { |
1347 | pmd_t *pmd; | 1347 | pmd_t *pmd; |
1348 | pte_t *pte; | 1348 | pte_t *pte; |
1349 | pfn_t pfn; | 1349 | kvm_pfn_t pfn; |
1350 | bool pfn_valid = false; | 1350 | bool pfn_valid = false; |
1351 | 1351 | ||
1352 | trace_kvm_access_fault(fault_ipa); | 1352 | trace_kvm_access_fault(fault_ipa); |
diff --git a/arch/arm/lib/uaccess_with_memcpy.c b/arch/arm/lib/uaccess_with_memcpy.c index 588bbc288396..6bd1089b07e0 100644 --- a/arch/arm/lib/uaccess_with_memcpy.c +++ b/arch/arm/lib/uaccess_with_memcpy.c | |||
@@ -52,14 +52,13 @@ pin_page_for_write(const void __user *_addr, pte_t **ptep, spinlock_t **ptlp) | |||
52 | * | 52 | * |
53 | * Lock the page table for the destination and check | 53 | * Lock the page table for the destination and check |
54 | * to see that it's still huge and whether or not we will | 54 | * to see that it's still huge and whether or not we will |
55 | * need to fault on write, or if we have a splitting THP. | 55 | * need to fault on write. |
56 | */ | 56 | */ |
57 | if (unlikely(pmd_thp_or_huge(*pmd))) { | 57 | if (unlikely(pmd_thp_or_huge(*pmd))) { |
58 | ptl = ¤t->mm->page_table_lock; | 58 | ptl = ¤t->mm->page_table_lock; |
59 | spin_lock(ptl); | 59 | spin_lock(ptl); |
60 | if (unlikely(!pmd_thp_or_huge(*pmd) | 60 | if (unlikely(!pmd_thp_or_huge(*pmd) |
61 | || pmd_hugewillfault(*pmd) | 61 | || pmd_hugewillfault(*pmd))) { |
62 | || pmd_trans_splitting(*pmd))) { | ||
63 | spin_unlock(ptl); | 62 | spin_unlock(ptl); |
64 | return 0; | 63 | return 0; |
65 | } | 64 | } |
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 1ec8e7590fc6..d0ba3551d49a 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c | |||
@@ -330,7 +330,7 @@ void flush_dcache_page(struct page *page) | |||
330 | mapping = page_mapping(page); | 330 | mapping = page_mapping(page); |
331 | 331 | ||
332 | if (!cache_ops_need_broadcast() && | 332 | if (!cache_ops_need_broadcast() && |
333 | mapping && !page_mapped(page)) | 333 | mapping && !page_mapcount(page)) |
334 | clear_bit(PG_dcache_clean, &page->flags); | 334 | clear_bit(PG_dcache_clean, &page->flags); |
335 | else { | 335 | else { |
336 | __flush_dcache_page(mapping, page); | 336 | __flush_dcache_page(mapping, page); |
@@ -415,18 +415,3 @@ void __flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned l | |||
415 | */ | 415 | */ |
416 | __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); | 416 | __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); |
417 | } | 417 | } |
418 | |||
419 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
420 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
421 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
422 | pmd_t *pmdp) | ||
423 | { | ||
424 | pmd_t pmd = pmd_mksplitting(*pmdp); | ||
425 | VM_BUG_ON(address & ~PMD_MASK); | ||
426 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); | ||
427 | |||
428 | /* dummy IPI to serialise against fast_gup */ | ||
429 | kick_all_cpus_sync(); | ||
430 | } | ||
431 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | ||
432 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 0bf8b4320a91..736433912a1e 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h | |||
@@ -230,7 +230,8 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) | |||
230 | return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; | 230 | return (vcpu_sys_reg(vcpu, SCTLR_EL1) & 0b101) == 0b101; |
231 | } | 231 | } |
232 | 232 | ||
233 | static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, pfn_t pfn, | 233 | static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu, |
234 | kvm_pfn_t pfn, | ||
234 | unsigned long size, | 235 | unsigned long size, |
235 | bool ipa_uncached) | 236 | bool ipa_uncached) |
236 | { | 237 | { |
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 69d2e2f86bce..2d545d7aa80b 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h | |||
@@ -353,21 +353,14 @@ static inline pgprot_t mk_sect_prot(pgprot_t prot) | |||
353 | 353 | ||
354 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 354 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
355 | #define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) | 355 | #define pmd_trans_huge(pmd) (pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT)) |
356 | #define pmd_trans_splitting(pmd) pte_special(pmd_pte(pmd)) | ||
357 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
358 | #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
359 | struct vm_area_struct; | ||
360 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
361 | pmd_t *pmdp); | ||
362 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | ||
363 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 356 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
364 | 357 | ||
365 | #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) | 358 | #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) |
366 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) | 359 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) |
367 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) | 360 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) |
368 | #define pmd_mksplitting(pmd) pte_pmd(pte_mkspecial(pmd_pte(pmd))) | ||
369 | #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) | 361 | #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) |
370 | #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) | 362 | #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) |
363 | #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) | ||
371 | #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) | 364 | #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) |
372 | #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) | 365 | #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) |
373 | #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK)) | 366 | #define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_TYPE_MASK)) |
diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 46649d6e6c5a..60585bde1264 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c | |||
@@ -102,19 +102,3 @@ EXPORT_SYMBOL(flush_dcache_page); | |||
102 | * Additional functions defined in assembly. | 102 | * Additional functions defined in assembly. |
103 | */ | 103 | */ |
104 | EXPORT_SYMBOL(flush_icache_range); | 104 | EXPORT_SYMBOL(flush_icache_range); |
105 | |||
106 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
107 | #ifdef CONFIG_HAVE_RCU_TABLE_FREE | ||
108 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
109 | pmd_t *pmdp) | ||
110 | { | ||
111 | pmd_t pmd = pmd_mksplitting(*pmdp); | ||
112 | |||
113 | VM_BUG_ON(address & ~PMD_MASK); | ||
114 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); | ||
115 | |||
116 | /* dummy IPI to serialise against fast_gup */ | ||
117 | kick_all_cpus_sync(); | ||
118 | } | ||
119 | #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ | ||
120 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
diff --git a/arch/avr32/include/asm/page.h b/arch/avr32/include/asm/page.h index f805d1cb11bc..c5d2a3e2c62f 100644 --- a/arch/avr32/include/asm/page.h +++ b/arch/avr32/include/asm/page.h | |||
@@ -83,11 +83,9 @@ static inline int get_order(unsigned long size) | |||
83 | 83 | ||
84 | #ifndef CONFIG_NEED_MULTIPLE_NODES | 84 | #ifndef CONFIG_NEED_MULTIPLE_NODES |
85 | 85 | ||
86 | #define PHYS_PFN_OFFSET (CONFIG_PHYS_OFFSET >> PAGE_SHIFT) | 86 | #define ARCH_PFN_OFFSET (CONFIG_PHYS_OFFSET >> PAGE_SHIFT) |
87 | 87 | ||
88 | #define pfn_to_page(pfn) (mem_map + ((pfn) - PHYS_PFN_OFFSET)) | 88 | #define pfn_valid(pfn) ((pfn) >= ARCH_PFN_OFFSET && (pfn) < (ARCH_PFN_OFFSET + max_mapnr)) |
89 | #define page_to_pfn(page) ((unsigned long)((page) - mem_map) + PHYS_PFN_OFFSET) | ||
90 | #define pfn_valid(pfn) ((pfn) >= PHYS_PFN_OFFSET && (pfn) < (PHYS_PFN_OFFSET + max_mapnr)) | ||
91 | #endif /* CONFIG_NEED_MULTIPLE_NODES */ | 89 | #endif /* CONFIG_NEED_MULTIPLE_NODES */ |
92 | 90 | ||
93 | #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) | 91 | #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) |
@@ -101,4 +99,6 @@ static inline int get_order(unsigned long size) | |||
101 | */ | 99 | */ |
102 | #define HIGHMEM_START 0x20000000UL | 100 | #define HIGHMEM_START 0x20000000UL |
103 | 101 | ||
102 | #include <asm-generic/memory_model.h> | ||
103 | |||
104 | #endif /* __ASM_AVR32_PAGE_H */ | 104 | #endif /* __ASM_AVR32_PAGE_H */ |
diff --git a/arch/frv/include/asm/page.h b/arch/frv/include/asm/page.h index 8c97068ac8fc..688d8076a43a 100644 --- a/arch/frv/include/asm/page.h +++ b/arch/frv/include/asm/page.h | |||
@@ -34,7 +34,7 @@ typedef struct page *pgtable_t; | |||
34 | #define pgprot_val(x) ((x).pgprot) | 34 | #define pgprot_val(x) ((x).pgprot) |
35 | 35 | ||
36 | #define __pte(x) ((pte_t) { (x) } ) | 36 | #define __pte(x) ((pte_t) { (x) } ) |
37 | #define __pmd(x) ((pmd_t) { (x) } ) | 37 | #define __pmd(x) ((pmd_t) { { (x) } } ) |
38 | #define __pud(x) ((pud_t) { (x) } ) | 38 | #define __pud(x) ((pud_t) { (x) } ) |
39 | #define __pgd(x) ((pgd_t) { (x) } ) | 39 | #define __pgd(x) ((pgd_t) { (x) } ) |
40 | #define __pgprot(x) ((pgprot_t) { (x) } ) | 40 | #define __pgprot(x) ((pgprot_t) { (x) } ) |
diff --git a/arch/ia64/include/asm/page.h b/arch/ia64/include/asm/page.h index ec48bb9f95e1..e8c486ef0d76 100644 --- a/arch/ia64/include/asm/page.h +++ b/arch/ia64/include/asm/page.h | |||
@@ -105,6 +105,7 @@ extern struct page *vmem_map; | |||
105 | #ifdef CONFIG_DISCONTIGMEM | 105 | #ifdef CONFIG_DISCONTIGMEM |
106 | # define page_to_pfn(page) ((unsigned long) (page - vmem_map)) | 106 | # define page_to_pfn(page) ((unsigned long) (page - vmem_map)) |
107 | # define pfn_to_page(pfn) (vmem_map + (pfn)) | 107 | # define pfn_to_page(pfn) (vmem_map + (pfn)) |
108 | # define __pfn_to_phys(pfn) PFN_PHYS(pfn) | ||
108 | #else | 109 | #else |
109 | # include <asm-generic/memory_model.h> | 110 | # include <asm-generic/memory_model.h> |
110 | #endif | 111 | #endif |
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index 0b389a81c43a..a0fa88da3e31 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig | |||
@@ -36,9 +36,6 @@ config STACKTRACE_SUPPORT | |||
36 | config LOCKDEP_SUPPORT | 36 | config LOCKDEP_SUPPORT |
37 | def_bool y | 37 | def_bool y |
38 | 38 | ||
39 | config HAVE_LATENCYTOP_SUPPORT | ||
40 | def_bool y | ||
41 | |||
42 | config RWSEM_GENERIC_SPINLOCK | 39 | config RWSEM_GENERIC_SPINLOCK |
43 | def_bool y | 40 | def_bool y |
44 | 41 | ||
diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 0bce820428fc..5ecd0287a874 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig | |||
@@ -67,9 +67,6 @@ config STACKTRACE_SUPPORT | |||
67 | config LOCKDEP_SUPPORT | 67 | config LOCKDEP_SUPPORT |
68 | def_bool y | 68 | def_bool y |
69 | 69 | ||
70 | config HAVE_LATENCYTOP_SUPPORT | ||
71 | def_bool y | ||
72 | |||
73 | source "init/Kconfig" | 70 | source "init/Kconfig" |
74 | 71 | ||
75 | source "kernel/Kconfig.freezer" | 72 | source "kernel/Kconfig.freezer" |
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 6ded8d347af9..7c191443c7ea 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h | |||
@@ -101,9 +101,9 @@ | |||
101 | #define CAUSEF_DC (_ULCAST_(1) << 27) | 101 | #define CAUSEF_DC (_ULCAST_(1) << 27) |
102 | 102 | ||
103 | extern atomic_t kvm_mips_instance; | 103 | extern atomic_t kvm_mips_instance; |
104 | extern pfn_t(*kvm_mips_gfn_to_pfn) (struct kvm *kvm, gfn_t gfn); | 104 | extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn); |
105 | extern void (*kvm_mips_release_pfn_clean) (pfn_t pfn); | 105 | extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn); |
106 | extern bool(*kvm_mips_is_error_pfn) (pfn_t pfn); | 106 | extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn); |
107 | 107 | ||
108 | struct kvm_vm_stat { | 108 | struct kvm_vm_stat { |
109 | u32 remote_tlb_flush; | 109 | u32 remote_tlb_flush; |
diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h index ff7ad91c85db..97b313882678 100644 --- a/arch/mips/include/asm/pgtable-bits.h +++ b/arch/mips/include/asm/pgtable-bits.h | |||
@@ -131,14 +131,12 @@ | |||
131 | /* Huge TLB page */ | 131 | /* Huge TLB page */ |
132 | #define _PAGE_HUGE_SHIFT (_PAGE_MODIFIED_SHIFT + 1) | 132 | #define _PAGE_HUGE_SHIFT (_PAGE_MODIFIED_SHIFT + 1) |
133 | #define _PAGE_HUGE (1 << _PAGE_HUGE_SHIFT) | 133 | #define _PAGE_HUGE (1 << _PAGE_HUGE_SHIFT) |
134 | #define _PAGE_SPLITTING_SHIFT (_PAGE_HUGE_SHIFT + 1) | ||
135 | #define _PAGE_SPLITTING (1 << _PAGE_SPLITTING_SHIFT) | ||
136 | #endif /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */ | 134 | #endif /* CONFIG_64BIT && CONFIG_MIPS_HUGE_TLB_SUPPORT */ |
137 | 135 | ||
138 | #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6) | 136 | #if defined(CONFIG_CPU_MIPSR2) || defined(CONFIG_CPU_MIPSR6) |
139 | /* XI - page cannot be executed */ | 137 | /* XI - page cannot be executed */ |
140 | #ifdef _PAGE_SPLITTING_SHIFT | 138 | #ifdef _PAGE_HUGE_SHIFT |
141 | #define _PAGE_NO_EXEC_SHIFT (_PAGE_SPLITTING_SHIFT + 1) | 139 | #define _PAGE_NO_EXEC_SHIFT (_PAGE_HUGE_SHIFT + 1) |
142 | #else | 140 | #else |
143 | #define _PAGE_NO_EXEC_SHIFT (_PAGE_MODIFIED_SHIFT + 1) | 141 | #define _PAGE_NO_EXEC_SHIFT (_PAGE_MODIFIED_SHIFT + 1) |
144 | #endif | 142 | #endif |
@@ -153,8 +151,8 @@ | |||
153 | 151 | ||
154 | #if defined(_PAGE_NO_READ_SHIFT) | 152 | #if defined(_PAGE_NO_READ_SHIFT) |
155 | #define _PAGE_GLOBAL_SHIFT (_PAGE_NO_READ_SHIFT + 1) | 153 | #define _PAGE_GLOBAL_SHIFT (_PAGE_NO_READ_SHIFT + 1) |
156 | #elif defined(_PAGE_SPLITTING_SHIFT) | 154 | #elif defined(_PAGE_HUGE_SHIFT) |
157 | #define _PAGE_GLOBAL_SHIFT (_PAGE_SPLITTING_SHIFT + 1) | 155 | #define _PAGE_GLOBAL_SHIFT (_PAGE_HUGE_SHIFT + 1) |
158 | #else | 156 | #else |
159 | #define _PAGE_GLOBAL_SHIFT (_PAGE_MODIFIED_SHIFT + 1) | 157 | #define _PAGE_GLOBAL_SHIFT (_PAGE_MODIFIED_SHIFT + 1) |
160 | #endif | 158 | #endif |
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 8957f15e21ec..6995b4a02e23 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h | |||
@@ -482,27 +482,9 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd) | |||
482 | return pmd; | 482 | return pmd; |
483 | } | 483 | } |
484 | 484 | ||
485 | static inline int pmd_trans_splitting(pmd_t pmd) | ||
486 | { | ||
487 | return !!(pmd_val(pmd) & _PAGE_SPLITTING); | ||
488 | } | ||
489 | |||
490 | static inline pmd_t pmd_mksplitting(pmd_t pmd) | ||
491 | { | ||
492 | pmd_val(pmd) |= _PAGE_SPLITTING; | ||
493 | |||
494 | return pmd; | ||
495 | } | ||
496 | |||
497 | extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, | 485 | extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, |
498 | pmd_t *pmdp, pmd_t pmd); | 486 | pmd_t *pmdp, pmd_t pmd); |
499 | 487 | ||
500 | #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
501 | /* Extern to avoid header file madness */ | ||
502 | extern void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
503 | unsigned long address, | ||
504 | pmd_t *pmdp); | ||
505 | |||
506 | #define __HAVE_ARCH_PMD_WRITE | 488 | #define __HAVE_ARCH_PMD_WRITE |
507 | static inline int pmd_write(pmd_t pmd) | 489 | static inline int pmd_write(pmd_t pmd) |
508 | { | 490 | { |
diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 97c03f468924..b0ebe59f73fd 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h | |||
@@ -73,8 +73,10 @@ | |||
73 | #define MADV_SEQUENTIAL 2 /* expect sequential page references */ | 73 | #define MADV_SEQUENTIAL 2 /* expect sequential page references */ |
74 | #define MADV_WILLNEED 3 /* will need these pages */ | 74 | #define MADV_WILLNEED 3 /* will need these pages */ |
75 | #define MADV_DONTNEED 4 /* don't need these pages */ | 75 | #define MADV_DONTNEED 4 /* don't need these pages */ |
76 | #define MADV_FREE 5 /* free pages only if memory pressure */ | ||
76 | 77 | ||
77 | /* common parameters: try to keep these consistent across architectures */ | 78 | /* common parameters: try to keep these consistent across architectures */ |
79 | #define MADV_FREE 8 /* free pages only if memory pressure */ | ||
78 | #define MADV_REMOVE 9 /* remove these pages & resources */ | 80 | #define MADV_REMOVE 9 /* remove these pages & resources */ |
79 | #define MADV_DONTFORK 10 /* don't inherit across fork */ | 81 | #define MADV_DONTFORK 10 /* don't inherit across fork */ |
80 | #define MADV_DOFORK 11 /* do inherit across fork */ | 82 | #define MADV_DOFORK 11 /* do inherit across fork */ |
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index 41b1b090f56f..1b675c7ce89f 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c | |||
@@ -1525,7 +1525,7 @@ int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu) | |||
1525 | struct kvm *kvm = vcpu->kvm; | 1525 | struct kvm *kvm = vcpu->kvm; |
1526 | unsigned long pa; | 1526 | unsigned long pa; |
1527 | gfn_t gfn; | 1527 | gfn_t gfn; |
1528 | pfn_t pfn; | 1528 | kvm_pfn_t pfn; |
1529 | 1529 | ||
1530 | gfn = va >> PAGE_SHIFT; | 1530 | gfn = va >> PAGE_SHIFT; |
1531 | 1531 | ||
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index aed0ac2a4972..570479c03bdc 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c | |||
@@ -38,13 +38,13 @@ atomic_t kvm_mips_instance; | |||
38 | EXPORT_SYMBOL(kvm_mips_instance); | 38 | EXPORT_SYMBOL(kvm_mips_instance); |
39 | 39 | ||
40 | /* These function pointers are initialized once the KVM module is loaded */ | 40 | /* These function pointers are initialized once the KVM module is loaded */ |
41 | pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn); | 41 | kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn); |
42 | EXPORT_SYMBOL(kvm_mips_gfn_to_pfn); | 42 | EXPORT_SYMBOL(kvm_mips_gfn_to_pfn); |
43 | 43 | ||
44 | void (*kvm_mips_release_pfn_clean)(pfn_t pfn); | 44 | void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn); |
45 | EXPORT_SYMBOL(kvm_mips_release_pfn_clean); | 45 | EXPORT_SYMBOL(kvm_mips_release_pfn_clean); |
46 | 46 | ||
47 | bool (*kvm_mips_is_error_pfn)(pfn_t pfn); | 47 | bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn); |
48 | EXPORT_SYMBOL(kvm_mips_is_error_pfn); | 48 | EXPORT_SYMBOL(kvm_mips_is_error_pfn); |
49 | 49 | ||
50 | uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) | 50 | uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) |
@@ -144,7 +144,7 @@ EXPORT_SYMBOL(kvm_mips_dump_guest_tlbs); | |||
144 | static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) | 144 | static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn) |
145 | { | 145 | { |
146 | int srcu_idx, err = 0; | 146 | int srcu_idx, err = 0; |
147 | pfn_t pfn; | 147 | kvm_pfn_t pfn; |
148 | 148 | ||
149 | if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE) | 149 | if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE) |
150 | return 0; | 150 | return 0; |
@@ -262,7 +262,7 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, | |||
262 | struct kvm_vcpu *vcpu) | 262 | struct kvm_vcpu *vcpu) |
263 | { | 263 | { |
264 | gfn_t gfn; | 264 | gfn_t gfn; |
265 | pfn_t pfn0, pfn1; | 265 | kvm_pfn_t pfn0, pfn1; |
266 | unsigned long vaddr = 0; | 266 | unsigned long vaddr = 0; |
267 | unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; | 267 | unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; |
268 | int even; | 268 | int even; |
@@ -313,7 +313,7 @@ EXPORT_SYMBOL(kvm_mips_handle_kseg0_tlb_fault); | |||
313 | int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, | 313 | int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr, |
314 | struct kvm_vcpu *vcpu) | 314 | struct kvm_vcpu *vcpu) |
315 | { | 315 | { |
316 | pfn_t pfn0, pfn1; | 316 | kvm_pfn_t pfn0, pfn1; |
317 | unsigned long flags, old_entryhi = 0, vaddr = 0; | 317 | unsigned long flags, old_entryhi = 0, vaddr = 0; |
318 | unsigned long entrylo0 = 0, entrylo1 = 0; | 318 | unsigned long entrylo0 = 0, entrylo1 = 0; |
319 | 319 | ||
@@ -360,7 +360,7 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, | |||
360 | { | 360 | { |
361 | unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; | 361 | unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0; |
362 | struct kvm *kvm = vcpu->kvm; | 362 | struct kvm *kvm = vcpu->kvm; |
363 | pfn_t pfn0, pfn1; | 363 | kvm_pfn_t pfn0, pfn1; |
364 | 364 | ||
365 | if ((tlb->tlb_hi & VPN2_MASK) == 0) { | 365 | if ((tlb->tlb_hi & VPN2_MASK) == 0) { |
366 | pfn0 = 0; | 366 | pfn0 = 0; |
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 5d3a25e1cfae..caac3d747a90 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c | |||
@@ -587,7 +587,8 @@ static inline void local_r4k_flush_cache_page(void *args) | |||
587 | * another ASID than the current one. | 587 | * another ASID than the current one. |
588 | */ | 588 | */ |
589 | map_coherent = (cpu_has_dc_aliases && | 589 | map_coherent = (cpu_has_dc_aliases && |
590 | page_mapped(page) && !Page_dcache_dirty(page)); | 590 | page_mapcount(page) && |
591 | !Page_dcache_dirty(page)); | ||
591 | if (map_coherent) | 592 | if (map_coherent) |
592 | vaddr = kmap_coherent(page, addr); | 593 | vaddr = kmap_coherent(page, addr); |
593 | else | 594 | else |
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index aab218c36e0d..3f159caf6dbc 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c | |||
@@ -106,7 +106,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr) | |||
106 | unsigned long addr = (unsigned long) page_address(page); | 106 | unsigned long addr = (unsigned long) page_address(page); |
107 | 107 | ||
108 | if (pages_do_alias(addr, vmaddr)) { | 108 | if (pages_do_alias(addr, vmaddr)) { |
109 | if (page_mapped(page) && !Page_dcache_dirty(page)) { | 109 | if (page_mapcount(page) && !Page_dcache_dirty(page)) { |
110 | void *kaddr; | 110 | void *kaddr; |
111 | 111 | ||
112 | kaddr = kmap_coherent(page, vmaddr); | 112 | kaddr = kmap_coherent(page, vmaddr); |
diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 349995d19c7f..1afd87c999b0 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c | |||
@@ -87,8 +87,6 @@ static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, | |||
87 | do { | 87 | do { |
88 | VM_BUG_ON(compound_head(page) != head); | 88 | VM_BUG_ON(compound_head(page) != head); |
89 | pages[*nr] = page; | 89 | pages[*nr] = page; |
90 | if (PageTail(page)) | ||
91 | get_huge_page_tail(page); | ||
92 | (*nr)++; | 90 | (*nr)++; |
93 | page++; | 91 | page++; |
94 | refs++; | 92 | refs++; |
@@ -109,18 +107,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
109 | pmd_t pmd = *pmdp; | 107 | pmd_t pmd = *pmdp; |
110 | 108 | ||
111 | next = pmd_addr_end(addr, end); | 109 | next = pmd_addr_end(addr, end); |
112 | /* | 110 | if (pmd_none(pmd)) |
113 | * The pmd_trans_splitting() check below explains why | ||
114 | * pmdp_splitting_flush has to flush the tlb, to stop | ||
115 | * this gup-fast code from running while we set the | ||
116 | * splitting bit in the pmd. Returning zero will take | ||
117 | * the slow path that will call wait_split_huge_page() | ||
118 | * if the pmd is still in splitting state. gup-fast | ||
119 | * can't because it has irq disabled and | ||
120 | * wait_split_huge_page() would never return as the | ||
121 | * tlb flush IPI wouldn't run. | ||
122 | */ | ||
123 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
124 | return 0; | 111 | return 0; |
125 | if (unlikely(pmd_huge(pmd))) { | 112 | if (unlikely(pmd_huge(pmd))) { |
126 | if (!gup_huge_pmd(pmd, addr, next, write, pages,nr)) | 113 | if (!gup_huge_pmd(pmd, addr, next, write, pages,nr)) |
@@ -153,8 +140,6 @@ static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, | |||
153 | do { | 140 | do { |
154 | VM_BUG_ON(compound_head(page) != head); | 141 | VM_BUG_ON(compound_head(page) != head); |
155 | pages[*nr] = page; | 142 | pages[*nr] = page; |
156 | if (PageTail(page)) | ||
157 | get_huge_page_tail(page); | ||
158 | (*nr)++; | 143 | (*nr)++; |
159 | page++; | 144 | page++; |
160 | refs++; | 145 | refs++; |
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 8770e619185e..7e5fa0938c21 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c | |||
@@ -165,7 +165,7 @@ void copy_user_highpage(struct page *to, struct page *from, | |||
165 | 165 | ||
166 | vto = kmap_atomic(to); | 166 | vto = kmap_atomic(to); |
167 | if (cpu_has_dc_aliases && | 167 | if (cpu_has_dc_aliases && |
168 | page_mapped(from) && !Page_dcache_dirty(from)) { | 168 | page_mapcount(from) && !Page_dcache_dirty(from)) { |
169 | vfrom = kmap_coherent(from, vaddr); | 169 | vfrom = kmap_coherent(from, vaddr); |
170 | copy_page(vto, vfrom); | 170 | copy_page(vto, vfrom); |
171 | kunmap_coherent(); | 171 | kunmap_coherent(); |
@@ -187,7 +187,7 @@ void copy_to_user_page(struct vm_area_struct *vma, | |||
187 | unsigned long len) | 187 | unsigned long len) |
188 | { | 188 | { |
189 | if (cpu_has_dc_aliases && | 189 | if (cpu_has_dc_aliases && |
190 | page_mapped(page) && !Page_dcache_dirty(page)) { | 190 | page_mapcount(page) && !Page_dcache_dirty(page)) { |
191 | void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); | 191 | void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); |
192 | memcpy(vto, src, len); | 192 | memcpy(vto, src, len); |
193 | kunmap_coherent(); | 193 | kunmap_coherent(); |
@@ -205,7 +205,7 @@ void copy_from_user_page(struct vm_area_struct *vma, | |||
205 | unsigned long len) | 205 | unsigned long len) |
206 | { | 206 | { |
207 | if (cpu_has_dc_aliases && | 207 | if (cpu_has_dc_aliases && |
208 | page_mapped(page) && !Page_dcache_dirty(page)) { | 208 | page_mapcount(page) && !Page_dcache_dirty(page)) { |
209 | void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); | 209 | void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); |
210 | memcpy(dst, vfrom, len); | 210 | memcpy(dst, vfrom, len); |
211 | kunmap_coherent(); | 211 | kunmap_coherent(); |
diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c index e8adc0069d66..ce4473e7c0d2 100644 --- a/arch/mips/mm/pgtable-64.c +++ b/arch/mips/mm/pgtable-64.c | |||
@@ -62,20 +62,6 @@ void pmd_init(unsigned long addr, unsigned long pagetable) | |||
62 | } | 62 | } |
63 | #endif | 63 | #endif |
64 | 64 | ||
65 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
66 | |||
67 | void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
68 | unsigned long address, | ||
69 | pmd_t *pmdp) | ||
70 | { | ||
71 | if (!pmd_trans_splitting(*pmdp)) { | ||
72 | pmd_t pmd = pmd_mksplitting(*pmdp); | ||
73 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); | ||
74 | } | ||
75 | } | ||
76 | |||
77 | #endif | ||
78 | |||
79 | pmd_t mk_pmd(struct page *page, pgprot_t prot) | 65 | pmd_t mk_pmd(struct page *page, pgprot_t prot) |
80 | { | 66 | { |
81 | pmd_t pmd; | 67 | pmd_t pmd; |
diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 32e0be27673f..482192cc8f2b 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c | |||
@@ -240,7 +240,6 @@ static void output_pgtable_bits_defines(void) | |||
240 | pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT); | 240 | pr_define("_PAGE_MODIFIED_SHIFT %d\n", _PAGE_MODIFIED_SHIFT); |
241 | #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT | 241 | #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT |
242 | pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT); | 242 | pr_define("_PAGE_HUGE_SHIFT %d\n", _PAGE_HUGE_SHIFT); |
243 | pr_define("_PAGE_SPLITTING_SHIFT %d\n", _PAGE_SPLITTING_SHIFT); | ||
244 | #endif | 243 | #endif |
245 | #ifdef CONFIG_CPU_MIPSR2 | 244 | #ifdef CONFIG_CPU_MIPSR2 |
246 | if (cpu_has_rixi) { | 245 | if (cpu_has_rixi) { |
diff --git a/arch/mn10300/include/asm/page.h b/arch/mn10300/include/asm/page.h index 8288e124165b..3810a6f740fd 100644 --- a/arch/mn10300/include/asm/page.h +++ b/arch/mn10300/include/asm/page.h | |||
@@ -107,6 +107,7 @@ static inline int get_order(unsigned long size) | |||
107 | #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) | 107 | #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) |
108 | #define pfn_to_page(pfn) (mem_map + ((pfn) - __pfn_disp)) | 108 | #define pfn_to_page(pfn) (mem_map + ((pfn) - __pfn_disp)) |
109 | #define page_to_pfn(page) ((unsigned long)((page) - mem_map) + __pfn_disp) | 109 | #define page_to_pfn(page) ((unsigned long)((page) - mem_map) + __pfn_disp) |
110 | #define __pfn_to_phys(pfn) PFN_PHYS(pfn) | ||
110 | 111 | ||
111 | #define pfn_valid(pfn) \ | 112 | #define pfn_valid(pfn) \ |
112 | ({ \ | 113 | ({ \ |
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 729f89163bc3..7c34cafdf301 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig | |||
@@ -79,9 +79,6 @@ config TIME_LOW_RES | |||
79 | depends on SMP | 79 | depends on SMP |
80 | default y | 80 | default y |
81 | 81 | ||
82 | config HAVE_LATENCYTOP_SUPPORT | ||
83 | def_bool y | ||
84 | |||
85 | # unless you want to implement ACPI on PA-RISC ... ;-) | 82 | # unless you want to implement ACPI on PA-RISC ... ;-) |
86 | config PM | 83 | config PM |
87 | bool | 84 | bool |
diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index dd4d1876a020..cf830d465f75 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h | |||
@@ -43,8 +43,10 @@ | |||
43 | #define MADV_SPACEAVAIL 5 /* insure that resources are reserved */ | 43 | #define MADV_SPACEAVAIL 5 /* insure that resources are reserved */ |
44 | #define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */ | 44 | #define MADV_VPS_PURGE 6 /* Purge pages from VM page cache */ |
45 | #define MADV_VPS_INHERIT 7 /* Inherit parents page size */ | 45 | #define MADV_VPS_INHERIT 7 /* Inherit parents page size */ |
46 | #define MADV_FREE 8 /* free pages only if memory pressure */ | ||
46 | 47 | ||
47 | /* common/generic parameters */ | 48 | /* common/generic parameters */ |
49 | #define MADV_FREE 8 /* free pages only if memory pressure */ | ||
48 | #define MADV_REMOVE 9 /* remove these pages & resources */ | 50 | #define MADV_REMOVE 9 /* remove these pages & resources */ |
49 | #define MADV_DONTFORK 10 /* don't inherit across fork */ | 51 | #define MADV_DONTFORK 10 /* don't inherit across fork */ |
50 | #define MADV_DOFORK 11 /* do inherit across fork */ | 52 | #define MADV_DOFORK 11 /* do inherit across fork */ |
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 7d5a8350f913..94f6c5089e0c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig | |||
@@ -47,9 +47,6 @@ config STACKTRACE_SUPPORT | |||
47 | bool | 47 | bool |
48 | default y | 48 | default y |
49 | 49 | ||
50 | config HAVE_LATENCYTOP_SUPPORT | ||
51 | def_bool y | ||
52 | |||
53 | config TRACE_IRQFLAGS_SUPPORT | 50 | config TRACE_IRQFLAGS_SUPPORT |
54 | bool | 51 | bool |
55 | default y | 52 | default y |
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 9e55e3b1fef0..849bbec80f7b 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h | |||
@@ -256,13 +256,6 @@ static inline int pmd_trans_huge(pmd_t pmd) | |||
256 | (_PAGE_PTE | _PAGE_THP_HUGE)); | 256 | (_PAGE_PTE | _PAGE_THP_HUGE)); |
257 | } | 257 | } |
258 | 258 | ||
259 | static inline int pmd_trans_splitting(pmd_t pmd) | ||
260 | { | ||
261 | if (pmd_trans_huge(pmd)) | ||
262 | return pmd_val(pmd) & _PAGE_SPLITTING; | ||
263 | return 0; | ||
264 | } | ||
265 | |||
266 | static inline int pmd_large(pmd_t pmd) | 259 | static inline int pmd_large(pmd_t pmd) |
267 | { | 260 | { |
268 | return !!(pmd_val(pmd) & _PAGE_PTE); | 261 | return !!(pmd_val(pmd) & _PAGE_PTE); |
@@ -273,11 +266,6 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) | |||
273 | return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); | 266 | return __pmd(pmd_val(pmd) & ~_PAGE_PRESENT); |
274 | } | 267 | } |
275 | 268 | ||
276 | static inline pmd_t pmd_mksplitting(pmd_t pmd) | ||
277 | { | ||
278 | return __pmd(pmd_val(pmd) | _PAGE_SPLITTING); | ||
279 | } | ||
280 | |||
281 | #define __HAVE_ARCH_PMD_SAME | 269 | #define __HAVE_ARCH_PMD_SAME |
282 | static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) | 270 | static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) |
283 | { | 271 | { |
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 2ff8b3df553d..06f17e778c27 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h | |||
@@ -41,11 +41,6 @@ | |||
41 | #endif | 41 | #endif |
42 | 42 | ||
43 | /* | 43 | /* |
44 | * THP pages can't be special. So use the _PAGE_SPECIAL | ||
45 | */ | ||
46 | #define _PAGE_SPLITTING _PAGE_SPECIAL | ||
47 | |||
48 | /* | ||
49 | * We need to differentiate between explicit huge page and THP huge | 44 | * We need to differentiate between explicit huge page and THP huge |
50 | * page, since THP huge page also need to track real subpage details | 45 | * page, since THP huge page also need to track real subpage details |
51 | */ | 46 | */ |
@@ -54,9 +49,8 @@ | |||
54 | /* | 49 | /* |
55 | * set of bits not changed in pmd_modify. | 50 | * set of bits not changed in pmd_modify. |
56 | */ | 51 | */ |
57 | #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | \ | 52 | #define _HPAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \ |
58 | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPLITTING | \ | 53 | _PAGE_ACCESSED | _PAGE_THP_HUGE) |
59 | _PAGE_THP_HUGE | _PAGE_PTE | _PAGE_SOFT_DIRTY) | ||
60 | 54 | ||
61 | #ifdef CONFIG_PPC_64K_PAGES | 55 | #ifdef CONFIG_PPC_64K_PAGES |
62 | #include <asm/book3s/64/hash-64k.h> | 56 | #include <asm/book3s/64/hash-64k.h> |
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index b3a5badab69f..8204b0c393aa 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h | |||
@@ -223,9 +223,11 @@ static inline pte_t *pmdp_ptep(pmd_t *pmd) | |||
223 | #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) | 223 | #define pmd_pfn(pmd) pte_pfn(pmd_pte(pmd)) |
224 | #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) | 224 | #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) |
225 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) | 225 | #define pmd_young(pmd) pte_young(pmd_pte(pmd)) |
226 | #define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd)) | ||
226 | #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) | 227 | #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) |
227 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) | 228 | #define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd))) |
228 | #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) | 229 | #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) |
230 | #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) | ||
229 | #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) | 231 | #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) |
230 | #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) | 232 | #define pmd_mkwrite(pmd) pte_pmd(pte_mkwrite(pmd_pte(pmd))) |
231 | 233 | ||
@@ -266,10 +268,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, | |||
266 | extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, | 268 | extern pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, |
267 | unsigned long addr, pmd_t *pmdp); | 269 | unsigned long addr, pmd_t *pmdp); |
268 | 270 | ||
269 | #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
270 | extern void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
271 | unsigned long address, pmd_t *pmdp); | ||
272 | |||
273 | extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, | 271 | extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, |
274 | unsigned long address, pmd_t *pmdp); | 272 | unsigned long address, pmd_t *pmdp); |
275 | #define pmdp_collapse_flush pmdp_collapse_flush | 273 | #define pmdp_collapse_flush pmdp_collapse_flush |
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 9fac01cb89c1..8f39796c9da8 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h | |||
@@ -154,8 +154,8 @@ extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, | |||
154 | bool upper, u32 val); | 154 | bool upper, u32 val); |
155 | extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); | 155 | extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr); |
156 | extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); | 156 | extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu); |
157 | extern pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, | 157 | extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, |
158 | bool *writable); | 158 | bool writing, bool *writable); |
159 | extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, | 159 | extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, |
160 | unsigned long *rmap, long pte_index, int realmode); | 160 | unsigned long *rmap, long pte_index, int realmode); |
161 | extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize); | 161 | extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize); |
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index c6ef05bd0765..2241d5357129 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h | |||
@@ -515,7 +515,7 @@ void kvmppc_claim_lpid(long lpid); | |||
515 | void kvmppc_free_lpid(long lpid); | 515 | void kvmppc_free_lpid(long lpid); |
516 | void kvmppc_init_lpid(unsigned long nr_lpids); | 516 | void kvmppc_init_lpid(unsigned long nr_lpids); |
517 | 517 | ||
518 | static inline void kvmppc_mmu_flush_icache(pfn_t pfn) | 518 | static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn) |
519 | { | 519 | { |
520 | struct page *page; | 520 | struct page *page; |
521 | /* | 521 | /* |
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 099c79d8c160..638c6d9be9e0 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c | |||
@@ -366,7 +366,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) | |||
366 | } | 366 | } |
367 | EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); | 367 | EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); |
368 | 368 | ||
369 | pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, | 369 | kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, |
370 | bool *writable) | 370 | bool *writable) |
371 | { | 371 | { |
372 | ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM; | 372 | ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM; |
@@ -379,9 +379,9 @@ pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, | |||
379 | gpa &= ~0xFFFULL; | 379 | gpa &= ~0xFFFULL; |
380 | if (unlikely(mp_pa) && unlikely((gpa & KVM_PAM) == mp_pa)) { | 380 | if (unlikely(mp_pa) && unlikely((gpa & KVM_PAM) == mp_pa)) { |
381 | ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; | 381 | ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; |
382 | pfn_t pfn; | 382 | kvm_pfn_t pfn; |
383 | 383 | ||
384 | pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; | 384 | pfn = (kvm_pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; |
385 | get_page(pfn_to_page(pfn)); | 385 | get_page(pfn_to_page(pfn)); |
386 | if (writable) | 386 | if (writable) |
387 | *writable = true; | 387 | *writable = true; |
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index d5c9bfeb0c9c..55c4d51ea3e2 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c | |||
@@ -142,7 +142,7 @@ extern char etext[]; | |||
142 | int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, | 142 | int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, |
143 | bool iswrite) | 143 | bool iswrite) |
144 | { | 144 | { |
145 | pfn_t hpaddr; | 145 | kvm_pfn_t hpaddr; |
146 | u64 vpn; | 146 | u64 vpn; |
147 | u64 vsid; | 147 | u64 vsid; |
148 | struct kvmppc_sid_map *map; | 148 | struct kvmppc_sid_map *map; |
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 79ad35abd196..913cd2198fa6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c | |||
@@ -83,7 +83,7 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, | |||
83 | bool iswrite) | 83 | bool iswrite) |
84 | { | 84 | { |
85 | unsigned long vpn; | 85 | unsigned long vpn; |
86 | pfn_t hpaddr; | 86 | kvm_pfn_t hpaddr; |
87 | ulong hash, hpteg; | 87 | ulong hash, hpteg; |
88 | u64 vsid; | 88 | u64 vsid; |
89 | int ret; | 89 | int ret; |
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h index 72920bed3ac6..94f04fcb373e 100644 --- a/arch/powerpc/kvm/e500.h +++ b/arch/powerpc/kvm/e500.h | |||
@@ -41,7 +41,7 @@ enum vcpu_ftr { | |||
41 | #define E500_TLB_MAS2_ATTR (0x7f) | 41 | #define E500_TLB_MAS2_ATTR (0x7f) |
42 | 42 | ||
43 | struct tlbe_ref { | 43 | struct tlbe_ref { |
44 | pfn_t pfn; /* valid only for TLB0, except briefly */ | 44 | kvm_pfn_t pfn; /* valid only for TLB0, except briefly */ |
45 | unsigned int flags; /* E500_TLB_* */ | 45 | unsigned int flags; /* E500_TLB_* */ |
46 | }; | 46 | }; |
47 | 47 | ||
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 34c43fff4adb..b0333cc737dd 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c | |||
@@ -163,9 +163,9 @@ void kvmppc_map_magic(struct kvm_vcpu *vcpu) | |||
163 | struct kvm_book3e_206_tlb_entry magic; | 163 | struct kvm_book3e_206_tlb_entry magic; |
164 | ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; | 164 | ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; |
165 | unsigned int stid; | 165 | unsigned int stid; |
166 | pfn_t pfn; | 166 | kvm_pfn_t pfn; |
167 | 167 | ||
168 | pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; | 168 | pfn = (kvm_pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; |
169 | get_page(pfn_to_page(pfn)); | 169 | get_page(pfn_to_page(pfn)); |
170 | 170 | ||
171 | preempt_disable(); | 171 | preempt_disable(); |
@@ -246,7 +246,7 @@ static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) | |||
246 | 246 | ||
247 | static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, | 247 | static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, |
248 | struct kvm_book3e_206_tlb_entry *gtlbe, | 248 | struct kvm_book3e_206_tlb_entry *gtlbe, |
249 | pfn_t pfn, unsigned int wimg) | 249 | kvm_pfn_t pfn, unsigned int wimg) |
250 | { | 250 | { |
251 | ref->pfn = pfn; | 251 | ref->pfn = pfn; |
252 | ref->flags = E500_TLB_VALID; | 252 | ref->flags = E500_TLB_VALID; |
@@ -309,7 +309,7 @@ static void kvmppc_e500_setup_stlbe( | |||
309 | int tsize, struct tlbe_ref *ref, u64 gvaddr, | 309 | int tsize, struct tlbe_ref *ref, u64 gvaddr, |
310 | struct kvm_book3e_206_tlb_entry *stlbe) | 310 | struct kvm_book3e_206_tlb_entry *stlbe) |
311 | { | 311 | { |
312 | pfn_t pfn = ref->pfn; | 312 | kvm_pfn_t pfn = ref->pfn; |
313 | u32 pr = vcpu->arch.shared->msr & MSR_PR; | 313 | u32 pr = vcpu->arch.shared->msr & MSR_PR; |
314 | 314 | ||
315 | BUG_ON(!(ref->flags & E500_TLB_VALID)); | 315 | BUG_ON(!(ref->flags & E500_TLB_VALID)); |
diff --git a/arch/powerpc/kvm/trace_pr.h b/arch/powerpc/kvm/trace_pr.h index 810507cb688a..d44f324184fb 100644 --- a/arch/powerpc/kvm/trace_pr.h +++ b/arch/powerpc/kvm/trace_pr.h | |||
@@ -30,7 +30,7 @@ TRACE_EVENT(kvm_book3s_reenter, | |||
30 | #ifdef CONFIG_PPC_BOOK3S_64 | 30 | #ifdef CONFIG_PPC_BOOK3S_64 |
31 | 31 | ||
32 | TRACE_EVENT(kvm_book3s_64_mmu_map, | 32 | TRACE_EVENT(kvm_book3s_64_mmu_map, |
33 | TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, | 33 | TP_PROTO(int rflags, ulong hpteg, ulong va, kvm_pfn_t hpaddr, |
34 | struct kvmppc_pte *orig_pte), | 34 | struct kvmppc_pte *orig_pte), |
35 | TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), | 35 | TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), |
36 | 36 | ||
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c index baf1301ded0c..49b152b0f926 100644 --- a/arch/powerpc/mm/hugepage-hash64.c +++ b/arch/powerpc/mm/hugepage-hash64.c | |||
@@ -39,9 +39,6 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid, | |||
39 | /* If PMD busy, retry the access */ | 39 | /* If PMD busy, retry the access */ |
40 | if (unlikely(old_pmd & _PAGE_BUSY)) | 40 | if (unlikely(old_pmd & _PAGE_BUSY)) |
41 | return 0; | 41 | return 0; |
42 | /* If PMD is trans splitting retry the access */ | ||
43 | if (unlikely(old_pmd & _PAGE_SPLITTING)) | ||
44 | return 0; | ||
45 | /* If PMD permissions don't match, take page fault */ | 42 | /* If PMD permissions don't match, take page fault */ |
46 | if (unlikely(access & ~old_pmd)) | 43 | if (unlikely(access & ~old_pmd)) |
47 | return 1; | 44 | return 1; |
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 61b8b7ccea4f..744e24bcb85c 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c | |||
@@ -958,10 +958,6 @@ pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, | |||
958 | /* | 958 | /* |
959 | * A hugepage collapse is captured by pmd_none, because | 959 | * A hugepage collapse is captured by pmd_none, because |
960 | * it mark the pmd none and do a hpte invalidate. | 960 | * it mark the pmd none and do a hpte invalidate. |
961 | * | ||
962 | * We don't worry about pmd_trans_splitting here, The | ||
963 | * caller if it needs to handle the splitting case | ||
964 | * should check for that. | ||
965 | */ | 961 | */ |
966 | if (pmd_none(pmd)) | 962 | if (pmd_none(pmd)) |
967 | return NULL; | 963 | return NULL; |
@@ -999,7 +995,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, | |||
999 | { | 995 | { |
1000 | unsigned long mask; | 996 | unsigned long mask; |
1001 | unsigned long pte_end; | 997 | unsigned long pte_end; |
1002 | struct page *head, *page, *tail; | 998 | struct page *head, *page; |
1003 | pte_t pte; | 999 | pte_t pte; |
1004 | int refs; | 1000 | int refs; |
1005 | 1001 | ||
@@ -1022,7 +1018,6 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, | |||
1022 | head = pte_page(pte); | 1018 | head = pte_page(pte); |
1023 | 1019 | ||
1024 | page = head + ((addr & (sz-1)) >> PAGE_SHIFT); | 1020 | page = head + ((addr & (sz-1)) >> PAGE_SHIFT); |
1025 | tail = page; | ||
1026 | do { | 1021 | do { |
1027 | VM_BUG_ON(compound_head(page) != head); | 1022 | VM_BUG_ON(compound_head(page) != head); |
1028 | pages[*nr] = page; | 1023 | pages[*nr] = page; |
@@ -1044,15 +1039,5 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, | |||
1044 | return 0; | 1039 | return 0; |
1045 | } | 1040 | } |
1046 | 1041 | ||
1047 | /* | ||
1048 | * Any tail page need their mapcount reference taken before we | ||
1049 | * return. | ||
1050 | */ | ||
1051 | while (refs--) { | ||
1052 | if (PageTail(tail)) | ||
1053 | get_huge_page_tail(tail); | ||
1054 | tail++; | ||
1055 | } | ||
1056 | |||
1057 | return 1; | 1042 | return 1; |
1058 | } | 1043 | } |
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index ea6bc31debb0..3124a20d0fab 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c | |||
@@ -604,55 +604,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, | |||
604 | } | 604 | } |
605 | 605 | ||
606 | /* | 606 | /* |
607 | * We mark the pmd splitting and invalidate all the hpte | ||
608 | * entries for this hugepage. | ||
609 | */ | ||
610 | void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
611 | unsigned long address, pmd_t *pmdp) | ||
612 | { | ||
613 | unsigned long old, tmp; | ||
614 | |||
615 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
616 | |||
617 | #ifdef CONFIG_DEBUG_VM | ||
618 | WARN_ON(!pmd_trans_huge(*pmdp)); | ||
619 | assert_spin_locked(&vma->vm_mm->page_table_lock); | ||
620 | #endif | ||
621 | |||
622 | #ifdef PTE_ATOMIC_UPDATES | ||
623 | |||
624 | __asm__ __volatile__( | ||
625 | "1: ldarx %0,0,%3\n\ | ||
626 | andi. %1,%0,%6\n\ | ||
627 | bne- 1b \n\ | ||
628 | oris %1,%0,%4@h \n\ | ||
629 | stdcx. %1,0,%3 \n\ | ||
630 | bne- 1b" | ||
631 | : "=&r" (old), "=&r" (tmp), "=m" (*pmdp) | ||
632 | : "r" (pmdp), "i" (_PAGE_SPLITTING), "m" (*pmdp), "i" (_PAGE_BUSY) | ||
633 | : "cc" ); | ||
634 | #else | ||
635 | old = pmd_val(*pmdp); | ||
636 | *pmdp = __pmd(old | _PAGE_SPLITTING); | ||
637 | #endif | ||
638 | /* | ||
639 | * If we didn't had the splitting flag set, go and flush the | ||
640 | * HPTE entries. | ||
641 | */ | ||
642 | trace_hugepage_splitting(address, old); | ||
643 | if (!(old & _PAGE_SPLITTING)) { | ||
644 | /* We need to flush the hpte */ | ||
645 | if (old & _PAGE_HASHPTE) | ||
646 | hpte_do_hugepage_flush(vma->vm_mm, address, pmdp, old); | ||
647 | } | ||
648 | /* | ||
649 | * This ensures that generic code that rely on IRQ disabling | ||
650 | * to prevent a parallel THP split work as expected. | ||
651 | */ | ||
652 | kick_all_cpus_sync(); | ||
653 | } | ||
654 | |||
655 | /* | ||
656 | * We want to put the pgtable in pmd and use pgtable for tracking | 607 | * We want to put the pgtable in pmd and use pgtable for tracking |
657 | * the base page size hptes | 608 | * the base page size hptes |
658 | */ | 609 | */ |
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index fa9fb5b4c66c..d5543514c1df 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c | |||
@@ -135,7 +135,7 @@ static int subpage_walk_pmd_entry(pmd_t *pmd, unsigned long addr, | |||
135 | unsigned long end, struct mm_walk *walk) | 135 | unsigned long end, struct mm_walk *walk) |
136 | { | 136 | { |
137 | struct vm_area_struct *vma = walk->vma; | 137 | struct vm_area_struct *vma = walk->vma; |
138 | split_huge_page_pmd(vma, addr, pmd); | 138 | split_huge_pmd(vma, pmd, addr); |
139 | return 0; | 139 | return 0; |
140 | } | 140 | } |
141 | 141 | ||
diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index c713b349d967..0d112b94d91d 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/types.h> | 43 | #include <linux/types.h> |
44 | #include <linux/of_device.h> | 44 | #include <linux/of_device.h> |
45 | #include <linux/of_platform.h> | 45 | #include <linux/of_platform.h> |
46 | #include <linux/pfn_t.h> | ||
46 | 47 | ||
47 | #include <asm/page.h> | 48 | #include <asm/page.h> |
48 | #include <asm/prom.h> | 49 | #include <asm/prom.h> |
@@ -142,15 +143,13 @@ axon_ram_make_request(struct request_queue *queue, struct bio *bio) | |||
142 | */ | 143 | */ |
143 | static long | 144 | static long |
144 | axon_ram_direct_access(struct block_device *device, sector_t sector, | 145 | axon_ram_direct_access(struct block_device *device, sector_t sector, |
145 | void __pmem **kaddr, unsigned long *pfn) | 146 | void __pmem **kaddr, pfn_t *pfn) |
146 | { | 147 | { |
147 | struct axon_ram_bank *bank = device->bd_disk->private_data; | 148 | struct axon_ram_bank *bank = device->bd_disk->private_data; |
148 | loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; | 149 | loff_t offset = (loff_t)sector << AXON_RAM_SECTOR_SHIFT; |
149 | void *addr = (void *)(bank->ph_addr + offset); | ||
150 | |||
151 | *kaddr = (void __pmem *)addr; | ||
152 | *pfn = virt_to_phys(addr) >> PAGE_SHIFT; | ||
153 | 150 | ||
151 | *kaddr = (void __pmem __force *) bank->io_addr + offset; | ||
152 | *pfn = phys_to_pfn_t(bank->ph_addr + offset, PFN_DEV); | ||
154 | return bank->size - offset; | 153 | return bank->size - offset; |
155 | } | 154 | } |
156 | 155 | ||
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 24490344c30f..dbeeb3a049f2 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig | |||
@@ -10,9 +10,6 @@ config LOCKDEP_SUPPORT | |||
10 | config STACKTRACE_SUPPORT | 10 | config STACKTRACE_SUPPORT |
11 | def_bool y | 11 | def_bool y |
12 | 12 | ||
13 | config HAVE_LATENCYTOP_SUPPORT | ||
14 | def_bool y | ||
15 | |||
16 | config RWSEM_GENERIC_SPINLOCK | 13 | config RWSEM_GENERIC_SPINLOCK |
17 | bool | 14 | bool |
18 | 15 | ||
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 024f85f947ae..64ead8091248 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h | |||
@@ -286,7 +286,6 @@ static inline int is_module_addr(void *addr) | |||
286 | 286 | ||
287 | #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ | 287 | #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ |
288 | #define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ | 288 | #define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ |
289 | #define _SEGMENT_ENTRY_SPLIT 0x0800 /* THP splitting bit */ | ||
290 | #define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ | 289 | #define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ |
291 | #define _SEGMENT_ENTRY_READ 0x0002 /* SW segment read bit */ | 290 | #define _SEGMENT_ENTRY_READ 0x0002 /* SW segment read bit */ |
292 | #define _SEGMENT_ENTRY_WRITE 0x0001 /* SW segment write bit */ | 291 | #define _SEGMENT_ENTRY_WRITE 0x0001 /* SW segment write bit */ |
@@ -318,8 +317,6 @@ static inline int is_module_addr(void *addr) | |||
318 | * SW-bits: y young, d dirty, r read, w write | 317 | * SW-bits: y young, d dirty, r read, w write |
319 | */ | 318 | */ |
320 | 319 | ||
321 | #define _SEGMENT_ENTRY_SPLIT_BIT 11 /* THP splitting bit number */ | ||
322 | |||
323 | /* Page status table bits for virtualization */ | 320 | /* Page status table bits for virtualization */ |
324 | #define PGSTE_ACC_BITS 0xf000000000000000UL | 321 | #define PGSTE_ACC_BITS 0xf000000000000000UL |
325 | #define PGSTE_FP_BIT 0x0800000000000000UL | 322 | #define PGSTE_FP_BIT 0x0800000000000000UL |
@@ -523,10 +520,6 @@ static inline int pmd_bad(pmd_t pmd) | |||
523 | return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; | 520 | return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; |
524 | } | 521 | } |
525 | 522 | ||
526 | #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
527 | extern void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
528 | unsigned long addr, pmd_t *pmdp); | ||
529 | |||
530 | #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS | 523 | #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS |
531 | extern int pmdp_set_access_flags(struct vm_area_struct *vma, | 524 | extern int pmdp_set_access_flags(struct vm_area_struct *vma, |
532 | unsigned long address, pmd_t *pmdp, | 525 | unsigned long address, pmd_t *pmdp, |
@@ -1424,8 +1417,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) | |||
1424 | if (pmd_large(pmd)) { | 1417 | if (pmd_large(pmd)) { |
1425 | pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE | | 1418 | pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE | |
1426 | _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG | | 1419 | _SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG | |
1427 | _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT | | 1420 | _SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SOFT_DIRTY; |
1428 | _SEGMENT_ENTRY_SOFT_DIRTY; | ||
1429 | pmd_val(pmd) |= massage_pgprot_pmd(newprot); | 1421 | pmd_val(pmd) |= massage_pgprot_pmd(newprot); |
1430 | if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY)) | 1422 | if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY)) |
1431 | pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT; | 1423 | pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT; |
@@ -1533,12 +1525,6 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, | |||
1533 | #define __HAVE_ARCH_PGTABLE_WITHDRAW | 1525 | #define __HAVE_ARCH_PGTABLE_WITHDRAW |
1534 | extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); | 1526 | extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); |
1535 | 1527 | ||
1536 | static inline int pmd_trans_splitting(pmd_t pmd) | ||
1537 | { | ||
1538 | return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) && | ||
1539 | (pmd_val(pmd) & _SEGMENT_ENTRY_SPLIT); | ||
1540 | } | ||
1541 | |||
1542 | static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, | 1528 | static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, |
1543 | pmd_t *pmdp, pmd_t entry) | 1529 | pmd_t *pmdp, pmd_t entry) |
1544 | { | 1530 | { |
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 21c74a71e2ab..13dab0c1645c 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c | |||
@@ -55,7 +55,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, | |||
55 | unsigned long end, int write, struct page **pages, int *nr) | 55 | unsigned long end, int write, struct page **pages, int *nr) |
56 | { | 56 | { |
57 | unsigned long mask, result; | 57 | unsigned long mask, result; |
58 | struct page *head, *page, *tail; | 58 | struct page *head, *page; |
59 | int refs; | 59 | int refs; |
60 | 60 | ||
61 | result = write ? 0 : _SEGMENT_ENTRY_PROTECT; | 61 | result = write ? 0 : _SEGMENT_ENTRY_PROTECT; |
@@ -67,7 +67,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, | |||
67 | refs = 0; | 67 | refs = 0; |
68 | head = pmd_page(pmd); | 68 | head = pmd_page(pmd); |
69 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | 69 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
70 | tail = page; | ||
71 | do { | 70 | do { |
72 | VM_BUG_ON(compound_head(page) != head); | 71 | VM_BUG_ON(compound_head(page) != head); |
73 | pages[*nr] = page; | 72 | pages[*nr] = page; |
@@ -88,16 +87,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, | |||
88 | return 0; | 87 | return 0; |
89 | } | 88 | } |
90 | 89 | ||
91 | /* | ||
92 | * Any tail page need their mapcount reference taken before we | ||
93 | * return. | ||
94 | */ | ||
95 | while (refs--) { | ||
96 | if (PageTail(tail)) | ||
97 | get_huge_page_tail(tail); | ||
98 | tail++; | ||
99 | } | ||
100 | |||
101 | return 1; | 90 | return 1; |
102 | } | 91 | } |
103 | 92 | ||
@@ -116,16 +105,7 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, | |||
116 | pmd = *pmdp; | 105 | pmd = *pmdp; |
117 | barrier(); | 106 | barrier(); |
118 | next = pmd_addr_end(addr, end); | 107 | next = pmd_addr_end(addr, end); |
119 | /* | 108 | if (pmd_none(pmd)) |
120 | * The pmd_trans_splitting() check below explains why | ||
121 | * pmdp_splitting_flush() has to serialize with | ||
122 | * smp_call_function() against our disabled IRQs, to stop | ||
123 | * this gup-fast code from running while we set the | ||
124 | * splitting bit in the pmd. Returning zero will take | ||
125 | * the slow path that will call wait_split_huge_page() | ||
126 | * if the pmd is still in splitting state. | ||
127 | */ | ||
128 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
129 | return 0; | 109 | return 0; |
130 | if (unlikely(pmd_large(pmd))) { | 110 | if (unlikely(pmd_large(pmd))) { |
131 | /* | 111 | /* |
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index aa34af0a0b26..a809fa8e6f8b 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c | |||
@@ -578,17 +578,29 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr, | |||
578 | { | 578 | { |
579 | unsigned long vmaddr; | 579 | unsigned long vmaddr; |
580 | int rc; | 580 | int rc; |
581 | bool unlocked; | ||
581 | 582 | ||
582 | down_read(&gmap->mm->mmap_sem); | 583 | down_read(&gmap->mm->mmap_sem); |
584 | |||
585 | retry: | ||
586 | unlocked = false; | ||
583 | vmaddr = __gmap_translate(gmap, gaddr); | 587 | vmaddr = __gmap_translate(gmap, gaddr); |
584 | if (IS_ERR_VALUE(vmaddr)) { | 588 | if (IS_ERR_VALUE(vmaddr)) { |
585 | rc = vmaddr; | 589 | rc = vmaddr; |
586 | goto out_up; | 590 | goto out_up; |
587 | } | 591 | } |
588 | if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) { | 592 | if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, |
593 | &unlocked)) { | ||
589 | rc = -EFAULT; | 594 | rc = -EFAULT; |
590 | goto out_up; | 595 | goto out_up; |
591 | } | 596 | } |
597 | /* | ||
598 | * In the case that fixup_user_fault unlocked the mmap_sem during | ||
599 | * faultin redo __gmap_translate to not race with a map/unmap_segment. | ||
600 | */ | ||
601 | if (unlocked) | ||
602 | goto retry; | ||
603 | |||
592 | rc = __gmap_link(gmap, gaddr, vmaddr); | 604 | rc = __gmap_link(gmap, gaddr, vmaddr); |
593 | out_up: | 605 | out_up: |
594 | up_read(&gmap->mm->mmap_sem); | 606 | up_read(&gmap->mm->mmap_sem); |
@@ -714,12 +726,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) | |||
714 | spinlock_t *ptl; | 726 | spinlock_t *ptl; |
715 | pte_t *ptep, entry; | 727 | pte_t *ptep, entry; |
716 | pgste_t pgste; | 728 | pgste_t pgste; |
729 | bool unlocked; | ||
717 | int rc = 0; | 730 | int rc = 0; |
718 | 731 | ||
719 | if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) | 732 | if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK)) |
720 | return -EINVAL; | 733 | return -EINVAL; |
721 | down_read(&gmap->mm->mmap_sem); | 734 | down_read(&gmap->mm->mmap_sem); |
722 | while (len) { | 735 | while (len) { |
736 | unlocked = false; | ||
723 | /* Convert gmap address and connect the page tables */ | 737 | /* Convert gmap address and connect the page tables */ |
724 | addr = __gmap_translate(gmap, gaddr); | 738 | addr = __gmap_translate(gmap, gaddr); |
725 | if (IS_ERR_VALUE(addr)) { | 739 | if (IS_ERR_VALUE(addr)) { |
@@ -727,10 +741,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len) | |||
727 | break; | 741 | break; |
728 | } | 742 | } |
729 | /* Get the page mapped */ | 743 | /* Get the page mapped */ |
730 | if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) { | 744 | if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE, |
745 | &unlocked)) { | ||
731 | rc = -EFAULT; | 746 | rc = -EFAULT; |
732 | break; | 747 | break; |
733 | } | 748 | } |
749 | /* While trying to map mmap_sem got unlocked. Let us retry */ | ||
750 | if (unlocked) | ||
751 | continue; | ||
734 | rc = __gmap_link(gmap, gaddr, addr); | 752 | rc = __gmap_link(gmap, gaddr, addr); |
735 | if (rc) | 753 | if (rc) |
736 | break; | 754 | break; |
@@ -791,9 +809,11 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, | |||
791 | spinlock_t *ptl; | 809 | spinlock_t *ptl; |
792 | pgste_t old, new; | 810 | pgste_t old, new; |
793 | pte_t *ptep; | 811 | pte_t *ptep; |
812 | bool unlocked; | ||
794 | 813 | ||
795 | down_read(&mm->mmap_sem); | 814 | down_read(&mm->mmap_sem); |
796 | retry: | 815 | retry: |
816 | unlocked = false; | ||
797 | ptep = get_locked_pte(mm, addr, &ptl); | 817 | ptep = get_locked_pte(mm, addr, &ptl); |
798 | if (unlikely(!ptep)) { | 818 | if (unlikely(!ptep)) { |
799 | up_read(&mm->mmap_sem); | 819 | up_read(&mm->mmap_sem); |
@@ -802,7 +822,12 @@ retry: | |||
802 | if (!(pte_val(*ptep) & _PAGE_INVALID) && | 822 | if (!(pte_val(*ptep) & _PAGE_INVALID) && |
803 | (pte_val(*ptep) & _PAGE_PROTECT)) { | 823 | (pte_val(*ptep) & _PAGE_PROTECT)) { |
804 | pte_unmap_unlock(ptep, ptl); | 824 | pte_unmap_unlock(ptep, ptl); |
805 | if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { | 825 | /* |
826 | * We do not really care about unlocked. We will retry either | ||
827 | * way. But this allows fixup_user_fault to enable userfaultfd. | ||
828 | */ | ||
829 | if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE, | ||
830 | &unlocked)) { | ||
806 | up_read(&mm->mmap_sem); | 831 | up_read(&mm->mmap_sem); |
807 | return -EFAULT; | 832 | return -EFAULT; |
808 | } | 833 | } |
@@ -1305,22 +1330,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, | |||
1305 | return 1; | 1330 | return 1; |
1306 | } | 1331 | } |
1307 | 1332 | ||
1308 | static void pmdp_splitting_flush_sync(void *arg) | ||
1309 | { | ||
1310 | /* Simply deliver the interrupt */ | ||
1311 | } | ||
1312 | |||
1313 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
1314 | pmd_t *pmdp) | ||
1315 | { | ||
1316 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
1317 | if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT, | ||
1318 | (unsigned long *) pmdp)) { | ||
1319 | /* need to serialize against gup-fast (IRQ disabled) */ | ||
1320 | smp_call_function(pmdp_splitting_flush_sync, NULL, 1); | ||
1321 | } | ||
1322 | } | ||
1323 | |||
1324 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, | 1333 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, |
1325 | pgtable_t pgtable) | 1334 | pgtable_t pgtable) |
1326 | { | 1335 | { |
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index d514df7e04dd..6c391a5d3e5c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig | |||
@@ -130,9 +130,6 @@ config STACKTRACE_SUPPORT | |||
130 | config LOCKDEP_SUPPORT | 130 | config LOCKDEP_SUPPORT |
131 | def_bool y | 131 | def_bool y |
132 | 132 | ||
133 | config HAVE_LATENCYTOP_SUPPORT | ||
134 | def_bool y | ||
135 | |||
136 | config ARCH_HAS_ILOG2_U32 | 133 | config ARCH_HAS_ILOG2_U32 |
137 | def_bool n | 134 | def_bool n |
138 | 135 | ||
diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index 51d8f7f31d1d..58aaa4f33b81 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c | |||
@@ -241,7 +241,7 @@ static void sh4_flush_cache_page(void *args) | |||
241 | */ | 241 | */ |
242 | map_coherent = (current_cpu_data.dcache.n_aliases && | 242 | map_coherent = (current_cpu_data.dcache.n_aliases && |
243 | test_bit(PG_dcache_clean, &page->flags) && | 243 | test_bit(PG_dcache_clean, &page->flags) && |
244 | page_mapped(page)); | 244 | page_mapcount(page)); |
245 | if (map_coherent) | 245 | if (map_coherent) |
246 | vaddr = kmap_coherent(page, address); | 246 | vaddr = kmap_coherent(page, address); |
247 | else | 247 | else |
diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c index f770e3992620..e58cfbf45150 100644 --- a/arch/sh/mm/cache.c +++ b/arch/sh/mm/cache.c | |||
@@ -59,7 +59,7 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, | |||
59 | unsigned long vaddr, void *dst, const void *src, | 59 | unsigned long vaddr, void *dst, const void *src, |
60 | unsigned long len) | 60 | unsigned long len) |
61 | { | 61 | { |
62 | if (boot_cpu_data.dcache.n_aliases && page_mapped(page) && | 62 | if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) && |
63 | test_bit(PG_dcache_clean, &page->flags)) { | 63 | test_bit(PG_dcache_clean, &page->flags)) { |
64 | void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); | 64 | void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); |
65 | memcpy(vto, src, len); | 65 | memcpy(vto, src, len); |
@@ -78,7 +78,7 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, | |||
78 | unsigned long vaddr, void *dst, const void *src, | 78 | unsigned long vaddr, void *dst, const void *src, |
79 | unsigned long len) | 79 | unsigned long len) |
80 | { | 80 | { |
81 | if (boot_cpu_data.dcache.n_aliases && page_mapped(page) && | 81 | if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) && |
82 | test_bit(PG_dcache_clean, &page->flags)) { | 82 | test_bit(PG_dcache_clean, &page->flags)) { |
83 | void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); | 83 | void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); |
84 | memcpy(dst, vfrom, len); | 84 | memcpy(dst, vfrom, len); |
@@ -97,7 +97,7 @@ void copy_user_highpage(struct page *to, struct page *from, | |||
97 | 97 | ||
98 | vto = kmap_atomic(to); | 98 | vto = kmap_atomic(to); |
99 | 99 | ||
100 | if (boot_cpu_data.dcache.n_aliases && page_mapped(from) && | 100 | if (boot_cpu_data.dcache.n_aliases && page_mapcount(from) && |
101 | test_bit(PG_dcache_clean, &from->flags)) { | 101 | test_bit(PG_dcache_clean, &from->flags)) { |
102 | vfrom = kmap_coherent(from, vaddr); | 102 | vfrom = kmap_coherent(from, vaddr); |
103 | copy_page(vto, vfrom); | 103 | copy_page(vto, vfrom); |
@@ -153,7 +153,7 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr) | |||
153 | unsigned long addr = (unsigned long) page_address(page); | 153 | unsigned long addr = (unsigned long) page_address(page); |
154 | 154 | ||
155 | if (pages_do_alias(addr, vmaddr)) { | 155 | if (pages_do_alias(addr, vmaddr)) { |
156 | if (boot_cpu_data.dcache.n_aliases && page_mapped(page) && | 156 | if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) && |
157 | test_bit(PG_dcache_clean, &page->flags)) { | 157 | test_bit(PG_dcache_clean, &page->flags)) { |
158 | void *kaddr; | 158 | void *kaddr; |
159 | 159 | ||
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 56442d2d7bbc..3203e42190dd 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig | |||
@@ -101,10 +101,6 @@ config LOCKDEP_SUPPORT | |||
101 | bool | 101 | bool |
102 | default y if SPARC64 | 102 | default y if SPARC64 |
103 | 103 | ||
104 | config HAVE_LATENCYTOP_SUPPORT | ||
105 | bool | ||
106 | default y if SPARC64 | ||
107 | |||
108 | config ARCH_HIBERNATION_POSSIBLE | 104 | config ARCH_HIBERNATION_POSSIBLE |
109 | def_bool y if SPARC64 | 105 | def_bool y if SPARC64 |
110 | 106 | ||
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 131d36fcd07a..7a38d6a576c5 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h | |||
@@ -681,13 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd) | |||
681 | return pte_val(pte) & _PAGE_PMD_HUGE; | 681 | return pte_val(pte) & _PAGE_PMD_HUGE; |
682 | } | 682 | } |
683 | 683 | ||
684 | static inline unsigned long pmd_trans_splitting(pmd_t pmd) | ||
685 | { | ||
686 | pte_t pte = __pte(pmd_val(pmd)); | ||
687 | |||
688 | return pmd_trans_huge(pmd) && pte_special(pte); | ||
689 | } | ||
690 | |||
691 | #define has_transparent_hugepage() 1 | 684 | #define has_transparent_hugepage() 1 |
692 | 685 | ||
693 | static inline pmd_t pmd_mkold(pmd_t pmd) | 686 | static inline pmd_t pmd_mkold(pmd_t pmd) |
@@ -717,29 +710,29 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) | |||
717 | return __pmd(pte_val(pte)); | 710 | return __pmd(pte_val(pte)); |
718 | } | 711 | } |
719 | 712 | ||
720 | static inline pmd_t pmd_mkyoung(pmd_t pmd) | 713 | static inline pmd_t pmd_mkclean(pmd_t pmd) |
721 | { | 714 | { |
722 | pte_t pte = __pte(pmd_val(pmd)); | 715 | pte_t pte = __pte(pmd_val(pmd)); |
723 | 716 | ||
724 | pte = pte_mkyoung(pte); | 717 | pte = pte_mkclean(pte); |
725 | 718 | ||
726 | return __pmd(pte_val(pte)); | 719 | return __pmd(pte_val(pte)); |
727 | } | 720 | } |
728 | 721 | ||
729 | static inline pmd_t pmd_mkwrite(pmd_t pmd) | 722 | static inline pmd_t pmd_mkyoung(pmd_t pmd) |
730 | { | 723 | { |
731 | pte_t pte = __pte(pmd_val(pmd)); | 724 | pte_t pte = __pte(pmd_val(pmd)); |
732 | 725 | ||
733 | pte = pte_mkwrite(pte); | 726 | pte = pte_mkyoung(pte); |
734 | 727 | ||
735 | return __pmd(pte_val(pte)); | 728 | return __pmd(pte_val(pte)); |
736 | } | 729 | } |
737 | 730 | ||
738 | static inline pmd_t pmd_mksplitting(pmd_t pmd) | 731 | static inline pmd_t pmd_mkwrite(pmd_t pmd) |
739 | { | 732 | { |
740 | pte_t pte = __pte(pmd_val(pmd)); | 733 | pte_t pte = __pte(pmd_val(pmd)); |
741 | 734 | ||
742 | pte = pte_mkspecial(pte); | 735 | pte = pte_mkwrite(pte); |
743 | 736 | ||
744 | return __pmd(pte_val(pte)); | 737 | return __pmd(pte_val(pte)); |
745 | } | 738 | } |
diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c index dbabe5713a15..cb841a33da59 100644 --- a/arch/sparc/mm/fault_64.c +++ b/arch/sparc/mm/fault_64.c | |||
@@ -113,9 +113,6 @@ static unsigned int get_user_insn(unsigned long tpc) | |||
113 | 113 | ||
114 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 114 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
115 | if (pmd_trans_huge(*pmdp)) { | 115 | if (pmd_trans_huge(*pmdp)) { |
116 | if (pmd_trans_splitting(*pmdp)) | ||
117 | goto out_irq_enable; | ||
118 | |||
119 | pa = pmd_pfn(*pmdp) << PAGE_SHIFT; | 116 | pa = pmd_pfn(*pmdp) << PAGE_SHIFT; |
120 | pa += tpc & ~HPAGE_MASK; | 117 | pa += tpc & ~HPAGE_MASK; |
121 | 118 | ||
diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 2e5c4fc2daa9..eb3d8e8ebc6b 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c | |||
@@ -56,8 +56,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | |||
56 | put_page(head); | 56 | put_page(head); |
57 | return 0; | 57 | return 0; |
58 | } | 58 | } |
59 | if (head != page) | ||
60 | get_huge_page_tail(page); | ||
61 | 59 | ||
62 | pages[*nr] = page; | 60 | pages[*nr] = page; |
63 | (*nr)++; | 61 | (*nr)++; |
@@ -70,7 +68,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, | |||
70 | unsigned long end, int write, struct page **pages, | 68 | unsigned long end, int write, struct page **pages, |
71 | int *nr) | 69 | int *nr) |
72 | { | 70 | { |
73 | struct page *head, *page, *tail; | 71 | struct page *head, *page; |
74 | int refs; | 72 | int refs; |
75 | 73 | ||
76 | if (!(pmd_val(pmd) & _PAGE_VALID)) | 74 | if (!(pmd_val(pmd) & _PAGE_VALID)) |
@@ -82,7 +80,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, | |||
82 | refs = 0; | 80 | refs = 0; |
83 | head = pmd_page(pmd); | 81 | head = pmd_page(pmd); |
84 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | 82 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
85 | tail = page; | ||
86 | do { | 83 | do { |
87 | VM_BUG_ON(compound_head(page) != head); | 84 | VM_BUG_ON(compound_head(page) != head); |
88 | pages[*nr] = page; | 85 | pages[*nr] = page; |
@@ -103,15 +100,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, | |||
103 | return 0; | 100 | return 0; |
104 | } | 101 | } |
105 | 102 | ||
106 | /* Any tail page need their mapcount reference taken before we | ||
107 | * return. | ||
108 | */ | ||
109 | while (refs--) { | ||
110 | if (PageTail(tail)) | ||
111 | get_huge_page_tail(tail); | ||
112 | tail++; | ||
113 | } | ||
114 | |||
115 | return 1; | 103 | return 1; |
116 | } | 104 | } |
117 | 105 | ||
@@ -126,7 +114,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
126 | pmd_t pmd = *pmdp; | 114 | pmd_t pmd = *pmdp; |
127 | 115 | ||
128 | next = pmd_addr_end(addr, end); | 116 | next = pmd_addr_end(addr, end); |
129 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | 117 | if (pmd_none(pmd)) |
130 | return 0; | 118 | return 0; |
131 | if (unlikely(pmd_large(pmd))) { | 119 | if (unlikely(pmd_large(pmd))) { |
132 | if (!gup_huge_pmd(pmdp, pmd, addr, next, | 120 | if (!gup_huge_pmd(pmdp, pmd, addr, next, |
diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index 2b05ccbebed9..96cecf55522e 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h | |||
@@ -489,16 +489,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) | |||
489 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 489 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
490 | #define has_transparent_hugepage() 1 | 490 | #define has_transparent_hugepage() 1 |
491 | #define pmd_trans_huge pmd_huge_page | 491 | #define pmd_trans_huge pmd_huge_page |
492 | |||
493 | static inline pmd_t pmd_mksplitting(pmd_t pmd) | ||
494 | { | ||
495 | return pte_pmd(hv_pte_set_client2(pmd_pte(pmd))); | ||
496 | } | ||
497 | |||
498 | static inline int pmd_trans_splitting(pmd_t pmd) | ||
499 | { | ||
500 | return hv_pte_get_client2(pmd_pte(pmd)); | ||
501 | } | ||
502 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 492 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
503 | 493 | ||
504 | /* | 494 | /* |
diff --git a/arch/um/include/asm/page.h b/arch/um/include/asm/page.h index 71c5d132062a..e13d41c392ae 100644 --- a/arch/um/include/asm/page.h +++ b/arch/um/include/asm/page.h | |||
@@ -18,6 +18,7 @@ | |||
18 | 18 | ||
19 | struct page; | 19 | struct page; |
20 | 20 | ||
21 | #include <linux/pfn.h> | ||
21 | #include <linux/types.h> | 22 | #include <linux/types.h> |
22 | #include <asm/vm-flags.h> | 23 | #include <asm/vm-flags.h> |
23 | 24 | ||
@@ -52,7 +53,6 @@ typedef struct { unsigned long pgd; } pgd_t; | |||
52 | #define pmd_val(x) ((x).pmd) | 53 | #define pmd_val(x) ((x).pmd) |
53 | #define __pmd(x) ((pmd_t) { (x) } ) | 54 | #define __pmd(x) ((pmd_t) { (x) } ) |
54 | 55 | ||
55 | typedef unsigned long long pfn_t; | ||
56 | typedef unsigned long long phys_t; | 56 | typedef unsigned long long phys_t; |
57 | 57 | ||
58 | #else | 58 | #else |
@@ -76,7 +76,6 @@ typedef struct { unsigned long pmd; } pmd_t; | |||
76 | #define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE)) | 76 | #define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE)) |
77 | #define pte_set_val(p, phys, prot) (p).pte = (phys | pgprot_val(prot)) | 77 | #define pte_set_val(p, phys, prot) (p).pte = (phys | pgprot_val(prot)) |
78 | 78 | ||
79 | typedef unsigned long pfn_t; | ||
80 | typedef unsigned long phys_t; | 79 | typedef unsigned long phys_t; |
81 | 80 | ||
82 | #endif | 81 | #endif |
@@ -109,8 +108,8 @@ extern unsigned long uml_physmem; | |||
109 | #define __pa(virt) to_phys((void *) (unsigned long) (virt)) | 108 | #define __pa(virt) to_phys((void *) (unsigned long) (virt)) |
110 | #define __va(phys) to_virt((unsigned long) (phys)) | 109 | #define __va(phys) to_virt((unsigned long) (phys)) |
111 | 110 | ||
112 | #define phys_to_pfn(p) ((pfn_t) ((p) >> PAGE_SHIFT)) | 111 | #define phys_to_pfn(p) ((p) >> PAGE_SHIFT) |
113 | #define pfn_to_phys(pfn) ((phys_t) ((pfn) << PAGE_SHIFT)) | 112 | #define pfn_to_phys(pfn) PFN_PHYS(pfn) |
114 | 113 | ||
115 | #define pfn_valid(pfn) ((pfn) < max_mapnr) | 114 | #define pfn_valid(pfn) ((pfn) < max_mapnr) |
116 | #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) | 115 | #define virt_addr_valid(v) pfn_valid(phys_to_pfn(__pa(v))) |
diff --git a/arch/um/include/asm/pgtable-3level.h b/arch/um/include/asm/pgtable-3level.h index 2b4274e7c095..bae8523a162f 100644 --- a/arch/um/include/asm/pgtable-3level.h +++ b/arch/um/include/asm/pgtable-3level.h | |||
@@ -98,7 +98,7 @@ static inline unsigned long pte_pfn(pte_t pte) | |||
98 | return phys_to_pfn(pte_val(pte)); | 98 | return phys_to_pfn(pte_val(pte)); |
99 | } | 99 | } |
100 | 100 | ||
101 | static inline pte_t pfn_pte(pfn_t page_nr, pgprot_t pgprot) | 101 | static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) |
102 | { | 102 | { |
103 | pte_t pte; | 103 | pte_t pte; |
104 | phys_t phys = pfn_to_phys(page_nr); | 104 | phys_t phys = pfn_to_phys(page_nr); |
@@ -107,7 +107,7 @@ static inline pte_t pfn_pte(pfn_t page_nr, pgprot_t pgprot) | |||
107 | return pte; | 107 | return pte; |
108 | } | 108 | } |
109 | 109 | ||
110 | static inline pmd_t pfn_pmd(pfn_t page_nr, pgprot_t pgprot) | 110 | static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) |
111 | { | 111 | { |
112 | return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); | 112 | return __pmd((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); |
113 | } | 113 | } |
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index 18eb9924dda3..7485398d0737 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h | |||
@@ -271,7 +271,7 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) | |||
271 | 271 | ||
272 | #define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) | 272 | #define phys_to_page(phys) pfn_to_page(phys_to_pfn(phys)) |
273 | #define __virt_to_page(virt) phys_to_page(__pa(virt)) | 273 | #define __virt_to_page(virt) phys_to_page(__pa(virt)) |
274 | #define page_to_phys(page) pfn_to_phys((pfn_t) page_to_pfn(page)) | 274 | #define page_to_phys(page) pfn_to_phys(page_to_pfn(page)) |
275 | #define virt_to_page(addr) __virt_to_page((const unsigned long) addr) | 275 | #define virt_to_page(addr) __virt_to_page((const unsigned long) addr) |
276 | 276 | ||
277 | #define mk_pte(page, pgprot) \ | 277 | #define mk_pte(page, pgprot) \ |
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 5dc4c0a43ccd..877342640b6e 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig | |||
@@ -34,9 +34,6 @@ config NO_IOPORT_MAP | |||
34 | config STACKTRACE_SUPPORT | 34 | config STACKTRACE_SUPPORT |
35 | def_bool y | 35 | def_bool y |
36 | 36 | ||
37 | config HAVE_LATENCYTOP_SUPPORT | ||
38 | def_bool y | ||
39 | |||
40 | config LOCKDEP_SUPPORT | 37 | config LOCKDEP_SUPPORT |
41 | def_bool y | 38 | def_bool y |
42 | 39 | ||
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 24f362bf3ec6..4a10ba9e95da 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -180,9 +180,6 @@ config LOCKDEP_SUPPORT | |||
180 | config STACKTRACE_SUPPORT | 180 | config STACKTRACE_SUPPORT |
181 | def_bool y | 181 | def_bool y |
182 | 182 | ||
183 | config HAVE_LATENCYTOP_SUPPORT | ||
184 | def_bool y | ||
185 | |||
186 | config MMU | 183 | config MMU |
187 | def_bool y | 184 | def_bool y |
188 | 185 | ||
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d3eee663c41f..0687c4748b8f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h | |||
@@ -162,20 +162,22 @@ static inline int pmd_large(pmd_t pte) | |||
162 | } | 162 | } |
163 | 163 | ||
164 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 164 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
165 | static inline int pmd_trans_splitting(pmd_t pmd) | ||
166 | { | ||
167 | return pmd_val(pmd) & _PAGE_SPLITTING; | ||
168 | } | ||
169 | |||
170 | static inline int pmd_trans_huge(pmd_t pmd) | 165 | static inline int pmd_trans_huge(pmd_t pmd) |
171 | { | 166 | { |
172 | return pmd_val(pmd) & _PAGE_PSE; | 167 | return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; |
173 | } | 168 | } |
174 | 169 | ||
175 | static inline int has_transparent_hugepage(void) | 170 | static inline int has_transparent_hugepage(void) |
176 | { | 171 | { |
177 | return cpu_has_pse; | 172 | return cpu_has_pse; |
178 | } | 173 | } |
174 | |||
175 | #ifdef __HAVE_ARCH_PTE_DEVMAP | ||
176 | static inline int pmd_devmap(pmd_t pmd) | ||
177 | { | ||
178 | return !!(pmd_val(pmd) & _PAGE_DEVMAP); | ||
179 | } | ||
180 | #endif | ||
179 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 181 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
180 | 182 | ||
181 | static inline pte_t pte_set_flags(pte_t pte, pteval_t set) | 183 | static inline pte_t pte_set_flags(pte_t pte, pteval_t set) |
@@ -252,6 +254,11 @@ static inline pte_t pte_mkspecial(pte_t pte) | |||
252 | return pte_set_flags(pte, _PAGE_SPECIAL); | 254 | return pte_set_flags(pte, _PAGE_SPECIAL); |
253 | } | 255 | } |
254 | 256 | ||
257 | static inline pte_t pte_mkdevmap(pte_t pte) | ||
258 | { | ||
259 | return pte_set_flags(pte, _PAGE_SPECIAL|_PAGE_DEVMAP); | ||
260 | } | ||
261 | |||
255 | static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) | 262 | static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) |
256 | { | 263 | { |
257 | pmdval_t v = native_pmd_val(pmd); | 264 | pmdval_t v = native_pmd_val(pmd); |
@@ -271,6 +278,11 @@ static inline pmd_t pmd_mkold(pmd_t pmd) | |||
271 | return pmd_clear_flags(pmd, _PAGE_ACCESSED); | 278 | return pmd_clear_flags(pmd, _PAGE_ACCESSED); |
272 | } | 279 | } |
273 | 280 | ||
281 | static inline pmd_t pmd_mkclean(pmd_t pmd) | ||
282 | { | ||
283 | return pmd_clear_flags(pmd, _PAGE_DIRTY); | ||
284 | } | ||
285 | |||
274 | static inline pmd_t pmd_wrprotect(pmd_t pmd) | 286 | static inline pmd_t pmd_wrprotect(pmd_t pmd) |
275 | { | 287 | { |
276 | return pmd_clear_flags(pmd, _PAGE_RW); | 288 | return pmd_clear_flags(pmd, _PAGE_RW); |
@@ -281,6 +293,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) | |||
281 | return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); | 293 | return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); |
282 | } | 294 | } |
283 | 295 | ||
296 | static inline pmd_t pmd_mkdevmap(pmd_t pmd) | ||
297 | { | ||
298 | return pmd_set_flags(pmd, _PAGE_DEVMAP); | ||
299 | } | ||
300 | |||
284 | static inline pmd_t pmd_mkhuge(pmd_t pmd) | 301 | static inline pmd_t pmd_mkhuge(pmd_t pmd) |
285 | { | 302 | { |
286 | return pmd_set_flags(pmd, _PAGE_PSE); | 303 | return pmd_set_flags(pmd, _PAGE_PSE); |
@@ -462,6 +479,13 @@ static inline int pte_present(pte_t a) | |||
462 | return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); | 479 | return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); |
463 | } | 480 | } |
464 | 481 | ||
482 | #ifdef __HAVE_ARCH_PTE_DEVMAP | ||
483 | static inline int pte_devmap(pte_t a) | ||
484 | { | ||
485 | return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP; | ||
486 | } | ||
487 | #endif | ||
488 | |||
465 | #define pte_accessible pte_accessible | 489 | #define pte_accessible pte_accessible |
466 | static inline bool pte_accessible(struct mm_struct *mm, pte_t a) | 490 | static inline bool pte_accessible(struct mm_struct *mm, pte_t a) |
467 | { | 491 | { |
@@ -808,10 +832,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, | |||
808 | unsigned long address, pmd_t *pmdp); | 832 | unsigned long address, pmd_t *pmdp); |
809 | 833 | ||
810 | 834 | ||
811 | #define __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
812 | extern void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
813 | unsigned long addr, pmd_t *pmdp); | ||
814 | |||
815 | #define __HAVE_ARCH_PMD_WRITE | 835 | #define __HAVE_ARCH_PMD_WRITE |
816 | static inline int pmd_write(pmd_t pmd) | 836 | static inline int pmd_write(pmd_t pmd) |
817 | { | 837 | { |
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index a471cadb9630..04c27a013165 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h | |||
@@ -22,10 +22,11 @@ | |||
22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ | 22 | #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ |
23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 | 23 | #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 |
24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 | 24 | #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 |
25 | #define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ | ||
26 | #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ | 25 | #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ |
27 | #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ | 26 | #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ |
28 | #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ | 27 | #define _PAGE_BIT_SOFTW4 58 /* available for programmer */ |
28 | #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 | ||
29 | #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ | ||
29 | 30 | ||
30 | /* If _PAGE_BIT_PRESENT is clear, we use these: */ | 31 | /* If _PAGE_BIT_PRESENT is clear, we use these: */ |
31 | /* - if the user mapped it with PROT_NONE; pte_present gives true */ | 32 | /* - if the user mapped it with PROT_NONE; pte_present gives true */ |
@@ -46,7 +47,6 @@ | |||
46 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) | 47 | #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) |
47 | #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) | 48 | #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) |
48 | #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) | 49 | #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) |
49 | #define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) | ||
50 | #define __HAVE_ARCH_PTE_SPECIAL | 50 | #define __HAVE_ARCH_PTE_SPECIAL |
51 | 51 | ||
52 | #ifdef CONFIG_KMEMCHECK | 52 | #ifdef CONFIG_KMEMCHECK |
@@ -85,8 +85,11 @@ | |||
85 | 85 | ||
86 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) | 86 | #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) |
87 | #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) | 87 | #define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX) |
88 | #define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP) | ||
89 | #define __HAVE_ARCH_PTE_DEVMAP | ||
88 | #else | 90 | #else |
89 | #define _PAGE_NX (_AT(pteval_t, 0)) | 91 | #define _PAGE_NX (_AT(pteval_t, 0)) |
92 | #define _PAGE_DEVMAP (_AT(pteval_t, 0)) | ||
90 | #endif | 93 | #endif |
91 | 94 | ||
92 | #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) | 95 | #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) |
diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index d8ce3ec816ab..1544fabcd7f9 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h | |||
@@ -132,12 +132,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) | |||
132 | { | 132 | { |
133 | void *vaddr = (void __force *)addr; | 133 | void *vaddr = (void __force *)addr; |
134 | 134 | ||
135 | /* TODO: implement the zeroing via non-temporal writes */ | 135 | memset(vaddr, 0, size); |
136 | if (size == PAGE_SIZE && ((unsigned long)vaddr & ~PAGE_MASK) == 0) | ||
137 | clear_page(vaddr); | ||
138 | else | ||
139 | memset(vaddr, 0, size); | ||
140 | |||
141 | __arch_wb_cache_pmem(vaddr, size); | 136 | __arch_wb_cache_pmem(vaddr, size); |
142 | } | 137 | } |
143 | 138 | ||
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 483231ebbb0b..e574b8546518 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c | |||
@@ -175,7 +175,11 @@ static void mark_screen_rdonly(struct mm_struct *mm) | |||
175 | if (pud_none_or_clear_bad(pud)) | 175 | if (pud_none_or_clear_bad(pud)) |
176 | goto out; | 176 | goto out; |
177 | pmd = pmd_offset(pud, 0xA0000); | 177 | pmd = pmd_offset(pud, 0xA0000); |
178 | split_huge_page_pmd_mm(mm, 0xA0000, pmd); | 178 | |
179 | if (pmd_trans_huge(*pmd)) { | ||
180 | struct vm_area_struct *vma = find_vma(mm, 0xA0000); | ||
181 | split_huge_pmd(vma, pmd, 0xA0000); | ||
182 | } | ||
179 | if (pmd_none_or_clear_bad(pmd)) | 183 | if (pmd_none_or_clear_bad(pmd)) |
180 | goto out; | 184 | goto out; |
181 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); | 185 | pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); |
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c index 5c520ebf6343..a22a488b4622 100644 --- a/arch/x86/kvm/iommu.c +++ b/arch/x86/kvm/iommu.c | |||
@@ -43,11 +43,11 @@ static int kvm_iommu_unmap_memslots(struct kvm *kvm); | |||
43 | static void kvm_iommu_put_pages(struct kvm *kvm, | 43 | static void kvm_iommu_put_pages(struct kvm *kvm, |
44 | gfn_t base_gfn, unsigned long npages); | 44 | gfn_t base_gfn, unsigned long npages); |
45 | 45 | ||
46 | static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, | 46 | static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, |
47 | unsigned long npages) | 47 | unsigned long npages) |
48 | { | 48 | { |
49 | gfn_t end_gfn; | 49 | gfn_t end_gfn; |
50 | pfn_t pfn; | 50 | kvm_pfn_t pfn; |
51 | 51 | ||
52 | pfn = gfn_to_pfn_memslot(slot, gfn); | 52 | pfn = gfn_to_pfn_memslot(slot, gfn); |
53 | end_gfn = gfn + npages; | 53 | end_gfn = gfn + npages; |
@@ -62,7 +62,8 @@ static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, | |||
62 | return pfn; | 62 | return pfn; |
63 | } | 63 | } |
64 | 64 | ||
65 | static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) | 65 | static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn, |
66 | unsigned long npages) | ||
66 | { | 67 | { |
67 | unsigned long i; | 68 | unsigned long i; |
68 | 69 | ||
@@ -73,7 +74,7 @@ static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages) | |||
73 | int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) | 74 | int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) |
74 | { | 75 | { |
75 | gfn_t gfn, end_gfn; | 76 | gfn_t gfn, end_gfn; |
76 | pfn_t pfn; | 77 | kvm_pfn_t pfn; |
77 | int r = 0; | 78 | int r = 0; |
78 | struct iommu_domain *domain = kvm->arch.iommu_domain; | 79 | struct iommu_domain *domain = kvm->arch.iommu_domain; |
79 | int flags; | 80 | int flags; |
@@ -275,7 +276,7 @@ static void kvm_iommu_put_pages(struct kvm *kvm, | |||
275 | { | 276 | { |
276 | struct iommu_domain *domain; | 277 | struct iommu_domain *domain; |
277 | gfn_t end_gfn, gfn; | 278 | gfn_t end_gfn, gfn; |
278 | pfn_t pfn; | 279 | kvm_pfn_t pfn; |
279 | u64 phys; | 280 | u64 phys; |
280 | 281 | ||
281 | domain = kvm->arch.iommu_domain; | 282 | domain = kvm->arch.iommu_domain; |
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 420a5ca3c0ee..95a955de5964 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -259,7 +259,7 @@ static unsigned get_mmio_spte_access(u64 spte) | |||
259 | } | 259 | } |
260 | 260 | ||
261 | static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, | 261 | static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, |
262 | pfn_t pfn, unsigned access) | 262 | kvm_pfn_t pfn, unsigned access) |
263 | { | 263 | { |
264 | if (unlikely(is_noslot_pfn(pfn))) { | 264 | if (unlikely(is_noslot_pfn(pfn))) { |
265 | mark_mmio_spte(vcpu, sptep, gfn, access); | 265 | mark_mmio_spte(vcpu, sptep, gfn, access); |
@@ -320,7 +320,7 @@ static int is_last_spte(u64 pte, int level) | |||
320 | return 0; | 320 | return 0; |
321 | } | 321 | } |
322 | 322 | ||
323 | static pfn_t spte_to_pfn(u64 pte) | 323 | static kvm_pfn_t spte_to_pfn(u64 pte) |
324 | { | 324 | { |
325 | return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; | 325 | return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; |
326 | } | 326 | } |
@@ -582,7 +582,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) | |||
582 | */ | 582 | */ |
583 | static int mmu_spte_clear_track_bits(u64 *sptep) | 583 | static int mmu_spte_clear_track_bits(u64 *sptep) |
584 | { | 584 | { |
585 | pfn_t pfn; | 585 | kvm_pfn_t pfn; |
586 | u64 old_spte = *sptep; | 586 | u64 old_spte = *sptep; |
587 | 587 | ||
588 | if (!spte_has_volatile_bits(old_spte)) | 588 | if (!spte_has_volatile_bits(old_spte)) |
@@ -1372,7 +1372,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, | |||
1372 | int need_flush = 0; | 1372 | int need_flush = 0; |
1373 | u64 new_spte; | 1373 | u64 new_spte; |
1374 | pte_t *ptep = (pte_t *)data; | 1374 | pte_t *ptep = (pte_t *)data; |
1375 | pfn_t new_pfn; | 1375 | kvm_pfn_t new_pfn; |
1376 | 1376 | ||
1377 | WARN_ON(pte_huge(*ptep)); | 1377 | WARN_ON(pte_huge(*ptep)); |
1378 | new_pfn = pte_pfn(*ptep); | 1378 | new_pfn = pte_pfn(*ptep); |
@@ -2450,7 +2450,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, | |||
2450 | return 0; | 2450 | return 0; |
2451 | } | 2451 | } |
2452 | 2452 | ||
2453 | static bool kvm_is_mmio_pfn(pfn_t pfn) | 2453 | static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) |
2454 | { | 2454 | { |
2455 | if (pfn_valid(pfn)) | 2455 | if (pfn_valid(pfn)) |
2456 | return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); | 2456 | return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); |
@@ -2460,7 +2460,7 @@ static bool kvm_is_mmio_pfn(pfn_t pfn) | |||
2460 | 2460 | ||
2461 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, | 2461 | static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, |
2462 | unsigned pte_access, int level, | 2462 | unsigned pte_access, int level, |
2463 | gfn_t gfn, pfn_t pfn, bool speculative, | 2463 | gfn_t gfn, kvm_pfn_t pfn, bool speculative, |
2464 | bool can_unsync, bool host_writable) | 2464 | bool can_unsync, bool host_writable) |
2465 | { | 2465 | { |
2466 | u64 spte; | 2466 | u64 spte; |
@@ -2539,7 +2539,7 @@ done: | |||
2539 | } | 2539 | } |
2540 | 2540 | ||
2541 | static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, | 2541 | static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, |
2542 | int write_fault, int level, gfn_t gfn, pfn_t pfn, | 2542 | int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn, |
2543 | bool speculative, bool host_writable) | 2543 | bool speculative, bool host_writable) |
2544 | { | 2544 | { |
2545 | int was_rmapped = 0; | 2545 | int was_rmapped = 0; |
@@ -2602,7 +2602,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, | |||
2602 | return emulate; | 2602 | return emulate; |
2603 | } | 2603 | } |
2604 | 2604 | ||
2605 | static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, | 2605 | static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, |
2606 | bool no_dirty_log) | 2606 | bool no_dirty_log) |
2607 | { | 2607 | { |
2608 | struct kvm_memory_slot *slot; | 2608 | struct kvm_memory_slot *slot; |
@@ -2684,7 +2684,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) | |||
2684 | } | 2684 | } |
2685 | 2685 | ||
2686 | static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, | 2686 | static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable, |
2687 | int level, gfn_t gfn, pfn_t pfn, bool prefault) | 2687 | int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault) |
2688 | { | 2688 | { |
2689 | struct kvm_shadow_walk_iterator iterator; | 2689 | struct kvm_shadow_walk_iterator iterator; |
2690 | struct kvm_mmu_page *sp; | 2690 | struct kvm_mmu_page *sp; |
@@ -2732,7 +2732,7 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct * | |||
2732 | send_sig_info(SIGBUS, &info, tsk); | 2732 | send_sig_info(SIGBUS, &info, tsk); |
2733 | } | 2733 | } |
2734 | 2734 | ||
2735 | static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) | 2735 | static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) |
2736 | { | 2736 | { |
2737 | /* | 2737 | /* |
2738 | * Do not cache the mmio info caused by writing the readonly gfn | 2738 | * Do not cache the mmio info caused by writing the readonly gfn |
@@ -2752,9 +2752,10 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) | |||
2752 | } | 2752 | } |
2753 | 2753 | ||
2754 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | 2754 | static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, |
2755 | gfn_t *gfnp, pfn_t *pfnp, int *levelp) | 2755 | gfn_t *gfnp, kvm_pfn_t *pfnp, |
2756 | int *levelp) | ||
2756 | { | 2757 | { |
2757 | pfn_t pfn = *pfnp; | 2758 | kvm_pfn_t pfn = *pfnp; |
2758 | gfn_t gfn = *gfnp; | 2759 | gfn_t gfn = *gfnp; |
2759 | int level = *levelp; | 2760 | int level = *levelp; |
2760 | 2761 | ||
@@ -2793,7 +2794,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, | |||
2793 | } | 2794 | } |
2794 | 2795 | ||
2795 | static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, | 2796 | static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, |
2796 | pfn_t pfn, unsigned access, int *ret_val) | 2797 | kvm_pfn_t pfn, unsigned access, int *ret_val) |
2797 | { | 2798 | { |
2798 | bool ret = true; | 2799 | bool ret = true; |
2799 | 2800 | ||
@@ -2947,7 +2948,7 @@ exit: | |||
2947 | } | 2948 | } |
2948 | 2949 | ||
2949 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | 2950 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
2950 | gva_t gva, pfn_t *pfn, bool write, bool *writable); | 2951 | gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable); |
2951 | static void make_mmu_pages_available(struct kvm_vcpu *vcpu); | 2952 | static void make_mmu_pages_available(struct kvm_vcpu *vcpu); |
2952 | 2953 | ||
2953 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, | 2954 | static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, |
@@ -2956,7 +2957,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, | |||
2956 | int r; | 2957 | int r; |
2957 | int level; | 2958 | int level; |
2958 | bool force_pt_level = false; | 2959 | bool force_pt_level = false; |
2959 | pfn_t pfn; | 2960 | kvm_pfn_t pfn; |
2960 | unsigned long mmu_seq; | 2961 | unsigned long mmu_seq; |
2961 | bool map_writable, write = error_code & PFERR_WRITE_MASK; | 2962 | bool map_writable, write = error_code & PFERR_WRITE_MASK; |
2962 | 2963 | ||
@@ -3410,7 +3411,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu) | |||
3410 | } | 3411 | } |
3411 | 3412 | ||
3412 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, | 3413 | static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, |
3413 | gva_t gva, pfn_t *pfn, bool write, bool *writable) | 3414 | gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable) |
3414 | { | 3415 | { |
3415 | struct kvm_memory_slot *slot; | 3416 | struct kvm_memory_slot *slot; |
3416 | bool async; | 3417 | bool async; |
@@ -3448,7 +3449,7 @@ check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) | |||
3448 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, | 3449 | static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, |
3449 | bool prefault) | 3450 | bool prefault) |
3450 | { | 3451 | { |
3451 | pfn_t pfn; | 3452 | kvm_pfn_t pfn; |
3452 | int r; | 3453 | int r; |
3453 | int level; | 3454 | int level; |
3454 | bool force_pt_level; | 3455 | bool force_pt_level; |
@@ -4601,7 +4602,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, | |||
4601 | u64 *sptep; | 4602 | u64 *sptep; |
4602 | struct rmap_iterator iter; | 4603 | struct rmap_iterator iter; |
4603 | int need_tlb_flush = 0; | 4604 | int need_tlb_flush = 0; |
4604 | pfn_t pfn; | 4605 | kvm_pfn_t pfn; |
4605 | struct kvm_mmu_page *sp; | 4606 | struct kvm_mmu_page *sp; |
4606 | 4607 | ||
4607 | restart: | 4608 | restart: |
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 1cee3ec20dd2..dcce533d420c 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) | |||
97 | { | 97 | { |
98 | struct kvm_mmu_page *sp; | 98 | struct kvm_mmu_page *sp; |
99 | gfn_t gfn; | 99 | gfn_t gfn; |
100 | pfn_t pfn; | 100 | kvm_pfn_t pfn; |
101 | hpa_t hpa; | 101 | hpa_t hpa; |
102 | 102 | ||
103 | sp = page_header(__pa(sptep)); | 103 | sp = page_header(__pa(sptep)); |
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 91e939b486d1..6c9fed957cce 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h | |||
@@ -456,7 +456,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, | |||
456 | { | 456 | { |
457 | unsigned pte_access; | 457 | unsigned pte_access; |
458 | gfn_t gfn; | 458 | gfn_t gfn; |
459 | pfn_t pfn; | 459 | kvm_pfn_t pfn; |
460 | 460 | ||
461 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) | 461 | if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) |
462 | return false; | 462 | return false; |
@@ -551,7 +551,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, | |||
551 | static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, | 551 | static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, |
552 | struct guest_walker *gw, | 552 | struct guest_walker *gw, |
553 | int write_fault, int hlevel, | 553 | int write_fault, int hlevel, |
554 | pfn_t pfn, bool map_writable, bool prefault) | 554 | kvm_pfn_t pfn, bool map_writable, bool prefault) |
555 | { | 555 | { |
556 | struct kvm_mmu_page *sp = NULL; | 556 | struct kvm_mmu_page *sp = NULL; |
557 | struct kvm_shadow_walk_iterator it; | 557 | struct kvm_shadow_walk_iterator it; |
@@ -694,7 +694,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, | |||
694 | int user_fault = error_code & PFERR_USER_MASK; | 694 | int user_fault = error_code & PFERR_USER_MASK; |
695 | struct guest_walker walker; | 695 | struct guest_walker walker; |
696 | int r; | 696 | int r; |
697 | pfn_t pfn; | 697 | kvm_pfn_t pfn; |
698 | int level = PT_PAGE_TABLE_LEVEL; | 698 | int level = PT_PAGE_TABLE_LEVEL; |
699 | bool force_pt_level = false; | 699 | bool force_pt_level = false; |
700 | unsigned long mmu_seq; | 700 | unsigned long mmu_seq; |
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 04d61d496b14..e2951b6edbbc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c | |||
@@ -4251,7 +4251,7 @@ out: | |||
4251 | static int init_rmode_identity_map(struct kvm *kvm) | 4251 | static int init_rmode_identity_map(struct kvm *kvm) |
4252 | { | 4252 | { |
4253 | int i, idx, r = 0; | 4253 | int i, idx, r = 0; |
4254 | pfn_t identity_map_pfn; | 4254 | kvm_pfn_t identity_map_pfn; |
4255 | u32 tmp; | 4255 | u32 tmp; |
4256 | 4256 | ||
4257 | if (!enable_ept) | 4257 | if (!enable_ept) |
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f53f5b13c677..4244c2baf57d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c | |||
@@ -5148,7 +5148,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2, | |||
5148 | int emulation_type) | 5148 | int emulation_type) |
5149 | { | 5149 | { |
5150 | gpa_t gpa = cr2; | 5150 | gpa_t gpa = cr2; |
5151 | pfn_t pfn; | 5151 | kvm_pfn_t pfn; |
5152 | 5152 | ||
5153 | if (emulation_type & EMULTYPE_NO_REEXECUTE) | 5153 | if (emulation_type & EMULTYPE_NO_REEXECUTE) |
5154 | return false; | 5154 | return false; |
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index ae9a37bf1371..6d5eb5900372 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include <linux/vmstat.h> | 9 | #include <linux/vmstat.h> |
10 | #include <linux/highmem.h> | 10 | #include <linux/highmem.h> |
11 | #include <linux/swap.h> | 11 | #include <linux/swap.h> |
12 | #include <linux/memremap.h> | ||
12 | 13 | ||
13 | #include <asm/pgtable.h> | 14 | #include <asm/pgtable.h> |
14 | 15 | ||
@@ -63,6 +64,16 @@ retry: | |||
63 | #endif | 64 | #endif |
64 | } | 65 | } |
65 | 66 | ||
67 | static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) | ||
68 | { | ||
69 | while ((*nr) - nr_start) { | ||
70 | struct page *page = pages[--(*nr)]; | ||
71 | |||
72 | ClearPageReferenced(page); | ||
73 | put_page(page); | ||
74 | } | ||
75 | } | ||
76 | |||
66 | /* | 77 | /* |
67 | * The performance critical leaf functions are made noinline otherwise gcc | 78 | * The performance critical leaf functions are made noinline otherwise gcc |
68 | * inlines everything into a single function which results in too much | 79 | * inlines everything into a single function which results in too much |
@@ -71,7 +82,9 @@ retry: | |||
71 | static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | 82 | static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, |
72 | unsigned long end, int write, struct page **pages, int *nr) | 83 | unsigned long end, int write, struct page **pages, int *nr) |
73 | { | 84 | { |
85 | struct dev_pagemap *pgmap = NULL; | ||
74 | unsigned long mask; | 86 | unsigned long mask; |
87 | int nr_start = *nr; | ||
75 | pte_t *ptep; | 88 | pte_t *ptep; |
76 | 89 | ||
77 | mask = _PAGE_PRESENT|_PAGE_USER; | 90 | mask = _PAGE_PRESENT|_PAGE_USER; |
@@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, | |||
89 | return 0; | 102 | return 0; |
90 | } | 103 | } |
91 | 104 | ||
92 | if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { | 105 | page = pte_page(pte); |
106 | if (pte_devmap(pte)) { | ||
107 | pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); | ||
108 | if (unlikely(!pgmap)) { | ||
109 | undo_dev_pagemap(nr, nr_start, pages); | ||
110 | pte_unmap(ptep); | ||
111 | return 0; | ||
112 | } | ||
113 | } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { | ||
93 | pte_unmap(ptep); | 114 | pte_unmap(ptep); |
94 | return 0; | 115 | return 0; |
95 | } | 116 | } |
96 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 117 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
97 | page = pte_page(pte); | ||
98 | get_page(page); | 118 | get_page(page); |
119 | put_dev_pagemap(pgmap); | ||
99 | SetPageReferenced(page); | 120 | SetPageReferenced(page); |
100 | pages[*nr] = page; | 121 | pages[*nr] = page; |
101 | (*nr)++; | 122 | (*nr)++; |
@@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr) | |||
114 | SetPageReferenced(page); | 135 | SetPageReferenced(page); |
115 | } | 136 | } |
116 | 137 | ||
138 | static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, | ||
139 | unsigned long end, struct page **pages, int *nr) | ||
140 | { | ||
141 | int nr_start = *nr; | ||
142 | unsigned long pfn = pmd_pfn(pmd); | ||
143 | struct dev_pagemap *pgmap = NULL; | ||
144 | |||
145 | pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; | ||
146 | do { | ||
147 | struct page *page = pfn_to_page(pfn); | ||
148 | |||
149 | pgmap = get_dev_pagemap(pfn, pgmap); | ||
150 | if (unlikely(!pgmap)) { | ||
151 | undo_dev_pagemap(nr, nr_start, pages); | ||
152 | return 0; | ||
153 | } | ||
154 | SetPageReferenced(page); | ||
155 | pages[*nr] = page; | ||
156 | get_page(page); | ||
157 | put_dev_pagemap(pgmap); | ||
158 | (*nr)++; | ||
159 | pfn++; | ||
160 | } while (addr += PAGE_SIZE, addr != end); | ||
161 | return 1; | ||
162 | } | ||
163 | |||
117 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | 164 | static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, |
118 | unsigned long end, int write, struct page **pages, int *nr) | 165 | unsigned long end, int write, struct page **pages, int *nr) |
119 | { | 166 | { |
@@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | |||
126 | mask |= _PAGE_RW; | 173 | mask |= _PAGE_RW; |
127 | if ((pmd_flags(pmd) & mask) != mask) | 174 | if ((pmd_flags(pmd) & mask) != mask) |
128 | return 0; | 175 | return 0; |
176 | |||
177 | VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); | ||
178 | if (pmd_devmap(pmd)) | ||
179 | return __gup_device_huge_pmd(pmd, addr, end, pages, nr); | ||
180 | |||
129 | /* hugepages are never "special" */ | 181 | /* hugepages are never "special" */ |
130 | VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); | 182 | VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); |
131 | VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); | ||
132 | 183 | ||
133 | refs = 0; | 184 | refs = 0; |
134 | head = pmd_page(pmd); | 185 | head = pmd_page(pmd); |
@@ -136,8 +187,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, | |||
136 | do { | 187 | do { |
137 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | 188 | VM_BUG_ON_PAGE(compound_head(page) != head, page); |
138 | pages[*nr] = page; | 189 | pages[*nr] = page; |
139 | if (PageTail(page)) | ||
140 | get_huge_page_tail(page); | ||
141 | (*nr)++; | 190 | (*nr)++; |
142 | page++; | 191 | page++; |
143 | refs++; | 192 | refs++; |
@@ -158,18 +207,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
158 | pmd_t pmd = *pmdp; | 207 | pmd_t pmd = *pmdp; |
159 | 208 | ||
160 | next = pmd_addr_end(addr, end); | 209 | next = pmd_addr_end(addr, end); |
161 | /* | 210 | if (pmd_none(pmd)) |
162 | * The pmd_trans_splitting() check below explains why | ||
163 | * pmdp_splitting_flush has to flush the tlb, to stop | ||
164 | * this gup-fast code from running while we set the | ||
165 | * splitting bit in the pmd. Returning zero will take | ||
166 | * the slow path that will call wait_split_huge_page() | ||
167 | * if the pmd is still in splitting state. gup-fast | ||
168 | * can't because it has irq disabled and | ||
169 | * wait_split_huge_page() would never return as the | ||
170 | * tlb flush IPI wouldn't run. | ||
171 | */ | ||
172 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | ||
173 | return 0; | 211 | return 0; |
174 | if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { | 212 | if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { |
175 | /* | 213 | /* |
@@ -212,8 +250,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, | |||
212 | do { | 250 | do { |
213 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | 251 | VM_BUG_ON_PAGE(compound_head(page) != head, page); |
214 | pages[*nr] = page; | 252 | pages[*nr] = page; |
215 | if (PageTail(page)) | ||
216 | get_huge_page_tail(page); | ||
217 | (*nr)++; | 253 | (*nr)++; |
218 | page++; | 254 | page++; |
219 | refs++; | 255 | refs++; |
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8829482d69ec..5488d21123bd 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c | |||
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/memory.h> | 31 | #include <linux/memory.h> |
32 | #include <linux/memory_hotplug.h> | 32 | #include <linux/memory_hotplug.h> |
33 | #include <linux/memremap.h> | ||
33 | #include <linux/nmi.h> | 34 | #include <linux/nmi.h> |
34 | #include <linux/gfp.h> | 35 | #include <linux/gfp.h> |
35 | #include <linux/kcore.h> | 36 | #include <linux/kcore.h> |
@@ -714,6 +715,12 @@ static void __meminit free_pagetable(struct page *page, int order) | |||
714 | { | 715 | { |
715 | unsigned long magic; | 716 | unsigned long magic; |
716 | unsigned int nr_pages = 1 << order; | 717 | unsigned int nr_pages = 1 << order; |
718 | struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page); | ||
719 | |||
720 | if (altmap) { | ||
721 | vmem_altmap_free(altmap, nr_pages); | ||
722 | return; | ||
723 | } | ||
717 | 724 | ||
718 | /* bootmem page has reserved flag */ | 725 | /* bootmem page has reserved flag */ |
719 | if (PageReserved(page)) { | 726 | if (PageReserved(page)) { |
@@ -1017,13 +1024,19 @@ int __ref arch_remove_memory(u64 start, u64 size) | |||
1017 | { | 1024 | { |
1018 | unsigned long start_pfn = start >> PAGE_SHIFT; | 1025 | unsigned long start_pfn = start >> PAGE_SHIFT; |
1019 | unsigned long nr_pages = size >> PAGE_SHIFT; | 1026 | unsigned long nr_pages = size >> PAGE_SHIFT; |
1027 | struct page *page = pfn_to_page(start_pfn); | ||
1028 | struct vmem_altmap *altmap; | ||
1020 | struct zone *zone; | 1029 | struct zone *zone; |
1021 | int ret; | 1030 | int ret; |
1022 | 1031 | ||
1023 | zone = page_zone(pfn_to_page(start_pfn)); | 1032 | /* With altmap the first mapped page is offset from @start */ |
1024 | kernel_physical_mapping_remove(start, start + size); | 1033 | altmap = to_vmem_altmap((unsigned long) page); |
1034 | if (altmap) | ||
1035 | page += vmem_altmap_offset(altmap); | ||
1036 | zone = page_zone(page); | ||
1025 | ret = __remove_pages(zone, start_pfn, nr_pages); | 1037 | ret = __remove_pages(zone, start_pfn, nr_pages); |
1026 | WARN_ON_ONCE(ret); | 1038 | WARN_ON_ONCE(ret); |
1039 | kernel_physical_mapping_remove(start, start + size); | ||
1027 | 1040 | ||
1028 | return ret; | 1041 | return ret; |
1029 | } | 1042 | } |
@@ -1235,7 +1248,7 @@ static void __meminitdata *p_start, *p_end; | |||
1235 | static int __meminitdata node_start; | 1248 | static int __meminitdata node_start; |
1236 | 1249 | ||
1237 | static int __meminit vmemmap_populate_hugepages(unsigned long start, | 1250 | static int __meminit vmemmap_populate_hugepages(unsigned long start, |
1238 | unsigned long end, int node) | 1251 | unsigned long end, int node, struct vmem_altmap *altmap) |
1239 | { | 1252 | { |
1240 | unsigned long addr; | 1253 | unsigned long addr; |
1241 | unsigned long next; | 1254 | unsigned long next; |
@@ -1258,7 +1271,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, | |||
1258 | if (pmd_none(*pmd)) { | 1271 | if (pmd_none(*pmd)) { |
1259 | void *p; | 1272 | void *p; |
1260 | 1273 | ||
1261 | p = vmemmap_alloc_block_buf(PMD_SIZE, node); | 1274 | p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); |
1262 | if (p) { | 1275 | if (p) { |
1263 | pte_t entry; | 1276 | pte_t entry; |
1264 | 1277 | ||
@@ -1279,7 +1292,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, | |||
1279 | addr_end = addr + PMD_SIZE; | 1292 | addr_end = addr + PMD_SIZE; |
1280 | p_end = p + PMD_SIZE; | 1293 | p_end = p + PMD_SIZE; |
1281 | continue; | 1294 | continue; |
1282 | } | 1295 | } else if (altmap) |
1296 | return -ENOMEM; /* no fallback */ | ||
1283 | } else if (pmd_large(*pmd)) { | 1297 | } else if (pmd_large(*pmd)) { |
1284 | vmemmap_verify((pte_t *)pmd, node, addr, next); | 1298 | vmemmap_verify((pte_t *)pmd, node, addr, next); |
1285 | continue; | 1299 | continue; |
@@ -1293,11 +1307,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, | |||
1293 | 1307 | ||
1294 | int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) | 1308 | int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) |
1295 | { | 1309 | { |
1310 | struct vmem_altmap *altmap = to_vmem_altmap(start); | ||
1296 | int err; | 1311 | int err; |
1297 | 1312 | ||
1298 | if (cpu_has_pse) | 1313 | if (cpu_has_pse) |
1299 | err = vmemmap_populate_hugepages(start, end, node); | 1314 | err = vmemmap_populate_hugepages(start, end, node, altmap); |
1300 | else | 1315 | else if (altmap) { |
1316 | pr_err_once("%s: no cpu support for altmap allocations\n", | ||
1317 | __func__); | ||
1318 | err = -ENOMEM; | ||
1319 | } else | ||
1301 | err = vmemmap_populate_basepages(start, end, node); | 1320 | err = vmemmap_populate_basepages(start, end, node); |
1302 | if (!err) | 1321 | if (!err) |
1303 | sync_global_pgds(start, end - 1, 0); | 1322 | sync_global_pgds(start, end - 1, 0); |
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 031782e74231..f4ae536b0914 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/debugfs.h> | 12 | #include <linux/debugfs.h> |
13 | #include <linux/kernel.h> | 13 | #include <linux/kernel.h> |
14 | #include <linux/module.h> | 14 | #include <linux/module.h> |
15 | #include <linux/pfn_t.h> | ||
15 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
16 | #include <linux/mm.h> | 17 | #include <linux/mm.h> |
17 | #include <linux/fs.h> | 18 | #include <linux/fs.h> |
@@ -949,7 +950,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, | |||
949 | } | 950 | } |
950 | 951 | ||
951 | int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, | 952 | int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, |
952 | unsigned long pfn) | 953 | pfn_t pfn) |
953 | { | 954 | { |
954 | enum page_cache_mode pcm; | 955 | enum page_cache_mode pcm; |
955 | 956 | ||
@@ -957,7 +958,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, | |||
957 | return 0; | 958 | return 0; |
958 | 959 | ||
959 | /* Set prot based on lookup */ | 960 | /* Set prot based on lookup */ |
960 | pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); | 961 | pcm = lookup_memtype(pfn_t_to_phys(pfn)); |
961 | *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | | 962 | *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | |
962 | cachemode2protval(pcm)); | 963 | cachemode2protval(pcm)); |
963 | 964 | ||
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ee9c2e3a7199..4eb287e25043 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c | |||
@@ -505,19 +505,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, | |||
505 | 505 | ||
506 | return young; | 506 | return young; |
507 | } | 507 | } |
508 | |||
509 | void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
510 | unsigned long address, pmd_t *pmdp) | ||
511 | { | ||
512 | int set; | ||
513 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
514 | set = !test_and_set_bit(_PAGE_BIT_SPLITTING, | ||
515 | (unsigned long *)pmdp); | ||
516 | if (set) { | ||
517 | /* need tlb flush only to serialize against gup-fast */ | ||
518 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
519 | } | ||
520 | } | ||
521 | #endif | 508 | #endif |
522 | 509 | ||
523 | /** | 510 | /** |
diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 360944e1da52..d030594ed22b 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h | |||
@@ -86,8 +86,10 @@ | |||
86 | #define MADV_SEQUENTIAL 2 /* expect sequential page references */ | 86 | #define MADV_SEQUENTIAL 2 /* expect sequential page references */ |
87 | #define MADV_WILLNEED 3 /* will need these pages */ | 87 | #define MADV_WILLNEED 3 /* will need these pages */ |
88 | #define MADV_DONTNEED 4 /* don't need these pages */ | 88 | #define MADV_DONTNEED 4 /* don't need these pages */ |
89 | #define MADV_FREE 5 /* free pages only if memory pressure */ | ||
89 | 90 | ||
90 | /* common parameters: try to keep these consistent across architectures */ | 91 | /* common parameters: try to keep these consistent across architectures */ |
92 | #define MADV_FREE 8 /* free pages only if memory pressure */ | ||
91 | #define MADV_REMOVE 9 /* remove these pages & resources */ | 93 | #define MADV_REMOVE 9 /* remove these pages & resources */ |
92 | #define MADV_DONTFORK 10 /* don't inherit across fork */ | 94 | #define MADV_DONTFORK 10 /* don't inherit across fork */ |
93 | #define MADV_DOFORK 11 /* do inherit across fork */ | 95 | #define MADV_DOFORK 11 /* do inherit across fork */ |
diff --git a/arch/xtensa/mm/tlb.c b/arch/xtensa/mm/tlb.c index 5ece856c5725..35c822286bbe 100644 --- a/arch/xtensa/mm/tlb.c +++ b/arch/xtensa/mm/tlb.c | |||
@@ -245,7 +245,7 @@ static int check_tlb_entry(unsigned w, unsigned e, bool dtlb) | |||
245 | page_mapcount(p)); | 245 | page_mapcount(p)); |
246 | if (!page_count(p)) | 246 | if (!page_count(p)) |
247 | rc |= TLB_INSANE; | 247 | rc |= TLB_INSANE; |
248 | else if (page_mapped(p)) | 248 | else if (page_mapcount(p)) |
249 | rc |= TLB_SUSPICIOUS; | 249 | rc |= TLB_SUSPICIOUS; |
250 | } else { | 250 | } else { |
251 | rc |= TLB_INSANE; | 251 | rc |= TLB_INSANE; |
diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 619fe584a44c..213456c2b123 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c | |||
@@ -647,6 +647,13 @@ static int add_memory_block(int base_section_nr) | |||
647 | return 0; | 647 | return 0; |
648 | } | 648 | } |
649 | 649 | ||
650 | static bool is_zone_device_section(struct mem_section *ms) | ||
651 | { | ||
652 | struct page *page; | ||
653 | |||
654 | page = sparse_decode_mem_map(ms->section_mem_map, __section_nr(ms)); | ||
655 | return is_zone_device_page(page); | ||
656 | } | ||
650 | 657 | ||
651 | /* | 658 | /* |
652 | * need an interface for the VM to add new memory regions, | 659 | * need an interface for the VM to add new memory regions, |
@@ -657,6 +664,9 @@ int register_new_memory(int nid, struct mem_section *section) | |||
657 | int ret = 0; | 664 | int ret = 0; |
658 | struct memory_block *mem; | 665 | struct memory_block *mem; |
659 | 666 | ||
667 | if (is_zone_device_section(section)) | ||
668 | return 0; | ||
669 | |||
660 | mutex_lock(&mem_sysfs_mutex); | 670 | mutex_lock(&mem_sysfs_mutex); |
661 | 671 | ||
662 | mem = find_memory_block(section); | 672 | mem = find_memory_block(section); |
@@ -693,6 +703,9 @@ static int remove_memory_section(unsigned long node_id, | |||
693 | { | 703 | { |
694 | struct memory_block *mem; | 704 | struct memory_block *mem; |
695 | 705 | ||
706 | if (is_zone_device_section(section)) | ||
707 | return 0; | ||
708 | |||
696 | mutex_lock(&mem_sysfs_mutex); | 709 | mutex_lock(&mem_sysfs_mutex); |
697 | mem = find_memory_block(section); | 710 | mem = find_memory_block(section); |
698 | unregister_mem_sect_under_nodes(mem, __section_nr(section)); | 711 | unregister_mem_sect_under_nodes(mem, __section_nr(section)); |
diff --git a/drivers/block/brd.c b/drivers/block/brd.c index a5880f4ab40e..cb27190e9f39 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c | |||
@@ -19,6 +19,9 @@ | |||
19 | #include <linux/radix-tree.h> | 19 | #include <linux/radix-tree.h> |
20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
21 | #include <linux/slab.h> | 21 | #include <linux/slab.h> |
22 | #ifdef CONFIG_BLK_DEV_RAM_DAX | ||
23 | #include <linux/pfn_t.h> | ||
24 | #endif | ||
22 | 25 | ||
23 | #include <asm/uaccess.h> | 26 | #include <asm/uaccess.h> |
24 | 27 | ||
@@ -378,7 +381,7 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector, | |||
378 | 381 | ||
379 | #ifdef CONFIG_BLK_DEV_RAM_DAX | 382 | #ifdef CONFIG_BLK_DEV_RAM_DAX |
380 | static long brd_direct_access(struct block_device *bdev, sector_t sector, | 383 | static long brd_direct_access(struct block_device *bdev, sector_t sector, |
381 | void __pmem **kaddr, unsigned long *pfn) | 384 | void __pmem **kaddr, pfn_t *pfn) |
382 | { | 385 | { |
383 | struct brd_device *brd = bdev->bd_disk->private_data; | 386 | struct brd_device *brd = bdev->bd_disk->private_data; |
384 | struct page *page; | 387 | struct page *page; |
@@ -389,7 +392,7 @@ static long brd_direct_access(struct block_device *bdev, sector_t sector, | |||
389 | if (!page) | 392 | if (!page) |
390 | return -ENOSPC; | 393 | return -ENOSPC; |
391 | *kaddr = (void __pmem *)page_address(page); | 394 | *kaddr = (void __pmem *)page_address(page); |
392 | *pfn = page_to_pfn(page); | 395 | *pfn = page_to_pfn_t(page); |
393 | 396 | ||
394 | return PAGE_SIZE; | 397 | return PAGE_SIZE; |
395 | } | 398 | } |
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 47915d736f8d..370c2f76016d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c | |||
@@ -1325,7 +1325,6 @@ static int zram_remove(struct zram *zram) | |||
1325 | 1325 | ||
1326 | pr_info("Removed device: %s\n", zram->disk->disk_name); | 1326 | pr_info("Removed device: %s\n", zram->disk->disk_name); |
1327 | 1327 | ||
1328 | idr_remove(&zram_index_idr, zram->disk->first_minor); | ||
1329 | blk_cleanup_queue(zram->disk->queue); | 1328 | blk_cleanup_queue(zram->disk->queue); |
1330 | del_gendisk(zram->disk); | 1329 | del_gendisk(zram->disk); |
1331 | put_disk(zram->disk); | 1330 | put_disk(zram->disk); |
@@ -1367,10 +1366,12 @@ static ssize_t hot_remove_store(struct class *class, | |||
1367 | mutex_lock(&zram_index_mutex); | 1366 | mutex_lock(&zram_index_mutex); |
1368 | 1367 | ||
1369 | zram = idr_find(&zram_index_idr, dev_id); | 1368 | zram = idr_find(&zram_index_idr, dev_id); |
1370 | if (zram) | 1369 | if (zram) { |
1371 | ret = zram_remove(zram); | 1370 | ret = zram_remove(zram); |
1372 | else | 1371 | idr_remove(&zram_index_idr, dev_id); |
1372 | } else { | ||
1373 | ret = -ENODEV; | 1373 | ret = -ENODEV; |
1374 | } | ||
1374 | 1375 | ||
1375 | mutex_unlock(&zram_index_mutex); | 1376 | mutex_unlock(&zram_index_mutex); |
1376 | return ret ? ret : count; | 1377 | return ret ? ret : count; |
diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index 252eb301470c..32358c5e3db4 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c | |||
@@ -14,6 +14,7 @@ | |||
14 | 14 | ||
15 | #include <linux/shmem_fs.h> | 15 | #include <linux/shmem_fs.h> |
16 | #include <linux/dma-buf.h> | 16 | #include <linux/dma-buf.h> |
17 | #include <linux/pfn_t.h> | ||
17 | #include <drm/exynos_drm.h> | 18 | #include <drm/exynos_drm.h> |
18 | 19 | ||
19 | #include "exynos_drm_drv.h" | 20 | #include "exynos_drm_drv.h" |
@@ -490,7 +491,8 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
490 | } | 491 | } |
491 | 492 | ||
492 | pfn = page_to_pfn(exynos_gem->pages[page_offset]); | 493 | pfn = page_to_pfn(exynos_gem->pages[page_offset]); |
493 | ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn); | 494 | ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, |
495 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
494 | 496 | ||
495 | out: | 497 | out: |
496 | switch (ret) { | 498 | switch (ret) { |
diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index ee95c03a8c54..cb95765050cc 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/kernel.h> | 21 | #include <linux/kernel.h> |
22 | #include <linux/errno.h> | 22 | #include <linux/errno.h> |
23 | #include <linux/string.h> | 23 | #include <linux/string.h> |
24 | #include <linux/pfn_t.h> | ||
24 | #include <linux/mm.h> | 25 | #include <linux/mm.h> |
25 | #include <linux/tty.h> | 26 | #include <linux/tty.h> |
26 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
@@ -132,7 +133,8 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
132 | for (i = 0; i < page_num; i++) { | 133 | for (i = 0; i < page_num; i++) { |
133 | pfn = (phys_addr >> PAGE_SHIFT); | 134 | pfn = (phys_addr >> PAGE_SHIFT); |
134 | 135 | ||
135 | ret = vm_insert_mixed(vma, address, pfn); | 136 | ret = vm_insert_mixed(vma, address, |
137 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
136 | if (unlikely((ret == -EBUSY) || (ret != 0 && i > 0))) | 138 | if (unlikely((ret == -EBUSY) || (ret != 0 && i > 0))) |
137 | break; | 139 | break; |
138 | else if (unlikely(ret != 0)) { | 140 | else if (unlikely(ret != 0)) { |
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index c76cc853b08a..3cedb8d5c855 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c | |||
@@ -18,6 +18,7 @@ | |||
18 | #include <linux/spinlock.h> | 18 | #include <linux/spinlock.h> |
19 | #include <linux/shmem_fs.h> | 19 | #include <linux/shmem_fs.h> |
20 | #include <linux/dma-buf.h> | 20 | #include <linux/dma-buf.h> |
21 | #include <linux/pfn_t.h> | ||
21 | 22 | ||
22 | #include "msm_drv.h" | 23 | #include "msm_drv.h" |
23 | #include "msm_gem.h" | 24 | #include "msm_gem.h" |
@@ -222,7 +223,8 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
222 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, | 223 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, |
223 | pfn, pfn << PAGE_SHIFT); | 224 | pfn, pfn << PAGE_SHIFT); |
224 | 225 | ||
225 | ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn); | 226 | ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, |
227 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
226 | 228 | ||
227 | out_unlock: | 229 | out_unlock: |
228 | mutex_unlock(&dev->struct_mutex); | 230 | mutex_unlock(&dev->struct_mutex); |
diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index 7ed08fdc4c42..ceba5459ceb7 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c | |||
@@ -19,6 +19,7 @@ | |||
19 | 19 | ||
20 | #include <linux/shmem_fs.h> | 20 | #include <linux/shmem_fs.h> |
21 | #include <linux/spinlock.h> | 21 | #include <linux/spinlock.h> |
22 | #include <linux/pfn_t.h> | ||
22 | 23 | ||
23 | #include <drm/drm_vma_manager.h> | 24 | #include <drm/drm_vma_manager.h> |
24 | 25 | ||
@@ -385,7 +386,8 @@ static int fault_1d(struct drm_gem_object *obj, | |||
385 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, | 386 | VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, |
386 | pfn, pfn << PAGE_SHIFT); | 387 | pfn, pfn << PAGE_SHIFT); |
387 | 388 | ||
388 | return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, pfn); | 389 | return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, |
390 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
389 | } | 391 | } |
390 | 392 | ||
391 | /* Special handling for the case of faulting in 2d tiled buffers */ | 393 | /* Special handling for the case of faulting in 2d tiled buffers */ |
@@ -478,7 +480,8 @@ static int fault_2d(struct drm_gem_object *obj, | |||
478 | pfn, pfn << PAGE_SHIFT); | 480 | pfn, pfn << PAGE_SHIFT); |
479 | 481 | ||
480 | for (i = n; i > 0; i--) { | 482 | for (i = n; i > 0; i--) { |
481 | vm_insert_mixed(vma, (unsigned long)vaddr, pfn); | 483 | vm_insert_mixed(vma, (unsigned long)vaddr, |
484 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
482 | pfn += usergart[fmt].stride_pfn; | 485 | pfn += usergart[fmt].stride_pfn; |
483 | vaddr += PAGE_SIZE * m; | 486 | vaddr += PAGE_SIZE * m; |
484 | } | 487 | } |
diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 8fb7213277cc..06d26dc438b2 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c | |||
@@ -35,6 +35,7 @@ | |||
35 | #include <ttm/ttm_placement.h> | 35 | #include <ttm/ttm_placement.h> |
36 | #include <drm/drm_vma_manager.h> | 36 | #include <drm/drm_vma_manager.h> |
37 | #include <linux/mm.h> | 37 | #include <linux/mm.h> |
38 | #include <linux/pfn_t.h> | ||
38 | #include <linux/rbtree.h> | 39 | #include <linux/rbtree.h> |
39 | #include <linux/module.h> | 40 | #include <linux/module.h> |
40 | #include <linux/uaccess.h> | 41 | #include <linux/uaccess.h> |
@@ -229,7 +230,8 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
229 | } | 230 | } |
230 | 231 | ||
231 | if (vma->vm_flags & VM_MIXEDMAP) | 232 | if (vma->vm_flags & VM_MIXEDMAP) |
232 | ret = vm_insert_mixed(&cvma, address, pfn); | 233 | ret = vm_insert_mixed(&cvma, address, |
234 | __pfn_to_pfn_t(pfn, PFN_DEV)); | ||
233 | else | 235 | else |
234 | ret = vm_insert_pfn(&cvma, address, pfn); | 236 | ret = vm_insert_pfn(&cvma, address, pfn); |
235 | 237 | ||
diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index fd01f3493fc7..af7cc1e65656 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c | |||
@@ -433,16 +433,15 @@ ssize_t iio_format_value(char *buf, unsigned int type, int size, int *vals) | |||
433 | scale_db = true; | 433 | scale_db = true; |
434 | case IIO_VAL_INT_PLUS_MICRO: | 434 | case IIO_VAL_INT_PLUS_MICRO: |
435 | if (vals[1] < 0) | 435 | if (vals[1] < 0) |
436 | return sprintf(buf, "-%ld.%06u%s\n", abs(vals[0]), | 436 | return sprintf(buf, "-%d.%06u%s\n", abs(vals[0]), |
437 | -vals[1], | 437 | -vals[1], scale_db ? " dB" : ""); |
438 | scale_db ? " dB" : ""); | ||
439 | else | 438 | else |
440 | return sprintf(buf, "%d.%06u%s\n", vals[0], vals[1], | 439 | return sprintf(buf, "%d.%06u%s\n", vals[0], vals[1], |
441 | scale_db ? " dB" : ""); | 440 | scale_db ? " dB" : ""); |
442 | case IIO_VAL_INT_PLUS_NANO: | 441 | case IIO_VAL_INT_PLUS_NANO: |
443 | if (vals[1] < 0) | 442 | if (vals[1] < 0) |
444 | return sprintf(buf, "-%ld.%09u\n", abs(vals[0]), | 443 | return sprintf(buf, "-%d.%09u\n", abs(vals[0]), |
445 | -vals[1]); | 444 | -vals[1]); |
446 | else | 445 | else |
447 | return sprintf(buf, "%d.%09u\n", vals[0], vals[1]); | 446 | return sprintf(buf, "%d.%09u\n", vals[0], vals[1]); |
448 | case IIO_VAL_FRACTIONAL: | 447 | case IIO_VAL_FRACTIONAL: |
diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c index 07a4c644fb9b..e9cef9de9ed8 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/calib.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/calib.c | |||
@@ -901,7 +901,7 @@ static void iwlagn_gain_computation(struct iwl_priv *priv, | |||
901 | /* bound gain by 2 bits value max, 3rd bit is sign */ | 901 | /* bound gain by 2 bits value max, 3rd bit is sign */ |
902 | data->delta_gain_code[i] = | 902 | data->delta_gain_code[i] = |
903 | min(abs(delta_g), | 903 | min(abs(delta_g), |
904 | (long) CHAIN_NOISE_MAX_DELTA_GAIN_CODE); | 904 | (s32) CHAIN_NOISE_MAX_DELTA_GAIN_CODE); |
905 | 905 | ||
906 | if (delta_g < 0) | 906 | if (delta_g < 0) |
907 | /* | 907 | /* |
diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index f9b674bc49db..0cc9048b86e2 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c | |||
@@ -83,8 +83,7 @@ static ssize_t mode_store(struct device *dev, | |||
83 | 83 | ||
84 | if (strncmp(buf, "pmem\n", n) == 0 | 84 | if (strncmp(buf, "pmem\n", n) == 0 |
85 | || strncmp(buf, "pmem", n) == 0) { | 85 | || strncmp(buf, "pmem", n) == 0) { |
86 | /* TODO: allocate from PMEM support */ | 86 | nd_pfn->mode = PFN_MODE_PMEM; |
87 | rc = -ENOTTY; | ||
88 | } else if (strncmp(buf, "ram\n", n) == 0 | 87 | } else if (strncmp(buf, "ram\n", n) == 0 |
89 | || strncmp(buf, "ram", n) == 0) | 88 | || strncmp(buf, "ram", n) == 0) |
90 | nd_pfn->mode = PFN_MODE_RAM; | 89 | nd_pfn->mode = PFN_MODE_RAM; |
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index b493ff3fccb2..7edf31671dab 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c | |||
@@ -21,10 +21,11 @@ | |||
21 | #include <linux/init.h> | 21 | #include <linux/init.h> |
22 | #include <linux/platform_device.h> | 22 | #include <linux/platform_device.h> |
23 | #include <linux/module.h> | 23 | #include <linux/module.h> |
24 | #include <linux/memory_hotplug.h> | ||
25 | #include <linux/moduleparam.h> | 24 | #include <linux/moduleparam.h> |
26 | #include <linux/badblocks.h> | 25 | #include <linux/badblocks.h> |
26 | #include <linux/memremap.h> | ||
27 | #include <linux/vmalloc.h> | 27 | #include <linux/vmalloc.h> |
28 | #include <linux/pfn_t.h> | ||
28 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
29 | #include <linux/pmem.h> | 30 | #include <linux/pmem.h> |
30 | #include <linux/nd.h> | 31 | #include <linux/nd.h> |
@@ -40,6 +41,7 @@ struct pmem_device { | |||
40 | phys_addr_t phys_addr; | 41 | phys_addr_t phys_addr; |
41 | /* when non-zero this device is hosting a 'pfn' instance */ | 42 | /* when non-zero this device is hosting a 'pfn' instance */ |
42 | phys_addr_t data_offset; | 43 | phys_addr_t data_offset; |
44 | unsigned long pfn_flags; | ||
43 | void __pmem *virt_addr; | 45 | void __pmem *virt_addr; |
44 | size_t size; | 46 | size_t size; |
45 | struct badblocks bb; | 47 | struct badblocks bb; |
@@ -135,13 +137,13 @@ static int pmem_rw_page(struct block_device *bdev, sector_t sector, | |||
135 | } | 137 | } |
136 | 138 | ||
137 | static long pmem_direct_access(struct block_device *bdev, sector_t sector, | 139 | static long pmem_direct_access(struct block_device *bdev, sector_t sector, |
138 | void __pmem **kaddr, unsigned long *pfn) | 140 | void __pmem **kaddr, pfn_t *pfn) |
139 | { | 141 | { |
140 | struct pmem_device *pmem = bdev->bd_disk->private_data; | 142 | struct pmem_device *pmem = bdev->bd_disk->private_data; |
141 | resource_size_t offset = sector * 512 + pmem->data_offset; | 143 | resource_size_t offset = sector * 512 + pmem->data_offset; |
142 | 144 | ||
143 | *kaddr = pmem->virt_addr + offset; | 145 | *kaddr = pmem->virt_addr + offset; |
144 | *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT; | 146 | *pfn = phys_to_pfn_t(pmem->phys_addr + offset, pmem->pfn_flags); |
145 | 147 | ||
146 | return pmem->size - offset; | 148 | return pmem->size - offset; |
147 | } | 149 | } |
@@ -157,6 +159,7 @@ static struct pmem_device *pmem_alloc(struct device *dev, | |||
157 | struct resource *res, int id) | 159 | struct resource *res, int id) |
158 | { | 160 | { |
159 | struct pmem_device *pmem; | 161 | struct pmem_device *pmem; |
162 | struct request_queue *q; | ||
160 | 163 | ||
161 | pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); | 164 | pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); |
162 | if (!pmem) | 165 | if (!pmem) |
@@ -174,16 +177,26 @@ static struct pmem_device *pmem_alloc(struct device *dev, | |||
174 | return ERR_PTR(-EBUSY); | 177 | return ERR_PTR(-EBUSY); |
175 | } | 178 | } |
176 | 179 | ||
177 | if (pmem_should_map_pages(dev)) | 180 | q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); |
178 | pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res); | 181 | if (!q) |
179 | else | 182 | return ERR_PTR(-ENOMEM); |
183 | |||
184 | pmem->pfn_flags = PFN_DEV; | ||
185 | if (pmem_should_map_pages(dev)) { | ||
186 | pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res, | ||
187 | &q->q_usage_counter, NULL); | ||
188 | pmem->pfn_flags |= PFN_MAP; | ||
189 | } else | ||
180 | pmem->virt_addr = (void __pmem *) devm_memremap(dev, | 190 | pmem->virt_addr = (void __pmem *) devm_memremap(dev, |
181 | pmem->phys_addr, pmem->size, | 191 | pmem->phys_addr, pmem->size, |
182 | ARCH_MEMREMAP_PMEM); | 192 | ARCH_MEMREMAP_PMEM); |
183 | 193 | ||
184 | if (IS_ERR(pmem->virt_addr)) | 194 | if (IS_ERR(pmem->virt_addr)) { |
195 | blk_cleanup_queue(q); | ||
185 | return (void __force *) pmem->virt_addr; | 196 | return (void __force *) pmem->virt_addr; |
197 | } | ||
186 | 198 | ||
199 | pmem->pmem_queue = q; | ||
187 | return pmem; | 200 | return pmem; |
188 | } | 201 | } |
189 | 202 | ||
@@ -203,10 +216,6 @@ static int pmem_attach_disk(struct device *dev, | |||
203 | int nid = dev_to_node(dev); | 216 | int nid = dev_to_node(dev); |
204 | struct gendisk *disk; | 217 | struct gendisk *disk; |
205 | 218 | ||
206 | pmem->pmem_queue = blk_alloc_queue_node(GFP_KERNEL, nid); | ||
207 | if (!pmem->pmem_queue) | ||
208 | return -ENOMEM; | ||
209 | |||
210 | blk_queue_make_request(pmem->pmem_queue, pmem_make_request); | 219 | blk_queue_make_request(pmem->pmem_queue, pmem_make_request); |
211 | blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE); | 220 | blk_queue_physical_block_size(pmem->pmem_queue, PAGE_SIZE); |
212 | blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX); | 221 | blk_queue_max_hw_sectors(pmem->pmem_queue, UINT_MAX); |
@@ -352,12 +361,17 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) | |||
352 | struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); | 361 | struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); |
353 | struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); | 362 | struct nd_pfn *nd_pfn = to_nd_pfn(ndns->claim); |
354 | struct device *dev = &nd_pfn->dev; | 363 | struct device *dev = &nd_pfn->dev; |
355 | struct vmem_altmap *altmap; | ||
356 | struct nd_region *nd_region; | 364 | struct nd_region *nd_region; |
365 | struct vmem_altmap *altmap; | ||
357 | struct nd_pfn_sb *pfn_sb; | 366 | struct nd_pfn_sb *pfn_sb; |
358 | struct pmem_device *pmem; | 367 | struct pmem_device *pmem; |
368 | struct request_queue *q; | ||
359 | phys_addr_t offset; | 369 | phys_addr_t offset; |
360 | int rc; | 370 | int rc; |
371 | struct vmem_altmap __altmap = { | ||
372 | .base_pfn = __phys_to_pfn(nsio->res.start), | ||
373 | .reserve = __phys_to_pfn(SZ_8K), | ||
374 | }; | ||
361 | 375 | ||
362 | if (!nd_pfn->uuid || !nd_pfn->ndns) | 376 | if (!nd_pfn->uuid || !nd_pfn->ndns) |
363 | return -ENODEV; | 377 | return -ENODEV; |
@@ -375,6 +389,17 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) | |||
375 | return -EINVAL; | 389 | return -EINVAL; |
376 | nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); | 390 | nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); |
377 | altmap = NULL; | 391 | altmap = NULL; |
392 | } else if (nd_pfn->mode == PFN_MODE_PMEM) { | ||
393 | nd_pfn->npfns = (resource_size(&nsio->res) - offset) | ||
394 | / PAGE_SIZE; | ||
395 | if (le64_to_cpu(nd_pfn->pfn_sb->npfns) > nd_pfn->npfns) | ||
396 | dev_info(&nd_pfn->dev, | ||
397 | "number of pfns truncated from %lld to %ld\n", | ||
398 | le64_to_cpu(nd_pfn->pfn_sb->npfns), | ||
399 | nd_pfn->npfns); | ||
400 | altmap = & __altmap; | ||
401 | altmap->free = __phys_to_pfn(offset - SZ_8K); | ||
402 | altmap->alloc = 0; | ||
378 | } else { | 403 | } else { |
379 | rc = -ENXIO; | 404 | rc = -ENXIO; |
380 | goto err; | 405 | goto err; |
@@ -382,8 +407,11 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) | |||
382 | 407 | ||
383 | /* establish pfn range for lookup, and switch to direct map */ | 408 | /* establish pfn range for lookup, and switch to direct map */ |
384 | pmem = dev_get_drvdata(dev); | 409 | pmem = dev_get_drvdata(dev); |
410 | q = pmem->pmem_queue; | ||
385 | devm_memunmap(dev, (void __force *) pmem->virt_addr); | 411 | devm_memunmap(dev, (void __force *) pmem->virt_addr); |
386 | pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res); | 412 | pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res, |
413 | &q->q_usage_counter, altmap); | ||
414 | pmem->pfn_flags |= PFN_MAP; | ||
387 | if (IS_ERR(pmem->virt_addr)) { | 415 | if (IS_ERR(pmem->virt_addr)) { |
388 | rc = PTR_ERR(pmem->virt_addr); | 416 | rc = PTR_ERR(pmem->virt_addr); |
389 | goto err; | 417 | goto err; |
@@ -424,19 +452,22 @@ static int nd_pmem_probe(struct device *dev) | |||
424 | return -ENOMEM; | 452 | return -ENOMEM; |
425 | nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); | 453 | nvdimm_namespace_add_poison(ndns, &pmem->bb, 0); |
426 | 454 | ||
427 | if (is_nd_btt(dev)) | 455 | if (is_nd_btt(dev)) { |
456 | /* btt allocates its own request_queue */ | ||
457 | blk_cleanup_queue(pmem->pmem_queue); | ||
458 | pmem->pmem_queue = NULL; | ||
428 | return nvdimm_namespace_attach_btt(ndns); | 459 | return nvdimm_namespace_attach_btt(ndns); |
460 | } | ||
429 | 461 | ||
430 | if (is_nd_pfn(dev)) | 462 | if (is_nd_pfn(dev)) |
431 | return nvdimm_namespace_attach_pfn(ndns); | 463 | return nvdimm_namespace_attach_pfn(ndns); |
432 | 464 | ||
433 | if (nd_btt_probe(ndns, pmem) == 0) { | 465 | if (nd_btt_probe(ndns, pmem) == 0 || nd_pfn_probe(ndns, pmem) == 0) { |
434 | /* we'll come back as btt-pmem */ | 466 | /* |
435 | return -ENXIO; | 467 | * We'll come back as either btt-pmem, or pfn-pmem, so |
436 | } | 468 | * drop the queue allocation for now. |
437 | 469 | */ | |
438 | if (nd_pfn_probe(ndns, pmem) == 0) { | 470 | blk_cleanup_queue(pmem->pmem_queue); |
439 | /* we'll come back as pfn-pmem */ | ||
440 | return -ENXIO; | 471 | return -ENXIO; |
441 | } | 472 | } |
442 | 473 | ||
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 94a8f4ab57bc..ce7b70181740 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/completion.h> | 17 | #include <linux/completion.h> |
18 | #include <linux/interrupt.h> | 18 | #include <linux/interrupt.h> |
19 | #include <linux/platform_device.h> | 19 | #include <linux/platform_device.h> |
20 | #include <linux/pfn_t.h> | ||
20 | #include <asm/extmem.h> | 21 | #include <asm/extmem.h> |
21 | #include <asm/io.h> | 22 | #include <asm/io.h> |
22 | 23 | ||
@@ -30,7 +31,7 @@ static void dcssblk_release(struct gendisk *disk, fmode_t mode); | |||
30 | static blk_qc_t dcssblk_make_request(struct request_queue *q, | 31 | static blk_qc_t dcssblk_make_request(struct request_queue *q, |
31 | struct bio *bio); | 32 | struct bio *bio); |
32 | static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, | 33 | static long dcssblk_direct_access(struct block_device *bdev, sector_t secnum, |
33 | void __pmem **kaddr, unsigned long *pfn); | 34 | void __pmem **kaddr, pfn_t *pfn); |
34 | 35 | ||
35 | static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; | 36 | static char dcssblk_segments[DCSSBLK_PARM_LEN] = "\0"; |
36 | 37 | ||
@@ -883,20 +884,18 @@ fail: | |||
883 | 884 | ||
884 | static long | 885 | static long |
885 | dcssblk_direct_access (struct block_device *bdev, sector_t secnum, | 886 | dcssblk_direct_access (struct block_device *bdev, sector_t secnum, |
886 | void __pmem **kaddr, unsigned long *pfn) | 887 | void __pmem **kaddr, pfn_t *pfn) |
887 | { | 888 | { |
888 | struct dcssblk_dev_info *dev_info; | 889 | struct dcssblk_dev_info *dev_info; |
889 | unsigned long offset, dev_sz; | 890 | unsigned long offset, dev_sz; |
890 | void *addr; | ||
891 | 891 | ||
892 | dev_info = bdev->bd_disk->private_data; | 892 | dev_info = bdev->bd_disk->private_data; |
893 | if (!dev_info) | 893 | if (!dev_info) |
894 | return -ENODEV; | 894 | return -ENODEV; |
895 | dev_sz = dev_info->end - dev_info->start; | 895 | dev_sz = dev_info->end - dev_info->start; |
896 | offset = secnum * 512; | 896 | offset = secnum * 512; |
897 | addr = (void *) (dev_info->start + offset); | 897 | *kaddr = (void __pmem *) (dev_info->start + offset); |
898 | *pfn = virt_to_phys(addr) >> PAGE_SHIFT; | 898 | *pfn = __pfn_to_pfn_t(PFN_DOWN(dev_info->start + offset), PFN_DEV); |
899 | *kaddr = (void __pmem *) addr; | ||
900 | 899 | ||
901 | return dev_sz - offset; | 900 | return dev_sz - offset; |
902 | } | 901 | } |
diff --git a/fs/Kconfig b/fs/Kconfig index 2bb1ef86c411..9adee0d7536e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig | |||
@@ -50,7 +50,8 @@ config FS_DAX_PMD | |||
50 | bool | 50 | bool |
51 | default FS_DAX | 51 | default FS_DAX |
52 | depends on FS_DAX | 52 | depends on FS_DAX |
53 | depends on BROKEN | 53 | depends on ZONE_DEVICE |
54 | depends on TRANSPARENT_HUGEPAGE | ||
54 | 55 | ||
55 | endif # BLOCK | 56 | endif # BLOCK |
56 | 57 | ||
diff --git a/fs/block_dev.c b/fs/block_dev.c index 81c0705558be..530145b607c4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -455,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page); | |||
455 | /** | 455 | /** |
456 | * bdev_direct_access() - Get the address for directly-accessibly memory | 456 | * bdev_direct_access() - Get the address for directly-accessibly memory |
457 | * @bdev: The device containing the memory | 457 | * @bdev: The device containing the memory |
458 | * @sector: The offset within the device | 458 | * @dax: control and output parameters for ->direct_access |
459 | * @addr: Where to put the address of the memory | ||
460 | * @pfn: The Page Frame Number for the memory | ||
461 | * @size: The number of bytes requested | ||
462 | * | 459 | * |
463 | * If a block device is made up of directly addressable memory, this function | 460 | * If a block device is made up of directly addressable memory, this function |
464 | * will tell the caller the PFN and the address of the memory. The address | 461 | * will tell the caller the PFN and the address of the memory. The address |
@@ -469,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page); | |||
469 | * Return: negative errno if an error occurs, otherwise the number of bytes | 466 | * Return: negative errno if an error occurs, otherwise the number of bytes |
470 | * accessible at this address. | 467 | * accessible at this address. |
471 | */ | 468 | */ |
472 | long bdev_direct_access(struct block_device *bdev, sector_t sector, | 469 | long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) |
473 | void __pmem **addr, unsigned long *pfn, long size) | ||
474 | { | 470 | { |
475 | long avail; | 471 | sector_t sector = dax->sector; |
472 | long avail, size = dax->size; | ||
476 | const struct block_device_operations *ops = bdev->bd_disk->fops; | 473 | const struct block_device_operations *ops = bdev->bd_disk->fops; |
477 | 474 | ||
478 | /* | 475 | /* |
@@ -491,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, | |||
491 | sector += get_start_sect(bdev); | 488 | sector += get_start_sect(bdev); |
492 | if (sector % (PAGE_SIZE / 512)) | 489 | if (sector % (PAGE_SIZE / 512)) |
493 | return -EINVAL; | 490 | return -EINVAL; |
494 | avail = ops->direct_access(bdev, sector, addr, pfn); | 491 | avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); |
495 | if (!avail) | 492 | if (!avail) |
496 | return -ERANGE; | 493 | return -ERANGE; |
494 | if (avail > 0 && avail & ~PAGE_MASK) | ||
495 | return -ENXIO; | ||
497 | return min(avail, size); | 496 | return min(avail, size); |
498 | } | 497 | } |
499 | EXPORT_SYMBOL_GPL(bdev_direct_access); | 498 | EXPORT_SYMBOL_GPL(bdev_direct_access); |
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0068e82217c3..0a2752b79e72 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c | |||
@@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, | |||
3391 | * should have access to this page, we're safe to simply set | 3391 | * should have access to this page, we're safe to simply set |
3392 | * PG_locked without checking it first. | 3392 | * PG_locked without checking it first. |
3393 | */ | 3393 | */ |
3394 | __set_page_locked(page); | 3394 | __SetPageLocked(page); |
3395 | rc = add_to_page_cache_locked(page, mapping, | 3395 | rc = add_to_page_cache_locked(page, mapping, |
3396 | page->index, gfp); | 3396 | page->index, gfp); |
3397 | 3397 | ||
3398 | /* give up if we can't stick it in the cache */ | 3398 | /* give up if we can't stick it in the cache */ |
3399 | if (rc) { | 3399 | if (rc) { |
3400 | __clear_page_locked(page); | 3400 | __ClearPageLocked(page); |
3401 | return rc; | 3401 | return rc; |
3402 | } | 3402 | } |
3403 | 3403 | ||
@@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, | |||
3418 | if (*bytes + PAGE_CACHE_SIZE > rsize) | 3418 | if (*bytes + PAGE_CACHE_SIZE > rsize) |
3419 | break; | 3419 | break; |
3420 | 3420 | ||
3421 | __set_page_locked(page); | 3421 | __SetPageLocked(page); |
3422 | if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { | 3422 | if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { |
3423 | __clear_page_locked(page); | 3423 | __ClearPageLocked(page); |
3424 | break; | 3424 | break; |
3425 | } | 3425 | } |
3426 | list_move_tail(&page->lru, tmplist); | 3426 | list_move_tail(&page->lru, tmplist); |
@@ -28,54 +28,68 @@ | |||
28 | #include <linux/sched.h> | 28 | #include <linux/sched.h> |
29 | #include <linux/uio.h> | 29 | #include <linux/uio.h> |
30 | #include <linux/vmstat.h> | 30 | #include <linux/vmstat.h> |
31 | #include <linux/pfn_t.h> | ||
32 | #include <linux/sizes.h> | ||
33 | |||
34 | static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) | ||
35 | { | ||
36 | struct request_queue *q = bdev->bd_queue; | ||
37 | long rc = -EIO; | ||
38 | |||
39 | dax->addr = (void __pmem *) ERR_PTR(-EIO); | ||
40 | if (blk_queue_enter(q, true) != 0) | ||
41 | return rc; | ||
42 | |||
43 | rc = bdev_direct_access(bdev, dax); | ||
44 | if (rc < 0) { | ||
45 | dax->addr = (void __pmem *) ERR_PTR(rc); | ||
46 | blk_queue_exit(q); | ||
47 | return rc; | ||
48 | } | ||
49 | return rc; | ||
50 | } | ||
51 | |||
52 | static void dax_unmap_atomic(struct block_device *bdev, | ||
53 | const struct blk_dax_ctl *dax) | ||
54 | { | ||
55 | if (IS_ERR(dax->addr)) | ||
56 | return; | ||
57 | blk_queue_exit(bdev->bd_queue); | ||
58 | } | ||
31 | 59 | ||
32 | /* | 60 | /* |
33 | * dax_clear_blocks() is called from within transaction context from XFS, | 61 | * dax_clear_blocks() is called from within transaction context from XFS, |
34 | * and hence this means the stack from this point must follow GFP_NOFS | 62 | * and hence this means the stack from this point must follow GFP_NOFS |
35 | * semantics for all operations. | 63 | * semantics for all operations. |
36 | */ | 64 | */ |
37 | int dax_clear_blocks(struct inode *inode, sector_t block, long size) | 65 | int dax_clear_blocks(struct inode *inode, sector_t block, long _size) |
38 | { | 66 | { |
39 | struct block_device *bdev = inode->i_sb->s_bdev; | 67 | struct block_device *bdev = inode->i_sb->s_bdev; |
40 | sector_t sector = block << (inode->i_blkbits - 9); | 68 | struct blk_dax_ctl dax = { |
69 | .sector = block << (inode->i_blkbits - 9), | ||
70 | .size = _size, | ||
71 | }; | ||
41 | 72 | ||
42 | might_sleep(); | 73 | might_sleep(); |
43 | do { | 74 | do { |
44 | void __pmem *addr; | 75 | long count, sz; |
45 | unsigned long pfn; | ||
46 | long count; | ||
47 | 76 | ||
48 | count = bdev_direct_access(bdev, sector, &addr, &pfn, size); | 77 | count = dax_map_atomic(bdev, &dax); |
49 | if (count < 0) | 78 | if (count < 0) |
50 | return count; | 79 | return count; |
51 | BUG_ON(size < count); | 80 | sz = min_t(long, count, SZ_128K); |
52 | while (count > 0) { | 81 | clear_pmem(dax.addr, sz); |
53 | unsigned pgsz = PAGE_SIZE - offset_in_page(addr); | 82 | dax.size -= sz; |
54 | if (pgsz > count) | 83 | dax.sector += sz / 512; |
55 | pgsz = count; | 84 | dax_unmap_atomic(bdev, &dax); |
56 | clear_pmem(addr, pgsz); | 85 | cond_resched(); |
57 | addr += pgsz; | 86 | } while (dax.size); |
58 | size -= pgsz; | ||
59 | count -= pgsz; | ||
60 | BUG_ON(pgsz & 511); | ||
61 | sector += pgsz / 512; | ||
62 | cond_resched(); | ||
63 | } | ||
64 | } while (size); | ||
65 | 87 | ||
66 | wmb_pmem(); | 88 | wmb_pmem(); |
67 | return 0; | 89 | return 0; |
68 | } | 90 | } |
69 | EXPORT_SYMBOL_GPL(dax_clear_blocks); | 91 | EXPORT_SYMBOL_GPL(dax_clear_blocks); |
70 | 92 | ||
71 | static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, | ||
72 | unsigned blkbits) | ||
73 | { | ||
74 | unsigned long pfn; | ||
75 | sector_t sector = bh->b_blocknr << (blkbits - 9); | ||
76 | return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); | ||
77 | } | ||
78 | |||
79 | /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ | 93 | /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ |
80 | static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, | 94 | static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, |
81 | loff_t pos, loff_t end) | 95 | loff_t pos, loff_t end) |
@@ -105,19 +119,29 @@ static bool buffer_size_valid(struct buffer_head *bh) | |||
105 | return bh->b_state != 0; | 119 | return bh->b_state != 0; |
106 | } | 120 | } |
107 | 121 | ||
122 | |||
123 | static sector_t to_sector(const struct buffer_head *bh, | ||
124 | const struct inode *inode) | ||
125 | { | ||
126 | sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); | ||
127 | |||
128 | return sector; | ||
129 | } | ||
130 | |||
108 | static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, | 131 | static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, |
109 | loff_t start, loff_t end, get_block_t get_block, | 132 | loff_t start, loff_t end, get_block_t get_block, |
110 | struct buffer_head *bh) | 133 | struct buffer_head *bh) |
111 | { | 134 | { |
112 | ssize_t retval = 0; | 135 | loff_t pos = start, max = start, bh_max = start; |
113 | loff_t pos = start; | 136 | bool hole = false, need_wmb = false; |
114 | loff_t max = start; | 137 | struct block_device *bdev = NULL; |
115 | loff_t bh_max = start; | 138 | int rw = iov_iter_rw(iter), rc; |
116 | void __pmem *addr; | 139 | long map_len = 0; |
117 | bool hole = false; | 140 | struct blk_dax_ctl dax = { |
118 | bool need_wmb = false; | 141 | .addr = (void __pmem *) ERR_PTR(-EIO), |
119 | 142 | }; | |
120 | if (iov_iter_rw(iter) != WRITE) | 143 | |
144 | if (rw == READ) | ||
121 | end = min(end, i_size_read(inode)); | 145 | end = min(end, i_size_read(inode)); |
122 | 146 | ||
123 | while (pos < end) { | 147 | while (pos < end) { |
@@ -132,13 +156,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, | |||
132 | if (pos == bh_max) { | 156 | if (pos == bh_max) { |
133 | bh->b_size = PAGE_ALIGN(end - pos); | 157 | bh->b_size = PAGE_ALIGN(end - pos); |
134 | bh->b_state = 0; | 158 | bh->b_state = 0; |
135 | retval = get_block(inode, block, bh, | 159 | rc = get_block(inode, block, bh, rw == WRITE); |
136 | iov_iter_rw(iter) == WRITE); | 160 | if (rc) |
137 | if (retval) | ||
138 | break; | 161 | break; |
139 | if (!buffer_size_valid(bh)) | 162 | if (!buffer_size_valid(bh)) |
140 | bh->b_size = 1 << blkbits; | 163 | bh->b_size = 1 << blkbits; |
141 | bh_max = pos - first + bh->b_size; | 164 | bh_max = pos - first + bh->b_size; |
165 | bdev = bh->b_bdev; | ||
142 | } else { | 166 | } else { |
143 | unsigned done = bh->b_size - | 167 | unsigned done = bh->b_size - |
144 | (bh_max - (pos - first)); | 168 | (bh_max - (pos - first)); |
@@ -146,47 +170,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, | |||
146 | bh->b_size -= done; | 170 | bh->b_size -= done; |
147 | } | 171 | } |
148 | 172 | ||
149 | hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); | 173 | hole = rw == READ && !buffer_written(bh); |
150 | if (hole) { | 174 | if (hole) { |
151 | addr = NULL; | ||
152 | size = bh->b_size - first; | 175 | size = bh->b_size - first; |
153 | } else { | 176 | } else { |
154 | retval = dax_get_addr(bh, &addr, blkbits); | 177 | dax_unmap_atomic(bdev, &dax); |
155 | if (retval < 0) | 178 | dax.sector = to_sector(bh, inode); |
179 | dax.size = bh->b_size; | ||
180 | map_len = dax_map_atomic(bdev, &dax); | ||
181 | if (map_len < 0) { | ||
182 | rc = map_len; | ||
156 | break; | 183 | break; |
184 | } | ||
157 | if (buffer_unwritten(bh) || buffer_new(bh)) { | 185 | if (buffer_unwritten(bh) || buffer_new(bh)) { |
158 | dax_new_buf(addr, retval, first, pos, | 186 | dax_new_buf(dax.addr, map_len, first, |
159 | end); | 187 | pos, end); |
160 | need_wmb = true; | 188 | need_wmb = true; |
161 | } | 189 | } |
162 | addr += first; | 190 | dax.addr += first; |
163 | size = retval - first; | 191 | size = map_len - first; |
164 | } | 192 | } |
165 | max = min(pos + size, end); | 193 | max = min(pos + size, end); |
166 | } | 194 | } |
167 | 195 | ||
168 | if (iov_iter_rw(iter) == WRITE) { | 196 | if (iov_iter_rw(iter) == WRITE) { |
169 | len = copy_from_iter_pmem(addr, max - pos, iter); | 197 | len = copy_from_iter_pmem(dax.addr, max - pos, iter); |
170 | need_wmb = true; | 198 | need_wmb = true; |
171 | } else if (!hole) | 199 | } else if (!hole) |
172 | len = copy_to_iter((void __force *)addr, max - pos, | 200 | len = copy_to_iter((void __force *) dax.addr, max - pos, |
173 | iter); | 201 | iter); |
174 | else | 202 | else |
175 | len = iov_iter_zero(max - pos, iter); | 203 | len = iov_iter_zero(max - pos, iter); |
176 | 204 | ||
177 | if (!len) { | 205 | if (!len) { |
178 | retval = -EFAULT; | 206 | rc = -EFAULT; |
179 | break; | 207 | break; |
180 | } | 208 | } |
181 | 209 | ||
182 | pos += len; | 210 | pos += len; |
183 | addr += len; | 211 | if (!IS_ERR(dax.addr)) |
212 | dax.addr += len; | ||
184 | } | 213 | } |
185 | 214 | ||
186 | if (need_wmb) | 215 | if (need_wmb) |
187 | wmb_pmem(); | 216 | wmb_pmem(); |
217 | dax_unmap_atomic(bdev, &dax); | ||
188 | 218 | ||
189 | return (pos == start) ? retval : pos - start; | 219 | return (pos == start) ? rc : pos - start; |
190 | } | 220 | } |
191 | 221 | ||
192 | /** | 222 | /** |
@@ -275,28 +305,35 @@ static int dax_load_hole(struct address_space *mapping, struct page *page, | |||
275 | return VM_FAULT_LOCKED; | 305 | return VM_FAULT_LOCKED; |
276 | } | 306 | } |
277 | 307 | ||
278 | static int copy_user_bh(struct page *to, struct buffer_head *bh, | 308 | static int copy_user_bh(struct page *to, struct inode *inode, |
279 | unsigned blkbits, unsigned long vaddr) | 309 | struct buffer_head *bh, unsigned long vaddr) |
280 | { | 310 | { |
281 | void __pmem *vfrom; | 311 | struct blk_dax_ctl dax = { |
312 | .sector = to_sector(bh, inode), | ||
313 | .size = bh->b_size, | ||
314 | }; | ||
315 | struct block_device *bdev = bh->b_bdev; | ||
282 | void *vto; | 316 | void *vto; |
283 | 317 | ||
284 | if (dax_get_addr(bh, &vfrom, blkbits) < 0) | 318 | if (dax_map_atomic(bdev, &dax) < 0) |
285 | return -EIO; | 319 | return PTR_ERR(dax.addr); |
286 | vto = kmap_atomic(to); | 320 | vto = kmap_atomic(to); |
287 | copy_user_page(vto, (void __force *)vfrom, vaddr, to); | 321 | copy_user_page(vto, (void __force *)dax.addr, vaddr, to); |
288 | kunmap_atomic(vto); | 322 | kunmap_atomic(vto); |
323 | dax_unmap_atomic(bdev, &dax); | ||
289 | return 0; | 324 | return 0; |
290 | } | 325 | } |
291 | 326 | ||
292 | static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, | 327 | static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, |
293 | struct vm_area_struct *vma, struct vm_fault *vmf) | 328 | struct vm_area_struct *vma, struct vm_fault *vmf) |
294 | { | 329 | { |
295 | struct address_space *mapping = inode->i_mapping; | ||
296 | sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); | ||
297 | unsigned long vaddr = (unsigned long)vmf->virtual_address; | 330 | unsigned long vaddr = (unsigned long)vmf->virtual_address; |
298 | void __pmem *addr; | 331 | struct address_space *mapping = inode->i_mapping; |
299 | unsigned long pfn; | 332 | struct block_device *bdev = bh->b_bdev; |
333 | struct blk_dax_ctl dax = { | ||
334 | .sector = to_sector(bh, inode), | ||
335 | .size = bh->b_size, | ||
336 | }; | ||
300 | pgoff_t size; | 337 | pgoff_t size; |
301 | int error; | 338 | int error; |
302 | 339 | ||
@@ -315,20 +352,18 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, | |||
315 | goto out; | 352 | goto out; |
316 | } | 353 | } |
317 | 354 | ||
318 | error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); | 355 | if (dax_map_atomic(bdev, &dax) < 0) { |
319 | if (error < 0) | 356 | error = PTR_ERR(dax.addr); |
320 | goto out; | ||
321 | if (error < PAGE_SIZE) { | ||
322 | error = -EIO; | ||
323 | goto out; | 357 | goto out; |
324 | } | 358 | } |
325 | 359 | ||
326 | if (buffer_unwritten(bh) || buffer_new(bh)) { | 360 | if (buffer_unwritten(bh) || buffer_new(bh)) { |
327 | clear_pmem(addr, PAGE_SIZE); | 361 | clear_pmem(dax.addr, PAGE_SIZE); |
328 | wmb_pmem(); | 362 | wmb_pmem(); |
329 | } | 363 | } |
364 | dax_unmap_atomic(bdev, &dax); | ||
330 | 365 | ||
331 | error = vm_insert_mixed(vma, vaddr, pfn); | 366 | error = vm_insert_mixed(vma, vaddr, dax.pfn); |
332 | 367 | ||
333 | out: | 368 | out: |
334 | i_mmap_unlock_read(mapping); | 369 | i_mmap_unlock_read(mapping); |
@@ -422,7 +457,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
422 | if (vmf->cow_page) { | 457 | if (vmf->cow_page) { |
423 | struct page *new_page = vmf->cow_page; | 458 | struct page *new_page = vmf->cow_page; |
424 | if (buffer_written(&bh)) | 459 | if (buffer_written(&bh)) |
425 | error = copy_user_bh(new_page, &bh, blkbits, vaddr); | 460 | error = copy_user_bh(new_page, inode, &bh, vaddr); |
426 | else | 461 | else |
427 | clear_user_highpage(new_page, vaddr); | 462 | clear_user_highpage(new_page, vaddr); |
428 | if (error) | 463 | if (error) |
@@ -523,6 +558,24 @@ EXPORT_SYMBOL_GPL(dax_fault); | |||
523 | */ | 558 | */ |
524 | #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) | 559 | #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) |
525 | 560 | ||
561 | static void __dax_dbg(struct buffer_head *bh, unsigned long address, | ||
562 | const char *reason, const char *fn) | ||
563 | { | ||
564 | if (bh) { | ||
565 | char bname[BDEVNAME_SIZE]; | ||
566 | bdevname(bh->b_bdev, bname); | ||
567 | pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " | ||
568 | "length %zd fallback: %s\n", fn, current->comm, | ||
569 | address, bname, bh->b_state, (u64)bh->b_blocknr, | ||
570 | bh->b_size, reason); | ||
571 | } else { | ||
572 | pr_debug("%s: %s addr: %lx fallback: %s\n", fn, | ||
573 | current->comm, address, reason); | ||
574 | } | ||
575 | } | ||
576 | |||
577 | #define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") | ||
578 | |||
526 | int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | 579 | int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, |
527 | pmd_t *pmd, unsigned int flags, get_block_t get_block, | 580 | pmd_t *pmd, unsigned int flags, get_block_t get_block, |
528 | dax_iodone_t complete_unwritten) | 581 | dax_iodone_t complete_unwritten) |
@@ -534,41 +587,49 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
534 | unsigned blkbits = inode->i_blkbits; | 587 | unsigned blkbits = inode->i_blkbits; |
535 | unsigned long pmd_addr = address & PMD_MASK; | 588 | unsigned long pmd_addr = address & PMD_MASK; |
536 | bool write = flags & FAULT_FLAG_WRITE; | 589 | bool write = flags & FAULT_FLAG_WRITE; |
537 | long length; | 590 | struct block_device *bdev; |
538 | void __pmem *kaddr; | ||
539 | pgoff_t size, pgoff; | 591 | pgoff_t size, pgoff; |
540 | sector_t block, sector; | 592 | sector_t block; |
541 | unsigned long pfn; | ||
542 | int result = 0; | 593 | int result = 0; |
543 | 594 | ||
544 | /* dax pmd mappings are broken wrt gup and fork */ | 595 | /* dax pmd mappings require pfn_t_devmap() */ |
545 | if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) | 596 | if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) |
546 | return VM_FAULT_FALLBACK; | 597 | return VM_FAULT_FALLBACK; |
547 | 598 | ||
548 | /* Fall back to PTEs if we're going to COW */ | 599 | /* Fall back to PTEs if we're going to COW */ |
549 | if (write && !(vma->vm_flags & VM_SHARED)) | 600 | if (write && !(vma->vm_flags & VM_SHARED)) { |
601 | split_huge_pmd(vma, pmd, address); | ||
602 | dax_pmd_dbg(NULL, address, "cow write"); | ||
550 | return VM_FAULT_FALLBACK; | 603 | return VM_FAULT_FALLBACK; |
604 | } | ||
551 | /* If the PMD would extend outside the VMA */ | 605 | /* If the PMD would extend outside the VMA */ |
552 | if (pmd_addr < vma->vm_start) | 606 | if (pmd_addr < vma->vm_start) { |
607 | dax_pmd_dbg(NULL, address, "vma start unaligned"); | ||
553 | return VM_FAULT_FALLBACK; | 608 | return VM_FAULT_FALLBACK; |
554 | if ((pmd_addr + PMD_SIZE) > vma->vm_end) | 609 | } |
610 | if ((pmd_addr + PMD_SIZE) > vma->vm_end) { | ||
611 | dax_pmd_dbg(NULL, address, "vma end unaligned"); | ||
555 | return VM_FAULT_FALLBACK; | 612 | return VM_FAULT_FALLBACK; |
613 | } | ||
556 | 614 | ||
557 | pgoff = linear_page_index(vma, pmd_addr); | 615 | pgoff = linear_page_index(vma, pmd_addr); |
558 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; | 616 | size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; |
559 | if (pgoff >= size) | 617 | if (pgoff >= size) |
560 | return VM_FAULT_SIGBUS; | 618 | return VM_FAULT_SIGBUS; |
561 | /* If the PMD would cover blocks out of the file */ | 619 | /* If the PMD would cover blocks out of the file */ |
562 | if ((pgoff | PG_PMD_COLOUR) >= size) | 620 | if ((pgoff | PG_PMD_COLOUR) >= size) { |
621 | dax_pmd_dbg(NULL, address, | ||
622 | "offset + huge page size > file size"); | ||
563 | return VM_FAULT_FALLBACK; | 623 | return VM_FAULT_FALLBACK; |
624 | } | ||
564 | 625 | ||
565 | memset(&bh, 0, sizeof(bh)); | 626 | memset(&bh, 0, sizeof(bh)); |
566 | block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); | 627 | block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); |
567 | 628 | ||
568 | bh.b_size = PMD_SIZE; | 629 | bh.b_size = PMD_SIZE; |
569 | length = get_block(inode, block, &bh, write); | 630 | if (get_block(inode, block, &bh, write) != 0) |
570 | if (length) | ||
571 | return VM_FAULT_SIGBUS; | 631 | return VM_FAULT_SIGBUS; |
632 | bdev = bh.b_bdev; | ||
572 | i_mmap_lock_read(mapping); | 633 | i_mmap_lock_read(mapping); |
573 | 634 | ||
574 | /* | 635 | /* |
@@ -576,8 +637,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
576 | * just fall back to PTEs. Calling get_block 512 times in a loop | 637 | * just fall back to PTEs. Calling get_block 512 times in a loop |
577 | * would be silly. | 638 | * would be silly. |
578 | */ | 639 | */ |
579 | if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) | 640 | if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { |
641 | dax_pmd_dbg(&bh, address, "allocated block too small"); | ||
580 | goto fallback; | 642 | goto fallback; |
643 | } | ||
581 | 644 | ||
582 | /* | 645 | /* |
583 | * If we allocated new storage, make sure no process has any | 646 | * If we allocated new storage, make sure no process has any |
@@ -600,57 +663,82 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
600 | result = VM_FAULT_SIGBUS; | 663 | result = VM_FAULT_SIGBUS; |
601 | goto out; | 664 | goto out; |
602 | } | 665 | } |
603 | if ((pgoff | PG_PMD_COLOUR) >= size) | 666 | if ((pgoff | PG_PMD_COLOUR) >= size) { |
667 | dax_pmd_dbg(&bh, address, "pgoff unaligned"); | ||
604 | goto fallback; | 668 | goto fallback; |
669 | } | ||
605 | 670 | ||
606 | if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { | 671 | if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { |
607 | spinlock_t *ptl; | 672 | spinlock_t *ptl; |
608 | pmd_t entry; | 673 | pmd_t entry; |
609 | struct page *zero_page = get_huge_zero_page(); | 674 | struct page *zero_page = get_huge_zero_page(); |
610 | 675 | ||
611 | if (unlikely(!zero_page)) | 676 | if (unlikely(!zero_page)) { |
677 | dax_pmd_dbg(&bh, address, "no zero page"); | ||
612 | goto fallback; | 678 | goto fallback; |
679 | } | ||
613 | 680 | ||
614 | ptl = pmd_lock(vma->vm_mm, pmd); | 681 | ptl = pmd_lock(vma->vm_mm, pmd); |
615 | if (!pmd_none(*pmd)) { | 682 | if (!pmd_none(*pmd)) { |
616 | spin_unlock(ptl); | 683 | spin_unlock(ptl); |
684 | dax_pmd_dbg(&bh, address, "pmd already present"); | ||
617 | goto fallback; | 685 | goto fallback; |
618 | } | 686 | } |
619 | 687 | ||
688 | dev_dbg(part_to_dev(bdev->bd_part), | ||
689 | "%s: %s addr: %lx pfn: <zero> sect: %llx\n", | ||
690 | __func__, current->comm, address, | ||
691 | (unsigned long long) to_sector(&bh, inode)); | ||
692 | |||
620 | entry = mk_pmd(zero_page, vma->vm_page_prot); | 693 | entry = mk_pmd(zero_page, vma->vm_page_prot); |
621 | entry = pmd_mkhuge(entry); | 694 | entry = pmd_mkhuge(entry); |
622 | set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); | 695 | set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); |
623 | result = VM_FAULT_NOPAGE; | 696 | result = VM_FAULT_NOPAGE; |
624 | spin_unlock(ptl); | 697 | spin_unlock(ptl); |
625 | } else { | 698 | } else { |
626 | sector = bh.b_blocknr << (blkbits - 9); | 699 | struct blk_dax_ctl dax = { |
627 | length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, | 700 | .sector = to_sector(&bh, inode), |
628 | bh.b_size); | 701 | .size = PMD_SIZE, |
702 | }; | ||
703 | long length = dax_map_atomic(bdev, &dax); | ||
704 | |||
629 | if (length < 0) { | 705 | if (length < 0) { |
630 | result = VM_FAULT_SIGBUS; | 706 | result = VM_FAULT_SIGBUS; |
631 | goto out; | 707 | goto out; |
632 | } | 708 | } |
633 | if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) | 709 | if (length < PMD_SIZE) { |
710 | dax_pmd_dbg(&bh, address, "dax-length too small"); | ||
711 | dax_unmap_atomic(bdev, &dax); | ||
634 | goto fallback; | 712 | goto fallback; |
713 | } | ||
714 | if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { | ||
715 | dax_pmd_dbg(&bh, address, "pfn unaligned"); | ||
716 | dax_unmap_atomic(bdev, &dax); | ||
717 | goto fallback; | ||
718 | } | ||
635 | 719 | ||
636 | /* | 720 | if (!pfn_t_devmap(dax.pfn)) { |
637 | * TODO: teach vmf_insert_pfn_pmd() to support | 721 | dax_unmap_atomic(bdev, &dax); |
638 | * 'pte_special' for pmds | 722 | dax_pmd_dbg(&bh, address, "pfn not in memmap"); |
639 | */ | ||
640 | if (pfn_valid(pfn)) | ||
641 | goto fallback; | 723 | goto fallback; |
724 | } | ||
642 | 725 | ||
643 | if (buffer_unwritten(&bh) || buffer_new(&bh)) { | 726 | if (buffer_unwritten(&bh) || buffer_new(&bh)) { |
644 | int i; | 727 | clear_pmem(dax.addr, PMD_SIZE); |
645 | for (i = 0; i < PTRS_PER_PMD; i++) | ||
646 | clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); | ||
647 | wmb_pmem(); | 728 | wmb_pmem(); |
648 | count_vm_event(PGMAJFAULT); | 729 | count_vm_event(PGMAJFAULT); |
649 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 730 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
650 | result |= VM_FAULT_MAJOR; | 731 | result |= VM_FAULT_MAJOR; |
651 | } | 732 | } |
652 | 733 | dax_unmap_atomic(bdev, &dax); | |
653 | result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); | 734 | |
735 | dev_dbg(part_to_dev(bdev->bd_part), | ||
736 | "%s: %s addr: %lx pfn: %lx sect: %llx\n", | ||
737 | __func__, current->comm, address, | ||
738 | pfn_t_to_pfn(dax.pfn), | ||
739 | (unsigned long long) dax.sector); | ||
740 | result |= vmf_insert_pfn_pmd(vma, address, pmd, | ||
741 | dax.pfn, write); | ||
654 | } | 742 | } |
655 | 743 | ||
656 | out: | 744 | out: |
@@ -752,12 +840,17 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, | |||
752 | if (err < 0) | 840 | if (err < 0) |
753 | return err; | 841 | return err; |
754 | if (buffer_written(&bh)) { | 842 | if (buffer_written(&bh)) { |
755 | void __pmem *addr; | 843 | struct block_device *bdev = bh.b_bdev; |
756 | err = dax_get_addr(&bh, &addr, inode->i_blkbits); | 844 | struct blk_dax_ctl dax = { |
757 | if (err < 0) | 845 | .sector = to_sector(&bh, inode), |
758 | return err; | 846 | .size = PAGE_CACHE_SIZE, |
759 | clear_pmem(addr + offset, length); | 847 | }; |
848 | |||
849 | if (dax_map_atomic(bdev, &dax) < 0) | ||
850 | return PTR_ERR(dax.addr); | ||
851 | clear_pmem(dax.addr + offset, length); | ||
760 | wmb_pmem(); | 852 | wmb_pmem(); |
853 | dax_unmap_atomic(bdev, &dax); | ||
761 | } | 854 | } |
762 | 855 | ||
763 | return 0; | 856 | return 0; |
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 023f6a1f23cd..6915c950e6e8 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c | |||
@@ -677,9 +677,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, | |||
677 | if (!wbc->wb) | 677 | if (!wbc->wb) |
678 | return; | 678 | return; |
679 | 679 | ||
680 | rcu_read_lock(); | ||
681 | id = mem_cgroup_css_from_page(page)->id; | 680 | id = mem_cgroup_css_from_page(page)->id; |
682 | rcu_read_unlock(); | ||
683 | 681 | ||
684 | if (id == wbc->wb_id) { | 682 | if (id == wbc->wb_id) { |
685 | wbc->wb_bytes += bytes; | 683 | wbc->wb_bytes += bytes; |
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 47789292a582..8bbf7f3e2a27 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page) | |||
324 | delete_from_page_cache(page); | 324 | delete_from_page_cache(page); |
325 | } | 325 | } |
326 | 326 | ||
327 | static void | ||
328 | hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) | ||
329 | { | ||
330 | struct vm_area_struct *vma; | ||
331 | |||
332 | /* | ||
333 | * end == 0 indicates that the entire range after | ||
334 | * start should be unmapped. | ||
335 | */ | ||
336 | vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { | ||
337 | unsigned long v_offset; | ||
338 | unsigned long v_end; | ||
339 | |||
340 | /* | ||
341 | * Can the expression below overflow on 32-bit arches? | ||
342 | * No, because the interval tree returns us only those vmas | ||
343 | * which overlap the truncated area starting at pgoff, | ||
344 | * and no vma on a 32-bit arch can span beyond the 4GB. | ||
345 | */ | ||
346 | if (vma->vm_pgoff < start) | ||
347 | v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; | ||
348 | else | ||
349 | v_offset = 0; | ||
350 | |||
351 | if (!end) | ||
352 | v_end = vma->vm_end; | ||
353 | else { | ||
354 | v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) | ||
355 | + vma->vm_start; | ||
356 | if (v_end > vma->vm_end) | ||
357 | v_end = vma->vm_end; | ||
358 | } | ||
359 | |||
360 | unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, | ||
361 | NULL); | ||
362 | } | ||
363 | } | ||
327 | 364 | ||
328 | /* | 365 | /* |
329 | * remove_inode_hugepages handles two distinct cases: truncation and hole | 366 | * remove_inode_hugepages handles two distinct cases: truncation and hole |
330 | * punch. There are subtle differences in operation for each case. | 367 | * punch. There are subtle differences in operation for each case. |
331 | 368 | * | |
332 | * truncation is indicated by end of range being LLONG_MAX | 369 | * truncation is indicated by end of range being LLONG_MAX |
333 | * In this case, we first scan the range and release found pages. | 370 | * In this case, we first scan the range and release found pages. |
334 | * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv | 371 | * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv |
@@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
379 | 416 | ||
380 | for (i = 0; i < pagevec_count(&pvec); ++i) { | 417 | for (i = 0; i < pagevec_count(&pvec); ++i) { |
381 | struct page *page = pvec.pages[i]; | 418 | struct page *page = pvec.pages[i]; |
419 | bool rsv_on_error; | ||
382 | u32 hash; | 420 | u32 hash; |
383 | 421 | ||
384 | /* | 422 | /* |
@@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, | |||
395 | mapping, next, 0); | 433 | mapping, next, 0); |
396 | mutex_lock(&hugetlb_fault_mutex_table[hash]); | 434 | mutex_lock(&hugetlb_fault_mutex_table[hash]); |
397 | 435 | ||
398 | lock_page(page); | 436 | /* |
399 | if (likely(!page_mapped(page))) { | 437 | * If page is mapped, it was faulted in after being |
400 | bool rsv_on_error = !PagePrivate(page); | 438 | * unmapped in caller. Unmap (again) now after taking |
401 | /* | 439 | * the fault mutex. The mutex will prevent faults |
402 | * We must free the huge page and remove | 440 | * until we finish removing the page. |
403 | * from page cache (remove_huge_page) BEFORE | 441 | * |
404 | * removing the region/reserve map | 442 | * This race can only happen in the hole punch case. |
405 | * (hugetlb_unreserve_pages). In rare out | 443 | * Getting here in a truncate operation is a bug. |
406 | * of memory conditions, removal of the | 444 | */ |
407 | * region/reserve map could fail. Before | 445 | if (unlikely(page_mapped(page))) { |
408 | * free'ing the page, note PagePrivate which | ||
409 | * is used in case of error. | ||
410 | */ | ||
411 | remove_huge_page(page); | ||
412 | freed++; | ||
413 | if (!truncate_op) { | ||
414 | if (unlikely(hugetlb_unreserve_pages( | ||
415 | inode, next, | ||
416 | next + 1, 1))) | ||
417 | hugetlb_fix_reserve_counts( | ||
418 | inode, rsv_on_error); | ||
419 | } | ||
420 | } else { | ||
421 | /* | ||
422 | * If page is mapped, it was faulted in after | ||
423 | * being unmapped. It indicates a race between | ||
424 | * hole punch and page fault. Do nothing in | ||
425 | * this case. Getting here in a truncate | ||
426 | * operation is a bug. | ||
427 | */ | ||
428 | BUG_ON(truncate_op); | 446 | BUG_ON(truncate_op); |
447 | |||
448 | i_mmap_lock_write(mapping); | ||
449 | hugetlb_vmdelete_list(&mapping->i_mmap, | ||
450 | next * pages_per_huge_page(h), | ||
451 | (next + 1) * pages_per_huge_page(h)); | ||
452 | i_mmap_unlock_write(mapping); | ||
453 | } | ||
454 | |||
455 | lock_page(page); | ||
456 | /* | ||
457 | * We must free the huge page and remove from page | ||
458 | * cache (remove_huge_page) BEFORE removing the | ||
459 | * region/reserve map (hugetlb_unreserve_pages). In | ||
460 | * rare out of memory conditions, removal of the | ||
461 | * region/reserve map could fail. Before free'ing | ||
462 | * the page, note PagePrivate which is used in case | ||
463 | * of error. | ||
464 | */ | ||
465 | rsv_on_error = !PagePrivate(page); | ||
466 | remove_huge_page(page); | ||
467 | freed++; | ||
468 | if (!truncate_op) { | ||
469 | if (unlikely(hugetlb_unreserve_pages(inode, | ||
470 | next, next + 1, 1))) | ||
471 | hugetlb_fix_reserve_counts(inode, | ||
472 | rsv_on_error); | ||
429 | } | 473 | } |
430 | 474 | ||
431 | unlock_page(page); | 475 | unlock_page(page); |
@@ -452,41 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode) | |||
452 | clear_inode(inode); | 496 | clear_inode(inode); |
453 | } | 497 | } |
454 | 498 | ||
455 | static inline void | ||
456 | hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) | ||
457 | { | ||
458 | struct vm_area_struct *vma; | ||
459 | |||
460 | /* | ||
461 | * end == 0 indicates that the entire range after | ||
462 | * start should be unmapped. | ||
463 | */ | ||
464 | vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { | ||
465 | unsigned long v_offset; | ||
466 | |||
467 | /* | ||
468 | * Can the expression below overflow on 32-bit arches? | ||
469 | * No, because the interval tree returns us only those vmas | ||
470 | * which overlap the truncated area starting at pgoff, | ||
471 | * and no vma on a 32-bit arch can span beyond the 4GB. | ||
472 | */ | ||
473 | if (vma->vm_pgoff < start) | ||
474 | v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; | ||
475 | else | ||
476 | v_offset = 0; | ||
477 | |||
478 | if (end) { | ||
479 | end = ((end - start) << PAGE_SHIFT) + | ||
480 | vma->vm_start + v_offset; | ||
481 | if (end > vma->vm_end) | ||
482 | end = vma->vm_end; | ||
483 | } else | ||
484 | end = vma->vm_end; | ||
485 | |||
486 | unmap_hugepage_range(vma, vma->vm_start + v_offset, end, NULL); | ||
487 | } | ||
488 | } | ||
489 | |||
490 | static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) | 499 | static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) |
491 | { | 500 | { |
492 | pgoff_t pgoff; | 501 | pgoff_t pgoff; |
@@ -708,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, | |||
708 | /* | 717 | /* |
709 | * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never | 718 | * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never |
710 | * be taken from reclaim -- unlike regular filesystems. This needs an | 719 | * be taken from reclaim -- unlike regular filesystems. This needs an |
711 | * annotation because huge_pmd_share() does an allocation under | 720 | * annotation because huge_pmd_share() does an allocation under hugetlb's |
712 | * i_mmap_rwsem. | 721 | * i_mmap_rwsem. |
713 | */ | 722 | */ |
714 | static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; | 723 | static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; |
diff --git a/fs/proc/page.c b/fs/proc/page.c index 93484034a03d..b2855eea5405 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c | |||
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page) | |||
103 | * pseudo flags for the well known (anonymous) memory mapped pages | 103 | * pseudo flags for the well known (anonymous) memory mapped pages |
104 | * | 104 | * |
105 | * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the | 105 | * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the |
106 | * simple test in page_mapped() is not enough. | 106 | * simple test in page_mapcount() is not enough. |
107 | */ | 107 | */ |
108 | if (!PageSlab(page) && page_mapped(page)) | 108 | if (!PageSlab(page) && page_mapcount(page)) |
109 | u |= 1 << KPF_MMAP; | 109 | u |= 1 << KPF_MMAP; |
110 | if (PageAnon(page)) | 110 | if (PageAnon(page)) |
111 | u |= 1 << KPF_ANON; | 111 | u |= 1 << KPF_ANON; |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a353b4c6e86e..65a1b6c69c11 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
@@ -466,9 +466,10 @@ struct mem_size_stats { | |||
466 | }; | 466 | }; |
467 | 467 | ||
468 | static void smaps_account(struct mem_size_stats *mss, struct page *page, | 468 | static void smaps_account(struct mem_size_stats *mss, struct page *page, |
469 | unsigned long size, bool young, bool dirty) | 469 | bool compound, bool young, bool dirty) |
470 | { | 470 | { |
471 | int mapcount; | 471 | int i, nr = compound ? HPAGE_PMD_NR : 1; |
472 | unsigned long size = nr * PAGE_SIZE; | ||
472 | 473 | ||
473 | if (PageAnon(page)) | 474 | if (PageAnon(page)) |
474 | mss->anonymous += size; | 475 | mss->anonymous += size; |
@@ -477,23 +478,37 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, | |||
477 | /* Accumulate the size in pages that have been accessed. */ | 478 | /* Accumulate the size in pages that have been accessed. */ |
478 | if (young || page_is_young(page) || PageReferenced(page)) | 479 | if (young || page_is_young(page) || PageReferenced(page)) |
479 | mss->referenced += size; | 480 | mss->referenced += size; |
480 | mapcount = page_mapcount(page); | ||
481 | if (mapcount >= 2) { | ||
482 | u64 pss_delta; | ||
483 | 481 | ||
484 | if (dirty || PageDirty(page)) | 482 | /* |
485 | mss->shared_dirty += size; | 483 | * page_count(page) == 1 guarantees the page is mapped exactly once. |
486 | else | 484 | * If any subpage of the compound page mapped with PTE it would elevate |
487 | mss->shared_clean += size; | 485 | * page_count(). |
488 | pss_delta = (u64)size << PSS_SHIFT; | 486 | */ |
489 | do_div(pss_delta, mapcount); | 487 | if (page_count(page) == 1) { |
490 | mss->pss += pss_delta; | ||
491 | } else { | ||
492 | if (dirty || PageDirty(page)) | 488 | if (dirty || PageDirty(page)) |
493 | mss->private_dirty += size; | 489 | mss->private_dirty += size; |
494 | else | 490 | else |
495 | mss->private_clean += size; | 491 | mss->private_clean += size; |
496 | mss->pss += (u64)size << PSS_SHIFT; | 492 | mss->pss += (u64)size << PSS_SHIFT; |
493 | return; | ||
494 | } | ||
495 | |||
496 | for (i = 0; i < nr; i++, page++) { | ||
497 | int mapcount = page_mapcount(page); | ||
498 | |||
499 | if (mapcount >= 2) { | ||
500 | if (dirty || PageDirty(page)) | ||
501 | mss->shared_dirty += PAGE_SIZE; | ||
502 | else | ||
503 | mss->shared_clean += PAGE_SIZE; | ||
504 | mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; | ||
505 | } else { | ||
506 | if (dirty || PageDirty(page)) | ||
507 | mss->private_dirty += PAGE_SIZE; | ||
508 | else | ||
509 | mss->private_clean += PAGE_SIZE; | ||
510 | mss->pss += PAGE_SIZE << PSS_SHIFT; | ||
511 | } | ||
497 | } | 512 | } |
498 | } | 513 | } |
499 | 514 | ||
@@ -554,7 +569,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, | |||
554 | 569 | ||
555 | if (!page) | 570 | if (!page) |
556 | return; | 571 | return; |
557 | smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); | 572 | |
573 | smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte)); | ||
558 | } | 574 | } |
559 | 575 | ||
560 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 576 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
@@ -570,8 +586,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, | |||
570 | if (IS_ERR_OR_NULL(page)) | 586 | if (IS_ERR_OR_NULL(page)) |
571 | return; | 587 | return; |
572 | mss->anonymous_thp += HPAGE_PMD_SIZE; | 588 | mss->anonymous_thp += HPAGE_PMD_SIZE; |
573 | smaps_account(mss, page, HPAGE_PMD_SIZE, | 589 | smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); |
574 | pmd_young(*pmd), pmd_dirty(*pmd)); | ||
575 | } | 590 | } |
576 | #else | 591 | #else |
577 | static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, | 592 | static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, |
@@ -587,7 +602,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
587 | pte_t *pte; | 602 | pte_t *pte; |
588 | spinlock_t *ptl; | 603 | spinlock_t *ptl; |
589 | 604 | ||
590 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 605 | if (pmd_trans_huge_lock(pmd, vma, &ptl)) { |
591 | smaps_pmd_entry(pmd, addr, walk); | 606 | smaps_pmd_entry(pmd, addr, walk); |
592 | spin_unlock(ptl); | 607 | spin_unlock(ptl); |
593 | return 0; | 608 | return 0; |
@@ -898,7 +913,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, | |||
898 | spinlock_t *ptl; | 913 | spinlock_t *ptl; |
899 | struct page *page; | 914 | struct page *page; |
900 | 915 | ||
901 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 916 | if (pmd_trans_huge_lock(pmd, vma, &ptl)) { |
902 | if (cp->type == CLEAR_REFS_SOFT_DIRTY) { | 917 | if (cp->type == CLEAR_REFS_SOFT_DIRTY) { |
903 | clear_soft_dirty_pmd(vma, addr, pmd); | 918 | clear_soft_dirty_pmd(vma, addr, pmd); |
904 | goto out; | 919 | goto out; |
@@ -1172,7 +1187,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, | |||
1172 | int err = 0; | 1187 | int err = 0; |
1173 | 1188 | ||
1174 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 1189 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
1175 | if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) { | 1190 | if (pmd_trans_huge_lock(pmdp, vma, &ptl)) { |
1176 | u64 flags = 0, frame = 0; | 1191 | u64 flags = 0, frame = 0; |
1177 | pmd_t pmd = *pmdp; | 1192 | pmd_t pmd = *pmdp; |
1178 | 1193 | ||
@@ -1504,7 +1519,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, | |||
1504 | pte_t *orig_pte; | 1519 | pte_t *orig_pte; |
1505 | pte_t *pte; | 1520 | pte_t *pte; |
1506 | 1521 | ||
1507 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1522 | if (pmd_trans_huge_lock(pmd, vma, &ptl)) { |
1508 | pte_t huge_pte = *(pte_t *)pmd; | 1523 | pte_t huge_pte = *(pte_t *)pmd; |
1509 | struct page *page; | 1524 | struct page *page; |
1510 | 1525 | ||
@@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat | |||
219 | # define choose_32_64(a,b) b | 219 | # define choose_32_64(a,b) b |
220 | #endif | 220 | #endif |
221 | 221 | ||
222 | #define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x) | 222 | #define valid_dev(x) choose_32_64(old_valid_dev(x),true) |
223 | #define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) | 223 | #define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) |
224 | 224 | ||
225 | #ifndef INIT_STRUCT_STAT_PADDING | 225 | #ifndef INIT_STRUCT_STAT_PADDING |
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 3a6803cb0ec9..0b3c0d39ef75 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h | |||
@@ -1,6 +1,8 @@ | |||
1 | #ifndef _ASM_GENERIC_PGTABLE_H | 1 | #ifndef _ASM_GENERIC_PGTABLE_H |
2 | #define _ASM_GENERIC_PGTABLE_H | 2 | #define _ASM_GENERIC_PGTABLE_H |
3 | 3 | ||
4 | #include <linux/pfn.h> | ||
5 | |||
4 | #ifndef __ASSEMBLY__ | 6 | #ifndef __ASSEMBLY__ |
5 | #ifdef CONFIG_MMU | 7 | #ifdef CONFIG_MMU |
6 | 8 | ||
@@ -207,11 +209,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, | |||
207 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 209 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
208 | #endif | 210 | #endif |
209 | 211 | ||
210 | #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
211 | extern void pmdp_splitting_flush(struct vm_area_struct *vma, | ||
212 | unsigned long address, pmd_t *pmdp); | ||
213 | #endif | ||
214 | |||
215 | #ifndef pmdp_collapse_flush | 212 | #ifndef pmdp_collapse_flush |
216 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 213 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
217 | extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, | 214 | extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, |
@@ -554,7 +551,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, | |||
554 | * by vm_insert_pfn(). | 551 | * by vm_insert_pfn(). |
555 | */ | 552 | */ |
556 | static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, | 553 | static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, |
557 | unsigned long pfn) | 554 | pfn_t pfn) |
558 | { | 555 | { |
559 | return 0; | 556 | return 0; |
560 | } | 557 | } |
@@ -589,7 +586,7 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, | |||
589 | unsigned long pfn, unsigned long addr, | 586 | unsigned long pfn, unsigned long addr, |
590 | unsigned long size); | 587 | unsigned long size); |
591 | extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, | 588 | extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, |
592 | unsigned long pfn); | 589 | pfn_t pfn); |
593 | extern int track_pfn_copy(struct vm_area_struct *vma); | 590 | extern int track_pfn_copy(struct vm_area_struct *vma); |
594 | extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, | 591 | extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, |
595 | unsigned long size); | 592 | unsigned long size); |
@@ -627,10 +624,6 @@ static inline int pmd_trans_huge(pmd_t pmd) | |||
627 | { | 624 | { |
628 | return 0; | 625 | return 0; |
629 | } | 626 | } |
630 | static inline int pmd_trans_splitting(pmd_t pmd) | ||
631 | { | ||
632 | return 0; | ||
633 | } | ||
634 | #ifndef __HAVE_ARCH_PMD_WRITE | 627 | #ifndef __HAVE_ARCH_PMD_WRITE |
635 | static inline int pmd_write(pmd_t pmd) | 628 | static inline int pmd_write(pmd_t pmd) |
636 | { | 629 | { |
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index b58fd667f87b..af0254c09424 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h | |||
@@ -4,6 +4,7 @@ | |||
4 | /* References to section boundaries */ | 4 | /* References to section boundaries */ |
5 | 5 | ||
6 | #include <linux/compiler.h> | 6 | #include <linux/compiler.h> |
7 | #include <linux/types.h> | ||
7 | 8 | ||
8 | /* | 9 | /* |
9 | * Usage guidelines: | 10 | * Usage guidelines: |
@@ -63,4 +64,68 @@ static inline int arch_is_kernel_data(unsigned long addr) | |||
63 | } | 64 | } |
64 | #endif | 65 | #endif |
65 | 66 | ||
67 | /** | ||
68 | * memory_contains - checks if an object is contained within a memory region | ||
69 | * @begin: virtual address of the beginning of the memory region | ||
70 | * @end: virtual address of the end of the memory region | ||
71 | * @virt: virtual address of the memory object | ||
72 | * @size: size of the memory object | ||
73 | * | ||
74 | * Returns: true if the object specified by @virt and @size is entirely | ||
75 | * contained within the memory region defined by @begin and @end, false | ||
76 | * otherwise. | ||
77 | */ | ||
78 | static inline bool memory_contains(void *begin, void *end, void *virt, | ||
79 | size_t size) | ||
80 | { | ||
81 | return virt >= begin && virt + size <= end; | ||
82 | } | ||
83 | |||
84 | /** | ||
85 | * memory_intersects - checks if the region occupied by an object intersects | ||
86 | * with another memory region | ||
87 | * @begin: virtual address of the beginning of the memory regien | ||
88 | * @end: virtual address of the end of the memory region | ||
89 | * @virt: virtual address of the memory object | ||
90 | * @size: size of the memory object | ||
91 | * | ||
92 | * Returns: true if an object's memory region, specified by @virt and @size, | ||
93 | * intersects with the region specified by @begin and @end, false otherwise. | ||
94 | */ | ||
95 | static inline bool memory_intersects(void *begin, void *end, void *virt, | ||
96 | size_t size) | ||
97 | { | ||
98 | void *vend = virt + size; | ||
99 | |||
100 | return (virt >= begin && virt < end) || (vend >= begin && vend < end); | ||
101 | } | ||
102 | |||
103 | /** | ||
104 | * init_section_contains - checks if an object is contained within the init | ||
105 | * section | ||
106 | * @virt: virtual address of the memory object | ||
107 | * @size: size of the memory object | ||
108 | * | ||
109 | * Returns: true if the object specified by @virt and @size is entirely | ||
110 | * contained within the init section, false otherwise. | ||
111 | */ | ||
112 | static inline bool init_section_contains(void *virt, size_t size) | ||
113 | { | ||
114 | return memory_contains(__init_begin, __init_end, virt, size); | ||
115 | } | ||
116 | |||
117 | /** | ||
118 | * init_section_intersects - checks if the region occupied by an object | ||
119 | * intersects with the init section | ||
120 | * @virt: virtual address of the memory object | ||
121 | * @size: size of the memory object | ||
122 | * | ||
123 | * Returns: true if an object's memory region, specified by @virt and @size, | ||
124 | * intersects with the init section, false otherwise. | ||
125 | */ | ||
126 | static inline bool init_section_intersects(void *virt, size_t size) | ||
127 | { | ||
128 | return memory_intersects(__init_begin, __init_end, virt, size); | ||
129 | } | ||
130 | |||
66 | #endif /* _ASM_GENERIC_SECTIONS_H_ */ | 131 | #endif /* _ASM_GENERIC_SECTIONS_H_ */ |
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c70e3588a48c..bfb64d672e19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h | |||
@@ -15,6 +15,7 @@ | |||
15 | #include <linux/backing-dev-defs.h> | 15 | #include <linux/backing-dev-defs.h> |
16 | #include <linux/wait.h> | 16 | #include <linux/wait.h> |
17 | #include <linux/mempool.h> | 17 | #include <linux/mempool.h> |
18 | #include <linux/pfn.h> | ||
18 | #include <linux/bio.h> | 19 | #include <linux/bio.h> |
19 | #include <linux/stringify.h> | 20 | #include <linux/stringify.h> |
20 | #include <linux/gfp.h> | 21 | #include <linux/gfp.h> |
@@ -1617,6 +1618,20 @@ static inline bool integrity_req_gap_front_merge(struct request *req, | |||
1617 | 1618 | ||
1618 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ | 1619 | #endif /* CONFIG_BLK_DEV_INTEGRITY */ |
1619 | 1620 | ||
1621 | /** | ||
1622 | * struct blk_dax_ctl - control and output parameters for ->direct_access | ||
1623 | * @sector: (input) offset relative to a block_device | ||
1624 | * @addr: (output) kernel virtual address for @sector populated by driver | ||
1625 | * @pfn: (output) page frame number for @addr populated by driver | ||
1626 | * @size: (input) number of bytes requested | ||
1627 | */ | ||
1628 | struct blk_dax_ctl { | ||
1629 | sector_t sector; | ||
1630 | void __pmem *addr; | ||
1631 | long size; | ||
1632 | pfn_t pfn; | ||
1633 | }; | ||
1634 | |||
1620 | struct block_device_operations { | 1635 | struct block_device_operations { |
1621 | int (*open) (struct block_device *, fmode_t); | 1636 | int (*open) (struct block_device *, fmode_t); |
1622 | void (*release) (struct gendisk *, fmode_t); | 1637 | void (*release) (struct gendisk *, fmode_t); |
@@ -1624,7 +1639,7 @@ struct block_device_operations { | |||
1624 | int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); | 1639 | int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); |
1625 | int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); | 1640 | int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); |
1626 | long (*direct_access)(struct block_device *, sector_t, void __pmem **, | 1641 | long (*direct_access)(struct block_device *, sector_t, void __pmem **, |
1627 | unsigned long *pfn); | 1642 | pfn_t *); |
1628 | unsigned int (*check_events) (struct gendisk *disk, | 1643 | unsigned int (*check_events) (struct gendisk *disk, |
1629 | unsigned int clearing); | 1644 | unsigned int clearing); |
1630 | /* ->media_changed() is DEPRECATED, use ->check_events() instead */ | 1645 | /* ->media_changed() is DEPRECATED, use ->check_events() instead */ |
@@ -1643,8 +1658,7 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, | |||
1643 | extern int bdev_read_page(struct block_device *, sector_t, struct page *); | 1658 | extern int bdev_read_page(struct block_device *, sector_t, struct page *); |
1644 | extern int bdev_write_page(struct block_device *, sector_t, struct page *, | 1659 | extern int bdev_write_page(struct block_device *, sector_t, struct page *, |
1645 | struct writeback_control *); | 1660 | struct writeback_control *); |
1646 | extern long bdev_direct_access(struct block_device *, sector_t, | 1661 | extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *); |
1647 | void __pmem **addr, unsigned long *pfn, long size); | ||
1648 | #else /* CONFIG_BLOCK */ | 1662 | #else /* CONFIG_BLOCK */ |
1649 | 1663 | ||
1650 | struct block_device; | 1664 | struct block_device; |
diff --git a/include/linux/console.h b/include/linux/console.h index bd194343c346..ea731af2451e 100644 --- a/include/linux/console.h +++ b/include/linux/console.h | |||
@@ -150,6 +150,7 @@ extern int console_trylock(void); | |||
150 | extern void console_unlock(void); | 150 | extern void console_unlock(void); |
151 | extern void console_conditional_schedule(void); | 151 | extern void console_conditional_schedule(void); |
152 | extern void console_unblank(void); | 152 | extern void console_unblank(void); |
153 | extern void console_flush_on_panic(void); | ||
153 | extern struct tty_driver *console_device(int *); | 154 | extern struct tty_driver *console_device(int *); |
154 | extern void console_stop(struct console *); | 155 | extern void console_stop(struct console *); |
155 | extern void console_start(struct console *); | 156 | extern void console_start(struct console *); |
diff --git a/include/linux/err.h b/include/linux/err.h index a729120644d5..56762ab41713 100644 --- a/include/linux/err.h +++ b/include/linux/err.h | |||
@@ -37,7 +37,7 @@ static inline bool __must_check IS_ERR(__force const void *ptr) | |||
37 | 37 | ||
38 | static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) | 38 | static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) |
39 | { | 39 | { |
40 | return !ptr || IS_ERR_VALUE((unsigned long)ptr); | 40 | return unlikely(!ptr) || IS_ERR_VALUE((unsigned long)ptr); |
41 | } | 41 | } |
42 | 42 | ||
43 | /** | 43 | /** |
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ecb080d6ff42..cfe81e10bd54 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h | |||
@@ -19,13 +19,16 @@ extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
19 | unsigned long addr, | 19 | unsigned long addr, |
20 | pmd_t *pmd, | 20 | pmd_t *pmd, |
21 | unsigned int flags); | 21 | unsigned int flags); |
22 | extern int madvise_free_huge_pmd(struct mmu_gather *tlb, | ||
23 | struct vm_area_struct *vma, | ||
24 | pmd_t *pmd, unsigned long addr, unsigned long next); | ||
22 | extern int zap_huge_pmd(struct mmu_gather *tlb, | 25 | extern int zap_huge_pmd(struct mmu_gather *tlb, |
23 | struct vm_area_struct *vma, | 26 | struct vm_area_struct *vma, |
24 | pmd_t *pmd, unsigned long addr); | 27 | pmd_t *pmd, unsigned long addr); |
25 | extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | 28 | extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
26 | unsigned long addr, unsigned long end, | 29 | unsigned long addr, unsigned long end, |
27 | unsigned char *vec); | 30 | unsigned char *vec); |
28 | extern int move_huge_pmd(struct vm_area_struct *vma, | 31 | extern bool move_huge_pmd(struct vm_area_struct *vma, |
29 | struct vm_area_struct *new_vma, | 32 | struct vm_area_struct *new_vma, |
30 | unsigned long old_addr, | 33 | unsigned long old_addr, |
31 | unsigned long new_addr, unsigned long old_end, | 34 | unsigned long new_addr, unsigned long old_end, |
@@ -34,8 +37,7 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
34 | unsigned long addr, pgprot_t newprot, | 37 | unsigned long addr, pgprot_t newprot, |
35 | int prot_numa); | 38 | int prot_numa); |
36 | int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, | 39 | int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, |
37 | unsigned long pfn, bool write); | 40 | pfn_t pfn, bool write); |
38 | |||
39 | enum transparent_hugepage_flag { | 41 | enum transparent_hugepage_flag { |
40 | TRANSPARENT_HUGEPAGE_FLAG, | 42 | TRANSPARENT_HUGEPAGE_FLAG, |
41 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, | 43 | TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, |
@@ -48,21 +50,13 @@ enum transparent_hugepage_flag { | |||
48 | #endif | 50 | #endif |
49 | }; | 51 | }; |
50 | 52 | ||
51 | enum page_check_address_pmd_flag { | ||
52 | PAGE_CHECK_ADDRESS_PMD_FLAG, | ||
53 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, | ||
54 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, | ||
55 | }; | ||
56 | extern pmd_t *page_check_address_pmd(struct page *page, | ||
57 | struct mm_struct *mm, | ||
58 | unsigned long address, | ||
59 | enum page_check_address_pmd_flag flag, | ||
60 | spinlock_t **ptl); | ||
61 | |||
62 | #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) | 53 | #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) |
63 | #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) | 54 | #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) |
64 | 55 | ||
65 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 56 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
57 | struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
58 | pmd_t *pmd, int flags); | ||
59 | |||
66 | #define HPAGE_PMD_SHIFT PMD_SHIFT | 60 | #define HPAGE_PMD_SHIFT PMD_SHIFT |
67 | #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) | 61 | #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) |
68 | #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) | 62 | #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) |
@@ -95,30 +89,28 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); | |||
95 | #endif /* CONFIG_DEBUG_VM */ | 89 | #endif /* CONFIG_DEBUG_VM */ |
96 | 90 | ||
97 | extern unsigned long transparent_hugepage_flags; | 91 | extern unsigned long transparent_hugepage_flags; |
98 | extern int split_huge_page_to_list(struct page *page, struct list_head *list); | 92 | |
93 | extern void prep_transhuge_page(struct page *page); | ||
94 | extern void free_transhuge_page(struct page *page); | ||
95 | |||
96 | int split_huge_page_to_list(struct page *page, struct list_head *list); | ||
99 | static inline int split_huge_page(struct page *page) | 97 | static inline int split_huge_page(struct page *page) |
100 | { | 98 | { |
101 | return split_huge_page_to_list(page, NULL); | 99 | return split_huge_page_to_list(page, NULL); |
102 | } | 100 | } |
103 | extern void __split_huge_page_pmd(struct vm_area_struct *vma, | 101 | void deferred_split_huge_page(struct page *page); |
104 | unsigned long address, pmd_t *pmd); | 102 | |
105 | #define split_huge_page_pmd(__vma, __address, __pmd) \ | 103 | void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
104 | unsigned long address); | ||
105 | |||
106 | #define split_huge_pmd(__vma, __pmd, __address) \ | ||
106 | do { \ | 107 | do { \ |
107 | pmd_t *____pmd = (__pmd); \ | 108 | pmd_t *____pmd = (__pmd); \ |
108 | if (unlikely(pmd_trans_huge(*____pmd))) \ | 109 | if (pmd_trans_huge(*____pmd) \ |
109 | __split_huge_page_pmd(__vma, __address, \ | 110 | || pmd_devmap(*____pmd)) \ |
110 | ____pmd); \ | 111 | __split_huge_pmd(__vma, __pmd, __address); \ |
111 | } while (0) | 112 | } while (0) |
112 | #define wait_split_huge_page(__anon_vma, __pmd) \ | 113 | |
113 | do { \ | ||
114 | pmd_t *____pmd = (__pmd); \ | ||
115 | anon_vma_lock_write(__anon_vma); \ | ||
116 | anon_vma_unlock_write(__anon_vma); \ | ||
117 | BUG_ON(pmd_trans_splitting(*____pmd) || \ | ||
118 | pmd_trans_huge(*____pmd)); \ | ||
119 | } while (0) | ||
120 | extern void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | ||
121 | pmd_t *pmd); | ||
122 | #if HPAGE_PMD_ORDER >= MAX_ORDER | 114 | #if HPAGE_PMD_ORDER >= MAX_ORDER |
123 | #error "hugepages can't be allocated by the buddy allocator" | 115 | #error "hugepages can't be allocated by the buddy allocator" |
124 | #endif | 116 | #endif |
@@ -128,17 +120,17 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
128 | unsigned long start, | 120 | unsigned long start, |
129 | unsigned long end, | 121 | unsigned long end, |
130 | long adjust_next); | 122 | long adjust_next); |
131 | extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, | 123 | extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, |
132 | spinlock_t **ptl); | 124 | spinlock_t **ptl); |
133 | /* mmap_sem must be held on entry */ | 125 | /* mmap_sem must be held on entry */ |
134 | static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, | 126 | static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, |
135 | spinlock_t **ptl) | 127 | spinlock_t **ptl) |
136 | { | 128 | { |
137 | VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); | 129 | VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); |
138 | if (pmd_trans_huge(*pmd)) | 130 | if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) |
139 | return __pmd_trans_huge_lock(pmd, vma, ptl); | 131 | return __pmd_trans_huge_lock(pmd, vma, ptl); |
140 | else | 132 | else |
141 | return 0; | 133 | return false; |
142 | } | 134 | } |
143 | static inline int hpage_nr_pages(struct page *page) | 135 | static inline int hpage_nr_pages(struct page *page) |
144 | { | 136 | { |
@@ -183,11 +175,8 @@ static inline int split_huge_page(struct page *page) | |||
183 | { | 175 | { |
184 | return 0; | 176 | return 0; |
185 | } | 177 | } |
186 | #define split_huge_page_pmd(__vma, __address, __pmd) \ | 178 | static inline void deferred_split_huge_page(struct page *page) {} |
187 | do { } while (0) | 179 | #define split_huge_pmd(__vma, __pmd, __address) \ |
188 | #define wait_split_huge_page(__anon_vma, __pmd) \ | ||
189 | do { } while (0) | ||
190 | #define split_huge_page_pmd_mm(__mm, __address, __pmd) \ | ||
191 | do { } while (0) | 180 | do { } while (0) |
192 | static inline int hugepage_madvise(struct vm_area_struct *vma, | 181 | static inline int hugepage_madvise(struct vm_area_struct *vma, |
193 | unsigned long *vm_flags, int advice) | 182 | unsigned long *vm_flags, int advice) |
@@ -201,10 +190,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
201 | long adjust_next) | 190 | long adjust_next) |
202 | { | 191 | { |
203 | } | 192 | } |
204 | static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, | 193 | static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, |
205 | spinlock_t **ptl) | 194 | spinlock_t **ptl) |
206 | { | 195 | { |
207 | return 0; | 196 | return false; |
208 | } | 197 | } |
209 | 198 | ||
210 | static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | 199 | static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, |
@@ -218,6 +207,12 @@ static inline bool is_huge_zero_page(struct page *page) | |||
218 | return false; | 207 | return false; |
219 | } | 208 | } |
220 | 209 | ||
210 | |||
211 | static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, | ||
212 | unsigned long addr, pmd_t *pmd, int flags) | ||
213 | { | ||
214 | return NULL; | ||
215 | } | ||
221 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 216 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
222 | 217 | ||
223 | #endif /* _LINUX_HUGE_MM_H */ | 218 | #endif /* _LINUX_HUGE_MM_H */ |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e76574d8f9b5..7d953c2542a8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/cgroup.h> | 8 | #include <linux/cgroup.h> |
9 | #include <linux/list.h> | 9 | #include <linux/list.h> |
10 | #include <linux/kref.h> | 10 | #include <linux/kref.h> |
11 | #include <asm/pgtable.h> | ||
11 | 12 | ||
12 | struct ctl_table; | 13 | struct ctl_table; |
13 | struct user_struct; | 14 | struct user_struct; |
diff --git a/include/linux/io.h b/include/linux/io.h index de64c1e53612..fffd88d7f426 100644 --- a/include/linux/io.h +++ b/include/linux/io.h | |||
@@ -89,21 +89,6 @@ void devm_memunmap(struct device *dev, void *addr); | |||
89 | 89 | ||
90 | void *__devm_memremap_pages(struct device *dev, struct resource *res); | 90 | void *__devm_memremap_pages(struct device *dev, struct resource *res); |
91 | 91 | ||
92 | #ifdef CONFIG_ZONE_DEVICE | ||
93 | void *devm_memremap_pages(struct device *dev, struct resource *res); | ||
94 | #else | ||
95 | static inline void *devm_memremap_pages(struct device *dev, struct resource *res) | ||
96 | { | ||
97 | /* | ||
98 | * Fail attempts to call devm_memremap_pages() without | ||
99 | * ZONE_DEVICE support enabled, this requires callers to fall | ||
100 | * back to plain devm_memremap() based on config | ||
101 | */ | ||
102 | WARN_ON_ONCE(1); | ||
103 | return ERR_PTR(-ENXIO); | ||
104 | } | ||
105 | #endif | ||
106 | |||
107 | /* | 92 | /* |
108 | * Some systems do not have legacy ISA devices. | 93 | * Some systems do not have legacy ISA devices. |
109 | * /dev/port is not a valid interface on these systems. | 94 | * /dev/port is not a valid interface on these systems. |
diff --git a/include/linux/kdev_t.h b/include/linux/kdev_t.h index 052c7b32cc91..8e9e288b08c1 100644 --- a/include/linux/kdev_t.h +++ b/include/linux/kdev_t.h | |||
@@ -35,11 +35,6 @@ static inline dev_t old_decode_dev(u16 val) | |||
35 | return MKDEV((val >> 8) & 255, val & 255); | 35 | return MKDEV((val >> 8) & 255, val & 255); |
36 | } | 36 | } |
37 | 37 | ||
38 | static inline bool new_valid_dev(dev_t dev) | ||
39 | { | ||
40 | return 1; | ||
41 | } | ||
42 | |||
43 | static inline u32 new_encode_dev(dev_t dev) | 38 | static inline u32 new_encode_dev(dev_t dev) |
44 | { | 39 | { |
45 | unsigned major = MAJOR(dev); | 40 | unsigned major = MAJOR(dev); |
diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7311c3294e25..f31638c6e873 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h | |||
@@ -202,26 +202,26 @@ extern int _cond_resched(void); | |||
202 | 202 | ||
203 | /** | 203 | /** |
204 | * abs - return absolute value of an argument | 204 | * abs - return absolute value of an argument |
205 | * @x: the value. If it is unsigned type, it is converted to signed type first | 205 | * @x: the value. If it is unsigned type, it is converted to signed type first. |
206 | * (s64, long or int depending on its size). | 206 | * char is treated as if it was signed (regardless of whether it really is) |
207 | * but the macro's return type is preserved as char. | ||
207 | * | 208 | * |
208 | * Return: an absolute value of x. If x is 64-bit, macro's return type is s64, | 209 | * Return: an absolute value of x. |
209 | * otherwise it is signed long. | ||
210 | */ | 210 | */ |
211 | #define abs(x) __builtin_choose_expr(sizeof(x) == sizeof(s64), ({ \ | 211 | #define abs(x) __abs_choose_expr(x, long long, \ |
212 | s64 __x = (x); \ | 212 | __abs_choose_expr(x, long, \ |
213 | (__x < 0) ? -__x : __x; \ | 213 | __abs_choose_expr(x, int, \ |
214 | }), ({ \ | 214 | __abs_choose_expr(x, short, \ |
215 | long ret; \ | 215 | __abs_choose_expr(x, char, \ |
216 | if (sizeof(x) == sizeof(long)) { \ | 216 | __builtin_choose_expr( \ |
217 | long __x = (x); \ | 217 | __builtin_types_compatible_p(typeof(x), char), \ |
218 | ret = (__x < 0) ? -__x : __x; \ | 218 | (char)({ signed char __x = (x); __x<0?-__x:__x; }), \ |
219 | } else { \ | 219 | ((void)0))))))) |
220 | int __x = (x); \ | 220 | |
221 | ret = (__x < 0) ? -__x : __x; \ | 221 | #define __abs_choose_expr(x, type, other) __builtin_choose_expr( \ |
222 | } \ | 222 | __builtin_types_compatible_p(typeof(x), signed type) || \ |
223 | ret; \ | 223 | __builtin_types_compatible_p(typeof(x), unsigned type), \ |
224 | })) | 224 | ({ signed type __x = (x); __x < 0 ? -__x : __x; }), other) |
225 | 225 | ||
226 | /** | 226 | /** |
227 | * reciprocal_scale - "scale" a value into range [0, ep_ro) | 227 | * reciprocal_scale - "scale" a value into range [0, ep_ro) |
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f707f74055c3..861f690aa791 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h | |||
@@ -66,7 +66,7 @@ | |||
66 | * error pfns indicate that the gfn is in slot but faild to | 66 | * error pfns indicate that the gfn is in slot but faild to |
67 | * translate it to pfn on host. | 67 | * translate it to pfn on host. |
68 | */ | 68 | */ |
69 | static inline bool is_error_pfn(pfn_t pfn) | 69 | static inline bool is_error_pfn(kvm_pfn_t pfn) |
70 | { | 70 | { |
71 | return !!(pfn & KVM_PFN_ERR_MASK); | 71 | return !!(pfn & KVM_PFN_ERR_MASK); |
72 | } | 72 | } |
@@ -76,13 +76,13 @@ static inline bool is_error_pfn(pfn_t pfn) | |||
76 | * translated to pfn - it is not in slot or failed to | 76 | * translated to pfn - it is not in slot or failed to |
77 | * translate it to pfn. | 77 | * translate it to pfn. |
78 | */ | 78 | */ |
79 | static inline bool is_error_noslot_pfn(pfn_t pfn) | 79 | static inline bool is_error_noslot_pfn(kvm_pfn_t pfn) |
80 | { | 80 | { |
81 | return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); | 81 | return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); |
82 | } | 82 | } |
83 | 83 | ||
84 | /* noslot pfn indicates that the gfn is not in slot. */ | 84 | /* noslot pfn indicates that the gfn is not in slot. */ |
85 | static inline bool is_noslot_pfn(pfn_t pfn) | 85 | static inline bool is_noslot_pfn(kvm_pfn_t pfn) |
86 | { | 86 | { |
87 | return pfn == KVM_PFN_NOSLOT; | 87 | return pfn == KVM_PFN_NOSLOT; |
88 | } | 88 | } |
@@ -591,19 +591,20 @@ void kvm_release_page_clean(struct page *page); | |||
591 | void kvm_release_page_dirty(struct page *page); | 591 | void kvm_release_page_dirty(struct page *page); |
592 | void kvm_set_page_accessed(struct page *page); | 592 | void kvm_set_page_accessed(struct page *page); |
593 | 593 | ||
594 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); | 594 | kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn); |
595 | pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); | 595 | kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn); |
596 | pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, | 596 | kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, |
597 | bool *writable); | 597 | bool *writable); |
598 | pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); | 598 | kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn); |
599 | pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); | 599 | kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn); |
600 | pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, | 600 | kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, |
601 | bool *async, bool write_fault, bool *writable); | 601 | bool atomic, bool *async, bool write_fault, |
602 | bool *writable); | ||
602 | 603 | ||
603 | void kvm_release_pfn_clean(pfn_t pfn); | 604 | void kvm_release_pfn_clean(kvm_pfn_t pfn); |
604 | void kvm_set_pfn_dirty(pfn_t pfn); | 605 | void kvm_set_pfn_dirty(kvm_pfn_t pfn); |
605 | void kvm_set_pfn_accessed(pfn_t pfn); | 606 | void kvm_set_pfn_accessed(kvm_pfn_t pfn); |
606 | void kvm_get_pfn(pfn_t pfn); | 607 | void kvm_get_pfn(kvm_pfn_t pfn); |
607 | 608 | ||
608 | int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, | 609 | int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, |
609 | int len); | 610 | int len); |
@@ -629,8 +630,8 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn); | |||
629 | 630 | ||
630 | struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); | 631 | struct kvm_memslots *kvm_vcpu_memslots(struct kvm_vcpu *vcpu); |
631 | struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); | 632 | struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn); |
632 | pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); | 633 | kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn); |
633 | pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); | 634 | kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn); |
634 | struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); | 635 | struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn); |
635 | unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); | 636 | unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn); |
636 | unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); | 637 | unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable); |
@@ -811,7 +812,7 @@ void kvm_arch_sync_events(struct kvm *kvm); | |||
811 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); | 812 | int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); |
812 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); | 813 | void kvm_vcpu_kick(struct kvm_vcpu *vcpu); |
813 | 814 | ||
814 | bool kvm_is_reserved_pfn(pfn_t pfn); | 815 | bool kvm_is_reserved_pfn(kvm_pfn_t pfn); |
815 | 816 | ||
816 | struct kvm_irq_ack_notifier { | 817 | struct kvm_irq_ack_notifier { |
817 | struct hlist_node link; | 818 | struct hlist_node link; |
@@ -965,7 +966,7 @@ static inline gfn_t gpa_to_gfn(gpa_t gpa) | |||
965 | return (gfn_t)(gpa >> PAGE_SHIFT); | 966 | return (gfn_t)(gpa >> PAGE_SHIFT); |
966 | } | 967 | } |
967 | 968 | ||
968 | static inline hpa_t pfn_to_hpa(pfn_t pfn) | 969 | static inline hpa_t pfn_to_hpa(kvm_pfn_t pfn) |
969 | { | 970 | { |
970 | return (hpa_t)pfn << PAGE_SHIFT; | 971 | return (hpa_t)pfn << PAGE_SHIFT; |
971 | } | 972 | } |
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 1b47a185c2f0..8bf259dae9f6 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h | |||
@@ -53,7 +53,7 @@ typedef unsigned long hva_t; | |||
53 | typedef u64 hpa_t; | 53 | typedef u64 hpa_t; |
54 | typedef u64 hfn_t; | 54 | typedef u64 hfn_t; |
55 | 55 | ||
56 | typedef hfn_t pfn_t; | 56 | typedef hfn_t kvm_pfn_t; |
57 | 57 | ||
58 | struct gfn_to_hva_cache { | 58 | struct gfn_to_hva_cache { |
59 | u64 generation; | 59 | u64 generation; |
diff --git a/include/linux/list.h b/include/linux/list.h index 5356f4d661a7..30cf4200ab40 100644 --- a/include/linux/list.h +++ b/include/linux/list.h | |||
@@ -113,6 +113,17 @@ extern void __list_del_entry(struct list_head *entry); | |||
113 | extern void list_del(struct list_head *entry); | 113 | extern void list_del(struct list_head *entry); |
114 | #endif | 114 | #endif |
115 | 115 | ||
116 | #ifdef CONFIG_DEBUG_LIST | ||
117 | /* | ||
118 | * See devm_memremap_pages() which wants DEBUG_LIST=y to assert if one | ||
119 | * of the pages it allocates is ever passed to list_add() | ||
120 | */ | ||
121 | extern void list_force_poison(struct list_head *entry); | ||
122 | #else | ||
123 | /* fallback to the less strict LIST_POISON* definitions */ | ||
124 | #define list_force_poison list_del | ||
125 | #endif | ||
126 | |||
116 | /** | 127 | /** |
117 | * list_replace - replace old entry by new one | 128 | * list_replace - replace old entry by new one |
118 | * @old : the element to be replaced | 129 | * @old : the element to be replaced |
diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 173fb44e22f1..3106ac1c895e 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h | |||
@@ -61,6 +61,14 @@ extern int memblock_debug; | |||
61 | extern bool movable_node_enabled; | 61 | extern bool movable_node_enabled; |
62 | #endif /* CONFIG_MOVABLE_NODE */ | 62 | #endif /* CONFIG_MOVABLE_NODE */ |
63 | 63 | ||
64 | #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK | ||
65 | #define __init_memblock __meminit | ||
66 | #define __initdata_memblock __meminitdata | ||
67 | #else | ||
68 | #define __init_memblock | ||
69 | #define __initdata_memblock | ||
70 | #endif | ||
71 | |||
64 | #define memblock_dbg(fmt, ...) \ | 72 | #define memblock_dbg(fmt, ...) \ |
65 | if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) | 73 | if (memblock_debug) printk(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__) |
66 | 74 | ||
@@ -166,7 +174,7 @@ static inline bool memblock_is_hotpluggable(struct memblock_region *m) | |||
166 | return m->flags & MEMBLOCK_HOTPLUG; | 174 | return m->flags & MEMBLOCK_HOTPLUG; |
167 | } | 175 | } |
168 | 176 | ||
169 | static inline bool movable_node_is_enabled(void) | 177 | static inline bool __init_memblock movable_node_is_enabled(void) |
170 | { | 178 | { |
171 | return movable_node_enabled; | 179 | return movable_node_enabled; |
172 | } | 180 | } |
@@ -405,14 +413,6 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo | |||
405 | for (idx = 0; idx < memblock_type->cnt; \ | 413 | for (idx = 0; idx < memblock_type->cnt; \ |
406 | idx++,rgn = &memblock_type->regions[idx]) | 414 | idx++,rgn = &memblock_type->regions[idx]) |
407 | 415 | ||
408 | #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK | ||
409 | #define __init_memblock __meminit | ||
410 | #define __initdata_memblock __meminitdata | ||
411 | #else | ||
412 | #define __init_memblock | ||
413 | #define __initdata_memblock | ||
414 | #endif | ||
415 | |||
416 | #ifdef CONFIG_MEMTEST | 416 | #ifdef CONFIG_MEMTEST |
417 | extern void early_memtest(phys_addr_t start, phys_addr_t end); | 417 | extern void early_memtest(phys_addr_t start, phys_addr_t end); |
418 | #else | 418 | #else |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2292468f2a30..189f04d4d2ec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -280,10 +280,12 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg, | |||
280 | bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); | 280 | bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); |
281 | 281 | ||
282 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | 282 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, |
283 | gfp_t gfp_mask, struct mem_cgroup **memcgp); | 283 | gfp_t gfp_mask, struct mem_cgroup **memcgp, |
284 | bool compound); | ||
284 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, | 285 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, |
285 | bool lrucare); | 286 | bool lrucare, bool compound); |
286 | void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); | 287 | void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, |
288 | bool compound); | ||
287 | void mem_cgroup_uncharge(struct page *page); | 289 | void mem_cgroup_uncharge(struct page *page); |
288 | void mem_cgroup_uncharge_list(struct list_head *page_list); | 290 | void mem_cgroup_uncharge_list(struct list_head *page_list); |
289 | 291 | ||
@@ -515,7 +517,8 @@ static inline bool mem_cgroup_low(struct mem_cgroup *root, | |||
515 | 517 | ||
516 | static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | 518 | static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, |
517 | gfp_t gfp_mask, | 519 | gfp_t gfp_mask, |
518 | struct mem_cgroup **memcgp) | 520 | struct mem_cgroup **memcgp, |
521 | bool compound) | ||
519 | { | 522 | { |
520 | *memcgp = NULL; | 523 | *memcgp = NULL; |
521 | return 0; | 524 | return 0; |
@@ -523,12 +526,13 @@ static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
523 | 526 | ||
524 | static inline void mem_cgroup_commit_charge(struct page *page, | 527 | static inline void mem_cgroup_commit_charge(struct page *page, |
525 | struct mem_cgroup *memcg, | 528 | struct mem_cgroup *memcg, |
526 | bool lrucare) | 529 | bool lrucare, bool compound) |
527 | { | 530 | { |
528 | } | 531 | } |
529 | 532 | ||
530 | static inline void mem_cgroup_cancel_charge(struct page *page, | 533 | static inline void mem_cgroup_cancel_charge(struct page *page, |
531 | struct mem_cgroup *memcg) | 534 | struct mem_cgroup *memcg, |
535 | bool compound) | ||
532 | { | 536 | { |
533 | } | 537 | } |
534 | 538 | ||
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2ea574ff9714..43405992d027 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h | |||
@@ -275,7 +275,8 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); | |||
275 | extern bool is_memblock_offlined(struct memory_block *mem); | 275 | extern bool is_memblock_offlined(struct memory_block *mem); |
276 | extern void remove_memory(int nid, u64 start, u64 size); | 276 | extern void remove_memory(int nid, u64 start, u64 size); |
277 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); | 277 | extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); |
278 | extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); | 278 | extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, |
279 | unsigned long map_offset); | ||
279 | extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, | 280 | extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, |
280 | unsigned long pnum); | 281 | unsigned long pnum); |
281 | 282 | ||
diff --git a/include/linux/memremap.h b/include/linux/memremap.h new file mode 100644 index 000000000000..bcaa634139a9 --- /dev/null +++ b/include/linux/memremap.h | |||
@@ -0,0 +1,114 @@ | |||
1 | #ifndef _LINUX_MEMREMAP_H_ | ||
2 | #define _LINUX_MEMREMAP_H_ | ||
3 | #include <linux/mm.h> | ||
4 | #include <linux/ioport.h> | ||
5 | #include <linux/percpu-refcount.h> | ||
6 | |||
7 | struct resource; | ||
8 | struct device; | ||
9 | |||
10 | /** | ||
11 | * struct vmem_altmap - pre-allocated storage for vmemmap_populate | ||
12 | * @base_pfn: base of the entire dev_pagemap mapping | ||
13 | * @reserve: pages mapped, but reserved for driver use (relative to @base) | ||
14 | * @free: free pages set aside in the mapping for memmap storage | ||
15 | * @align: pages reserved to meet allocation alignments | ||
16 | * @alloc: track pages consumed, private to vmemmap_populate() | ||
17 | */ | ||
18 | struct vmem_altmap { | ||
19 | const unsigned long base_pfn; | ||
20 | const unsigned long reserve; | ||
21 | unsigned long free; | ||
22 | unsigned long align; | ||
23 | unsigned long alloc; | ||
24 | }; | ||
25 | |||
26 | unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); | ||
27 | void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); | ||
28 | |||
29 | #if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE) | ||
30 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); | ||
31 | #else | ||
32 | static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) | ||
33 | { | ||
34 | return NULL; | ||
35 | } | ||
36 | #endif | ||
37 | |||
38 | /** | ||
39 | * struct dev_pagemap - metadata for ZONE_DEVICE mappings | ||
40 | * @altmap: pre-allocated/reserved memory for vmemmap allocations | ||
41 | * @res: physical address range covered by @ref | ||
42 | * @ref: reference count that pins the devm_memremap_pages() mapping | ||
43 | * @dev: host device of the mapping for debug | ||
44 | */ | ||
45 | struct dev_pagemap { | ||
46 | struct vmem_altmap *altmap; | ||
47 | const struct resource *res; | ||
48 | struct percpu_ref *ref; | ||
49 | struct device *dev; | ||
50 | }; | ||
51 | |||
52 | #ifdef CONFIG_ZONE_DEVICE | ||
53 | void *devm_memremap_pages(struct device *dev, struct resource *res, | ||
54 | struct percpu_ref *ref, struct vmem_altmap *altmap); | ||
55 | struct dev_pagemap *find_dev_pagemap(resource_size_t phys); | ||
56 | #else | ||
57 | static inline void *devm_memremap_pages(struct device *dev, | ||
58 | struct resource *res, struct percpu_ref *ref, | ||
59 | struct vmem_altmap *altmap) | ||
60 | { | ||
61 | /* | ||
62 | * Fail attempts to call devm_memremap_pages() without | ||
63 | * ZONE_DEVICE support enabled, this requires callers to fall | ||
64 | * back to plain devm_memremap() based on config | ||
65 | */ | ||
66 | WARN_ON_ONCE(1); | ||
67 | return ERR_PTR(-ENXIO); | ||
68 | } | ||
69 | |||
70 | static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) | ||
71 | { | ||
72 | return NULL; | ||
73 | } | ||
74 | #endif | ||
75 | |||
76 | /** | ||
77 | * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn | ||
78 | * @pfn: page frame number to lookup page_map | ||
79 | * @pgmap: optional known pgmap that already has a reference | ||
80 | * | ||
81 | * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the | ||
82 | * same mapping. | ||
83 | */ | ||
84 | static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, | ||
85 | struct dev_pagemap *pgmap) | ||
86 | { | ||
87 | const struct resource *res = pgmap ? pgmap->res : NULL; | ||
88 | resource_size_t phys = PFN_PHYS(pfn); | ||
89 | |||
90 | /* | ||
91 | * In the cached case we're already holding a live reference so | ||
92 | * we can simply do a blind increment | ||
93 | */ | ||
94 | if (res && phys >= res->start && phys <= res->end) { | ||
95 | percpu_ref_get(pgmap->ref); | ||
96 | return pgmap; | ||
97 | } | ||
98 | |||
99 | /* fall back to slow path lookup */ | ||
100 | rcu_read_lock(); | ||
101 | pgmap = find_dev_pagemap(phys); | ||
102 | if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) | ||
103 | pgmap = NULL; | ||
104 | rcu_read_unlock(); | ||
105 | |||
106 | return pgmap; | ||
107 | } | ||
108 | |||
109 | static inline void put_dev_pagemap(struct dev_pagemap *pgmap) | ||
110 | { | ||
111 | if (pgmap) | ||
112 | percpu_ref_put(pgmap->ref); | ||
113 | } | ||
114 | #endif /* _LINUX_MEMREMAP_H_ */ | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 839d9e9a1c38..f1cd22f2df1a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/mm_types.h> | 16 | #include <linux/mm_types.h> |
17 | #include <linux/range.h> | 17 | #include <linux/range.h> |
18 | #include <linux/pfn.h> | 18 | #include <linux/pfn.h> |
19 | #include <linux/percpu-refcount.h> | ||
19 | #include <linux/bit_spinlock.h> | 20 | #include <linux/bit_spinlock.h> |
20 | #include <linux/shrinker.h> | 21 | #include <linux/shrinker.h> |
21 | #include <linux/resource.h> | 22 | #include <linux/resource.h> |
@@ -329,6 +330,13 @@ struct inode; | |||
329 | #define page_private(page) ((page)->private) | 330 | #define page_private(page) ((page)->private) |
330 | #define set_page_private(page, v) ((page)->private = (v)) | 331 | #define set_page_private(page, v) ((page)->private = (v)) |
331 | 332 | ||
333 | #if !defined(__HAVE_ARCH_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) | ||
334 | static inline int pmd_devmap(pmd_t pmd) | ||
335 | { | ||
336 | return 0; | ||
337 | } | ||
338 | #endif | ||
339 | |||
332 | /* | 340 | /* |
333 | * FIXME: take this include out, include page-flags.h in | 341 | * FIXME: take this include out, include page-flags.h in |
334 | * files which need it (119 of them) | 342 | * files which need it (119 of them) |
@@ -410,39 +418,17 @@ static inline int is_vmalloc_or_module_addr(const void *x) | |||
410 | 418 | ||
411 | extern void kvfree(const void *addr); | 419 | extern void kvfree(const void *addr); |
412 | 420 | ||
413 | static inline void compound_lock(struct page *page) | 421 | static inline atomic_t *compound_mapcount_ptr(struct page *page) |
414 | { | 422 | { |
415 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 423 | return &page[1].compound_mapcount; |
416 | VM_BUG_ON_PAGE(PageSlab(page), page); | ||
417 | bit_spin_lock(PG_compound_lock, &page->flags); | ||
418 | #endif | ||
419 | } | 424 | } |
420 | 425 | ||
421 | static inline void compound_unlock(struct page *page) | 426 | static inline int compound_mapcount(struct page *page) |
422 | { | 427 | { |
423 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 428 | if (!PageCompound(page)) |
424 | VM_BUG_ON_PAGE(PageSlab(page), page); | 429 | return 0; |
425 | bit_spin_unlock(PG_compound_lock, &page->flags); | 430 | page = compound_head(page); |
426 | #endif | 431 | return atomic_read(compound_mapcount_ptr(page)) + 1; |
427 | } | ||
428 | |||
429 | static inline unsigned long compound_lock_irqsave(struct page *page) | ||
430 | { | ||
431 | unsigned long uninitialized_var(flags); | ||
432 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
433 | local_irq_save(flags); | ||
434 | compound_lock(page); | ||
435 | #endif | ||
436 | return flags; | ||
437 | } | ||
438 | |||
439 | static inline void compound_unlock_irqrestore(struct page *page, | ||
440 | unsigned long flags) | ||
441 | { | ||
442 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
443 | compound_unlock(page); | ||
444 | local_irq_restore(flags); | ||
445 | #endif | ||
446 | } | 432 | } |
447 | 433 | ||
448 | /* | 434 | /* |
@@ -455,61 +441,29 @@ static inline void page_mapcount_reset(struct page *page) | |||
455 | atomic_set(&(page)->_mapcount, -1); | 441 | atomic_set(&(page)->_mapcount, -1); |
456 | } | 442 | } |
457 | 443 | ||
444 | int __page_mapcount(struct page *page); | ||
445 | |||
458 | static inline int page_mapcount(struct page *page) | 446 | static inline int page_mapcount(struct page *page) |
459 | { | 447 | { |
460 | VM_BUG_ON_PAGE(PageSlab(page), page); | 448 | VM_BUG_ON_PAGE(PageSlab(page), page); |
461 | return atomic_read(&page->_mapcount) + 1; | ||
462 | } | ||
463 | 449 | ||
464 | static inline int page_count(struct page *page) | 450 | if (unlikely(PageCompound(page))) |
465 | { | 451 | return __page_mapcount(page); |
466 | return atomic_read(&compound_head(page)->_count); | 452 | return atomic_read(&page->_mapcount) + 1; |
467 | } | ||
468 | |||
469 | static inline bool __compound_tail_refcounted(struct page *page) | ||
470 | { | ||
471 | return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); | ||
472 | } | ||
473 | |||
474 | /* | ||
475 | * This takes a head page as parameter and tells if the | ||
476 | * tail page reference counting can be skipped. | ||
477 | * | ||
478 | * For this to be safe, PageSlab and PageHeadHuge must remain true on | ||
479 | * any given page where they return true here, until all tail pins | ||
480 | * have been released. | ||
481 | */ | ||
482 | static inline bool compound_tail_refcounted(struct page *page) | ||
483 | { | ||
484 | VM_BUG_ON_PAGE(!PageHead(page), page); | ||
485 | return __compound_tail_refcounted(page); | ||
486 | } | 453 | } |
487 | 454 | ||
488 | static inline void get_huge_page_tail(struct page *page) | 455 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
456 | int total_mapcount(struct page *page); | ||
457 | #else | ||
458 | static inline int total_mapcount(struct page *page) | ||
489 | { | 459 | { |
490 | /* | 460 | return page_mapcount(page); |
491 | * __split_huge_page_refcount() cannot run from under us. | ||
492 | */ | ||
493 | VM_BUG_ON_PAGE(!PageTail(page), page); | ||
494 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); | ||
495 | VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); | ||
496 | if (compound_tail_refcounted(compound_head(page))) | ||
497 | atomic_inc(&page->_mapcount); | ||
498 | } | 461 | } |
462 | #endif | ||
499 | 463 | ||
500 | extern bool __get_page_tail(struct page *page); | 464 | static inline int page_count(struct page *page) |
501 | |||
502 | static inline void get_page(struct page *page) | ||
503 | { | 465 | { |
504 | if (unlikely(PageTail(page))) | 466 | return atomic_read(&compound_head(page)->_count); |
505 | if (likely(__get_page_tail(page))) | ||
506 | return; | ||
507 | /* | ||
508 | * Getting a normal page or the head of a compound page | ||
509 | * requires to already have an elevated page->_count. | ||
510 | */ | ||
511 | VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); | ||
512 | atomic_inc(&page->_count); | ||
513 | } | 467 | } |
514 | 468 | ||
515 | static inline struct page *virt_to_head_page(const void *x) | 469 | static inline struct page *virt_to_head_page(const void *x) |
@@ -528,7 +482,8 @@ static inline void init_page_count(struct page *page) | |||
528 | atomic_set(&page->_count, 1); | 482 | atomic_set(&page->_count, 1); |
529 | } | 483 | } |
530 | 484 | ||
531 | void put_page(struct page *page); | 485 | void __put_page(struct page *page); |
486 | |||
532 | void put_pages_list(struct list_head *pages); | 487 | void put_pages_list(struct list_head *pages); |
533 | 488 | ||
534 | void split_page(struct page *page, unsigned int order); | 489 | void split_page(struct page *page, unsigned int order); |
@@ -548,6 +503,9 @@ enum compound_dtor_id { | |||
548 | #ifdef CONFIG_HUGETLB_PAGE | 503 | #ifdef CONFIG_HUGETLB_PAGE |
549 | HUGETLB_PAGE_DTOR, | 504 | HUGETLB_PAGE_DTOR, |
550 | #endif | 505 | #endif |
506 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
507 | TRANSHUGE_PAGE_DTOR, | ||
508 | #endif | ||
551 | NR_COMPOUND_DTORS, | 509 | NR_COMPOUND_DTORS, |
552 | }; | 510 | }; |
553 | extern compound_page_dtor * const compound_page_dtors[]; | 511 | extern compound_page_dtor * const compound_page_dtors[]; |
@@ -577,6 +535,8 @@ static inline void set_compound_order(struct page *page, unsigned int order) | |||
577 | page[1].compound_order = order; | 535 | page[1].compound_order = order; |
578 | } | 536 | } |
579 | 537 | ||
538 | void free_compound_page(struct page *page); | ||
539 | |||
580 | #ifdef CONFIG_MMU | 540 | #ifdef CONFIG_MMU |
581 | /* | 541 | /* |
582 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when | 542 | * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when |
@@ -704,6 +664,51 @@ static inline enum zone_type page_zonenum(const struct page *page) | |||
704 | return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; | 664 | return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; |
705 | } | 665 | } |
706 | 666 | ||
667 | #ifdef CONFIG_ZONE_DEVICE | ||
668 | void get_zone_device_page(struct page *page); | ||
669 | void put_zone_device_page(struct page *page); | ||
670 | static inline bool is_zone_device_page(const struct page *page) | ||
671 | { | ||
672 | return page_zonenum(page) == ZONE_DEVICE; | ||
673 | } | ||
674 | #else | ||
675 | static inline void get_zone_device_page(struct page *page) | ||
676 | { | ||
677 | } | ||
678 | static inline void put_zone_device_page(struct page *page) | ||
679 | { | ||
680 | } | ||
681 | static inline bool is_zone_device_page(const struct page *page) | ||
682 | { | ||
683 | return false; | ||
684 | } | ||
685 | #endif | ||
686 | |||
687 | static inline void get_page(struct page *page) | ||
688 | { | ||
689 | page = compound_head(page); | ||
690 | /* | ||
691 | * Getting a normal page or the head of a compound page | ||
692 | * requires to already have an elevated page->_count. | ||
693 | */ | ||
694 | VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); | ||
695 | atomic_inc(&page->_count); | ||
696 | |||
697 | if (unlikely(is_zone_device_page(page))) | ||
698 | get_zone_device_page(page); | ||
699 | } | ||
700 | |||
701 | static inline void put_page(struct page *page) | ||
702 | { | ||
703 | page = compound_head(page); | ||
704 | |||
705 | if (put_page_testzero(page)) | ||
706 | __put_page(page); | ||
707 | |||
708 | if (unlikely(is_zone_device_page(page))) | ||
709 | put_zone_device_page(page); | ||
710 | } | ||
711 | |||
707 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) | 712 | #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) |
708 | #define SECTION_IN_PAGE_FLAGS | 713 | #define SECTION_IN_PAGE_FLAGS |
709 | #endif | 714 | #endif |
@@ -993,10 +998,21 @@ static inline pgoff_t page_file_index(struct page *page) | |||
993 | 998 | ||
994 | /* | 999 | /* |
995 | * Return true if this page is mapped into pagetables. | 1000 | * Return true if this page is mapped into pagetables. |
1001 | * For compound page it returns true if any subpage of compound page is mapped. | ||
996 | */ | 1002 | */ |
997 | static inline int page_mapped(struct page *page) | 1003 | static inline bool page_mapped(struct page *page) |
998 | { | 1004 | { |
999 | return atomic_read(&(page)->_mapcount) >= 0; | 1005 | int i; |
1006 | if (likely(!PageCompound(page))) | ||
1007 | return atomic_read(&page->_mapcount) >= 0; | ||
1008 | page = compound_head(page); | ||
1009 | if (atomic_read(compound_mapcount_ptr(page)) >= 0) | ||
1010 | return true; | ||
1011 | for (i = 0; i < hpage_nr_pages(page); i++) { | ||
1012 | if (atomic_read(&page[i]._mapcount) >= 0) | ||
1013 | return true; | ||
1014 | } | ||
1015 | return false; | ||
1000 | } | 1016 | } |
1001 | 1017 | ||
1002 | /* | 1018 | /* |
@@ -1084,7 +1100,7 @@ static inline bool shmem_mapping(struct address_space *mapping) | |||
1084 | } | 1100 | } |
1085 | #endif | 1101 | #endif |
1086 | 1102 | ||
1087 | extern int can_do_mlock(void); | 1103 | extern bool can_do_mlock(void); |
1088 | extern int user_shm_lock(size_t, struct user_struct *); | 1104 | extern int user_shm_lock(size_t, struct user_struct *); |
1089 | extern void user_shm_unlock(size_t, struct user_struct *); | 1105 | extern void user_shm_unlock(size_t, struct user_struct *); |
1090 | 1106 | ||
@@ -1178,7 +1194,8 @@ int invalidate_inode_page(struct page *page); | |||
1178 | extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 1194 | extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
1179 | unsigned long address, unsigned int flags); | 1195 | unsigned long address, unsigned int flags); |
1180 | extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | 1196 | extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, |
1181 | unsigned long address, unsigned int fault_flags); | 1197 | unsigned long address, unsigned int fault_flags, |
1198 | bool *unlocked); | ||
1182 | #else | 1199 | #else |
1183 | static inline int handle_mm_fault(struct mm_struct *mm, | 1200 | static inline int handle_mm_fault(struct mm_struct *mm, |
1184 | struct vm_area_struct *vma, unsigned long address, | 1201 | struct vm_area_struct *vma, unsigned long address, |
@@ -1190,7 +1207,7 @@ static inline int handle_mm_fault(struct mm_struct *mm, | |||
1190 | } | 1207 | } |
1191 | static inline int fixup_user_fault(struct task_struct *tsk, | 1208 | static inline int fixup_user_fault(struct task_struct *tsk, |
1192 | struct mm_struct *mm, unsigned long address, | 1209 | struct mm_struct *mm, unsigned long address, |
1193 | unsigned int fault_flags) | 1210 | unsigned int fault_flags, bool *unlocked) |
1194 | { | 1211 | { |
1195 | /* should never happen if there's no MMU */ | 1212 | /* should never happen if there's no MMU */ |
1196 | BUG(); | 1213 | BUG(); |
@@ -1444,6 +1461,13 @@ static inline void sync_mm_rss(struct mm_struct *mm) | |||
1444 | } | 1461 | } |
1445 | #endif | 1462 | #endif |
1446 | 1463 | ||
1464 | #ifndef __HAVE_ARCH_PTE_DEVMAP | ||
1465 | static inline int pte_devmap(pte_t pte) | ||
1466 | { | ||
1467 | return 0; | ||
1468 | } | ||
1469 | #endif | ||
1470 | |||
1447 | int vma_wants_writenotify(struct vm_area_struct *vma); | 1471 | int vma_wants_writenotify(struct vm_area_struct *vma); |
1448 | 1472 | ||
1449 | extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, | 1473 | extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, |
@@ -2114,7 +2138,7 @@ int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); | |||
2114 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 2138 | int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
2115 | unsigned long pfn); | 2139 | unsigned long pfn); |
2116 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 2140 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
2117 | unsigned long pfn); | 2141 | pfn_t pfn); |
2118 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); | 2142 | int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); |
2119 | 2143 | ||
2120 | 2144 | ||
@@ -2224,7 +2248,14 @@ pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); | |||
2224 | pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); | 2248 | pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); |
2225 | pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); | 2249 | pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); |
2226 | void *vmemmap_alloc_block(unsigned long size, int node); | 2250 | void *vmemmap_alloc_block(unsigned long size, int node); |
2227 | void *vmemmap_alloc_block_buf(unsigned long size, int node); | 2251 | struct vmem_altmap; |
2252 | void *__vmemmap_alloc_block_buf(unsigned long size, int node, | ||
2253 | struct vmem_altmap *altmap); | ||
2254 | static inline void *vmemmap_alloc_block_buf(unsigned long size, int node) | ||
2255 | { | ||
2256 | return __vmemmap_alloc_block_buf(size, node, NULL); | ||
2257 | } | ||
2258 | |||
2228 | void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); | 2259 | void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); |
2229 | int vmemmap_populate_basepages(unsigned long start, unsigned long end, | 2260 | int vmemmap_populate_basepages(unsigned long start, unsigned long end, |
2230 | int node); | 2261 | int node); |
@@ -2246,7 +2277,7 @@ extern int memory_failure(unsigned long pfn, int trapno, int flags); | |||
2246 | extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); | 2277 | extern void memory_failure_queue(unsigned long pfn, int trapno, int flags); |
2247 | extern int unpoison_memory(unsigned long pfn); | 2278 | extern int unpoison_memory(unsigned long pfn); |
2248 | extern int get_hwpoison_page(struct page *page); | 2279 | extern int get_hwpoison_page(struct page *page); |
2249 | extern void put_hwpoison_page(struct page *page); | 2280 | #define put_hwpoison_page(page) put_page(page) |
2250 | extern int sysctl_memory_failure_early_kill; | 2281 | extern int sysctl_memory_failure_early_kill; |
2251 | extern int sysctl_memory_failure_recovery; | 2282 | extern int sysctl_memory_failure_recovery; |
2252 | extern void shake_page(struct page *p, int access); | 2283 | extern void shake_page(struct page *p, int access); |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6bc9a0ce2253..d3ebb9d21a53 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -54,6 +54,8 @@ struct page { | |||
54 | * see PAGE_MAPPING_ANON below. | 54 | * see PAGE_MAPPING_ANON below. |
55 | */ | 55 | */ |
56 | void *s_mem; /* slab first object */ | 56 | void *s_mem; /* slab first object */ |
57 | atomic_t compound_mapcount; /* first tail page */ | ||
58 | /* page_deferred_list().next -- second tail page */ | ||
57 | }; | 59 | }; |
58 | 60 | ||
59 | /* Second double word */ | 61 | /* Second double word */ |
@@ -61,6 +63,7 @@ struct page { | |||
61 | union { | 63 | union { |
62 | pgoff_t index; /* Our offset within mapping. */ | 64 | pgoff_t index; /* Our offset within mapping. */ |
63 | void *freelist; /* sl[aou]b first free object */ | 65 | void *freelist; /* sl[aou]b first free object */ |
66 | /* page_deferred_list().prev -- second tail page */ | ||
64 | }; | 67 | }; |
65 | 68 | ||
66 | union { | 69 | union { |
@@ -81,20 +84,9 @@ struct page { | |||
81 | 84 | ||
82 | union { | 85 | union { |
83 | /* | 86 | /* |
84 | * Count of ptes mapped in | 87 | * Count of ptes mapped in mms, to show |
85 | * mms, to show when page is | 88 | * when page is mapped & limit reverse |
86 | * mapped & limit reverse map | 89 | * map searches. |
87 | * searches. | ||
88 | * | ||
89 | * Used also for tail pages | ||
90 | * refcounting instead of | ||
91 | * _count. Tail pages cannot | ||
92 | * be mapped and keeping the | ||
93 | * tail page _count zero at | ||
94 | * all times guarantees | ||
95 | * get_page_unless_zero() will | ||
96 | * never succeed on tail | ||
97 | * pages. | ||
98 | */ | 90 | */ |
99 | atomic_t _mapcount; | 91 | atomic_t _mapcount; |
100 | 92 | ||
@@ -124,6 +116,11 @@ struct page { | |||
124 | * Can be used as a generic list | 116 | * Can be used as a generic list |
125 | * by the page owner. | 117 | * by the page owner. |
126 | */ | 118 | */ |
119 | struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an | ||
120 | * lru or handled by a slab | ||
121 | * allocator, this points to the | ||
122 | * hosting device page map. | ||
123 | */ | ||
127 | struct { /* slub per cpu partial pages */ | 124 | struct { /* slub per cpu partial pages */ |
128 | struct page *next; /* Next partial slab */ | 125 | struct page *next; /* Next partial slab */ |
129 | #ifdef CONFIG_64BIT | 126 | #ifdef CONFIG_64BIT |
diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 772362adf471..053824b0a412 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h | |||
@@ -56,4 +56,10 @@ void dump_mm(const struct mm_struct *mm); | |||
56 | #define VIRTUAL_BUG_ON(cond) do { } while (0) | 56 | #define VIRTUAL_BUG_ON(cond) do { } while (0) |
57 | #endif | 57 | #endif |
58 | 58 | ||
59 | #ifdef CONFIG_DEBUG_VM_PGFLAGS | ||
60 | #define VM_BUG_ON_PGFLAGS(cond, page) VM_BUG_ON_PAGE(cond, page) | ||
61 | #else | ||
62 | #define VM_BUG_ON_PGFLAGS(cond, page) BUILD_BUG_ON_INVALID(cond) | ||
63 | #endif | ||
64 | |||
59 | #endif | 65 | #endif |
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index bb53c7b86315..19724e6ebd26 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h | |||
@@ -101,9 +101,6 @@ enum pageflags { | |||
101 | #ifdef CONFIG_MEMORY_FAILURE | 101 | #ifdef CONFIG_MEMORY_FAILURE |
102 | PG_hwpoison, /* hardware poisoned page. Don't touch */ | 102 | PG_hwpoison, /* hardware poisoned page. Don't touch */ |
103 | #endif | 103 | #endif |
104 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
105 | PG_compound_lock, | ||
106 | #endif | ||
107 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) | 104 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) |
108 | PG_young, | 105 | PG_young, |
109 | PG_idle, | 106 | PG_idle, |
@@ -129,53 +126,104 @@ enum pageflags { | |||
129 | 126 | ||
130 | /* SLOB */ | 127 | /* SLOB */ |
131 | PG_slob_free = PG_private, | 128 | PG_slob_free = PG_private, |
129 | |||
130 | /* Compound pages. Stored in first tail page's flags */ | ||
131 | PG_double_map = PG_private_2, | ||
132 | }; | 132 | }; |
133 | 133 | ||
134 | #ifndef __GENERATING_BOUNDS_H | 134 | #ifndef __GENERATING_BOUNDS_H |
135 | 135 | ||
136 | struct page; /* forward declaration */ | ||
137 | |||
138 | static inline struct page *compound_head(struct page *page) | ||
139 | { | ||
140 | unsigned long head = READ_ONCE(page->compound_head); | ||
141 | |||
142 | if (unlikely(head & 1)) | ||
143 | return (struct page *) (head - 1); | ||
144 | return page; | ||
145 | } | ||
146 | |||
147 | static inline int PageTail(struct page *page) | ||
148 | { | ||
149 | return READ_ONCE(page->compound_head) & 1; | ||
150 | } | ||
151 | |||
152 | static inline int PageCompound(struct page *page) | ||
153 | { | ||
154 | return test_bit(PG_head, &page->flags) || PageTail(page); | ||
155 | } | ||
156 | |||
157 | /* | ||
158 | * Page flags policies wrt compound pages | ||
159 | * | ||
160 | * PF_ANY: | ||
161 | * the page flag is relevant for small, head and tail pages. | ||
162 | * | ||
163 | * PF_HEAD: | ||
164 | * for compound page all operations related to the page flag applied to | ||
165 | * head page. | ||
166 | * | ||
167 | * PF_NO_TAIL: | ||
168 | * modifications of the page flag must be done on small or head pages, | ||
169 | * checks can be done on tail pages too. | ||
170 | * | ||
171 | * PF_NO_COMPOUND: | ||
172 | * the page flag is not relevant for compound pages. | ||
173 | */ | ||
174 | #define PF_ANY(page, enforce) page | ||
175 | #define PF_HEAD(page, enforce) compound_head(page) | ||
176 | #define PF_NO_TAIL(page, enforce) ({ \ | ||
177 | VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ | ||
178 | compound_head(page);}) | ||
179 | #define PF_NO_COMPOUND(page, enforce) ({ \ | ||
180 | VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ | ||
181 | page;}) | ||
182 | |||
136 | /* | 183 | /* |
137 | * Macros to create function definitions for page flags | 184 | * Macros to create function definitions for page flags |
138 | */ | 185 | */ |
139 | #define TESTPAGEFLAG(uname, lname) \ | 186 | #define TESTPAGEFLAG(uname, lname, policy) \ |
140 | static inline int Page##uname(const struct page *page) \ | 187 | static inline int Page##uname(struct page *page) \ |
141 | { return test_bit(PG_##lname, &page->flags); } | 188 | { return test_bit(PG_##lname, &policy(page, 0)->flags); } |
142 | 189 | ||
143 | #define SETPAGEFLAG(uname, lname) \ | 190 | #define SETPAGEFLAG(uname, lname, policy) \ |
144 | static inline void SetPage##uname(struct page *page) \ | 191 | static inline void SetPage##uname(struct page *page) \ |
145 | { set_bit(PG_##lname, &page->flags); } | 192 | { set_bit(PG_##lname, &policy(page, 1)->flags); } |
146 | 193 | ||
147 | #define CLEARPAGEFLAG(uname, lname) \ | 194 | #define CLEARPAGEFLAG(uname, lname, policy) \ |
148 | static inline void ClearPage##uname(struct page *page) \ | 195 | static inline void ClearPage##uname(struct page *page) \ |
149 | { clear_bit(PG_##lname, &page->flags); } | 196 | { clear_bit(PG_##lname, &policy(page, 1)->flags); } |
150 | 197 | ||
151 | #define __SETPAGEFLAG(uname, lname) \ | 198 | #define __SETPAGEFLAG(uname, lname, policy) \ |
152 | static inline void __SetPage##uname(struct page *page) \ | 199 | static inline void __SetPage##uname(struct page *page) \ |
153 | { __set_bit(PG_##lname, &page->flags); } | 200 | { __set_bit(PG_##lname, &policy(page, 1)->flags); } |
154 | 201 | ||
155 | #define __CLEARPAGEFLAG(uname, lname) \ | 202 | #define __CLEARPAGEFLAG(uname, lname, policy) \ |
156 | static inline void __ClearPage##uname(struct page *page) \ | 203 | static inline void __ClearPage##uname(struct page *page) \ |
157 | { __clear_bit(PG_##lname, &page->flags); } | 204 | { __clear_bit(PG_##lname, &policy(page, 1)->flags); } |
158 | 205 | ||
159 | #define TESTSETFLAG(uname, lname) \ | 206 | #define TESTSETFLAG(uname, lname, policy) \ |
160 | static inline int TestSetPage##uname(struct page *page) \ | 207 | static inline int TestSetPage##uname(struct page *page) \ |
161 | { return test_and_set_bit(PG_##lname, &page->flags); } | 208 | { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } |
162 | 209 | ||
163 | #define TESTCLEARFLAG(uname, lname) \ | 210 | #define TESTCLEARFLAG(uname, lname, policy) \ |
164 | static inline int TestClearPage##uname(struct page *page) \ | 211 | static inline int TestClearPage##uname(struct page *page) \ |
165 | { return test_and_clear_bit(PG_##lname, &page->flags); } | 212 | { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } |
166 | |||
167 | #define __TESTCLEARFLAG(uname, lname) \ | ||
168 | static inline int __TestClearPage##uname(struct page *page) \ | ||
169 | { return __test_and_clear_bit(PG_##lname, &page->flags); } | ||
170 | 213 | ||
171 | #define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ | 214 | #define PAGEFLAG(uname, lname, policy) \ |
172 | SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname) | 215 | TESTPAGEFLAG(uname, lname, policy) \ |
216 | SETPAGEFLAG(uname, lname, policy) \ | ||
217 | CLEARPAGEFLAG(uname, lname, policy) | ||
173 | 218 | ||
174 | #define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ | 219 | #define __PAGEFLAG(uname, lname, policy) \ |
175 | __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname) | 220 | TESTPAGEFLAG(uname, lname, policy) \ |
221 | __SETPAGEFLAG(uname, lname, policy) \ | ||
222 | __CLEARPAGEFLAG(uname, lname, policy) | ||
176 | 223 | ||
177 | #define TESTSCFLAG(uname, lname) \ | 224 | #define TESTSCFLAG(uname, lname, policy) \ |
178 | TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname) | 225 | TESTSETFLAG(uname, lname, policy) \ |
226 | TESTCLEARFLAG(uname, lname, policy) | ||
179 | 227 | ||
180 | #define TESTPAGEFLAG_FALSE(uname) \ | 228 | #define TESTPAGEFLAG_FALSE(uname) \ |
181 | static inline int Page##uname(const struct page *page) { return 0; } | 229 | static inline int Page##uname(const struct page *page) { return 0; } |
@@ -195,56 +243,62 @@ static inline int TestSetPage##uname(struct page *page) { return 0; } | |||
195 | #define TESTCLEARFLAG_FALSE(uname) \ | 243 | #define TESTCLEARFLAG_FALSE(uname) \ |
196 | static inline int TestClearPage##uname(struct page *page) { return 0; } | 244 | static inline int TestClearPage##uname(struct page *page) { return 0; } |
197 | 245 | ||
198 | #define __TESTCLEARFLAG_FALSE(uname) \ | ||
199 | static inline int __TestClearPage##uname(struct page *page) { return 0; } | ||
200 | |||
201 | #define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ | 246 | #define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ |
202 | SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) | 247 | SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) |
203 | 248 | ||
204 | #define TESTSCFLAG_FALSE(uname) \ | 249 | #define TESTSCFLAG_FALSE(uname) \ |
205 | TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) | 250 | TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) |
206 | 251 | ||
207 | struct page; /* forward declaration */ | 252 | __PAGEFLAG(Locked, locked, PF_NO_TAIL) |
208 | 253 | PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND) | |
209 | TESTPAGEFLAG(Locked, locked) | 254 | PAGEFLAG(Referenced, referenced, PF_HEAD) |
210 | PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) | 255 | TESTCLEARFLAG(Referenced, referenced, PF_HEAD) |
211 | PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) | 256 | __SETPAGEFLAG(Referenced, referenced, PF_HEAD) |
212 | __SETPAGEFLAG(Referenced, referenced) | 257 | PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) |
213 | PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) | 258 | __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) |
214 | PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) | 259 | PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) |
215 | PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) | 260 | PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) |
216 | TESTCLEARFLAG(Active, active) | 261 | TESTCLEARFLAG(Active, active, PF_HEAD) |
217 | __PAGEFLAG(Slab, slab) | 262 | __PAGEFLAG(Slab, slab, PF_NO_TAIL) |
218 | PAGEFLAG(Checked, checked) /* Used by some filesystems */ | 263 | __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) |
219 | PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ | 264 | PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ |
220 | PAGEFLAG(SavePinned, savepinned); /* Xen */ | 265 | |
221 | PAGEFLAG(Foreign, foreign); /* Xen */ | 266 | /* Xen */ |
222 | PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) | 267 | PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) |
223 | PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) | 268 | TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) |
224 | __SETPAGEFLAG(SwapBacked, swapbacked) | 269 | PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); |
225 | 270 | PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); | |
226 | __PAGEFLAG(SlobFree, slob_free) | 271 | |
272 | PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) | ||
273 | __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) | ||
274 | PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) | ||
275 | __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) | ||
276 | __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) | ||
227 | 277 | ||
228 | /* | 278 | /* |
229 | * Private page markings that may be used by the filesystem that owns the page | 279 | * Private page markings that may be used by the filesystem that owns the page |
230 | * for its own purposes. | 280 | * for its own purposes. |
231 | * - PG_private and PG_private_2 cause releasepage() and co to be invoked | 281 | * - PG_private and PG_private_2 cause releasepage() and co to be invoked |
232 | */ | 282 | */ |
233 | PAGEFLAG(Private, private) __SETPAGEFLAG(Private, private) | 283 | PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY) |
234 | __CLEARPAGEFLAG(Private, private) | 284 | __CLEARPAGEFLAG(Private, private, PF_ANY) |
235 | PAGEFLAG(Private2, private_2) TESTSCFLAG(Private2, private_2) | 285 | PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) |
236 | PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) | 286 | PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) |
287 | TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) | ||
237 | 288 | ||
238 | /* | 289 | /* |
239 | * Only test-and-set exist for PG_writeback. The unconditional operators are | 290 | * Only test-and-set exist for PG_writeback. The unconditional operators are |
240 | * risky: they bypass page accounting. | 291 | * risky: they bypass page accounting. |
241 | */ | 292 | */ |
242 | TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) | 293 | TESTPAGEFLAG(Writeback, writeback, PF_NO_COMPOUND) |
243 | PAGEFLAG(MappedToDisk, mappedtodisk) | 294 | TESTSCFLAG(Writeback, writeback, PF_NO_COMPOUND) |
295 | PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_COMPOUND) | ||
244 | 296 | ||
245 | /* PG_readahead is only used for reads; PG_reclaim is only for writes */ | 297 | /* PG_readahead is only used for reads; PG_reclaim is only for writes */ |
246 | PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim) | 298 | PAGEFLAG(Reclaim, reclaim, PF_NO_COMPOUND) |
247 | PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim) | 299 | TESTCLEARFLAG(Reclaim, reclaim, PF_NO_COMPOUND) |
300 | PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) | ||
301 | TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) | ||
248 | 302 | ||
249 | #ifdef CONFIG_HIGHMEM | 303 | #ifdef CONFIG_HIGHMEM |
250 | /* | 304 | /* |
@@ -257,31 +311,33 @@ PAGEFLAG_FALSE(HighMem) | |||
257 | #endif | 311 | #endif |
258 | 312 | ||
259 | #ifdef CONFIG_SWAP | 313 | #ifdef CONFIG_SWAP |
260 | PAGEFLAG(SwapCache, swapcache) | 314 | PAGEFLAG(SwapCache, swapcache, PF_NO_COMPOUND) |
261 | #else | 315 | #else |
262 | PAGEFLAG_FALSE(SwapCache) | 316 | PAGEFLAG_FALSE(SwapCache) |
263 | #endif | 317 | #endif |
264 | 318 | ||
265 | PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) | 319 | PAGEFLAG(Unevictable, unevictable, PF_HEAD) |
266 | TESTCLEARFLAG(Unevictable, unevictable) | 320 | __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) |
321 | TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) | ||
267 | 322 | ||
268 | #ifdef CONFIG_MMU | 323 | #ifdef CONFIG_MMU |
269 | PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) | 324 | PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) |
270 | TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked) | 325 | __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) |
326 | TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) | ||
271 | #else | 327 | #else |
272 | PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) | 328 | PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) |
273 | TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) | 329 | TESTSCFLAG_FALSE(Mlocked) |
274 | #endif | 330 | #endif |
275 | 331 | ||
276 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED | 332 | #ifdef CONFIG_ARCH_USES_PG_UNCACHED |
277 | PAGEFLAG(Uncached, uncached) | 333 | PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) |
278 | #else | 334 | #else |
279 | PAGEFLAG_FALSE(Uncached) | 335 | PAGEFLAG_FALSE(Uncached) |
280 | #endif | 336 | #endif |
281 | 337 | ||
282 | #ifdef CONFIG_MEMORY_FAILURE | 338 | #ifdef CONFIG_MEMORY_FAILURE |
283 | PAGEFLAG(HWPoison, hwpoison) | 339 | PAGEFLAG(HWPoison, hwpoison, PF_ANY) |
284 | TESTSCFLAG(HWPoison, hwpoison) | 340 | TESTSCFLAG(HWPoison, hwpoison, PF_ANY) |
285 | #define __PG_HWPOISON (1UL << PG_hwpoison) | 341 | #define __PG_HWPOISON (1UL << PG_hwpoison) |
286 | #else | 342 | #else |
287 | PAGEFLAG_FALSE(HWPoison) | 343 | PAGEFLAG_FALSE(HWPoison) |
@@ -289,10 +345,10 @@ PAGEFLAG_FALSE(HWPoison) | |||
289 | #endif | 345 | #endif |
290 | 346 | ||
291 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) | 347 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) |
292 | TESTPAGEFLAG(Young, young) | 348 | TESTPAGEFLAG(Young, young, PF_ANY) |
293 | SETPAGEFLAG(Young, young) | 349 | SETPAGEFLAG(Young, young, PF_ANY) |
294 | TESTCLEARFLAG(Young, young) | 350 | TESTCLEARFLAG(Young, young, PF_ANY) |
295 | PAGEFLAG(Idle, idle) | 351 | PAGEFLAG(Idle, idle, PF_ANY) |
296 | #endif | 352 | #endif |
297 | 353 | ||
298 | /* | 354 | /* |
@@ -317,6 +373,7 @@ PAGEFLAG(Idle, idle) | |||
317 | 373 | ||
318 | static inline int PageAnon(struct page *page) | 374 | static inline int PageAnon(struct page *page) |
319 | { | 375 | { |
376 | page = compound_head(page); | ||
320 | return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; | 377 | return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; |
321 | } | 378 | } |
322 | 379 | ||
@@ -329,6 +386,7 @@ static inline int PageAnon(struct page *page) | |||
329 | */ | 386 | */ |
330 | static inline int PageKsm(struct page *page) | 387 | static inline int PageKsm(struct page *page) |
331 | { | 388 | { |
389 | page = compound_head(page); | ||
332 | return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == | 390 | return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == |
333 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); | 391 | (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); |
334 | } | 392 | } |
@@ -340,8 +398,9 @@ u64 stable_page_flags(struct page *page); | |||
340 | 398 | ||
341 | static inline int PageUptodate(struct page *page) | 399 | static inline int PageUptodate(struct page *page) |
342 | { | 400 | { |
343 | int ret = test_bit(PG_uptodate, &(page)->flags); | 401 | int ret; |
344 | 402 | page = compound_head(page); | |
403 | ret = test_bit(PG_uptodate, &(page)->flags); | ||
345 | /* | 404 | /* |
346 | * Must ensure that the data we read out of the page is loaded | 405 | * Must ensure that the data we read out of the page is loaded |
347 | * _after_ we've loaded page->flags to check for PageUptodate. | 406 | * _after_ we've loaded page->flags to check for PageUptodate. |
@@ -358,22 +417,24 @@ static inline int PageUptodate(struct page *page) | |||
358 | 417 | ||
359 | static inline void __SetPageUptodate(struct page *page) | 418 | static inline void __SetPageUptodate(struct page *page) |
360 | { | 419 | { |
420 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
361 | smp_wmb(); | 421 | smp_wmb(); |
362 | __set_bit(PG_uptodate, &(page)->flags); | 422 | __set_bit(PG_uptodate, &page->flags); |
363 | } | 423 | } |
364 | 424 | ||
365 | static inline void SetPageUptodate(struct page *page) | 425 | static inline void SetPageUptodate(struct page *page) |
366 | { | 426 | { |
427 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
367 | /* | 428 | /* |
368 | * Memory barrier must be issued before setting the PG_uptodate bit, | 429 | * Memory barrier must be issued before setting the PG_uptodate bit, |
369 | * so that all previous stores issued in order to bring the page | 430 | * so that all previous stores issued in order to bring the page |
370 | * uptodate are actually visible before PageUptodate becomes true. | 431 | * uptodate are actually visible before PageUptodate becomes true. |
371 | */ | 432 | */ |
372 | smp_wmb(); | 433 | smp_wmb(); |
373 | set_bit(PG_uptodate, &(page)->flags); | 434 | set_bit(PG_uptodate, &page->flags); |
374 | } | 435 | } |
375 | 436 | ||
376 | CLEARPAGEFLAG(Uptodate, uptodate) | 437 | CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) |
377 | 438 | ||
378 | int test_clear_page_writeback(struct page *page); | 439 | int test_clear_page_writeback(struct page *page); |
379 | int __test_set_page_writeback(struct page *page, bool keep_write); | 440 | int __test_set_page_writeback(struct page *page, bool keep_write); |
@@ -393,12 +454,7 @@ static inline void set_page_writeback_keepwrite(struct page *page) | |||
393 | test_set_page_writeback_keepwrite(page); | 454 | test_set_page_writeback_keepwrite(page); |
394 | } | 455 | } |
395 | 456 | ||
396 | __PAGEFLAG(Head, head) CLEARPAGEFLAG(Head, head) | 457 | __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) |
397 | |||
398 | static inline int PageTail(struct page *page) | ||
399 | { | ||
400 | return READ_ONCE(page->compound_head) & 1; | ||
401 | } | ||
402 | 458 | ||
403 | static inline void set_compound_head(struct page *page, struct page *head) | 459 | static inline void set_compound_head(struct page *page, struct page *head) |
404 | { | 460 | { |
@@ -410,20 +466,6 @@ static inline void clear_compound_head(struct page *page) | |||
410 | WRITE_ONCE(page->compound_head, 0); | 466 | WRITE_ONCE(page->compound_head, 0); |
411 | } | 467 | } |
412 | 468 | ||
413 | static inline struct page *compound_head(struct page *page) | ||
414 | { | ||
415 | unsigned long head = READ_ONCE(page->compound_head); | ||
416 | |||
417 | if (unlikely(head & 1)) | ||
418 | return (struct page *) (head - 1); | ||
419 | return page; | ||
420 | } | ||
421 | |||
422 | static inline int PageCompound(struct page *page) | ||
423 | { | ||
424 | return PageHead(page) || PageTail(page); | ||
425 | |||
426 | } | ||
427 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 469 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
428 | static inline void ClearPageCompound(struct page *page) | 470 | static inline void ClearPageCompound(struct page *page) |
429 | { | 471 | { |
@@ -484,22 +526,43 @@ static inline int PageTransTail(struct page *page) | |||
484 | return PageTail(page); | 526 | return PageTail(page); |
485 | } | 527 | } |
486 | 528 | ||
487 | #else | 529 | /* |
488 | 530 | * PageDoubleMap indicates that the compound page is mapped with PTEs as well | |
489 | static inline int PageTransHuge(struct page *page) | 531 | * as PMDs. |
532 | * | ||
533 | * This is required for optimization of rmap operations for THP: we can postpone | ||
534 | * per small page mapcount accounting (and its overhead from atomic operations) | ||
535 | * until the first PMD split. | ||
536 | * | ||
537 | * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up | ||
538 | * by one. This reference will go away with last compound_mapcount. | ||
539 | * | ||
540 | * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). | ||
541 | */ | ||
542 | static inline int PageDoubleMap(struct page *page) | ||
490 | { | 543 | { |
491 | return 0; | 544 | return PageHead(page) && test_bit(PG_double_map, &page[1].flags); |
492 | } | 545 | } |
493 | 546 | ||
494 | static inline int PageTransCompound(struct page *page) | 547 | static inline int TestSetPageDoubleMap(struct page *page) |
495 | { | 548 | { |
496 | return 0; | 549 | VM_BUG_ON_PAGE(!PageHead(page), page); |
550 | return test_and_set_bit(PG_double_map, &page[1].flags); | ||
497 | } | 551 | } |
498 | 552 | ||
499 | static inline int PageTransTail(struct page *page) | 553 | static inline int TestClearPageDoubleMap(struct page *page) |
500 | { | 554 | { |
501 | return 0; | 555 | VM_BUG_ON_PAGE(!PageHead(page), page); |
556 | return test_and_clear_bit(PG_double_map, &page[1].flags); | ||
502 | } | 557 | } |
558 | |||
559 | #else | ||
560 | TESTPAGEFLAG_FALSE(TransHuge) | ||
561 | TESTPAGEFLAG_FALSE(TransCompound) | ||
562 | TESTPAGEFLAG_FALSE(TransTail) | ||
563 | TESTPAGEFLAG_FALSE(DoubleMap) | ||
564 | TESTSETFLAG_FALSE(DoubleMap) | ||
565 | TESTCLEARFLAG_FALSE(DoubleMap) | ||
503 | #endif | 566 | #endif |
504 | 567 | ||
505 | /* | 568 | /* |
@@ -583,12 +646,6 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) | |||
583 | #define __PG_MLOCKED 0 | 646 | #define __PG_MLOCKED 0 |
584 | #endif | 647 | #endif |
585 | 648 | ||
586 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
587 | #define __PG_COMPOUND_LOCK (1 << PG_compound_lock) | ||
588 | #else | ||
589 | #define __PG_COMPOUND_LOCK 0 | ||
590 | #endif | ||
591 | |||
592 | /* | 649 | /* |
593 | * Flags checked when a page is freed. Pages being freed should not have | 650 | * Flags checked when a page is freed. Pages being freed should not have |
594 | * these flags set. It they are, there is a problem. | 651 | * these flags set. It they are, there is a problem. |
@@ -598,8 +655,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) | |||
598 | 1 << PG_private | 1 << PG_private_2 | \ | 655 | 1 << PG_private | 1 << PG_private_2 | \ |
599 | 1 << PG_writeback | 1 << PG_reserved | \ | 656 | 1 << PG_writeback | 1 << PG_reserved | \ |
600 | 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ | 657 | 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ |
601 | 1 << PG_unevictable | __PG_MLOCKED | \ | 658 | 1 << PG_unevictable | __PG_MLOCKED) |
602 | __PG_COMPOUND_LOCK) | ||
603 | 659 | ||
604 | /* | 660 | /* |
605 | * Flags checked when a page is prepped for return by the page allocator. | 661 | * Flags checked when a page is prepped for return by the page allocator. |
@@ -626,6 +682,10 @@ static inline int page_has_private(struct page *page) | |||
626 | return !!(page->flags & PAGE_FLAGS_PRIVATE); | 682 | return !!(page->flags & PAGE_FLAGS_PRIVATE); |
627 | } | 683 | } |
628 | 684 | ||
685 | #undef PF_ANY | ||
686 | #undef PF_HEAD | ||
687 | #undef PF_NO_TAIL | ||
688 | #undef PF_NO_COMPOUND | ||
629 | #endif /* !__GENERATING_BOUNDS_H */ | 689 | #endif /* !__GENERATING_BOUNDS_H */ |
630 | 690 | ||
631 | #endif /* PAGE_FLAGS_H */ | 691 | #endif /* PAGE_FLAGS_H */ |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 26eabf5ec718..4d08b6c33557 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -394,10 +394,21 @@ static inline struct page *read_mapping_page(struct address_space *mapping, | |||
394 | */ | 394 | */ |
395 | static inline pgoff_t page_to_pgoff(struct page *page) | 395 | static inline pgoff_t page_to_pgoff(struct page *page) |
396 | { | 396 | { |
397 | pgoff_t pgoff; | ||
398 | |||
397 | if (unlikely(PageHeadHuge(page))) | 399 | if (unlikely(PageHeadHuge(page))) |
398 | return page->index << compound_order(page); | 400 | return page->index << compound_order(page); |
399 | else | 401 | |
402 | if (likely(!PageTransTail(page))) | ||
400 | return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 403 | return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
404 | |||
405 | /* | ||
406 | * We don't initialize ->index for tail pages: calculate based on | ||
407 | * head page | ||
408 | */ | ||
409 | pgoff = compound_head(page)->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
410 | pgoff += page - compound_head(page); | ||
411 | return pgoff; | ||
401 | } | 412 | } |
402 | 413 | ||
403 | /* | 414 | /* |
@@ -433,18 +444,9 @@ extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, | |||
433 | unsigned int flags); | 444 | unsigned int flags); |
434 | extern void unlock_page(struct page *page); | 445 | extern void unlock_page(struct page *page); |
435 | 446 | ||
436 | static inline void __set_page_locked(struct page *page) | ||
437 | { | ||
438 | __set_bit(PG_locked, &page->flags); | ||
439 | } | ||
440 | |||
441 | static inline void __clear_page_locked(struct page *page) | ||
442 | { | ||
443 | __clear_bit(PG_locked, &page->flags); | ||
444 | } | ||
445 | |||
446 | static inline int trylock_page(struct page *page) | 447 | static inline int trylock_page(struct page *page) |
447 | { | 448 | { |
449 | page = compound_head(page); | ||
448 | return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); | 450 | return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); |
449 | } | 451 | } |
450 | 452 | ||
@@ -497,9 +499,9 @@ extern int wait_on_page_bit_killable_timeout(struct page *page, | |||
497 | 499 | ||
498 | static inline int wait_on_page_locked_killable(struct page *page) | 500 | static inline int wait_on_page_locked_killable(struct page *page) |
499 | { | 501 | { |
500 | if (PageLocked(page)) | 502 | if (!PageLocked(page)) |
501 | return wait_on_page_bit_killable(page, PG_locked); | 503 | return 0; |
502 | return 0; | 504 | return wait_on_page_bit_killable(compound_head(page), PG_locked); |
503 | } | 505 | } |
504 | 506 | ||
505 | extern wait_queue_head_t *page_waitqueue(struct page *page); | 507 | extern wait_queue_head_t *page_waitqueue(struct page *page); |
@@ -518,7 +520,7 @@ static inline void wake_up_page(struct page *page, int bit) | |||
518 | static inline void wait_on_page_locked(struct page *page) | 520 | static inline void wait_on_page_locked(struct page *page) |
519 | { | 521 | { |
520 | if (PageLocked(page)) | 522 | if (PageLocked(page)) |
521 | wait_on_page_bit(page, PG_locked); | 523 | wait_on_page_bit(compound_head(page), PG_locked); |
522 | } | 524 | } |
523 | 525 | ||
524 | /* | 526 | /* |
@@ -664,17 +666,17 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); | |||
664 | 666 | ||
665 | /* | 667 | /* |
666 | * Like add_to_page_cache_locked, but used to add newly allocated pages: | 668 | * Like add_to_page_cache_locked, but used to add newly allocated pages: |
667 | * the page is new, so we can just run __set_page_locked() against it. | 669 | * the page is new, so we can just run __SetPageLocked() against it. |
668 | */ | 670 | */ |
669 | static inline int add_to_page_cache(struct page *page, | 671 | static inline int add_to_page_cache(struct page *page, |
670 | struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) | 672 | struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) |
671 | { | 673 | { |
672 | int error; | 674 | int error; |
673 | 675 | ||
674 | __set_page_locked(page); | 676 | __SetPageLocked(page); |
675 | error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); | 677 | error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); |
676 | if (unlikely(error)) | 678 | if (unlikely(error)) |
677 | __clear_page_locked(page); | 679 | __ClearPageLocked(page); |
678 | return error; | 680 | return error; |
679 | } | 681 | } |
680 | 682 | ||
diff --git a/include/linux/pfn.h b/include/linux/pfn.h index 97f3e88aead4..2d8e49711b63 100644 --- a/include/linux/pfn.h +++ b/include/linux/pfn.h | |||
@@ -3,6 +3,15 @@ | |||
3 | 3 | ||
4 | #ifndef __ASSEMBLY__ | 4 | #ifndef __ASSEMBLY__ |
5 | #include <linux/types.h> | 5 | #include <linux/types.h> |
6 | |||
7 | /* | ||
8 | * pfn_t: encapsulates a page-frame number that is optionally backed | ||
9 | * by memmap (struct page). Whether a pfn_t has a 'struct page' | ||
10 | * backing is indicated by flags in the high bits of the value. | ||
11 | */ | ||
12 | typedef struct { | ||
13 | unsigned long val; | ||
14 | } pfn_t; | ||
6 | #endif | 15 | #endif |
7 | 16 | ||
8 | #define PFN_ALIGN(x) (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK) | 17 | #define PFN_ALIGN(x) (((unsigned long)(x) + (PAGE_SIZE - 1)) & PAGE_MASK) |
diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h new file mode 100644 index 000000000000..0703b5360d31 --- /dev/null +++ b/include/linux/pfn_t.h | |||
@@ -0,0 +1,102 @@ | |||
1 | #ifndef _LINUX_PFN_T_H_ | ||
2 | #define _LINUX_PFN_T_H_ | ||
3 | #include <linux/mm.h> | ||
4 | |||
5 | /* | ||
6 | * PFN_FLAGS_MASK - mask of all the possible valid pfn_t flags | ||
7 | * PFN_SG_CHAIN - pfn is a pointer to the next scatterlist entry | ||
8 | * PFN_SG_LAST - pfn references a page and is the last scatterlist entry | ||
9 | * PFN_DEV - pfn is not covered by system memmap by default | ||
10 | * PFN_MAP - pfn has a dynamic page mapping established by a device driver | ||
11 | */ | ||
12 | #define PFN_FLAGS_MASK (((unsigned long) ~PAGE_MASK) \ | ||
13 | << (BITS_PER_LONG - PAGE_SHIFT)) | ||
14 | #define PFN_SG_CHAIN (1UL << (BITS_PER_LONG - 1)) | ||
15 | #define PFN_SG_LAST (1UL << (BITS_PER_LONG - 2)) | ||
16 | #define PFN_DEV (1UL << (BITS_PER_LONG - 3)) | ||
17 | #define PFN_MAP (1UL << (BITS_PER_LONG - 4)) | ||
18 | |||
19 | static inline pfn_t __pfn_to_pfn_t(unsigned long pfn, unsigned long flags) | ||
20 | { | ||
21 | pfn_t pfn_t = { .val = pfn | (flags & PFN_FLAGS_MASK), }; | ||
22 | |||
23 | return pfn_t; | ||
24 | } | ||
25 | |||
26 | /* a default pfn to pfn_t conversion assumes that @pfn is pfn_valid() */ | ||
27 | static inline pfn_t pfn_to_pfn_t(unsigned long pfn) | ||
28 | { | ||
29 | return __pfn_to_pfn_t(pfn, 0); | ||
30 | } | ||
31 | |||
32 | extern pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags); | ||
33 | |||
34 | static inline bool pfn_t_has_page(pfn_t pfn) | ||
35 | { | ||
36 | return (pfn.val & PFN_MAP) == PFN_MAP || (pfn.val & PFN_DEV) == 0; | ||
37 | } | ||
38 | |||
39 | static inline unsigned long pfn_t_to_pfn(pfn_t pfn) | ||
40 | { | ||
41 | return pfn.val & ~PFN_FLAGS_MASK; | ||
42 | } | ||
43 | |||
44 | static inline struct page *pfn_t_to_page(pfn_t pfn) | ||
45 | { | ||
46 | if (pfn_t_has_page(pfn)) | ||
47 | return pfn_to_page(pfn_t_to_pfn(pfn)); | ||
48 | return NULL; | ||
49 | } | ||
50 | |||
51 | static inline dma_addr_t pfn_t_to_phys(pfn_t pfn) | ||
52 | { | ||
53 | return PFN_PHYS(pfn_t_to_pfn(pfn)); | ||
54 | } | ||
55 | |||
56 | static inline void *pfn_t_to_virt(pfn_t pfn) | ||
57 | { | ||
58 | if (pfn_t_has_page(pfn)) | ||
59 | return __va(pfn_t_to_phys(pfn)); | ||
60 | return NULL; | ||
61 | } | ||
62 | |||
63 | static inline pfn_t page_to_pfn_t(struct page *page) | ||
64 | { | ||
65 | return pfn_to_pfn_t(page_to_pfn(page)); | ||
66 | } | ||
67 | |||
68 | static inline int pfn_t_valid(pfn_t pfn) | ||
69 | { | ||
70 | return pfn_valid(pfn_t_to_pfn(pfn)); | ||
71 | } | ||
72 | |||
73 | #ifdef CONFIG_MMU | ||
74 | static inline pte_t pfn_t_pte(pfn_t pfn, pgprot_t pgprot) | ||
75 | { | ||
76 | return pfn_pte(pfn_t_to_pfn(pfn), pgprot); | ||
77 | } | ||
78 | #endif | ||
79 | |||
80 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
81 | static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot) | ||
82 | { | ||
83 | return pfn_pmd(pfn_t_to_pfn(pfn), pgprot); | ||
84 | } | ||
85 | #endif | ||
86 | |||
87 | #ifdef __HAVE_ARCH_PTE_DEVMAP | ||
88 | static inline bool pfn_t_devmap(pfn_t pfn) | ||
89 | { | ||
90 | const unsigned long flags = PFN_DEV|PFN_MAP; | ||
91 | |||
92 | return (pfn.val & flags) == flags; | ||
93 | } | ||
94 | #else | ||
95 | static inline bool pfn_t_devmap(pfn_t pfn) | ||
96 | { | ||
97 | return false; | ||
98 | } | ||
99 | pte_t pte_mkdevmap(pte_t pte); | ||
100 | pmd_t pmd_mkdevmap(pmd_t pmd); | ||
101 | #endif | ||
102 | #endif /* _LINUX_PFN_T_H_ */ | ||
diff --git a/include/linux/poison.h b/include/linux/poison.h index 317e16de09e5..4a27153574e2 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h | |||
@@ -27,11 +27,15 @@ | |||
27 | * Magic number "tsta" to indicate a static timer initializer | 27 | * Magic number "tsta" to indicate a static timer initializer |
28 | * for the object debugging code. | 28 | * for the object debugging code. |
29 | */ | 29 | */ |
30 | #define TIMER_ENTRY_STATIC ((void *) 0x74737461) | 30 | #define TIMER_ENTRY_STATIC ((void *) 0x300 + POISON_POINTER_DELTA) |
31 | 31 | ||
32 | /********** mm/debug-pagealloc.c **********/ | 32 | /********** mm/debug-pagealloc.c **********/ |
33 | #define PAGE_POISON 0xaa | 33 | #define PAGE_POISON 0xaa |
34 | 34 | ||
35 | /********** mm/page_alloc.c ************/ | ||
36 | |||
37 | #define TAIL_MAPPING ((void *) 0x400 + POISON_POINTER_DELTA) | ||
38 | |||
35 | /********** mm/slab.c **********/ | 39 | /********** mm/slab.c **********/ |
36 | /* | 40 | /* |
37 | * Magic nums for obj red zoning. | 41 | * Magic nums for obj red zoning. |
diff --git a/include/linux/printk.h b/include/linux/printk.h index 9729565c25ff..9ccbdf2c1453 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h | |||
@@ -106,13 +106,13 @@ struct va_format { | |||
106 | 106 | ||
107 | /* | 107 | /* |
108 | * Dummy printk for disabled debugging statements to use whilst maintaining | 108 | * Dummy printk for disabled debugging statements to use whilst maintaining |
109 | * gcc's format and side-effect checking. | 109 | * gcc's format checking. |
110 | */ | 110 | */ |
111 | static inline __printf(1, 2) | 111 | #define no_printk(fmt, ...) \ |
112 | int no_printk(const char *fmt, ...) | 112 | do { \ |
113 | { | 113 | if (0) \ |
114 | return 0; | 114 | printk(fmt, ##__VA_ARGS__); \ |
115 | } | 115 | } while (0) |
116 | 116 | ||
117 | #ifdef CONFIG_EARLY_PRINTK | 117 | #ifdef CONFIG_EARLY_PRINTK |
118 | extern asmlinkage __printf(1, 2) | 118 | extern asmlinkage __printf(1, 2) |
diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 29446aeef36e..bdf597c4f0be 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h | |||
@@ -85,6 +85,7 @@ enum ttu_flags { | |||
85 | TTU_UNMAP = 1, /* unmap mode */ | 85 | TTU_UNMAP = 1, /* unmap mode */ |
86 | TTU_MIGRATION = 2, /* migration mode */ | 86 | TTU_MIGRATION = 2, /* migration mode */ |
87 | TTU_MUNLOCK = 4, /* munlock mode */ | 87 | TTU_MUNLOCK = 4, /* munlock mode */ |
88 | TTU_LZFREE = 8, /* lazy free mode */ | ||
88 | 89 | ||
89 | TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ | 90 | TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ |
90 | TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ | 91 | TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ |
@@ -161,25 +162,31 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, | |||
161 | 162 | ||
162 | struct anon_vma *page_get_anon_vma(struct page *page); | 163 | struct anon_vma *page_get_anon_vma(struct page *page); |
163 | 164 | ||
165 | /* bitflags for do_page_add_anon_rmap() */ | ||
166 | #define RMAP_EXCLUSIVE 0x01 | ||
167 | #define RMAP_COMPOUND 0x02 | ||
168 | |||
164 | /* | 169 | /* |
165 | * rmap interfaces called when adding or removing pte of page | 170 | * rmap interfaces called when adding or removing pte of page |
166 | */ | 171 | */ |
167 | void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); | 172 | void page_move_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); |
168 | void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); | 173 | void page_add_anon_rmap(struct page *, struct vm_area_struct *, |
174 | unsigned long, bool); | ||
169 | void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, | 175 | void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, |
170 | unsigned long, int); | 176 | unsigned long, int); |
171 | void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); | 177 | void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, |
178 | unsigned long, bool); | ||
172 | void page_add_file_rmap(struct page *); | 179 | void page_add_file_rmap(struct page *); |
173 | void page_remove_rmap(struct page *); | 180 | void page_remove_rmap(struct page *, bool); |
174 | 181 | ||
175 | void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, | 182 | void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, |
176 | unsigned long); | 183 | unsigned long); |
177 | void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, | 184 | void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, |
178 | unsigned long); | 185 | unsigned long); |
179 | 186 | ||
180 | static inline void page_dup_rmap(struct page *page) | 187 | static inline void page_dup_rmap(struct page *page, bool compound) |
181 | { | 188 | { |
182 | atomic_inc(&page->_mapcount); | 189 | atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); |
183 | } | 190 | } |
184 | 191 | ||
185 | /* | 192 | /* |
@@ -210,6 +217,25 @@ static inline pte_t *page_check_address(struct page *page, struct mm_struct *mm, | |||
210 | } | 217 | } |
211 | 218 | ||
212 | /* | 219 | /* |
220 | * Used by idle page tracking to check if a page was referenced via page | ||
221 | * tables. | ||
222 | */ | ||
223 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
224 | bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, | ||
225 | unsigned long address, pmd_t **pmdp, | ||
226 | pte_t **ptep, spinlock_t **ptlp); | ||
227 | #else | ||
228 | static inline bool page_check_address_transhuge(struct page *page, | ||
229 | struct mm_struct *mm, unsigned long address, | ||
230 | pmd_t **pmdp, pte_t **ptep, spinlock_t **ptlp) | ||
231 | { | ||
232 | *ptep = page_check_address(page, mm, address, ptlp, 0); | ||
233 | *pmdp = NULL; | ||
234 | return !!*ptep; | ||
235 | } | ||
236 | #endif | ||
237 | |||
238 | /* | ||
213 | * Used by swapoff to help locate where page is expected in vma. | 239 | * Used by swapoff to help locate where page is expected in vma. |
214 | */ | 240 | */ |
215 | unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); | 241 | unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); |
@@ -286,5 +312,6 @@ static inline int page_mkclean(struct page *page) | |||
286 | #define SWAP_AGAIN 1 | 312 | #define SWAP_AGAIN 1 |
287 | #define SWAP_FAIL 2 | 313 | #define SWAP_FAIL 2 |
288 | #define SWAP_MLOCK 3 | 314 | #define SWAP_MLOCK 3 |
315 | #define SWAP_LZFREE 4 | ||
289 | 316 | ||
290 | #endif /* _LINUX_RMAP_H */ | 317 | #endif /* _LINUX_RMAP_H */ |
diff --git a/include/linux/swap.h b/include/linux/swap.h index 066bd21765ad..414e101cd061 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h | |||
@@ -307,6 +307,7 @@ extern void lru_add_drain_cpu(int cpu); | |||
307 | extern void lru_add_drain_all(void); | 307 | extern void lru_add_drain_all(void); |
308 | extern void rotate_reclaimable_page(struct page *page); | 308 | extern void rotate_reclaimable_page(struct page *page); |
309 | extern void deactivate_file_page(struct page *page); | 309 | extern void deactivate_file_page(struct page *page); |
310 | extern void deactivate_page(struct page *page); | ||
310 | extern void swap_setup(void); | 311 | extern void swap_setup(void); |
311 | 312 | ||
312 | extern void add_page_to_unevictable_list(struct page *page); | 313 | extern void add_page_to_unevictable_list(struct page *page); |
@@ -538,7 +539,8 @@ static inline int swp_swapcount(swp_entry_t entry) | |||
538 | return 0; | 539 | return 0; |
539 | } | 540 | } |
540 | 541 | ||
541 | #define reuse_swap_page(page) (page_mapcount(page) == 1) | 542 | #define reuse_swap_page(page) \ |
543 | (!PageTransCompound(page) && page_mapcount(page) == 1) | ||
542 | 544 | ||
543 | static inline int try_to_free_swap(struct page *page) | 545 | static inline int try_to_free_swap(struct page *page) |
544 | { | 546 | { |
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index e623d392db0c..67c1dbd19c6d 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h | |||
@@ -25,6 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
25 | FOR_ALL_ZONES(PGALLOC), | 25 | FOR_ALL_ZONES(PGALLOC), |
26 | PGFREE, PGACTIVATE, PGDEACTIVATE, | 26 | PGFREE, PGACTIVATE, PGDEACTIVATE, |
27 | PGFAULT, PGMAJFAULT, | 27 | PGFAULT, PGMAJFAULT, |
28 | PGLAZYFREED, | ||
28 | FOR_ALL_ZONES(PGREFILL), | 29 | FOR_ALL_ZONES(PGREFILL), |
29 | FOR_ALL_ZONES(PGSTEAL_KSWAPD), | 30 | FOR_ALL_ZONES(PGSTEAL_KSWAPD), |
30 | FOR_ALL_ZONES(PGSTEAL_DIRECT), | 31 | FOR_ALL_ZONES(PGSTEAL_DIRECT), |
@@ -68,7 +69,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, | |||
68 | THP_FAULT_FALLBACK, | 69 | THP_FAULT_FALLBACK, |
69 | THP_COLLAPSE_ALLOC, | 70 | THP_COLLAPSE_ALLOC, |
70 | THP_COLLAPSE_ALLOC_FAILED, | 71 | THP_COLLAPSE_ALLOC_FAILED, |
71 | THP_SPLIT, | 72 | THP_SPLIT_PAGE, |
73 | THP_SPLIT_PAGE_FAILED, | ||
74 | THP_SPLIT_PMD, | ||
72 | THP_ZERO_PAGE_ALLOC, | 75 | THP_ZERO_PAGE_ALLOC, |
73 | THP_ZERO_PAGE_ALLOC_FAILED, | 76 | THP_ZERO_PAGE_ALLOC_FAILED, |
74 | #endif | 77 | #endif |
diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 97d635cabac8..0f803d2783e3 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h | |||
@@ -22,6 +22,7 @@ | |||
22 | EM( SCAN_PAGE_LRU, "page_not_in_lru") \ | 22 | EM( SCAN_PAGE_LRU, "page_not_in_lru") \ |
23 | EM( SCAN_PAGE_LOCK, "page_locked") \ | 23 | EM( SCAN_PAGE_LOCK, "page_locked") \ |
24 | EM( SCAN_PAGE_ANON, "page_not_anon") \ | 24 | EM( SCAN_PAGE_ANON, "page_not_anon") \ |
25 | EM( SCAN_PAGE_COMPOUND, "page_compound") \ | ||
25 | EM( SCAN_ANY_PROCESS, "no_process_for_page") \ | 26 | EM( SCAN_ANY_PROCESS, "no_process_for_page") \ |
26 | EM( SCAN_VMA_NULL, "vma_null") \ | 27 | EM( SCAN_VMA_NULL, "vma_null") \ |
27 | EM( SCAN_VMA_CHECK, "vma_check_failed") \ | 28 | EM( SCAN_VMA_CHECK, "vma_check_failed") \ |
diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index a74dd84bbb6d..58274382a616 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h | |||
@@ -41,6 +41,7 @@ | |||
41 | #define MADV_DONTNEED 4 /* don't need these pages */ | 41 | #define MADV_DONTNEED 4 /* don't need these pages */ |
42 | 42 | ||
43 | /* common parameters: try to keep these consistent across architectures */ | 43 | /* common parameters: try to keep these consistent across architectures */ |
44 | #define MADV_FREE 8 /* free pages only if memory pressure */ | ||
44 | #define MADV_REMOVE 9 /* remove these pages & resources */ | 45 | #define MADV_REMOVE 9 /* remove these pages & resources */ |
45 | #define MADV_DONTFORK 10 /* don't inherit across fork */ | 46 | #define MADV_DONTFORK 10 /* don't inherit across fork */ |
46 | #define MADV_DOFORK 11 /* do inherit across fork */ | 47 | #define MADV_DOFORK 11 /* do inherit across fork */ |
diff --git a/init/Kconfig b/init/Kconfig index 5481b49e8c3f..4644217b2373 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -285,7 +285,7 @@ config FHANDLE | |||
285 | 285 | ||
286 | config USELIB | 286 | config USELIB |
287 | bool "uselib syscall" | 287 | bool "uselib syscall" |
288 | default y | 288 | def_bool ALPHA || M68K || SPARC || X86_32 || IA32_EMULATION |
289 | help | 289 | help |
290 | This option enables the uselib syscall, a system call used in the | 290 | This option enables the uselib syscall, a system call used in the |
291 | dynamic linker from libc5 and earlier. glibc does not use this | 291 | dynamic linker from libc5 and earlier. glibc does not use this |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bb0669169716..0167679182c0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
@@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
161 | const unsigned long mmun_end = addr + PAGE_SIZE; | 161 | const unsigned long mmun_end = addr + PAGE_SIZE; |
162 | struct mem_cgroup *memcg; | 162 | struct mem_cgroup *memcg; |
163 | 163 | ||
164 | err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); | 164 | err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, |
165 | false); | ||
165 | if (err) | 166 | if (err) |
166 | return err; | 167 | return err; |
167 | 168 | ||
@@ -175,8 +176,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
175 | goto unlock; | 176 | goto unlock; |
176 | 177 | ||
177 | get_page(kpage); | 178 | get_page(kpage); |
178 | page_add_new_anon_rmap(kpage, vma, addr); | 179 | page_add_new_anon_rmap(kpage, vma, addr, false); |
179 | mem_cgroup_commit_charge(kpage, memcg, false); | 180 | mem_cgroup_commit_charge(kpage, memcg, false, false); |
180 | lru_cache_add_active_or_unevictable(kpage, vma); | 181 | lru_cache_add_active_or_unevictable(kpage, vma); |
181 | 182 | ||
182 | if (!PageAnon(page)) { | 183 | if (!PageAnon(page)) { |
@@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
188 | ptep_clear_flush_notify(vma, addr, ptep); | 189 | ptep_clear_flush_notify(vma, addr, ptep); |
189 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | 190 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
190 | 191 | ||
191 | page_remove_rmap(page); | 192 | page_remove_rmap(page, false); |
192 | if (!page_mapped(page)) | 193 | if (!page_mapped(page)) |
193 | try_to_free_swap(page); | 194 | try_to_free_swap(page); |
194 | pte_unmap_unlock(ptep, ptl); | 195 | pte_unmap_unlock(ptep, ptl); |
@@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
199 | 200 | ||
200 | err = 0; | 201 | err = 0; |
201 | unlock: | 202 | unlock: |
202 | mem_cgroup_cancel_charge(kpage, memcg); | 203 | mem_cgroup_cancel_charge(kpage, memcg, false); |
203 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 204 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
204 | unlock_page(page); | 205 | unlock_page(page); |
205 | return err; | 206 | return err; |
diff --git a/kernel/futex.c b/kernel/futex.c index 8a310e240cda..c6f514573b28 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) | |||
469 | { | 469 | { |
470 | unsigned long address = (unsigned long)uaddr; | 470 | unsigned long address = (unsigned long)uaddr; |
471 | struct mm_struct *mm = current->mm; | 471 | struct mm_struct *mm = current->mm; |
472 | struct page *page, *page_head; | 472 | struct page *page; |
473 | struct address_space *mapping; | ||
473 | int err, ro = 0; | 474 | int err, ro = 0; |
474 | 475 | ||
475 | /* | 476 | /* |
@@ -519,46 +520,9 @@ again: | |||
519 | else | 520 | else |
520 | err = 0; | 521 | err = 0; |
521 | 522 | ||
522 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 523 | lock_page(page); |
523 | page_head = page; | ||
524 | if (unlikely(PageTail(page))) { | ||
525 | put_page(page); | ||
526 | /* serialize against __split_huge_page_splitting() */ | ||
527 | local_irq_disable(); | ||
528 | if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { | ||
529 | page_head = compound_head(page); | ||
530 | /* | ||
531 | * page_head is valid pointer but we must pin | ||
532 | * it before taking the PG_lock and/or | ||
533 | * PG_compound_lock. The moment we re-enable | ||
534 | * irqs __split_huge_page_splitting() can | ||
535 | * return and the head page can be freed from | ||
536 | * under us. We can't take the PG_lock and/or | ||
537 | * PG_compound_lock on a page that could be | ||
538 | * freed from under us. | ||
539 | */ | ||
540 | if (page != page_head) { | ||
541 | get_page(page_head); | ||
542 | put_page(page); | ||
543 | } | ||
544 | local_irq_enable(); | ||
545 | } else { | ||
546 | local_irq_enable(); | ||
547 | goto again; | ||
548 | } | ||
549 | } | ||
550 | #else | ||
551 | page_head = compound_head(page); | ||
552 | if (page != page_head) { | ||
553 | get_page(page_head); | ||
554 | put_page(page); | ||
555 | } | ||
556 | #endif | ||
557 | |||
558 | lock_page(page_head); | ||
559 | |||
560 | /* | 524 | /* |
561 | * If page_head->mapping is NULL, then it cannot be a PageAnon | 525 | * If page->mapping is NULL, then it cannot be a PageAnon |
562 | * page; but it might be the ZERO_PAGE or in the gate area or | 526 | * page; but it might be the ZERO_PAGE or in the gate area or |
563 | * in a special mapping (all cases which we are happy to fail); | 527 | * in a special mapping (all cases which we are happy to fail); |
564 | * or it may have been a good file page when get_user_pages_fast | 528 | * or it may have been a good file page when get_user_pages_fast |
@@ -570,12 +534,13 @@ again: | |||
570 | * | 534 | * |
571 | * The case we do have to guard against is when memory pressure made | 535 | * The case we do have to guard against is when memory pressure made |
572 | * shmem_writepage move it from filecache to swapcache beneath us: | 536 | * shmem_writepage move it from filecache to swapcache beneath us: |
573 | * an unlikely race, but we do need to retry for page_head->mapping. | 537 | * an unlikely race, but we do need to retry for page->mapping. |
574 | */ | 538 | */ |
575 | if (!page_head->mapping) { | 539 | mapping = compound_head(page)->mapping; |
576 | int shmem_swizzled = PageSwapCache(page_head); | 540 | if (!mapping) { |
577 | unlock_page(page_head); | 541 | int shmem_swizzled = PageSwapCache(page); |
578 | put_page(page_head); | 542 | unlock_page(page); |
543 | put_page(page); | ||
579 | if (shmem_swizzled) | 544 | if (shmem_swizzled) |
580 | goto again; | 545 | goto again; |
581 | return -EFAULT; | 546 | return -EFAULT; |
@@ -588,7 +553,7 @@ again: | |||
588 | * it's a read-only handle, it's expected that futexes attach to | 553 | * it's a read-only handle, it's expected that futexes attach to |
589 | * the object not the particular process. | 554 | * the object not the particular process. |
590 | */ | 555 | */ |
591 | if (PageAnon(page_head)) { | 556 | if (PageAnon(page)) { |
592 | /* | 557 | /* |
593 | * A RO anonymous page will never change and thus doesn't make | 558 | * A RO anonymous page will never change and thus doesn't make |
594 | * sense for futex operations. | 559 | * sense for futex operations. |
@@ -603,15 +568,15 @@ again: | |||
603 | key->private.address = address; | 568 | key->private.address = address; |
604 | } else { | 569 | } else { |
605 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ | 570 | key->both.offset |= FUT_OFF_INODE; /* inode-based key */ |
606 | key->shared.inode = page_head->mapping->host; | 571 | key->shared.inode = mapping->host; |
607 | key->shared.pgoff = basepage_index(page); | 572 | key->shared.pgoff = basepage_index(page); |
608 | } | 573 | } |
609 | 574 | ||
610 | get_futex_key_refs(key); /* implies MB (B) */ | 575 | get_futex_key_refs(key); /* implies MB (B) */ |
611 | 576 | ||
612 | out: | 577 | out: |
613 | unlock_page(page_head); | 578 | unlock_page(page); |
614 | put_page(page_head); | 579 | put_page(page); |
615 | return err; | 580 | return err; |
616 | } | 581 | } |
617 | 582 | ||
@@ -639,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr) | |||
639 | 604 | ||
640 | down_read(&mm->mmap_sem); | 605 | down_read(&mm->mmap_sem); |
641 | ret = fixup_user_fault(current, mm, (unsigned long)uaddr, | 606 | ret = fixup_user_fault(current, mm, (unsigned long)uaddr, |
642 | FAULT_FLAG_WRITE); | 607 | FAULT_FLAG_WRITE, NULL); |
643 | up_read(&mm->mmap_sem); | 608 | up_read(&mm->mmap_sem); |
644 | 609 | ||
645 | return ret < 0 ? ret : 0; | 610 | return ret < 0 ? ret : 0; |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 7658d32c5c78..e517a16cb426 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
@@ -10,8 +10,11 @@ | |||
10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
11 | * General Public License for more details. | 11 | * General Public License for more details. |
12 | */ | 12 | */ |
13 | #include <linux/radix-tree.h> | ||
14 | #include <linux/memremap.h> | ||
13 | #include <linux/device.h> | 15 | #include <linux/device.h> |
14 | #include <linux/types.h> | 16 | #include <linux/types.h> |
17 | #include <linux/pfn_t.h> | ||
15 | #include <linux/io.h> | 18 | #include <linux/io.h> |
16 | #include <linux/mm.h> | 19 | #include <linux/mm.h> |
17 | #include <linux/memory_hotplug.h> | 20 | #include <linux/memory_hotplug.h> |
@@ -147,24 +150,127 @@ void devm_memunmap(struct device *dev, void *addr) | |||
147 | } | 150 | } |
148 | EXPORT_SYMBOL(devm_memunmap); | 151 | EXPORT_SYMBOL(devm_memunmap); |
149 | 152 | ||
153 | pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) | ||
154 | { | ||
155 | return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); | ||
156 | } | ||
157 | EXPORT_SYMBOL(phys_to_pfn_t); | ||
158 | |||
150 | #ifdef CONFIG_ZONE_DEVICE | 159 | #ifdef CONFIG_ZONE_DEVICE |
160 | static DEFINE_MUTEX(pgmap_lock); | ||
161 | static RADIX_TREE(pgmap_radix, GFP_KERNEL); | ||
162 | #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) | ||
163 | #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) | ||
164 | |||
151 | struct page_map { | 165 | struct page_map { |
152 | struct resource res; | 166 | struct resource res; |
167 | struct percpu_ref *ref; | ||
168 | struct dev_pagemap pgmap; | ||
169 | struct vmem_altmap altmap; | ||
153 | }; | 170 | }; |
154 | 171 | ||
155 | static void devm_memremap_pages_release(struct device *dev, void *res) | 172 | void get_zone_device_page(struct page *page) |
173 | { | ||
174 | percpu_ref_get(page->pgmap->ref); | ||
175 | } | ||
176 | EXPORT_SYMBOL(get_zone_device_page); | ||
177 | |||
178 | void put_zone_device_page(struct page *page) | ||
179 | { | ||
180 | put_dev_pagemap(page->pgmap); | ||
181 | } | ||
182 | EXPORT_SYMBOL(put_zone_device_page); | ||
183 | |||
184 | static void pgmap_radix_release(struct resource *res) | ||
185 | { | ||
186 | resource_size_t key; | ||
187 | |||
188 | mutex_lock(&pgmap_lock); | ||
189 | for (key = res->start; key <= res->end; key += SECTION_SIZE) | ||
190 | radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); | ||
191 | mutex_unlock(&pgmap_lock); | ||
192 | } | ||
193 | |||
194 | static unsigned long pfn_first(struct page_map *page_map) | ||
195 | { | ||
196 | struct dev_pagemap *pgmap = &page_map->pgmap; | ||
197 | const struct resource *res = &page_map->res; | ||
198 | struct vmem_altmap *altmap = pgmap->altmap; | ||
199 | unsigned long pfn; | ||
200 | |||
201 | pfn = res->start >> PAGE_SHIFT; | ||
202 | if (altmap) | ||
203 | pfn += vmem_altmap_offset(altmap); | ||
204 | return pfn; | ||
205 | } | ||
206 | |||
207 | static unsigned long pfn_end(struct page_map *page_map) | ||
208 | { | ||
209 | const struct resource *res = &page_map->res; | ||
210 | |||
211 | return (res->start + resource_size(res)) >> PAGE_SHIFT; | ||
212 | } | ||
213 | |||
214 | #define for_each_device_pfn(pfn, map) \ | ||
215 | for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) | ||
216 | |||
217 | static void devm_memremap_pages_release(struct device *dev, void *data) | ||
156 | { | 218 | { |
157 | struct page_map *page_map = res; | 219 | struct page_map *page_map = data; |
220 | struct resource *res = &page_map->res; | ||
221 | resource_size_t align_start, align_size; | ||
222 | struct dev_pagemap *pgmap = &page_map->pgmap; | ||
223 | |||
224 | if (percpu_ref_tryget_live(pgmap->ref)) { | ||
225 | dev_WARN(dev, "%s: page mapping is still live!\n", __func__); | ||
226 | percpu_ref_put(pgmap->ref); | ||
227 | } | ||
228 | |||
229 | pgmap_radix_release(res); | ||
158 | 230 | ||
159 | /* pages are dead and unused, undo the arch mapping */ | 231 | /* pages are dead and unused, undo the arch mapping */ |
160 | arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); | 232 | align_start = res->start & ~(SECTION_SIZE - 1); |
233 | align_size = ALIGN(resource_size(res), SECTION_SIZE); | ||
234 | arch_remove_memory(align_start, align_size); | ||
235 | dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, | ||
236 | "%s: failed to free all reserved pages\n", __func__); | ||
237 | } | ||
238 | |||
239 | /* assumes rcu_read_lock() held at entry */ | ||
240 | struct dev_pagemap *find_dev_pagemap(resource_size_t phys) | ||
241 | { | ||
242 | struct page_map *page_map; | ||
243 | |||
244 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
245 | |||
246 | page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); | ||
247 | return page_map ? &page_map->pgmap : NULL; | ||
161 | } | 248 | } |
162 | 249 | ||
163 | void *devm_memremap_pages(struct device *dev, struct resource *res) | 250 | /** |
251 | * devm_memremap_pages - remap and provide memmap backing for the given resource | ||
252 | * @dev: hosting device for @res | ||
253 | * @res: "host memory" address range | ||
254 | * @ref: a live per-cpu reference count | ||
255 | * @altmap: optional descriptor for allocating the memmap from @res | ||
256 | * | ||
257 | * Notes: | ||
258 | * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time | ||
259 | * (or devm release event). | ||
260 | * | ||
261 | * 2/ @res is expected to be a host memory range that could feasibly be | ||
262 | * treated as a "System RAM" range, i.e. not a device mmio range, but | ||
263 | * this is not enforced. | ||
264 | */ | ||
265 | void *devm_memremap_pages(struct device *dev, struct resource *res, | ||
266 | struct percpu_ref *ref, struct vmem_altmap *altmap) | ||
164 | { | 267 | { |
165 | int is_ram = region_intersects(res->start, resource_size(res), | 268 | int is_ram = region_intersects(res->start, resource_size(res), |
166 | "System RAM"); | 269 | "System RAM"); |
270 | resource_size_t key, align_start, align_size; | ||
271 | struct dev_pagemap *pgmap; | ||
167 | struct page_map *page_map; | 272 | struct page_map *page_map; |
273 | unsigned long pfn; | ||
168 | int error, nid; | 274 | int error, nid; |
169 | 275 | ||
170 | if (is_ram == REGION_MIXED) { | 276 | if (is_ram == REGION_MIXED) { |
@@ -176,25 +282,120 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) | |||
176 | if (is_ram == REGION_INTERSECTS) | 282 | if (is_ram == REGION_INTERSECTS) |
177 | return __va(res->start); | 283 | return __va(res->start); |
178 | 284 | ||
285 | if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { | ||
286 | dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", | ||
287 | __func__); | ||
288 | return ERR_PTR(-ENXIO); | ||
289 | } | ||
290 | |||
291 | if (!ref) | ||
292 | return ERR_PTR(-EINVAL); | ||
293 | |||
179 | page_map = devres_alloc_node(devm_memremap_pages_release, | 294 | page_map = devres_alloc_node(devm_memremap_pages_release, |
180 | sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); | 295 | sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); |
181 | if (!page_map) | 296 | if (!page_map) |
182 | return ERR_PTR(-ENOMEM); | 297 | return ERR_PTR(-ENOMEM); |
298 | pgmap = &page_map->pgmap; | ||
183 | 299 | ||
184 | memcpy(&page_map->res, res, sizeof(*res)); | 300 | memcpy(&page_map->res, res, sizeof(*res)); |
185 | 301 | ||
302 | pgmap->dev = dev; | ||
303 | if (altmap) { | ||
304 | memcpy(&page_map->altmap, altmap, sizeof(*altmap)); | ||
305 | pgmap->altmap = &page_map->altmap; | ||
306 | } | ||
307 | pgmap->ref = ref; | ||
308 | pgmap->res = &page_map->res; | ||
309 | |||
310 | mutex_lock(&pgmap_lock); | ||
311 | error = 0; | ||
312 | for (key = res->start; key <= res->end; key += SECTION_SIZE) { | ||
313 | struct dev_pagemap *dup; | ||
314 | |||
315 | rcu_read_lock(); | ||
316 | dup = find_dev_pagemap(key); | ||
317 | rcu_read_unlock(); | ||
318 | if (dup) { | ||
319 | dev_err(dev, "%s: %pr collides with mapping for %s\n", | ||
320 | __func__, res, dev_name(dup->dev)); | ||
321 | error = -EBUSY; | ||
322 | break; | ||
323 | } | ||
324 | error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, | ||
325 | page_map); | ||
326 | if (error) { | ||
327 | dev_err(dev, "%s: failed: %d\n", __func__, error); | ||
328 | break; | ||
329 | } | ||
330 | } | ||
331 | mutex_unlock(&pgmap_lock); | ||
332 | if (error) | ||
333 | goto err_radix; | ||
334 | |||
186 | nid = dev_to_node(dev); | 335 | nid = dev_to_node(dev); |
187 | if (nid < 0) | 336 | if (nid < 0) |
188 | nid = numa_mem_id(); | 337 | nid = numa_mem_id(); |
189 | 338 | ||
190 | error = arch_add_memory(nid, res->start, resource_size(res), true); | 339 | align_start = res->start & ~(SECTION_SIZE - 1); |
191 | if (error) { | 340 | align_size = ALIGN(resource_size(res), SECTION_SIZE); |
192 | devres_free(page_map); | 341 | error = arch_add_memory(nid, align_start, align_size, true); |
193 | return ERR_PTR(error); | 342 | if (error) |
194 | } | 343 | goto err_add_memory; |
195 | 344 | ||
345 | for_each_device_pfn(pfn, page_map) { | ||
346 | struct page *page = pfn_to_page(pfn); | ||
347 | |||
348 | /* ZONE_DEVICE pages must never appear on a slab lru */ | ||
349 | list_force_poison(&page->lru); | ||
350 | page->pgmap = pgmap; | ||
351 | } | ||
196 | devres_add(dev, page_map); | 352 | devres_add(dev, page_map); |
197 | return __va(res->start); | 353 | return __va(res->start); |
354 | |||
355 | err_add_memory: | ||
356 | err_radix: | ||
357 | pgmap_radix_release(res); | ||
358 | devres_free(page_map); | ||
359 | return ERR_PTR(error); | ||
198 | } | 360 | } |
199 | EXPORT_SYMBOL(devm_memremap_pages); | 361 | EXPORT_SYMBOL(devm_memremap_pages); |
362 | |||
363 | unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) | ||
364 | { | ||
365 | /* number of pfns from base where pfn_to_page() is valid */ | ||
366 | return altmap->reserve + altmap->free; | ||
367 | } | ||
368 | |||
369 | void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) | ||
370 | { | ||
371 | altmap->alloc -= nr_pfns; | ||
372 | } | ||
373 | |||
374 | #ifdef CONFIG_SPARSEMEM_VMEMMAP | ||
375 | struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) | ||
376 | { | ||
377 | /* | ||
378 | * 'memmap_start' is the virtual address for the first "struct | ||
379 | * page" in this range of the vmemmap array. In the case of | ||
380 | * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple | ||
381 | * pointer arithmetic, so we can perform this to_vmem_altmap() | ||
382 | * conversion without concern for the initialization state of | ||
383 | * the struct page fields. | ||
384 | */ | ||
385 | struct page *page = (struct page *) memmap_start; | ||
386 | struct dev_pagemap *pgmap; | ||
387 | |||
388 | /* | ||
389 | * Uncoditionally retrieve a dev_pagemap associated with the | ||
390 | * given physical address, this is only for use in the | ||
391 | * arch_{add|remove}_memory() for setting up and tearing down | ||
392 | * the memmap. | ||
393 | */ | ||
394 | rcu_read_lock(); | ||
395 | pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); | ||
396 | rcu_read_unlock(); | ||
397 | |||
398 | return pgmap ? pgmap->altmap : NULL; | ||
399 | } | ||
400 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ | ||
200 | #endif /* CONFIG_ZONE_DEVICE */ | 401 | #endif /* CONFIG_ZONE_DEVICE */ |
diff --git a/kernel/panic.c b/kernel/panic.c index b333380c6bb2..d96469de72dc 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -180,8 +180,7 @@ void panic(const char *fmt, ...) | |||
180 | * panic() is not being callled from OOPS. | 180 | * panic() is not being callled from OOPS. |
181 | */ | 181 | */ |
182 | debug_locks_off(); | 182 | debug_locks_off(); |
183 | console_trylock(); | 183 | console_flush_on_panic(); |
184 | console_unlock(); | ||
185 | 184 | ||
186 | if (!panic_blink) | 185 | if (!panic_blink) |
187 | panic_blink = no_blink; | 186 | panic_blink = no_blink; |
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2ce8826f1053..e79439134978 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
@@ -48,6 +48,7 @@ | |||
48 | #include <linux/uio.h> | 48 | #include <linux/uio.h> |
49 | 49 | ||
50 | #include <asm/uaccess.h> | 50 | #include <asm/uaccess.h> |
51 | #include <asm-generic/sections.h> | ||
51 | 52 | ||
52 | #define CREATE_TRACE_POINTS | 53 | #define CREATE_TRACE_POINTS |
53 | #include <trace/events/printk.h> | 54 | #include <trace/events/printk.h> |
@@ -1660,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1660 | const char *dict, size_t dictlen, | 1661 | const char *dict, size_t dictlen, |
1661 | const char *fmt, va_list args) | 1662 | const char *fmt, va_list args) |
1662 | { | 1663 | { |
1663 | static int recursion_bug; | 1664 | static bool recursion_bug; |
1664 | static char textbuf[LOG_LINE_MAX]; | 1665 | static char textbuf[LOG_LINE_MAX]; |
1665 | char *text = textbuf; | 1666 | char *text = textbuf; |
1666 | size_t text_len = 0; | 1667 | size_t text_len = 0; |
@@ -1696,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1696 | * it can be printed at the next appropriate moment: | 1697 | * it can be printed at the next appropriate moment: |
1697 | */ | 1698 | */ |
1698 | if (!oops_in_progress && !lockdep_recursing(current)) { | 1699 | if (!oops_in_progress && !lockdep_recursing(current)) { |
1699 | recursion_bug = 1; | 1700 | recursion_bug = true; |
1700 | local_irq_restore(flags); | 1701 | local_irq_restore(flags); |
1701 | return 0; | 1702 | return 0; |
1702 | } | 1703 | } |
@@ -1711,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
1711 | static const char recursion_msg[] = | 1712 | static const char recursion_msg[] = |
1712 | "BUG: recent printk recursion!"; | 1713 | "BUG: recent printk recursion!"; |
1713 | 1714 | ||
1714 | recursion_bug = 0; | 1715 | recursion_bug = false; |
1715 | /* emit KERN_CRIT message */ | 1716 | /* emit KERN_CRIT message */ |
1716 | printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, | 1717 | printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, |
1717 | NULL, 0, recursion_msg, | 1718 | NULL, 0, recursion_msg, |
@@ -2233,13 +2234,24 @@ void console_unlock(void) | |||
2233 | static u64 seen_seq; | 2234 | static u64 seen_seq; |
2234 | unsigned long flags; | 2235 | unsigned long flags; |
2235 | bool wake_klogd = false; | 2236 | bool wake_klogd = false; |
2236 | bool retry; | 2237 | bool do_cond_resched, retry; |
2237 | 2238 | ||
2238 | if (console_suspended) { | 2239 | if (console_suspended) { |
2239 | up_console_sem(); | 2240 | up_console_sem(); |
2240 | return; | 2241 | return; |
2241 | } | 2242 | } |
2242 | 2243 | ||
2244 | /* | ||
2245 | * Console drivers are called under logbuf_lock, so | ||
2246 | * @console_may_schedule should be cleared before; however, we may | ||
2247 | * end up dumping a lot of lines, for example, if called from | ||
2248 | * console registration path, and should invoke cond_resched() | ||
2249 | * between lines if allowable. Not doing so can cause a very long | ||
2250 | * scheduling stall on a slow console leading to RCU stall and | ||
2251 | * softlockup warnings which exacerbate the issue with more | ||
2252 | * messages practically incapacitating the system. | ||
2253 | */ | ||
2254 | do_cond_resched = console_may_schedule; | ||
2243 | console_may_schedule = 0; | 2255 | console_may_schedule = 0; |
2244 | 2256 | ||
2245 | /* flush buffered message fragment immediately to console */ | 2257 | /* flush buffered message fragment immediately to console */ |
@@ -2311,6 +2323,9 @@ skip: | |||
2311 | call_console_drivers(level, ext_text, ext_len, text, len); | 2323 | call_console_drivers(level, ext_text, ext_len, text, len); |
2312 | start_critical_timings(); | 2324 | start_critical_timings(); |
2313 | local_irq_restore(flags); | 2325 | local_irq_restore(flags); |
2326 | |||
2327 | if (do_cond_resched) | ||
2328 | cond_resched(); | ||
2314 | } | 2329 | } |
2315 | console_locked = 0; | 2330 | console_locked = 0; |
2316 | 2331 | ||
@@ -2378,6 +2393,25 @@ void console_unblank(void) | |||
2378 | console_unlock(); | 2393 | console_unlock(); |
2379 | } | 2394 | } |
2380 | 2395 | ||
2396 | /** | ||
2397 | * console_flush_on_panic - flush console content on panic | ||
2398 | * | ||
2399 | * Immediately output all pending messages no matter what. | ||
2400 | */ | ||
2401 | void console_flush_on_panic(void) | ||
2402 | { | ||
2403 | /* | ||
2404 | * If someone else is holding the console lock, trylock will fail | ||
2405 | * and may_schedule may be set. Ignore and proceed to unlock so | ||
2406 | * that messages are flushed out. As this can be called from any | ||
2407 | * context and we don't want to get preempted while flushing, | ||
2408 | * ensure may_schedule is cleared. | ||
2409 | */ | ||
2410 | console_trylock(); | ||
2411 | console_may_schedule = 0; | ||
2412 | console_unlock(); | ||
2413 | } | ||
2414 | |||
2381 | /* | 2415 | /* |
2382 | * Return the console tty driver structure and its associated index | 2416 | * Return the console tty driver structure and its associated index |
2383 | */ | 2417 | */ |
@@ -2658,13 +2692,36 @@ int unregister_console(struct console *console) | |||
2658 | } | 2692 | } |
2659 | EXPORT_SYMBOL(unregister_console); | 2693 | EXPORT_SYMBOL(unregister_console); |
2660 | 2694 | ||
2695 | /* | ||
2696 | * Some boot consoles access data that is in the init section and which will | ||
2697 | * be discarded after the initcalls have been run. To make sure that no code | ||
2698 | * will access this data, unregister the boot consoles in a late initcall. | ||
2699 | * | ||
2700 | * If for some reason, such as deferred probe or the driver being a loadable | ||
2701 | * module, the real console hasn't registered yet at this point, there will | ||
2702 | * be a brief interval in which no messages are logged to the console, which | ||
2703 | * makes it difficult to diagnose problems that occur during this time. | ||
2704 | * | ||
2705 | * To mitigate this problem somewhat, only unregister consoles whose memory | ||
2706 | * intersects with the init section. Note that code exists elsewhere to get | ||
2707 | * rid of the boot console as soon as the proper console shows up, so there | ||
2708 | * won't be side-effects from postponing the removal. | ||
2709 | */ | ||
2661 | static int __init printk_late_init(void) | 2710 | static int __init printk_late_init(void) |
2662 | { | 2711 | { |
2663 | struct console *con; | 2712 | struct console *con; |
2664 | 2713 | ||
2665 | for_each_console(con) { | 2714 | for_each_console(con) { |
2666 | if (!keep_bootcon && con->flags & CON_BOOT) { | 2715 | if (!keep_bootcon && con->flags & CON_BOOT) { |
2667 | unregister_console(con); | 2716 | /* |
2717 | * Make sure to unregister boot consoles whose data | ||
2718 | * resides in the init section before the init section | ||
2719 | * is discarded. Boot consoles whose data will stick | ||
2720 | * around will automatically be unregistered when the | ||
2721 | * proper console replaces them. | ||
2722 | */ | ||
2723 | if (init_section_intersects(con, sizeof(*con))) | ||
2724 | unregister_console(con); | ||
2668 | } | 2725 | } |
2669 | } | 2726 | } |
2670 | hotcpu_notifier(console_cpu_notify, 0); | 2727 | hotcpu_notifier(console_cpu_notify, 0); |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index edb6de4f5908..a467e6c28a3b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -529,8 +529,6 @@ static int __init cpu_stop_init(void) | |||
529 | } | 529 | } |
530 | early_initcall(cpu_stop_init); | 530 | early_initcall(cpu_stop_init); |
531 | 531 | ||
532 | #if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) | ||
533 | |||
534 | static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) | 532 | static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) |
535 | { | 533 | { |
536 | struct multi_stop_data msdata = { | 534 | struct multi_stop_data msdata = { |
@@ -628,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, | |||
628 | mutex_unlock(&stop_cpus_mutex); | 626 | mutex_unlock(&stop_cpus_mutex); |
629 | return ret ?: done.ret; | 627 | return ret ?: done.ret; |
630 | } | 628 | } |
631 | |||
632 | #endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ | ||
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ee1ac1cc082c..f75a33f29f6e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug | |||
@@ -580,6 +580,14 @@ config DEBUG_VM_RB | |||
580 | 580 | ||
581 | If unsure, say N. | 581 | If unsure, say N. |
582 | 582 | ||
583 | config DEBUG_VM_PGFLAGS | ||
584 | bool "Debug page-flags operations" | ||
585 | depends on DEBUG_VM | ||
586 | help | ||
587 | Enables extra validation on page flags operations. | ||
588 | |||
589 | If unsure, say N. | ||
590 | |||
583 | config DEBUG_VIRTUAL | 591 | config DEBUG_VIRTUAL |
584 | bool "Debug VM translations" | 592 | bool "Debug VM translations" |
585 | depends on DEBUG_KERNEL && X86 | 593 | depends on DEBUG_KERNEL && X86 |
@@ -1589,7 +1597,6 @@ config FAULT_INJECTION_STACKTRACE_FILTER | |||
1589 | 1597 | ||
1590 | config LATENCYTOP | 1598 | config LATENCYTOP |
1591 | bool "Latency measuring infrastructure" | 1599 | bool "Latency measuring infrastructure" |
1592 | depends on HAVE_LATENCYTOP_SUPPORT | ||
1593 | depends on DEBUG_KERNEL | 1600 | depends on DEBUG_KERNEL |
1594 | depends on STACKTRACE_SUPPORT | 1601 | depends on STACKTRACE_SUPPORT |
1595 | depends on PROC_FS | 1602 | depends on PROC_FS |
diff --git a/lib/kasprintf.c b/lib/kasprintf.c index f194e6e593e1..7f6c506a4942 100644 --- a/lib/kasprintf.c +++ b/lib/kasprintf.c | |||
@@ -13,19 +13,21 @@ | |||
13 | /* Simplified asprintf. */ | 13 | /* Simplified asprintf. */ |
14 | char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) | 14 | char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) |
15 | { | 15 | { |
16 | unsigned int len; | 16 | unsigned int first, second; |
17 | char *p; | 17 | char *p; |
18 | va_list aq; | 18 | va_list aq; |
19 | 19 | ||
20 | va_copy(aq, ap); | 20 | va_copy(aq, ap); |
21 | len = vsnprintf(NULL, 0, fmt, aq); | 21 | first = vsnprintf(NULL, 0, fmt, aq); |
22 | va_end(aq); | 22 | va_end(aq); |
23 | 23 | ||
24 | p = kmalloc_track_caller(len+1, gfp); | 24 | p = kmalloc_track_caller(first+1, gfp); |
25 | if (!p) | 25 | if (!p) |
26 | return NULL; | 26 | return NULL; |
27 | 27 | ||
28 | vsnprintf(p, len+1, fmt, ap); | 28 | second = vsnprintf(p, first+1, fmt, ap); |
29 | WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)", | ||
30 | first, second, fmt); | ||
29 | 31 | ||
30 | return p; | 32 | return p; |
31 | } | 33 | } |
diff --git a/lib/list_debug.c b/lib/list_debug.c index 3859bf63561c..3345a089ef7b 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c | |||
@@ -12,6 +12,13 @@ | |||
12 | #include <linux/kernel.h> | 12 | #include <linux/kernel.h> |
13 | #include <linux/rculist.h> | 13 | #include <linux/rculist.h> |
14 | 14 | ||
15 | static struct list_head force_poison; | ||
16 | void list_force_poison(struct list_head *entry) | ||
17 | { | ||
18 | entry->next = &force_poison; | ||
19 | entry->prev = &force_poison; | ||
20 | } | ||
21 | |||
15 | /* | 22 | /* |
16 | * Insert a new entry between two known consecutive entries. | 23 | * Insert a new entry between two known consecutive entries. |
17 | * | 24 | * |
@@ -23,6 +30,8 @@ void __list_add(struct list_head *new, | |||
23 | struct list_head *prev, | 30 | struct list_head *prev, |
24 | struct list_head *next) | 31 | struct list_head *next) |
25 | { | 32 | { |
33 | WARN(new->next == &force_poison || new->prev == &force_poison, | ||
34 | "list_add attempted on force-poisoned entry\n"); | ||
26 | WARN(next->prev != prev, | 35 | WARN(next->prev != prev, |
27 | "list_add corruption. next->prev should be " | 36 | "list_add corruption. next->prev should be " |
28 | "prev (%p), but was %p. (next=%p).\n", | 37 | "prev (%p), but was %p. (next=%p).\n", |
diff --git a/lib/test_printf.c b/lib/test_printf.c index c5a666af9ba5..4f6ae60433bc 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c | |||
@@ -12,10 +12,13 @@ | |||
12 | #include <linux/slab.h> | 12 | #include <linux/slab.h> |
13 | #include <linux/string.h> | 13 | #include <linux/string.h> |
14 | 14 | ||
15 | #include <linux/bitmap.h> | ||
16 | #include <linux/dcache.h> | ||
15 | #include <linux/socket.h> | 17 | #include <linux/socket.h> |
16 | #include <linux/in.h> | 18 | #include <linux/in.h> |
17 | 19 | ||
18 | #define BUF_SIZE 256 | 20 | #define BUF_SIZE 256 |
21 | #define PAD_SIZE 16 | ||
19 | #define FILL_CHAR '$' | 22 | #define FILL_CHAR '$' |
20 | 23 | ||
21 | #define PTR1 ((void*)0x01234567) | 24 | #define PTR1 ((void*)0x01234567) |
@@ -39,6 +42,7 @@ | |||
39 | static unsigned total_tests __initdata; | 42 | static unsigned total_tests __initdata; |
40 | static unsigned failed_tests __initdata; | 43 | static unsigned failed_tests __initdata; |
41 | static char *test_buffer __initdata; | 44 | static char *test_buffer __initdata; |
45 | static char *alloced_buffer __initdata; | ||
42 | 46 | ||
43 | static int __printf(4, 0) __init | 47 | static int __printf(4, 0) __init |
44 | do_test(int bufsize, const char *expect, int elen, | 48 | do_test(int bufsize, const char *expect, int elen, |
@@ -49,7 +53,7 @@ do_test(int bufsize, const char *expect, int elen, | |||
49 | 53 | ||
50 | total_tests++; | 54 | total_tests++; |
51 | 55 | ||
52 | memset(test_buffer, FILL_CHAR, BUF_SIZE); | 56 | memset(alloced_buffer, FILL_CHAR, BUF_SIZE + 2*PAD_SIZE); |
53 | va_copy(aq, ap); | 57 | va_copy(aq, ap); |
54 | ret = vsnprintf(test_buffer, bufsize, fmt, aq); | 58 | ret = vsnprintf(test_buffer, bufsize, fmt, aq); |
55 | va_end(aq); | 59 | va_end(aq); |
@@ -60,8 +64,13 @@ do_test(int bufsize, const char *expect, int elen, | |||
60 | return 1; | 64 | return 1; |
61 | } | 65 | } |
62 | 66 | ||
67 | if (memchr_inv(alloced_buffer, FILL_CHAR, PAD_SIZE)) { | ||
68 | pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote before buffer\n", bufsize, fmt); | ||
69 | return 1; | ||
70 | } | ||
71 | |||
63 | if (!bufsize) { | 72 | if (!bufsize) { |
64 | if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE)) { | 73 | if (memchr_inv(test_buffer, FILL_CHAR, BUF_SIZE + PAD_SIZE)) { |
65 | pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n", | 74 | pr_warn("vsnprintf(buf, 0, \"%s\", ...) wrote to buffer\n", |
66 | fmt); | 75 | fmt); |
67 | return 1; | 76 | return 1; |
@@ -76,6 +85,12 @@ do_test(int bufsize, const char *expect, int elen, | |||
76 | return 1; | 85 | return 1; |
77 | } | 86 | } |
78 | 87 | ||
88 | if (memchr_inv(test_buffer + written + 1, FILL_CHAR, BUF_SIZE + PAD_SIZE - (written + 1))) { | ||
89 | pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote beyond the nul-terminator\n", | ||
90 | bufsize, fmt); | ||
91 | return 1; | ||
92 | } | ||
93 | |||
79 | if (memcmp(test_buffer, expect, written)) { | 94 | if (memcmp(test_buffer, expect, written)) { |
80 | pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n", | 95 | pr_warn("vsnprintf(buf, %d, \"%s\", ...) wrote '%s', expected '%.*s'\n", |
81 | bufsize, fmt, test_buffer, written, expect); | 96 | bufsize, fmt, test_buffer, written, expect); |
@@ -91,7 +106,12 @@ __test(const char *expect, int elen, const char *fmt, ...) | |||
91 | int rand; | 106 | int rand; |
92 | char *p; | 107 | char *p; |
93 | 108 | ||
94 | BUG_ON(elen >= BUF_SIZE); | 109 | if (elen >= BUF_SIZE) { |
110 | pr_err("error in test suite: expected output length %d too long. Format was '%s'.\n", | ||
111 | elen, fmt); | ||
112 | failed_tests++; | ||
113 | return; | ||
114 | } | ||
95 | 115 | ||
96 | va_start(ap, fmt); | 116 | va_start(ap, fmt); |
97 | 117 | ||
@@ -109,6 +129,7 @@ __test(const char *expect, int elen, const char *fmt, ...) | |||
109 | 129 | ||
110 | p = kvasprintf(GFP_KERNEL, fmt, ap); | 130 | p = kvasprintf(GFP_KERNEL, fmt, ap); |
111 | if (p) { | 131 | if (p) { |
132 | total_tests++; | ||
112 | if (memcmp(p, expect, elen+1)) { | 133 | if (memcmp(p, expect, elen+1)) { |
113 | pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n", | 134 | pr_warn("kvasprintf(..., \"%s\", ...) returned '%s', expected '%s'\n", |
114 | fmt, p, expect); | 135 | fmt, p, expect); |
@@ -140,6 +161,30 @@ test_number(void) | |||
140 | test("0x1234abcd ", "%#-12x", 0x1234abcd); | 161 | test("0x1234abcd ", "%#-12x", 0x1234abcd); |
141 | test(" 0x1234abcd", "%#12x", 0x1234abcd); | 162 | test(" 0x1234abcd", "%#12x", 0x1234abcd); |
142 | test("0|001| 12|+123| 1234|-123|-1234", "%d|%03d|%3d|%+d|% d|%+d|% d", 0, 1, 12, 123, 1234, -123, -1234); | 163 | test("0|001| 12|+123| 1234|-123|-1234", "%d|%03d|%3d|%+d|% d|%+d|% d", 0, 1, 12, 123, 1234, -123, -1234); |
164 | test("0|1|1|128|255", "%hhu|%hhu|%hhu|%hhu|%hhu", 0, 1, 257, 128, -1); | ||
165 | test("0|1|1|-128|-1", "%hhd|%hhd|%hhd|%hhd|%hhd", 0, 1, 257, 128, -1); | ||
166 | test("2015122420151225", "%ho%ho%#ho", 1037, 5282, -11627); | ||
167 | /* | ||
168 | * POSIX/C99: »The result of converting zero with an explicit | ||
169 | * precision of zero shall be no characters.« Hence the output | ||
170 | * from the below test should really be "00|0||| ". However, | ||
171 | * the kernel's printf also produces a single 0 in that | ||
172 | * case. This test case simply documents the current | ||
173 | * behaviour. | ||
174 | */ | ||
175 | test("00|0|0|0|0", "%.2d|%.1d|%.0d|%.*d|%1.0d", 0, 0, 0, 0, 0, 0); | ||
176 | #ifndef __CHAR_UNSIGNED__ | ||
177 | { | ||
178 | /* | ||
179 | * Passing a 'char' to a %02x specifier doesn't do | ||
180 | * what was presumably the intention when char is | ||
181 | * signed and the value is negative. One must either & | ||
182 | * with 0xff or cast to u8. | ||
183 | */ | ||
184 | char val = -16; | ||
185 | test("0xfffffff0|0xf0|0xf0", "%#02x|%#02x|%#02x", val, val & 0xff, (u8)val); | ||
186 | } | ||
187 | #endif | ||
143 | } | 188 | } |
144 | 189 | ||
145 | static void __init | 190 | static void __init |
@@ -148,14 +193,23 @@ test_string(void) | |||
148 | test("", "%s%.0s", "", "123"); | 193 | test("", "%s%.0s", "", "123"); |
149 | test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456"); | 194 | test("ABCD|abc|123", "%s|%.3s|%.*s", "ABCD", "abcdef", 3, "123456"); |
150 | test("1 | 2|3 | 4|5 ", "%-3s|%3s|%-*s|%*s|%*s", "1", "2", 3, "3", 3, "4", -3, "5"); | 195 | test("1 | 2|3 | 4|5 ", "%-3s|%3s|%-*s|%*s|%*s", "1", "2", 3, "3", 3, "4", -3, "5"); |
196 | test("1234 ", "%-10.4s", "123456"); | ||
197 | test(" 1234", "%10.4s", "123456"); | ||
151 | /* | 198 | /* |
152 | * POSIX and C99 say that a missing precision should be | 199 | * POSIX and C99 say that a negative precision (which is only |
153 | * treated as a precision of 0. However, the kernel's printf | 200 | * possible to pass via a * argument) should be treated as if |
154 | * implementation treats this case as if the . wasn't | 201 | * the precision wasn't present, and that if the precision is |
155 | * present. Let's add a test case documenting the current | 202 | * omitted (as in %.s), the precision should be taken to be |
156 | * behaviour; should anyone ever feel the need to follow the | 203 | * 0. However, the kernel's printf behave exactly opposite, |
157 | * standards more closely, this can be revisited. | 204 | * treating a negative precision as 0 and treating an omitted |
205 | * precision specifier as if no precision was given. | ||
206 | * | ||
207 | * These test cases document the current behaviour; should | ||
208 | * anyone ever feel the need to follow the standards more | ||
209 | * closely, this can be revisited. | ||
158 | */ | 210 | */ |
211 | test(" ", "%4.*s", -5, "123456"); | ||
212 | test("123456", "%.s", "123456"); | ||
159 | test("a||", "%.s|%.0s|%.*s", "a", "b", 0, "c"); | 213 | test("a||", "%.s|%.0s|%.*s", "a", "b", 0, "c"); |
160 | test("a | | ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c"); | 214 | test("a | | ", "%-3.s|%-3.0s|%-3.*s", "a", "b", 0, "c"); |
161 | } | 215 | } |
@@ -273,9 +327,35 @@ uuid(void) | |||
273 | test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid); | 327 | test("03020100-0504-0706-0809-0A0B0C0D0E0F", "%pUL", uuid); |
274 | } | 328 | } |
275 | 329 | ||
330 | static struct dentry test_dentry[4] __initdata = { | ||
331 | { .d_parent = &test_dentry[0], | ||
332 | .d_name = QSTR_INIT(test_dentry[0].d_iname, 3), | ||
333 | .d_iname = "foo" }, | ||
334 | { .d_parent = &test_dentry[0], | ||
335 | .d_name = QSTR_INIT(test_dentry[1].d_iname, 5), | ||
336 | .d_iname = "bravo" }, | ||
337 | { .d_parent = &test_dentry[1], | ||
338 | .d_name = QSTR_INIT(test_dentry[2].d_iname, 4), | ||
339 | .d_iname = "alfa" }, | ||
340 | { .d_parent = &test_dentry[2], | ||
341 | .d_name = QSTR_INIT(test_dentry[3].d_iname, 5), | ||
342 | .d_iname = "romeo" }, | ||
343 | }; | ||
344 | |||
276 | static void __init | 345 | static void __init |
277 | dentry(void) | 346 | dentry(void) |
278 | { | 347 | { |
348 | test("foo", "%pd", &test_dentry[0]); | ||
349 | test("foo", "%pd2", &test_dentry[0]); | ||
350 | |||
351 | test("romeo", "%pd", &test_dentry[3]); | ||
352 | test("alfa/romeo", "%pd2", &test_dentry[3]); | ||
353 | test("bravo/alfa/romeo", "%pd3", &test_dentry[3]); | ||
354 | test("/bravo/alfa/romeo", "%pd4", &test_dentry[3]); | ||
355 | test("/bravo/alfa", "%pd4", &test_dentry[2]); | ||
356 | |||
357 | test("bravo/alfa |bravo/alfa ", "%-12pd2|%*pd2", &test_dentry[2], -12, &test_dentry[2]); | ||
358 | test(" bravo/alfa| bravo/alfa", "%12pd2|%*pd2", &test_dentry[2], 12, &test_dentry[2]); | ||
279 | } | 359 | } |
280 | 360 | ||
281 | static void __init | 361 | static void __init |
@@ -289,6 +369,20 @@ struct_clk(void) | |||
289 | } | 369 | } |
290 | 370 | ||
291 | static void __init | 371 | static void __init |
372 | large_bitmap(void) | ||
373 | { | ||
374 | const int nbits = 1 << 16; | ||
375 | unsigned long *bits = kcalloc(BITS_TO_LONGS(nbits), sizeof(long), GFP_KERNEL); | ||
376 | if (!bits) | ||
377 | return; | ||
378 | |||
379 | bitmap_set(bits, 1, 20); | ||
380 | bitmap_set(bits, 60000, 15); | ||
381 | test("1-20,60000-60014", "%*pbl", nbits, bits); | ||
382 | kfree(bits); | ||
383 | } | ||
384 | |||
385 | static void __init | ||
292 | bitmap(void) | 386 | bitmap(void) |
293 | { | 387 | { |
294 | DECLARE_BITMAP(bits, 20); | 388 | DECLARE_BITMAP(bits, 20); |
@@ -307,6 +401,8 @@ bitmap(void) | |||
307 | bitmap_fill(bits, 20); | 401 | bitmap_fill(bits, 20); |
308 | test("fffff|fffff", "%20pb|%*pb", bits, 20, bits); | 402 | test("fffff|fffff", "%20pb|%*pb", bits, 20, bits); |
309 | test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits); | 403 | test("0-19|0-19", "%20pbl|%*pbl", bits, 20, bits); |
404 | |||
405 | large_bitmap(); | ||
310 | } | 406 | } |
311 | 407 | ||
312 | static void __init | 408 | static void __init |
@@ -337,16 +433,17 @@ test_pointer(void) | |||
337 | static int __init | 433 | static int __init |
338 | test_printf_init(void) | 434 | test_printf_init(void) |
339 | { | 435 | { |
340 | test_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); | 436 | alloced_buffer = kmalloc(BUF_SIZE + 2*PAD_SIZE, GFP_KERNEL); |
341 | if (!test_buffer) | 437 | if (!alloced_buffer) |
342 | return -ENOMEM; | 438 | return -ENOMEM; |
439 | test_buffer = alloced_buffer + PAD_SIZE; | ||
343 | 440 | ||
344 | test_basic(); | 441 | test_basic(); |
345 | test_number(); | 442 | test_number(); |
346 | test_string(); | 443 | test_string(); |
347 | test_pointer(); | 444 | test_pointer(); |
348 | 445 | ||
349 | kfree(test_buffer); | 446 | kfree(alloced_buffer); |
350 | 447 | ||
351 | if (failed_tests == 0) | 448 | if (failed_tests == 0) |
352 | pr_info("all %u tests passed\n", total_tests); | 449 | pr_info("all %u tests passed\n", total_tests); |
diff --git a/lib/vsprintf.c b/lib/vsprintf.c index ac3f9476b776..48ff9c36644d 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c | |||
@@ -383,13 +383,14 @@ enum format_type { | |||
383 | }; | 383 | }; |
384 | 384 | ||
385 | struct printf_spec { | 385 | struct printf_spec { |
386 | u8 type; /* format_type enum */ | 386 | unsigned int type:8; /* format_type enum */ |
387 | u8 flags; /* flags to number() */ | 387 | signed int field_width:24; /* width of output field */ |
388 | u8 base; /* number base, 8, 10 or 16 only */ | 388 | unsigned int flags:8; /* flags to number() */ |
389 | u8 qualifier; /* number qualifier, one of 'hHlLtzZ' */ | 389 | unsigned int base:8; /* number base, 8, 10 or 16 only */ |
390 | s16 field_width; /* width of output field */ | 390 | signed int precision:16; /* # of digits/chars */ |
391 | s16 precision; /* # of digits/chars */ | 391 | } __packed; |
392 | }; | 392 | #define FIELD_WIDTH_MAX ((1 << 23) - 1) |
393 | #define PRECISION_MAX ((1 << 15) - 1) | ||
393 | 394 | ||
394 | static noinline_for_stack | 395 | static noinline_for_stack |
395 | char *number(char *buf, char *end, unsigned long long num, | 396 | char *number(char *buf, char *end, unsigned long long num, |
@@ -402,6 +403,10 @@ char *number(char *buf, char *end, unsigned long long num, | |||
402 | int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); | 403 | int need_pfx = ((spec.flags & SPECIAL) && spec.base != 10); |
403 | int i; | 404 | int i; |
404 | bool is_zero = num == 0LL; | 405 | bool is_zero = num == 0LL; |
406 | int field_width = spec.field_width; | ||
407 | int precision = spec.precision; | ||
408 | |||
409 | BUILD_BUG_ON(sizeof(struct printf_spec) != 8); | ||
405 | 410 | ||
406 | /* locase = 0 or 0x20. ORing digits or letters with 'locase' | 411 | /* locase = 0 or 0x20. ORing digits or letters with 'locase' |
407 | * produces same digits or (maybe lowercased) letters */ | 412 | * produces same digits or (maybe lowercased) letters */ |
@@ -413,20 +418,20 @@ char *number(char *buf, char *end, unsigned long long num, | |||
413 | if ((signed long long)num < 0) { | 418 | if ((signed long long)num < 0) { |
414 | sign = '-'; | 419 | sign = '-'; |
415 | num = -(signed long long)num; | 420 | num = -(signed long long)num; |
416 | spec.field_width--; | 421 | field_width--; |
417 | } else if (spec.flags & PLUS) { | 422 | } else if (spec.flags & PLUS) { |
418 | sign = '+'; | 423 | sign = '+'; |
419 | spec.field_width--; | 424 | field_width--; |
420 | } else if (spec.flags & SPACE) { | 425 | } else if (spec.flags & SPACE) { |
421 | sign = ' '; | 426 | sign = ' '; |
422 | spec.field_width--; | 427 | field_width--; |
423 | } | 428 | } |
424 | } | 429 | } |
425 | if (need_pfx) { | 430 | if (need_pfx) { |
426 | if (spec.base == 16) | 431 | if (spec.base == 16) |
427 | spec.field_width -= 2; | 432 | field_width -= 2; |
428 | else if (!is_zero) | 433 | else if (!is_zero) |
429 | spec.field_width--; | 434 | field_width--; |
430 | } | 435 | } |
431 | 436 | ||
432 | /* generate full string in tmp[], in reverse order */ | 437 | /* generate full string in tmp[], in reverse order */ |
@@ -448,12 +453,12 @@ char *number(char *buf, char *end, unsigned long long num, | |||
448 | } | 453 | } |
449 | 454 | ||
450 | /* printing 100 using %2d gives "100", not "00" */ | 455 | /* printing 100 using %2d gives "100", not "00" */ |
451 | if (i > spec.precision) | 456 | if (i > precision) |
452 | spec.precision = i; | 457 | precision = i; |
453 | /* leading space padding */ | 458 | /* leading space padding */ |
454 | spec.field_width -= spec.precision; | 459 | field_width -= precision; |
455 | if (!(spec.flags & (ZEROPAD | LEFT))) { | 460 | if (!(spec.flags & (ZEROPAD | LEFT))) { |
456 | while (--spec.field_width >= 0) { | 461 | while (--field_width >= 0) { |
457 | if (buf < end) | 462 | if (buf < end) |
458 | *buf = ' '; | 463 | *buf = ' '; |
459 | ++buf; | 464 | ++buf; |
@@ -482,14 +487,14 @@ char *number(char *buf, char *end, unsigned long long num, | |||
482 | if (!(spec.flags & LEFT)) { | 487 | if (!(spec.flags & LEFT)) { |
483 | char c = ' ' + (spec.flags & ZEROPAD); | 488 | char c = ' ' + (spec.flags & ZEROPAD); |
484 | BUILD_BUG_ON(' ' + ZEROPAD != '0'); | 489 | BUILD_BUG_ON(' ' + ZEROPAD != '0'); |
485 | while (--spec.field_width >= 0) { | 490 | while (--field_width >= 0) { |
486 | if (buf < end) | 491 | if (buf < end) |
487 | *buf = c; | 492 | *buf = c; |
488 | ++buf; | 493 | ++buf; |
489 | } | 494 | } |
490 | } | 495 | } |
491 | /* hmm even more zero padding? */ | 496 | /* hmm even more zero padding? */ |
492 | while (i <= --spec.precision) { | 497 | while (i <= --precision) { |
493 | if (buf < end) | 498 | if (buf < end) |
494 | *buf = '0'; | 499 | *buf = '0'; |
495 | ++buf; | 500 | ++buf; |
@@ -501,7 +506,7 @@ char *number(char *buf, char *end, unsigned long long num, | |||
501 | ++buf; | 506 | ++buf; |
502 | } | 507 | } |
503 | /* trailing space padding */ | 508 | /* trailing space padding */ |
504 | while (--spec.field_width >= 0) { | 509 | while (--field_width >= 0) { |
505 | if (buf < end) | 510 | if (buf < end) |
506 | *buf = ' '; | 511 | *buf = ' '; |
507 | ++buf; | 512 | ++buf; |
@@ -511,37 +516,20 @@ char *number(char *buf, char *end, unsigned long long num, | |||
511 | } | 516 | } |
512 | 517 | ||
513 | static noinline_for_stack | 518 | static noinline_for_stack |
514 | char *string(char *buf, char *end, const char *s, struct printf_spec spec) | 519 | char *special_hex_number(char *buf, char *end, unsigned long long num, int size) |
515 | { | 520 | { |
516 | int len, i; | 521 | struct printf_spec spec; |
517 | |||
518 | if ((unsigned long)s < PAGE_SIZE) | ||
519 | s = "(null)"; | ||
520 | 522 | ||
521 | len = strnlen(s, spec.precision); | 523 | spec.type = FORMAT_TYPE_PTR; |
522 | 524 | spec.field_width = 2 + 2 * size; /* 0x + hex */ | |
523 | if (!(spec.flags & LEFT)) { | 525 | spec.flags = SPECIAL | SMALL | ZEROPAD; |
524 | while (len < spec.field_width--) { | 526 | spec.base = 16; |
525 | if (buf < end) | 527 | spec.precision = -1; |
526 | *buf = ' '; | ||
527 | ++buf; | ||
528 | } | ||
529 | } | ||
530 | for (i = 0; i < len; ++i) { | ||
531 | if (buf < end) | ||
532 | *buf = *s; | ||
533 | ++buf; ++s; | ||
534 | } | ||
535 | while (len < spec.field_width--) { | ||
536 | if (buf < end) | ||
537 | *buf = ' '; | ||
538 | ++buf; | ||
539 | } | ||
540 | 528 | ||
541 | return buf; | 529 | return number(buf, end, num, spec); |
542 | } | 530 | } |
543 | 531 | ||
544 | static void widen(char *buf, char *end, unsigned len, unsigned spaces) | 532 | static void move_right(char *buf, char *end, unsigned len, unsigned spaces) |
545 | { | 533 | { |
546 | size_t size; | 534 | size_t size; |
547 | if (buf >= end) /* nowhere to put anything */ | 535 | if (buf >= end) /* nowhere to put anything */ |
@@ -559,6 +547,56 @@ static void widen(char *buf, char *end, unsigned len, unsigned spaces) | |||
559 | memset(buf, ' ', spaces); | 547 | memset(buf, ' ', spaces); |
560 | } | 548 | } |
561 | 549 | ||
550 | /* | ||
551 | * Handle field width padding for a string. | ||
552 | * @buf: current buffer position | ||
553 | * @n: length of string | ||
554 | * @end: end of output buffer | ||
555 | * @spec: for field width and flags | ||
556 | * Returns: new buffer position after padding. | ||
557 | */ | ||
558 | static noinline_for_stack | ||
559 | char *widen_string(char *buf, int n, char *end, struct printf_spec spec) | ||
560 | { | ||
561 | unsigned spaces; | ||
562 | |||
563 | if (likely(n >= spec.field_width)) | ||
564 | return buf; | ||
565 | /* we want to pad the sucker */ | ||
566 | spaces = spec.field_width - n; | ||
567 | if (!(spec.flags & LEFT)) { | ||
568 | move_right(buf - n, end, n, spaces); | ||
569 | return buf + spaces; | ||
570 | } | ||
571 | while (spaces--) { | ||
572 | if (buf < end) | ||
573 | *buf = ' '; | ||
574 | ++buf; | ||
575 | } | ||
576 | return buf; | ||
577 | } | ||
578 | |||
579 | static noinline_for_stack | ||
580 | char *string(char *buf, char *end, const char *s, struct printf_spec spec) | ||
581 | { | ||
582 | int len = 0; | ||
583 | size_t lim = spec.precision; | ||
584 | |||
585 | if ((unsigned long)s < PAGE_SIZE) | ||
586 | s = "(null)"; | ||
587 | |||
588 | while (lim--) { | ||
589 | char c = *s++; | ||
590 | if (!c) | ||
591 | break; | ||
592 | if (buf < end) | ||
593 | *buf = c; | ||
594 | ++buf; | ||
595 | ++len; | ||
596 | } | ||
597 | return widen_string(buf, len, end, spec); | ||
598 | } | ||
599 | |||
562 | static noinline_for_stack | 600 | static noinline_for_stack |
563 | char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec, | 601 | char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_spec spec, |
564 | const char *fmt) | 602 | const char *fmt) |
@@ -600,20 +638,7 @@ char *dentry_name(char *buf, char *end, const struct dentry *d, struct printf_sp | |||
600 | *buf = c; | 638 | *buf = c; |
601 | } | 639 | } |
602 | rcu_read_unlock(); | 640 | rcu_read_unlock(); |
603 | if (n < spec.field_width) { | 641 | return widen_string(buf, n, end, spec); |
604 | /* we want to pad the sucker */ | ||
605 | unsigned spaces = spec.field_width - n; | ||
606 | if (!(spec.flags & LEFT)) { | ||
607 | widen(buf - n, end, n, spaces); | ||
608 | return buf + spaces; | ||
609 | } | ||
610 | while (spaces--) { | ||
611 | if (buf < end) | ||
612 | *buf = ' '; | ||
613 | ++buf; | ||
614 | } | ||
615 | } | ||
616 | return buf; | ||
617 | } | 642 | } |
618 | 643 | ||
619 | #ifdef CONFIG_BLOCK | 644 | #ifdef CONFIG_BLOCK |
@@ -659,11 +684,7 @@ char *symbol_string(char *buf, char *end, void *ptr, | |||
659 | 684 | ||
660 | return string(buf, end, sym, spec); | 685 | return string(buf, end, sym, spec); |
661 | #else | 686 | #else |
662 | spec.field_width = 2 * sizeof(void *); | 687 | return special_hex_number(buf, end, value, sizeof(void *)); |
663 | spec.flags |= SPECIAL | SMALL | ZEROPAD; | ||
664 | spec.base = 16; | ||
665 | |||
666 | return number(buf, end, value, spec); | ||
667 | #endif | 688 | #endif |
668 | } | 689 | } |
669 | 690 | ||
@@ -1324,40 +1345,45 @@ char *uuid_string(char *buf, char *end, const u8 *addr, | |||
1324 | return string(buf, end, uuid, spec); | 1345 | return string(buf, end, uuid, spec); |
1325 | } | 1346 | } |
1326 | 1347 | ||
1327 | static | 1348 | static noinline_for_stack |
1328 | char *netdev_feature_string(char *buf, char *end, const u8 *addr, | 1349 | char *netdev_bits(char *buf, char *end, const void *addr, const char *fmt) |
1329 | struct printf_spec spec) | ||
1330 | { | 1350 | { |
1331 | spec.flags |= SPECIAL | SMALL | ZEROPAD; | 1351 | unsigned long long num; |
1332 | if (spec.field_width == -1) | 1352 | int size; |
1333 | spec.field_width = 2 + 2 * sizeof(netdev_features_t); | ||
1334 | spec.base = 16; | ||
1335 | 1353 | ||
1336 | return number(buf, end, *(const netdev_features_t *)addr, spec); | 1354 | switch (fmt[1]) { |
1355 | case 'F': | ||
1356 | num = *(const netdev_features_t *)addr; | ||
1357 | size = sizeof(netdev_features_t); | ||
1358 | break; | ||
1359 | default: | ||
1360 | num = (unsigned long)addr; | ||
1361 | size = sizeof(unsigned long); | ||
1362 | break; | ||
1363 | } | ||
1364 | |||
1365 | return special_hex_number(buf, end, num, size); | ||
1337 | } | 1366 | } |
1338 | 1367 | ||
1339 | static noinline_for_stack | 1368 | static noinline_for_stack |
1340 | char *address_val(char *buf, char *end, const void *addr, | 1369 | char *address_val(char *buf, char *end, const void *addr, const char *fmt) |
1341 | struct printf_spec spec, const char *fmt) | ||
1342 | { | 1370 | { |
1343 | unsigned long long num; | 1371 | unsigned long long num; |
1344 | 1372 | int size; | |
1345 | spec.flags |= SPECIAL | SMALL | ZEROPAD; | ||
1346 | spec.base = 16; | ||
1347 | 1373 | ||
1348 | switch (fmt[1]) { | 1374 | switch (fmt[1]) { |
1349 | case 'd': | 1375 | case 'd': |
1350 | num = *(const dma_addr_t *)addr; | 1376 | num = *(const dma_addr_t *)addr; |
1351 | spec.field_width = sizeof(dma_addr_t) * 2 + 2; | 1377 | size = sizeof(dma_addr_t); |
1352 | break; | 1378 | break; |
1353 | case 'p': | 1379 | case 'p': |
1354 | default: | 1380 | default: |
1355 | num = *(const phys_addr_t *)addr; | 1381 | num = *(const phys_addr_t *)addr; |
1356 | spec.field_width = sizeof(phys_addr_t) * 2 + 2; | 1382 | size = sizeof(phys_addr_t); |
1357 | break; | 1383 | break; |
1358 | } | 1384 | } |
1359 | 1385 | ||
1360 | return number(buf, end, num, spec); | 1386 | return special_hex_number(buf, end, num, size); |
1361 | } | 1387 | } |
1362 | 1388 | ||
1363 | static noinline_for_stack | 1389 | static noinline_for_stack |
@@ -1376,10 +1402,7 @@ char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec, | |||
1376 | #ifdef CONFIG_COMMON_CLK | 1402 | #ifdef CONFIG_COMMON_CLK |
1377 | return string(buf, end, __clk_get_name(clk), spec); | 1403 | return string(buf, end, __clk_get_name(clk), spec); |
1378 | #else | 1404 | #else |
1379 | spec.base = 16; | 1405 | return special_hex_number(buf, end, (unsigned long)clk, sizeof(unsigned long)); |
1380 | spec.field_width = sizeof(unsigned long) * 2 + 2; | ||
1381 | spec.flags |= SPECIAL | SMALL | ZEROPAD; | ||
1382 | return number(buf, end, (unsigned long)clk, spec); | ||
1383 | #endif | 1406 | #endif |
1384 | } | 1407 | } |
1385 | } | 1408 | } |
@@ -1609,13 +1632,9 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, | |||
1609 | break; | 1632 | break; |
1610 | 1633 | ||
1611 | case 'N': | 1634 | case 'N': |
1612 | switch (fmt[1]) { | 1635 | return netdev_bits(buf, end, ptr, fmt); |
1613 | case 'F': | ||
1614 | return netdev_feature_string(buf, end, ptr, spec); | ||
1615 | } | ||
1616 | break; | ||
1617 | case 'a': | 1636 | case 'a': |
1618 | return address_val(buf, end, ptr, spec, fmt); | 1637 | return address_val(buf, end, ptr, fmt); |
1619 | case 'd': | 1638 | case 'd': |
1620 | return dentry_name(buf, end, ptr, spec, fmt); | 1639 | return dentry_name(buf, end, ptr, spec, fmt); |
1621 | case 'C': | 1640 | case 'C': |
@@ -1664,6 +1683,7 @@ static noinline_for_stack | |||
1664 | int format_decode(const char *fmt, struct printf_spec *spec) | 1683 | int format_decode(const char *fmt, struct printf_spec *spec) |
1665 | { | 1684 | { |
1666 | const char *start = fmt; | 1685 | const char *start = fmt; |
1686 | char qualifier; | ||
1667 | 1687 | ||
1668 | /* we finished early by reading the field width */ | 1688 | /* we finished early by reading the field width */ |
1669 | if (spec->type == FORMAT_TYPE_WIDTH) { | 1689 | if (spec->type == FORMAT_TYPE_WIDTH) { |
@@ -1746,16 +1766,16 @@ precision: | |||
1746 | 1766 | ||
1747 | qualifier: | 1767 | qualifier: |
1748 | /* get the conversion qualifier */ | 1768 | /* get the conversion qualifier */ |
1749 | spec->qualifier = -1; | 1769 | qualifier = 0; |
1750 | if (*fmt == 'h' || _tolower(*fmt) == 'l' || | 1770 | if (*fmt == 'h' || _tolower(*fmt) == 'l' || |
1751 | _tolower(*fmt) == 'z' || *fmt == 't') { | 1771 | _tolower(*fmt) == 'z' || *fmt == 't') { |
1752 | spec->qualifier = *fmt++; | 1772 | qualifier = *fmt++; |
1753 | if (unlikely(spec->qualifier == *fmt)) { | 1773 | if (unlikely(qualifier == *fmt)) { |
1754 | if (spec->qualifier == 'l') { | 1774 | if (qualifier == 'l') { |
1755 | spec->qualifier = 'L'; | 1775 | qualifier = 'L'; |
1756 | ++fmt; | 1776 | ++fmt; |
1757 | } else if (spec->qualifier == 'h') { | 1777 | } else if (qualifier == 'h') { |
1758 | spec->qualifier = 'H'; | 1778 | qualifier = 'H'; |
1759 | ++fmt; | 1779 | ++fmt; |
1760 | } | 1780 | } |
1761 | } | 1781 | } |
@@ -1812,19 +1832,19 @@ qualifier: | |||
1812 | return fmt - start; | 1832 | return fmt - start; |
1813 | } | 1833 | } |
1814 | 1834 | ||
1815 | if (spec->qualifier == 'L') | 1835 | if (qualifier == 'L') |
1816 | spec->type = FORMAT_TYPE_LONG_LONG; | 1836 | spec->type = FORMAT_TYPE_LONG_LONG; |
1817 | else if (spec->qualifier == 'l') { | 1837 | else if (qualifier == 'l') { |
1818 | BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG); | 1838 | BUILD_BUG_ON(FORMAT_TYPE_ULONG + SIGN != FORMAT_TYPE_LONG); |
1819 | spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN); | 1839 | spec->type = FORMAT_TYPE_ULONG + (spec->flags & SIGN); |
1820 | } else if (_tolower(spec->qualifier) == 'z') { | 1840 | } else if (_tolower(qualifier) == 'z') { |
1821 | spec->type = FORMAT_TYPE_SIZE_T; | 1841 | spec->type = FORMAT_TYPE_SIZE_T; |
1822 | } else if (spec->qualifier == 't') { | 1842 | } else if (qualifier == 't') { |
1823 | spec->type = FORMAT_TYPE_PTRDIFF; | 1843 | spec->type = FORMAT_TYPE_PTRDIFF; |
1824 | } else if (spec->qualifier == 'H') { | 1844 | } else if (qualifier == 'H') { |
1825 | BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE); | 1845 | BUILD_BUG_ON(FORMAT_TYPE_UBYTE + SIGN != FORMAT_TYPE_BYTE); |
1826 | spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN); | 1846 | spec->type = FORMAT_TYPE_UBYTE + (spec->flags & SIGN); |
1827 | } else if (spec->qualifier == 'h') { | 1847 | } else if (qualifier == 'h') { |
1828 | BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT); | 1848 | BUILD_BUG_ON(FORMAT_TYPE_USHORT + SIGN != FORMAT_TYPE_SHORT); |
1829 | spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN); | 1849 | spec->type = FORMAT_TYPE_USHORT + (spec->flags & SIGN); |
1830 | } else { | 1850 | } else { |
@@ -1835,6 +1855,24 @@ qualifier: | |||
1835 | return ++fmt - start; | 1855 | return ++fmt - start; |
1836 | } | 1856 | } |
1837 | 1857 | ||
1858 | static void | ||
1859 | set_field_width(struct printf_spec *spec, int width) | ||
1860 | { | ||
1861 | spec->field_width = width; | ||
1862 | if (WARN_ONCE(spec->field_width != width, "field width %d too large", width)) { | ||
1863 | spec->field_width = clamp(width, -FIELD_WIDTH_MAX, FIELD_WIDTH_MAX); | ||
1864 | } | ||
1865 | } | ||
1866 | |||
1867 | static void | ||
1868 | set_precision(struct printf_spec *spec, int prec) | ||
1869 | { | ||
1870 | spec->precision = prec; | ||
1871 | if (WARN_ONCE(spec->precision != prec, "precision %d too large", prec)) { | ||
1872 | spec->precision = clamp(prec, 0, PRECISION_MAX); | ||
1873 | } | ||
1874 | } | ||
1875 | |||
1838 | /** | 1876 | /** |
1839 | * vsnprintf - Format a string and place it in a buffer | 1877 | * vsnprintf - Format a string and place it in a buffer |
1840 | * @buf: The buffer to place the result into | 1878 | * @buf: The buffer to place the result into |
@@ -1902,11 +1940,11 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) | |||
1902 | } | 1940 | } |
1903 | 1941 | ||
1904 | case FORMAT_TYPE_WIDTH: | 1942 | case FORMAT_TYPE_WIDTH: |
1905 | spec.field_width = va_arg(args, int); | 1943 | set_field_width(&spec, va_arg(args, int)); |
1906 | break; | 1944 | break; |
1907 | 1945 | ||
1908 | case FORMAT_TYPE_PRECISION: | 1946 | case FORMAT_TYPE_PRECISION: |
1909 | spec.precision = va_arg(args, int); | 1947 | set_precision(&spec, va_arg(args, int)); |
1910 | break; | 1948 | break; |
1911 | 1949 | ||
1912 | case FORMAT_TYPE_CHAR: { | 1950 | case FORMAT_TYPE_CHAR: { |
@@ -2346,11 +2384,11 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) | |||
2346 | } | 2384 | } |
2347 | 2385 | ||
2348 | case FORMAT_TYPE_WIDTH: | 2386 | case FORMAT_TYPE_WIDTH: |
2349 | spec.field_width = get_arg(int); | 2387 | set_field_width(&spec, get_arg(int)); |
2350 | break; | 2388 | break; |
2351 | 2389 | ||
2352 | case FORMAT_TYPE_PRECISION: | 2390 | case FORMAT_TYPE_PRECISION: |
2353 | spec.precision = get_arg(int); | 2391 | set_precision(&spec, get_arg(int)); |
2354 | break; | 2392 | break; |
2355 | 2393 | ||
2356 | case FORMAT_TYPE_CHAR: { | 2394 | case FORMAT_TYPE_CHAR: { |
diff --git a/mm/debug.c b/mm/debug.c index 5d2072ed8d5e..f05b2d5d6481 100644 --- a/mm/debug.c +++ b/mm/debug.c | |||
@@ -40,9 +40,6 @@ static const struct trace_print_flags pageflag_names[] = { | |||
40 | #ifdef CONFIG_MEMORY_FAILURE | 40 | #ifdef CONFIG_MEMORY_FAILURE |
41 | {1UL << PG_hwpoison, "hwpoison" }, | 41 | {1UL << PG_hwpoison, "hwpoison" }, |
42 | #endif | 42 | #endif |
43 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
44 | {1UL << PG_compound_lock, "compound_lock" }, | ||
45 | #endif | ||
46 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) | 43 | #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) |
47 | {1UL << PG_young, "young" }, | 44 | {1UL << PG_young, "young" }, |
48 | {1UL << PG_idle, "idle" }, | 45 | {1UL << PG_idle, "idle" }, |
@@ -82,9 +79,12 @@ static void dump_flags(unsigned long flags, | |||
82 | void dump_page_badflags(struct page *page, const char *reason, | 79 | void dump_page_badflags(struct page *page, const char *reason, |
83 | unsigned long badflags) | 80 | unsigned long badflags) |
84 | { | 81 | { |
85 | pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", | 82 | pr_emerg("page:%p count:%d mapcount:%d mapping:%p index:%#lx", |
86 | page, atomic_read(&page->_count), page_mapcount(page), | 83 | page, atomic_read(&page->_count), page_mapcount(page), |
87 | page->mapping, page->index); | 84 | page->mapping, page->index); |
85 | if (PageCompound(page)) | ||
86 | pr_cont(" compound_mapcount: %d", compound_mapcount(page)); | ||
87 | pr_cont("\n"); | ||
88 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); | 88 | BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS); |
89 | dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); | 89 | dump_flags(page->flags, pageflag_names, ARRAY_SIZE(pageflag_names)); |
90 | if (reason) | 90 | if (reason) |
diff --git a/mm/filemap.c b/mm/filemap.c index ff42d31c891a..847ee43c2806 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -204,7 +204,7 @@ void __delete_from_page_cache(struct page *page, void *shadow, | |||
204 | __dec_zone_page_state(page, NR_FILE_PAGES); | 204 | __dec_zone_page_state(page, NR_FILE_PAGES); |
205 | if (PageSwapBacked(page)) | 205 | if (PageSwapBacked(page)) |
206 | __dec_zone_page_state(page, NR_SHMEM); | 206 | __dec_zone_page_state(page, NR_SHMEM); |
207 | BUG_ON(page_mapped(page)); | 207 | VM_BUG_ON_PAGE(page_mapped(page), page); |
208 | 208 | ||
209 | /* | 209 | /* |
210 | * At this point page must be either written or cleaned by truncate. | 210 | * At this point page must be either written or cleaned by truncate. |
@@ -618,7 +618,7 @@ static int __add_to_page_cache_locked(struct page *page, | |||
618 | 618 | ||
619 | if (!huge) { | 619 | if (!huge) { |
620 | error = mem_cgroup_try_charge(page, current->mm, | 620 | error = mem_cgroup_try_charge(page, current->mm, |
621 | gfp_mask, &memcg); | 621 | gfp_mask, &memcg, false); |
622 | if (error) | 622 | if (error) |
623 | return error; | 623 | return error; |
624 | } | 624 | } |
@@ -626,7 +626,7 @@ static int __add_to_page_cache_locked(struct page *page, | |||
626 | error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); | 626 | error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); |
627 | if (error) { | 627 | if (error) { |
628 | if (!huge) | 628 | if (!huge) |
629 | mem_cgroup_cancel_charge(page, memcg); | 629 | mem_cgroup_cancel_charge(page, memcg, false); |
630 | return error; | 630 | return error; |
631 | } | 631 | } |
632 | 632 | ||
@@ -645,7 +645,7 @@ static int __add_to_page_cache_locked(struct page *page, | |||
645 | __inc_zone_page_state(page, NR_FILE_PAGES); | 645 | __inc_zone_page_state(page, NR_FILE_PAGES); |
646 | spin_unlock_irq(&mapping->tree_lock); | 646 | spin_unlock_irq(&mapping->tree_lock); |
647 | if (!huge) | 647 | if (!huge) |
648 | mem_cgroup_commit_charge(page, memcg, false); | 648 | mem_cgroup_commit_charge(page, memcg, false, false); |
649 | trace_mm_filemap_add_to_page_cache(page); | 649 | trace_mm_filemap_add_to_page_cache(page); |
650 | return 0; | 650 | return 0; |
651 | err_insert: | 651 | err_insert: |
@@ -653,7 +653,7 @@ err_insert: | |||
653 | /* Leave page->index set: truncation relies upon it */ | 653 | /* Leave page->index set: truncation relies upon it */ |
654 | spin_unlock_irq(&mapping->tree_lock); | 654 | spin_unlock_irq(&mapping->tree_lock); |
655 | if (!huge) | 655 | if (!huge) |
656 | mem_cgroup_cancel_charge(page, memcg); | 656 | mem_cgroup_cancel_charge(page, memcg, false); |
657 | page_cache_release(page); | 657 | page_cache_release(page); |
658 | return error; | 658 | return error; |
659 | } | 659 | } |
@@ -682,11 +682,11 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, | |||
682 | void *shadow = NULL; | 682 | void *shadow = NULL; |
683 | int ret; | 683 | int ret; |
684 | 684 | ||
685 | __set_page_locked(page); | 685 | __SetPageLocked(page); |
686 | ret = __add_to_page_cache_locked(page, mapping, offset, | 686 | ret = __add_to_page_cache_locked(page, mapping, offset, |
687 | gfp_mask, &shadow); | 687 | gfp_mask, &shadow); |
688 | if (unlikely(ret)) | 688 | if (unlikely(ret)) |
689 | __clear_page_locked(page); | 689 | __ClearPageLocked(page); |
690 | else { | 690 | else { |
691 | /* | 691 | /* |
692 | * The page might have been evicted from cache only | 692 | * The page might have been evicted from cache only |
@@ -809,6 +809,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue); | |||
809 | */ | 809 | */ |
810 | void unlock_page(struct page *page) | 810 | void unlock_page(struct page *page) |
811 | { | 811 | { |
812 | page = compound_head(page); | ||
812 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 813 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
813 | clear_bit_unlock(PG_locked, &page->flags); | 814 | clear_bit_unlock(PG_locked, &page->flags); |
814 | smp_mb__after_atomic(); | 815 | smp_mb__after_atomic(); |
@@ -873,18 +874,20 @@ EXPORT_SYMBOL_GPL(page_endio); | |||
873 | */ | 874 | */ |
874 | void __lock_page(struct page *page) | 875 | void __lock_page(struct page *page) |
875 | { | 876 | { |
876 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 877 | struct page *page_head = compound_head(page); |
878 | DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); | ||
877 | 879 | ||
878 | __wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io, | 880 | __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io, |
879 | TASK_UNINTERRUPTIBLE); | 881 | TASK_UNINTERRUPTIBLE); |
880 | } | 882 | } |
881 | EXPORT_SYMBOL(__lock_page); | 883 | EXPORT_SYMBOL(__lock_page); |
882 | 884 | ||
883 | int __lock_page_killable(struct page *page) | 885 | int __lock_page_killable(struct page *page) |
884 | { | 886 | { |
885 | DEFINE_WAIT_BIT(wait, &page->flags, PG_locked); | 887 | struct page *page_head = compound_head(page); |
888 | DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked); | ||
886 | 889 | ||
887 | return __wait_on_bit_lock(page_waitqueue(page), &wait, | 890 | return __wait_on_bit_lock(page_waitqueue(page_head), &wait, |
888 | bit_wait_io, TASK_KILLABLE); | 891 | bit_wait_io, TASK_KILLABLE); |
889 | } | 892 | } |
890 | EXPORT_SYMBOL_GPL(__lock_page_killable); | 893 | EXPORT_SYMBOL_GPL(__lock_page_killable); |
@@ -4,6 +4,7 @@ | |||
4 | #include <linux/spinlock.h> | 4 | #include <linux/spinlock.h> |
5 | 5 | ||
6 | #include <linux/mm.h> | 6 | #include <linux/mm.h> |
7 | #include <linux/memremap.h> | ||
7 | #include <linux/pagemap.h> | 8 | #include <linux/pagemap.h> |
8 | #include <linux/rmap.h> | 9 | #include <linux/rmap.h> |
9 | #include <linux/swap.h> | 10 | #include <linux/swap.h> |
@@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, | |||
62 | unsigned long address, pmd_t *pmd, unsigned int flags) | 63 | unsigned long address, pmd_t *pmd, unsigned int flags) |
63 | { | 64 | { |
64 | struct mm_struct *mm = vma->vm_mm; | 65 | struct mm_struct *mm = vma->vm_mm; |
66 | struct dev_pagemap *pgmap = NULL; | ||
65 | struct page *page; | 67 | struct page *page; |
66 | spinlock_t *ptl; | 68 | spinlock_t *ptl; |
67 | pte_t *ptep, pte; | 69 | pte_t *ptep, pte; |
@@ -98,7 +100,17 @@ retry: | |||
98 | } | 100 | } |
99 | 101 | ||
100 | page = vm_normal_page(vma, address, pte); | 102 | page = vm_normal_page(vma, address, pte); |
101 | if (unlikely(!page)) { | 103 | if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { |
104 | /* | ||
105 | * Only return device mapping pages in the FOLL_GET case since | ||
106 | * they are only valid while holding the pgmap reference. | ||
107 | */ | ||
108 | pgmap = get_dev_pagemap(pte_pfn(pte), NULL); | ||
109 | if (pgmap) | ||
110 | page = pte_page(pte); | ||
111 | else | ||
112 | goto no_page; | ||
113 | } else if (unlikely(!page)) { | ||
102 | if (flags & FOLL_DUMP) { | 114 | if (flags & FOLL_DUMP) { |
103 | /* Avoid special (like zero) pages in core dumps */ | 115 | /* Avoid special (like zero) pages in core dumps */ |
104 | page = ERR_PTR(-EFAULT); | 116 | page = ERR_PTR(-EFAULT); |
@@ -116,8 +128,28 @@ retry: | |||
116 | } | 128 | } |
117 | } | 129 | } |
118 | 130 | ||
119 | if (flags & FOLL_GET) | 131 | if (flags & FOLL_SPLIT && PageTransCompound(page)) { |
120 | get_page_foll(page); | 132 | int ret; |
133 | get_page(page); | ||
134 | pte_unmap_unlock(ptep, ptl); | ||
135 | lock_page(page); | ||
136 | ret = split_huge_page(page); | ||
137 | unlock_page(page); | ||
138 | put_page(page); | ||
139 | if (ret) | ||
140 | return ERR_PTR(ret); | ||
141 | goto retry; | ||
142 | } | ||
143 | |||
144 | if (flags & FOLL_GET) { | ||
145 | get_page(page); | ||
146 | |||
147 | /* drop the pgmap reference now that we hold the page */ | ||
148 | if (pgmap) { | ||
149 | put_dev_pagemap(pgmap); | ||
150 | pgmap = NULL; | ||
151 | } | ||
152 | } | ||
121 | if (flags & FOLL_TOUCH) { | 153 | if (flags & FOLL_TOUCH) { |
122 | if ((flags & FOLL_WRITE) && | 154 | if ((flags & FOLL_WRITE) && |
123 | !pte_dirty(pte) && !PageDirty(page)) | 155 | !pte_dirty(pte) && !PageDirty(page)) |
@@ -130,6 +162,10 @@ retry: | |||
130 | mark_page_accessed(page); | 162 | mark_page_accessed(page); |
131 | } | 163 | } |
132 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | 164 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { |
165 | /* Do not mlock pte-mapped THP */ | ||
166 | if (PageTransCompound(page)) | ||
167 | goto out; | ||
168 | |||
133 | /* | 169 | /* |
134 | * The preliminary mapping check is mainly to avoid the | 170 | * The preliminary mapping check is mainly to avoid the |
135 | * pointless overhead of lock_page on the ZERO_PAGE | 171 | * pointless overhead of lock_page on the ZERO_PAGE |
@@ -220,27 +256,45 @@ struct page *follow_page_mask(struct vm_area_struct *vma, | |||
220 | } | 256 | } |
221 | if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) | 257 | if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) |
222 | return no_page_table(vma, flags); | 258 | return no_page_table(vma, flags); |
223 | if (pmd_trans_huge(*pmd)) { | 259 | if (pmd_devmap(*pmd)) { |
224 | if (flags & FOLL_SPLIT) { | ||
225 | split_huge_page_pmd(vma, address, pmd); | ||
226 | return follow_page_pte(vma, address, pmd, flags); | ||
227 | } | ||
228 | ptl = pmd_lock(mm, pmd); | 260 | ptl = pmd_lock(mm, pmd); |
229 | if (likely(pmd_trans_huge(*pmd))) { | 261 | page = follow_devmap_pmd(vma, address, pmd, flags); |
230 | if (unlikely(pmd_trans_splitting(*pmd))) { | 262 | spin_unlock(ptl); |
231 | spin_unlock(ptl); | 263 | if (page) |
232 | wait_split_huge_page(vma->anon_vma, pmd); | 264 | return page; |
233 | } else { | 265 | } |
234 | page = follow_trans_huge_pmd(vma, address, | 266 | if (likely(!pmd_trans_huge(*pmd))) |
235 | pmd, flags); | 267 | return follow_page_pte(vma, address, pmd, flags); |
236 | spin_unlock(ptl); | 268 | |
237 | *page_mask = HPAGE_PMD_NR - 1; | 269 | ptl = pmd_lock(mm, pmd); |
238 | return page; | 270 | if (unlikely(!pmd_trans_huge(*pmd))) { |
239 | } | 271 | spin_unlock(ptl); |
240 | } else | 272 | return follow_page_pte(vma, address, pmd, flags); |
273 | } | ||
274 | if (flags & FOLL_SPLIT) { | ||
275 | int ret; | ||
276 | page = pmd_page(*pmd); | ||
277 | if (is_huge_zero_page(page)) { | ||
278 | spin_unlock(ptl); | ||
279 | ret = 0; | ||
280 | split_huge_pmd(vma, pmd, address); | ||
281 | } else { | ||
282 | get_page(page); | ||
241 | spin_unlock(ptl); | 283 | spin_unlock(ptl); |
284 | lock_page(page); | ||
285 | ret = split_huge_page(page); | ||
286 | unlock_page(page); | ||
287 | put_page(page); | ||
288 | } | ||
289 | |||
290 | return ret ? ERR_PTR(ret) : | ||
291 | follow_page_pte(vma, address, pmd, flags); | ||
242 | } | 292 | } |
243 | return follow_page_pte(vma, address, pmd, flags); | 293 | |
294 | page = follow_trans_huge_pmd(vma, address, pmd, flags); | ||
295 | spin_unlock(ptl); | ||
296 | *page_mask = HPAGE_PMD_NR - 1; | ||
297 | return page; | ||
244 | } | 298 | } |
245 | 299 | ||
246 | static int get_gate_page(struct mm_struct *mm, unsigned long address, | 300 | static int get_gate_page(struct mm_struct *mm, unsigned long address, |
@@ -564,6 +618,8 @@ EXPORT_SYMBOL(__get_user_pages); | |||
564 | * @mm: mm_struct of target mm | 618 | * @mm: mm_struct of target mm |
565 | * @address: user address | 619 | * @address: user address |
566 | * @fault_flags:flags to pass down to handle_mm_fault() | 620 | * @fault_flags:flags to pass down to handle_mm_fault() |
621 | * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller | ||
622 | * does not allow retry | ||
567 | * | 623 | * |
568 | * This is meant to be called in the specific scenario where for locking reasons | 624 | * This is meant to be called in the specific scenario where for locking reasons |
569 | * we try to access user memory in atomic context (within a pagefault_disable() | 625 | * we try to access user memory in atomic context (within a pagefault_disable() |
@@ -575,22 +631,28 @@ EXPORT_SYMBOL(__get_user_pages); | |||
575 | * The main difference with get_user_pages() is that this function will | 631 | * The main difference with get_user_pages() is that this function will |
576 | * unconditionally call handle_mm_fault() which will in turn perform all the | 632 | * unconditionally call handle_mm_fault() which will in turn perform all the |
577 | * necessary SW fixup of the dirty and young bits in the PTE, while | 633 | * necessary SW fixup of the dirty and young bits in the PTE, while |
578 | * handle_mm_fault() only guarantees to update these in the struct page. | 634 | * get_user_pages() only guarantees to update these in the struct page. |
579 | * | 635 | * |
580 | * This is important for some architectures where those bits also gate the | 636 | * This is important for some architectures where those bits also gate the |
581 | * access permission to the page because they are maintained in software. On | 637 | * access permission to the page because they are maintained in software. On |
582 | * such architectures, gup() will not be enough to make a subsequent access | 638 | * such architectures, gup() will not be enough to make a subsequent access |
583 | * succeed. | 639 | * succeed. |
584 | * | 640 | * |
585 | * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). | 641 | * This function will not return with an unlocked mmap_sem. So it has not the |
642 | * same semantics wrt the @mm->mmap_sem as does filemap_fault(). | ||
586 | */ | 643 | */ |
587 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | 644 | int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, |
588 | unsigned long address, unsigned int fault_flags) | 645 | unsigned long address, unsigned int fault_flags, |
646 | bool *unlocked) | ||
589 | { | 647 | { |
590 | struct vm_area_struct *vma; | 648 | struct vm_area_struct *vma; |
591 | vm_flags_t vm_flags; | 649 | vm_flags_t vm_flags; |
592 | int ret; | 650 | int ret, major = 0; |
593 | 651 | ||
652 | if (unlocked) | ||
653 | fault_flags |= FAULT_FLAG_ALLOW_RETRY; | ||
654 | |||
655 | retry: | ||
594 | vma = find_extend_vma(mm, address); | 656 | vma = find_extend_vma(mm, address); |
595 | if (!vma || address < vma->vm_start) | 657 | if (!vma || address < vma->vm_start) |
596 | return -EFAULT; | 658 | return -EFAULT; |
@@ -600,6 +662,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
600 | return -EFAULT; | 662 | return -EFAULT; |
601 | 663 | ||
602 | ret = handle_mm_fault(mm, vma, address, fault_flags); | 664 | ret = handle_mm_fault(mm, vma, address, fault_flags); |
665 | major |= ret & VM_FAULT_MAJOR; | ||
603 | if (ret & VM_FAULT_ERROR) { | 666 | if (ret & VM_FAULT_ERROR) { |
604 | if (ret & VM_FAULT_OOM) | 667 | if (ret & VM_FAULT_OOM) |
605 | return -ENOMEM; | 668 | return -ENOMEM; |
@@ -609,8 +672,19 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, | |||
609 | return -EFAULT; | 672 | return -EFAULT; |
610 | BUG(); | 673 | BUG(); |
611 | } | 674 | } |
675 | |||
676 | if (ret & VM_FAULT_RETRY) { | ||
677 | down_read(&mm->mmap_sem); | ||
678 | if (!(fault_flags & FAULT_FLAG_TRIED)) { | ||
679 | *unlocked = true; | ||
680 | fault_flags &= ~FAULT_FLAG_ALLOW_RETRY; | ||
681 | fault_flags |= FAULT_FLAG_TRIED; | ||
682 | goto retry; | ||
683 | } | ||
684 | } | ||
685 | |||
612 | if (tsk) { | 686 | if (tsk) { |
613 | if (ret & VM_FAULT_MAJOR) | 687 | if (major) |
614 | tsk->maj_flt++; | 688 | tsk->maj_flt++; |
615 | else | 689 | else |
616 | tsk->min_flt++; | 690 | tsk->min_flt++; |
@@ -896,7 +970,6 @@ long populate_vma_page_range(struct vm_area_struct *vma, | |||
896 | gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; | 970 | gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; |
897 | if (vma->vm_flags & VM_LOCKONFAULT) | 971 | if (vma->vm_flags & VM_LOCKONFAULT) |
898 | gup_flags &= ~FOLL_POPULATE; | 972 | gup_flags &= ~FOLL_POPULATE; |
899 | |||
900 | /* | 973 | /* |
901 | * We want to touch writable mappings with a write fault in order | 974 | * We want to touch writable mappings with a write fault in order |
902 | * to break COW, except for shared mappings because these don't COW | 975 | * to break COW, except for shared mappings because these don't COW |
@@ -1036,9 +1109,6 @@ struct page *get_dump_page(unsigned long addr) | |||
1036 | * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free | 1109 | * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free |
1037 | * pages containing page tables. | 1110 | * pages containing page tables. |
1038 | * | 1111 | * |
1039 | * *) THP splits will broadcast an IPI, this can be achieved by overriding | ||
1040 | * pmdp_splitting_flush. | ||
1041 | * | ||
1042 | * *) ptes can be read atomically by the architecture. | 1112 | * *) ptes can be read atomically by the architecture. |
1043 | * | 1113 | * |
1044 | * *) access_ok is sufficient to validate userspace address ranges. | 1114 | * *) access_ok is sufficient to validate userspace address ranges. |
@@ -1066,7 +1136,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
1066 | * for an example see gup_get_pte in arch/x86/mm/gup.c | 1136 | * for an example see gup_get_pte in arch/x86/mm/gup.c |
1067 | */ | 1137 | */ |
1068 | pte_t pte = READ_ONCE(*ptep); | 1138 | pte_t pte = READ_ONCE(*ptep); |
1069 | struct page *page; | 1139 | struct page *head, *page; |
1070 | 1140 | ||
1071 | /* | 1141 | /* |
1072 | * Similar to the PMD case below, NUMA hinting must take slow | 1142 | * Similar to the PMD case below, NUMA hinting must take slow |
@@ -1078,15 +1148,17 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
1078 | 1148 | ||
1079 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); | 1149 | VM_BUG_ON(!pfn_valid(pte_pfn(pte))); |
1080 | page = pte_page(pte); | 1150 | page = pte_page(pte); |
1151 | head = compound_head(page); | ||
1081 | 1152 | ||
1082 | if (!page_cache_get_speculative(page)) | 1153 | if (!page_cache_get_speculative(head)) |
1083 | goto pte_unmap; | 1154 | goto pte_unmap; |
1084 | 1155 | ||
1085 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { | 1156 | if (unlikely(pte_val(pte) != pte_val(*ptep))) { |
1086 | put_page(page); | 1157 | put_page(head); |
1087 | goto pte_unmap; | 1158 | goto pte_unmap; |
1088 | } | 1159 | } |
1089 | 1160 | ||
1161 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | ||
1090 | pages[*nr] = page; | 1162 | pages[*nr] = page; |
1091 | (*nr)++; | 1163 | (*nr)++; |
1092 | 1164 | ||
@@ -1119,7 +1191,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, | |||
1119 | static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, | 1191 | static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, |
1120 | unsigned long end, int write, struct page **pages, int *nr) | 1192 | unsigned long end, int write, struct page **pages, int *nr) |
1121 | { | 1193 | { |
1122 | struct page *head, *page, *tail; | 1194 | struct page *head, *page; |
1123 | int refs; | 1195 | int refs; |
1124 | 1196 | ||
1125 | if (write && !pmd_write(orig)) | 1197 | if (write && !pmd_write(orig)) |
@@ -1128,7 +1200,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, | |||
1128 | refs = 0; | 1200 | refs = 0; |
1129 | head = pmd_page(orig); | 1201 | head = pmd_page(orig); |
1130 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); | 1202 | page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); |
1131 | tail = page; | ||
1132 | do { | 1203 | do { |
1133 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | 1204 | VM_BUG_ON_PAGE(compound_head(page) != head, page); |
1134 | pages[*nr] = page; | 1205 | pages[*nr] = page; |
@@ -1149,24 +1220,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, | |||
1149 | return 0; | 1220 | return 0; |
1150 | } | 1221 | } |
1151 | 1222 | ||
1152 | /* | ||
1153 | * Any tail pages need their mapcount reference taken before we | ||
1154 | * return. (This allows the THP code to bump their ref count when | ||
1155 | * they are split into base pages). | ||
1156 | */ | ||
1157 | while (refs--) { | ||
1158 | if (PageTail(tail)) | ||
1159 | get_huge_page_tail(tail); | ||
1160 | tail++; | ||
1161 | } | ||
1162 | |||
1163 | return 1; | 1223 | return 1; |
1164 | } | 1224 | } |
1165 | 1225 | ||
1166 | static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, | 1226 | static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, |
1167 | unsigned long end, int write, struct page **pages, int *nr) | 1227 | unsigned long end, int write, struct page **pages, int *nr) |
1168 | { | 1228 | { |
1169 | struct page *head, *page, *tail; | 1229 | struct page *head, *page; |
1170 | int refs; | 1230 | int refs; |
1171 | 1231 | ||
1172 | if (write && !pud_write(orig)) | 1232 | if (write && !pud_write(orig)) |
@@ -1175,7 +1235,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, | |||
1175 | refs = 0; | 1235 | refs = 0; |
1176 | head = pud_page(orig); | 1236 | head = pud_page(orig); |
1177 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); | 1237 | page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); |
1178 | tail = page; | ||
1179 | do { | 1238 | do { |
1180 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | 1239 | VM_BUG_ON_PAGE(compound_head(page) != head, page); |
1181 | pages[*nr] = page; | 1240 | pages[*nr] = page; |
@@ -1196,12 +1255,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, | |||
1196 | return 0; | 1255 | return 0; |
1197 | } | 1256 | } |
1198 | 1257 | ||
1199 | while (refs--) { | ||
1200 | if (PageTail(tail)) | ||
1201 | get_huge_page_tail(tail); | ||
1202 | tail++; | ||
1203 | } | ||
1204 | |||
1205 | return 1; | 1258 | return 1; |
1206 | } | 1259 | } |
1207 | 1260 | ||
@@ -1210,7 +1263,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, | |||
1210 | struct page **pages, int *nr) | 1263 | struct page **pages, int *nr) |
1211 | { | 1264 | { |
1212 | int refs; | 1265 | int refs; |
1213 | struct page *head, *page, *tail; | 1266 | struct page *head, *page; |
1214 | 1267 | ||
1215 | if (write && !pgd_write(orig)) | 1268 | if (write && !pgd_write(orig)) |
1216 | return 0; | 1269 | return 0; |
@@ -1218,7 +1271,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, | |||
1218 | refs = 0; | 1271 | refs = 0; |
1219 | head = pgd_page(orig); | 1272 | head = pgd_page(orig); |
1220 | page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); | 1273 | page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); |
1221 | tail = page; | ||
1222 | do { | 1274 | do { |
1223 | VM_BUG_ON_PAGE(compound_head(page) != head, page); | 1275 | VM_BUG_ON_PAGE(compound_head(page) != head, page); |
1224 | pages[*nr] = page; | 1276 | pages[*nr] = page; |
@@ -1239,12 +1291,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, | |||
1239 | return 0; | 1291 | return 0; |
1240 | } | 1292 | } |
1241 | 1293 | ||
1242 | while (refs--) { | ||
1243 | if (PageTail(tail)) | ||
1244 | get_huge_page_tail(tail); | ||
1245 | tail++; | ||
1246 | } | ||
1247 | |||
1248 | return 1; | 1294 | return 1; |
1249 | } | 1295 | } |
1250 | 1296 | ||
@@ -1259,7 +1305,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, | |||
1259 | pmd_t pmd = READ_ONCE(*pmdp); | 1305 | pmd_t pmd = READ_ONCE(*pmdp); |
1260 | 1306 | ||
1261 | next = pmd_addr_end(addr, end); | 1307 | next = pmd_addr_end(addr, end); |
1262 | if (pmd_none(pmd) || pmd_trans_splitting(pmd)) | 1308 | if (pmd_none(pmd)) |
1263 | return 0; | 1309 | return 0; |
1264 | 1310 | ||
1265 | if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { | 1311 | if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) { |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f952f055fdcf..b2db98136af9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
@@ -16,12 +16,16 @@ | |||
16 | #include <linux/swap.h> | 16 | #include <linux/swap.h> |
17 | #include <linux/shrinker.h> | 17 | #include <linux/shrinker.h> |
18 | #include <linux/mm_inline.h> | 18 | #include <linux/mm_inline.h> |
19 | #include <linux/swapops.h> | ||
19 | #include <linux/dax.h> | 20 | #include <linux/dax.h> |
20 | #include <linux/kthread.h> | 21 | #include <linux/kthread.h> |
21 | #include <linux/khugepaged.h> | 22 | #include <linux/khugepaged.h> |
22 | #include <linux/freezer.h> | 23 | #include <linux/freezer.h> |
24 | #include <linux/pfn_t.h> | ||
23 | #include <linux/mman.h> | 25 | #include <linux/mman.h> |
26 | #include <linux/memremap.h> | ||
24 | #include <linux/pagemap.h> | 27 | #include <linux/pagemap.h> |
28 | #include <linux/debugfs.h> | ||
25 | #include <linux/migrate.h> | 29 | #include <linux/migrate.h> |
26 | #include <linux/hashtable.h> | 30 | #include <linux/hashtable.h> |
27 | #include <linux/userfaultfd_k.h> | 31 | #include <linux/userfaultfd_k.h> |
@@ -45,6 +49,7 @@ enum scan_result { | |||
45 | SCAN_PAGE_LRU, | 49 | SCAN_PAGE_LRU, |
46 | SCAN_PAGE_LOCK, | 50 | SCAN_PAGE_LOCK, |
47 | SCAN_PAGE_ANON, | 51 | SCAN_PAGE_ANON, |
52 | SCAN_PAGE_COMPOUND, | ||
48 | SCAN_ANY_PROCESS, | 53 | SCAN_ANY_PROCESS, |
49 | SCAN_VMA_NULL, | 54 | SCAN_VMA_NULL, |
50 | SCAN_VMA_CHECK, | 55 | SCAN_VMA_CHECK, |
@@ -133,6 +138,10 @@ static struct khugepaged_scan khugepaged_scan = { | |||
133 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), | 138 | .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), |
134 | }; | 139 | }; |
135 | 140 | ||
141 | static DEFINE_SPINLOCK(split_queue_lock); | ||
142 | static LIST_HEAD(split_queue); | ||
143 | static unsigned long split_queue_len; | ||
144 | static struct shrinker deferred_split_shrinker; | ||
136 | 145 | ||
137 | static void set_recommended_min_free_kbytes(void) | 146 | static void set_recommended_min_free_kbytes(void) |
138 | { | 147 | { |
@@ -665,6 +674,9 @@ static int __init hugepage_init(void) | |||
665 | err = register_shrinker(&huge_zero_page_shrinker); | 674 | err = register_shrinker(&huge_zero_page_shrinker); |
666 | if (err) | 675 | if (err) |
667 | goto err_hzp_shrinker; | 676 | goto err_hzp_shrinker; |
677 | err = register_shrinker(&deferred_split_shrinker); | ||
678 | if (err) | ||
679 | goto err_split_shrinker; | ||
668 | 680 | ||
669 | /* | 681 | /* |
670 | * By default disable transparent hugepages on smaller systems, | 682 | * By default disable transparent hugepages on smaller systems, |
@@ -682,6 +694,8 @@ static int __init hugepage_init(void) | |||
682 | 694 | ||
683 | return 0; | 695 | return 0; |
684 | err_khugepaged: | 696 | err_khugepaged: |
697 | unregister_shrinker(&deferred_split_shrinker); | ||
698 | err_split_shrinker: | ||
685 | unregister_shrinker(&huge_zero_page_shrinker); | 699 | unregister_shrinker(&huge_zero_page_shrinker); |
686 | err_hzp_shrinker: | 700 | err_hzp_shrinker: |
687 | khugepaged_slab_exit(); | 701 | khugepaged_slab_exit(); |
@@ -738,6 +752,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) | |||
738 | return entry; | 752 | return entry; |
739 | } | 753 | } |
740 | 754 | ||
755 | static inline struct list_head *page_deferred_list(struct page *page) | ||
756 | { | ||
757 | /* | ||
758 | * ->lru in the tail pages is occupied by compound_head. | ||
759 | * Let's use ->mapping + ->index in the second tail page as list_head. | ||
760 | */ | ||
761 | return (struct list_head *)&page[2].mapping; | ||
762 | } | ||
763 | |||
764 | void prep_transhuge_page(struct page *page) | ||
765 | { | ||
766 | /* | ||
767 | * we use page->mapping and page->indexlru in second tail page | ||
768 | * as list_head: assuming THP order >= 2 | ||
769 | */ | ||
770 | BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); | ||
771 | |||
772 | INIT_LIST_HEAD(page_deferred_list(page)); | ||
773 | set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); | ||
774 | } | ||
775 | |||
741 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | 776 | static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, |
742 | struct vm_area_struct *vma, | 777 | struct vm_area_struct *vma, |
743 | unsigned long address, pmd_t *pmd, | 778 | unsigned long address, pmd_t *pmd, |
@@ -751,7 +786,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
751 | 786 | ||
752 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 787 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
753 | 788 | ||
754 | if (mem_cgroup_try_charge(page, mm, gfp, &memcg)) { | 789 | if (mem_cgroup_try_charge(page, mm, gfp, &memcg, true)) { |
755 | put_page(page); | 790 | put_page(page); |
756 | count_vm_event(THP_FAULT_FALLBACK); | 791 | count_vm_event(THP_FAULT_FALLBACK); |
757 | return VM_FAULT_FALLBACK; | 792 | return VM_FAULT_FALLBACK; |
@@ -759,7 +794,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
759 | 794 | ||
760 | pgtable = pte_alloc_one(mm, haddr); | 795 | pgtable = pte_alloc_one(mm, haddr); |
761 | if (unlikely(!pgtable)) { | 796 | if (unlikely(!pgtable)) { |
762 | mem_cgroup_cancel_charge(page, memcg); | 797 | mem_cgroup_cancel_charge(page, memcg, true); |
763 | put_page(page); | 798 | put_page(page); |
764 | return VM_FAULT_OOM; | 799 | return VM_FAULT_OOM; |
765 | } | 800 | } |
@@ -775,7 +810,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
775 | ptl = pmd_lock(mm, pmd); | 810 | ptl = pmd_lock(mm, pmd); |
776 | if (unlikely(!pmd_none(*pmd))) { | 811 | if (unlikely(!pmd_none(*pmd))) { |
777 | spin_unlock(ptl); | 812 | spin_unlock(ptl); |
778 | mem_cgroup_cancel_charge(page, memcg); | 813 | mem_cgroup_cancel_charge(page, memcg, true); |
779 | put_page(page); | 814 | put_page(page); |
780 | pte_free(mm, pgtable); | 815 | pte_free(mm, pgtable); |
781 | } else { | 816 | } else { |
@@ -786,7 +821,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
786 | int ret; | 821 | int ret; |
787 | 822 | ||
788 | spin_unlock(ptl); | 823 | spin_unlock(ptl); |
789 | mem_cgroup_cancel_charge(page, memcg); | 824 | mem_cgroup_cancel_charge(page, memcg, true); |
790 | put_page(page); | 825 | put_page(page); |
791 | pte_free(mm, pgtable); | 826 | pte_free(mm, pgtable); |
792 | ret = handle_userfault(vma, address, flags, | 827 | ret = handle_userfault(vma, address, flags, |
@@ -797,8 +832,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, | |||
797 | 832 | ||
798 | entry = mk_huge_pmd(page, vma->vm_page_prot); | 833 | entry = mk_huge_pmd(page, vma->vm_page_prot); |
799 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 834 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
800 | page_add_new_anon_rmap(page, vma, haddr); | 835 | page_add_new_anon_rmap(page, vma, haddr, true); |
801 | mem_cgroup_commit_charge(page, memcg, false); | 836 | mem_cgroup_commit_charge(page, memcg, false, true); |
802 | lru_cache_add_active_or_unevictable(page, vma); | 837 | lru_cache_add_active_or_unevictable(page, vma); |
803 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 838 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
804 | set_pmd_at(mm, haddr, pmd, entry); | 839 | set_pmd_at(mm, haddr, pmd, entry); |
@@ -892,32 +927,33 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
892 | count_vm_event(THP_FAULT_FALLBACK); | 927 | count_vm_event(THP_FAULT_FALLBACK); |
893 | return VM_FAULT_FALLBACK; | 928 | return VM_FAULT_FALLBACK; |
894 | } | 929 | } |
930 | prep_transhuge_page(page); | ||
895 | return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, | 931 | return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, |
896 | flags); | 932 | flags); |
897 | } | 933 | } |
898 | 934 | ||
899 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | 935 | static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, |
900 | pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write) | 936 | pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) |
901 | { | 937 | { |
902 | struct mm_struct *mm = vma->vm_mm; | 938 | struct mm_struct *mm = vma->vm_mm; |
903 | pmd_t entry; | 939 | pmd_t entry; |
904 | spinlock_t *ptl; | 940 | spinlock_t *ptl; |
905 | 941 | ||
906 | ptl = pmd_lock(mm, pmd); | 942 | ptl = pmd_lock(mm, pmd); |
907 | if (pmd_none(*pmd)) { | 943 | entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); |
908 | entry = pmd_mkhuge(pfn_pmd(pfn, prot)); | 944 | if (pfn_t_devmap(pfn)) |
909 | if (write) { | 945 | entry = pmd_mkdevmap(entry); |
910 | entry = pmd_mkyoung(pmd_mkdirty(entry)); | 946 | if (write) { |
911 | entry = maybe_pmd_mkwrite(entry, vma); | 947 | entry = pmd_mkyoung(pmd_mkdirty(entry)); |
912 | } | 948 | entry = maybe_pmd_mkwrite(entry, vma); |
913 | set_pmd_at(mm, addr, pmd, entry); | 949 | } |
914 | update_mmu_cache_pmd(vma, addr, pmd); | 950 | set_pmd_at(mm, addr, pmd, entry); |
915 | } | 951 | update_mmu_cache_pmd(vma, addr, pmd); |
916 | spin_unlock(ptl); | 952 | spin_unlock(ptl); |
917 | } | 953 | } |
918 | 954 | ||
919 | int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | 955 | int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, |
920 | pmd_t *pmd, unsigned long pfn, bool write) | 956 | pmd_t *pmd, pfn_t pfn, bool write) |
921 | { | 957 | { |
922 | pgprot_t pgprot = vma->vm_page_prot; | 958 | pgprot_t pgprot = vma->vm_page_prot; |
923 | /* | 959 | /* |
@@ -929,7 +965,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
929 | BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == | 965 | BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == |
930 | (VM_PFNMAP|VM_MIXEDMAP)); | 966 | (VM_PFNMAP|VM_MIXEDMAP)); |
931 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); | 967 | BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); |
932 | BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); | 968 | BUG_ON(!pfn_t_devmap(pfn)); |
933 | 969 | ||
934 | if (addr < vma->vm_start || addr >= vma->vm_end) | 970 | if (addr < vma->vm_start || addr >= vma->vm_end) |
935 | return VM_FAULT_SIGBUS; | 971 | return VM_FAULT_SIGBUS; |
@@ -939,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, | |||
939 | return VM_FAULT_NOPAGE; | 975 | return VM_FAULT_NOPAGE; |
940 | } | 976 | } |
941 | 977 | ||
978 | static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
979 | pmd_t *pmd) | ||
980 | { | ||
981 | pmd_t _pmd; | ||
982 | |||
983 | /* | ||
984 | * We should set the dirty bit only for FOLL_WRITE but for now | ||
985 | * the dirty bit in the pmd is meaningless. And if the dirty | ||
986 | * bit will become meaningful and we'll only set it with | ||
987 | * FOLL_WRITE, an atomic set_bit will be required on the pmd to | ||
988 | * set the young bit, instead of the current set_pmd_at. | ||
989 | */ | ||
990 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | ||
991 | if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, | ||
992 | pmd, _pmd, 1)) | ||
993 | update_mmu_cache_pmd(vma, addr, pmd); | ||
994 | } | ||
995 | |||
996 | struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, | ||
997 | pmd_t *pmd, int flags) | ||
998 | { | ||
999 | unsigned long pfn = pmd_pfn(*pmd); | ||
1000 | struct mm_struct *mm = vma->vm_mm; | ||
1001 | struct dev_pagemap *pgmap; | ||
1002 | struct page *page; | ||
1003 | |||
1004 | assert_spin_locked(pmd_lockptr(mm, pmd)); | ||
1005 | |||
1006 | if (flags & FOLL_WRITE && !pmd_write(*pmd)) | ||
1007 | return NULL; | ||
1008 | |||
1009 | if (pmd_present(*pmd) && pmd_devmap(*pmd)) | ||
1010 | /* pass */; | ||
1011 | else | ||
1012 | return NULL; | ||
1013 | |||
1014 | if (flags & FOLL_TOUCH) | ||
1015 | touch_pmd(vma, addr, pmd); | ||
1016 | |||
1017 | /* | ||
1018 | * device mapped pages can only be returned if the | ||
1019 | * caller will manage the page reference count. | ||
1020 | */ | ||
1021 | if (!(flags & FOLL_GET)) | ||
1022 | return ERR_PTR(-EEXIST); | ||
1023 | |||
1024 | pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; | ||
1025 | pgmap = get_dev_pagemap(pfn, NULL); | ||
1026 | if (!pgmap) | ||
1027 | return ERR_PTR(-EFAULT); | ||
1028 | page = pfn_to_page(pfn); | ||
1029 | get_page(page); | ||
1030 | put_dev_pagemap(pgmap); | ||
1031 | |||
1032 | return page; | ||
1033 | } | ||
1034 | |||
942 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | 1035 | int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, |
943 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, | 1036 | pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, |
944 | struct vm_area_struct *vma) | 1037 | struct vm_area_struct *vma) |
@@ -960,7 +1053,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
960 | 1053 | ||
961 | ret = -EAGAIN; | 1054 | ret = -EAGAIN; |
962 | pmd = *src_pmd; | 1055 | pmd = *src_pmd; |
963 | if (unlikely(!pmd_trans_huge(pmd))) { | 1056 | if (unlikely(!pmd_trans_huge(pmd) && !pmd_devmap(pmd))) { |
964 | pte_free(dst_mm, pgtable); | 1057 | pte_free(dst_mm, pgtable); |
965 | goto out_unlock; | 1058 | goto out_unlock; |
966 | } | 1059 | } |
@@ -983,26 +1076,20 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
983 | goto out_unlock; | 1076 | goto out_unlock; |
984 | } | 1077 | } |
985 | 1078 | ||
986 | if (unlikely(pmd_trans_splitting(pmd))) { | 1079 | if (pmd_trans_huge(pmd)) { |
987 | /* split huge page running from under us */ | 1080 | /* thp accounting separate from pmd_devmap accounting */ |
988 | spin_unlock(src_ptl); | 1081 | src_page = pmd_page(pmd); |
989 | spin_unlock(dst_ptl); | 1082 | VM_BUG_ON_PAGE(!PageHead(src_page), src_page); |
990 | pte_free(dst_mm, pgtable); | 1083 | get_page(src_page); |
991 | 1084 | page_dup_rmap(src_page, true); | |
992 | wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ | 1085 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); |
993 | goto out; | 1086 | atomic_long_inc(&dst_mm->nr_ptes); |
1087 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); | ||
994 | } | 1088 | } |
995 | src_page = pmd_page(pmd); | ||
996 | VM_BUG_ON_PAGE(!PageHead(src_page), src_page); | ||
997 | get_page(src_page); | ||
998 | page_dup_rmap(src_page); | ||
999 | add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); | ||
1000 | 1089 | ||
1001 | pmdp_set_wrprotect(src_mm, addr, src_pmd); | 1090 | pmdp_set_wrprotect(src_mm, addr, src_pmd); |
1002 | pmd = pmd_mkold(pmd_wrprotect(pmd)); | 1091 | pmd = pmd_mkold(pmd_wrprotect(pmd)); |
1003 | pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); | ||
1004 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); | 1092 | set_pmd_at(dst_mm, addr, dst_pmd, pmd); |
1005 | atomic_long_inc(&dst_mm->nr_ptes); | ||
1006 | 1093 | ||
1007 | ret = 0; | 1094 | ret = 0; |
1008 | out_unlock: | 1095 | out_unlock: |
@@ -1035,37 +1122,6 @@ unlock: | |||
1035 | spin_unlock(ptl); | 1122 | spin_unlock(ptl); |
1036 | } | 1123 | } |
1037 | 1124 | ||
1038 | /* | ||
1039 | * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages | ||
1040 | * during copy_user_huge_page()'s copy_page_rep(): in the case when | ||
1041 | * the source page gets split and a tail freed before copy completes. | ||
1042 | * Called under pmd_lock of checked pmd, so safe from splitting itself. | ||
1043 | */ | ||
1044 | static void get_user_huge_page(struct page *page) | ||
1045 | { | ||
1046 | if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { | ||
1047 | struct page *endpage = page + HPAGE_PMD_NR; | ||
1048 | |||
1049 | atomic_add(HPAGE_PMD_NR, &page->_count); | ||
1050 | while (++page < endpage) | ||
1051 | get_huge_page_tail(page); | ||
1052 | } else { | ||
1053 | get_page(page); | ||
1054 | } | ||
1055 | } | ||
1056 | |||
1057 | static void put_user_huge_page(struct page *page) | ||
1058 | { | ||
1059 | if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { | ||
1060 | struct page *endpage = page + HPAGE_PMD_NR; | ||
1061 | |||
1062 | while (page < endpage) | ||
1063 | put_page(page++); | ||
1064 | } else { | ||
1065 | put_page(page); | ||
1066 | } | ||
1067 | } | ||
1068 | |||
1069 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | 1125 | static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, |
1070 | struct vm_area_struct *vma, | 1126 | struct vm_area_struct *vma, |
1071 | unsigned long address, | 1127 | unsigned long address, |
@@ -1095,13 +1151,14 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1095 | vma, address, page_to_nid(page)); | 1151 | vma, address, page_to_nid(page)); |
1096 | if (unlikely(!pages[i] || | 1152 | if (unlikely(!pages[i] || |
1097 | mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, | 1153 | mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, |
1098 | &memcg))) { | 1154 | &memcg, false))) { |
1099 | if (pages[i]) | 1155 | if (pages[i]) |
1100 | put_page(pages[i]); | 1156 | put_page(pages[i]); |
1101 | while (--i >= 0) { | 1157 | while (--i >= 0) { |
1102 | memcg = (void *)page_private(pages[i]); | 1158 | memcg = (void *)page_private(pages[i]); |
1103 | set_page_private(pages[i], 0); | 1159 | set_page_private(pages[i], 0); |
1104 | mem_cgroup_cancel_charge(pages[i], memcg); | 1160 | mem_cgroup_cancel_charge(pages[i], memcg, |
1161 | false); | ||
1105 | put_page(pages[i]); | 1162 | put_page(pages[i]); |
1106 | } | 1163 | } |
1107 | kfree(pages); | 1164 | kfree(pages); |
@@ -1139,8 +1196,8 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1139 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 1196 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
1140 | memcg = (void *)page_private(pages[i]); | 1197 | memcg = (void *)page_private(pages[i]); |
1141 | set_page_private(pages[i], 0); | 1198 | set_page_private(pages[i], 0); |
1142 | page_add_new_anon_rmap(pages[i], vma, haddr); | 1199 | page_add_new_anon_rmap(pages[i], vma, haddr, false); |
1143 | mem_cgroup_commit_charge(pages[i], memcg, false); | 1200 | mem_cgroup_commit_charge(pages[i], memcg, false, false); |
1144 | lru_cache_add_active_or_unevictable(pages[i], vma); | 1201 | lru_cache_add_active_or_unevictable(pages[i], vma); |
1145 | pte = pte_offset_map(&_pmd, haddr); | 1202 | pte = pte_offset_map(&_pmd, haddr); |
1146 | VM_BUG_ON(!pte_none(*pte)); | 1203 | VM_BUG_ON(!pte_none(*pte)); |
@@ -1151,7 +1208,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, | |||
1151 | 1208 | ||
1152 | smp_wmb(); /* make pte visible before pmd */ | 1209 | smp_wmb(); /* make pte visible before pmd */ |
1153 | pmd_populate(mm, pmd, pgtable); | 1210 | pmd_populate(mm, pmd, pgtable); |
1154 | page_remove_rmap(page); | 1211 | page_remove_rmap(page, true); |
1155 | spin_unlock(ptl); | 1212 | spin_unlock(ptl); |
1156 | 1213 | ||
1157 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1214 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
@@ -1168,7 +1225,7 @@ out_free_pages: | |||
1168 | for (i = 0; i < HPAGE_PMD_NR; i++) { | 1225 | for (i = 0; i < HPAGE_PMD_NR; i++) { |
1169 | memcg = (void *)page_private(pages[i]); | 1226 | memcg = (void *)page_private(pages[i]); |
1170 | set_page_private(pages[i], 0); | 1227 | set_page_private(pages[i], 0); |
1171 | mem_cgroup_cancel_charge(pages[i], memcg); | 1228 | mem_cgroup_cancel_charge(pages[i], memcg, false); |
1172 | put_page(pages[i]); | 1229 | put_page(pages[i]); |
1173 | } | 1230 | } |
1174 | kfree(pages); | 1231 | kfree(pages); |
@@ -1198,7 +1255,17 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1198 | 1255 | ||
1199 | page = pmd_page(orig_pmd); | 1256 | page = pmd_page(orig_pmd); |
1200 | VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); | 1257 | VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); |
1201 | if (page_mapcount(page) == 1) { | 1258 | /* |
1259 | * We can only reuse the page if nobody else maps the huge page or it's | ||
1260 | * part. We can do it by checking page_mapcount() on each sub-page, but | ||
1261 | * it's expensive. | ||
1262 | * The cheaper way is to check page_count() to be equal 1: every | ||
1263 | * mapcount takes page reference reference, so this way we can | ||
1264 | * guarantee, that the PMD is the only mapping. | ||
1265 | * This can give false negative if somebody pinned the page, but that's | ||
1266 | * fine. | ||
1267 | */ | ||
1268 | if (page_mapcount(page) == 1 && page_count(page) == 1) { | ||
1202 | pmd_t entry; | 1269 | pmd_t entry; |
1203 | entry = pmd_mkyoung(orig_pmd); | 1270 | entry = pmd_mkyoung(orig_pmd); |
1204 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1271 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
@@ -1207,7 +1274,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
1207 | ret |= VM_FAULT_WRITE; | 1274 | ret |= VM_FAULT_WRITE; |
1208 | goto out_unlock; | 1275 | goto out_unlock; |
1209 | } | 1276 | } |
1210 | get_user_huge_page(page); | 1277 | get_page(page); |
1211 | spin_unlock(ptl); | 1278 | spin_unlock(ptl); |
1212 | alloc: | 1279 | alloc: |
1213 | if (transparent_hugepage_enabled(vma) && | 1280 | if (transparent_hugepage_enabled(vma) && |
@@ -1217,30 +1284,33 @@ alloc: | |||
1217 | } else | 1284 | } else |
1218 | new_page = NULL; | 1285 | new_page = NULL; |
1219 | 1286 | ||
1220 | if (unlikely(!new_page)) { | 1287 | if (likely(new_page)) { |
1288 | prep_transhuge_page(new_page); | ||
1289 | } else { | ||
1221 | if (!page) { | 1290 | if (!page) { |
1222 | split_huge_page_pmd(vma, address, pmd); | 1291 | split_huge_pmd(vma, pmd, address); |
1223 | ret |= VM_FAULT_FALLBACK; | 1292 | ret |= VM_FAULT_FALLBACK; |
1224 | } else { | 1293 | } else { |
1225 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, | 1294 | ret = do_huge_pmd_wp_page_fallback(mm, vma, address, |
1226 | pmd, orig_pmd, page, haddr); | 1295 | pmd, orig_pmd, page, haddr); |
1227 | if (ret & VM_FAULT_OOM) { | 1296 | if (ret & VM_FAULT_OOM) { |
1228 | split_huge_page(page); | 1297 | split_huge_pmd(vma, pmd, address); |
1229 | ret |= VM_FAULT_FALLBACK; | 1298 | ret |= VM_FAULT_FALLBACK; |
1230 | } | 1299 | } |
1231 | put_user_huge_page(page); | 1300 | put_page(page); |
1232 | } | 1301 | } |
1233 | count_vm_event(THP_FAULT_FALLBACK); | 1302 | count_vm_event(THP_FAULT_FALLBACK); |
1234 | goto out; | 1303 | goto out; |
1235 | } | 1304 | } |
1236 | 1305 | ||
1237 | if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg))) { | 1306 | if (unlikely(mem_cgroup_try_charge(new_page, mm, huge_gfp, &memcg, |
1307 | true))) { | ||
1238 | put_page(new_page); | 1308 | put_page(new_page); |
1239 | if (page) { | 1309 | if (page) { |
1240 | split_huge_page(page); | 1310 | split_huge_pmd(vma, pmd, address); |
1241 | put_user_huge_page(page); | 1311 | put_page(page); |
1242 | } else | 1312 | } else |
1243 | split_huge_page_pmd(vma, address, pmd); | 1313 | split_huge_pmd(vma, pmd, address); |
1244 | ret |= VM_FAULT_FALLBACK; | 1314 | ret |= VM_FAULT_FALLBACK; |
1245 | count_vm_event(THP_FAULT_FALLBACK); | 1315 | count_vm_event(THP_FAULT_FALLBACK); |
1246 | goto out; | 1316 | goto out; |
@@ -1260,10 +1330,10 @@ alloc: | |||
1260 | 1330 | ||
1261 | spin_lock(ptl); | 1331 | spin_lock(ptl); |
1262 | if (page) | 1332 | if (page) |
1263 | put_user_huge_page(page); | 1333 | put_page(page); |
1264 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { | 1334 | if (unlikely(!pmd_same(*pmd, orig_pmd))) { |
1265 | spin_unlock(ptl); | 1335 | spin_unlock(ptl); |
1266 | mem_cgroup_cancel_charge(new_page, memcg); | 1336 | mem_cgroup_cancel_charge(new_page, memcg, true); |
1267 | put_page(new_page); | 1337 | put_page(new_page); |
1268 | goto out_mn; | 1338 | goto out_mn; |
1269 | } else { | 1339 | } else { |
@@ -1271,8 +1341,8 @@ alloc: | |||
1271 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); | 1341 | entry = mk_huge_pmd(new_page, vma->vm_page_prot); |
1272 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); | 1342 | entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); |
1273 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); | 1343 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); |
1274 | page_add_new_anon_rmap(new_page, vma, haddr); | 1344 | page_add_new_anon_rmap(new_page, vma, haddr, true); |
1275 | mem_cgroup_commit_charge(new_page, memcg, false); | 1345 | mem_cgroup_commit_charge(new_page, memcg, false, true); |
1276 | lru_cache_add_active_or_unevictable(new_page, vma); | 1346 | lru_cache_add_active_or_unevictable(new_page, vma); |
1277 | set_pmd_at(mm, haddr, pmd, entry); | 1347 | set_pmd_at(mm, haddr, pmd, entry); |
1278 | update_mmu_cache_pmd(vma, address, pmd); | 1348 | update_mmu_cache_pmd(vma, address, pmd); |
@@ -1281,7 +1351,7 @@ alloc: | |||
1281 | put_huge_zero_page(); | 1351 | put_huge_zero_page(); |
1282 | } else { | 1352 | } else { |
1283 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1353 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1284 | page_remove_rmap(page); | 1354 | page_remove_rmap(page, true); |
1285 | put_page(page); | 1355 | put_page(page); |
1286 | } | 1356 | } |
1287 | ret |= VM_FAULT_WRITE; | 1357 | ret |= VM_FAULT_WRITE; |
@@ -1319,23 +1389,23 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1319 | 1389 | ||
1320 | page = pmd_page(*pmd); | 1390 | page = pmd_page(*pmd); |
1321 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1391 | VM_BUG_ON_PAGE(!PageHead(page), page); |
1322 | if (flags & FOLL_TOUCH) { | 1392 | if (flags & FOLL_TOUCH) |
1323 | pmd_t _pmd; | 1393 | touch_pmd(vma, addr, pmd); |
1394 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
1324 | /* | 1395 | /* |
1325 | * We should set the dirty bit only for FOLL_WRITE but | 1396 | * We don't mlock() pte-mapped THPs. This way we can avoid |
1326 | * for now the dirty bit in the pmd is meaningless. | 1397 | * leaking mlocked pages into non-VM_LOCKED VMAs. |
1327 | * And if the dirty bit will become meaningful and | 1398 | * |
1328 | * we'll only set it with FOLL_WRITE, an atomic | 1399 | * In most cases the pmd is the only mapping of the page as we |
1329 | * set_bit will be required on the pmd to set the | 1400 | * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for |
1330 | * young bit, instead of the current set_pmd_at. | 1401 | * writable private mappings in populate_vma_page_range(). |
1402 | * | ||
1403 | * The only scenario when we have the page shared here is if we | ||
1404 | * mlocking read-only mapping shared over fork(). We skip | ||
1405 | * mlocking such pages. | ||
1331 | */ | 1406 | */ |
1332 | _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); | 1407 | if (compound_mapcount(page) == 1 && !PageDoubleMap(page) && |
1333 | if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, | 1408 | page->mapping && trylock_page(page)) { |
1334 | pmd, _pmd, 1)) | ||
1335 | update_mmu_cache_pmd(vma, addr, pmd); | ||
1336 | } | ||
1337 | if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { | ||
1338 | if (page->mapping && trylock_page(page)) { | ||
1339 | lru_add_drain(); | 1409 | lru_add_drain(); |
1340 | if (page->mapping) | 1410 | if (page->mapping) |
1341 | mlock_vma_page(page); | 1411 | mlock_vma_page(page); |
@@ -1345,7 +1415,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, | |||
1345 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; | 1415 | page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; |
1346 | VM_BUG_ON_PAGE(!PageCompound(page), page); | 1416 | VM_BUG_ON_PAGE(!PageCompound(page), page); |
1347 | if (flags & FOLL_GET) | 1417 | if (flags & FOLL_GET) |
1348 | get_page_foll(page); | 1418 | get_page(page); |
1349 | 1419 | ||
1350 | out: | 1420 | out: |
1351 | return page; | 1421 | return page; |
@@ -1480,13 +1550,84 @@ out: | |||
1480 | return 0; | 1550 | return 0; |
1481 | } | 1551 | } |
1482 | 1552 | ||
1553 | int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | ||
1554 | pmd_t *pmd, unsigned long addr, unsigned long next) | ||
1555 | |||
1556 | { | ||
1557 | spinlock_t *ptl; | ||
1558 | pmd_t orig_pmd; | ||
1559 | struct page *page; | ||
1560 | struct mm_struct *mm = tlb->mm; | ||
1561 | int ret = 0; | ||
1562 | |||
1563 | if (!pmd_trans_huge_lock(pmd, vma, &ptl)) | ||
1564 | goto out; | ||
1565 | |||
1566 | orig_pmd = *pmd; | ||
1567 | if (is_huge_zero_pmd(orig_pmd)) { | ||
1568 | ret = 1; | ||
1569 | goto out; | ||
1570 | } | ||
1571 | |||
1572 | page = pmd_page(orig_pmd); | ||
1573 | /* | ||
1574 | * If other processes are mapping this page, we couldn't discard | ||
1575 | * the page unless they all do MADV_FREE so let's skip the page. | ||
1576 | */ | ||
1577 | if (page_mapcount(page) != 1) | ||
1578 | goto out; | ||
1579 | |||
1580 | if (!trylock_page(page)) | ||
1581 | goto out; | ||
1582 | |||
1583 | /* | ||
1584 | * If user want to discard part-pages of THP, split it so MADV_FREE | ||
1585 | * will deactivate only them. | ||
1586 | */ | ||
1587 | if (next - addr != HPAGE_PMD_SIZE) { | ||
1588 | get_page(page); | ||
1589 | spin_unlock(ptl); | ||
1590 | if (split_huge_page(page)) { | ||
1591 | put_page(page); | ||
1592 | unlock_page(page); | ||
1593 | goto out_unlocked; | ||
1594 | } | ||
1595 | put_page(page); | ||
1596 | unlock_page(page); | ||
1597 | ret = 1; | ||
1598 | goto out_unlocked; | ||
1599 | } | ||
1600 | |||
1601 | if (PageDirty(page)) | ||
1602 | ClearPageDirty(page); | ||
1603 | unlock_page(page); | ||
1604 | |||
1605 | if (PageActive(page)) | ||
1606 | deactivate_page(page); | ||
1607 | |||
1608 | if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { | ||
1609 | orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, | ||
1610 | tlb->fullmm); | ||
1611 | orig_pmd = pmd_mkold(orig_pmd); | ||
1612 | orig_pmd = pmd_mkclean(orig_pmd); | ||
1613 | |||
1614 | set_pmd_at(mm, addr, pmd, orig_pmd); | ||
1615 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); | ||
1616 | } | ||
1617 | ret = 1; | ||
1618 | out: | ||
1619 | spin_unlock(ptl); | ||
1620 | out_unlocked: | ||
1621 | return ret; | ||
1622 | } | ||
1623 | |||
1483 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | 1624 | int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, |
1484 | pmd_t *pmd, unsigned long addr) | 1625 | pmd_t *pmd, unsigned long addr) |
1485 | { | 1626 | { |
1486 | pmd_t orig_pmd; | 1627 | pmd_t orig_pmd; |
1487 | spinlock_t *ptl; | 1628 | spinlock_t *ptl; |
1488 | 1629 | ||
1489 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) != 1) | 1630 | if (!__pmd_trans_huge_lock(pmd, vma, &ptl)) |
1490 | return 0; | 1631 | return 0; |
1491 | /* | 1632 | /* |
1492 | * For architectures like ppc64 we look at deposited pgtable | 1633 | * For architectures like ppc64 we look at deposited pgtable |
@@ -1508,7 +1649,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1508 | put_huge_zero_page(); | 1649 | put_huge_zero_page(); |
1509 | } else { | 1650 | } else { |
1510 | struct page *page = pmd_page(orig_pmd); | 1651 | struct page *page = pmd_page(orig_pmd); |
1511 | page_remove_rmap(page); | 1652 | page_remove_rmap(page, true); |
1512 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); | 1653 | VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); |
1513 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); | 1654 | add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); |
1514 | VM_BUG_ON_PAGE(!PageHead(page), page); | 1655 | VM_BUG_ON_PAGE(!PageHead(page), page); |
@@ -1520,13 +1661,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, | |||
1520 | return 1; | 1661 | return 1; |
1521 | } | 1662 | } |
1522 | 1663 | ||
1523 | int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | 1664 | bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, |
1524 | unsigned long old_addr, | 1665 | unsigned long old_addr, |
1525 | unsigned long new_addr, unsigned long old_end, | 1666 | unsigned long new_addr, unsigned long old_end, |
1526 | pmd_t *old_pmd, pmd_t *new_pmd) | 1667 | pmd_t *old_pmd, pmd_t *new_pmd) |
1527 | { | 1668 | { |
1528 | spinlock_t *old_ptl, *new_ptl; | 1669 | spinlock_t *old_ptl, *new_ptl; |
1529 | int ret = 0; | ||
1530 | pmd_t pmd; | 1670 | pmd_t pmd; |
1531 | 1671 | ||
1532 | struct mm_struct *mm = vma->vm_mm; | 1672 | struct mm_struct *mm = vma->vm_mm; |
@@ -1535,7 +1675,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1535 | (new_addr & ~HPAGE_PMD_MASK) || | 1675 | (new_addr & ~HPAGE_PMD_MASK) || |
1536 | old_end - old_addr < HPAGE_PMD_SIZE || | 1676 | old_end - old_addr < HPAGE_PMD_SIZE || |
1537 | (new_vma->vm_flags & VM_NOHUGEPAGE)) | 1677 | (new_vma->vm_flags & VM_NOHUGEPAGE)) |
1538 | goto out; | 1678 | return false; |
1539 | 1679 | ||
1540 | /* | 1680 | /* |
1541 | * The destination pmd shouldn't be established, free_pgtables() | 1681 | * The destination pmd shouldn't be established, free_pgtables() |
@@ -1543,15 +1683,14 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1543 | */ | 1683 | */ |
1544 | if (WARN_ON(!pmd_none(*new_pmd))) { | 1684 | if (WARN_ON(!pmd_none(*new_pmd))) { |
1545 | VM_BUG_ON(pmd_trans_huge(*new_pmd)); | 1685 | VM_BUG_ON(pmd_trans_huge(*new_pmd)); |
1546 | goto out; | 1686 | return false; |
1547 | } | 1687 | } |
1548 | 1688 | ||
1549 | /* | 1689 | /* |
1550 | * We don't have to worry about the ordering of src and dst | 1690 | * We don't have to worry about the ordering of src and dst |
1551 | * ptlocks because exclusive mmap_sem prevents deadlock. | 1691 | * ptlocks because exclusive mmap_sem prevents deadlock. |
1552 | */ | 1692 | */ |
1553 | ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); | 1693 | if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) { |
1554 | if (ret == 1) { | ||
1555 | new_ptl = pmd_lockptr(mm, new_pmd); | 1694 | new_ptl = pmd_lockptr(mm, new_pmd); |
1556 | if (new_ptl != old_ptl) | 1695 | if (new_ptl != old_ptl) |
1557 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); | 1696 | spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); |
@@ -1567,9 +1706,9 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, | |||
1567 | if (new_ptl != old_ptl) | 1706 | if (new_ptl != old_ptl) |
1568 | spin_unlock(new_ptl); | 1707 | spin_unlock(new_ptl); |
1569 | spin_unlock(old_ptl); | 1708 | spin_unlock(old_ptl); |
1709 | return true; | ||
1570 | } | 1710 | } |
1571 | out: | 1711 | return false; |
1572 | return ret; | ||
1573 | } | 1712 | } |
1574 | 1713 | ||
1575 | /* | 1714 | /* |
@@ -1585,7 +1724,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1585 | spinlock_t *ptl; | 1724 | spinlock_t *ptl; |
1586 | int ret = 0; | 1725 | int ret = 0; |
1587 | 1726 | ||
1588 | if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 1727 | if (__pmd_trans_huge_lock(pmd, vma, &ptl)) { |
1589 | pmd_t entry; | 1728 | pmd_t entry; |
1590 | bool preserve_write = prot_numa && pmd_write(*pmd); | 1729 | bool preserve_write = prot_numa && pmd_write(*pmd); |
1591 | ret = 1; | 1730 | ret = 1; |
@@ -1616,405 +1755,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, | |||
1616 | } | 1755 | } |
1617 | 1756 | ||
1618 | /* | 1757 | /* |
1619 | * Returns 1 if a given pmd maps a stable (not under splitting) thp. | 1758 | * Returns true if a given pmd maps a thp, false otherwise. |
1620 | * Returns -1 if it maps a thp under splitting. Returns 0 otherwise. | ||
1621 | * | 1759 | * |
1622 | * Note that if it returns 1, this routine returns without unlocking page | 1760 | * Note that if it returns true, this routine returns without unlocking page |
1623 | * table locks. So callers must unlock them. | 1761 | * table lock. So callers must unlock it. |
1624 | */ | 1762 | */ |
1625 | int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, | 1763 | bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, |
1626 | spinlock_t **ptl) | 1764 | spinlock_t **ptl) |
1627 | { | 1765 | { |
1628 | *ptl = pmd_lock(vma->vm_mm, pmd); | 1766 | *ptl = pmd_lock(vma->vm_mm, pmd); |
1629 | if (likely(pmd_trans_huge(*pmd))) { | 1767 | if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) |
1630 | if (unlikely(pmd_trans_splitting(*pmd))) { | 1768 | return true; |
1631 | spin_unlock(*ptl); | ||
1632 | wait_split_huge_page(vma->anon_vma, pmd); | ||
1633 | return -1; | ||
1634 | } else { | ||
1635 | /* Thp mapped by 'pmd' is stable, so we can | ||
1636 | * handle it as it is. */ | ||
1637 | return 1; | ||
1638 | } | ||
1639 | } | ||
1640 | spin_unlock(*ptl); | ||
1641 | return 0; | ||
1642 | } | ||
1643 | |||
1644 | /* | ||
1645 | * This function returns whether a given @page is mapped onto the @address | ||
1646 | * in the virtual space of @mm. | ||
1647 | * | ||
1648 | * When it's true, this function returns *pmd with holding the page table lock | ||
1649 | * and passing it back to the caller via @ptl. | ||
1650 | * If it's false, returns NULL without holding the page table lock. | ||
1651 | */ | ||
1652 | pmd_t *page_check_address_pmd(struct page *page, | ||
1653 | struct mm_struct *mm, | ||
1654 | unsigned long address, | ||
1655 | enum page_check_address_pmd_flag flag, | ||
1656 | spinlock_t **ptl) | ||
1657 | { | ||
1658 | pgd_t *pgd; | ||
1659 | pud_t *pud; | ||
1660 | pmd_t *pmd; | ||
1661 | |||
1662 | if (address & ~HPAGE_PMD_MASK) | ||
1663 | return NULL; | ||
1664 | |||
1665 | pgd = pgd_offset(mm, address); | ||
1666 | if (!pgd_present(*pgd)) | ||
1667 | return NULL; | ||
1668 | pud = pud_offset(pgd, address); | ||
1669 | if (!pud_present(*pud)) | ||
1670 | return NULL; | ||
1671 | pmd = pmd_offset(pud, address); | ||
1672 | |||
1673 | *ptl = pmd_lock(mm, pmd); | ||
1674 | if (!pmd_present(*pmd)) | ||
1675 | goto unlock; | ||
1676 | if (pmd_page(*pmd) != page) | ||
1677 | goto unlock; | ||
1678 | /* | ||
1679 | * split_vma() may create temporary aliased mappings. There is | ||
1680 | * no risk as long as all huge pmd are found and have their | ||
1681 | * splitting bit set before __split_huge_page_refcount | ||
1682 | * runs. Finding the same huge pmd more than once during the | ||
1683 | * same rmap walk is not a problem. | ||
1684 | */ | ||
1685 | if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && | ||
1686 | pmd_trans_splitting(*pmd)) | ||
1687 | goto unlock; | ||
1688 | if (pmd_trans_huge(*pmd)) { | ||
1689 | VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && | ||
1690 | !pmd_trans_splitting(*pmd)); | ||
1691 | return pmd; | ||
1692 | } | ||
1693 | unlock: | ||
1694 | spin_unlock(*ptl); | 1769 | spin_unlock(*ptl); |
1695 | return NULL; | 1770 | return false; |
1696 | } | ||
1697 | |||
1698 | static int __split_huge_page_splitting(struct page *page, | ||
1699 | struct vm_area_struct *vma, | ||
1700 | unsigned long address) | ||
1701 | { | ||
1702 | struct mm_struct *mm = vma->vm_mm; | ||
1703 | spinlock_t *ptl; | ||
1704 | pmd_t *pmd; | ||
1705 | int ret = 0; | ||
1706 | /* For mmu_notifiers */ | ||
1707 | const unsigned long mmun_start = address; | ||
1708 | const unsigned long mmun_end = address + HPAGE_PMD_SIZE; | ||
1709 | |||
1710 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
1711 | pmd = page_check_address_pmd(page, mm, address, | ||
1712 | PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); | ||
1713 | if (pmd) { | ||
1714 | /* | ||
1715 | * We can't temporarily set the pmd to null in order | ||
1716 | * to split it, the pmd must remain marked huge at all | ||
1717 | * times or the VM won't take the pmd_trans_huge paths | ||
1718 | * and it won't wait on the anon_vma->root->rwsem to | ||
1719 | * serialize against split_huge_page*. | ||
1720 | */ | ||
1721 | pmdp_splitting_flush(vma, address, pmd); | ||
1722 | |||
1723 | ret = 1; | ||
1724 | spin_unlock(ptl); | ||
1725 | } | ||
1726 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
1727 | |||
1728 | return ret; | ||
1729 | } | ||
1730 | |||
1731 | static void __split_huge_page_refcount(struct page *page, | ||
1732 | struct list_head *list) | ||
1733 | { | ||
1734 | int i; | ||
1735 | struct zone *zone = page_zone(page); | ||
1736 | struct lruvec *lruvec; | ||
1737 | int tail_count = 0; | ||
1738 | |||
1739 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | ||
1740 | spin_lock_irq(&zone->lru_lock); | ||
1741 | lruvec = mem_cgroup_page_lruvec(page, zone); | ||
1742 | |||
1743 | compound_lock(page); | ||
1744 | /* complete memcg works before add pages to LRU */ | ||
1745 | mem_cgroup_split_huge_fixup(page); | ||
1746 | |||
1747 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) { | ||
1748 | struct page *page_tail = page + i; | ||
1749 | |||
1750 | /* tail_page->_mapcount cannot change */ | ||
1751 | BUG_ON(page_mapcount(page_tail) < 0); | ||
1752 | tail_count += page_mapcount(page_tail); | ||
1753 | /* check for overflow */ | ||
1754 | BUG_ON(tail_count < 0); | ||
1755 | BUG_ON(atomic_read(&page_tail->_count) != 0); | ||
1756 | /* | ||
1757 | * tail_page->_count is zero and not changing from | ||
1758 | * under us. But get_page_unless_zero() may be running | ||
1759 | * from under us on the tail_page. If we used | ||
1760 | * atomic_set() below instead of atomic_add(), we | ||
1761 | * would then run atomic_set() concurrently with | ||
1762 | * get_page_unless_zero(), and atomic_set() is | ||
1763 | * implemented in C not using locked ops. spin_unlock | ||
1764 | * on x86 sometime uses locked ops because of PPro | ||
1765 | * errata 66, 92, so unless somebody can guarantee | ||
1766 | * atomic_set() here would be safe on all archs (and | ||
1767 | * not only on x86), it's safer to use atomic_add(). | ||
1768 | */ | ||
1769 | atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, | ||
1770 | &page_tail->_count); | ||
1771 | |||
1772 | /* after clearing PageTail the gup refcount can be released */ | ||
1773 | smp_mb__after_atomic(); | ||
1774 | |||
1775 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | ||
1776 | page_tail->flags |= (page->flags & | ||
1777 | ((1L << PG_referenced) | | ||
1778 | (1L << PG_swapbacked) | | ||
1779 | (1L << PG_mlocked) | | ||
1780 | (1L << PG_uptodate) | | ||
1781 | (1L << PG_active) | | ||
1782 | (1L << PG_unevictable))); | ||
1783 | page_tail->flags |= (1L << PG_dirty); | ||
1784 | |||
1785 | clear_compound_head(page_tail); | ||
1786 | |||
1787 | if (page_is_young(page)) | ||
1788 | set_page_young(page_tail); | ||
1789 | if (page_is_idle(page)) | ||
1790 | set_page_idle(page_tail); | ||
1791 | |||
1792 | /* | ||
1793 | * __split_huge_page_splitting() already set the | ||
1794 | * splitting bit in all pmd that could map this | ||
1795 | * hugepage, that will ensure no CPU can alter the | ||
1796 | * mapcount on the head page. The mapcount is only | ||
1797 | * accounted in the head page and it has to be | ||
1798 | * transferred to all tail pages in the below code. So | ||
1799 | * for this code to be safe, the split the mapcount | ||
1800 | * can't change. But that doesn't mean userland can't | ||
1801 | * keep changing and reading the page contents while | ||
1802 | * we transfer the mapcount, so the pmd splitting | ||
1803 | * status is achieved setting a reserved bit in the | ||
1804 | * pmd, not by clearing the present bit. | ||
1805 | */ | ||
1806 | page_tail->_mapcount = page->_mapcount; | ||
1807 | |||
1808 | BUG_ON(page_tail->mapping); | ||
1809 | page_tail->mapping = page->mapping; | ||
1810 | |||
1811 | page_tail->index = page->index + i; | ||
1812 | page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); | ||
1813 | |||
1814 | BUG_ON(!PageAnon(page_tail)); | ||
1815 | BUG_ON(!PageUptodate(page_tail)); | ||
1816 | BUG_ON(!PageDirty(page_tail)); | ||
1817 | BUG_ON(!PageSwapBacked(page_tail)); | ||
1818 | |||
1819 | lru_add_page_tail(page, page_tail, lruvec, list); | ||
1820 | } | ||
1821 | atomic_sub(tail_count, &page->_count); | ||
1822 | BUG_ON(atomic_read(&page->_count) <= 0); | ||
1823 | |||
1824 | __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1); | ||
1825 | |||
1826 | ClearPageCompound(page); | ||
1827 | compound_unlock(page); | ||
1828 | spin_unlock_irq(&zone->lru_lock); | ||
1829 | |||
1830 | for (i = 1; i < HPAGE_PMD_NR; i++) { | ||
1831 | struct page *page_tail = page + i; | ||
1832 | BUG_ON(page_count(page_tail) <= 0); | ||
1833 | /* | ||
1834 | * Tail pages may be freed if there wasn't any mapping | ||
1835 | * like if add_to_swap() is running on a lru page that | ||
1836 | * had its mapping zapped. And freeing these pages | ||
1837 | * requires taking the lru_lock so we do the put_page | ||
1838 | * of the tail pages after the split is complete. | ||
1839 | */ | ||
1840 | put_page(page_tail); | ||
1841 | } | ||
1842 | |||
1843 | /* | ||
1844 | * Only the head page (now become a regular page) is required | ||
1845 | * to be pinned by the caller. | ||
1846 | */ | ||
1847 | BUG_ON(page_count(page) <= 0); | ||
1848 | } | ||
1849 | |||
1850 | static int __split_huge_page_map(struct page *page, | ||
1851 | struct vm_area_struct *vma, | ||
1852 | unsigned long address) | ||
1853 | { | ||
1854 | struct mm_struct *mm = vma->vm_mm; | ||
1855 | spinlock_t *ptl; | ||
1856 | pmd_t *pmd, _pmd; | ||
1857 | int ret = 0, i; | ||
1858 | pgtable_t pgtable; | ||
1859 | unsigned long haddr; | ||
1860 | |||
1861 | pmd = page_check_address_pmd(page, mm, address, | ||
1862 | PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); | ||
1863 | if (pmd) { | ||
1864 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | ||
1865 | pmd_populate(mm, &_pmd, pgtable); | ||
1866 | if (pmd_write(*pmd)) | ||
1867 | BUG_ON(page_mapcount(page) != 1); | ||
1868 | |||
1869 | haddr = address; | ||
1870 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
1871 | pte_t *pte, entry; | ||
1872 | BUG_ON(PageCompound(page+i)); | ||
1873 | /* | ||
1874 | * Note that NUMA hinting access restrictions are not | ||
1875 | * transferred to avoid any possibility of altering | ||
1876 | * permissions across VMAs. | ||
1877 | */ | ||
1878 | entry = mk_pte(page + i, vma->vm_page_prot); | ||
1879 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | ||
1880 | if (!pmd_write(*pmd)) | ||
1881 | entry = pte_wrprotect(entry); | ||
1882 | if (!pmd_young(*pmd)) | ||
1883 | entry = pte_mkold(entry); | ||
1884 | pte = pte_offset_map(&_pmd, haddr); | ||
1885 | BUG_ON(!pte_none(*pte)); | ||
1886 | set_pte_at(mm, haddr, pte, entry); | ||
1887 | pte_unmap(pte); | ||
1888 | } | ||
1889 | |||
1890 | smp_wmb(); /* make pte visible before pmd */ | ||
1891 | /* | ||
1892 | * Up to this point the pmd is present and huge and | ||
1893 | * userland has the whole access to the hugepage | ||
1894 | * during the split (which happens in place). If we | ||
1895 | * overwrite the pmd with the not-huge version | ||
1896 | * pointing to the pte here (which of course we could | ||
1897 | * if all CPUs were bug free), userland could trigger | ||
1898 | * a small page size TLB miss on the small sized TLB | ||
1899 | * while the hugepage TLB entry is still established | ||
1900 | * in the huge TLB. Some CPU doesn't like that. See | ||
1901 | * http://support.amd.com/us/Processor_TechDocs/41322.pdf, | ||
1902 | * Erratum 383 on page 93. Intel should be safe but is | ||
1903 | * also warns that it's only safe if the permission | ||
1904 | * and cache attributes of the two entries loaded in | ||
1905 | * the two TLB is identical (which should be the case | ||
1906 | * here). But it is generally safer to never allow | ||
1907 | * small and huge TLB entries for the same virtual | ||
1908 | * address to be loaded simultaneously. So instead of | ||
1909 | * doing "pmd_populate(); flush_pmd_tlb_range();" we first | ||
1910 | * mark the current pmd notpresent (atomically because | ||
1911 | * here the pmd_trans_huge and pmd_trans_splitting | ||
1912 | * must remain set at all times on the pmd until the | ||
1913 | * split is complete for this pmd), then we flush the | ||
1914 | * SMP TLB and finally we write the non-huge version | ||
1915 | * of the pmd entry with pmd_populate. | ||
1916 | */ | ||
1917 | pmdp_invalidate(vma, address, pmd); | ||
1918 | pmd_populate(mm, pmd, pgtable); | ||
1919 | ret = 1; | ||
1920 | spin_unlock(ptl); | ||
1921 | } | ||
1922 | |||
1923 | return ret; | ||
1924 | } | ||
1925 | |||
1926 | /* must be called with anon_vma->root->rwsem held */ | ||
1927 | static void __split_huge_page(struct page *page, | ||
1928 | struct anon_vma *anon_vma, | ||
1929 | struct list_head *list) | ||
1930 | { | ||
1931 | int mapcount, mapcount2; | ||
1932 | pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | ||
1933 | struct anon_vma_chain *avc; | ||
1934 | |||
1935 | BUG_ON(!PageHead(page)); | ||
1936 | BUG_ON(PageTail(page)); | ||
1937 | |||
1938 | mapcount = 0; | ||
1939 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
1940 | struct vm_area_struct *vma = avc->vma; | ||
1941 | unsigned long addr = vma_address(page, vma); | ||
1942 | BUG_ON(is_vma_temporary_stack(vma)); | ||
1943 | mapcount += __split_huge_page_splitting(page, vma, addr); | ||
1944 | } | ||
1945 | /* | ||
1946 | * It is critical that new vmas are added to the tail of the | ||
1947 | * anon_vma list. This guarantes that if copy_huge_pmd() runs | ||
1948 | * and establishes a child pmd before | ||
1949 | * __split_huge_page_splitting() freezes the parent pmd (so if | ||
1950 | * we fail to prevent copy_huge_pmd() from running until the | ||
1951 | * whole __split_huge_page() is complete), we will still see | ||
1952 | * the newly established pmd of the child later during the | ||
1953 | * walk, to be able to set it as pmd_trans_splitting too. | ||
1954 | */ | ||
1955 | if (mapcount != page_mapcount(page)) { | ||
1956 | pr_err("mapcount %d page_mapcount %d\n", | ||
1957 | mapcount, page_mapcount(page)); | ||
1958 | BUG(); | ||
1959 | } | ||
1960 | |||
1961 | __split_huge_page_refcount(page, list); | ||
1962 | |||
1963 | mapcount2 = 0; | ||
1964 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) { | ||
1965 | struct vm_area_struct *vma = avc->vma; | ||
1966 | unsigned long addr = vma_address(page, vma); | ||
1967 | BUG_ON(is_vma_temporary_stack(vma)); | ||
1968 | mapcount2 += __split_huge_page_map(page, vma, addr); | ||
1969 | } | ||
1970 | if (mapcount != mapcount2) { | ||
1971 | pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", | ||
1972 | mapcount, mapcount2, page_mapcount(page)); | ||
1973 | BUG(); | ||
1974 | } | ||
1975 | } | ||
1976 | |||
1977 | /* | ||
1978 | * Split a hugepage into normal pages. This doesn't change the position of head | ||
1979 | * page. If @list is null, tail pages will be added to LRU list, otherwise, to | ||
1980 | * @list. Both head page and tail pages will inherit mapping, flags, and so on | ||
1981 | * from the hugepage. | ||
1982 | * Return 0 if the hugepage is split successfully otherwise return 1. | ||
1983 | */ | ||
1984 | int split_huge_page_to_list(struct page *page, struct list_head *list) | ||
1985 | { | ||
1986 | struct anon_vma *anon_vma; | ||
1987 | int ret = 1; | ||
1988 | |||
1989 | BUG_ON(is_huge_zero_page(page)); | ||
1990 | BUG_ON(!PageAnon(page)); | ||
1991 | |||
1992 | /* | ||
1993 | * The caller does not necessarily hold an mmap_sem that would prevent | ||
1994 | * the anon_vma disappearing so we first we take a reference to it | ||
1995 | * and then lock the anon_vma for write. This is similar to | ||
1996 | * page_lock_anon_vma_read except the write lock is taken to serialise | ||
1997 | * against parallel split or collapse operations. | ||
1998 | */ | ||
1999 | anon_vma = page_get_anon_vma(page); | ||
2000 | if (!anon_vma) | ||
2001 | goto out; | ||
2002 | anon_vma_lock_write(anon_vma); | ||
2003 | |||
2004 | ret = 0; | ||
2005 | if (!PageCompound(page)) | ||
2006 | goto out_unlock; | ||
2007 | |||
2008 | BUG_ON(!PageSwapBacked(page)); | ||
2009 | __split_huge_page(page, anon_vma, list); | ||
2010 | count_vm_event(THP_SPLIT); | ||
2011 | |||
2012 | BUG_ON(PageCompound(page)); | ||
2013 | out_unlock: | ||
2014 | anon_vma_unlock_write(anon_vma); | ||
2015 | put_anon_vma(anon_vma); | ||
2016 | out: | ||
2017 | return ret; | ||
2018 | } | 1771 | } |
2019 | 1772 | ||
2020 | #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) | 1773 | #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) |
@@ -2371,7 +2124,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, | |||
2371 | * superfluous. | 2124 | * superfluous. |
2372 | */ | 2125 | */ |
2373 | pte_clear(vma->vm_mm, address, _pte); | 2126 | pte_clear(vma->vm_mm, address, _pte); |
2374 | page_remove_rmap(src_page); | 2127 | page_remove_rmap(src_page, false); |
2375 | spin_unlock(ptl); | 2128 | spin_unlock(ptl); |
2376 | free_page_and_swap_cache(src_page); | 2129 | free_page_and_swap_cache(src_page); |
2377 | } | 2130 | } |
@@ -2481,6 +2234,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, | |||
2481 | return NULL; | 2234 | return NULL; |
2482 | } | 2235 | } |
2483 | 2236 | ||
2237 | prep_transhuge_page(*hpage); | ||
2484 | count_vm_event(THP_COLLAPSE_ALLOC); | 2238 | count_vm_event(THP_COLLAPSE_ALLOC); |
2485 | return *hpage; | 2239 | return *hpage; |
2486 | } | 2240 | } |
@@ -2492,8 +2246,12 @@ static int khugepaged_find_target_node(void) | |||
2492 | 2246 | ||
2493 | static inline struct page *alloc_hugepage(int defrag) | 2247 | static inline struct page *alloc_hugepage(int defrag) |
2494 | { | 2248 | { |
2495 | return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), | 2249 | struct page *page; |
2496 | HPAGE_PMD_ORDER); | 2250 | |
2251 | page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); | ||
2252 | if (page) | ||
2253 | prep_transhuge_page(page); | ||
2254 | return page; | ||
2497 | } | 2255 | } |
2498 | 2256 | ||
2499 | static struct page *khugepaged_alloc_hugepage(bool *wait) | 2257 | static struct page *khugepaged_alloc_hugepage(bool *wait) |
@@ -2543,7 +2301,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma) | |||
2543 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || | 2301 | if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || |
2544 | (vma->vm_flags & VM_NOHUGEPAGE)) | 2302 | (vma->vm_flags & VM_NOHUGEPAGE)) |
2545 | return false; | 2303 | return false; |
2546 | |||
2547 | if (!vma->anon_vma || vma->vm_ops) | 2304 | if (!vma->anon_vma || vma->vm_ops) |
2548 | return false; | 2305 | return false; |
2549 | if (is_vma_temporary_stack(vma)) | 2306 | if (is_vma_temporary_stack(vma)) |
@@ -2583,7 +2340,7 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2583 | goto out_nolock; | 2340 | goto out_nolock; |
2584 | } | 2341 | } |
2585 | 2342 | ||
2586 | if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg))) { | 2343 | if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { |
2587 | result = SCAN_CGROUP_CHARGE_FAIL; | 2344 | result = SCAN_CGROUP_CHARGE_FAIL; |
2588 | goto out_nolock; | 2345 | goto out_nolock; |
2589 | } | 2346 | } |
@@ -2682,8 +2439,8 @@ static void collapse_huge_page(struct mm_struct *mm, | |||
2682 | 2439 | ||
2683 | spin_lock(pmd_ptl); | 2440 | spin_lock(pmd_ptl); |
2684 | BUG_ON(!pmd_none(*pmd)); | 2441 | BUG_ON(!pmd_none(*pmd)); |
2685 | page_add_new_anon_rmap(new_page, vma, address); | 2442 | page_add_new_anon_rmap(new_page, vma, address, true); |
2686 | mem_cgroup_commit_charge(new_page, memcg, false); | 2443 | mem_cgroup_commit_charge(new_page, memcg, false, true); |
2687 | lru_cache_add_active_or_unevictable(new_page, vma); | 2444 | lru_cache_add_active_or_unevictable(new_page, vma); |
2688 | pgtable_trans_huge_deposit(mm, pmd, pgtable); | 2445 | pgtable_trans_huge_deposit(mm, pmd, pgtable); |
2689 | set_pmd_at(mm, address, pmd, _pmd); | 2446 | set_pmd_at(mm, address, pmd, _pmd); |
@@ -2703,7 +2460,7 @@ out_nolock: | |||
2703 | trace_mm_collapse_huge_page(mm, isolated, result); | 2460 | trace_mm_collapse_huge_page(mm, isolated, result); |
2704 | return; | 2461 | return; |
2705 | out: | 2462 | out: |
2706 | mem_cgroup_cancel_charge(new_page, memcg); | 2463 | mem_cgroup_cancel_charge(new_page, memcg, true); |
2707 | goto out_up_write; | 2464 | goto out_up_write; |
2708 | } | 2465 | } |
2709 | 2466 | ||
@@ -2755,6 +2512,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2755 | result = SCAN_PAGE_NULL; | 2512 | result = SCAN_PAGE_NULL; |
2756 | goto out_unmap; | 2513 | goto out_unmap; |
2757 | } | 2514 | } |
2515 | |||
2516 | /* TODO: teach khugepaged to collapse THP mapped with pte */ | ||
2517 | if (PageCompound(page)) { | ||
2518 | result = SCAN_PAGE_COMPOUND; | ||
2519 | goto out_unmap; | ||
2520 | } | ||
2521 | |||
2758 | /* | 2522 | /* |
2759 | * Record which node the original page is from and save this | 2523 | * Record which node the original page is from and save this |
2760 | * information to khugepaged_node_load[]. | 2524 | * information to khugepaged_node_load[]. |
@@ -2767,7 +2531,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, | |||
2767 | goto out_unmap; | 2531 | goto out_unmap; |
2768 | } | 2532 | } |
2769 | khugepaged_node_load[node]++; | 2533 | khugepaged_node_load[node]++; |
2770 | VM_BUG_ON_PAGE(PageCompound(page), page); | ||
2771 | if (!PageLRU(page)) { | 2534 | if (!PageLRU(page)) { |
2772 | result = SCAN_SCAN_ABORT; | 2535 | result = SCAN_SCAN_ABORT; |
2773 | goto out_unmap; | 2536 | goto out_unmap; |
@@ -3040,8 +2803,8 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | |||
3040 | pmd_t _pmd; | 2803 | pmd_t _pmd; |
3041 | int i; | 2804 | int i; |
3042 | 2805 | ||
3043 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); | ||
3044 | /* leave pmd empty until pte is filled */ | 2806 | /* leave pmd empty until pte is filled */ |
2807 | pmdp_huge_clear_flush_notify(vma, haddr, pmd); | ||
3045 | 2808 | ||
3046 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); | 2809 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
3047 | pmd_populate(mm, &_pmd, pgtable); | 2810 | pmd_populate(mm, &_pmd, pgtable); |
@@ -3060,66 +2823,153 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, | |||
3060 | put_huge_zero_page(); | 2823 | put_huge_zero_page(); |
3061 | } | 2824 | } |
3062 | 2825 | ||
3063 | void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | 2826 | static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, |
3064 | pmd_t *pmd) | 2827 | unsigned long haddr, bool freeze) |
3065 | { | 2828 | { |
3066 | spinlock_t *ptl; | ||
3067 | struct page *page = NULL; | ||
3068 | struct mm_struct *mm = vma->vm_mm; | 2829 | struct mm_struct *mm = vma->vm_mm; |
3069 | unsigned long haddr = address & HPAGE_PMD_MASK; | 2830 | struct page *page; |
3070 | unsigned long mmun_start; /* For mmu_notifiers */ | 2831 | pgtable_t pgtable; |
3071 | unsigned long mmun_end; /* For mmu_notifiers */ | 2832 | pmd_t _pmd; |
2833 | bool young, write, dirty; | ||
2834 | int i; | ||
3072 | 2835 | ||
3073 | BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE); | 2836 | VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); |
2837 | VM_BUG_ON_VMA(vma->vm_start > haddr, vma); | ||
2838 | VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); | ||
2839 | VM_BUG_ON(!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)); | ||
2840 | |||
2841 | count_vm_event(THP_SPLIT_PMD); | ||
3074 | 2842 | ||
3075 | mmun_start = haddr; | ||
3076 | mmun_end = haddr + HPAGE_PMD_SIZE; | ||
3077 | again: | ||
3078 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
3079 | ptl = pmd_lock(mm, pmd); | ||
3080 | if (unlikely(!pmd_trans_huge(*pmd))) | ||
3081 | goto unlock; | ||
3082 | if (vma_is_dax(vma)) { | 2843 | if (vma_is_dax(vma)) { |
3083 | pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); | 2844 | pmd_t _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); |
3084 | if (is_huge_zero_pmd(_pmd)) | 2845 | if (is_huge_zero_pmd(_pmd)) |
3085 | put_huge_zero_page(); | 2846 | put_huge_zero_page(); |
2847 | return; | ||
3086 | } else if (is_huge_zero_pmd(*pmd)) { | 2848 | } else if (is_huge_zero_pmd(*pmd)) { |
3087 | __split_huge_zero_page_pmd(vma, haddr, pmd); | 2849 | return __split_huge_zero_page_pmd(vma, haddr, pmd); |
3088 | } else { | ||
3089 | page = pmd_page(*pmd); | ||
3090 | VM_BUG_ON_PAGE(!page_count(page), page); | ||
3091 | get_page(page); | ||
3092 | } | 2850 | } |
3093 | unlock: | ||
3094 | spin_unlock(ptl); | ||
3095 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
3096 | 2851 | ||
3097 | if (!page) | 2852 | page = pmd_page(*pmd); |
3098 | return; | 2853 | VM_BUG_ON_PAGE(!page_count(page), page); |
2854 | atomic_add(HPAGE_PMD_NR - 1, &page->_count); | ||
2855 | write = pmd_write(*pmd); | ||
2856 | young = pmd_young(*pmd); | ||
2857 | dirty = pmd_dirty(*pmd); | ||
3099 | 2858 | ||
3100 | split_huge_page(page); | 2859 | pgtable = pgtable_trans_huge_withdraw(mm, pmd); |
3101 | put_page(page); | 2860 | pmd_populate(mm, &_pmd, pgtable); |
3102 | 2861 | ||
2862 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
2863 | pte_t entry, *pte; | ||
2864 | /* | ||
2865 | * Note that NUMA hinting access restrictions are not | ||
2866 | * transferred to avoid any possibility of altering | ||
2867 | * permissions across VMAs. | ||
2868 | */ | ||
2869 | if (freeze) { | ||
2870 | swp_entry_t swp_entry; | ||
2871 | swp_entry = make_migration_entry(page + i, write); | ||
2872 | entry = swp_entry_to_pte(swp_entry); | ||
2873 | } else { | ||
2874 | entry = mk_pte(page + i, vma->vm_page_prot); | ||
2875 | entry = maybe_mkwrite(entry, vma); | ||
2876 | if (!write) | ||
2877 | entry = pte_wrprotect(entry); | ||
2878 | if (!young) | ||
2879 | entry = pte_mkold(entry); | ||
2880 | } | ||
2881 | if (dirty) | ||
2882 | SetPageDirty(page + i); | ||
2883 | pte = pte_offset_map(&_pmd, haddr); | ||
2884 | BUG_ON(!pte_none(*pte)); | ||
2885 | set_pte_at(mm, haddr, pte, entry); | ||
2886 | atomic_inc(&page[i]._mapcount); | ||
2887 | pte_unmap(pte); | ||
2888 | } | ||
2889 | |||
2890 | /* | ||
2891 | * Set PG_double_map before dropping compound_mapcount to avoid | ||
2892 | * false-negative page_mapped(). | ||
2893 | */ | ||
2894 | if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) { | ||
2895 | for (i = 0; i < HPAGE_PMD_NR; i++) | ||
2896 | atomic_inc(&page[i]._mapcount); | ||
2897 | } | ||
2898 | |||
2899 | if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { | ||
2900 | /* Last compound_mapcount is gone. */ | ||
2901 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | ||
2902 | if (TestClearPageDoubleMap(page)) { | ||
2903 | /* No need in mapcount reference anymore */ | ||
2904 | for (i = 0; i < HPAGE_PMD_NR; i++) | ||
2905 | atomic_dec(&page[i]._mapcount); | ||
2906 | } | ||
2907 | } | ||
2908 | |||
2909 | smp_wmb(); /* make pte visible before pmd */ | ||
3103 | /* | 2910 | /* |
3104 | * We don't always have down_write of mmap_sem here: a racing | 2911 | * Up to this point the pmd is present and huge and userland has the |
3105 | * do_huge_pmd_wp_page() might have copied-on-write to another | 2912 | * whole access to the hugepage during the split (which happens in |
3106 | * huge page before our split_huge_page() got the anon_vma lock. | 2913 | * place). If we overwrite the pmd with the not-huge version pointing |
2914 | * to the pte here (which of course we could if all CPUs were bug | ||
2915 | * free), userland could trigger a small page size TLB miss on the | ||
2916 | * small sized TLB while the hugepage TLB entry is still established in | ||
2917 | * the huge TLB. Some CPU doesn't like that. | ||
2918 | * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum | ||
2919 | * 383 on page 93. Intel should be safe but is also warns that it's | ||
2920 | * only safe if the permission and cache attributes of the two entries | ||
2921 | * loaded in the two TLB is identical (which should be the case here). | ||
2922 | * But it is generally safer to never allow small and huge TLB entries | ||
2923 | * for the same virtual address to be loaded simultaneously. So instead | ||
2924 | * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the | ||
2925 | * current pmd notpresent (atomically because here the pmd_trans_huge | ||
2926 | * and pmd_trans_splitting must remain set at all times on the pmd | ||
2927 | * until the split is complete for this pmd), then we flush the SMP TLB | ||
2928 | * and finally we write the non-huge version of the pmd entry with | ||
2929 | * pmd_populate. | ||
3107 | */ | 2930 | */ |
3108 | if (unlikely(pmd_trans_huge(*pmd))) | 2931 | pmdp_invalidate(vma, haddr, pmd); |
3109 | goto again; | 2932 | pmd_populate(mm, pmd, pgtable); |
2933 | |||
2934 | if (freeze) { | ||
2935 | for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { | ||
2936 | page_remove_rmap(page + i, false); | ||
2937 | put_page(page + i); | ||
2938 | } | ||
2939 | } | ||
3110 | } | 2940 | } |
3111 | 2941 | ||
3112 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | 2942 | void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, |
3113 | pmd_t *pmd) | 2943 | unsigned long address) |
3114 | { | 2944 | { |
3115 | struct vm_area_struct *vma; | 2945 | spinlock_t *ptl; |
2946 | struct mm_struct *mm = vma->vm_mm; | ||
2947 | struct page *page = NULL; | ||
2948 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
3116 | 2949 | ||
3117 | vma = find_vma(mm, address); | 2950 | mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE); |
3118 | BUG_ON(vma == NULL); | 2951 | ptl = pmd_lock(mm, pmd); |
3119 | split_huge_page_pmd(vma, address, pmd); | 2952 | if (pmd_trans_huge(*pmd)) { |
2953 | page = pmd_page(*pmd); | ||
2954 | if (PageMlocked(page)) | ||
2955 | get_page(page); | ||
2956 | else | ||
2957 | page = NULL; | ||
2958 | } else if (!pmd_devmap(*pmd)) | ||
2959 | goto out; | ||
2960 | __split_huge_pmd_locked(vma, pmd, haddr, false); | ||
2961 | out: | ||
2962 | spin_unlock(ptl); | ||
2963 | mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); | ||
2964 | if (page) { | ||
2965 | lock_page(page); | ||
2966 | munlock_vma_page(page); | ||
2967 | unlock_page(page); | ||
2968 | put_page(page); | ||
2969 | } | ||
3120 | } | 2970 | } |
3121 | 2971 | ||
3122 | static void split_huge_page_address(struct mm_struct *mm, | 2972 | static void split_huge_pmd_address(struct vm_area_struct *vma, |
3123 | unsigned long address) | 2973 | unsigned long address) |
3124 | { | 2974 | { |
3125 | pgd_t *pgd; | 2975 | pgd_t *pgd; |
@@ -3128,7 +2978,7 @@ static void split_huge_page_address(struct mm_struct *mm, | |||
3128 | 2978 | ||
3129 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); | 2979 | VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); |
3130 | 2980 | ||
3131 | pgd = pgd_offset(mm, address); | 2981 | pgd = pgd_offset(vma->vm_mm, address); |
3132 | if (!pgd_present(*pgd)) | 2982 | if (!pgd_present(*pgd)) |
3133 | return; | 2983 | return; |
3134 | 2984 | ||
@@ -3137,13 +2987,13 @@ static void split_huge_page_address(struct mm_struct *mm, | |||
3137 | return; | 2987 | return; |
3138 | 2988 | ||
3139 | pmd = pmd_offset(pud, address); | 2989 | pmd = pmd_offset(pud, address); |
3140 | if (!pmd_present(*pmd)) | 2990 | if (!pmd_present(*pmd) || (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd))) |
3141 | return; | 2991 | return; |
3142 | /* | 2992 | /* |
3143 | * Caller holds the mmap_sem write mode, so a huge pmd cannot | 2993 | * Caller holds the mmap_sem write mode, so a huge pmd cannot |
3144 | * materialize from under us. | 2994 | * materialize from under us. |
3145 | */ | 2995 | */ |
3146 | split_huge_page_pmd_mm(mm, address, pmd); | 2996 | split_huge_pmd(vma, pmd, address); |
3147 | } | 2997 | } |
3148 | 2998 | ||
3149 | void vma_adjust_trans_huge(struct vm_area_struct *vma, | 2999 | void vma_adjust_trans_huge(struct vm_area_struct *vma, |
@@ -3159,7 +3009,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
3159 | if (start & ~HPAGE_PMD_MASK && | 3009 | if (start & ~HPAGE_PMD_MASK && |
3160 | (start & HPAGE_PMD_MASK) >= vma->vm_start && | 3010 | (start & HPAGE_PMD_MASK) >= vma->vm_start && |
3161 | (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | 3011 | (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) |
3162 | split_huge_page_address(vma->vm_mm, start); | 3012 | split_huge_pmd_address(vma, start); |
3163 | 3013 | ||
3164 | /* | 3014 | /* |
3165 | * If the new end address isn't hpage aligned and it could | 3015 | * If the new end address isn't hpage aligned and it could |
@@ -3169,7 +3019,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
3169 | if (end & ~HPAGE_PMD_MASK && | 3019 | if (end & ~HPAGE_PMD_MASK && |
3170 | (end & HPAGE_PMD_MASK) >= vma->vm_start && | 3020 | (end & HPAGE_PMD_MASK) >= vma->vm_start && |
3171 | (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) | 3021 | (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) |
3172 | split_huge_page_address(vma->vm_mm, end); | 3022 | split_huge_pmd_address(vma, end); |
3173 | 3023 | ||
3174 | /* | 3024 | /* |
3175 | * If we're also updating the vma->vm_next->vm_start, if the new | 3025 | * If we're also updating the vma->vm_next->vm_start, if the new |
@@ -3183,6 +3033,540 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, | |||
3183 | if (nstart & ~HPAGE_PMD_MASK && | 3033 | if (nstart & ~HPAGE_PMD_MASK && |
3184 | (nstart & HPAGE_PMD_MASK) >= next->vm_start && | 3034 | (nstart & HPAGE_PMD_MASK) >= next->vm_start && |
3185 | (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) | 3035 | (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) |
3186 | split_huge_page_address(next->vm_mm, nstart); | 3036 | split_huge_pmd_address(next, nstart); |
3037 | } | ||
3038 | } | ||
3039 | |||
3040 | static void freeze_page_vma(struct vm_area_struct *vma, struct page *page, | ||
3041 | unsigned long address) | ||
3042 | { | ||
3043 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
3044 | spinlock_t *ptl; | ||
3045 | pgd_t *pgd; | ||
3046 | pud_t *pud; | ||
3047 | pmd_t *pmd; | ||
3048 | pte_t *pte; | ||
3049 | int i, nr = HPAGE_PMD_NR; | ||
3050 | |||
3051 | /* Skip pages which doesn't belong to the VMA */ | ||
3052 | if (address < vma->vm_start) { | ||
3053 | int off = (vma->vm_start - address) >> PAGE_SHIFT; | ||
3054 | page += off; | ||
3055 | nr -= off; | ||
3056 | address = vma->vm_start; | ||
3057 | } | ||
3058 | |||
3059 | pgd = pgd_offset(vma->vm_mm, address); | ||
3060 | if (!pgd_present(*pgd)) | ||
3061 | return; | ||
3062 | pud = pud_offset(pgd, address); | ||
3063 | if (!pud_present(*pud)) | ||
3064 | return; | ||
3065 | pmd = pmd_offset(pud, address); | ||
3066 | ptl = pmd_lock(vma->vm_mm, pmd); | ||
3067 | if (!pmd_present(*pmd)) { | ||
3068 | spin_unlock(ptl); | ||
3069 | return; | ||
3070 | } | ||
3071 | if (pmd_trans_huge(*pmd)) { | ||
3072 | if (page == pmd_page(*pmd)) | ||
3073 | __split_huge_pmd_locked(vma, pmd, haddr, true); | ||
3074 | spin_unlock(ptl); | ||
3075 | return; | ||
3076 | } | ||
3077 | spin_unlock(ptl); | ||
3078 | |||
3079 | pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); | ||
3080 | for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { | ||
3081 | pte_t entry, swp_pte; | ||
3082 | swp_entry_t swp_entry; | ||
3083 | |||
3084 | /* | ||
3085 | * We've just crossed page table boundary: need to map next one. | ||
3086 | * It can happen if THP was mremaped to non PMD-aligned address. | ||
3087 | */ | ||
3088 | if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { | ||
3089 | pte_unmap_unlock(pte - 1, ptl); | ||
3090 | pmd = mm_find_pmd(vma->vm_mm, address); | ||
3091 | if (!pmd) | ||
3092 | return; | ||
3093 | pte = pte_offset_map_lock(vma->vm_mm, pmd, | ||
3094 | address, &ptl); | ||
3095 | } | ||
3096 | |||
3097 | if (!pte_present(*pte)) | ||
3098 | continue; | ||
3099 | if (page_to_pfn(page) != pte_pfn(*pte)) | ||
3100 | continue; | ||
3101 | flush_cache_page(vma, address, page_to_pfn(page)); | ||
3102 | entry = ptep_clear_flush(vma, address, pte); | ||
3103 | if (pte_dirty(entry)) | ||
3104 | SetPageDirty(page); | ||
3105 | swp_entry = make_migration_entry(page, pte_write(entry)); | ||
3106 | swp_pte = swp_entry_to_pte(swp_entry); | ||
3107 | if (pte_soft_dirty(entry)) | ||
3108 | swp_pte = pte_swp_mksoft_dirty(swp_pte); | ||
3109 | set_pte_at(vma->vm_mm, address, pte, swp_pte); | ||
3110 | page_remove_rmap(page, false); | ||
3111 | put_page(page); | ||
3112 | } | ||
3113 | pte_unmap_unlock(pte - 1, ptl); | ||
3114 | } | ||
3115 | |||
3116 | static void freeze_page(struct anon_vma *anon_vma, struct page *page) | ||
3117 | { | ||
3118 | struct anon_vma_chain *avc; | ||
3119 | pgoff_t pgoff = page_to_pgoff(page); | ||
3120 | |||
3121 | VM_BUG_ON_PAGE(!PageHead(page), page); | ||
3122 | |||
3123 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, | ||
3124 | pgoff + HPAGE_PMD_NR - 1) { | ||
3125 | unsigned long address = __vma_address(page, avc->vma); | ||
3126 | |||
3127 | mmu_notifier_invalidate_range_start(avc->vma->vm_mm, | ||
3128 | address, address + HPAGE_PMD_SIZE); | ||
3129 | freeze_page_vma(avc->vma, page, address); | ||
3130 | mmu_notifier_invalidate_range_end(avc->vma->vm_mm, | ||
3131 | address, address + HPAGE_PMD_SIZE); | ||
3132 | } | ||
3133 | } | ||
3134 | |||
3135 | static void unfreeze_page_vma(struct vm_area_struct *vma, struct page *page, | ||
3136 | unsigned long address) | ||
3137 | { | ||
3138 | spinlock_t *ptl; | ||
3139 | pmd_t *pmd; | ||
3140 | pte_t *pte, entry; | ||
3141 | swp_entry_t swp_entry; | ||
3142 | unsigned long haddr = address & HPAGE_PMD_MASK; | ||
3143 | int i, nr = HPAGE_PMD_NR; | ||
3144 | |||
3145 | /* Skip pages which doesn't belong to the VMA */ | ||
3146 | if (address < vma->vm_start) { | ||
3147 | int off = (vma->vm_start - address) >> PAGE_SHIFT; | ||
3148 | page += off; | ||
3149 | nr -= off; | ||
3150 | address = vma->vm_start; | ||
3151 | } | ||
3152 | |||
3153 | pmd = mm_find_pmd(vma->vm_mm, address); | ||
3154 | if (!pmd) | ||
3155 | return; | ||
3156 | |||
3157 | pte = pte_offset_map_lock(vma->vm_mm, pmd, address, &ptl); | ||
3158 | for (i = 0; i < nr; i++, address += PAGE_SIZE, page++, pte++) { | ||
3159 | /* | ||
3160 | * We've just crossed page table boundary: need to map next one. | ||
3161 | * It can happen if THP was mremaped to non-PMD aligned address. | ||
3162 | */ | ||
3163 | if (unlikely(address == haddr + HPAGE_PMD_SIZE)) { | ||
3164 | pte_unmap_unlock(pte - 1, ptl); | ||
3165 | pmd = mm_find_pmd(vma->vm_mm, address); | ||
3166 | if (!pmd) | ||
3167 | return; | ||
3168 | pte = pte_offset_map_lock(vma->vm_mm, pmd, | ||
3169 | address, &ptl); | ||
3170 | } | ||
3171 | |||
3172 | if (!is_swap_pte(*pte)) | ||
3173 | continue; | ||
3174 | |||
3175 | swp_entry = pte_to_swp_entry(*pte); | ||
3176 | if (!is_migration_entry(swp_entry)) | ||
3177 | continue; | ||
3178 | if (migration_entry_to_page(swp_entry) != page) | ||
3179 | continue; | ||
3180 | |||
3181 | get_page(page); | ||
3182 | page_add_anon_rmap(page, vma, address, false); | ||
3183 | |||
3184 | entry = pte_mkold(mk_pte(page, vma->vm_page_prot)); | ||
3185 | if (PageDirty(page)) | ||
3186 | entry = pte_mkdirty(entry); | ||
3187 | if (is_write_migration_entry(swp_entry)) | ||
3188 | entry = maybe_mkwrite(entry, vma); | ||
3189 | |||
3190 | flush_dcache_page(page); | ||
3191 | set_pte_at(vma->vm_mm, address, pte, entry); | ||
3192 | |||
3193 | /* No need to invalidate - it was non-present before */ | ||
3194 | update_mmu_cache(vma, address, pte); | ||
3195 | } | ||
3196 | pte_unmap_unlock(pte - 1, ptl); | ||
3197 | } | ||
3198 | |||
3199 | static void unfreeze_page(struct anon_vma *anon_vma, struct page *page) | ||
3200 | { | ||
3201 | struct anon_vma_chain *avc; | ||
3202 | pgoff_t pgoff = page_to_pgoff(page); | ||
3203 | |||
3204 | anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, | ||
3205 | pgoff, pgoff + HPAGE_PMD_NR - 1) { | ||
3206 | unsigned long address = __vma_address(page, avc->vma); | ||
3207 | |||
3208 | mmu_notifier_invalidate_range_start(avc->vma->vm_mm, | ||
3209 | address, address + HPAGE_PMD_SIZE); | ||
3210 | unfreeze_page_vma(avc->vma, page, address); | ||
3211 | mmu_notifier_invalidate_range_end(avc->vma->vm_mm, | ||
3212 | address, address + HPAGE_PMD_SIZE); | ||
3213 | } | ||
3214 | } | ||
3215 | |||
3216 | static int __split_huge_page_tail(struct page *head, int tail, | ||
3217 | struct lruvec *lruvec, struct list_head *list) | ||
3218 | { | ||
3219 | int mapcount; | ||
3220 | struct page *page_tail = head + tail; | ||
3221 | |||
3222 | mapcount = atomic_read(&page_tail->_mapcount) + 1; | ||
3223 | VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail); | ||
3224 | |||
3225 | /* | ||
3226 | * tail_page->_count is zero and not changing from under us. But | ||
3227 | * get_page_unless_zero() may be running from under us on the | ||
3228 | * tail_page. If we used atomic_set() below instead of atomic_add(), we | ||
3229 | * would then run atomic_set() concurrently with | ||
3230 | * get_page_unless_zero(), and atomic_set() is implemented in C not | ||
3231 | * using locked ops. spin_unlock on x86 sometime uses locked ops | ||
3232 | * because of PPro errata 66, 92, so unless somebody can guarantee | ||
3233 | * atomic_set() here would be safe on all archs (and not only on x86), | ||
3234 | * it's safer to use atomic_add(). | ||
3235 | */ | ||
3236 | atomic_add(mapcount + 1, &page_tail->_count); | ||
3237 | |||
3238 | |||
3239 | page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | ||
3240 | page_tail->flags |= (head->flags & | ||
3241 | ((1L << PG_referenced) | | ||
3242 | (1L << PG_swapbacked) | | ||
3243 | (1L << PG_mlocked) | | ||
3244 | (1L << PG_uptodate) | | ||
3245 | (1L << PG_active) | | ||
3246 | (1L << PG_locked) | | ||
3247 | (1L << PG_unevictable) | | ||
3248 | (1L << PG_dirty))); | ||
3249 | |||
3250 | /* | ||
3251 | * After clearing PageTail the gup refcount can be released. | ||
3252 | * Page flags also must be visible before we make the page non-compound. | ||
3253 | */ | ||
3254 | smp_wmb(); | ||
3255 | |||
3256 | clear_compound_head(page_tail); | ||
3257 | |||
3258 | if (page_is_young(head)) | ||
3259 | set_page_young(page_tail); | ||
3260 | if (page_is_idle(head)) | ||
3261 | set_page_idle(page_tail); | ||
3262 | |||
3263 | /* ->mapping in first tail page is compound_mapcount */ | ||
3264 | VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, | ||
3265 | page_tail); | ||
3266 | page_tail->mapping = head->mapping; | ||
3267 | |||
3268 | page_tail->index = head->index + tail; | ||
3269 | page_cpupid_xchg_last(page_tail, page_cpupid_last(head)); | ||
3270 | lru_add_page_tail(head, page_tail, lruvec, list); | ||
3271 | |||
3272 | return mapcount; | ||
3273 | } | ||
3274 | |||
3275 | static void __split_huge_page(struct page *page, struct list_head *list) | ||
3276 | { | ||
3277 | struct page *head = compound_head(page); | ||
3278 | struct zone *zone = page_zone(head); | ||
3279 | struct lruvec *lruvec; | ||
3280 | int i, tail_mapcount; | ||
3281 | |||
3282 | /* prevent PageLRU to go away from under us, and freeze lru stats */ | ||
3283 | spin_lock_irq(&zone->lru_lock); | ||
3284 | lruvec = mem_cgroup_page_lruvec(head, zone); | ||
3285 | |||
3286 | /* complete memcg works before add pages to LRU */ | ||
3287 | mem_cgroup_split_huge_fixup(head); | ||
3288 | |||
3289 | tail_mapcount = 0; | ||
3290 | for (i = HPAGE_PMD_NR - 1; i >= 1; i--) | ||
3291 | tail_mapcount += __split_huge_page_tail(head, i, lruvec, list); | ||
3292 | atomic_sub(tail_mapcount, &head->_count); | ||
3293 | |||
3294 | ClearPageCompound(head); | ||
3295 | spin_unlock_irq(&zone->lru_lock); | ||
3296 | |||
3297 | unfreeze_page(page_anon_vma(head), head); | ||
3298 | |||
3299 | for (i = 0; i < HPAGE_PMD_NR; i++) { | ||
3300 | struct page *subpage = head + i; | ||
3301 | if (subpage == page) | ||
3302 | continue; | ||
3303 | unlock_page(subpage); | ||
3304 | |||
3305 | /* | ||
3306 | * Subpages may be freed if there wasn't any mapping | ||
3307 | * like if add_to_swap() is running on a lru page that | ||
3308 | * had its mapping zapped. And freeing these pages | ||
3309 | * requires taking the lru_lock so we do the put_page | ||
3310 | * of the tail pages after the split is complete. | ||
3311 | */ | ||
3312 | put_page(subpage); | ||
3187 | } | 3313 | } |
3188 | } | 3314 | } |
3315 | |||
3316 | int total_mapcount(struct page *page) | ||
3317 | { | ||
3318 | int i, ret; | ||
3319 | |||
3320 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
3321 | |||
3322 | if (likely(!PageCompound(page))) | ||
3323 | return atomic_read(&page->_mapcount) + 1; | ||
3324 | |||
3325 | ret = compound_mapcount(page); | ||
3326 | if (PageHuge(page)) | ||
3327 | return ret; | ||
3328 | for (i = 0; i < HPAGE_PMD_NR; i++) | ||
3329 | ret += atomic_read(&page[i]._mapcount) + 1; | ||
3330 | if (PageDoubleMap(page)) | ||
3331 | ret -= HPAGE_PMD_NR; | ||
3332 | return ret; | ||
3333 | } | ||
3334 | |||
3335 | /* | ||
3336 | * This function splits huge page into normal pages. @page can point to any | ||
3337 | * subpage of huge page to split. Split doesn't change the position of @page. | ||
3338 | * | ||
3339 | * Only caller must hold pin on the @page, otherwise split fails with -EBUSY. | ||
3340 | * The huge page must be locked. | ||
3341 | * | ||
3342 | * If @list is null, tail pages will be added to LRU list, otherwise, to @list. | ||
3343 | * | ||
3344 | * Both head page and tail pages will inherit mapping, flags, and so on from | ||
3345 | * the hugepage. | ||
3346 | * | ||
3347 | * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if | ||
3348 | * they are not mapped. | ||
3349 | * | ||
3350 | * Returns 0 if the hugepage is split successfully. | ||
3351 | * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under | ||
3352 | * us. | ||
3353 | */ | ||
3354 | int split_huge_page_to_list(struct page *page, struct list_head *list) | ||
3355 | { | ||
3356 | struct page *head = compound_head(page); | ||
3357 | struct anon_vma *anon_vma; | ||
3358 | int count, mapcount, ret; | ||
3359 | bool mlocked; | ||
3360 | |||
3361 | VM_BUG_ON_PAGE(is_huge_zero_page(page), page); | ||
3362 | VM_BUG_ON_PAGE(!PageAnon(page), page); | ||
3363 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
3364 | VM_BUG_ON_PAGE(!PageSwapBacked(page), page); | ||
3365 | VM_BUG_ON_PAGE(!PageCompound(page), page); | ||
3366 | |||
3367 | /* | ||
3368 | * The caller does not necessarily hold an mmap_sem that would prevent | ||
3369 | * the anon_vma disappearing so we first we take a reference to it | ||
3370 | * and then lock the anon_vma for write. This is similar to | ||
3371 | * page_lock_anon_vma_read except the write lock is taken to serialise | ||
3372 | * against parallel split or collapse operations. | ||
3373 | */ | ||
3374 | anon_vma = page_get_anon_vma(head); | ||
3375 | if (!anon_vma) { | ||
3376 | ret = -EBUSY; | ||
3377 | goto out; | ||
3378 | } | ||
3379 | anon_vma_lock_write(anon_vma); | ||
3380 | |||
3381 | /* | ||
3382 | * Racy check if we can split the page, before freeze_page() will | ||
3383 | * split PMDs | ||
3384 | */ | ||
3385 | if (total_mapcount(head) != page_count(head) - 1) { | ||
3386 | ret = -EBUSY; | ||
3387 | goto out_unlock; | ||
3388 | } | ||
3389 | |||
3390 | mlocked = PageMlocked(page); | ||
3391 | freeze_page(anon_vma, head); | ||
3392 | VM_BUG_ON_PAGE(compound_mapcount(head), head); | ||
3393 | |||
3394 | /* Make sure the page is not on per-CPU pagevec as it takes pin */ | ||
3395 | if (mlocked) | ||
3396 | lru_add_drain(); | ||
3397 | |||
3398 | /* Prevent deferred_split_scan() touching ->_count */ | ||
3399 | spin_lock(&split_queue_lock); | ||
3400 | count = page_count(head); | ||
3401 | mapcount = total_mapcount(head); | ||
3402 | if (!mapcount && count == 1) { | ||
3403 | if (!list_empty(page_deferred_list(head))) { | ||
3404 | split_queue_len--; | ||
3405 | list_del(page_deferred_list(head)); | ||
3406 | } | ||
3407 | spin_unlock(&split_queue_lock); | ||
3408 | __split_huge_page(page, list); | ||
3409 | ret = 0; | ||
3410 | } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { | ||
3411 | spin_unlock(&split_queue_lock); | ||
3412 | pr_alert("total_mapcount: %u, page_count(): %u\n", | ||
3413 | mapcount, count); | ||
3414 | if (PageTail(page)) | ||
3415 | dump_page(head, NULL); | ||
3416 | dump_page(page, "total_mapcount(head) > 0"); | ||
3417 | BUG(); | ||
3418 | } else { | ||
3419 | spin_unlock(&split_queue_lock); | ||
3420 | unfreeze_page(anon_vma, head); | ||
3421 | ret = -EBUSY; | ||
3422 | } | ||
3423 | |||
3424 | out_unlock: | ||
3425 | anon_vma_unlock_write(anon_vma); | ||
3426 | put_anon_vma(anon_vma); | ||
3427 | out: | ||
3428 | count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); | ||
3429 | return ret; | ||
3430 | } | ||
3431 | |||
3432 | void free_transhuge_page(struct page *page) | ||
3433 | { | ||
3434 | unsigned long flags; | ||
3435 | |||
3436 | spin_lock_irqsave(&split_queue_lock, flags); | ||
3437 | if (!list_empty(page_deferred_list(page))) { | ||
3438 | split_queue_len--; | ||
3439 | list_del(page_deferred_list(page)); | ||
3440 | } | ||
3441 | spin_unlock_irqrestore(&split_queue_lock, flags); | ||
3442 | free_compound_page(page); | ||
3443 | } | ||
3444 | |||
3445 | void deferred_split_huge_page(struct page *page) | ||
3446 | { | ||
3447 | unsigned long flags; | ||
3448 | |||
3449 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | ||
3450 | |||
3451 | spin_lock_irqsave(&split_queue_lock, flags); | ||
3452 | if (list_empty(page_deferred_list(page))) { | ||
3453 | list_add_tail(page_deferred_list(page), &split_queue); | ||
3454 | split_queue_len++; | ||
3455 | } | ||
3456 | spin_unlock_irqrestore(&split_queue_lock, flags); | ||
3457 | } | ||
3458 | |||
3459 | static unsigned long deferred_split_count(struct shrinker *shrink, | ||
3460 | struct shrink_control *sc) | ||
3461 | { | ||
3462 | /* | ||
3463 | * Split a page from split_queue will free up at least one page, | ||
3464 | * at most HPAGE_PMD_NR - 1. We don't track exact number. | ||
3465 | * Let's use HPAGE_PMD_NR / 2 as ballpark. | ||
3466 | */ | ||
3467 | return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2; | ||
3468 | } | ||
3469 | |||
3470 | static unsigned long deferred_split_scan(struct shrinker *shrink, | ||
3471 | struct shrink_control *sc) | ||
3472 | { | ||
3473 | unsigned long flags; | ||
3474 | LIST_HEAD(list), *pos, *next; | ||
3475 | struct page *page; | ||
3476 | int split = 0; | ||
3477 | |||
3478 | spin_lock_irqsave(&split_queue_lock, flags); | ||
3479 | list_splice_init(&split_queue, &list); | ||
3480 | |||
3481 | /* Take pin on all head pages to avoid freeing them under us */ | ||
3482 | list_for_each_safe(pos, next, &list) { | ||
3483 | page = list_entry((void *)pos, struct page, mapping); | ||
3484 | page = compound_head(page); | ||
3485 | /* race with put_compound_page() */ | ||
3486 | if (!get_page_unless_zero(page)) { | ||
3487 | list_del_init(page_deferred_list(page)); | ||
3488 | split_queue_len--; | ||
3489 | } | ||
3490 | } | ||
3491 | spin_unlock_irqrestore(&split_queue_lock, flags); | ||
3492 | |||
3493 | list_for_each_safe(pos, next, &list) { | ||
3494 | page = list_entry((void *)pos, struct page, mapping); | ||
3495 | lock_page(page); | ||
3496 | /* split_huge_page() removes page from list on success */ | ||
3497 | if (!split_huge_page(page)) | ||
3498 | split++; | ||
3499 | unlock_page(page); | ||
3500 | put_page(page); | ||
3501 | } | ||
3502 | |||
3503 | spin_lock_irqsave(&split_queue_lock, flags); | ||
3504 | list_splice_tail(&list, &split_queue); | ||
3505 | spin_unlock_irqrestore(&split_queue_lock, flags); | ||
3506 | |||
3507 | return split * HPAGE_PMD_NR / 2; | ||
3508 | } | ||
3509 | |||
3510 | static struct shrinker deferred_split_shrinker = { | ||
3511 | .count_objects = deferred_split_count, | ||
3512 | .scan_objects = deferred_split_scan, | ||
3513 | .seeks = DEFAULT_SEEKS, | ||
3514 | }; | ||
3515 | |||
3516 | #ifdef CONFIG_DEBUG_FS | ||
3517 | static int split_huge_pages_set(void *data, u64 val) | ||
3518 | { | ||
3519 | struct zone *zone; | ||
3520 | struct page *page; | ||
3521 | unsigned long pfn, max_zone_pfn; | ||
3522 | unsigned long total = 0, split = 0; | ||
3523 | |||
3524 | if (val != 1) | ||
3525 | return -EINVAL; | ||
3526 | |||
3527 | for_each_populated_zone(zone) { | ||
3528 | max_zone_pfn = zone_end_pfn(zone); | ||
3529 | for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) { | ||
3530 | if (!pfn_valid(pfn)) | ||
3531 | continue; | ||
3532 | |||
3533 | page = pfn_to_page(pfn); | ||
3534 | if (!get_page_unless_zero(page)) | ||
3535 | continue; | ||
3536 | |||
3537 | if (zone != page_zone(page)) | ||
3538 | goto next; | ||
3539 | |||
3540 | if (!PageHead(page) || !PageAnon(page) || | ||
3541 | PageHuge(page)) | ||
3542 | goto next; | ||
3543 | |||
3544 | total++; | ||
3545 | lock_page(page); | ||
3546 | if (!split_huge_page(page)) | ||
3547 | split++; | ||
3548 | unlock_page(page); | ||
3549 | next: | ||
3550 | put_page(page); | ||
3551 | } | ||
3552 | } | ||
3553 | |||
3554 | pr_info("%lu of %lu THP split", split, total); | ||
3555 | |||
3556 | return 0; | ||
3557 | } | ||
3558 | DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set, | ||
3559 | "%llu\n"); | ||
3560 | |||
3561 | static int __init split_huge_pages_debugfs(void) | ||
3562 | { | ||
3563 | void *ret; | ||
3564 | |||
3565 | ret = debugfs_create_file("split_huge_pages", 0644, NULL, NULL, | ||
3566 | &split_huge_pages_fops); | ||
3567 | if (!ret) | ||
3568 | pr_warn("Failed to create split_huge_pages in debugfs"); | ||
3569 | return 0; | ||
3570 | } | ||
3571 | late_initcall(split_huge_pages_debugfs); | ||
3572 | #endif | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index be934df69b85..12908dcf5831 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -1267,8 +1267,8 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) | |||
1267 | 1267 | ||
1268 | /* we rely on prep_new_huge_page to set the destructor */ | 1268 | /* we rely on prep_new_huge_page to set the destructor */ |
1269 | set_compound_order(page, order); | 1269 | set_compound_order(page, order); |
1270 | __SetPageHead(page); | ||
1271 | __ClearPageReserved(page); | 1270 | __ClearPageReserved(page); |
1271 | __SetPageHead(page); | ||
1272 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | 1272 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { |
1273 | /* | 1273 | /* |
1274 | * For gigantic hugepages allocated through bootmem at | 1274 | * For gigantic hugepages allocated through bootmem at |
@@ -3102,7 +3102,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, | |||
3102 | entry = huge_ptep_get(src_pte); | 3102 | entry = huge_ptep_get(src_pte); |
3103 | ptepage = pte_page(entry); | 3103 | ptepage = pte_page(entry); |
3104 | get_page(ptepage); | 3104 | get_page(ptepage); |
3105 | page_dup_rmap(ptepage); | 3105 | page_dup_rmap(ptepage, true); |
3106 | set_huge_pte_at(dst, addr, dst_pte, entry); | 3106 | set_huge_pte_at(dst, addr, dst_pte, entry); |
3107 | hugetlb_count_add(pages_per_huge_page(h), dst); | 3107 | hugetlb_count_add(pages_per_huge_page(h), dst); |
3108 | } | 3108 | } |
@@ -3186,7 +3186,7 @@ again: | |||
3186 | set_page_dirty(page); | 3186 | set_page_dirty(page); |
3187 | 3187 | ||
3188 | hugetlb_count_sub(pages_per_huge_page(h), mm); | 3188 | hugetlb_count_sub(pages_per_huge_page(h), mm); |
3189 | page_remove_rmap(page); | 3189 | page_remove_rmap(page, true); |
3190 | force_flush = !__tlb_remove_page(tlb, page); | 3190 | force_flush = !__tlb_remove_page(tlb, page); |
3191 | if (force_flush) { | 3191 | if (force_flush) { |
3192 | address += sz; | 3192 | address += sz; |
@@ -3415,7 +3415,7 @@ retry_avoidcopy: | |||
3415 | mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); | 3415 | mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); |
3416 | set_huge_pte_at(mm, address, ptep, | 3416 | set_huge_pte_at(mm, address, ptep, |
3417 | make_huge_pte(vma, new_page, 1)); | 3417 | make_huge_pte(vma, new_page, 1)); |
3418 | page_remove_rmap(old_page); | 3418 | page_remove_rmap(old_page, true); |
3419 | hugepage_add_new_anon_rmap(new_page, vma, address); | 3419 | hugepage_add_new_anon_rmap(new_page, vma, address); |
3420 | /* Make the old page be freed below */ | 3420 | /* Make the old page be freed below */ |
3421 | new_page = old_page; | 3421 | new_page = old_page; |
@@ -3585,7 +3585,7 @@ retry: | |||
3585 | ClearPagePrivate(page); | 3585 | ClearPagePrivate(page); |
3586 | hugepage_add_new_anon_rmap(page, vma, address); | 3586 | hugepage_add_new_anon_rmap(page, vma, address); |
3587 | } else | 3587 | } else |
3588 | page_dup_rmap(page); | 3588 | page_dup_rmap(page, true); |
3589 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) | 3589 | new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) |
3590 | && (vma->vm_flags & VM_SHARED))); | 3590 | && (vma->vm_flags & VM_SHARED))); |
3591 | set_huge_pte_at(mm, address, ptep, new_pte); | 3591 | set_huge_pte_at(mm, address, ptep, new_pte); |
@@ -3865,7 +3865,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3865 | same_page: | 3865 | same_page: |
3866 | if (pages) { | 3866 | if (pages) { |
3867 | pages[i] = mem_map_offset(page, pfn_offset); | 3867 | pages[i] = mem_map_offset(page, pfn_offset); |
3868 | get_page_foll(pages[i]); | 3868 | get_page(pages[i]); |
3869 | } | 3869 | } |
3870 | 3870 | ||
3871 | if (vmas) | 3871 | if (vmas) |
diff --git a/mm/internal.h b/mm/internal.h index 38e24b89e4c4..ed8b5ffcf9b1 100644 --- a/mm/internal.h +++ b/mm/internal.h | |||
@@ -13,6 +13,7 @@ | |||
13 | 13 | ||
14 | #include <linux/fs.h> | 14 | #include <linux/fs.h> |
15 | #include <linux/mm.h> | 15 | #include <linux/mm.h> |
16 | #include <linux/pagemap.h> | ||
16 | 17 | ||
17 | /* | 18 | /* |
18 | * The set of flags that only affect watermark checking and reclaim | 19 | * The set of flags that only affect watermark checking and reclaim |
@@ -66,50 +67,6 @@ static inline void set_page_refcounted(struct page *page) | |||
66 | set_page_count(page, 1); | 67 | set_page_count(page, 1); |
67 | } | 68 | } |
68 | 69 | ||
69 | static inline void __get_page_tail_foll(struct page *page, | ||
70 | bool get_page_head) | ||
71 | { | ||
72 | /* | ||
73 | * If we're getting a tail page, the elevated page->_count is | ||
74 | * required only in the head page and we will elevate the head | ||
75 | * page->_count and tail page->_mapcount. | ||
76 | * | ||
77 | * We elevate page_tail->_mapcount for tail pages to force | ||
78 | * page_tail->_count to be zero at all times to avoid getting | ||
79 | * false positives from get_page_unless_zero() with | ||
80 | * speculative page access (like in | ||
81 | * page_cache_get_speculative()) on tail pages. | ||
82 | */ | ||
83 | VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page); | ||
84 | if (get_page_head) | ||
85 | atomic_inc(&compound_head(page)->_count); | ||
86 | get_huge_page_tail(page); | ||
87 | } | ||
88 | |||
89 | /* | ||
90 | * This is meant to be called as the FOLL_GET operation of | ||
91 | * follow_page() and it must be called while holding the proper PT | ||
92 | * lock while the pte (or pmd_trans_huge) is still mapping the page. | ||
93 | */ | ||
94 | static inline void get_page_foll(struct page *page) | ||
95 | { | ||
96 | if (unlikely(PageTail(page))) | ||
97 | /* | ||
98 | * This is safe only because | ||
99 | * __split_huge_page_refcount() can't run under | ||
100 | * get_page_foll() because we hold the proper PT lock. | ||
101 | */ | ||
102 | __get_page_tail_foll(page, true); | ||
103 | else { | ||
104 | /* | ||
105 | * Getting a normal page or the head of a compound page | ||
106 | * requires to already have an elevated page->_count. | ||
107 | */ | ||
108 | VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); | ||
109 | atomic_inc(&page->_count); | ||
110 | } | ||
111 | } | ||
112 | |||
113 | extern unsigned long highest_memmap_pfn; | 70 | extern unsigned long highest_memmap_pfn; |
114 | 71 | ||
115 | /* | 72 | /* |
@@ -309,10 +266,27 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) | |||
309 | 266 | ||
310 | extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); | 267 | extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); |
311 | 268 | ||
312 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 269 | /* |
313 | extern unsigned long vma_address(struct page *page, | 270 | * At what user virtual address is page expected in @vma? |
314 | struct vm_area_struct *vma); | 271 | */ |
315 | #endif | 272 | static inline unsigned long |
273 | __vma_address(struct page *page, struct vm_area_struct *vma) | ||
274 | { | ||
275 | pgoff_t pgoff = page_to_pgoff(page); | ||
276 | return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
277 | } | ||
278 | |||
279 | static inline unsigned long | ||
280 | vma_address(struct page *page, struct vm_area_struct *vma) | ||
281 | { | ||
282 | unsigned long address = __vma_address(page, vma); | ||
283 | |||
284 | /* page should be within @vma mapping range */ | ||
285 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); | ||
286 | |||
287 | return address; | ||
288 | } | ||
289 | |||
316 | #else /* !CONFIG_MMU */ | 290 | #else /* !CONFIG_MMU */ |
317 | static inline void clear_page_mlock(struct page *page) { } | 291 | static inline void clear_page_mlock(struct page *page) { } |
318 | static inline void mlock_vma_page(struct page *page) { } | 292 | static inline void mlock_vma_page(struct page *page) { } |
@@ -441,20 +441,6 @@ static void break_cow(struct rmap_item *rmap_item) | |||
441 | up_read(&mm->mmap_sem); | 441 | up_read(&mm->mmap_sem); |
442 | } | 442 | } |
443 | 443 | ||
444 | static struct page *page_trans_compound_anon(struct page *page) | ||
445 | { | ||
446 | if (PageTransCompound(page)) { | ||
447 | struct page *head = compound_head(page); | ||
448 | /* | ||
449 | * head may actually be splitted and freed from under | ||
450 | * us but it's ok here. | ||
451 | */ | ||
452 | if (PageAnon(head)) | ||
453 | return head; | ||
454 | } | ||
455 | return NULL; | ||
456 | } | ||
457 | |||
458 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) | 444 | static struct page *get_mergeable_page(struct rmap_item *rmap_item) |
459 | { | 445 | { |
460 | struct mm_struct *mm = rmap_item->mm; | 446 | struct mm_struct *mm = rmap_item->mm; |
@@ -470,7 +456,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) | |||
470 | page = follow_page(vma, addr, FOLL_GET); | 456 | page = follow_page(vma, addr, FOLL_GET); |
471 | if (IS_ERR_OR_NULL(page)) | 457 | if (IS_ERR_OR_NULL(page)) |
472 | goto out; | 458 | goto out; |
473 | if (PageAnon(page) || page_trans_compound_anon(page)) { | 459 | if (PageAnon(page)) { |
474 | flush_anon_page(vma, page, addr); | 460 | flush_anon_page(vma, page, addr); |
475 | flush_dcache_page(page); | 461 | flush_dcache_page(page); |
476 | } else { | 462 | } else { |
@@ -956,13 +942,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, | |||
956 | } | 942 | } |
957 | 943 | ||
958 | get_page(kpage); | 944 | get_page(kpage); |
959 | page_add_anon_rmap(kpage, vma, addr); | 945 | page_add_anon_rmap(kpage, vma, addr, false); |
960 | 946 | ||
961 | flush_cache_page(vma, addr, pte_pfn(*ptep)); | 947 | flush_cache_page(vma, addr, pte_pfn(*ptep)); |
962 | ptep_clear_flush_notify(vma, addr, ptep); | 948 | ptep_clear_flush_notify(vma, addr, ptep); |
963 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); | 949 | set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); |
964 | 950 | ||
965 | page_remove_rmap(page); | 951 | page_remove_rmap(page, false); |
966 | if (!page_mapped(page)) | 952 | if (!page_mapped(page)) |
967 | try_to_free_swap(page); | 953 | try_to_free_swap(page); |
968 | put_page(page); | 954 | put_page(page); |
@@ -975,33 +961,6 @@ out: | |||
975 | return err; | 961 | return err; |
976 | } | 962 | } |
977 | 963 | ||
978 | static int page_trans_compound_anon_split(struct page *page) | ||
979 | { | ||
980 | int ret = 0; | ||
981 | struct page *transhuge_head = page_trans_compound_anon(page); | ||
982 | if (transhuge_head) { | ||
983 | /* Get the reference on the head to split it. */ | ||
984 | if (get_page_unless_zero(transhuge_head)) { | ||
985 | /* | ||
986 | * Recheck we got the reference while the head | ||
987 | * was still anonymous. | ||
988 | */ | ||
989 | if (PageAnon(transhuge_head)) | ||
990 | ret = split_huge_page(transhuge_head); | ||
991 | else | ||
992 | /* | ||
993 | * Retry later if split_huge_page run | ||
994 | * from under us. | ||
995 | */ | ||
996 | ret = 1; | ||
997 | put_page(transhuge_head); | ||
998 | } else | ||
999 | /* Retry later if split_huge_page run from under us. */ | ||
1000 | ret = 1; | ||
1001 | } | ||
1002 | return ret; | ||
1003 | } | ||
1004 | |||
1005 | /* | 964 | /* |
1006 | * try_to_merge_one_page - take two pages and merge them into one | 965 | * try_to_merge_one_page - take two pages and merge them into one |
1007 | * @vma: the vma that holds the pte pointing to page | 966 | * @vma: the vma that holds the pte pointing to page |
@@ -1020,9 +979,6 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
1020 | if (page == kpage) /* ksm page forked */ | 979 | if (page == kpage) /* ksm page forked */ |
1021 | return 0; | 980 | return 0; |
1022 | 981 | ||
1023 | if (PageTransCompound(page) && page_trans_compound_anon_split(page)) | ||
1024 | goto out; | ||
1025 | BUG_ON(PageTransCompound(page)); | ||
1026 | if (!PageAnon(page)) | 982 | if (!PageAnon(page)) |
1027 | goto out; | 983 | goto out; |
1028 | 984 | ||
@@ -1035,6 +991,13 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
1035 | */ | 991 | */ |
1036 | if (!trylock_page(page)) | 992 | if (!trylock_page(page)) |
1037 | goto out; | 993 | goto out; |
994 | |||
995 | if (PageTransCompound(page)) { | ||
996 | err = split_huge_page(page); | ||
997 | if (err) | ||
998 | goto out_unlock; | ||
999 | } | ||
1000 | |||
1038 | /* | 1001 | /* |
1039 | * If this anonymous page is mapped only here, its pte may need | 1002 | * If this anonymous page is mapped only here, its pte may need |
1040 | * to be write-protected. If it's mapped elsewhere, all of its | 1003 | * to be write-protected. If it's mapped elsewhere, all of its |
@@ -1050,6 +1013,12 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
1050 | */ | 1013 | */ |
1051 | set_page_stable_node(page, NULL); | 1014 | set_page_stable_node(page, NULL); |
1052 | mark_page_accessed(page); | 1015 | mark_page_accessed(page); |
1016 | /* | ||
1017 | * Page reclaim just frees a clean page with no dirty | ||
1018 | * ptes: make sure that the ksm page would be swapped. | ||
1019 | */ | ||
1020 | if (!PageDirty(page)) | ||
1021 | SetPageDirty(page); | ||
1053 | err = 0; | 1022 | err = 0; |
1054 | } else if (pages_identical(page, kpage)) | 1023 | } else if (pages_identical(page, kpage)) |
1055 | err = replace_page(vma, page, kpage, orig_pte); | 1024 | err = replace_page(vma, page, kpage, orig_pte); |
@@ -1065,6 +1034,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, | |||
1065 | } | 1034 | } |
1066 | } | 1035 | } |
1067 | 1036 | ||
1037 | out_unlock: | ||
1068 | unlock_page(page); | 1038 | unlock_page(page); |
1069 | out: | 1039 | out: |
1070 | return err; | 1040 | return err; |
@@ -1635,8 +1605,7 @@ next_mm: | |||
1635 | cond_resched(); | 1605 | cond_resched(); |
1636 | continue; | 1606 | continue; |
1637 | } | 1607 | } |
1638 | if (PageAnon(*page) || | 1608 | if (PageAnon(*page)) { |
1639 | page_trans_compound_anon(*page)) { | ||
1640 | flush_anon_page(vma, *page, ksm_scan.address); | 1609 | flush_anon_page(vma, *page, ksm_scan.address); |
1641 | flush_dcache_page(*page); | 1610 | flush_dcache_page(*page); |
1642 | rmap_item = get_next_rmap_item(slot, | 1611 | rmap_item = get_next_rmap_item(slot, |
@@ -1899,7 +1868,7 @@ struct page *ksm_might_need_to_copy(struct page *page, | |||
1899 | 1868 | ||
1900 | SetPageDirty(new_page); | 1869 | SetPageDirty(new_page); |
1901 | __SetPageUptodate(new_page); | 1870 | __SetPageUptodate(new_page); |
1902 | __set_page_locked(new_page); | 1871 | __SetPageLocked(new_page); |
1903 | } | 1872 | } |
1904 | 1873 | ||
1905 | return new_page; | 1874 | return new_page; |
diff --git a/mm/madvise.c b/mm/madvise.c index c889fcbb530e..f56825b6d2e1 100644 --- a/mm/madvise.c +++ b/mm/madvise.c | |||
@@ -20,6 +20,9 @@ | |||
20 | #include <linux/backing-dev.h> | 20 | #include <linux/backing-dev.h> |
21 | #include <linux/swap.h> | 21 | #include <linux/swap.h> |
22 | #include <linux/swapops.h> | 22 | #include <linux/swapops.h> |
23 | #include <linux/mmu_notifier.h> | ||
24 | |||
25 | #include <asm/tlb.h> | ||
23 | 26 | ||
24 | /* | 27 | /* |
25 | * Any behaviour which results in changes to the vma->vm_flags needs to | 28 | * Any behaviour which results in changes to the vma->vm_flags needs to |
@@ -32,6 +35,7 @@ static int madvise_need_mmap_write(int behavior) | |||
32 | case MADV_REMOVE: | 35 | case MADV_REMOVE: |
33 | case MADV_WILLNEED: | 36 | case MADV_WILLNEED: |
34 | case MADV_DONTNEED: | 37 | case MADV_DONTNEED: |
38 | case MADV_FREE: | ||
35 | return 0; | 39 | return 0; |
36 | default: | 40 | default: |
37 | /* be safe, default to 1. list exceptions explicitly */ | 41 | /* be safe, default to 1. list exceptions explicitly */ |
@@ -256,6 +260,194 @@ static long madvise_willneed(struct vm_area_struct *vma, | |||
256 | return 0; | 260 | return 0; |
257 | } | 261 | } |
258 | 262 | ||
263 | static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, | ||
264 | unsigned long end, struct mm_walk *walk) | ||
265 | |||
266 | { | ||
267 | struct mmu_gather *tlb = walk->private; | ||
268 | struct mm_struct *mm = tlb->mm; | ||
269 | struct vm_area_struct *vma = walk->vma; | ||
270 | spinlock_t *ptl; | ||
271 | pte_t *orig_pte, *pte, ptent; | ||
272 | struct page *page; | ||
273 | int nr_swap = 0; | ||
274 | unsigned long next; | ||
275 | |||
276 | next = pmd_addr_end(addr, end); | ||
277 | if (pmd_trans_huge(*pmd)) | ||
278 | if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) | ||
279 | goto next; | ||
280 | |||
281 | if (pmd_trans_unstable(pmd)) | ||
282 | return 0; | ||
283 | |||
284 | orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | ||
285 | arch_enter_lazy_mmu_mode(); | ||
286 | for (; addr != end; pte++, addr += PAGE_SIZE) { | ||
287 | ptent = *pte; | ||
288 | |||
289 | if (pte_none(ptent)) | ||
290 | continue; | ||
291 | /* | ||
292 | * If the pte has swp_entry, just clear page table to | ||
293 | * prevent swap-in which is more expensive rather than | ||
294 | * (page allocation + zeroing). | ||
295 | */ | ||
296 | if (!pte_present(ptent)) { | ||
297 | swp_entry_t entry; | ||
298 | |||
299 | entry = pte_to_swp_entry(ptent); | ||
300 | if (non_swap_entry(entry)) | ||
301 | continue; | ||
302 | nr_swap--; | ||
303 | free_swap_and_cache(entry); | ||
304 | pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); | ||
305 | continue; | ||
306 | } | ||
307 | |||
308 | page = vm_normal_page(vma, addr, ptent); | ||
309 | if (!page) | ||
310 | continue; | ||
311 | |||
312 | /* | ||
313 | * If pmd isn't transhuge but the page is THP and | ||
314 | * is owned by only this process, split it and | ||
315 | * deactivate all pages. | ||
316 | */ | ||
317 | if (PageTransCompound(page)) { | ||
318 | if (page_mapcount(page) != 1) | ||
319 | goto out; | ||
320 | get_page(page); | ||
321 | if (!trylock_page(page)) { | ||
322 | put_page(page); | ||
323 | goto out; | ||
324 | } | ||
325 | pte_unmap_unlock(orig_pte, ptl); | ||
326 | if (split_huge_page(page)) { | ||
327 | unlock_page(page); | ||
328 | put_page(page); | ||
329 | pte_offset_map_lock(mm, pmd, addr, &ptl); | ||
330 | goto out; | ||
331 | } | ||
332 | put_page(page); | ||
333 | unlock_page(page); | ||
334 | pte = pte_offset_map_lock(mm, pmd, addr, &ptl); | ||
335 | pte--; | ||
336 | addr -= PAGE_SIZE; | ||
337 | continue; | ||
338 | } | ||
339 | |||
340 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | ||
341 | |||
342 | if (PageSwapCache(page) || PageDirty(page)) { | ||
343 | if (!trylock_page(page)) | ||
344 | continue; | ||
345 | /* | ||
346 | * If page is shared with others, we couldn't clear | ||
347 | * PG_dirty of the page. | ||
348 | */ | ||
349 | if (page_mapcount(page) != 1) { | ||
350 | unlock_page(page); | ||
351 | continue; | ||
352 | } | ||
353 | |||
354 | if (PageSwapCache(page) && !try_to_free_swap(page)) { | ||
355 | unlock_page(page); | ||
356 | continue; | ||
357 | } | ||
358 | |||
359 | ClearPageDirty(page); | ||
360 | unlock_page(page); | ||
361 | } | ||
362 | |||
363 | if (pte_young(ptent) || pte_dirty(ptent)) { | ||
364 | /* | ||
365 | * Some of architecture(ex, PPC) don't update TLB | ||
366 | * with set_pte_at and tlb_remove_tlb_entry so for | ||
367 | * the portability, remap the pte with old|clean | ||
368 | * after pte clearing. | ||
369 | */ | ||
370 | ptent = ptep_get_and_clear_full(mm, addr, pte, | ||
371 | tlb->fullmm); | ||
372 | |||
373 | ptent = pte_mkold(ptent); | ||
374 | ptent = pte_mkclean(ptent); | ||
375 | set_pte_at(mm, addr, pte, ptent); | ||
376 | if (PageActive(page)) | ||
377 | deactivate_page(page); | ||
378 | tlb_remove_tlb_entry(tlb, pte, addr); | ||
379 | } | ||
380 | } | ||
381 | out: | ||
382 | if (nr_swap) { | ||
383 | if (current->mm == mm) | ||
384 | sync_mm_rss(mm); | ||
385 | |||
386 | add_mm_counter(mm, MM_SWAPENTS, nr_swap); | ||
387 | } | ||
388 | arch_leave_lazy_mmu_mode(); | ||
389 | pte_unmap_unlock(orig_pte, ptl); | ||
390 | cond_resched(); | ||
391 | next: | ||
392 | return 0; | ||
393 | } | ||
394 | |||
395 | static void madvise_free_page_range(struct mmu_gather *tlb, | ||
396 | struct vm_area_struct *vma, | ||
397 | unsigned long addr, unsigned long end) | ||
398 | { | ||
399 | struct mm_walk free_walk = { | ||
400 | .pmd_entry = madvise_free_pte_range, | ||
401 | .mm = vma->vm_mm, | ||
402 | .private = tlb, | ||
403 | }; | ||
404 | |||
405 | tlb_start_vma(tlb, vma); | ||
406 | walk_page_range(addr, end, &free_walk); | ||
407 | tlb_end_vma(tlb, vma); | ||
408 | } | ||
409 | |||
410 | static int madvise_free_single_vma(struct vm_area_struct *vma, | ||
411 | unsigned long start_addr, unsigned long end_addr) | ||
412 | { | ||
413 | unsigned long start, end; | ||
414 | struct mm_struct *mm = vma->vm_mm; | ||
415 | struct mmu_gather tlb; | ||
416 | |||
417 | if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) | ||
418 | return -EINVAL; | ||
419 | |||
420 | /* MADV_FREE works for only anon vma at the moment */ | ||
421 | if (!vma_is_anonymous(vma)) | ||
422 | return -EINVAL; | ||
423 | |||
424 | start = max(vma->vm_start, start_addr); | ||
425 | if (start >= vma->vm_end) | ||
426 | return -EINVAL; | ||
427 | end = min(vma->vm_end, end_addr); | ||
428 | if (end <= vma->vm_start) | ||
429 | return -EINVAL; | ||
430 | |||
431 | lru_add_drain(); | ||
432 | tlb_gather_mmu(&tlb, mm, start, end); | ||
433 | update_hiwater_rss(mm); | ||
434 | |||
435 | mmu_notifier_invalidate_range_start(mm, start, end); | ||
436 | madvise_free_page_range(&tlb, vma, start, end); | ||
437 | mmu_notifier_invalidate_range_end(mm, start, end); | ||
438 | tlb_finish_mmu(&tlb, start, end); | ||
439 | |||
440 | return 0; | ||
441 | } | ||
442 | |||
443 | static long madvise_free(struct vm_area_struct *vma, | ||
444 | struct vm_area_struct **prev, | ||
445 | unsigned long start, unsigned long end) | ||
446 | { | ||
447 | *prev = vma; | ||
448 | return madvise_free_single_vma(vma, start, end); | ||
449 | } | ||
450 | |||
259 | /* | 451 | /* |
260 | * Application no longer needs these pages. If the pages are dirty, | 452 | * Application no longer needs these pages. If the pages are dirty, |
261 | * it's OK to just throw them away. The app will be more careful about | 453 | * it's OK to just throw them away. The app will be more careful about |
@@ -379,6 +571,14 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, | |||
379 | return madvise_remove(vma, prev, start, end); | 571 | return madvise_remove(vma, prev, start, end); |
380 | case MADV_WILLNEED: | 572 | case MADV_WILLNEED: |
381 | return madvise_willneed(vma, prev, start, end); | 573 | return madvise_willneed(vma, prev, start, end); |
574 | case MADV_FREE: | ||
575 | /* | ||
576 | * XXX: In this implementation, MADV_FREE works like | ||
577 | * MADV_DONTNEED on swapless system or full swap. | ||
578 | */ | ||
579 | if (get_nr_swap_pages() > 0) | ||
580 | return madvise_free(vma, prev, start, end); | ||
581 | /* passthrough */ | ||
382 | case MADV_DONTNEED: | 582 | case MADV_DONTNEED: |
383 | return madvise_dontneed(vma, prev, start, end); | 583 | return madvise_dontneed(vma, prev, start, end); |
384 | default: | 584 | default: |
@@ -398,6 +598,7 @@ madvise_behavior_valid(int behavior) | |||
398 | case MADV_REMOVE: | 598 | case MADV_REMOVE: |
399 | case MADV_WILLNEED: | 599 | case MADV_WILLNEED: |
400 | case MADV_DONTNEED: | 600 | case MADV_DONTNEED: |
601 | case MADV_FREE: | ||
401 | #ifdef CONFIG_KSM | 602 | #ifdef CONFIG_KSM |
402 | case MADV_MERGEABLE: | 603 | case MADV_MERGEABLE: |
403 | case MADV_UNMERGEABLE: | 604 | case MADV_UNMERGEABLE: |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 54eae4f19d80..0eda67376df4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -382,14 +382,11 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) | |||
382 | { | 382 | { |
383 | struct mem_cgroup *memcg; | 383 | struct mem_cgroup *memcg; |
384 | 384 | ||
385 | rcu_read_lock(); | ||
386 | |||
387 | memcg = page->mem_cgroup; | 385 | memcg = page->mem_cgroup; |
388 | 386 | ||
389 | if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) | 387 | if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) |
390 | memcg = root_mem_cgroup; | 388 | memcg = root_mem_cgroup; |
391 | 389 | ||
392 | rcu_read_unlock(); | ||
393 | return &memcg->css; | 390 | return &memcg->css; |
394 | } | 391 | } |
395 | 392 | ||
@@ -647,7 +644,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
647 | 644 | ||
648 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | 645 | static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, |
649 | struct page *page, | 646 | struct page *page, |
650 | int nr_pages) | 647 | bool compound, int nr_pages) |
651 | { | 648 | { |
652 | /* | 649 | /* |
653 | * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is | 650 | * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is |
@@ -660,9 +657,11 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, | |||
660 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], | 657 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], |
661 | nr_pages); | 658 | nr_pages); |
662 | 659 | ||
663 | if (PageTransHuge(page)) | 660 | if (compound) { |
661 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | ||
664 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], | 662 | __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], |
665 | nr_pages); | 663 | nr_pages); |
664 | } | ||
666 | 665 | ||
667 | /* pagein of a big page is an event. So, ignore page size */ | 666 | /* pagein of a big page is an event. So, ignore page size */ |
668 | if (nr_pages > 0) | 667 | if (nr_pages > 0) |
@@ -2431,9 +2430,7 @@ void __memcg_kmem_uncharge(struct page *page, int order) | |||
2431 | 2430 | ||
2432 | /* | 2431 | /* |
2433 | * Because tail pages are not marked as "used", set it. We're under | 2432 | * Because tail pages are not marked as "used", set it. We're under |
2434 | * zone->lru_lock, 'splitting on pmd' and compound_lock. | 2433 | * zone->lru_lock and migration entries setup in all page mappings. |
2435 | * charge/uncharge will be never happen and move_account() is done under | ||
2436 | * compound_lock(), so we don't have to take care of races. | ||
2437 | */ | 2434 | */ |
2438 | void mem_cgroup_split_huge_fixup(struct page *head) | 2435 | void mem_cgroup_split_huge_fixup(struct page *head) |
2439 | { | 2436 | { |
@@ -3494,16 +3491,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, | |||
3494 | swap_buffers: | 3491 | swap_buffers: |
3495 | /* Swap primary and spare array */ | 3492 | /* Swap primary and spare array */ |
3496 | thresholds->spare = thresholds->primary; | 3493 | thresholds->spare = thresholds->primary; |
3497 | /* If all events are unregistered, free the spare array */ | ||
3498 | if (!new) { | ||
3499 | kfree(thresholds->spare); | ||
3500 | thresholds->spare = NULL; | ||
3501 | } | ||
3502 | 3494 | ||
3503 | rcu_assign_pointer(thresholds->primary, new); | 3495 | rcu_assign_pointer(thresholds->primary, new); |
3504 | 3496 | ||
3505 | /* To be sure that nobody uses thresholds */ | 3497 | /* To be sure that nobody uses thresholds */ |
3506 | synchronize_rcu(); | 3498 | synchronize_rcu(); |
3499 | |||
3500 | /* If all events are unregistered, free the spare array */ | ||
3501 | if (!new) { | ||
3502 | kfree(thresholds->spare); | ||
3503 | thresholds->spare = NULL; | ||
3504 | } | ||
3507 | unlock: | 3505 | unlock: |
3508 | mutex_unlock(&memcg->thresholds_lock); | 3506 | mutex_unlock(&memcg->thresholds_lock); |
3509 | } | 3507 | } |
@@ -4505,38 +4503,30 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | |||
4505 | * @from: mem_cgroup which the page is moved from. | 4503 | * @from: mem_cgroup which the page is moved from. |
4506 | * @to: mem_cgroup which the page is moved to. @from != @to. | 4504 | * @to: mem_cgroup which the page is moved to. @from != @to. |
4507 | * | 4505 | * |
4508 | * The caller must confirm following. | 4506 | * The caller must make sure the page is not on LRU (isolate_page() is useful.) |
4509 | * - page is not on LRU (isolate_page() is useful.) | ||
4510 | * - compound_lock is held when nr_pages > 1 | ||
4511 | * | 4507 | * |
4512 | * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" | 4508 | * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" |
4513 | * from old cgroup. | 4509 | * from old cgroup. |
4514 | */ | 4510 | */ |
4515 | static int mem_cgroup_move_account(struct page *page, | 4511 | static int mem_cgroup_move_account(struct page *page, |
4516 | unsigned int nr_pages, | 4512 | bool compound, |
4517 | struct mem_cgroup *from, | 4513 | struct mem_cgroup *from, |
4518 | struct mem_cgroup *to) | 4514 | struct mem_cgroup *to) |
4519 | { | 4515 | { |
4520 | unsigned long flags; | 4516 | unsigned long flags; |
4517 | unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; | ||
4521 | int ret; | 4518 | int ret; |
4522 | bool anon; | 4519 | bool anon; |
4523 | 4520 | ||
4524 | VM_BUG_ON(from == to); | 4521 | VM_BUG_ON(from == to); |
4525 | VM_BUG_ON_PAGE(PageLRU(page), page); | 4522 | VM_BUG_ON_PAGE(PageLRU(page), page); |
4526 | /* | 4523 | VM_BUG_ON(compound && !PageTransHuge(page)); |
4527 | * The page is isolated from LRU. So, collapse function | ||
4528 | * will not handle this page. But page splitting can happen. | ||
4529 | * Do this check under compound_page_lock(). The caller should | ||
4530 | * hold it. | ||
4531 | */ | ||
4532 | ret = -EBUSY; | ||
4533 | if (nr_pages > 1 && !PageTransHuge(page)) | ||
4534 | goto out; | ||
4535 | 4524 | ||
4536 | /* | 4525 | /* |
4537 | * Prevent mem_cgroup_replace_page() from looking at | 4526 | * Prevent mem_cgroup_replace_page() from looking at |
4538 | * page->mem_cgroup of its source page while we change it. | 4527 | * page->mem_cgroup of its source page while we change it. |
4539 | */ | 4528 | */ |
4529 | ret = -EBUSY; | ||
4540 | if (!trylock_page(page)) | 4530 | if (!trylock_page(page)) |
4541 | goto out; | 4531 | goto out; |
4542 | 4532 | ||
@@ -4591,9 +4581,9 @@ static int mem_cgroup_move_account(struct page *page, | |||
4591 | ret = 0; | 4581 | ret = 0; |
4592 | 4582 | ||
4593 | local_irq_disable(); | 4583 | local_irq_disable(); |
4594 | mem_cgroup_charge_statistics(to, page, nr_pages); | 4584 | mem_cgroup_charge_statistics(to, page, compound, nr_pages); |
4595 | memcg_check_events(to, page); | 4585 | memcg_check_events(to, page); |
4596 | mem_cgroup_charge_statistics(from, page, -nr_pages); | 4586 | mem_cgroup_charge_statistics(from, page, compound, -nr_pages); |
4597 | memcg_check_events(from, page); | 4587 | memcg_check_events(from, page); |
4598 | local_irq_enable(); | 4588 | local_irq_enable(); |
4599 | out_unlock: | 4589 | out_unlock: |
@@ -4683,7 +4673,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | |||
4683 | pte_t *pte; | 4673 | pte_t *pte; |
4684 | spinlock_t *ptl; | 4674 | spinlock_t *ptl; |
4685 | 4675 | ||
4686 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 4676 | if (pmd_trans_huge_lock(pmd, vma, &ptl)) { |
4687 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) | 4677 | if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) |
4688 | mc.precharge += HPAGE_PMD_NR; | 4678 | mc.precharge += HPAGE_PMD_NR; |
4689 | spin_unlock(ptl); | 4679 | spin_unlock(ptl); |
@@ -4871,17 +4861,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
4871 | union mc_target target; | 4861 | union mc_target target; |
4872 | struct page *page; | 4862 | struct page *page; |
4873 | 4863 | ||
4874 | /* | 4864 | if (pmd_trans_huge_lock(pmd, vma, &ptl)) { |
4875 | * We don't take compound_lock() here but no race with splitting thp | ||
4876 | * happens because: | ||
4877 | * - if pmd_trans_huge_lock() returns 1, the relevant thp is not | ||
4878 | * under splitting, which means there's no concurrent thp split, | ||
4879 | * - if another thread runs into split_huge_page() just after we | ||
4880 | * entered this if-block, the thread must wait for page table lock | ||
4881 | * to be unlocked in __split_huge_page_splitting(), where the main | ||
4882 | * part of thp split is not executed yet. | ||
4883 | */ | ||
4884 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | ||
4885 | if (mc.precharge < HPAGE_PMD_NR) { | 4865 | if (mc.precharge < HPAGE_PMD_NR) { |
4886 | spin_unlock(ptl); | 4866 | spin_unlock(ptl); |
4887 | return 0; | 4867 | return 0; |
@@ -4890,7 +4870,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | |||
4890 | if (target_type == MC_TARGET_PAGE) { | 4870 | if (target_type == MC_TARGET_PAGE) { |
4891 | page = target.page; | 4871 | page = target.page; |
4892 | if (!isolate_lru_page(page)) { | 4872 | if (!isolate_lru_page(page)) { |
4893 | if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, | 4873 | if (!mem_cgroup_move_account(page, true, |
4894 | mc.from, mc.to)) { | 4874 | mc.from, mc.to)) { |
4895 | mc.precharge -= HPAGE_PMD_NR; | 4875 | mc.precharge -= HPAGE_PMD_NR; |
4896 | mc.moved_charge += HPAGE_PMD_NR; | 4876 | mc.moved_charge += HPAGE_PMD_NR; |
@@ -4917,9 +4897,18 @@ retry: | |||
4917 | switch (get_mctgt_type(vma, addr, ptent, &target)) { | 4897 | switch (get_mctgt_type(vma, addr, ptent, &target)) { |
4918 | case MC_TARGET_PAGE: | 4898 | case MC_TARGET_PAGE: |
4919 | page = target.page; | 4899 | page = target.page; |
4900 | /* | ||
4901 | * We can have a part of the split pmd here. Moving it | ||
4902 | * can be done but it would be too convoluted so simply | ||
4903 | * ignore such a partial THP and keep it in original | ||
4904 | * memcg. There should be somebody mapping the head. | ||
4905 | */ | ||
4906 | if (PageTransCompound(page)) | ||
4907 | goto put; | ||
4920 | if (isolate_lru_page(page)) | 4908 | if (isolate_lru_page(page)) |
4921 | goto put; | 4909 | goto put; |
4922 | if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) { | 4910 | if (!mem_cgroup_move_account(page, false, |
4911 | mc.from, mc.to)) { | ||
4923 | mc.precharge--; | 4912 | mc.precharge--; |
4924 | /* we uncharge from mc.from later. */ | 4913 | /* we uncharge from mc.from later. */ |
4925 | mc.moved_charge++; | 4914 | mc.moved_charge++; |
@@ -5258,10 +5247,11 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) | |||
5258 | * with mem_cgroup_cancel_charge() in case page instantiation fails. | 5247 | * with mem_cgroup_cancel_charge() in case page instantiation fails. |
5259 | */ | 5248 | */ |
5260 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | 5249 | int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, |
5261 | gfp_t gfp_mask, struct mem_cgroup **memcgp) | 5250 | gfp_t gfp_mask, struct mem_cgroup **memcgp, |
5251 | bool compound) | ||
5262 | { | 5252 | { |
5263 | struct mem_cgroup *memcg = NULL; | 5253 | struct mem_cgroup *memcg = NULL; |
5264 | unsigned int nr_pages = 1; | 5254 | unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; |
5265 | int ret = 0; | 5255 | int ret = 0; |
5266 | 5256 | ||
5267 | if (mem_cgroup_disabled()) | 5257 | if (mem_cgroup_disabled()) |
@@ -5291,11 +5281,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, | |||
5291 | } | 5281 | } |
5292 | } | 5282 | } |
5293 | 5283 | ||
5294 | if (PageTransHuge(page)) { | ||
5295 | nr_pages <<= compound_order(page); | ||
5296 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | ||
5297 | } | ||
5298 | |||
5299 | if (!memcg) | 5284 | if (!memcg) |
5300 | memcg = get_mem_cgroup_from_mm(mm); | 5285 | memcg = get_mem_cgroup_from_mm(mm); |
5301 | 5286 | ||
@@ -5324,9 +5309,9 @@ out: | |||
5324 | * Use mem_cgroup_cancel_charge() to cancel the transaction instead. | 5309 | * Use mem_cgroup_cancel_charge() to cancel the transaction instead. |
5325 | */ | 5310 | */ |
5326 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, | 5311 | void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, |
5327 | bool lrucare) | 5312 | bool lrucare, bool compound) |
5328 | { | 5313 | { |
5329 | unsigned int nr_pages = 1; | 5314 | unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; |
5330 | 5315 | ||
5331 | VM_BUG_ON_PAGE(!page->mapping, page); | 5316 | VM_BUG_ON_PAGE(!page->mapping, page); |
5332 | VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); | 5317 | VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); |
@@ -5343,13 +5328,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
5343 | 5328 | ||
5344 | commit_charge(page, memcg, lrucare); | 5329 | commit_charge(page, memcg, lrucare); |
5345 | 5330 | ||
5346 | if (PageTransHuge(page)) { | ||
5347 | nr_pages <<= compound_order(page); | ||
5348 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | ||
5349 | } | ||
5350 | |||
5351 | local_irq_disable(); | 5331 | local_irq_disable(); |
5352 | mem_cgroup_charge_statistics(memcg, page, nr_pages); | 5332 | mem_cgroup_charge_statistics(memcg, page, compound, nr_pages); |
5353 | memcg_check_events(memcg, page); | 5333 | memcg_check_events(memcg, page); |
5354 | local_irq_enable(); | 5334 | local_irq_enable(); |
5355 | 5335 | ||
@@ -5371,9 +5351,10 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, | |||
5371 | * | 5351 | * |
5372 | * Cancel a charge transaction started by mem_cgroup_try_charge(). | 5352 | * Cancel a charge transaction started by mem_cgroup_try_charge(). |
5373 | */ | 5353 | */ |
5374 | void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) | 5354 | void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, |
5355 | bool compound) | ||
5375 | { | 5356 | { |
5376 | unsigned int nr_pages = 1; | 5357 | unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1; |
5377 | 5358 | ||
5378 | if (mem_cgroup_disabled()) | 5359 | if (mem_cgroup_disabled()) |
5379 | return; | 5360 | return; |
@@ -5385,11 +5366,6 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) | |||
5385 | if (!memcg) | 5366 | if (!memcg) |
5386 | return; | 5367 | return; |
5387 | 5368 | ||
5388 | if (PageTransHuge(page)) { | ||
5389 | nr_pages <<= compound_order(page); | ||
5390 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | ||
5391 | } | ||
5392 | |||
5393 | cancel_charge(memcg, nr_pages); | 5369 | cancel_charge(memcg, nr_pages); |
5394 | } | 5370 | } |
5395 | 5371 | ||
@@ -5750,7 +5726,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) | |||
5750 | * only synchronisation we have for udpating the per-CPU variables. | 5726 | * only synchronisation we have for udpating the per-CPU variables. |
5751 | */ | 5727 | */ |
5752 | VM_BUG_ON(!irqs_disabled()); | 5728 | VM_BUG_ON(!irqs_disabled()); |
5753 | mem_cgroup_charge_statistics(memcg, page, -1); | 5729 | mem_cgroup_charge_statistics(memcg, page, false, -1); |
5754 | memcg_check_events(memcg, page); | 5730 | memcg_check_events(memcg, page); |
5755 | } | 5731 | } |
5756 | 5732 | ||
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8424b64711ac..ac595e7a3a95 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c | |||
@@ -882,15 +882,7 @@ int get_hwpoison_page(struct page *page) | |||
882 | { | 882 | { |
883 | struct page *head = compound_head(page); | 883 | struct page *head = compound_head(page); |
884 | 884 | ||
885 | if (PageHuge(head)) | 885 | if (!PageHuge(head) && PageTransHuge(head)) { |
886 | return get_page_unless_zero(head); | ||
887 | |||
888 | /* | ||
889 | * Thp tail page has special refcounting rule (refcount of tail pages | ||
890 | * is stored in ->_mapcount,) so we can't call get_page_unless_zero() | ||
891 | * directly for tail pages. | ||
892 | */ | ||
893 | if (PageTransHuge(head)) { | ||
894 | /* | 886 | /* |
895 | * Non anonymous thp exists only in allocation/free time. We | 887 | * Non anonymous thp exists only in allocation/free time. We |
896 | * can't handle such a case correctly, so let's give it up. | 888 | * can't handle such a case correctly, so let's give it up. |
@@ -902,41 +894,12 @@ int get_hwpoison_page(struct page *page) | |||
902 | page_to_pfn(page)); | 894 | page_to_pfn(page)); |
903 | return 0; | 895 | return 0; |
904 | } | 896 | } |
905 | |||
906 | if (get_page_unless_zero(head)) { | ||
907 | if (PageTail(page)) | ||
908 | get_page(page); | ||
909 | return 1; | ||
910 | } else { | ||
911 | return 0; | ||
912 | } | ||
913 | } | 897 | } |
914 | 898 | ||
915 | return get_page_unless_zero(page); | 899 | return get_page_unless_zero(head); |
916 | } | 900 | } |
917 | EXPORT_SYMBOL_GPL(get_hwpoison_page); | 901 | EXPORT_SYMBOL_GPL(get_hwpoison_page); |
918 | 902 | ||
919 | /** | ||
920 | * put_hwpoison_page() - Put refcount for memory error handling: | ||
921 | * @page: raw error page (hit by memory error) | ||
922 | */ | ||
923 | void put_hwpoison_page(struct page *page) | ||
924 | { | ||
925 | struct page *head = compound_head(page); | ||
926 | |||
927 | if (PageHuge(head)) { | ||
928 | put_page(head); | ||
929 | return; | ||
930 | } | ||
931 | |||
932 | if (PageTransHuge(head)) | ||
933 | if (page != head) | ||
934 | put_page(head); | ||
935 | |||
936 | put_page(page); | ||
937 | } | ||
938 | EXPORT_SYMBOL_GPL(put_hwpoison_page); | ||
939 | |||
940 | /* | 903 | /* |
941 | * Do all that is necessary to remove user space mappings. Unmap | 904 | * Do all that is necessary to remove user space mappings. Unmap |
942 | * the pages and send SIGBUS to the processes if the data was dirty. | 905 | * the pages and send SIGBUS to the processes if the data was dirty. |
@@ -1149,7 +1112,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1149 | } | 1112 | } |
1150 | 1113 | ||
1151 | if (!PageHuge(p) && PageTransHuge(hpage)) { | 1114 | if (!PageHuge(p) && PageTransHuge(hpage)) { |
1115 | lock_page(hpage); | ||
1152 | if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { | 1116 | if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { |
1117 | unlock_page(hpage); | ||
1153 | if (!PageAnon(hpage)) | 1118 | if (!PageAnon(hpage)) |
1154 | pr_err("MCE: %#lx: non anonymous thp\n", pfn); | 1119 | pr_err("MCE: %#lx: non anonymous thp\n", pfn); |
1155 | else | 1120 | else |
@@ -1159,6 +1124,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1159 | put_hwpoison_page(p); | 1124 | put_hwpoison_page(p); |
1160 | return -EBUSY; | 1125 | return -EBUSY; |
1161 | } | 1126 | } |
1127 | unlock_page(hpage); | ||
1128 | get_hwpoison_page(p); | ||
1129 | put_hwpoison_page(hpage); | ||
1162 | VM_BUG_ON_PAGE(!page_count(p), p); | 1130 | VM_BUG_ON_PAGE(!page_count(p), p); |
1163 | hpage = compound_head(p); | 1131 | hpage = compound_head(p); |
1164 | } | 1132 | } |
@@ -1166,7 +1134,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) | |||
1166 | /* | 1134 | /* |
1167 | * We ignore non-LRU pages for good reasons. | 1135 | * We ignore non-LRU pages for good reasons. |
1168 | * - PG_locked is only well defined for LRU pages and a few others | 1136 | * - PG_locked is only well defined for LRU pages and a few others |
1169 | * - to avoid races with __set_page_locked() | 1137 | * - to avoid races with __SetPageLocked() |
1170 | * - to avoid races with __SetPageSlab*() (and more non-atomic ops) | 1138 | * - to avoid races with __SetPageSlab*() (and more non-atomic ops) |
1171 | * The check (unnecessarily) ignores LRU pages being isolated and | 1139 | * The check (unnecessarily) ignores LRU pages being isolated and |
1172 | * walked by the page reclaim code, however that's not a big loss. | 1140 | * walked by the page reclaim code, however that's not a big loss. |
@@ -1572,7 +1540,7 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) | |||
1572 | * Did it turn free? | 1540 | * Did it turn free? |
1573 | */ | 1541 | */ |
1574 | ret = __get_any_page(page, pfn, 0); | 1542 | ret = __get_any_page(page, pfn, 0); |
1575 | if (!PageLRU(page)) { | 1543 | if (ret == 1 && !PageLRU(page)) { |
1576 | /* Drop page reference which is from __get_any_page() */ | 1544 | /* Drop page reference which is from __get_any_page() */ |
1577 | put_hwpoison_page(page); | 1545 | put_hwpoison_page(page); |
1578 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", | 1546 | pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", |
@@ -1716,6 +1684,49 @@ static int __soft_offline_page(struct page *page, int flags) | |||
1716 | return ret; | 1684 | return ret; |
1717 | } | 1685 | } |
1718 | 1686 | ||
1687 | static int soft_offline_in_use_page(struct page *page, int flags) | ||
1688 | { | ||
1689 | int ret; | ||
1690 | struct page *hpage = compound_head(page); | ||
1691 | |||
1692 | if (!PageHuge(page) && PageTransHuge(hpage)) { | ||
1693 | lock_page(hpage); | ||
1694 | if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { | ||
1695 | unlock_page(hpage); | ||
1696 | if (!PageAnon(hpage)) | ||
1697 | pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page)); | ||
1698 | else | ||
1699 | pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page)); | ||
1700 | put_hwpoison_page(hpage); | ||
1701 | return -EBUSY; | ||
1702 | } | ||
1703 | unlock_page(hpage); | ||
1704 | get_hwpoison_page(page); | ||
1705 | put_hwpoison_page(hpage); | ||
1706 | } | ||
1707 | |||
1708 | if (PageHuge(page)) | ||
1709 | ret = soft_offline_huge_page(page, flags); | ||
1710 | else | ||
1711 | ret = __soft_offline_page(page, flags); | ||
1712 | |||
1713 | return ret; | ||
1714 | } | ||
1715 | |||
1716 | static void soft_offline_free_page(struct page *page) | ||
1717 | { | ||
1718 | if (PageHuge(page)) { | ||
1719 | struct page *hpage = compound_head(page); | ||
1720 | |||
1721 | set_page_hwpoison_huge_page(hpage); | ||
1722 | if (!dequeue_hwpoisoned_huge_page(hpage)) | ||
1723 | num_poisoned_pages_add(1 << compound_order(hpage)); | ||
1724 | } else { | ||
1725 | if (!TestSetPageHWPoison(page)) | ||
1726 | num_poisoned_pages_inc(); | ||
1727 | } | ||
1728 | } | ||
1729 | |||
1719 | /** | 1730 | /** |
1720 | * soft_offline_page - Soft offline a page. | 1731 | * soft_offline_page - Soft offline a page. |
1721 | * @page: page to offline | 1732 | * @page: page to offline |
@@ -1742,7 +1753,6 @@ int soft_offline_page(struct page *page, int flags) | |||
1742 | { | 1753 | { |
1743 | int ret; | 1754 | int ret; |
1744 | unsigned long pfn = page_to_pfn(page); | 1755 | unsigned long pfn = page_to_pfn(page); |
1745 | struct page *hpage = compound_head(page); | ||
1746 | 1756 | ||
1747 | if (PageHWPoison(page)) { | 1757 | if (PageHWPoison(page)) { |
1748 | pr_info("soft offline: %#lx page already poisoned\n", pfn); | 1758 | pr_info("soft offline: %#lx page already poisoned\n", pfn); |
@@ -1750,34 +1760,15 @@ int soft_offline_page(struct page *page, int flags) | |||
1750 | put_hwpoison_page(page); | 1760 | put_hwpoison_page(page); |
1751 | return -EBUSY; | 1761 | return -EBUSY; |
1752 | } | 1762 | } |
1753 | if (!PageHuge(page) && PageTransHuge(hpage)) { | ||
1754 | if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { | ||
1755 | pr_info("soft offline: %#lx: failed to split THP\n", | ||
1756 | pfn); | ||
1757 | if (flags & MF_COUNT_INCREASED) | ||
1758 | put_hwpoison_page(page); | ||
1759 | return -EBUSY; | ||
1760 | } | ||
1761 | } | ||
1762 | 1763 | ||
1763 | get_online_mems(); | 1764 | get_online_mems(); |
1764 | |||
1765 | ret = get_any_page(page, pfn, flags); | 1765 | ret = get_any_page(page, pfn, flags); |
1766 | put_online_mems(); | 1766 | put_online_mems(); |
1767 | if (ret > 0) { /* for in-use pages */ | 1767 | |
1768 | if (PageHuge(page)) | 1768 | if (ret > 0) |
1769 | ret = soft_offline_huge_page(page, flags); | 1769 | ret = soft_offline_in_use_page(page, flags); |
1770 | else | 1770 | else if (ret == 0) |
1771 | ret = __soft_offline_page(page, flags); | 1771 | soft_offline_free_page(page); |
1772 | } else if (ret == 0) { /* for free pages */ | 1772 | |
1773 | if (PageHuge(page)) { | ||
1774 | set_page_hwpoison_huge_page(hpage); | ||
1775 | if (!dequeue_hwpoisoned_huge_page(hpage)) | ||
1776 | num_poisoned_pages_add(1 << compound_order(hpage)); | ||
1777 | } else { | ||
1778 | if (!TestSetPageHWPoison(page)) | ||
1779 | num_poisoned_pages_inc(); | ||
1780 | } | ||
1781 | } | ||
1782 | return ret; | 1773 | return ret; |
1783 | } | 1774 | } |
diff --git a/mm/memory.c b/mm/memory.c index d4e4d37c1989..ff17850a52d9 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -50,6 +50,7 @@ | |||
50 | #include <linux/export.h> | 50 | #include <linux/export.h> |
51 | #include <linux/delayacct.h> | 51 | #include <linux/delayacct.h> |
52 | #include <linux/init.h> | 52 | #include <linux/init.h> |
53 | #include <linux/pfn_t.h> | ||
53 | #include <linux/writeback.h> | 54 | #include <linux/writeback.h> |
54 | #include <linux/memcontrol.h> | 55 | #include <linux/memcontrol.h> |
55 | #include <linux/mmu_notifier.h> | 56 | #include <linux/mmu_notifier.h> |
@@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, | |||
566 | { | 567 | { |
567 | spinlock_t *ptl; | 568 | spinlock_t *ptl; |
568 | pgtable_t new = pte_alloc_one(mm, address); | 569 | pgtable_t new = pte_alloc_one(mm, address); |
569 | int wait_split_huge_page; | ||
570 | if (!new) | 570 | if (!new) |
571 | return -ENOMEM; | 571 | return -ENOMEM; |
572 | 572 | ||
@@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, | |||
586 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ | 586 | smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ |
587 | 587 | ||
588 | ptl = pmd_lock(mm, pmd); | 588 | ptl = pmd_lock(mm, pmd); |
589 | wait_split_huge_page = 0; | ||
590 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | 589 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
591 | atomic_long_inc(&mm->nr_ptes); | 590 | atomic_long_inc(&mm->nr_ptes); |
592 | pmd_populate(mm, pmd, new); | 591 | pmd_populate(mm, pmd, new); |
593 | new = NULL; | 592 | new = NULL; |
594 | } else if (unlikely(pmd_trans_splitting(*pmd))) | 593 | } |
595 | wait_split_huge_page = 1; | ||
596 | spin_unlock(ptl); | 594 | spin_unlock(ptl); |
597 | if (new) | 595 | if (new) |
598 | pte_free(mm, new); | 596 | pte_free(mm, new); |
599 | if (wait_split_huge_page) | ||
600 | wait_split_huge_page(vma->anon_vma, pmd); | ||
601 | return 0; | 597 | return 0; |
602 | } | 598 | } |
603 | 599 | ||
@@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) | |||
613 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ | 609 | if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ |
614 | pmd_populate_kernel(&init_mm, pmd, new); | 610 | pmd_populate_kernel(&init_mm, pmd, new); |
615 | new = NULL; | 611 | new = NULL; |
616 | } else | 612 | } |
617 | VM_BUG_ON(pmd_trans_splitting(*pmd)); | ||
618 | spin_unlock(&init_mm.page_table_lock); | 613 | spin_unlock(&init_mm.page_table_lock); |
619 | if (new) | 614 | if (new) |
620 | pte_free_kernel(&init_mm, new); | 615 | pte_free_kernel(&init_mm, new); |
@@ -870,7 +865,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
870 | page = vm_normal_page(vma, addr, pte); | 865 | page = vm_normal_page(vma, addr, pte); |
871 | if (page) { | 866 | if (page) { |
872 | get_page(page); | 867 | get_page(page); |
873 | page_dup_rmap(page); | 868 | page_dup_rmap(page, false); |
874 | rss[mm_counter(page)]++; | 869 | rss[mm_counter(page)]++; |
875 | } | 870 | } |
876 | 871 | ||
@@ -955,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src | |||
955 | src_pmd = pmd_offset(src_pud, addr); | 950 | src_pmd = pmd_offset(src_pud, addr); |
956 | do { | 951 | do { |
957 | next = pmd_addr_end(addr, end); | 952 | next = pmd_addr_end(addr, end); |
958 | if (pmd_trans_huge(*src_pmd)) { | 953 | if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { |
959 | int err; | 954 | int err; |
960 | VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); | 955 | VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); |
961 | err = copy_huge_pmd(dst_mm, src_mm, | 956 | err = copy_huge_pmd(dst_mm, src_mm, |
@@ -1118,7 +1113,7 @@ again: | |||
1118 | mark_page_accessed(page); | 1113 | mark_page_accessed(page); |
1119 | } | 1114 | } |
1120 | rss[mm_counter(page)]--; | 1115 | rss[mm_counter(page)]--; |
1121 | page_remove_rmap(page); | 1116 | page_remove_rmap(page, false); |
1122 | if (unlikely(page_mapcount(page) < 0)) | 1117 | if (unlikely(page_mapcount(page) < 0)) |
1123 | print_bad_pte(vma, addr, ptent, page); | 1118 | print_bad_pte(vma, addr, ptent, page); |
1124 | if (unlikely(!__tlb_remove_page(tlb, page))) { | 1119 | if (unlikely(!__tlb_remove_page(tlb, page))) { |
@@ -1182,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1182 | pmd = pmd_offset(pud, addr); | 1177 | pmd = pmd_offset(pud, addr); |
1183 | do { | 1178 | do { |
1184 | next = pmd_addr_end(addr, end); | 1179 | next = pmd_addr_end(addr, end); |
1185 | if (pmd_trans_huge(*pmd)) { | 1180 | if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { |
1186 | if (next - addr != HPAGE_PMD_SIZE) { | 1181 | if (next - addr != HPAGE_PMD_SIZE) { |
1187 | #ifdef CONFIG_DEBUG_VM | 1182 | #ifdef CONFIG_DEBUG_VM |
1188 | if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { | 1183 | if (!rwsem_is_locked(&tlb->mm->mmap_sem)) { |
@@ -1193,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, | |||
1193 | BUG(); | 1188 | BUG(); |
1194 | } | 1189 | } |
1195 | #endif | 1190 | #endif |
1196 | split_huge_page_pmd(vma, addr, pmd); | 1191 | split_huge_pmd(vma, pmd, addr); |
1197 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) | 1192 | } else if (zap_huge_pmd(tlb, vma, pmd, addr)) |
1198 | goto next; | 1193 | goto next; |
1199 | /* fall through */ | 1194 | /* fall through */ |
@@ -1506,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, | |||
1506 | EXPORT_SYMBOL(vm_insert_page); | 1501 | EXPORT_SYMBOL(vm_insert_page); |
1507 | 1502 | ||
1508 | static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, | 1503 | static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, |
1509 | unsigned long pfn, pgprot_t prot) | 1504 | pfn_t pfn, pgprot_t prot) |
1510 | { | 1505 | { |
1511 | struct mm_struct *mm = vma->vm_mm; | 1506 | struct mm_struct *mm = vma->vm_mm; |
1512 | int retval; | 1507 | int retval; |
@@ -1522,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1522 | goto out_unlock; | 1517 | goto out_unlock; |
1523 | 1518 | ||
1524 | /* Ok, finally just insert the thing.. */ | 1519 | /* Ok, finally just insert the thing.. */ |
1525 | entry = pte_mkspecial(pfn_pte(pfn, prot)); | 1520 | if (pfn_t_devmap(pfn)) |
1521 | entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); | ||
1522 | else | ||
1523 | entry = pte_mkspecial(pfn_t_pte(pfn, prot)); | ||
1526 | set_pte_at(mm, addr, pte, entry); | 1524 | set_pte_at(mm, addr, pte, entry); |
1527 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ | 1525 | update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ |
1528 | 1526 | ||
@@ -1569,17 +1567,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, | |||
1569 | 1567 | ||
1570 | if (addr < vma->vm_start || addr >= vma->vm_end) | 1568 | if (addr < vma->vm_start || addr >= vma->vm_end) |
1571 | return -EFAULT; | 1569 | return -EFAULT; |
1572 | if (track_pfn_insert(vma, &pgprot, pfn)) | 1570 | if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV))) |
1573 | return -EINVAL; | 1571 | return -EINVAL; |
1574 | 1572 | ||
1575 | ret = insert_pfn(vma, addr, pfn, pgprot); | 1573 | ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); |
1576 | 1574 | ||
1577 | return ret; | 1575 | return ret; |
1578 | } | 1576 | } |
1579 | EXPORT_SYMBOL(vm_insert_pfn); | 1577 | EXPORT_SYMBOL(vm_insert_pfn); |
1580 | 1578 | ||
1581 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | 1579 | int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, |
1582 | unsigned long pfn) | 1580 | pfn_t pfn) |
1583 | { | 1581 | { |
1584 | BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); | 1582 | BUG_ON(!(vma->vm_flags & VM_MIXEDMAP)); |
1585 | 1583 | ||
@@ -1593,10 +1591,10 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, | |||
1593 | * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP | 1591 | * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP |
1594 | * without pte special, it would there be refcounted as a normal page. | 1592 | * without pte special, it would there be refcounted as a normal page. |
1595 | */ | 1593 | */ |
1596 | if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { | 1594 | if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) { |
1597 | struct page *page; | 1595 | struct page *page; |
1598 | 1596 | ||
1599 | page = pfn_to_page(pfn); | 1597 | page = pfn_t_to_page(pfn); |
1600 | return insert_page(vma, addr, page, vma->vm_page_prot); | 1598 | return insert_page(vma, addr, page, vma->vm_page_prot); |
1601 | } | 1599 | } |
1602 | return insert_pfn(vma, addr, pfn, vma->vm_page_prot); | 1600 | return insert_pfn(vma, addr, pfn, vma->vm_page_prot); |
@@ -2087,7 +2085,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2087 | cow_user_page(new_page, old_page, address, vma); | 2085 | cow_user_page(new_page, old_page, address, vma); |
2088 | } | 2086 | } |
2089 | 2087 | ||
2090 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) | 2088 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) |
2091 | goto oom_free_new; | 2089 | goto oom_free_new; |
2092 | 2090 | ||
2093 | __SetPageUptodate(new_page); | 2091 | __SetPageUptodate(new_page); |
@@ -2118,8 +2116,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2118 | * thread doing COW. | 2116 | * thread doing COW. |
2119 | */ | 2117 | */ |
2120 | ptep_clear_flush_notify(vma, address, page_table); | 2118 | ptep_clear_flush_notify(vma, address, page_table); |
2121 | page_add_new_anon_rmap(new_page, vma, address); | 2119 | page_add_new_anon_rmap(new_page, vma, address, false); |
2122 | mem_cgroup_commit_charge(new_page, memcg, false); | 2120 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
2123 | lru_cache_add_active_or_unevictable(new_page, vma); | 2121 | lru_cache_add_active_or_unevictable(new_page, vma); |
2124 | /* | 2122 | /* |
2125 | * We call the notify macro here because, when using secondary | 2123 | * We call the notify macro here because, when using secondary |
@@ -2151,14 +2149,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2151 | * mapcount is visible. So transitively, TLBs to | 2149 | * mapcount is visible. So transitively, TLBs to |
2152 | * old page will be flushed before it can be reused. | 2150 | * old page will be flushed before it can be reused. |
2153 | */ | 2151 | */ |
2154 | page_remove_rmap(old_page); | 2152 | page_remove_rmap(old_page, false); |
2155 | } | 2153 | } |
2156 | 2154 | ||
2157 | /* Free the old page.. */ | 2155 | /* Free the old page.. */ |
2158 | new_page = old_page; | 2156 | new_page = old_page; |
2159 | page_copied = 1; | 2157 | page_copied = 1; |
2160 | } else { | 2158 | } else { |
2161 | mem_cgroup_cancel_charge(new_page, memcg); | 2159 | mem_cgroup_cancel_charge(new_page, memcg, false); |
2162 | } | 2160 | } |
2163 | 2161 | ||
2164 | if (new_page) | 2162 | if (new_page) |
@@ -2173,7 +2171,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2173 | */ | 2171 | */ |
2174 | if (page_copied && (vma->vm_flags & VM_LOCKED)) { | 2172 | if (page_copied && (vma->vm_flags & VM_LOCKED)) { |
2175 | lock_page(old_page); /* LRU manipulation */ | 2173 | lock_page(old_page); /* LRU manipulation */ |
2176 | munlock_vma_page(old_page); | 2174 | if (PageMlocked(old_page)) |
2175 | munlock_vma_page(old_page); | ||
2177 | unlock_page(old_page); | 2176 | unlock_page(old_page); |
2178 | } | 2177 | } |
2179 | page_cache_release(old_page); | 2178 | page_cache_release(old_page); |
@@ -2533,7 +2532,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2533 | goto out_page; | 2532 | goto out_page; |
2534 | } | 2533 | } |
2535 | 2534 | ||
2536 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { | 2535 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) { |
2537 | ret = VM_FAULT_OOM; | 2536 | ret = VM_FAULT_OOM; |
2538 | goto out_page; | 2537 | goto out_page; |
2539 | } | 2538 | } |
@@ -2567,7 +2566,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2567 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); | 2566 | pte = maybe_mkwrite(pte_mkdirty(pte), vma); |
2568 | flags &= ~FAULT_FLAG_WRITE; | 2567 | flags &= ~FAULT_FLAG_WRITE; |
2569 | ret |= VM_FAULT_WRITE; | 2568 | ret |= VM_FAULT_WRITE; |
2570 | exclusive = 1; | 2569 | exclusive = RMAP_EXCLUSIVE; |
2571 | } | 2570 | } |
2572 | flush_icache_page(vma, page); | 2571 | flush_icache_page(vma, page); |
2573 | if (pte_swp_soft_dirty(orig_pte)) | 2572 | if (pte_swp_soft_dirty(orig_pte)) |
@@ -2575,10 +2574,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2575 | set_pte_at(mm, address, page_table, pte); | 2574 | set_pte_at(mm, address, page_table, pte); |
2576 | if (page == swapcache) { | 2575 | if (page == swapcache) { |
2577 | do_page_add_anon_rmap(page, vma, address, exclusive); | 2576 | do_page_add_anon_rmap(page, vma, address, exclusive); |
2578 | mem_cgroup_commit_charge(page, memcg, true); | 2577 | mem_cgroup_commit_charge(page, memcg, true, false); |
2579 | } else { /* ksm created a completely new copy */ | 2578 | } else { /* ksm created a completely new copy */ |
2580 | page_add_new_anon_rmap(page, vma, address); | 2579 | page_add_new_anon_rmap(page, vma, address, false); |
2581 | mem_cgroup_commit_charge(page, memcg, false); | 2580 | mem_cgroup_commit_charge(page, memcg, false, false); |
2582 | lru_cache_add_active_or_unevictable(page, vma); | 2581 | lru_cache_add_active_or_unevictable(page, vma); |
2583 | } | 2582 | } |
2584 | 2583 | ||
@@ -2613,7 +2612,7 @@ unlock: | |||
2613 | out: | 2612 | out: |
2614 | return ret; | 2613 | return ret; |
2615 | out_nomap: | 2614 | out_nomap: |
2616 | mem_cgroup_cancel_charge(page, memcg); | 2615 | mem_cgroup_cancel_charge(page, memcg, false); |
2617 | pte_unmap_unlock(page_table, ptl); | 2616 | pte_unmap_unlock(page_table, ptl); |
2618 | out_page: | 2617 | out_page: |
2619 | unlock_page(page); | 2618 | unlock_page(page); |
@@ -2707,7 +2706,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2707 | if (!page) | 2706 | if (!page) |
2708 | goto oom; | 2707 | goto oom; |
2709 | 2708 | ||
2710 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) | 2709 | if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) |
2711 | goto oom_free_page; | 2710 | goto oom_free_page; |
2712 | 2711 | ||
2713 | /* | 2712 | /* |
@@ -2728,15 +2727,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
2728 | /* Deliver the page fault to userland, check inside PT lock */ | 2727 | /* Deliver the page fault to userland, check inside PT lock */ |
2729 | if (userfaultfd_missing(vma)) { | 2728 | if (userfaultfd_missing(vma)) { |
2730 | pte_unmap_unlock(page_table, ptl); | 2729 | pte_unmap_unlock(page_table, ptl); |
2731 | mem_cgroup_cancel_charge(page, memcg); | 2730 | mem_cgroup_cancel_charge(page, memcg, false); |
2732 | page_cache_release(page); | 2731 | page_cache_release(page); |
2733 | return handle_userfault(vma, address, flags, | 2732 | return handle_userfault(vma, address, flags, |
2734 | VM_UFFD_MISSING); | 2733 | VM_UFFD_MISSING); |
2735 | } | 2734 | } |
2736 | 2735 | ||
2737 | inc_mm_counter_fast(mm, MM_ANONPAGES); | 2736 | inc_mm_counter_fast(mm, MM_ANONPAGES); |
2738 | page_add_new_anon_rmap(page, vma, address); | 2737 | page_add_new_anon_rmap(page, vma, address, false); |
2739 | mem_cgroup_commit_charge(page, memcg, false); | 2738 | mem_cgroup_commit_charge(page, memcg, false, false); |
2740 | lru_cache_add_active_or_unevictable(page, vma); | 2739 | lru_cache_add_active_or_unevictable(page, vma); |
2741 | setpte: | 2740 | setpte: |
2742 | set_pte_at(mm, address, page_table, entry); | 2741 | set_pte_at(mm, address, page_table, entry); |
@@ -2747,7 +2746,7 @@ unlock: | |||
2747 | pte_unmap_unlock(page_table, ptl); | 2746 | pte_unmap_unlock(page_table, ptl); |
2748 | return 0; | 2747 | return 0; |
2749 | release: | 2748 | release: |
2750 | mem_cgroup_cancel_charge(page, memcg); | 2749 | mem_cgroup_cancel_charge(page, memcg, false); |
2751 | page_cache_release(page); | 2750 | page_cache_release(page); |
2752 | goto unlock; | 2751 | goto unlock; |
2753 | oom_free_page: | 2752 | oom_free_page: |
@@ -2824,7 +2823,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, | |||
2824 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2823 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
2825 | if (anon) { | 2824 | if (anon) { |
2826 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); | 2825 | inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); |
2827 | page_add_new_anon_rmap(page, vma, address); | 2826 | page_add_new_anon_rmap(page, vma, address, false); |
2828 | } else { | 2827 | } else { |
2829 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); | 2828 | inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); |
2830 | page_add_file_rmap(page); | 2829 | page_add_file_rmap(page); |
@@ -3000,7 +2999,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3000 | if (!new_page) | 2999 | if (!new_page) |
3001 | return VM_FAULT_OOM; | 3000 | return VM_FAULT_OOM; |
3002 | 3001 | ||
3003 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { | 3002 | if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) { |
3004 | page_cache_release(new_page); | 3003 | page_cache_release(new_page); |
3005 | return VM_FAULT_OOM; | 3004 | return VM_FAULT_OOM; |
3006 | } | 3005 | } |
@@ -3029,7 +3028,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3029 | goto uncharge_out; | 3028 | goto uncharge_out; |
3030 | } | 3029 | } |
3031 | do_set_pte(vma, address, new_page, pte, true, true); | 3030 | do_set_pte(vma, address, new_page, pte, true, true); |
3032 | mem_cgroup_commit_charge(new_page, memcg, false); | 3031 | mem_cgroup_commit_charge(new_page, memcg, false, false); |
3033 | lru_cache_add_active_or_unevictable(new_page, vma); | 3032 | lru_cache_add_active_or_unevictable(new_page, vma); |
3034 | pte_unmap_unlock(pte, ptl); | 3033 | pte_unmap_unlock(pte, ptl); |
3035 | if (fault_page) { | 3034 | if (fault_page) { |
@@ -3044,7 +3043,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3044 | } | 3043 | } |
3045 | return ret; | 3044 | return ret; |
3046 | uncharge_out: | 3045 | uncharge_out: |
3047 | mem_cgroup_cancel_charge(new_page, memcg); | 3046 | mem_cgroup_cancel_charge(new_page, memcg, false); |
3048 | page_cache_release(new_page); | 3047 | page_cache_release(new_page); |
3049 | return ret; | 3048 | return ret; |
3050 | } | 3049 | } |
@@ -3096,7 +3095,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3096 | * pinned by vma->vm_file's reference. We rely on unlock_page()'s | 3095 | * pinned by vma->vm_file's reference. We rely on unlock_page()'s |
3097 | * release semantics to prevent the compiler from undoing this copying. | 3096 | * release semantics to prevent the compiler from undoing this copying. |
3098 | */ | 3097 | */ |
3099 | mapping = fault_page->mapping; | 3098 | mapping = page_rmapping(fault_page); |
3100 | unlock_page(fault_page); | 3099 | unlock_page(fault_page); |
3101 | if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { | 3100 | if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { |
3102 | /* | 3101 | /* |
@@ -3198,6 +3197,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3198 | return 0; | 3197 | return 0; |
3199 | } | 3198 | } |
3200 | 3199 | ||
3200 | /* TODO: handle PTE-mapped THP */ | ||
3201 | if (PageCompound(page)) { | ||
3202 | pte_unmap_unlock(ptep, ptl); | ||
3203 | return 0; | ||
3204 | } | ||
3205 | |||
3201 | /* | 3206 | /* |
3202 | * Avoid grouping on RO pages in general. RO pages shouldn't hurt as | 3207 | * Avoid grouping on RO pages in general. RO pages shouldn't hurt as |
3203 | * much anyway since they can be in shared cache state. This misses | 3208 | * much anyway since they can be in shared cache state. This misses |
@@ -3370,17 +3375,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3370 | int ret; | 3375 | int ret; |
3371 | 3376 | ||
3372 | barrier(); | 3377 | barrier(); |
3373 | if (pmd_trans_huge(orig_pmd)) { | 3378 | if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { |
3374 | unsigned int dirty = flags & FAULT_FLAG_WRITE; | 3379 | unsigned int dirty = flags & FAULT_FLAG_WRITE; |
3375 | 3380 | ||
3376 | /* | ||
3377 | * If the pmd is splitting, return and retry the | ||
3378 | * the fault. Alternative: wait until the split | ||
3379 | * is done, and goto retry. | ||
3380 | */ | ||
3381 | if (pmd_trans_splitting(orig_pmd)) | ||
3382 | return 0; | ||
3383 | |||
3384 | if (pmd_protnone(orig_pmd)) | 3381 | if (pmd_protnone(orig_pmd)) |
3385 | return do_huge_pmd_numa_page(mm, vma, address, | 3382 | return do_huge_pmd_numa_page(mm, vma, address, |
3386 | orig_pmd, pmd); | 3383 | orig_pmd, pmd); |
@@ -3407,7 +3404,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3407 | unlikely(__pte_alloc(mm, vma, pmd, address))) | 3404 | unlikely(__pte_alloc(mm, vma, pmd, address))) |
3408 | return VM_FAULT_OOM; | 3405 | return VM_FAULT_OOM; |
3409 | /* if an huge pmd materialized from under us just retry later */ | 3406 | /* if an huge pmd materialized from under us just retry later */ |
3410 | if (unlikely(pmd_trans_huge(*pmd))) | 3407 | if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) |
3411 | return 0; | 3408 | return 0; |
3412 | /* | 3409 | /* |
3413 | * A regular pmd is established and it can't morph into a huge pmd | 3410 | * A regular pmd is established and it can't morph into a huge pmd |
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 92f95952692b..4af58a3a8ffa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include <linux/sysctl.h> | 17 | #include <linux/sysctl.h> |
18 | #include <linux/cpu.h> | 18 | #include <linux/cpu.h> |
19 | #include <linux/memory.h> | 19 | #include <linux/memory.h> |
20 | #include <linux/memremap.h> | ||
20 | #include <linux/memory_hotplug.h> | 21 | #include <linux/memory_hotplug.h> |
21 | #include <linux/highmem.h> | 22 | #include <linux/highmem.h> |
22 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> |
@@ -506,10 +507,25 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, | |||
506 | unsigned long i; | 507 | unsigned long i; |
507 | int err = 0; | 508 | int err = 0; |
508 | int start_sec, end_sec; | 509 | int start_sec, end_sec; |
510 | struct vmem_altmap *altmap; | ||
511 | |||
509 | /* during initialize mem_map, align hot-added range to section */ | 512 | /* during initialize mem_map, align hot-added range to section */ |
510 | start_sec = pfn_to_section_nr(phys_start_pfn); | 513 | start_sec = pfn_to_section_nr(phys_start_pfn); |
511 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); | 514 | end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); |
512 | 515 | ||
516 | altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); | ||
517 | if (altmap) { | ||
518 | /* | ||
519 | * Validate altmap is within bounds of the total request | ||
520 | */ | ||
521 | if (altmap->base_pfn != phys_start_pfn | ||
522 | || vmem_altmap_offset(altmap) > nr_pages) { | ||
523 | pr_warn_once("memory add fail, invalid altmap\n"); | ||
524 | return -EINVAL; | ||
525 | } | ||
526 | altmap->alloc = 0; | ||
527 | } | ||
528 | |||
513 | for (i = start_sec; i <= end_sec; i++) { | 529 | for (i = start_sec; i <= end_sec; i++) { |
514 | err = __add_section(nid, zone, section_nr_to_pfn(i)); | 530 | err = __add_section(nid, zone, section_nr_to_pfn(i)); |
515 | 531 | ||
@@ -731,7 +747,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) | |||
731 | pgdat_resize_unlock(zone->zone_pgdat, &flags); | 747 | pgdat_resize_unlock(zone->zone_pgdat, &flags); |
732 | } | 748 | } |
733 | 749 | ||
734 | static int __remove_section(struct zone *zone, struct mem_section *ms) | 750 | static int __remove_section(struct zone *zone, struct mem_section *ms, |
751 | unsigned long map_offset) | ||
735 | { | 752 | { |
736 | unsigned long start_pfn; | 753 | unsigned long start_pfn; |
737 | int scn_nr; | 754 | int scn_nr; |
@@ -748,7 +765,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) | |||
748 | start_pfn = section_nr_to_pfn(scn_nr); | 765 | start_pfn = section_nr_to_pfn(scn_nr); |
749 | __remove_zone(zone, start_pfn); | 766 | __remove_zone(zone, start_pfn); |
750 | 767 | ||
751 | sparse_remove_one_section(zone, ms); | 768 | sparse_remove_one_section(zone, ms, map_offset); |
752 | return 0; | 769 | return 0; |
753 | } | 770 | } |
754 | 771 | ||
@@ -767,9 +784,32 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
767 | unsigned long nr_pages) | 784 | unsigned long nr_pages) |
768 | { | 785 | { |
769 | unsigned long i; | 786 | unsigned long i; |
770 | int sections_to_remove; | 787 | unsigned long map_offset = 0; |
771 | resource_size_t start, size; | 788 | int sections_to_remove, ret = 0; |
772 | int ret = 0; | 789 | |
790 | /* In the ZONE_DEVICE case device driver owns the memory region */ | ||
791 | if (is_dev_zone(zone)) { | ||
792 | struct page *page = pfn_to_page(phys_start_pfn); | ||
793 | struct vmem_altmap *altmap; | ||
794 | |||
795 | altmap = to_vmem_altmap((unsigned long) page); | ||
796 | if (altmap) | ||
797 | map_offset = vmem_altmap_offset(altmap); | ||
798 | } else { | ||
799 | resource_size_t start, size; | ||
800 | |||
801 | start = phys_start_pfn << PAGE_SHIFT; | ||
802 | size = nr_pages * PAGE_SIZE; | ||
803 | |||
804 | ret = release_mem_region_adjustable(&iomem_resource, start, | ||
805 | size); | ||
806 | if (ret) { | ||
807 | resource_size_t endres = start + size - 1; | ||
808 | |||
809 | pr_warn("Unable to release resource <%pa-%pa> (%d)\n", | ||
810 | &start, &endres, ret); | ||
811 | } | ||
812 | } | ||
773 | 813 | ||
774 | /* | 814 | /* |
775 | * We can only remove entire sections | 815 | * We can only remove entire sections |
@@ -777,23 +817,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, | |||
777 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); | 817 | BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); |
778 | BUG_ON(nr_pages % PAGES_PER_SECTION); | 818 | BUG_ON(nr_pages % PAGES_PER_SECTION); |
779 | 819 | ||
780 | start = phys_start_pfn << PAGE_SHIFT; | ||
781 | size = nr_pages * PAGE_SIZE; | ||
782 | |||
783 | /* in the ZONE_DEVICE case device driver owns the memory region */ | ||
784 | if (!is_dev_zone(zone)) | ||
785 | ret = release_mem_region_adjustable(&iomem_resource, start, size); | ||
786 | if (ret) { | ||
787 | resource_size_t endres = start + size - 1; | ||
788 | |||
789 | pr_warn("Unable to release resource <%pa-%pa> (%d)\n", | ||
790 | &start, &endres, ret); | ||
791 | } | ||
792 | |||
793 | sections_to_remove = nr_pages / PAGES_PER_SECTION; | 820 | sections_to_remove = nr_pages / PAGES_PER_SECTION; |
794 | for (i = 0; i < sections_to_remove; i++) { | 821 | for (i = 0; i < sections_to_remove; i++) { |
795 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; | 822 | unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; |
796 | ret = __remove_section(zone, __pfn_to_section(pfn)); | 823 | |
824 | ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); | ||
825 | map_offset = 0; | ||
797 | if (ret) | 826 | if (ret) |
798 | break; | 827 | break; |
799 | } | 828 | } |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d8caff071a30..27d135408a22 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -489,14 +489,33 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, | |||
489 | struct page *page; | 489 | struct page *page; |
490 | struct queue_pages *qp = walk->private; | 490 | struct queue_pages *qp = walk->private; |
491 | unsigned long flags = qp->flags; | 491 | unsigned long flags = qp->flags; |
492 | int nid; | 492 | int nid, ret; |
493 | pte_t *pte; | 493 | pte_t *pte; |
494 | spinlock_t *ptl; | 494 | spinlock_t *ptl; |
495 | 495 | ||
496 | split_huge_page_pmd(vma, addr, pmd); | 496 | if (pmd_trans_huge(*pmd)) { |
497 | if (pmd_trans_unstable(pmd)) | 497 | ptl = pmd_lock(walk->mm, pmd); |
498 | return 0; | 498 | if (pmd_trans_huge(*pmd)) { |
499 | page = pmd_page(*pmd); | ||
500 | if (is_huge_zero_page(page)) { | ||
501 | spin_unlock(ptl); | ||
502 | split_huge_pmd(vma, pmd, addr); | ||
503 | } else { | ||
504 | get_page(page); | ||
505 | spin_unlock(ptl); | ||
506 | lock_page(page); | ||
507 | ret = split_huge_page(page); | ||
508 | unlock_page(page); | ||
509 | put_page(page); | ||
510 | if (ret) | ||
511 | return 0; | ||
512 | } | ||
513 | } else { | ||
514 | spin_unlock(ptl); | ||
515 | } | ||
516 | } | ||
499 | 517 | ||
518 | retry: | ||
500 | pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); | 519 | pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); |
501 | for (; addr != end; pte++, addr += PAGE_SIZE) { | 520 | for (; addr != end; pte++, addr += PAGE_SIZE) { |
502 | if (!pte_present(*pte)) | 521 | if (!pte_present(*pte)) |
@@ -513,6 +532,21 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, | |||
513 | nid = page_to_nid(page); | 532 | nid = page_to_nid(page); |
514 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) | 533 | if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT)) |
515 | continue; | 534 | continue; |
535 | if (PageTail(page) && PageAnon(page)) { | ||
536 | get_page(page); | ||
537 | pte_unmap_unlock(pte, ptl); | ||
538 | lock_page(page); | ||
539 | ret = split_huge_page(page); | ||
540 | unlock_page(page); | ||
541 | put_page(page); | ||
542 | /* Failed to split -- skip. */ | ||
543 | if (ret) { | ||
544 | pte = pte_offset_map_lock(walk->mm, pmd, | ||
545 | addr, &ptl); | ||
546 | continue; | ||
547 | } | ||
548 | goto retry; | ||
549 | } | ||
516 | 550 | ||
517 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | 551 | if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) |
518 | migrate_page_add(page, qp->pagelist, flags); | 552 | migrate_page_add(page, qp->pagelist, flags); |
@@ -610,7 +644,8 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, | |||
610 | 644 | ||
611 | if (flags & MPOL_MF_LAZY) { | 645 | if (flags & MPOL_MF_LAZY) { |
612 | /* Similar to task_numa_work, skip inaccessible VMAs */ | 646 | /* Similar to task_numa_work, skip inaccessible VMAs */ |
613 | if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | 647 | if (vma_migratable(vma) && |
648 | vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) | ||
614 | change_prot_numa(vma, start, endvma); | 649 | change_prot_numa(vma, start, endvma); |
615 | return 1; | 650 | return 1; |
616 | } | 651 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index 7890d0bb5e23..b1034f9c77e7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -165,9 +165,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
165 | if (PageAnon(new)) | 165 | if (PageAnon(new)) |
166 | hugepage_add_anon_rmap(new, vma, addr); | 166 | hugepage_add_anon_rmap(new, vma, addr); |
167 | else | 167 | else |
168 | page_dup_rmap(new); | 168 | page_dup_rmap(new, true); |
169 | } else if (PageAnon(new)) | 169 | } else if (PageAnon(new)) |
170 | page_add_anon_rmap(new, vma, addr); | 170 | page_add_anon_rmap(new, vma, addr, false); |
171 | else | 171 | else |
172 | page_add_file_rmap(new); | 172 | page_add_file_rmap(new); |
173 | 173 | ||
@@ -943,9 +943,13 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page, | |||
943 | goto out; | 943 | goto out; |
944 | } | 944 | } |
945 | 945 | ||
946 | if (unlikely(PageTransHuge(page))) | 946 | if (unlikely(PageTransHuge(page))) { |
947 | if (unlikely(split_huge_page(page))) | 947 | lock_page(page); |
948 | rc = split_huge_page(page); | ||
949 | unlock_page(page); | ||
950 | if (rc) | ||
948 | goto out; | 951 | goto out; |
952 | } | ||
949 | 953 | ||
950 | rc = __unmap_and_move(page, newpage, force, mode); | 954 | rc = __unmap_and_move(page, newpage, force, mode); |
951 | if (rc == MIGRATEPAGE_SUCCESS) | 955 | if (rc == MIGRATEPAGE_SUCCESS) |
@@ -1756,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1756 | HPAGE_PMD_ORDER); | 1760 | HPAGE_PMD_ORDER); |
1757 | if (!new_page) | 1761 | if (!new_page) |
1758 | goto out_fail; | 1762 | goto out_fail; |
1763 | prep_transhuge_page(new_page); | ||
1759 | 1764 | ||
1760 | isolated = numamigrate_isolate_page(pgdat, page); | 1765 | isolated = numamigrate_isolate_page(pgdat, page); |
1761 | if (!isolated) { | 1766 | if (!isolated) { |
@@ -1767,7 +1772,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, | |||
1767 | flush_tlb_range(vma, mmun_start, mmun_end); | 1772 | flush_tlb_range(vma, mmun_start, mmun_end); |
1768 | 1773 | ||
1769 | /* Prepare a page as a migration target */ | 1774 | /* Prepare a page as a migration target */ |
1770 | __set_page_locked(new_page); | 1775 | __SetPageLocked(new_page); |
1771 | SetPageSwapBacked(new_page); | 1776 | SetPageSwapBacked(new_page); |
1772 | 1777 | ||
1773 | /* anon mapping, we can simply copy page->mapping to the new page: */ | 1778 | /* anon mapping, we can simply copy page->mapping to the new page: */ |
@@ -1815,7 +1820,7 @@ fail_putback: | |||
1815 | * guarantee the copy is visible before the pagetable update. | 1820 | * guarantee the copy is visible before the pagetable update. |
1816 | */ | 1821 | */ |
1817 | flush_cache_range(vma, mmun_start, mmun_end); | 1822 | flush_cache_range(vma, mmun_start, mmun_end); |
1818 | page_add_anon_rmap(new_page, vma, mmun_start); | 1823 | page_add_anon_rmap(new_page, vma, mmun_start, true); |
1819 | pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); | 1824 | pmdp_huge_clear_flush_notify(vma, mmun_start, pmd); |
1820 | set_pmd_at(mm, mmun_start, pmd, entry); | 1825 | set_pmd_at(mm, mmun_start, pmd, entry); |
1821 | flush_tlb_range(vma, mmun_start, mmun_end); | 1826 | flush_tlb_range(vma, mmun_start, mmun_end); |
@@ -1826,14 +1831,14 @@ fail_putback: | |||
1826 | flush_tlb_range(vma, mmun_start, mmun_end); | 1831 | flush_tlb_range(vma, mmun_start, mmun_end); |
1827 | mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); | 1832 | mmu_notifier_invalidate_range(mm, mmun_start, mmun_end); |
1828 | update_mmu_cache_pmd(vma, address, &entry); | 1833 | update_mmu_cache_pmd(vma, address, &entry); |
1829 | page_remove_rmap(new_page); | 1834 | page_remove_rmap(new_page, true); |
1830 | goto fail_putback; | 1835 | goto fail_putback; |
1831 | } | 1836 | } |
1832 | 1837 | ||
1833 | mlock_migrate_page(new_page, page); | 1838 | mlock_migrate_page(new_page, page); |
1834 | set_page_memcg(new_page, page_memcg(page)); | 1839 | set_page_memcg(new_page, page_memcg(page)); |
1835 | set_page_memcg(page, NULL); | 1840 | set_page_memcg(page, NULL); |
1836 | page_remove_rmap(page); | 1841 | page_remove_rmap(page, true); |
1837 | 1842 | ||
1838 | spin_unlock(ptl); | 1843 | spin_unlock(ptl); |
1839 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | 1844 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); |
diff --git a/mm/mincore.c b/mm/mincore.c index 14bb9fb37f0c..2a565ed8bb49 100644 --- a/mm/mincore.c +++ b/mm/mincore.c | |||
@@ -117,7 +117,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, | |||
117 | unsigned char *vec = walk->private; | 117 | unsigned char *vec = walk->private; |
118 | int nr = (end - addr) >> PAGE_SHIFT; | 118 | int nr = (end - addr) >> PAGE_SHIFT; |
119 | 119 | ||
120 | if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { | 120 | if (pmd_trans_huge_lock(pmd, vma, &ptl)) { |
121 | memset(vec, 1, nr); | 121 | memset(vec, 1, nr); |
122 | spin_unlock(ptl); | 122 | spin_unlock(ptl); |
123 | goto out; | 123 | goto out; |
diff --git a/mm/mlock.c b/mm/mlock.c index 9cb87cbc4071..e1e2b1207bf2 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -24,13 +24,13 @@ | |||
24 | 24 | ||
25 | #include "internal.h" | 25 | #include "internal.h" |
26 | 26 | ||
27 | int can_do_mlock(void) | 27 | bool can_do_mlock(void) |
28 | { | 28 | { |
29 | if (rlimit(RLIMIT_MEMLOCK) != 0) | 29 | if (rlimit(RLIMIT_MEMLOCK) != 0) |
30 | return 1; | 30 | return true; |
31 | if (capable(CAP_IPC_LOCK)) | 31 | if (capable(CAP_IPC_LOCK)) |
32 | return 1; | 32 | return true; |
33 | return 0; | 33 | return false; |
34 | } | 34 | } |
35 | EXPORT_SYMBOL(can_do_mlock); | 35 | EXPORT_SYMBOL(can_do_mlock); |
36 | 36 | ||
@@ -82,6 +82,9 @@ void mlock_vma_page(struct page *page) | |||
82 | /* Serialize with page migration */ | 82 | /* Serialize with page migration */ |
83 | BUG_ON(!PageLocked(page)); | 83 | BUG_ON(!PageLocked(page)); |
84 | 84 | ||
85 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
86 | VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); | ||
87 | |||
85 | if (!TestSetPageMlocked(page)) { | 88 | if (!TestSetPageMlocked(page)) { |
86 | mod_zone_page_state(page_zone(page), NR_MLOCK, | 89 | mod_zone_page_state(page_zone(page), NR_MLOCK, |
87 | hpage_nr_pages(page)); | 90 | hpage_nr_pages(page)); |
@@ -178,6 +181,8 @@ unsigned int munlock_vma_page(struct page *page) | |||
178 | /* For try_to_munlock() and to serialize with page migration */ | 181 | /* For try_to_munlock() and to serialize with page migration */ |
179 | BUG_ON(!PageLocked(page)); | 182 | BUG_ON(!PageLocked(page)); |
180 | 183 | ||
184 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
185 | |||
181 | /* | 186 | /* |
182 | * Serialize with any parallel __split_huge_page_refcount() which | 187 | * Serialize with any parallel __split_huge_page_refcount() which |
183 | * might otherwise copy PageMlocked to part of the tail pages before | 188 | * might otherwise copy PageMlocked to part of the tail pages before |
@@ -388,6 +393,13 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, | |||
388 | if (!page || page_zone_id(page) != zoneid) | 393 | if (!page || page_zone_id(page) != zoneid) |
389 | break; | 394 | break; |
390 | 395 | ||
396 | /* | ||
397 | * Do not use pagevec for PTE-mapped THP, | ||
398 | * munlock_vma_pages_range() will handle them. | ||
399 | */ | ||
400 | if (PageTransCompound(page)) | ||
401 | break; | ||
402 | |||
391 | get_page(page); | 403 | get_page(page); |
392 | /* | 404 | /* |
393 | * Increase the address that will be returned *before* the | 405 | * Increase the address that will be returned *before* the |
@@ -444,7 +456,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
444 | &page_mask); | 456 | &page_mask); |
445 | 457 | ||
446 | if (page && !IS_ERR(page)) { | 458 | if (page && !IS_ERR(page)) { |
447 | if (PageTransHuge(page)) { | 459 | if (PageTransTail(page)) { |
460 | VM_BUG_ON_PAGE(PageMlocked(page), page); | ||
461 | put_page(page); /* follow_page_mask() */ | ||
462 | } else if (PageTransHuge(page)) { | ||
448 | lock_page(page); | 463 | lock_page(page); |
449 | /* | 464 | /* |
450 | * Any THP page found by follow_page_mask() may | 465 | * Any THP page found by follow_page_mask() may |
@@ -477,8 +492,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, | |||
477 | goto next; | 492 | goto next; |
478 | } | 493 | } |
479 | } | 494 | } |
480 | /* It's a bug to munlock in the middle of a THP page */ | ||
481 | VM_BUG_ON((start >> PAGE_SHIFT) & page_mask); | ||
482 | page_increm = 1 + page_mask; | 495 | page_increm = 1 + page_mask; |
483 | start += page_increm * PAGE_SIZE; | 496 | start += page_increm * PAGE_SIZE; |
484 | next: | 497 | next: |
@@ -3184,10 +3184,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) | |||
3184 | * mapping->flags avoid to take the same lock twice, if more than one | 3184 | * mapping->flags avoid to take the same lock twice, if more than one |
3185 | * vma in this mm is backed by the same anon_vma or address_space. | 3185 | * vma in this mm is backed by the same anon_vma or address_space. |
3186 | * | 3186 | * |
3187 | * We can take all the locks in random order because the VM code | 3187 | * We take locks in following order, accordingly to comment at beginning |
3188 | * taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never | 3188 | * of mm/rmap.c: |
3189 | * takes more than one of them in a row. Secondly we're protected | 3189 | * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for |
3190 | * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex. | 3190 | * hugetlb mapping); |
3191 | * - all i_mmap_rwsem locks; | ||
3192 | * - all anon_vma->rwseml | ||
3193 | * | ||
3194 | * We can take all locks within these types randomly because the VM code | ||
3195 | * doesn't nest them and we protected from parallel mm_take_all_locks() by | ||
3196 | * mm_all_locks_mutex. | ||
3191 | * | 3197 | * |
3192 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations | 3198 | * mm_take_all_locks() and mm_drop_all_locks are expensive operations |
3193 | * that may have to take thousand of locks. | 3199 | * that may have to take thousand of locks. |
@@ -3206,7 +3212,16 @@ int mm_take_all_locks(struct mm_struct *mm) | |||
3206 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 3212 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
3207 | if (signal_pending(current)) | 3213 | if (signal_pending(current)) |
3208 | goto out_unlock; | 3214 | goto out_unlock; |
3209 | if (vma->vm_file && vma->vm_file->f_mapping) | 3215 | if (vma->vm_file && vma->vm_file->f_mapping && |
3216 | is_vm_hugetlb_page(vma)) | ||
3217 | vm_lock_mapping(mm, vma->vm_file->f_mapping); | ||
3218 | } | ||
3219 | |||
3220 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
3221 | if (signal_pending(current)) | ||
3222 | goto out_unlock; | ||
3223 | if (vma->vm_file && vma->vm_file->f_mapping && | ||
3224 | !is_vm_hugetlb_page(vma)) | ||
3210 | vm_lock_mapping(mm, vma->vm_file->f_mapping); | 3225 | vm_lock_mapping(mm, vma->vm_file->f_mapping); |
3211 | } | 3226 | } |
3212 | 3227 | ||
diff --git a/mm/mprotect.c b/mm/mprotect.c index c764402c464f..8eb7bb40dc40 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
@@ -149,7 +149,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
149 | unsigned long this_pages; | 149 | unsigned long this_pages; |
150 | 150 | ||
151 | next = pmd_addr_end(addr, end); | 151 | next = pmd_addr_end(addr, end); |
152 | if (!pmd_trans_huge(*pmd) && pmd_none_or_clear_bad(pmd)) | 152 | if (!pmd_trans_huge(*pmd) && !pmd_devmap(*pmd) |
153 | && pmd_none_or_clear_bad(pmd)) | ||
153 | continue; | 154 | continue; |
154 | 155 | ||
155 | /* invoke the mmu notifier if the pmd is populated */ | 156 | /* invoke the mmu notifier if the pmd is populated */ |
@@ -158,9 +159,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, | |||
158 | mmu_notifier_invalidate_range_start(mm, mni_start, end); | 159 | mmu_notifier_invalidate_range_start(mm, mni_start, end); |
159 | } | 160 | } |
160 | 161 | ||
161 | if (pmd_trans_huge(*pmd)) { | 162 | if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { |
162 | if (next - addr != HPAGE_PMD_SIZE) | 163 | if (next - addr != HPAGE_PMD_SIZE) |
163 | split_huge_page_pmd(vma, addr, pmd); | 164 | split_huge_pmd(vma, pmd, addr); |
164 | else { | 165 | else { |
165 | int nr_ptes = change_huge_pmd(vma, pmd, addr, | 166 | int nr_ptes = change_huge_pmd(vma, pmd, addr, |
166 | newprot, prot_numa); | 167 | newprot, prot_numa); |
diff --git a/mm/mremap.c b/mm/mremap.c index e55b157865d5..d77946a997f7 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -192,25 +192,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma, | |||
192 | if (!new_pmd) | 192 | if (!new_pmd) |
193 | break; | 193 | break; |
194 | if (pmd_trans_huge(*old_pmd)) { | 194 | if (pmd_trans_huge(*old_pmd)) { |
195 | int err = 0; | ||
196 | if (extent == HPAGE_PMD_SIZE) { | 195 | if (extent == HPAGE_PMD_SIZE) { |
196 | bool moved; | ||
197 | VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, | 197 | VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, |
198 | vma); | 198 | vma); |
199 | /* See comment in move_ptes() */ | 199 | /* See comment in move_ptes() */ |
200 | if (need_rmap_locks) | 200 | if (need_rmap_locks) |
201 | anon_vma_lock_write(vma->anon_vma); | 201 | anon_vma_lock_write(vma->anon_vma); |
202 | err = move_huge_pmd(vma, new_vma, old_addr, | 202 | moved = move_huge_pmd(vma, new_vma, old_addr, |
203 | new_addr, old_end, | 203 | new_addr, old_end, |
204 | old_pmd, new_pmd); | 204 | old_pmd, new_pmd); |
205 | if (need_rmap_locks) | 205 | if (need_rmap_locks) |
206 | anon_vma_unlock_write(vma->anon_vma); | 206 | anon_vma_unlock_write(vma->anon_vma); |
207 | if (moved) { | ||
208 | need_flush = true; | ||
209 | continue; | ||
210 | } | ||
207 | } | 211 | } |
208 | if (err > 0) { | 212 | split_huge_pmd(vma, old_pmd, old_addr); |
209 | need_flush = true; | ||
210 | continue; | ||
211 | } else if (!err) { | ||
212 | split_huge_page_pmd(vma, old_addr, old_pmd); | ||
213 | } | ||
214 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); | 213 | VM_BUG_ON(pmd_trans_huge(*old_pmd)); |
215 | } | 214 | } |
216 | if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, | 215 | if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce63d603820f..63358d9f9aa9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -43,6 +43,7 @@ | |||
43 | #include <linux/vmalloc.h> | 43 | #include <linux/vmalloc.h> |
44 | #include <linux/vmstat.h> | 44 | #include <linux/vmstat.h> |
45 | #include <linux/mempolicy.h> | 45 | #include <linux/mempolicy.h> |
46 | #include <linux/memremap.h> | ||
46 | #include <linux/stop_machine.h> | 47 | #include <linux/stop_machine.h> |
47 | #include <linux/sort.h> | 48 | #include <linux/sort.h> |
48 | #include <linux/pfn.h> | 49 | #include <linux/pfn.h> |
@@ -222,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = { | |||
222 | #endif | 223 | #endif |
223 | }; | 224 | }; |
224 | 225 | ||
225 | static void free_compound_page(struct page *page); | ||
226 | compound_page_dtor * const compound_page_dtors[] = { | 226 | compound_page_dtor * const compound_page_dtors[] = { |
227 | NULL, | 227 | NULL, |
228 | free_compound_page, | 228 | free_compound_page, |
229 | #ifdef CONFIG_HUGETLB_PAGE | 229 | #ifdef CONFIG_HUGETLB_PAGE |
230 | free_huge_page, | 230 | free_huge_page, |
231 | #endif | 231 | #endif |
232 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
233 | free_transhuge_page, | ||
234 | #endif | ||
232 | }; | 235 | }; |
233 | 236 | ||
234 | int min_free_kbytes = 1024; | 237 | int min_free_kbytes = 1024; |
@@ -450,7 +453,7 @@ out: | |||
450 | * This usage means that zero-order pages may not be compound. | 453 | * This usage means that zero-order pages may not be compound. |
451 | */ | 454 | */ |
452 | 455 | ||
453 | static void free_compound_page(struct page *page) | 456 | void free_compound_page(struct page *page) |
454 | { | 457 | { |
455 | __free_pages_ok(page, compound_order(page)); | 458 | __free_pages_ok(page, compound_order(page)); |
456 | } | 459 | } |
@@ -466,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order) | |||
466 | for (i = 1; i < nr_pages; i++) { | 469 | for (i = 1; i < nr_pages; i++) { |
467 | struct page *p = page + i; | 470 | struct page *p = page + i; |
468 | set_page_count(p, 0); | 471 | set_page_count(p, 0); |
472 | p->mapping = TAIL_MAPPING; | ||
469 | set_compound_head(p, page); | 473 | set_compound_head(p, page); |
470 | } | 474 | } |
475 | atomic_set(compound_mapcount_ptr(page), -1); | ||
471 | } | 476 | } |
472 | 477 | ||
473 | #ifdef CONFIG_DEBUG_PAGEALLOC | 478 | #ifdef CONFIG_DEBUG_PAGEALLOC |
@@ -732,7 +737,7 @@ static inline int free_pages_check(struct page *page) | |||
732 | const char *bad_reason = NULL; | 737 | const char *bad_reason = NULL; |
733 | unsigned long bad_flags = 0; | 738 | unsigned long bad_flags = 0; |
734 | 739 | ||
735 | if (unlikely(page_mapcount(page))) | 740 | if (unlikely(atomic_read(&page->_mapcount) != -1)) |
736 | bad_reason = "nonzero mapcount"; | 741 | bad_reason = "nonzero mapcount"; |
737 | if (unlikely(page->mapping != NULL)) | 742 | if (unlikely(page->mapping != NULL)) |
738 | bad_reason = "non-NULL mapping"; | 743 | bad_reason = "non-NULL mapping"; |
@@ -856,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) | |||
856 | ret = 0; | 861 | ret = 0; |
857 | goto out; | 862 | goto out; |
858 | } | 863 | } |
864 | switch (page - head_page) { | ||
865 | case 1: | ||
866 | /* the first tail page: ->mapping is compound_mapcount() */ | ||
867 | if (unlikely(compound_mapcount(page))) { | ||
868 | bad_page(page, "nonzero compound_mapcount", 0); | ||
869 | goto out; | ||
870 | } | ||
871 | break; | ||
872 | case 2: | ||
873 | /* | ||
874 | * the second tail page: ->mapping is | ||
875 | * page_deferred_list().next -- ignore value. | ||
876 | */ | ||
877 | break; | ||
878 | default: | ||
879 | if (page->mapping != TAIL_MAPPING) { | ||
880 | bad_page(page, "corrupted mapping in tail page", 0); | ||
881 | goto out; | ||
882 | } | ||
883 | break; | ||
884 | } | ||
859 | if (unlikely(!PageTail(page))) { | 885 | if (unlikely(!PageTail(page))) { |
860 | bad_page(page, "PageTail not set", 0); | 886 | bad_page(page, "PageTail not set", 0); |
861 | goto out; | 887 | goto out; |
@@ -866,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) | |||
866 | } | 892 | } |
867 | ret = 0; | 893 | ret = 0; |
868 | out: | 894 | out: |
895 | page->mapping = NULL; | ||
869 | clear_compound_head(page); | 896 | clear_compound_head(page); |
870 | return ret; | 897 | return ret; |
871 | } | 898 | } |
@@ -1329,7 +1356,7 @@ static inline int check_new_page(struct page *page) | |||
1329 | const char *bad_reason = NULL; | 1356 | const char *bad_reason = NULL; |
1330 | unsigned long bad_flags = 0; | 1357 | unsigned long bad_flags = 0; |
1331 | 1358 | ||
1332 | if (unlikely(page_mapcount(page))) | 1359 | if (unlikely(atomic_read(&page->_mapcount) != -1)) |
1333 | bad_reason = "nonzero mapcount"; | 1360 | bad_reason = "nonzero mapcount"; |
1334 | if (unlikely(page->mapping != NULL)) | 1361 | if (unlikely(page->mapping != NULL)) |
1335 | bad_reason = "non-NULL mapping"; | 1362 | bad_reason = "non-NULL mapping"; |
@@ -4459,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size) | |||
4459 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | 4486 | void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, |
4460 | unsigned long start_pfn, enum memmap_context context) | 4487 | unsigned long start_pfn, enum memmap_context context) |
4461 | { | 4488 | { |
4462 | pg_data_t *pgdat = NODE_DATA(nid); | 4489 | struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); |
4463 | unsigned long end_pfn = start_pfn + size; | 4490 | unsigned long end_pfn = start_pfn + size; |
4491 | pg_data_t *pgdat = NODE_DATA(nid); | ||
4464 | unsigned long pfn; | 4492 | unsigned long pfn; |
4465 | struct zone *z; | ||
4466 | unsigned long nr_initialised = 0; | 4493 | unsigned long nr_initialised = 0; |
4467 | 4494 | ||
4468 | if (highest_memmap_pfn < end_pfn - 1) | 4495 | if (highest_memmap_pfn < end_pfn - 1) |
4469 | highest_memmap_pfn = end_pfn - 1; | 4496 | highest_memmap_pfn = end_pfn - 1; |
4470 | 4497 | ||
4471 | z = &pgdat->node_zones[zone]; | 4498 | /* |
4499 | * Honor reservation requested by the driver for this ZONE_DEVICE | ||
4500 | * memory | ||
4501 | */ | ||
4502 | if (altmap && start_pfn == altmap->base_pfn) | ||
4503 | start_pfn += altmap->reserve; | ||
4504 | |||
4472 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { | 4505 | for (pfn = start_pfn; pfn < end_pfn; pfn++) { |
4473 | /* | 4506 | /* |
4474 | * There can be holes in boot-time mem_map[]s | 4507 | * There can be holes in boot-time mem_map[]s |
diff --git a/mm/page_idle.c b/mm/page_idle.c index d5dd79041484..4ea9c4ef5146 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c | |||
@@ -55,25 +55,26 @@ static int page_idle_clear_pte_refs_one(struct page *page, | |||
55 | unsigned long addr, void *arg) | 55 | unsigned long addr, void *arg) |
56 | { | 56 | { |
57 | struct mm_struct *mm = vma->vm_mm; | 57 | struct mm_struct *mm = vma->vm_mm; |
58 | spinlock_t *ptl; | ||
59 | pmd_t *pmd; | 58 | pmd_t *pmd; |
60 | pte_t *pte; | 59 | pte_t *pte; |
60 | spinlock_t *ptl; | ||
61 | bool referenced = false; | 61 | bool referenced = false; |
62 | 62 | ||
63 | if (unlikely(PageTransHuge(page))) { | 63 | if (!page_check_address_transhuge(page, mm, addr, &pmd, &pte, &ptl)) |
64 | pmd = page_check_address_pmd(page, mm, addr, | 64 | return SWAP_AGAIN; |
65 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); | 65 | |
66 | if (pmd) { | 66 | if (pte) { |
67 | referenced = pmdp_clear_young_notify(vma, addr, pmd); | 67 | referenced = ptep_clear_young_notify(vma, addr, pte); |
68 | spin_unlock(ptl); | 68 | pte_unmap(pte); |
69 | } | 69 | } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { |
70 | referenced = pmdp_clear_young_notify(vma, addr, pmd); | ||
70 | } else { | 71 | } else { |
71 | pte = page_check_address(page, mm, addr, &ptl, 0); | 72 | /* unexpected pmd-mapped page? */ |
72 | if (pte) { | 73 | WARN_ON_ONCE(1); |
73 | referenced = ptep_clear_young_notify(vma, addr, pte); | ||
74 | pte_unmap_unlock(pte, ptl); | ||
75 | } | ||
76 | } | 74 | } |
75 | |||
76 | spin_unlock(ptl); | ||
77 | |||
77 | if (referenced) { | 78 | if (referenced) { |
78 | clear_page_idle(page); | 79 | clear_page_idle(page); |
79 | /* | 80 | /* |
diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5e139fec6c6c..92c4c36501e7 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c | |||
@@ -196,8 +196,10 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, | |||
196 | { | 196 | { |
197 | unsigned long pfn; | 197 | unsigned long pfn; |
198 | struct page *page; | 198 | struct page *page; |
199 | BUG_ON((start_pfn) & (pageblock_nr_pages - 1)); | 199 | |
200 | BUG_ON((end_pfn) & (pageblock_nr_pages - 1)); | 200 | BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages)); |
201 | BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages)); | ||
202 | |||
201 | for (pfn = start_pfn; | 203 | for (pfn = start_pfn; |
202 | pfn < end_pfn; | 204 | pfn < end_pfn; |
203 | pfn += pageblock_nr_pages) { | 205 | pfn += pageblock_nr_pages) { |
diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 29f2f8b853ae..207244489a68 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c | |||
@@ -58,7 +58,7 @@ again: | |||
58 | if (!walk->pte_entry) | 58 | if (!walk->pte_entry) |
59 | continue; | 59 | continue; |
60 | 60 | ||
61 | split_huge_page_pmd_mm(walk->mm, addr, pmd); | 61 | split_huge_pmd(walk->vma, pmd, addr); |
62 | if (pmd_trans_unstable(pmd)) | 62 | if (pmd_trans_unstable(pmd)) |
63 | goto again; | 63 | goto again; |
64 | err = walk_pte_range(pmd, addr, next, walk); | 64 | err = walk_pte_range(pmd, addr, next, walk); |
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4c681baff363..9d4767698a1c 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c | |||
@@ -132,25 +132,13 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, | |||
132 | { | 132 | { |
133 | pmd_t pmd; | 133 | pmd_t pmd; |
134 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | 134 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
135 | VM_BUG_ON(!pmd_trans_huge(*pmdp)); | 135 | VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); |
136 | pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); | 136 | pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); |
137 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | 137 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
138 | return pmd; | 138 | return pmd; |
139 | } | 139 | } |
140 | #endif | 140 | #endif |
141 | 141 | ||
142 | #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH | ||
143 | void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, | ||
144 | pmd_t *pmdp) | ||
145 | { | ||
146 | pmd_t pmd = pmd_mksplitting(*pmdp); | ||
147 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); | ||
148 | set_pmd_at(vma->vm_mm, address, pmdp, pmd); | ||
149 | /* tlb flush only to serialize against gup-fast */ | ||
150 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); | ||
151 | } | ||
152 | #endif | ||
153 | |||
154 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT | 142 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT |
155 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, | 143 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, |
156 | pgtable_t pgtable) | 144 | pgtable_t pgtable) |
@@ -23,21 +23,22 @@ | |||
23 | * inode->i_mutex (while writing or truncating, not reading or faulting) | 23 | * inode->i_mutex (while writing or truncating, not reading or faulting) |
24 | * mm->mmap_sem | 24 | * mm->mmap_sem |
25 | * page->flags PG_locked (lock_page) | 25 | * page->flags PG_locked (lock_page) |
26 | * mapping->i_mmap_rwsem | 26 | * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) |
27 | * anon_vma->rwsem | 27 | * mapping->i_mmap_rwsem |
28 | * mm->page_table_lock or pte_lock | 28 | * anon_vma->rwsem |
29 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) | 29 | * mm->page_table_lock or pte_lock |
30 | * swap_lock (in swap_duplicate, swap_info_get) | 30 | * zone->lru_lock (in mark_page_accessed, isolate_lru_page) |
31 | * mmlist_lock (in mmput, drain_mmlist and others) | 31 | * swap_lock (in swap_duplicate, swap_info_get) |
32 | * mapping->private_lock (in __set_page_dirty_buffers) | 32 | * mmlist_lock (in mmput, drain_mmlist and others) |
33 | * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) | 33 | * mapping->private_lock (in __set_page_dirty_buffers) |
34 | * mapping->tree_lock (widely used) | 34 | * mem_cgroup_{begin,end}_page_stat (memcg->move_lock) |
35 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) | 35 | * mapping->tree_lock (widely used) |
36 | * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) | 36 | * inode->i_lock (in set_page_dirty's __mark_inode_dirty) |
37 | * sb_lock (within inode_lock in fs/fs-writeback.c) | 37 | * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) |
38 | * mapping->tree_lock (widely used, in set_page_dirty, | 38 | * sb_lock (within inode_lock in fs/fs-writeback.c) |
39 | * in arch-dependent flush_dcache_mmap_lock, | 39 | * mapping->tree_lock (widely used, in set_page_dirty, |
40 | * within bdi.wb->list_lock in __sync_single_inode) | 40 | * in arch-dependent flush_dcache_mmap_lock, |
41 | * within bdi.wb->list_lock in __sync_single_inode) | ||
41 | * | 42 | * |
42 | * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) | 43 | * anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon) |
43 | * ->tasklist_lock | 44 | * ->tasklist_lock |
@@ -567,27 +568,6 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) | |||
567 | anon_vma_unlock_read(anon_vma); | 568 | anon_vma_unlock_read(anon_vma); |
568 | } | 569 | } |
569 | 570 | ||
570 | /* | ||
571 | * At what user virtual address is page expected in @vma? | ||
572 | */ | ||
573 | static inline unsigned long | ||
574 | __vma_address(struct page *page, struct vm_area_struct *vma) | ||
575 | { | ||
576 | pgoff_t pgoff = page_to_pgoff(page); | ||
577 | return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); | ||
578 | } | ||
579 | |||
580 | inline unsigned long | ||
581 | vma_address(struct page *page, struct vm_area_struct *vma) | ||
582 | { | ||
583 | unsigned long address = __vma_address(page, vma); | ||
584 | |||
585 | /* page should be within @vma mapping range */ | ||
586 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); | ||
587 | |||
588 | return address; | ||
589 | } | ||
590 | |||
591 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH | 571 | #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH |
592 | static void percpu_flush_tlb_batch_pages(void *data) | 572 | static void percpu_flush_tlb_batch_pages(void *data) |
593 | { | 573 | { |
@@ -819,6 +799,96 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) | |||
819 | return 1; | 799 | return 1; |
820 | } | 800 | } |
821 | 801 | ||
802 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
803 | /* | ||
804 | * Check that @page is mapped at @address into @mm. In contrast to | ||
805 | * page_check_address(), this function can handle transparent huge pages. | ||
806 | * | ||
807 | * On success returns true with pte mapped and locked. For PMD-mapped | ||
808 | * transparent huge pages *@ptep is set to NULL. | ||
809 | */ | ||
810 | bool page_check_address_transhuge(struct page *page, struct mm_struct *mm, | ||
811 | unsigned long address, pmd_t **pmdp, | ||
812 | pte_t **ptep, spinlock_t **ptlp) | ||
813 | { | ||
814 | pgd_t *pgd; | ||
815 | pud_t *pud; | ||
816 | pmd_t *pmd; | ||
817 | pte_t *pte; | ||
818 | spinlock_t *ptl; | ||
819 | |||
820 | if (unlikely(PageHuge(page))) { | ||
821 | /* when pud is not present, pte will be NULL */ | ||
822 | pte = huge_pte_offset(mm, address); | ||
823 | if (!pte) | ||
824 | return false; | ||
825 | |||
826 | ptl = huge_pte_lockptr(page_hstate(page), mm, pte); | ||
827 | pmd = NULL; | ||
828 | goto check_pte; | ||
829 | } | ||
830 | |||
831 | pgd = pgd_offset(mm, address); | ||
832 | if (!pgd_present(*pgd)) | ||
833 | return false; | ||
834 | pud = pud_offset(pgd, address); | ||
835 | if (!pud_present(*pud)) | ||
836 | return false; | ||
837 | pmd = pmd_offset(pud, address); | ||
838 | |||
839 | if (pmd_trans_huge(*pmd)) { | ||
840 | ptl = pmd_lock(mm, pmd); | ||
841 | if (!pmd_present(*pmd)) | ||
842 | goto unlock_pmd; | ||
843 | if (unlikely(!pmd_trans_huge(*pmd))) { | ||
844 | spin_unlock(ptl); | ||
845 | goto map_pte; | ||
846 | } | ||
847 | |||
848 | if (pmd_page(*pmd) != page) | ||
849 | goto unlock_pmd; | ||
850 | |||
851 | pte = NULL; | ||
852 | goto found; | ||
853 | unlock_pmd: | ||
854 | spin_unlock(ptl); | ||
855 | return false; | ||
856 | } else { | ||
857 | pmd_t pmde = *pmd; | ||
858 | |||
859 | barrier(); | ||
860 | if (!pmd_present(pmde) || pmd_trans_huge(pmde)) | ||
861 | return false; | ||
862 | } | ||
863 | map_pte: | ||
864 | pte = pte_offset_map(pmd, address); | ||
865 | if (!pte_present(*pte)) { | ||
866 | pte_unmap(pte); | ||
867 | return false; | ||
868 | } | ||
869 | |||
870 | ptl = pte_lockptr(mm, pmd); | ||
871 | check_pte: | ||
872 | spin_lock(ptl); | ||
873 | |||
874 | if (!pte_present(*pte)) { | ||
875 | pte_unmap_unlock(pte, ptl); | ||
876 | return false; | ||
877 | } | ||
878 | |||
879 | /* THP can be referenced by any subpage */ | ||
880 | if (pte_pfn(*pte) - page_to_pfn(page) >= hpage_nr_pages(page)) { | ||
881 | pte_unmap_unlock(pte, ptl); | ||
882 | return false; | ||
883 | } | ||
884 | found: | ||
885 | *ptep = pte; | ||
886 | *pmdp = pmd; | ||
887 | *ptlp = ptl; | ||
888 | return true; | ||
889 | } | ||
890 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | ||
891 | |||
822 | struct page_referenced_arg { | 892 | struct page_referenced_arg { |
823 | int mapcount; | 893 | int mapcount; |
824 | int referenced; | 894 | int referenced; |
@@ -832,49 +902,24 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
832 | unsigned long address, void *arg) | 902 | unsigned long address, void *arg) |
833 | { | 903 | { |
834 | struct mm_struct *mm = vma->vm_mm; | 904 | struct mm_struct *mm = vma->vm_mm; |
905 | struct page_referenced_arg *pra = arg; | ||
906 | pmd_t *pmd; | ||
907 | pte_t *pte; | ||
835 | spinlock_t *ptl; | 908 | spinlock_t *ptl; |
836 | int referenced = 0; | 909 | int referenced = 0; |
837 | struct page_referenced_arg *pra = arg; | ||
838 | |||
839 | if (unlikely(PageTransHuge(page))) { | ||
840 | pmd_t *pmd; | ||
841 | 910 | ||
842 | /* | 911 | if (!page_check_address_transhuge(page, mm, address, &pmd, &pte, &ptl)) |
843 | * rmap might return false positives; we must filter | 912 | return SWAP_AGAIN; |
844 | * these out using page_check_address_pmd(). | ||
845 | */ | ||
846 | pmd = page_check_address_pmd(page, mm, address, | ||
847 | PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); | ||
848 | if (!pmd) | ||
849 | return SWAP_AGAIN; | ||
850 | |||
851 | if (vma->vm_flags & VM_LOCKED) { | ||
852 | spin_unlock(ptl); | ||
853 | pra->vm_flags |= VM_LOCKED; | ||
854 | return SWAP_FAIL; /* To break the loop */ | ||
855 | } | ||
856 | 913 | ||
857 | /* go ahead even if the pmd is pmd_trans_splitting() */ | 914 | if (vma->vm_flags & VM_LOCKED) { |
858 | if (pmdp_clear_flush_young_notify(vma, address, pmd)) | 915 | if (pte) |
859 | referenced++; | 916 | pte_unmap(pte); |
860 | spin_unlock(ptl); | 917 | spin_unlock(ptl); |
861 | } else { | 918 | pra->vm_flags |= VM_LOCKED; |
862 | pte_t *pte; | 919 | return SWAP_FAIL; /* To break the loop */ |
863 | 920 | } | |
864 | /* | ||
865 | * rmap might return false positives; we must filter | ||
866 | * these out using page_check_address(). | ||
867 | */ | ||
868 | pte = page_check_address(page, mm, address, &ptl, 0); | ||
869 | if (!pte) | ||
870 | return SWAP_AGAIN; | ||
871 | |||
872 | if (vma->vm_flags & VM_LOCKED) { | ||
873 | pte_unmap_unlock(pte, ptl); | ||
874 | pra->vm_flags |= VM_LOCKED; | ||
875 | return SWAP_FAIL; /* To break the loop */ | ||
876 | } | ||
877 | 921 | ||
922 | if (pte) { | ||
878 | if (ptep_clear_flush_young_notify(vma, address, pte)) { | 923 | if (ptep_clear_flush_young_notify(vma, address, pte)) { |
879 | /* | 924 | /* |
880 | * Don't treat a reference through a sequentially read | 925 | * Don't treat a reference through a sequentially read |
@@ -886,8 +931,15 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, | |||
886 | if (likely(!(vma->vm_flags & VM_SEQ_READ))) | 931 | if (likely(!(vma->vm_flags & VM_SEQ_READ))) |
887 | referenced++; | 932 | referenced++; |
888 | } | 933 | } |
889 | pte_unmap_unlock(pte, ptl); | 934 | pte_unmap(pte); |
935 | } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { | ||
936 | if (pmdp_clear_flush_young_notify(vma, address, pmd)) | ||
937 | referenced++; | ||
938 | } else { | ||
939 | /* unexpected pmd-mapped page? */ | ||
940 | WARN_ON_ONCE(1); | ||
890 | } | 941 | } |
942 | spin_unlock(ptl); | ||
891 | 943 | ||
892 | if (referenced) | 944 | if (referenced) |
893 | clear_page_idle(page); | 945 | clear_page_idle(page); |
@@ -935,7 +987,7 @@ int page_referenced(struct page *page, | |||
935 | int ret; | 987 | int ret; |
936 | int we_locked = 0; | 988 | int we_locked = 0; |
937 | struct page_referenced_arg pra = { | 989 | struct page_referenced_arg pra = { |
938 | .mapcount = page_mapcount(page), | 990 | .mapcount = total_mapcount(page), |
939 | .memcg = memcg, | 991 | .memcg = memcg, |
940 | }; | 992 | }; |
941 | struct rmap_walk_control rwc = { | 993 | struct rmap_walk_control rwc = { |
@@ -1124,7 +1176,7 @@ static void __page_check_anon_rmap(struct page *page, | |||
1124 | * over the call to page_add_new_anon_rmap. | 1176 | * over the call to page_add_new_anon_rmap. |
1125 | */ | 1177 | */ |
1126 | BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); | 1178 | BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); |
1127 | BUG_ON(page->index != linear_page_index(vma, address)); | 1179 | BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address)); |
1128 | #endif | 1180 | #endif |
1129 | } | 1181 | } |
1130 | 1182 | ||
@@ -1133,6 +1185,7 @@ static void __page_check_anon_rmap(struct page *page, | |||
1133 | * @page: the page to add the mapping to | 1185 | * @page: the page to add the mapping to |
1134 | * @vma: the vm area in which the mapping is added | 1186 | * @vma: the vm area in which the mapping is added |
1135 | * @address: the user virtual address mapped | 1187 | * @address: the user virtual address mapped |
1188 | * @compound: charge the page as compound or small page | ||
1136 | * | 1189 | * |
1137 | * The caller needs to hold the pte lock, and the page must be locked in | 1190 | * The caller needs to hold the pte lock, and the page must be locked in |
1138 | * the anon_vma case: to serialize mapping,index checking after setting, | 1191 | * the anon_vma case: to serialize mapping,index checking after setting, |
@@ -1140,9 +1193,9 @@ static void __page_check_anon_rmap(struct page *page, | |||
1140 | * (but PageKsm is never downgraded to PageAnon). | 1193 | * (but PageKsm is never downgraded to PageAnon). |
1141 | */ | 1194 | */ |
1142 | void page_add_anon_rmap(struct page *page, | 1195 | void page_add_anon_rmap(struct page *page, |
1143 | struct vm_area_struct *vma, unsigned long address) | 1196 | struct vm_area_struct *vma, unsigned long address, bool compound) |
1144 | { | 1197 | { |
1145 | do_page_add_anon_rmap(page, vma, address, 0); | 1198 | do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0); |
1146 | } | 1199 | } |
1147 | 1200 | ||
1148 | /* | 1201 | /* |
@@ -1151,29 +1204,44 @@ void page_add_anon_rmap(struct page *page, | |||
1151 | * Everybody else should continue to use page_add_anon_rmap above. | 1204 | * Everybody else should continue to use page_add_anon_rmap above. |
1152 | */ | 1205 | */ |
1153 | void do_page_add_anon_rmap(struct page *page, | 1206 | void do_page_add_anon_rmap(struct page *page, |
1154 | struct vm_area_struct *vma, unsigned long address, int exclusive) | 1207 | struct vm_area_struct *vma, unsigned long address, int flags) |
1155 | { | 1208 | { |
1156 | int first = atomic_inc_and_test(&page->_mapcount); | 1209 | bool compound = flags & RMAP_COMPOUND; |
1210 | bool first; | ||
1211 | |||
1212 | if (compound) { | ||
1213 | atomic_t *mapcount; | ||
1214 | VM_BUG_ON_PAGE(!PageLocked(page), page); | ||
1215 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); | ||
1216 | mapcount = compound_mapcount_ptr(page); | ||
1217 | first = atomic_inc_and_test(mapcount); | ||
1218 | } else { | ||
1219 | first = atomic_inc_and_test(&page->_mapcount); | ||
1220 | } | ||
1221 | |||
1157 | if (first) { | 1222 | if (first) { |
1223 | int nr = compound ? hpage_nr_pages(page) : 1; | ||
1158 | /* | 1224 | /* |
1159 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because | 1225 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because |
1160 | * these counters are not modified in interrupt context, and | 1226 | * these counters are not modified in interrupt context, and |
1161 | * pte lock(a spinlock) is held, which implies preemption | 1227 | * pte lock(a spinlock) is held, which implies preemption |
1162 | * disabled. | 1228 | * disabled. |
1163 | */ | 1229 | */ |
1164 | if (PageTransHuge(page)) | 1230 | if (compound) { |
1165 | __inc_zone_page_state(page, | 1231 | __inc_zone_page_state(page, |
1166 | NR_ANON_TRANSPARENT_HUGEPAGES); | 1232 | NR_ANON_TRANSPARENT_HUGEPAGES); |
1167 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | 1233 | } |
1168 | hpage_nr_pages(page)); | 1234 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); |
1169 | } | 1235 | } |
1170 | if (unlikely(PageKsm(page))) | 1236 | if (unlikely(PageKsm(page))) |
1171 | return; | 1237 | return; |
1172 | 1238 | ||
1173 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 1239 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
1240 | |||
1174 | /* address might be in next vma when migration races vma_adjust */ | 1241 | /* address might be in next vma when migration races vma_adjust */ |
1175 | if (first) | 1242 | if (first) |
1176 | __page_set_anon_rmap(page, vma, address, exclusive); | 1243 | __page_set_anon_rmap(page, vma, address, |
1244 | flags & RMAP_EXCLUSIVE); | ||
1177 | else | 1245 | else |
1178 | __page_check_anon_rmap(page, vma, address); | 1246 | __page_check_anon_rmap(page, vma, address); |
1179 | } | 1247 | } |
@@ -1183,21 +1251,31 @@ void do_page_add_anon_rmap(struct page *page, | |||
1183 | * @page: the page to add the mapping to | 1251 | * @page: the page to add the mapping to |
1184 | * @vma: the vm area in which the mapping is added | 1252 | * @vma: the vm area in which the mapping is added |
1185 | * @address: the user virtual address mapped | 1253 | * @address: the user virtual address mapped |
1254 | * @compound: charge the page as compound or small page | ||
1186 | * | 1255 | * |
1187 | * Same as page_add_anon_rmap but must only be called on *new* pages. | 1256 | * Same as page_add_anon_rmap but must only be called on *new* pages. |
1188 | * This means the inc-and-test can be bypassed. | 1257 | * This means the inc-and-test can be bypassed. |
1189 | * Page does not have to be locked. | 1258 | * Page does not have to be locked. |
1190 | */ | 1259 | */ |
1191 | void page_add_new_anon_rmap(struct page *page, | 1260 | void page_add_new_anon_rmap(struct page *page, |
1192 | struct vm_area_struct *vma, unsigned long address) | 1261 | struct vm_area_struct *vma, unsigned long address, bool compound) |
1193 | { | 1262 | { |
1263 | int nr = compound ? hpage_nr_pages(page) : 1; | ||
1264 | |||
1194 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); | 1265 | VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); |
1195 | SetPageSwapBacked(page); | 1266 | SetPageSwapBacked(page); |
1196 | atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ | 1267 | if (compound) { |
1197 | if (PageTransHuge(page)) | 1268 | VM_BUG_ON_PAGE(!PageTransHuge(page), page); |
1269 | /* increment count (starts at -1) */ | ||
1270 | atomic_set(compound_mapcount_ptr(page), 0); | ||
1198 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | 1271 | __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); |
1199 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | 1272 | } else { |
1200 | hpage_nr_pages(page)); | 1273 | /* Anon THP always mapped first with PMD */ |
1274 | VM_BUG_ON_PAGE(PageTransCompound(page), page); | ||
1275 | /* increment count (starts at -1) */ | ||
1276 | atomic_set(&page->_mapcount, 0); | ||
1277 | } | ||
1278 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, nr); | ||
1201 | __page_set_anon_rmap(page, vma, address, 1); | 1279 | __page_set_anon_rmap(page, vma, address, 1); |
1202 | } | 1280 | } |
1203 | 1281 | ||
@@ -1225,12 +1303,15 @@ static void page_remove_file_rmap(struct page *page) | |||
1225 | 1303 | ||
1226 | memcg = mem_cgroup_begin_page_stat(page); | 1304 | memcg = mem_cgroup_begin_page_stat(page); |
1227 | 1305 | ||
1228 | /* page still mapped by someone else? */ | 1306 | /* Hugepages are not counted in NR_FILE_MAPPED for now. */ |
1229 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1307 | if (unlikely(PageHuge(page))) { |
1308 | /* hugetlb pages are always mapped with pmds */ | ||
1309 | atomic_dec(compound_mapcount_ptr(page)); | ||
1230 | goto out; | 1310 | goto out; |
1311 | } | ||
1231 | 1312 | ||
1232 | /* Hugepages are not counted in NR_FILE_MAPPED for now. */ | 1313 | /* page still mapped by someone else? */ |
1233 | if (unlikely(PageHuge(page))) | 1314 | if (!atomic_add_negative(-1, &page->_mapcount)) |
1234 | goto out; | 1315 | goto out; |
1235 | 1316 | ||
1236 | /* | 1317 | /* |
@@ -1247,41 +1328,79 @@ out: | |||
1247 | mem_cgroup_end_page_stat(memcg); | 1328 | mem_cgroup_end_page_stat(memcg); |
1248 | } | 1329 | } |
1249 | 1330 | ||
1331 | static void page_remove_anon_compound_rmap(struct page *page) | ||
1332 | { | ||
1333 | int i, nr; | ||
1334 | |||
1335 | if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) | ||
1336 | return; | ||
1337 | |||
1338 | /* Hugepages are not counted in NR_ANON_PAGES for now. */ | ||
1339 | if (unlikely(PageHuge(page))) | ||
1340 | return; | ||
1341 | |||
1342 | if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) | ||
1343 | return; | ||
1344 | |||
1345 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | ||
1346 | |||
1347 | if (TestClearPageDoubleMap(page)) { | ||
1348 | /* | ||
1349 | * Subpages can be mapped with PTEs too. Check how many of | ||
1350 | * themi are still mapped. | ||
1351 | */ | ||
1352 | for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) { | ||
1353 | if (atomic_add_negative(-1, &page[i]._mapcount)) | ||
1354 | nr++; | ||
1355 | } | ||
1356 | } else { | ||
1357 | nr = HPAGE_PMD_NR; | ||
1358 | } | ||
1359 | |||
1360 | if (unlikely(PageMlocked(page))) | ||
1361 | clear_page_mlock(page); | ||
1362 | |||
1363 | if (nr) { | ||
1364 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); | ||
1365 | deferred_split_huge_page(page); | ||
1366 | } | ||
1367 | } | ||
1368 | |||
1250 | /** | 1369 | /** |
1251 | * page_remove_rmap - take down pte mapping from a page | 1370 | * page_remove_rmap - take down pte mapping from a page |
1252 | * @page: page to remove mapping from | 1371 | * @page: page to remove mapping from |
1372 | * @compound: uncharge the page as compound or small page | ||
1253 | * | 1373 | * |
1254 | * The caller needs to hold the pte lock. | 1374 | * The caller needs to hold the pte lock. |
1255 | */ | 1375 | */ |
1256 | void page_remove_rmap(struct page *page) | 1376 | void page_remove_rmap(struct page *page, bool compound) |
1257 | { | 1377 | { |
1258 | if (!PageAnon(page)) { | 1378 | if (!PageAnon(page)) { |
1379 | VM_BUG_ON_PAGE(compound && !PageHuge(page), page); | ||
1259 | page_remove_file_rmap(page); | 1380 | page_remove_file_rmap(page); |
1260 | return; | 1381 | return; |
1261 | } | 1382 | } |
1262 | 1383 | ||
1384 | if (compound) | ||
1385 | return page_remove_anon_compound_rmap(page); | ||
1386 | |||
1263 | /* page still mapped by someone else? */ | 1387 | /* page still mapped by someone else? */ |
1264 | if (!atomic_add_negative(-1, &page->_mapcount)) | 1388 | if (!atomic_add_negative(-1, &page->_mapcount)) |
1265 | return; | 1389 | return; |
1266 | 1390 | ||
1267 | /* Hugepages are not counted in NR_ANON_PAGES for now. */ | ||
1268 | if (unlikely(PageHuge(page))) | ||
1269 | return; | ||
1270 | |||
1271 | /* | 1391 | /* |
1272 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because | 1392 | * We use the irq-unsafe __{inc|mod}_zone_page_stat because |
1273 | * these counters are not modified in interrupt context, and | 1393 | * these counters are not modified in interrupt context, and |
1274 | * pte lock(a spinlock) is held, which implies preemption disabled. | 1394 | * pte lock(a spinlock) is held, which implies preemption disabled. |
1275 | */ | 1395 | */ |
1276 | if (PageTransHuge(page)) | 1396 | __dec_zone_page_state(page, NR_ANON_PAGES); |
1277 | __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); | ||
1278 | |||
1279 | __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, | ||
1280 | -hpage_nr_pages(page)); | ||
1281 | 1397 | ||
1282 | if (unlikely(PageMlocked(page))) | 1398 | if (unlikely(PageMlocked(page))) |
1283 | clear_page_mlock(page); | 1399 | clear_page_mlock(page); |
1284 | 1400 | ||
1401 | if (PageTransCompound(page)) | ||
1402 | deferred_split_huge_page(compound_head(page)); | ||
1403 | |||
1285 | /* | 1404 | /* |
1286 | * It would be tidy to reset the PageAnon mapping here, | 1405 | * It would be tidy to reset the PageAnon mapping here, |
1287 | * but that might overwrite a racing page_add_anon_rmap | 1406 | * but that might overwrite a racing page_add_anon_rmap |
@@ -1293,6 +1412,11 @@ void page_remove_rmap(struct page *page) | |||
1293 | */ | 1412 | */ |
1294 | } | 1413 | } |
1295 | 1414 | ||
1415 | struct rmap_private { | ||
1416 | enum ttu_flags flags; | ||
1417 | int lazyfreed; | ||
1418 | }; | ||
1419 | |||
1296 | /* | 1420 | /* |
1297 | * @arg: enum ttu_flags will be passed to this argument | 1421 | * @arg: enum ttu_flags will be passed to this argument |
1298 | */ | 1422 | */ |
@@ -1304,7 +1428,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1304 | pte_t pteval; | 1428 | pte_t pteval; |
1305 | spinlock_t *ptl; | 1429 | spinlock_t *ptl; |
1306 | int ret = SWAP_AGAIN; | 1430 | int ret = SWAP_AGAIN; |
1307 | enum ttu_flags flags = (enum ttu_flags)arg; | 1431 | struct rmap_private *rp = arg; |
1432 | enum ttu_flags flags = rp->flags; | ||
1308 | 1433 | ||
1309 | /* munlock has nothing to gain from examining un-locked vmas */ | 1434 | /* munlock has nothing to gain from examining un-locked vmas */ |
1310 | if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) | 1435 | if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) |
@@ -1396,6 +1521,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1396 | * See handle_pte_fault() ... | 1521 | * See handle_pte_fault() ... |
1397 | */ | 1522 | */ |
1398 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); | 1523 | VM_BUG_ON_PAGE(!PageSwapCache(page), page); |
1524 | |||
1525 | if (!PageDirty(page) && (flags & TTU_LZFREE)) { | ||
1526 | /* It's a freeable page by MADV_FREE */ | ||
1527 | dec_mm_counter(mm, MM_ANONPAGES); | ||
1528 | rp->lazyfreed++; | ||
1529 | goto discard; | ||
1530 | } | ||
1531 | |||
1399 | if (swap_duplicate(entry) < 0) { | 1532 | if (swap_duplicate(entry) < 0) { |
1400 | set_pte_at(mm, address, pte, pteval); | 1533 | set_pte_at(mm, address, pte, pteval); |
1401 | ret = SWAP_FAIL; | 1534 | ret = SWAP_FAIL; |
@@ -1416,7 +1549,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, | |||
1416 | } else | 1549 | } else |
1417 | dec_mm_counter(mm, mm_counter_file(page)); | 1550 | dec_mm_counter(mm, mm_counter_file(page)); |
1418 | 1551 | ||
1419 | page_remove_rmap(page); | 1552 | discard: |
1553 | page_remove_rmap(page, PageHuge(page)); | ||
1420 | page_cache_release(page); | 1554 | page_cache_release(page); |
1421 | 1555 | ||
1422 | out_unmap: | 1556 | out_unmap: |
@@ -1468,9 +1602,14 @@ static int page_not_mapped(struct page *page) | |||
1468 | int try_to_unmap(struct page *page, enum ttu_flags flags) | 1602 | int try_to_unmap(struct page *page, enum ttu_flags flags) |
1469 | { | 1603 | { |
1470 | int ret; | 1604 | int ret; |
1605 | struct rmap_private rp = { | ||
1606 | .flags = flags, | ||
1607 | .lazyfreed = 0, | ||
1608 | }; | ||
1609 | |||
1471 | struct rmap_walk_control rwc = { | 1610 | struct rmap_walk_control rwc = { |
1472 | .rmap_one = try_to_unmap_one, | 1611 | .rmap_one = try_to_unmap_one, |
1473 | .arg = (void *)flags, | 1612 | .arg = &rp, |
1474 | .done = page_not_mapped, | 1613 | .done = page_not_mapped, |
1475 | .anon_lock = page_lock_anon_vma_read, | 1614 | .anon_lock = page_lock_anon_vma_read, |
1476 | }; | 1615 | }; |
@@ -1490,8 +1629,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1490 | 1629 | ||
1491 | ret = rmap_walk(page, &rwc); | 1630 | ret = rmap_walk(page, &rwc); |
1492 | 1631 | ||
1493 | if (ret != SWAP_MLOCK && !page_mapped(page)) | 1632 | if (ret != SWAP_MLOCK && !page_mapped(page)) { |
1494 | ret = SWAP_SUCCESS; | 1633 | ret = SWAP_SUCCESS; |
1634 | if (rp.lazyfreed && !PageDirty(page)) | ||
1635 | ret = SWAP_LZFREE; | ||
1636 | } | ||
1495 | return ret; | 1637 | return ret; |
1496 | } | 1638 | } |
1497 | 1639 | ||
@@ -1513,9 +1655,14 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) | |||
1513 | int try_to_munlock(struct page *page) | 1655 | int try_to_munlock(struct page *page) |
1514 | { | 1656 | { |
1515 | int ret; | 1657 | int ret; |
1658 | struct rmap_private rp = { | ||
1659 | .flags = TTU_MUNLOCK, | ||
1660 | .lazyfreed = 0, | ||
1661 | }; | ||
1662 | |||
1516 | struct rmap_walk_control rwc = { | 1663 | struct rmap_walk_control rwc = { |
1517 | .rmap_one = try_to_unmap_one, | 1664 | .rmap_one = try_to_unmap_one, |
1518 | .arg = (void *)TTU_MUNLOCK, | 1665 | .arg = &rp, |
1519 | .done = page_not_mapped, | 1666 | .done = page_not_mapped, |
1520 | .anon_lock = page_lock_anon_vma_read, | 1667 | .anon_lock = page_lock_anon_vma_read, |
1521 | 1668 | ||
@@ -1698,7 +1845,7 @@ void hugepage_add_anon_rmap(struct page *page, | |||
1698 | BUG_ON(!PageLocked(page)); | 1845 | BUG_ON(!PageLocked(page)); |
1699 | BUG_ON(!anon_vma); | 1846 | BUG_ON(!anon_vma); |
1700 | /* address might be in next vma when migration races vma_adjust */ | 1847 | /* address might be in next vma when migration races vma_adjust */ |
1701 | first = atomic_inc_and_test(&page->_mapcount); | 1848 | first = atomic_inc_and_test(compound_mapcount_ptr(page)); |
1702 | if (first) | 1849 | if (first) |
1703 | __hugepage_set_anon_rmap(page, vma, address, 0); | 1850 | __hugepage_set_anon_rmap(page, vma, address, 0); |
1704 | } | 1851 | } |
@@ -1707,7 +1854,7 @@ void hugepage_add_new_anon_rmap(struct page *page, | |||
1707 | struct vm_area_struct *vma, unsigned long address) | 1854 | struct vm_area_struct *vma, unsigned long address) |
1708 | { | 1855 | { |
1709 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 1856 | BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
1710 | atomic_set(&page->_mapcount, 0); | 1857 | atomic_set(compound_mapcount_ptr(page), 0); |
1711 | __hugepage_set_anon_rmap(page, vma, address, 1); | 1858 | __hugepage_set_anon_rmap(page, vma, address, 1); |
1712 | } | 1859 | } |
1713 | #endif /* CONFIG_HUGETLB_PAGE */ | 1860 | #endif /* CONFIG_HUGETLB_PAGE */ |
diff --git a/mm/shmem.c b/mm/shmem.c index 970ff5b80853..b98e1011858c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -810,7 +810,8 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
810 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). | 810 | * the shmem_swaplist_mutex which might hold up shmem_writepage(). |
811 | * Charged back to the user (not to caller) when swap account is used. | 811 | * Charged back to the user (not to caller) when swap account is used. |
812 | */ | 812 | */ |
813 | error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); | 813 | error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg, |
814 | false); | ||
814 | if (error) | 815 | if (error) |
815 | goto out; | 816 | goto out; |
816 | /* No radix_tree_preload: swap entry keeps a place for page in tree */ | 817 | /* No radix_tree_preload: swap entry keeps a place for page in tree */ |
@@ -833,9 +834,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page) | |||
833 | if (error) { | 834 | if (error) { |
834 | if (error != -ENOMEM) | 835 | if (error != -ENOMEM) |
835 | error = 0; | 836 | error = 0; |
836 | mem_cgroup_cancel_charge(page, memcg); | 837 | mem_cgroup_cancel_charge(page, memcg, false); |
837 | } else | 838 | } else |
838 | mem_cgroup_commit_charge(page, memcg, true); | 839 | mem_cgroup_commit_charge(page, memcg, true, false); |
839 | out: | 840 | out: |
840 | unlock_page(page); | 841 | unlock_page(page); |
841 | page_cache_release(page); | 842 | page_cache_release(page); |
@@ -1085,7 +1086,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, | |||
1085 | copy_highpage(newpage, oldpage); | 1086 | copy_highpage(newpage, oldpage); |
1086 | flush_dcache_page(newpage); | 1087 | flush_dcache_page(newpage); |
1087 | 1088 | ||
1088 | __set_page_locked(newpage); | 1089 | __SetPageLocked(newpage); |
1089 | SetPageUptodate(newpage); | 1090 | SetPageUptodate(newpage); |
1090 | SetPageSwapBacked(newpage); | 1091 | SetPageSwapBacked(newpage); |
1091 | set_page_private(newpage, swap_index); | 1092 | set_page_private(newpage, swap_index); |
@@ -1218,7 +1219,8 @@ repeat: | |||
1218 | goto failed; | 1219 | goto failed; |
1219 | } | 1220 | } |
1220 | 1221 | ||
1221 | error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); | 1222 | error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, |
1223 | false); | ||
1222 | if (!error) { | 1224 | if (!error) { |
1223 | error = shmem_add_to_page_cache(page, mapping, index, | 1225 | error = shmem_add_to_page_cache(page, mapping, index, |
1224 | swp_to_radix_entry(swap)); | 1226 | swp_to_radix_entry(swap)); |
@@ -1235,14 +1237,14 @@ repeat: | |||
1235 | * "repeat": reading a hole and writing should succeed. | 1237 | * "repeat": reading a hole and writing should succeed. |
1236 | */ | 1238 | */ |
1237 | if (error) { | 1239 | if (error) { |
1238 | mem_cgroup_cancel_charge(page, memcg); | 1240 | mem_cgroup_cancel_charge(page, memcg, false); |
1239 | delete_from_swap_cache(page); | 1241 | delete_from_swap_cache(page); |
1240 | } | 1242 | } |
1241 | } | 1243 | } |
1242 | if (error) | 1244 | if (error) |
1243 | goto failed; | 1245 | goto failed; |
1244 | 1246 | ||
1245 | mem_cgroup_commit_charge(page, memcg, true); | 1247 | mem_cgroup_commit_charge(page, memcg, true, false); |
1246 | 1248 | ||
1247 | spin_lock(&info->lock); | 1249 | spin_lock(&info->lock); |
1248 | info->swapped--; | 1250 | info->swapped--; |
@@ -1277,11 +1279,12 @@ repeat: | |||
1277 | } | 1279 | } |
1278 | 1280 | ||
1279 | __SetPageSwapBacked(page); | 1281 | __SetPageSwapBacked(page); |
1280 | __set_page_locked(page); | 1282 | __SetPageLocked(page); |
1281 | if (sgp == SGP_WRITE) | 1283 | if (sgp == SGP_WRITE) |
1282 | __SetPageReferenced(page); | 1284 | __SetPageReferenced(page); |
1283 | 1285 | ||
1284 | error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); | 1286 | error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, |
1287 | false); | ||
1285 | if (error) | 1288 | if (error) |
1286 | goto decused; | 1289 | goto decused; |
1287 | error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); | 1290 | error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); |
@@ -1291,10 +1294,10 @@ repeat: | |||
1291 | radix_tree_preload_end(); | 1294 | radix_tree_preload_end(); |
1292 | } | 1295 | } |
1293 | if (error) { | 1296 | if (error) { |
1294 | mem_cgroup_cancel_charge(page, memcg); | 1297 | mem_cgroup_cancel_charge(page, memcg, false); |
1295 | goto decused; | 1298 | goto decused; |
1296 | } | 1299 | } |
1297 | mem_cgroup_commit_charge(page, memcg, false); | 1300 | mem_cgroup_commit_charge(page, memcg, false, false); |
1298 | lru_cache_add_anon(page); | 1301 | lru_cache_add_anon(page); |
1299 | 1302 | ||
1300 | spin_lock(&info->lock); | 1303 | spin_lock(&info->lock); |
@@ -338,11 +338,13 @@ static inline int oo_objects(struct kmem_cache_order_objects x) | |||
338 | */ | 338 | */ |
339 | static __always_inline void slab_lock(struct page *page) | 339 | static __always_inline void slab_lock(struct page *page) |
340 | { | 340 | { |
341 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
341 | bit_spin_lock(PG_locked, &page->flags); | 342 | bit_spin_lock(PG_locked, &page->flags); |
342 | } | 343 | } |
343 | 344 | ||
344 | static __always_inline void slab_unlock(struct page *page) | 345 | static __always_inline void slab_unlock(struct page *page) |
345 | { | 346 | { |
347 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
346 | __bit_spin_unlock(PG_locked, &page->flags); | 348 | __bit_spin_unlock(PG_locked, &page->flags); |
347 | } | 349 | } |
348 | 350 | ||
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 4cba9c2783a1..b60802b3e5ea 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c | |||
@@ -20,6 +20,7 @@ | |||
20 | #include <linux/mm.h> | 20 | #include <linux/mm.h> |
21 | #include <linux/mmzone.h> | 21 | #include <linux/mmzone.h> |
22 | #include <linux/bootmem.h> | 22 | #include <linux/bootmem.h> |
23 | #include <linux/memremap.h> | ||
23 | #include <linux/highmem.h> | 24 | #include <linux/highmem.h> |
24 | #include <linux/slab.h> | 25 | #include <linux/slab.h> |
25 | #include <linux/spinlock.h> | 26 | #include <linux/spinlock.h> |
@@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) | |||
70 | } | 71 | } |
71 | 72 | ||
72 | /* need to make sure size is all the same during early stage */ | 73 | /* need to make sure size is all the same during early stage */ |
73 | void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) | 74 | static void * __meminit alloc_block_buf(unsigned long size, int node) |
74 | { | 75 | { |
75 | void *ptr; | 76 | void *ptr; |
76 | 77 | ||
@@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) | |||
87 | return ptr; | 88 | return ptr; |
88 | } | 89 | } |
89 | 90 | ||
91 | static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) | ||
92 | { | ||
93 | return altmap->base_pfn + altmap->reserve + altmap->alloc | ||
94 | + altmap->align; | ||
95 | } | ||
96 | |||
97 | static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap) | ||
98 | { | ||
99 | unsigned long allocated = altmap->alloc + altmap->align; | ||
100 | |||
101 | if (altmap->free > allocated) | ||
102 | return altmap->free - allocated; | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | /** | ||
107 | * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation | ||
108 | * @altmap - reserved page pool for the allocation | ||
109 | * @nr_pfns - size (in pages) of the allocation | ||
110 | * | ||
111 | * Allocations are aligned to the size of the request | ||
112 | */ | ||
113 | static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap, | ||
114 | unsigned long nr_pfns) | ||
115 | { | ||
116 | unsigned long pfn = vmem_altmap_next_pfn(altmap); | ||
117 | unsigned long nr_align; | ||
118 | |||
119 | nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG); | ||
120 | nr_align = ALIGN(pfn, nr_align) - pfn; | ||
121 | |||
122 | if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap)) | ||
123 | return ULONG_MAX; | ||
124 | altmap->alloc += nr_pfns; | ||
125 | altmap->align += nr_align; | ||
126 | return pfn + nr_align; | ||
127 | } | ||
128 | |||
129 | static void * __meminit altmap_alloc_block_buf(unsigned long size, | ||
130 | struct vmem_altmap *altmap) | ||
131 | { | ||
132 | unsigned long pfn, nr_pfns; | ||
133 | void *ptr; | ||
134 | |||
135 | if (size & ~PAGE_MASK) { | ||
136 | pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n", | ||
137 | __func__, size); | ||
138 | return NULL; | ||
139 | } | ||
140 | |||
141 | nr_pfns = size >> PAGE_SHIFT; | ||
142 | pfn = vmem_altmap_alloc(altmap, nr_pfns); | ||
143 | if (pfn < ULONG_MAX) | ||
144 | ptr = __va(__pfn_to_phys(pfn)); | ||
145 | else | ||
146 | ptr = NULL; | ||
147 | pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n", | ||
148 | __func__, pfn, altmap->alloc, altmap->align, nr_pfns); | ||
149 | |||
150 | return ptr; | ||
151 | } | ||
152 | |||
153 | /* need to make sure size is all the same during early stage */ | ||
154 | void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node, | ||
155 | struct vmem_altmap *altmap) | ||
156 | { | ||
157 | if (altmap) | ||
158 | return altmap_alloc_block_buf(size, altmap); | ||
159 | return alloc_block_buf(size, node); | ||
160 | } | ||
161 | |||
90 | void __meminit vmemmap_verify(pte_t *pte, int node, | 162 | void __meminit vmemmap_verify(pte_t *pte, int node, |
91 | unsigned long start, unsigned long end) | 163 | unsigned long start, unsigned long end) |
92 | { | 164 | { |
@@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) | |||
103 | pte_t *pte = pte_offset_kernel(pmd, addr); | 175 | pte_t *pte = pte_offset_kernel(pmd, addr); |
104 | if (pte_none(*pte)) { | 176 | if (pte_none(*pte)) { |
105 | pte_t entry; | 177 | pte_t entry; |
106 | void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); | 178 | void *p = alloc_block_buf(PAGE_SIZE, node); |
107 | if (!p) | 179 | if (!p) |
108 | return NULL; | 180 | return NULL; |
109 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); | 181 | entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); |
diff --git a/mm/sparse.c b/mm/sparse.c index d1b48b691ac8..3717ceed4177 100644 --- a/mm/sparse.c +++ b/mm/sparse.c | |||
@@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) | |||
748 | if (!memmap) | 748 | if (!memmap) |
749 | return; | 749 | return; |
750 | 750 | ||
751 | for (i = 0; i < PAGES_PER_SECTION; i++) { | 751 | for (i = 0; i < nr_pages; i++) { |
752 | if (PageHWPoison(&memmap[i])) { | 752 | if (PageHWPoison(&memmap[i])) { |
753 | atomic_long_sub(1, &num_poisoned_pages); | 753 | atomic_long_sub(1, &num_poisoned_pages); |
754 | ClearPageHWPoison(&memmap[i]); | 754 | ClearPageHWPoison(&memmap[i]); |
@@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) | |||
788 | free_map_bootmem(memmap); | 788 | free_map_bootmem(memmap); |
789 | } | 789 | } |
790 | 790 | ||
791 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | 791 | void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, |
792 | unsigned long map_offset) | ||
792 | { | 793 | { |
793 | struct page *memmap = NULL; | 794 | struct page *memmap = NULL; |
794 | unsigned long *usemap = NULL, flags; | 795 | unsigned long *usemap = NULL, flags; |
@@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) | |||
804 | } | 805 | } |
805 | pgdat_resize_unlock(pgdat, &flags); | 806 | pgdat_resize_unlock(pgdat, &flags); |
806 | 807 | ||
807 | clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); | 808 | clear_hwpoisoned_pages(memmap + map_offset, |
809 | PAGES_PER_SECTION - map_offset); | ||
808 | free_section_usemap(memmap, usemap); | 810 | free_section_usemap(memmap, usemap); |
809 | } | 811 | } |
810 | #endif /* CONFIG_MEMORY_HOTREMOVE */ | 812 | #endif /* CONFIG_MEMORY_HOTREMOVE */ |
@@ -24,6 +24,7 @@ | |||
24 | #include <linux/export.h> | 24 | #include <linux/export.h> |
25 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
26 | #include <linux/percpu_counter.h> | 26 | #include <linux/percpu_counter.h> |
27 | #include <linux/memremap.h> | ||
27 | #include <linux/percpu.h> | 28 | #include <linux/percpu.h> |
28 | #include <linux/cpu.h> | 29 | #include <linux/cpu.h> |
29 | #include <linux/notifier.h> | 30 | #include <linux/notifier.h> |
@@ -45,6 +46,7 @@ int page_cluster; | |||
45 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); | 46 | static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); |
46 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); | 47 | static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); |
47 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); | 48 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); |
49 | static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); | ||
48 | 50 | ||
49 | /* | 51 | /* |
50 | * This path almost never happens for VM activity - pages are normally | 52 | * This path almost never happens for VM activity - pages are normally |
@@ -89,260 +91,14 @@ static void __put_compound_page(struct page *page) | |||
89 | (*dtor)(page); | 91 | (*dtor)(page); |
90 | } | 92 | } |
91 | 93 | ||
92 | /** | 94 | void __put_page(struct page *page) |
93 | * Two special cases here: we could avoid taking compound_lock_irqsave | ||
94 | * and could skip the tail refcounting(in _mapcount). | ||
95 | * | ||
96 | * 1. Hugetlbfs page: | ||
97 | * | ||
98 | * PageHeadHuge will remain true until the compound page | ||
99 | * is released and enters the buddy allocator, and it could | ||
100 | * not be split by __split_huge_page_refcount(). | ||
101 | * | ||
102 | * So if we see PageHeadHuge set, and we have the tail page pin, | ||
103 | * then we could safely put head page. | ||
104 | * | ||
105 | * 2. Slab THP page: | ||
106 | * | ||
107 | * PG_slab is cleared before the slab frees the head page, and | ||
108 | * tail pin cannot be the last reference left on the head page, | ||
109 | * because the slab code is free to reuse the compound page | ||
110 | * after a kfree/kmem_cache_free without having to check if | ||
111 | * there's any tail pin left. In turn all tail pinsmust be always | ||
112 | * released while the head is still pinned by the slab code | ||
113 | * and so we know PG_slab will be still set too. | ||
114 | * | ||
115 | * So if we see PageSlab set, and we have the tail page pin, | ||
116 | * then we could safely put head page. | ||
117 | */ | ||
118 | static __always_inline | ||
119 | void put_unrefcounted_compound_page(struct page *page_head, struct page *page) | ||
120 | { | ||
121 | /* | ||
122 | * If @page is a THP tail, we must read the tail page | ||
123 | * flags after the head page flags. The | ||
124 | * __split_huge_page_refcount side enforces write memory barriers | ||
125 | * between clearing PageTail and before the head page | ||
126 | * can be freed and reallocated. | ||
127 | */ | ||
128 | smp_rmb(); | ||
129 | if (likely(PageTail(page))) { | ||
130 | /* | ||
131 | * __split_huge_page_refcount cannot race | ||
132 | * here, see the comment above this function. | ||
133 | */ | ||
134 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); | ||
135 | if (put_page_testzero(page_head)) { | ||
136 | /* | ||
137 | * If this is the tail of a slab THP page, | ||
138 | * the tail pin must not be the last reference | ||
139 | * held on the page, because the PG_slab cannot | ||
140 | * be cleared before all tail pins (which skips | ||
141 | * the _mapcount tail refcounting) have been | ||
142 | * released. | ||
143 | * | ||
144 | * If this is the tail of a hugetlbfs page, | ||
145 | * the tail pin may be the last reference on | ||
146 | * the page instead, because PageHeadHuge will | ||
147 | * not go away until the compound page enters | ||
148 | * the buddy allocator. | ||
149 | */ | ||
150 | VM_BUG_ON_PAGE(PageSlab(page_head), page_head); | ||
151 | __put_compound_page(page_head); | ||
152 | } | ||
153 | } else | ||
154 | /* | ||
155 | * __split_huge_page_refcount run before us, | ||
156 | * @page was a THP tail. The split @page_head | ||
157 | * has been freed and reallocated as slab or | ||
158 | * hugetlbfs page of smaller order (only | ||
159 | * possible if reallocated as slab on x86). | ||
160 | */ | ||
161 | if (put_page_testzero(page)) | ||
162 | __put_single_page(page); | ||
163 | } | ||
164 | |||
165 | static __always_inline | ||
166 | void put_refcounted_compound_page(struct page *page_head, struct page *page) | ||
167 | { | ||
168 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
169 | unsigned long flags; | ||
170 | |||
171 | /* | ||
172 | * @page_head wasn't a dangling pointer but it may not | ||
173 | * be a head page anymore by the time we obtain the | ||
174 | * lock. That is ok as long as it can't be freed from | ||
175 | * under us. | ||
176 | */ | ||
177 | flags = compound_lock_irqsave(page_head); | ||
178 | if (unlikely(!PageTail(page))) { | ||
179 | /* __split_huge_page_refcount run before us */ | ||
180 | compound_unlock_irqrestore(page_head, flags); | ||
181 | if (put_page_testzero(page_head)) { | ||
182 | /* | ||
183 | * The @page_head may have been freed | ||
184 | * and reallocated as a compound page | ||
185 | * of smaller order and then freed | ||
186 | * again. All we know is that it | ||
187 | * cannot have become: a THP page, a | ||
188 | * compound page of higher order, a | ||
189 | * tail page. That is because we | ||
190 | * still hold the refcount of the | ||
191 | * split THP tail and page_head was | ||
192 | * the THP head before the split. | ||
193 | */ | ||
194 | if (PageHead(page_head)) | ||
195 | __put_compound_page(page_head); | ||
196 | else | ||
197 | __put_single_page(page_head); | ||
198 | } | ||
199 | out_put_single: | ||
200 | if (put_page_testzero(page)) | ||
201 | __put_single_page(page); | ||
202 | return; | ||
203 | } | ||
204 | VM_BUG_ON_PAGE(page_head != compound_head(page), page); | ||
205 | /* | ||
206 | * We can release the refcount taken by | ||
207 | * get_page_unless_zero() now that | ||
208 | * __split_huge_page_refcount() is blocked on the | ||
209 | * compound_lock. | ||
210 | */ | ||
211 | if (put_page_testzero(page_head)) | ||
212 | VM_BUG_ON_PAGE(1, page_head); | ||
213 | /* __split_huge_page_refcount will wait now */ | ||
214 | VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); | ||
215 | atomic_dec(&page->_mapcount); | ||
216 | VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); | ||
217 | VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); | ||
218 | compound_unlock_irqrestore(page_head, flags); | ||
219 | |||
220 | if (put_page_testzero(page_head)) { | ||
221 | if (PageHead(page_head)) | ||
222 | __put_compound_page(page_head); | ||
223 | else | ||
224 | __put_single_page(page_head); | ||
225 | } | ||
226 | } else { | ||
227 | /* @page_head is a dangling pointer */ | ||
228 | VM_BUG_ON_PAGE(PageTail(page), page); | ||
229 | goto out_put_single; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | static void put_compound_page(struct page *page) | ||
234 | { | ||
235 | struct page *page_head; | ||
236 | |||
237 | /* | ||
238 | * We see the PageCompound set and PageTail not set, so @page maybe: | ||
239 | * 1. hugetlbfs head page, or | ||
240 | * 2. THP head page. | ||
241 | */ | ||
242 | if (likely(!PageTail(page))) { | ||
243 | if (put_page_testzero(page)) { | ||
244 | /* | ||
245 | * By the time all refcounts have been released | ||
246 | * split_huge_page cannot run anymore from under us. | ||
247 | */ | ||
248 | if (PageHead(page)) | ||
249 | __put_compound_page(page); | ||
250 | else | ||
251 | __put_single_page(page); | ||
252 | } | ||
253 | return; | ||
254 | } | ||
255 | |||
256 | /* | ||
257 | * We see the PageCompound set and PageTail set, so @page maybe: | ||
258 | * 1. a tail hugetlbfs page, or | ||
259 | * 2. a tail THP page, or | ||
260 | * 3. a split THP page. | ||
261 | * | ||
262 | * Case 3 is possible, as we may race with | ||
263 | * __split_huge_page_refcount tearing down a THP page. | ||
264 | */ | ||
265 | page_head = compound_head(page); | ||
266 | if (!__compound_tail_refcounted(page_head)) | ||
267 | put_unrefcounted_compound_page(page_head, page); | ||
268 | else | ||
269 | put_refcounted_compound_page(page_head, page); | ||
270 | } | ||
271 | |||
272 | void put_page(struct page *page) | ||
273 | { | 95 | { |
274 | if (unlikely(PageCompound(page))) | 96 | if (unlikely(PageCompound(page))) |
275 | put_compound_page(page); | 97 | __put_compound_page(page); |
276 | else if (put_page_testzero(page)) | 98 | else |
277 | __put_single_page(page); | 99 | __put_single_page(page); |
278 | } | 100 | } |
279 | EXPORT_SYMBOL(put_page); | 101 | EXPORT_SYMBOL(__put_page); |
280 | |||
281 | /* | ||
282 | * This function is exported but must not be called by anything other | ||
283 | * than get_page(). It implements the slow path of get_page(). | ||
284 | */ | ||
285 | bool __get_page_tail(struct page *page) | ||
286 | { | ||
287 | /* | ||
288 | * This takes care of get_page() if run on a tail page | ||
289 | * returned by one of the get_user_pages/follow_page variants. | ||
290 | * get_user_pages/follow_page itself doesn't need the compound | ||
291 | * lock because it runs __get_page_tail_foll() under the | ||
292 | * proper PT lock that already serializes against | ||
293 | * split_huge_page(). | ||
294 | */ | ||
295 | unsigned long flags; | ||
296 | bool got; | ||
297 | struct page *page_head = compound_head(page); | ||
298 | |||
299 | /* Ref to put_compound_page() comment. */ | ||
300 | if (!__compound_tail_refcounted(page_head)) { | ||
301 | smp_rmb(); | ||
302 | if (likely(PageTail(page))) { | ||
303 | /* | ||
304 | * This is a hugetlbfs page or a slab | ||
305 | * page. __split_huge_page_refcount | ||
306 | * cannot race here. | ||
307 | */ | ||
308 | VM_BUG_ON_PAGE(!PageHead(page_head), page_head); | ||
309 | __get_page_tail_foll(page, true); | ||
310 | return true; | ||
311 | } else { | ||
312 | /* | ||
313 | * __split_huge_page_refcount run | ||
314 | * before us, "page" was a THP | ||
315 | * tail. The split page_head has been | ||
316 | * freed and reallocated as slab or | ||
317 | * hugetlbfs page of smaller order | ||
318 | * (only possible if reallocated as | ||
319 | * slab on x86). | ||
320 | */ | ||
321 | return false; | ||
322 | } | ||
323 | } | ||
324 | |||
325 | got = false; | ||
326 | if (likely(page != page_head && get_page_unless_zero(page_head))) { | ||
327 | /* | ||
328 | * page_head wasn't a dangling pointer but it | ||
329 | * may not be a head page anymore by the time | ||
330 | * we obtain the lock. That is ok as long as it | ||
331 | * can't be freed from under us. | ||
332 | */ | ||
333 | flags = compound_lock_irqsave(page_head); | ||
334 | /* here __split_huge_page_refcount won't run anymore */ | ||
335 | if (likely(PageTail(page))) { | ||
336 | __get_page_tail_foll(page, false); | ||
337 | got = true; | ||
338 | } | ||
339 | compound_unlock_irqrestore(page_head, flags); | ||
340 | if (unlikely(!got)) | ||
341 | put_page(page_head); | ||
342 | } | ||
343 | return got; | ||
344 | } | ||
345 | EXPORT_SYMBOL(__get_page_tail); | ||
346 | 102 | ||
347 | /** | 103 | /** |
348 | * put_pages_list() - release a list of pages | 104 | * put_pages_list() - release a list of pages |
@@ -604,6 +360,7 @@ static void __lru_cache_activate_page(struct page *page) | |||
604 | */ | 360 | */ |
605 | void mark_page_accessed(struct page *page) | 361 | void mark_page_accessed(struct page *page) |
606 | { | 362 | { |
363 | page = compound_head(page); | ||
607 | if (!PageActive(page) && !PageUnevictable(page) && | 364 | if (!PageActive(page) && !PageUnevictable(page) && |
608 | PageReferenced(page)) { | 365 | PageReferenced(page)) { |
609 | 366 | ||
@@ -799,6 +556,24 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, | |||
799 | update_page_reclaim_stat(lruvec, file, 0); | 556 | update_page_reclaim_stat(lruvec, file, 0); |
800 | } | 557 | } |
801 | 558 | ||
559 | |||
560 | static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, | ||
561 | void *arg) | ||
562 | { | ||
563 | if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { | ||
564 | int file = page_is_file_cache(page); | ||
565 | int lru = page_lru_base_type(page); | ||
566 | |||
567 | del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); | ||
568 | ClearPageActive(page); | ||
569 | ClearPageReferenced(page); | ||
570 | add_page_to_lru_list(page, lruvec, lru); | ||
571 | |||
572 | __count_vm_event(PGDEACTIVATE); | ||
573 | update_page_reclaim_stat(lruvec, file, 0); | ||
574 | } | ||
575 | } | ||
576 | |||
802 | /* | 577 | /* |
803 | * Drain pages out of the cpu's pagevecs. | 578 | * Drain pages out of the cpu's pagevecs. |
804 | * Either "cpu" is the current CPU, and preemption has already been | 579 | * Either "cpu" is the current CPU, and preemption has already been |
@@ -825,6 +600,10 @@ void lru_add_drain_cpu(int cpu) | |||
825 | if (pagevec_count(pvec)) | 600 | if (pagevec_count(pvec)) |
826 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); | 601 | pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); |
827 | 602 | ||
603 | pvec = &per_cpu(lru_deactivate_pvecs, cpu); | ||
604 | if (pagevec_count(pvec)) | ||
605 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | ||
606 | |||
828 | activate_page_drain(cpu); | 607 | activate_page_drain(cpu); |
829 | } | 608 | } |
830 | 609 | ||
@@ -854,6 +633,26 @@ void deactivate_file_page(struct page *page) | |||
854 | } | 633 | } |
855 | } | 634 | } |
856 | 635 | ||
636 | /** | ||
637 | * deactivate_page - deactivate a page | ||
638 | * @page: page to deactivate | ||
639 | * | ||
640 | * deactivate_page() moves @page to the inactive list if @page was on the active | ||
641 | * list and was not an unevictable page. This is done to accelerate the reclaim | ||
642 | * of @page. | ||
643 | */ | ||
644 | void deactivate_page(struct page *page) | ||
645 | { | ||
646 | if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { | ||
647 | struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); | ||
648 | |||
649 | page_cache_get(page); | ||
650 | if (!pagevec_add(pvec, page)) | ||
651 | pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); | ||
652 | put_cpu_var(lru_deactivate_pvecs); | ||
653 | } | ||
654 | } | ||
655 | |||
857 | void lru_add_drain(void) | 656 | void lru_add_drain(void) |
858 | { | 657 | { |
859 | lru_add_drain_cpu(get_cpu()); | 658 | lru_add_drain_cpu(get_cpu()); |
@@ -883,6 +682,7 @@ void lru_add_drain_all(void) | |||
883 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || | 682 | if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || |
884 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || | 683 | pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || |
885 | pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || | 684 | pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || |
685 | pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || | ||
886 | need_activate_page_drain(cpu)) { | 686 | need_activate_page_drain(cpu)) { |
887 | INIT_WORK(work, lru_add_drain_per_cpu); | 687 | INIT_WORK(work, lru_add_drain_per_cpu); |
888 | schedule_work_on(cpu, work); | 688 | schedule_work_on(cpu, work); |
@@ -918,15 +718,6 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
918 | for (i = 0; i < nr; i++) { | 718 | for (i = 0; i < nr; i++) { |
919 | struct page *page = pages[i]; | 719 | struct page *page = pages[i]; |
920 | 720 | ||
921 | if (unlikely(PageCompound(page))) { | ||
922 | if (zone) { | ||
923 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
924 | zone = NULL; | ||
925 | } | ||
926 | put_compound_page(page); | ||
927 | continue; | ||
928 | } | ||
929 | |||
930 | /* | 721 | /* |
931 | * Make sure the IRQ-safe lock-holding time does not get | 722 | * Make sure the IRQ-safe lock-holding time does not get |
932 | * excessive with a continuous string of pages from the | 723 | * excessive with a continuous string of pages from the |
@@ -937,9 +728,19 @@ void release_pages(struct page **pages, int nr, bool cold) | |||
937 | zone = NULL; | 728 | zone = NULL; |
938 | } | 729 | } |
939 | 730 | ||
731 | page = compound_head(page); | ||
940 | if (!put_page_testzero(page)) | 732 | if (!put_page_testzero(page)) |
941 | continue; | 733 | continue; |
942 | 734 | ||
735 | if (PageCompound(page)) { | ||
736 | if (zone) { | ||
737 | spin_unlock_irqrestore(&zone->lru_lock, flags); | ||
738 | zone = NULL; | ||
739 | } | ||
740 | __put_compound_page(page); | ||
741 | continue; | ||
742 | } | ||
743 | |||
943 | if (PageLRU(page)) { | 744 | if (PageLRU(page)) { |
944 | struct zone *pagezone = page_zone(page); | 745 | struct zone *pagezone = page_zone(page); |
945 | 746 | ||
diff --git a/mm/swap_state.c b/mm/swap_state.c index d504adb7fa5f..676ff2991380 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
@@ -185,13 +185,12 @@ int add_to_swap(struct page *page, struct list_head *list) | |||
185 | * deadlock in the swap out path. | 185 | * deadlock in the swap out path. |
186 | */ | 186 | */ |
187 | /* | 187 | /* |
188 | * Add it to the swap cache and mark it dirty | 188 | * Add it to the swap cache. |
189 | */ | 189 | */ |
190 | err = add_to_swap_cache(page, entry, | 190 | err = add_to_swap_cache(page, entry, |
191 | __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); | 191 | __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); |
192 | 192 | ||
193 | if (!err) { /* Success */ | 193 | if (!err) { |
194 | SetPageDirty(page); | ||
195 | return 1; | 194 | return 1; |
196 | } else { /* -ENOMEM radix-tree allocation failure */ | 195 | } else { /* -ENOMEM radix-tree allocation failure */ |
197 | /* | 196 | /* |
@@ -353,7 +352,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
353 | } | 352 | } |
354 | 353 | ||
355 | /* May fail (-ENOMEM) if radix-tree node allocation failed. */ | 354 | /* May fail (-ENOMEM) if radix-tree node allocation failed. */ |
356 | __set_page_locked(new_page); | 355 | __SetPageLocked(new_page); |
357 | SetPageSwapBacked(new_page); | 356 | SetPageSwapBacked(new_page); |
358 | err = __add_to_swap_cache(new_page, entry); | 357 | err = __add_to_swap_cache(new_page, entry); |
359 | if (likely(!err)) { | 358 | if (likely(!err)) { |
@@ -367,7 +366,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, | |||
367 | } | 366 | } |
368 | radix_tree_preload_end(); | 367 | radix_tree_preload_end(); |
369 | ClearPageSwapBacked(new_page); | 368 | ClearPageSwapBacked(new_page); |
370 | __clear_page_locked(new_page); | 369 | __ClearPageLocked(new_page); |
371 | /* | 370 | /* |
372 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely | 371 | * add_to_swap_cache() doesn't return -EEXIST, so we can safely |
373 | * clear SWAP_HAS_CACHE flag. | 372 | * clear SWAP_HAS_CACHE flag. |
diff --git a/mm/swapfile.c b/mm/swapfile.c index e6b8591a3ed2..2bb30aa3a412 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
@@ -926,6 +926,9 @@ int reuse_swap_page(struct page *page) | |||
926 | VM_BUG_ON_PAGE(!PageLocked(page), page); | 926 | VM_BUG_ON_PAGE(!PageLocked(page), page); |
927 | if (unlikely(PageKsm(page))) | 927 | if (unlikely(PageKsm(page))) |
928 | return 0; | 928 | return 0; |
929 | /* The page is part of THP and cannot be reused */ | ||
930 | if (PageTransCompound(page)) | ||
931 | return 0; | ||
929 | count = page_mapcount(page); | 932 | count = page_mapcount(page); |
930 | if (count <= 1 && PageSwapCache(page)) { | 933 | if (count <= 1 && PageSwapCache(page)) { |
931 | count += page_swapcount(page); | 934 | count += page_swapcount(page); |
@@ -1108,19 +1111,9 @@ unsigned int count_swap_pages(int type, int free) | |||
1108 | } | 1111 | } |
1109 | #endif /* CONFIG_HIBERNATION */ | 1112 | #endif /* CONFIG_HIBERNATION */ |
1110 | 1113 | ||
1111 | static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) | 1114 | static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) |
1112 | { | 1115 | { |
1113 | #ifdef CONFIG_MEM_SOFT_DIRTY | 1116 | return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); |
1114 | /* | ||
1115 | * When pte keeps soft dirty bit the pte generated | ||
1116 | * from swap entry does not has it, still it's same | ||
1117 | * pte from logical point of view. | ||
1118 | */ | ||
1119 | pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); | ||
1120 | return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); | ||
1121 | #else | ||
1122 | return pte_same(pte, swp_pte); | ||
1123 | #endif | ||
1124 | } | 1117 | } |
1125 | 1118 | ||
1126 | /* | 1119 | /* |
@@ -1142,14 +1135,15 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
1142 | if (unlikely(!page)) | 1135 | if (unlikely(!page)) |
1143 | return -ENOMEM; | 1136 | return -ENOMEM; |
1144 | 1137 | ||
1145 | if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { | 1138 | if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, |
1139 | &memcg, false)) { | ||
1146 | ret = -ENOMEM; | 1140 | ret = -ENOMEM; |
1147 | goto out_nolock; | 1141 | goto out_nolock; |
1148 | } | 1142 | } |
1149 | 1143 | ||
1150 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 1144 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
1151 | if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { | 1145 | if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) { |
1152 | mem_cgroup_cancel_charge(page, memcg); | 1146 | mem_cgroup_cancel_charge(page, memcg, false); |
1153 | ret = 0; | 1147 | ret = 0; |
1154 | goto out; | 1148 | goto out; |
1155 | } | 1149 | } |
@@ -1160,11 +1154,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, | |||
1160 | set_pte_at(vma->vm_mm, addr, pte, | 1154 | set_pte_at(vma->vm_mm, addr, pte, |
1161 | pte_mkold(mk_pte(page, vma->vm_page_prot))); | 1155 | pte_mkold(mk_pte(page, vma->vm_page_prot))); |
1162 | if (page == swapcache) { | 1156 | if (page == swapcache) { |
1163 | page_add_anon_rmap(page, vma, addr); | 1157 | page_add_anon_rmap(page, vma, addr, false); |
1164 | mem_cgroup_commit_charge(page, memcg, true); | 1158 | mem_cgroup_commit_charge(page, memcg, true, false); |
1165 | } else { /* ksm created a completely new copy */ | 1159 | } else { /* ksm created a completely new copy */ |
1166 | page_add_new_anon_rmap(page, vma, addr); | 1160 | page_add_new_anon_rmap(page, vma, addr, false); |
1167 | mem_cgroup_commit_charge(page, memcg, false); | 1161 | mem_cgroup_commit_charge(page, memcg, false, false); |
1168 | lru_cache_add_active_or_unevictable(page, vma); | 1162 | lru_cache_add_active_or_unevictable(page, vma); |
1169 | } | 1163 | } |
1170 | swap_free(entry); | 1164 | swap_free(entry); |
@@ -1206,7 +1200,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
1206 | * swapoff spends a _lot_ of time in this loop! | 1200 | * swapoff spends a _lot_ of time in this loop! |
1207 | * Test inline before going to call unuse_pte. | 1201 | * Test inline before going to call unuse_pte. |
1208 | */ | 1202 | */ |
1209 | if (unlikely(maybe_same_pte(*pte, swp_pte))) { | 1203 | if (unlikely(pte_same_as_swp(*pte, swp_pte))) { |
1210 | pte_unmap(pte); | 1204 | pte_unmap(pte); |
1211 | ret = unuse_pte(vma, pmd, addr, entry, page); | 1205 | ret = unuse_pte(vma, pmd, addr, entry, page); |
1212 | if (ret) | 1206 | if (ret) |
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 77fee9325a57..806b0c758c5b 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c | |||
@@ -63,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, | |||
63 | __SetPageUptodate(page); | 63 | __SetPageUptodate(page); |
64 | 64 | ||
65 | ret = -ENOMEM; | 65 | ret = -ENOMEM; |
66 | if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg)) | 66 | if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) |
67 | goto out_release; | 67 | goto out_release; |
68 | 68 | ||
69 | _dst_pte = mk_pte(page, dst_vma->vm_page_prot); | 69 | _dst_pte = mk_pte(page, dst_vma->vm_page_prot); |
@@ -76,8 +76,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, | |||
76 | goto out_release_uncharge_unlock; | 76 | goto out_release_uncharge_unlock; |
77 | 77 | ||
78 | inc_mm_counter(dst_mm, MM_ANONPAGES); | 78 | inc_mm_counter(dst_mm, MM_ANONPAGES); |
79 | page_add_new_anon_rmap(page, dst_vma, dst_addr); | 79 | page_add_new_anon_rmap(page, dst_vma, dst_addr, false); |
80 | mem_cgroup_commit_charge(page, memcg, false); | 80 | mem_cgroup_commit_charge(page, memcg, false, false); |
81 | lru_cache_add_active_or_unevictable(page, dst_vma); | 81 | lru_cache_add_active_or_unevictable(page, dst_vma); |
82 | 82 | ||
83 | set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); | 83 | set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); |
@@ -91,7 +91,7 @@ out: | |||
91 | return ret; | 91 | return ret; |
92 | out_release_uncharge_unlock: | 92 | out_release_uncharge_unlock: |
93 | pte_unmap_unlock(dst_pte, ptl); | 93 | pte_unmap_unlock(dst_pte, ptl); |
94 | mem_cgroup_cancel_charge(page, memcg); | 94 | mem_cgroup_cancel_charge(page, memcg, false); |
95 | out_release: | 95 | out_release: |
96 | page_cache_release(page); | 96 | page_cache_release(page); |
97 | goto out; | 97 | goto out; |
@@ -386,7 +386,9 @@ struct anon_vma *page_anon_vma(struct page *page) | |||
386 | 386 | ||
387 | struct address_space *page_mapping(struct page *page) | 387 | struct address_space *page_mapping(struct page *page) |
388 | { | 388 | { |
389 | unsigned long mapping; | 389 | struct address_space *mapping; |
390 | |||
391 | page = compound_head(page); | ||
390 | 392 | ||
391 | /* This happens if someone calls flush_dcache_page on slab page */ | 393 | /* This happens if someone calls flush_dcache_page on slab page */ |
392 | if (unlikely(PageSlab(page))) | 394 | if (unlikely(PageSlab(page))) |
@@ -399,11 +401,25 @@ struct address_space *page_mapping(struct page *page) | |||
399 | return swap_address_space(entry); | 401 | return swap_address_space(entry); |
400 | } | 402 | } |
401 | 403 | ||
402 | mapping = (unsigned long)page->mapping; | 404 | mapping = page->mapping; |
403 | if (mapping & PAGE_MAPPING_FLAGS) | 405 | if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) |
404 | return NULL; | 406 | return NULL; |
405 | return page->mapping; | 407 | return mapping; |
408 | } | ||
409 | |||
410 | /* Slow path of page_mapcount() for compound pages */ | ||
411 | int __page_mapcount(struct page *page) | ||
412 | { | ||
413 | int ret; | ||
414 | |||
415 | ret = atomic_read(&page->_mapcount) + 1; | ||
416 | page = compound_head(page); | ||
417 | ret += atomic_read(compound_mapcount_ptr(page)) + 1; | ||
418 | if (PageDoubleMap(page)) | ||
419 | ret--; | ||
420 | return ret; | ||
406 | } | 421 | } |
422 | EXPORT_SYMBOL_GPL(__page_mapcount); | ||
407 | 423 | ||
408 | int overcommit_ratio_handler(struct ctl_table *table, int write, | 424 | int overcommit_ratio_handler(struct ctl_table *table, int write, |
409 | void __user *buffer, size_t *lenp, | 425 | void __user *buffer, size_t *lenp, |
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 58ceeb107960..fb42a5bffe47 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -455,7 +455,7 @@ found: | |||
455 | free_vmap_cache = &va->rb_node; | 455 | free_vmap_cache = &va->rb_node; |
456 | spin_unlock(&vmap_area_lock); | 456 | spin_unlock(&vmap_area_lock); |
457 | 457 | ||
458 | BUG_ON(va->va_start & (align-1)); | 458 | BUG_ON(!IS_ALIGNED(va->va_start, align)); |
459 | BUG_ON(va->va_start < vstart); | 459 | BUG_ON(va->va_start < vstart); |
460 | BUG_ON(va->va_end > vend); | 460 | BUG_ON(va->va_end > vend); |
461 | 461 | ||
@@ -1086,7 +1086,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) | |||
1086 | BUG_ON(!addr); | 1086 | BUG_ON(!addr); |
1087 | BUG_ON(addr < VMALLOC_START); | 1087 | BUG_ON(addr < VMALLOC_START); |
1088 | BUG_ON(addr > VMALLOC_END); | 1088 | BUG_ON(addr > VMALLOC_END); |
1089 | BUG_ON(addr & (PAGE_SIZE-1)); | 1089 | BUG_ON(!IS_ALIGNED(addr, PAGE_SIZE)); |
1090 | 1090 | ||
1091 | debug_check_no_locks_freed(mem, size); | 1091 | debug_check_no_locks_freed(mem, size); |
1092 | vmap_debug_free_range(addr, addr+size); | 1092 | vmap_debug_free_range(addr, addr+size); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 108bd119f2f6..5ac86956ff9d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -906,6 +906,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
906 | int may_enter_fs; | 906 | int may_enter_fs; |
907 | enum page_references references = PAGEREF_RECLAIM_CLEAN; | 907 | enum page_references references = PAGEREF_RECLAIM_CLEAN; |
908 | bool dirty, writeback; | 908 | bool dirty, writeback; |
909 | bool lazyfree = false; | ||
910 | int ret = SWAP_SUCCESS; | ||
909 | 911 | ||
910 | cond_resched(); | 912 | cond_resched(); |
911 | 913 | ||
@@ -1049,6 +1051,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1049 | goto keep_locked; | 1051 | goto keep_locked; |
1050 | if (!add_to_swap(page, page_list)) | 1052 | if (!add_to_swap(page, page_list)) |
1051 | goto activate_locked; | 1053 | goto activate_locked; |
1054 | lazyfree = true; | ||
1052 | may_enter_fs = 1; | 1055 | may_enter_fs = 1; |
1053 | 1056 | ||
1054 | /* Adding to swap updated mapping */ | 1057 | /* Adding to swap updated mapping */ |
@@ -1060,14 +1063,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1060 | * processes. Try to unmap it here. | 1063 | * processes. Try to unmap it here. |
1061 | */ | 1064 | */ |
1062 | if (page_mapped(page) && mapping) { | 1065 | if (page_mapped(page) && mapping) { |
1063 | switch (try_to_unmap(page, | 1066 | switch (ret = try_to_unmap(page, lazyfree ? |
1064 | ttu_flags|TTU_BATCH_FLUSH)) { | 1067 | (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : |
1068 | (ttu_flags | TTU_BATCH_FLUSH))) { | ||
1065 | case SWAP_FAIL: | 1069 | case SWAP_FAIL: |
1066 | goto activate_locked; | 1070 | goto activate_locked; |
1067 | case SWAP_AGAIN: | 1071 | case SWAP_AGAIN: |
1068 | goto keep_locked; | 1072 | goto keep_locked; |
1069 | case SWAP_MLOCK: | 1073 | case SWAP_MLOCK: |
1070 | goto cull_mlocked; | 1074 | goto cull_mlocked; |
1075 | case SWAP_LZFREE: | ||
1076 | goto lazyfree; | ||
1071 | case SWAP_SUCCESS: | 1077 | case SWAP_SUCCESS: |
1072 | ; /* try to free the page below */ | 1078 | ; /* try to free the page below */ |
1073 | } | 1079 | } |
@@ -1174,6 +1180,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1174 | } | 1180 | } |
1175 | } | 1181 | } |
1176 | 1182 | ||
1183 | lazyfree: | ||
1177 | if (!mapping || !__remove_mapping(mapping, page, true)) | 1184 | if (!mapping || !__remove_mapping(mapping, page, true)) |
1178 | goto keep_locked; | 1185 | goto keep_locked; |
1179 | 1186 | ||
@@ -1184,8 +1191,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1184 | * we obviously don't have to worry about waking up a process | 1191 | * we obviously don't have to worry about waking up a process |
1185 | * waiting on the page lock, because there are no references. | 1192 | * waiting on the page lock, because there are no references. |
1186 | */ | 1193 | */ |
1187 | __clear_page_locked(page); | 1194 | __ClearPageLocked(page); |
1188 | free_it: | 1195 | free_it: |
1196 | if (ret == SWAP_LZFREE) | ||
1197 | count_vm_event(PGLAZYFREED); | ||
1198 | |||
1189 | nr_reclaimed++; | 1199 | nr_reclaimed++; |
1190 | 1200 | ||
1191 | /* | 1201 | /* |
diff --git a/mm/vmstat.c b/mm/vmstat.c index 83a003bc3cae..64bd0aa13f75 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -783,6 +783,7 @@ const char * const vmstat_text[] = { | |||
783 | 783 | ||
784 | "pgfault", | 784 | "pgfault", |
785 | "pgmajfault", | 785 | "pgmajfault", |
786 | "pglazyfreed", | ||
786 | 787 | ||
787 | TEXTS_FOR_ZONES("pgrefill") | 788 | TEXTS_FOR_ZONES("pgrefill") |
788 | TEXTS_FOR_ZONES("pgsteal_kswapd") | 789 | TEXTS_FOR_ZONES("pgsteal_kswapd") |
@@ -844,7 +845,9 @@ const char * const vmstat_text[] = { | |||
844 | "thp_fault_fallback", | 845 | "thp_fault_fallback", |
845 | "thp_collapse_alloc", | 846 | "thp_collapse_alloc", |
846 | "thp_collapse_alloc_failed", | 847 | "thp_collapse_alloc_failed", |
847 | "thp_split", | 848 | "thp_split_page", |
849 | "thp_split_page_failed", | ||
850 | "thp_split_pmd", | ||
848 | "thp_zero_page_alloc", | 851 | "thp_zero_page_alloc", |
849 | "thp_zero_page_alloc_failed", | 852 | "thp_zero_page_alloc_failed", |
850 | #endif | 853 | #endif |
diff --git a/scripts/tags.sh b/scripts/tags.sh index 262889046703..76f131ebc192 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh | |||
@@ -193,7 +193,6 @@ exuberant() | |||
193 | --regex-c++='/CLEARPAGEFLAG_NOOP\(([^,)]*).*/ClearPage\1/' \ | 193 | --regex-c++='/CLEARPAGEFLAG_NOOP\(([^,)]*).*/ClearPage\1/' \ |
194 | --regex-c++='/__CLEARPAGEFLAG_NOOP\(([^,)]*).*/__ClearPage\1/' \ | 194 | --regex-c++='/__CLEARPAGEFLAG_NOOP\(([^,)]*).*/__ClearPage\1/' \ |
195 | --regex-c++='/TESTCLEARFLAG_FALSE\(([^,)]*).*/TestClearPage\1/' \ | 195 | --regex-c++='/TESTCLEARFLAG_FALSE\(([^,)]*).*/TestClearPage\1/' \ |
196 | --regex-c++='/__TESTCLEARFLAG_FALSE\(([^,)]*).*/__TestClearPage\1/' \ | ||
197 | --regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/' \ | 196 | --regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/' \ |
198 | --regex-c++='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/' \ | 197 | --regex-c++='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/' \ |
199 | --regex-c++='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/' \ | 198 | --regex-c++='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/' \ |
@@ -260,7 +259,6 @@ emacs() | |||
260 | --regex='/CLEARPAGEFLAG_NOOP(\([^,)]*\).*/ClearPage\1/' \ | 259 | --regex='/CLEARPAGEFLAG_NOOP(\([^,)]*\).*/ClearPage\1/' \ |
261 | --regex='/__CLEARPAGEFLAG_NOOP(\([^,)]*\).*/__ClearPage\1/' \ | 260 | --regex='/__CLEARPAGEFLAG_NOOP(\([^,)]*\).*/__ClearPage\1/' \ |
262 | --regex='/TESTCLEARFLAG_FALSE(\([^,)]*\).*/TestClearPage\1/' \ | 261 | --regex='/TESTCLEARFLAG_FALSE(\([^,)]*\).*/TestClearPage\1/' \ |
263 | --regex='/__TESTCLEARFLAG_FALSE(\([^,)]*\).*/__TestClearPage\1/' \ | ||
264 | --regex='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/' \ | 262 | --regex='/TASK_PFA_TEST\([^,]*,\s*([^)]*)\)/task_\1/' \ |
265 | --regex='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/' \ | 263 | --regex='/TASK_PFA_SET\([^,]*,\s*([^)]*)\)/task_set_\1/' \ |
266 | --regex='/TASK_PFA_CLEAR\([^,]*,\s*([^)]*)\)/task_clear_\1/' \ | 264 | --regex='/TASK_PFA_CLEAR\([^,]*,\s*([^)]*)\)/task_clear_\1/' \ |
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 314c7774652e..a11cfd20a6a0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c | |||
@@ -111,7 +111,7 @@ static void hardware_disable_all(void); | |||
111 | 111 | ||
112 | static void kvm_io_bus_destroy(struct kvm_io_bus *bus); | 112 | static void kvm_io_bus_destroy(struct kvm_io_bus *bus); |
113 | 113 | ||
114 | static void kvm_release_pfn_dirty(pfn_t pfn); | 114 | static void kvm_release_pfn_dirty(kvm_pfn_t pfn); |
115 | static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); | 115 | static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); |
116 | 116 | ||
117 | __visible bool kvm_rebooting; | 117 | __visible bool kvm_rebooting; |
@@ -119,7 +119,7 @@ EXPORT_SYMBOL_GPL(kvm_rebooting); | |||
119 | 119 | ||
120 | static bool largepages_enabled = true; | 120 | static bool largepages_enabled = true; |
121 | 121 | ||
122 | bool kvm_is_reserved_pfn(pfn_t pfn) | 122 | bool kvm_is_reserved_pfn(kvm_pfn_t pfn) |
123 | { | 123 | { |
124 | if (pfn_valid(pfn)) | 124 | if (pfn_valid(pfn)) |
125 | return PageReserved(pfn_to_page(pfn)); | 125 | return PageReserved(pfn_to_page(pfn)); |
@@ -1289,7 +1289,7 @@ static inline int check_user_page_hwpoison(unsigned long addr) | |||
1289 | * true indicates success, otherwise false is returned. | 1289 | * true indicates success, otherwise false is returned. |
1290 | */ | 1290 | */ |
1291 | static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, | 1291 | static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, |
1292 | bool write_fault, bool *writable, pfn_t *pfn) | 1292 | bool write_fault, bool *writable, kvm_pfn_t *pfn) |
1293 | { | 1293 | { |
1294 | struct page *page[1]; | 1294 | struct page *page[1]; |
1295 | int npages; | 1295 | int npages; |
@@ -1322,7 +1322,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, | |||
1322 | * 1 indicates success, -errno is returned if error is detected. | 1322 | * 1 indicates success, -errno is returned if error is detected. |
1323 | */ | 1323 | */ |
1324 | static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, | 1324 | static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, |
1325 | bool *writable, pfn_t *pfn) | 1325 | bool *writable, kvm_pfn_t *pfn) |
1326 | { | 1326 | { |
1327 | struct page *page[1]; | 1327 | struct page *page[1]; |
1328 | int npages = 0; | 1328 | int npages = 0; |
@@ -1386,11 +1386,11 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) | |||
1386 | * 2): @write_fault = false && @writable, @writable will tell the caller | 1386 | * 2): @write_fault = false && @writable, @writable will tell the caller |
1387 | * whether the mapping is writable. | 1387 | * whether the mapping is writable. |
1388 | */ | 1388 | */ |
1389 | static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, | 1389 | static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, |
1390 | bool write_fault, bool *writable) | 1390 | bool write_fault, bool *writable) |
1391 | { | 1391 | { |
1392 | struct vm_area_struct *vma; | 1392 | struct vm_area_struct *vma; |
1393 | pfn_t pfn = 0; | 1393 | kvm_pfn_t pfn = 0; |
1394 | int npages; | 1394 | int npages; |
1395 | 1395 | ||
1396 | /* we can do it either atomically or asynchronously, not both */ | 1396 | /* we can do it either atomically or asynchronously, not both */ |
@@ -1431,8 +1431,9 @@ exit: | |||
1431 | return pfn; | 1431 | return pfn; |
1432 | } | 1432 | } |
1433 | 1433 | ||
1434 | pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, | 1434 | kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, |
1435 | bool *async, bool write_fault, bool *writable) | 1435 | bool atomic, bool *async, bool write_fault, |
1436 | bool *writable) | ||
1436 | { | 1437 | { |
1437 | unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); | 1438 | unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); |
1438 | 1439 | ||
@@ -1453,7 +1454,7 @@ pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, | |||
1453 | } | 1454 | } |
1454 | EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); | 1455 | EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); |
1455 | 1456 | ||
1456 | pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, | 1457 | kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, |
1457 | bool *writable) | 1458 | bool *writable) |
1458 | { | 1459 | { |
1459 | return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, | 1460 | return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, |
@@ -1461,37 +1462,37 @@ pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, | |||
1461 | } | 1462 | } |
1462 | EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); | 1463 | EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); |
1463 | 1464 | ||
1464 | pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) | 1465 | kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) |
1465 | { | 1466 | { |
1466 | return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); | 1467 | return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); |
1467 | } | 1468 | } |
1468 | EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); | 1469 | EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); |
1469 | 1470 | ||
1470 | pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) | 1471 | kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) |
1471 | { | 1472 | { |
1472 | return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); | 1473 | return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); |
1473 | } | 1474 | } |
1474 | EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); | 1475 | EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); |
1475 | 1476 | ||
1476 | pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) | 1477 | kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) |
1477 | { | 1478 | { |
1478 | return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); | 1479 | return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); |
1479 | } | 1480 | } |
1480 | EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); | 1481 | EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); |
1481 | 1482 | ||
1482 | pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) | 1483 | kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) |
1483 | { | 1484 | { |
1484 | return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); | 1485 | return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); |
1485 | } | 1486 | } |
1486 | EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); | 1487 | EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); |
1487 | 1488 | ||
1488 | pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) | 1489 | kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) |
1489 | { | 1490 | { |
1490 | return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); | 1491 | return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); |
1491 | } | 1492 | } |
1492 | EXPORT_SYMBOL_GPL(gfn_to_pfn); | 1493 | EXPORT_SYMBOL_GPL(gfn_to_pfn); |
1493 | 1494 | ||
1494 | pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) | 1495 | kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) |
1495 | { | 1496 | { |
1496 | return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); | 1497 | return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); |
1497 | } | 1498 | } |
@@ -1514,7 +1515,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, | |||
1514 | } | 1515 | } |
1515 | EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); | 1516 | EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); |
1516 | 1517 | ||
1517 | static struct page *kvm_pfn_to_page(pfn_t pfn) | 1518 | static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) |
1518 | { | 1519 | { |
1519 | if (is_error_noslot_pfn(pfn)) | 1520 | if (is_error_noslot_pfn(pfn)) |
1520 | return KVM_ERR_PTR_BAD_PAGE; | 1521 | return KVM_ERR_PTR_BAD_PAGE; |
@@ -1529,7 +1530,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn) | |||
1529 | 1530 | ||
1530 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) | 1531 | struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) |
1531 | { | 1532 | { |
1532 | pfn_t pfn; | 1533 | kvm_pfn_t pfn; |
1533 | 1534 | ||
1534 | pfn = gfn_to_pfn(kvm, gfn); | 1535 | pfn = gfn_to_pfn(kvm, gfn); |
1535 | 1536 | ||
@@ -1539,7 +1540,7 @@ EXPORT_SYMBOL_GPL(gfn_to_page); | |||
1539 | 1540 | ||
1540 | struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) | 1541 | struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) |
1541 | { | 1542 | { |
1542 | pfn_t pfn; | 1543 | kvm_pfn_t pfn; |
1543 | 1544 | ||
1544 | pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); | 1545 | pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); |
1545 | 1546 | ||
@@ -1555,7 +1556,7 @@ void kvm_release_page_clean(struct page *page) | |||
1555 | } | 1556 | } |
1556 | EXPORT_SYMBOL_GPL(kvm_release_page_clean); | 1557 | EXPORT_SYMBOL_GPL(kvm_release_page_clean); |
1557 | 1558 | ||
1558 | void kvm_release_pfn_clean(pfn_t pfn) | 1559 | void kvm_release_pfn_clean(kvm_pfn_t pfn) |
1559 | { | 1560 | { |
1560 | if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) | 1561 | if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) |
1561 | put_page(pfn_to_page(pfn)); | 1562 | put_page(pfn_to_page(pfn)); |
@@ -1570,13 +1571,13 @@ void kvm_release_page_dirty(struct page *page) | |||
1570 | } | 1571 | } |
1571 | EXPORT_SYMBOL_GPL(kvm_release_page_dirty); | 1572 | EXPORT_SYMBOL_GPL(kvm_release_page_dirty); |
1572 | 1573 | ||
1573 | static void kvm_release_pfn_dirty(pfn_t pfn) | 1574 | static void kvm_release_pfn_dirty(kvm_pfn_t pfn) |
1574 | { | 1575 | { |
1575 | kvm_set_pfn_dirty(pfn); | 1576 | kvm_set_pfn_dirty(pfn); |
1576 | kvm_release_pfn_clean(pfn); | 1577 | kvm_release_pfn_clean(pfn); |
1577 | } | 1578 | } |
1578 | 1579 | ||
1579 | void kvm_set_pfn_dirty(pfn_t pfn) | 1580 | void kvm_set_pfn_dirty(kvm_pfn_t pfn) |
1580 | { | 1581 | { |
1581 | if (!kvm_is_reserved_pfn(pfn)) { | 1582 | if (!kvm_is_reserved_pfn(pfn)) { |
1582 | struct page *page = pfn_to_page(pfn); | 1583 | struct page *page = pfn_to_page(pfn); |
@@ -1587,14 +1588,14 @@ void kvm_set_pfn_dirty(pfn_t pfn) | |||
1587 | } | 1588 | } |
1588 | EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); | 1589 | EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); |
1589 | 1590 | ||
1590 | void kvm_set_pfn_accessed(pfn_t pfn) | 1591 | void kvm_set_pfn_accessed(kvm_pfn_t pfn) |
1591 | { | 1592 | { |
1592 | if (!kvm_is_reserved_pfn(pfn)) | 1593 | if (!kvm_is_reserved_pfn(pfn)) |
1593 | mark_page_accessed(pfn_to_page(pfn)); | 1594 | mark_page_accessed(pfn_to_page(pfn)); |
1594 | } | 1595 | } |
1595 | EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); | 1596 | EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); |
1596 | 1597 | ||
1597 | void kvm_get_pfn(pfn_t pfn) | 1598 | void kvm_get_pfn(kvm_pfn_t pfn) |
1598 | { | 1599 | { |
1599 | if (!kvm_is_reserved_pfn(pfn)) | 1600 | if (!kvm_is_reserved_pfn(pfn)) |
1600 | get_page(pfn_to_page(pfn)); | 1601 | get_page(pfn_to_page(pfn)); |