diff options
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r-- | include/linux/mm.h | 136 |
1 files changed, 98 insertions, 38 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index 80fc92a49649..a4d24f3c5430 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -138,7 +138,6 @@ extern unsigned int kobjsize(const void *objp); | |||
138 | #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ | 138 | #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ |
139 | #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ | 139 | #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ |
140 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ | 140 | #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ |
141 | #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ | ||
142 | #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ | 141 | #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ |
143 | #define VM_ARCH_2 0x02000000 | 142 | #define VM_ARCH_2 0x02000000 |
144 | #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ | 143 | #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ |
@@ -206,21 +205,19 @@ extern unsigned int kobjsize(const void *objp); | |||
206 | extern pgprot_t protection_map[16]; | 205 | extern pgprot_t protection_map[16]; |
207 | 206 | ||
208 | #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ | 207 | #define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ |
209 | #define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ | 208 | #define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */ |
210 | #define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ | 209 | #define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */ |
211 | #define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ | 210 | #define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */ |
212 | #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ | 211 | #define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */ |
213 | #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ | 212 | #define FAULT_FLAG_TRIED 0x20 /* Second try */ |
214 | #define FAULT_FLAG_TRIED 0x40 /* second try */ | 213 | #define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ |
215 | #define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */ | ||
216 | 214 | ||
217 | /* | 215 | /* |
218 | * vm_fault is filled by the the pagefault handler and passed to the vma's | 216 | * vm_fault is filled by the the pagefault handler and passed to the vma's |
219 | * ->fault function. The vma's ->fault is responsible for returning a bitmask | 217 | * ->fault function. The vma's ->fault is responsible for returning a bitmask |
220 | * of VM_FAULT_xxx flags that give details about how the fault was handled. | 218 | * of VM_FAULT_xxx flags that give details about how the fault was handled. |
221 | * | 219 | * |
222 | * pgoff should be used in favour of virtual_address, if possible. If pgoff | 220 | * pgoff should be used in favour of virtual_address, if possible. |
223 | * is used, one may implement ->remap_pages to get nonlinear mapping support. | ||
224 | */ | 221 | */ |
225 | struct vm_fault { | 222 | struct vm_fault { |
226 | unsigned int flags; /* FAULT_FLAG_xxx flags */ | 223 | unsigned int flags; /* FAULT_FLAG_xxx flags */ |
@@ -287,9 +284,13 @@ struct vm_operations_struct { | |||
287 | struct mempolicy *(*get_policy)(struct vm_area_struct *vma, | 284 | struct mempolicy *(*get_policy)(struct vm_area_struct *vma, |
288 | unsigned long addr); | 285 | unsigned long addr); |
289 | #endif | 286 | #endif |
290 | /* called by sys_remap_file_pages() to populate non-linear mapping */ | 287 | /* |
291 | int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, | 288 | * Called by vm_normal_page() for special PTEs to find the |
292 | unsigned long size, pgoff_t pgoff); | 289 | * page for @addr. This is useful if the default behavior |
290 | * (using pte_page()) would not find the correct page. | ||
291 | */ | ||
292 | struct page *(*find_special_page)(struct vm_area_struct *vma, | ||
293 | unsigned long addr); | ||
293 | }; | 294 | }; |
294 | 295 | ||
295 | struct mmu_gather; | 296 | struct mmu_gather; |
@@ -446,6 +447,12 @@ static inline struct page *compound_head_by_tail(struct page *tail) | |||
446 | return tail; | 447 | return tail; |
447 | } | 448 | } |
448 | 449 | ||
450 | /* | ||
451 | * Since either compound page could be dismantled asynchronously in THP | ||
452 | * or we access asynchronously arbitrary positioned struct page, there | ||
453 | * would be tail flag race. To handle this race, we should call | ||
454 | * smp_rmb() before checking tail flag. compound_head_by_tail() did it. | ||
455 | */ | ||
449 | static inline struct page *compound_head(struct page *page) | 456 | static inline struct page *compound_head(struct page *page) |
450 | { | 457 | { |
451 | if (unlikely(PageTail(page))) | 458 | if (unlikely(PageTail(page))) |
@@ -454,6 +461,18 @@ static inline struct page *compound_head(struct page *page) | |||
454 | } | 461 | } |
455 | 462 | ||
456 | /* | 463 | /* |
464 | * If we access compound page synchronously such as access to | ||
465 | * allocated page, there is no need to handle tail flag race, so we can | ||
466 | * check tail flag directly without any synchronization primitive. | ||
467 | */ | ||
468 | static inline struct page *compound_head_fast(struct page *page) | ||
469 | { | ||
470 | if (unlikely(PageTail(page))) | ||
471 | return page->first_page; | ||
472 | return page; | ||
473 | } | ||
474 | |||
475 | /* | ||
457 | * The atomic page->_mapcount, starts from -1: so that transitions | 476 | * The atomic page->_mapcount, starts from -1: so that transitions |
458 | * both from it and to it can be tracked, using atomic_inc_and_test | 477 | * both from it and to it can be tracked, using atomic_inc_and_test |
459 | * and atomic_add_negative(-1). | 478 | * and atomic_add_negative(-1). |
@@ -465,7 +484,8 @@ static inline void page_mapcount_reset(struct page *page) | |||
465 | 484 | ||
466 | static inline int page_mapcount(struct page *page) | 485 | static inline int page_mapcount(struct page *page) |
467 | { | 486 | { |
468 | return atomic_read(&(page)->_mapcount) + 1; | 487 | VM_BUG_ON_PAGE(PageSlab(page), page); |
488 | return atomic_read(&page->_mapcount) + 1; | ||
469 | } | 489 | } |
470 | 490 | ||
471 | static inline int page_count(struct page *page) | 491 | static inline int page_count(struct page *page) |
@@ -531,7 +551,14 @@ static inline void get_page(struct page *page) | |||
531 | static inline struct page *virt_to_head_page(const void *x) | 551 | static inline struct page *virt_to_head_page(const void *x) |
532 | { | 552 | { |
533 | struct page *page = virt_to_page(x); | 553 | struct page *page = virt_to_page(x); |
534 | return compound_head(page); | 554 | |
555 | /* | ||
556 | * We don't need to worry about synchronization of tail flag | ||
557 | * when we call virt_to_head_page() since it is only called for | ||
558 | * already allocated page and this page won't be freed until | ||
559 | * this virt_to_head_page() is finished. So use _fast variant. | ||
560 | */ | ||
561 | return compound_head_fast(page); | ||
535 | } | 562 | } |
536 | 563 | ||
537 | /* | 564 | /* |
@@ -601,29 +628,28 @@ int split_free_page(struct page *page); | |||
601 | * prototype for that function and accessor functions. | 628 | * prototype for that function and accessor functions. |
602 | * These are _only_ valid on the head of a PG_compound page. | 629 | * These are _only_ valid on the head of a PG_compound page. |
603 | */ | 630 | */ |
604 | typedef void compound_page_dtor(struct page *); | ||
605 | 631 | ||
606 | static inline void set_compound_page_dtor(struct page *page, | 632 | static inline void set_compound_page_dtor(struct page *page, |
607 | compound_page_dtor *dtor) | 633 | compound_page_dtor *dtor) |
608 | { | 634 | { |
609 | page[1].lru.next = (void *)dtor; | 635 | page[1].compound_dtor = dtor; |
610 | } | 636 | } |
611 | 637 | ||
612 | static inline compound_page_dtor *get_compound_page_dtor(struct page *page) | 638 | static inline compound_page_dtor *get_compound_page_dtor(struct page *page) |
613 | { | 639 | { |
614 | return (compound_page_dtor *)page[1].lru.next; | 640 | return page[1].compound_dtor; |
615 | } | 641 | } |
616 | 642 | ||
617 | static inline int compound_order(struct page *page) | 643 | static inline int compound_order(struct page *page) |
618 | { | 644 | { |
619 | if (!PageHead(page)) | 645 | if (!PageHead(page)) |
620 | return 0; | 646 | return 0; |
621 | return (unsigned long)page[1].lru.prev; | 647 | return page[1].compound_order; |
622 | } | 648 | } |
623 | 649 | ||
624 | static inline void set_compound_order(struct page *page, unsigned long order) | 650 | static inline void set_compound_order(struct page *page, unsigned long order) |
625 | { | 651 | { |
626 | page[1].lru.prev = (void *)order; | 652 | page[1].compound_order = order; |
627 | } | 653 | } |
628 | 654 | ||
629 | #ifdef CONFIG_MMU | 655 | #ifdef CONFIG_MMU |
@@ -1070,6 +1096,7 @@ static inline int page_mapped(struct page *page) | |||
1070 | #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ | 1096 | #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ |
1071 | #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ | 1097 | #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ |
1072 | #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ | 1098 | #define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ |
1099 | #define VM_FAULT_SIGSEGV 0x0040 | ||
1073 | 1100 | ||
1074 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ | 1101 | #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ |
1075 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ | 1102 | #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ |
@@ -1078,8 +1105,9 @@ static inline int page_mapped(struct page *page) | |||
1078 | 1105 | ||
1079 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ | 1106 | #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ |
1080 | 1107 | ||
1081 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ | 1108 | #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ |
1082 | VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE) | 1109 | VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ |
1110 | VM_FAULT_FALLBACK) | ||
1083 | 1111 | ||
1084 | /* Encode hstate index for a hwpoisoned large page */ | 1112 | /* Encode hstate index for a hwpoisoned large page */ |
1085 | #define VM_FAULT_SET_HINDEX(x) ((x) << 12) | 1113 | #define VM_FAULT_SET_HINDEX(x) ((x) << 12) |
@@ -1119,7 +1147,6 @@ extern void user_shm_unlock(size_t, struct user_struct *); | |||
1119 | * Parameter block passed down to zap_pte_range in exceptional cases. | 1147 | * Parameter block passed down to zap_pte_range in exceptional cases. |
1120 | */ | 1148 | */ |
1121 | struct zap_details { | 1149 | struct zap_details { |
1122 | struct vm_area_struct *nonlinear_vma; /* Check page->index if set */ | ||
1123 | struct address_space *check_mapping; /* Check page->mapping if set */ | 1150 | struct address_space *check_mapping; /* Check page->mapping if set */ |
1124 | pgoff_t first_index; /* Lowest page->index to unmap */ | 1151 | pgoff_t first_index; /* Lowest page->index to unmap */ |
1125 | pgoff_t last_index; /* Highest page->index to unmap */ | 1152 | pgoff_t last_index; /* Highest page->index to unmap */ |
@@ -1137,8 +1164,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | |||
1137 | 1164 | ||
1138 | /** | 1165 | /** |
1139 | * mm_walk - callbacks for walk_page_range | 1166 | * mm_walk - callbacks for walk_page_range |
1140 | * @pgd_entry: if set, called for each non-empty PGD (top-level) entry | ||
1141 | * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry | ||
1142 | * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry | 1167 | * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry |
1143 | * this handler is required to be able to handle | 1168 | * this handler is required to be able to handle |
1144 | * pmd_trans_huge() pmds. They may simply choose to | 1169 | * pmd_trans_huge() pmds. They may simply choose to |
@@ -1146,16 +1171,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, | |||
1146 | * @pte_entry: if set, called for each non-empty PTE (4th-level) entry | 1171 | * @pte_entry: if set, called for each non-empty PTE (4th-level) entry |
1147 | * @pte_hole: if set, called for each hole at all levels | 1172 | * @pte_hole: if set, called for each hole at all levels |
1148 | * @hugetlb_entry: if set, called for each hugetlb entry | 1173 | * @hugetlb_entry: if set, called for each hugetlb entry |
1149 | * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry | 1174 | * @test_walk: caller specific callback function to determine whether |
1150 | * is used. | 1175 | * we walk over the current vma or not. A positive returned |
1176 | * value means "do page table walk over the current vma," | ||
1177 | * and a negative one means "abort current page table walk | ||
1178 | * right now." 0 means "skip the current vma." | ||
1179 | * @mm: mm_struct representing the target process of page table walk | ||
1180 | * @vma: vma currently walked (NULL if walking outside vmas) | ||
1181 | * @private: private data for callbacks' usage | ||
1151 | * | 1182 | * |
1152 | * (see walk_page_range for more details) | 1183 | * (see the comment on walk_page_range() for more details) |
1153 | */ | 1184 | */ |
1154 | struct mm_walk { | 1185 | struct mm_walk { |
1155 | int (*pgd_entry)(pgd_t *pgd, unsigned long addr, | ||
1156 | unsigned long next, struct mm_walk *walk); | ||
1157 | int (*pud_entry)(pud_t *pud, unsigned long addr, | ||
1158 | unsigned long next, struct mm_walk *walk); | ||
1159 | int (*pmd_entry)(pmd_t *pmd, unsigned long addr, | 1186 | int (*pmd_entry)(pmd_t *pmd, unsigned long addr, |
1160 | unsigned long next, struct mm_walk *walk); | 1187 | unsigned long next, struct mm_walk *walk); |
1161 | int (*pte_entry)(pte_t *pte, unsigned long addr, | 1188 | int (*pte_entry)(pte_t *pte, unsigned long addr, |
@@ -1165,12 +1192,16 @@ struct mm_walk { | |||
1165 | int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, | 1192 | int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, |
1166 | unsigned long addr, unsigned long next, | 1193 | unsigned long addr, unsigned long next, |
1167 | struct mm_walk *walk); | 1194 | struct mm_walk *walk); |
1195 | int (*test_walk)(unsigned long addr, unsigned long next, | ||
1196 | struct mm_walk *walk); | ||
1168 | struct mm_struct *mm; | 1197 | struct mm_struct *mm; |
1198 | struct vm_area_struct *vma; | ||
1169 | void *private; | 1199 | void *private; |
1170 | }; | 1200 | }; |
1171 | 1201 | ||
1172 | int walk_page_range(unsigned long addr, unsigned long end, | 1202 | int walk_page_range(unsigned long addr, unsigned long end, |
1173 | struct mm_walk *walk); | 1203 | struct mm_walk *walk); |
1204 | int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk); | ||
1174 | void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, | 1205 | void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, |
1175 | unsigned long end, unsigned long floor, unsigned long ceiling); | 1206 | unsigned long end, unsigned long floor, unsigned long ceiling); |
1176 | int copy_page_range(struct mm_struct *dst, struct mm_struct *src, | 1207 | int copy_page_range(struct mm_struct *dst, struct mm_struct *src, |
@@ -1234,6 +1265,17 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, | |||
1234 | unsigned long start, unsigned long nr_pages, | 1265 | unsigned long start, unsigned long nr_pages, |
1235 | int write, int force, struct page **pages, | 1266 | int write, int force, struct page **pages, |
1236 | struct vm_area_struct **vmas); | 1267 | struct vm_area_struct **vmas); |
1268 | long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm, | ||
1269 | unsigned long start, unsigned long nr_pages, | ||
1270 | int write, int force, struct page **pages, | ||
1271 | int *locked); | ||
1272 | long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
1273 | unsigned long start, unsigned long nr_pages, | ||
1274 | int write, int force, struct page **pages, | ||
1275 | unsigned int gup_flags); | ||
1276 | long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, | ||
1277 | unsigned long start, unsigned long nr_pages, | ||
1278 | int write, int force, struct page **pages); | ||
1237 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, | 1279 | int get_user_pages_fast(unsigned long start, int nr_pages, int write, |
1238 | struct page **pages); | 1280 | struct page **pages); |
1239 | struct kvec; | 1281 | struct kvec; |
@@ -1411,8 +1453,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, | |||
1411 | { | 1453 | { |
1412 | return 0; | 1454 | return 0; |
1413 | } | 1455 | } |
1456 | |||
1457 | static inline unsigned long mm_nr_pmds(struct mm_struct *mm) | ||
1458 | { | ||
1459 | return 0; | ||
1460 | } | ||
1461 | |||
1462 | static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} | ||
1463 | static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} | ||
1464 | |||
1414 | #else | 1465 | #else |
1415 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); | 1466 | int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); |
1467 | |||
1468 | static inline unsigned long mm_nr_pmds(struct mm_struct *mm) | ||
1469 | { | ||
1470 | return atomic_long_read(&mm->nr_pmds); | ||
1471 | } | ||
1472 | |||
1473 | static inline void mm_inc_nr_pmds(struct mm_struct *mm) | ||
1474 | { | ||
1475 | atomic_long_inc(&mm->nr_pmds); | ||
1476 | } | ||
1477 | |||
1478 | static inline void mm_dec_nr_pmds(struct mm_struct *mm) | ||
1479 | { | ||
1480 | atomic_long_dec(&mm->nr_pmds); | ||
1481 | } | ||
1416 | #endif | 1482 | #endif |
1417 | 1483 | ||
1418 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, | 1484 | int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, |
@@ -1775,12 +1841,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node, | |||
1775 | for (vma = vma_interval_tree_iter_first(root, start, last); \ | 1841 | for (vma = vma_interval_tree_iter_first(root, start, last); \ |
1776 | vma; vma = vma_interval_tree_iter_next(vma, start, last)) | 1842 | vma; vma = vma_interval_tree_iter_next(vma, start, last)) |
1777 | 1843 | ||
1778 | static inline void vma_nonlinear_insert(struct vm_area_struct *vma, | ||
1779 | struct list_head *list) | ||
1780 | { | ||
1781 | list_add_tail(&vma->shared.nonlinear, list); | ||
1782 | } | ||
1783 | |||
1784 | void anon_vma_interval_tree_insert(struct anon_vma_chain *node, | 1844 | void anon_vma_interval_tree_insert(struct anon_vma_chain *node, |
1785 | struct rb_root *root); | 1845 | struct rb_root *root); |
1786 | void anon_vma_interval_tree_remove(struct anon_vma_chain *node, | 1846 | void anon_vma_interval_tree_remove(struct anon_vma_chain *node, |