aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux/mm.h
diff options
context:
space:
mode:
Diffstat (limited to 'include/linux/mm.h')
-rw-r--r--include/linux/mm.h136
1 files changed, 98 insertions, 38 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80fc92a49649..a4d24f3c5430 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -138,7 +138,6 @@ extern unsigned int kobjsize(const void *objp);
138#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 138#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
139#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ 139#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
140#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 140#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
141#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
142#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ 141#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
143#define VM_ARCH_2 0x02000000 142#define VM_ARCH_2 0x02000000
144#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ 143#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
@@ -206,21 +205,19 @@ extern unsigned int kobjsize(const void *objp);
206extern pgprot_t protection_map[16]; 205extern pgprot_t protection_map[16];
207 206
208#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */ 207#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
209#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */ 208#define FAULT_FLAG_MKWRITE 0x02 /* Fault was mkwrite of existing pte */
210#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */ 209#define FAULT_FLAG_ALLOW_RETRY 0x04 /* Retry fault if blocking */
211#define FAULT_FLAG_ALLOW_RETRY 0x08 /* Retry fault if blocking */ 210#define FAULT_FLAG_RETRY_NOWAIT 0x08 /* Don't drop mmap_sem and wait when retrying */
212#define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */ 211#define FAULT_FLAG_KILLABLE 0x10 /* The fault task is in SIGKILL killable region */
213#define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */ 212#define FAULT_FLAG_TRIED 0x20 /* Second try */
214#define FAULT_FLAG_TRIED 0x40 /* second try */ 213#define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */
215#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */
216 214
217/* 215/*
218 * vm_fault is filled by the the pagefault handler and passed to the vma's 216 * vm_fault is filled by the the pagefault handler and passed to the vma's
219 * ->fault function. The vma's ->fault is responsible for returning a bitmask 217 * ->fault function. The vma's ->fault is responsible for returning a bitmask
220 * of VM_FAULT_xxx flags that give details about how the fault was handled. 218 * of VM_FAULT_xxx flags that give details about how the fault was handled.
221 * 219 *
222 * pgoff should be used in favour of virtual_address, if possible. If pgoff 220 * pgoff should be used in favour of virtual_address, if possible.
223 * is used, one may implement ->remap_pages to get nonlinear mapping support.
224 */ 221 */
225struct vm_fault { 222struct vm_fault {
226 unsigned int flags; /* FAULT_FLAG_xxx flags */ 223 unsigned int flags; /* FAULT_FLAG_xxx flags */
@@ -287,9 +284,13 @@ struct vm_operations_struct {
287 struct mempolicy *(*get_policy)(struct vm_area_struct *vma, 284 struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
288 unsigned long addr); 285 unsigned long addr);
289#endif 286#endif
290 /* called by sys_remap_file_pages() to populate non-linear mapping */ 287 /*
291 int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr, 288 * Called by vm_normal_page() for special PTEs to find the
292 unsigned long size, pgoff_t pgoff); 289 * page for @addr. This is useful if the default behavior
290 * (using pte_page()) would not find the correct page.
291 */
292 struct page *(*find_special_page)(struct vm_area_struct *vma,
293 unsigned long addr);
293}; 294};
294 295
295struct mmu_gather; 296struct mmu_gather;
@@ -446,6 +447,12 @@ static inline struct page *compound_head_by_tail(struct page *tail)
446 return tail; 447 return tail;
447} 448}
448 449
450/*
451 * Since either compound page could be dismantled asynchronously in THP
452 * or we access asynchronously arbitrary positioned struct page, there
453 * would be tail flag race. To handle this race, we should call
454 * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
455 */
449static inline struct page *compound_head(struct page *page) 456static inline struct page *compound_head(struct page *page)
450{ 457{
451 if (unlikely(PageTail(page))) 458 if (unlikely(PageTail(page)))
@@ -454,6 +461,18 @@ static inline struct page *compound_head(struct page *page)
454} 461}
455 462
456/* 463/*
464 * If we access compound page synchronously such as access to
465 * allocated page, there is no need to handle tail flag race, so we can
466 * check tail flag directly without any synchronization primitive.
467 */
468static inline struct page *compound_head_fast(struct page *page)
469{
470 if (unlikely(PageTail(page)))
471 return page->first_page;
472 return page;
473}
474
475/*
457 * The atomic page->_mapcount, starts from -1: so that transitions 476 * The atomic page->_mapcount, starts from -1: so that transitions
458 * both from it and to it can be tracked, using atomic_inc_and_test 477 * both from it and to it can be tracked, using atomic_inc_and_test
459 * and atomic_add_negative(-1). 478 * and atomic_add_negative(-1).
@@ -465,7 +484,8 @@ static inline void page_mapcount_reset(struct page *page)
465 484
466static inline int page_mapcount(struct page *page) 485static inline int page_mapcount(struct page *page)
467{ 486{
468 return atomic_read(&(page)->_mapcount) + 1; 487 VM_BUG_ON_PAGE(PageSlab(page), page);
488 return atomic_read(&page->_mapcount) + 1;
469} 489}
470 490
471static inline int page_count(struct page *page) 491static inline int page_count(struct page *page)
@@ -531,7 +551,14 @@ static inline void get_page(struct page *page)
531static inline struct page *virt_to_head_page(const void *x) 551static inline struct page *virt_to_head_page(const void *x)
532{ 552{
533 struct page *page = virt_to_page(x); 553 struct page *page = virt_to_page(x);
534 return compound_head(page); 554
555 /*
556 * We don't need to worry about synchronization of tail flag
557 * when we call virt_to_head_page() since it is only called for
558 * already allocated page and this page won't be freed until
559 * this virt_to_head_page() is finished. So use _fast variant.
560 */
561 return compound_head_fast(page);
535} 562}
536 563
537/* 564/*
@@ -601,29 +628,28 @@ int split_free_page(struct page *page);
601 * prototype for that function and accessor functions. 628 * prototype for that function and accessor functions.
602 * These are _only_ valid on the head of a PG_compound page. 629 * These are _only_ valid on the head of a PG_compound page.
603 */ 630 */
604typedef void compound_page_dtor(struct page *);
605 631
606static inline void set_compound_page_dtor(struct page *page, 632static inline void set_compound_page_dtor(struct page *page,
607 compound_page_dtor *dtor) 633 compound_page_dtor *dtor)
608{ 634{
609 page[1].lru.next = (void *)dtor; 635 page[1].compound_dtor = dtor;
610} 636}
611 637
612static inline compound_page_dtor *get_compound_page_dtor(struct page *page) 638static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
613{ 639{
614 return (compound_page_dtor *)page[1].lru.next; 640 return page[1].compound_dtor;
615} 641}
616 642
617static inline int compound_order(struct page *page) 643static inline int compound_order(struct page *page)
618{ 644{
619 if (!PageHead(page)) 645 if (!PageHead(page))
620 return 0; 646 return 0;
621 return (unsigned long)page[1].lru.prev; 647 return page[1].compound_order;
622} 648}
623 649
624static inline void set_compound_order(struct page *page, unsigned long order) 650static inline void set_compound_order(struct page *page, unsigned long order)
625{ 651{
626 page[1].lru.prev = (void *)order; 652 page[1].compound_order = order;
627} 653}
628 654
629#ifdef CONFIG_MMU 655#ifdef CONFIG_MMU
@@ -1070,6 +1096,7 @@ static inline int page_mapped(struct page *page)
1070#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ 1096#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
1071#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ 1097#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
1072#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ 1098#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
1099#define VM_FAULT_SIGSEGV 0x0040
1073 1100
1074#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ 1101#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
1075#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ 1102#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
@@ -1078,8 +1105,9 @@ static inline int page_mapped(struct page *page)
1078 1105
1079#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ 1106#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
1080 1107
1081#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ 1108#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
1082 VM_FAULT_FALLBACK | VM_FAULT_HWPOISON_LARGE) 1109 VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
1110 VM_FAULT_FALLBACK)
1083 1111
1084/* Encode hstate index for a hwpoisoned large page */ 1112/* Encode hstate index for a hwpoisoned large page */
1085#define VM_FAULT_SET_HINDEX(x) ((x) << 12) 1113#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
@@ -1119,7 +1147,6 @@ extern void user_shm_unlock(size_t, struct user_struct *);
1119 * Parameter block passed down to zap_pte_range in exceptional cases. 1147 * Parameter block passed down to zap_pte_range in exceptional cases.
1120 */ 1148 */
1121struct zap_details { 1149struct zap_details {
1122 struct vm_area_struct *nonlinear_vma; /* Check page->index if set */
1123 struct address_space *check_mapping; /* Check page->mapping if set */ 1150 struct address_space *check_mapping; /* Check page->mapping if set */
1124 pgoff_t first_index; /* Lowest page->index to unmap */ 1151 pgoff_t first_index; /* Lowest page->index to unmap */
1125 pgoff_t last_index; /* Highest page->index to unmap */ 1152 pgoff_t last_index; /* Highest page->index to unmap */
@@ -1137,8 +1164,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
1137 1164
1138/** 1165/**
1139 * mm_walk - callbacks for walk_page_range 1166 * mm_walk - callbacks for walk_page_range
1140 * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
1141 * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry
1142 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry 1167 * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry
1143 * this handler is required to be able to handle 1168 * this handler is required to be able to handle
1144 * pmd_trans_huge() pmds. They may simply choose to 1169 * pmd_trans_huge() pmds. They may simply choose to
@@ -1146,16 +1171,18 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
1146 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry 1171 * @pte_entry: if set, called for each non-empty PTE (4th-level) entry
1147 * @pte_hole: if set, called for each hole at all levels 1172 * @pte_hole: if set, called for each hole at all levels
1148 * @hugetlb_entry: if set, called for each hugetlb entry 1173 * @hugetlb_entry: if set, called for each hugetlb entry
1149 * *Caution*: The caller must hold mmap_sem() if @hugetlb_entry 1174 * @test_walk: caller specific callback function to determine whether
1150 * is used. 1175 * we walk over the current vma or not. A positive returned
1176 * value means "do page table walk over the current vma,"
1177 * and a negative one means "abort current page table walk
1178 * right now." 0 means "skip the current vma."
1179 * @mm: mm_struct representing the target process of page table walk
1180 * @vma: vma currently walked (NULL if walking outside vmas)
1181 * @private: private data for callbacks' usage
1151 * 1182 *
1152 * (see walk_page_range for more details) 1183 * (see the comment on walk_page_range() for more details)
1153 */ 1184 */
1154struct mm_walk { 1185struct mm_walk {
1155 int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
1156 unsigned long next, struct mm_walk *walk);
1157 int (*pud_entry)(pud_t *pud, unsigned long addr,
1158 unsigned long next, struct mm_walk *walk);
1159 int (*pmd_entry)(pmd_t *pmd, unsigned long addr, 1186 int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
1160 unsigned long next, struct mm_walk *walk); 1187 unsigned long next, struct mm_walk *walk);
1161 int (*pte_entry)(pte_t *pte, unsigned long addr, 1188 int (*pte_entry)(pte_t *pte, unsigned long addr,
@@ -1165,12 +1192,16 @@ struct mm_walk {
1165 int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, 1192 int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
1166 unsigned long addr, unsigned long next, 1193 unsigned long addr, unsigned long next,
1167 struct mm_walk *walk); 1194 struct mm_walk *walk);
1195 int (*test_walk)(unsigned long addr, unsigned long next,
1196 struct mm_walk *walk);
1168 struct mm_struct *mm; 1197 struct mm_struct *mm;
1198 struct vm_area_struct *vma;
1169 void *private; 1199 void *private;
1170}; 1200};
1171 1201
1172int walk_page_range(unsigned long addr, unsigned long end, 1202int walk_page_range(unsigned long addr, unsigned long end,
1173 struct mm_walk *walk); 1203 struct mm_walk *walk);
1204int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk);
1174void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, 1205void free_pgd_range(struct mmu_gather *tlb, unsigned long addr,
1175 unsigned long end, unsigned long floor, unsigned long ceiling); 1206 unsigned long end, unsigned long floor, unsigned long ceiling);
1176int copy_page_range(struct mm_struct *dst, struct mm_struct *src, 1207int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
@@ -1234,6 +1265,17 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1234 unsigned long start, unsigned long nr_pages, 1265 unsigned long start, unsigned long nr_pages,
1235 int write, int force, struct page **pages, 1266 int write, int force, struct page **pages,
1236 struct vm_area_struct **vmas); 1267 struct vm_area_struct **vmas);
1268long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
1269 unsigned long start, unsigned long nr_pages,
1270 int write, int force, struct page **pages,
1271 int *locked);
1272long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
1273 unsigned long start, unsigned long nr_pages,
1274 int write, int force, struct page **pages,
1275 unsigned int gup_flags);
1276long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
1277 unsigned long start, unsigned long nr_pages,
1278 int write, int force, struct page **pages);
1237int get_user_pages_fast(unsigned long start, int nr_pages, int write, 1279int get_user_pages_fast(unsigned long start, int nr_pages, int write,
1238 struct page **pages); 1280 struct page **pages);
1239struct kvec; 1281struct kvec;
@@ -1411,8 +1453,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
1411{ 1453{
1412 return 0; 1454 return 0;
1413} 1455}
1456
1457static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
1458{
1459 return 0;
1460}
1461
1462static inline void mm_inc_nr_pmds(struct mm_struct *mm) {}
1463static inline void mm_dec_nr_pmds(struct mm_struct *mm) {}
1464
1414#else 1465#else
1415int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); 1466int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address);
1467
1468static inline unsigned long mm_nr_pmds(struct mm_struct *mm)
1469{
1470 return atomic_long_read(&mm->nr_pmds);
1471}
1472
1473static inline void mm_inc_nr_pmds(struct mm_struct *mm)
1474{
1475 atomic_long_inc(&mm->nr_pmds);
1476}
1477
1478static inline void mm_dec_nr_pmds(struct mm_struct *mm)
1479{
1480 atomic_long_dec(&mm->nr_pmds);
1481}
1416#endif 1482#endif
1417 1483
1418int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, 1484int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -1775,12 +1841,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
1775 for (vma = vma_interval_tree_iter_first(root, start, last); \ 1841 for (vma = vma_interval_tree_iter_first(root, start, last); \
1776 vma; vma = vma_interval_tree_iter_next(vma, start, last)) 1842 vma; vma = vma_interval_tree_iter_next(vma, start, last))
1777 1843
1778static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
1779 struct list_head *list)
1780{
1781 list_add_tail(&vma->shared.nonlinear, list);
1782}
1783
1784void anon_vma_interval_tree_insert(struct anon_vma_chain *node, 1844void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
1785 struct rb_root *root); 1845 struct rb_root *root);
1786void anon_vma_interval_tree_remove(struct anon_vma_chain *node, 1846void anon_vma_interval_tree_remove(struct anon_vma_chain *node,