summaryrefslogtreecommitdiffstats
path: root/fs/hugetlbfs/inode.c
diff options
context:
space:
mode:
authorMike Kravetz <mike.kravetz@oracle.com>2015-09-08 18:01:54 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-08 18:35:28 -0400
commit70c3547e36f5c9fbc4caecfeca98f0effa6932c5 (patch)
tree418e7d852a326516291553a41fdc965ece2290b2 /fs/hugetlbfs/inode.c
parentab76ad540a50191308e5bb6b5e2d9e26c78616d3 (diff)
hugetlbfs: add hugetlbfs_fallocate()
This is based on the shmem version, but it has diverged quite a bit. We have no swap to worry about, nor the new file sealing. Add synchronication via the fault mutex table to coordinate page faults, fallocate allocation and fallocate hole punch. What this allows us to do is move physical memory in and out of a hugetlbfs file without having it mapped. This also gives us the ability to support MADV_REMOVE since it is currently implemented using fallocate(). MADV_REMOVE lets madvise() remove pages from the middle of a hugetlbfs file, which wasn't possible before. hugetlbfs fallocate only operates on whole huge pages. Based on code by Dave Hansen. Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: David Rientjes <rientjes@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Michal Hocko <mhocko@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/hugetlbfs/inode.c')
-rw-r--r--fs/hugetlbfs/inode.c179
1 files changed, 178 insertions, 1 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1ef630f81c99..316adb968b65 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -12,6 +12,7 @@
12#include <linux/thread_info.h> 12#include <linux/thread_info.h>
13#include <asm/current.h> 13#include <asm/current.h>
14#include <linux/sched.h> /* remove ASAP */ 14#include <linux/sched.h> /* remove ASAP */
15#include <linux/falloc.h>
15#include <linux/fs.h> 16#include <linux/fs.h>
16#include <linux/mount.h> 17#include <linux/mount.h>
17#include <linux/file.h> 18#include <linux/file.h>
@@ -84,6 +85,29 @@ static const match_table_t tokens = {
84 {Opt_err, NULL}, 85 {Opt_err, NULL},
85}; 86};
86 87
88#ifdef CONFIG_NUMA
89static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
90 struct inode *inode, pgoff_t index)
91{
92 vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy,
93 index);
94}
95
96static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
97{
98 mpol_cond_put(vma->vm_policy);
99}
100#else
101static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma,
102 struct inode *inode, pgoff_t index)
103{
104}
105
106static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
107{
108}
109#endif
110
87static void huge_pagevec_release(struct pagevec *pvec) 111static void huge_pagevec_release(struct pagevec *pvec)
88{ 112{
89 int i; 113 int i;
@@ -479,6 +503,158 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
479 return 0; 503 return 0;
480} 504}
481 505
506static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
507{
508 struct hstate *h = hstate_inode(inode);
509 loff_t hpage_size = huge_page_size(h);
510 loff_t hole_start, hole_end;
511
512 /*
513 * For hole punch round up the beginning offset of the hole and
514 * round down the end.
515 */
516 hole_start = round_up(offset, hpage_size);
517 hole_end = round_down(offset + len, hpage_size);
518
519 if (hole_end > hole_start) {
520 struct address_space *mapping = inode->i_mapping;
521
522 mutex_lock(&inode->i_mutex);
523 i_mmap_lock_write(mapping);
524 if (!RB_EMPTY_ROOT(&mapping->i_mmap))
525 hugetlb_vmdelete_list(&mapping->i_mmap,
526 hole_start >> PAGE_SHIFT,
527 hole_end >> PAGE_SHIFT);
528 i_mmap_unlock_write(mapping);
529 remove_inode_hugepages(inode, hole_start, hole_end);
530 mutex_unlock(&inode->i_mutex);
531 }
532
533 return 0;
534}
535
536static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
537 loff_t len)
538{
539 struct inode *inode = file_inode(file);
540 struct address_space *mapping = inode->i_mapping;
541 struct hstate *h = hstate_inode(inode);
542 struct vm_area_struct pseudo_vma;
543 struct mm_struct *mm = current->mm;
544 loff_t hpage_size = huge_page_size(h);
545 unsigned long hpage_shift = huge_page_shift(h);
546 pgoff_t start, index, end;
547 int error;
548 u32 hash;
549
550 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
551 return -EOPNOTSUPP;
552
553 if (mode & FALLOC_FL_PUNCH_HOLE)
554 return hugetlbfs_punch_hole(inode, offset, len);
555
556 /*
557 * Default preallocate case.
558 * For this range, start is rounded down and end is rounded up
559 * as well as being converted to page offsets.
560 */
561 start = offset >> hpage_shift;
562 end = (offset + len + hpage_size - 1) >> hpage_shift;
563
564 mutex_lock(&inode->i_mutex);
565
566 /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
567 error = inode_newsize_ok(inode, offset + len);
568 if (error)
569 goto out;
570
571 /*
572 * Initialize a pseudo vma as this is required by the huge page
573 * allocation routines. If NUMA is configured, use page index
574 * as input to create an allocation policy.
575 */
576 memset(&pseudo_vma, 0, sizeof(struct vm_area_struct));
577 pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
578 pseudo_vma.vm_file = file;
579
580 for (index = start; index < end; index++) {
581 /*
582 * This is supposed to be the vaddr where the page is being
583 * faulted in, but we have no vaddr here.
584 */
585 struct page *page;
586 unsigned long addr;
587 int avoid_reserve = 0;
588
589 cond_resched();
590
591 /*
592 * fallocate(2) manpage permits EINTR; we may have been
593 * interrupted because we are using up too much memory.
594 */
595 if (signal_pending(current)) {
596 error = -EINTR;
597 break;
598 }
599
600 /* Set numa allocation policy based on index */
601 hugetlb_set_vma_policy(&pseudo_vma, inode, index);
602
603 /* addr is the offset within the file (zero based) */
604 addr = index * hpage_size;
605
606 /* mutex taken here, fault path and hole punch */
607 hash = hugetlb_fault_mutex_hash(h, mm, &pseudo_vma, mapping,
608 index, addr);
609 mutex_lock(&hugetlb_fault_mutex_table[hash]);
610
611 /* See if already present in mapping to avoid alloc/free */
612 page = find_get_page(mapping, index);
613 if (page) {
614 put_page(page);
615 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
616 hugetlb_drop_vma_policy(&pseudo_vma);
617 continue;
618 }
619
620 /* Allocate page and add to page cache */
621 page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
622 hugetlb_drop_vma_policy(&pseudo_vma);
623 if (IS_ERR(page)) {
624 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
625 error = PTR_ERR(page);
626 goto out;
627 }
628 clear_huge_page(page, addr, pages_per_huge_page(h));
629 __SetPageUptodate(page);
630 error = huge_add_to_page_cache(page, mapping, index);
631 if (unlikely(error)) {
632 put_page(page);
633 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
634 goto out;
635 }
636
637 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
638
639 /*
640 * page_put due to reference from alloc_huge_page()
641 * unlock_page because locked by add_to_page_cache()
642 */
643 put_page(page);
644 unlock_page(page);
645 }
646
647 if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
648 i_size_write(inode, offset + len);
649 inode->i_ctime = CURRENT_TIME;
650 spin_lock(&inode->i_lock);
651 inode->i_private = NULL;
652 spin_unlock(&inode->i_lock);
653out:
654 mutex_unlock(&inode->i_mutex);
655 return error;
656}
657
482static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) 658static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
483{ 659{
484 struct inode *inode = d_inode(dentry); 660 struct inode *inode = d_inode(dentry);
@@ -790,7 +966,8 @@ const struct file_operations hugetlbfs_file_operations = {
790 .mmap = hugetlbfs_file_mmap, 966 .mmap = hugetlbfs_file_mmap,
791 .fsync = noop_fsync, 967 .fsync = noop_fsync,
792 .get_unmapped_area = hugetlb_get_unmapped_area, 968 .get_unmapped_area = hugetlb_get_unmapped_area,
793 .llseek = default_llseek, 969 .llseek = default_llseek,
970 .fallocate = hugetlbfs_fallocate,
794}; 971};
795 972
796static const struct inode_operations hugetlbfs_dir_inode_operations = { 973static const struct inode_operations hugetlbfs_dir_inode_operations = {