aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c44
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/raid56.c1
-rw-r--r--fs/btrfs/sysfs.c8
-rw-r--r--fs/btrfs/transaction.c20
-rw-r--r--fs/dcache.c21
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/hugetlbfs/inode.c17
-rw-r--r--fs/namei.c5
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/pnfs.c13
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/write.c83
-rw-r--r--fs/nfsd/nfs4state.c62
-rw-r--r--fs/overlayfs/Kconfig14
-rw-r--r--fs/overlayfs/export.c216
-rw-r--r--fs/overlayfs/inode.c58
-rw-r--r--fs/overlayfs/namei.c6
-rw-r--r--fs/overlayfs/overlayfs.h1
-rw-r--r--fs/overlayfs/super.c1
-rw-r--r--fs/sysfs/symlink.c1
-rw-r--r--fs/xfs/xfs_iomap.c42
22 files changed, 436 insertions, 185 deletions
diff --git a/fs/aio.c b/fs/aio.c
index a062d75109cb..6bcd3fb5265a 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -68,9 +68,9 @@ struct aio_ring {
68#define AIO_RING_PAGES 8 68#define AIO_RING_PAGES 8
69 69
70struct kioctx_table { 70struct kioctx_table {
71 struct rcu_head rcu; 71 struct rcu_head rcu;
72 unsigned nr; 72 unsigned nr;
73 struct kioctx *table[]; 73 struct kioctx __rcu *table[];
74}; 74};
75 75
76struct kioctx_cpu { 76struct kioctx_cpu {
@@ -115,7 +115,8 @@ struct kioctx {
115 struct page **ring_pages; 115 struct page **ring_pages;
116 long nr_pages; 116 long nr_pages;
117 117
118 struct work_struct free_work; 118 struct rcu_head free_rcu;
119 struct work_struct free_work; /* see free_ioctx() */
119 120
120 /* 121 /*
121 * signals when all in-flight requests are done 122 * signals when all in-flight requests are done
@@ -329,7 +330,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
329 for (i = 0; i < table->nr; i++) { 330 for (i = 0; i < table->nr; i++) {
330 struct kioctx *ctx; 331 struct kioctx *ctx;
331 332
332 ctx = table->table[i]; 333 ctx = rcu_dereference(table->table[i]);
333 if (ctx && ctx->aio_ring_file == file) { 334 if (ctx && ctx->aio_ring_file == file) {
334 if (!atomic_read(&ctx->dead)) { 335 if (!atomic_read(&ctx->dead)) {
335 ctx->user_id = ctx->mmap_base = vma->vm_start; 336 ctx->user_id = ctx->mmap_base = vma->vm_start;
@@ -588,6 +589,12 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
588 return cancel(&kiocb->common); 589 return cancel(&kiocb->common);
589} 590}
590 591
592/*
593 * free_ioctx() should be RCU delayed to synchronize against the RCU
594 * protected lookup_ioctx() and also needs process context to call
595 * aio_free_ring(), so the double bouncing through kioctx->free_rcu and
596 * ->free_work.
597 */
591static void free_ioctx(struct work_struct *work) 598static void free_ioctx(struct work_struct *work)
592{ 599{
593 struct kioctx *ctx = container_of(work, struct kioctx, free_work); 600 struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -601,6 +608,14 @@ static void free_ioctx(struct work_struct *work)
601 kmem_cache_free(kioctx_cachep, ctx); 608 kmem_cache_free(kioctx_cachep, ctx);
602} 609}
603 610
611static void free_ioctx_rcufn(struct rcu_head *head)
612{
613 struct kioctx *ctx = container_of(head, struct kioctx, free_rcu);
614
615 INIT_WORK(&ctx->free_work, free_ioctx);
616 schedule_work(&ctx->free_work);
617}
618
604static void free_ioctx_reqs(struct percpu_ref *ref) 619static void free_ioctx_reqs(struct percpu_ref *ref)
605{ 620{
606 struct kioctx *ctx = container_of(ref, struct kioctx, reqs); 621 struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
@@ -609,8 +624,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
609 if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) 624 if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count))
610 complete(&ctx->rq_wait->comp); 625 complete(&ctx->rq_wait->comp);
611 626
612 INIT_WORK(&ctx->free_work, free_ioctx); 627 /* Synchronize against RCU protected table->table[] dereferences */
613 schedule_work(&ctx->free_work); 628 call_rcu(&ctx->free_rcu, free_ioctx_rcufn);
614} 629}
615 630
616/* 631/*
@@ -651,9 +666,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
651 while (1) { 666 while (1) {
652 if (table) 667 if (table)
653 for (i = 0; i < table->nr; i++) 668 for (i = 0; i < table->nr; i++)
654 if (!table->table[i]) { 669 if (!rcu_access_pointer(table->table[i])) {
655 ctx->id = i; 670 ctx->id = i;
656 table->table[i] = ctx; 671 rcu_assign_pointer(table->table[i], ctx);
657 spin_unlock(&mm->ioctx_lock); 672 spin_unlock(&mm->ioctx_lock);
658 673
659 /* While kioctx setup is in progress, 674 /* While kioctx setup is in progress,
@@ -834,11 +849,11 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
834 } 849 }
835 850
836 table = rcu_dereference_raw(mm->ioctx_table); 851 table = rcu_dereference_raw(mm->ioctx_table);
837 WARN_ON(ctx != table->table[ctx->id]); 852 WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
838 table->table[ctx->id] = NULL; 853 RCU_INIT_POINTER(table->table[ctx->id], NULL);
839 spin_unlock(&mm->ioctx_lock); 854 spin_unlock(&mm->ioctx_lock);
840 855
841 /* percpu_ref_kill() will do the necessary call_rcu() */ 856 /* free_ioctx_reqs() will do the necessary RCU synchronization */
842 wake_up_all(&ctx->wait); 857 wake_up_all(&ctx->wait);
843 858
844 /* 859 /*
@@ -880,7 +895,8 @@ void exit_aio(struct mm_struct *mm)
880 895
881 skipped = 0; 896 skipped = 0;
882 for (i = 0; i < table->nr; ++i) { 897 for (i = 0; i < table->nr; ++i) {
883 struct kioctx *ctx = table->table[i]; 898 struct kioctx *ctx =
899 rcu_dereference_protected(table->table[i], true);
884 900
885 if (!ctx) { 901 if (!ctx) {
886 skipped++; 902 skipped++;
@@ -1069,7 +1085,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
1069 if (!table || id >= table->nr) 1085 if (!table || id >= table->nr)
1070 goto out; 1086 goto out;
1071 1087
1072 ctx = table->table[id]; 1088 ctx = rcu_dereference(table->table[id]);
1073 if (ctx && ctx->user_id == ctx_id) { 1089 if (ctx && ctx->user_id == ctx_id) {
1074 percpu_ref_get(&ctx->users); 1090 percpu_ref_get(&ctx->users);
1075 ret = ctx; 1091 ret = ctx;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f94b2d8c744a..26484648d090 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1519,6 +1519,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
1519 if (!node) 1519 if (!node)
1520 break; 1520 break;
1521 bytenr = node->val; 1521 bytenr = node->val;
1522 shared.share_count = 0;
1522 cond_resched(); 1523 cond_resched();
1523 } 1524 }
1524 1525
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index dec0907dfb8a..fcfc20de2df3 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1370,6 +1370,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1370 stripe_start = stripe->physical; 1370 stripe_start = stripe->physical;
1371 if (physical >= stripe_start && 1371 if (physical >= stripe_start &&
1372 physical < stripe_start + rbio->stripe_len && 1372 physical < stripe_start + rbio->stripe_len &&
1373 stripe->dev->bdev &&
1373 bio->bi_disk == stripe->dev->bdev->bd_disk && 1374 bio->bi_disk == stripe->dev->bdev->bd_disk &&
1374 bio->bi_partno == stripe->dev->bdev->bd_partno) { 1375 bio->bi_partno == stripe->dev->bdev->bd_partno) {
1375 return i; 1376 return i;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index d11c70bff5a9..a8bafed931f4 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -423,7 +423,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj,
423{ 423{
424 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 424 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
425 425
426 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->nodesize); 426 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize);
427} 427}
428 428
429BTRFS_ATTR(, nodesize, btrfs_nodesize_show); 429BTRFS_ATTR(, nodesize, btrfs_nodesize_show);
@@ -433,7 +433,8 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj,
433{ 433{
434 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 434 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
435 435
436 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->sectorsize); 436 return snprintf(buf, PAGE_SIZE, "%u\n",
437 fs_info->super_copy->sectorsize);
437} 438}
438 439
439BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); 440BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show);
@@ -443,7 +444,8 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj,
443{ 444{
444 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 445 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
445 446
446 return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->sectorsize); 447 return snprintf(buf, PAGE_SIZE, "%u\n",
448 fs_info->super_copy->sectorsize);
447} 449}
448 450
449BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); 451BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 9220f004001c..04f07144b45c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1722,23 +1722,19 @@ static void update_super_roots(struct btrfs_fs_info *fs_info)
1722 1722
1723 super = fs_info->super_copy; 1723 super = fs_info->super_copy;
1724 1724
1725 /* update latest btrfs_super_block::chunk_root refs */
1726 root_item = &fs_info->chunk_root->root_item; 1725 root_item = &fs_info->chunk_root->root_item;
1727 btrfs_set_super_chunk_root(super, root_item->bytenr); 1726 super->chunk_root = root_item->bytenr;
1728 btrfs_set_super_chunk_root_generation(super, root_item->generation); 1727 super->chunk_root_generation = root_item->generation;
1729 btrfs_set_super_chunk_root_level(super, root_item->level); 1728 super->chunk_root_level = root_item->level;
1730 1729
1731 /* update latest btrfs_super_block::root refs */
1732 root_item = &fs_info->tree_root->root_item; 1730 root_item = &fs_info->tree_root->root_item;
1733 btrfs_set_super_root(super, root_item->bytenr); 1731 super->root = root_item->bytenr;
1734 btrfs_set_super_generation(super, root_item->generation); 1732 super->generation = root_item->generation;
1735 btrfs_set_super_root_level(super, root_item->level); 1733 super->root_level = root_item->level;
1736
1737 if (btrfs_test_opt(fs_info, SPACE_CACHE)) 1734 if (btrfs_test_opt(fs_info, SPACE_CACHE))
1738 btrfs_set_super_cache_generation(super, root_item->generation); 1735 super->cache_generation = root_item->generation;
1739 if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) 1736 if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))
1740 btrfs_set_super_uuid_tree_generation(super, 1737 super->uuid_tree_generation = root_item->generation;
1741 root_item->generation);
1742} 1738}
1743 1739
1744int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1740int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
diff --git a/fs/dcache.c b/fs/dcache.c
index 7c38f39958bc..8945e6cabd93 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -647,11 +647,16 @@ again:
647 spin_unlock(&parent->d_lock); 647 spin_unlock(&parent->d_lock);
648 goto again; 648 goto again;
649 } 649 }
650 rcu_read_unlock(); 650 if (parent != dentry) {
651 if (parent != dentry)
652 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 651 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
653 else 652 if (unlikely(dentry->d_lockref.count < 0)) {
653 spin_unlock(&parent->d_lock);
654 parent = NULL;
655 }
656 } else {
654 parent = NULL; 657 parent = NULL;
658 }
659 rcu_read_unlock();
655 return parent; 660 return parent;
656} 661}
657 662
@@ -2474,7 +2479,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
2474 2479
2475retry: 2480retry:
2476 rcu_read_lock(); 2481 rcu_read_lock();
2477 seq = smp_load_acquire(&parent->d_inode->i_dir_seq) & ~1; 2482 seq = smp_load_acquire(&parent->d_inode->i_dir_seq);
2478 r_seq = read_seqbegin(&rename_lock); 2483 r_seq = read_seqbegin(&rename_lock);
2479 dentry = __d_lookup_rcu(parent, name, &d_seq); 2484 dentry = __d_lookup_rcu(parent, name, &d_seq);
2480 if (unlikely(dentry)) { 2485 if (unlikely(dentry)) {
@@ -2495,8 +2500,14 @@ retry:
2495 rcu_read_unlock(); 2500 rcu_read_unlock();
2496 goto retry; 2501 goto retry;
2497 } 2502 }
2503
2504 if (unlikely(seq & 1)) {
2505 rcu_read_unlock();
2506 goto retry;
2507 }
2508
2498 hlist_bl_lock(b); 2509 hlist_bl_lock(b);
2499 if (unlikely(parent->d_inode->i_dir_seq != seq)) { 2510 if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) {
2500 hlist_bl_unlock(b); 2511 hlist_bl_unlock(b);
2501 rcu_read_unlock(); 2512 rcu_read_unlock();
2502 goto retry; 2513 goto retry;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 86d6a4435c87..51f940e76c5e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -807,9 +807,6 @@ do_alloc:
807 iomap->length = hole_size(inode, lblock, &mp); 807 iomap->length = hole_size(inode, lblock, &mp);
808 else 808 else
809 iomap->length = size - pos; 809 iomap->length = size - pos;
810 } else {
811 if (height <= ip->i_height)
812 iomap->length = hole_size(inode, lblock, &mp);
813 } 810 }
814 goto out_release; 811 goto out_release;
815} 812}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8fe1b0aa2896..b9a254dcc0e7 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec)
108 pagevec_reinit(pvec); 108 pagevec_reinit(pvec);
109} 109}
110 110
111/*
112 * Mask used when checking the page offset value passed in via system
113 * calls. This value will be converted to a loff_t which is signed.
114 * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the
115 * value. The extra bit (- 1 in the shift value) is to take the sign
116 * bit into account.
117 */
118#define PGOFF_LOFFT_MAX \
119 (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
120
111static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 121static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
112{ 122{
113 struct inode *inode = file_inode(file); 123 struct inode *inode = file_inode(file);
@@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
127 vma->vm_ops = &hugetlb_vm_ops; 137 vma->vm_ops = &hugetlb_vm_ops;
128 138
129 /* 139 /*
130 * Offset passed to mmap (before page shift) could have been 140 * page based offset in vm_pgoff could be sufficiently large to
131 * negative when represented as a (l)off_t. 141 * overflow a (l)off_t when converted to byte offset.
132 */ 142 */
133 if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) 143 if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
134 return -EINVAL; 144 return -EINVAL;
135 145
146 /* must be huge page aligned */
136 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) 147 if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
137 return -EINVAL; 148 return -EINVAL;
138 149
diff --git a/fs/namei.c b/fs/namei.c
index 921ae32dbc80..cafa365eeb70 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -559,9 +559,10 @@ static int __nd_alloc_stack(struct nameidata *nd)
559static bool path_connected(const struct path *path) 559static bool path_connected(const struct path *path)
560{ 560{
561 struct vfsmount *mnt = path->mnt; 561 struct vfsmount *mnt = path->mnt;
562 struct super_block *sb = mnt->mnt_sb;
562 563
563 /* Only bind mounts can have disconnected paths */ 564 /* Bind mounts and multi-root filesystems can have disconnected paths */
564 if (mnt->mnt_root == mnt->mnt_sb->s_root) 565 if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root))
565 return true; 566 return true;
566 567
567 return is_subdir(path->dentry, mnt->mnt_root); 568 return is_subdir(path->dentry, mnt->mnt_root);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 8c10b0562e75..621c517b325c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -86,10 +86,10 @@ struct nfs_direct_req {
86 struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX]; 86 struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
87 int mirror_count; 87 int mirror_count;
88 88
89 loff_t io_start; /* Start offset for I/O */
89 ssize_t count, /* bytes actually processed */ 90 ssize_t count, /* bytes actually processed */
90 max_count, /* max expected count */ 91 max_count, /* max expected count */
91 bytes_left, /* bytes left to be sent */ 92 bytes_left, /* bytes left to be sent */
92 io_start, /* start of IO */
93 error; /* any reported error */ 93 error; /* any reported error */
94 struct completion completion; /* wait for i/o completion */ 94 struct completion completion; /* wait for i/o completion */
95 95
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index c13e826614b5..ee723aa153a3 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -292,8 +292,11 @@ pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
292void 292void
293pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) 293pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
294{ 294{
295 struct inode *inode = lo->plh_inode; 295 struct inode *inode;
296 296
297 if (!lo)
298 return;
299 inode = lo->plh_inode;
297 pnfs_layoutreturn_before_put_layout_hdr(lo); 300 pnfs_layoutreturn_before_put_layout_hdr(lo);
298 301
299 if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { 302 if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
@@ -1241,10 +1244,12 @@ retry:
1241 spin_lock(&ino->i_lock); 1244 spin_lock(&ino->i_lock);
1242 lo = nfsi->layout; 1245 lo = nfsi->layout;
1243 if (!lo || !pnfs_layout_is_valid(lo) || 1246 if (!lo || !pnfs_layout_is_valid(lo) ||
1244 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) 1247 test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1248 lo = NULL;
1245 goto out_noroc; 1249 goto out_noroc;
1250 }
1251 pnfs_get_layout_hdr(lo);
1246 if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { 1252 if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
1247 pnfs_get_layout_hdr(lo);
1248 spin_unlock(&ino->i_lock); 1253 spin_unlock(&ino->i_lock);
1249 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, 1254 wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
1250 TASK_UNINTERRUPTIBLE); 1255 TASK_UNINTERRUPTIBLE);
@@ -1312,10 +1317,12 @@ out_noroc:
1312 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; 1317 struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
1313 if (ld->prepare_layoutreturn) 1318 if (ld->prepare_layoutreturn)
1314 ld->prepare_layoutreturn(args); 1319 ld->prepare_layoutreturn(args);
1320 pnfs_put_layout_hdr(lo);
1315 return true; 1321 return true;
1316 } 1322 }
1317 if (layoutreturn) 1323 if (layoutreturn)
1318 pnfs_send_layoutreturn(lo, &stateid, iomode, true); 1324 pnfs_send_layoutreturn(lo, &stateid, iomode, true);
1325 pnfs_put_layout_hdr(lo);
1319 return false; 1326 return false;
1320} 1327}
1321 1328
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 29bacdc56f6a..5e470e233c83 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2631,6 +2631,8 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
2631 /* initial superblock/root creation */ 2631 /* initial superblock/root creation */
2632 mount_info->fill_super(s, mount_info); 2632 mount_info->fill_super(s, mount_info);
2633 nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); 2633 nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned);
2634 if (!(server->flags & NFS_MOUNT_UNSHARED))
2635 s->s_iflags |= SB_I_MULTIROOT;
2634 } 2636 }
2635 2637
2636 mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); 2638 mntroot = nfs_get_root(s, mount_info->mntfh, dev_name);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7428a669d7a7..e7d8ceae8f26 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1876,40 +1876,43 @@ int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
1876 return status; 1876 return status;
1877} 1877}
1878 1878
1879int nfs_commit_inode(struct inode *inode, int how) 1879static int __nfs_commit_inode(struct inode *inode, int how,
1880 struct writeback_control *wbc)
1880{ 1881{
1881 LIST_HEAD(head); 1882 LIST_HEAD(head);
1882 struct nfs_commit_info cinfo; 1883 struct nfs_commit_info cinfo;
1883 int may_wait = how & FLUSH_SYNC; 1884 int may_wait = how & FLUSH_SYNC;
1884 int error = 0; 1885 int ret, nscan;
1885 int res;
1886 1886
1887 nfs_init_cinfo_from_inode(&cinfo, inode); 1887 nfs_init_cinfo_from_inode(&cinfo, inode);
1888 nfs_commit_begin(cinfo.mds); 1888 nfs_commit_begin(cinfo.mds);
1889 res = nfs_scan_commit(inode, &head, &cinfo); 1889 for (;;) {
1890 if (res) 1890 ret = nscan = nfs_scan_commit(inode, &head, &cinfo);
1891 error = nfs_generic_commit_list(inode, &head, how, &cinfo); 1891 if (ret <= 0)
1892 break;
1893 ret = nfs_generic_commit_list(inode, &head, how, &cinfo);
1894 if (ret < 0)
1895 break;
1896 ret = 0;
1897 if (wbc && wbc->sync_mode == WB_SYNC_NONE) {
1898 if (nscan < wbc->nr_to_write)
1899 wbc->nr_to_write -= nscan;
1900 else
1901 wbc->nr_to_write = 0;
1902 }
1903 if (nscan < INT_MAX)
1904 break;
1905 cond_resched();
1906 }
1892 nfs_commit_end(cinfo.mds); 1907 nfs_commit_end(cinfo.mds);
1893 if (res == 0) 1908 if (ret || !may_wait)
1894 return res; 1909 return ret;
1895 if (error < 0) 1910 return wait_on_commit(cinfo.mds);
1896 goto out_error; 1911}
1897 if (!may_wait) 1912
1898 goto out_mark_dirty; 1913int nfs_commit_inode(struct inode *inode, int how)
1899 error = wait_on_commit(cinfo.mds); 1914{
1900 if (error < 0) 1915 return __nfs_commit_inode(inode, how, NULL);
1901 return error;
1902 return res;
1903out_error:
1904 res = error;
1905 /* Note: If we exit without ensuring that the commit is complete,
1906 * we must mark the inode as dirty. Otherwise, future calls to
1907 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
1908 * that the data is on the disk.
1909 */
1910out_mark_dirty:
1911 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1912 return res;
1913} 1916}
1914EXPORT_SYMBOL_GPL(nfs_commit_inode); 1917EXPORT_SYMBOL_GPL(nfs_commit_inode);
1915 1918
@@ -1919,11 +1922,11 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1919 int flags = FLUSH_SYNC; 1922 int flags = FLUSH_SYNC;
1920 int ret = 0; 1923 int ret = 0;
1921 1924
1922 /* no commits means nothing needs to be done */
1923 if (!atomic_long_read(&nfsi->commit_info.ncommit))
1924 return ret;
1925
1926 if (wbc->sync_mode == WB_SYNC_NONE) { 1925 if (wbc->sync_mode == WB_SYNC_NONE) {
1926 /* no commits means nothing needs to be done */
1927 if (!atomic_long_read(&nfsi->commit_info.ncommit))
1928 goto check_requests_outstanding;
1929
1927 /* Don't commit yet if this is a non-blocking flush and there 1930 /* Don't commit yet if this is a non-blocking flush and there
1928 * are a lot of outstanding writes for this mapping. 1931 * are a lot of outstanding writes for this mapping.
1929 */ 1932 */
@@ -1934,16 +1937,16 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
1934 flags = 0; 1937 flags = 0;
1935 } 1938 }
1936 1939
1937 ret = nfs_commit_inode(inode, flags); 1940 ret = __nfs_commit_inode(inode, flags, wbc);
1938 if (ret >= 0) { 1941 if (!ret) {
1939 if (wbc->sync_mode == WB_SYNC_NONE) { 1942 if (flags & FLUSH_SYNC)
1940 if (ret < wbc->nr_to_write) 1943 return 0;
1941 wbc->nr_to_write -= ret; 1944 } else if (atomic_long_read(&nfsi->commit_info.ncommit))
1942 else 1945 goto out_mark_dirty;
1943 wbc->nr_to_write = 0; 1946
1944 } 1947check_requests_outstanding:
1945 return 0; 1948 if (!atomic_read(&nfsi->commit_info.rpcs_out))
1946 } 1949 return ret;
1947out_mark_dirty: 1950out_mark_dirty:
1948 __mark_inode_dirty(inode, I_DIRTY_DATASYNC); 1951 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1949 return ret; 1952 return ret;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 150521c9671b..61b770e39809 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -268,6 +268,35 @@ free_blocked_lock(struct nfsd4_blocked_lock *nbl)
268 kfree(nbl); 268 kfree(nbl);
269} 269}
270 270
271static void
272remove_blocked_locks(struct nfs4_lockowner *lo)
273{
274 struct nfs4_client *clp = lo->lo_owner.so_client;
275 struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
276 struct nfsd4_blocked_lock *nbl;
277 LIST_HEAD(reaplist);
278
279 /* Dequeue all blocked locks */
280 spin_lock(&nn->blocked_locks_lock);
281 while (!list_empty(&lo->lo_blocked)) {
282 nbl = list_first_entry(&lo->lo_blocked,
283 struct nfsd4_blocked_lock,
284 nbl_list);
285 list_del_init(&nbl->nbl_list);
286 list_move(&nbl->nbl_lru, &reaplist);
287 }
288 spin_unlock(&nn->blocked_locks_lock);
289
290 /* Now free them */
291 while (!list_empty(&reaplist)) {
292 nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock,
293 nbl_lru);
294 list_del_init(&nbl->nbl_lru);
295 posix_unblock_lock(&nbl->nbl_lock);
296 free_blocked_lock(nbl);
297 }
298}
299
271static int 300static int
272nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) 301nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
273{ 302{
@@ -1866,6 +1895,7 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp)
1866static void 1895static void
1867__destroy_client(struct nfs4_client *clp) 1896__destroy_client(struct nfs4_client *clp)
1868{ 1897{
1898 int i;
1869 struct nfs4_openowner *oo; 1899 struct nfs4_openowner *oo;
1870 struct nfs4_delegation *dp; 1900 struct nfs4_delegation *dp;
1871 struct list_head reaplist; 1901 struct list_head reaplist;
@@ -1895,6 +1925,16 @@ __destroy_client(struct nfs4_client *clp)
1895 nfs4_get_stateowner(&oo->oo_owner); 1925 nfs4_get_stateowner(&oo->oo_owner);
1896 release_openowner(oo); 1926 release_openowner(oo);
1897 } 1927 }
1928 for (i = 0; i < OWNER_HASH_SIZE; i++) {
1929 struct nfs4_stateowner *so, *tmp;
1930
1931 list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i],
1932 so_strhash) {
1933 /* Should be no openowners at this point */
1934 WARN_ON_ONCE(so->so_is_open_owner);
1935 remove_blocked_locks(lockowner(so));
1936 }
1937 }
1898 nfsd4_return_all_client_layouts(clp); 1938 nfsd4_return_all_client_layouts(clp);
1899 nfsd4_shutdown_callback(clp); 1939 nfsd4_shutdown_callback(clp);
1900 if (clp->cl_cb_conn.cb_xprt) 1940 if (clp->cl_cb_conn.cb_xprt)
@@ -6355,6 +6395,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
6355 } 6395 }
6356 spin_unlock(&clp->cl_lock); 6396 spin_unlock(&clp->cl_lock);
6357 free_ol_stateid_reaplist(&reaplist); 6397 free_ol_stateid_reaplist(&reaplist);
6398 remove_blocked_locks(lo);
6358 nfs4_put_stateowner(&lo->lo_owner); 6399 nfs4_put_stateowner(&lo->lo_owner);
6359 6400
6360 return status; 6401 return status;
@@ -7140,6 +7181,8 @@ nfs4_state_destroy_net(struct net *net)
7140 } 7181 }
7141 } 7182 }
7142 7183
7184 WARN_ON(!list_empty(&nn->blocked_locks_lru));
7185
7143 for (i = 0; i < CLIENT_HASH_SIZE; i++) { 7186 for (i = 0; i < CLIENT_HASH_SIZE; i++) {
7144 while (!list_empty(&nn->unconf_id_hashtbl[i])) { 7187 while (!list_empty(&nn->unconf_id_hashtbl[i])) {
7145 clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); 7188 clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
@@ -7206,7 +7249,6 @@ nfs4_state_shutdown_net(struct net *net)
7206 struct nfs4_delegation *dp = NULL; 7249 struct nfs4_delegation *dp = NULL;
7207 struct list_head *pos, *next, reaplist; 7250 struct list_head *pos, *next, reaplist;
7208 struct nfsd_net *nn = net_generic(net, nfsd_net_id); 7251 struct nfsd_net *nn = net_generic(net, nfsd_net_id);
7209 struct nfsd4_blocked_lock *nbl;
7210 7252
7211 cancel_delayed_work_sync(&nn->laundromat_work); 7253 cancel_delayed_work_sync(&nn->laundromat_work);
7212 locks_end_grace(&nn->nfsd4_manager); 7254 locks_end_grace(&nn->nfsd4_manager);
@@ -7227,24 +7269,6 @@ nfs4_state_shutdown_net(struct net *net)
7227 nfs4_put_stid(&dp->dl_stid); 7269 nfs4_put_stid(&dp->dl_stid);
7228 } 7270 }
7229 7271
7230 BUG_ON(!list_empty(&reaplist));
7231 spin_lock(&nn->blocked_locks_lock);
7232 while (!list_empty(&nn->blocked_locks_lru)) {
7233 nbl = list_first_entry(&nn->blocked_locks_lru,
7234 struct nfsd4_blocked_lock, nbl_lru);
7235 list_move(&nbl->nbl_lru, &reaplist);
7236 list_del_init(&nbl->nbl_list);
7237 }
7238 spin_unlock(&nn->blocked_locks_lock);
7239
7240 while (!list_empty(&reaplist)) {
7241 nbl = list_first_entry(&reaplist,
7242 struct nfsd4_blocked_lock, nbl_lru);
7243 list_del_init(&nbl->nbl_lru);
7244 posix_unblock_lock(&nbl->nbl_lock);
7245 free_blocked_lock(nbl);
7246 }
7247
7248 nfsd4_client_tracking_exit(net); 7272 nfsd4_client_tracking_exit(net);
7249 nfs4_state_destroy_net(net); 7273 nfs4_state_destroy_net(net);
7250} 7274}
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 406e72de88f6..ce6ff5a0a6e4 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -24,6 +24,8 @@ config OVERLAY_FS_REDIRECT_DIR
24 an overlay which has redirects on a kernel that doesn't support this 24 an overlay which has redirects on a kernel that doesn't support this
25 feature will have unexpected results. 25 feature will have unexpected results.
26 26
27 If unsure, say N.
28
27config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW 29config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW
28 bool "Overlayfs: follow redirects even if redirects are turned off" 30 bool "Overlayfs: follow redirects even if redirects are turned off"
29 default y 31 default y
@@ -32,8 +34,13 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW
32 Disable this to get a possibly more secure configuration, but that 34 Disable this to get a possibly more secure configuration, but that
33 might not be backward compatible with previous kernels. 35 might not be backward compatible with previous kernels.
34 36
37 If backward compatibility is not an issue, then it is safe and
38 recommended to say N here.
39
35 For more information, see Documentation/filesystems/overlayfs.txt 40 For more information, see Documentation/filesystems/overlayfs.txt
36 41
42 If unsure, say Y.
43
37config OVERLAY_FS_INDEX 44config OVERLAY_FS_INDEX
38 bool "Overlayfs: turn on inodes index feature by default" 45 bool "Overlayfs: turn on inodes index feature by default"
39 depends on OVERLAY_FS 46 depends on OVERLAY_FS
@@ -51,6 +58,8 @@ config OVERLAY_FS_INDEX
51 That is, mounting an overlay which has an inodes index on a kernel 58 That is, mounting an overlay which has an inodes index on a kernel
52 that doesn't support this feature will have unexpected results. 59 that doesn't support this feature will have unexpected results.
53 60
61 If unsure, say N.
62
54config OVERLAY_FS_NFS_EXPORT 63config OVERLAY_FS_NFS_EXPORT
55 bool "Overlayfs: turn on NFS export feature by default" 64 bool "Overlayfs: turn on NFS export feature by default"
56 depends on OVERLAY_FS 65 depends on OVERLAY_FS
@@ -72,3 +81,8 @@ config OVERLAY_FS_NFS_EXPORT
72 Note, that the NFS export feature is not backward compatible. 81 Note, that the NFS export feature is not backward compatible.
73 That is, mounting an overlay which has a full index on a kernel 82 That is, mounting an overlay which has a full index on a kernel
74 that doesn't support this feature will have unexpected results. 83 that doesn't support this feature will have unexpected results.
84
85 Most users should say N here and enable this feature on a case-by-
86 case basis with the "nfs_export=on" mount option.
87
88 Say N unless you fully understand the consequences.
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index bb94ce9da5c8..87bd4148f4fb 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -19,6 +19,142 @@
19#include <linux/ratelimit.h> 19#include <linux/ratelimit.h>
20#include "overlayfs.h" 20#include "overlayfs.h"
21 21
22static int ovl_encode_maybe_copy_up(struct dentry *dentry)
23{
24 int err;
25
26 if (ovl_dentry_upper(dentry))
27 return 0;
28
29 err = ovl_want_write(dentry);
30 if (!err) {
31 err = ovl_copy_up(dentry);
32 ovl_drop_write(dentry);
33 }
34
35 if (err) {
36 pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n",
37 dentry, err);
38 }
39
40 return err;
41}
42
43/*
44 * Before encoding a non-upper directory file handle from real layer N, we need
45 * to check if it will be possible to reconnect an overlay dentry from the real
46 * lower decoded dentry. This is done by following the overlay ancestry up to a
47 * "layer N connected" ancestor and verifying that all parents along the way are
48 * "layer N connectable". If an ancestor that is NOT "layer N connectable" is
49 * found, we need to copy up an ancestor, which is "layer N connectable", thus
50 * making that ancestor "layer N connected". For example:
51 *
52 * layer 1: /a
53 * layer 2: /a/b/c
54 *
55 * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is
56 * copied up and renamed, upper dir /a will be indexed by lower dir /a from
57 * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*)
58 * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay
59 * dentry from the connected lower dentry /a/b/c.
60 *
61 * To avoid this problem on decode time, we need to copy up an ancestor of
62 * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is
63 * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected"
64 * and when the time comes to decode the file handle from lower dentry /a/b/c,
65 * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding
66 * a connected overlay dentry will be accomplished.
67 *
68 * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an
69 * entry /a in the lower layers above layer N and find the indexed dir /a from
70 * layer 1. If that improvement is made, then the check for "layer N connected"
71 * will need to verify there are no redirects in lower layers above N. In the
72 * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a
73 * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable":
74 *
75 * layer 1: /A (redirect = /a)
76 * layer 2: /a/b/c
77 */
78
79/* Return the lowest layer for encoding a connectable file handle */
80static int ovl_connectable_layer(struct dentry *dentry)
81{
82 struct ovl_entry *oe = OVL_E(dentry);
83
84 /* We can get overlay root from root of any layer */
85 if (dentry == dentry->d_sb->s_root)
86 return oe->numlower;
87
88 /*
89 * If it's an unindexed merge dir, then it's not connectable with any
90 * lower layer
91 */
92 if (ovl_dentry_upper(dentry) &&
93 !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
94 return 0;
95
96 /* We can get upper/overlay path from indexed/lower dentry */
97 return oe->lowerstack[0].layer->idx;
98}
99
100/*
101 * @dentry is "connected" if all ancestors up to root or a "connected" ancestor
102 * have the same uppermost lower layer as the origin's layer. We may need to
103 * copy up a "connectable" ancestor to make it "connected". A "connected" dentry
104 * cannot become non "connected", so cache positive result in dentry flags.
105 *
106 * Return the connected origin layer or < 0 on error.
107 */
108static int ovl_connect_layer(struct dentry *dentry)
109{
110 struct dentry *next, *parent = NULL;
111 int origin_layer;
112 int err = 0;
113
114 if (WARN_ON(dentry == dentry->d_sb->s_root) ||
115 WARN_ON(!ovl_dentry_lower(dentry)))
116 return -EIO;
117
118 origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx;
119 if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry))
120 return origin_layer;
121
122 /* Find the topmost origin layer connectable ancestor of @dentry */
123 next = dget(dentry);
124 for (;;) {
125 parent = dget_parent(next);
126 if (WARN_ON(parent == next)) {
127 err = -EIO;
128 break;
129 }
130
131 /*
132 * If @parent is not origin layer connectable, then copy up
133 * @next which is origin layer connectable and we are done.
134 */
135 if (ovl_connectable_layer(parent) < origin_layer) {
136 err = ovl_encode_maybe_copy_up(next);
137 break;
138 }
139
140 /* If @parent is connected or indexed we are done */
141 if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) ||
142 ovl_test_flag(OVL_INDEX, d_inode(parent)))
143 break;
144
145 dput(next);
146 next = parent;
147 }
148
149 dput(parent);
150 dput(next);
151
152 if (!err)
153 ovl_dentry_set_flag(OVL_E_CONNECTED, dentry);
154
155 return err ?: origin_layer;
156}
157
22/* 158/*
23 * We only need to encode origin if there is a chance that the same object was 159 * We only need to encode origin if there is a chance that the same object was
24 * encoded pre copy up and then we need to stay consistent with the same 160 * encoded pre copy up and then we need to stay consistent with the same
@@ -41,73 +177,59 @@
41 * L = lower file handle 177 * L = lower file handle
42 * 178 *
43 * (*) Connecting an overlay dir from real lower dentry is not always 179 * (*) Connecting an overlay dir from real lower dentry is not always
44 * possible when there are redirects in lower layers. To mitigate this case, 180 * possible when there are redirects in lower layers and non-indexed merge dirs.
45 * we copy up the lower dir first and then encode an upper dir file handle. 181 * To mitigate those case, we may copy up the lower dir ancestor before encode
182 * a lower dir file handle.
183 *
184 * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
46 */ 185 */
47static bool ovl_should_encode_origin(struct dentry *dentry) 186static int ovl_check_encode_origin(struct dentry *dentry)
48{ 187{
49 struct ovl_fs *ofs = dentry->d_sb->s_fs_info; 188 struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
50 189
190 /* Upper file handle for pure upper */
51 if (!ovl_dentry_lower(dentry)) 191 if (!ovl_dentry_lower(dentry))
52 return false; 192 return 0;
53 193
54 /* 194 /*
55 * Decoding a merge dir, whose origin's parent is under a redirected 195 * Upper file handle for non-indexed upper.
56 * lower dir is not always possible. As a simple aproximation, we do
57 * not encode lower dir file handles when overlay has multiple lower
58 * layers and origin is below the topmost lower layer.
59 * 196 *
60 * TODO: copy up only the parent that is under redirected lower. 197 * Root is never indexed, so if there's an upper layer, encode upper for
198 * root.
61 */ 199 */
62 if (d_is_dir(dentry) && ofs->upper_mnt &&
63 OVL_E(dentry)->lowerstack[0].layer->idx > 1)
64 return false;
65
66 /* Decoding a non-indexed upper from origin is not implemented */
67 if (ovl_dentry_upper(dentry) && 200 if (ovl_dentry_upper(dentry) &&
68 !ovl_test_flag(OVL_INDEX, d_inode(dentry))) 201 !ovl_test_flag(OVL_INDEX, d_inode(dentry)))
69 return false;
70
71 return true;
72}
73
74static int ovl_encode_maybe_copy_up(struct dentry *dentry)
75{
76 int err;
77
78 if (ovl_dentry_upper(dentry))
79 return 0; 202 return 0;
80 203
81 err = ovl_want_write(dentry); 204 /*
82 if (err) 205 * Decoding a merge dir, whose origin's ancestor is under a redirected
83 return err; 206 * lower dir or under a non-indexed upper is not always possible.
84 207 * ovl_connect_layer() will try to make origin's layer "connected" by
85 err = ovl_copy_up(dentry); 208 * copying up a "connectable" ancestor.
209 */
210 if (d_is_dir(dentry) && ofs->upper_mnt)
211 return ovl_connect_layer(dentry);
86 212
87 ovl_drop_write(dentry); 213 /* Lower file handle for indexed and non-upper dir/non-dir */
88 return err; 214 return 1;
89} 215}
90 216
91static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) 217static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen)
92{ 218{
93 struct dentry *origin = ovl_dentry_lower(dentry);
94 struct ovl_fh *fh = NULL; 219 struct ovl_fh *fh = NULL;
95 int err; 220 int err, enc_lower;
96 221
97 /* 222 /*
98 * If we should not encode a lower dir file handle, copy up and encode 223 * Check if we should encode a lower or upper file handle and maybe
99 * an upper dir file handle. 224 * copy up an ancestor to make lower file handle connectable.
100 */ 225 */
101 if (!ovl_should_encode_origin(dentry)) { 226 err = enc_lower = ovl_check_encode_origin(dentry);
102 err = ovl_encode_maybe_copy_up(dentry); 227 if (enc_lower < 0)
103 if (err) 228 goto fail;
104 goto fail;
105
106 origin = NULL;
107 }
108 229
109 /* Encode an upper or origin file handle */ 230 /* Encode an upper or lower file handle */
110 fh = ovl_encode_fh(origin ?: ovl_dentry_upper(dentry), !origin); 231 fh = ovl_encode_fh(enc_lower ? ovl_dentry_lower(dentry) :
232 ovl_dentry_upper(dentry), !enc_lower);
111 err = PTR_ERR(fh); 233 err = PTR_ERR(fh);
112 if (IS_ERR(fh)) 234 if (IS_ERR(fh))
113 goto fail; 235 goto fail;
@@ -355,8 +477,8 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
355 dput(upper); 477 dput(upper);
356 } 478 }
357 479
358 if (!this) 480 if (IS_ERR_OR_NULL(this))
359 return NULL; 481 return this;
360 482
361 if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) { 483 if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) {
362 dput(this); 484 dput(this);
@@ -498,7 +620,7 @@ static struct dentry *ovl_lookup_real(struct super_block *sb,
498 if (err == -ECHILD) { 620 if (err == -ECHILD) {
499 this = ovl_lookup_real_ancestor(sb, real, 621 this = ovl_lookup_real_ancestor(sb, real,
500 layer); 622 layer);
501 err = IS_ERR(this) ? PTR_ERR(this) : 0; 623 err = PTR_ERR_OR_ZERO(this);
502 } 624 }
503 if (!err) { 625 if (!err) {
504 dput(connected); 626 dput(connected);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index fcd97b783fa1..3b1bd469accd 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -669,38 +669,59 @@ struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
669 return inode; 669 return inode;
670} 670}
671 671
672/*
673 * Does overlay inode need to be hashed by lower inode?
674 */
675static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
676 struct dentry *lower, struct dentry *index)
677{
678 struct ovl_fs *ofs = sb->s_fs_info;
679
680 /* No, if pure upper */
681 if (!lower)
682 return false;
683
684 /* Yes, if already indexed */
685 if (index)
686 return true;
687
688 /* Yes, if won't be copied up */
689 if (!ofs->upper_mnt)
690 return true;
691
692 /* No, if lower hardlink is or will be broken on copy up */
693 if ((upper || !ovl_indexdir(sb)) &&
694 !d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
695 return false;
696
697 /* No, if non-indexed upper with NFS export */
698 if (sb->s_export_op && upper)
699 return false;
700
701 /* Otherwise, hash by lower inode for fsnotify */
702 return true;
703}
704
672struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, 705struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
673 struct dentry *lowerdentry, struct dentry *index, 706 struct dentry *lowerdentry, struct dentry *index,
674 unsigned int numlower) 707 unsigned int numlower)
675{ 708{
676 struct ovl_fs *ofs = sb->s_fs_info;
677 struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; 709 struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
678 struct inode *inode; 710 struct inode *inode;
679 /* Already indexed or could be indexed on copy up? */ 711 bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index);
680 bool indexed = (index || (ovl_indexdir(sb) && !upperdentry));
681 struct dentry *origin = indexed ? lowerdentry : NULL;
682 bool is_dir; 712 bool is_dir;
683 713
684 if (WARN_ON(upperdentry && indexed && !lowerdentry))
685 return ERR_PTR(-EIO);
686
687 if (!realinode) 714 if (!realinode)
688 realinode = d_inode(lowerdentry); 715 realinode = d_inode(lowerdentry);
689 716
690 /* 717 /*
691 * Copy up origin (lower) may exist for non-indexed non-dir upper, but 718 * Copy up origin (lower) may exist for non-indexed upper, but we must
692 * we must not use lower as hash key in that case. 719 * not use lower as hash key if this is a broken hardlink.
693 * Hash non-dir that is or could be indexed by origin inode.
694 * Hash dir that is or could be merged by origin inode.
695 * Hash pure upper and non-indexed non-dir by upper inode.
696 * Hash non-indexed dir by upper inode for NFS export.
697 */ 720 */
698 is_dir = S_ISDIR(realinode->i_mode); 721 is_dir = S_ISDIR(realinode->i_mode);
699 if (is_dir && (indexed || !sb->s_export_op || !ofs->upper_mnt)) 722 if (upperdentry || bylower) {
700 origin = lowerdentry; 723 struct inode *key = d_inode(bylower ? lowerdentry :
701 724 upperdentry);
702 if (upperdentry || origin) {
703 struct inode *key = d_inode(origin ?: upperdentry);
704 unsigned int nlink = is_dir ? 1 : realinode->i_nlink; 725 unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
705 726
706 inode = iget5_locked(sb, (unsigned long) key, 727 inode = iget5_locked(sb, (unsigned long) key,
@@ -728,6 +749,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry,
728 nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink); 749 nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
729 set_nlink(inode, nlink); 750 set_nlink(inode, nlink);
730 } else { 751 } else {
752 /* Lower hardlink that will be broken on copy up */
731 inode = new_inode(sb); 753 inode = new_inode(sb);
732 if (!inode) 754 if (!inode)
733 goto out_nomem; 755 goto out_nomem;
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index de3e6da1d5a5..70fcfcc684cc 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -913,9 +913,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
913 stack[ctr].layer = lower.layer; 913 stack[ctr].layer = lower.layer;
914 ctr++; 914 ctr++;
915 915
916 if (d.stop)
917 break;
918
919 /* 916 /*
920 * Following redirects can have security consequences: it's like 917 * Following redirects can have security consequences: it's like
921 * a symlink into the lower layer without the permission checks. 918 * a symlink into the lower layer without the permission checks.
@@ -933,6 +930,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
933 goto out_put; 930 goto out_put;
934 } 931 }
935 932
933 if (d.stop)
934 break;
935
936 if (d.redirect && d.redirect[0] == '/' && poe != roe) { 936 if (d.redirect && d.redirect[0] == '/' && poe != roe) {
937 poe = roe; 937 poe = roe;
938 /* Find the current layer on the root dentry */ 938 /* Find the current layer on the root dentry */
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0df25a9c94bd..225ff1171147 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -40,6 +40,7 @@ enum ovl_inode_flag {
40enum ovl_entry_flag { 40enum ovl_entry_flag {
41 OVL_E_UPPER_ALIAS, 41 OVL_E_UPPER_ALIAS,
42 OVL_E_OPAQUE, 42 OVL_E_OPAQUE,
43 OVL_E_CONNECTED,
43}; 44};
44 45
45/* 46/*
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 9ee37c76091d..7c24619ae7fc 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1359,6 +1359,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
1359 1359
1360 /* Root is always merge -> can have whiteouts */ 1360 /* Root is always merge -> can have whiteouts */
1361 ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); 1361 ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry));
1362 ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry);
1362 ovl_inode_init(d_inode(root_dentry), upperpath.dentry, 1363 ovl_inode_init(d_inode(root_dentry), upperpath.dentry,
1363 ovl_dentry_lower(root_dentry)); 1364 ovl_dentry_lower(root_dentry));
1364 1365
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 8664db25a9a6..215c225b2ca1 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -106,6 +106,7 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target,
106{ 106{
107 return sysfs_do_create_link(kobj, target, name, 0); 107 return sysfs_do_create_link(kobj, target, name, 0);
108} 108}
109EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn);
109 110
110/** 111/**
111 * sysfs_delete_link - remove symlink in object's directory. 112 * sysfs_delete_link - remove symlink in object's directory.
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 66e1edbfb2b2..046469fcc1b8 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -955,15 +955,29 @@ static inline bool imap_needs_alloc(struct inode *inode,
955 (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); 955 (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
956} 956}
957 957
958static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps)
959{
960 return nimaps &&
961 imap->br_startblock != HOLESTARTBLOCK &&
962 imap->br_state != XFS_EXT_UNWRITTEN;
963}
964
958static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) 965static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
959{ 966{
960 /* 967 /*
961 * COW writes will allocate delalloc space, so we need to make sure 968 * COW writes may allocate delalloc space or convert unwritten COW
962 * to take the lock exclusively here. 969 * extents, so we need to make sure to take the lock exclusively here.
963 */ 970 */
964 if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) 971 if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
965 return true; 972 return true;
966 if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE)) 973
974 /*
975 * Extents not yet cached requires exclusive access, don't block.
976 * This is an opencoded xfs_ilock_data_map_shared() to cater for the
977 * non-blocking behaviour.
978 */
979 if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
980 !(ip->i_df.if_flags & XFS_IFEXTENTS))
967 return true; 981 return true;
968 return false; 982 return false;
969} 983}
@@ -993,16 +1007,18 @@ xfs_file_iomap_begin(
993 return xfs_file_iomap_begin_delay(inode, offset, length, iomap); 1007 return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
994 } 1008 }
995 1009
996 if (need_excl_ilock(ip, flags)) { 1010 if (need_excl_ilock(ip, flags))
997 lockmode = XFS_ILOCK_EXCL; 1011 lockmode = XFS_ILOCK_EXCL;
998 xfs_ilock(ip, XFS_ILOCK_EXCL); 1012 else
999 } else { 1013 lockmode = XFS_ILOCK_SHARED;
1000 lockmode = xfs_ilock_data_map_shared(ip);
1001 }
1002 1014
1003 if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) { 1015 if (flags & IOMAP_NOWAIT) {
1004 error = -EAGAIN; 1016 if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
1005 goto out_unlock; 1017 return -EAGAIN;
1018 if (!xfs_ilock_nowait(ip, lockmode))
1019 return -EAGAIN;
1020 } else {
1021 xfs_ilock(ip, lockmode);
1006 } 1022 }
1007 1023
1008 ASSERT(offset <= mp->m_super->s_maxbytes); 1024 ASSERT(offset <= mp->m_super->s_maxbytes);
@@ -1024,7 +1040,9 @@ xfs_file_iomap_begin(
1024 goto out_unlock; 1040 goto out_unlock;
1025 } 1041 }
1026 1042
1027 if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { 1043 if (xfs_is_reflink_inode(ip) &&
1044 ((flags & IOMAP_WRITE) ||
1045 ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) {
1028 if (flags & IOMAP_DIRECT) { 1046 if (flags & IOMAP_DIRECT) {
1029 /* 1047 /*
1030 * A reflinked inode will result in CoW alloc. 1048 * A reflinked inode will result in CoW alloc.