aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2013-12-17 09:27:08 -0500
committerIngo Molnar <mingo@kernel.org>2013-12-17 09:27:08 -0500
commitbb799d3b980eb803ca2da4a4eefbd9308f8d988a (patch)
tree69fbe0cd6d47b23a50f5e1d87bf7489532fae149 /fs
parent919fc6e34831d1c2b58bfb5ae261dc3facc9b269 (diff)
parent319e2e3f63c348a9b66db4667efa73178e18b17d (diff)
Merge tag 'v3.13-rc4' into core/locking
Merge Linux 3.13-rc4, to refresh this rather old tree with the latest fixes. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/vfs_dentry.c19
-rw-r--r--fs/affs/Changes2
-rw-r--r--fs/aio.c138
-rw-r--r--fs/bio.c2
-rw-r--r--fs/btrfs/Kconfig15
-rw-r--r--fs/btrfs/async-thread.c1
-rw-r--r--fs/btrfs/check-integrity.c57
-rw-r--r--fs/btrfs/check-integrity.h2
-rw-r--r--fs/btrfs/ctree.h6
-rw-r--r--fs/btrfs/dev-replace.c2
-rw-r--r--fs/btrfs/disk-io.c21
-rw-r--r--fs/btrfs/extent-tree.c22
-rw-r--r--fs/btrfs/extent_io.c23
-rw-r--r--fs/btrfs/inode.c6
-rw-r--r--fs/btrfs/ioctl.c3
-rw-r--r--fs/btrfs/ordered-data.c3
-rw-r--r--fs/btrfs/relocation.c81
-rw-r--r--fs/btrfs/scrub.c39
-rw-r--r--fs/btrfs/send.c4
-rw-r--r--fs/btrfs/super.c5
-rw-r--r--fs/btrfs/transaction.c4
-rw-r--r--fs/btrfs/tree-log.c5
-rw-r--r--fs/btrfs/volumes.c2
-rw-r--r--fs/ceph/addr.c2
-rw-r--r--fs/ceph/cache.c3
-rw-r--r--fs/ceph/caps.c27
-rw-r--r--fs/ceph/dir.c11
-rw-r--r--fs/ceph/inode.c49
-rw-r--r--fs/ceph/mds_client.c61
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/super.h8
-rw-r--r--fs/cifs/cifsglob.h1
-rw-r--r--fs/cifs/ioctl.c6
-rw-r--r--fs/cifs/smb2ops.c99
-rw-r--r--fs/cifs/smb2pdu.c92
-rw-r--r--fs/cifs/smb2pdu.h12
-rw-r--r--fs/cifs/smb2proto.h1
-rw-r--r--fs/cifs/smbfsctl.h2
-rw-r--r--fs/configfs/dir.c28
-rw-r--r--fs/coredump.c6
-rw-r--r--fs/dcache.c86
-rw-r--r--fs/ecryptfs/file.c8
-rw-r--r--fs/efivarfs/super.c11
-rw-r--r--fs/eventpoll.c3
-rw-r--r--fs/exec.c5
-rw-r--r--fs/gfs2/glock.c3
-rw-r--r--fs/gfs2/inode.c5
-rw-r--r--fs/gfs2/lock_dlm.c8
-rw-r--r--fs/gfs2/quota.c23
-rw-r--r--fs/gfs2/rgrp.c4
-rw-r--r--fs/hfsplus/wrapper.c17
-rw-r--r--fs/hostfs/hostfs_kern.c11
-rw-r--r--fs/libfs.c12
-rw-r--r--fs/logfs/dev_bdev.c13
-rw-r--r--fs/namei.c11
-rw-r--r--fs/nfs/blocklayout/blocklayout.h1
-rw-r--r--fs/nfs/blocklayout/extents.c2
-rw-r--r--fs/nfs/dns_resolve.c2
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/internal.h15
-rw-r--r--fs/nfs/nfs4_fs.h8
-rw-r--r--fs/nfs/nfs4proc.c30
-rw-r--r--fs/nfsd/nfs4xdr.c3
-rw-r--r--fs/nfsd/nfscache.c9
-rw-r--r--fs/nfsd/vfs.c173
-rw-r--r--fs/pipe.c39
-rw-r--r--fs/proc/base.c14
-rw-r--r--fs/proc/generic.c18
-rw-r--r--fs/proc/inode.c14
-rw-r--r--fs/proc/namespaces.c8
-rw-r--r--fs/squashfs/Kconfig72
-rw-r--r--fs/squashfs/Makefile5
-rw-r--r--fs/squashfs/block.c36
-rw-r--r--fs/squashfs/cache.c28
-rw-r--r--fs/squashfs/decompressor.c59
-rw-r--r--fs/squashfs/decompressor.h24
-rw-r--r--fs/squashfs/decompressor_multi.c198
-rw-r--r--fs/squashfs/decompressor_multi_percpu.c97
-rw-r--r--fs/squashfs/decompressor_single.c85
-rw-r--r--fs/squashfs/file.c142
-rw-r--r--fs/squashfs/file_cache.c38
-rw-r--r--fs/squashfs/file_direct.c176
-rw-r--r--fs/squashfs/lzo_wrapper.c47
-rw-r--r--fs/squashfs/page_actor.c100
-rw-r--r--fs/squashfs/page_actor.h81
-rw-r--r--fs/squashfs/squashfs.h20
-rw-r--r--fs/squashfs/squashfs_fs_sb.h4
-rw-r--r--fs/squashfs/super.c10
-rw-r--r--fs/squashfs/xz_wrapper.c105
-rw-r--r--fs/squashfs/zlib_wrapper.c64
-rw-r--r--fs/sysfs/file.c22
-rw-r--r--fs/xfs/xfs_bmap.c38
-rw-r--r--fs/xfs/xfs_discard.c5
-rw-r--r--fs/xfs/xfs_fsops.c6
-rw-r--r--fs/xfs/xfs_ioctl.c3
-rw-r--r--fs/xfs/xfs_ioctl32.c3
-rw-r--r--fs/xfs/xfs_mount.c15
-rw-r--r--fs/xfs/xfs_mount.h2
-rw-r--r--fs/xfs/xfs_trans_inode.c8
-rw-r--r--fs/xfs/xfs_trans_resv.c3
100 files changed, 2016 insertions, 909 deletions
diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c
index f039b104a98e..b03dd23feda8 100644
--- a/fs/9p/vfs_dentry.c
+++ b/fs/9p/vfs_dentry.c
@@ -43,23 +43,6 @@
43#include "fid.h" 43#include "fid.h"
44 44
45/** 45/**
46 * v9fs_dentry_delete - called when dentry refcount equals 0
47 * @dentry: dentry in question
48 *
49 * By returning 1 here we should remove cacheing of unused
50 * dentry components.
51 *
52 */
53
54static int v9fs_dentry_delete(const struct dentry *dentry)
55{
56 p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n",
57 dentry->d_name.name, dentry);
58
59 return 1;
60}
61
62/**
63 * v9fs_cached_dentry_delete - called when dentry refcount equals 0 46 * v9fs_cached_dentry_delete - called when dentry refcount equals 0
64 * @dentry: dentry in question 47 * @dentry: dentry in question
65 * 48 *
@@ -134,6 +117,6 @@ const struct dentry_operations v9fs_cached_dentry_operations = {
134}; 117};
135 118
136const struct dentry_operations v9fs_dentry_operations = { 119const struct dentry_operations v9fs_dentry_operations = {
137 .d_delete = v9fs_dentry_delete, 120 .d_delete = always_delete_dentry,
138 .d_release = v9fs_dentry_release, 121 .d_release = v9fs_dentry_release,
139}; 122};
diff --git a/fs/affs/Changes b/fs/affs/Changes
index a29409c1ffe0..b41c2c9792ff 100644
--- a/fs/affs/Changes
+++ b/fs/affs/Changes
@@ -91,7 +91,7 @@ more 2.4 fixes: [Roman Zippel]
91Version 3.11 91Version 3.11
92------------ 92------------
93 93
94- Converted to use 2.3.x page cache [Dave Jones <dave@powertweak.com>] 94- Converted to use 2.3.x page cache [Dave Jones]
95- Corruption in truncate() bugfix [Ken Tyler <kent@werple.net.au>] 95- Corruption in truncate() bugfix [Ken Tyler <kent@werple.net.au>]
96 96
97Version 3.10 97Version 3.10
diff --git a/fs/aio.c b/fs/aio.c
index 823efcbb6ccd..6efb7f6cb22e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -80,6 +80,8 @@ struct kioctx {
80 struct percpu_ref users; 80 struct percpu_ref users;
81 atomic_t dead; 81 atomic_t dead;
82 82
83 struct percpu_ref reqs;
84
83 unsigned long user_id; 85 unsigned long user_id;
84 86
85 struct __percpu kioctx_cpu *cpu; 87 struct __percpu kioctx_cpu *cpu;
@@ -107,7 +109,6 @@ struct kioctx {
107 struct page **ring_pages; 109 struct page **ring_pages;
108 long nr_pages; 110 long nr_pages;
109 111
110 struct rcu_head rcu_head;
111 struct work_struct free_work; 112 struct work_struct free_work;
112 113
113 struct { 114 struct {
@@ -250,8 +251,10 @@ static void aio_free_ring(struct kioctx *ctx)
250 251
251 put_aio_ring_file(ctx); 252 put_aio_ring_file(ctx);
252 253
253 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) 254 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) {
254 kfree(ctx->ring_pages); 255 kfree(ctx->ring_pages);
256 ctx->ring_pages = NULL;
257 }
255} 258}
256 259
257static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) 260static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
@@ -364,8 +367,10 @@ static int aio_setup_ring(struct kioctx *ctx)
364 if (nr_pages > AIO_RING_PAGES) { 367 if (nr_pages > AIO_RING_PAGES) {
365 ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), 368 ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
366 GFP_KERNEL); 369 GFP_KERNEL);
367 if (!ctx->ring_pages) 370 if (!ctx->ring_pages) {
371 put_aio_ring_file(ctx);
368 return -ENOMEM; 372 return -ENOMEM;
373 }
369 } 374 }
370 375
371 ctx->mmap_size = nr_pages * PAGE_SIZE; 376 ctx->mmap_size = nr_pages * PAGE_SIZE;
@@ -463,26 +468,34 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
463 return cancel(kiocb); 468 return cancel(kiocb);
464} 469}
465 470
466static void free_ioctx_rcu(struct rcu_head *head) 471static void free_ioctx(struct work_struct *work)
467{ 472{
468 struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); 473 struct kioctx *ctx = container_of(work, struct kioctx, free_work);
474
475 pr_debug("freeing %p\n", ctx);
469 476
477 aio_free_ring(ctx);
470 free_percpu(ctx->cpu); 478 free_percpu(ctx->cpu);
471 kmem_cache_free(kioctx_cachep, ctx); 479 kmem_cache_free(kioctx_cachep, ctx);
472} 480}
473 481
482static void free_ioctx_reqs(struct percpu_ref *ref)
483{
484 struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
485
486 INIT_WORK(&ctx->free_work, free_ioctx);
487 schedule_work(&ctx->free_work);
488}
489
474/* 490/*
475 * When this function runs, the kioctx has been removed from the "hash table" 491 * When this function runs, the kioctx has been removed from the "hash table"
476 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - 492 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
477 * now it's safe to cancel any that need to be. 493 * now it's safe to cancel any that need to be.
478 */ 494 */
479static void free_ioctx(struct work_struct *work) 495static void free_ioctx_users(struct percpu_ref *ref)
480{ 496{
481 struct kioctx *ctx = container_of(work, struct kioctx, free_work); 497 struct kioctx *ctx = container_of(ref, struct kioctx, users);
482 struct aio_ring *ring;
483 struct kiocb *req; 498 struct kiocb *req;
484 unsigned cpu, avail;
485 DEFINE_WAIT(wait);
486 499
487 spin_lock_irq(&ctx->ctx_lock); 500 spin_lock_irq(&ctx->ctx_lock);
488 501
@@ -496,54 +509,8 @@ static void free_ioctx(struct work_struct *work)
496 509
497 spin_unlock_irq(&ctx->ctx_lock); 510 spin_unlock_irq(&ctx->ctx_lock);
498 511
499 for_each_possible_cpu(cpu) { 512 percpu_ref_kill(&ctx->reqs);
500 struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); 513 percpu_ref_put(&ctx->reqs);
501
502 atomic_add(kcpu->reqs_available, &ctx->reqs_available);
503 kcpu->reqs_available = 0;
504 }
505
506 while (1) {
507 prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE);
508
509 ring = kmap_atomic(ctx->ring_pages[0]);
510 avail = (ring->head <= ring->tail)
511 ? ring->tail - ring->head
512 : ctx->nr_events - ring->head + ring->tail;
513
514 atomic_add(avail, &ctx->reqs_available);
515 ring->head = ring->tail;
516 kunmap_atomic(ring);
517
518 if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)
519 break;
520
521 schedule();
522 }
523 finish_wait(&ctx->wait, &wait);
524
525 WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
526
527 aio_free_ring(ctx);
528
529 pr_debug("freeing %p\n", ctx);
530
531 /*
532 * Here the call_rcu() is between the wait_event() for reqs_active to
533 * hit 0, and freeing the ioctx.
534 *
535 * aio_complete() decrements reqs_active, but it has to touch the ioctx
536 * after to issue a wakeup so we use rcu.
537 */
538 call_rcu(&ctx->rcu_head, free_ioctx_rcu);
539}
540
541static void free_ioctx_ref(struct percpu_ref *ref)
542{
543 struct kioctx *ctx = container_of(ref, struct kioctx, users);
544
545 INIT_WORK(&ctx->free_work, free_ioctx);
546 schedule_work(&ctx->free_work);
547} 514}
548 515
549static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) 516static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
@@ -602,6 +569,16 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
602 } 569 }
603} 570}
604 571
572static void aio_nr_sub(unsigned nr)
573{
574 spin_lock(&aio_nr_lock);
575 if (WARN_ON(aio_nr - nr > aio_nr))
576 aio_nr = 0;
577 else
578 aio_nr -= nr;
579 spin_unlock(&aio_nr_lock);
580}
581
605/* ioctx_alloc 582/* ioctx_alloc
606 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. 583 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
607 */ 584 */
@@ -639,8 +616,11 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
639 616
640 ctx->max_reqs = nr_events; 617 ctx->max_reqs = nr_events;
641 618
642 if (percpu_ref_init(&ctx->users, free_ioctx_ref)) 619 if (percpu_ref_init(&ctx->users, free_ioctx_users))
643 goto out_freectx; 620 goto err;
621
622 if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
623 goto err;
644 624
645 spin_lock_init(&ctx->ctx_lock); 625 spin_lock_init(&ctx->ctx_lock);
646 spin_lock_init(&ctx->completion_lock); 626 spin_lock_init(&ctx->completion_lock);
@@ -651,10 +631,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
651 631
652 ctx->cpu = alloc_percpu(struct kioctx_cpu); 632 ctx->cpu = alloc_percpu(struct kioctx_cpu);
653 if (!ctx->cpu) 633 if (!ctx->cpu)
654 goto out_freeref; 634 goto err;
655 635
656 if (aio_setup_ring(ctx) < 0) 636 if (aio_setup_ring(ctx) < 0)
657 goto out_freepcpu; 637 goto err;
658 638
659 atomic_set(&ctx->reqs_available, ctx->nr_events - 1); 639 atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
660 ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); 640 ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
@@ -666,7 +646,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
666 if (aio_nr + nr_events > (aio_max_nr * 2UL) || 646 if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
667 aio_nr + nr_events < aio_nr) { 647 aio_nr + nr_events < aio_nr) {
668 spin_unlock(&aio_nr_lock); 648 spin_unlock(&aio_nr_lock);
669 goto out_cleanup; 649 err = -EAGAIN;
650 goto err_ctx;
670 } 651 }
671 aio_nr += ctx->max_reqs; 652 aio_nr += ctx->max_reqs;
672 spin_unlock(&aio_nr_lock); 653 spin_unlock(&aio_nr_lock);
@@ -675,23 +656,20 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
675 656
676 err = ioctx_add_table(ctx, mm); 657 err = ioctx_add_table(ctx, mm);
677 if (err) 658 if (err)
678 goto out_cleanup_put; 659 goto err_cleanup;
679 660
680 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", 661 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
681 ctx, ctx->user_id, mm, ctx->nr_events); 662 ctx, ctx->user_id, mm, ctx->nr_events);
682 return ctx; 663 return ctx;
683 664
684out_cleanup_put: 665err_cleanup:
685 percpu_ref_put(&ctx->users); 666 aio_nr_sub(ctx->max_reqs);
686out_cleanup: 667err_ctx:
687 err = -EAGAIN;
688 aio_free_ring(ctx); 668 aio_free_ring(ctx);
689out_freepcpu: 669err:
690 free_percpu(ctx->cpu); 670 free_percpu(ctx->cpu);
691out_freeref: 671 free_percpu(ctx->reqs.pcpu_count);
692 free_percpu(ctx->users.pcpu_count); 672 free_percpu(ctx->users.pcpu_count);
693out_freectx:
694 put_aio_ring_file(ctx);
695 kmem_cache_free(kioctx_cachep, ctx); 673 kmem_cache_free(kioctx_cachep, ctx);
696 pr_debug("error allocating ioctx %d\n", err); 674 pr_debug("error allocating ioctx %d\n", err);
697 return ERR_PTR(err); 675 return ERR_PTR(err);
@@ -726,10 +704,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
726 * -EAGAIN with no ioctxs actually in use (as far as userspace 704 * -EAGAIN with no ioctxs actually in use (as far as userspace
727 * could tell). 705 * could tell).
728 */ 706 */
729 spin_lock(&aio_nr_lock); 707 aio_nr_sub(ctx->max_reqs);
730 BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
731 aio_nr -= ctx->max_reqs;
732 spin_unlock(&aio_nr_lock);
733 708
734 if (ctx->mmap_size) 709 if (ctx->mmap_size)
735 vm_munmap(ctx->mmap_base, ctx->mmap_size); 710 vm_munmap(ctx->mmap_base, ctx->mmap_size);
@@ -861,6 +836,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx)
861 if (unlikely(!req)) 836 if (unlikely(!req))
862 goto out_put; 837 goto out_put;
863 838
839 percpu_ref_get(&ctx->reqs);
840
864 req->ki_ctx = ctx; 841 req->ki_ctx = ctx;
865 return req; 842 return req;
866out_put: 843out_put:
@@ -930,12 +907,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
930 return; 907 return;
931 } 908 }
932 909
933 /*
934 * Take rcu_read_lock() in case the kioctx is being destroyed, as we
935 * need to issue a wakeup after incrementing reqs_available.
936 */
937 rcu_read_lock();
938
939 if (iocb->ki_list.next) { 910 if (iocb->ki_list.next) {
940 unsigned long flags; 911 unsigned long flags;
941 912
@@ -1010,7 +981,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
1010 if (waitqueue_active(&ctx->wait)) 981 if (waitqueue_active(&ctx->wait))
1011 wake_up(&ctx->wait); 982 wake_up(&ctx->wait);
1012 983
1013 rcu_read_unlock(); 984 percpu_ref_put(&ctx->reqs);
1014} 985}
1015EXPORT_SYMBOL(aio_complete); 986EXPORT_SYMBOL(aio_complete);
1016 987
@@ -1421,6 +1392,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1421 return 0; 1392 return 0;
1422out_put_req: 1393out_put_req:
1423 put_reqs_available(ctx, 1); 1394 put_reqs_available(ctx, 1);
1395 percpu_ref_put(&ctx->reqs);
1424 kiocb_free(req); 1396 kiocb_free(req);
1425 return ret; 1397 return ret;
1426} 1398}
diff --git a/fs/bio.c b/fs/bio.c
index 2bdb4e25ee77..33d79a4eb92d 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -601,7 +601,7 @@ EXPORT_SYMBOL(bio_get_nr_vecs);
601 601
602static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page 602static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
603 *page, unsigned int len, unsigned int offset, 603 *page, unsigned int len, unsigned int offset,
604 unsigned short max_sectors) 604 unsigned int max_sectors)
605{ 605{
606 int retried_segments = 0; 606 int retried_segments = 0;
607 struct bio_vec *bvec; 607 struct bio_vec *bvec;
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index f9d5094e1029..aa976eced2d2 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -9,12 +9,17 @@ config BTRFS_FS
9 select XOR_BLOCKS 9 select XOR_BLOCKS
10 10
11 help 11 help
12 Btrfs is a new filesystem with extents, writable snapshotting, 12 Btrfs is a general purpose copy-on-write filesystem with extents,
13 support for multiple devices and many more features. 13 writable snapshotting, support for multiple devices and many more
14 features focused on fault tolerance, repair and easy administration.
14 15
15 Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET 16 The filesystem disk format is no longer unstable, and it's not
16 FINALIZED. You should say N here unless you are interested in 17 expected to change unless there are strong reasons to do so. If there
17 testing Btrfs with non-critical data. 18 is a format change, file systems with a unchanged format will
19 continue to be mountable and usable by newer kernels.
20
21 For more information, please see the web pages at
22 http://btrfs.wiki.kernel.org.
18 23
19 To compile this file system support as a module, choose M here. The 24 To compile this file system support as a module, choose M here. The
20 module will be called btrfs. 25 module will be called btrfs.
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 8aec751fa464..c1e0b0caf9cc 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -495,6 +495,7 @@ static int __btrfs_start_workers(struct btrfs_workers *workers)
495 spin_lock_irq(&workers->lock); 495 spin_lock_irq(&workers->lock);
496 if (workers->stopping) { 496 if (workers->stopping) {
497 spin_unlock_irq(&workers->lock); 497 spin_unlock_irq(&workers->lock);
498 ret = -EINVAL;
498 goto fail_kthread; 499 goto fail_kthread;
499 } 500 }
500 list_add_tail(&worker->worker_list, &workers->idle_list); 501 list_add_tail(&worker->worker_list, &workers->idle_list);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index e0aab4456974..131d82800b3a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -77,6 +77,15 @@
77 * the integrity of (super)-block write requests, do not 77 * the integrity of (super)-block write requests, do not
78 * enable the config option BTRFS_FS_CHECK_INTEGRITY to 78 * enable the config option BTRFS_FS_CHECK_INTEGRITY to
79 * include and compile the integrity check tool. 79 * include and compile the integrity check tool.
80 *
81 * Expect millions of lines of information in the kernel log with an
82 * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the
83 * kernel config to at least 26 (which is 64MB). Usually the value is
84 * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be
85 * changed like this before LOG_BUF_SHIFT can be set to a high value:
86 * config LOG_BUF_SHIFT
87 * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)"
88 * range 12 30
80 */ 89 */
81 90
82#include <linux/sched.h> 91#include <linux/sched.h>
@@ -124,6 +133,7 @@
124#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 133#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400
125#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 134#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800
126#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 135#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000
136#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000
127 137
128struct btrfsic_dev_state; 138struct btrfsic_dev_state;
129struct btrfsic_state; 139struct btrfsic_state;
@@ -323,7 +333,6 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
323static int btrfsic_read_block(struct btrfsic_state *state, 333static int btrfsic_read_block(struct btrfsic_state *state,
324 struct btrfsic_block_data_ctx *block_ctx); 334 struct btrfsic_block_data_ctx *block_ctx);
325static void btrfsic_dump_database(struct btrfsic_state *state); 335static void btrfsic_dump_database(struct btrfsic_state *state);
326static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
327static int btrfsic_test_for_metadata(struct btrfsic_state *state, 336static int btrfsic_test_for_metadata(struct btrfsic_state *state,
328 char **datav, unsigned int num_pages); 337 char **datav, unsigned int num_pages);
329static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, 338static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
@@ -1677,7 +1686,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
1677 for (i = 0; i < num_pages;) { 1686 for (i = 0; i < num_pages;) {
1678 struct bio *bio; 1687 struct bio *bio;
1679 unsigned int j; 1688 unsigned int j;
1680 DECLARE_COMPLETION_ONSTACK(complete);
1681 1689
1682 bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i); 1690 bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
1683 if (!bio) { 1691 if (!bio) {
@@ -1688,8 +1696,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
1688 } 1696 }
1689 bio->bi_bdev = block_ctx->dev->bdev; 1697 bio->bi_bdev = block_ctx->dev->bdev;
1690 bio->bi_sector = dev_bytenr >> 9; 1698 bio->bi_sector = dev_bytenr >> 9;
1691 bio->bi_end_io = btrfsic_complete_bio_end_io;
1692 bio->bi_private = &complete;
1693 1699
1694 for (j = i; j < num_pages; j++) { 1700 for (j = i; j < num_pages; j++) {
1695 ret = bio_add_page(bio, block_ctx->pagev[j], 1701 ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -1702,12 +1708,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
1702 "btrfsic: error, failed to add a single page!\n"); 1708 "btrfsic: error, failed to add a single page!\n");
1703 return -1; 1709 return -1;
1704 } 1710 }
1705 submit_bio(READ, bio); 1711 if (submit_bio_wait(READ, bio)) {
1706
1707 /* this will also unplug the queue */
1708 wait_for_completion(&complete);
1709
1710 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1711 printk(KERN_INFO 1712 printk(KERN_INFO
1712 "btrfsic: read error at logical %llu dev %s!\n", 1713 "btrfsic: read error at logical %llu dev %s!\n",
1713 block_ctx->start, block_ctx->dev->name); 1714 block_ctx->start, block_ctx->dev->name);
@@ -1730,11 +1731,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
1730 return block_ctx->len; 1731 return block_ctx->len;
1731} 1732}
1732 1733
1733static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
1734{
1735 complete((struct completion *)bio->bi_private);
1736}
1737
1738static void btrfsic_dump_database(struct btrfsic_state *state) 1734static void btrfsic_dump_database(struct btrfsic_state *state)
1739{ 1735{
1740 struct list_head *elem_all; 1736 struct list_head *elem_all;
@@ -2998,14 +2994,12 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
2998 return submit_bh(rw, bh); 2994 return submit_bh(rw, bh);
2999} 2995}
3000 2996
3001void btrfsic_submit_bio(int rw, struct bio *bio) 2997static void __btrfsic_submit_bio(int rw, struct bio *bio)
3002{ 2998{
3003 struct btrfsic_dev_state *dev_state; 2999 struct btrfsic_dev_state *dev_state;
3004 3000
3005 if (!btrfsic_is_initialized) { 3001 if (!btrfsic_is_initialized)
3006 submit_bio(rw, bio);
3007 return; 3002 return;
3008 }
3009 3003
3010 mutex_lock(&btrfsic_mutex); 3004 mutex_lock(&btrfsic_mutex);
3011 /* since btrfsic_submit_bio() is also called before 3005 /* since btrfsic_submit_bio() is also called before
@@ -3015,6 +3009,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
3015 (rw & WRITE) && NULL != bio->bi_io_vec) { 3009 (rw & WRITE) && NULL != bio->bi_io_vec) {
3016 unsigned int i; 3010 unsigned int i;
3017 u64 dev_bytenr; 3011 u64 dev_bytenr;
3012 u64 cur_bytenr;
3018 int bio_is_patched; 3013 int bio_is_patched;
3019 char **mapped_datav; 3014 char **mapped_datav;
3020 3015
@@ -3033,6 +3028,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
3033 GFP_NOFS); 3028 GFP_NOFS);
3034 if (!mapped_datav) 3029 if (!mapped_datav)
3035 goto leave; 3030 goto leave;
3031 cur_bytenr = dev_bytenr;
3036 for (i = 0; i < bio->bi_vcnt; i++) { 3032 for (i = 0; i < bio->bi_vcnt; i++) {
3037 BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE); 3033 BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
3038 mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); 3034 mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
@@ -3044,16 +3040,13 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
3044 kfree(mapped_datav); 3040 kfree(mapped_datav);
3045 goto leave; 3041 goto leave;
3046 } 3042 }
3047 if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | 3043 if (dev_state->state->print_mask &
3048 BTRFSIC_PRINT_MASK_VERBOSE) == 3044 BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
3049 (dev_state->state->print_mask &
3050 (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
3051 BTRFSIC_PRINT_MASK_VERBOSE)))
3052 printk(KERN_INFO 3045 printk(KERN_INFO
3053 "#%u: page=%p, len=%u, offset=%u\n", 3046 "#%u: bytenr=%llu, len=%u, offset=%u\n",
3054 i, bio->bi_io_vec[i].bv_page, 3047 i, cur_bytenr, bio->bi_io_vec[i].bv_len,
3055 bio->bi_io_vec[i].bv_len,
3056 bio->bi_io_vec[i].bv_offset); 3048 bio->bi_io_vec[i].bv_offset);
3049 cur_bytenr += bio->bi_io_vec[i].bv_len;
3057 } 3050 }
3058 btrfsic_process_written_block(dev_state, dev_bytenr, 3051 btrfsic_process_written_block(dev_state, dev_bytenr,
3059 mapped_datav, bio->bi_vcnt, 3052 mapped_datav, bio->bi_vcnt,
@@ -3097,10 +3090,20 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
3097 } 3090 }
3098leave: 3091leave:
3099 mutex_unlock(&btrfsic_mutex); 3092 mutex_unlock(&btrfsic_mutex);
3093}
3100 3094
3095void btrfsic_submit_bio(int rw, struct bio *bio)
3096{
3097 __btrfsic_submit_bio(rw, bio);
3101 submit_bio(rw, bio); 3098 submit_bio(rw, bio);
3102} 3099}
3103 3100
3101int btrfsic_submit_bio_wait(int rw, struct bio *bio)
3102{
3103 __btrfsic_submit_bio(rw, bio);
3104 return submit_bio_wait(rw, bio);
3105}
3106
3104int btrfsic_mount(struct btrfs_root *root, 3107int btrfsic_mount(struct btrfs_root *root,
3105 struct btrfs_fs_devices *fs_devices, 3108 struct btrfs_fs_devices *fs_devices,
3106 int including_extent_data, u32 print_mask) 3109 int including_extent_data, u32 print_mask)
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index 8b59175cc502..13b8566c97ab 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -22,9 +22,11 @@
22#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY 22#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
23int btrfsic_submit_bh(int rw, struct buffer_head *bh); 23int btrfsic_submit_bh(int rw, struct buffer_head *bh);
24void btrfsic_submit_bio(int rw, struct bio *bio); 24void btrfsic_submit_bio(int rw, struct bio *bio);
25int btrfsic_submit_bio_wait(int rw, struct bio *bio);
25#else 26#else
26#define btrfsic_submit_bh submit_bh 27#define btrfsic_submit_bh submit_bh
27#define btrfsic_submit_bio submit_bio 28#define btrfsic_submit_bio submit_bio
29#define btrfsic_submit_bio_wait submit_bio_wait
28#endif 30#endif
29 31
30int btrfsic_mount(struct btrfs_root *root, 32int btrfsic_mount(struct btrfs_root *root,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f9aeb2759a64..54ab86127f7a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3613,9 +3613,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
3613 struct btrfs_ordered_sum *sums); 3613 struct btrfs_ordered_sum *sums);
3614int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, 3614int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
3615 struct bio *bio, u64 file_start, int contig); 3615 struct bio *bio, u64 file_start, int contig);
3616int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
3617 struct btrfs_root *root, struct btrfs_path *path,
3618 u64 isize);
3619int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 3616int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
3620 struct list_head *list, int search_commit); 3617 struct list_head *list, int search_commit);
3621/* inode.c */ 3618/* inode.c */
@@ -3744,9 +3741,6 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
3744int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); 3741int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
3745void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 3742void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
3746 int skip_pinned); 3743 int skip_pinned);
3747int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
3748 u64 start, u64 end, int skip_pinned,
3749 int modified);
3750extern const struct file_operations btrfs_file_operations; 3744extern const struct file_operations btrfs_file_operations;
3751int __btrfs_drop_extents(struct btrfs_trans_handle *trans, 3745int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
3752 struct btrfs_root *root, struct inode *inode, 3746 struct btrfs_root *root, struct inode *inode,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 342f9fd411e3..2cfc3dfff64f 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -366,7 +366,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
366 dev_replace->tgtdev = tgt_device; 366 dev_replace->tgtdev = tgt_device;
367 367
368 printk_in_rcu(KERN_INFO 368 printk_in_rcu(KERN_INFO
369 "btrfs: dev_replace from %s (devid %llu) to %s) started\n", 369 "btrfs: dev_replace from %s (devid %llu) to %s started\n",
370 src_device->missing ? "<missing disk>" : 370 src_device->missing ? "<missing disk>" :
371 rcu_str_deref(src_device->name), 371 rcu_str_deref(src_device->name),
372 src_device->devid, 372 src_device->devid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4c4ed0bb3da1..8072cfa8a3b1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3517,7 +3517,6 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
3517int btrfs_commit_super(struct btrfs_root *root) 3517int btrfs_commit_super(struct btrfs_root *root)
3518{ 3518{
3519 struct btrfs_trans_handle *trans; 3519 struct btrfs_trans_handle *trans;
3520 int ret;
3521 3520
3522 mutex_lock(&root->fs_info->cleaner_mutex); 3521 mutex_lock(&root->fs_info->cleaner_mutex);
3523 btrfs_run_delayed_iputs(root); 3522 btrfs_run_delayed_iputs(root);
@@ -3531,25 +3530,7 @@ int btrfs_commit_super(struct btrfs_root *root)
3531 trans = btrfs_join_transaction(root); 3530 trans = btrfs_join_transaction(root);
3532 if (IS_ERR(trans)) 3531 if (IS_ERR(trans))
3533 return PTR_ERR(trans); 3532 return PTR_ERR(trans);
3534 ret = btrfs_commit_transaction(trans, root); 3533 return btrfs_commit_transaction(trans, root);
3535 if (ret)
3536 return ret;
3537 /* run commit again to drop the original snapshot */
3538 trans = btrfs_join_transaction(root);
3539 if (IS_ERR(trans))
3540 return PTR_ERR(trans);
3541 ret = btrfs_commit_transaction(trans, root);
3542 if (ret)
3543 return ret;
3544 ret = btrfs_write_and_wait_transaction(NULL, root);
3545 if (ret) {
3546 btrfs_error(root->fs_info, ret,
3547 "Failed to sync btree inode to disk.");
3548 return ret;
3549 }
3550
3551 ret = write_ctree_super(NULL, root, 0);
3552 return ret;
3553} 3534}
3554 3535
3555int close_ctree(struct btrfs_root *root) 3536int close_ctree(struct btrfs_root *root)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 45d98d01028f..9c01509dd8ab 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -767,20 +767,19 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
767 if (!path) 767 if (!path)
768 return -ENOMEM; 768 return -ENOMEM;
769 769
770 if (metadata) {
771 key.objectid = bytenr;
772 key.type = BTRFS_METADATA_ITEM_KEY;
773 key.offset = offset;
774 } else {
775 key.objectid = bytenr;
776 key.type = BTRFS_EXTENT_ITEM_KEY;
777 key.offset = offset;
778 }
779
780 if (!trans) { 770 if (!trans) {
781 path->skip_locking = 1; 771 path->skip_locking = 1;
782 path->search_commit_root = 1; 772 path->search_commit_root = 1;
783 } 773 }
774
775search_again:
776 key.objectid = bytenr;
777 key.offset = offset;
778 if (metadata)
779 key.type = BTRFS_METADATA_ITEM_KEY;
780 else
781 key.type = BTRFS_EXTENT_ITEM_KEY;
782
784again: 783again:
785 ret = btrfs_search_slot(trans, root->fs_info->extent_root, 784 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
786 &key, path, 0, 0); 785 &key, path, 0, 0);
@@ -788,7 +787,6 @@ again:
788 goto out_free; 787 goto out_free;
789 788
790 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) { 789 if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
791 metadata = 0;
792 if (path->slots[0]) { 790 if (path->slots[0]) {
793 path->slots[0]--; 791 path->slots[0]--;
794 btrfs_item_key_to_cpu(path->nodes[0], &key, 792 btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -855,7 +853,7 @@ again:
855 mutex_lock(&head->mutex); 853 mutex_lock(&head->mutex);
856 mutex_unlock(&head->mutex); 854 mutex_unlock(&head->mutex);
857 btrfs_put_delayed_ref(&head->node); 855 btrfs_put_delayed_ref(&head->node);
858 goto again; 856 goto search_again;
859 } 857 }
860 if (head->extent_op && head->extent_op->update_flags) 858 if (head->extent_op && head->extent_op->update_flags)
861 extent_flags |= head->extent_op->flags_to_set; 859 extent_flags |= head->extent_op->flags_to_set;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 856bc2b2192c..ff43802a7c88 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1952,11 +1952,6 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1952 return err; 1952 return err;
1953} 1953}
1954 1954
1955static void repair_io_failure_callback(struct bio *bio, int err)
1956{
1957 complete(bio->bi_private);
1958}
1959
1960/* 1955/*
1961 * this bypasses the standard btrfs submit functions deliberately, as 1956 * this bypasses the standard btrfs submit functions deliberately, as
1962 * the standard behavior is to write all copies in a raid setup. here we only 1957 * the standard behavior is to write all copies in a raid setup. here we only
@@ -1973,13 +1968,13 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1973{ 1968{
1974 struct bio *bio; 1969 struct bio *bio;
1975 struct btrfs_device *dev; 1970 struct btrfs_device *dev;
1976 DECLARE_COMPLETION_ONSTACK(compl);
1977 u64 map_length = 0; 1971 u64 map_length = 0;
1978 u64 sector; 1972 u64 sector;
1979 struct btrfs_bio *bbio = NULL; 1973 struct btrfs_bio *bbio = NULL;
1980 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 1974 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
1981 int ret; 1975 int ret;
1982 1976
1977 ASSERT(!(fs_info->sb->s_flags & MS_RDONLY));
1983 BUG_ON(!mirror_num); 1978 BUG_ON(!mirror_num);
1984 1979
1985 /* we can't repair anything in raid56 yet */ 1980 /* we can't repair anything in raid56 yet */
@@ -1989,8 +1984,6 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1989 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 1984 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1990 if (!bio) 1985 if (!bio)
1991 return -EIO; 1986 return -EIO;
1992 bio->bi_private = &compl;
1993 bio->bi_end_io = repair_io_failure_callback;
1994 bio->bi_size = 0; 1987 bio->bi_size = 0;
1995 map_length = length; 1988 map_length = length;
1996 1989
@@ -2011,10 +2004,8 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2011 } 2004 }
2012 bio->bi_bdev = dev->bdev; 2005 bio->bi_bdev = dev->bdev;
2013 bio_add_page(bio, page, length, start - page_offset(page)); 2006 bio_add_page(bio, page, length, start - page_offset(page));
2014 btrfsic_submit_bio(WRITE_SYNC, bio);
2015 wait_for_completion(&compl);
2016 2007
2017 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2008 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
2018 /* try to remap that extent elsewhere? */ 2009 /* try to remap that extent elsewhere? */
2019 bio_put(bio); 2010 bio_put(bio);
2020 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 2011 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
@@ -2036,6 +2027,9 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2036 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); 2027 unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
2037 int ret = 0; 2028 int ret = 0;
2038 2029
2030 if (root->fs_info->sb->s_flags & MS_RDONLY)
2031 return -EROFS;
2032
2039 for (i = 0; i < num_pages; i++) { 2033 for (i = 0; i < num_pages; i++) {
2040 struct page *p = extent_buffer_page(eb, i); 2034 struct page *p = extent_buffer_page(eb, i);
2041 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, 2035 ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
@@ -2057,12 +2051,12 @@ static int clean_io_failure(u64 start, struct page *page)
2057 u64 private; 2051 u64 private;
2058 u64 private_failure; 2052 u64 private_failure;
2059 struct io_failure_record *failrec; 2053 struct io_failure_record *failrec;
2060 struct btrfs_fs_info *fs_info; 2054 struct inode *inode = page->mapping->host;
2055 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2061 struct extent_state *state; 2056 struct extent_state *state;
2062 int num_copies; 2057 int num_copies;
2063 int did_repair = 0; 2058 int did_repair = 0;
2064 int ret; 2059 int ret;
2065 struct inode *inode = page->mapping->host;
2066 2060
2067 private = 0; 2061 private = 0;
2068 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 2062 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
@@ -2085,6 +2079,8 @@ static int clean_io_failure(u64 start, struct page *page)
2085 did_repair = 1; 2079 did_repair = 1;
2086 goto out; 2080 goto out;
2087 } 2081 }
2082 if (fs_info->sb->s_flags & MS_RDONLY)
2083 goto out;
2088 2084
2089 spin_lock(&BTRFS_I(inode)->io_tree.lock); 2085 spin_lock(&BTRFS_I(inode)->io_tree.lock);
2090 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, 2086 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
@@ -2094,7 +2090,6 @@ static int clean_io_failure(u64 start, struct page *page)
2094 2090
2095 if (state && state->start <= failrec->start && 2091 if (state && state->start <= failrec->start &&
2096 state->end >= failrec->start + failrec->len - 1) { 2092 state->end >= failrec->start + failrec->len - 1) {
2097 fs_info = BTRFS_I(inode)->root->fs_info;
2098 num_copies = btrfs_num_copies(fs_info, failrec->logical, 2093 num_copies = btrfs_num_copies(fs_info, failrec->logical,
2099 failrec->len); 2094 failrec->len);
2100 if (num_copies > 1) { 2095 if (num_copies > 1) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index da8d2f696ac5..f1a77449d032 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2129,7 +2129,8 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
2129 old->extent_offset, fs_info, 2129 old->extent_offset, fs_info,
2130 path, record_one_backref, 2130 path, record_one_backref,
2131 old); 2131 old);
2132 BUG_ON(ret < 0 && ret != -ENOENT); 2132 if (ret < 0 && ret != -ENOENT)
2133 return false;
2133 2134
2134 /* no backref to be processed for this extent */ 2135 /* no backref to be processed for this extent */
2135 if (!old->count) { 2136 if (!old->count) {
@@ -6186,8 +6187,7 @@ insert:
6186 write_unlock(&em_tree->lock); 6187 write_unlock(&em_tree->lock);
6187out: 6188out:
6188 6189
6189 if (em) 6190 trace_btrfs_get_extent(root, em);
6190 trace_btrfs_get_extent(root, em);
6191 6191
6192 if (path) 6192 if (path)
6193 btrfs_free_path(path); 6193 btrfs_free_path(path);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a111622598b0..21da5762b0b1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2121,7 +2121,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2121 2121
2122 err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); 2122 err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
2123 if (err == -EINTR) 2123 if (err == -EINTR)
2124 goto out; 2124 goto out_drop_write;
2125 dentry = lookup_one_len(vol_args->name, parent, namelen); 2125 dentry = lookup_one_len(vol_args->name, parent, namelen);
2126 if (IS_ERR(dentry)) { 2126 if (IS_ERR(dentry)) {
2127 err = PTR_ERR(dentry); 2127 err = PTR_ERR(dentry);
@@ -2284,6 +2284,7 @@ out_dput:
2284 dput(dentry); 2284 dput(dentry);
2285out_unlock_dir: 2285out_unlock_dir:
2286 mutex_unlock(&dir->i_mutex); 2286 mutex_unlock(&dir->i_mutex);
2287out_drop_write:
2287 mnt_drop_write_file(file); 2288 mnt_drop_write_file(file);
2288out: 2289out:
2289 kfree(vol_args); 2290 kfree(vol_args);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 25a8f3812f14..69582d5b69d1 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -638,6 +638,7 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
638 WARN_ON(nr < 0); 638 WARN_ON(nr < 0);
639 } 639 }
640 } 640 }
641 list_splice_tail(&splice, &fs_info->ordered_roots);
641 spin_unlock(&fs_info->ordered_root_lock); 642 spin_unlock(&fs_info->ordered_root_lock);
642} 643}
643 644
@@ -803,7 +804,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
803 btrfs_put_ordered_extent(ordered); 804 btrfs_put_ordered_extent(ordered);
804 break; 805 break;
805 } 806 }
806 if (ordered->file_offset + ordered->len < start) { 807 if (ordered->file_offset + ordered->len <= start) {
807 btrfs_put_ordered_extent(ordered); 808 btrfs_put_ordered_extent(ordered);
808 break; 809 break;
809 } 810 }
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ce459a7cb16d..429c73c374b8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -571,7 +571,9 @@ static int is_cowonly_root(u64 root_objectid)
571 root_objectid == BTRFS_CHUNK_TREE_OBJECTID || 571 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
572 root_objectid == BTRFS_DEV_TREE_OBJECTID || 572 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
573 root_objectid == BTRFS_TREE_LOG_OBJECTID || 573 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
574 root_objectid == BTRFS_CSUM_TREE_OBJECTID) 574 root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
575 root_objectid == BTRFS_UUID_TREE_OBJECTID ||
576 root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
575 return 1; 577 return 1;
576 return 0; 578 return 0;
577} 579}
@@ -1264,10 +1266,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
1264} 1266}
1265 1267
1266/* 1268/*
1267 * helper to update/delete the 'address of tree root -> reloc tree' 1269 * helper to delete the 'address of tree root -> reloc tree'
1268 * mapping 1270 * mapping
1269 */ 1271 */
1270static int __update_reloc_root(struct btrfs_root *root, int del) 1272static void __del_reloc_root(struct btrfs_root *root)
1271{ 1273{
1272 struct rb_node *rb_node; 1274 struct rb_node *rb_node;
1273 struct mapping_node *node = NULL; 1275 struct mapping_node *node = NULL;
@@ -1275,7 +1277,7 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
1275 1277
1276 spin_lock(&rc->reloc_root_tree.lock); 1278 spin_lock(&rc->reloc_root_tree.lock);
1277 rb_node = tree_search(&rc->reloc_root_tree.rb_root, 1279 rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1278 root->commit_root->start); 1280 root->node->start);
1279 if (rb_node) { 1281 if (rb_node) {
1280 node = rb_entry(rb_node, struct mapping_node, rb_node); 1282 node = rb_entry(rb_node, struct mapping_node, rb_node);
1281 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root); 1283 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
@@ -1283,23 +1285,45 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
1283 spin_unlock(&rc->reloc_root_tree.lock); 1285 spin_unlock(&rc->reloc_root_tree.lock);
1284 1286
1285 if (!node) 1287 if (!node)
1286 return 0; 1288 return;
1287 BUG_ON((struct btrfs_root *)node->data != root); 1289 BUG_ON((struct btrfs_root *)node->data != root);
1288 1290
1289 if (!del) { 1291 spin_lock(&root->fs_info->trans_lock);
1290 spin_lock(&rc->reloc_root_tree.lock); 1292 list_del_init(&root->root_list);
1291 node->bytenr = root->node->start; 1293 spin_unlock(&root->fs_info->trans_lock);
1292 rb_node = tree_insert(&rc->reloc_root_tree.rb_root, 1294 kfree(node);
1293 node->bytenr, &node->rb_node); 1295}
1294 spin_unlock(&rc->reloc_root_tree.lock); 1296
1295 if (rb_node) 1297/*
1296 backref_tree_panic(rb_node, -EEXIST, node->bytenr); 1298 * helper to update the 'address of tree root -> reloc tree'
1297 } else { 1299 * mapping
1298 spin_lock(&root->fs_info->trans_lock); 1300 */
1299 list_del_init(&root->root_list); 1301static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
1300 spin_unlock(&root->fs_info->trans_lock); 1302{
1301 kfree(node); 1303 struct rb_node *rb_node;
1304 struct mapping_node *node = NULL;
1305 struct reloc_control *rc = root->fs_info->reloc_ctl;
1306
1307 spin_lock(&rc->reloc_root_tree.lock);
1308 rb_node = tree_search(&rc->reloc_root_tree.rb_root,
1309 root->node->start);
1310 if (rb_node) {
1311 node = rb_entry(rb_node, struct mapping_node, rb_node);
1312 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
1302 } 1313 }
1314 spin_unlock(&rc->reloc_root_tree.lock);
1315
1316 if (!node)
1317 return 0;
1318 BUG_ON((struct btrfs_root *)node->data != root);
1319
1320 spin_lock(&rc->reloc_root_tree.lock);
1321 node->bytenr = new_bytenr;
1322 rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
1323 node->bytenr, &node->rb_node);
1324 spin_unlock(&rc->reloc_root_tree.lock);
1325 if (rb_node)
1326 backref_tree_panic(rb_node, -EEXIST, node->bytenr);
1303 return 0; 1327 return 0;
1304} 1328}
1305 1329
@@ -1420,7 +1444,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1420{ 1444{
1421 struct btrfs_root *reloc_root; 1445 struct btrfs_root *reloc_root;
1422 struct btrfs_root_item *root_item; 1446 struct btrfs_root_item *root_item;
1423 int del = 0;
1424 int ret; 1447 int ret;
1425 1448
1426 if (!root->reloc_root) 1449 if (!root->reloc_root)
@@ -1432,11 +1455,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1432 if (root->fs_info->reloc_ctl->merge_reloc_tree && 1455 if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1433 btrfs_root_refs(root_item) == 0) { 1456 btrfs_root_refs(root_item) == 0) {
1434 root->reloc_root = NULL; 1457 root->reloc_root = NULL;
1435 del = 1; 1458 __del_reloc_root(reloc_root);
1436 } 1459 }
1437 1460
1438 __update_reloc_root(reloc_root, del);
1439
1440 if (reloc_root->commit_root != reloc_root->node) { 1461 if (reloc_root->commit_root != reloc_root->node) {
1441 btrfs_set_root_node(root_item, reloc_root->node); 1462 btrfs_set_root_node(root_item, reloc_root->node);
1442 free_extent_buffer(reloc_root->commit_root); 1463 free_extent_buffer(reloc_root->commit_root);
@@ -2287,7 +2308,7 @@ void free_reloc_roots(struct list_head *list)
2287 while (!list_empty(list)) { 2308 while (!list_empty(list)) {
2288 reloc_root = list_entry(list->next, struct btrfs_root, 2309 reloc_root = list_entry(list->next, struct btrfs_root,
2289 root_list); 2310 root_list);
2290 __update_reloc_root(reloc_root, 1); 2311 __del_reloc_root(reloc_root);
2291 free_extent_buffer(reloc_root->node); 2312 free_extent_buffer(reloc_root->node);
2292 free_extent_buffer(reloc_root->commit_root); 2313 free_extent_buffer(reloc_root->commit_root);
2293 kfree(reloc_root); 2314 kfree(reloc_root);
@@ -2332,7 +2353,7 @@ again:
2332 2353
2333 ret = merge_reloc_root(rc, root); 2354 ret = merge_reloc_root(rc, root);
2334 if (ret) { 2355 if (ret) {
2335 __update_reloc_root(reloc_root, 1); 2356 __del_reloc_root(reloc_root);
2336 free_extent_buffer(reloc_root->node); 2357 free_extent_buffer(reloc_root->node);
2337 free_extent_buffer(reloc_root->commit_root); 2358 free_extent_buffer(reloc_root->commit_root);
2338 kfree(reloc_root); 2359 kfree(reloc_root);
@@ -2388,6 +2409,13 @@ out:
2388 btrfs_std_error(root->fs_info, ret); 2409 btrfs_std_error(root->fs_info, ret);
2389 if (!list_empty(&reloc_roots)) 2410 if (!list_empty(&reloc_roots))
2390 free_reloc_roots(&reloc_roots); 2411 free_reloc_roots(&reloc_roots);
2412
2413 /* new reloc root may be added */
2414 mutex_lock(&root->fs_info->reloc_mutex);
2415 list_splice_init(&rc->reloc_roots, &reloc_roots);
2416 mutex_unlock(&root->fs_info->reloc_mutex);
2417 if (!list_empty(&reloc_roots))
2418 free_reloc_roots(&reloc_roots);
2391 } 2419 }
2392 2420
2393 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2421 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
@@ -4522,6 +4550,11 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4522 BUG_ON(rc->stage == UPDATE_DATA_PTRS && 4550 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4523 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); 4551 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4524 4552
4553 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
4554 if (buf == root->node)
4555 __update_reloc_root(root, cow->start);
4556 }
4557
4525 level = btrfs_header_level(buf); 4558 level = btrfs_header_level(buf);
4526 if (btrfs_header_generation(buf) <= 4559 if (btrfs_header_generation(buf) <=
4527 btrfs_root_last_snapshot(&root->root_item)) 4560 btrfs_root_last_snapshot(&root->root_item))
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 2544805544f0..1fd3f33c330a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -208,7 +208,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
208 int is_metadata, int have_csum, 208 int is_metadata, int have_csum,
209 const u8 *csum, u64 generation, 209 const u8 *csum, u64 generation,
210 u16 csum_size); 210 u16 csum_size);
211static void scrub_complete_bio_end_io(struct bio *bio, int err);
212static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 211static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
213 struct scrub_block *sblock_good, 212 struct scrub_block *sblock_good,
214 int force_write); 213 int force_write);
@@ -938,8 +937,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
938 BTRFS_DEV_STAT_CORRUPTION_ERRS); 937 BTRFS_DEV_STAT_CORRUPTION_ERRS);
939 } 938 }
940 939
941 if (sctx->readonly && !sctx->is_dev_replace) 940 if (sctx->readonly) {
942 goto did_not_correct_error; 941 ASSERT(!sctx->is_dev_replace);
942 goto out;
943 }
943 944
944 if (!is_metadata && !have_csum) { 945 if (!is_metadata && !have_csum) {
945 struct scrub_fixup_nodatasum *fixup_nodatasum; 946 struct scrub_fixup_nodatasum *fixup_nodatasum;
@@ -1292,7 +1293,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1292 for (page_num = 0; page_num < sblock->page_count; page_num++) { 1293 for (page_num = 0; page_num < sblock->page_count; page_num++) {
1293 struct bio *bio; 1294 struct bio *bio;
1294 struct scrub_page *page = sblock->pagev[page_num]; 1295 struct scrub_page *page = sblock->pagev[page_num];
1295 DECLARE_COMPLETION_ONSTACK(complete);
1296 1296
1297 if (page->dev->bdev == NULL) { 1297 if (page->dev->bdev == NULL) {
1298 page->io_error = 1; 1298 page->io_error = 1;
@@ -1309,18 +1309,11 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1309 } 1309 }
1310 bio->bi_bdev = page->dev->bdev; 1310 bio->bi_bdev = page->dev->bdev;
1311 bio->bi_sector = page->physical >> 9; 1311 bio->bi_sector = page->physical >> 9;
1312 bio->bi_end_io = scrub_complete_bio_end_io;
1313 bio->bi_private = &complete;
1314 1312
1315 bio_add_page(bio, page->page, PAGE_SIZE, 0); 1313 bio_add_page(bio, page->page, PAGE_SIZE, 0);
1316 btrfsic_submit_bio(READ, bio); 1314 if (btrfsic_submit_bio_wait(READ, bio))
1317
1318 /* this will also unplug the queue */
1319 wait_for_completion(&complete);
1320
1321 page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
1322 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1323 sblock->no_io_error_seen = 0; 1315 sblock->no_io_error_seen = 0;
1316
1324 bio_put(bio); 1317 bio_put(bio);
1325 } 1318 }
1326 1319
@@ -1389,11 +1382,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1389 sblock->checksum_error = 1; 1382 sblock->checksum_error = 1;
1390} 1383}
1391 1384
1392static void scrub_complete_bio_end_io(struct bio *bio, int err)
1393{
1394 complete((struct completion *)bio->bi_private);
1395}
1396
1397static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, 1385static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1398 struct scrub_block *sblock_good, 1386 struct scrub_block *sblock_good,
1399 int force_write) 1387 int force_write)
@@ -1428,7 +1416,6 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1428 sblock_bad->checksum_error || page_bad->io_error) { 1416 sblock_bad->checksum_error || page_bad->io_error) {
1429 struct bio *bio; 1417 struct bio *bio;
1430 int ret; 1418 int ret;
1431 DECLARE_COMPLETION_ONSTACK(complete);
1432 1419
1433 if (!page_bad->dev->bdev) { 1420 if (!page_bad->dev->bdev) {
1434 printk_ratelimited(KERN_WARNING 1421 printk_ratelimited(KERN_WARNING
@@ -1441,19 +1428,14 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1441 return -EIO; 1428 return -EIO;
1442 bio->bi_bdev = page_bad->dev->bdev; 1429 bio->bi_bdev = page_bad->dev->bdev;
1443 bio->bi_sector = page_bad->physical >> 9; 1430 bio->bi_sector = page_bad->physical >> 9;
1444 bio->bi_end_io = scrub_complete_bio_end_io;
1445 bio->bi_private = &complete;
1446 1431
1447 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0); 1432 ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1448 if (PAGE_SIZE != ret) { 1433 if (PAGE_SIZE != ret) {
1449 bio_put(bio); 1434 bio_put(bio);
1450 return -EIO; 1435 return -EIO;
1451 } 1436 }
1452 btrfsic_submit_bio(WRITE, bio);
1453 1437
1454 /* this will also unplug the queue */ 1438 if (btrfsic_submit_bio_wait(WRITE, bio)) {
1455 wait_for_completion(&complete);
1456 if (!bio_flagged(bio, BIO_UPTODATE)) {
1457 btrfs_dev_stat_inc_and_print(page_bad->dev, 1439 btrfs_dev_stat_inc_and_print(page_bad->dev,
1458 BTRFS_DEV_STAT_WRITE_ERRS); 1440 BTRFS_DEV_STAT_WRITE_ERRS);
1459 btrfs_dev_replace_stats_inc( 1441 btrfs_dev_replace_stats_inc(
@@ -3373,7 +3355,6 @@ static int write_page_nocow(struct scrub_ctx *sctx,
3373 struct bio *bio; 3355 struct bio *bio;
3374 struct btrfs_device *dev; 3356 struct btrfs_device *dev;
3375 int ret; 3357 int ret;
3376 DECLARE_COMPLETION_ONSTACK(compl);
3377 3358
3378 dev = sctx->wr_ctx.tgtdev; 3359 dev = sctx->wr_ctx.tgtdev;
3379 if (!dev) 3360 if (!dev)
@@ -3390,8 +3371,6 @@ static int write_page_nocow(struct scrub_ctx *sctx,
3390 spin_unlock(&sctx->stat_lock); 3371 spin_unlock(&sctx->stat_lock);
3391 return -ENOMEM; 3372 return -ENOMEM;
3392 } 3373 }
3393 bio->bi_private = &compl;
3394 bio->bi_end_io = scrub_complete_bio_end_io;
3395 bio->bi_size = 0; 3374 bio->bi_size = 0;
3396 bio->bi_sector = physical_for_dev_replace >> 9; 3375 bio->bi_sector = physical_for_dev_replace >> 9;
3397 bio->bi_bdev = dev->bdev; 3376 bio->bi_bdev = dev->bdev;
@@ -3402,10 +3381,8 @@ leave_with_eio:
3402 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 3381 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3403 return -EIO; 3382 return -EIO;
3404 } 3383 }
3405 btrfsic_submit_bio(WRITE_SYNC, bio);
3406 wait_for_completion(&compl);
3407 3384
3408 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 3385 if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
3409 goto leave_with_eio; 3386 goto leave_with_eio;
3410 3387
3411 bio_put(bio); 3388 bio_put(bio);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6837fe87f3a6..945d1db98f26 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4723,8 +4723,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
4723 } 4723 }
4724 4724
4725 if (!access_ok(VERIFY_READ, arg->clone_sources, 4725 if (!access_ok(VERIFY_READ, arg->clone_sources,
4726 sizeof(*arg->clone_sources * 4726 sizeof(*arg->clone_sources) *
4727 arg->clone_sources_count))) { 4727 arg->clone_sources_count)) {
4728 ret = -EFAULT; 4728 ret = -EFAULT;
4729 goto out; 4729 goto out;
4730 } 4730 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2d8ac1bf0cf9..d71a11d13dfa 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -432,7 +432,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
432 } else { 432 } else {
433 printk(KERN_INFO "btrfs: setting nodatacow\n"); 433 printk(KERN_INFO "btrfs: setting nodatacow\n");
434 } 434 }
435 info->compress_type = BTRFS_COMPRESS_NONE;
436 btrfs_clear_opt(info->mount_opt, COMPRESS); 435 btrfs_clear_opt(info->mount_opt, COMPRESS);
437 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); 436 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
438 btrfs_set_opt(info->mount_opt, NODATACOW); 437 btrfs_set_opt(info->mount_opt, NODATACOW);
@@ -461,7 +460,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
461 btrfs_set_fs_incompat(info, COMPRESS_LZO); 460 btrfs_set_fs_incompat(info, COMPRESS_LZO);
462 } else if (strncmp(args[0].from, "no", 2) == 0) { 461 } else if (strncmp(args[0].from, "no", 2) == 0) {
463 compress_type = "no"; 462 compress_type = "no";
464 info->compress_type = BTRFS_COMPRESS_NONE;
465 btrfs_clear_opt(info->mount_opt, COMPRESS); 463 btrfs_clear_opt(info->mount_opt, COMPRESS);
466 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); 464 btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
467 compress_force = false; 465 compress_force = false;
@@ -474,9 +472,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
474 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 472 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
475 pr_info("btrfs: force %s compression\n", 473 pr_info("btrfs: force %s compression\n",
476 compress_type); 474 compress_type);
477 } else 475 } else if (btrfs_test_opt(root, COMPRESS)) {
478 pr_info("btrfs: use %s compression\n", 476 pr_info("btrfs: use %s compression\n",
479 compress_type); 477 compress_type);
478 }
480 break; 479 break;
481 case Opt_ssd: 480 case Opt_ssd:
482 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 481 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 57c16b46afbd..c6a872a8a468 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1480,7 +1480,7 @@ static void do_async_commit(struct work_struct *work)
1480 * We've got freeze protection passed with the transaction. 1480 * We've got freeze protection passed with the transaction.
1481 * Tell lockdep about it. 1481 * Tell lockdep about it.
1482 */ 1482 */
1483 if (ac->newtrans->type < TRANS_JOIN_NOLOCK) 1483 if (ac->newtrans->type & __TRANS_FREEZABLE)
1484 rwsem_acquire_read( 1484 rwsem_acquire_read(
1485 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1485 &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1486 0, 1, _THIS_IP_); 1486 0, 1, _THIS_IP_);
@@ -1521,7 +1521,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1521 * Tell lockdep we've released the freeze rwsem, since the 1521 * Tell lockdep we've released the freeze rwsem, since the
1522 * async commit thread will be the one to unlock it. 1522 * async commit thread will be the one to unlock it.
1523 */ 1523 */
1524 if (trans->type < TRANS_JOIN_NOLOCK) 1524 if (ac->newtrans->type & __TRANS_FREEZABLE)
1525 rwsem_release( 1525 rwsem_release(
1526 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], 1526 &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
1527 1, _THIS_IP_); 1527 1, _THIS_IP_);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 744553c83fe2..9f7fc51ca334 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3697,7 +3697,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3697 ret = btrfs_truncate_inode_items(trans, log, 3697 ret = btrfs_truncate_inode_items(trans, log,
3698 inode, 0, 0); 3698 inode, 0, 0);
3699 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, 3699 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
3700 &BTRFS_I(inode)->runtime_flags)) { 3700 &BTRFS_I(inode)->runtime_flags) ||
3701 inode_only == LOG_INODE_EXISTS) {
3701 if (inode_only == LOG_INODE_ALL) 3702 if (inode_only == LOG_INODE_ALL)
3702 fast_search = true; 3703 fast_search = true;
3703 max_key.type = BTRFS_XATTR_ITEM_KEY; 3704 max_key.type = BTRFS_XATTR_ITEM_KEY;
@@ -3801,7 +3802,7 @@ log_extents:
3801 err = ret; 3802 err = ret;
3802 goto out_unlock; 3803 goto out_unlock;
3803 } 3804 }
3804 } else { 3805 } else if (inode_only == LOG_INODE_ALL) {
3805 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; 3806 struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
3806 struct extent_map *em, *n; 3807 struct extent_map *em, *n;
3807 3808
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0db637097862..92303f42baaa 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -5394,7 +5394,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
5394{ 5394{
5395 struct bio_vec *prev; 5395 struct bio_vec *prev;
5396 struct request_queue *q = bdev_get_queue(bdev); 5396 struct request_queue *q = bdev_get_queue(bdev);
5397 unsigned short max_sectors = queue_max_sectors(q); 5397 unsigned int max_sectors = queue_max_sectors(q);
5398 struct bvec_merge_data bvm = { 5398 struct bvec_merge_data bvm = {
5399 .bi_bdev = bdev, 5399 .bi_bdev = bdev,
5400 .bi_sector = sector, 5400 .bi_sector = sector,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd481425..1e561c059539 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -216,7 +216,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
216 } 216 }
217 SetPageUptodate(page); 217 SetPageUptodate(page);
218 218
219 if (err == 0) 219 if (err >= 0)
220 ceph_readpage_to_fscache(inode, page); 220 ceph_readpage_to_fscache(inode, page);
221 221
222out: 222out:
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 7db2e6ca4b8f..8c44fdd4e1c3 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -324,6 +324,9 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
324{ 324{
325 struct ceph_inode_info *ci = ceph_inode(inode); 325 struct ceph_inode_info *ci = ceph_inode(inode);
326 326
327 if (!PageFsCache(page))
328 return;
329
327 fscache_wait_on_page_write(ci->fscache, page); 330 fscache_wait_on_page_write(ci->fscache, page);
328 fscache_uncache_page(ci->fscache, page); 331 fscache_uncache_page(ci->fscache, page);
329} 332}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 13976c33332e..3c0a4bd74996 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -897,7 +897,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
897 * caller should hold i_ceph_lock. 897 * caller should hold i_ceph_lock.
898 * caller will not hold session s_mutex if called from destroy_inode. 898 * caller will not hold session s_mutex if called from destroy_inode.
899 */ 899 */
900void __ceph_remove_cap(struct ceph_cap *cap) 900void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
901{ 901{
902 struct ceph_mds_session *session = cap->session; 902 struct ceph_mds_session *session = cap->session;
903 struct ceph_inode_info *ci = cap->ci; 903 struct ceph_inode_info *ci = cap->ci;
@@ -909,6 +909,16 @@ void __ceph_remove_cap(struct ceph_cap *cap)
909 909
910 /* remove from session list */ 910 /* remove from session list */
911 spin_lock(&session->s_cap_lock); 911 spin_lock(&session->s_cap_lock);
912 /*
913 * s_cap_reconnect is protected by s_cap_lock. no one changes
914 * s_cap_gen while session is in the reconnect state.
915 */
916 if (queue_release &&
917 (!session->s_cap_reconnect ||
918 cap->cap_gen == session->s_cap_gen))
919 __queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
920 cap->mseq, cap->issue_seq);
921
912 if (session->s_cap_iterator == cap) { 922 if (session->s_cap_iterator == cap) {
913 /* not yet, we are iterating over this very cap */ 923 /* not yet, we are iterating over this very cap */
914 dout("__ceph_remove_cap delaying %p removal from session %p\n", 924 dout("__ceph_remove_cap delaying %p removal from session %p\n",
@@ -1023,7 +1033,6 @@ void __queue_cap_release(struct ceph_mds_session *session,
1023 struct ceph_mds_cap_release *head; 1033 struct ceph_mds_cap_release *head;
1024 struct ceph_mds_cap_item *item; 1034 struct ceph_mds_cap_item *item;
1025 1035
1026 spin_lock(&session->s_cap_lock);
1027 BUG_ON(!session->s_num_cap_releases); 1036 BUG_ON(!session->s_num_cap_releases);
1028 msg = list_first_entry(&session->s_cap_releases, 1037 msg = list_first_entry(&session->s_cap_releases,
1029 struct ceph_msg, list_head); 1038 struct ceph_msg, list_head);
@@ -1052,7 +1061,6 @@ void __queue_cap_release(struct ceph_mds_session *session,
1052 (int)CEPH_CAPS_PER_RELEASE, 1061 (int)CEPH_CAPS_PER_RELEASE,
1053 (int)msg->front.iov_len); 1062 (int)msg->front.iov_len);
1054 } 1063 }
1055 spin_unlock(&session->s_cap_lock);
1056} 1064}
1057 1065
1058/* 1066/*
@@ -1067,12 +1075,8 @@ void ceph_queue_caps_release(struct inode *inode)
1067 p = rb_first(&ci->i_caps); 1075 p = rb_first(&ci->i_caps);
1068 while (p) { 1076 while (p) {
1069 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); 1077 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
1070 struct ceph_mds_session *session = cap->session;
1071
1072 __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
1073 cap->mseq, cap->issue_seq);
1074 p = rb_next(p); 1078 p = rb_next(p);
1075 __ceph_remove_cap(cap); 1079 __ceph_remove_cap(cap, true);
1076 } 1080 }
1077} 1081}
1078 1082
@@ -2791,7 +2795,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2791 } 2795 }
2792 spin_unlock(&mdsc->cap_dirty_lock); 2796 spin_unlock(&mdsc->cap_dirty_lock);
2793 } 2797 }
2794 __ceph_remove_cap(cap); 2798 __ceph_remove_cap(cap, false);
2795 } 2799 }
2796 /* else, we already released it */ 2800 /* else, we already released it */
2797 2801
@@ -2931,9 +2935,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2931 if (!inode) { 2935 if (!inode) {
2932 dout(" i don't have ino %llx\n", vino.ino); 2936 dout(" i don't have ino %llx\n", vino.ino);
2933 2937
2934 if (op == CEPH_CAP_OP_IMPORT) 2938 if (op == CEPH_CAP_OP_IMPORT) {
2939 spin_lock(&session->s_cap_lock);
2935 __queue_cap_release(session, vino.ino, cap_id, 2940 __queue_cap_release(session, vino.ino, cap_id,
2936 mseq, seq); 2941 mseq, seq);
2942 spin_unlock(&session->s_cap_lock);
2943 }
2937 goto flush_cap_releases; 2944 goto flush_cap_releases;
2938 } 2945 }
2939 2946
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 868b61d56cac..2a0bcaeb189a 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -352,8 +352,18 @@ more:
352 } 352 }
353 353
354 /* note next offset and last dentry name */ 354 /* note next offset and last dentry name */
355 rinfo = &req->r_reply_info;
356 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
357 frag = le32_to_cpu(rinfo->dir_dir->frag);
358 if (ceph_frag_is_leftmost(frag))
359 fi->next_offset = 2;
360 else
361 fi->next_offset = 0;
362 off = fi->next_offset;
363 }
355 fi->offset = fi->next_offset; 364 fi->offset = fi->next_offset;
356 fi->last_readdir = req; 365 fi->last_readdir = req;
366 fi->frag = frag;
357 367
358 if (req->r_reply_info.dir_end) { 368 if (req->r_reply_info.dir_end) {
359 kfree(fi->last_name); 369 kfree(fi->last_name);
@@ -363,7 +373,6 @@ more:
363 else 373 else
364 fi->next_offset = 0; 374 fi->next_offset = 0;
365 } else { 375 } else {
366 rinfo = &req->r_reply_info;
367 err = note_last_dentry(fi, 376 err = note_last_dentry(fi,
368 rinfo->dir_dname[rinfo->dir_nr-1], 377 rinfo->dir_dname[rinfo->dir_nr-1],
369 rinfo->dir_dname_len[rinfo->dir_nr-1]); 378 rinfo->dir_dname_len[rinfo->dir_nr-1]);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8549a48115f7..9a8e396aed89 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -577,6 +577,8 @@ static int fill_inode(struct inode *inode,
577 int issued = 0, implemented; 577 int issued = 0, implemented;
578 struct timespec mtime, atime, ctime; 578 struct timespec mtime, atime, ctime;
579 u32 nsplits; 579 u32 nsplits;
580 struct ceph_inode_frag *frag;
581 struct rb_node *rb_node;
580 struct ceph_buffer *xattr_blob = NULL; 582 struct ceph_buffer *xattr_blob = NULL;
581 int err = 0; 583 int err = 0;
582 int queue_trunc = 0; 584 int queue_trunc = 0;
@@ -751,15 +753,38 @@ no_change:
751 /* FIXME: move me up, if/when version reflects fragtree changes */ 753 /* FIXME: move me up, if/when version reflects fragtree changes */
752 nsplits = le32_to_cpu(info->fragtree.nsplits); 754 nsplits = le32_to_cpu(info->fragtree.nsplits);
753 mutex_lock(&ci->i_fragtree_mutex); 755 mutex_lock(&ci->i_fragtree_mutex);
756 rb_node = rb_first(&ci->i_fragtree);
754 for (i = 0; i < nsplits; i++) { 757 for (i = 0; i < nsplits; i++) {
755 u32 id = le32_to_cpu(info->fragtree.splits[i].frag); 758 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
756 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id); 759 frag = NULL;
757 760 while (rb_node) {
758 if (IS_ERR(frag)) 761 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
759 continue; 762 if (ceph_frag_compare(frag->frag, id) >= 0) {
763 if (frag->frag != id)
764 frag = NULL;
765 else
766 rb_node = rb_next(rb_node);
767 break;
768 }
769 rb_node = rb_next(rb_node);
770 rb_erase(&frag->node, &ci->i_fragtree);
771 kfree(frag);
772 frag = NULL;
773 }
774 if (!frag) {
775 frag = __get_or_create_frag(ci, id);
776 if (IS_ERR(frag))
777 continue;
778 }
760 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); 779 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
761 dout(" frag %x split by %d\n", frag->frag, frag->split_by); 780 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
762 } 781 }
782 while (rb_node) {
783 frag = rb_entry(rb_node, struct ceph_inode_frag, node);
784 rb_node = rb_next(rb_node);
785 rb_erase(&frag->node, &ci->i_fragtree);
786 kfree(frag);
787 }
763 mutex_unlock(&ci->i_fragtree_mutex); 788 mutex_unlock(&ci->i_fragtree_mutex);
764 789
765 /* were we issued a capability? */ 790 /* were we issued a capability? */
@@ -1250,8 +1275,20 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1250 int err = 0, i; 1275 int err = 0, i;
1251 struct inode *snapdir = NULL; 1276 struct inode *snapdir = NULL;
1252 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1277 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1253 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1254 struct ceph_dentry_info *di; 1278 struct ceph_dentry_info *di;
1279 u64 r_readdir_offset = req->r_readdir_offset;
1280 u32 frag = le32_to_cpu(rhead->args.readdir.frag);
1281
1282 if (rinfo->dir_dir &&
1283 le32_to_cpu(rinfo->dir_dir->frag) != frag) {
1284 dout("readdir_prepopulate got new frag %x -> %x\n",
1285 frag, le32_to_cpu(rinfo->dir_dir->frag));
1286 frag = le32_to_cpu(rinfo->dir_dir->frag);
1287 if (ceph_frag_is_leftmost(frag))
1288 r_readdir_offset = 2;
1289 else
1290 r_readdir_offset = 0;
1291 }
1255 1292
1256 if (req->r_aborted) 1293 if (req->r_aborted)
1257 return readdir_prepopulate_inodes_only(req, session); 1294 return readdir_prepopulate_inodes_only(req, session);
@@ -1315,7 +1352,7 @@ retry_lookup:
1315 } 1352 }
1316 1353
1317 di = dn->d_fsdata; 1354 di = dn->d_fsdata;
1318 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); 1355 di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
1319 1356
1320 /* inode */ 1357 /* inode */
1321 if (dn->d_inode) { 1358 if (dn->d_inode) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b7bda5d9611d..d90861f45210 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -43,6 +43,7 @@
43 */ 43 */
44 44
45struct ceph_reconnect_state { 45struct ceph_reconnect_state {
46 int nr_caps;
46 struct ceph_pagelist *pagelist; 47 struct ceph_pagelist *pagelist;
47 bool flock; 48 bool flock;
48}; 49};
@@ -443,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
443 INIT_LIST_HEAD(&s->s_waiting); 444 INIT_LIST_HEAD(&s->s_waiting);
444 INIT_LIST_HEAD(&s->s_unsafe); 445 INIT_LIST_HEAD(&s->s_unsafe);
445 s->s_num_cap_releases = 0; 446 s->s_num_cap_releases = 0;
447 s->s_cap_reconnect = 0;
446 s->s_cap_iterator = NULL; 448 s->s_cap_iterator = NULL;
447 INIT_LIST_HEAD(&s->s_cap_releases); 449 INIT_LIST_HEAD(&s->s_cap_releases);
448 INIT_LIST_HEAD(&s->s_cap_releases_done); 450 INIT_LIST_HEAD(&s->s_cap_releases_done);
@@ -642,6 +644,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
642 req->r_unsafe_dir = NULL; 644 req->r_unsafe_dir = NULL;
643 } 645 }
644 646
647 complete_all(&req->r_safe_completion);
648
645 ceph_mdsc_put_request(req); 649 ceph_mdsc_put_request(req);
646} 650}
647 651
@@ -986,7 +990,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
986 dout("removing cap %p, ci is %p, inode is %p\n", 990 dout("removing cap %p, ci is %p, inode is %p\n",
987 cap, ci, &ci->vfs_inode); 991 cap, ci, &ci->vfs_inode);
988 spin_lock(&ci->i_ceph_lock); 992 spin_lock(&ci->i_ceph_lock);
989 __ceph_remove_cap(cap); 993 __ceph_remove_cap(cap, false);
990 if (!__ceph_is_any_real_caps(ci)) { 994 if (!__ceph_is_any_real_caps(ci)) {
991 struct ceph_mds_client *mdsc = 995 struct ceph_mds_client *mdsc =
992 ceph_sb_to_client(inode->i_sb)->mdsc; 996 ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1231,9 +1235,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1231 session->s_trim_caps--; 1235 session->s_trim_caps--;
1232 if (oissued) { 1236 if (oissued) {
1233 /* we aren't the only cap.. just remove us */ 1237 /* we aren't the only cap.. just remove us */
1234 __queue_cap_release(session, ceph_ino(inode), cap->cap_id, 1238 __ceph_remove_cap(cap, true);
1235 cap->mseq, cap->issue_seq);
1236 __ceph_remove_cap(cap);
1237 } else { 1239 } else {
1238 /* try to drop referring dentries */ 1240 /* try to drop referring dentries */
1239 spin_unlock(&ci->i_ceph_lock); 1241 spin_unlock(&ci->i_ceph_lock);
@@ -1416,7 +1418,6 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
1416 unsigned num; 1418 unsigned num;
1417 1419
1418 dout("discard_cap_releases mds%d\n", session->s_mds); 1420 dout("discard_cap_releases mds%d\n", session->s_mds);
1419 spin_lock(&session->s_cap_lock);
1420 1421
1421 /* zero out the in-progress message */ 1422 /* zero out the in-progress message */
1422 msg = list_first_entry(&session->s_cap_releases, 1423 msg = list_first_entry(&session->s_cap_releases,
@@ -1443,8 +1444,6 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
1443 msg->front.iov_len = sizeof(*head); 1444 msg->front.iov_len = sizeof(*head);
1444 list_add(&msg->list_head, &session->s_cap_releases); 1445 list_add(&msg->list_head, &session->s_cap_releases);
1445 } 1446 }
1446
1447 spin_unlock(&session->s_cap_lock);
1448} 1447}
1449 1448
1450/* 1449/*
@@ -1875,8 +1874,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
1875 int mds = -1; 1874 int mds = -1;
1876 int err = -EAGAIN; 1875 int err = -EAGAIN;
1877 1876
1878 if (req->r_err || req->r_got_result) 1877 if (req->r_err || req->r_got_result) {
1878 if (req->r_aborted)
1879 __unregister_request(mdsc, req);
1879 goto out; 1880 goto out;
1881 }
1880 1882
1881 if (req->r_timeout && 1883 if (req->r_timeout &&
1882 time_after_eq(jiffies, req->r_started + req->r_timeout)) { 1884 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
@@ -2186,7 +2188,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2186 if (head->safe) { 2188 if (head->safe) {
2187 req->r_got_safe = true; 2189 req->r_got_safe = true;
2188 __unregister_request(mdsc, req); 2190 __unregister_request(mdsc, req);
2189 complete_all(&req->r_safe_completion);
2190 2191
2191 if (req->r_got_unsafe) { 2192 if (req->r_got_unsafe) {
2192 /* 2193 /*
@@ -2238,8 +2239,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2238 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); 2239 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2239 if (err == 0) { 2240 if (err == 0) {
2240 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || 2241 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
2241 req->r_op == CEPH_MDS_OP_LSSNAP) && 2242 req->r_op == CEPH_MDS_OP_LSSNAP))
2242 rinfo->dir_nr)
2243 ceph_readdir_prepopulate(req, req->r_session); 2243 ceph_readdir_prepopulate(req, req->r_session);
2244 ceph_unreserve_caps(mdsc, &req->r_caps_reservation); 2244 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
2245 } 2245 }
@@ -2490,6 +2490,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2490 cap->seq = 0; /* reset cap seq */ 2490 cap->seq = 0; /* reset cap seq */
2491 cap->issue_seq = 0; /* and issue_seq */ 2491 cap->issue_seq = 0; /* and issue_seq */
2492 cap->mseq = 0; /* and migrate_seq */ 2492 cap->mseq = 0; /* and migrate_seq */
2493 cap->cap_gen = cap->session->s_cap_gen;
2493 2494
2494 if (recon_state->flock) { 2495 if (recon_state->flock) {
2495 rec.v2.cap_id = cpu_to_le64(cap->cap_id); 2496 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -2552,6 +2553,8 @@ encode_again:
2552 } else { 2553 } else {
2553 err = ceph_pagelist_append(pagelist, &rec, reclen); 2554 err = ceph_pagelist_append(pagelist, &rec, reclen);
2554 } 2555 }
2556
2557 recon_state->nr_caps++;
2555out_free: 2558out_free:
2556 kfree(path); 2559 kfree(path);
2557out_dput: 2560out_dput:
@@ -2579,6 +2582,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2579 struct rb_node *p; 2582 struct rb_node *p;
2580 int mds = session->s_mds; 2583 int mds = session->s_mds;
2581 int err = -ENOMEM; 2584 int err = -ENOMEM;
2585 int s_nr_caps;
2582 struct ceph_pagelist *pagelist; 2586 struct ceph_pagelist *pagelist;
2583 struct ceph_reconnect_state recon_state; 2587 struct ceph_reconnect_state recon_state;
2584 2588
@@ -2610,20 +2614,38 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2610 dout("session %p state %s\n", session, 2614 dout("session %p state %s\n", session,
2611 session_state_name(session->s_state)); 2615 session_state_name(session->s_state));
2612 2616
2617 spin_lock(&session->s_gen_ttl_lock);
2618 session->s_cap_gen++;
2619 spin_unlock(&session->s_gen_ttl_lock);
2620
2621 spin_lock(&session->s_cap_lock);
2622 /*
2623 * notify __ceph_remove_cap() that we are composing cap reconnect.
2624 * If a cap get released before being added to the cap reconnect,
2625 * __ceph_remove_cap() should skip queuing cap release.
2626 */
2627 session->s_cap_reconnect = 1;
2613 /* drop old cap expires; we're about to reestablish that state */ 2628 /* drop old cap expires; we're about to reestablish that state */
2614 discard_cap_releases(mdsc, session); 2629 discard_cap_releases(mdsc, session);
2630 spin_unlock(&session->s_cap_lock);
2615 2631
2616 /* traverse this session's caps */ 2632 /* traverse this session's caps */
2617 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2633 s_nr_caps = session->s_nr_caps;
2634 err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
2618 if (err) 2635 if (err)
2619 goto fail; 2636 goto fail;
2620 2637
2638 recon_state.nr_caps = 0;
2621 recon_state.pagelist = pagelist; 2639 recon_state.pagelist = pagelist;
2622 recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK; 2640 recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
2623 err = iterate_session_caps(session, encode_caps_cb, &recon_state); 2641 err = iterate_session_caps(session, encode_caps_cb, &recon_state);
2624 if (err < 0) 2642 if (err < 0)
2625 goto fail; 2643 goto fail;
2626 2644
2645 spin_lock(&session->s_cap_lock);
2646 session->s_cap_reconnect = 0;
2647 spin_unlock(&session->s_cap_lock);
2648
2627 /* 2649 /*
2628 * snaprealms. we provide mds with the ino, seq (version), and 2650 * snaprealms. we provide mds with the ino, seq (version), and
2629 * parent for all of our realms. If the mds has any newer info, 2651 * parent for all of our realms. If the mds has any newer info,
@@ -2646,11 +2668,18 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2646 2668
2647 if (recon_state.flock) 2669 if (recon_state.flock)
2648 reply->hdr.version = cpu_to_le16(2); 2670 reply->hdr.version = cpu_to_le16(2);
2649 if (pagelist->length) { 2671
2650 /* set up outbound data if we have any */ 2672 /* raced with cap release? */
2651 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2673 if (s_nr_caps != recon_state.nr_caps) {
2652 ceph_msg_data_add_pagelist(reply, pagelist); 2674 struct page *page = list_first_entry(&pagelist->head,
2675 struct page, lru);
2676 __le32 *addr = kmap_atomic(page);
2677 *addr = cpu_to_le32(recon_state.nr_caps);
2678 kunmap_atomic(addr);
2653 } 2679 }
2680
2681 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2682 ceph_msg_data_add_pagelist(reply, pagelist);
2654 ceph_con_send(&session->s_con, reply); 2683 ceph_con_send(&session->s_con, reply);
2655 2684
2656 mutex_unlock(&session->s_mutex); 2685 mutex_unlock(&session->s_mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c2a19fbbe517..4c053d099ae4 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -132,6 +132,7 @@ struct ceph_mds_session {
132 struct list_head s_caps; /* all caps issued by this session */ 132 struct list_head s_caps; /* all caps issued by this session */
133 int s_nr_caps, s_trim_caps; 133 int s_nr_caps, s_trim_caps;
134 int s_num_cap_releases; 134 int s_num_cap_releases;
135 int s_cap_reconnect;
135 struct list_head s_cap_releases; /* waiting cap_release messages */ 136 struct list_head s_cap_releases; /* waiting cap_release messages */
136 struct list_head s_cap_releases_done; /* ready to send */ 137 struct list_head s_cap_releases_done; /* ready to send */
137 struct ceph_cap *s_cap_iterator; 138 struct ceph_cap *s_cap_iterator;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6014b0a3c405..ef4ac38bb614 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -741,13 +741,7 @@ extern int ceph_add_cap(struct inode *inode,
741 int fmode, unsigned issued, unsigned wanted, 741 int fmode, unsigned issued, unsigned wanted,
742 unsigned cap, unsigned seq, u64 realmino, int flags, 742 unsigned cap, unsigned seq, u64 realmino, int flags,
743 struct ceph_cap_reservation *caps_reservation); 743 struct ceph_cap_reservation *caps_reservation);
744extern void __ceph_remove_cap(struct ceph_cap *cap); 744extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
745static inline void ceph_remove_cap(struct ceph_cap *cap)
746{
747 spin_lock(&cap->ci->i_ceph_lock);
748 __ceph_remove_cap(cap);
749 spin_unlock(&cap->ci->i_ceph_lock);
750}
751extern void ceph_put_cap(struct ceph_mds_client *mdsc, 745extern void ceph_put_cap(struct ceph_mds_client *mdsc,
752 struct ceph_cap *cap); 746 struct ceph_cap *cap);
753 747
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d9ea7ada1378..f918a998a087 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -384,6 +384,7 @@ struct smb_version_operations {
384 int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file, 384 int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file,
385 struct cifsFileInfo *target_file, u64 src_off, u64 len, 385 struct cifsFileInfo *target_file, u64 src_off, u64 len,
386 u64 dest_off); 386 u64 dest_off);
387 int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
387}; 388};
388 389
389struct smb_version_values { 390struct smb_version_values {
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 409b45eefe70..77492301cc2b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -26,13 +26,15 @@
26#include <linux/mount.h> 26#include <linux/mount.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/pagemap.h> 28#include <linux/pagemap.h>
29#include <linux/btrfs.h>
30#include "cifspdu.h" 29#include "cifspdu.h"
31#include "cifsglob.h" 30#include "cifsglob.h"
32#include "cifsproto.h" 31#include "cifsproto.h"
33#include "cifs_debug.h" 32#include "cifs_debug.h"
34#include "cifsfs.h" 33#include "cifsfs.h"
35 34
35#define CIFS_IOCTL_MAGIC 0xCF
36#define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int)
37
36static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, 38static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
37 unsigned long srcfd, u64 off, u64 len, u64 destoff) 39 unsigned long srcfd, u64 off, u64 len, u64 destoff)
38{ 40{
@@ -213,7 +215,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
213 cifs_dbg(FYI, "set compress flag rc %d\n", rc); 215 cifs_dbg(FYI, "set compress flag rc %d\n", rc);
214 } 216 }
215 break; 217 break;
216 case BTRFS_IOC_CLONE: 218 case CIFS_IOC_COPYCHUNK_FILE:
217 rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0); 219 rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0);
218 break; 220 break;
219 default: 221 default:
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 11dde4b24f8a..757da3e54d3d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -532,7 +532,10 @@ smb2_clone_range(const unsigned int xid,
532 int rc; 532 int rc;
533 unsigned int ret_data_len; 533 unsigned int ret_data_len;
534 struct copychunk_ioctl *pcchunk; 534 struct copychunk_ioctl *pcchunk;
535 char *retbuf = NULL; 535 struct copychunk_ioctl_rsp *retbuf = NULL;
536 struct cifs_tcon *tcon;
537 int chunks_copied = 0;
538 bool chunk_sizes_updated = false;
536 539
537 pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); 540 pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
538 541
@@ -547,27 +550,96 @@ smb2_clone_range(const unsigned int xid,
547 550
548 /* Note: request_res_key sets res_key null only if rc !=0 */ 551 /* Note: request_res_key sets res_key null only if rc !=0 */
549 if (rc) 552 if (rc)
550 return rc; 553 goto cchunk_out;
551 554
552 /* For now array only one chunk long, will make more flexible later */ 555 /* For now array only one chunk long, will make more flexible later */
553 pcchunk->ChunkCount = __constant_cpu_to_le32(1); 556 pcchunk->ChunkCount = __constant_cpu_to_le32(1);
554 pcchunk->Reserved = 0; 557 pcchunk->Reserved = 0;
555 pcchunk->SourceOffset = cpu_to_le64(src_off);
556 pcchunk->TargetOffset = cpu_to_le64(dest_off);
557 pcchunk->Length = cpu_to_le32(len);
558 pcchunk->Reserved2 = 0; 558 pcchunk->Reserved2 = 0;
559 559
560 /* Request that server copy to target from src file identified by key */ 560 tcon = tlink_tcon(trgtfile->tlink);
561 rc = SMB2_ioctl(xid, tlink_tcon(trgtfile->tlink),
562 trgtfile->fid.persistent_fid,
563 trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
564 true /* is_fsctl */, (char *)pcchunk,
565 sizeof(struct copychunk_ioctl), &retbuf, &ret_data_len);
566 561
567 /* BB need to special case rc = EINVAL to alter chunk size */ 562 while (len > 0) {
563 pcchunk->SourceOffset = cpu_to_le64(src_off);
564 pcchunk->TargetOffset = cpu_to_le64(dest_off);
565 pcchunk->Length =
566 cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk));
568 567
569 cifs_dbg(FYI, "rc %d data length out %d\n", rc, ret_data_len); 568 /* Request server copy to target from src identified by key */
569 rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
570 trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
571 true /* is_fsctl */, (char *)pcchunk,
572 sizeof(struct copychunk_ioctl), (char **)&retbuf,
573 &ret_data_len);
574 if (rc == 0) {
575 if (ret_data_len !=
576 sizeof(struct copychunk_ioctl_rsp)) {
577 cifs_dbg(VFS, "invalid cchunk response size\n");
578 rc = -EIO;
579 goto cchunk_out;
580 }
581 if (retbuf->TotalBytesWritten == 0) {
582 cifs_dbg(FYI, "no bytes copied\n");
583 rc = -EIO;
584 goto cchunk_out;
585 }
586 /*
587 * Check if server claimed to write more than we asked
588 */
589 if (le32_to_cpu(retbuf->TotalBytesWritten) >
590 le32_to_cpu(pcchunk->Length)) {
591 cifs_dbg(VFS, "invalid copy chunk response\n");
592 rc = -EIO;
593 goto cchunk_out;
594 }
595 if (le32_to_cpu(retbuf->ChunksWritten) != 1) {
596 cifs_dbg(VFS, "invalid num chunks written\n");
597 rc = -EIO;
598 goto cchunk_out;
599 }
600 chunks_copied++;
601
602 src_off += le32_to_cpu(retbuf->TotalBytesWritten);
603 dest_off += le32_to_cpu(retbuf->TotalBytesWritten);
604 len -= le32_to_cpu(retbuf->TotalBytesWritten);
605
606 cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %d\n",
607 le32_to_cpu(retbuf->ChunksWritten),
608 le32_to_cpu(retbuf->ChunkBytesWritten),
609 le32_to_cpu(retbuf->TotalBytesWritten));
610 } else if (rc == -EINVAL) {
611 if (ret_data_len != sizeof(struct copychunk_ioctl_rsp))
612 goto cchunk_out;
613
614 cifs_dbg(FYI, "MaxChunks %d BytesChunk %d MaxCopy %d\n",
615 le32_to_cpu(retbuf->ChunksWritten),
616 le32_to_cpu(retbuf->ChunkBytesWritten),
617 le32_to_cpu(retbuf->TotalBytesWritten));
618
619 /*
620 * Check if this is the first request using these sizes,
621 * (ie check if copy succeed once with original sizes
622 * and check if the server gave us different sizes after
623 * we already updated max sizes on previous request).
624 * if not then why is the server returning an error now
625 */
626 if ((chunks_copied != 0) || chunk_sizes_updated)
627 goto cchunk_out;
628
629 /* Check that server is not asking us to grow size */
630 if (le32_to_cpu(retbuf->ChunkBytesWritten) <
631 tcon->max_bytes_chunk)
632 tcon->max_bytes_chunk =
633 le32_to_cpu(retbuf->ChunkBytesWritten);
634 else
635 goto cchunk_out; /* server gave us bogus size */
636
637 /* No need to change MaxChunks since already set to 1 */
638 chunk_sizes_updated = true;
639 }
640 }
570 641
642cchunk_out:
571 kfree(pcchunk); 643 kfree(pcchunk);
572 return rc; 644 return rc;
573} 645}
@@ -1247,6 +1319,7 @@ struct smb_version_operations smb30_operations = {
1247 .create_lease_buf = smb3_create_lease_buf, 1319 .create_lease_buf = smb3_create_lease_buf,
1248 .parse_lease_buf = smb3_parse_lease_buf, 1320 .parse_lease_buf = smb3_parse_lease_buf,
1249 .clone_range = smb2_clone_range, 1321 .clone_range = smb2_clone_range,
1322 .validate_negotiate = smb3_validate_negotiate,
1250}; 1323};
1251 1324
1252struct smb_version_values smb20_values = { 1325struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index d65270c290a1..2013234b73ad 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -454,6 +454,81 @@ neg_exit:
454 return rc; 454 return rc;
455} 455}
456 456
457int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
458{
459 int rc = 0;
460 struct validate_negotiate_info_req vneg_inbuf;
461 struct validate_negotiate_info_rsp *pneg_rsp;
462 u32 rsplen;
463
464 cifs_dbg(FYI, "validate negotiate\n");
465
466 /*
467 * validation ioctl must be signed, so no point sending this if we
468 * can not sign it. We could eventually change this to selectively
469 * sign just this, the first and only signed request on a connection.
470 * This is good enough for now since a user who wants better security
471 * would also enable signing on the mount. Having validation of
472 * negotiate info for signed connections helps reduce attack vectors
473 */
474 if (tcon->ses->server->sign == false)
475 return 0; /* validation requires signing */
476
477 vneg_inbuf.Capabilities =
478 cpu_to_le32(tcon->ses->server->vals->req_capabilities);
479 memcpy(vneg_inbuf.Guid, cifs_client_guid, SMB2_CLIENT_GUID_SIZE);
480
481 if (tcon->ses->sign)
482 vneg_inbuf.SecurityMode =
483 cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
484 else if (global_secflags & CIFSSEC_MAY_SIGN)
485 vneg_inbuf.SecurityMode =
486 cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
487 else
488 vneg_inbuf.SecurityMode = 0;
489
490 vneg_inbuf.DialectCount = cpu_to_le16(1);
491 vneg_inbuf.Dialects[0] =
492 cpu_to_le16(tcon->ses->server->vals->protocol_id);
493
494 rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
495 FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
496 (char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req),
497 (char **)&pneg_rsp, &rsplen);
498
499 if (rc != 0) {
500 cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
501 return -EIO;
502 }
503
504 if (rsplen != sizeof(struct validate_negotiate_info_rsp)) {
505 cifs_dbg(VFS, "invalid size of protocol negotiate response\n");
506 return -EIO;
507 }
508
509 /* check validate negotiate info response matches what we got earlier */
510 if (pneg_rsp->Dialect !=
511 cpu_to_le16(tcon->ses->server->vals->protocol_id))
512 goto vneg_out;
513
514 if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode))
515 goto vneg_out;
516
517 /* do not validate server guid because not saved at negprot time yet */
518
519 if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND |
520 SMB2_LARGE_FILES) != tcon->ses->server->capabilities)
521 goto vneg_out;
522
523 /* validate negotiate successful */
524 cifs_dbg(FYI, "validate negotiate info successful\n");
525 return 0;
526
527vneg_out:
528 cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n");
529 return -EIO;
530}
531
457int 532int
458SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, 533SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
459 const struct nls_table *nls_cp) 534 const struct nls_table *nls_cp)
@@ -829,6 +904,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
829 ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0)) 904 ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
830 cifs_dbg(VFS, "DFS capability contradicts DFS flag\n"); 905 cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
831 init_copy_chunk_defaults(tcon); 906 init_copy_chunk_defaults(tcon);
907 if (tcon->ses->server->ops->validate_negotiate)
908 rc = tcon->ses->server->ops->validate_negotiate(xid, tcon);
832tcon_exit: 909tcon_exit:
833 free_rsp_buf(resp_buftype, rsp); 910 free_rsp_buf(resp_buftype, rsp);
834 kfree(unc_path); 911 kfree(unc_path);
@@ -1214,10 +1291,17 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
1214 rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0); 1291 rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
1215 rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base; 1292 rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
1216 1293
1217 if (rc != 0) { 1294 if ((rc != 0) && (rc != -EINVAL)) {
1218 if (tcon) 1295 if (tcon)
1219 cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE); 1296 cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
1220 goto ioctl_exit; 1297 goto ioctl_exit;
1298 } else if (rc == -EINVAL) {
1299 if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) &&
1300 (opcode != FSCTL_SRV_COPYCHUNK)) {
1301 if (tcon)
1302 cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
1303 goto ioctl_exit;
1304 }
1221 } 1305 }
1222 1306
1223 /* check if caller wants to look at return data or just return rc */ 1307 /* check if caller wants to look at return data or just return rc */
@@ -2154,11 +2238,9 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
2154 rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0); 2238 rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0);
2155 rsp = (struct smb2_set_info_rsp *)iov[0].iov_base; 2239 rsp = (struct smb2_set_info_rsp *)iov[0].iov_base;
2156 2240
2157 if (rc != 0) { 2241 if (rc != 0)
2158 cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE); 2242 cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE);
2159 goto out; 2243
2160 }
2161out:
2162 free_rsp_buf(resp_buftype, rsp); 2244 free_rsp_buf(resp_buftype, rsp);
2163 kfree(iov); 2245 kfree(iov);
2164 return rc; 2246 return rc;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f88320bbb477..2022c542ea3a 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -577,13 +577,19 @@ struct copychunk_ioctl_rsp {
577 __le32 TotalBytesWritten; 577 __le32 TotalBytesWritten;
578} __packed; 578} __packed;
579 579
580/* Response and Request are the same format */ 580struct validate_negotiate_info_req {
581struct validate_negotiate_info {
582 __le32 Capabilities; 581 __le32 Capabilities;
583 __u8 Guid[SMB2_CLIENT_GUID_SIZE]; 582 __u8 Guid[SMB2_CLIENT_GUID_SIZE];
584 __le16 SecurityMode; 583 __le16 SecurityMode;
585 __le16 DialectCount; 584 __le16 DialectCount;
586 __le16 Dialect[1]; 585 __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */
586} __packed;
587
588struct validate_negotiate_info_rsp {
589 __le32 Capabilities;
590 __u8 Guid[SMB2_CLIENT_GUID_SIZE];
591 __le16 SecurityMode;
592 __le16 Dialect; /* Dialect in use for the connection */
587} __packed; 593} __packed;
588 594
589#define RSS_CAPABLE 0x00000001 595#define RSS_CAPABLE 0x00000001
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index b4eea105b08c..93adc64666f3 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -162,5 +162,6 @@ extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
162 struct smb2_lock_element *buf); 162 struct smb2_lock_element *buf);
163extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, 163extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
164 __u8 *lease_key, const __le32 lease_state); 164 __u8 *lease_key, const __le32 lease_state);
165extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
165 166
166#endif /* _SMB2PROTO_H */ 167#endif /* _SMB2PROTO_H */
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index a4b2391fe66e..0e538b5c9622 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -90,7 +90,7 @@
90#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */ 90#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
91#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ 91#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
92#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ 92#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
93#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */ 93#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
94/* Perform server-side data movement */ 94/* Perform server-side data movement */
95#define FSCTL_SRV_COPYCHUNK 0x001440F2 95#define FSCTL_SRV_COPYCHUNK 0x001440F2
96#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2 96#define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 277bd1be21fd..e081acbac2e7 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -56,29 +56,28 @@ static void configfs_d_iput(struct dentry * dentry,
56 struct configfs_dirent *sd = dentry->d_fsdata; 56 struct configfs_dirent *sd = dentry->d_fsdata;
57 57
58 if (sd) { 58 if (sd) {
59 BUG_ON(sd->s_dentry != dentry);
60 /* Coordinate with configfs_readdir */ 59 /* Coordinate with configfs_readdir */
61 spin_lock(&configfs_dirent_lock); 60 spin_lock(&configfs_dirent_lock);
62 sd->s_dentry = NULL; 61 /* Coordinate with configfs_attach_attr where will increase
62 * sd->s_count and update sd->s_dentry to new allocated one.
63 * Only set sd->dentry to null when this dentry is the only
64 * sd owner.
65 * If not do so, configfs_d_iput may run just after
66 * configfs_attach_attr and set sd->s_dentry to null
67 * even it's still in use.
68 */
69 if (atomic_read(&sd->s_count) <= 2)
70 sd->s_dentry = NULL;
71
63 spin_unlock(&configfs_dirent_lock); 72 spin_unlock(&configfs_dirent_lock);
64 configfs_put(sd); 73 configfs_put(sd);
65 } 74 }
66 iput(inode); 75 iput(inode);
67} 76}
68 77
69/*
70 * We _must_ delete our dentries on last dput, as the chain-to-parent
71 * behavior is required to clear the parents of default_groups.
72 */
73static int configfs_d_delete(const struct dentry *dentry)
74{
75 return 1;
76}
77
78const struct dentry_operations configfs_dentry_ops = { 78const struct dentry_operations configfs_dentry_ops = {
79 .d_iput = configfs_d_iput, 79 .d_iput = configfs_d_iput,
80 /* simple_delete_dentry() isn't exported */ 80 .d_delete = always_delete_dentry,
81 .d_delete = configfs_d_delete,
82}; 81};
83 82
84#ifdef CONFIG_LOCKDEP 83#ifdef CONFIG_LOCKDEP
@@ -426,8 +425,11 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
426 struct configfs_attribute * attr = sd->s_element; 425 struct configfs_attribute * attr = sd->s_element;
427 int error; 426 int error;
428 427
428 spin_lock(&configfs_dirent_lock);
429 dentry->d_fsdata = configfs_get(sd); 429 dentry->d_fsdata = configfs_get(sd);
430 sd->s_dentry = dentry; 430 sd->s_dentry = dentry;
431 spin_unlock(&configfs_dirent_lock);
432
431 error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, 433 error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
432 configfs_init_file); 434 configfs_init_file);
433 if (error) { 435 if (error) {
diff --git a/fs/coredump.c b/fs/coredump.c
index 62406b6959b6..bc3fbcd32558 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -695,7 +695,7 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr)
695 while (nr) { 695 while (nr) {
696 if (dump_interrupted()) 696 if (dump_interrupted())
697 return 0; 697 return 0;
698 n = vfs_write(file, addr, nr, &pos); 698 n = __kernel_write(file, addr, nr, &pos);
699 if (n <= 0) 699 if (n <= 0)
700 return 0; 700 return 0;
701 file->f_pos = pos; 701 file->f_pos = pos;
@@ -733,7 +733,7 @@ int dump_align(struct coredump_params *cprm, int align)
733{ 733{
734 unsigned mod = cprm->written & (align - 1); 734 unsigned mod = cprm->written & (align - 1);
735 if (align & (align - 1)) 735 if (align & (align - 1))
736 return -EINVAL; 736 return 0;
737 return mod ? dump_skip(cprm, align - mod) : 0; 737 return mod ? dump_skip(cprm, align - mod) : 1;
738} 738}
739EXPORT_SYMBOL(dump_align); 739EXPORT_SYMBOL(dump_align);
diff --git a/fs/dcache.c b/fs/dcache.c
index 0a38ef8d7f00..6055d61811d3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -88,35 +88,6 @@ EXPORT_SYMBOL(rename_lock);
88 88
89static struct kmem_cache *dentry_cache __read_mostly; 89static struct kmem_cache *dentry_cache __read_mostly;
90 90
91/**
92 * read_seqbegin_or_lock - begin a sequence number check or locking block
93 * @lock: sequence lock
94 * @seq : sequence number to be checked
95 *
96 * First try it once optimistically without taking the lock. If that fails,
97 * take the lock. The sequence number is also used as a marker for deciding
98 * whether to be a reader (even) or writer (odd).
99 * N.B. seq must be initialized to an even number to begin with.
100 */
101static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
102{
103 if (!(*seq & 1)) /* Even */
104 *seq = read_seqbegin(lock);
105 else /* Odd */
106 read_seqlock_excl(lock);
107}
108
109static inline int need_seqretry(seqlock_t *lock, int seq)
110{
111 return !(seq & 1) && read_seqretry(lock, seq);
112}
113
114static inline void done_seqretry(seqlock_t *lock, int seq)
115{
116 if (seq & 1)
117 read_sequnlock_excl(lock);
118}
119
120/* 91/*
121 * This is the single most critical data structure when it comes 92 * This is the single most critical data structure when it comes
122 * to the dcache: the hashtable for lookups. Somebody should try 93 * to the dcache: the hashtable for lookups. Somebody should try
@@ -125,8 +96,6 @@ static inline void done_seqretry(seqlock_t *lock, int seq)
125 * This hash-function tries to avoid losing too many bits of hash 96 * This hash-function tries to avoid losing too many bits of hash
126 * information, yet avoid using a prime hash-size or similar. 97 * information, yet avoid using a prime hash-size or similar.
127 */ 98 */
128#define D_HASHBITS d_hash_shift
129#define D_HASHMASK d_hash_mask
130 99
131static unsigned int d_hash_mask __read_mostly; 100static unsigned int d_hash_mask __read_mostly;
132static unsigned int d_hash_shift __read_mostly; 101static unsigned int d_hash_shift __read_mostly;
@@ -137,8 +106,8 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
137 unsigned int hash) 106 unsigned int hash)
138{ 107{
139 hash += (unsigned long) parent / L1_CACHE_BYTES; 108 hash += (unsigned long) parent / L1_CACHE_BYTES;
140 hash = hash + (hash >> D_HASHBITS); 109 hash = hash + (hash >> d_hash_shift);
141 return dentry_hashtable + (hash & D_HASHMASK); 110 return dentry_hashtable + (hash & d_hash_mask);
142} 111}
143 112
144/* Statistics gathering. */ 113/* Statistics gathering. */
@@ -223,7 +192,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char
223 if (!tcount) 192 if (!tcount)
224 return 0; 193 return 0;
225 } 194 }
226 mask = ~(~0ul << tcount*8); 195 mask = bytemask_from_count(tcount);
227 return unlikely(!!((a ^ b) & mask)); 196 return unlikely(!!((a ^ b) & mask));
228} 197}
229 198
@@ -469,7 +438,7 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
469{ 438{
470 list_del(&dentry->d_u.d_child); 439 list_del(&dentry->d_u.d_child);
471 /* 440 /*
472 * Inform try_to_ascend() that we are no longer attached to the 441 * Inform d_walk() that we are no longer attached to the
473 * dentry tree 442 * dentry tree
474 */ 443 */
475 dentry->d_flags |= DCACHE_DENTRY_KILLED; 444 dentry->d_flags |= DCACHE_DENTRY_KILLED;
@@ -1069,34 +1038,6 @@ void shrink_dcache_sb(struct super_block *sb)
1069} 1038}
1070EXPORT_SYMBOL(shrink_dcache_sb); 1039EXPORT_SYMBOL(shrink_dcache_sb);
1071 1040
1072/*
1073 * This tries to ascend one level of parenthood, but
1074 * we can race with renaming, so we need to re-check
1075 * the parenthood after dropping the lock and check
1076 * that the sequence number still matches.
1077 */
1078static struct dentry *try_to_ascend(struct dentry *old, unsigned seq)
1079{
1080 struct dentry *new = old->d_parent;
1081
1082 rcu_read_lock();
1083 spin_unlock(&old->d_lock);
1084 spin_lock(&new->d_lock);
1085
1086 /*
1087 * might go back up the wrong parent if we have had a rename
1088 * or deletion
1089 */
1090 if (new != old->d_parent ||
1091 (old->d_flags & DCACHE_DENTRY_KILLED) ||
1092 need_seqretry(&rename_lock, seq)) {
1093 spin_unlock(&new->d_lock);
1094 new = NULL;
1095 }
1096 rcu_read_unlock();
1097 return new;
1098}
1099
1100/** 1041/**
1101 * enum d_walk_ret - action to talke during tree walk 1042 * enum d_walk_ret - action to talke during tree walk
1102 * @D_WALK_CONTINUE: contrinue walk 1043 * @D_WALK_CONTINUE: contrinue walk
@@ -1185,9 +1126,24 @@ resume:
1185 */ 1126 */
1186 if (this_parent != parent) { 1127 if (this_parent != parent) {
1187 struct dentry *child = this_parent; 1128 struct dentry *child = this_parent;
1188 this_parent = try_to_ascend(this_parent, seq); 1129 this_parent = child->d_parent;
1189 if (!this_parent) 1130
1131 rcu_read_lock();
1132 spin_unlock(&child->d_lock);
1133 spin_lock(&this_parent->d_lock);
1134
1135 /*
1136 * might go back up the wrong parent if we have had a rename
1137 * or deletion
1138 */
1139 if (this_parent != child->d_parent ||
1140 (child->d_flags & DCACHE_DENTRY_KILLED) ||
1141 need_seqretry(&rename_lock, seq)) {
1142 spin_unlock(&this_parent->d_lock);
1143 rcu_read_unlock();
1190 goto rename_retry; 1144 goto rename_retry;
1145 }
1146 rcu_read_unlock();
1191 next = child->d_u.d_child.next; 1147 next = child->d_u.d_child.next;
1192 goto resume; 1148 goto resume;
1193 } 1149 }
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2229a74aeeed..b1eaa7a1f82c 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -313,11 +313,9 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
313static long 313static long
314ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 314ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
315{ 315{
316 struct file *lower_file = NULL; 316 struct file *lower_file = ecryptfs_file_to_lower(file);
317 long rc = -ENOTTY; 317 long rc = -ENOTTY;
318 318
319 if (ecryptfs_file_to_private(file))
320 lower_file = ecryptfs_file_to_lower(file);
321 if (lower_file->f_op->unlocked_ioctl) 319 if (lower_file->f_op->unlocked_ioctl)
322 rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg); 320 rc = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
323 return rc; 321 return rc;
@@ -327,11 +325,9 @@ ecryptfs_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
327static long 325static long
328ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 326ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
329{ 327{
330 struct file *lower_file = NULL; 328 struct file *lower_file = ecryptfs_file_to_lower(file);
331 long rc = -ENOIOCTLCMD; 329 long rc = -ENOIOCTLCMD;
332 330
333 if (ecryptfs_file_to_private(file))
334 lower_file = ecryptfs_file_to_lower(file);
335 if (lower_file->f_op && lower_file->f_op->compat_ioctl) 331 if (lower_file->f_op && lower_file->f_op->compat_ioctl)
336 rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); 332 rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
337 return rc; 333 return rc;
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index a8766b880c07..becc725a1953 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -83,19 +83,10 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
83 return 0; 83 return 0;
84} 84}
85 85
86/*
87 * Retaining negative dentries for an in-memory filesystem just wastes
88 * memory and lookup time: arrange for them to be deleted immediately.
89 */
90static int efivarfs_delete_dentry(const struct dentry *dentry)
91{
92 return 1;
93}
94
95static struct dentry_operations efivarfs_d_ops = { 86static struct dentry_operations efivarfs_d_ops = {
96 .d_compare = efivarfs_d_compare, 87 .d_compare = efivarfs_d_compare,
97 .d_hash = efivarfs_d_hash, 88 .d_hash = efivarfs_d_hash,
98 .d_delete = efivarfs_delete_dentry, 89 .d_delete = always_delete_dentry,
99}; 90};
100 91
101static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name) 92static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 79b65c3b9e87..8b5e2584c840 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,8 +1852,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1852 goto error_tgt_fput; 1852 goto error_tgt_fput;
1853 1853
1854 /* Check if EPOLLWAKEUP is allowed */ 1854 /* Check if EPOLLWAKEUP is allowed */
1855 if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) 1855 ep_take_care_of_epollwakeup(&epds);
1856 epds.events &= ~EPOLLWAKEUP;
1857 1856
1858 /* 1857 /*
1859 * We have to check that the file structure underneath the file descriptor 1858 * We have to check that the file structure underneath the file descriptor
diff --git a/fs/exec.c b/fs/exec.c
index 977319fd77f3..7ea097f6b341 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1380,10 +1380,6 @@ int search_binary_handler(struct linux_binprm *bprm)
1380 if (retval) 1380 if (retval)
1381 return retval; 1381 return retval;
1382 1382
1383 retval = audit_bprm(bprm);
1384 if (retval)
1385 return retval;
1386
1387 retval = -ENOENT; 1383 retval = -ENOENT;
1388 retry: 1384 retry:
1389 read_lock(&binfmt_lock); 1385 read_lock(&binfmt_lock);
@@ -1431,6 +1427,7 @@ static int exec_binprm(struct linux_binprm *bprm)
1431 1427
1432 ret = search_binary_handler(bprm); 1428 ret = search_binary_handler(bprm);
1433 if (ret >= 0) { 1429 if (ret >= 0) {
1430 audit_bprm(bprm);
1434 trace_sched_process_exec(current, old_pid, bprm); 1431 trace_sched_process_exec(current, old_pid, bprm);
1435 ptrace_event(PTRACE_EVENT_EXEC, old_vpid); 1432 ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
1436 current->did_exec = 1; 1433 current->did_exec = 1;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e66a8009aff1..c8420f7e4db6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1899,7 +1899,8 @@ static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
1899 gi->nhash = 0; 1899 gi->nhash = 0;
1900 } 1900 }
1901 /* Skip entries for other sb and dead entries */ 1901 /* Skip entries for other sb and dead entries */
1902 } while (gi->sdp != gi->gl->gl_sbd || __lockref_is_dead(&gl->gl_lockref)); 1902 } while (gi->sdp != gi->gl->gl_sbd ||
1903 __lockref_is_dead(&gi->gl->gl_lockref));
1903 1904
1904 return 0; 1905 return 0;
1905} 1906}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 1615df16cf4e..7119504159f1 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1171,8 +1171,11 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
1171 if (d != NULL) 1171 if (d != NULL)
1172 dentry = d; 1172 dentry = d;
1173 if (dentry->d_inode) { 1173 if (dentry->d_inode) {
1174 if (!(*opened & FILE_OPENED)) 1174 if (!(*opened & FILE_OPENED)) {
1175 if (d == NULL)
1176 dget(dentry);
1175 return finish_no_open(file, dentry); 1177 return finish_no_open(file, dentry);
1178 }
1176 dput(d); 1179 dput(d);
1177 return 0; 1180 return 0;
1178 } 1181 }
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index c8423d6de6c3..2a6ba06bee6f 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -466,19 +466,19 @@ static void gdlm_cancel(struct gfs2_glock *gl)
466static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen, 466static void control_lvb_read(struct lm_lockstruct *ls, uint32_t *lvb_gen,
467 char *lvb_bits) 467 char *lvb_bits)
468{ 468{
469 uint32_t gen; 469 __le32 gen;
470 memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE); 470 memcpy(lvb_bits, ls->ls_control_lvb, GDLM_LVB_SIZE);
471 memcpy(&gen, lvb_bits, sizeof(uint32_t)); 471 memcpy(&gen, lvb_bits, sizeof(__le32));
472 *lvb_gen = le32_to_cpu(gen); 472 *lvb_gen = le32_to_cpu(gen);
473} 473}
474 474
475static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen, 475static void control_lvb_write(struct lm_lockstruct *ls, uint32_t lvb_gen,
476 char *lvb_bits) 476 char *lvb_bits)
477{ 477{
478 uint32_t gen; 478 __le32 gen;
479 memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE); 479 memcpy(ls->ls_control_lvb, lvb_bits, GDLM_LVB_SIZE);
480 gen = cpu_to_le32(lvb_gen); 480 gen = cpu_to_le32(lvb_gen);
481 memcpy(ls->ls_control_lvb, &gen, sizeof(uint32_t)); 481 memcpy(ls->ls_control_lvb, &gen, sizeof(__le32));
482} 482}
483 483
484static int all_jid_bits_clear(char *lvb) 484static int all_jid_bits_clear(char *lvb)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 453b50eaddec..98236d0df3ca 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -667,7 +667,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
667 struct buffer_head *bh; 667 struct buffer_head *bh;
668 struct page *page; 668 struct page *page;
669 void *kaddr, *ptr; 669 void *kaddr, *ptr;
670 struct gfs2_quota q, *qp; 670 struct gfs2_quota q;
671 int err, nbytes; 671 int err, nbytes;
672 u64 size; 672 u64 size;
673 673
@@ -683,28 +683,25 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
683 return err; 683 return err;
684 684
685 err = -EIO; 685 err = -EIO;
686 qp = &q; 686 be64_add_cpu(&q.qu_value, change);
687 qp->qu_value = be64_to_cpu(qp->qu_value); 687 qd->qd_qb.qb_value = q.qu_value;
688 qp->qu_value += change;
689 qp->qu_value = cpu_to_be64(qp->qu_value);
690 qd->qd_qb.qb_value = qp->qu_value;
691 if (fdq) { 688 if (fdq) {
692 if (fdq->d_fieldmask & FS_DQ_BSOFT) { 689 if (fdq->d_fieldmask & FS_DQ_BSOFT) {
693 qp->qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift); 690 q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
694 qd->qd_qb.qb_warn = qp->qu_warn; 691 qd->qd_qb.qb_warn = q.qu_warn;
695 } 692 }
696 if (fdq->d_fieldmask & FS_DQ_BHARD) { 693 if (fdq->d_fieldmask & FS_DQ_BHARD) {
697 qp->qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift); 694 q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
698 qd->qd_qb.qb_limit = qp->qu_limit; 695 qd->qd_qb.qb_limit = q.qu_limit;
699 } 696 }
700 if (fdq->d_fieldmask & FS_DQ_BCOUNT) { 697 if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
701 qp->qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift); 698 q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
702 qd->qd_qb.qb_value = qp->qu_value; 699 qd->qd_qb.qb_value = q.qu_value;
703 } 700 }
704 } 701 }
705 702
706 /* Write the quota into the quota file on disk */ 703 /* Write the quota into the quota file on disk */
707 ptr = qp; 704 ptr = &q;
708 nbytes = sizeof(struct gfs2_quota); 705 nbytes = sizeof(struct gfs2_quota);
709get_a_page: 706get_a_page:
710 page = find_or_create_page(mapping, index, GFP_NOFS); 707 page = find_or_create_page(mapping, index, GFP_NOFS);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 4d83abdd5635..c8d6161bd682 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1127,7 +1127,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
1127 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); 1127 rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
1128 rgd->rd_free_clone = rgd->rd_free; 1128 rgd->rd_free_clone = rgd->rd_free;
1129 } 1129 }
1130 if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { 1130 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
1131 rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); 1131 rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
1132 gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, 1132 gfs2_rgrp_ondisk2lvb(rgd->rd_rgl,
1133 rgd->rd_bits[0].bi_bh->b_data); 1133 rgd->rd_bits[0].bi_bh->b_data);
@@ -1161,7 +1161,7 @@ int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
1161 if (rgd->rd_flags & GFS2_RDF_UPTODATE) 1161 if (rgd->rd_flags & GFS2_RDF_UPTODATE)
1162 return 0; 1162 return 0;
1163 1163
1164 if (be32_to_cpu(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) 1164 if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic)
1165 return gfs2_rgrp_bh_get(rgd); 1165 return gfs2_rgrp_bh_get(rgd);
1166 1166
1167 rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); 1167 rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b51a6079108d..e9a97a0d4314 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,13 +24,6 @@ struct hfsplus_wd {
24 u16 embed_count; 24 u16 embed_count;
25}; 25};
26 26
27static void hfsplus_end_io_sync(struct bio *bio, int err)
28{
29 if (err)
30 clear_bit(BIO_UPTODATE, &bio->bi_flags);
31 complete(bio->bi_private);
32}
33
34/* 27/*
35 * hfsplus_submit_bio - Perfrom block I/O 28 * hfsplus_submit_bio - Perfrom block I/O
36 * @sb: super block of volume for I/O 29 * @sb: super block of volume for I/O
@@ -53,7 +46,6 @@ static void hfsplus_end_io_sync(struct bio *bio, int err)
53int hfsplus_submit_bio(struct super_block *sb, sector_t sector, 46int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
54 void *buf, void **data, int rw) 47 void *buf, void **data, int rw)
55{ 48{
56 DECLARE_COMPLETION_ONSTACK(wait);
57 struct bio *bio; 49 struct bio *bio;
58 int ret = 0; 50 int ret = 0;
59 u64 io_size; 51 u64 io_size;
@@ -73,8 +65,6 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
73 bio = bio_alloc(GFP_NOIO, 1); 65 bio = bio_alloc(GFP_NOIO, 1);
74 bio->bi_sector = sector; 66 bio->bi_sector = sector;
75 bio->bi_bdev = sb->s_bdev; 67 bio->bi_bdev = sb->s_bdev;
76 bio->bi_end_io = hfsplus_end_io_sync;
77 bio->bi_private = &wait;
78 68
79 if (!(rw & WRITE) && data) 69 if (!(rw & WRITE) && data)
80 *data = (u8 *)buf + offset; 70 *data = (u8 *)buf + offset;
@@ -93,12 +83,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
93 buf = (u8 *)buf + len; 83 buf = (u8 *)buf + len;
94 } 84 }
95 85
96 submit_bio(rw, bio); 86 ret = submit_bio_wait(rw, bio);
97 wait_for_completion(&wait);
98
99 if (!bio_flagged(bio, BIO_UPTODATE))
100 ret = -EIO;
101
102out: 87out:
103 bio_put(bio); 88 bio_put(bio);
104 return ret < 0 ? ret : 0; 89 return ret < 0 ? ret : 0;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 25437280a207..db23ce1bd903 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -33,15 +33,6 @@ static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
33 33
34#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file)) 34#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
35 35
36static int hostfs_d_delete(const struct dentry *dentry)
37{
38 return 1;
39}
40
41static const struct dentry_operations hostfs_dentry_ops = {
42 .d_delete = hostfs_d_delete,
43};
44
45/* Changed in hostfs_args before the kernel starts running */ 36/* Changed in hostfs_args before the kernel starts running */
46static char *root_ino = ""; 37static char *root_ino = "";
47static int append = 0; 38static int append = 0;
@@ -925,7 +916,7 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
925 sb->s_blocksize_bits = 10; 916 sb->s_blocksize_bits = 10;
926 sb->s_magic = HOSTFS_SUPER_MAGIC; 917 sb->s_magic = HOSTFS_SUPER_MAGIC;
927 sb->s_op = &hostfs_sbops; 918 sb->s_op = &hostfs_sbops;
928 sb->s_d_op = &hostfs_dentry_ops; 919 sb->s_d_op = &simple_dentry_operations;
929 sb->s_maxbytes = MAX_LFS_FILESIZE; 920 sb->s_maxbytes = MAX_LFS_FILESIZE;
930 921
931 /* NULL is printed as <NULL> by sprintf: avoid that. */ 922 /* NULL is printed as <NULL> by sprintf: avoid that. */
diff --git a/fs/libfs.c b/fs/libfs.c
index 5de06947ba5e..a1844244246f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -47,10 +47,16 @@ EXPORT_SYMBOL(simple_statfs);
47 * Retaining negative dentries for an in-memory filesystem just wastes 47 * Retaining negative dentries for an in-memory filesystem just wastes
48 * memory and lookup time: arrange for them to be deleted immediately. 48 * memory and lookup time: arrange for them to be deleted immediately.
49 */ 49 */
50static int simple_delete_dentry(const struct dentry *dentry) 50int always_delete_dentry(const struct dentry *dentry)
51{ 51{
52 return 1; 52 return 1;
53} 53}
54EXPORT_SYMBOL(always_delete_dentry);
55
56const struct dentry_operations simple_dentry_operations = {
57 .d_delete = always_delete_dentry,
58};
59EXPORT_SYMBOL(simple_dentry_operations);
54 60
55/* 61/*
56 * Lookup the data. This is trivial - if the dentry didn't already 62 * Lookup the data. This is trivial - if the dentry didn't already
@@ -58,10 +64,6 @@ static int simple_delete_dentry(const struct dentry *dentry)
58 */ 64 */
59struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 65struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
60{ 66{
61 static const struct dentry_operations simple_dentry_operations = {
62 .d_delete = simple_delete_dentry,
63 };
64
65 if (dentry->d_name.len > NAME_MAX) 67 if (dentry->d_name.len > NAME_MAX)
66 return ERR_PTR(-ENAMETOOLONG); 68 return ERR_PTR(-ENAMETOOLONG);
67 if (!dentry->d_sb->s_d_op) 69 if (!dentry->d_sb->s_d_op)
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 550475ca6a0e..0f95f0d0b313 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -14,16 +14,10 @@
14 14
15#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) 15#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
16 16
17static void request_complete(struct bio *bio, int err)
18{
19 complete((struct completion *)bio->bi_private);
20}
21
22static int sync_request(struct page *page, struct block_device *bdev, int rw) 17static int sync_request(struct page *page, struct block_device *bdev, int rw)
23{ 18{
24 struct bio bio; 19 struct bio bio;
25 struct bio_vec bio_vec; 20 struct bio_vec bio_vec;
26 struct completion complete;
27 21
28 bio_init(&bio); 22 bio_init(&bio);
29 bio.bi_max_vecs = 1; 23 bio.bi_max_vecs = 1;
@@ -35,13 +29,8 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
35 bio.bi_size = PAGE_SIZE; 29 bio.bi_size = PAGE_SIZE;
36 bio.bi_bdev = bdev; 30 bio.bi_bdev = bdev;
37 bio.bi_sector = page->index * (PAGE_SIZE >> 9); 31 bio.bi_sector = page->index * (PAGE_SIZE >> 9);
38 init_completion(&complete);
39 bio.bi_private = &complete;
40 bio.bi_end_io = request_complete;
41 32
42 submit_bio(rw, &bio); 33 return submit_bio_wait(rw, &bio);
43 wait_for_completion(&complete);
44 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
45} 34}
46 35
47static int bdev_readpage(void *_sb, struct page *page) 36static int bdev_readpage(void *_sb, struct page *page)
diff --git a/fs/namei.c b/fs/namei.c
index e029a4cbff7d..3531deebad30 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -513,8 +513,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
513 513
514 if (!lockref_get_not_dead(&parent->d_lockref)) { 514 if (!lockref_get_not_dead(&parent->d_lockref)) {
515 nd->path.dentry = NULL; 515 nd->path.dentry = NULL;
516 rcu_read_unlock(); 516 goto out;
517 return -ECHILD;
518 } 517 }
519 518
520 /* 519 /*
@@ -1599,11 +1598,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
1599 * do a "get_unaligned()" if this helps and is sufficiently 1598 * do a "get_unaligned()" if this helps and is sufficiently
1600 * fast. 1599 * fast.
1601 * 1600 *
1602 * - Little-endian machines (so that we can generate the mask
1603 * of low bytes efficiently). Again, we *could* do a byte
1604 * swapping load on big-endian architectures if that is not
1605 * expensive enough to make the optimization worthless.
1606 *
1607 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we 1601 * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
1608 * do not trap on the (extremely unlikely) case of a page 1602 * do not trap on the (extremely unlikely) case of a page
1609 * crossing operation. 1603 * crossing operation.
@@ -1647,7 +1641,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len)
1647 if (!len) 1641 if (!len)
1648 goto done; 1642 goto done;
1649 } 1643 }
1650 mask = ~(~0ul << len*8); 1644 mask = bytemask_from_count(len);
1651 hash += mask & a; 1645 hash += mask & a;
1652done: 1646done:
1653 return fold_hash(hash); 1647 return fold_hash(hash);
@@ -2435,6 +2429,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
2435 */ 2429 */
2436static inline int may_create(struct inode *dir, struct dentry *child) 2430static inline int may_create(struct inode *dir, struct dentry *child)
2437{ 2431{
2432 audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
2438 if (child->d_inode) 2433 if (child->d_inode)
2439 return -EEXIST; 2434 return -EEXIST;
2440 if (IS_DEADDIR(dir)) 2435 if (IS_DEADDIR(dir))
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 8485978993e8..9838fb020473 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -36,6 +36,7 @@
36#include <linux/nfs_fs.h> 36#include <linux/nfs_fs.h>
37#include <linux/sunrpc/rpc_pipe_fs.h> 37#include <linux/sunrpc/rpc_pipe_fs.h>
38 38
39#include "../nfs4_fs.h"
39#include "../pnfs.h" 40#include "../pnfs.h"
40#include "../netns.h" 41#include "../netns.h"
41 42
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 9c3e117c3ed1..4d0161442565 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -44,7 +44,7 @@
44static inline sector_t normalize(sector_t s, int base) 44static inline sector_t normalize(sector_t s, int base)
45{ 45{
46 sector_t tmp = s; /* Since do_div modifies its argument */ 46 sector_t tmp = s; /* Since do_div modifies its argument */
47 return s - do_div(tmp, base); 47 return s - sector_div(tmp, base);
48} 48}
49 49
50static inline sector_t normalize_up(sector_t s, int base) 50static inline sector_t normalize_up(sector_t s, int base)
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index fc0f95ec7358..d25f10fb4926 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -46,7 +46,9 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
46#include <linux/sunrpc/cache.h> 46#include <linux/sunrpc/cache.h>
47#include <linux/sunrpc/svcauth.h> 47#include <linux/sunrpc/svcauth.h>
48#include <linux/sunrpc/rpc_pipe_fs.h> 48#include <linux/sunrpc/rpc_pipe_fs.h>
49#include <linux/nfs_fs.h>
49 50
51#include "nfs4_fs.h"
50#include "dns_resolve.h" 52#include "dns_resolve.h"
51#include "cache_lib.h" 53#include "cache_lib.h"
52#include "netns.h" 54#include "netns.h"
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 18ab2da4eeb6..00ad1c2b217d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -312,7 +312,7 @@ struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
312} 312}
313EXPORT_SYMBOL_GPL(nfs4_label_alloc); 313EXPORT_SYMBOL_GPL(nfs4_label_alloc);
314#else 314#else
315void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, 315void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
316 struct nfs4_label *label) 316 struct nfs4_label *label)
317{ 317{
318} 318}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bca6a3e3c49c..8b5cc04a8611 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -269,6 +269,21 @@ extern const u32 nfs41_maxgetdevinfo_overhead;
269extern struct rpc_procinfo nfs4_procedures[]; 269extern struct rpc_procinfo nfs4_procedures[];
270#endif 270#endif
271 271
272#ifdef CONFIG_NFS_V4_SECURITY_LABEL
273extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags);
274static inline void nfs4_label_free(struct nfs4_label *label)
275{
276 if (label) {
277 kfree(label->label);
278 kfree(label);
279 }
280 return;
281}
282#else
283static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
284static inline void nfs4_label_free(void *label) {}
285#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
286
272/* proc.c */ 287/* proc.c */
273void nfs_close_context(struct nfs_open_context *ctx, int is_sync); 288void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
274extern struct nfs_client *nfs_init_client(struct nfs_client *clp, 289extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3ce79b04522e..5609edc742a0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -9,6 +9,14 @@
9#ifndef __LINUX_FS_NFS_NFS4_FS_H 9#ifndef __LINUX_FS_NFS_NFS4_FS_H
10#define __LINUX_FS_NFS_NFS4_FS_H 10#define __LINUX_FS_NFS_NFS4_FS_H
11 11
12#if defined(CONFIG_NFS_V4_2)
13#define NFS4_MAX_MINOR_VERSION 2
14#elif defined(CONFIG_NFS_V4_1)
15#define NFS4_MAX_MINOR_VERSION 1
16#else
17#define NFS4_MAX_MINOR_VERSION 0
18#endif
19
12#if IS_ENABLED(CONFIG_NFS_V4) 20#if IS_ENABLED(CONFIG_NFS_V4)
13 21
14#define NFS4_MAX_LOOP_ON_RECOVER (10) 22#define NFS4_MAX_LOOP_ON_RECOVER (10)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 659990c0109e..15052b81df42 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2518,9 +2518,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2518 calldata->roc_barrier); 2518 calldata->roc_barrier);
2519 nfs_set_open_stateid(state, &calldata->res.stateid, 0); 2519 nfs_set_open_stateid(state, &calldata->res.stateid, 0);
2520 renew_lease(server, calldata->timestamp); 2520 renew_lease(server, calldata->timestamp);
2521 nfs4_close_clear_stateid_flags(state,
2522 calldata->arg.fmode);
2523 break; 2521 break;
2522 case -NFS4ERR_ADMIN_REVOKED:
2524 case -NFS4ERR_STALE_STATEID: 2523 case -NFS4ERR_STALE_STATEID:
2525 case -NFS4ERR_OLD_STATEID: 2524 case -NFS4ERR_OLD_STATEID:
2526 case -NFS4ERR_BAD_STATEID: 2525 case -NFS4ERR_BAD_STATEID:
@@ -2528,9 +2527,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
2528 if (calldata->arg.fmode == 0) 2527 if (calldata->arg.fmode == 0)
2529 break; 2528 break;
2530 default: 2529 default:
2531 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) 2530 if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
2532 rpc_restart_call_prepare(task); 2531 rpc_restart_call_prepare(task);
2532 goto out_release;
2533 }
2533 } 2534 }
2535 nfs4_close_clear_stateid_flags(state, calldata->arg.fmode);
2536out_release:
2534 nfs_release_seqid(calldata->arg.seqid); 2537 nfs_release_seqid(calldata->arg.seqid);
2535 nfs_refresh_inode(calldata->inode, calldata->res.fattr); 2538 nfs_refresh_inode(calldata->inode, calldata->res.fattr);
2536 dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); 2539 dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
@@ -4802,7 +4805,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
4802 dprintk("%s ERROR %d, Reset session\n", __func__, 4805 dprintk("%s ERROR %d, Reset session\n", __func__,
4803 task->tk_status); 4806 task->tk_status);
4804 nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); 4807 nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
4805 goto restart_call; 4808 goto wait_on_recovery;
4806#endif /* CONFIG_NFS_V4_1 */ 4809#endif /* CONFIG_NFS_V4_1 */
4807 case -NFS4ERR_DELAY: 4810 case -NFS4ERR_DELAY:
4808 nfs_inc_server_stats(server, NFSIOS_DELAY); 4811 nfs_inc_server_stats(server, NFSIOS_DELAY);
@@ -4987,11 +4990,17 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
4987 4990
4988 trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); 4991 trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
4989 switch (task->tk_status) { 4992 switch (task->tk_status) {
4990 case -NFS4ERR_STALE_STATEID:
4991 case -NFS4ERR_EXPIRED:
4992 case 0: 4993 case 0:
4993 renew_lease(data->res.server, data->timestamp); 4994 renew_lease(data->res.server, data->timestamp);
4994 break; 4995 break;
4996 case -NFS4ERR_ADMIN_REVOKED:
4997 case -NFS4ERR_DELEG_REVOKED:
4998 case -NFS4ERR_BAD_STATEID:
4999 case -NFS4ERR_OLD_STATEID:
5000 case -NFS4ERR_STALE_STATEID:
5001 case -NFS4ERR_EXPIRED:
5002 task->tk_status = 0;
5003 break;
4995 default: 5004 default:
4996 if (nfs4_async_handle_error(task, data->res.server, NULL) == 5005 if (nfs4_async_handle_error(task, data->res.server, NULL) ==
4997 -EAGAIN) { 5006 -EAGAIN) {
@@ -7589,7 +7598,14 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
7589 return; 7598 return;
7590 7599
7591 server = NFS_SERVER(lrp->args.inode); 7600 server = NFS_SERVER(lrp->args.inode);
7592 if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { 7601 switch (task->tk_status) {
7602 default:
7603 task->tk_status = 0;
7604 case 0:
7605 break;
7606 case -NFS4ERR_DELAY:
7607 if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN)
7608 break;
7593 rpc_restart_call_prepare(task); 7609 rpc_restart_call_prepare(task);
7594 return; 7610 return;
7595 } 7611 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 088de1355e93..ee7237f99f54 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -141,8 +141,8 @@ xdr_error: \
141 141
142static void next_decode_page(struct nfsd4_compoundargs *argp) 142static void next_decode_page(struct nfsd4_compoundargs *argp)
143{ 143{
144 argp->pagelist++;
145 argp->p = page_address(argp->pagelist[0]); 144 argp->p = page_address(argp->pagelist[0]);
145 argp->pagelist++;
146 if (argp->pagelen < PAGE_SIZE) { 146 if (argp->pagelen < PAGE_SIZE) {
147 argp->end = argp->p + (argp->pagelen>>2); 147 argp->end = argp->p + (argp->pagelen>>2);
148 argp->pagelen = 0; 148 argp->pagelen = 0;
@@ -1229,6 +1229,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
1229 len -= pages * PAGE_SIZE; 1229 len -= pages * PAGE_SIZE;
1230 1230
1231 argp->p = (__be32 *)page_address(argp->pagelist[0]); 1231 argp->p = (__be32 *)page_address(argp->pagelist[0]);
1232 argp->pagelist++;
1232 argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); 1233 argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE);
1233 } 1234 }
1234 argp->p += XDR_QUADLEN(len); 1235 argp->p += XDR_QUADLEN(len);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 9186c7ce0b14..b6af150c96b8 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -132,6 +132,13 @@ nfsd_reply_cache_alloc(void)
132} 132}
133 133
134static void 134static void
135nfsd_reply_cache_unhash(struct svc_cacherep *rp)
136{
137 hlist_del_init(&rp->c_hash);
138 list_del_init(&rp->c_lru);
139}
140
141static void
135nfsd_reply_cache_free_locked(struct svc_cacherep *rp) 142nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
136{ 143{
137 if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) { 144 if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
@@ -417,7 +424,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
417 rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru); 424 rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
418 if (nfsd_cache_entry_expired(rp) || 425 if (nfsd_cache_entry_expired(rp) ||
419 num_drc_entries >= max_drc_entries) { 426 num_drc_entries >= max_drc_entries) {
420 lru_put_end(rp); 427 nfsd_reply_cache_unhash(rp);
421 prune_cache_entries(); 428 prune_cache_entries();
422 goto search_cache; 429 goto search_cache;
423 } 430 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 94b5f5d2bfed..7eea63cada1d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -298,41 +298,12 @@ commit_metadata(struct svc_fh *fhp)
298} 298}
299 299
300/* 300/*
301 * Set various file attributes. 301 * Go over the attributes and take care of the small differences between
302 * N.B. After this call fhp needs an fh_put 302 * NFS semantics and what Linux expects.
303 */ 303 */
304__be32 304static void
305nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, 305nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
306 int check_guard, time_t guardtime)
307{ 306{
308 struct dentry *dentry;
309 struct inode *inode;
310 int accmode = NFSD_MAY_SATTR;
311 umode_t ftype = 0;
312 __be32 err;
313 int host_err;
314 int size_change = 0;
315
316 if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
317 accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
318 if (iap->ia_valid & ATTR_SIZE)
319 ftype = S_IFREG;
320
321 /* Get inode */
322 err = fh_verify(rqstp, fhp, ftype, accmode);
323 if (err)
324 goto out;
325
326 dentry = fhp->fh_dentry;
327 inode = dentry->d_inode;
328
329 /* Ignore any mode updates on symlinks */
330 if (S_ISLNK(inode->i_mode))
331 iap->ia_valid &= ~ATTR_MODE;
332
333 if (!iap->ia_valid)
334 goto out;
335
336 /* 307 /*
337 * NFSv2 does not differentiate between "set-[ac]time-to-now" 308 * NFSv2 does not differentiate between "set-[ac]time-to-now"
338 * which only requires access, and "set-[ac]time-to-X" which 309 * which only requires access, and "set-[ac]time-to-X" which
@@ -342,8 +313,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
342 * convert to "set to now" instead of "set to explicit time" 313 * convert to "set to now" instead of "set to explicit time"
343 * 314 *
344 * We only call inode_change_ok as the last test as technically 315 * We only call inode_change_ok as the last test as technically
345 * it is not an interface that we should be using. It is only 316 * it is not an interface that we should be using.
346 * valid if the filesystem does not define it's own i_op->setattr.
347 */ 317 */
348#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) 318#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET)
349#define MAX_TOUCH_TIME_ERROR (30*60) 319#define MAX_TOUCH_TIME_ERROR (30*60)
@@ -369,30 +339,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
369 iap->ia_valid &= ~BOTH_TIME_SET; 339 iap->ia_valid &= ~BOTH_TIME_SET;
370 } 340 }
371 } 341 }
372
373 /*
374 * The size case is special.
375 * It changes the file as well as the attributes.
376 */
377 if (iap->ia_valid & ATTR_SIZE) {
378 if (iap->ia_size < inode->i_size) {
379 err = nfsd_permission(rqstp, fhp->fh_export, dentry,
380 NFSD_MAY_TRUNC|NFSD_MAY_OWNER_OVERRIDE);
381 if (err)
382 goto out;
383 }
384
385 host_err = get_write_access(inode);
386 if (host_err)
387 goto out_nfserr;
388
389 size_change = 1;
390 host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
391 if (host_err) {
392 put_write_access(inode);
393 goto out_nfserr;
394 }
395 }
396 342
397 /* sanitize the mode change */ 343 /* sanitize the mode change */
398 if (iap->ia_valid & ATTR_MODE) { 344 if (iap->ia_valid & ATTR_MODE) {
@@ -415,32 +361,111 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
415 iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); 361 iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID);
416 } 362 }
417 } 363 }
364}
418 365
419 /* Change the attributes. */ 366static __be32
367nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
368 struct iattr *iap)
369{
370 struct inode *inode = fhp->fh_dentry->d_inode;
371 int host_err;
420 372
421 iap->ia_valid |= ATTR_CTIME; 373 if (iap->ia_size < inode->i_size) {
374 __be32 err;
422 375
423 err = nfserr_notsync; 376 err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
424 if (!check_guard || guardtime == inode->i_ctime.tv_sec) { 377 NFSD_MAY_TRUNC | NFSD_MAY_OWNER_OVERRIDE);
425 host_err = nfsd_break_lease(inode); 378 if (err)
426 if (host_err) 379 return err;
427 goto out_nfserr; 380 }
428 fh_lock(fhp);
429 381
430 host_err = notify_change(dentry, iap, NULL); 382 host_err = get_write_access(inode);
431 err = nfserrno(host_err); 383 if (host_err)
432 fh_unlock(fhp); 384 goto out_nfserrno;
385
386 host_err = locks_verify_truncate(inode, NULL, iap->ia_size);
387 if (host_err)
388 goto out_put_write_access;
389 return 0;
390
391out_put_write_access:
392 put_write_access(inode);
393out_nfserrno:
394 return nfserrno(host_err);
395}
396
397/*
398 * Set various file attributes. After this call fhp needs an fh_put.
399 */
400__be32
401nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
402 int check_guard, time_t guardtime)
403{
404 struct dentry *dentry;
405 struct inode *inode;
406 int accmode = NFSD_MAY_SATTR;
407 umode_t ftype = 0;
408 __be32 err;
409 int host_err;
410 int size_change = 0;
411
412 if (iap->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_SIZE))
413 accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
414 if (iap->ia_valid & ATTR_SIZE)
415 ftype = S_IFREG;
416
417 /* Get inode */
418 err = fh_verify(rqstp, fhp, ftype, accmode);
419 if (err)
420 goto out;
421
422 dentry = fhp->fh_dentry;
423 inode = dentry->d_inode;
424
425 /* Ignore any mode updates on symlinks */
426 if (S_ISLNK(inode->i_mode))
427 iap->ia_valid &= ~ATTR_MODE;
428
429 if (!iap->ia_valid)
430 goto out;
431
432 nfsd_sanitize_attrs(inode, iap);
433
434 /*
435 * The size case is special, it changes the file in addition to the
436 * attributes.
437 */
438 if (iap->ia_valid & ATTR_SIZE) {
439 err = nfsd_get_write_access(rqstp, fhp, iap);
440 if (err)
441 goto out;
442 size_change = 1;
433 } 443 }
444
445 iap->ia_valid |= ATTR_CTIME;
446
447 if (check_guard && guardtime != inode->i_ctime.tv_sec) {
448 err = nfserr_notsync;
449 goto out_put_write_access;
450 }
451
452 host_err = nfsd_break_lease(inode);
453 if (host_err)
454 goto out_put_write_access_nfserror;
455
456 fh_lock(fhp);
457 host_err = notify_change(dentry, iap, NULL);
458 fh_unlock(fhp);
459
460out_put_write_access_nfserror:
461 err = nfserrno(host_err);
462out_put_write_access:
434 if (size_change) 463 if (size_change)
435 put_write_access(inode); 464 put_write_access(inode);
436 if (!err) 465 if (!err)
437 commit_metadata(fhp); 466 commit_metadata(fhp);
438out: 467out:
439 return err; 468 return err;
440
441out_nfserr:
442 err = nfserrno(host_err);
443 goto out;
444} 469}
445 470
446#if defined(CONFIG_NFSD_V2_ACL) || \ 471#if defined(CONFIG_NFSD_V2_ACL) || \
diff --git a/fs/pipe.c b/fs/pipe.c
index d2c45e14e6d8..0e0752ef2715 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -726,11 +726,25 @@ pipe_poll(struct file *filp, poll_table *wait)
726 return mask; 726 return mask;
727} 727}
728 728
729static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
730{
731 int kill = 0;
732
733 spin_lock(&inode->i_lock);
734 if (!--pipe->files) {
735 inode->i_pipe = NULL;
736 kill = 1;
737 }
738 spin_unlock(&inode->i_lock);
739
740 if (kill)
741 free_pipe_info(pipe);
742}
743
729static int 744static int
730pipe_release(struct inode *inode, struct file *file) 745pipe_release(struct inode *inode, struct file *file)
731{ 746{
732 struct pipe_inode_info *pipe = inode->i_pipe; 747 struct pipe_inode_info *pipe = file->private_data;
733 int kill = 0;
734 748
735 __pipe_lock(pipe); 749 __pipe_lock(pipe);
736 if (file->f_mode & FMODE_READ) 750 if (file->f_mode & FMODE_READ)
@@ -743,17 +757,9 @@ pipe_release(struct inode *inode, struct file *file)
743 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 757 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
744 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 758 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
745 } 759 }
746 spin_lock(&inode->i_lock);
747 if (!--pipe->files) {
748 inode->i_pipe = NULL;
749 kill = 1;
750 }
751 spin_unlock(&inode->i_lock);
752 __pipe_unlock(pipe); 760 __pipe_unlock(pipe);
753 761
754 if (kill) 762 put_pipe_info(inode, pipe);
755 free_pipe_info(pipe);
756
757 return 0; 763 return 0;
758} 764}
759 765
@@ -1014,7 +1020,6 @@ static int fifo_open(struct inode *inode, struct file *filp)
1014{ 1020{
1015 struct pipe_inode_info *pipe; 1021 struct pipe_inode_info *pipe;
1016 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 1022 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
1017 int kill = 0;
1018 int ret; 1023 int ret;
1019 1024
1020 filp->f_version = 0; 1025 filp->f_version = 0;
@@ -1130,15 +1135,9 @@ err_wr:
1130 goto err; 1135 goto err;
1131 1136
1132err: 1137err:
1133 spin_lock(&inode->i_lock);
1134 if (!--pipe->files) {
1135 inode->i_pipe = NULL;
1136 kill = 1;
1137 }
1138 spin_unlock(&inode->i_lock);
1139 __pipe_unlock(pipe); 1138 __pipe_unlock(pipe);
1140 if (kill) 1139
1141 free_pipe_info(pipe); 1140 put_pipe_info(inode, pipe);
1142 return ret; 1141 return ret;
1143} 1142}
1144 1143
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1485e38daaa3..03c8d747be48 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1151,10 +1151,16 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1151 goto out_free_page; 1151 goto out_free_page;
1152 1152
1153 } 1153 }
1154 kloginuid = make_kuid(file->f_cred->user_ns, loginuid); 1154
1155 if (!uid_valid(kloginuid)) { 1155 /* is userspace tring to explicitly UNSET the loginuid? */
1156 length = -EINVAL; 1156 if (loginuid == AUDIT_UID_UNSET) {
1157 goto out_free_page; 1157 kloginuid = INVALID_UID;
1158 } else {
1159 kloginuid = make_kuid(file->f_cred->user_ns, loginuid);
1160 if (!uid_valid(kloginuid)) {
1161 length = -EINVAL;
1162 goto out_free_page;
1163 }
1158 } 1164 }
1159 1165
1160 length = audit_set_loginuid(kloginuid); 1166 length = audit_set_loginuid(kloginuid);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 737e15615b04..cca93b6fb9a9 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -175,22 +175,6 @@ static const struct inode_operations proc_link_inode_operations = {
175}; 175};
176 176
177/* 177/*
178 * As some entries in /proc are volatile, we want to
179 * get rid of unused dentries. This could be made
180 * smarter: we could keep a "volatile" flag in the
181 * inode to indicate which ones to keep.
182 */
183static int proc_delete_dentry(const struct dentry * dentry)
184{
185 return 1;
186}
187
188static const struct dentry_operations proc_dentry_operations =
189{
190 .d_delete = proc_delete_dentry,
191};
192
193/*
194 * Don't create negative dentries here, return -ENOENT by hand 178 * Don't create negative dentries here, return -ENOENT by hand
195 * instead. 179 * instead.
196 */ 180 */
@@ -209,7 +193,7 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
209 inode = proc_get_inode(dir->i_sb, de); 193 inode = proc_get_inode(dir->i_sb, de);
210 if (!inode) 194 if (!inode)
211 return ERR_PTR(-ENOMEM); 195 return ERR_PTR(-ENOMEM);
212 d_set_d_op(dentry, &proc_dentry_operations); 196 d_set_d_op(dentry, &simple_dentry_operations);
213 d_add(dentry, inode); 197 d_add(dentry, inode);
214 return NULL; 198 return NULL;
215 } 199 }
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 28955d4b7218..124fc43c7090 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -292,16 +292,20 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
292{ 292{
293 struct proc_dir_entry *pde = PDE(file_inode(file)); 293 struct proc_dir_entry *pde = PDE(file_inode(file));
294 unsigned long rv = -EIO; 294 unsigned long rv = -EIO;
295 unsigned long (*get_area)(struct file *, unsigned long, unsigned long, 295
296 unsigned long, unsigned long) = NULL;
297 if (use_pde(pde)) { 296 if (use_pde(pde)) {
297 typeof(proc_reg_get_unmapped_area) *get_area;
298
299 get_area = pde->proc_fops->get_unmapped_area;
298#ifdef CONFIG_MMU 300#ifdef CONFIG_MMU
299 get_area = current->mm->get_unmapped_area; 301 if (!get_area)
302 get_area = current->mm->get_unmapped_area;
300#endif 303#endif
301 if (pde->proc_fops->get_unmapped_area) 304
302 get_area = pde->proc_fops->get_unmapped_area;
303 if (get_area) 305 if (get_area)
304 rv = get_area(file, orig_addr, len, pgoff, flags); 306 rv = get_area(file, orig_addr, len, pgoff, flags);
307 else
308 rv = orig_addr;
305 unuse_pde(pde); 309 unuse_pde(pde);
306 } 310 }
307 return rv; 311 return rv;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 49a7fff2e83a..9ae46b87470d 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -42,12 +42,6 @@ static const struct inode_operations ns_inode_operations = {
42 .setattr = proc_setattr, 42 .setattr = proc_setattr,
43}; 43};
44 44
45static int ns_delete_dentry(const struct dentry *dentry)
46{
47 /* Don't cache namespace inodes when not in use */
48 return 1;
49}
50
51static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) 45static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
52{ 46{
53 struct inode *inode = dentry->d_inode; 47 struct inode *inode = dentry->d_inode;
@@ -59,7 +53,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
59 53
60const struct dentry_operations ns_dentry_operations = 54const struct dentry_operations ns_dentry_operations =
61{ 55{
62 .d_delete = ns_delete_dentry, 56 .d_delete = always_delete_dentry,
63 .d_dname = ns_dname, 57 .d_dname = ns_dname,
64}; 58};
65 59
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index c70111ebefd4..b6fa8657dcbc 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -25,6 +25,78 @@ config SQUASHFS
25 25
26 If unsure, say N. 26 If unsure, say N.
27 27
28choice
29 prompt "File decompression options"
30 depends on SQUASHFS
31 help
32 Squashfs now supports two options for decompressing file
33 data. Traditionally Squashfs has decompressed into an
34 intermediate buffer and then memcopied it into the page cache.
35 Squashfs now supports the ability to decompress directly into
36 the page cache.
37
38 If unsure, select "Decompress file data into an intermediate buffer"
39
40config SQUASHFS_FILE_CACHE
41 bool "Decompress file data into an intermediate buffer"
42 help
43 Decompress file data into an intermediate buffer and then
44 memcopy it into the page cache.
45
46config SQUASHFS_FILE_DIRECT
47 bool "Decompress files directly into the page cache"
48 help
49 Directly decompress file data into the page cache.
50 Doing so can significantly improve performance because
51 it eliminates a memcpy and it also removes the lock contention
52 on the single buffer.
53
54endchoice
55
56choice
57 prompt "Decompressor parallelisation options"
58 depends on SQUASHFS
59 help
60 Squashfs now supports three parallelisation options for
61 decompression. Each one exhibits various trade-offs between
62 decompression performance and CPU and memory usage.
63
64 If in doubt, select "Single threaded compression"
65
66config SQUASHFS_DECOMP_SINGLE
67 bool "Single threaded compression"
68 help
69 Traditionally Squashfs has used single-threaded decompression.
70 Only one block (data or metadata) can be decompressed at any
71 one time. This limits CPU and memory usage to a minimum.
72
73config SQUASHFS_DECOMP_MULTI
74 bool "Use multiple decompressors for parallel I/O"
75 help
76 By default Squashfs uses a single decompressor but it gives
77 poor performance on parallel I/O workloads when using multiple CPU
78 machines due to waiting on decompressor availability.
79
80 If you have a parallel I/O workload and your system has enough memory,
81 using this option may improve overall I/O performance.
82
83 This decompressor implementation uses up to two parallel
84 decompressors per core. It dynamically allocates decompressors
85 on a demand basis.
86
87config SQUASHFS_DECOMP_MULTI_PERCPU
88 bool "Use percpu multiple decompressors for parallel I/O"
89 help
90 By default Squashfs uses a single decompressor but it gives
91 poor performance on parallel I/O workloads when using multiple CPU
92 machines due to waiting on decompressor availability.
93
94 This decompressor implementation uses a maximum of one
95 decompressor per core. It uses percpu variables to ensure
96 decompression is load-balanced across the cores.
97
98endchoice
99
28config SQUASHFS_XATTR 100config SQUASHFS_XATTR
29 bool "Squashfs XATTR support" 101 bool "Squashfs XATTR support"
30 depends on SQUASHFS 102 depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index 110b0476f3b4..4132520b4ff2 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,6 +5,11 @@
5obj-$(CONFIG_SQUASHFS) += squashfs.o 5obj-$(CONFIG_SQUASHFS) += squashfs.o
6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o 6squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
7squashfs-y += namei.o super.o symlink.o decompressor.o 7squashfs-y += namei.o super.o symlink.o decompressor.o
8squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
9squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
10squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
11squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
12squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
8squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o 13squashfs-$(CONFIG_SQUASHFS_XATTR) += xattr.o xattr_id.o
9squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o 14squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o
10squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o 15squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index 41d108ecc9be..0cea9b9236d0 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -36,6 +36,7 @@
36#include "squashfs_fs_sb.h" 36#include "squashfs_fs_sb.h"
37#include "squashfs.h" 37#include "squashfs.h"
38#include "decompressor.h" 38#include "decompressor.h"
39#include "page_actor.h"
39 40
40/* 41/*
41 * Read the metadata block length, this is stored in the first two 42 * Read the metadata block length, this is stored in the first two
@@ -86,16 +87,16 @@ static struct buffer_head *get_block_length(struct super_block *sb,
86 * generated a larger block - this does occasionally happen with compression 87 * generated a larger block - this does occasionally happen with compression
87 * algorithms). 88 * algorithms).
88 */ 89 */
89int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, 90int squashfs_read_data(struct super_block *sb, u64 index, int length,
90 int length, u64 *next_index, int srclength, int pages) 91 u64 *next_index, struct squashfs_page_actor *output)
91{ 92{
92 struct squashfs_sb_info *msblk = sb->s_fs_info; 93 struct squashfs_sb_info *msblk = sb->s_fs_info;
93 struct buffer_head **bh; 94 struct buffer_head **bh;
94 int offset = index & ((1 << msblk->devblksize_log2) - 1); 95 int offset = index & ((1 << msblk->devblksize_log2) - 1);
95 u64 cur_index = index >> msblk->devblksize_log2; 96 u64 cur_index = index >> msblk->devblksize_log2;
96 int bytes, compressed, b = 0, k = 0, page = 0, avail; 97 int bytes, compressed, b = 0, k = 0, avail, i;
97 98
98 bh = kcalloc(((srclength + msblk->devblksize - 1) 99 bh = kcalloc(((output->length + msblk->devblksize - 1)
99 >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); 100 >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
100 if (bh == NULL) 101 if (bh == NULL)
101 return -ENOMEM; 102 return -ENOMEM;
@@ -111,9 +112,9 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
111 *next_index = index + length; 112 *next_index = index + length;
112 113
113 TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", 114 TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
114 index, compressed ? "" : "un", length, srclength); 115 index, compressed ? "" : "un", length, output->length);
115 116
116 if (length < 0 || length > srclength || 117 if (length < 0 || length > output->length ||
117 (index + length) > msblk->bytes_used) 118 (index + length) > msblk->bytes_used)
118 goto read_failure; 119 goto read_failure;
119 120
@@ -145,7 +146,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
145 TRACE("Block @ 0x%llx, %scompressed size %d\n", index, 146 TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
146 compressed ? "" : "un", length); 147 compressed ? "" : "un", length);
147 148
148 if (length < 0 || length > srclength || 149 if (length < 0 || length > output->length ||
149 (index + length) > msblk->bytes_used) 150 (index + length) > msblk->bytes_used)
150 goto block_release; 151 goto block_release;
151 152
@@ -158,9 +159,15 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
158 ll_rw_block(READ, b - 1, bh + 1); 159 ll_rw_block(READ, b - 1, bh + 1);
159 } 160 }
160 161
162 for (i = 0; i < b; i++) {
163 wait_on_buffer(bh[i]);
164 if (!buffer_uptodate(bh[i]))
165 goto block_release;
166 }
167
161 if (compressed) { 168 if (compressed) {
162 length = squashfs_decompress(msblk, buffer, bh, b, offset, 169 length = squashfs_decompress(msblk, bh, b, offset, length,
163 length, srclength, pages); 170 output);
164 if (length < 0) 171 if (length < 0)
165 goto read_failure; 172 goto read_failure;
166 } else { 173 } else {
@@ -168,22 +175,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
168 * Block is uncompressed. 175 * Block is uncompressed.
169 */ 176 */
170 int in, pg_offset = 0; 177 int in, pg_offset = 0;
178 void *data = squashfs_first_page(output);
171 179
172 for (bytes = length; k < b; k++) { 180 for (bytes = length; k < b; k++) {
173 in = min(bytes, msblk->devblksize - offset); 181 in = min(bytes, msblk->devblksize - offset);
174 bytes -= in; 182 bytes -= in;
175 wait_on_buffer(bh[k]);
176 if (!buffer_uptodate(bh[k]))
177 goto block_release;
178 while (in) { 183 while (in) {
179 if (pg_offset == PAGE_CACHE_SIZE) { 184 if (pg_offset == PAGE_CACHE_SIZE) {
180 page++; 185 data = squashfs_next_page(output);
181 pg_offset = 0; 186 pg_offset = 0;
182 } 187 }
183 avail = min_t(int, in, PAGE_CACHE_SIZE - 188 avail = min_t(int, in, PAGE_CACHE_SIZE -
184 pg_offset); 189 pg_offset);
185 memcpy(buffer[page] + pg_offset, 190 memcpy(data + pg_offset, bh[k]->b_data + offset,
186 bh[k]->b_data + offset, avail); 191 avail);
187 in -= avail; 192 in -= avail;
188 pg_offset += avail; 193 pg_offset += avail;
189 offset += avail; 194 offset += avail;
@@ -191,6 +196,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
191 offset = 0; 196 offset = 0;
192 put_bh(bh[k]); 197 put_bh(bh[k]);
193 } 198 }
199 squashfs_finish_page(output);
194 } 200 }
195 201
196 kfree(bh); 202 kfree(bh);
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index af0b73802592..1cb70a0b2168 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -56,6 +56,7 @@
56#include "squashfs_fs.h" 56#include "squashfs_fs.h"
57#include "squashfs_fs_sb.h" 57#include "squashfs_fs_sb.h"
58#include "squashfs.h" 58#include "squashfs.h"
59#include "page_actor.h"
59 60
60/* 61/*
61 * Look-up block in cache, and increment usage count. If not in cache, read 62 * Look-up block in cache, and increment usage count. If not in cache, read
@@ -119,9 +120,8 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
119 entry->error = 0; 120 entry->error = 0;
120 spin_unlock(&cache->lock); 121 spin_unlock(&cache->lock);
121 122
122 entry->length = squashfs_read_data(sb, entry->data, 123 entry->length = squashfs_read_data(sb, block, length,
123 block, length, &entry->next_index, 124 &entry->next_index, entry->actor);
124 cache->block_size, cache->pages);
125 125
126 spin_lock(&cache->lock); 126 spin_lock(&cache->lock);
127 127
@@ -220,6 +220,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
220 kfree(cache->entry[i].data[j]); 220 kfree(cache->entry[i].data[j]);
221 kfree(cache->entry[i].data); 221 kfree(cache->entry[i].data);
222 } 222 }
223 kfree(cache->entry[i].actor);
223 } 224 }
224 225
225 kfree(cache->entry); 226 kfree(cache->entry);
@@ -280,6 +281,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
280 goto cleanup; 281 goto cleanup;
281 } 282 }
282 } 283 }
284
285 entry->actor = squashfs_page_actor_init(entry->data,
286 cache->pages, 0);
287 if (entry->actor == NULL) {
288 ERROR("Failed to allocate %s cache entry\n", name);
289 goto cleanup;
290 }
283 } 291 }
284 292
285 return cache; 293 return cache;
@@ -410,6 +418,7 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
410 int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 418 int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
411 int i, res; 419 int i, res;
412 void *table, *buffer, **data; 420 void *table, *buffer, **data;
421 struct squashfs_page_actor *actor;
413 422
414 table = buffer = kmalloc(length, GFP_KERNEL); 423 table = buffer = kmalloc(length, GFP_KERNEL);
415 if (table == NULL) 424 if (table == NULL)
@@ -421,19 +430,28 @@ void *squashfs_read_table(struct super_block *sb, u64 block, int length)
421 goto failed; 430 goto failed;
422 } 431 }
423 432
433 actor = squashfs_page_actor_init(data, pages, length);
434 if (actor == NULL) {
435 res = -ENOMEM;
436 goto failed2;
437 }
438
424 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) 439 for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
425 data[i] = buffer; 440 data[i] = buffer;
426 441
427 res = squashfs_read_data(sb, data, block, length | 442 res = squashfs_read_data(sb, block, length |
428 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages); 443 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor);
429 444
430 kfree(data); 445 kfree(data);
446 kfree(actor);
431 447
432 if (res < 0) 448 if (res < 0)
433 goto failed; 449 goto failed;
434 450
435 return table; 451 return table;
436 452
453failed2:
454 kfree(data);
437failed: 455failed:
438 kfree(table); 456 kfree(table);
439 return ERR_PTR(res); 457 return ERR_PTR(res);
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c
index 3f6271d86abc..ac22fe73b0ad 100644
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -30,6 +30,7 @@
30#include "squashfs_fs_sb.h" 30#include "squashfs_fs_sb.h"
31#include "decompressor.h" 31#include "decompressor.h"
32#include "squashfs.h" 32#include "squashfs.h"
33#include "page_actor.h"
33 34
34/* 35/*
35 * This file (and decompressor.h) implements a decompressor framework for 36 * This file (and decompressor.h) implements a decompressor framework for
@@ -37,29 +38,29 @@
37 */ 38 */
38 39
39static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = { 40static const struct squashfs_decompressor squashfs_lzma_unsupported_comp_ops = {
40 NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0 41 NULL, NULL, NULL, NULL, LZMA_COMPRESSION, "lzma", 0
41}; 42};
42 43
43#ifndef CONFIG_SQUASHFS_LZO 44#ifndef CONFIG_SQUASHFS_LZO
44static const struct squashfs_decompressor squashfs_lzo_comp_ops = { 45static const struct squashfs_decompressor squashfs_lzo_comp_ops = {
45 NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0 46 NULL, NULL, NULL, NULL, LZO_COMPRESSION, "lzo", 0
46}; 47};
47#endif 48#endif
48 49
49#ifndef CONFIG_SQUASHFS_XZ 50#ifndef CONFIG_SQUASHFS_XZ
50static const struct squashfs_decompressor squashfs_xz_comp_ops = { 51static const struct squashfs_decompressor squashfs_xz_comp_ops = {
51 NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0 52 NULL, NULL, NULL, NULL, XZ_COMPRESSION, "xz", 0
52}; 53};
53#endif 54#endif
54 55
55#ifndef CONFIG_SQUASHFS_ZLIB 56#ifndef CONFIG_SQUASHFS_ZLIB
56static const struct squashfs_decompressor squashfs_zlib_comp_ops = { 57static const struct squashfs_decompressor squashfs_zlib_comp_ops = {
57 NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0 58 NULL, NULL, NULL, NULL, ZLIB_COMPRESSION, "zlib", 0
58}; 59};
59#endif 60#endif
60 61
61static const struct squashfs_decompressor squashfs_unknown_comp_ops = { 62static const struct squashfs_decompressor squashfs_unknown_comp_ops = {
62 NULL, NULL, NULL, 0, "unknown", 0 63 NULL, NULL, NULL, NULL, 0, "unknown", 0
63}; 64};
64 65
65static const struct squashfs_decompressor *decompressor[] = { 66static const struct squashfs_decompressor *decompressor[] = {
@@ -83,10 +84,11 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
83} 84}
84 85
85 86
86void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags) 87static void *get_comp_opts(struct super_block *sb, unsigned short flags)
87{ 88{
88 struct squashfs_sb_info *msblk = sb->s_fs_info; 89 struct squashfs_sb_info *msblk = sb->s_fs_info;
89 void *strm, *buffer = NULL; 90 void *buffer = NULL, *comp_opts;
91 struct squashfs_page_actor *actor = NULL;
90 int length = 0; 92 int length = 0;
91 93
92 /* 94 /*
@@ -94,23 +96,46 @@ void *squashfs_decompressor_init(struct super_block *sb, unsigned short flags)
94 */ 96 */
95 if (SQUASHFS_COMP_OPTS(flags)) { 97 if (SQUASHFS_COMP_OPTS(flags)) {
96 buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); 98 buffer = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL);
97 if (buffer == NULL) 99 if (buffer == NULL) {
98 return ERR_PTR(-ENOMEM); 100 comp_opts = ERR_PTR(-ENOMEM);
101 goto out;
102 }
103
104 actor = squashfs_page_actor_init(&buffer, 1, 0);
105 if (actor == NULL) {
106 comp_opts = ERR_PTR(-ENOMEM);
107 goto out;
108 }
99 109
100 length = squashfs_read_data(sb, &buffer, 110 length = squashfs_read_data(sb,
101 sizeof(struct squashfs_super_block), 0, NULL, 111 sizeof(struct squashfs_super_block), 0, NULL, actor);
102 PAGE_CACHE_SIZE, 1);
103 112
104 if (length < 0) { 113 if (length < 0) {
105 strm = ERR_PTR(length); 114 comp_opts = ERR_PTR(length);
106 goto finished; 115 goto out;
107 } 116 }
108 } 117 }
109 118
110 strm = msblk->decompressor->init(msblk, buffer, length); 119 comp_opts = squashfs_comp_opts(msblk, buffer, length);
111 120
112finished: 121out:
122 kfree(actor);
113 kfree(buffer); 123 kfree(buffer);
124 return comp_opts;
125}
126
127
128void *squashfs_decompressor_setup(struct super_block *sb, unsigned short flags)
129{
130 struct squashfs_sb_info *msblk = sb->s_fs_info;
131 void *stream, *comp_opts = get_comp_opts(sb, flags);
132
133 if (IS_ERR(comp_opts))
134 return comp_opts;
135
136 stream = squashfs_decompressor_create(msblk, comp_opts);
137 if (IS_ERR(stream))
138 kfree(comp_opts);
114 139
115 return strm; 140 return stream;
116} 141}
diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h
index 330073e29029..af0985321808 100644
--- a/fs/squashfs/decompressor.h
+++ b/fs/squashfs/decompressor.h
@@ -24,28 +24,22 @@
24 */ 24 */
25 25
26struct squashfs_decompressor { 26struct squashfs_decompressor {
27 void *(*init)(struct squashfs_sb_info *, void *, int); 27 void *(*init)(struct squashfs_sb_info *, void *);
28 void *(*comp_opts)(struct squashfs_sb_info *, void *, int);
28 void (*free)(void *); 29 void (*free)(void *);
29 int (*decompress)(struct squashfs_sb_info *, void **, 30 int (*decompress)(struct squashfs_sb_info *, void *,
30 struct buffer_head **, int, int, int, int, int); 31 struct buffer_head **, int, int, int,
32 struct squashfs_page_actor *);
31 int id; 33 int id;
32 char *name; 34 char *name;
33 int supported; 35 int supported;
34}; 36};
35 37
36static inline void squashfs_decompressor_free(struct squashfs_sb_info *msblk, 38static inline void *squashfs_comp_opts(struct squashfs_sb_info *msblk,
37 void *s) 39 void *buff, int length)
38{ 40{
39 if (msblk->decompressor) 41 return msblk->decompressor->comp_opts ?
40 msblk->decompressor->free(s); 42 msblk->decompressor->comp_opts(msblk, buff, length) : NULL;
41}
42
43static inline int squashfs_decompress(struct squashfs_sb_info *msblk,
44 void **buffer, struct buffer_head **bh, int b, int offset, int length,
45 int srclength, int pages)
46{
47 return msblk->decompressor->decompress(msblk, buffer, bh, b, offset,
48 length, srclength, pages);
49} 43}
50 44
51#ifdef CONFIG_SQUASHFS_XZ 45#ifdef CONFIG_SQUASHFS_XZ
diff --git a/fs/squashfs/decompressor_multi.c b/fs/squashfs/decompressor_multi.c
new file mode 100644
index 000000000000..d6008a636479
--- /dev/null
+++ b/fs/squashfs/decompressor_multi.c
@@ -0,0 +1,198 @@
1/*
2 * Copyright (c) 2013
3 * Minchan Kim <minchan@kernel.org>
4 *
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8#include <linux/types.h>
9#include <linux/mutex.h>
10#include <linux/slab.h>
11#include <linux/buffer_head.h>
12#include <linux/sched.h>
13#include <linux/wait.h>
14#include <linux/cpumask.h>
15
16#include "squashfs_fs.h"
17#include "squashfs_fs_sb.h"
18#include "decompressor.h"
19#include "squashfs.h"
20
21/*
22 * This file implements multi-threaded decompression in the
23 * decompressor framework
24 */
25
26
27/*
28 * The reason that multiply two is that a CPU can request new I/O
29 * while it is waiting previous request.
30 */
31#define MAX_DECOMPRESSOR (num_online_cpus() * 2)
32
33
34int squashfs_max_decompressors(void)
35{
36 return MAX_DECOMPRESSOR;
37}
38
39
40struct squashfs_stream {
41 void *comp_opts;
42 struct list_head strm_list;
43 struct mutex mutex;
44 int avail_decomp;
45 wait_queue_head_t wait;
46};
47
48
49struct decomp_stream {
50 void *stream;
51 struct list_head list;
52};
53
54
55static void put_decomp_stream(struct decomp_stream *decomp_strm,
56 struct squashfs_stream *stream)
57{
58 mutex_lock(&stream->mutex);
59 list_add(&decomp_strm->list, &stream->strm_list);
60 mutex_unlock(&stream->mutex);
61 wake_up(&stream->wait);
62}
63
64void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
65 void *comp_opts)
66{
67 struct squashfs_stream *stream;
68 struct decomp_stream *decomp_strm = NULL;
69 int err = -ENOMEM;
70
71 stream = kzalloc(sizeof(*stream), GFP_KERNEL);
72 if (!stream)
73 goto out;
74
75 stream->comp_opts = comp_opts;
76 mutex_init(&stream->mutex);
77 INIT_LIST_HEAD(&stream->strm_list);
78 init_waitqueue_head(&stream->wait);
79
80 /*
81 * We should have a decompressor at least as default
82 * so if we fail to allocate new decompressor dynamically,
83 * we could always fall back to default decompressor and
84 * file system works.
85 */
86 decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL);
87 if (!decomp_strm)
88 goto out;
89
90 decomp_strm->stream = msblk->decompressor->init(msblk,
91 stream->comp_opts);
92 if (IS_ERR(decomp_strm->stream)) {
93 err = PTR_ERR(decomp_strm->stream);
94 goto out;
95 }
96
97 list_add(&decomp_strm->list, &stream->strm_list);
98 stream->avail_decomp = 1;
99 return stream;
100
101out:
102 kfree(decomp_strm);
103 kfree(stream);
104 return ERR_PTR(err);
105}
106
107
108void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
109{
110 struct squashfs_stream *stream = msblk->stream;
111 if (stream) {
112 struct decomp_stream *decomp_strm;
113
114 while (!list_empty(&stream->strm_list)) {
115 decomp_strm = list_entry(stream->strm_list.prev,
116 struct decomp_stream, list);
117 list_del(&decomp_strm->list);
118 msblk->decompressor->free(decomp_strm->stream);
119 kfree(decomp_strm);
120 stream->avail_decomp--;
121 }
122 WARN_ON(stream->avail_decomp);
123 kfree(stream->comp_opts);
124 kfree(stream);
125 }
126}
127
128
129static struct decomp_stream *get_decomp_stream(struct squashfs_sb_info *msblk,
130 struct squashfs_stream *stream)
131{
132 struct decomp_stream *decomp_strm;
133
134 while (1) {
135 mutex_lock(&stream->mutex);
136
137 /* There is available decomp_stream */
138 if (!list_empty(&stream->strm_list)) {
139 decomp_strm = list_entry(stream->strm_list.prev,
140 struct decomp_stream, list);
141 list_del(&decomp_strm->list);
142 mutex_unlock(&stream->mutex);
143 break;
144 }
145
146 /*
147 * If there is no available decomp and already full,
148 * let's wait for releasing decomp from other users.
149 */
150 if (stream->avail_decomp >= MAX_DECOMPRESSOR)
151 goto wait;
152
153 /* Let's allocate new decomp */
154 decomp_strm = kmalloc(sizeof(*decomp_strm), GFP_KERNEL);
155 if (!decomp_strm)
156 goto wait;
157
158 decomp_strm->stream = msblk->decompressor->init(msblk,
159 stream->comp_opts);
160 if (IS_ERR(decomp_strm->stream)) {
161 kfree(decomp_strm);
162 goto wait;
163 }
164
165 stream->avail_decomp++;
166 WARN_ON(stream->avail_decomp > MAX_DECOMPRESSOR);
167
168 mutex_unlock(&stream->mutex);
169 break;
170wait:
171 /*
172 * If system memory is tough, let's for other's
173 * releasing instead of hurting VM because it could
174 * make page cache thrashing.
175 */
176 mutex_unlock(&stream->mutex);
177 wait_event(stream->wait,
178 !list_empty(&stream->strm_list));
179 }
180
181 return decomp_strm;
182}
183
184
185int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
186 int b, int offset, int length, struct squashfs_page_actor *output)
187{
188 int res;
189 struct squashfs_stream *stream = msblk->stream;
190 struct decomp_stream *decomp_stream = get_decomp_stream(msblk, stream);
191 res = msblk->decompressor->decompress(msblk, decomp_stream->stream,
192 bh, b, offset, length, output);
193 put_decomp_stream(decomp_stream, stream);
194 if (res < 0)
195 ERROR("%s decompression failed, data probably corrupt\n",
196 msblk->decompressor->name);
197 return res;
198}
diff --git a/fs/squashfs/decompressor_multi_percpu.c b/fs/squashfs/decompressor_multi_percpu.c
new file mode 100644
index 000000000000..23a9c28ad8ea
--- /dev/null
+++ b/fs/squashfs/decompressor_multi_percpu.c
@@ -0,0 +1,97 @@
1/*
2 * Copyright (c) 2013
3 * Phillip Lougher <phillip@squashfs.org.uk>
4 *
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/types.h>
10#include <linux/slab.h>
11#include <linux/percpu.h>
12#include <linux/buffer_head.h>
13
14#include "squashfs_fs.h"
15#include "squashfs_fs_sb.h"
16#include "decompressor.h"
17#include "squashfs.h"
18
19/*
20 * This file implements multi-threaded decompression using percpu
21 * variables, one thread per cpu core.
22 */
23
24struct squashfs_stream {
25 void *stream;
26};
27
28void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
29 void *comp_opts)
30{
31 struct squashfs_stream *stream;
32 struct squashfs_stream __percpu *percpu;
33 int err, cpu;
34
35 percpu = alloc_percpu(struct squashfs_stream);
36 if (percpu == NULL)
37 return ERR_PTR(-ENOMEM);
38
39 for_each_possible_cpu(cpu) {
40 stream = per_cpu_ptr(percpu, cpu);
41 stream->stream = msblk->decompressor->init(msblk, comp_opts);
42 if (IS_ERR(stream->stream)) {
43 err = PTR_ERR(stream->stream);
44 goto out;
45 }
46 }
47
48 kfree(comp_opts);
49 return (__force void *) percpu;
50
51out:
52 for_each_possible_cpu(cpu) {
53 stream = per_cpu_ptr(percpu, cpu);
54 if (!IS_ERR_OR_NULL(stream->stream))
55 msblk->decompressor->free(stream->stream);
56 }
57 free_percpu(percpu);
58 return ERR_PTR(err);
59}
60
61void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
62{
63 struct squashfs_stream __percpu *percpu =
64 (struct squashfs_stream __percpu *) msblk->stream;
65 struct squashfs_stream *stream;
66 int cpu;
67
68 if (msblk->stream) {
69 for_each_possible_cpu(cpu) {
70 stream = per_cpu_ptr(percpu, cpu);
71 msblk->decompressor->free(stream->stream);
72 }
73 free_percpu(percpu);
74 }
75}
76
77int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
78 int b, int offset, int length, struct squashfs_page_actor *output)
79{
80 struct squashfs_stream __percpu *percpu =
81 (struct squashfs_stream __percpu *) msblk->stream;
82 struct squashfs_stream *stream = get_cpu_ptr(percpu);
83 int res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
84 offset, length, output);
85 put_cpu_ptr(stream);
86
87 if (res < 0)
88 ERROR("%s decompression failed, data probably corrupt\n",
89 msblk->decompressor->name);
90
91 return res;
92}
93
94int squashfs_max_decompressors(void)
95{
96 return num_possible_cpus();
97}
diff --git a/fs/squashfs/decompressor_single.c b/fs/squashfs/decompressor_single.c
new file mode 100644
index 000000000000..a6c75929a00e
--- /dev/null
+++ b/fs/squashfs/decompressor_single.c
@@ -0,0 +1,85 @@
1/*
2 * Copyright (c) 2013
3 * Phillip Lougher <phillip@squashfs.org.uk>
4 *
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/types.h>
10#include <linux/mutex.h>
11#include <linux/slab.h>
12#include <linux/buffer_head.h>
13
14#include "squashfs_fs.h"
15#include "squashfs_fs_sb.h"
16#include "decompressor.h"
17#include "squashfs.h"
18
19/*
20 * This file implements single-threaded decompression in the
21 * decompressor framework
22 */
23
24struct squashfs_stream {
25 void *stream;
26 struct mutex mutex;
27};
28
29void *squashfs_decompressor_create(struct squashfs_sb_info *msblk,
30 void *comp_opts)
31{
32 struct squashfs_stream *stream;
33 int err = -ENOMEM;
34
35 stream = kmalloc(sizeof(*stream), GFP_KERNEL);
36 if (stream == NULL)
37 goto out;
38
39 stream->stream = msblk->decompressor->init(msblk, comp_opts);
40 if (IS_ERR(stream->stream)) {
41 err = PTR_ERR(stream->stream);
42 goto out;
43 }
44
45 kfree(comp_opts);
46 mutex_init(&stream->mutex);
47 return stream;
48
49out:
50 kfree(stream);
51 return ERR_PTR(err);
52}
53
54void squashfs_decompressor_destroy(struct squashfs_sb_info *msblk)
55{
56 struct squashfs_stream *stream = msblk->stream;
57
58 if (stream) {
59 msblk->decompressor->free(stream->stream);
60 kfree(stream);
61 }
62}
63
64int squashfs_decompress(struct squashfs_sb_info *msblk, struct buffer_head **bh,
65 int b, int offset, int length, struct squashfs_page_actor *output)
66{
67 int res;
68 struct squashfs_stream *stream = msblk->stream;
69
70 mutex_lock(&stream->mutex);
71 res = msblk->decompressor->decompress(msblk, stream->stream, bh, b,
72 offset, length, output);
73 mutex_unlock(&stream->mutex);
74
75 if (res < 0)
76 ERROR("%s decompression failed, data probably corrupt\n",
77 msblk->decompressor->name);
78
79 return res;
80}
81
82int squashfs_max_decompressors(void)
83{
84 return 1;
85}
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 8ca62c28fe12..e5c9689062ba 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -370,77 +370,15 @@ static int read_blocklist(struct inode *inode, int index, u64 *block)
370 return le32_to_cpu(size); 370 return le32_to_cpu(size);
371} 371}
372 372
373 373/* Copy data into page cache */
374static int squashfs_readpage(struct file *file, struct page *page) 374void squashfs_copy_cache(struct page *page, struct squashfs_cache_entry *buffer,
375 int bytes, int offset)
375{ 376{
376 struct inode *inode = page->mapping->host; 377 struct inode *inode = page->mapping->host;
377 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; 378 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
378 int bytes, i, offset = 0, sparse = 0;
379 struct squashfs_cache_entry *buffer = NULL;
380 void *pageaddr; 379 void *pageaddr;
381 380 int i, mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
382 int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; 381 int start_index = page->index & ~mask, end_index = start_index | mask;
383 int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
384 int start_index = page->index & ~mask;
385 int end_index = start_index | mask;
386 int file_end = i_size_read(inode) >> msblk->block_log;
387
388 TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
389 page->index, squashfs_i(inode)->start);
390
391 if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
392 PAGE_CACHE_SHIFT))
393 goto out;
394
395 if (index < file_end || squashfs_i(inode)->fragment_block ==
396 SQUASHFS_INVALID_BLK) {
397 /*
398 * Reading a datablock from disk. Need to read block list
399 * to get location and block size.
400 */
401 u64 block = 0;
402 int bsize = read_blocklist(inode, index, &block);
403 if (bsize < 0)
404 goto error_out;
405
406 if (bsize == 0) { /* hole */
407 bytes = index == file_end ?
408 (i_size_read(inode) & (msblk->block_size - 1)) :
409 msblk->block_size;
410 sparse = 1;
411 } else {
412 /*
413 * Read and decompress datablock.
414 */
415 buffer = squashfs_get_datablock(inode->i_sb,
416 block, bsize);
417 if (buffer->error) {
418 ERROR("Unable to read page, block %llx, size %x"
419 "\n", block, bsize);
420 squashfs_cache_put(buffer);
421 goto error_out;
422 }
423 bytes = buffer->length;
424 }
425 } else {
426 /*
427 * Datablock is stored inside a fragment (tail-end packed
428 * block).
429 */
430 buffer = squashfs_get_fragment(inode->i_sb,
431 squashfs_i(inode)->fragment_block,
432 squashfs_i(inode)->fragment_size);
433
434 if (buffer->error) {
435 ERROR("Unable to read page, block %llx, size %x\n",
436 squashfs_i(inode)->fragment_block,
437 squashfs_i(inode)->fragment_size);
438 squashfs_cache_put(buffer);
439 goto error_out;
440 }
441 bytes = i_size_read(inode) & (msblk->block_size - 1);
442 offset = squashfs_i(inode)->fragment_offset;
443 }
444 382
445 /* 383 /*
446 * Loop copying datablock into pages. As the datablock likely covers 384 * Loop copying datablock into pages. As the datablock likely covers
@@ -451,7 +389,7 @@ static int squashfs_readpage(struct file *file, struct page *page)
451 for (i = start_index; i <= end_index && bytes > 0; i++, 389 for (i = start_index; i <= end_index && bytes > 0; i++,
452 bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { 390 bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
453 struct page *push_page; 391 struct page *push_page;
454 int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE); 392 int avail = buffer ? min_t(int, bytes, PAGE_CACHE_SIZE) : 0;
455 393
456 TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); 394 TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
457 395
@@ -475,11 +413,75 @@ skip_page:
475 if (i != page->index) 413 if (i != page->index)
476 page_cache_release(push_page); 414 page_cache_release(push_page);
477 } 415 }
416}
417
418/* Read datablock stored packed inside a fragment (tail-end packed block) */
419static int squashfs_readpage_fragment(struct page *page)
420{
421 struct inode *inode = page->mapping->host;
422 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
423 struct squashfs_cache_entry *buffer = squashfs_get_fragment(inode->i_sb,
424 squashfs_i(inode)->fragment_block,
425 squashfs_i(inode)->fragment_size);
426 int res = buffer->error;
427
428 if (res)
429 ERROR("Unable to read page, block %llx, size %x\n",
430 squashfs_i(inode)->fragment_block,
431 squashfs_i(inode)->fragment_size);
432 else
433 squashfs_copy_cache(page, buffer, i_size_read(inode) &
434 (msblk->block_size - 1),
435 squashfs_i(inode)->fragment_offset);
436
437 squashfs_cache_put(buffer);
438 return res;
439}
478 440
479 if (!sparse) 441static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
480 squashfs_cache_put(buffer); 442{
443 struct inode *inode = page->mapping->host;
444 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
445 int bytes = index == file_end ?
446 (i_size_read(inode) & (msblk->block_size - 1)) :
447 msblk->block_size;
481 448
449 squashfs_copy_cache(page, NULL, bytes, 0);
482 return 0; 450 return 0;
451}
452
453static int squashfs_readpage(struct file *file, struct page *page)
454{
455 struct inode *inode = page->mapping->host;
456 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
457 int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
458 int file_end = i_size_read(inode) >> msblk->block_log;
459 int res;
460 void *pageaddr;
461
462 TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
463 page->index, squashfs_i(inode)->start);
464
465 if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
466 PAGE_CACHE_SHIFT))
467 goto out;
468
469 if (index < file_end || squashfs_i(inode)->fragment_block ==
470 SQUASHFS_INVALID_BLK) {
471 u64 block = 0;
472 int bsize = read_blocklist(inode, index, &block);
473 if (bsize < 0)
474 goto error_out;
475
476 if (bsize == 0)
477 res = squashfs_readpage_sparse(page, index, file_end);
478 else
479 res = squashfs_readpage_block(page, block, bsize);
480 } else
481 res = squashfs_readpage_fragment(page);
482
483 if (!res)
484 return 0;
483 485
484error_out: 486error_out:
485 SetPageError(page); 487 SetPageError(page);
diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c
new file mode 100644
index 000000000000..f2310d2a2019
--- /dev/null
+++ b/fs/squashfs/file_cache.c
@@ -0,0 +1,38 @@
1/*
2 * Copyright (c) 2013
3 * Phillip Lougher <phillip@squashfs.org.uk>
4 *
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/fs.h>
10#include <linux/vfs.h>
11#include <linux/kernel.h>
12#include <linux/slab.h>
13#include <linux/string.h>
14#include <linux/pagemap.h>
15#include <linux/mutex.h>
16
17#include "squashfs_fs.h"
18#include "squashfs_fs_sb.h"
19#include "squashfs_fs_i.h"
20#include "squashfs.h"
21
22/* Read separately compressed datablock and memcopy into page cache */
23int squashfs_readpage_block(struct page *page, u64 block, int bsize)
24{
25 struct inode *i = page->mapping->host;
26 struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
27 block, bsize);
28 int res = buffer->error;
29
30 if (res)
31 ERROR("Unable to read page, block %llx, size %x\n", block,
32 bsize);
33 else
34 squashfs_copy_cache(page, buffer, buffer->length, 0);
35
36 squashfs_cache_put(buffer);
37 return res;
38}
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
new file mode 100644
index 000000000000..62a0de6632e1
--- /dev/null
+++ b/fs/squashfs/file_direct.c
@@ -0,0 +1,176 @@
1/*
2 * Copyright (c) 2013
3 * Phillip Lougher <phillip@squashfs.org.uk>
4 *
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/fs.h>
10#include <linux/vfs.h>
11#include <linux/kernel.h>
12#include <linux/slab.h>
13#include <linux/string.h>
14#include <linux/pagemap.h>
15#include <linux/mutex.h>
16
17#include "squashfs_fs.h"
18#include "squashfs_fs_sb.h"
19#include "squashfs_fs_i.h"
20#include "squashfs.h"
21#include "page_actor.h"
22
23static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
24 int pages, struct page **page);
25
26/* Read separately compressed datablock directly into page cache */
27int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
28
29{
30 struct inode *inode = target_page->mapping->host;
31 struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
32
33 int file_end = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
34 int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
35 int start_index = target_page->index & ~mask;
36 int end_index = start_index | mask;
37 int i, n, pages, missing_pages, bytes, res = -ENOMEM;
38 struct page **page;
39 struct squashfs_page_actor *actor;
40 void *pageaddr;
41
42 if (end_index > file_end)
43 end_index = file_end;
44
45 pages = end_index - start_index + 1;
46
47 page = kmalloc(sizeof(void *) * pages, GFP_KERNEL);
48 if (page == NULL)
49 return res;
50
51 /*
52 * Create a "page actor" which will kmap and kunmap the
53 * page cache pages appropriately within the decompressor
54 */
55 actor = squashfs_page_actor_init_special(page, pages, 0);
56 if (actor == NULL)
57 goto out;
58
59 /* Try to grab all the pages covered by the Squashfs block */
60 for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
61 page[i] = (n == target_page->index) ? target_page :
62 grab_cache_page_nowait(target_page->mapping, n);
63
64 if (page[i] == NULL) {
65 missing_pages++;
66 continue;
67 }
68
69 if (PageUptodate(page[i])) {
70 unlock_page(page[i]);
71 page_cache_release(page[i]);
72 page[i] = NULL;
73 missing_pages++;
74 }
75 }
76
77 if (missing_pages) {
78 /*
79 * Couldn't get one or more pages, this page has either
80 * been VM reclaimed, but others are still in the page cache
81 * and uptodate, or we're racing with another thread in
82 * squashfs_readpage also trying to grab them. Fall back to
83 * using an intermediate buffer.
84 */
85 res = squashfs_read_cache(target_page, block, bsize, pages,
86 page);
87 if (res < 0)
88 goto mark_errored;
89
90 goto out;
91 }
92
93 /* Decompress directly into the page cache buffers */
94 res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
95 if (res < 0)
96 goto mark_errored;
97
98 /* Last page may have trailing bytes not filled */
99 bytes = res % PAGE_CACHE_SIZE;
100 if (bytes) {
101 pageaddr = kmap_atomic(page[pages - 1]);
102 memset(pageaddr + bytes, 0, PAGE_CACHE_SIZE - bytes);
103 kunmap_atomic(pageaddr);
104 }
105
106 /* Mark pages as uptodate, unlock and release */
107 for (i = 0; i < pages; i++) {
108 flush_dcache_page(page[i]);
109 SetPageUptodate(page[i]);
110 unlock_page(page[i]);
111 if (page[i] != target_page)
112 page_cache_release(page[i]);
113 }
114
115 kfree(actor);
116 kfree(page);
117
118 return 0;
119
120mark_errored:
121 /* Decompression failed, mark pages as errored. Target_page is
122 * dealt with by the caller
123 */
124 for (i = 0; i < pages; i++) {
125 if (page[i] == NULL || page[i] == target_page)
126 continue;
127 flush_dcache_page(page[i]);
128 SetPageError(page[i]);
129 unlock_page(page[i]);
130 page_cache_release(page[i]);
131 }
132
133out:
134 kfree(actor);
135 kfree(page);
136 return res;
137}
138
139
140static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
141 int pages, struct page **page)
142{
143 struct inode *i = target_page->mapping->host;
144 struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
145 block, bsize);
146 int bytes = buffer->length, res = buffer->error, n, offset = 0;
147 void *pageaddr;
148
149 if (res) {
150 ERROR("Unable to read page, block %llx, size %x\n", block,
151 bsize);
152 goto out;
153 }
154
155 for (n = 0; n < pages && bytes > 0; n++,
156 bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
157 int avail = min_t(int, bytes, PAGE_CACHE_SIZE);
158
159 if (page[n] == NULL)
160 continue;
161
162 pageaddr = kmap_atomic(page[n]);
163 squashfs_copy_data(pageaddr, buffer, offset, avail);
164 memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
165 kunmap_atomic(pageaddr);
166 flush_dcache_page(page[n]);
167 SetPageUptodate(page[n]);
168 unlock_page(page[n]);
169 if (page[n] != target_page)
170 page_cache_release(page[n]);
171 }
172
173out:
174 squashfs_cache_put(buffer);
175 return res;
176}
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c
index 00f4dfc5f088..244b9fbfff7b 100644
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -31,13 +31,14 @@
31#include "squashfs_fs_sb.h" 31#include "squashfs_fs_sb.h"
32#include "squashfs.h" 32#include "squashfs.h"
33#include "decompressor.h" 33#include "decompressor.h"
34#include "page_actor.h"
34 35
35struct squashfs_lzo { 36struct squashfs_lzo {
36 void *input; 37 void *input;
37 void *output; 38 void *output;
38}; 39};
39 40
40static void *lzo_init(struct squashfs_sb_info *msblk, void *buff, int len) 41static void *lzo_init(struct squashfs_sb_info *msblk, void *buff)
41{ 42{
42 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); 43 int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE);
43 44
@@ -74,22 +75,16 @@ static void lzo_free(void *strm)
74} 75}
75 76
76 77
77static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer, 78static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
78 struct buffer_head **bh, int b, int offset, int length, int srclength, 79 struct buffer_head **bh, int b, int offset, int length,
79 int pages) 80 struct squashfs_page_actor *output)
80{ 81{
81 struct squashfs_lzo *stream = msblk->stream; 82 struct squashfs_lzo *stream = strm;
82 void *buff = stream->input; 83 void *buff = stream->input, *data;
83 int avail, i, bytes = length, res; 84 int avail, i, bytes = length, res;
84 size_t out_len = srclength; 85 size_t out_len = output->length;
85
86 mutex_lock(&msblk->read_data_mutex);
87 86
88 for (i = 0; i < b; i++) { 87 for (i = 0; i < b; i++) {
89 wait_on_buffer(bh[i]);
90 if (!buffer_uptodate(bh[i]))
91 goto block_release;
92
93 avail = min(bytes, msblk->devblksize - offset); 88 avail = min(bytes, msblk->devblksize - offset);
94 memcpy(buff, bh[i]->b_data + offset, avail); 89 memcpy(buff, bh[i]->b_data + offset, avail);
95 buff += avail; 90 buff += avail;
@@ -104,24 +99,24 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void **buffer,
104 goto failed; 99 goto failed;
105 100
106 res = bytes = (int)out_len; 101 res = bytes = (int)out_len;
107 for (i = 0, buff = stream->output; bytes && i < pages; i++) { 102 data = squashfs_first_page(output);
108 avail = min_t(int, bytes, PAGE_CACHE_SIZE); 103 buff = stream->output;
109 memcpy(buffer[i], buff, avail); 104 while (data) {
110 buff += avail; 105 if (bytes <= PAGE_CACHE_SIZE) {
111 bytes -= avail; 106 memcpy(data, buff, bytes);
107 break;
108 } else {
109 memcpy(data, buff, PAGE_CACHE_SIZE);
110 buff += PAGE_CACHE_SIZE;
111 bytes -= PAGE_CACHE_SIZE;
112 data = squashfs_next_page(output);
113 }
112 } 114 }
115 squashfs_finish_page(output);
113 116
114 mutex_unlock(&msblk->read_data_mutex);
115 return res; 117 return res;
116 118
117block_release:
118 for (; i < b; i++)
119 put_bh(bh[i]);
120
121failed: 119failed:
122 mutex_unlock(&msblk->read_data_mutex);
123
124 ERROR("lzo decompression failed, data probably corrupt\n");
125 return -EIO; 120 return -EIO;
126} 121}
127 122
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c
new file mode 100644
index 000000000000..5a1c11f56441
--- /dev/null
+++ b/fs/squashfs/page_actor.c
@@ -0,0 +1,100 @@
1/*
2 * Copyright (c) 2013
3 * Phillip Lougher <phillip@squashfs.org.uk>
4 *
5 * This work is licensed under the terms of the GNU GPL, version 2. See
6 * the COPYING file in the top-level directory.
7 */
8
9#include <linux/kernel.h>
10#include <linux/slab.h>
11#include <linux/pagemap.h>
12#include "page_actor.h"
13
14/*
15 * This file contains implementations of page_actor for decompressing into
16 * an intermediate buffer, and for decompressing directly into the
17 * page cache.
18 *
19 * Calling code should avoid sleeping between calls to squashfs_first_page()
20 * and squashfs_finish_page().
21 */
22
23/* Implementation of page_actor for decompressing into intermediate buffer */
24static void *cache_first_page(struct squashfs_page_actor *actor)
25{
26 actor->next_page = 1;
27 return actor->buffer[0];
28}
29
30static void *cache_next_page(struct squashfs_page_actor *actor)
31{
32 if (actor->next_page == actor->pages)
33 return NULL;
34
35 return actor->buffer[actor->next_page++];
36}
37
38static void cache_finish_page(struct squashfs_page_actor *actor)
39{
40 /* empty */
41}
42
43struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
44 int pages, int length)
45{
46 struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
47
48 if (actor == NULL)
49 return NULL;
50
51 actor->length = length ? : pages * PAGE_CACHE_SIZE;
52 actor->buffer = buffer;
53 actor->pages = pages;
54 actor->next_page = 0;
55 actor->squashfs_first_page = cache_first_page;
56 actor->squashfs_next_page = cache_next_page;
57 actor->squashfs_finish_page = cache_finish_page;
58 return actor;
59}
60
61/* Implementation of page_actor for decompressing directly into page cache. */
62static void *direct_first_page(struct squashfs_page_actor *actor)
63{
64 actor->next_page = 1;
65 return actor->pageaddr = kmap_atomic(actor->page[0]);
66}
67
68static void *direct_next_page(struct squashfs_page_actor *actor)
69{
70 if (actor->pageaddr)
71 kunmap_atomic(actor->pageaddr);
72
73 return actor->pageaddr = actor->next_page == actor->pages ? NULL :
74 kmap_atomic(actor->page[actor->next_page++]);
75}
76
77static void direct_finish_page(struct squashfs_page_actor *actor)
78{
79 if (actor->pageaddr)
80 kunmap_atomic(actor->pageaddr);
81}
82
83struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
84 int pages, int length)
85{
86 struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
87
88 if (actor == NULL)
89 return NULL;
90
91 actor->length = length ? : pages * PAGE_CACHE_SIZE;
92 actor->page = page;
93 actor->pages = pages;
94 actor->next_page = 0;
95 actor->pageaddr = NULL;
96 actor->squashfs_first_page = direct_first_page;
97 actor->squashfs_next_page = direct_next_page;
98 actor->squashfs_finish_page = direct_finish_page;
99 return actor;
100}
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h
new file mode 100644
index 000000000000..26dd82008b82
--- /dev/null
+++ b/fs/squashfs/page_actor.h
@@ -0,0 +1,81 @@
1#ifndef PAGE_ACTOR_H
2#define PAGE_ACTOR_H
3/*
4 * Copyright (c) 2013
5 * Phillip Lougher <phillip@squashfs.org.uk>
6 *
7 * This work is licensed under the terms of the GNU GPL, version 2. See
8 * the COPYING file in the top-level directory.
9 */
10
11#ifndef CONFIG_SQUASHFS_FILE_DIRECT
12struct squashfs_page_actor {
13 void **page;
14 int pages;
15 int length;
16 int next_page;
17};
18
19static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
20 int pages, int length)
21{
22 struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
23
24 if (actor == NULL)
25 return NULL;
26
27 actor->length = length ? : pages * PAGE_CACHE_SIZE;
28 actor->page = page;
29 actor->pages = pages;
30 actor->next_page = 0;
31 return actor;
32}
33
34static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
35{
36 actor->next_page = 1;
37 return actor->page[0];
38}
39
40static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
41{
42 return actor->next_page == actor->pages ? NULL :
43 actor->page[actor->next_page++];
44}
45
46static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
47{
48 /* empty */
49}
50#else
51struct squashfs_page_actor {
52 union {
53 void **buffer;
54 struct page **page;
55 };
56 void *pageaddr;
57 void *(*squashfs_first_page)(struct squashfs_page_actor *);
58 void *(*squashfs_next_page)(struct squashfs_page_actor *);
59 void (*squashfs_finish_page)(struct squashfs_page_actor *);
60 int pages;
61 int length;
62 int next_page;
63};
64
65extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
66extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
67 **, int, int);
68static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
69{
70 return actor->squashfs_first_page(actor);
71}
72static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
73{
74 return actor->squashfs_next_page(actor);
75}
76static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
77{
78 actor->squashfs_finish_page(actor);
79}
80#endif
81#endif
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index d1266516ed08..9e1bb79f7e6f 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -28,8 +28,8 @@
28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) 28#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args)
29 29
30/* block.c */ 30/* block.c */
31extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, 31extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
32 int, int); 32 struct squashfs_page_actor *);
33 33
34/* cache.c */ 34/* cache.c */
35extern struct squashfs_cache *squashfs_cache_init(char *, int, int); 35extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
@@ -48,7 +48,14 @@ extern void *squashfs_read_table(struct super_block *, u64, int);
48 48
49/* decompressor.c */ 49/* decompressor.c */
50extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int); 50extern const struct squashfs_decompressor *squashfs_lookup_decompressor(int);
51extern void *squashfs_decompressor_init(struct super_block *, unsigned short); 51extern void *squashfs_decompressor_setup(struct super_block *, unsigned short);
52
53/* decompressor_xxx.c */
54extern void *squashfs_decompressor_create(struct squashfs_sb_info *, void *);
55extern void squashfs_decompressor_destroy(struct squashfs_sb_info *);
56extern int squashfs_decompress(struct squashfs_sb_info *, struct buffer_head **,
57 int, int, int, struct squashfs_page_actor *);
58extern int squashfs_max_decompressors(void);
52 59
53/* export.c */ 60/* export.c */
54extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64, 61extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, u64,
@@ -59,6 +66,13 @@ extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *);
59extern __le64 *squashfs_read_fragment_index_table(struct super_block *, 66extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
60 u64, u64, unsigned int); 67 u64, u64, unsigned int);
61 68
69/* file.c */
70void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
71 int);
72
73/* file_xxx.c */
74extern int squashfs_readpage_block(struct page *, u64, int);
75
62/* id.c */ 76/* id.c */
63extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); 77extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
64extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64, 78extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, u64,
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 52934a22f296..1da565cb50c3 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -50,6 +50,7 @@ struct squashfs_cache_entry {
50 wait_queue_head_t wait_queue; 50 wait_queue_head_t wait_queue;
51 struct squashfs_cache *cache; 51 struct squashfs_cache *cache;
52 void **data; 52 void **data;
53 struct squashfs_page_actor *actor;
53}; 54};
54 55
55struct squashfs_sb_info { 56struct squashfs_sb_info {
@@ -63,10 +64,9 @@ struct squashfs_sb_info {
63 __le64 *id_table; 64 __le64 *id_table;
64 __le64 *fragment_index; 65 __le64 *fragment_index;
65 __le64 *xattr_id_table; 66 __le64 *xattr_id_table;
66 struct mutex read_data_mutex;
67 struct mutex meta_index_mutex; 67 struct mutex meta_index_mutex;
68 struct meta_index *meta_index; 68 struct meta_index *meta_index;
69 void *stream; 69 struct squashfs_stream *stream;
70 __le64 *inode_lookup_table; 70 __le64 *inode_lookup_table;
71 u64 inode_table; 71 u64 inode_table;
72 u64 directory_table; 72 u64 directory_table;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 60553a9053ca..202df6312d4e 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -98,7 +98,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
98 msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); 98 msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE);
99 msblk->devblksize_log2 = ffz(~msblk->devblksize); 99 msblk->devblksize_log2 = ffz(~msblk->devblksize);
100 100
101 mutex_init(&msblk->read_data_mutex);
102 mutex_init(&msblk->meta_index_mutex); 101 mutex_init(&msblk->meta_index_mutex);
103 102
104 /* 103 /*
@@ -206,13 +205,14 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
206 goto failed_mount; 205 goto failed_mount;
207 206
208 /* Allocate read_page block */ 207 /* Allocate read_page block */
209 msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size); 208 msblk->read_page = squashfs_cache_init("data",
209 squashfs_max_decompressors(), msblk->block_size);
210 if (msblk->read_page == NULL) { 210 if (msblk->read_page == NULL) {
211 ERROR("Failed to allocate read_page block\n"); 211 ERROR("Failed to allocate read_page block\n");
212 goto failed_mount; 212 goto failed_mount;
213 } 213 }
214 214
215 msblk->stream = squashfs_decompressor_init(sb, flags); 215 msblk->stream = squashfs_decompressor_setup(sb, flags);
216 if (IS_ERR(msblk->stream)) { 216 if (IS_ERR(msblk->stream)) {
217 err = PTR_ERR(msblk->stream); 217 err = PTR_ERR(msblk->stream);
218 msblk->stream = NULL; 218 msblk->stream = NULL;
@@ -336,7 +336,7 @@ failed_mount:
336 squashfs_cache_delete(msblk->block_cache); 336 squashfs_cache_delete(msblk->block_cache);
337 squashfs_cache_delete(msblk->fragment_cache); 337 squashfs_cache_delete(msblk->fragment_cache);
338 squashfs_cache_delete(msblk->read_page); 338 squashfs_cache_delete(msblk->read_page);
339 squashfs_decompressor_free(msblk, msblk->stream); 339 squashfs_decompressor_destroy(msblk);
340 kfree(msblk->inode_lookup_table); 340 kfree(msblk->inode_lookup_table);
341 kfree(msblk->fragment_index); 341 kfree(msblk->fragment_index);
342 kfree(msblk->id_table); 342 kfree(msblk->id_table);
@@ -383,7 +383,7 @@ static void squashfs_put_super(struct super_block *sb)
383 squashfs_cache_delete(sbi->block_cache); 383 squashfs_cache_delete(sbi->block_cache);
384 squashfs_cache_delete(sbi->fragment_cache); 384 squashfs_cache_delete(sbi->fragment_cache);
385 squashfs_cache_delete(sbi->read_page); 385 squashfs_cache_delete(sbi->read_page);
386 squashfs_decompressor_free(sbi, sbi->stream); 386 squashfs_decompressor_destroy(sbi);
387 kfree(sbi->id_table); 387 kfree(sbi->id_table);
388 kfree(sbi->fragment_index); 388 kfree(sbi->fragment_index);
389 kfree(sbi->meta_index); 389 kfree(sbi->meta_index);
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c
index 1760b7d108f6..c609624e4b8a 100644
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -32,44 +32,70 @@
32#include "squashfs_fs_sb.h" 32#include "squashfs_fs_sb.h"
33#include "squashfs.h" 33#include "squashfs.h"
34#include "decompressor.h" 34#include "decompressor.h"
35#include "page_actor.h"
35 36
36struct squashfs_xz { 37struct squashfs_xz {
37 struct xz_dec *state; 38 struct xz_dec *state;
38 struct xz_buf buf; 39 struct xz_buf buf;
39}; 40};
40 41
41struct comp_opts { 42struct disk_comp_opts {
42 __le32 dictionary_size; 43 __le32 dictionary_size;
43 __le32 flags; 44 __le32 flags;
44}; 45};
45 46
46static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff, 47struct comp_opts {
47 int len) 48 int dict_size;
49};
50
51static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk,
52 void *buff, int len)
48{ 53{
49 struct comp_opts *comp_opts = buff; 54 struct disk_comp_opts *comp_opts = buff;
50 struct squashfs_xz *stream; 55 struct comp_opts *opts;
51 int dict_size = msblk->block_size; 56 int err = 0, n;
52 int err, n; 57
58 opts = kmalloc(sizeof(*opts), GFP_KERNEL);
59 if (opts == NULL) {
60 err = -ENOMEM;
61 goto out2;
62 }
53 63
54 if (comp_opts) { 64 if (comp_opts) {
55 /* check compressor options are the expected length */ 65 /* check compressor options are the expected length */
56 if (len < sizeof(*comp_opts)) { 66 if (len < sizeof(*comp_opts)) {
57 err = -EIO; 67 err = -EIO;
58 goto failed; 68 goto out;
59 } 69 }
60 70
61 dict_size = le32_to_cpu(comp_opts->dictionary_size); 71 opts->dict_size = le32_to_cpu(comp_opts->dictionary_size);
62 72
63 /* the dictionary size should be 2^n or 2^n+2^(n+1) */ 73 /* the dictionary size should be 2^n or 2^n+2^(n+1) */
64 n = ffs(dict_size) - 1; 74 n = ffs(opts->dict_size) - 1;
65 if (dict_size != (1 << n) && dict_size != (1 << n) + 75 if (opts->dict_size != (1 << n) && opts->dict_size != (1 << n) +
66 (1 << (n + 1))) { 76 (1 << (n + 1))) {
67 err = -EIO; 77 err = -EIO;
68 goto failed; 78 goto out;
69 } 79 }
70 } 80 } else
81 /* use defaults */
82 opts->dict_size = max_t(int, msblk->block_size,
83 SQUASHFS_METADATA_SIZE);
84
85 return opts;
86
87out:
88 kfree(opts);
89out2:
90 return ERR_PTR(err);
91}
92
71 93
72 dict_size = max_t(int, dict_size, SQUASHFS_METADATA_SIZE); 94static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff)
95{
96 struct comp_opts *comp_opts = buff;
97 struct squashfs_xz *stream;
98 int err;
73 99
74 stream = kmalloc(sizeof(*stream), GFP_KERNEL); 100 stream = kmalloc(sizeof(*stream), GFP_KERNEL);
75 if (stream == NULL) { 101 if (stream == NULL) {
@@ -77,7 +103,7 @@ static void *squashfs_xz_init(struct squashfs_sb_info *msblk, void *buff,
77 goto failed; 103 goto failed;
78 } 104 }
79 105
80 stream->state = xz_dec_init(XZ_PREALLOC, dict_size); 106 stream->state = xz_dec_init(XZ_PREALLOC, comp_opts->dict_size);
81 if (stream->state == NULL) { 107 if (stream->state == NULL) {
82 kfree(stream); 108 kfree(stream);
83 err = -ENOMEM; 109 err = -ENOMEM;
@@ -103,42 +129,37 @@ static void squashfs_xz_free(void *strm)
103} 129}
104 130
105 131
106static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer, 132static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
107 struct buffer_head **bh, int b, int offset, int length, int srclength, 133 struct buffer_head **bh, int b, int offset, int length,
108 int pages) 134 struct squashfs_page_actor *output)
109{ 135{
110 enum xz_ret xz_err; 136 enum xz_ret xz_err;
111 int avail, total = 0, k = 0, page = 0; 137 int avail, total = 0, k = 0;
112 struct squashfs_xz *stream = msblk->stream; 138 struct squashfs_xz *stream = strm;
113
114 mutex_lock(&msblk->read_data_mutex);
115 139
116 xz_dec_reset(stream->state); 140 xz_dec_reset(stream->state);
117 stream->buf.in_pos = 0; 141 stream->buf.in_pos = 0;
118 stream->buf.in_size = 0; 142 stream->buf.in_size = 0;
119 stream->buf.out_pos = 0; 143 stream->buf.out_pos = 0;
120 stream->buf.out_size = PAGE_CACHE_SIZE; 144 stream->buf.out_size = PAGE_CACHE_SIZE;
121 stream->buf.out = buffer[page++]; 145 stream->buf.out = squashfs_first_page(output);
122 146
123 do { 147 do {
124 if (stream->buf.in_pos == stream->buf.in_size && k < b) { 148 if (stream->buf.in_pos == stream->buf.in_size && k < b) {
125 avail = min(length, msblk->devblksize - offset); 149 avail = min(length, msblk->devblksize - offset);
126 length -= avail; 150 length -= avail;
127 wait_on_buffer(bh[k]);
128 if (!buffer_uptodate(bh[k]))
129 goto release_mutex;
130
131 stream->buf.in = bh[k]->b_data + offset; 151 stream->buf.in = bh[k]->b_data + offset;
132 stream->buf.in_size = avail; 152 stream->buf.in_size = avail;
133 stream->buf.in_pos = 0; 153 stream->buf.in_pos = 0;
134 offset = 0; 154 offset = 0;
135 } 155 }
136 156
137 if (stream->buf.out_pos == stream->buf.out_size 157 if (stream->buf.out_pos == stream->buf.out_size) {
138 && page < pages) { 158 stream->buf.out = squashfs_next_page(output);
139 stream->buf.out = buffer[page++]; 159 if (stream->buf.out != NULL) {
140 stream->buf.out_pos = 0; 160 stream->buf.out_pos = 0;
141 total += PAGE_CACHE_SIZE; 161 total += PAGE_CACHE_SIZE;
162 }
142 } 163 }
143 164
144 xz_err = xz_dec_run(stream->state, &stream->buf); 165 xz_err = xz_dec_run(stream->state, &stream->buf);
@@ -147,23 +168,14 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void **buffer,
147 put_bh(bh[k++]); 168 put_bh(bh[k++]);
148 } while (xz_err == XZ_OK); 169 } while (xz_err == XZ_OK);
149 170
150 if (xz_err != XZ_STREAM_END) { 171 squashfs_finish_page(output);
151 ERROR("xz_dec_run error, data probably corrupt\n");
152 goto release_mutex;
153 }
154
155 if (k < b) {
156 ERROR("xz_uncompress error, input remaining\n");
157 goto release_mutex;
158 }
159 172
160 total += stream->buf.out_pos; 173 if (xz_err != XZ_STREAM_END || k < b)
161 mutex_unlock(&msblk->read_data_mutex); 174 goto out;
162 return total;
163 175
164release_mutex: 176 return total + stream->buf.out_pos;
165 mutex_unlock(&msblk->read_data_mutex);
166 177
178out:
167 for (; k < b; k++) 179 for (; k < b; k++)
168 put_bh(bh[k]); 180 put_bh(bh[k]);
169 181
@@ -172,6 +184,7 @@ release_mutex:
172 184
173const struct squashfs_decompressor squashfs_xz_comp_ops = { 185const struct squashfs_decompressor squashfs_xz_comp_ops = {
174 .init = squashfs_xz_init, 186 .init = squashfs_xz_init,
187 .comp_opts = squashfs_xz_comp_opts,
175 .free = squashfs_xz_free, 188 .free = squashfs_xz_free,
176 .decompress = squashfs_xz_uncompress, 189 .decompress = squashfs_xz_uncompress,
177 .id = XZ_COMPRESSION, 190 .id = XZ_COMPRESSION,
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c
index 55d918fd2d86..8727caba6882 100644
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -32,8 +32,9 @@
32#include "squashfs_fs_sb.h" 32#include "squashfs_fs_sb.h"
33#include "squashfs.h" 33#include "squashfs.h"
34#include "decompressor.h" 34#include "decompressor.h"
35#include "page_actor.h"
35 36
36static void *zlib_init(struct squashfs_sb_info *dummy, void *buff, int len) 37static void *zlib_init(struct squashfs_sb_info *dummy, void *buff)
37{ 38{
38 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL); 39 z_stream *stream = kmalloc(sizeof(z_stream), GFP_KERNEL);
39 if (stream == NULL) 40 if (stream == NULL)
@@ -61,44 +62,37 @@ static void zlib_free(void *strm)
61} 62}
62 63
63 64
64static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, 65static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
65 struct buffer_head **bh, int b, int offset, int length, int srclength, 66 struct buffer_head **bh, int b, int offset, int length,
66 int pages) 67 struct squashfs_page_actor *output)
67{ 68{
68 int zlib_err, zlib_init = 0; 69 int zlib_err, zlib_init = 0, k = 0;
69 int k = 0, page = 0; 70 z_stream *stream = strm;
70 z_stream *stream = msblk->stream;
71
72 mutex_lock(&msblk->read_data_mutex);
73 71
74 stream->avail_out = 0; 72 stream->avail_out = PAGE_CACHE_SIZE;
73 stream->next_out = squashfs_first_page(output);
75 stream->avail_in = 0; 74 stream->avail_in = 0;
76 75
77 do { 76 do {
78 if (stream->avail_in == 0 && k < b) { 77 if (stream->avail_in == 0 && k < b) {
79 int avail = min(length, msblk->devblksize - offset); 78 int avail = min(length, msblk->devblksize - offset);
80 length -= avail; 79 length -= avail;
81 wait_on_buffer(bh[k]);
82 if (!buffer_uptodate(bh[k]))
83 goto release_mutex;
84
85 stream->next_in = bh[k]->b_data + offset; 80 stream->next_in = bh[k]->b_data + offset;
86 stream->avail_in = avail; 81 stream->avail_in = avail;
87 offset = 0; 82 offset = 0;
88 } 83 }
89 84
90 if (stream->avail_out == 0 && page < pages) { 85 if (stream->avail_out == 0) {
91 stream->next_out = buffer[page++]; 86 stream->next_out = squashfs_next_page(output);
92 stream->avail_out = PAGE_CACHE_SIZE; 87 if (stream->next_out != NULL)
88 stream->avail_out = PAGE_CACHE_SIZE;
93 } 89 }
94 90
95 if (!zlib_init) { 91 if (!zlib_init) {
96 zlib_err = zlib_inflateInit(stream); 92 zlib_err = zlib_inflateInit(stream);
97 if (zlib_err != Z_OK) { 93 if (zlib_err != Z_OK) {
98 ERROR("zlib_inflateInit returned unexpected " 94 squashfs_finish_page(output);
99 "result 0x%x, srclength %d\n", 95 goto out;
100 zlib_err, srclength);
101 goto release_mutex;
102 } 96 }
103 zlib_init = 1; 97 zlib_init = 1;
104 } 98 }
@@ -109,29 +103,21 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer,
109 put_bh(bh[k++]); 103 put_bh(bh[k++]);
110 } while (zlib_err == Z_OK); 104 } while (zlib_err == Z_OK);
111 105
112 if (zlib_err != Z_STREAM_END) { 106 squashfs_finish_page(output);
113 ERROR("zlib_inflate error, data probably corrupt\n");
114 goto release_mutex;
115 }
116 107
117 zlib_err = zlib_inflateEnd(stream); 108 if (zlib_err != Z_STREAM_END)
118 if (zlib_err != Z_OK) { 109 goto out;
119 ERROR("zlib_inflate error, data probably corrupt\n");
120 goto release_mutex;
121 }
122 110
123 if (k < b) { 111 zlib_err = zlib_inflateEnd(stream);
124 ERROR("zlib_uncompress error, data remaining\n"); 112 if (zlib_err != Z_OK)
125 goto release_mutex; 113 goto out;
126 }
127 114
128 length = stream->total_out; 115 if (k < b)
129 mutex_unlock(&msblk->read_data_mutex); 116 goto out;
130 return length;
131 117
132release_mutex: 118 return stream->total_out;
133 mutex_unlock(&msblk->read_data_mutex);
134 119
120out:
135 for (; k < b; k++) 121 for (; k < b; k++)
136 put_bh(bh[k]); 122 put_bh(bh[k]);
137 123
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 79b5da2acbe1..b94f93685093 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -609,7 +609,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
609 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata; 609 struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
610 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj; 610 struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
611 struct sysfs_open_file *of; 611 struct sysfs_open_file *of;
612 bool has_read, has_write; 612 bool has_read, has_write, has_mmap;
613 int error = -EACCES; 613 int error = -EACCES;
614 614
615 /* need attr_sd for attr and ops, its parent for kobj */ 615 /* need attr_sd for attr and ops, its parent for kobj */
@@ -621,6 +621,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
621 621
622 has_read = battr->read || battr->mmap; 622 has_read = battr->read || battr->mmap;
623 has_write = battr->write || battr->mmap; 623 has_write = battr->write || battr->mmap;
624 has_mmap = battr->mmap;
624 } else { 625 } else {
625 const struct sysfs_ops *ops = sysfs_file_ops(attr_sd); 626 const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
626 627
@@ -632,6 +633,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
632 633
633 has_read = ops->show; 634 has_read = ops->show;
634 has_write = ops->store; 635 has_write = ops->store;
636 has_mmap = false;
635 } 637 }
636 638
637 /* check perms and supported operations */ 639 /* check perms and supported operations */
@@ -649,7 +651,23 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
649 if (!of) 651 if (!of)
650 goto err_out; 652 goto err_out;
651 653
652 mutex_init(&of->mutex); 654 /*
655 * The following is done to give a different lockdep key to
656 * @of->mutex for files which implement mmap. This is a rather
657 * crude way to avoid false positive lockdep warning around
658 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
659 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
660 * which mm->mmap_sem nests, while holding @of->mutex. As each
661 * open file has a separate mutex, it's okay as long as those don't
662 * happen on the same file. At this point, we can't easily give
663 * each file a separate locking class. Let's differentiate on
664 * whether the file has mmap or not for now.
665 */
666 if (has_mmap)
667 mutex_init(&of->mutex);
668 else
669 mutex_init(&of->mutex);
670
653 of->sd = attr_sd; 671 of->sd = attr_sd;
654 of->file = file; 672 of->file = file;
655 673
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 1c02da8bb7df..3ef11b22e750 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1137,6 +1137,7 @@ xfs_bmap_add_attrfork(
1137 int committed; /* xaction was committed */ 1137 int committed; /* xaction was committed */
1138 int logflags; /* logging flags */ 1138 int logflags; /* logging flags */
1139 int error; /* error return value */ 1139 int error; /* error return value */
1140 int cancel_flags = 0;
1140 1141
1141 ASSERT(XFS_IFORK_Q(ip) == 0); 1142 ASSERT(XFS_IFORK_Q(ip) == 0);
1142 1143
@@ -1147,19 +1148,20 @@ xfs_bmap_add_attrfork(
1147 if (rsvd) 1148 if (rsvd)
1148 tp->t_flags |= XFS_TRANS_RESERVE; 1149 tp->t_flags |= XFS_TRANS_RESERVE;
1149 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); 1150 error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0);
1150 if (error) 1151 if (error) {
1151 goto error0; 1152 xfs_trans_cancel(tp, 0);
1153 return error;
1154 }
1155 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1152 xfs_ilock(ip, XFS_ILOCK_EXCL); 1156 xfs_ilock(ip, XFS_ILOCK_EXCL);
1153 error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? 1157 error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ?
1154 XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : 1158 XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
1155 XFS_QMOPT_RES_REGBLKS); 1159 XFS_QMOPT_RES_REGBLKS);
1156 if (error) { 1160 if (error)
1157 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1161 goto trans_cancel;
1158 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); 1162 cancel_flags |= XFS_TRANS_ABORT;
1159 return error;
1160 }
1161 if (XFS_IFORK_Q(ip)) 1163 if (XFS_IFORK_Q(ip))
1162 goto error1; 1164 goto trans_cancel;
1163 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { 1165 if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
1164 /* 1166 /*
1165 * For inodes coming from pre-6.2 filesystems. 1167 * For inodes coming from pre-6.2 filesystems.
@@ -1169,7 +1171,7 @@ xfs_bmap_add_attrfork(
1169 } 1171 }
1170 ASSERT(ip->i_d.di_anextents == 0); 1172 ASSERT(ip->i_d.di_anextents == 0);
1171 1173
1172 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1174 xfs_trans_ijoin(tp, ip, 0);
1173 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1175 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1174 1176
1175 switch (ip->i_d.di_format) { 1177 switch (ip->i_d.di_format) {
@@ -1191,7 +1193,7 @@ xfs_bmap_add_attrfork(
1191 default: 1193 default:
1192 ASSERT(0); 1194 ASSERT(0);
1193 error = XFS_ERROR(EINVAL); 1195 error = XFS_ERROR(EINVAL);
1194 goto error1; 1196 goto trans_cancel;
1195 } 1197 }
1196 1198
1197 ASSERT(ip->i_afp == NULL); 1199 ASSERT(ip->i_afp == NULL);
@@ -1219,7 +1221,7 @@ xfs_bmap_add_attrfork(
1219 if (logflags) 1221 if (logflags)
1220 xfs_trans_log_inode(tp, ip, logflags); 1222 xfs_trans_log_inode(tp, ip, logflags);
1221 if (error) 1223 if (error)
1222 goto error2; 1224 goto bmap_cancel;
1223 if (!xfs_sb_version_hasattr(&mp->m_sb) || 1225 if (!xfs_sb_version_hasattr(&mp->m_sb) ||
1224 (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { 1226 (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
1225 __int64_t sbfields = 0; 1227 __int64_t sbfields = 0;
@@ -1242,14 +1244,16 @@ xfs_bmap_add_attrfork(
1242 1244
1243 error = xfs_bmap_finish(&tp, &flist, &committed); 1245 error = xfs_bmap_finish(&tp, &flist, &committed);
1244 if (error) 1246 if (error)
1245 goto error2; 1247 goto bmap_cancel;
1246 return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1248 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1247error2: 1249 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1250 return error;
1251
1252bmap_cancel:
1248 xfs_bmap_cancel(&flist); 1253 xfs_bmap_cancel(&flist);
1249error1: 1254trans_cancel:
1255 xfs_trans_cancel(tp, cancel_flags);
1250 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1256 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1251error0:
1252 xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1253 return error; 1257 return error;
1254} 1258}
1255 1259
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 8367d6dc18c9..4f11ef011139 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -157,7 +157,7 @@ xfs_ioc_trim(
157 struct xfs_mount *mp, 157 struct xfs_mount *mp,
158 struct fstrim_range __user *urange) 158 struct fstrim_range __user *urange)
159{ 159{
160 struct request_queue *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue; 160 struct request_queue *q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
161 unsigned int granularity = q->limits.discard_granularity; 161 unsigned int granularity = q->limits.discard_granularity;
162 struct fstrim_range range; 162 struct fstrim_range range;
163 xfs_daddr_t start, end, minlen; 163 xfs_daddr_t start, end, minlen;
@@ -180,7 +180,8 @@ xfs_ioc_trim(
180 * matter as trimming blocks is an advisory interface. 180 * matter as trimming blocks is an advisory interface.
181 */ 181 */
182 if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) || 182 if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
183 range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp))) 183 range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
184 range.len < mp->m_sb.sb_blocksize)
184 return -XFS_ERROR(EINVAL); 185 return -XFS_ERROR(EINVAL);
185 186
186 start = BTOBB(range.start); 187 start = BTOBB(range.start);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a6e54b3319bd..02fb943cbf22 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -220,6 +220,8 @@ xfs_growfs_data_private(
220 */ 220 */
221 nfree = 0; 221 nfree = 0;
222 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { 222 for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
223 __be32 *agfl_bno;
224
223 /* 225 /*
224 * AG freespace header block 226 * AG freespace header block
225 */ 227 */
@@ -279,8 +281,10 @@ xfs_growfs_data_private(
279 agfl->agfl_seqno = cpu_to_be32(agno); 281 agfl->agfl_seqno = cpu_to_be32(agno);
280 uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid); 282 uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
281 } 283 }
284
285 agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
282 for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++) 286 for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
283 agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); 287 agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
284 288
285 error = xfs_bwrite(bp); 289 error = xfs_bwrite(bp);
286 xfs_buf_relse(bp); 290 xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 4d613401a5e0..33ad9a77791f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -442,7 +442,8 @@ xfs_attrlist_by_handle(
442 return -XFS_ERROR(EPERM); 442 return -XFS_ERROR(EPERM);
443 if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) 443 if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
444 return -XFS_ERROR(EFAULT); 444 return -XFS_ERROR(EFAULT);
445 if (al_hreq.buflen > XATTR_LIST_MAX) 445 if (al_hreq.buflen < sizeof(struct attrlist) ||
446 al_hreq.buflen > XATTR_LIST_MAX)
446 return -XFS_ERROR(EINVAL); 447 return -XFS_ERROR(EINVAL);
447 448
448 /* 449 /*
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index e8fb1231db81..a7992f8de9d3 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -356,7 +356,8 @@ xfs_compat_attrlist_by_handle(
356 if (copy_from_user(&al_hreq, arg, 356 if (copy_from_user(&al_hreq, arg,
357 sizeof(compat_xfs_fsop_attrlist_handlereq_t))) 357 sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
358 return -XFS_ERROR(EFAULT); 358 return -XFS_ERROR(EFAULT);
359 if (al_hreq.buflen > XATTR_LIST_MAX) 359 if (al_hreq.buflen < sizeof(struct attrlist) ||
360 al_hreq.buflen > XATTR_LIST_MAX)
360 return -XFS_ERROR(EINVAL); 361 return -XFS_ERROR(EINVAL);
361 362
362 /* 363 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index da88f167af78..02df7b408a26 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -41,6 +41,7 @@
41#include "xfs_fsops.h" 41#include "xfs_fsops.h"
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43#include "xfs_icache.h" 43#include "xfs_icache.h"
44#include "xfs_dinode.h"
44 45
45 46
46#ifdef HAVE_PERCPU_SB 47#ifdef HAVE_PERCPU_SB
@@ -718,8 +719,22 @@ xfs_mountfs(
718 * Set the inode cluster size. 719 * Set the inode cluster size.
719 * This may still be overridden by the file system 720 * This may still be overridden by the file system
720 * block size if it is larger than the chosen cluster size. 721 * block size if it is larger than the chosen cluster size.
722 *
723 * For v5 filesystems, scale the cluster size with the inode size to
724 * keep a constant ratio of inode per cluster buffer, but only if mkfs
725 * has set the inode alignment value appropriately for larger cluster
726 * sizes.
721 */ 727 */
722 mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE; 728 mp->m_inode_cluster_size = XFS_INODE_BIG_CLUSTER_SIZE;
729 if (xfs_sb_version_hascrc(&mp->m_sb)) {
730 int new_size = mp->m_inode_cluster_size;
731
732 new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
733 if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
734 mp->m_inode_cluster_size = new_size;
735 xfs_info(mp, "Using inode cluster size of %d bytes",
736 mp->m_inode_cluster_size);
737 }
723 738
724 /* 739 /*
725 * Set inode alignment fields 740 * Set inode alignment fields
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d8101a10d8e..a466c5e5826e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -112,7 +112,7 @@ typedef struct xfs_mount {
112 __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */ 112 __uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
113 __uint8_t m_agno_log; /* log #ag's */ 113 __uint8_t m_agno_log; /* log #ag's */
114 __uint8_t m_agino_log; /* #bits for agino in inum */ 114 __uint8_t m_agino_log; /* #bits for agino in inum */
115 __uint16_t m_inode_cluster_size;/* min inode buf size */ 115 uint m_inode_cluster_size;/* min inode buf size */
116 uint m_blockmask; /* sb_blocksize-1 */ 116 uint m_blockmask; /* sb_blocksize-1 */
117 uint m_blockwsize; /* sb_blocksize in words */ 117 uint m_blockwsize; /* sb_blocksize in words */
118 uint m_blockwmask; /* blockwsize-1 */ 118 uint m_blockwmask; /* blockwsize-1 */
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 1bba7f60d94c..50c3f5614288 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -111,12 +111,14 @@ xfs_trans_log_inode(
111 111
112 /* 112 /*
113 * First time we log the inode in a transaction, bump the inode change 113 * First time we log the inode in a transaction, bump the inode change
114 * counter if it is configured for this to occur. 114 * counter if it is configured for this to occur. We don't use
115 * inode_inc_version() because there is no need for extra locking around
116 * i_version as we already hold the inode locked exclusively for
117 * metadata modification.
115 */ 118 */
116 if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && 119 if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) &&
117 IS_I_VERSION(VFS_I(ip))) { 120 IS_I_VERSION(VFS_I(ip))) {
118 inode_inc_iversion(VFS_I(ip)); 121 ip->i_d.di_changecount = ++VFS_I(ip)->i_version;
119 ip->i_d.di_changecount = VFS_I(ip)->i_version;
120 flags |= XFS_ILOG_CORE; 122 flags |= XFS_ILOG_CORE;
121 } 123 }
122 124
diff --git a/fs/xfs/xfs_trans_resv.c b/fs/xfs/xfs_trans_resv.c
index d53d9f0627a7..2fd59c0dae66 100644
--- a/fs/xfs/xfs_trans_resv.c
+++ b/fs/xfs/xfs_trans_resv.c
@@ -385,8 +385,7 @@ xfs_calc_ifree_reservation(
385 xfs_calc_inode_res(mp, 1) + 385 xfs_calc_inode_res(mp, 1) +
386 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) + 386 xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
387 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) + 387 xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
388 MAX((__uint16_t)XFS_FSB_TO_B(mp, 1), 388 max_t(uint, XFS_FSB_TO_B(mp, 1), XFS_INODE_CLUSTER_SIZE(mp)) +
389 XFS_INODE_CLUSTER_SIZE(mp)) +
390 xfs_calc_buf_res(1, 0) + 389 xfs_calc_buf_res(1, 0) +
391 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) + 390 xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
392 mp->m_in_maxlevels, 0) + 391 mp->m_in_maxlevels, 0) +