Merge commit '85082fd7cbe3173198aac0eb5e85ab1edcc6352c' into test-build

Manual fixup of: arch/powerpc/Kconfig
author: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2008-07-15 01:44:51 -0400
committer: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2008-07-15 01:44:51 -0400
commit: 43d2548bb2ef7e6d753f91468a746784041e522d (patch)
tree: 77d13fcd48fd998393abb825ec36e2b732684a73 /fs
parent: 585583d95c5660973bc0cf64add517b040acd8a4 (diff)
parent: 85082fd7cbe3173198aac0eb5e85ab1edcc6352c (diff)
29 files changed, 938 insertions, 144 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 2694648cbd1b..313b2e06ded5 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -930,7 +930,7 @@ config PROC_KCORE
 config PROC_VMCORE
        bool "/proc/vmcore support (EXPERIMENTAL)"
-        depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+        depends on PROC_FS && CRASH_DUMP
        default y
        help
        Exports the dump image of crashed kernel in ELF format.
diff --git a/fs/Makefile b/fs/Makefile
index 1e7a11bd4da1..277b079dec9e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -19,6 +19,7 @@ else
 obj-y +=        no-block.o
 endif
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-$(CONFIG_INOTIFY)           += inotify.o
 obj-$(CONFIG_INOTIFY_USER)      += inotify_user.o
 obj-$(CONFIG_EPOLL)             += eventpoll.o
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
new file mode 100644
index 000000000000..63e2ee63058d
--- /dev/null
+++ b/fs/bio-integrity.c
@@ -0,0 +1,719 @@
+/*
+ * bio-integrity.c - bio data integrity extensions
+ *
+ * Copyright (C) 2007, 2008 Oracle Corporation
+ * Written by: Martin K. Petersen <martin.petersen@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
+ * USA.
+ *
+ */
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+static struct kmem_cache *bio_integrity_slab __read_mostly;
+static struct workqueue_struct *kintegrityd_wq;
+/**
+ * bio_integrity_alloc_bioset - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ * @bs:         bio_set to allocate from
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio,
+                                                         gfp_t gfp_mask,
+                                                         unsigned int nr_vecs,
+                                                         struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip;
+        struct bio_vec *iv;
+        unsigned long idx;
+        BUG_ON(bio == NULL);
+        bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
+        if (unlikely(bip == NULL)) {
+                printk(KERN_ERR "%s: could not alloc bip\n", __func__);
+                return NULL;
+        }
+        memset(bip, 0, sizeof(*bip));
+        iv = bvec_alloc_bs(gfp_mask, nr_vecs, &idx, bs);
+        if (unlikely(iv == NULL)) {
+                printk(KERN_ERR "%s: could not alloc bip_vec\n", __func__);
+                mempool_free(bip, bs->bio_integrity_pool);
+                return NULL;
+        }
+        bip->bip_pool = idx;
+        bip->bip_vec = iv;
+        bip->bip_bio = bio;
+        bio->bi_integrity = bip;
+        return bip;
+}
+EXPORT_SYMBOL(bio_integrity_alloc_bioset);
+/**
+ * bio_integrity_alloc - Allocate integrity payload and attach it to bio
+ * @bio:        bio to attach integrity metadata to
+ * @gfp_mask:   Memory allocation mask
+ * @nr_vecs:    Number of integrity metadata scatter-gather elements
+ *
+ * Description: This function prepares a bio for attaching integrity
+ * metadata.  nr_vecs specifies the maximum number of pages containing
+ * integrity metadata that can be attached.
+ */
+struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
+                                                  gfp_t gfp_mask,
+                                                  unsigned int nr_vecs)
+{
+        return bio_integrity_alloc_bioset(bio, gfp_mask, nr_vecs, fs_bio_set);
+}
+EXPORT_SYMBOL(bio_integrity_alloc);
+/**
+ * bio_integrity_free - Free bio integrity payload
+ * @bio:        bio containing bip to be freed
+ * @bs:         bio_set this bio was allocated from
+ *
+ * Description: Used to free the integrity portion of a bio. Usually
+ * called from bio_free().
+ */
+void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        BUG_ON(bip == NULL);
+        /* A cloned bio doesn't own the integrity metadata */
+        if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL)
+                kfree(bip->bip_buf);
+        mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+        mempool_free(bip, bs->bio_integrity_pool);
+        bio->bi_integrity = NULL;
+}
+EXPORT_SYMBOL(bio_integrity_free);
+/**
+ * bio_integrity_add_page - Attach integrity metadata
+ * @bio:        bio to update
+ * @page:       page containing integrity metadata
+ * @len:        number of bytes of integrity metadata in page
+ * @offset:     start offset within page
+ *
+ * Description: Attach a page containing integrity metadata to bio.
+ */
+int bio_integrity_add_page(struct bio *bio, struct page *page,
+                           unsigned int len, unsigned int offset)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct bio_vec *iv;
+        if (bip->bip_vcnt >= bvec_nr_vecs(bip->bip_pool)) {
+                printk(KERN_ERR "%s: bip_vec full\n", __func__);
+                return 0;
+        }
+        iv = bip_vec_idx(bip, bip->bip_vcnt);
+        BUG_ON(iv == NULL);
+        BUG_ON(iv->bv_page != NULL);
+        iv->bv_page = page;
+        iv->bv_len = len;
+        iv->bv_offset = offset;
+        bip->bip_vcnt++;
+        return len;
+}
+EXPORT_SYMBOL(bio_integrity_add_page);
+/**
+ * bio_integrity_enabled - Check whether integrity can be passed
+ * @bio:        bio to check
+ *
+ * Description: Determines whether bio_integrity_prep() can be called
+ * on this bio or not.  bio data direction and target device must be
+ * set prior to calling.  The functions honors the write_generate and
+ * read_verify flags in sysfs.
+ */
+int bio_integrity_enabled(struct bio *bio)
+{
+        /* Already protected? */
+        if (bio_integrity(bio))
+                return 0;
+        return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
+}
+EXPORT_SYMBOL(bio_integrity_enabled);
+/**
+ * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
+ * @bi:         blk_integrity profile for device
+ * @sectors:    Number of 512 sectors to convert
+ *
+ * Description: The block layer calculates everything in 512 byte
+ * sectors but integrity metadata is done in terms of the hardware
+ * sector size of the storage device.  Convert the block layer sectors
+ * to physical sectors.
+ */
+static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
+                                                    unsigned int sectors)
+{
+        /* At this point there are only 512b or 4096b DIF/EPP devices */
+        if (bi->sector_size == 4096)
+                return sectors >>= 3;
+        return sectors;
+}
+/**
+ * bio_integrity_tag_size - Retrieve integrity tag space
+ * @bio:        bio to inspect
+ *
+ * Description: Returns the maximum number of tag bytes that can be
+ * attached to this bio. Filesystems can use this to determine how
+ * much metadata to attach to an I/O.
+ */
+unsigned int bio_integrity_tag_size(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        BUG_ON(bio->bi_size == 0);
+        return bi->tag_size * (bio->bi_size / bi->sector_size);
+}
+EXPORT_SYMBOL(bio_integrity_tag_size);
+int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len, int set)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip->bip_buf == NULL);
+        if (bi->tag_size == 0)
+                return -1;
+        nr_sectors = bio_integrity_hw_sectors(bi,
+                                        DIV_ROUND_UP(len, bi->tag_size));
+        if (nr_sectors * bi->tuple_size > bip->bip_size) {
+                printk(KERN_ERR "%s: tag too big for bio: %u > %u\n",
+                       __func__, nr_sectors * bi->tuple_size, bip->bip_size);
+                return -1;
+        }
+        if (set)
+                bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+        else
+                bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
+        return 0;
+}
+/**
+ * bio_integrity_set_tag - Attach a tag buffer to a bio
+ * @bio:        bio to attach buffer to
+ * @tag_buf:    Pointer to a buffer containing tag data
+ * @len:        Length of the included buffer
+ *
+ * Description: Use this function to tag a bio by leveraging the extra
+ * space provided by devices formatted with integrity protection.  The
+ * size of the integrity buffer must be <= to the size reported by
+ * bio_integrity_tag_size().
+ */
+int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+        BUG_ON(bio_data_dir(bio) != WRITE);
+        return bio_integrity_tag(bio, tag_buf, len, 1);
+}
+EXPORT_SYMBOL(bio_integrity_set_tag);
+/**
+ * bio_integrity_get_tag - Retrieve a tag buffer from a bio
+ * @bio:        bio to retrieve buffer from
+ * @tag_buf:    Pointer to a buffer for the tag data
+ * @len:        Length of the target buffer
+ *
+ * Description: Use this function to retrieve the tag buffer from a
+ * completed I/O. The size of the integrity buffer must be <= to the
+ * size reported by bio_integrity_tag_size().
+ */
+int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
+{
+        BUG_ON(bio_data_dir(bio) != READ);
+        return bio_integrity_tag(bio, tag_buf, len, 0);
+}
+EXPORT_SYMBOL(bio_integrity_get_tag);
+/**
+ * bio_integrity_generate - Generate integrity metadata for a bio
+ * @bio:        bio to generate integrity metadata for
+ *
+ * Description: Generates integrity metadata for a bio by calling the
+ * block device's generation callback function.  The bio must have a
+ * bip attached with enough room to accommodate the generated
+ * integrity metadata.
+ */
+static void bio_integrity_generate(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        struct blk_integrity_exchg bix;
+        struct bio_vec *bv;
+        sector_t sector = bio->bi_sector;
+        unsigned int i, sectors, total;
+        void *prot_buf = bio->bi_integrity->bip_buf;
+        total = 0;
+        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+        bix.sector_size = bi->sector_size;
+        bio_for_each_segment(bv, bio, i) {
+                void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+                bix.data_buf = kaddr + bv->bv_offset;
+                bix.data_size = bv->bv_len;
+                bix.prot_buf = prot_buf;
+                bix.sector = sector;
+                bi->generate_fn(&bix);
+                sectors = bv->bv_len / bi->sector_size;
+                sector += sectors;
+                prot_buf += sectors * bi->tuple_size;
+                total += sectors * bi->tuple_size;
+                BUG_ON(total > bio->bi_integrity->bip_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+}
+/**
+ * bio_integrity_prep - Prepare bio for integrity I/O
+ * @bio:        bio to prepare
+ *
+ * Description: Allocates a buffer for integrity metadata, maps the
+ * pages and attaches them to a bio.  The bio must have data
+ * direction, target device and start sector set priot to calling.  In
+ * the WRITE case, integrity metadata will be generated using the
+ * block device's integrity function.  In the READ case, the buffer
+ * will be prepared for DMA and a suitable end_io handler set up.
+ */
+int bio_integrity_prep(struct bio *bio)
+{
+        struct bio_integrity_payload *bip;
+        struct blk_integrity *bi;
+        struct request_queue *q;
+        void *buf;
+        unsigned long start, end;
+        unsigned int len, nr_pages;
+        unsigned int bytes, offset, i;
+        unsigned int sectors;
+        bi = bdev_get_integrity(bio->bi_bdev);
+        q = bdev_get_queue(bio->bi_bdev);
+        BUG_ON(bi == NULL);
+        BUG_ON(bio_integrity(bio));
+        sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
+        /* Allocate kernel buffer for protection data */
+        len = sectors * blk_integrity_tuple_size(bi);
+        buf = kmalloc(len, GFP_NOIO | __GFP_NOFAIL | q->bounce_gfp);
+        if (unlikely(buf == NULL)) {
+                printk(KERN_ERR "could not allocate integrity buffer\n");
+                return -EIO;
+        }
+        end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        start = ((unsigned long) buf) >> PAGE_SHIFT;
+        nr_pages = end - start;
+        /* Allocate bio integrity payload and integrity vectors */
+        bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
+        if (unlikely(bip == NULL)) {
+                printk(KERN_ERR "could not allocate data integrity bioset\n");
+                kfree(buf);
+                return -EIO;
+        }
+        bip->bip_buf = buf;
+        bip->bip_size = len;
+        bip->bip_sector = bio->bi_sector;
+        /* Map it */
+        offset = offset_in_page(buf);
+        for (i = 0 ; i < nr_pages ; i++) {
+                int ret;
+                bytes = PAGE_SIZE - offset;
+                if (len <= 0)
+                        break;
+                if (bytes > len)
+                        bytes = len;
+                ret = bio_integrity_add_page(bio, virt_to_page(buf),
+                                             bytes, offset);
+                if (ret == 0)
+                        return 0;
+                if (ret < bytes)
+                        break;
+                buf += bytes;
+                len -= bytes;
+                offset = 0;
+        }
+        /* Install custom I/O completion handler if read verify is enabled */
+        if (bio_data_dir(bio) == READ) {
+                bip->bip_end_io = bio->bi_end_io;
+                bio->bi_end_io = bio_integrity_endio;
+        }
+        /* Auto-generate integrity metadata if this is a write */
+        if (bio_data_dir(bio) == WRITE)
+                bio_integrity_generate(bio);
+        return 0;
+}
+EXPORT_SYMBOL(bio_integrity_prep);
+/**
+ * bio_integrity_verify - Verify integrity metadata for a bio
+ * @bio:        bio to verify
+ *
+ * Description: This function is called to verify the integrity of a
+ * bio.  The data in the bio io_vec is compared to the integrity
+ * metadata returned by the HBA.
+ */
+static int bio_integrity_verify(struct bio *bio)
+{
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        struct blk_integrity_exchg bix;
+        struct bio_vec *bv;
+        sector_t sector = bio->bi_integrity->bip_sector;
+        unsigned int i, sectors, total, ret;
+        void *prot_buf = bio->bi_integrity->bip_buf;
+        ret = total = 0;
+        bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
+        bix.sector_size = bi->sector_size;
+        bio_for_each_segment(bv, bio, i) {
+                void *kaddr = kmap_atomic(bv->bv_page, KM_USER0);
+                bix.data_buf = kaddr + bv->bv_offset;
+                bix.data_size = bv->bv_len;
+                bix.prot_buf = prot_buf;
+                bix.sector = sector;
+                ret = bi->verify_fn(&bix);
+                if (ret) {
+                        kunmap_atomic(kaddr, KM_USER0);
+                        break;
+                }
+                sectors = bv->bv_len / bi->sector_size;
+                sector += sectors;
+                prot_buf += sectors * bi->tuple_size;
+                total += sectors * bi->tuple_size;
+                BUG_ON(total > bio->bi_integrity->bip_size);
+                kunmap_atomic(kaddr, KM_USER0);
+        }
+        return ret;
+}
+/**
+ * bio_integrity_verify_fn - Integrity I/O completion worker
+ * @work:       Work struct stored in bio to be verified
+ *
+ * Description: This workqueue function is called to complete a READ
+ * request.  The function verifies the transferred integrity metadata
+ * and then calls the original bio end_io function.
+ */
+static void bio_integrity_verify_fn(struct work_struct *work)
+{
+        struct bio_integrity_payload *bip =
+                container_of(work, struct bio_integrity_payload, bip_work);
+        struct bio *bio = bip->bip_bio;
+        int error = bip->bip_error;
+        if (bio_integrity_verify(bio)) {
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
+                error = -EIO;
+        }
+        /* Restore original bio completion handler */
+        bio->bi_end_io = bip->bip_end_io;
+        if (bio->bi_end_io)
+                bio->bi_end_io(bio, error);
+}
+/**
+ * bio_integrity_endio - Integrity I/O completion function
+ * @bio:        Protected bio
+ * @error:      Pointer to errno
+ *
+ * Description: Completion for integrity I/O
+ *
+ * Normally I/O completion is done in interrupt context.  However,
+ * verifying I/O integrity is a time-consuming task which must be run
+ * in process context.  This function postpones completion
+ * accordingly.
+ */
+void bio_integrity_endio(struct bio *bio, int error)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        BUG_ON(bip->bip_bio != bio);
+        bip->bip_error = error;
+        INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
+        queue_work(kintegrityd_wq, &bip->bip_work);
+}
+EXPORT_SYMBOL(bio_integrity_endio);
+/**
+ * bio_integrity_mark_head - Advance bip_vec skip bytes
+ * @bip:        Integrity vector to advance
+ * @skip:       Number of bytes to advance it
+ */
+void bio_integrity_mark_head(struct bio_integrity_payload *bip,
+                             unsigned int skip)
+{
+        struct bio_vec *iv;
+        unsigned int i;
+        bip_for_each_vec(iv, bip, i) {
+                if (skip == 0) {
+                        bip->bip_idx = i;
+                        return;
+                } else if (skip >= iv->bv_len) {
+                        skip -= iv->bv_len;
+                } else { /* skip < iv->bv_len) */
+                        iv->bv_offset += skip;
+                        iv->bv_len -= skip;
+                        bip->bip_idx = i;
+                        return;
+                }
+        }
+}
+/**
+ * bio_integrity_mark_tail - Truncate bip_vec to be len bytes long
+ * @bip:        Integrity vector to truncate
+ * @len:        New length of integrity vector
+ */
+void bio_integrity_mark_tail(struct bio_integrity_payload *bip,
+                             unsigned int len)
+{
+        struct bio_vec *iv;
+        unsigned int i;
+        bip_for_each_vec(iv, bip, i) {
+                if (len == 0) {
+                        bip->bip_vcnt = i;
+                        return;
+                } else if (len >= iv->bv_len) {
+                        len -= iv->bv_len;
+                } else { /* len < iv->bv_len) */
+                        iv->bv_len = len;
+                        len = 0;
+                }
+        }
+}
+/**
+ * bio_integrity_advance - Advance integrity vector
+ * @bio:        bio whose integrity vector to update
+ * @bytes_done: number of data bytes that have been completed
+ *
+ * Description: This function calculates how many integrity bytes the
+ * number of completed data bytes correspond to and advances the
+ * integrity vector accordingly.
+ */
+void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip == NULL);
+        BUG_ON(bi == NULL);
+        nr_sectors = bio_integrity_hw_sectors(bi, bytes_done >> 9);
+        bio_integrity_mark_head(bip, nr_sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_advance);
+/**
+ * bio_integrity_trim - Trim integrity vector
+ * @bio:        bio whose integrity vector to update
+ * @offset:     offset to first data sector
+ * @sectors:    number of data sectors
+ *
+ * Description: Used to trim the integrity vector in a cloned bio.
+ * The ivec will be advanced corresponding to 'offset' data sectors
+ * and the length will be truncated corresponding to 'len' data
+ * sectors.
+ */
+void bio_integrity_trim(struct bio *bio, unsigned int offset,
+                        unsigned int sectors)
+{
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
+        unsigned int nr_sectors;
+        BUG_ON(bip == NULL);
+        BUG_ON(bi == NULL);
+        BUG_ON(!bio_flagged(bio, BIO_CLONED));
+        nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+        bip->bip_sector = bip->bip_sector + offset;
+        bio_integrity_mark_head(bip, offset * bi->tuple_size);
+        bio_integrity_mark_tail(bip, sectors * bi->tuple_size);
+}
+EXPORT_SYMBOL(bio_integrity_trim);
+/**
+ * bio_integrity_split - Split integrity metadata
+ * @bio:        Protected bio
+ * @bp:         Resulting bio_pair
+ * @sectors:    Offset
+ *
+ * Description: Splits an integrity page into a bio_pair.
+ */
+void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
+{
+        struct blk_integrity *bi;
+        struct bio_integrity_payload *bip = bio->bi_integrity;
+        unsigned int nr_sectors;
+        if (bio_integrity(bio) == 0)
+                return;
+        bi = bdev_get_integrity(bio->bi_bdev);
+        BUG_ON(bi == NULL);
+        BUG_ON(bip->bip_vcnt != 1);
+        nr_sectors = bio_integrity_hw_sectors(bi, sectors);
+        bp->bio1.bi_integrity = &bp->bip1;
+        bp->bio2.bi_integrity = &bp->bip2;
+        bp->iv1 = bip->bip_vec[0];
+        bp->iv2 = bip->bip_vec[0];
+        bp->bip1.bip_vec = &bp->iv1;
+        bp->bip2.bip_vec = &bp->iv2;
+        bp->iv1.bv_len = sectors * bi->tuple_size;
+        bp->iv2.bv_offset += sectors * bi->tuple_size;
+        bp->iv2.bv_len -= sectors * bi->tuple_size;
+        bp->bip1.bip_sector = bio->bi_integrity->bip_sector;
+        bp->bip2.bip_sector = bio->bi_integrity->bip_sector + nr_sectors;
+        bp->bip1.bip_vcnt = bp->bip2.bip_vcnt = 1;
+        bp->bip1.bip_idx = bp->bip2.bip_idx = 0;
+}
+EXPORT_SYMBOL(bio_integrity_split);
+/**
+ * bio_integrity_clone - Callback for cloning bios with integrity metadata
+ * @bio:        New bio
+ * @bio_src:    Original bio
+ * @bs:         bio_set to allocate bip from
+ *
+ * Description: Called to allocate a bip when cloning a bio
+ */
+int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
+                        struct bio_set *bs)
+{
+        struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
+        struct bio_integrity_payload *bip;
+        BUG_ON(bip_src == NULL);
+        bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs);
+        if (bip == NULL)
+                return -EIO;
+        memcpy(bip->bip_vec, bip_src->bip_vec,
+               bip_src->bip_vcnt * sizeof(struct bio_vec));
+        bip->bip_sector = bip_src->bip_sector;
+        bip->bip_vcnt = bip_src->bip_vcnt;
+        bip->bip_idx = bip_src->bip_idx;
+        return 0;
+}
+EXPORT_SYMBOL(bio_integrity_clone);
+int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+        bs->bio_integrity_pool = mempool_create_slab_pool(pool_size,
+                                                          bio_integrity_slab);
+        if (!bs->bio_integrity_pool)
+                return -1;
+        return 0;
+}
+EXPORT_SYMBOL(bioset_integrity_create);
+void bioset_integrity_free(struct bio_set *bs)
+{
+        if (bs->bio_integrity_pool)
+                mempool_destroy(bs->bio_integrity_pool);
+}
+EXPORT_SYMBOL(bioset_integrity_free);
+void __init bio_integrity_init_slab(void)
+{
+        bio_integrity_slab = KMEM_CACHE(bio_integrity_payload,
+                                        SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+}
+EXPORT_SYMBOL(bio_integrity_init_slab);
+static int __init integrity_init(void)
+{
+        kintegrityd_wq = create_workqueue("kintegrityd");
+        if (!kintegrityd_wq)
+                panic("Failed to create kintegrityd\n");
+        return 0;
+}
+subsys_initcall(integrity_init);
diff --git a/fs/bio.c b/fs/bio.c
index 78562574cb52..88322b066acb 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -28,25 +28,10 @@
 #include <linux/blktrace_api.h>
 #include <scsi/sg.h>            /* for struct sg_iovec */
-#define BIO_POOL_SIZE 2
 static struct kmem_cache *bio_slab __read_mostly;
-#define BIOVEC_NR_POOLS 6
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
 mempool_t *bio_split_pool __read_mostly;
-struct biovec_slab {
-        int nr_vecs;
-        char *name; 
-        struct kmem_cache *slab;
-};
 /*
 * if you change this list, also change bvec_alloc or things will
 * break badly! cannot be bigger than what you can fit into an
@@ -60,23 +45,17 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
 #undef BV
 /*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-struct bio_set {
-        mempool_t *bio_pool;
-        mempool_t *bvec_pools[BIOVEC_NR_POOLS];
-};
-/*
 * fs_bio_set is the bio_set containing bio and iovec memory pools used by
 * IO code that does not need private memory pools.
 */
-static struct bio_set *fs_bio_set;
+struct bio_set *fs_bio_set;
+unsigned int bvec_nr_vecs(unsigned short idx)
+{
+        return bvec_slabs[idx].nr_vecs;
+}
-static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
        struct bio_vec *bvl;
@@ -117,6 +96,9 @@ void bio_free(struct bio *bio, struct bio_set *bio_set)
                mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
        }
+        if (bio_integrity(bio))
+                bio_integrity_free(bio, bio_set);
        mempool_free(bio, bio_set->bio_pool);
 }
@@ -275,9 +257,19 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
        struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
-        if (b) {
+        if (!b)
-                b->bi_destructor = bio_fs_destructor;
+                return NULL;
-                __bio_clone(b, bio);
+        b->bi_destructor = bio_fs_destructor;
+        __bio_clone(b, bio);
+        if (bio_integrity(bio)) {
+                int ret;
+                ret = bio_integrity_clone(b, bio, fs_bio_set);
+                if (ret < 0)
+                        return NULL;
        }
        return b;
@@ -333,10 +325,19 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
                if (page == prev->bv_page &&
                    offset == prev->bv_offset + prev->bv_len) {
                        prev->bv_len += len;
-                        if (q->merge_bvec_fn &&
-                            q->merge_bvec_fn(q, bio, prev) < len) {
+                        if (q->merge_bvec_fn) {
-                                prev->bv_len -= len;
+                                struct bvec_merge_data bvm = {
-                                return 0;
+                                        .bi_bdev = bio->bi_bdev,
+                                        .bi_sector = bio->bi_sector,
+                                        .bi_size = bio->bi_size,
+                                        .bi_rw = bio->bi_rw,
+                                };
+                                if (q->merge_bvec_fn(q, &bvm, prev) < len) {
+                                        prev->bv_len -= len;
+                                        return 0;
+                                }
                        }
                        goto done;
@@ -377,11 +378,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
         * queue to get further control
         */
        if (q->merge_bvec_fn) {
+                struct bvec_merge_data bvm = {
+                        .bi_bdev = bio->bi_bdev,
+                        .bi_sector = bio->bi_sector,
+                        .bi_size = bio->bi_size,
+                        .bi_rw = bio->bi_rw,
+                };
                /*
                 * merge_bvec_fn() returns number of bytes it can accept
                 * at this offset
                 */
-                if (q->merge_bvec_fn(q, bio, bvec) < len) {
+                if (q->merge_bvec_fn(q, &bvm, bvec) < len) {
                        bvec->bv_page = NULL;
                        bvec->bv_len = 0;
                        bvec->bv_offset = 0;
@@ -1249,6 +1257,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
        bp->bio1.bi_private = bi;
        bp->bio2.bi_private = pool;
+        if (bio_integrity(bi))
+                bio_integrity_split(bi, bp, first_sectors);
        return bp;
 }
@@ -1290,6 +1301,7 @@ void bioset_free(struct bio_set *bs)
        if (bs->bio_pool)
                mempool_destroy(bs->bio_pool);
+        bioset_integrity_free(bs);
        biovec_free_pools(bs);
        kfree(bs);
@@ -1306,6 +1318,9 @@ struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
        if (!bs->bio_pool)
                goto bad;
+        if (bioset_integrity_create(bs, bio_pool_size))
+                goto bad;
        if (!biovec_create_pools(bs, bvec_pool_size))
                return bs;
@@ -1332,6 +1347,7 @@ static int __init init_bio(void)
 {
        bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+        bio_integrity_init_slab();
        biovec_init_slabs();
        fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 68e510b88457..3cb7cda3d780 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -373,6 +373,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                        return -ENXIO;
                new = container_of(kobj, struct cdev, kobj);
                spin_lock(&cdev_lock);
+                /* Check i_cdev again in case somebody beat us to it while
+                   we dropped the lock. */
                p = inode->i_cdev;
                if (!p) {
                        inode->i_cdev = p = new;
@@ -392,11 +394,8 @@ static int chrdev_open(struct inode *inode, struct file *filp)
                cdev_put(p);
                return -ENXIO;
        }
-        if (filp->f_op->open) {
+        if (filp->f_op->open)
-                lock_kernel();
                ret = filp->f_op->open(inode,filp);
-                unlock_kernel();
-        }
        if (ret)
                cdev_put(p);
        return ret;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 86b4d5f405ae..22857c639df5 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -612,7 +612,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
                if (retval < 0)
                        return (loff_t)retval;
        }
-        return remote_llseek(file, offset, origin);
+        return generic_file_llseek_unlocked(file, offset, origin);
 }
 struct file_system_type cifs_fs_type = {
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebbcf38fd33b..f976f303c196 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -15,6 +15,7 @@
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/spinlock.h>
+#include <linux/smp_lock.h>
 #include <linux/dlm.h>
 #include <linux/dlm_device.h>
@@ -618,13 +619,17 @@ static int device_open(struct inode *inode, struct file *file)
        struct dlm_user_proc *proc;
        struct dlm_ls *ls;
+        lock_kernel();
        ls = dlm_find_lockspace_device(iminor(inode));
-        if (!ls)
+        if (!ls) {
+                unlock_kernel();
                return -ENOENT;
+        }
        proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL);
        if (!proc) {
                dlm_put_lockspace(ls);
+                unlock_kernel();
                return -ENOMEM;
        }
@@ -636,6 +641,7 @@ static int device_open(struct inode *inode, struct file *file)
        spin_lock_init(&proc->locks_spin);
        init_waitqueue_head(&proc->wait);
        file->private_data = proc;
+        unlock_kernel();
        return 0;
 }
@@ -870,6 +876,7 @@ static unsigned int device_poll(struct file *file, poll_table *wait)
 static int ctl_device_open(struct inode *inode, struct file *file)
 {
+        cycle_kernel_lock();
        file->private_data = NULL;
        return 0;
 }
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 2258b8f654a6..24749bf0668f 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
+#include <linux/smp_lock.h>
 #include "ecryptfs_kernel.h"
 /**
@@ -277,9 +278,11 @@ static int ecryptfs_fasync(int fd, struct file *file, int flag)
        int rc = 0;
        struct file *lower_file = NULL;
+        lock_kernel();
        lower_file = ecryptfs_file_to_lower(file);
        if (lower_file->f_op && lower_file->f_op->fasync)
                rc = lower_file->f_op->fasync(fd, lower_file, flag);
+        unlock_kernel();
        return rc;
 }
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index fda25479af26..3a9ecac8d61f 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -61,7 +61,7 @@ void fat_cache_destroy(void)
 static inline struct fat_cache *fat_cache_alloc(struct inode *inode)
 {
-        return kmem_cache_alloc(fat_cache_cachep, GFP_KERNEL);
+        return kmem_cache_alloc(fat_cache_cachep, GFP_NOFS);
 }
 static inline void fat_cache_free(struct fat_cache *cache)
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 486725ee99ae..34541d06e626 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -472,7 +472,7 @@ static int __fat_readdir(struct inode *inode, struct file *filp, void *dirent,
        loff_t cpos;
        int ret = 0;
-        lock_kernel();
+        lock_super(sb);
        cpos = filp->f_pos;
        /* Fake . and .. for the root directory. */
@@ -654,7 +654,7 @@ FillFailed:
        if (unicode)
                __putname(unicode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return ret;
 }
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 771326b8047e..c672df4036e9 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -11,7 +11,6 @@
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/msdos_fs.h>
-#include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
@@ -242,9 +241,7 @@ void fat_truncate(struct inode *inode)
        nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
-        lock_kernel();
        fat_free(inode, nr_clusters);
-        unlock_kernel();
        fat_flush_inodes(inode->i_sb, inode, NULL);
 }
@@ -310,8 +307,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        int error = 0;
        unsigned int ia_valid;
-        lock_kernel();
        /*
         * Expand the file. Since inode_setattr() updates ->i_size
         * before calling the ->truncate(), but FAT needs to fill the
@@ -366,7 +361,6 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        error = inode_setattr(inode, attr);
 out:
-        unlock_kernel();
        return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4e0a3dd9d677..46a4508ffd2e 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -440,14 +440,13 @@ static void fat_delete_inode(struct inode *inode)
 static void fat_clear_inode(struct inode *inode)
 {
-        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+        struct super_block *sb = inode->i_sb;
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
-        lock_kernel();
        spin_lock(&sbi->inode_hash_lock);
        fat_cache_inval_inode(inode);
        hlist_del_init(&MSDOS_I(inode)->i_fat_hash);
        spin_unlock(&sbi->inode_hash_lock);
-        unlock_kernel();
 }
 static void fat_write_super(struct super_block *sb)
@@ -485,7 +484,7 @@ static struct kmem_cache *fat_inode_cachep;
 static struct inode *fat_alloc_inode(struct super_block *sb)
 {
        struct msdos_inode_info *ei;
-        ei = kmem_cache_alloc(fat_inode_cachep, GFP_KERNEL);
+        ei = kmem_cache_alloc(fat_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
        return &ei->vfs_inode;
@@ -567,7 +566,7 @@ retry:
        if (inode->i_ino == MSDOS_ROOT_INO || !i_pos)
                return 0;
-        lock_kernel();
+        lock_super(sb);
        bh = sb_bread(sb, i_pos >> sbi->dir_per_block_bits);
        if (!bh) {
                printk(KERN_ERR "FAT: unable to read inode block "
@@ -579,7 +578,7 @@ retry:
        if (i_pos != MSDOS_I(inode)->i_pos) {
                spin_unlock(&sbi->inode_hash_lock);
                brelse(bh);
-                unlock_kernel();
+                unlock_super(sb);
                goto retry;
        }
@@ -606,7 +605,7 @@ retry:
                err = sync_dirty_buffer(bh);
        brelse(bh);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -736,6 +735,7 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
 static struct dentry *fat_get_parent(struct dentry *child)
 {
+        struct super_block *sb = child->d_sb;
        struct buffer_head *bh;
        struct msdos_dir_entry *de;
        loff_t i_pos;
@@ -743,14 +743,14 @@ static struct dentry *fat_get_parent(struct dentry *child)
        struct inode *inode;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = fat_get_dotdot_entry(child->d_inode, &bh, &de, &i_pos);
        if (err) {
                parent = ERR_PTR(err);
                goto out;
        }
-        inode = fat_build_inode(child->d_sb, de, i_pos);
+        inode = fat_build_inode(sb, de, i_pos);
        brelse(bh);
        if (IS_ERR(inode)) {
                parent = ERR_CAST(inode);
@@ -762,7 +762,7 @@ static struct dentry *fat_get_parent(struct dentry *child)
                parent = ERR_PTR(-ENOMEM);
        }
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return parent;
 }
@@ -1172,6 +1172,12 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        long error;
        char buf[50];
+        /*
+         * GFP_KERNEL is ok here, because while we do hold the
+         * supeblock lock, memory pressure can't call back into
+         * the filesystem, since we're only just about to mount
+         * it and have no inodes etc active!
+         */
        sbi = kzalloc(sizeof(struct msdos_sb_info), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index bfd776509a72..330a7d782591 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
 #include <linux/fdtable.h>
 #include <linux/capability.h>
 #include <linux/dnotify.h>
-#include <linux/smp_lock.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/security.h>
@@ -227,7 +226,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
        if (error)
                return error;
-        lock_kernel();
        if ((arg ^ filp->f_flags) & FASYNC) {
                if (filp->f_op && filp->f_op->fasync) {
                        error = filp->f_op->fasync(fd, filp, (arg & FASYNC) != 0);
@@ -238,7 +236,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
        filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
 out:
-        unlock_kernel();
        return error;
 }
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index e1b7d525a066..24dd59450088 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -62,11 +62,11 @@ static loff_t gfs2_llseek(struct file *file, loff_t offset, int origin)
                error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
                                           &i_gh);
                if (!error) {
-                        error = remote_llseek(file, offset, origin);
+                        error = generic_file_llseek_unlocked(file, offset, origin);
                        gfs2_glock_dq_uninit(&i_gh);
                }
        } else
-                error = remote_llseek(file, offset, origin);
+                error = generic_file_llseek_unlocked(file, offset, origin);
        return error;
 }
diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c
index 05ff4f1d7026..1f7f2956412a 100644
--- a/fs/msdos/namei.c
+++ b/fs/msdos/namei.c
@@ -214,7 +214,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry,
        dentry->d_op = &msdos_dentry_operations;
-        lock_kernel();
+        lock_super(sb);
        res = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
        if (res == -ENOENT)
                goto add;
@@ -232,7 +232,7 @@ add:
        if (dentry)
                dentry->d_op = &msdos_dentry_operations;
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!res)
                return dentry;
        return ERR_PTR(res);
@@ -286,7 +286,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
        unsigned char msdos_name[MSDOS_NAME];
        int err, is_hid;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
                                msdos_name, &MSDOS_SB(sb)->options);
@@ -315,7 +315,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, int mode,
        d_instantiate(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
                err = fat_flush_inodes(sb, dir, inode);
        return err;
@@ -324,11 +324,12 @@ out:
 /***** Remove a directory */
 static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
 {
+        struct super_block *sb = dir->i_sb;
        struct inode *inode = dentry->d_inode;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        /*
         * Check whether the directory is not in use, then check
         * whether it is empty.
@@ -349,9 +350,9 @@ static int msdos_rmdir(struct inode *dir, struct dentry *dentry)
        inode->i_ctime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(inode->i_sb, dir, inode);
+                err = fat_flush_inodes(sb, dir, inode);
        return err;
 }
@@ -366,7 +367,7 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct timespec ts;
        int err, is_hid, cluster;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(dentry->d_name.name, dentry->d_name.len,
                                msdos_name, &MSDOS_SB(sb)->options);
@@ -404,14 +405,14 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        d_instantiate(dentry, inode);
-        unlock_kernel();
+        unlock_super(sb);
        fat_flush_inodes(sb, dir, inode);
        return 0;
 out_free:
        fat_free_clusters(dir, cluster);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -419,10 +420,11 @@ out:
 static int msdos_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb= inode->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_find(dir, dentry->d_name.name, dentry->d_name.len, &sinfo);
        if (err)
                goto out;
@@ -434,9 +436,9 @@ static int msdos_unlink(struct inode *dir, struct dentry *dentry)
        inode->i_ctime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(inode->i_sb, dir, inode);
+                err = fat_flush_inodes(sb, dir, inode);
        return err;
 }
@@ -618,10 +620,11 @@ error_inode:
 static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
                        struct inode *new_dir, struct dentry *new_dentry)
 {
+        struct super_block *sb = old_dir->i_sb;
        unsigned char old_msdos_name[MSDOS_NAME], new_msdos_name[MSDOS_NAME];
        int err, is_hid;
-        lock_kernel();
+        lock_super(sb);
        err = msdos_format_name(old_dentry->d_name.name,
                                old_dentry->d_name.len, old_msdos_name,
@@ -640,9 +643,9 @@ static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry,
        err = do_msdos_rename(old_dir, old_msdos_name, old_dentry,
                              new_dir, new_msdos_name, new_dentry, is_hid);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        if (!err)
-                err = fat_flush_inodes(old_dir->i_sb, old_dir, new_dir);
+                err = fat_flush_inodes(sb, old_dir, new_dir);
        return err;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 4fc302c2a0e0..4f6f7635b59c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -750,7 +750,7 @@ struct proc_fs_info {
        const char *str;
 };
-static void show_sb_opts(struct seq_file *m, struct super_block *sb)
+static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 {
        static const struct proc_fs_info fs_info[] = {
                { MS_SYNCHRONOUS, ",sync" },
@@ -764,6 +764,8 @@ static void show_sb_opts(struct seq_file *m, struct super_block *sb)
                if (sb->s_flags & fs_infop->flag)
                        seq_puts(m, fs_infop->str);
        }
+        return security_sb_show_options(m, sb);
 }
 static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
@@ -806,11 +808,14 @@ static int show_vfsmnt(struct seq_file *m, void *v)
        seq_putc(m, ' ');
        show_type(m, mnt->mnt_sb);
        seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
-        show_sb_opts(m, mnt->mnt_sb);
+        err = show_sb_opts(m, mnt->mnt_sb);
+        if (err)
+                goto out;
        show_mnt_opts(m, mnt);
        if (mnt->mnt_sb->s_op->show_options)
                err = mnt->mnt_sb->s_op->show_options(m, mnt);
        seq_puts(m, " 0 0\n");
+out:
        return err;
 }
@@ -865,10 +870,13 @@ static int show_mountinfo(struct seq_file *m, void *v)
        seq_putc(m, ' ');
        mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
        seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
-        show_sb_opts(m, sb);
+        err = show_sb_opts(m, sb);
+        if (err)
+                goto out;
        if (sb->s_op->show_options)
                err = sb->s_op->show_options(m, mnt);
        seq_putc(m, '\n');
+out:
        return err;
 }
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 2b145de45b39..6a7d901f1936 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
@@ -281,9 +282,18 @@ static int ncp_release(struct inode *inode, struct file *file) {
        return 0;
 }
+static loff_t ncp_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t ret;
+        lock_kernel();
+        ret = generic_file_llseek_unlocked(file, offset, origin);
+        unlock_kernel();
+        return ret;
+}
 const struct file_operations ncp_file_operations =
 {
-        .llseek         = remote_llseek,
+        .llseek         = ncp_remote_llseek,
        .read           = ncp_file_read,
        .write          = ncp_file_write,
        .ioctl          = ncp_ioctl,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d84a3d8f32af..4e98a56a1777 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,6 +170,7 @@ force_reval:
 static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
 {
+        loff_t loff;
        /* origin == SEEK_END => we must revalidate the cached file length */
        if (origin == SEEK_END) {
                struct inode *inode = filp->f_mapping->host;
@@ -177,7 +178,10 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin)
                if (retval < 0)
                        return (loff_t)retval;
        }
-        return remote_llseek(filp, offset, origin);
+        lock_kernel();  /* BKL needed? */
+        loff = generic_file_llseek_unlocked(filp, offset, origin);
+        unlock_kernel();
+        return loff;
 }
 /*
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c021280dd462..bd7e0f3acfc7 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
+#include <linux/smp_lock.h>
 #include <linux/reboot.h>
 #include <asm/uaccess.h>
@@ -619,10 +620,12 @@ static int ocfs2_control_open(struct inode *inode, struct file *file)
                return -ENOMEM;
        p->op_this_node = -1;
+        lock_kernel();
        mutex_lock(&ocfs2_control_lock);
        file->private_data = p;
        list_add(&p->op_list, &ocfs2_control_private_list);
        mutex_unlock(&ocfs2_control_lock);
+        unlock_kernel();
        return 0;
 }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3b455371e7ff..58c3e6a8e15e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -233,7 +233,7 @@ static int check_mem_permission(struct task_struct *task)
         */
        if (task->parent == current && (task->ptrace & PT_PTRACED) &&
            task_is_stopped_or_traced(task) &&
-            ptrace_may_attach(task))
+            ptrace_may_access(task, PTRACE_MODE_ATTACH))
                return 0;
        /*
@@ -251,7 +251,8 @@ struct mm_struct *mm_for_maps(struct task_struct *task)
        task_lock(task);
        if (task->mm != mm)
                goto out;
-        if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
+        if (task->mm != current->mm &&
+            __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
                goto out;
        task_unlock(task);
        return mm;
@@ -518,7 +519,7 @@ static int proc_fd_access_allowed(struct inode *inode)
         */
        task = get_proc_task(inode);
        if (task) {
-                allowed = ptrace_may_attach(task);
+                allowed = ptrace_may_access(task, PTRACE_MODE_READ);
                put_task_struct(task);
        }
        return allowed;
@@ -904,7 +905,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
        if (!task)
                goto out_no_task;
-        if (!ptrace_may_attach(task))
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out;
        ret = -ENOMEM;
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 7e277f2ad466..c652d469dc08 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -123,6 +123,11 @@ static int uptime_read_proc(char *page, char **start, off_t off,
        return proc_calc_metrics(page, start, off, count, eof, len);
 }
+int __attribute__((weak)) arch_report_meminfo(char *page)
+{
+        return 0;
+}
 static int meminfo_read_proc(char *page, char **start, off_t off,
                                 int count, int *eof, void *data)
 {
@@ -221,6 +226,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
                len += hugetlb_report_meminfo(page + len);
+        len += arch_report_meminfo(page + len);
        return proc_calc_metrics(page, start, off, count, eof, len);
 #undef K
 }
@@ -472,6 +479,13 @@ static const struct file_operations proc_vmalloc_operations = {
 };
 #endif
+#ifndef arch_irq_stat_cpu
+#define arch_irq_stat_cpu(cpu) 0
+#endif
+#ifndef arch_irq_stat
+#define arch_irq_stat() 0
+#endif
 static int show_stat(struct seq_file *p, void *v)
 {
        int i;
@@ -509,7 +523,9 @@ static int show_stat(struct seq_file *p, void *v)
                        sum += temp;
                        per_irq_sum[j] += temp;
                }
+                sum += arch_irq_stat_cpu(i);
        }
+        sum += arch_irq_stat();
        seq_printf(p, "cpu  %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
                (unsigned long long)cputime64_to_clock_t(user),
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index c492449f3b45..164bd9f9ede3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -210,7 +210,7 @@ static int show_map(struct seq_file *m, void *v)
        dev_t dev = 0;
        int len;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        if (file) {
@@ -646,7 +646,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
                goto out;
        ret = -EACCES;
-        if (!ptrace_may_attach(task))
+        if (!ptrace_may_access(task, PTRACE_MODE_READ))
                goto out_task;
        ret = -EINVAL;
@@ -747,7 +747,7 @@ static int show_numa_map_checked(struct seq_file *m, void *v)
        struct proc_maps_private *priv = m->private;
        struct task_struct *task = priv->task;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        return show_numa_map(m, v);
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 4b4f9cc2f186..5d84e7121df8 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -113,7 +113,7 @@ static int show_map(struct seq_file *m, void *_vml)
        struct proc_maps_private *priv = m->private;
        struct task_struct *task = priv->task;
-        if (maps_protect && !ptrace_may_attach(task))
+        if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ))
                return -EACCES;
        return nommu_vma_show(m, vml->vma);
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 9590b9024300..78f613cb9c76 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -45,6 +45,7 @@ const struct file_operations ramfs_file_operations = {
        .mmap           = generic_file_mmap,
        .fsync          = simple_sync_file,
        .splice_read    = generic_file_splice_read,
+        .splice_write   = generic_file_splice_write,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 0989bc2c2f69..52312ec93ff4 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,6 +43,7 @@ const struct file_operations ramfs_file_operations = {
        .aio_write              = generic_file_aio_write,
        .fsync                  = simple_sync_file,
        .splice_read            = generic_file_splice_read,
+        .splice_write           = generic_file_splice_write,
        .llseek                 = generic_file_llseek,
 };
diff --git a/fs/read_write.c b/fs/read_write.c
index f0d1240a5c69..9ba495d5a29b 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -31,12 +31,12 @@ const struct file_operations generic_ro_fops = {
 EXPORT_SYMBOL(generic_ro_fops);
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
+loff_t
+generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
 {
        loff_t retval;
        struct inode *inode = file->f_mapping->host;
-        mutex_lock(&inode->i_mutex);
        switch (origin) {
                case SEEK_END:
                        offset += inode->i_size;
@@ -46,42 +46,26 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
        }
        retval = -EINVAL;
        if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
+                /* Special lock needed here? */
                if (offset != file->f_pos) {
                        file->f_pos = offset;
                        file->f_version = 0;
                }
                retval = offset;
        }
-        mutex_unlock(&inode->i_mutex);
        return retval;
 }
+EXPORT_SYMBOL(generic_file_llseek_unlocked);
-EXPORT_SYMBOL(generic_file_llseek);
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
-loff_t remote_llseek(struct file *file, loff_t offset, int origin)
 {
-        loff_t retval;
+        loff_t n;
+        mutex_lock(&file->f_dentry->d_inode->i_mutex);
-        lock_kernel();
+        n = generic_file_llseek_unlocked(file, offset, origin);
-        switch (origin) {
+        mutex_unlock(&file->f_dentry->d_inode->i_mutex);
-                case SEEK_END:
+        return n;
-                        offset += i_size_read(file->f_path.dentry->d_inode);
-                        break;
-                case SEEK_CUR:
-                        offset += file->f_pos;
-        }
-        retval = -EINVAL;
-        if (offset>=0 && offset<=file->f_path.dentry->d_inode->i_sb->s_maxbytes) {
-                if (offset != file->f_pos) {
-                        file->f_pos = offset;
-                        file->f_version = 0;
-                }
-                retval = offset;
-        }
-        unlock_kernel();
-        return retval;
 }
-EXPORT_SYMBOL(remote_llseek);
+EXPORT_SYMBOL(generic_file_llseek);
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index efbe29af3d7a..2294783320cb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -422,9 +422,18 @@ smb_file_permission(struct inode *inode, int mask, struct nameidata *nd)
        return error;
 }
+static loff_t smb_remote_llseek(struct file *file, loff_t offset, int origin)
+{
+        loff_t ret;
+        lock_kernel();
+        ret = generic_file_llseek_unlocked(file, offset, origin);
+        unlock_kernel();
+        return ret;
+}
 const struct file_operations smb_file_operations =
 {
-        .llseek         = remote_llseek,
+        .llseek         = smb_remote_llseek,
        .read           = do_sync_read,
        .aio_read       = smb_file_aio_read,
        .write          = do_sync_write,
diff --git a/fs/splice.c b/fs/splice.c
index aa5f6f60b305..399442179d89 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -379,13 +379,22 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                lock_page(page);
                        /*
-                         * page was truncated, stop here. if this isn't the
+                         * Page was truncated, or invalidated by the
-                         * first page, we'll just complete what we already
+                         * filesystem.  Redo the find/create, but this time the
-                         * added
+                         * page is kept locked, so there's no chance of another
+                         * race with truncate/invalidate.
                         */
                        if (!page->mapping) {
                                unlock_page(page);
-                                break;
+                                page = find_or_create_page(mapping, index,
+                                                mapping_gfp_mask(mapping));
+                                if (!page) {
+                                        error = -ENOMEM;
+                                        break;
+                                }
+                                page_cache_release(pages[page_nr]);
+                                pages[page_nr] = page;
                        }
                        /*
                         * page was already under io and is now done, great
diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c
index a3522727ea5b..b546ba69be82 100644
--- a/fs/vfat/namei.c
+++ b/fs/vfat/namei.c
@@ -645,7 +645,7 @@ static int vfat_add_entry(struct inode *dir, struct qstr *qname, int is_dir,
        if (len == 0)
                return -ENOENT;
-        slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_KERNEL);
+        slots = kmalloc(sizeof(*slots) * MSDOS_SLOTS, GFP_NOFS);
        if (slots == NULL)
                return -ENOMEM;
@@ -687,7 +687,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        struct dentry *alias;
        int err, table;
-        lock_kernel();
+        lock_super(sb);
        table = (MSDOS_SB(sb)->options.name_check == 's') ? 2 : 0;
        dentry->d_op = &vfat_dentry_ops[table];
@@ -699,7 +699,7 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
        inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos);
        brelse(sinfo.bh);
        if (IS_ERR(inode)) {
-                unlock_kernel();
+                unlock_super(sb);
                return ERR_CAST(inode);
        }
        alias = d_find_alias(inode);
@@ -708,13 +708,13 @@ static struct dentry *vfat_lookup(struct inode *dir, struct dentry *dentry,
                        dput(alias);
                else {
                        iput(inode);
-                        unlock_kernel();
+                        unlock_super(sb);
                        return alias;
                }
        }
 error:
-        unlock_kernel();
+        unlock_super(sb);
        dentry->d_op = &vfat_dentry_ops[table];
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        dentry = d_splice_alias(inode, dentry);
@@ -734,7 +734,7 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
        struct timespec ts;
        int err;
-        lock_kernel();
+        lock_super(sb);
        ts = CURRENT_TIME_SEC;
        err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo);
@@ -755,17 +755,18 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, int mode,
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
 static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = fat_dir_empty(inode);
        if (err)
@@ -783,7 +784,7 @@ static int vfat_rmdir(struct inode *dir, struct dentry *dentry)
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -791,10 +792,11 @@ out:
 static int vfat_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
+        struct super_block *sb = dir->i_sb;
        struct fat_slot_info sinfo;
        int err;
-        lock_kernel();
+        lock_super(sb);
        err = vfat_find(dir, &dentry->d_name, &sinfo);
        if (err)
@@ -807,7 +809,7 @@ static int vfat_unlink(struct inode *dir, struct dentry *dentry)
        inode->i_mtime = inode->i_atime = CURRENT_TIME_SEC;
        fat_detach(inode);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -820,7 +822,7 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct timespec ts;
        int err, cluster;
-        lock_kernel();
+        lock_super(sb);
        ts = CURRENT_TIME_SEC;
        cluster = fat_alloc_new_dir(dir, &ts);
@@ -849,13 +851,13 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        dentry->d_time = dentry->d_parent->d_inode->i_version;
        d_instantiate(dentry, inode);
-        unlock_kernel();
+        unlock_super(sb);
        return 0;
 out_free:
        fat_free_clusters(dir, cluster);
 out:
-        unlock_kernel();
+        unlock_super(sb);
        return err;
 }
@@ -869,11 +871,12 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct timespec ts;
        loff_t dotdot_i_pos, new_i_pos;
        int err, is_dir, update_dotdot, corrupt = 0;
+        struct super_block *sb = old_dir->i_sb;
        old_sinfo.bh = sinfo.bh = dotdot_bh = NULL;
        old_inode = old_dentry->d_inode;
        new_inode = new_dentry->d_inode;
-        lock_kernel();
+        lock_super(sb);
        err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo);
        if (err)
                goto out;
@@ -951,7 +954,7 @@ out:
        brelse(sinfo.bh);
        brelse(dotdot_bh);
        brelse(old_sinfo.bh);
-        unlock_kernel();
+        unlock_super(sb);
        return err;
author	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2008-07-15 01:44:51 -0400
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2008-07-15 01:44:51 -0400
commit	43d2548bb2ef7e6d753f91468a746784041e522d (patch)
tree	77d13fcd48fd998393abb825ec36e2b732684a73 /fs
parent	585583d95c5660973bc0cf64add517b040acd8a4 (diff)
parent	85082fd7cbe3173198aac0eb5e85ab1edcc6352c (diff)