Merge /spare/repo/linux-2.6/

author: Jeff Garzik <jgarzik@pobox.com> 2005-09-08 05:39:55 -0400
committer: Jeff Garzik <jgarzik@pobox.com> 2005-09-08 05:39:55 -0400
commit: c324b44c34050cf2a9b58830e11c974806bd85d8 (patch)
tree: 3ac45a783221283925cd698334a8f5e7dd4c1df8 /fs
parent: 2fcf522509cceea524b6e7ece8fd6759b682175a (diff)
parent: caf39e87cc1182f7dae84eefc43ca14d54c78ef9 (diff)
119 files changed, 4024 insertions, 1794 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index e54be7058359..5e817902cb3b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -382,10 +382,8 @@ config QUOTA
          usage (also called disk quotas). Currently, it works for the
          ext2, ext3, and reiserfs file system. ext3 also supports journalled
          quotas for which you don't need to run quotacheck(8) after an unclean
-          shutdown. You need additional software in order to use quota support
+          shutdown.
-          (you can download sources from
+          For further details, read the Quota mini-HOWTO, available from
-          <http://www.sf.net/projects/linuxquota/>). For further details, read
-          the Quota mini-HOWTO, available from
          <http://www.tldp.org/docs.html#howto>, or the documentation provided
          with the quota tools. Probably the quota support is only useful for
          multi user systems. If unsure, say N.
@@ -403,8 +401,7 @@ config QFMT_V2
        depends on QUOTA
        help
          This quota format allows using quotas with 32-bit UIDs/GIDs. If you
-          need this functionality say Y here. Note that you will need recent
+          need this functionality say Y here.
-          quota utilities (>= 3.01) for new quota format with this kernel.
 config QUOTACTL
        bool
@@ -783,28 +780,6 @@ config SYSFS
        Designers of embedded systems may wish to say N here to conserve space.
-config DEVPTS_FS_XATTR
-        bool "/dev/pts Extended Attributes"
-        depends on UNIX98_PTYS
-        help
-          Extended attributes are name:value pairs associated with inodes by
-          the kernel or by users (see the attr(5) manual page, or visit
-          <http://acl.bestbits.at/> for details).
-          If unsure, say N.
-config DEVPTS_FS_SECURITY
-        bool "/dev/pts Security Labels"
-        depends on DEVPTS_FS_XATTR
-        help
-          Security labels support alternative access control models
-          implemented by security modules like SELinux.  This option
-          enables an extended attribute handler for file security
-          labels in the /dev/pts filesystem.
-          If you are not using a security module that requires using
-          extended attributes for file security labels, say N.
 config TMPFS
        bool "Virtual memory file system support (former shm fs)"
        help
@@ -817,27 +792,6 @@ config TMPFS
          See <file:Documentation/filesystems/tmpfs.txt> for details.
-config TMPFS_XATTR
-        bool "tmpfs Extended Attributes"
-        depends on TMPFS
-        help
-          Extended attributes are name:value pairs associated with inodes by
-          the kernel or by users (see the attr(5) manual page, or visit
-          <http://acl.bestbits.at/> for details).
-          If unsure, say N.
-config TMPFS_SECURITY
-        bool "tmpfs Security Labels"
-        depends on TMPFS_XATTR
-        help
-          Security labels support alternative access control models
-          implemented by security modules like SELinux.  This option
-          enables an extended attribute handler for file security
-          labels in the tmpfs filesystem.
-          If you are not using a security module that requires using
-          extended attributes for file security labels, say N.
 config HUGETLBFS
        bool "HugeTLB file system support"
        depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || X86_64 || BROKEN
@@ -859,6 +813,18 @@ config RAMFS
          To compile this as a module, choose M here: the module will be called
          ramfs.
+config RELAYFS_FS
+        tristate "Relayfs file system support"
+        ---help---
+          Relayfs is a high-speed data relay filesystem designed to provide
+          an efficient mechanism for tools and facilities to relay large
+          amounts of data from kernel space to user space.
+          To compile this code as a module, choose M here: the module will be
+          called relayfs.
+          If unsure, say N.
 endmenu
 menu "Miscellaneous filesystems"
diff --git a/fs/Makefile b/fs/Makefile
index cf95eb894fd5..15158309dee4 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -90,6 +90,7 @@ obj-$(CONFIG_AUTOFS_FS)		+= autofs/
 obj-$(CONFIG_AUTOFS4_FS)        += autofs4/
 obj-$(CONFIG_ADFS_FS)           += adfs/
 obj-$(CONFIG_UDF_FS)            += udf/
+obj-$(CONFIG_RELAYFS_FS)        += relayfs/
 obj-$(CONFIG_SUN_OPENPROMFS)    += openpromfs/
 obj-$(CONFIG_JFS_FS)            += jfs/
 obj-$(CONFIG_XFS_FS)            += xfs/
diff --git a/fs/aio.c b/fs/aio.c
index 06d7d4390fe7..4f641abac3c0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -567,6 +567,10 @@ static void use_mm(struct mm_struct *mm)
        atomic_inc(&mm->mm_count);
        tsk->mm = mm;
        tsk->active_mm = mm;
+        /*
+         * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
+         * it won't work. Update it accordingly if you change it here
+         */
        activate_mm(active_mm, mm);
        task_unlock(tsk);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index c8998dc66882..7974efa107bc 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -520,7 +520,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
                down_write(&current->mm->mmap_sem);
-                textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_SHARED, 0);
+                textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE, 0);
                up_write(&current->mm->mmap_sem);
                if (!textpos  || textpos >= (unsigned long) -4096) {
                        if (!textpos)
diff --git a/fs/bio.c b/fs/bio.c
index 1f2d4649b188..a7d4fd3a3299 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
+#include <scsi/sg.h>            /* for struct sg_iovec */
 #define BIO_POOL_SIZE 256
@@ -104,18 +105,22 @@ static inline struct bio_vec *bvec_alloc_bs(unsigned int __nocast gfp_mask, int
        return bvl;
 }
-/*
+void bio_free(struct bio *bio, struct bio_set *bio_set)
- * default destructor for a bio allocated with bio_alloc_bioset()
- */
-static void bio_destructor(struct bio *bio)
 {
        const int pool_idx = BIO_POOL_IDX(bio);
-        struct bio_set *bs = bio->bi_set;
        BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
-        mempool_free(bio->bi_io_vec, bs->bvec_pools[pool_idx]);
+        mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
-        mempool_free(bio, bs->bio_pool);
+        mempool_free(bio, bio_set->bio_pool);
+}
+/*
+ * default destructor for a bio allocated with bio_alloc_bioset()
+ */
+static void bio_fs_destructor(struct bio *bio)
+{
+        bio_free(bio, fs_bio_set);
 }
 inline void bio_init(struct bio *bio)
@@ -171,8 +176,6 @@ struct bio *bio_alloc_bioset(unsigned int __nocast gfp_mask, int nr_iovecs, stru
                        bio->bi_max_vecs = bvec_slabs[idx].nr_vecs;
                }
                bio->bi_io_vec = bvl;
-                bio->bi_destructor = bio_destructor;
-                bio->bi_set = bs;
        }
 out:
        return bio;
@@ -180,7 +183,12 @@ out:
 struct bio *bio_alloc(unsigned int __nocast gfp_mask, int nr_iovecs)
 {
-        return bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+        struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
+        if (bio)
+                bio->bi_destructor = bio_fs_destructor;
+        return bio;
 }
 void zero_fill_bio(struct bio *bio)
@@ -273,8 +281,10 @@ struct bio *bio_clone(struct bio *bio, unsigned int __nocast gfp_mask)
 {
        struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
-        if (b)
+        if (b) {
+                b->bi_destructor = bio_fs_destructor;
                __bio_clone(b, bio);
+        }
        return b;
 }
@@ -546,22 +556,34 @@ out_bmd:
        return ERR_PTR(ret);
 }
-static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev,
+static struct bio *__bio_map_user_iov(request_queue_t *q,
-                                  unsigned long uaddr, unsigned int len,
+                                      struct block_device *bdev,
-                                  int write_to_vm)
+                                      struct sg_iovec *iov, int iov_count,
+                                      int write_to_vm)
 {
-        unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        int i, j;
-        unsigned long start = uaddr >> PAGE_SHIFT;
+        int nr_pages = 0;
-        const int nr_pages = end - start;
-        int ret, offset, i;
        struct page **pages;
        struct bio *bio;
+        int cur_page = 0;
+        int ret, offset;
-        /*
+        for (i = 0; i < iov_count; i++) {
-         * transfer and buffer must be aligned to at least hardsector
+                unsigned long uaddr = (unsigned long)iov[i].iov_base;
-         * size for now, in the future we can relax this restriction
+                unsigned long len = iov[i].iov_len;
-         */
+                unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q)))
+                unsigned long start = uaddr >> PAGE_SHIFT;
+                nr_pages += end - start;
+                /*
+                 * transfer and buffer must be aligned to at least hardsector
+                 * size for now, in the future we can relax this restriction
+                 */
+                if ((uaddr & queue_dma_alignment(q)) || (len & queue_dma_alignment(q)))
+                        return ERR_PTR(-EINVAL);
+        }
+        if (!nr_pages)
                return ERR_PTR(-EINVAL);
        bio = bio_alloc(GFP_KERNEL, nr_pages);
@@ -573,42 +595,54 @@ static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev,
        if (!pages)
                goto out;
-        down_read(&current->mm->mmap_sem);
+        memset(pages, 0, nr_pages * sizeof(struct page *));
-        ret = get_user_pages(current, current->mm, uaddr, nr_pages,
-                                                write_to_vm, 0, pages, NULL);
+        for (i = 0; i < iov_count; i++) {
-        up_read(&current->mm->mmap_sem);
+                unsigned long uaddr = (unsigned long)iov[i].iov_base;
+                unsigned long len = iov[i].iov_len;
-        if (ret < nr_pages)
+                unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-                goto out;
+                unsigned long start = uaddr >> PAGE_SHIFT;
+                const int local_nr_pages = end - start;
-        bio->bi_bdev = bdev;
+                const int page_limit = cur_page + local_nr_pages;
+                
-        offset = uaddr & ~PAGE_MASK;
+                down_read(&current->mm->mmap_sem);
-        for (i = 0; i < nr_pages; i++) {
+                ret = get_user_pages(current, current->mm, uaddr,
-                unsigned int bytes = PAGE_SIZE - offset;
+                                     local_nr_pages,
+                                     write_to_vm, 0, &pages[cur_page], NULL);
-                if (len <= 0)
+                up_read(&current->mm->mmap_sem);
-                        break;
+                if (ret < local_nr_pages)
-                if (bytes > len)
+                        goto out_unmap;
-                        bytes = len;
+                offset = uaddr & ~PAGE_MASK;
+                for (j = cur_page; j < page_limit; j++) {
+                        unsigned int bytes = PAGE_SIZE - offset;
+                        if (len <= 0)
+                                break;
+                        
+                        if (bytes > len)
+                                bytes = len;
+                        /*
+                         * sorry...
+                         */
+                        if (__bio_add_page(q, bio, pages[j], bytes, offset) < bytes)
+                                break;
+                        len -= bytes;
+                        offset = 0;
+                }
+                cur_page = j;
                /*
-                 * sorry...
+                 * release the pages we didn't map into the bio, if any
                 */
-                if (__bio_add_page(q, bio, pages[i], bytes, offset) < bytes)
+                while (j < page_limit)
-                        break;
+                        page_cache_release(pages[j++]);
-                len -= bytes;
-                offset = 0;
        }
-        /*
-         * release the pages we didn't map into the bio, if any
-         */
-        while (i < nr_pages)
-                page_cache_release(pages[i++]);
        kfree(pages);
        /*
@@ -617,9 +651,17 @@ static struct bio *__bio_map_user(request_queue_t *q, struct block_device *bdev,
        if (!write_to_vm)
                bio->bi_rw |= (1 << BIO_RW);
+        bio->bi_bdev = bdev;
        bio->bi_flags |= (1 << BIO_USER_MAPPED);
        return bio;
-out:
+ out_unmap:
+        for (i = 0; i < nr_pages; i++) {
+                if(!pages[i])
+                        break;
+                page_cache_release(pages[i]);
+        }
+ out:
        kfree(pages);
        bio_put(bio);
        return ERR_PTR(ret);
@@ -639,9 +681,33 @@ out:
 struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev,
                         unsigned long uaddr, unsigned int len, int write_to_vm)
 {
+        struct sg_iovec iov;
+        iov.iov_base = (__user void *)uaddr;
+        iov.iov_len = len;
+        return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm);
+}
+/**
+ *      bio_map_user_iov - map user sg_iovec table into bio
+ *      @q: the request_queue_t for the bio
+ *      @bdev: destination block device
+ *      @iov:   the iovec.
+ *      @iov_count: number of elements in the iovec
+ *      @write_to_vm: bool indicating writing to pages or not
+ *
+ *      Map the user space address into a bio suitable for io to a block
+ *      device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_user_iov(request_queue_t *q, struct block_device *bdev,
+                             struct sg_iovec *iov, int iov_count,
+                             int write_to_vm)
+{
        struct bio *bio;
+        int len = 0, i;
-        bio = __bio_map_user(q, bdev, uaddr, len, write_to_vm);
+        bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm);
        if (IS_ERR(bio))
                return bio;
@@ -654,6 +720,9 @@ struct bio *bio_map_user(request_queue_t *q, struct block_device *bdev,
         */
        bio_get(bio);
+        for (i = 0; i < iov_count; i++)
+                len += iov[i].iov_len;
        if (bio->bi_size == len)
                return bio;
@@ -698,6 +767,82 @@ void bio_unmap_user(struct bio *bio)
        bio_put(bio);
 }
+static int bio_map_kern_endio(struct bio *bio, unsigned int bytes_done, int err)
+{
+        if (bio->bi_size)
+                return 1;
+        bio_put(bio);
+        return 0;
+}
+static struct bio *__bio_map_kern(request_queue_t *q, void *data,
+                                  unsigned int len, unsigned int gfp_mask)
+{
+        unsigned long kaddr = (unsigned long)data;
+        unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        unsigned long start = kaddr >> PAGE_SHIFT;
+        const int nr_pages = end - start;
+        int offset, i;
+        struct bio *bio;
+        bio = bio_alloc(gfp_mask, nr_pages);
+        if (!bio)
+                return ERR_PTR(-ENOMEM);
+        offset = offset_in_page(kaddr);
+        for (i = 0; i < nr_pages; i++) {
+                unsigned int bytes = PAGE_SIZE - offset;
+                if (len <= 0)
+                        break;
+                if (bytes > len)
+                        bytes = len;
+                if (__bio_add_page(q, bio, virt_to_page(data), bytes,
+                                   offset) < bytes)
+                        break;
+                data += bytes;
+                len -= bytes;
+                offset = 0;
+        }
+        bio->bi_end_io = bio_map_kern_endio;
+        return bio;
+}
+/**
+ *      bio_map_kern    -       map kernel address into bio
+ *      @q: the request_queue_t for the bio
+ *      @data: pointer to buffer to map
+ *      @len: length in bytes
+ *      @gfp_mask: allocation flags for bio allocation
+ *
+ *      Map the kernel address into a bio suitable for io to a block
+ *      device. Returns an error pointer in case of error.
+ */
+struct bio *bio_map_kern(request_queue_t *q, void *data, unsigned int len,
+                         unsigned int gfp_mask)
+{
+        struct bio *bio;
+        bio = __bio_map_kern(q, data, len, gfp_mask);
+        if (IS_ERR(bio))
+                return bio;
+        if (bio->bi_size == len)
+                return bio;
+        /*
+         * Don't support partial mappings.
+         */
+        bio_put(bio);
+        return ERR_PTR(-EINVAL);
+}
 /*
 * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
 * for performing direct-IO in BIOs.
@@ -1075,6 +1220,7 @@ subsys_initcall(init_bio);
 EXPORT_SYMBOL(bio_alloc);
 EXPORT_SYMBOL(bio_put);
+EXPORT_SYMBOL(bio_free);
 EXPORT_SYMBOL(bio_endio);
 EXPORT_SYMBOL(bio_init);
 EXPORT_SYMBOL(__bio_clone);
@@ -1085,6 +1231,7 @@ EXPORT_SYMBOL(bio_add_page);
 EXPORT_SYMBOL(bio_get_nr_vecs);
 EXPORT_SYMBOL(bio_map_user);
 EXPORT_SYMBOL(bio_unmap_user);
+EXPORT_SYMBOL(bio_map_kern);
 EXPORT_SYMBOL(bio_pair_release);
 EXPORT_SYMBOL(bio_split);
 EXPORT_SYMBOL(bio_split_pool);
diff --git a/fs/buffer.c b/fs/buffer.c
index 6a25d7df89b1..1c62203a4906 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -917,8 +917,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
                                 * contents - it is a noop if I/O is still in
                                 * flight on potentially older contents.
                                 */
-                                wait_on_buffer(bh);
+                                ll_rw_block(SWRITE, 1, &bh);
-                                ll_rw_block(WRITE, 1, &bh);
                                brelse(bh);
                                spin_lock(lock);
                        }
@@ -2793,21 +2792,22 @@ int submit_bh(int rw, struct buffer_head * bh)
 /**
 * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
+ * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
 * @nr: number of &struct buffer_heads in the array
 * @bhs: array of pointers to &struct buffer_head
 *
- * ll_rw_block() takes an array of pointers to &struct buffer_heads,
+ * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
- * and requests an I/O operation on them, either a %READ or a %WRITE.
+ * requests an I/O operation on them, either a %READ or a %WRITE.  The third
- * The third %READA option is described in the documentation for
+ * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
- * generic_make_request() which ll_rw_block() calls.
+ * are sent to disk. The fourth %READA option is described in the documentation
+ * for generic_make_request() which ll_rw_block() calls.
 *
 * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit), any buffer that appears to be clean when doing a
+ * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
- * write request, and any buffer that appears to be up-to-date when doing
+ * clean when doing a write request, and any buffer that appears to be
- * read request.  Further it marks as clean buffers that are processed for
+ * up-to-date when doing read request.  Further it marks as clean buffers that
- * writing (the buffer cache won't assume that they are actually clean until
+ * are processed for writing (the buffer cache won't assume that they are
- * the buffer gets unlocked).
+ * actually clean until the buffer gets unlocked).
 *
 * ll_rw_block sets b_end_io to simple completion handler that marks
 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -2823,11 +2823,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
        for (i = 0; i < nr; i++) {
                struct buffer_head *bh = bhs[i];
-                if (test_set_buffer_locked(bh))
+                if (rw == SWRITE)
+                        lock_buffer(bh);
+                else if (test_set_buffer_locked(bh))
                        continue;
                get_bh(bh);
-                if (rw == WRITE) {
+                if (rw == WRITE || rw == SWRITE) {
                        if (test_clear_buffer_dirty(bh)) {
                                bh->b_end_io = end_buffer_write_sync;
                                submit_bh(WRITE, bh);
@@ -3046,10 +3048,9 @@ struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
 {
        struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
        if (ret) {
-                preempt_disable();
+                get_cpu_var(bh_accounting).nr++;
-                __get_cpu_var(bh_accounting).nr++;
                recalc_bh_state();
-                preempt_enable();
+                put_cpu_var(bh_accounting);
        }
        return ret;
 }
@@ -3059,10 +3060,9 @@ void free_buffer_head(struct buffer_head *bh)
 {
        BUG_ON(!list_empty(&bh->b_assoc_buffers));
        kmem_cache_free(bh_cachep, bh);
-        preempt_disable();
+        get_cpu_var(bh_accounting).nr--;
-        __get_cpu_var(bh_accounting).nr--;
        recalc_bh_state();
-        preempt_enable();
+        put_cpu_var(bh_accounting);
 }
 EXPORT_SYMBOL(free_buffer_head);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index e568cc47a7f9..3217ac5f6bd7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -836,7 +836,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                                /* go from value to value + temp_len condensing 
                                double commas to singles. Note that this ends up
                                allocating a few bytes too many, which is ok */
-                                vol->password = kcalloc(1, temp_len, GFP_KERNEL);
+                                vol->password = kzalloc(temp_len, GFP_KERNEL);
                                if(vol->password == NULL) {
                                        printk("CIFS: no memory for pass\n");
                                        return 1;
@@ -851,7 +851,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol)
                                }
                                vol->password[j] = 0;
                        } else {
-                                vol->password = kcalloc(1, temp_len+1, GFP_KERNEL);
+                                vol->password = kzalloc(temp_len+1, GFP_KERNEL);
                                if(vol->password == NULL) {
                                        printk("CIFS: no memory for pass\n");
                                        return 1;
@@ -1317,7 +1317,7 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket,
                sessinit is sent but no second negprot */
                struct rfc1002_session_packet * ses_init_buf;
                struct smb_hdr * smb_buf;
-                ses_init_buf = kcalloc(1, sizeof(struct rfc1002_session_packet), GFP_KERNEL);
+                ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), GFP_KERNEL);
                if(ses_init_buf) {
                        ses_init_buf->trailer.session_req.called_len = 32;
                        rfc1002mangle(ses_init_buf->trailer.session_req.called_name,
@@ -1964,7 +1964,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
 /* We look for obvious messed up bcc or strings in response so we do not go off
   the end since (at least) WIN2K and Windows XP have a major bug in not null
   terminating last Unicode string in response  */
-                                ses->serverOS = kcalloc(1, 2 * (len + 1), GFP_KERNEL);
+                                ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL);
                                if(ses->serverOS == NULL)
                                        goto sesssetup_nomem;
                                cifs_strfromUCS_le(ses->serverOS,
@@ -1976,7 +1976,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                if (remaining_words > 0) {
                                        len = UniStrnlen((wchar_t *)bcc_ptr,
                                                         remaining_words-1);
-                                        ses->serverNOS = kcalloc(1, 2 * (len + 1),GFP_KERNEL);
+                                        ses->serverNOS = kzalloc(2 * (len + 1),GFP_KERNEL);
                                        if(ses->serverNOS == NULL)
                                                goto sesssetup_nomem;
                                        cifs_strfromUCS_le(ses->serverNOS,
@@ -1994,7 +1994,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words);
          /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
                                                ses->serverDomain =
-                                                    kcalloc(1, 2*(len+1),GFP_KERNEL);
+                                                    kzalloc(2*(len+1),GFP_KERNEL);
                                                if(ses->serverDomain == NULL)
                                                        goto sesssetup_nomem;
                                                cifs_strfromUCS_le(ses->serverDomain,
@@ -2005,22 +2005,22 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                        } /* else no more room so create dummy domain string */
                                        else
                                                ses->serverDomain = 
-                                                        kcalloc(1, 2, GFP_KERNEL);
+                                                        kzalloc(2, GFP_KERNEL);
                                } else {        /* no room so create dummy domain and NOS string */
                                        /* if these kcallocs fail not much we
                                           can do, but better to not fail the
                                           sesssetup itself */
                                        ses->serverDomain =
-                                            kcalloc(1, 2, GFP_KERNEL);
+                                            kzalloc(2, GFP_KERNEL);
                                        ses->serverNOS =
-                                            kcalloc(1, 2, GFP_KERNEL);
+                                            kzalloc(2, GFP_KERNEL);
                                }
                        } else {        /* ASCII */
                                len = strnlen(bcc_ptr, 1024);
                                if (((long) bcc_ptr + len) - (long)
                                    pByteArea(smb_buffer_response)
                                            <= BCC(smb_buffer_response)) {
-                                        ses->serverOS = kcalloc(1, len + 1,GFP_KERNEL);
+                                        ses->serverOS = kzalloc(len + 1,GFP_KERNEL);
                                        if(ses->serverOS == NULL)
                                                goto sesssetup_nomem;
                                        strncpy(ses->serverOS,bcc_ptr, len);
@@ -2030,7 +2030,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                        bcc_ptr++;
                                        len = strnlen(bcc_ptr, 1024);
-                                        ses->serverNOS = kcalloc(1, len + 1,GFP_KERNEL);
+                                        ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
                                        if(ses->serverNOS == NULL)
                                                goto sesssetup_nomem;
                                        strncpy(ses->serverNOS, bcc_ptr, len);
@@ -2039,7 +2039,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                        bcc_ptr++;
                                        len = strnlen(bcc_ptr, 1024);
-                                        ses->serverDomain = kcalloc(1, len + 1,GFP_KERNEL);
+                                        ses->serverDomain = kzalloc(len + 1,GFP_KERNEL);
                                        if(ses->serverDomain == NULL)
                                                goto sesssetup_nomem;
                                        strncpy(ses->serverDomain, bcc_ptr, len);
@@ -2240,7 +2240,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
   the end since (at least) WIN2K and Windows XP have a major bug in not null
   terminating last Unicode string in response  */
                                        ses->serverOS =
-                                            kcalloc(1, 2 * (len + 1), GFP_KERNEL);
+                                            kzalloc(2 * (len + 1), GFP_KERNEL);
                                        cifs_strfromUCS_le(ses->serverOS,
                                                           (wchar_t *)
                                                           bcc_ptr, len,
@@ -2254,7 +2254,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                                 remaining_words
                                                                 - 1);
                                                ses->serverNOS =
-                                                    kcalloc(1, 2 * (len + 1),
+                                                    kzalloc(2 * (len + 1),
                                                            GFP_KERNEL);
                                                cifs_strfromUCS_le(ses->serverNOS,
                                                                   (wchar_t *)bcc_ptr,
@@ -2267,7 +2267,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                if (remaining_words > 0) {
                                                        len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); 
                            /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
-                                                        ses->serverDomain = kcalloc(1, 2*(len+1),GFP_KERNEL);
+                                                        ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL);
                                                        cifs_strfromUCS_le(ses->serverDomain,
                                                             (wchar_t *)bcc_ptr, 
                                 len,
@@ -2278,10 +2278,10 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                } /* else no more room so create dummy domain string */
                                                else
                                                        ses->serverDomain =
-                                                            kcalloc(1, 2,GFP_KERNEL);
+                                                            kzalloc(2,GFP_KERNEL);
                                        } else {        /* no room so create dummy domain and NOS string */
-                                                ses->serverDomain = kcalloc(1, 2, GFP_KERNEL);
+                                                ses->serverDomain = kzalloc(2, GFP_KERNEL);
-                                                ses->serverNOS = kcalloc(1, 2, GFP_KERNEL);
+                                                ses->serverNOS = kzalloc(2, GFP_KERNEL);
                                        }
                                } else {        /* ASCII */
@@ -2289,7 +2289,7 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                        if (((long) bcc_ptr + len) - (long)
                                            pByteArea(smb_buffer_response)
                                            <= BCC(smb_buffer_response)) {
-                                                ses->serverOS = kcalloc(1, len + 1, GFP_KERNEL);
+                                                ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
                                                strncpy(ses->serverOS, bcc_ptr, len);
                                                bcc_ptr += len;
@@ -2297,14 +2297,14 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                bcc_ptr++;
                                                len = strnlen(bcc_ptr, 1024);
-                                                ses->serverNOS = kcalloc(1, len + 1,GFP_KERNEL);
+                                                ses->serverNOS = kzalloc(len + 1,GFP_KERNEL);
                                                strncpy(ses->serverNOS, bcc_ptr, len);
                                                bcc_ptr += len;
                                                bcc_ptr[0] = 0;
                                                bcc_ptr++;
                                                len = strnlen(bcc_ptr, 1024);
-                                                ses->serverDomain = kcalloc(1, len + 1, GFP_KERNEL);
+                                                ses->serverDomain = kzalloc(len + 1, GFP_KERNEL);
                                                strncpy(ses->serverDomain, bcc_ptr, len);
                                                bcc_ptr += len;
                                                bcc_ptr[0] = 0;
@@ -2554,7 +2554,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
   the end since (at least) WIN2K and Windows XP have a major bug in not null
   terminating last Unicode string in response  */
                                        ses->serverOS =
-                                            kcalloc(1, 2 * (len + 1), GFP_KERNEL);
+                                            kzalloc(2 * (len + 1), GFP_KERNEL);
                                        cifs_strfromUCS_le(ses->serverOS,
                                                           (wchar_t *)
                                                           bcc_ptr, len,
@@ -2569,7 +2569,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                                                 remaining_words
                                                                 - 1);
                                                ses->serverNOS =
-                                                    kcalloc(1, 2 * (len + 1),
+                                                    kzalloc(2 * (len + 1),
                                                            GFP_KERNEL);
                                                cifs_strfromUCS_le(ses->
                                                                   serverNOS,
@@ -2586,7 +2586,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                                        len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); 
           /* last string is not always null terminated (for e.g. for Windows XP & 2000) */
                                                        ses->serverDomain =
-                                                            kcalloc(1, 2 *
+                                                            kzalloc(2 *
                                                                    (len +
                                                                     1),
                                                                    GFP_KERNEL);
@@ -2612,13 +2612,13 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                                } /* else no more room so create dummy domain string */
                                                else
                                                        ses->serverDomain =
-                                                            kcalloc(1, 2,
+                                                            kzalloc(2,
                                                                    GFP_KERNEL);
                                        } else {        /* no room so create dummy domain and NOS string */
                                                ses->serverDomain =
-                                                    kcalloc(1, 2, GFP_KERNEL);
+                                                    kzalloc(2, GFP_KERNEL);
                                                ses->serverNOS =
-                                                    kcalloc(1, 2, GFP_KERNEL);
+                                                    kzalloc(2, GFP_KERNEL);
                                        }
                                } else {        /* ASCII */
                                        len = strnlen(bcc_ptr, 1024);
@@ -2626,7 +2626,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                            pByteArea(smb_buffer_response)
                                            <= BCC(smb_buffer_response)) {
                                                ses->serverOS =
-                                                    kcalloc(1, len + 1,
+                                                    kzalloc(len + 1,
                                                            GFP_KERNEL);
                                                strncpy(ses->serverOS,
                                                        bcc_ptr, len);
@@ -2637,7 +2637,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                                len = strnlen(bcc_ptr, 1024);
                                                ses->serverNOS =
-                                                    kcalloc(1, len + 1,
+                                                    kzalloc(len + 1,
                                                            GFP_KERNEL);
                                                strncpy(ses->serverNOS, bcc_ptr, len);
                                                bcc_ptr += len;
@@ -2646,7 +2646,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid,
                                                len = strnlen(bcc_ptr, 1024);
                                                ses->serverDomain =
-                                                    kcalloc(1, len + 1,
+                                                    kzalloc(len + 1,
                                                            GFP_KERNEL);
                                                strncpy(ses->serverDomain, bcc_ptr, len);       
                                                bcc_ptr += len;
@@ -2948,7 +2948,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
  the end since (at least) WIN2K and Windows XP have a major bug in not null
  terminating last Unicode string in response  */
                                        ses->serverOS =
-                                            kcalloc(1, 2 * (len + 1), GFP_KERNEL);
+                                            kzalloc(2 * (len + 1), GFP_KERNEL);
                                        cifs_strfromUCS_le(ses->serverOS,
                                                           (wchar_t *)
                                                           bcc_ptr, len,
@@ -2963,7 +2963,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                                 remaining_words
                                                                 - 1);
                                                ses->serverNOS =
-                                                    kcalloc(1, 2 * (len + 1),
+                                                    kzalloc(2 * (len + 1),
                                                            GFP_KERNEL);
                                                cifs_strfromUCS_le(ses->
                                                                   serverNOS,
@@ -2979,7 +2979,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                        len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); 
     /* last string not always null terminated (e.g. for Windows XP & 2000) */
                                                        ses->serverDomain =
-                                                            kcalloc(1, 2 *
+                                                            kzalloc(2 *
                                                                    (len +
                                                                     1),
                                                                    GFP_KERNEL);
@@ -3004,17 +3004,17 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                            = 0;
                                                } /* else no more room so create dummy domain string */
                                                else
-                                                        ses->serverDomain = kcalloc(1, 2,GFP_KERNEL);
+                                                        ses->serverDomain = kzalloc(2,GFP_KERNEL);
                                        } else {  /* no room so create dummy domain and NOS string */
-                                                ses->serverDomain = kcalloc(1, 2, GFP_KERNEL);
+                                                ses->serverDomain = kzalloc(2, GFP_KERNEL);
-                                                ses->serverNOS = kcalloc(1, 2, GFP_KERNEL);
+                                                ses->serverNOS = kzalloc(2, GFP_KERNEL);
                                        }
                                } else {        /* ASCII */
                                        len = strnlen(bcc_ptr, 1024);
                                        if (((long) bcc_ptr + len) - 
                        (long) pByteArea(smb_buffer_response) 
                            <= BCC(smb_buffer_response)) {
-                                                ses->serverOS = kcalloc(1, len + 1,GFP_KERNEL);
+                                                ses->serverOS = kzalloc(len + 1,GFP_KERNEL);
                                                strncpy(ses->serverOS,bcc_ptr, len);
                                                bcc_ptr += len;
@@ -3022,14 +3022,14 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses,
                                                bcc_ptr++;
                                                len = strnlen(bcc_ptr, 1024);
-                                                ses->serverNOS = kcalloc(1, len+1,GFP_KERNEL);
+                                                ses->serverNOS = kzalloc(len+1,GFP_KERNEL);
                                                strncpy(ses->serverNOS, bcc_ptr, len);  
                                                bcc_ptr += len;
                                                bcc_ptr[0] = 0;
                                                bcc_ptr++;
                                                len = strnlen(bcc_ptr, 1024);
-                                                ses->serverDomain = kcalloc(1, len+1,GFP_KERNEL);
+                                                ses->serverDomain = kzalloc(len+1,GFP_KERNEL);
                                                strncpy(ses->serverDomain, bcc_ptr, len);
                                                bcc_ptr += len;
                                                bcc_ptr[0] = 0;
@@ -3141,7 +3141,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                                if(tcon->nativeFileSystem)
                                        kfree(tcon->nativeFileSystem);
                                tcon->nativeFileSystem =
-                                    kcalloc(1, length + 2, GFP_KERNEL);
+                                    kzalloc(length + 2, GFP_KERNEL);
                                cifs_strfromUCS_le(tcon->nativeFileSystem,
                                                   (wchar_t *) bcc_ptr,
                                                   length, nls_codepage);
@@ -3159,7 +3159,7 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                                if(tcon->nativeFileSystem)
                                        kfree(tcon->nativeFileSystem);
                                tcon->nativeFileSystem =
-                                    kcalloc(1, length + 1, GFP_KERNEL);
+                                    kzalloc(length + 1, GFP_KERNEL);
                                strncpy(tcon->nativeFileSystem, bcc_ptr,
                                        length);
                        }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3f3538d4a1fa..d335269bd91c 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -145,24 +145,23 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
                return -ENOMEM;
        }
-        if(nd) {
+        if(nd && (nd->flags & LOOKUP_OPEN)) {
-                if ((nd->intent.open.flags & O_ACCMODE) == O_RDONLY)
+                int oflags = nd->intent.open.flags;
-                        desiredAccess = GENERIC_READ;
-                else if ((nd->intent.open.flags & O_ACCMODE) == O_WRONLY) {
+                desiredAccess = 0;
-                        desiredAccess = GENERIC_WRITE;
+                if (oflags & FMODE_READ)
-                        write_only = TRUE;
+                        desiredAccess |= GENERIC_READ;
-                } else if ((nd->intent.open.flags & O_ACCMODE) == O_RDWR) {
+                if (oflags & FMODE_WRITE) {
-                        /* GENERIC_ALL is too much permission to request */
+                        desiredAccess |= GENERIC_WRITE;
-                        /* can cause unnecessary access denied on create */
+                        if (!(oflags & FMODE_READ))
-                        /* desiredAccess = GENERIC_ALL; */
+                                write_only = TRUE;
-                        desiredAccess = GENERIC_READ | GENERIC_WRITE;
                }
-                if((nd->intent.open.flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+                if((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
                        disposition = FILE_CREATE;
-                else if((nd->intent.open.flags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
+                else if((oflags & (O_CREAT | O_TRUNC)) == (O_CREAT | O_TRUNC))
                        disposition = FILE_OVERWRITE_IF;
-                else if((nd->intent.open.flags & O_CREAT) == O_CREAT)
+                else if((oflags & O_CREAT) == O_CREAT)
                        disposition = FILE_OPEN_IF;
                else {
                        cFYI(1,("Create flag not set in create function"));
diff --git a/fs/compat.c b/fs/compat.c
index 6b06b6bae35e..8c665705c6a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -310,96 +310,6 @@ static int __init init_sys32_ioctl(void)
 __initcall(init_sys32_ioctl);
-int register_ioctl32_conversion(unsigned int cmd,
-                                ioctl_trans_handler_t handler)
-{
-        struct ioctl_trans *t;
-        struct ioctl_trans *new_t;
-        unsigned long hash = ioctl32_hash(cmd);
-        new_t = kmalloc(sizeof(*new_t), GFP_KERNEL);
-        if (!new_t)
-                return -ENOMEM;
-        down_write(&ioctl32_sem);
-        for (t = ioctl32_hash_table[hash]; t; t = t->next) {
-                if (t->cmd == cmd) {
-                        printk(KERN_ERR "Trying to register duplicated ioctl32 "
-                                        "handler %x\n", cmd);
-                        up_write(&ioctl32_sem);
-                        kfree(new_t);
-                        return -EINVAL; 
-                }
-        }
-        new_t->next = NULL;
-        new_t->cmd = cmd;
-        new_t->handler = handler;
-        ioctl32_insert_translation(new_t);
-        up_write(&ioctl32_sem);
-        return 0;
-}
-EXPORT_SYMBOL(register_ioctl32_conversion);
-static inline int builtin_ioctl(struct ioctl_trans *t)
-{ 
-        return t >= ioctl_start && t < (ioctl_start + ioctl_table_size);
-} 
-/* Problem: 
-   This function cannot unregister duplicate ioctls, because they are not
-   unique.
-   When they happen we need to extend the prototype to pass the handler too. */
-int unregister_ioctl32_conversion(unsigned int cmd)
-{
-        unsigned long hash = ioctl32_hash(cmd);
-        struct ioctl_trans *t, *t1;
-        down_write(&ioctl32_sem);
-        t = ioctl32_hash_table[hash];
-        if (!t) { 
-                up_write(&ioctl32_sem);
-                return -EINVAL;
-        } 
-        if (t->cmd == cmd) { 
-                if (builtin_ioctl(t)) {
-                        printk("%p tried to unregister builtin ioctl %x\n",
-                               __builtin_return_address(0), cmd);
-                } else { 
-                        ioctl32_hash_table[hash] = t->next;
-                        up_write(&ioctl32_sem);
-                        kfree(t);
-                        return 0;
-                }
-        } 
-        while (t->next) {
-                t1 = t->next;
-                if (t1->cmd == cmd) { 
-                        if (builtin_ioctl(t1)) {
-                                printk("%p tried to unregister builtin "
-                                        "ioctl %x\n",
-                                        __builtin_return_address(0), cmd);
-                                goto out;
-                        } else { 
-                                t->next = t1->next;
-                                up_write(&ioctl32_sem);
-                                kfree(t1);
-                                return 0;
-                        }
-                }
-                t = t1;
-        }
-        printk(KERN_ERR "Trying to free unknown 32bit ioctl handler %x\n",
-                                cmd);
-out:
-        up_write(&ioctl32_sem);
-        return -EINVAL;
-}
-EXPORT_SYMBOL(unregister_ioctl32_conversion); 
 static void compat_ioctl_error(struct file *filp, unsigned int fd,
                unsigned int cmd, unsigned long arg)
 {
@@ -720,14 +630,14 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
 struct compat_ncp_mount_data {
        compat_int_t version;
        compat_uint_t ncp_fd;
-        compat_uid_t mounted_uid;
+        __compat_uid_t mounted_uid;
        compat_pid_t wdog_pid;
        unsigned char mounted_vol[NCP_VOLNAME_LEN + 1];
        compat_uint_t time_out;
        compat_uint_t retry_count;
        compat_uint_t flags;
-        compat_uid_t uid;
+        __compat_uid_t uid;
-        compat_gid_t gid;
+        __compat_gid_t gid;
        compat_mode_t file_mode;
        compat_mode_t dir_mode;
 };
@@ -784,9 +694,9 @@ static void *do_ncp_super_data_conv(void *raw_data)
 struct compat_smb_mount_data {
        compat_int_t version;
-        compat_uid_t mounted_uid;
+        __compat_uid_t mounted_uid;
-        compat_uid_t uid;
+        __compat_uid_t uid;
-        compat_gid_t gid;
+        __compat_gid_t gid;
        compat_mode_t file_mode;
        compat_mode_t dir_mode;
 };
@@ -1365,6 +1275,16 @@ out:
 }
 /*
+ * Exactly like fs/open.c:sys_open(), except that it doesn't set the
+ * O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open(const char __user *filename, int flags, int mode)
+{
+        return do_sys_open(filename, flags, mode);
+}
+/*
 * compat_count() counts the number of arguments/envelopes. It is basically
 * a copy of count() from fs/exec.c, except that it works with 32 bit argv
 * and envp pointers.
@@ -1808,8 +1728,8 @@ struct compat_nfsctl_export {
        compat_dev_t    ex32_dev;
        compat_ino_t    ex32_ino;
        compat_int_t    ex32_flags;
-        compat_uid_t    ex32_anon_uid;
+        __compat_uid_t  ex32_anon_uid;
-        compat_gid_t    ex32_anon_gid;
+        __compat_gid_t  ex32_anon_gid;
 };
 struct compat_nfsctl_fdparm {
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 6c285efa2004..7fe85415ae7c 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -39,12 +39,47 @@ static DECLARE_MUTEX(read_mutex);
 #define CRAMINO(x)      ((x)->offset?(x)->offset<<2:1)
 #define OFFSET(x)       ((x)->i_ino)
-static struct inode *get_cramfs_inode(struct super_block *sb, struct cramfs_inode * cramfs_inode)
+static int cramfs_iget5_test(struct inode *inode, void *opaque)
+{
+        struct cramfs_inode *cramfs_inode = opaque;
+        if (inode->i_ino != CRAMINO(cramfs_inode))
+                return 0; /* does not match */
+        if (inode->i_ino != 1)
+                return 1;
+        /* all empty directories, char, block, pipe, and sock, share inode #1 */
+        if ((inode->i_mode != cramfs_inode->mode) ||
+            (inode->i_gid != cramfs_inode->gid) ||
+            (inode->i_uid != cramfs_inode->uid))
+                return 0; /* does not match */
+        if ((S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) &&
+            (inode->i_rdev != old_decode_dev(cramfs_inode->size)))
+                return 0; /* does not match */
+        return 1; /* matches */
+}
+static int cramfs_iget5_set(struct inode *inode, void *opaque)
+{
+        struct cramfs_inode *cramfs_inode = opaque;
+        inode->i_ino = CRAMINO(cramfs_inode);
+        return 0;
+}
+static struct inode *get_cramfs_inode(struct super_block *sb,
+                                struct cramfs_inode * cramfs_inode)
 {
-        struct inode * inode = new_inode(sb);
+        struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode),
+                                            cramfs_iget5_test, cramfs_iget5_set,
+                                            cramfs_inode);
        static struct timespec zerotime;
-        if (inode) {
+        if (inode && (inode->i_state & I_NEW)) {
                inode->i_mode = cramfs_inode->mode;
                inode->i_uid = cramfs_inode->uid;
                inode->i_size = cramfs_inode->size;
@@ -58,7 +93,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb, struct cramfs_inod
                   but it's the best we can do without reading the directory
                   contents.  1 yields the right result in GNU find, even
                   without -noleaf option. */
-                insert_inode_hash(inode);
                if (S_ISREG(inode->i_mode)) {
                        inode->i_fop = &generic_ro_fops;
                        inode->i_data.a_ops = &cramfs_aops;
@@ -74,6 +108,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, struct cramfs_inod
                        init_special_inode(inode, inode->i_mode,
                                old_decode_dev(cramfs_inode->size));
                }
+                unlock_new_inode(inode);
        }
        return inode;
 }
diff --git a/fs/devpts/Makefile b/fs/devpts/Makefile
index 5800df2e50c8..236696efcbac 100644
--- a/fs/devpts/Makefile
+++ b/fs/devpts/Makefile
@@ -5,4 +5,3 @@
 obj-$(CONFIG_UNIX98_PTYS)               += devpts.o
 devpts-$(CONFIG_UNIX98_PTYS)            := inode.o
-devpts-$(CONFIG_DEVPTS_FS_SECURITY)     += xattr_security.o
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 1571c8d6c232..f2be44d4491f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -18,28 +18,9 @@
 #include <linux/mount.h>
 #include <linux/tty.h>
 #include <linux/devpts_fs.h>
-#include <linux/xattr.h>
 #define DEVPTS_SUPER_MAGIC 0x1cd1
-extern struct xattr_handler devpts_xattr_security_handler;
-static struct xattr_handler *devpts_xattr_handlers[] = {
-#ifdef CONFIG_DEVPTS_FS_SECURITY
-        &devpts_xattr_security_handler,
-#endif
-        NULL
-};
-static struct inode_operations devpts_file_inode_operations = {
-#ifdef CONFIG_DEVPTS_FS_XATTR
-        .setxattr       = generic_setxattr,
-        .getxattr       = generic_getxattr,
-        .listxattr      = generic_listxattr,
-        .removexattr    = generic_removexattr,
-#endif
-};
 static struct vfsmount *devpts_mnt;
 static struct dentry *devpts_root;
@@ -102,7 +83,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
        s->s_blocksize_bits = 10;
        s->s_magic = DEVPTS_SUPER_MAGIC;
        s->s_op = &devpts_sops;
-        s->s_xattr = devpts_xattr_handlers;
        s->s_time_gran = 1;
        inode = new_inode(s);
@@ -175,7 +155,6 @@ int devpts_pty_new(struct tty_struct *tty)
        inode->i_gid = config.setgid ? config.gid : current->fsgid;
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        init_special_inode(inode, S_IFCHR|config.mode, device);
-        inode->i_op = &devpts_file_inode_operations;
        inode->u.generic_ip = tty;
        dentry = get_node(number);
diff --git a/fs/devpts/xattr_security.c b/fs/devpts/xattr_security.c
deleted file mode 100644
index 864cb5c79baa..000000000000
--- a/fs/devpts/xattr_security.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Security xattr support for devpts.
- *
- * Author: Stephen Smalley <sds@epoch.ncsc.mil>
- * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
- * any later version.
- */
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/security.h>
-#include <linux/xattr.h>
-static size_t
-devpts_xattr_security_list(struct inode *inode, char *list, size_t list_len,
-                           const char *name, size_t name_len)
-{
-        return security_inode_listsecurity(inode, list, list_len);
-}
-static int
-devpts_xattr_security_get(struct inode *inode, const char *name,
-                          void *buffer, size_t size)
-{
-        if (strcmp(name, "") == 0)
-                return -EINVAL;
-        return security_inode_getsecurity(inode, name, buffer, size);
-}
-static int
-devpts_xattr_security_set(struct inode *inode, const char *name,
-                          const void *value, size_t size, int flags)
-{
-        if (strcmp(name, "") == 0)
-                return -EINVAL;
-        return security_inode_setsecurity(inode, name, value, size, flags);
-}
-struct xattr_handler devpts_xattr_security_handler = {
-        .prefix = XATTR_SECURITY_PREFIX,
-        .list   = devpts_xattr_security_list,
-        .get    = devpts_xattr_security_get,
-        .set    = devpts_xattr_security_set,
-};
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index dcfe331dc4c4..3c0c7c6a5b44 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -19,6 +19,7 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
@@ -27,6 +28,8 @@
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
 #include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include "ext2.h"
 #include "xattr.h"
@@ -201,6 +204,26 @@ static void ext2_clear_inode(struct inode *inode)
 #endif
 }
+static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+        struct ext2_sb_info *sbi = EXT2_SB(vfs->mnt_sb);
+        if (sbi->s_mount_opt & EXT2_MOUNT_GRPID)
+                seq_puts(seq, ",grpid");
+        else
+                seq_puts(seq, ",nogrpid");
+#if defined(CONFIG_QUOTA)
+        if (sbi->s_mount_opt & EXT2_MOUNT_USRQUOTA)
+                seq_puts(seq, ",usrquota");
+        if (sbi->s_mount_opt & EXT2_MOUNT_GRPQUOTA)
+                seq_puts(seq, ",grpquota");
+#endif
+        return 0;
+}
 #ifdef CONFIG_QUOTA
 static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off);
 static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
@@ -218,6 +241,7 @@ static struct super_operations ext2_sops = {
        .statfs         = ext2_statfs,
        .remount_fs     = ext2_remount,
        .clear_inode    = ext2_clear_inode,
+        .show_options   = ext2_show_options,
 #ifdef CONFIG_QUOTA
        .quota_read     = ext2_quota_read,
        .quota_write    = ext2_quota_write,
@@ -256,10 +280,11 @@ static unsigned long get_sb_block(void **data)
 enum {
        Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
-        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
+        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
-        Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_nobh,
+        Opt_err_ro, Opt_nouid32, Opt_check, Opt_nocheck, Opt_debug,
-        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_xip,
+        Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
-        Opt_ignore, Opt_err,
+        Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota,
+        Opt_usrquota, Opt_grpquota
 };
 static match_table_t tokens = {
@@ -288,10 +313,10 @@ static match_table_t tokens = {
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_xip, "xip"},
-        {Opt_ignore, "grpquota"},
+        {Opt_grpquota, "grpquota"},
        {Opt_ignore, "noquota"},
-        {Opt_ignore, "quota"},
+        {Opt_quota, "quota"},
-        {Opt_ignore, "usrquota"},
+        {Opt_usrquota, "usrquota"},
        {Opt_err, NULL}
 };
@@ -406,6 +431,26 @@ static int parse_options (char * options,
                        printk("EXT2 xip option not supported\n");
 #endif
                        break;
+#if defined(CONFIG_QUOTA)
+                case Opt_quota:
+                case Opt_usrquota:
+                        set_opt(sbi->s_mount_opt, USRQUOTA);
+                        break;
+                case Opt_grpquota:
+                        set_opt(sbi->s_mount_opt, GRPQUOTA);
+                        break;
+#else
+                case Opt_quota:
+                case Opt_usrquota:
+                case Opt_grpquota:
+                        printk(KERN_ERR
+                                "EXT2-fs: quota operations not supported.\n");
+                        break;
+#endif
                case Opt_ignore:
                        break;
                default:
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c3c6e399fb3..a93c3609025d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -35,6 +35,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/quotaops.h>
+#include <linux/seq_file.h>
 #include <asm/uaccess.h>
 #include "xattr.h"
 #include "acl.h"
@@ -509,8 +510,41 @@ static void ext3_clear_inode(struct inode *inode)
        kfree(rsv);
 }
-#ifdef CONFIG_QUOTA
+static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+        struct ext3_sb_info *sbi = EXT3_SB(vfs->mnt_sb);
+        if (sbi->s_mount_opt & EXT3_MOUNT_JOURNAL_DATA)
+                seq_puts(seq, ",data=journal");
+        if (sbi->s_mount_opt & EXT3_MOUNT_ORDERED_DATA)
+                seq_puts(seq, ",data=ordered");
+        if (sbi->s_mount_opt & EXT3_MOUNT_WRITEBACK_DATA)
+                seq_puts(seq, ",data=writeback");
+#if defined(CONFIG_QUOTA)
+        if (sbi->s_jquota_fmt)
+                seq_printf(seq, ",jqfmt=%s",
+                (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0");
+        if (sbi->s_qf_names[USRQUOTA])
+                seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+        if (sbi->s_qf_names[GRPQUOTA])
+                seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+        if (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA)
+                seq_puts(seq, ",usrquota");
+        if (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)
+                seq_puts(seq, ",grpquota");
+#endif
+        return 0;
+}
+#ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -569,6 +603,7 @@ static struct super_operations ext3_sops = {
        .statfs         = ext3_statfs,
        .remount_fs     = ext3_remount,
        .clear_inode    = ext3_clear_inode,
+        .show_options   = ext3_show_options,
 #ifdef CONFIG_QUOTA
        .quota_read     = ext3_quota_read,
        .quota_write    = ext3_quota_write,
@@ -590,7 +625,8 @@ enum {
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-        Opt_ignore, Opt_barrier, Opt_err, Opt_resize,
+        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+        Opt_grpquota
 };
 static match_table_t tokens = {
@@ -634,10 +670,10 @@ static match_table_t tokens = {
        {Opt_grpjquota, "grpjquota=%s"},
        {Opt_jqfmt_vfsold, "jqfmt=vfsold"},
        {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"},
-        {Opt_quota, "grpquota"},
+        {Opt_grpquota, "grpquota"},
        {Opt_noquota, "noquota"},
        {Opt_quota, "quota"},
-        {Opt_quota, "usrquota"},
+        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
        {Opt_err, NULL},
        {Opt_resize, "resize"},
@@ -903,7 +939,13 @@ clear_qf_name:
                        sbi->s_jquota_fmt = QFMT_VFS_V0;
                        break;
                case Opt_quota:
+                case Opt_usrquota:
                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sbi->s_mount_opt, USRQUOTA);
+                        break;
+                case Opt_grpquota:
+                        set_opt(sbi->s_mount_opt, QUOTA);
+                        set_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
                case Opt_noquota:
                        if (sb_any_quota_enabled(sb)) {
@@ -912,8 +954,13 @@ clear_qf_name:
                                return 0;
                        }
                        clear_opt(sbi->s_mount_opt, QUOTA);
+                        clear_opt(sbi->s_mount_opt, USRQUOTA);
+                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
                        break;
 #else
+                case Opt_quota:
+                case Opt_usrquota:
+                case Opt_grpquota:
                case Opt_usrjquota:
                case Opt_grpjquota:
                case Opt_offusrjquota:
@@ -924,7 +971,6 @@ clear_qf_name:
                                "EXT3-fs: journalled quota options not "
                                "supported.\n");
                        break;
-                case Opt_quota:
                case Opt_noquota:
                        break;
 #endif
@@ -962,14 +1008,38 @@ clear_qf_name:
                }
        }
 #ifdef CONFIG_QUOTA
-        if (!sbi->s_jquota_fmt && (sbi->s_qf_names[USRQUOTA] ||
+        if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
-            sbi->s_qf_names[GRPQUOTA])) {
+                if ((sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA) &&
-                printk(KERN_ERR
+                     sbi->s_qf_names[USRQUOTA])
-                        "EXT3-fs: journalled quota format not specified.\n");
+                        clear_opt(sbi->s_mount_opt, USRQUOTA);
-                return 0;
+                if ((sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA) &&
+                     sbi->s_qf_names[GRPQUOTA])
+                        clear_opt(sbi->s_mount_opt, GRPQUOTA);
+                if ((sbi->s_qf_names[USRQUOTA] &&
+                                (sbi->s_mount_opt & EXT3_MOUNT_GRPQUOTA)) ||
+                    (sbi->s_qf_names[GRPQUOTA] &&
+                                (sbi->s_mount_opt & EXT3_MOUNT_USRQUOTA))) {
+                        printk(KERN_ERR "EXT3-fs: old and new quota "
+                                        "format mixing.\n");
+                        return 0;
+                }
+                if (!sbi->s_jquota_fmt) {
+                        printk(KERN_ERR "EXT3-fs: journalled quota format "
+                                        "not specified.\n");
+                        return 0;
+                }
+        } else {
+                if (sbi->s_jquota_fmt) {
+                        printk(KERN_ERR "EXT3-fs: journalled quota format "
+                                        "specified with no journalling "
+                                        "enabled.\n");
+                        return 0;
+                }
        }
 #endif
        return 1;
 }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index e5ae1b720dde..895049b2ac9c 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -30,6 +30,29 @@ static inline loff_t fat_make_i_pos(struct super_block *sb,
                | (de - (struct msdos_dir_entry *)bh->b_data);
 }
+static inline void fat_dir_readahead(struct inode *dir, sector_t iblock,
+                                     sector_t phys)
+{
+        struct super_block *sb = dir->i_sb;
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
+        struct buffer_head *bh;
+        int sec;
+        /* This is not a first sector of cluster, or sec_per_clus == 1 */
+        if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1)
+                return;
+        /* root dir of FAT12/FAT16 */
+        if ((sbi->fat_bits != 32) && (dir->i_ino == MSDOS_ROOT_INO))
+                return;
+        bh = sb_getblk(sb, phys);
+        if (bh && !buffer_uptodate(bh)) {
+                for (sec = 0; sec < sbi->sec_per_clus; sec++)
+                        sb_breadahead(sb, phys + sec);
+        }
+        brelse(bh);
+}
 /* Returns the inode number of the directory entry at offset pos. If bh is
   non-NULL, it is brelse'd before. Pos is incremented. The buffer header is
   returned in bh.
@@ -58,6 +81,8 @@ next:
        if (err || !phys)
                return -1;      /* beyond EOF or error */
+        fat_dir_readahead(dir, iblock, phys);
        *bh = sb_bread(sb, phys);
        if (*bh == NULL) {
                printk(KERN_ERR "FAT: Directory bread(block %llu) failed\n",
@@ -635,8 +660,7 @@ RecEnd:
 EODir:
        filp->f_pos = cpos;
 FillFailed:
-        if (bh)
+        brelse(bh);
-                brelse(bh);
        if (unicode)
                free_page((unsigned long)unicode);
 out:
diff --git a/fs/file_table.c b/fs/file_table.c
index 1d3de78e6bc9..43e9e1737de2 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -89,7 +89,6 @@ struct file *get_empty_filp(void)
        rwlock_init(&f->f_owner.lock);
        /* f->f_version: 0 */
        INIT_LIST_HEAD(&f->f_list);
-        f->f_maxcount = INT_MAX;
        return f;
 over:
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 27f66d3e8a04..6aa6fbe4f8ee 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -155,7 +155,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
        sbp->s_flags |= MS_RDONLY;
-        infp = kcalloc(1, sizeof(*infp), GFP_KERNEL);
+        infp = kzalloc(sizeof(*infp), GFP_KERNEL);
        if (!infp) {
                printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n");
                return -ENOMEM;
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index a096c5a56664..3d5cdc6847c0 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -13,8 +13,6 @@
 #include "btree.h"
-#define REF_PAGES       0
 void hfs_bnode_read(struct hfs_bnode *node, void *buf,
                int off, int len)
 {
@@ -289,9 +287,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
                        page_cache_release(page);
                        goto fail;
                }
-#if !REF_PAGES
                page_cache_release(page);
-#endif
                node->page[i] = page;
        }
@@ -449,13 +445,6 @@ void hfs_bnode_get(struct hfs_bnode *node)
 {
        if (node) {
                atomic_inc(&node->refcnt);
-#if REF_PAGES
-                {
-                int i;
-                for (i = 0; i < node->tree->pages_per_bnode; i++)
-                        get_page(node->page[i]);
-                }
-#endif
                dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
                       node->tree->cnid, node->this, atomic_read(&node->refcnt));
        }
@@ -472,20 +461,12 @@ void hfs_bnode_put(struct hfs_bnode *node)
                       node->tree->cnid, node->this, atomic_read(&node->refcnt));
                if (!atomic_read(&node->refcnt))
                        BUG();
-                if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) {
+                if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
-#if REF_PAGES
-                        for (i = 0; i < tree->pages_per_bnode; i++)
-                                put_page(node->page[i]);
-#endif
                        return;
-                }
                for (i = 0; i < tree->pages_per_bnode; i++) {
                        if (!node->page[i])
                                continue;
                        mark_page_accessed(node->page[i]);
-#if REF_PAGES
-                        put_page(node->page[i]);
-#endif
                }
                if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index 65dedefcabfc..2fcd679f0238 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -20,12 +20,12 @@
 *
 * Given the ID of the parent and the name build a search key.
 */
-void hfs_cat_build_key(btree_key *key, u32 parent, struct qstr *name)
+void hfs_cat_build_key(struct super_block *sb, btree_key *key, u32 parent, struct qstr *name)
 {
        key->cat.reserved = 0;
        key->cat.ParID = cpu_to_be32(parent);
        if (name) {
-                hfs_triv2mac(&key->cat.CName, name);
+                hfs_asc2mac(sb, &key->cat.CName, name);
                key->key_len = 6 + key->cat.CName.len;
        } else {
                memset(&key->cat.CName, 0, sizeof(struct hfs_name));
@@ -62,13 +62,14 @@ static int hfs_cat_build_record(hfs_cat_rec *rec, u32 cnid, struct inode *inode)
        }
 }
-static int hfs_cat_build_thread(hfs_cat_rec *rec, int type,
+static int hfs_cat_build_thread(struct super_block *sb,
+                                hfs_cat_rec *rec, int type,
                                u32 parentid, struct qstr *name)
 {
        rec->type = type;
        memset(rec->thread.reserved, 0, sizeof(rec->thread.reserved));
        rec->thread.ParID = cpu_to_be32(parentid);
-        hfs_triv2mac(&rec->thread.CName, name);
+        hfs_asc2mac(sb, &rec->thread.CName, name);
        return sizeof(struct hfs_cat_thread);
 }
@@ -93,8 +94,8 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *
        sb = dir->i_sb;
        hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
-        hfs_cat_build_key(fd.search_key, cnid, NULL);
+        hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
-        entry_size = hfs_cat_build_thread(&entry, S_ISDIR(inode->i_mode) ?
+        entry_size = hfs_cat_build_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
                        HFS_CDR_THD : HFS_CDR_FTH,
                        dir->i_ino, str);
        err = hfs_brec_find(&fd);
@@ -107,7 +108,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *
        if (err)
                goto err2;
-        hfs_cat_build_key(fd.search_key, dir->i_ino, str);
+        hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
        entry_size = hfs_cat_build_record(&entry, cnid, inode);
        err = hfs_brec_find(&fd);
        if (err != -ENOENT) {
@@ -127,7 +128,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, struct qstr *str, struct inode *
        return 0;
 err1:
-        hfs_cat_build_key(fd.search_key, cnid, NULL);
+        hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
        if (!hfs_brec_find(&fd))
                hfs_brec_remove(&fd);
 err2:
@@ -176,7 +177,7 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid,
        hfs_cat_rec rec;
        int res, len, type;
-        hfs_cat_build_key(fd->search_key, cnid, NULL);
+        hfs_cat_build_key(sb, fd->search_key, cnid, NULL);
        res = hfs_brec_read(fd, &rec, sizeof(rec));
        if (res)
                return res;
@@ -211,7 +212,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
        sb = dir->i_sb;
        hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
-        hfs_cat_build_key(fd.search_key, dir->i_ino, str);
+        hfs_cat_build_key(sb, fd.search_key, dir->i_ino, str);
        res = hfs_brec_find(&fd);
        if (res)
                goto out;
@@ -239,7 +240,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
        if (res)
                goto out;
-        hfs_cat_build_key(fd.search_key, cnid, NULL);
+        hfs_cat_build_key(sb, fd.search_key, cnid, NULL);
        res = hfs_brec_find(&fd);
        if (!res) {
                res = hfs_brec_remove(&fd);
@@ -280,7 +281,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
        dst_fd = src_fd;
        /* find the old dir entry and read the data */
-        hfs_cat_build_key(src_fd.search_key, src_dir->i_ino, src_name);
+        hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
        err = hfs_brec_find(&src_fd);
        if (err)
                goto out;
@@ -289,7 +290,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
                            src_fd.entrylength);
        /* create new dir entry with the data from the old entry */
-        hfs_cat_build_key(dst_fd.search_key, dst_dir->i_ino, dst_name);
+        hfs_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name);
        err = hfs_brec_find(&dst_fd);
        if (err != -ENOENT) {
                if (!err)
@@ -305,7 +306,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
        mark_inode_dirty(dst_dir);
        /* finally remove the old entry */
-        hfs_cat_build_key(src_fd.search_key, src_dir->i_ino, src_name);
+        hfs_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name);
        err = hfs_brec_find(&src_fd);
        if (err)
                goto out;
@@ -321,7 +322,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
                goto out;
        /* remove old thread entry */
-        hfs_cat_build_key(src_fd.search_key, cnid, NULL);
+        hfs_cat_build_key(sb, src_fd.search_key, cnid, NULL);
        err = hfs_brec_find(&src_fd);
        if (err)
                goto out;
@@ -330,8 +331,8 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, struct qstr *src_name,
                goto out;
        /* create new thread entry */
-        hfs_cat_build_key(dst_fd.search_key, cnid, NULL);
+        hfs_cat_build_key(sb, dst_fd.search_key, cnid, NULL);
-        entry_size = hfs_cat_build_thread(&entry, type == HFS_CDR_FIL ? HFS_CDR_FTH : HFS_CDR_THD,
+        entry_size = hfs_cat_build_thread(sb, &entry, type == HFS_CDR_FIL ? HFS_CDR_FTH : HFS_CDR_THD,
                                        dst_dir->i_ino, dst_name);
        err = hfs_brec_find(&dst_fd);
        if (err != -ENOENT) {
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index c55998262aed..e1f24befba58 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -28,7 +28,7 @@ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry,
        dentry->d_op = &hfs_dentry_operations;
        hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd);
-        hfs_cat_build_key(fd.search_key, dir->i_ino, &dentry->d_name);
+        hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name);
        res = hfs_brec_read(&fd, &rec, sizeof(rec));
        if (res) {
                hfs_find_exit(&fd);
@@ -56,7 +56,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
        struct inode *inode = filp->f_dentry->d_inode;
        struct super_block *sb = inode->i_sb;
        int len, err;
-        char strbuf[HFS_NAMELEN + 1];
+        char strbuf[HFS_MAX_NAMELEN];
        union hfs_cat_rec entry;
        struct hfs_find_data fd;
        struct hfs_readdir_data *rd;
@@ -66,7 +66,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                return 0;
        hfs_find_init(HFS_SB(sb)->cat_tree, &fd);
-        hfs_cat_build_key(fd.search_key, inode->i_ino, NULL);
+        hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
        err = hfs_brec_find(&fd);
        if (err)
                goto out;
@@ -111,7 +111,7 @@ static int hfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                }
                hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength);
                type = entry.type;
-                len = hfs_mac2triv(strbuf, &fd.key->cat.CName);
+                len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName);
                if (type == HFS_CDR_DIR) {
                        if (fd.entrylength < sizeof(struct hfs_cat_dir)) {
                                printk("HFS: small dir entry\n");
@@ -307,7 +307,8 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                           old_dir, &old_dentry->d_name,
                           new_dir, &new_dentry->d_name);
        if (!res)
-                hfs_cat_build_key((btree_key *)&HFS_I(old_dentry->d_inode)->cat_key,
+                hfs_cat_build_key(old_dir->i_sb,
+                                  (btree_key *)&HFS_I(old_dentry->d_inode)->cat_key,
                                  new_dir->i_ino, &new_dentry->d_name);
        return res;
 }
diff --git a/fs/hfs/hfs.h b/fs/hfs/hfs.h
index df6b33adee3b..88099ab1a180 100644
--- a/fs/hfs/hfs.h
+++ b/fs/hfs/hfs.h
@@ -25,6 +25,7 @@
 #define HFS_SECTOR_SIZE         512    /* size of an HFS sector */
 #define HFS_SECTOR_SIZE_BITS    9      /* log_2(HFS_SECTOR_SIZE) */
 #define HFS_NAMELEN             31     /* maximum length of an HFS filename */
+#define HFS_MAX_NAMELEN         128
 #define HFS_MAX_VALENCE         32767U
 /* Meanings of the drAtrb field of the MDB,
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 0dc8ef8e14de..aae019aadf88 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -141,6 +141,8 @@ struct hfs_sb_info {
        int session, part;
+        struct nls_table *nls_io, *nls_disk;
        struct semaphore bitmap_lock;
        unsigned long flags;
@@ -168,7 +170,7 @@ extern int hfs_cat_create(u32, struct inode *, struct qstr *, struct inode *);
 extern int hfs_cat_delete(u32, struct inode *, struct qstr *);
 extern int hfs_cat_move(u32, struct inode *, struct qstr *,
                        struct inode *, struct qstr *);
-extern void hfs_cat_build_key(btree_key *, u32, struct qstr *);
+extern void hfs_cat_build_key(struct super_block *, btree_key *, u32, struct qstr *);
 /* dir.c */
 extern struct file_operations hfs_dir_operations;
@@ -222,8 +224,8 @@ extern int hfs_strcmp(const unsigned char *, unsigned int,
 extern int hfs_compare_dentry(struct dentry *, struct qstr *, struct qstr *);
 /* trans.c */
-extern void hfs_triv2mac(struct hfs_name *, struct qstr *);
+extern void hfs_asc2mac(struct super_block *, struct hfs_name *, struct qstr *);
-extern int hfs_mac2triv(char *, const struct hfs_name *);
+extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *);
 extern struct timezone sys_tz;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 751912326094..f1570b9f9de3 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -160,7 +160,7 @@ struct inode *hfs_new_inode(struct inode *dir, struct qstr *name, int mode)
        init_MUTEX(&HFS_I(inode)->extents_lock);
        INIT_LIST_HEAD(&HFS_I(inode)->open_dir_list);
-        hfs_cat_build_key((btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
+        hfs_cat_build_key(sb, (btree_key *)&HFS_I(inode)->cat_key, dir->i_ino, name);
        inode->i_ino = HFS_SB(sb)->next_id++;
        inode->i_mode = mode;
        inode->i_uid = current->fsuid;
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index 217e32f37e0b..0a473f79c89f 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -10,6 +10,7 @@
 #include <linux/cdrom.h>
 #include <linux/genhd.h>
+#include <linux/nls.h>
 #include "hfs_fs.h"
 #include "btree.h"
@@ -343,6 +344,11 @@ void hfs_mdb_put(struct super_block *sb)
        brelse(HFS_SB(sb)->mdb_bh);
        brelse(HFS_SB(sb)->alt_mdb_bh);
+        if (HFS_SB(sb)->nls_io)
+                unload_nls(HFS_SB(sb)->nls_io);
+        if (HFS_SB(sb)->nls_disk)
+                unload_nls(HFS_SB(sb)->nls_disk);
        kfree(HFS_SB(sb));
        sb->s_fs_info = NULL;
 }
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index ab783f6afa3b..c5074aeafcae 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -15,8 +15,11 @@
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/mount.h>
 #include <linux/init.h>
+#include <linux/nls.h>
 #include <linux/parser.h>
+#include <linux/seq_file.h>
 #include <linux/vfs.h>
 #include "hfs_fs.h"
@@ -111,6 +114,32 @@ static int hfs_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
+static int hfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+        struct hfs_sb_info *sbi = HFS_SB(mnt->mnt_sb);
+        if (sbi->s_creator != cpu_to_be32(0x3f3f3f3f))
+                seq_printf(seq, ",creator=%.4s", (char *)&sbi->s_creator);
+        if (sbi->s_type != cpu_to_be32(0x3f3f3f3f))
+                seq_printf(seq, ",type=%.4s", (char *)&sbi->s_type);
+        seq_printf(seq, ",uid=%u,gid=%u", sbi->s_uid, sbi->s_gid);
+        if (sbi->s_file_umask != 0133)
+                seq_printf(seq, ",file_umask=%o", sbi->s_file_umask);
+        if (sbi->s_dir_umask != 0022)
+                seq_printf(seq, ",dir_umask=%o", sbi->s_dir_umask);
+        if (sbi->part >= 0)
+                seq_printf(seq, ",part=%u", sbi->part);
+        if (sbi->session >= 0)
+                seq_printf(seq, ",session=%u", sbi->session);
+        if (sbi->nls_disk)
+                seq_printf(seq, ",codepage=%s", sbi->nls_disk->charset);
+        if (sbi->nls_io)
+                seq_printf(seq, ",iocharset=%s", sbi->nls_io->charset);
+        if (sbi->s_quiet)
+                seq_printf(seq, ",quiet");
+        return 0;
+}
 static struct inode *hfs_alloc_inode(struct super_block *sb)
 {
        struct hfs_inode_info *i;
@@ -133,11 +162,13 @@ static struct super_operations hfs_super_operations = {
        .write_super    = hfs_write_super,
        .statfs         = hfs_statfs,
        .remount_fs     = hfs_remount,
+        .show_options   = hfs_show_options,
 };
 enum {
        opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask,
        opt_part, opt_session, opt_type, opt_creator, opt_quiet,
+        opt_codepage, opt_iocharset,
        opt_err
 };
@@ -152,6 +183,8 @@ static match_table_t tokens = {
        { opt_type, "type=%s" },
        { opt_creator, "creator=%s" },
        { opt_quiet, "quiet" },
+        { opt_codepage, "codepage=%s" },
+        { opt_iocharset, "iocharset=%s" },
        { opt_err, NULL }
 };
@@ -257,11 +290,46 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
                case opt_quiet:
                        hsb->s_quiet = 1;
                        break;
+                case opt_codepage:
+                        if (hsb->nls_disk) {
+                                printk("HFS+-fs: unable to change codepage\n");
+                                return 0;
+                        }
+                        p = match_strdup(&args[0]);
+                        hsb->nls_disk = load_nls(p);
+                        if (!hsb->nls_disk) {
+                                printk("HFS+-fs: unable to load codepage \"%s\"\n", p);
+                                kfree(p);
+                                return 0;
+                        }
+                        kfree(p);
+                        break;
+                case opt_iocharset:
+                        if (hsb->nls_io) {
+                                printk("HFS: unable to change iocharset\n");
+                                return 0;
+                        }
+                        p = match_strdup(&args[0]);
+                        hsb->nls_io = load_nls(p);
+                        if (!hsb->nls_io) {
+                                printk("HFS: unable to load iocharset \"%s\"\n", p);
+                                kfree(p);
+                                return 0;
+                        }
+                        kfree(p);
+                        break;
                default:
                        return 0;
                }
        }
+        if (hsb->nls_disk && !hsb->nls_io) {
+                hsb->nls_io = load_nls_default();
+                if (!hsb->nls_io) {
+                        printk("HFS: unable to load default iocharset\n");
+                        return 0;
+                }
+        }
        hsb->s_dir_umask &= 0777;
        hsb->s_file_umask &= 0577;
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index fb9720abbadd..e673a88b8ae7 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -9,12 +9,15 @@
 * with ':' vs. '/' as the path-element separator.
 */
+#include <linux/types.h>
+#include <linux/nls.h>
 #include "hfs_fs.h"
 /*================ Global functions ================*/
 /*
- * hfs_mac2triv()
+ * hfs_mac2asc()
 *
 * Given a 'Pascal String' (a string preceded by a length byte) in
 * the Macintosh character set produce the corresponding filename using
@@ -27,23 +30,58 @@
 * by ':' which never appears in HFS filenames.  All other characters
 * are passed unchanged from input to output.
 */
-int hfs_mac2triv(char *out, const struct hfs_name *in)
+int hfs_mac2asc(struct super_block *sb, char *out, const struct hfs_name *in)
 {
-        const char *p;
+        struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
-        char c;
+        struct nls_table *nls_io = HFS_SB(sb)->nls_io;
-        int i, len;
+        const char *src;
+        char *dst;
+        int srclen, dstlen, size;
+        src = in->name;
+        srclen = in->len;
+        dst = out;
+        dstlen = HFS_MAX_NAMELEN;
+        if (nls_io) {
+                wchar_t ch;
-        len = in->len;
+                while (srclen > 0) {
-        p = in->name;
+                        if (nls_disk) {
-        for (i = 0; i < len; i++) {
+                                size = nls_disk->char2uni(src, srclen, &ch);
-                c = *p++;
+                                if (size <= 0) {
-                *out++ = c == '/' ? ':' : c;
+                                        ch = '?';
+                                        size = 1;
+                                }
+                                src += size;
+                                srclen -= size;
+                        } else {
+                                ch = *src++;
+                                srclen--;
+                        }
+                        if (ch == '/')
+                                ch = ':';
+                        size = nls_io->uni2char(ch, dst, dstlen);
+                        if (size < 0) {
+                                if (size == -ENAMETOOLONG)
+                                        goto out;
+                                *dst = '?';
+                                size = 1;
+                        }
+                        dst += size;
+                        dstlen -= size;
+                }
+        } else {
+                char ch;
+                while (--srclen >= 0)
+                        *dst++ = (ch = *src++) == '/' ? ':' : ch;
        }
-        return i;
+out:
+        return dst - out;
 }
 /*
- * hfs_triv2mac()
+ * hfs_asc2mac()
 *
 * Given an ASCII string (not null-terminated) and its length,
 * generate the corresponding filename in the Macintosh character set
@@ -54,19 +92,57 @@ int hfs_mac2triv(char *out, const struct hfs_name *in)
 * This routine is a inverse to hfs_mac2triv().
 * A ':' is replaced by a '/'.
 */
-void hfs_triv2mac(struct hfs_name *out, struct qstr *in)
+void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, struct qstr *in)
 {
+        struct nls_table *nls_disk = HFS_SB(sb)->nls_disk;
+        struct nls_table *nls_io = HFS_SB(sb)->nls_io;
        const char *src;
-        char *dst, c;
+        char *dst;
-        int i, len;
+        int srclen, dstlen, size;
-        out->len = len = min((unsigned int)HFS_NAMELEN, in->len);
        src = in->name;
+        srclen = in->len;
        dst = out->name;
-        for (i = 0; i < len; i++) {
+        dstlen = HFS_NAMELEN;
-                c = *src++;
+        if (nls_io) {
-                *dst++ = c == ':' ? '/' : c;
+                wchar_t ch;
+                while (srclen > 0) {
+                        size = nls_io->char2uni(src, srclen, &ch);
+                        if (size < 0) {
+                                ch = '?';
+                                size = 1;
+                        }
+                        src += size;
+                        srclen -= size;
+                        if (ch == ':')
+                                ch = '/';
+                        if (nls_disk) {
+                                size = nls_disk->uni2char(ch, dst, dstlen);
+                                if (size < 0) {
+                                        if (size == -ENAMETOOLONG)
+                                                goto out;
+                                        *dst = '?';
+                                        size = 1;
+                                }
+                                dst += size;
+                                dstlen -= size;
+                        } else {
+                                *dst++ = ch > 0xff ? '?' : ch;
+                                dstlen--;
+                        }
+                }
+        } else {
+                char ch;
+                if (dstlen > srclen)
+                        dstlen = srclen;
+                while (--dstlen >= 0)
+                        *dst++ = (ch = *src++) == ':' ? '/' : ch;
        }
-        for (; i < HFS_NAMELEN; i++)
+out:
+        out->len = dst - (char *)out->name;
+        dstlen = HFS_NAMELEN - out->len;
+        while (--dstlen >= 0)
                *dst++ = 0;
 }
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 8868d3b766fd..b85abc6e6f83 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -18,8 +18,6 @@
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
-#define REF_PAGES       0
 /* Copy a specified range of bytes from the raw data of a node */
 void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
 {
@@ -450,9 +448,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
                        page_cache_release(page);
                        goto fail;
                }
-#if !REF_PAGES
                page_cache_release(page);
-#endif
                node->page[i] = page;
        }
@@ -612,13 +608,6 @@ void hfs_bnode_get(struct hfs_bnode *node)
 {
        if (node) {
                atomic_inc(&node->refcnt);
-#if REF_PAGES
-                {
-                int i;
-                for (i = 0; i < node->tree->pages_per_bnode; i++)
-                        get_page(node->page[i]);
-                }
-#endif
                dprint(DBG_BNODE_REFS, "get_node(%d:%d): %d\n",
                       node->tree->cnid, node->this, atomic_read(&node->refcnt));
        }
@@ -635,20 +624,12 @@ void hfs_bnode_put(struct hfs_bnode *node)
                       node->tree->cnid, node->this, atomic_read(&node->refcnt));
                if (!atomic_read(&node->refcnt))
                        BUG();
-                if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) {
+                if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock))
-#if REF_PAGES
-                        for (i = 0; i < tree->pages_per_bnode; i++)
-                                put_page(node->page[i]);
-#endif
                        return;
-                }
                for (i = 0; i < tree->pages_per_bnode; i++) {
                        if (!node->page[i])
                                continue;
                        mark_page_accessed(node->page[i]);
-#if REF_PAGES
-                        put_page(node->page[i]);
-#endif
                }
                if (test_bit(HFS_BNODE_DELETED, &node->flags)) {
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 533094a570df..2bc0cdd30e56 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -343,8 +343,9 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* options.c */
-int parse_options(char *, struct hfsplus_sb_info *);
+int hfsplus_parse_options(char *, struct hfsplus_sb_info *);
-void fill_defaults(struct hfsplus_sb_info *);
+void hfsplus_fill_defaults(struct hfsplus_sb_info *);
+int hfsplus_show_options(struct seq_file *, struct vfsmount *);
 /* tables.c */
 extern u16 hfsplus_case_fold_table[];
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 1cca0102c98d..cca0818aa4ca 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -13,6 +13,8 @@
 #include <linux/sched.h>
 #include <linux/parser.h>
 #include <linux/nls.h>
+#include <linux/mount.h>
+#include <linux/seq_file.h>
 #include "hfsplus_fs.h"
 enum {
@@ -38,7 +40,7 @@ static match_table_t tokens = {
 };
 /* Initialize an options object to reasonable defaults */
-void fill_defaults(struct hfsplus_sb_info *opts)
+void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
 {
        if (!opts)
                return;
@@ -63,7 +65,7 @@ static inline int match_fourchar(substring_t *arg, u32 *result)
 /* Parse options from mount. Returns 0 on failure */
 /* input is the options passed to mount() as a string */
-int parse_options(char *input, struct hfsplus_sb_info *sbi)
+int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
 {
        char *p;
        substring_t args[MAX_OPT_ARGS];
@@ -160,3 +162,23 @@ done:
        return 1;
 }
+int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
+{
+        struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb);
+        if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
+                seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
+        if (sbi->type != HFSPLUS_DEF_CR_TYPE)
+                seq_printf(seq, ",type=%.4s", (char *)&sbi->type);
+        seq_printf(seq, ",umask=%o,uid=%u,gid=%u", sbi->umask, sbi->uid, sbi->gid);
+        if (sbi->part >= 0)
+                seq_printf(seq, ",part=%u", sbi->part);
+        if (sbi->session >= 0)
+                seq_printf(seq, ",session=%u", sbi->session);
+        if (sbi->nls)
+                seq_printf(seq, ",nls=%s", sbi->nls->charset);
+        if (sbi->flags & HFSPLUS_SB_NODECOMPOSE)
+                seq_printf(seq, ",nodecompose");
+        return 0;
+}
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index d55ad67b8e42..fd0f0f050e1d 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -217,8 +217,7 @@ static void hfsplus_put_super(struct super_block *sb)
                vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
                vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
                mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-                ll_rw_block(WRITE, 1, &HFSPLUS_SB(sb).s_vhbh);
+                sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
-                wait_on_buffer(HFSPLUS_SB(sb).s_vhbh);
        }
        hfs_btree_close(HFSPLUS_SB(sb).cat_tree);
@@ -277,6 +276,7 @@ static struct super_operations hfsplus_sops = {
        .write_super    = hfsplus_write_super,
        .statfs         = hfsplus_statfs,
        .remount_fs     = hfsplus_remount,
+        .show_options   = hfsplus_show_options,
 };
 static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
@@ -297,8 +297,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        memset(sbi, 0, sizeof(HFSPLUS_SB(sb)));
        sb->s_fs_info = sbi;
        INIT_HLIST_HEAD(&sbi->rsrc_inodes);
-        fill_defaults(sbi);
+        hfsplus_fill_defaults(sbi);
-        if (!parse_options(data, sbi)) {
+        if (!hfsplus_parse_options(data, sbi)) {
                if (!silent)
                        printk("HFS+-fs: unable to parse mount options\n");
                err = -EINVAL;
@@ -415,8 +415,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
        vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
        mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh);
-        ll_rw_block(WRITE, 1, &HFSPLUS_SB(sb).s_vhbh);
+        sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh);
-        wait_on_buffer(HFSPLUS_SB(sb).s_vhbh);
        if (!HFSPLUS_SB(sb).hidden_dir) {
                printk("HFS+: create hidden dir...\n");
diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 67bca0d4a33b..cca3fb693f99 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -49,7 +49,6 @@ struct hostfs_iattr {
        struct timespec ia_atime;
        struct timespec ia_mtime;
        struct timespec ia_ctime;
-        unsigned int    ia_attr_flags;
 };
 extern int stat_file(const char *path, unsigned long long *inode_out,
diff --git a/fs/inode.c b/fs/inode.c
index e57f1724db3e..71df1b1e8f75 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1195,9 +1195,6 @@ void update_atime(struct inode *inode)
        if (!timespec_equal(&inode->i_atime, &now)) {
                inode->i_atime = now;
                mark_inode_dirty_sync(inode);
-        } else {
-                if (!timespec_equal(&inode->i_atime, &now))
-                        inode->i_atime = now;
        }
 }
diff --git a/fs/inotify.c b/fs/inotify.c
index 2e4e2a57708c..a37e9fb1da58 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -37,6 +37,7 @@
 #include <asm/ioctls.h>
 static atomic_t inotify_cookie;
+static atomic_t inotify_watches;
 static kmem_cache_t *watch_cachep;
 static kmem_cache_t *event_cachep;
@@ -422,6 +423,7 @@ static struct inotify_watch *create_watch(struct inotify_device *dev,
        get_inotify_watch(watch);
        atomic_inc(&dev->user->inotify_watches);
+        atomic_inc(&inotify_watches);
        return watch;
 }
@@ -454,6 +456,7 @@ static void remove_watch_no_event(struct inotify_watch *watch,
        list_del(&watch->d_list);
        atomic_dec(&dev->user->inotify_watches);
+        atomic_dec(&inotify_watches);
        idr_remove(&dev->idr, watch->wd);
        put_inotify_watch(watch);
 }
@@ -532,6 +535,9 @@ void inotify_dentry_parent_queue_event(struct dentry *dentry, u32 mask,
        struct dentry *parent;
        struct inode *inode;
+        if (!atomic_read (&inotify_watches))
+                return;
        spin_lock(&dentry->d_lock);
        parent = dentry->d_parent;
        inode = parent->d_inode;
@@ -925,6 +931,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
        struct nameidata nd;
        struct file *filp;
        int ret, fput_needed;
+        int mask_add = 0;
        filp = fget_light(fd, &fput_needed);
        if (unlikely(!filp))
@@ -947,6 +954,9 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
        down(&inode->inotify_sem);
        down(&dev->sem);
+        if (mask & IN_MASK_ADD)
+                mask_add = 1;
        /* don't let user-space set invalid bits: we don't want flags set */
        mask &= IN_ALL_EVENTS;
        if (unlikely(!mask)) {
@@ -960,7 +970,10 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask)
         */
        old = inode_find_dev(inode, dev);
        if (unlikely(old)) {
-                old->mask = mask;
+                if (mask_add)
+                        old->mask |= mask;
+                else
+                        old->mask = mask;
                ret = old->wd;
                goto out;
        }
@@ -1043,6 +1056,7 @@ static int __init inotify_setup(void)
        inotify_max_user_watches = 8192;
        atomic_set(&inotify_cookie, 0);
+        atomic_set(&inotify_watches, 0);
        watch_cachep = kmem_cache_create("inotify_watch_cache",
                                         sizeof(struct inotify_watch),
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 5a97e346bd95..014a51fd00d7 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -204,7 +204,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
        int i;
        spin_unlock(&journal->j_list_lock);
-        ll_rw_block(WRITE, *batch_count, bhs);
+        ll_rw_block(SWRITE, *batch_count, bhs);
        spin_lock(&journal->j_list_lock);
        for (i = 0; i < *batch_count; i++) {
                struct buffer_head *bh = bhs[i];
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index dac720c837ab..2a3e310f79ef 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -358,7 +358,7 @@ write_out_data:
                                        jbd_debug(2, "submit %d writes\n",
                                                        bufs);
                                        spin_unlock(&journal->j_list_lock);
-                                        ll_rw_block(WRITE, bufs, wbuf);
+                                        ll_rw_block(SWRITE, bufs, wbuf);
                                        journal_brelse_array(wbuf, bufs);
                                        bufs = 0;
                                        goto write_out_data;
@@ -381,7 +381,7 @@ write_out_data:
        if (bufs) {
                spin_unlock(&journal->j_list_lock);
-                ll_rw_block(WRITE, bufs, wbuf);
+                ll_rw_block(SWRITE, bufs, wbuf);
                journal_brelse_array(wbuf, bufs);
                spin_lock(&journal->j_list_lock);
        }
@@ -720,11 +720,17 @@ wait_for_iobuf:
        J_ASSERT(commit_transaction->t_log_list == NULL);
 restart_loop:
+        /*
+         * As there are other places (journal_unmap_buffer()) adding buffers
+         * to this list we have to be careful and hold the j_list_lock.
+         */
+        spin_lock(&journal->j_list_lock);
        while (commit_transaction->t_forget) {
                transaction_t *cp_transaction;
                struct buffer_head *bh;
                jh = commit_transaction->t_forget;
+                spin_unlock(&journal->j_list_lock);
                bh = jh2bh(jh);
                jbd_lock_bh_state(bh);
                J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
@@ -792,9 +798,25 @@ restart_loop:
                        journal_remove_journal_head(bh);  /* needs a brelse */
                        release_buffer_page(bh);
                }
+                cond_resched_lock(&journal->j_list_lock);
+        }
+        spin_unlock(&journal->j_list_lock);
+        /*
+         * This is a bit sleazy.  We borrow j_list_lock to protect
+         * journal->j_committing_transaction in __journal_remove_checkpoint.
+         * Really, __journal_remove_checkpoint should be using j_state_lock but
+         * it's a bit hassle to hold that across __journal_remove_checkpoint
+         */
+        spin_lock(&journal->j_state_lock);
+        spin_lock(&journal->j_list_lock);
+        /*
+         * Now recheck if some buffers did not get attached to the transaction
+         * while the lock was dropped...
+         */
+        if (commit_transaction->t_forget) {
                spin_unlock(&journal->j_list_lock);
-                if (cond_resched())
+                spin_unlock(&journal->j_state_lock);
-                        goto restart_loop;
+                goto restart_loop;
        }
        /* Done with this transaction! */
@@ -803,14 +825,6 @@ restart_loop:
        J_ASSERT(commit_transaction->t_state == T_COMMIT);
-        /*
-         * This is a bit sleazy.  We borrow j_list_lock to protect
-         * journal->j_committing_transaction in __journal_remove_checkpoint.
-         * Really, __jornal_remove_checkpoint should be using j_state_lock but
-         * it's a bit hassle to hold that across __journal_remove_checkpoint
-         */
-        spin_lock(&journal->j_state_lock);
-        spin_lock(&journal->j_list_lock);
        commit_transaction->t_state = T_FINISHED;
        J_ASSERT(commit_transaction == journal->j_committing_transaction);
        journal->j_commit_sequence = commit_transaction->t_tid;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 5e7b43949517..7ae2c4fe506b 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -65,7 +65,6 @@ EXPORT_SYMBOL(journal_set_features);
 EXPORT_SYMBOL(journal_create);
 EXPORT_SYMBOL(journal_load);
 EXPORT_SYMBOL(journal_destroy);
-EXPORT_SYMBOL(journal_recover);
 EXPORT_SYMBOL(journal_update_superblock);
 EXPORT_SYMBOL(journal_abort);
 EXPORT_SYMBOL(journal_errno);
@@ -81,6 +80,7 @@ EXPORT_SYMBOL(journal_try_to_free_buffers);
 EXPORT_SYMBOL(journal_force_commit);
 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
+static void __journal_abort_soft (journal_t *journal, int errno);
 /*
 * Helper function used to manage commit timeouts
@@ -93,16 +93,6 @@ static void commit_timeout(unsigned long __data)
        wake_up_process(p);
 }
-/* Static check for data structure consistency.  There's no code
- * invoked --- we'll just get a linker failure if things aren't right.
- */
-void __journal_internal_check(void)
-{
-        extern void journal_bad_superblock_size(void);
-        if (sizeof(struct journal_superblock_s) != 1024)
-                journal_bad_superblock_size();
-}
 /*
 * kjournald: The main thread function used to manage a logging device
 * journal.
@@ -119,16 +109,12 @@ void __journal_internal_check(void)
 *    known as checkpointing, and this thread is responsible for that job.
 */
-journal_t *current_journal;             // AKPM: debug
+static int kjournald(void *arg)
-int kjournald(void *arg)
 {
        journal_t *journal = (journal_t *) arg;
        transaction_t *transaction;
        struct timer_list timer;
-        current_journal = journal;
        daemonize("kjournald");
        /* Set up an interval timer which can be used to trigger a
@@ -193,6 +179,8 @@ loop:
                if (transaction && time_after_eq(jiffies,
                                                transaction->t_expires))
                        should_sleep = 0;
+                if (journal->j_flags & JFS_UNMOUNT)
+                        should_sleep = 0;
                if (should_sleep) {
                        spin_unlock(&journal->j_state_lock);
                        schedule();
@@ -969,7 +957,7 @@ void journal_update_superblock(journal_t *journal, int wait)
        if (wait)
                sync_dirty_buffer(bh);
        else
-                ll_rw_block(WRITE, 1, &bh);
+                ll_rw_block(SWRITE, 1, &bh);
 out:
        /* If we have just flushed the log (by marking s_start==0), then
@@ -1439,7 +1427,7 @@ int journal_wipe(journal_t *journal, int write)
 * device this journal is present.
 */
-const char *journal_dev_name(journal_t *journal, char *buffer)
+static const char *journal_dev_name(journal_t *journal, char *buffer)
 {
        struct block_device *bdev;
@@ -1485,7 +1473,7 @@ void __journal_abort_hard(journal_t *journal)
 /* Soft abort: record the abort error status in the journal superblock,
 * but don't do any other IO. */
-void __journal_abort_soft (journal_t *journal, int errno)
+static void __journal_abort_soft (journal_t *journal, int errno)
 {
        if (journal->j_flags & JFS_ABORT)
                return;
@@ -1880,7 +1868,7 @@ EXPORT_SYMBOL(journal_enable_debug);
 static struct proc_dir_entry *proc_jbd_debug;
-int read_jbd_debug(char *page, char **start, off_t off,
+static int read_jbd_debug(char *page, char **start, off_t off,
                          int count, int *eof, void *data)
 {
        int ret;
@@ -1890,7 +1878,7 @@ int read_jbd_debug(char *page, char **start, off_t off,
        return ret;
 }
-int write_jbd_debug(struct file *file, const char __user *buffer,
+static int write_jbd_debug(struct file *file, const char __user *buffer,
                           unsigned long count, void *data)
 {
        char buf[32];
@@ -1979,6 +1967,14 @@ static int __init journal_init(void)
 {
        int ret;
+/* Static check for data structure consistency.  There's no code
+ * invoked --- we'll just get a linker failure if things aren't right.
+ */
+        extern void journal_bad_superblock_size(void);
+        if (sizeof(struct journal_superblock_s) != 1024)
+                journal_bad_superblock_size();
        ret = journal_init_caches();
        if (ret != 0)
                journal_destroy_caches();
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index d327a598f861..a56144183462 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -116,7 +116,8 @@ static inline int hash(journal_t *journal, unsigned long block)
                (block << (hash_shift - 12))) & (table->hash_size - 1);
 }
-int insert_revoke_hash(journal_t *journal, unsigned long blocknr, tid_t seq)
+static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+                              tid_t seq)
 {
        struct list_head *hash_list;
        struct jbd_revoke_record_s *record;
@@ -613,7 +614,7 @@ static void flush_descriptor(journal_t *journal,
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
-        ll_rw_block(WRITE, 1, &bh);
+        ll_rw_block(SWRITE, 1, &bh);
 }
 #endif
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 77b7662b840b..c6ec66fd8766 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -490,23 +490,21 @@ void journal_unlock_updates (journal_t *journal)
 */
 static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        struct buffer_head *bh = jh2bh(jh);
        int jlist;
-        if (buffer_dirty(bh)) {
+        /* If this buffer is one which might reasonably be dirty
-                /* If this buffer is one which might reasonably be dirty
+         * --- ie. data, or not part of this journal --- then
-                 * --- ie. data, or not part of this journal --- then
+         * we're OK to leave it alone, but otherwise we need to
-                 * we're OK to leave it alone, but otherwise we need to
+         * move the dirty bit to the journal's own internal
-                 * move the dirty bit to the journal's own internal
+         * JBDDirty bit. */
-                 * JBDDirty bit. */
+        jlist = jh->b_jlist;
-                jlist = jh->b_jlist;
+        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
-                if (jlist == BJ_Metadata || jlist == BJ_Reserved || 
+            jlist == BJ_Shadow || jlist == BJ_Forget) {
-                    jlist == BJ_Shadow || jlist == BJ_Forget) {
+                struct buffer_head *bh = jh2bh(jh);
-                        if (test_clear_buffer_dirty(jh2bh(jh))) {
-                                set_bit(BH_JBDDirty, &jh2bh(jh)->b_state);
+                if (test_clear_buffer_dirty(bh))
-                        }
+                        set_buffer_jbddirty(bh);
-                }
        }
 }
@@ -574,9 +572,14 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
-                        JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                }
-                        jbd_unexpected_dirty_buffer(jh);
+                /*
-                }
+                 * In any case we need to clean the dirty flag and we must
+                 * do it under the buffer lock to be sure we don't race
+                 * with running write-out.
+                 */
+                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                jbd_unexpected_dirty_buffer(jh);
        }
        unlock_buffer(bh);
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index bfbeb4c86e03..777b90057b89 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -1629,9 +1629,6 @@ static int jffs_fsync(struct file *f, struct dentry *d, int datasync)
 }
-extern int generic_file_open(struct inode *, struct file *) __attribute__((weak));
-extern loff_t generic_file_llseek(struct file *, loff_t, int) __attribute__((weak));
 static struct file_operations jffs_file_operations =
 {
        .open           = generic_file_open,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index bd9ed9b0247b..8279bf0133ff 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -21,9 +21,6 @@
 #include <linux/jffs2.h>
 #include "nodelist.h"
-extern int generic_file_open(struct inode *, struct file *) __attribute__((weak));
-extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) __attribute__((weak));
 static int jffs2_commit_write (struct file *filp, struct page *pg,
                               unsigned start, unsigned end);
 static int jffs2_prepare_write (struct file *filp, struct page *pg,
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 86ccac80f0ab..72a5588faeca 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -37,6 +37,9 @@
 #define JFS_ERR_CONTINUE   0x00000004   /* continue */
 #define JFS_ERR_PANIC      0x00000008   /* panic */
+#define JFS_USRQUOTA    0x00000010
+#define JFS_GRPQUOTA    0x00000020
 /* platform option (conditional compilation) */
 #define JFS_AIX         0x80000000      /* AIX support */
 /*      POSIX name/directory  support */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 1cae14e741eb..49ccde3937f9 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1390,6 +1390,8 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
        jfs_info("jfs_lookup: name = %s", name);
+        if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
+                dentry->d_op = &jfs_ci_dentry_operations;
        if ((name[0] == '.') && (len == 1))
                inum = dip->i_ino;
@@ -1417,9 +1419,6 @@ static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry, struc
                return ERR_PTR(-EACCES);
        }
-        if (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2)
-                dentry->d_op = &jfs_ci_dentry_operations;
        dentry = d_splice_alias(ip, dentry);
        if (dentry && (JFS_SBI(dip->i_sb)->mntflag & JFS_OS2))
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 9ff89720f93b..71bc34b96b2b 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -23,9 +23,11 @@
 #include <linux/parser.h>
 #include <linux/completion.h>
 #include <linux/vfs.h>
+#include <linux/mount.h>
 #include <linux/moduleparam.h>
 #include <linux/posix_acl.h>
 #include <asm/uaccess.h>
+#include <linux/seq_file.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -192,7 +194,8 @@ static void jfs_put_super(struct super_block *sb)
 enum {
        Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize,
-        Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err,
+        Opt_resize_nosize, Opt_errors, Opt_ignore, Opt_err, Opt_quota,
+        Opt_usrquota, Opt_grpquota
 };
 static match_table_t tokens = {
@@ -204,8 +207,8 @@ static match_table_t tokens = {
        {Opt_errors, "errors=%s"},
        {Opt_ignore, "noquota"},
        {Opt_ignore, "quota"},
-        {Opt_ignore, "usrquota"},
+        {Opt_usrquota, "usrquota"},
-        {Opt_ignore, "grpquota"},
+        {Opt_grpquota, "grpquota"},
        {Opt_err, NULL}
 };
@@ -293,6 +296,24 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
                        }
                        break;
                }
+#if defined(CONFIG_QUOTA)
+                case Opt_quota:
+                case Opt_usrquota:
+                        *flag |= JFS_USRQUOTA;
+                        break;
+                case Opt_grpquota:
+                        *flag |= JFS_GRPQUOTA;
+                        break;
+#else
+                case Opt_usrquota:
+                case Opt_grpquota:
+                case Opt_quota:
+                        printk(KERN_ERR
+                               "JFS: quota operations not supported\n");
+                        break;
+#endif
                default:
                        printk("jfs: Unrecognized mount option \"%s\" "
                                        " or missing value\n", p);
@@ -539,6 +560,26 @@ static int jfs_sync_fs(struct super_block *sb, int wait)
        return 0;
 }
+static int jfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+        struct jfs_sb_info *sbi = JFS_SBI(vfs->mnt_sb);
+        if (sbi->flag & JFS_NOINTEGRITY)
+                seq_puts(seq, ",nointegrity");
+        else
+                seq_puts(seq, ",integrity");
+#if defined(CONFIG_QUOTA)
+        if (sbi->flag & JFS_USRQUOTA)
+                seq_puts(seq, ",usrquota");
+        if (sbi->flag & JFS_GRPQUOTA)
+                seq_puts(seq, ",grpquota");
+#endif
+        return 0;
+}
 static struct super_operations jfs_super_operations = {
        .alloc_inode    = jfs_alloc_inode,
        .destroy_inode  = jfs_destroy_inode,
@@ -552,6 +593,7 @@ static struct super_operations jfs_super_operations = {
        .unlockfs       = jfs_unlockfs,
        .statfs         = jfs_statfs,
        .remount_fs     = jfs_remount,
+        .show_options   = jfs_show_options
 };
 static struct export_operations jfs_export_operations = {
diff --git a/fs/namei.c b/fs/namei.c
index 6ec1f0fefc5b..145e852c4bd0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -525,6 +525,22 @@ static inline int __do_follow_link(struct path *path, struct nameidata *nd)
        return error;
 }
+static inline void dput_path(struct path *path, struct nameidata *nd)
+{
+        dput(path->dentry);
+        if (path->mnt != nd->mnt)
+                mntput(path->mnt);
+}
+static inline void path_to_nameidata(struct path *path, struct nameidata *nd)
+{
+        dput(nd->dentry);
+        if (nd->mnt != path->mnt)
+                mntput(nd->mnt);
+        nd->mnt = path->mnt;
+        nd->dentry = path->dentry;
+}
 /*
 * This limits recursive symlink follows to 8, while
 * limiting consecutive symlinks to 40.
@@ -552,9 +568,7 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
        nd->depth--;
        return err;
 loop:
-        dput(path->dentry);
+        dput_path(path, nd);
-        if (path->mnt != nd->mnt)
-                mntput(path->mnt);
        path_release(nd);
        return err;
 }
@@ -813,13 +827,8 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd)
                        err = -ENOTDIR; 
                        if (!inode->i_op)
                                break;
-                } else {
+                } else
-                        dput(nd->dentry);
+                        path_to_nameidata(&next, nd);
-                        if (nd->mnt != next.mnt)
-                                mntput(nd->mnt);
-                        nd->mnt = next.mnt;
-                        nd->dentry = next.dentry;
-                }
                err = -ENOTDIR; 
                if (!inode->i_op->lookup)
                        break;
@@ -859,13 +868,8 @@ last_component:
                        if (err)
                                goto return_err;
                        inode = nd->dentry->d_inode;
-                } else {
+                } else
-                        dput(nd->dentry);
+                        path_to_nameidata(&next, nd);
-                        if (nd->mnt != next.mnt)
-                                mntput(nd->mnt);
-                        nd->mnt = next.mnt;
-                        nd->dentry = next.dentry;
-                }
                err = -ENOENT;
                if (!inode)
                        break;
@@ -901,9 +905,7 @@ return_reval:
 return_base:
                return 0;
 out_dput:
-                dput(next.dentry);
+                dput_path(&next, nd);
-                if (nd->mnt != next.mnt)
-                        mntput(next.mnt);
                break;
        }
        path_release(nd);
@@ -1507,11 +1509,7 @@ do_last:
        if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
                goto do_link;
-        dput(nd->dentry);
+        path_to_nameidata(&path, nd);
-        nd->dentry = path.dentry;
-        if (nd->mnt != path.mnt)
-                mntput(nd->mnt);
-        nd->mnt = path.mnt;
        error = -EISDIR;
        if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
                goto exit;
@@ -1522,9 +1520,7 @@ ok:
        return 0;
 exit_dput:
-        dput(path.dentry);
+        dput_path(&path, nd);
-        if (nd->mnt != path.mnt)
-                mntput(path.mnt);
 exit:
        path_release(nd);
        return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index 79bd8a46e1e7..34156260c9b6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -40,7 +40,7 @@ static inline int sysfs_init(void)
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 static struct list_head *mount_hashtable;
-static int hash_mask, hash_bits;
+static int hash_mask __read_mostly, hash_bits __read_mostly;
 static kmem_cache_t *mnt_cache; 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
@@ -1334,8 +1334,12 @@ asmlinkage long sys_pivot_root(const char __user *new_root, const char __user *p
        error = -EINVAL;
        if (user_nd.mnt->mnt_root != user_nd.dentry)
                goto out2; /* not a mountpoint */
+        if (user_nd.mnt->mnt_parent == user_nd.mnt)
+                goto out2; /* not attached */
        if (new_nd.mnt->mnt_root != new_nd.dentry)
                goto out2; /* not a mountpoint */
+        if (new_nd.mnt->mnt_parent == new_nd.mnt)
+                goto out2; /* not attached */
        tmp = old_nd.mnt; /* make sure we can reach put_old from new_root */
        spin_lock(&vfsmount_lock);
        if (tmp != new_nd.mnt) {
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 9a11aa39e2e4..057aff745506 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -26,6 +26,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/hash.h>
+#include <linux/module.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
@@ -221,6 +222,7 @@ static int expkey_show(struct seq_file *m,
 }
        
 struct cache_detail svc_expkey_cache = {
+        .owner          = THIS_MODULE,
        .hash_size      = EXPKEY_HASHMAX,
        .hash_table     = expkey_table,
        .name           = "nfsd.fh",
@@ -456,6 +458,7 @@ static int svc_export_show(struct seq_file *m,
        return 0;
 }
 struct cache_detail svc_export_cache = {
+        .owner          = THIS_MODULE,
        .hash_size      = EXPORT_HASHMAX,
        .hash_table     = export_table,
        .name           = "nfsd.export",
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5605a26efc57..13369650cdf9 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -187,6 +187,7 @@ static int         idtoname_parse(struct cache_detail *, char *, int);
 static struct ent *idtoname_lookup(struct ent *, int);
 static struct cache_detail idtoname_cache = {
+        .owner          = THIS_MODULE,
        .hash_size      = ENT_HASHMAX,
        .hash_table     = idtoname_table,
        .name           = "nfs4.idtoname",
@@ -320,6 +321,7 @@ static struct ent *nametoid_lookup(struct ent *, int);
 static int         nametoid_parse(struct cache_detail *, char *, int);
 static struct cache_detail nametoid_cache = {
+        .owner          = THIS_MODULE,
        .hash_size      = ENT_HASHMAX,
        .hash_table     = nametoid_table,
        .name           = "nfs4.nametoid",
@@ -404,8 +406,10 @@ nfsd_idmap_init(void)
 void
 nfsd_idmap_shutdown(void)
 {
-        cache_unregister(&idtoname_cache);
+        if (cache_unregister(&idtoname_cache))
-        cache_unregister(&nametoid_cache);
+                printk(KERN_ERR "nfsd: failed to unregister idtoname cache\n");
+        if (cache_unregister(&nametoid_cache))
+                printk(KERN_ERR "nfsd: failed to unregister nametoid cache\n");
 }
 /*
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 57ed50fe7f85..954cf893d50c 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -93,7 +93,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
        dprintk("NFSD: nfs4_make_rec_clidname for %.*s\n",
                        clname->len, clname->data);
-        tfm = crypto_alloc_tfm("md5", 0);
+        tfm = crypto_alloc_tfm("md5", CRYPTO_TFM_REQ_MAY_SLEEP);
        if (tfm == NULL)
                goto out;
        cksum.len = crypto_tfm_alg_digestsize(tfm);
@@ -114,8 +114,7 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname)
        kfree(cksum.data);
        status = nfs_ok;
 out:
-        if (tfm)
+        crypto_free_tfm(tfm);
-                crypto_free_tfm(tfm);
        return status;
 }
diff --git a/fs/open.c b/fs/open.c
index 32bf05e2996d..4ee2dcc31c28 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -933,16 +933,11 @@ void fastcall fd_install(unsigned int fd, struct file * file)
 EXPORT_SYMBOL(fd_install);
-asmlinkage long sys_open(const char __user * filename, int flags, int mode)
+long do_sys_open(const char __user *filename, int flags, int mode)
 {
-        char * tmp;
+        char *tmp = getname(filename);
-        int fd;
+        int fd = PTR_ERR(tmp);
-        if (force_o_largefile())
-                flags |= O_LARGEFILE;
-        tmp = getname(filename);
-        fd = PTR_ERR(tmp);
        if (!IS_ERR(tmp)) {
                fd = get_unused_fd();
                if (fd >= 0) {
@@ -959,6 +954,14 @@ asmlinkage long sys_open(const char __user * filename, int flags, int mode)
        }
        return fd;
 }
+asmlinkage long sys_open(const char __user *filename, int flags, int mode)
+{
+        if (force_o_largefile())
+                flags |= O_LARGEFILE;
+        return do_sys_open(filename, flags, mode);
+}
 EXPORT_SYMBOL_GPL(sys_open);
 #ifndef __alpha__
diff --git a/fs/pipe.c b/fs/pipe.c
index 25aa09f9d09d..2c7a23dde2d8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -415,6 +415,10 @@ pipe_poll(struct file *filp, poll_table *wait)
        if (filp->f_mode & FMODE_WRITE) {
                mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+                /*
+                 * Most Unices do not set POLLERR for FIFOs but on Linux they
+                 * behave exactly like pipes for poll().
+                 */
                if (!PIPE_READERS(*inode))
                        mask |= POLLERR;
        }
@@ -422,9 +426,6 @@ pipe_poll(struct file *filp, poll_table *wait)
        return mask;
 }
-/* FIXME: most Unices do not set POLLERR for fifos */
-#define fifo_poll pipe_poll
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
@@ -568,7 +569,7 @@ struct file_operations read_fifo_fops = {
        .read           = pipe_read,
        .readv          = pipe_readv,
        .write          = bad_pipe_w,
-        .poll           = fifo_poll,
+        .poll           = pipe_poll,
        .ioctl          = pipe_ioctl,
        .open           = pipe_read_open,
        .release        = pipe_read_release,
@@ -580,7 +581,7 @@ struct file_operations write_fifo_fops = {
        .read           = bad_pipe_r,
        .write          = pipe_write,
        .writev         = pipe_writev,
-        .poll           = fifo_poll,
+        .poll           = pipe_poll,
        .ioctl          = pipe_ioctl,
        .open           = pipe_write_open,
        .release        = pipe_write_release,
@@ -593,7 +594,7 @@ struct file_operations rdwr_fifo_fops = {
        .readv          = pipe_readv,
        .write          = pipe_write,
        .writev         = pipe_writev,
-        .poll           = fifo_poll,
+        .poll           = pipe_poll,
        .ioctl          = pipe_ioctl,
        .open           = pipe_rdwr_open,
        .release        = pipe_rdwr_release,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 491f2d9f89ac..84751f3f52d5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -11,6 +11,40 @@
 *  go into icache. We cache the reference to task_struct upon lookup too.
 *  Eventually it should become a filesystem in its own. We don't use the
 *  rest of procfs anymore.
+ *
+ *
+ *  Changelog:
+ *  17-Jan-2005
+ *  Allan Bezerra
+ *  Bruna Moreira <bruna.moreira@indt.org.br>
+ *  Edjard Mota <edjard.mota@indt.org.br>
+ *  Ilias Biris <ilias.biris@indt.org.br>
+ *  Mauricio Lin <mauricio.lin@indt.org.br>
+ *
+ *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *
+ *  A new process specific entry (smaps) included in /proc. It shows the
+ *  size of rss for each memory area. The maps entry lacks information
+ *  about physical memory size (rss) for each mapped file, i.e.,
+ *  rss information for executables and library files.
+ *  This additional information is useful for any tools that need to know
+ *  about physical memory consumption for a process specific library.
+ *
+ *  Changelog:
+ *  21-Feb-2005
+ *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
+ *  Pud inclusion in the page table walking.
+ *
+ *  ChangeLog:
+ *  10-Mar-2005
+ *  10LE Instituto Nokia de Tecnologia - INdT:
+ *  A better way to walks through the page table as suggested by Hugh Dickins.
+ *
+ *  Simo Piiroinen <simo.piiroinen@nokia.com>:
+ *  Smaps information related to shared, private, clean and dirty pages.
+ *
+ *  Paul Mundt <paul.mundt@nokia.com>:
+ *  Overall revision about smaps.
 */
 #include <asm/uaccess.h>
@@ -65,8 +99,10 @@ enum pid_directory_inos {
        PROC_TGID_STAT,
        PROC_TGID_STATM,
        PROC_TGID_MAPS,
+        PROC_TGID_NUMA_MAPS,
        PROC_TGID_MOUNTS,
        PROC_TGID_WCHAN,
+        PROC_TGID_SMAPS,
 #ifdef CONFIG_SCHEDSTATS
        PROC_TGID_SCHEDSTAT,
 #endif
@@ -83,7 +119,6 @@ enum pid_directory_inos {
 #ifdef CONFIG_AUDITSYSCALL
        PROC_TGID_LOGINUID,
 #endif
-        PROC_TGID_FD_DIR,
        PROC_TGID_OOM_SCORE,
        PROC_TGID_OOM_ADJUST,
        PROC_TID_INO,
@@ -102,8 +137,10 @@ enum pid_directory_inos {
        PROC_TID_STAT,
        PROC_TID_STATM,
        PROC_TID_MAPS,
+        PROC_TID_NUMA_MAPS,
        PROC_TID_MOUNTS,
        PROC_TID_WCHAN,
+        PROC_TID_SMAPS,
 #ifdef CONFIG_SCHEDSTATS
        PROC_TID_SCHEDSTAT,
 #endif
@@ -120,9 +157,11 @@ enum pid_directory_inos {
 #ifdef CONFIG_AUDITSYSCALL
        PROC_TID_LOGINUID,
 #endif
-        PROC_TID_FD_DIR = 0x8000,       /* 0x8000-0xffff */
        PROC_TID_OOM_SCORE,
        PROC_TID_OOM_ADJUST,
+        /* Add new entries before this */
+        PROC_TID_FD_DIR = 0x8000,       /* 0x8000-0xffff */
 };
 struct pid_entry {
@@ -144,6 +183,9 @@ static struct pid_entry tgid_base_stuff[] = {
        E(PROC_TGID_STAT,      "stat",    S_IFREG|S_IRUGO),
        E(PROC_TGID_STATM,     "statm",   S_IFREG|S_IRUGO),
        E(PROC_TGID_MAPS,      "maps",    S_IFREG|S_IRUGO),
+#ifdef CONFIG_NUMA
+        E(PROC_TGID_NUMA_MAPS, "numa_maps", S_IFREG|S_IRUGO),
+#endif
        E(PROC_TGID_MEM,       "mem",     S_IFREG|S_IRUSR|S_IWUSR),
 #ifdef CONFIG_SECCOMP
        E(PROC_TGID_SECCOMP,   "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
@@ -152,6 +194,7 @@ static struct pid_entry tgid_base_stuff[] = {
        E(PROC_TGID_ROOT,      "root",    S_IFLNK|S_IRWXUGO),
        E(PROC_TGID_EXE,       "exe",     S_IFLNK|S_IRWXUGO),
        E(PROC_TGID_MOUNTS,    "mounts",  S_IFREG|S_IRUGO),
+        E(PROC_TGID_SMAPS,     "smaps",   S_IFREG|S_IRUGO),
 #ifdef CONFIG_SECURITY
        E(PROC_TGID_ATTR,      "attr",    S_IFDIR|S_IRUGO|S_IXUGO),
 #endif
@@ -180,6 +223,9 @@ static struct pid_entry tid_base_stuff[] = {
        E(PROC_TID_STAT,       "stat",    S_IFREG|S_IRUGO),
        E(PROC_TID_STATM,      "statm",   S_IFREG|S_IRUGO),
        E(PROC_TID_MAPS,       "maps",    S_IFREG|S_IRUGO),
+#ifdef CONFIG_NUMA
+        E(PROC_TID_NUMA_MAPS,  "numa_maps",    S_IFREG|S_IRUGO),
+#endif
        E(PROC_TID_MEM,        "mem",     S_IFREG|S_IRUSR|S_IWUSR),
 #ifdef CONFIG_SECCOMP
        E(PROC_TID_SECCOMP,    "seccomp", S_IFREG|S_IRUSR|S_IWUSR),
@@ -188,6 +234,7 @@ static struct pid_entry tid_base_stuff[] = {
        E(PROC_TID_ROOT,       "root",    S_IFLNK|S_IRWXUGO),
        E(PROC_TID_EXE,        "exe",     S_IFLNK|S_IRWXUGO),
        E(PROC_TID_MOUNTS,     "mounts",  S_IFREG|S_IRUGO),
+        E(PROC_TID_SMAPS,      "smaps",   S_IFREG|S_IRUGO),
 #ifdef CONFIG_SECURITY
        E(PROC_TID_ATTR,       "attr",    S_IFDIR|S_IRUGO|S_IXUGO),
 #endif
@@ -251,15 +298,21 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
        return -ENOENT;
 }
-static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+static struct fs_struct *get_fs_struct(struct task_struct *task)
 {
        struct fs_struct *fs;
-        int result = -ENOENT;
+        task_lock(task);
-        task_lock(proc_task(inode));
+        fs = task->fs;
-        fs = proc_task(inode)->fs;
        if(fs)
                atomic_inc(&fs->count);
-        task_unlock(proc_task(inode));
+        task_unlock(task);
+        return fs;
+}
+static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+{
+        struct fs_struct *fs = get_fs_struct(proc_task(inode));
+        int result = -ENOENT;
        if (fs) {
                read_lock(&fs->lock);
                *mnt = mntget(fs->pwdmnt);
@@ -273,13 +326,8 @@ static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfs
 static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
 {
-        struct fs_struct *fs;
+        struct fs_struct *fs = get_fs_struct(proc_task(inode));
        int result = -ENOENT;
-        task_lock(proc_task(inode));
-        fs = proc_task(inode)->fs;
-        if(fs)
-                atomic_inc(&fs->count);
-        task_unlock(proc_task(inode));
        if (fs) {
                read_lock(&fs->lock);
                *mnt = mntget(fs->rootmnt);
@@ -298,33 +346,6 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
         (task->state == TASK_STOPPED || task->state == TASK_TRACED) && \
         security_ptrace(current,task) == 0))
-static int may_ptrace_attach(struct task_struct *task)
-{
-        int retval = 0;
-        task_lock(task);
-        if (!task->mm)
-                goto out;
-        if (((current->uid != task->euid) ||
-             (current->uid != task->suid) ||
-             (current->uid != task->uid) ||
-             (current->gid != task->egid) ||
-             (current->gid != task->sgid) ||
-             (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
-                goto out;
-        rmb();
-        if (task->mm->dumpable != 1 && !capable(CAP_SYS_PTRACE))
-                goto out;
-        if (security_ptrace(current, task))
-                goto out;
-        retval = 1;
-out:
-        task_unlock(task);
-        return retval;
-}
 static int proc_pid_environ(struct task_struct *task, char * buffer)
 {
        int res = 0;
@@ -334,7 +355,7 @@ static int proc_pid_environ(struct task_struct *task, char * buffer)
                if (len > PAGE_SIZE)
                        len = PAGE_SIZE;
                res = access_process_vm(task, mm->env_start, buffer, len, 0);
-                if (!may_ptrace_attach(task))
+                if (!ptrace_may_attach(task))
                        res = -ESRCH;
                mmput(mm);
        }
@@ -515,6 +536,46 @@ static struct file_operations proc_maps_operations = {
        .release        = seq_release,
 };
+#ifdef CONFIG_NUMA
+extern struct seq_operations proc_pid_numa_maps_op;
+static int numa_maps_open(struct inode *inode, struct file *file)
+{
+        struct task_struct *task = proc_task(inode);
+        int ret = seq_open(file, &proc_pid_numa_maps_op);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = task;
+        }
+        return ret;
+}
+static struct file_operations proc_numa_maps_operations = {
+        .open           = numa_maps_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+#endif
+extern struct seq_operations proc_pid_smaps_op;
+static int smaps_open(struct inode *inode, struct file *file)
+{
+        struct task_struct *task = proc_task(inode);
+        int ret = seq_open(file, &proc_pid_smaps_op);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = task;
+        }
+        return ret;
+}
+static struct file_operations proc_smaps_operations = {
+        .open           = smaps_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
 extern struct seq_operations mounts_op;
 static int mounts_open(struct inode *inode, struct file *file)
 {
@@ -597,7 +658,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
        int ret = -ESRCH;
        struct mm_struct *mm;
-        if (!MAY_PTRACE(task) || !may_ptrace_attach(task))
+        if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
                goto out;
        ret = -ENOMEM;
@@ -623,7 +684,7 @@ static ssize_t mem_read(struct file * file, char __user * buf,
                this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
                retval = access_process_vm(task, src, page, this_len, 0);
-                if (!retval || !MAY_PTRACE(task) || !may_ptrace_attach(task)) {
+                if (!retval || !MAY_PTRACE(task) || !ptrace_may_attach(task)) {
                        if (!ret)
                                ret = -EIO;
                        break;
@@ -661,7 +722,7 @@ static ssize_t mem_write(struct file * file, const char * buf,
        struct task_struct *task = proc_task(file->f_dentry->d_inode);
        unsigned long dst = *ppos;
-        if (!MAY_PTRACE(task) || !may_ptrace_attach(task))
+        if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
                return -ESRCH;
        page = (char *)__get_free_page(GFP_USER);
@@ -1524,6 +1585,12 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
                case PROC_TGID_MAPS:
                        inode->i_fop = &proc_maps_operations;
                        break;
+#ifdef CONFIG_NUMA
+                case PROC_TID_NUMA_MAPS:
+                case PROC_TGID_NUMA_MAPS:
+                        inode->i_fop = &proc_numa_maps_operations;
+                        break;
+#endif
                case PROC_TID_MEM:
                case PROC_TGID_MEM:
                        inode->i_op = &proc_mem_inode_operations;
@@ -1539,6 +1606,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
                case PROC_TGID_MOUNTS:
                        inode->i_fop = &proc_mounts_operations;
                        break;
+                case PROC_TID_SMAPS:
+                case PROC_TGID_SMAPS:
+                        inode->i_fop = &proc_smaps_operations;
+                        break;
 #ifdef CONFIG_SECURITY
                case PROC_TID_ATTR:
                        inode->i_nlink = 2;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index abe8920313fb..8a8c34461d48 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -249,6 +249,18 @@ out:
        return error;
 }
+static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
+                        struct kstat *stat)
+{
+        struct inode *inode = dentry->d_inode;
+        struct proc_dir_entry *de = PROC_I(inode)->pde;
+        if (de && de->nlink)
+                inode->i_nlink = de->nlink;
+        generic_fillattr(inode, stat);
+        return 0;
+}
 static struct inode_operations proc_file_inode_operations = {
        .setattr        = proc_notify_change,
 };
@@ -475,6 +487,7 @@ static struct file_operations proc_dir_operations = {
 */
 static struct inode_operations proc_dir_inode_operations = {
        .lookup         = proc_lookup,
+        .getattr        = proc_getattr,
        .setattr        = proc_notify_change,
 };
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 28b4a0253a92..c7ef3e48e35b 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2,8 +2,13 @@
 #include <linux/hugetlb.h>
 #include <linux/mount.h>
 #include <linux/seq_file.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/mempolicy.h>
 #include <asm/elf.h>
 #include <asm/uaccess.h>
+#include <asm/tlbflush.h>
 #include "internal.h"
 char *task_mem(struct mm_struct *mm, char *buffer)
@@ -87,49 +92,58 @@ static void pad_len_spaces(struct seq_file *m, int len)
        seq_printf(m, "%*c", len, ' ');
 }
-static int show_map(struct seq_file *m, void *v)
+struct mem_size_stats
+{
+        unsigned long resident;
+        unsigned long shared_clean;
+        unsigned long shared_dirty;
+        unsigned long private_clean;
+        unsigned long private_dirty;
+};
+static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
 {
        struct task_struct *task = m->private;
-        struct vm_area_struct *map = v;
+        struct vm_area_struct *vma = v;
-        struct mm_struct *mm = map->vm_mm;
+        struct mm_struct *mm = vma->vm_mm;
-        struct file *file = map->vm_file;
+        struct file *file = vma->vm_file;
-        int flags = map->vm_flags;
+        int flags = vma->vm_flags;
        unsigned long ino = 0;
        dev_t dev = 0;
        int len;
        if (file) {
-                struct inode *inode = map->vm_file->f_dentry->d_inode;
+                struct inode *inode = vma->vm_file->f_dentry->d_inode;
                dev = inode->i_sb->s_dev;
                ino = inode->i_ino;
        }
        seq_printf(m, "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
-                        map->vm_start,
+                        vma->vm_start,
-                        map->vm_end,
+                        vma->vm_end,
                        flags & VM_READ ? 'r' : '-',
                        flags & VM_WRITE ? 'w' : '-',
                        flags & VM_EXEC ? 'x' : '-',
                        flags & VM_MAYSHARE ? 's' : 'p',
-                        map->vm_pgoff << PAGE_SHIFT,
+                        vma->vm_pgoff << PAGE_SHIFT,
                        MAJOR(dev), MINOR(dev), ino, &len);
        /*
         * Print the dentry name for named mappings, and a
         * special [heap] marker for the heap:
         */
-        if (map->vm_file) {
+        if (file) {
                pad_len_spaces(m, len);
-                seq_path(m, file->f_vfsmnt, file->f_dentry, "");
+                seq_path(m, file->f_vfsmnt, file->f_dentry, "\n");
        } else {
                if (mm) {
-                        if (map->vm_start <= mm->start_brk &&
+                        if (vma->vm_start <= mm->start_brk &&
-                                                map->vm_end >= mm->brk) {
+                                                vma->vm_end >= mm->brk) {
                                pad_len_spaces(m, len);
                                seq_puts(m, "[heap]");
                        } else {
-                                if (map->vm_start <= mm->start_stack &&
+                                if (vma->vm_start <= mm->start_stack &&
-                                        map->vm_end >= mm->start_stack) {
+                                        vma->vm_end >= mm->start_stack) {
                                        pad_len_spaces(m, len);
                                        seq_puts(m, "[stack]");
@@ -141,24 +155,146 @@ static int show_map(struct seq_file *m, void *v)
                }
        }
        seq_putc(m, '\n');
-        if (m->count < m->size)  /* map is copied successfully */
-                m->version = (map != get_gate_vma(task))? map->vm_start: 0;
+        if (mss)
+                seq_printf(m,
+                           "Size:          %8lu kB\n"
+                           "Rss:           %8lu kB\n"
+                           "Shared_Clean:  %8lu kB\n"
+                           "Shared_Dirty:  %8lu kB\n"
+                           "Private_Clean: %8lu kB\n"
+                           "Private_Dirty: %8lu kB\n",
+                           (vma->vm_end - vma->vm_start) >> 10,
+                           mss->resident >> 10,
+                           mss->shared_clean  >> 10,
+                           mss->shared_dirty  >> 10,
+                           mss->private_clean >> 10,
+                           mss->private_dirty >> 10);
+        if (m->count < m->size)  /* vma is copied successfully */
+                m->version = (vma != get_gate_vma(task))? vma->vm_start: 0;
        return 0;
 }
+static int show_map(struct seq_file *m, void *v)
+{
+        return show_map_internal(m, v, 0);
+}
+static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+                                unsigned long addr, unsigned long end,
+                                struct mem_size_stats *mss)
+{
+        pte_t *pte, ptent;
+        unsigned long pfn;
+        struct page *page;
+        pte = pte_offset_map(pmd, addr);
+        do {
+                ptent = *pte;
+                if (pte_none(ptent) || !pte_present(ptent))
+                        continue;
+                mss->resident += PAGE_SIZE;
+                pfn = pte_pfn(ptent);
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                if (page_count(page) >= 2) {
+                        if (pte_dirty(ptent))
+                                mss->shared_dirty += PAGE_SIZE;
+                        else
+                                mss->shared_clean += PAGE_SIZE;
+                } else {
+                        if (pte_dirty(ptent))
+                                mss->private_dirty += PAGE_SIZE;
+                        else
+                                mss->private_clean += PAGE_SIZE;
+                }
+        } while (pte++, addr += PAGE_SIZE, addr != end);
+        pte_unmap(pte - 1);
+        cond_resched_lock(&vma->vm_mm->page_table_lock);
+}
+static inline void smaps_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+                                unsigned long addr, unsigned long end,
+                                struct mem_size_stats *mss)
+{
+        pmd_t *pmd;
+        unsigned long next;
+        pmd = pmd_offset(pud, addr);
+        do {
+                next = pmd_addr_end(addr, end);
+                if (pmd_none_or_clear_bad(pmd))
+                        continue;
+                smaps_pte_range(vma, pmd, addr, next, mss);
+        } while (pmd++, addr = next, addr != end);
+}
+static inline void smaps_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+                                unsigned long addr, unsigned long end,
+                                struct mem_size_stats *mss)
+{
+        pud_t *pud;
+        unsigned long next;
+        pud = pud_offset(pgd, addr);
+        do {
+                next = pud_addr_end(addr, end);
+                if (pud_none_or_clear_bad(pud))
+                        continue;
+                smaps_pmd_range(vma, pud, addr, next, mss);
+        } while (pud++, addr = next, addr != end);
+}
+static inline void smaps_pgd_range(struct vm_area_struct *vma,
+                                unsigned long addr, unsigned long end,
+                                struct mem_size_stats *mss)
+{
+        pgd_t *pgd;
+        unsigned long next;
+        pgd = pgd_offset(vma->vm_mm, addr);
+        do {
+                next = pgd_addr_end(addr, end);
+                if (pgd_none_or_clear_bad(pgd))
+                        continue;
+                smaps_pud_range(vma, pgd, addr, next, mss);
+        } while (pgd++, addr = next, addr != end);
+}
+static int show_smap(struct seq_file *m, void *v)
+{
+        struct vm_area_struct *vma = v;
+        struct mm_struct *mm = vma->vm_mm;
+        struct mem_size_stats mss;
+        memset(&mss, 0, sizeof mss);
+        if (mm) {
+                spin_lock(&mm->page_table_lock);
+                smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss);
+                spin_unlock(&mm->page_table_lock);
+        }
+        return show_map_internal(m, v, &mss);
+}
 static void *m_start(struct seq_file *m, loff_t *pos)
 {
        struct task_struct *task = m->private;
        unsigned long last_addr = m->version;
        struct mm_struct *mm;
-        struct vm_area_struct *map, *tail_map;
+        struct vm_area_struct *vma, *tail_vma;
        loff_t l = *pos;
        /*
         * We remember last_addr rather than next_addr to hit with
         * mmap_cache most of the time. We have zero last_addr at
-         * the begining and also after lseek. We will have -1 last_addr
+         * the beginning and also after lseek. We will have -1 last_addr
-         * after the end of the maps.
+         * after the end of the vmas.
         */
        if (last_addr == -1UL)
@@ -168,47 +304,47 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        if (!mm)
                return NULL;
-        tail_map = get_gate_vma(task);
+        tail_vma = get_gate_vma(task);
        down_read(&mm->mmap_sem);
        /* Start with last addr hint */
-        if (last_addr && (map = find_vma(mm, last_addr))) {
+        if (last_addr && (vma = find_vma(mm, last_addr))) {
-                map = map->vm_next;
+                vma = vma->vm_next;
                goto out;
        }
        /*
-         * Check the map index is within the range and do
+         * Check the vma index is within the range and do
         * sequential scan until m_index.
         */
-        map = NULL;
+        vma = NULL;
        if ((unsigned long)l < mm->map_count) {
-                map = mm->mmap;
+                vma = mm->mmap;
-                while (l-- && map)
+                while (l-- && vma)
-                        map = map->vm_next;
+                        vma = vma->vm_next;
                goto out;
        }
        if (l != mm->map_count)
-                tail_map = NULL; /* After gate map */
+                tail_vma = NULL; /* After gate vma */
 out:
-        if (map)
+        if (vma)
-                return map;
+                return vma;
-        /* End of maps has reached */
+        /* End of vmas has been reached */
-        m->version = (tail_map != NULL)? 0: -1UL;
+        m->version = (tail_vma != NULL)? 0: -1UL;
        up_read(&mm->mmap_sem);
        mmput(mm);
-        return tail_map;
+        return tail_vma;
 }
 static void m_stop(struct seq_file *m, void *v)
 {
        struct task_struct *task = m->private;
-        struct vm_area_struct *map = v;
+        struct vm_area_struct *vma = v;
-        if (map && map != get_gate_vma(task)) {
+        if (vma && vma != get_gate_vma(task)) {
-                struct mm_struct *mm = map->vm_mm;
+                struct mm_struct *mm = vma->vm_mm;
                up_read(&mm->mmap_sem);
                mmput(mm);
        }
@@ -217,14 +353,14 @@ static void m_stop(struct seq_file *m, void *v)
 static void *m_next(struct seq_file *m, void *v, loff_t *pos)
 {
        struct task_struct *task = m->private;
-        struct vm_area_struct *map = v;
+        struct vm_area_struct *vma = v;
-        struct vm_area_struct *tail_map = get_gate_vma(task);
+        struct vm_area_struct *tail_vma = get_gate_vma(task);
        (*pos)++;
-        if (map && (map != tail_map) && map->vm_next)
+        if (vma && (vma != tail_vma) && vma->vm_next)
-                return map->vm_next;
+                return vma->vm_next;
        m_stop(m, v);
-        return (map != tail_map)? tail_map: NULL;
+        return (vma != tail_vma)? tail_vma: NULL;
 }
 struct seq_operations proc_pid_maps_op = {
@@ -233,3 +369,140 @@ struct seq_operations proc_pid_maps_op = {
        .stop   = m_stop,
        .show   = show_map
 };
+struct seq_operations proc_pid_smaps_op = {
+        .start  = m_start,
+        .next   = m_next,
+        .stop   = m_stop,
+        .show   = show_smap
+};
+#ifdef CONFIG_NUMA
+struct numa_maps {
+        unsigned long pages;
+        unsigned long anon;
+        unsigned long mapped;
+        unsigned long mapcount_max;
+        unsigned long node[MAX_NUMNODES];
+};
+/*
+ * Calculate numa node maps for a vma
+ */
+static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
+{
+        struct page *page;
+        unsigned long vaddr;
+        struct mm_struct *mm = vma->vm_mm;
+        int i;
+        struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL);
+        if (!md)
+                return NULL;
+        md->pages = 0;
+        md->anon = 0;
+        md->mapped = 0;
+        md->mapcount_max = 0;
+        for_each_node(i)
+                md->node[i] =0;
+        spin_lock(&mm->page_table_lock);
+        for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
+                page = follow_page(mm, vaddr, 0);
+                if (page) {
+                        int count = page_mapcount(page);
+                        if (count)
+                                md->mapped++;
+                        if (count > md->mapcount_max)
+                                md->mapcount_max = count;
+                        md->pages++;
+                        if (PageAnon(page))
+                                md->anon++;
+                        md->node[page_to_nid(page)]++;
+                }
+        }
+        spin_unlock(&mm->page_table_lock);
+        return md;
+}
+static int show_numa_map(struct seq_file *m, void *v)
+{
+        struct task_struct *task = m->private;
+        struct vm_area_struct *vma = v;
+        struct mempolicy *pol;
+        struct numa_maps *md;
+        struct zone **z;
+        int n;
+        int first;
+        if (!vma->vm_mm)
+                return 0;
+        md = get_numa_maps(vma);
+        if (!md)
+                return 0;
+        seq_printf(m, "%08lx", vma->vm_start);
+        pol = get_vma_policy(task, vma, vma->vm_start);
+        /* Print policy */
+        switch (pol->policy) {
+        case MPOL_PREFERRED:
+                seq_printf(m, " prefer=%d", pol->v.preferred_node);
+                break;
+        case MPOL_BIND:
+                seq_printf(m, " bind={");
+                first = 1;
+                for (z = pol->v.zonelist->zones; *z; z++) {
+                        if (!first)
+                                seq_putc(m, ',');
+                        else
+                                first = 0;
+                        seq_printf(m, "%d/%s", (*z)->zone_pgdat->node_id,
+                                        (*z)->name);
+                }
+                seq_putc(m, '}');
+                break;
+        case MPOL_INTERLEAVE:
+                seq_printf(m, " interleave={");
+                first = 1;
+                for_each_node(n) {
+                        if (test_bit(n, pol->v.nodes)) {
+                                if (!first)
+                                        seq_putc(m,',');
+                                else
+                                        first = 0;
+                                seq_printf(m, "%d",n);
+                        }
+                }
+                seq_putc(m, '}');
+                break;
+        default:
+                seq_printf(m," default");
+                break;
+        }
+        seq_printf(m, " MaxRef=%lu Pages=%lu Mapped=%lu",
+                        md->mapcount_max, md->pages, md->mapped);
+        if (md->anon)
+                seq_printf(m," Anon=%lu",md->anon);
+        for_each_online_node(n) {
+                if (md->node[n])
+                        seq_printf(m, " N%d=%lu", n, md->node[n]);
+        }
+        seq_putc(m, '\n');
+        kfree(md);
+        if (m->count < m->size)  /* vma is copied successfully */
+                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+        return 0;
+}
+struct seq_operations proc_pid_numa_maps_op = {
+        .start  = m_start,
+        .next   = m_next,
+        .stop   = m_stop,
+        .show   = show_numa_map
+};
+#endif
diff --git a/fs/read_write.c b/fs/read_write.c
index 563abd09b5c8..b60324aaa2b6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -188,7 +188,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
        struct inode *inode;
        loff_t pos;
-        if (unlikely(count > file->f_maxcount))
+        if (unlikely(count > INT_MAX))
                goto Einval;
        pos = *ppos;
        if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ca7989b04be3..a8e29e9bbbd0 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1034,7 +1034,7 @@ static int flush_commit_list(struct super_block *s,
                    SB_ONDISK_JOURNAL_SIZE(s);
                tbh = journal_find_get_block(s, bn);
                if (buffer_dirty(tbh))  /* redundant, ll_rw_block() checks */
-                        ll_rw_block(WRITE, 1, &tbh);
+                        ll_rw_block(SWRITE, 1, &tbh);
                put_bh(tbh);
        }
        atomic_dec(&journal->j_async_throttle);
@@ -2172,7 +2172,7 @@ static int journal_read_transaction(struct super_block *p_s_sb,
        /* flush out the real blocks */
        for (i = 0; i < get_desc_trans_len(desc); i++) {
                set_buffer_dirty(real_blocks[i]);
-                ll_rw_block(WRITE, 1, real_blocks + i);
+                ll_rw_block(SWRITE, 1, real_blocks + i);
        }
        for (i = 0; i < get_desc_trans_len(desc); i++) {
                wait_on_buffer(real_blocks[i]);
diff --git a/fs/relayfs/Makefile b/fs/relayfs/Makefile
new file mode 100644
index 000000000000..e76e182cdb38
--- /dev/null
+++ b/fs/relayfs/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_RELAYFS_FS) += relayfs.o
+relayfs-y := relay.o inode.o buffers.o
diff --git a/fs/relayfs/buffers.c b/fs/relayfs/buffers.c
new file mode 100644
index 000000000000..2aa8e2719999
--- /dev/null
+++ b/fs/relayfs/buffers.c
@@ -0,0 +1,189 @@
+/*
+ * RelayFS buffer management code.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/relayfs_fs.h>
+#include "relay.h"
+#include "buffers.h"
+/*
+ * close() vm_op implementation for relayfs file mapping.
+ */
+static void relay_file_mmap_close(struct vm_area_struct *vma)
+{
+        struct rchan_buf *buf = vma->vm_private_data;
+        buf->chan->cb->buf_unmapped(buf, vma->vm_file);
+}
+/*
+ * nopage() vm_op implementation for relayfs file mapping.
+ */
+static struct page *relay_buf_nopage(struct vm_area_struct *vma,
+                                     unsigned long address,
+                                     int *type)
+{
+        struct page *page;
+        struct rchan_buf *buf = vma->vm_private_data;
+        unsigned long offset = address - vma->vm_start;
+        if (address > vma->vm_end)
+                return NOPAGE_SIGBUS; /* Disallow mremap */
+        if (!buf)
+                return NOPAGE_OOM;
+        page = vmalloc_to_page(buf->start + offset);
+        if (!page)
+                return NOPAGE_OOM;
+        get_page(page);
+        if (type)
+                *type = VM_FAULT_MINOR;
+        return page;
+}
+/*
+ * vm_ops for relay file mappings.
+ */
+static struct vm_operations_struct relay_file_mmap_ops = {
+        .nopage = relay_buf_nopage,
+        .close = relay_file_mmap_close,
+};
+/**
+ *      relay_mmap_buf: - mmap channel buffer to process address space
+ *      @buf: relay channel buffer
+ *      @vma: vm_area_struct describing memory to be mapped
+ *
+ *      Returns 0 if ok, negative on error
+ *
+ *      Caller should already have grabbed mmap_sem.
+ */
+int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
+{
+        unsigned long length = vma->vm_end - vma->vm_start;
+        struct file *filp = vma->vm_file;
+        if (!buf)
+                return -EBADF;
+        if (length != (unsigned long)buf->chan->alloc_size)
+                return -EINVAL;
+        vma->vm_ops = &relay_file_mmap_ops;
+        vma->vm_private_data = buf;
+        buf->chan->cb->buf_mapped(buf, filp);
+        return 0;
+}
+/**
+ *      relay_alloc_buf - allocate a channel buffer
+ *      @buf: the buffer struct
+ *      @size: total size of the buffer
+ *
+ *      Returns a pointer to the resulting buffer, NULL if unsuccessful
+ */
+static void *relay_alloc_buf(struct rchan_buf *buf, unsigned long size)
+{
+        void *mem;
+        unsigned int i, j, n_pages;
+        size = PAGE_ALIGN(size);
+        n_pages = size >> PAGE_SHIFT;
+        buf->page_array = kcalloc(n_pages, sizeof(struct page *), GFP_KERNEL);
+        if (!buf->page_array)
+                return NULL;
+        for (i = 0; i < n_pages; i++) {
+                buf->page_array[i] = alloc_page(GFP_KERNEL);
+                if (unlikely(!buf->page_array[i]))
+                        goto depopulate;
+        }
+        mem = vmap(buf->page_array, n_pages, GFP_KERNEL, PAGE_KERNEL);
+        if (!mem)
+                goto depopulate;
+        memset(mem, 0, size);
+        buf->page_count = n_pages;
+        return mem;
+depopulate:
+        for (j = 0; j < i; j++)
+                __free_page(buf->page_array[j]);
+        kfree(buf->page_array);
+        return NULL;
+}
+/**
+ *      relay_create_buf - allocate and initialize a channel buffer
+ *      @alloc_size: size of the buffer to allocate
+ *      @n_subbufs: number of sub-buffers in the channel
+ *
+ *      Returns channel buffer if successful, NULL otherwise
+ */
+struct rchan_buf *relay_create_buf(struct rchan *chan)
+{
+        struct rchan_buf *buf = kcalloc(1, sizeof(struct rchan_buf), GFP_KERNEL);
+        if (!buf)
+                return NULL;
+        buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL);
+        if (!buf->padding)
+                goto free_buf;
+        buf->start = relay_alloc_buf(buf, chan->alloc_size);
+        if (!buf->start)
+                goto free_buf;
+        buf->chan = chan;
+        kref_get(&buf->chan->kref);
+        return buf;
+free_buf:
+        kfree(buf->padding);
+        kfree(buf);
+        return NULL;
+}
+/**
+ *      relay_destroy_buf - destroy an rchan_buf struct and associated buffer
+ *      @buf: the buffer struct
+ */
+void relay_destroy_buf(struct rchan_buf *buf)
+{
+        struct rchan *chan = buf->chan;
+        unsigned int i;
+        if (likely(buf->start)) {
+                vunmap(buf->start);
+                for (i = 0; i < buf->page_count; i++)
+                        __free_page(buf->page_array[i]);
+                kfree(buf->page_array);
+        }
+        kfree(buf->padding);
+        kfree(buf);
+        kref_put(&chan->kref, relay_destroy_channel);
+}
+/**
+ *      relay_remove_buf - remove a channel buffer
+ *
+ *      Removes the file from the relayfs fileystem, which also frees the
+ *      rchan_buf_struct and the channel buffer.  Should only be called from
+ *      kref_put().
+ */
+void relay_remove_buf(struct kref *kref)
+{
+        struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
+        relayfs_remove(buf->dentry);
+}
diff --git a/fs/relayfs/buffers.h b/fs/relayfs/buffers.h
new file mode 100644
index 000000000000..37a12493f641
--- /dev/null
+++ b/fs/relayfs/buffers.h
@@ -0,0 +1,12 @@
+#ifndef _BUFFERS_H
+#define _BUFFERS_H
+/* This inspired by rtai/shmem */
+#define FIX_SIZE(x) (((x) - 1) & PAGE_MASK) + PAGE_SIZE
+extern int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma);
+extern struct rchan_buf *relay_create_buf(struct rchan *chan);
+extern void relay_destroy_buf(struct rchan_buf *buf);
+extern void relay_remove_buf(struct kref *kref);
+#endif/* _BUFFERS_H */
diff --git a/fs/relayfs/inode.c b/fs/relayfs/inode.c
new file mode 100644
index 000000000000..0f7f88d067ad
--- /dev/null
+++ b/fs/relayfs/inode.c
@@ -0,0 +1,609 @@
+/*
+ * VFS-related code for RelayFS, a high-speed data relay filesystem.
+ *
+ * Copyright (C) 2003-2005 - Tom Zanussi <zanussi@us.ibm.com>, IBM Corp
+ * Copyright (C) 2003-2005 - Karim Yaghmour <karim@opersys.com>
+ *
+ * Based on ramfs, Copyright (C) 2002 - Linus Torvalds
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/backing-dev.h>
+#include <linux/namei.h>
+#include <linux/poll.h>
+#include <linux/relayfs_fs.h>
+#include "relay.h"
+#include "buffers.h"
+#define RELAYFS_MAGIC                   0xF0B4A981
+static struct vfsmount *                relayfs_mount;
+static int                              relayfs_mount_count;
+static kmem_cache_t *                   relayfs_inode_cachep;
+static struct backing_dev_info          relayfs_backing_dev_info = {
+        .ra_pages       = 0,    /* No readahead */
+        .capabilities   = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
+};
+static struct inode *relayfs_get_inode(struct super_block *sb, int mode,
+                                       struct rchan *chan)
+{
+        struct rchan_buf *buf = NULL;
+        struct inode *inode;
+        if (S_ISREG(mode)) {
+                BUG_ON(!chan);
+                buf = relay_create_buf(chan);
+                if (!buf)
+                        return NULL;
+        }
+        inode = new_inode(sb);
+        if (!inode) {
+                relay_destroy_buf(buf);
+                return NULL;
+        }
+        inode->i_mode = mode;
+        inode->i_uid = 0;
+        inode->i_gid = 0;
+        inode->i_blksize = PAGE_CACHE_SIZE;
+        inode->i_blocks = 0;
+        inode->i_mapping->backing_dev_info = &relayfs_backing_dev_info;
+        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        switch (mode & S_IFMT) {
+        case S_IFREG:
+                inode->i_fop = &relayfs_file_operations;
+                RELAYFS_I(inode)->buf = buf;
+                break;
+        case S_IFDIR:
+                inode->i_op = &simple_dir_inode_operations;
+                inode->i_fop = &simple_dir_operations;
+                /* directory inodes start off with i_nlink == 2 (for "." entry) */
+                inode->i_nlink++;
+                break;
+        default:
+                break;
+        }
+        return inode;
+}
+/**
+ *      relayfs_create_entry - create a relayfs directory or file
+ *      @name: the name of the file to create
+ *      @parent: parent directory
+ *      @mode: mode
+ *      @chan: relay channel associated with the file
+ *
+ *      Returns the new dentry, NULL on failure
+ *
+ *      Creates a file or directory with the specifed permissions.
+ */
+static struct dentry *relayfs_create_entry(const char *name,
+                                           struct dentry *parent,
+                                           int mode,
+                                           struct rchan *chan)
+{
+        struct dentry *d;
+        struct inode *inode;
+        int error = 0;
+        BUG_ON(!name || !(S_ISREG(mode) || S_ISDIR(mode)));
+        error = simple_pin_fs("relayfs", &relayfs_mount, &relayfs_mount_count);
+        if (error) {
+                printk(KERN_ERR "Couldn't mount relayfs: errcode %d\n", error);
+                return NULL;
+        }
+        if (!parent && relayfs_mount && relayfs_mount->mnt_sb)
+                parent = relayfs_mount->mnt_sb->s_root;
+        if (!parent) {
+                simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+                return NULL;
+        }
+        parent = dget(parent);
+        down(&parent->d_inode->i_sem);
+        d = lookup_one_len(name, parent, strlen(name));
+        if (IS_ERR(d)) {
+                d = NULL;
+                goto release_mount;
+        }
+        if (d->d_inode) {
+                d = NULL;
+                goto release_mount;
+        }
+        inode = relayfs_get_inode(parent->d_inode->i_sb, mode, chan);
+        if (!inode) {
+                d = NULL;
+                goto release_mount;
+        }
+        d_instantiate(d, inode);
+        dget(d);        /* Extra count - pin the dentry in core */
+        if (S_ISDIR(mode))
+                parent->d_inode->i_nlink++;
+        goto exit;
+release_mount:
+        simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+exit:
+        up(&parent->d_inode->i_sem);
+        dput(parent);
+        return d;
+}
+/**
+ *      relayfs_create_file - create a file in the relay filesystem
+ *      @name: the name of the file to create
+ *      @parent: parent directory
+ *      @mode: mode, if not specied the default perms are used
+ *      @chan: channel associated with the file
+ *
+ *      Returns file dentry if successful, NULL otherwise.
+ *
+ *      The file will be created user r on behalf of current user.
+ */
+struct dentry *relayfs_create_file(const char *name, struct dentry *parent,
+                                   int mode, struct rchan *chan)
+{
+        if (!mode)
+                mode = S_IRUSR;
+        mode = (mode & S_IALLUGO) | S_IFREG;
+        return relayfs_create_entry(name, parent, mode, chan);
+}
+/**
+ *      relayfs_create_dir - create a directory in the relay filesystem
+ *      @name: the name of the directory to create
+ *      @parent: parent directory, NULL if parent should be fs root
+ *
+ *      Returns directory dentry if successful, NULL otherwise.
+ *
+ *      The directory will be created world rwx on behalf of current user.
+ */
+struct dentry *relayfs_create_dir(const char *name, struct dentry *parent)
+{
+        int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+        return relayfs_create_entry(name, parent, mode, NULL);
+}
+/**
+ *      relayfs_remove - remove a file or directory in the relay filesystem
+ *      @dentry: file or directory dentry
+ *
+ *      Returns 0 if successful, negative otherwise.
+ */
+int relayfs_remove(struct dentry *dentry)
+{
+        struct dentry *parent;
+        int error = 0;
+        if (!dentry)
+                return -EINVAL;
+        parent = dentry->d_parent;
+        if (!parent)
+                return -EINVAL;
+        parent = dget(parent);
+        down(&parent->d_inode->i_sem);
+        if (dentry->d_inode) {
+                if (S_ISDIR(dentry->d_inode->i_mode))
+                        error = simple_rmdir(parent->d_inode, dentry);
+                else
+                        error = simple_unlink(parent->d_inode, dentry);
+                if (!error)
+                        d_delete(dentry);
+        }
+        if (!error)
+                dput(dentry);
+        up(&parent->d_inode->i_sem);
+        dput(parent);
+        if (!error)
+                simple_release_fs(&relayfs_mount, &relayfs_mount_count);
+        return error;
+}
+/**
+ *      relayfs_remove_dir - remove a directory in the relay filesystem
+ *      @dentry: directory dentry
+ *
+ *      Returns 0 if successful, negative otherwise.
+ */
+int relayfs_remove_dir(struct dentry *dentry)
+{
+        return relayfs_remove(dentry);
+}
+/**
+ *      relayfs_open - open file op for relayfs files
+ *      @inode: the inode
+ *      @filp: the file
+ *
+ *      Increments the channel buffer refcount.
+ */
+static int relayfs_open(struct inode *inode, struct file *filp)
+{
+        struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+        kref_get(&buf->kref);
+        return 0;
+}
+/**
+ *      relayfs_mmap - mmap file op for relayfs files
+ *      @filp: the file
+ *      @vma: the vma describing what to map
+ *
+ *      Calls upon relay_mmap_buf to map the file into user space.
+ */
+static int relayfs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        return relay_mmap_buf(RELAYFS_I(inode)->buf, vma);
+}
+/**
+ *      relayfs_poll - poll file op for relayfs files
+ *      @filp: the file
+ *      @wait: poll table
+ *
+ *      Poll implemention.
+ */
+static unsigned int relayfs_poll(struct file *filp, poll_table *wait)
+{
+        unsigned int mask = 0;
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+        if (buf->finalized)
+                return POLLERR;
+        if (filp->f_mode & FMODE_READ) {
+                poll_wait(filp, &buf->read_wait, wait);
+                if (!relay_buf_empty(buf))
+                        mask |= POLLIN | POLLRDNORM;
+        }
+        return mask;
+}
+/**
+ *      relayfs_release - release file op for relayfs files
+ *      @inode: the inode
+ *      @filp: the file
+ *
+ *      Decrements the channel refcount, as the filesystem is
+ *      no longer using it.
+ */
+static int relayfs_release(struct inode *inode, struct file *filp)
+{
+        struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+        kref_put(&buf->kref, relay_remove_buf);
+        return 0;
+}
+/**
+ *      relayfs_read_consume - update the consumed count for the buffer
+ */
+static void relayfs_read_consume(struct rchan_buf *buf,
+                                 size_t read_pos,
+                                 size_t bytes_consumed)
+{
+        size_t subbuf_size = buf->chan->subbuf_size;
+        size_t n_subbufs = buf->chan->n_subbufs;
+        size_t read_subbuf;
+        if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
+                relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+                buf->bytes_consumed = 0;
+        }
+        buf->bytes_consumed += bytes_consumed;
+        read_subbuf = read_pos / buf->chan->subbuf_size;
+        if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) {
+                if ((read_subbuf == buf->subbufs_produced % n_subbufs) &&
+                    (buf->offset == subbuf_size))
+                        return;
+                relay_subbufs_consumed(buf->chan, buf->cpu, 1);
+                buf->bytes_consumed = 0;
+        }
+}
+/**
+ *      relayfs_read_avail - boolean, are there unconsumed bytes available?
+ */
+static int relayfs_read_avail(struct rchan_buf *buf, size_t read_pos)
+{
+        size_t bytes_produced, bytes_consumed, write_offset;
+        size_t subbuf_size = buf->chan->subbuf_size;
+        size_t n_subbufs = buf->chan->n_subbufs;
+        size_t produced = buf->subbufs_produced % n_subbufs;
+        size_t consumed = buf->subbufs_consumed % n_subbufs;
+        write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+        if (consumed > produced) {
+                if ((produced > n_subbufs) &&
+                    (produced + n_subbufs - consumed <= n_subbufs))
+                        produced += n_subbufs;
+        } else if (consumed == produced) {
+                if (buf->offset > subbuf_size) {
+                        produced += n_subbufs;
+                        if (buf->subbufs_produced == buf->subbufs_consumed)
+                                consumed += n_subbufs;
+                }
+        }
+        if (buf->offset > subbuf_size)
+                bytes_produced = (produced - 1) * subbuf_size + write_offset;
+        else
+                bytes_produced = produced * subbuf_size + write_offset;
+        bytes_consumed = consumed * subbuf_size + buf->bytes_consumed;
+        if (bytes_produced == bytes_consumed)
+                return 0;
+        relayfs_read_consume(buf, read_pos, 0);
+        return 1;
+}
+/**
+ *      relayfs_read_subbuf_avail - return bytes available in sub-buffer
+ */
+static size_t relayfs_read_subbuf_avail(size_t read_pos,
+                                        struct rchan_buf *buf)
+{
+        size_t padding, avail = 0;
+        size_t read_subbuf, read_offset, write_subbuf, write_offset;
+        size_t subbuf_size = buf->chan->subbuf_size;
+        write_subbuf = (buf->data - buf->start) / subbuf_size;
+        write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset;
+        read_subbuf = read_pos / subbuf_size;
+        read_offset = read_pos % subbuf_size;
+        padding = buf->padding[read_subbuf];
+        if (read_subbuf == write_subbuf) {
+                if (read_offset + padding < write_offset)
+                        avail = write_offset - (read_offset + padding);
+        } else
+                avail = (subbuf_size - padding) - read_offset;
+        return avail;
+}
+/**
+ *      relayfs_read_start_pos - find the first available byte to read
+ *
+ *      If the read_pos is in the middle of padding, return the
+ *      position of the first actually available byte, otherwise
+ *      return the original value.
+ */
+static size_t relayfs_read_start_pos(size_t read_pos,
+                                     struct rchan_buf *buf)
+{
+        size_t read_subbuf, padding, padding_start, padding_end;
+        size_t subbuf_size = buf->chan->subbuf_size;
+        size_t n_subbufs = buf->chan->n_subbufs;
+        read_subbuf = read_pos / subbuf_size;
+        padding = buf->padding[read_subbuf];
+        padding_start = (read_subbuf + 1) * subbuf_size - padding;
+        padding_end = (read_subbuf + 1) * subbuf_size;
+        if (read_pos >= padding_start && read_pos < padding_end) {
+                read_subbuf = (read_subbuf + 1) % n_subbufs;
+                read_pos = read_subbuf * subbuf_size;
+        }
+        return read_pos;
+}
+/**
+ *      relayfs_read_end_pos - return the new read position
+ */
+static size_t relayfs_read_end_pos(struct rchan_buf *buf,
+                                   size_t read_pos,
+                                   size_t count)
+{
+        size_t read_subbuf, padding, end_pos;
+        size_t subbuf_size = buf->chan->subbuf_size;
+        size_t n_subbufs = buf->chan->n_subbufs;
+        read_subbuf = read_pos / subbuf_size;
+        padding = buf->padding[read_subbuf];
+        if (read_pos % subbuf_size + count + padding == subbuf_size)
+                end_pos = (read_subbuf + 1) * subbuf_size;
+        else
+                end_pos = read_pos + count;
+        if (end_pos >= subbuf_size * n_subbufs)
+                end_pos = 0;
+        return end_pos;
+}
+/**
+ *      relayfs_read - read file op for relayfs files
+ *      @filp: the file
+ *      @buffer: the userspace buffer
+ *      @count: number of bytes to read
+ *      @ppos: position to read from
+ *
+ *      Reads count bytes or the number of bytes available in the
+ *      current sub-buffer being read, whichever is smaller.
+ */
+static ssize_t relayfs_read(struct file *filp,
+                            char __user *buffer,
+                            size_t count,
+                            loff_t *ppos)
+{
+        struct inode *inode = filp->f_dentry->d_inode;
+        struct rchan_buf *buf = RELAYFS_I(inode)->buf;
+        size_t read_start, avail;
+        ssize_t ret = 0;
+        void *from;
+        down(&inode->i_sem);
+        if(!relayfs_read_avail(buf, *ppos))
+                goto out;
+        read_start = relayfs_read_start_pos(*ppos, buf);
+        avail = relayfs_read_subbuf_avail(read_start, buf);
+        if (!avail)
+                goto out;
+        from = buf->start + read_start;
+        ret = count = min(count, avail);
+        if (copy_to_user(buffer, from, count)) {
+                ret = -EFAULT;
+                goto out;
+        }
+        relayfs_read_consume(buf, read_start, count);
+        *ppos = relayfs_read_end_pos(buf, read_start, count);
+out:
+        up(&inode->i_sem);
+        return ret;
+}
+/**
+ *      relayfs alloc_inode() implementation
+ */
+static struct inode *relayfs_alloc_inode(struct super_block *sb)
+{
+        struct relayfs_inode_info *p = kmem_cache_alloc(relayfs_inode_cachep, SLAB_KERNEL);
+        if (!p)
+                return NULL;
+        p->buf = NULL;
+        return &p->vfs_inode;
+}
+/**
+ *      relayfs destroy_inode() implementation
+ */
+static void relayfs_destroy_inode(struct inode *inode)
+{
+        if (RELAYFS_I(inode)->buf)
+                relay_destroy_buf(RELAYFS_I(inode)->buf);
+        kmem_cache_free(relayfs_inode_cachep, RELAYFS_I(inode));
+}
+static void init_once(void *p, kmem_cache_t *cachep, unsigned long flags)
+{
+        struct relayfs_inode_info *i = p;
+        if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == SLAB_CTOR_CONSTRUCTOR)
+                inode_init_once(&i->vfs_inode);
+}
+struct file_operations relayfs_file_operations = {
+        .open           = relayfs_open,
+        .poll           = relayfs_poll,
+        .mmap           = relayfs_mmap,
+        .read           = relayfs_read,
+        .llseek         = no_llseek,
+        .release        = relayfs_release,
+};
+static struct super_operations relayfs_ops = {
+        .statfs         = simple_statfs,
+        .drop_inode     = generic_delete_inode,
+        .alloc_inode    = relayfs_alloc_inode,
+        .destroy_inode  = relayfs_destroy_inode,
+};
+static int relayfs_fill_super(struct super_block * sb, void * data, int silent)
+{
+        struct inode *inode;
+        struct dentry *root;
+        int mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+        sb->s_blocksize = PAGE_CACHE_SIZE;
+        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+        sb->s_magic = RELAYFS_MAGIC;
+        sb->s_op = &relayfs_ops;
+        inode = relayfs_get_inode(sb, mode, NULL);
+        if (!inode)
+                return -ENOMEM;
+        root = d_alloc_root(inode);
+        if (!root) {
+                iput(inode);
+                return -ENOMEM;
+        }
+        sb->s_root = root;
+        return 0;
+}
+static struct super_block * relayfs_get_sb(struct file_system_type *fs_type,
+                                           int flags, const char *dev_name,
+                                           void *data)
+{
+        return get_sb_single(fs_type, flags, data, relayfs_fill_super);
+}
+static struct file_system_type relayfs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "relayfs",
+        .get_sb         = relayfs_get_sb,
+        .kill_sb        = kill_litter_super,
+};
+static int __init init_relayfs_fs(void)
+{
+        int err;
+        relayfs_inode_cachep = kmem_cache_create("relayfs_inode_cache",
+                                sizeof(struct relayfs_inode_info), 0,
+                                0, init_once, NULL);
+        if (!relayfs_inode_cachep)
+                return -ENOMEM;
+        err = register_filesystem(&relayfs_fs_type);
+        if (err)
+                kmem_cache_destroy(relayfs_inode_cachep);
+        return err;
+}
+static void __exit exit_relayfs_fs(void)
+{
+        unregister_filesystem(&relayfs_fs_type);
+        kmem_cache_destroy(relayfs_inode_cachep);
+}
+module_init(init_relayfs_fs)
+module_exit(exit_relayfs_fs)
+EXPORT_SYMBOL_GPL(relayfs_file_operations);
+EXPORT_SYMBOL_GPL(relayfs_create_dir);
+EXPORT_SYMBOL_GPL(relayfs_remove_dir);
+MODULE_AUTHOR("Tom Zanussi <zanussi@us.ibm.com> and Karim Yaghmour <karim@opersys.com>");
+MODULE_DESCRIPTION("Relay Filesystem");
+MODULE_LICENSE("GPL");
diff --git a/fs/relayfs/relay.c b/fs/relayfs/relay.c
new file mode 100644
index 000000000000..16446a15c96d
--- /dev/null
+++ b/fs/relayfs/relay.c
@@ -0,0 +1,431 @@
+/*
+ * Public API and common code for RelayFS.
+ *
+ * See Documentation/filesystems/relayfs.txt for an overview of relayfs.
+ *
+ * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp
+ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com)
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/errno.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/relayfs_fs.h>
+#include "relay.h"
+#include "buffers.h"
+/**
+ *      relay_buf_empty - boolean, is the channel buffer empty?
+ *      @buf: channel buffer
+ *
+ *      Returns 1 if the buffer is empty, 0 otherwise.
+ */
+int relay_buf_empty(struct rchan_buf *buf)
+{
+        return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1;
+}
+/**
+ *      relay_buf_full - boolean, is the channel buffer full?
+ *      @buf: channel buffer
+ *
+ *      Returns 1 if the buffer is full, 0 otherwise.
+ */
+int relay_buf_full(struct rchan_buf *buf)
+{
+        size_t ready = buf->subbufs_produced - buf->subbufs_consumed;
+        return (ready >= buf->chan->n_subbufs) ? 1 : 0;
+}
+/*
+ * High-level relayfs kernel API and associated functions.
+ */
+/*
+ * rchan_callback implementations defining default channel behavior.  Used
+ * in place of corresponding NULL values in client callback struct.
+ */
+/*
+ * subbuf_start() default callback.  Does nothing.
+ */
+static int subbuf_start_default_callback (struct rchan_buf *buf,
+                                          void *subbuf,
+                                          void *prev_subbuf,
+                                          size_t prev_padding)
+{
+        if (relay_buf_full(buf))
+                return 0;
+        return 1;
+}
+/*
+ * buf_mapped() default callback.  Does nothing.
+ */
+static void buf_mapped_default_callback(struct rchan_buf *buf,
+                                        struct file *filp)
+{
+}
+/*
+ * buf_unmapped() default callback.  Does nothing.
+ */
+static void buf_unmapped_default_callback(struct rchan_buf *buf,
+                                          struct file *filp)
+{
+}
+/* relay channel default callbacks */
+static struct rchan_callbacks default_channel_callbacks = {
+        .subbuf_start = subbuf_start_default_callback,
+        .buf_mapped = buf_mapped_default_callback,
+        .buf_unmapped = buf_unmapped_default_callback,
+};
+/**
+ *      wakeup_readers - wake up readers waiting on a channel
+ *      @private: the channel buffer
+ *
+ *      This is the work function used to defer reader waking.  The
+ *      reason waking is deferred is that calling directly from write
+ *      causes problems if you're writing from say the scheduler.
+ */
+static void wakeup_readers(void *private)
+{
+        struct rchan_buf *buf = private;
+        wake_up_interruptible(&buf->read_wait);
+}
+/**
+ *      __relay_reset - reset a channel buffer
+ *      @buf: the channel buffer
+ *      @init: 1 if this is a first-time initialization
+ *
+ *      See relay_reset for description of effect.
+ */
+static inline void __relay_reset(struct rchan_buf *buf, unsigned int init)
+{
+        size_t i;
+        if (init) {
+                init_waitqueue_head(&buf->read_wait);
+                kref_init(&buf->kref);
+                INIT_WORK(&buf->wake_readers, NULL, NULL);
+        } else {
+                cancel_delayed_work(&buf->wake_readers);
+                flush_scheduled_work();
+        }
+        buf->subbufs_produced = 0;
+        buf->subbufs_consumed = 0;
+        buf->bytes_consumed = 0;
+        buf->finalized = 0;
+        buf->data = buf->start;
+        buf->offset = 0;
+        for (i = 0; i < buf->chan->n_subbufs; i++)
+                buf->padding[i] = 0;
+        buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0);
+}
+/**
+ *      relay_reset - reset the channel
+ *      @chan: the channel
+ *
+ *      This has the effect of erasing all data from all channel buffers
+ *      and restarting the channel in its initial state.  The buffers
+ *      are not freed, so any mappings are still in effect.
+ *
+ *      NOTE: Care should be taken that the channel isn't actually
+ *      being used by anything when this call is made.
+ */
+void relay_reset(struct rchan *chan)
+{
+        unsigned int i;
+        if (!chan)
+                return;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!chan->buf[i])
+                        continue;
+                __relay_reset(chan->buf[i], 0);
+        }
+}
+/**
+ *      relay_open_buf - create a new channel buffer in relayfs
+ *
+ *      Internal - used by relay_open().
+ */
+static struct rchan_buf *relay_open_buf(struct rchan *chan,
+                                        const char *filename,
+                                        struct dentry *parent)
+{
+        struct rchan_buf *buf;
+        struct dentry *dentry;
+        /* Create file in fs */
+        dentry = relayfs_create_file(filename, parent, S_IRUSR, chan);
+        if (!dentry)
+                return NULL;
+        buf = RELAYFS_I(dentry->d_inode)->buf;
+        buf->dentry = dentry;
+        __relay_reset(buf, 1);
+        return buf;
+}
+/**
+ *      relay_close_buf - close a channel buffer
+ *      @buf: channel buffer
+ *
+ *      Marks the buffer finalized and restores the default callbacks.
+ *      The channel buffer and channel buffer data structure are then freed
+ *      automatically when the last reference is given up.
+ */
+static inline void relay_close_buf(struct rchan_buf *buf)
+{
+        buf->finalized = 1;
+        buf->chan->cb = &default_channel_callbacks;
+        cancel_delayed_work(&buf->wake_readers);
+        flush_scheduled_work();
+        kref_put(&buf->kref, relay_remove_buf);
+}
+static inline void setup_callbacks(struct rchan *chan,
+                                   struct rchan_callbacks *cb)
+{
+        if (!cb) {
+                chan->cb = &default_channel_callbacks;
+                return;
+        }
+        if (!cb->subbuf_start)
+                cb->subbuf_start = subbuf_start_default_callback;
+        if (!cb->buf_mapped)
+                cb->buf_mapped = buf_mapped_default_callback;
+        if (!cb->buf_unmapped)
+                cb->buf_unmapped = buf_unmapped_default_callback;
+        chan->cb = cb;
+}
+/**
+ *      relay_open - create a new relayfs channel
+ *      @base_filename: base name of files to create
+ *      @parent: dentry of parent directory, NULL for root directory
+ *      @subbuf_size: size of sub-buffers
+ *      @n_subbufs: number of sub-buffers
+ *      @cb: client callback functions
+ *
+ *      Returns channel pointer if successful, NULL otherwise.
+ *
+ *      Creates a channel buffer for each cpu using the sizes and
+ *      attributes specified.  The created channel buffer files
+ *      will be named base_filename0...base_filenameN-1.  File
+ *      permissions will be S_IRUSR.
+ */
+struct rchan *relay_open(const char *base_filename,
+                         struct dentry *parent,
+                         size_t subbuf_size,
+                         size_t n_subbufs,
+                         struct rchan_callbacks *cb)
+{
+        unsigned int i;
+        struct rchan *chan;
+        char *tmpname;
+        if (!base_filename)
+                return NULL;
+        if (!(subbuf_size && n_subbufs))
+                return NULL;
+        chan = kcalloc(1, sizeof(struct rchan), GFP_KERNEL);
+        if (!chan)
+                return NULL;
+        chan->version = RELAYFS_CHANNEL_VERSION;
+        chan->n_subbufs = n_subbufs;
+        chan->subbuf_size = subbuf_size;
+        chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+        setup_callbacks(chan, cb);
+        kref_init(&chan->kref);
+        tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+        if (!tmpname)
+                goto free_chan;
+        for_each_online_cpu(i) {
+                sprintf(tmpname, "%s%d", base_filename, i);
+                chan->buf[i] = relay_open_buf(chan, tmpname, parent);
+                chan->buf[i]->cpu = i;
+                if (!chan->buf[i])
+                        goto free_bufs;
+        }
+        kfree(tmpname);
+        return chan;
+free_bufs:
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!chan->buf[i])
+                        break;
+                relay_close_buf(chan->buf[i]);
+        }
+        kfree(tmpname);
+free_chan:
+        kref_put(&chan->kref, relay_destroy_channel);
+        return NULL;
+}
+/**
+ *      relay_switch_subbuf - switch to a new sub-buffer
+ *      @buf: channel buffer
+ *      @length: size of current event
+ *
+ *      Returns either the length passed in or 0 if full.
+ *      Performs sub-buffer-switch tasks such as invoking callbacks,
+ *      updating padding counts, waking up readers, etc.
+ */
+size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
+{
+        void *old, *new;
+        size_t old_subbuf, new_subbuf;
+        if (unlikely(length > buf->chan->subbuf_size))
+                goto toobig;
+        if (buf->offset != buf->chan->subbuf_size + 1) {
+                buf->prev_padding = buf->chan->subbuf_size - buf->offset;
+                old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+                buf->padding[old_subbuf] = buf->prev_padding;
+                buf->subbufs_produced++;
+                if (waitqueue_active(&buf->read_wait)) {
+                        PREPARE_WORK(&buf->wake_readers, wakeup_readers, buf);
+                        schedule_delayed_work(&buf->wake_readers, 1);
+                }
+        }
+        old = buf->data;
+        new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
+        new = buf->start + new_subbuf * buf->chan->subbuf_size;
+        buf->offset = 0;
+        if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) {
+                buf->offset = buf->chan->subbuf_size + 1;
+                return 0;
+        }
+        buf->data = new;
+        buf->padding[new_subbuf] = 0;
+        if (unlikely(length + buf->offset > buf->chan->subbuf_size))
+                goto toobig;
+        return length;
+toobig:
+        printk(KERN_WARNING "relayfs: event too large (%Zd)\n", length);
+        WARN_ON(1);
+        return 0;
+}
+/**
+ *      relay_subbufs_consumed - update the buffer's sub-buffers-consumed count
+ *      @chan: the channel
+ *      @cpu: the cpu associated with the channel buffer to update
+ *      @subbufs_consumed: number of sub-buffers to add to current buf's count
+ *
+ *      Adds to the channel buffer's consumed sub-buffer count.
+ *      subbufs_consumed should be the number of sub-buffers newly consumed,
+ *      not the total consumed.
+ *
+ *      NOTE: kernel clients don't need to call this function if the channel
+ *      mode is 'overwrite'.
+ */
+void relay_subbufs_consumed(struct rchan *chan,
+                            unsigned int cpu,
+                            size_t subbufs_consumed)
+{
+        struct rchan_buf *buf;
+        if (!chan)
+                return;
+        if (cpu >= NR_CPUS || !chan->buf[cpu])
+                return;
+        buf = chan->buf[cpu];
+        buf->subbufs_consumed += subbufs_consumed;
+        if (buf->subbufs_consumed > buf->subbufs_produced)
+                buf->subbufs_consumed = buf->subbufs_produced;
+}
+/**
+ *      relay_destroy_channel - free the channel struct
+ *
+ *      Should only be called from kref_put().
+ */
+void relay_destroy_channel(struct kref *kref)
+{
+        struct rchan *chan = container_of(kref, struct rchan, kref);
+        kfree(chan);
+}
+/**
+ *      relay_close - close the channel
+ *      @chan: the channel
+ *
+ *      Closes all channel buffers and frees the channel.
+ */
+void relay_close(struct rchan *chan)
+{
+        unsigned int i;
+        if (!chan)
+                return;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!chan->buf[i])
+                        continue;
+                relay_close_buf(chan->buf[i]);
+        }
+        kref_put(&chan->kref, relay_destroy_channel);
+}
+/**
+ *      relay_flush - close the channel
+ *      @chan: the channel
+ *
+ *      Flushes all channel buffers i.e. forces buffer switch.
+ */
+void relay_flush(struct rchan *chan)
+{
+        unsigned int i;
+        if (!chan)
+                return;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!chan->buf[i])
+                        continue;
+                relay_switch_subbuf(chan->buf[i], 0);
+        }
+}
+EXPORT_SYMBOL_GPL(relay_open);
+EXPORT_SYMBOL_GPL(relay_close);
+EXPORT_SYMBOL_GPL(relay_flush);
+EXPORT_SYMBOL_GPL(relay_reset);
+EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
+EXPORT_SYMBOL_GPL(relay_switch_subbuf);
+EXPORT_SYMBOL_GPL(relay_buf_full);
diff --git a/fs/relayfs/relay.h b/fs/relayfs/relay.h
new file mode 100644
index 000000000000..703503fa22b6
--- /dev/null
+++ b/fs/relayfs/relay.h
@@ -0,0 +1,12 @@
+#ifndef _RELAY_H
+#define _RELAY_H
+struct dentry *relayfs_create_file(const char *name,
+                                   struct dentry *parent,
+                                   int mode,
+                                   struct rchan *chan);
+extern int relayfs_remove(struct dentry *dentry);
+extern int relay_buf_empty(struct rchan_buf *buf);
+extern void relay_destroy_channel(struct kref *kref);
+#endif /* _RELAY_H */
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 997640c99c7d..faf1512173eb 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -114,8 +114,7 @@ void ufs_free_fragments (struct inode * inode, unsigned fragment, unsigned count
        ubh_mark_buffer_dirty (USPI_UBH);
        ubh_mark_buffer_dirty (UCPI_UBH);
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-                ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **)&ucpi);
                ubh_wait_on_buffer (UCPI_UBH);
        }
        sb->s_dirt = 1;
@@ -200,8 +199,7 @@ do_more:
        ubh_mark_buffer_dirty (USPI_UBH);
        ubh_mark_buffer_dirty (UCPI_UBH);
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-                ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **)&ucpi);
                ubh_wait_on_buffer (UCPI_UBH);
        }
@@ -459,8 +457,7 @@ ufs_add_fragments (struct inode * inode, unsigned fragment,
        ubh_mark_buffer_dirty (USPI_UBH);
        ubh_mark_buffer_dirty (UCPI_UBH);
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-                ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **)&ucpi);
                ubh_wait_on_buffer (UCPI_UBH);
        }
        sb->s_dirt = 1;
@@ -585,8 +582,7 @@ succed:
        ubh_mark_buffer_dirty (USPI_UBH);
        ubh_mark_buffer_dirty (UCPI_UBH);
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **)&ucpi);
-                ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **)&ucpi);
                ubh_wait_on_buffer (UCPI_UBH);
        }
        sb->s_dirt = 1;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 61a6b1542fc5..0938945b9cbc 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -124,8 +124,7 @@ void ufs_free_inode (struct inode * inode)
        ubh_mark_buffer_dirty (USPI_UBH);
        ubh_mark_buffer_dirty (UCPI_UBH);
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
-                ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **) &ucpi);
                ubh_wait_on_buffer (UCPI_UBH);
        }
        
@@ -249,8 +248,7 @@ cg_found:
        ubh_mark_buffer_dirty (USPI_UBH);
        ubh_mark_buffer_dirty (UCPI_UBH);
        if (sb->s_flags & MS_SYNCHRONOUS) {
-                ubh_wait_on_buffer (UCPI_UBH);
+                ubh_ll_rw_block (SWRITE, 1, (struct ufs_buffer_head **) &ucpi);
-                ubh_ll_rw_block (WRITE, 1, (struct ufs_buffer_head **) &ucpi);
                ubh_wait_on_buffer (UCPI_UBH);
        }
        sb->s_dirt = 1;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index e312bf8bad9f..61d2e35012a4 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -285,8 +285,7 @@ next:;
                }
        }
        if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
-                ubh_wait_on_buffer (ind_ubh);
+                ubh_ll_rw_block (SWRITE, 1, &ind_ubh);
-                ubh_ll_rw_block (WRITE, 1, &ind_ubh);
                ubh_wait_on_buffer (ind_ubh);
        }
        ubh_brelse (ind_ubh);
@@ -353,8 +352,7 @@ static int ufs_trunc_dindirect (struct inode *inode, unsigned offset, __fs32 *p)
                }
        }
        if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
-                ubh_wait_on_buffer (dind_bh);
+                ubh_ll_rw_block (SWRITE, 1, &dind_bh);
-                ubh_ll_rw_block (WRITE, 1, &dind_bh);
                ubh_wait_on_buffer (dind_bh);
        }
        ubh_brelse (dind_bh);
@@ -418,8 +416,7 @@ static int ufs_trunc_tindirect (struct inode * inode)
                }
        }
        if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
-                ubh_wait_on_buffer (tind_bh);
+                ubh_ll_rw_block (SWRITE, 1, &tind_bh);
-                ubh_ll_rw_block (WRITE, 1, &tind_bh);
                ubh_wait_on_buffer (tind_bh);
        }
        ubh_brelse (tind_bh);
diff --git a/fs/umsdos/notes b/fs/umsdos/notes
deleted file mode 100644
index 3c47d1f4fc47..000000000000
--- a/fs/umsdos/notes
+++ /dev/null
@@ -1,17 +0,0 @@
-This file contain idea and things I don't want to forget
-Possible bug in fs/read_write.c
-Function sys_readdir()
-        There is a call the verify_area that does not take in account
-        the count parameter. I guess it should read
-        error = verify_area(VERIFY_WRITE, dirent, count*sizeof (*dirent));
-        instead of
-        error = verify_area(VERIFY_WRITE, dirent, sizeof (*dirent));
-        Of course, now , count is always 1
diff --git a/fs/xattr.c b/fs/xattr.c
index 6acd5c63da91..3f9c64bea151 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -51,20 +51,29 @@ setxattr(struct dentry *d, char __user *name, void __user *value,
                }
        }
+        down(&d->d_inode->i_sem);
+        error = security_inode_setxattr(d, kname, kvalue, size, flags);
+        if (error)
+                goto out;
        error = -EOPNOTSUPP;
        if (d->d_inode->i_op && d->d_inode->i_op->setxattr) {
-                down(&d->d_inode->i_sem);
+                error = d->d_inode->i_op->setxattr(d, kname, kvalue,
-                error = security_inode_setxattr(d, kname, kvalue, size, flags);
+                                                   size, flags);
-                if (error)
-                        goto out;
-                error = d->d_inode->i_op->setxattr(d, kname, kvalue, size, flags);
                if (!error) {
                        fsnotify_xattr(d);
-                        security_inode_post_setxattr(d, kname, kvalue, size, flags);
+                        security_inode_post_setxattr(d, kname, kvalue,
+                                                     size, flags);
                }
-out:
+        } else if (!strncmp(kname, XATTR_SECURITY_PREFIX,
-                up(&d->d_inode->i_sem);
+                            sizeof XATTR_SECURITY_PREFIX - 1)) {
+                const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
+                error = security_inode_setsecurity(d->d_inode, suffix, kvalue,
+                                                   size, flags);
+                if (!error)
+                        fsnotify_xattr(d);
        }
+out:
+        up(&d->d_inode->i_sem);
        if (kvalue)
                kfree(kvalue);
        return error;
@@ -139,20 +148,25 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size)
                        return -ENOMEM;
        }
+        error = security_inode_getxattr(d, kname);
+        if (error)
+                goto out;
        error = -EOPNOTSUPP;
-        if (d->d_inode->i_op && d->d_inode->i_op->getxattr) {
+        if (d->d_inode->i_op && d->d_inode->i_op->getxattr)
-                error = security_inode_getxattr(d, kname);
-                if (error)
-                        goto out;
                error = d->d_inode->i_op->getxattr(d, kname, kvalue, size);
-                if (error > 0) {
+        else if (!strncmp(kname, XATTR_SECURITY_PREFIX,
-                        if (size && copy_to_user(value, kvalue, error))
+                          sizeof XATTR_SECURITY_PREFIX - 1)) {
-                                error = -EFAULT;
+                const char *suffix = kname + sizeof XATTR_SECURITY_PREFIX - 1;
-                } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
+                error = security_inode_getsecurity(d->d_inode, suffix, kvalue,
-                        /* The file system tried to returned a value bigger
+                                                   size);
-                           than XATTR_SIZE_MAX bytes. Not possible. */
+        }
-                        error = -E2BIG;
+        if (error > 0) {
-                }
+                if (size && copy_to_user(value, kvalue, error))
+                        error = -EFAULT;
+        } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) {
+                /* The file system tried to returned a value bigger
+                   than XATTR_SIZE_MAX bytes. Not possible. */
+                error = -E2BIG;
        }
 out:
        if (kvalue)
@@ -221,20 +235,24 @@ listxattr(struct dentry *d, char __user *list, size_t size)
                        return -ENOMEM;
        }
+        error = security_inode_listxattr(d);
+        if (error)
+                goto out;
        error = -EOPNOTSUPP;
        if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
-                error = security_inode_listxattr(d);
-                if (error)
-                        goto out;
                error = d->d_inode->i_op->listxattr(d, klist, size);
-                if (error > 0) {
+        } else {
-                        if (size && copy_to_user(list, klist, error))
+                error = security_inode_listsecurity(d->d_inode, klist, size);
-                                error = -EFAULT;
+                if (size && error >= size)
-                } else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
+                        error = -ERANGE;
-                        /* The file system tried to returned a list bigger
+        }
-                           than XATTR_LIST_MAX bytes. Not possible. */
+        if (error > 0) {
-                        error = -E2BIG;
+                if (size && copy_to_user(list, klist, error))
-                }
+                        error = -EFAULT;
+        } else if (error == -ERANGE && size >= XATTR_LIST_MAX) {
+                /* The file system tried to returned a list bigger
+                   than XATTR_LIST_MAX bytes. Not possible. */
+                error = -E2BIG;
        }
 out:
        if (klist)
@@ -307,6 +325,8 @@ removexattr(struct dentry *d, char __user *name)
                down(&d->d_inode->i_sem);
                error = d->d_inode->i_op->removexattr(d, kname);
                up(&d->d_inode->i_sem);
+                if (!error)
+                        fsnotify_xattr(d);
        }
 out:
        return error;
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d3ff78354638..49e3e7e5e3dc 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -1,150 +1 @@
-#
+include $(TOPDIR)/fs/xfs/Makefile-linux-$(VERSION).$(PATCHLEVEL)
-# Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
-#
-# This program is free software; you can redistribute it and/or modify it
-# under the terms of version 2 of the GNU General Public License as
-# published by the Free Software Foundation.
-#
-# This program is distributed in the hope that it would be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
-#
-# Further, this software is distributed without any warranty that it is
-# free of the rightful claim of any third person regarding infringement
-# or the like.  Any license provided herein, whether implied or
-# otherwise, applies only to this software file.  Patent licenses, if
-# any, provided herein do not apply to combinations of this program with
-# other software, or any other product whatsoever.
-#
-# You should have received a copy of the GNU General Public License along
-# with this program; if not, write the Free Software Foundation, Inc., 59
-# Temple Place - Suite 330, Boston MA 02111-1307, USA.
-#
-# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
-# Mountain View, CA  94043, or:
-#
-# http://www.sgi.com
-#
-# For further information regarding this notice, see:
-#
-# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
-#
-EXTRA_CFLAGS +=  -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char
-ifeq ($(CONFIG_XFS_DEBUG),y)
-        EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG
-        EXTRA_CFLAGS += -DPAGEBUF_LOCK_TRACKING
-endif
-ifeq ($(CONFIG_XFS_TRACE),y)
-        EXTRA_CFLAGS += -DXFS_ALLOC_TRACE
-        EXTRA_CFLAGS += -DXFS_ATTR_TRACE
-        EXTRA_CFLAGS += -DXFS_BLI_TRACE
-        EXTRA_CFLAGS += -DXFS_BMAP_TRACE
-        EXTRA_CFLAGS += -DXFS_BMBT_TRACE
-        EXTRA_CFLAGS += -DXFS_DIR_TRACE
-        EXTRA_CFLAGS += -DXFS_DIR2_TRACE
-        EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
-        EXTRA_CFLAGS += -DXFS_ILOCK_TRACE
-        EXTRA_CFLAGS += -DXFS_LOG_TRACE
-        EXTRA_CFLAGS += -DXFS_RW_TRACE
-        EXTRA_CFLAGS += -DPAGEBUF_TRACE
-        EXTRA_CFLAGS += -DXFS_VNODE_TRACE
-endif
-obj-$(CONFIG_XFS_FS)            += xfs.o
-xfs-$(CONFIG_XFS_QUOTA)         += $(addprefix quota/, \
-                                   xfs_dquot.o \
-                                   xfs_dquot_item.o \
-                                   xfs_trans_dquot.o \
-                                   xfs_qm_syscalls.o \
-                                   xfs_qm_bhv.o \
-                                   xfs_qm.o)
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS)           += quota/xfs_qm_stats.o
-endif
-xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
-xfs-$(CONFIG_PROC_FS)           += linux-2.6/xfs_stats.o
-xfs-$(CONFIG_SYSCTL)            += linux-2.6/xfs_sysctl.o
-xfs-$(CONFIG_COMPAT)            += linux-2.6/xfs_ioctl32.o
-xfs-$(CONFIG_XFS_EXPORT)        += linux-2.6/xfs_export.o
-xfs-y                           += xfs_alloc.o \
-                                   xfs_alloc_btree.o \
-                                   xfs_attr.o \
-                                   xfs_attr_leaf.o \
-                                   xfs_behavior.o \
-                                   xfs_bit.o \
-                                   xfs_bmap.o \
-                                   xfs_bmap_btree.o \
-                                   xfs_btree.o \
-                                   xfs_buf_item.o \
-                                   xfs_da_btree.o \
-                                   xfs_dir.o \
-                                   xfs_dir2.o \
-                                   xfs_dir2_block.o \
-                                   xfs_dir2_data.o \
-                                   xfs_dir2_leaf.o \
-                                   xfs_dir2_node.o \
-                                   xfs_dir2_sf.o \
-                                   xfs_dir_leaf.o \
-                                   xfs_error.o \
-                                   xfs_extfree_item.o \
-                                   xfs_fsops.o \
-                                   xfs_ialloc.o \
-                                   xfs_ialloc_btree.o \
-                                   xfs_iget.o \
-                                   xfs_inode.o \
-                                   xfs_inode_item.o \
-                                   xfs_iocore.o \
-                                   xfs_iomap.o \
-                                   xfs_itable.o \
-                                   xfs_dfrag.o \
-                                   xfs_log.o \
-                                   xfs_log_recover.o \
-                                   xfs_macros.o \
-                                   xfs_mount.o \
-                                   xfs_rename.o \
-                                   xfs_trans.o \
-                                   xfs_trans_ail.o \
-                                   xfs_trans_buf.o \
-                                   xfs_trans_extfree.o \
-                                   xfs_trans_inode.o \
-                                   xfs_trans_item.o \
-                                   xfs_utils.o \
-                                   xfs_vfsops.o \
-                                   xfs_vnodeops.o \
-                                   xfs_rw.o \
-                                   xfs_dmops.o \
-                                   xfs_qmops.o
-xfs-$(CONFIG_XFS_TRACE)         += xfs_dir2_trace.o
-# Objects in linux-2.6/
-xfs-y                           += $(addprefix linux-2.6/, \
-                                   kmem.o \
-                                   xfs_aops.o \
-                                   xfs_buf.o \
-                                   xfs_file.o \
-                                   xfs_fs_subr.o \
-                                   xfs_globals.o \
-                                   xfs_ioctl.o \
-                                   xfs_iops.o \
-                                   xfs_lrw.o \
-                                   xfs_super.o \
-                                   xfs_vfs.o \
-                                   xfs_vnode.o)
-# Objects in support/
-xfs-y                           += $(addprefix support/, \
-                                   debug.o \
-                                   move.o \
-                                   qsort.o \
-                                   uuid.o)
-xfs-$(CONFIG_XFS_TRACE)         += support/ktrace.o
diff --git a/fs/xfs/Makefile-linux-2.6 b/fs/xfs/Makefile-linux-2.6
new file mode 100644
index 000000000000..fbfcbe5a7cda
--- /dev/null
+++ b/fs/xfs/Makefile-linux-2.6
@@ -0,0 +1,141 @@
+#
+# Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.  Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+EXTRA_CFLAGS +=  -Ifs/xfs -Ifs/xfs/linux-2.6 -funsigned-char
+XFS_LINUX := linux-2.6
+ifeq ($(CONFIG_XFS_DEBUG),y)
+        EXTRA_CFLAGS += -g -DSTATIC="" -DDEBUG
+        EXTRA_CFLAGS += -DPAGEBUF_LOCK_TRACKING
+endif
+ifeq ($(CONFIG_XFS_TRACE),y)
+        EXTRA_CFLAGS += -DXFS_ALLOC_TRACE
+        EXTRA_CFLAGS += -DXFS_ATTR_TRACE
+        EXTRA_CFLAGS += -DXFS_BLI_TRACE
+        EXTRA_CFLAGS += -DXFS_BMAP_TRACE
+        EXTRA_CFLAGS += -DXFS_BMBT_TRACE
+        EXTRA_CFLAGS += -DXFS_DIR_TRACE
+        EXTRA_CFLAGS += -DXFS_DIR2_TRACE
+        EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
+        EXTRA_CFLAGS += -DXFS_ILOCK_TRACE
+        EXTRA_CFLAGS += -DXFS_LOG_TRACE
+        EXTRA_CFLAGS += -DXFS_RW_TRACE
+        EXTRA_CFLAGS += -DPAGEBUF_TRACE
+        EXTRA_CFLAGS += -DXFS_VNODE_TRACE
+endif
+obj-$(CONFIG_XFS_FS)            += xfs.o
+obj-$(CONFIG_XFS_QUOTA)         += quota/
+xfs-$(CONFIG_XFS_RT)            += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_POSIX_ACL)     += xfs_acl.o
+xfs-$(CONFIG_PROC_FS)           += $(XFS_LINUX)/xfs_stats.o
+xfs-$(CONFIG_SYSCTL)            += $(XFS_LINUX)/xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)            += $(XFS_LINUX)/xfs_ioctl32.o
+xfs-$(CONFIG_XFS_EXPORT)        += $(XFS_LINUX)/xfs_export.o
+xfs-y                           += xfs_alloc.o \
+                                   xfs_alloc_btree.o \
+                                   xfs_attr.o \
+                                   xfs_attr_leaf.o \
+                                   xfs_behavior.o \
+                                   xfs_bit.o \
+                                   xfs_bmap.o \
+                                   xfs_bmap_btree.o \
+                                   xfs_btree.o \
+                                   xfs_buf_item.o \
+                                   xfs_da_btree.o \
+                                   xfs_dir.o \
+                                   xfs_dir2.o \
+                                   xfs_dir2_block.o \
+                                   xfs_dir2_data.o \
+                                   xfs_dir2_leaf.o \
+                                   xfs_dir2_node.o \
+                                   xfs_dir2_sf.o \
+                                   xfs_dir_leaf.o \
+                                   xfs_error.o \
+                                   xfs_extfree_item.o \
+                                   xfs_fsops.o \
+                                   xfs_ialloc.o \
+                                   xfs_ialloc_btree.o \
+                                   xfs_iget.o \
+                                   xfs_inode.o \
+                                   xfs_inode_item.o \
+                                   xfs_iocore.o \
+                                   xfs_iomap.o \
+                                   xfs_itable.o \
+                                   xfs_dfrag.o \
+                                   xfs_log.o \
+                                   xfs_log_recover.o \
+                                   xfs_macros.o \
+                                   xfs_mount.o \
+                                   xfs_rename.o \
+                                   xfs_trans.o \
+                                   xfs_trans_ail.o \
+                                   xfs_trans_buf.o \
+                                   xfs_trans_extfree.o \
+                                   xfs_trans_inode.o \
+                                   xfs_trans_item.o \
+                                   xfs_utils.o \
+                                   xfs_vfsops.o \
+                                   xfs_vnodeops.o \
+                                   xfs_rw.o \
+                                   xfs_dmops.o \
+                                   xfs_qmops.o
+xfs-$(CONFIG_XFS_TRACE)         += xfs_dir2_trace.o
+# Objects in linux/
+xfs-y                           += $(addprefix $(XFS_LINUX)/, \
+                                   kmem.o \
+                                   xfs_aops.o \
+                                   xfs_buf.o \
+                                   xfs_file.o \
+                                   xfs_fs_subr.o \
+                                   xfs_globals.o \
+                                   xfs_ioctl.o \
+                                   xfs_iops.o \
+                                   xfs_lrw.o \
+                                   xfs_super.o \
+                                   xfs_vfs.o \
+                                   xfs_vnode.o)
+# Objects in support/
+xfs-y                           += $(addprefix support/, \
+                                   debug.o \
+                                   move.o \
+                                   uuid.o)
+xfs-$(CONFIG_XFS_TRACE)         += support/ktrace.o
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 364ea8c386b1..4b184559f231 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -45,11 +45,11 @@
 void *
-kmem_alloc(size_t size, int flags)
+kmem_alloc(size_t size, unsigned int __nocast flags)
 {
-        int     retries = 0;
+        int             retries = 0;
-        int     lflags = kmem_flags_convert(flags);
+        unsigned int    lflags = kmem_flags_convert(flags);
-        void    *ptr;
+        void            *ptr;
        do {
                if (size < MAX_SLAB_SIZE || retries > MAX_VMALLOCS)
@@ -67,7 +67,7 @@ kmem_alloc(size_t size, int flags)
 }
 void *
-kmem_zalloc(size_t size, int flags)
+kmem_zalloc(size_t size, unsigned int __nocast flags)
 {
        void    *ptr;
@@ -89,7 +89,8 @@ kmem_free(void *ptr, size_t size)
 }
 void *
-kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
+kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
+             unsigned int __nocast flags)
 {
        void    *new;
@@ -104,11 +105,11 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize, int flags)
 }
 void *
-kmem_zone_alloc(kmem_zone_t *zone, int flags)
+kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 {
-        int     retries = 0;
+        int             retries = 0;
-        int     lflags = kmem_flags_convert(flags);
+        unsigned int    lflags = kmem_flags_convert(flags);
-        void    *ptr;
+        void            *ptr;
        do {
                ptr = kmem_cache_alloc(zone, lflags);
@@ -123,7 +124,7 @@ kmem_zone_alloc(kmem_zone_t *zone, int flags)
 }
 void *
-kmem_zone_zalloc(kmem_zone_t *zone, int flags)
+kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
 {
        void    *ptr;
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 1397b669b059..109fcf27e256 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -39,10 +39,10 @@
 /*
 * memory management routines
 */
-#define KM_SLEEP        0x0001
+#define KM_SLEEP        0x0001u
-#define KM_NOSLEEP      0x0002
+#define KM_NOSLEEP      0x0002u
-#define KM_NOFS         0x0004
+#define KM_NOFS         0x0004u
-#define KM_MAYFAIL      0x0008
+#define KM_MAYFAIL      0x0008u
 #define kmem_zone       kmem_cache_s
 #define kmem_zone_t     kmem_cache_t
@@ -81,9 +81,9 @@ typedef unsigned long xfs_pflags_t;
        *(NSTATEP) = *(OSTATEP);        \
 } while (0)
-static __inline unsigned int kmem_flags_convert(int flags)
+static __inline unsigned int kmem_flags_convert(unsigned int __nocast flags)
 {
-        int     lflags = __GFP_NOWARN;  /* we'll report problems, if need be */
+        unsigned int    lflags = __GFP_NOWARN;  /* we'll report problems, if need be */
 #ifdef DEBUG
        if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) {
@@ -125,12 +125,13 @@ kmem_zone_destroy(kmem_zone_t *zone)
                BUG();
 }
-extern void         *kmem_zone_zalloc(kmem_zone_t *, int);
+extern void         *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
-extern void         *kmem_zone_alloc(kmem_zone_t *, int);
+extern void         *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
-extern void         *kmem_alloc(size_t, int);
+extern void         *kmem_alloc(size_t, unsigned int __nocast);
-extern void         *kmem_realloc(void *, size_t, size_t, int);
+extern void         *kmem_realloc(void *, size_t, size_t,
-extern void         *kmem_zalloc(size_t, int);
+                                  unsigned int __nocast);
+extern void         *kmem_zalloc(size_t, unsigned int __nocast);
 extern void         kmem_free(void *, size_t);
 typedef struct shrinker *kmem_shaker_t;
diff --git a/fs/xfs/linux-2.6/spin.h b/fs/xfs/linux-2.6/spin.h
index bcf60a0b8df0..0039504069a5 100644
--- a/fs/xfs/linux-2.6/spin.h
+++ b/fs/xfs/linux-2.6/spin.h
@@ -45,6 +45,9 @@
 typedef spinlock_t lock_t;
 #define SPLDECL(s)                      unsigned long s
+#ifndef DEFINE_SPINLOCK
+#define DEFINE_SPINLOCK(s)              spinlock_t s = SPIN_LOCK_UNLOCKED
+#endif
 #define spinlock_init(lock, name)       spin_lock_init(lock)
 #define spinlock_destroy(lock)
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a3a4b5aaf5d9..c6c077978fe3 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -104,66 +104,114 @@ xfs_page_trace(
 #define xfs_page_trace(tag, inode, page, mask)
 #endif
-void
+/*
-linvfs_unwritten_done(
+ * Schedule IO completion handling on a xfsdatad if this was
-        struct buffer_head      *bh,
+ * the final hold on this ioend.
-        int                     uptodate)
+ */
+STATIC void
+xfs_finish_ioend(
+        xfs_ioend_t             *ioend)
 {
-        xfs_buf_t               *pb = (xfs_buf_t *)bh->b_private;
+        if (atomic_dec_and_test(&ioend->io_remaining))
+                queue_work(xfsdatad_workqueue, &ioend->io_work);
+}
-        ASSERT(buffer_unwritten(bh));
+STATIC void
-        bh->b_end_io = NULL;
+xfs_destroy_ioend(
-        clear_buffer_unwritten(bh);
+        xfs_ioend_t             *ioend)
-        if (!uptodate)
+{
-                pagebuf_ioerror(pb, EIO);
+        vn_iowake(ioend->io_vnode);
-        if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
+        mempool_free(ioend, xfs_ioend_pool);
-                pagebuf_iodone(pb, 1, 1);
-        }
-        end_buffer_async_write(bh, uptodate);
 }
 /*
 * Issue transactions to convert a buffer range from unwritten
- * to written extents (buffered IO).
+ * to written extents.
 */
 STATIC void
-linvfs_unwritten_convert(
+xfs_end_bio_unwritten(
-        xfs_buf_t       *bp)
+        void                    *data)
 {
-        vnode_t         *vp = XFS_BUF_FSPRIVATE(bp, vnode_t *);
+        xfs_ioend_t             *ioend = data;
-        int             error;
+        vnode_t                 *vp = ioend->io_vnode;
+        xfs_off_t               offset = ioend->io_offset;
+        size_t                  size = ioend->io_size;
+        struct buffer_head      *bh, *next;
+        int                     error;
+        if (ioend->io_uptodate)
+                VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
+        /* ioend->io_buffer_head is only non-NULL for buffered I/O */
+        for (bh = ioend->io_buffer_head; bh; bh = next) {
+                next = bh->b_private;
+                bh->b_end_io = NULL;
+                clear_buffer_unwritten(bh);
+                end_buffer_async_write(bh, ioend->io_uptodate);
+        }
-        BUG_ON(atomic_read(&bp->pb_hold) < 1);
+        xfs_destroy_ioend(ioend);
-        VOP_BMAP(vp, XFS_BUF_OFFSET(bp), XFS_BUF_SIZE(bp),
-                        BMAPI_UNWRITTEN, NULL, NULL, error);
-        XFS_BUF_SET_FSPRIVATE(bp, NULL);
-        XFS_BUF_CLR_IODONE_FUNC(bp);
-        XFS_BUF_UNDATAIO(bp);
-        iput(LINVFS_GET_IP(vp));
-        pagebuf_iodone(bp, 0, 0);
 }
 /*
- * Issue transactions to convert a buffer range from unwritten
+ * Allocate and initialise an IO completion structure.
- * to written extents (direct IO).
+ * We need to track unwritten extent write completion here initially.
+ * We'll need to extend this for updating the ondisk inode size later
+ * (vs. incore size).
 */
-STATIC void
+STATIC xfs_ioend_t *
-linvfs_unwritten_convert_direct(
+xfs_alloc_ioend(
-        struct kiocb    *iocb,
+        struct inode            *inode)
-        loff_t          offset,
-        ssize_t         size,
-        void            *private)
 {
-        struct inode    *inode = iocb->ki_filp->f_dentry->d_inode;
+        xfs_ioend_t             *ioend;
-        ASSERT(!private || inode == (struct inode *)private);
-        /* private indicates an unwritten extent lay beneath this IO */
+        ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
-        if (private && size > 0) {
-                vnode_t *vp = LINVFS_GET_VP(inode);
-                int     error;
-                VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
+        /*
-        }
+         * Set the count to 1 initially, which will prevent an I/O
+         * completion callback from happening before we have started
+         * all the I/O from calling the completion routine too early.
+         */
+        atomic_set(&ioend->io_remaining, 1);
+        ioend->io_uptodate = 1; /* cleared if any I/O fails */
+        ioend->io_vnode = LINVFS_GET_VP(inode);
+        ioend->io_buffer_head = NULL;
+        atomic_inc(&ioend->io_vnode->v_iocount);
+        ioend->io_offset = 0;
+        ioend->io_size = 0;
+        INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend);
+        return ioend;
+}
+void
+linvfs_unwritten_done(
+        struct buffer_head      *bh,
+        int                     uptodate)
+{
+        xfs_ioend_t             *ioend = bh->b_private;
+        static spinlock_t       unwritten_done_lock = SPIN_LOCK_UNLOCKED;
+        unsigned long           flags;
+        ASSERT(buffer_unwritten(bh));
+        bh->b_end_io = NULL;
+        if (!uptodate)
+                ioend->io_uptodate = 0;
+        /*
+         * Deep magic here.  We reuse b_private in the buffer_heads to build
+         * a chain for completing the I/O from user context after we've issued
+         * a transaction to convert the unwritten extent.
+         */
+        spin_lock_irqsave(&unwritten_done_lock, flags);
+        bh->b_private = ioend->io_buffer_head;
+        ioend->io_buffer_head = bh;
+        spin_unlock_irqrestore(&unwritten_done_lock, flags);
+        xfs_finish_ioend(ioend);
 }
 STATIC int
@@ -255,7 +303,7 @@ xfs_probe_unwritten_page(
        struct address_space    *mapping,
        pgoff_t                 index,
        xfs_iomap_t             *iomapp,
-        xfs_buf_t               *pb,
+        xfs_ioend_t             *ioend,
        unsigned long           max_offset,
        unsigned long           *fsbs,
        unsigned int            bbits)
@@ -283,7 +331,7 @@ xfs_probe_unwritten_page(
                                break;
                        xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
                        set_buffer_unwritten_io(bh);
-                        bh->b_private = pb;
+                        bh->b_private = ioend;
                        p_offset += bh->b_size;
                        (*fsbs)++;
                } while ((bh = bh->b_this_page) != head);
@@ -434,34 +482,15 @@ xfs_map_unwritten(
 {
        struct buffer_head      *bh = curr;
        xfs_iomap_t             *tmp;
-        xfs_buf_t               *pb;
+        xfs_ioend_t             *ioend;
-        loff_t                  offset, size;
+        loff_t                  offset;
        unsigned long           nblocks = 0;
        offset = start_page->index;
        offset <<= PAGE_CACHE_SHIFT;
        offset += p_offset;
-        /* get an "empty" pagebuf to manage IO completion
+        ioend = xfs_alloc_ioend(inode);
-         * Proper values will be set before returning */
-        pb = pagebuf_lookup(iomapp->iomap_target, 0, 0, 0);
-        if (!pb)
-                return -EAGAIN;
-        /* Take a reference to the inode to prevent it from
-         * being reclaimed while we have outstanding unwritten
-         * extent IO on it.
-         */
-        if ((igrab(inode)) != inode) {
-                pagebuf_free(pb);
-                return -EAGAIN;
-        }
-        /* Set the count to 1 initially, this will stop an I/O
-         * completion callout which happens before we have started
-         * all the I/O from calling pagebuf_iodone too early.
-         */
-        atomic_set(&pb->pb_io_remaining, 1);
        /* First map forwards in the page consecutive buffers
         * covering this unwritten extent
@@ -474,12 +503,12 @@ xfs_map_unwritten(
                        break;
                xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
                set_buffer_unwritten_io(bh);
-                bh->b_private = pb;
+                bh->b_private = ioend;
                p_offset += bh->b_size;
                nblocks++;
        } while ((bh = bh->b_this_page) != head);
-        atomic_add(nblocks, &pb->pb_io_remaining);
+        atomic_add(nblocks, &ioend->io_remaining);
        /* If we reached the end of the page, map forwards in any
         * following pages which are also covered by this extent.
@@ -496,13 +525,13 @@ xfs_map_unwritten(
                tloff = min(tlast, tloff);
                for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
                        page = xfs_probe_unwritten_page(mapping,
-                                                tindex, iomapp, pb,
+                                                tindex, iomapp, ioend,
                                                PAGE_CACHE_SIZE, &bs, bbits);
                        if (!page)
                                break;
                        nblocks += bs;
-                        atomic_add(bs, &pb->pb_io_remaining);
+                        atomic_add(bs, &ioend->io_remaining);
-                        xfs_convert_page(inode, page, iomapp, wbc, pb,
+                        xfs_convert_page(inode, page, iomapp, wbc, ioend,
                                                        startio, all_bh);
                        /* stop if converting the next page might add
                         * enough blocks that the corresponding byte
@@ -514,12 +543,12 @@ xfs_map_unwritten(
                if (tindex == tlast &&
                    (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
                        page = xfs_probe_unwritten_page(mapping,
-                                                        tindex, iomapp, pb,
+                                                        tindex, iomapp, ioend,
                                                        pg_offset, &bs, bbits);
                        if (page) {
                                nblocks += bs;
-                                atomic_add(bs, &pb->pb_io_remaining);
+                                atomic_add(bs, &ioend->io_remaining);
-                                xfs_convert_page(inode, page, iomapp, wbc, pb,
+                                xfs_convert_page(inode, page, iomapp, wbc, ioend,
                                                        startio, all_bh);
                                if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
                                        goto enough;
@@ -528,21 +557,9 @@ xfs_map_unwritten(
        }
 enough:
-        size = nblocks;         /* NB: using 64bit number here */
+        ioend->io_size = (xfs_off_t)nblocks << block_bits;
-        size <<= block_bits;    /* convert fsb's to byte range */
+        ioend->io_offset = offset;
+        xfs_finish_ioend(ioend);
-        XFS_BUF_DATAIO(pb);
-        XFS_BUF_ASYNC(pb);
-        XFS_BUF_SET_SIZE(pb, size);
-        XFS_BUF_SET_COUNT(pb, size);
-        XFS_BUF_SET_OFFSET(pb, offset);
-        XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode));
-        XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert);
-        if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
-                pagebuf_iodone(pb, 1, 1);
-        }
        return 0;
 }
@@ -787,7 +804,7 @@ xfs_page_state_convert(
                                continue;
                        if (!iomp) {
                                err = xfs_map_blocks(inode, offset, len, &iomap,
-                                                BMAPI_READ|BMAPI_IGNSTATE);
+                                                BMAPI_WRITE|BMAPI_IGNSTATE);
                                if (err) {
                                        goto error;
                                }
@@ -1028,6 +1045,44 @@ linvfs_get_blocks_direct(
                                        create, 1, BMAPI_WRITE|BMAPI_DIRECT);
 }
+STATIC void
+linvfs_end_io_direct(
+        struct kiocb    *iocb,
+        loff_t          offset,
+        ssize_t         size,
+        void            *private)
+{
+        xfs_ioend_t     *ioend = iocb->private;
+        /*
+         * Non-NULL private data means we need to issue a transaction to
+         * convert a range from unwritten to written extents.  This needs
+         * to happen from process contect but aio+dio I/O completion
+         * happens from irq context so we need to defer it to a workqueue.
+         * This is not nessecary for synchronous direct I/O, but we do
+         * it anyway to keep the code uniform and simpler.
+         *
+         * The core direct I/O code might be changed to always call the
+         * completion handler in the future, in which case all this can
+         * go away.
+         */
+        if (private && size > 0) {
+                ioend->io_offset = offset;
+                ioend->io_size = size;
+                xfs_finish_ioend(ioend);
+        } else {
+                ASSERT(size >= 0);
+                xfs_destroy_ioend(ioend);
+        }
+        /*
+         * blockdev_direct_IO can return an error even afer the I/O
+         * completion handler was called.  Thus we need to protect
+         * against double-freeing.
+         */
+        iocb->private = NULL;
+}
 STATIC ssize_t
 linvfs_direct_IO(
        int                     rw,
@@ -1042,16 +1097,23 @@ linvfs_direct_IO(
        xfs_iomap_t     iomap;
        int             maps = 1;
        int             error;
+        ssize_t         ret;
        VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error);
        if (error)
                return -error;
-        return blockdev_direct_IO_own_locking(rw, iocb, inode,
+        iocb->private = xfs_alloc_ioend(inode);
+        ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
                iomap.iomap_target->pbr_bdev,
                iov, offset, nr_segs,
                linvfs_get_blocks_direct,
-                linvfs_unwritten_convert_direct);
+                linvfs_end_io_direct);
+        if (unlikely(ret <= 0 && iocb->private))
+                xfs_destroy_ioend(iocb->private);
+        return ret;
 }
@@ -1202,6 +1264,16 @@ out_unlock:
        return error;
 }
+STATIC int
+linvfs_invalidate_page(
+        struct page             *page,
+        unsigned long           offset)
+{
+        xfs_page_trace(XFS_INVALIDPAGE_ENTER,
+                        page->mapping->host, page, offset);
+        return block_invalidatepage(page, offset);
+}
 /*
 * Called to move a page into cleanable state - and from there
 * to be released. Possibly the page is already clean. We always
@@ -1279,6 +1351,7 @@ struct address_space_operations linvfs_aops = {
        .writepage              = linvfs_writepage,
        .sync_page              = block_sync_page,
        .releasepage            = linvfs_release_page,
+        .invalidatepage         = linvfs_invalidate_page,
        .prepare_write          = linvfs_prepare_write,
        .commit_write           = generic_commit_write,
        .bmap                   = linvfs_bmap,
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
new file mode 100644
index 000000000000..2fa62974a04d
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+#ifndef __XFS_AOPS_H__
+#define __XFS_AOPS_H__
+extern struct workqueue_struct *xfsdatad_workqueue;
+extern mempool_t *xfs_ioend_pool;
+typedef void (*xfs_ioend_func_t)(void *);
+typedef struct xfs_ioend {
+        unsigned int            io_uptodate;    /* I/O status register */
+        atomic_t                io_remaining;   /* hold count */
+        struct vnode            *io_vnode;      /* file being written to */
+        struct buffer_head      *io_buffer_head;/* buffer linked list head */
+        size_t                  io_size;        /* size of the extent */
+        xfs_off_t               io_offset;      /* offset in the file */
+        struct work_struct      io_work;        /* xfsdatad work queue */
+} xfs_ioend_t;
+#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index df0cba239dd5..655bf4a78afe 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -54,6 +54,7 @@
 #include <linux/percpu.h>
 #include <linux/blkdev.h>
 #include <linux/hash.h>
+#include <linux/kthread.h>
 #include "xfs_linux.h"
@@ -67,7 +68,7 @@ STATIC int xfsbufd_wakeup(int, unsigned int);
 STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
 STATIC struct workqueue_struct *xfslogd_workqueue;
-STATIC struct workqueue_struct *xfsdatad_workqueue;
+struct workqueue_struct *xfsdatad_workqueue;
 /*
 * Pagebuf debugging
@@ -590,8 +591,10 @@ found:
                PB_SET_OWNER(pb);
        }
-        if (pb->pb_flags & PBF_STALE)
+        if (pb->pb_flags & PBF_STALE) {
+                ASSERT((pb->pb_flags & _PBF_DELWRI_Q) == 0);
                pb->pb_flags &= PBF_MAPPED;
+        }
        PB_TRACE(pb, "got_lock", 0);
        XFS_STATS_INC(pb_get_locked);
        return (pb);
@@ -700,25 +703,6 @@ xfs_buf_read_flags(
 }
 /*
- * Create a skeletal pagebuf (no pages associated with it).
- */
-xfs_buf_t *
-pagebuf_lookup(
-        xfs_buftarg_t           *target,
-        loff_t                  ioff,
-        size_t                  isize,
-        page_buf_flags_t        flags)
-{
-        xfs_buf_t               *pb;
-        pb = pagebuf_allocate(flags);
-        if (pb) {
-                _pagebuf_initialize(pb, target, ioff, isize, flags);
-        }
-        return pb;
-}
-/*
 * If we are not low on memory then do the readahead in a deadlock
 * safe manner.
 */
@@ -913,22 +897,23 @@ pagebuf_rele(
                        do_free = 0;
                }
-                if (pb->pb_flags & PBF_DELWRI) {
+                if (pb->pb_flags & PBF_FS_MANAGED) {
-                        pb->pb_flags |= PBF_ASYNC;
-                        atomic_inc(&pb->pb_hold);
-                        pagebuf_delwri_queue(pb, 0);
-                        do_free = 0;
-                } else if (pb->pb_flags & PBF_FS_MANAGED) {
                        do_free = 0;
                }
                if (do_free) {
+                        ASSERT((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == 0);
                        list_del_init(&pb->pb_hash_list);
                        spin_unlock(&hash->bh_lock);
                        pagebuf_free(pb);
                } else {
                        spin_unlock(&hash->bh_lock);
                }
+        } else {
+                /*
+                 * Catch reference count leaks
+                 */
+                ASSERT(atomic_read(&pb->pb_hold) >= 0);
        }
 }
@@ -1006,13 +991,24 @@ pagebuf_lock(
 *      pagebuf_unlock
 *
 *      pagebuf_unlock releases the lock on the buffer object created by
- *      pagebuf_lock or pagebuf_cond_lock (not any
+ *      pagebuf_lock or pagebuf_cond_lock (not any pinning of underlying pages
- *      pinning of underlying pages created by pagebuf_pin).
+ *      created by pagebuf_pin).
+ *
+ *      If the buffer is marked delwri but is not queued, do so before we
+ *      unlock the buffer as we need to set flags correctly. We also need to
+ *      take a reference for the delwri queue because the unlocker is going to
+ *      drop their's and they don't know we just queued it.
 */
 void
 pagebuf_unlock(                         /* unlock buffer                */
        xfs_buf_t               *pb)    /* buffer to unlock             */
 {
+        if ((pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q)) == PBF_DELWRI) {
+                atomic_inc(&pb->pb_hold);
+                pb->pb_flags |= PBF_ASYNC;
+                pagebuf_delwri_queue(pb, 0);
+        }
        PB_CLEAR_OWNER(pb);
        up(&pb->pb_sema);
        PB_TRACE(pb, "unlock", 0);
@@ -1249,8 +1245,8 @@ bio_end_io_pagebuf(
        int                     error)
 {
        xfs_buf_t               *pb = (xfs_buf_t *)bio->bi_private;
-        unsigned int            i, blocksize = pb->pb_target->pbr_bsize;
+        unsigned int            blocksize = pb->pb_target->pbr_bsize;
-        struct bio_vec          *bvec = bio->bi_io_vec;
+        struct bio_vec          *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
        if (bio->bi_size)
                return 1;
@@ -1258,10 +1254,12 @@ bio_end_io_pagebuf(
        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                pb->pb_error = EIO;
-        for (i = 0; i < bio->bi_vcnt; i++, bvec++) {
+        do {
                struct page     *page = bvec->bv_page;
-                if (pb->pb_error) {
+                if (unlikely(pb->pb_error)) {
+                        if (pb->pb_flags & PBF_READ)
+                                ClearPageUptodate(page);
                        SetPageError(page);
                } else if (blocksize == PAGE_CACHE_SIZE) {
                        SetPageUptodate(page);
@@ -1270,10 +1268,13 @@ bio_end_io_pagebuf(
                        set_page_region(page, bvec->bv_offset, bvec->bv_len);
                }
+                if (--bvec >= bio->bi_io_vec)
+                        prefetchw(&bvec->bv_page->flags);
                if (_pagebuf_iolocked(pb)) {
                        unlock_page(page);
                }
-        }
+        } while (bvec >= bio->bi_io_vec);
        _pagebuf_iodone(pb, 1);
        bio_put(bio);
@@ -1511,6 +1512,11 @@ again:
                        ASSERT(btp == bp->pb_target);
                        if (!(bp->pb_flags & PBF_FS_MANAGED)) {
                                spin_unlock(&hash->bh_lock);
+                                /*
+                                 * Catch superblock reference count leaks
+                                 * immediately
+                                 */
+                                BUG_ON(bp->pb_bn == 0);
                                delay(100);
                                goto again;
                        }
@@ -1686,17 +1692,20 @@ pagebuf_delwri_queue(
        int                     unlock)
 {
        PB_TRACE(pb, "delwri_q", (long)unlock);
-        ASSERT(pb->pb_flags & PBF_DELWRI);
+        ASSERT((pb->pb_flags & (PBF_DELWRI|PBF_ASYNC)) ==
+                                        (PBF_DELWRI|PBF_ASYNC));
        spin_lock(&pbd_delwrite_lock);
        /* If already in the queue, dequeue and place at tail */
        if (!list_empty(&pb->pb_list)) {
+                ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
                if (unlock) {
                        atomic_dec(&pb->pb_hold);
                }
                list_del(&pb->pb_list);
        }
+        pb->pb_flags |= _PBF_DELWRI_Q;
        list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
        pb->pb_queuetime = jiffies;
        spin_unlock(&pbd_delwrite_lock);
@@ -1713,10 +1722,11 @@ pagebuf_delwri_dequeue(
        spin_lock(&pbd_delwrite_lock);
        if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
+                ASSERT(pb->pb_flags & _PBF_DELWRI_Q);
                list_del_init(&pb->pb_list);
                dequeued = 1;
        }
-        pb->pb_flags &= ~PBF_DELWRI;
+        pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
        spin_unlock(&pbd_delwrite_lock);
        if (dequeued)
@@ -1733,9 +1743,7 @@ pagebuf_runall_queues(
 }
 /* Defines for pagebuf daemon */
-STATIC DECLARE_COMPLETION(xfsbufd_done);
 STATIC struct task_struct *xfsbufd_task;
-STATIC int xfsbufd_active;
 STATIC int xfsbufd_force_flush;
 STATIC int xfsbufd_force_sleep;
@@ -1761,14 +1769,8 @@ xfsbufd(
        xfs_buftarg_t           *target;
        xfs_buf_t               *pb, *n;
-        /*  Set up the thread  */
-        daemonize("xfsbufd");
        current->flags |= PF_MEMALLOC;
-        xfsbufd_task = current;
-        xfsbufd_active = 1;
-        barrier();
        INIT_LIST_HEAD(&tmp);
        do {
                if (unlikely(freezing(current))) {
@@ -1795,7 +1797,7 @@ xfsbufd(
                                        break;
                                }
-                                pb->pb_flags &= ~PBF_DELWRI;
+                                pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
                                pb->pb_flags |= PBF_WRITE;
                                list_move(&pb->pb_list, &tmp);
                        }
@@ -1816,9 +1818,9 @@ xfsbufd(
                        purge_addresses();
                xfsbufd_force_flush = 0;
-        } while (xfsbufd_active);
+        } while (!kthread_should_stop());
-        complete_and_exit(&xfsbufd_done, 0);
+        return 0;
 }
 /*
@@ -1845,15 +1847,13 @@ xfs_flush_buftarg(
                if (pb->pb_target != target)
                        continue;
-                ASSERT(pb->pb_flags & PBF_DELWRI);
+                ASSERT(pb->pb_flags & (PBF_DELWRI|_PBF_DELWRI_Q));
                PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
                if (pagebuf_ispin(pb)) {
                        pincount++;
                        continue;
                }
-                pb->pb_flags &= ~PBF_DELWRI;
-                pb->pb_flags |= PBF_WRITE;
                list_move(&pb->pb_list, &tmp);
        }
        spin_unlock(&pbd_delwrite_lock);
@@ -1862,12 +1862,14 @@ xfs_flush_buftarg(
         * Dropped the delayed write list lock, now walk the temporary list
         */
        list_for_each_entry_safe(pb, n, &tmp, pb_list) {
+                pagebuf_lock(pb);
+                pb->pb_flags &= ~(PBF_DELWRI|_PBF_DELWRI_Q);
+                pb->pb_flags |= PBF_WRITE;
                if (wait)
                        pb->pb_flags &= ~PBF_ASYNC;
                else
                        list_del_init(&pb->pb_list);
-                pagebuf_lock(pb);
                pagebuf_iostrategy(pb);
        }
@@ -1901,9 +1903,11 @@ xfs_buf_daemons_start(void)
        if (!xfsdatad_workqueue)
                goto out_destroy_xfslogd_workqueue;
-        error = kernel_thread(xfsbufd, NULL, CLONE_FS|CLONE_FILES);
+        xfsbufd_task = kthread_run(xfsbufd, NULL, "xfsbufd");
-        if (error < 0)
+        if (IS_ERR(xfsbufd_task)) {
+                error = PTR_ERR(xfsbufd_task);
                goto out_destroy_xfsdatad_workqueue;
+        }
        return 0;
 out_destroy_xfsdatad_workqueue:
@@ -1920,10 +1924,7 @@ xfs_buf_daemons_start(void)
 STATIC void
 xfs_buf_daemons_stop(void)
 {
-        xfsbufd_active = 0;
+        kthread_stop(xfsbufd_task);
-        barrier();
-        wait_for_completion(&xfsbufd_done);
        destroy_workqueue(xfslogd_workqueue);
        destroy_workqueue(xfsdatad_workqueue);
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 3f8f69a66aea..67c19f799232 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -89,6 +89,7 @@ typedef enum page_buf_flags_e {		/* pb_flags values */
        _PBF_PAGE_CACHE = (1 << 17),/* backed by pagecache                 */
        _PBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc()              */
        _PBF_RUN_QUEUES = (1 << 19),/* run block device task queue         */
+        _PBF_DELWRI_Q = (1 << 21),   /* buffer on delwri queue             */
 } page_buf_flags_t;
 #define PBF_UPDATE (PBF_READ | PBF_WRITE)
@@ -206,13 +207,6 @@ extern xfs_buf_t *xfs_buf_read_flags(	/* allocate and read a buffer	*/
 #define xfs_buf_read(target, blkno, len, flags) \
        xfs_buf_read_flags((target), (blkno), (len), PBF_LOCK | PBF_MAPPED)
-extern xfs_buf_t *pagebuf_lookup(
-                xfs_buftarg_t *,
-                loff_t,                 /* starting offset of range     */
-                size_t,                 /* length of range              */
-                page_buf_flags_t);      /* PBF_READ, PBF_WRITE,         */
-                                        /* PBF_FORCEIO,                 */
 extern xfs_buf_t *pagebuf_get_empty(    /* allocate pagebuf struct with */
                                        /*  no memory or disk address   */
                size_t len,
@@ -344,8 +338,6 @@ extern void pagebuf_trace(
 /* These are just for xfs_syncsub... it sets an internal variable
 * then passes it to VOP_FLUSH_PAGES or adds the flags to a newly gotten buf_t
 */
@@ -452,7 +444,7 @@ extern void pagebuf_trace(
 #define XFS_BUF_PTR(bp)         (xfs_caddr_t)((bp)->pb_addr)
-extern inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset)
+static inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset)
 {
        if (bp->pb_flags & PBF_MAPPED)
                return XFS_BUF_PTR(bp) + offset;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index f1ce4323f56e..3881622bcf08 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -311,6 +311,31 @@ linvfs_fsync(
 #define nextdp(dp)      ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen))
+#ifdef CONFIG_XFS_DMAPI
+STATIC struct page *
+linvfs_filemap_nopage(
+        struct vm_area_struct   *area,
+        unsigned long           address,
+        int                     *type)
+{
+        struct inode    *inode = area->vm_file->f_dentry->d_inode;
+        vnode_t         *vp = LINVFS_GET_VP(inode);
+        xfs_mount_t     *mp = XFS_VFSTOM(vp->v_vfsp);
+        int             error;
+        ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI);
+        error = XFS_SEND_MMAP(mp, area, 0);
+        if (error)
+                return NULL;
+        return filemap_nopage(area, address, type);
+}
+#endif /* CONFIG_XFS_DMAPI */
 STATIC int
 linvfs_readdir(
        struct file     *filp,
@@ -390,14 +415,6 @@ done:
        return -error;
 }
-#ifdef CONFIG_XFS_DMAPI
-STATIC void
-linvfs_mmap_close(
-        struct vm_area_struct   *vma)
-{
-        xfs_dm_mm_put(vma);
-}
-#endif /* CONFIG_XFS_DMAPI */
 STATIC int
 linvfs_file_mmap(
@@ -411,16 +428,11 @@ linvfs_file_mmap(
        vma->vm_ops = &linvfs_file_vm_ops;
-        if (vp->v_vfsp->vfs_flag & VFS_DMI) {
-                xfs_mount_t     *mp = XFS_VFSTOM(vp->v_vfsp);
-                error = -XFS_SEND_MMAP(mp, vma, 0);
-                if (error)
-                        return error;
 #ifdef CONFIG_XFS_DMAPI
+        if (vp->v_vfsp->vfs_flag & VFS_DMI) {
                vma->vm_ops = &linvfs_dmapi_file_vm_ops;
-#endif
        }
+#endif /* CONFIG_XFS_DMAPI */
        VOP_SETATTR(vp, &va, XFS_AT_UPDATIME, NULL, error);
        if (!error)
@@ -474,6 +486,7 @@ linvfs_ioctl_invis(
        return error;
 }
+#ifdef CONFIG_XFS_DMAPI
 #ifdef HAVE_VMOP_MPROTECT
 STATIC int
 linvfs_mprotect(
@@ -494,6 +507,7 @@ linvfs_mprotect(
        return error;
 }
 #endif /* HAVE_VMOP_MPROTECT */
+#endif /* CONFIG_XFS_DMAPI */
 #ifdef HAVE_FOP_OPEN_EXEC
 /* If the user is attempting to execute a file that is offline then
@@ -528,49 +542,10 @@ open_exec_out:
 }
 #endif /* HAVE_FOP_OPEN_EXEC */
-/*
- * Temporary workaround to the AIO direct IO write problem.
- * This code can go and we can revert to do_sync_write once
- * the writepage(s) rework is merged.
- */
-STATIC ssize_t
-linvfs_write(
-        struct file     *filp,
-        const char      __user *buf,
-        size_t          len,
-        loff_t          *ppos)
-{
-        struct kiocb    kiocb;
-        ssize_t         ret;
-        init_sync_kiocb(&kiocb, filp);
-        kiocb.ki_pos = *ppos;
-        ret = __linvfs_write(&kiocb, buf, 0, len, kiocb.ki_pos);
-        *ppos = kiocb.ki_pos;
-        return ret;
-}
-STATIC ssize_t
-linvfs_write_invis(
-        struct file     *filp,
-        const char      __user *buf,
-        size_t          len,
-        loff_t          *ppos)
-{
-        struct kiocb    kiocb;
-        ssize_t         ret;
-        init_sync_kiocb(&kiocb, filp);
-        kiocb.ki_pos = *ppos;
-        ret = __linvfs_write(&kiocb, buf, IO_INVIS, len, kiocb.ki_pos);
-        *ppos = kiocb.ki_pos;
-        return ret;
-}
 struct file_operations linvfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
-        .write          = linvfs_write,
+        .write          = do_sync_write,
        .readv          = linvfs_readv,
        .writev         = linvfs_writev,
        .aio_read       = linvfs_aio_read,
@@ -592,7 +567,7 @@ struct file_operations linvfs_file_operations = {
 struct file_operations linvfs_invis_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
-        .write          = linvfs_write_invis,
+        .write          = do_sync_write,
        .readv          = linvfs_readv_invis,
        .writev         = linvfs_writev_invis,
        .aio_read       = linvfs_aio_read_invis,
@@ -626,8 +601,7 @@ static struct vm_operations_struct linvfs_file_vm_ops = {
 #ifdef CONFIG_XFS_DMAPI
 static struct vm_operations_struct linvfs_dmapi_file_vm_ops = {
-        .close          = linvfs_mmap_close,
+        .nopage         = linvfs_filemap_nopage,
-        .nopage         = filemap_nopage,
        .populate       = filemap_populate,
 #ifdef HAVE_VMOP_MPROTECT
        .mprotect       = linvfs_mprotect,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 05a447e51cc0..6a3326bcd8d0 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -141,13 +141,19 @@ xfs_find_handle(
                return -XFS_ERROR(EINVAL);
        }
-        /* we need the vnode */
+        switch (inode->i_mode & S_IFMT) {
-        vp = LINVFS_GET_VP(inode);
+        case S_IFREG:
-        if (vp->v_type != VREG && vp->v_type != VDIR && vp->v_type != VLNK) {
+        case S_IFDIR:
+        case S_IFLNK:
+                break;
+        default:
                iput(inode);
                return -XFS_ERROR(EBADF);
        }
+        /* we need the vnode */
+        vp = LINVFS_GET_VP(inode);
        /* now we can grab the fsid */
        memcpy(&handle.ha_fsid, vp->v_vfsp->vfs_altfsid, sizeof(xfs_fsid_t));
        hsize = sizeof(xfs_fsid_t);
@@ -386,7 +392,7 @@ xfs_readlink_by_handle(
                return -error;
        /* Restrict this handle operation to symlinks only. */
-        if (vp->v_type != VLNK) {
+        if (!S_ISLNK(inode->i_mode)) {
                VN_RELE(vp);
                return -XFS_ERROR(EINVAL);
        }
@@ -982,10 +988,10 @@ xfs_ioc_space(
        if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
                return -XFS_ERROR(EPERM);
-        if (!(filp->f_flags & FMODE_WRITE))
+        if (!(filp->f_mode & FMODE_WRITE))
                return -XFS_ERROR(EBADF);
-        if (vp->v_type != VREG)
+        if (!VN_ISREG(vp))
                return -XFS_ERROR(EINVAL);
        if (copy_from_user(&bf, arg, sizeof(bf)))
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0f8f1384eb36..4636b7f86f1f 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -47,8 +47,52 @@
 #include "xfs_vnode.h"
 #include "xfs_dfrag.h"
+#define  _NATIVE_IOC(cmd, type) \
+          _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
 #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
 #define BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct xfs_flock64_32 {
+        __s16           l_type;
+        __s16           l_whence;
+        __s64           l_start __attribute__((packed));
+                        /* len == 0 means until end of file */
+        __s64           l_len __attribute__((packed));
+        __s32           l_sysid;
+        __u32           l_pid;
+        __s32           l_pad[4];       /* reserve area */
+} xfs_flock64_32_t;
+#define XFS_IOC_ALLOCSP_32      _IOW ('X', 10, struct xfs_flock64_32)
+#define XFS_IOC_FREESP_32       _IOW ('X', 11, struct xfs_flock64_32)
+#define XFS_IOC_ALLOCSP64_32    _IOW ('X', 36, struct xfs_flock64_32)
+#define XFS_IOC_FREESP64_32     _IOW ('X', 37, struct xfs_flock64_32)
+#define XFS_IOC_RESVSP_32       _IOW ('X', 40, struct xfs_flock64_32)
+#define XFS_IOC_UNRESVSP_32     _IOW ('X', 41, struct xfs_flock64_32)
+#define XFS_IOC_RESVSP64_32     _IOW ('X', 42, struct xfs_flock64_32)
+#define XFS_IOC_UNRESVSP64_32   _IOW ('X', 43, struct xfs_flock64_32)
+/* just account for different alignment */
+STATIC unsigned long
+xfs_ioctl32_flock(
+        unsigned long           arg)
+{
+        xfs_flock64_32_t        __user *p32 = (void __user *)arg;
+        xfs_flock64_t           __user *p = compat_alloc_user_space(sizeof(*p));
+        if (copy_in_user(&p->l_type,    &p32->l_type,   sizeof(s16)) ||
+            copy_in_user(&p->l_whence,  &p32->l_whence, sizeof(s16)) ||
+            copy_in_user(&p->l_start,   &p32->l_start,  sizeof(s64)) ||
+            copy_in_user(&p->l_len,     &p32->l_len,    sizeof(s64)) ||
+            copy_in_user(&p->l_sysid,   &p32->l_sysid,  sizeof(s32)) ||
+            copy_in_user(&p->l_pid,     &p32->l_pid,    sizeof(u32)) ||
+            copy_in_user(&p->l_pad,     &p32->l_pad,    4*sizeof(u32)))
+                return -EFAULT;
+        
+        return (unsigned long)p;
+}
 #else
 typedef struct xfs_fsop_bulkreq32 {
@@ -103,7 +147,6 @@ __linvfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg)
 /* not handled
        case XFS_IOC_FD_TO_HANDLE:
        case XFS_IOC_PATH_TO_HANDLE:
-        case XFS_IOC_PATH_TO_HANDLE:
        case XFS_IOC_PATH_TO_FSHANDLE:
        case XFS_IOC_OPEN_BY_HANDLE:
        case XFS_IOC_FSSETDM_BY_HANDLE:
@@ -124,8 +167,21 @@ __linvfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg)
        case XFS_IOC_ERROR_CLEARALL:
                break;
-#ifndef BROKEN_X86_ALIGNMENT
+#ifdef BROKEN_X86_ALIGNMENT
-        /* xfs_flock_t and xfs_bstat_t have wrong u32 vs u64 alignment */
+        /* xfs_flock_t has wrong u32 vs u64 alignment */
+        case XFS_IOC_ALLOCSP_32:
+        case XFS_IOC_FREESP_32:
+        case XFS_IOC_ALLOCSP64_32:
+        case XFS_IOC_FREESP64_32:
+        case XFS_IOC_RESVSP_32:
+        case XFS_IOC_UNRESVSP_32:
+        case XFS_IOC_RESVSP64_32:
+        case XFS_IOC_UNRESVSP64_32:
+                arg = xfs_ioctl32_flock(arg);
+                cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+                break;
+#else /* These are handled fine if no alignment issues */
        case XFS_IOC_ALLOCSP:
        case XFS_IOC_FREESP:
        case XFS_IOC_RESVSP:
@@ -134,6 +190,9 @@ __linvfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg)
        case XFS_IOC_FREESP64:
        case XFS_IOC_RESVSP64:
        case XFS_IOC_UNRESVSP64:
+                break;
+        /* xfs_bstat_t still has wrong u32 vs u64 alignment */
        case XFS_IOC_SWAPEXT:
                break;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index f252605514eb..77708a8c9f87 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -140,7 +140,6 @@ linvfs_mknod(
        memset(&va, 0, sizeof(va));
        va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
-        va.va_type = IFTOVT(mode);
        va.va_mode = mode;
        switch (mode & S_IFMT) {
@@ -308,14 +307,13 @@ linvfs_symlink(
        cvp = NULL;
        memset(&va, 0, sizeof(va));
-        va.va_type = VLNK;
+        va.va_mode = S_IFLNK |
-        va.va_mode = irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO;
+                (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
        va.va_mask = XFS_AT_TYPE|XFS_AT_MODE;
        error = 0;
        VOP_SYMLINK(dvp, dentry, &va, (char *)symname, &cvp, NULL, error);
        if (!error && cvp) {
-                ASSERT(cvp->v_type == VLNK);
                ip = LINVFS_GET_IP(cvp);
                d_instantiate(dentry, ip);
                validate_fields(dir);
@@ -425,9 +423,14 @@ linvfs_follow_link(
        return NULL;
 }
-static void linvfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p)
+STATIC void
+linvfs_put_link(
+        struct dentry   *dentry,
+        struct nameidata *nd,
+        void            *p)
 {
-        char *s = nd_get_link(nd);
+        char            *s = nd_get_link(nd);
        if (!IS_ERR(s))
                kfree(s);
 }
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 42dc5e4662ed..68c5d885ed9c 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -64,7 +64,6 @@
 #include <sema.h>
 #include <time.h>
-#include <support/qsort.h>
 #include <support/ktrace.h>
 #include <support/debug.h>
 #include <support/move.h>
@@ -104,6 +103,7 @@
 #include <xfs_stats.h>
 #include <xfs_sysctl.h>
 #include <xfs_iops.h>
+#include <xfs_aops.h>
 #include <xfs_super.h>
 #include <xfs_globals.h>
 #include <xfs_fs_subr.h>
@@ -254,11 +254,18 @@ static inline void set_buffer_unwritten_io(struct buffer_head *bh)
 #define MAX(a,b)        (max(a,b))
 #define howmany(x, y)   (((x)+((y)-1))/(y))
 #define roundup(x, y)   ((((x)+((y)-1))/(y))*(y))
+#define qsort(a,n,s,fn) sort(a,n,s,fn,NULL)
+/*
+ * Various platform dependent calls that don't fit anywhere else
+ */
 #define xfs_stack_trace()       dump_stack()
 #define xfs_itruncate_data(ip, off)     \
        (-vmtruncate(LINVFS_GET_IP(XFS_ITOV(ip)), (off)))
+#define xfs_statvfs_fsid(statp, mp)     \
+        ({ u64 id = huge_encode_dev((mp)->m_dev);       \
+           __kernel_fsid_t *fsid = &(statp)->f_fsid;    \
+        (fsid->val[0] = (u32)id, fsid->val[1] = (u32)(id >> 32)); })
 /* Move the kernel do_div definition off to one side */
@@ -371,6 +378,4 @@ static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
        return(x * y);
 }
-#define qsort(a, n, s, cmp) sort(a, n, s, cmp, NULL)
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index acab58c48043..3b5fabe8dae9 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -660,9 +660,6 @@ xfs_write(
                        (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if (ioflags & IO_ISAIO)
-                        return XFS_ERROR(-ENOSYS);
                if ((pos & target->pbr_smask) || (count & target->pbr_smask))
                        return XFS_ERROR(-EINVAL);
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index f197a720e394..6294dcdb797c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -70,9 +70,10 @@ struct xfs_iomap;
 #define XFS_SENDFILE_ENTER      21
 #define XFS_WRITEPAGE_ENTER     22
 #define XFS_RELEASEPAGE_ENTER   23
-#define XFS_IOMAP_ALLOC_ENTER   24
+#define XFS_INVALIDPAGE_ENTER   24
-#define XFS_IOMAP_ALLOC_MAP     25
+#define XFS_IOMAP_ALLOC_ENTER   25
-#define XFS_IOMAP_UNWRITTEN     26
+#define XFS_IOMAP_ALLOC_MAP     26
+#define XFS_IOMAP_UNWRITTEN     27
 extern void xfs_rw_enter_trace(int, struct xfs_iocore *,
                                void *, size_t, loff_t, int);
 extern void xfs_inval_cached_trace(struct xfs_iocore *,
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f6dd7de25927..0da87bfc9999 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -70,11 +70,15 @@
 #include <linux/namei.h>
 #include <linux/init.h>
 #include <linux/mount.h>
+#include <linux/mempool.h>
 #include <linux/writeback.h>
+#include <linux/kthread.h>
 STATIC struct quotactl_ops linvfs_qops;
 STATIC struct super_operations linvfs_sops;
-STATIC kmem_zone_t *linvfs_inode_zone;
+STATIC kmem_zone_t *xfs_vnode_zone;
+STATIC kmem_zone_t *xfs_ioend_zone;
+mempool_t *xfs_ioend_pool;
 STATIC struct xfs_mount_args *
 xfs_args_allocate(
@@ -138,24 +142,25 @@ STATIC __inline__ void
 xfs_set_inodeops(
        struct inode            *inode)
 {
-        vnode_t                 *vp = LINVFS_GET_VP(inode);
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFREG:
-        if (vp->v_type == VNON) {
-                vn_mark_bad(vp);
-        } else if (S_ISREG(inode->i_mode)) {
                inode->i_op = &linvfs_file_inode_operations;
                inode->i_fop = &linvfs_file_operations;
                inode->i_mapping->a_ops = &linvfs_aops;
-        } else if (S_ISDIR(inode->i_mode)) {
+                break;
+        case S_IFDIR:
                inode->i_op = &linvfs_dir_inode_operations;
                inode->i_fop = &linvfs_dir_operations;
-        } else if (S_ISLNK(inode->i_mode)) {
+                break;
+        case S_IFLNK:
                inode->i_op = &linvfs_symlink_inode_operations;
                if (inode->i_blocks)
                        inode->i_mapping->a_ops = &linvfs_aops;
-        } else {
+                break;
+        default:
                inode->i_op = &linvfs_file_inode_operations;
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                break;
        }
 }
@@ -167,16 +172,23 @@ xfs_revalidate_inode(
 {
        struct inode            *inode = LINVFS_GET_IP(vp);
-        inode->i_mode   = (ip->i_d.di_mode & MODEMASK) | VTTOIF(vp->v_type);
+        inode->i_mode   = ip->i_d.di_mode;
        inode->i_nlink  = ip->i_d.di_nlink;
        inode->i_uid    = ip->i_d.di_uid;
        inode->i_gid    = ip->i_d.di_gid;
-        if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
+        switch (inode->i_mode & S_IFMT) {
+        case S_IFBLK:
+        case S_IFCHR:
+                inode->i_rdev =
+                        MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+                              sysv_minor(ip->i_df.if_u2.if_rdev));
+                break;
+        default:
                inode->i_rdev = 0;
-        } else {
+                break;
-                xfs_dev_t dev = ip->i_df.if_u2.if_rdev;
-                inode->i_rdev = MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
        }
        inode->i_blksize = PAGE_CACHE_SIZE;
        inode->i_generation = ip->i_d.di_gen;
        i_size_write(inode, ip->i_d.di_size);
@@ -231,7 +243,6 @@ xfs_initialize_vnode(
         * finish our work.
         */
        if (ip->i_d.di_mode != 0 && unlock && (inode->i_state & I_NEW)) {
-                vp->v_type = IFTOVT(ip->i_d.di_mode);
                xfs_revalidate_inode(XFS_BHVTOM(bdp), vp, ip);
                xfs_set_inodeops(inode);
        
@@ -274,8 +285,7 @@ linvfs_alloc_inode(
 {
        vnode_t                 *vp;
-        vp = (vnode_t *)kmem_cache_alloc(linvfs_inode_zone, 
+        vp = kmem_cache_alloc(xfs_vnode_zone, kmem_flags_convert(KM_SLEEP));
-                kmem_flags_convert(KM_SLEEP));
        if (!vp)
                return NULL;
        return LINVFS_GET_IP(vp);
@@ -285,11 +295,11 @@ STATIC void
 linvfs_destroy_inode(
        struct inode            *inode)
 {
-        kmem_cache_free(linvfs_inode_zone, LINVFS_GET_VP(inode));
+        kmem_zone_free(xfs_vnode_zone, LINVFS_GET_VP(inode));
 }
 STATIC void
-init_once(
+linvfs_inode_init_once(
        void                    *data,
        kmem_cache_t            *cachep,
        unsigned long           flags)
@@ -302,21 +312,41 @@ init_once(
 }
 STATIC int
-init_inodecache( void )
+linvfs_init_zones(void)
 {
-        linvfs_inode_zone = kmem_cache_create("linvfs_icache",
+        xfs_vnode_zone = kmem_cache_create("xfs_vnode",
                                sizeof(vnode_t), 0, SLAB_RECLAIM_ACCOUNT,
-                                init_once, NULL);
+                                linvfs_inode_init_once, NULL);
-        if (linvfs_inode_zone == NULL)
+        if (!xfs_vnode_zone)
-                return -ENOMEM;
+                goto out;
+        xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
+        if (!xfs_ioend_zone)
+                goto out_destroy_vnode_zone;
+        xfs_ioend_pool = mempool_create(4 * MAX_BUF_PER_PAGE,
+                        mempool_alloc_slab, mempool_free_slab,
+                        xfs_ioend_zone);
+        if (!xfs_ioend_pool)
+                goto out_free_ioend_zone;
        return 0;
+ out_free_ioend_zone:
+        kmem_zone_destroy(xfs_ioend_zone);
+ out_destroy_vnode_zone:
+        kmem_zone_destroy(xfs_vnode_zone);
+ out:
+        return -ENOMEM;
 }
 STATIC void
-destroy_inodecache( void )
+linvfs_destroy_zones(void)
 {
-        if (kmem_cache_destroy(linvfs_inode_zone))
+        mempool_destroy(xfs_ioend_pool);
-                printk(KERN_WARNING "%s: cache still in use!\n", __FUNCTION__);
+        kmem_zone_destroy(xfs_vnode_zone);
+        kmem_zone_destroy(xfs_ioend_zone);
 }
 /*
@@ -354,17 +384,38 @@ linvfs_clear_inode(
        struct inode            *inode)
 {
        vnode_t                 *vp = LINVFS_GET_VP(inode);
+        int                     error, cache;
-        if (vp) {
+        vn_trace_entry(vp, "clear_inode", (inst_t *)__return_address);
-                vn_rele(vp);
-                vn_trace_entry(vp, __FUNCTION__, (inst_t *)__return_address);
+        XFS_STATS_INC(vn_rele);
-                /*
+        XFS_STATS_INC(vn_remove);
-                 * Do all our cleanup, and remove this vnode.
+        XFS_STATS_INC(vn_reclaim);
-                 */
+        XFS_STATS_DEC(vn_active);
-                vn_remove(vp);
+        /*
+         * This can happen because xfs_iget_core calls xfs_idestroy if we
+         * find an inode with di_mode == 0 but without IGET_CREATE set.
+         */
+        if (vp->v_fbhv)
+                VOP_INACTIVE(vp, NULL, cache);
+        VN_LOCK(vp);
+        vp->v_flag &= ~VMODIFIED;
+        VN_UNLOCK(vp, 0);
+        if (vp->v_fbhv) {
+                VOP_RECLAIM(vp, error);
+                if (error)
+                        panic("vn_purge: cannot reclaim");
        }
-}
+        ASSERT(vp->v_fbhv == NULL);
+#ifdef XFS_VNODE_TRACE
+        ktrace_free(vp->v_trace);
+#endif
+}
 /*
 * Enqueue a work item to be picked up by the vfs xfssyncd thread.
@@ -466,25 +517,16 @@ xfssyncd(
 {
        long                    timeleft;
        vfs_t                   *vfsp = (vfs_t *) arg;
-        struct list_head        tmp;
        struct vfs_sync_work    *work, *n;
+        LIST_HEAD               (tmp);
-        daemonize("xfssyncd");
-        vfsp->vfs_sync_work.w_vfs = vfsp;
-        vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
-        vfsp->vfs_sync_task = current;
-        wmb();
-        wake_up(&vfsp->vfs_wait_sync_task);
-        INIT_LIST_HEAD(&tmp);
        timeleft = (xfs_syncd_centisecs * HZ) / 100;
        for (;;) {
                set_current_state(TASK_INTERRUPTIBLE);
                timeleft = schedule_timeout(timeleft);
                /* swsusp */
                try_to_freeze();
-                if (vfsp->vfs_flag & VFS_UMOUNT)
+                if (kthread_should_stop())
                        break;
                spin_lock(&vfsp->vfs_sync_lock);
@@ -513,10 +555,6 @@ xfssyncd(
                }
        }
-        vfsp->vfs_sync_task = NULL;
-        wmb();
-        wake_up(&vfsp->vfs_wait_sync_task);
        return 0;
 }
@@ -524,13 +562,11 @@ STATIC int
 linvfs_start_syncd(
        vfs_t                   *vfsp)
 {
-        int                     pid;
+        vfsp->vfs_sync_work.w_syncer = vfs_sync_worker;
+        vfsp->vfs_sync_work.w_vfs = vfsp;
-        pid = kernel_thread(xfssyncd, (void *) vfsp,
+        vfsp->vfs_sync_task = kthread_run(xfssyncd, vfsp, "xfssyncd");
-                        CLONE_VM | CLONE_FS | CLONE_FILES);
+        if (IS_ERR(vfsp->vfs_sync_task))
-        if (pid < 0)
+                return -PTR_ERR(vfsp->vfs_sync_task);
-                return -pid;
-        wait_event(vfsp->vfs_wait_sync_task, vfsp->vfs_sync_task);
        return 0;
 }
@@ -538,11 +574,7 @@ STATIC void
 linvfs_stop_syncd(
        vfs_t                   *vfsp)
 {
-        vfsp->vfs_flag |= VFS_UMOUNT;
+        kthread_stop(vfsp->vfs_sync_task);
-        wmb();
-        wake_up_process(vfsp->vfs_sync_task);
-        wait_event(vfsp->vfs_wait_sync_task, !vfsp->vfs_sync_task);
 }
 STATIC void
@@ -866,9 +898,9 @@ init_xfs_fs( void )
        ktrace_init(64);
-        error = init_inodecache();
+        error = linvfs_init_zones();
        if (error < 0)
-                goto undo_inodecache;
+                goto undo_zones;
        error = pagebuf_init();
        if (error < 0)
@@ -889,9 +921,9 @@ undo_register:
        pagebuf_terminate();
 undo_pagebuf:
-        destroy_inodecache();
+        linvfs_destroy_zones();
-undo_inodecache:
+undo_zones:
        return error;
 }
@@ -903,7 +935,7 @@ exit_xfs_fs( void )
        unregister_filesystem(&xfs_fs_type);
        xfs_cleanup();
        pagebuf_terminate();
-        destroy_inodecache();
+        linvfs_destroy_zones();
        ktrace_uninit();
 }
diff --git a/fs/xfs/linux-2.6/xfs_vfs.c b/fs/xfs/linux-2.6/xfs_vfs.c
index 669c61644959..34cc902ec119 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.c
+++ b/fs/xfs/linux-2.6/xfs_vfs.c
@@ -251,7 +251,6 @@ vfs_allocate( void )
        bhv_head_init(VFS_BHVHEAD(vfsp), "vfs");
        INIT_LIST_HEAD(&vfsp->vfs_sync_list);
        spin_lock_init(&vfsp->vfs_sync_lock);
-        init_waitqueue_head(&vfsp->vfs_wait_sync_task);
        init_waitqueue_head(&vfsp->vfs_wait_single_sync_task);
        return vfsp;
 }
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 7ee1f714e9ba..f0ab574fb47a 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -65,7 +65,6 @@ typedef struct vfs {
        spinlock_t              vfs_sync_lock;  /* work item list lock */
        int                     vfs_sync_seq;   /* sync thread generation no. */
        wait_queue_head_t       vfs_wait_single_sync_task;
-        wait_queue_head_t       vfs_wait_sync_task;
 } vfs_t;
 #define vfs_fbhv                vfs_bh.bh_first /* 1st on vfs behavior chain */
@@ -96,7 +95,6 @@ typedef enum {
 #define VFS_RDONLY              0x0001  /* read-only vfs */
 #define VFS_GRPID               0x0002  /* group-ID assigned from directory */
 #define VFS_DMI                 0x0004  /* filesystem has the DMI enabled */
-#define VFS_UMOUNT              0x0008  /* unmount in progress */
 #define VFS_END                 0x0008  /* max flag */
 #define SYNC_ATTR               0x0001  /* sync attributes */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
index 250cad54e892..268f45bf6a9a 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ b/fs/xfs/linux-2.6/xfs_vnode.c
@@ -42,93 +42,33 @@ DEFINE_SPINLOCK(vnumber_lock);
 */
 #define NVSYNC                  37
 #define vptosync(v)             (&vsync[((unsigned long)v) % NVSYNC])
-sv_t vsync[NVSYNC];
+STATIC wait_queue_head_t vsync[NVSYNC];
-/*
- * Translate stat(2) file types to vnode types and vice versa.
- * Aware of numeric order of S_IFMT and vnode type values.
- */
-enum vtype iftovt_tab[] = {
-        VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
-        VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
-};
-u_short vttoif_tab[] = {
-        0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO, 0, S_IFSOCK
-};
 void
 vn_init(void)
 {
-        register sv_t *svp;
+        int i;
-        register int i;
-        for (svp = vsync, i = 0; i < NVSYNC; i++, svp++)
+        for (i = 0; i < NVSYNC; i++)
-                init_sv(svp, SV_DEFAULT, "vsy", i);
+                init_waitqueue_head(&vsync[i]);
 }
-/*
+void
- * Clean a vnode of filesystem-specific data and prepare it for reuse.
+vn_iowait(
- */
-STATIC int
-vn_reclaim(
        struct vnode    *vp)
 {
-        int             error;
+        wait_queue_head_t *wq = vptosync(vp);
-        XFS_STATS_INC(vn_reclaim);
+        wait_event(*wq, (atomic_read(&vp->v_iocount) == 0));
-        vn_trace_entry(vp, "vn_reclaim", (inst_t *)__return_address);
-        /*
-         * Only make the VOP_RECLAIM call if there are behaviors
-         * to call.
-         */
-        if (vp->v_fbhv) {
-                VOP_RECLAIM(vp, error);
-                if (error)
-                        return -error;
-        }
-        ASSERT(vp->v_fbhv == NULL);
-        VN_LOCK(vp);
-        vp->v_flag &= (VRECLM|VWAIT);
-        VN_UNLOCK(vp, 0);
-        vp->v_type = VNON;
-        vp->v_fbhv = NULL;
-#ifdef XFS_VNODE_TRACE
-        ktrace_free(vp->v_trace);
-        vp->v_trace = NULL;
-#endif
-        return 0;
-}
-STATIC void
-vn_wakeup(
-        struct vnode    *vp)
-{
-        VN_LOCK(vp);
-        if (vp->v_flag & VWAIT)
-                sv_broadcast(vptosync(vp));
-        vp->v_flag &= ~(VRECLM|VWAIT|VMODIFIED);
-        VN_UNLOCK(vp, 0);
 }
-int
+void
-vn_wait(
+vn_iowake(
        struct vnode    *vp)
 {
-        VN_LOCK(vp);
+        if (atomic_dec_and_test(&vp->v_iocount))
-        if (vp->v_flag & (VINACT | VRECLM)) {
+                wake_up(vptosync(vp));
-                vp->v_flag |= VWAIT;
-                sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
-                return 1;
-        }
-        VN_UNLOCK(vp, 0);
-        return 0;
 }
 struct vnode *
@@ -154,6 +94,8 @@ vn_initialize(
        /* Initialize the first behavior and the behavior chain head. */
        vn_bhv_head_init(VN_BHV_HEAD(vp), "vnode");
+        atomic_set(&vp->v_iocount, 0);
 #ifdef  XFS_VNODE_TRACE
        vp->v_trace = ktrace_alloc(VNODE_TRACE_SIZE, KM_SLEEP);
 #endif  /* XFS_VNODE_TRACE */
@@ -163,30 +105,6 @@ vn_initialize(
 }
 /*
- * Get a reference on a vnode.
- */
-vnode_t *
-vn_get(
-        struct vnode    *vp,
-        vmap_t          *vmap)
-{
-        struct inode    *inode;
-        XFS_STATS_INC(vn_get);
-        inode = LINVFS_GET_IP(vp);
-        if (inode->i_state & I_FREEING)
-                return NULL;
-        inode = ilookup(vmap->v_vfsp->vfs_super, vmap->v_ino);
-        if (!inode)     /* Inode not present */
-                return NULL;
-        vn_trace_exit(vp, "vn_get", (inst_t *)__return_address);
-        return vp;
-}
-/*
 * Revalidate the Linux inode from the vattr.
 * Note: i_size _not_ updated; we must hold the inode
 * semaphore when doing that - callers responsibility.
@@ -198,7 +116,7 @@ vn_revalidate_core(
 {
        struct inode    *inode = LINVFS_GET_IP(vp);
-        inode->i_mode       = VTTOIF(vap->va_type) | vap->va_mode;
+        inode->i_mode       = vap->va_mode;
        inode->i_nlink      = vap->va_nlink;
        inode->i_uid        = vap->va_uid;
        inode->i_gid        = vap->va_gid;
@@ -247,71 +165,6 @@ vn_revalidate(
 }
 /*
- * purge a vnode from the cache
- * At this point the vnode is guaranteed to have no references (vn_count == 0)
- * The caller has to make sure that there are no ways someone could
- * get a handle (via vn_get) on the vnode (usually done via a mount/vfs lock).
- */
-void
-vn_purge(
-        struct vnode    *vp,
-        vmap_t          *vmap)
-{
-        vn_trace_entry(vp, "vn_purge", (inst_t *)__return_address);
-again:
-        /*
-         * Check whether vp has already been reclaimed since our caller
-         * sampled its version while holding a filesystem cache lock that
-         * its VOP_RECLAIM function acquires.
-         */
-        VN_LOCK(vp);
-        if (vp->v_number != vmap->v_number) {
-                VN_UNLOCK(vp, 0);
-                return;
-        }
-        /*
-         * If vp is being reclaimed or inactivated, wait until it is inert,
-         * then proceed.  Can't assume that vnode is actually reclaimed
-         * just because the reclaimed flag is asserted -- a vn_alloc
-         * reclaim can fail.
-         */
-        if (vp->v_flag & (VINACT | VRECLM)) {
-                ASSERT(vn_count(vp) == 0);
-                vp->v_flag |= VWAIT;
-                sv_wait(vptosync(vp), PINOD, &vp->v_lock, 0);
-                goto again;
-        }
-        /*
-         * Another process could have raced in and gotten this vnode...
-         */
-        if (vn_count(vp) > 0) {
-                VN_UNLOCK(vp, 0);
-                return;
-        }
-        XFS_STATS_DEC(vn_active);
-        vp->v_flag |= VRECLM;
-        VN_UNLOCK(vp, 0);
-        /*
-         * Call VOP_RECLAIM and clean vp. The FSYNC_INVAL flag tells
-         * vp's filesystem to flush and invalidate all cached resources.
-         * When vn_reclaim returns, vp should have no private data,
-         * either in a system cache or attached to v_data.
-         */
-        if (vn_reclaim(vp) != 0)
-                panic("vn_purge: cannot reclaim");
-        /*
-         * Wakeup anyone waiting for vp to be reclaimed.
-         */
-        vn_wakeup(vp);
-}
-/*
 * Add a reference to a referenced vnode.
 */
 struct vnode *
@@ -330,80 +183,6 @@ vn_hold(
        return vp;
 }
-/*
- *  Call VOP_INACTIVE on last reference.
- */
-void
-vn_rele(
-        struct vnode    *vp)
-{
-        int             vcnt;
-        int             cache;
-        XFS_STATS_INC(vn_rele);
-        VN_LOCK(vp);
-        vn_trace_entry(vp, "vn_rele", (inst_t *)__return_address);
-        vcnt = vn_count(vp);
-        /*
-         * Since we always get called from put_inode we know
-         * that i_count won't be decremented after we
-         * return.
-         */
-        if (!vcnt) {
-                /*
-                 * As soon as we turn this on, noone can find us in vn_get
-                 * until we turn off VINACT or VRECLM
-                 */
-                vp->v_flag |= VINACT;
-                VN_UNLOCK(vp, 0);
-                /*
-                 * Do not make the VOP_INACTIVE call if there
-                 * are no behaviors attached to the vnode to call.
-                 */
-                if (vp->v_fbhv)
-                        VOP_INACTIVE(vp, NULL, cache);
-                VN_LOCK(vp);
-                if (vp->v_flag & VWAIT)
-                        sv_broadcast(vptosync(vp));
-                vp->v_flag &= ~(VINACT|VWAIT|VRECLM|VMODIFIED);
-        }
-        VN_UNLOCK(vp, 0);
-        vn_trace_exit(vp, "vn_rele", (inst_t *)__return_address);
-}
-/*
- * Finish the removal of a vnode.
- */
-void
-vn_remove(
-        struct vnode    *vp)
-{
-        vmap_t          vmap;
-        /* Make sure we don't do this to the same vnode twice */
-        if (!(vp->v_fbhv))
-                return;
-        XFS_STATS_INC(vn_remove);
-        vn_trace_exit(vp, "vn_remove", (inst_t *)__return_address);
-        /*
-         * After the following purge the vnode
-         * will no longer exist.
-         */
-        VMAP(vp, vmap);
-        vn_purge(vp, &vmap);
-}
 #ifdef  XFS_VNODE_TRACE
 #define KTRACE_ENTER(vp, vk, s, line, ra)                       \
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index a6e57c647be4..35f306cebb87 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -65,10 +65,6 @@ struct vattr;
 struct xfs_iomap;
 struct attrlist_cursor_kern;
-/*
- * Vnode types.  VNON means no type.
- */
-enum vtype      { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VFIFO, VBAD, VSOCK };
 typedef xfs_ino_t vnumber_t;
 typedef struct dentry vname_t;
@@ -77,15 +73,14 @@ typedef bhv_head_t vn_bhv_head_t;
 /*
 * MP locking protocols:
 *      v_flag, v_vfsp                          VN_LOCK/VN_UNLOCK
- *      v_type                                  read-only or fs-dependent
 */
 typedef struct vnode {
        __u32           v_flag;                 /* vnode flags (see below) */
-        enum vtype      v_type;                 /* vnode type */
        struct vfs      *v_vfsp;                /* ptr to containing VFS */
        vnumber_t       v_number;               /* in-core vnode number */
        vn_bhv_head_t   v_bh;                   /* behavior head */
        spinlock_t      v_lock;                 /* VN_LOCK/VN_UNLOCK */
+        atomic_t        v_iocount;              /* outstanding I/O count */
 #ifdef XFS_VNODE_TRACE
        struct ktrace   *v_trace;               /* trace header structure    */
 #endif
@@ -93,6 +88,12 @@ typedef struct vnode {
        /* inode MUST be last */
 } vnode_t;
+#define VN_ISLNK(vp)    S_ISLNK((vp)->v_inode.i_mode)
+#define VN_ISREG(vp)    S_ISREG((vp)->v_inode.i_mode)
+#define VN_ISDIR(vp)    S_ISDIR((vp)->v_inode.i_mode)
+#define VN_ISCHR(vp)    S_ISCHR((vp)->v_inode.i_mode)
+#define VN_ISBLK(vp)    S_ISBLK((vp)->v_inode.i_mode)
 #define v_fbhv                  v_bh.bh_first          /* first behavior */
 #define v_fops                  v_bh.bh_first->bd_ops  /* first behavior ops */
@@ -133,22 +134,8 @@ typedef enum {
 #define LINVFS_GET_IP(vp)       (&(vp)->v_inode)
 /*
- * Convert between vnode types and inode formats (since POSIX.1
- * defines mode word of stat structure in terms of inode formats).
- */
-extern enum vtype       iftovt_tab[];
-extern u_short          vttoif_tab[];
-#define IFTOVT(mode)    (iftovt_tab[((mode) & S_IFMT) >> 12])
-#define VTTOIF(indx)    (vttoif_tab[(int)(indx)])
-#define MAKEIMODE(indx, mode)   (int)(VTTOIF(indx) | (mode))
-/*
 * Vnode flags.
 */
-#define VINACT                 0x1      /* vnode is being inactivated   */
-#define VRECLM                 0x2      /* vnode is being reclaimed     */
-#define VWAIT                  0x4      /* waiting for VINACT/VRECLM to end */
 #define VMODIFIED              0x8      /* XFS inode state possibly differs */
                                        /* to the Linux inode state.    */
@@ -408,7 +395,6 @@ typedef struct vnodeops {
 */
 typedef struct vattr {
        int             va_mask;        /* bit-mask of attributes present */
-        enum vtype      va_type;        /* vnode type (for create) */
        mode_t          va_mode;        /* file access mode and type */
        xfs_nlink_t     va_nlink;       /* number of references to file */
        uid_t           va_uid;         /* owner user id */
@@ -498,27 +484,12 @@ typedef struct vattr {
 * Check whether mandatory file locking is enabled.
 */
 #define MANDLOCK(vp, mode)      \
-        ((vp)->v_type == VREG && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
+        (VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID)
 extern void     vn_init(void);
-extern int      vn_wait(struct vnode *);
 extern vnode_t  *vn_initialize(struct inode *);
 /*
- * Acquiring and invalidating vnodes:
- *
- *      if (vn_get(vp, version, 0))
- *              ...;
- *      vn_purge(vp, version);
- *
- * vn_get and vn_purge must be called with vmap_t arguments, sampled
- * while a lock that the vnode's VOP_RECLAIM function acquires is
- * held, to ensure that the vnode sampled with the lock held isn't
- * recycled (VOP_RECLAIMed) or deallocated between the release of the lock
- * and the subsequent vn_get or vn_purge.
- */
-/*
 * vnode_map structures _must_ match vn_epoch and vnode structure sizes.
 */
 typedef struct vnode_map {
@@ -531,11 +502,11 @@ typedef struct vnode_map {
                         (vmap).v_number = (vp)->v_number,      \
                         (vmap).v_ino    = (vp)->v_inode.i_ino; }
-extern void     vn_purge(struct vnode *, vmap_t *);
-extern vnode_t  *vn_get(struct vnode *, vmap_t *);
 extern int      vn_revalidate(struct vnode *);
 extern void     vn_revalidate_core(struct vnode *, vattr_t *);
-extern void     vn_remove(struct vnode *);
+extern void     vn_iowait(struct vnode *vp);
+extern void     vn_iowake(struct vnode *vp);
 static inline int vn_count(struct vnode *vp)
 {
@@ -546,7 +517,6 @@ static inline int vn_count(struct vnode *vp)
 * Vnode reference counting functions (and macros for compatibility).
 */
 extern vnode_t  *vn_hold(struct vnode *);
-extern void     vn_rele(struct vnode *);
 #if defined(XFS_VNODE_TRACE)
 #define VN_HOLD(vp)             \
@@ -560,6 +530,12 @@ extern void	vn_rele(struct vnode *);
 #define VN_RELE(vp)             (iput(LINVFS_GET_IP(vp)))
 #endif
+static inline struct vnode *vn_grab(struct vnode *vp)
+{
+        struct inode *inode = igrab(LINVFS_GET_IP(vp));
+        return inode ? LINVFS_GET_VP(inode) : NULL;
+}
 /*
 * Vname handling macros.
 */
diff --git a/fs/xfs/quota/Makefile b/fs/xfs/quota/Makefile
new file mode 100644
index 000000000000..7a4f725b2824
--- /dev/null
+++ b/fs/xfs/quota/Makefile
@@ -0,0 +1 @@
+include $(TOPDIR)/fs/xfs/quota/Makefile-linux-$(VERSION).$(PATCHLEVEL)
diff --git a/fs/xfs/quota/Makefile-linux-2.6 b/fs/xfs/quota/Makefile-linux-2.6
new file mode 100644
index 000000000000..8b7b676718b9
--- /dev/null
+++ b/fs/xfs/quota/Makefile-linux-2.6
@@ -0,0 +1,53 @@
+#
+# Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of version 2 of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# Further, this software is distributed without any warranty that it is
+# free of the rightful claim of any third person regarding infringement
+# or the like.  Any license provided herein, whether implied or
+# otherwise, applies only to this software file.  Patent licenses, if
+# any, provided herein do not apply to combinations of this program with
+# other software, or any other product whatsoever.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write the Free Software Foundation, Inc., 59
+# Temple Place - Suite 330, Boston MA 02111-1307, USA.
+#
+# Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+# Mountain View, CA  94043, or:
+#
+# http://www.sgi.com
+#
+# For further information regarding this notice, see:
+#
+# http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+#
+EXTRA_CFLAGS += -I $(TOPDIR)/fs/xfs -I $(TOPDIR)/fs/xfs/linux-2.6
+ifeq ($(CONFIG_XFS_DEBUG),y)
+        EXTRA_CFLAGS += -g -DDEBUG
+        #EXTRA_CFLAGS += -DQUOTADEBUG
+endif
+ifeq ($(CONFIG_XFS_TRACE),y)
+        EXTRA_CFLAGS += -DXFS_DQUOT_TRACE
+        EXTRA_CFLAGS += -DXFS_VNODE_TRACE
+endif
+obj-$(CONFIG_XFS_QUOTA)         += xfs_quota.o
+xfs_quota-y                     += xfs_dquot.o \
+                                   xfs_dquot_item.o \
+                                   xfs_trans_dquot.o \
+                                   xfs_qm_syscalls.o \
+                                   xfs_qm_bhv.o \
+                                   xfs_qm.o
+xfs_quota-$(CONFIG_PROC_FS)     += xfs_qm_stats.o
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 46ce1e3ce1d6..e2e8d35fa4d0 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -421,7 +421,7 @@ xfs_qm_init_dquot_blk(
 */
 STATIC int
 xfs_qm_dqalloc(
-        xfs_trans_t     *tp,
+        xfs_trans_t     **tpp,
        xfs_mount_t     *mp,
        xfs_dquot_t     *dqp,
        xfs_inode_t     *quotip,
@@ -433,6 +433,7 @@ xfs_qm_dqalloc(
        xfs_bmbt_irec_t map;
        int             nmaps, error, committed;
        xfs_buf_t       *bp;
+        xfs_trans_t     *tp = *tpp;
        ASSERT(tp != NULL);
        xfs_dqtrace_entry(dqp, "DQALLOC");
@@ -492,10 +493,32 @@ xfs_qm_dqalloc(
        xfs_qm_init_dquot_blk(tp, mp, INT_GET(dqp->q_core.d_id, ARCH_CONVERT),
                              dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
-        if ((error = xfs_bmap_finish(&tp, &flist, firstblock, &committed))) {
+        /*
+         * xfs_bmap_finish() may commit the current transaction and
+         * start a second transaction if the freelist is not empty.
+         *
+         * Since we still want to modify this buffer, we need to
+         * ensure that the buffer is not released on commit of
+         * the first transaction and ensure the buffer is added to the
+         * second transaction.
+         *
+         * If there is only one transaction then don't stop the buffer
+         * from being released when it commits later on.
+         */
+        xfs_trans_bhold(tp, bp);
+        if ((error = xfs_bmap_finish(tpp, &flist, firstblock, &committed))) {
                goto error1;
        }
+        if (committed) {
+                tp = *tpp;
+                xfs_trans_bjoin(tp, bp);
+        } else {
+                xfs_trans_bhold_release(tp, bp);
+        }
        *O_bpp = bp;
        return 0;
@@ -514,7 +537,7 @@ xfs_qm_dqalloc(
 */
 STATIC int
 xfs_qm_dqtobp(
-        xfs_trans_t             *tp,
+        xfs_trans_t             **tpp,
        xfs_dquot_t             *dqp,
        xfs_disk_dquot_t        **O_ddpp,
        xfs_buf_t               **O_bpp,
@@ -528,6 +551,7 @@ xfs_qm_dqtobp(
        xfs_disk_dquot_t *ddq;
        xfs_dqid_t      id;
        boolean_t       newdquot;
+        xfs_trans_t     *tp = (tpp ? *tpp : NULL);
        mp = dqp->q_mount;
        id = INT_GET(dqp->q_core.d_id, ARCH_CONVERT);
@@ -579,9 +603,10 @@ xfs_qm_dqtobp(
                                return (ENOENT);
                        ASSERT(tp);
-                        if ((error = xfs_qm_dqalloc(tp, mp, dqp, quotip,
+                        if ((error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
                                                dqp->q_fileoffset, &bp)))
                                return (error);
+                        tp = *tpp;
                        newdquot = B_TRUE;
                } else {
                        /*
@@ -645,7 +670,7 @@ xfs_qm_dqtobp(
 /* ARGSUSED */
 STATIC int
 xfs_qm_dqread(
-        xfs_trans_t     *tp,
+        xfs_trans_t     **tpp,
        xfs_dqid_t      id,
        xfs_dquot_t     *dqp,   /* dquot to get filled in */
        uint            flags)
@@ -653,15 +678,19 @@ xfs_qm_dqread(
        xfs_disk_dquot_t *ddqp;
        xfs_buf_t        *bp;
        int              error;
+        xfs_trans_t      *tp;
+        ASSERT(tpp);
        /*
         * get a pointer to the on-disk dquot and the buffer containing it
         * dqp already knows its own type (GROUP/USER).
         */
        xfs_dqtrace_entry(dqp, "DQREAD");
-        if ((error = xfs_qm_dqtobp(tp, dqp, &ddqp, &bp, flags))) {
+        if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
                return (error);
        }
+        tp = *tpp;
        /* copy everything from disk dquot to the incore dquot */
        memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
@@ -740,7 +769,7 @@ xfs_qm_idtodq(
         * Read it from disk; xfs_dqread() takes care of
         * all the necessary initialization of dquot's fields (locks, etc)
         */
-        if ((error = xfs_qm_dqread(tp, id, dqp, flags))) {
+        if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) {
                /*
                 * This can happen if quotas got turned off (ESRCH),
                 * or if the dquot didn't exist on disk and we ask to
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 39175103c8e0..8ebc87176c78 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2002 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -113,20 +113,6 @@ typedef struct xfs_dquot {
 #define XFS_DQHOLD(dqp)         ((dqp)->q_nrefs++)
-/*
- * Quota Accounting/Enforcement flags
- */
-#define XFS_ALL_QUOTA_ACCT      \
-                (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
-#define XFS_ALL_QUOTA_ENFD      (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
-#define XFS_ALL_QUOTA_CHKD      (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
-#define XFS_IS_QUOTA_RUNNING(mp)        ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
-#define XFS_IS_QUOTA_ENFORCED(mp)       ((mp)->m_qflags & XFS_ALL_QUOTA_ENFD)
-#define XFS_IS_UQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_UQUOTA_ACCT)
-#define XFS_IS_PQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_PQUOTA_ACCT)
-#define XFS_IS_GQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_GQUOTA_ACCT)
 #ifdef DEBUG
 static inline int
 XFS_DQ_IS_LOCKED(xfs_dquot_t *dqp)
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index f5271b7b1e84..e74eaa7dd1bc 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -509,6 +509,7 @@ xfs_qm_qoff_logitem_format(xfs_qoff_logitem_t	*qf,
        log_vector->i_addr = (xfs_caddr_t)&(qf->qql_format);
        log_vector->i_len = sizeof(xfs_qoff_logitem_t);
+        XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_QUOTAOFF);
        qf->qql_format.qf_size = 1;
 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index f665ca8f9e96..efde16e0a913 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -365,16 +365,6 @@ xfs_qm_mount_quotas(
        int             error = 0;
        uint            sbf;
-        /*
-         * If a file system had quotas running earlier, but decided to
-         * mount without -o uquota/pquota/gquota options, revoke the
-         * quotachecked license, and bail out.
-         */
-        if (! XFS_IS_QUOTA_ON(mp) &&
-            (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT)) {
-                mp->m_qflags = 0;
-                goto write_changes;
-        }
        /*
         * If quotas on realtime volumes is not supported, we disable
@@ -388,11 +378,8 @@ xfs_qm_mount_quotas(
                goto write_changes;
        }
-#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY)
-        cmn_err(CE_NOTE, "Attempting to turn on disk quotas.");
-#endif
        ASSERT(XFS_IS_QUOTA_RUNNING(mp));
        /*
         * Allocate the quotainfo structure inside the mount struct, and
         * create quotainode(s), and change/rev superblock if necessary.
@@ -410,19 +397,14 @@ xfs_qm_mount_quotas(
         */
        if (XFS_QM_NEED_QUOTACHECK(mp) &&
                !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
-#ifdef DEBUG
-                cmn_err(CE_NOTE, "Doing a quotacheck. Please wait.");
-#endif
                if ((error = xfs_qm_quotacheck(mp))) {
                        /* Quotacheck has failed and quotas have
                         * been disabled.
                         */
                        return XFS_ERROR(error);
                }
-#ifdef DEBUG
-                cmn_err(CE_NOTE, "Done quotacheck.");
-#endif
        }
 write_changes:
        /*
         * We actually don't have to acquire the SB_LOCK at all.
@@ -2010,7 +1992,7 @@ xfs_qm_quotacheck(
                ASSERT(mp->m_quotainfo != NULL);
                ASSERT(xfs_Gqm != NULL);
                xfs_qm_destroy_quotainfo(mp);
-                xfs_mount_reset_sbqflags(mp);
+                (void)xfs_mount_reset_sbqflags(mp);
        } else {
                cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
        }
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index b03eecf3b6cb..0b00b3c67015 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -184,8 +184,6 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_HOLD(xqm)        ((xqm)->qm_nrefs++)
 #define XFS_QM_RELE(xqm)        ((xqm)->qm_nrefs--)
-extern void             xfs_mount_reset_sbqflags(xfs_mount_t *);
 extern void             xfs_qm_destroy_quotainfo(xfs_mount_t *);
 extern int              xfs_qm_mount_quotas(xfs_mount_t *, int);
 extern void             xfs_qm_mount_quotainit(xfs_mount_t *, uint);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index dc3c37a1e158..8890a18a99d8 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -229,48 +229,6 @@ xfs_qm_syncall(
        return error;
 }
-/*
- * Clear the quotaflags in memory and in the superblock.
- */
-void
-xfs_mount_reset_sbqflags(
-        xfs_mount_t             *mp)
-{
-        xfs_trans_t             *tp;
-        unsigned long           s;
-        mp->m_qflags = 0;
-        /*
-         * It is OK to look at sb_qflags here in mount path,
-         * without SB_LOCK.
-         */
-        if (mp->m_sb.sb_qflags == 0)
-                return;
-        s = XFS_SB_LOCK(mp);
-        mp->m_sb.sb_qflags = 0;
-        XFS_SB_UNLOCK(mp, s);
-        /*
-         * if the fs is readonly, let the incore superblock run
-         * with quotas off but don't flush the update out to disk
-         */
-        if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
-                return;
-#ifdef QUOTADEBUG
-        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
-#endif
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-        if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
-                                      XFS_DEFAULT_LOG_COUNT)) {
-                xfs_trans_cancel(tp, 0);
-                xfs_fs_cmn_err(CE_ALERT, mp,
-                        "xfs_mount_reset_sbqflags: Superblock update failed!");
-                return;
-        }
-        xfs_mod_sb(tp, XFS_SB_QFLAGS);
-        xfs_trans_commit(tp, 0, NULL);
-}
 STATIC int
 xfs_qm_newmount(
        xfs_mount_t     *mp,
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 68e98962dbef..15e02e8a9d4f 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -1053,7 +1053,6 @@ xfs_qm_dqrele_all_inodes(
        struct xfs_mount *mp,
        uint             flags)
 {
-        vmap_t          vmap;
        xfs_inode_t     *ip, *topino;
        uint            ireclaims;
        vnode_t         *vp;
@@ -1061,8 +1060,8 @@ xfs_qm_dqrele_all_inodes(
        ASSERT(mp->m_quotainfo);
-again:
        XFS_MOUNT_ILOCK(mp);
+again:
        ip = mp->m_inodes;
        if (ip == NULL) {
                XFS_MOUNT_IUNLOCK(mp);
@@ -1090,18 +1089,14 @@ again:
                }
                vnode_refd = B_FALSE;
                if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
-                        /*
-                         * Sample vp mapping while holding the mplock, lest
-                         * we come across a non-existent vnode.
-                         */
-                        VMAP(vp, vmap);
                        ireclaims = mp->m_ireclaims;
                        topino = mp->m_inodes;
-                        XFS_MOUNT_IUNLOCK(mp);
+                        vp = vn_grab(vp);
+                        if (!vp)
+                                goto again;
+                        XFS_MOUNT_IUNLOCK(mp);
                        /* XXX restart limit ? */
-                        if ( ! (vp = vn_get(vp, &vmap)))
-                                goto again;
                        xfs_ilock(ip, XFS_ILOCK_EXCL);
                        vnode_refd = B_TRUE;
                } else {
@@ -1137,7 +1132,6 @@ again:
                 */
                if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
                        /* XXX use a sentinel */
-                        XFS_MOUNT_IUNLOCK(mp);
                        goto again;
                }
                ip = ip->i_mnext;
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index 4ed7b6928cd7..4e1a5ec22fa3 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -31,6 +31,7 @@
 */
 #include "debug.h"
+#include "spin.h"
 #include <asm/page.h>
 #include <linux/sched.h>
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 8d01dce8c532..92fd1d67f878 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -85,7 +85,7 @@ xfs_acl_vhasacl_default(
 {
        int             error;
-        if (vp->v_type != VDIR)
+        if (!VN_ISDIR(vp))
                return 0;
        xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error);
        return (error == 0);
@@ -389,7 +389,7 @@ xfs_acl_allow_set(
        if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
                return EPERM;
-        if (kind == _ACL_TYPE_DEFAULT && vp->v_type != VDIR)
+        if (kind == _ACL_TYPE_DEFAULT && !VN_ISDIR(vp))
                return ENOTDIR;
        if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
                return EROFS;
@@ -750,7 +750,7 @@ xfs_acl_inherit(
         * If the new file is a directory, its default ACL is a copy of
         * the containing directory's default ACL.
         */
-        if (vp->v_type == VDIR)
+        if (VN_ISDIR(vp))
                xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
        if (!error && !basicperms)
                xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 6f5d283888aa..3e76def1283d 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4754,10 +4754,20 @@ xfs_bmapi(
                                        error = xfs_mod_incore_sb(mp,
                                                        XFS_SBS_FDBLOCKS,
                                                        -(alen), rsvd);
-                                if (!error)
+                                if (!error) {
                                        error = xfs_mod_incore_sb(mp,
                                                        XFS_SBS_FDBLOCKS,
                                                        -(indlen), rsvd);
+                                        if (error && rt) {
+                                                xfs_mod_incore_sb(ip->i_mount,
+                                                        XFS_SBS_FREXTENTS,
+                                                        extsz, rsvd);
+                                        } else if (error) {
+                                                xfs_mod_incore_sb(ip->i_mount,
+                                                        XFS_SBS_FDBLOCKS,
+                                                        alen, rsvd);
+                                        }
+                                }
                                if (error) {
                                        if (XFS_IS_QUOTA_ON(ip->i_mount))
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 30b8285ad476..a264657acfd9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -274,6 +274,7 @@ xfs_buf_item_format(
                       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
        vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
        vecp->i_len = base_size;
+        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);
        vecp++;
        nvecs = 1;
@@ -320,12 +321,14 @@ xfs_buf_item_format(
                        buffer_offset = first_bit * XFS_BLI_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
                        nvecs++;
                        break;
                } else if (next_bit != last_bit + 1) {
                        buffer_offset = first_bit * XFS_BLI_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
                        nvecs++;
                        vecp++;
                        first_bit = next_bit;
@@ -337,6 +340,7 @@ xfs_buf_item_format(
                        buffer_offset = first_bit * XFS_BLI_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
 /* You would think we need to bump the nvecs here too, but we do not
 * this number is used by recovery, and it gets confused by the boundary
 * split here
diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h
index 55c17adaaa37..19e872856f6b 100644
--- a/fs/xfs/xfs_dmapi.h
+++ b/fs/xfs/xfs_dmapi.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index db7cbd1bc857..cc7d1494a45d 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -107,6 +107,7 @@ xfs_efi_item_format(xfs_efi_log_item_t	*efip,
        log_vector->i_addr = (xfs_caddr_t)&(efip->efi_format);
        log_vector->i_len = size;
+        XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFI_FORMAT);
        ASSERT(size >= sizeof(xfs_efi_log_format_t));
 }
@@ -426,6 +427,7 @@ xfs_efd_item_format(xfs_efd_log_item_t	*efdp,
        log_vector->i_addr = (xfs_caddr_t)&(efdp->efd_format);
        log_vector->i_len = size;
+        XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_EFD_FORMAT);
        ASSERT(size >= sizeof(xfs_efd_log_format_t));
 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index d3da00045f26..0d9ae8fb4138 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -30,6 +30,8 @@
 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
 */
+#include <linux/delay.h>
 #include "xfs.h"
 #include "xfs_macros.h"
@@ -505,17 +507,15 @@ xfs_iget(
        vnode_t         *vp = NULL;
        int             error;
-retry:
        XFS_STATS_INC(xs_ig_attempts);
+retry:
        if ((inode = iget_locked(XFS_MTOVFS(mp)->vfs_super, ino))) {
                bhv_desc_t      *bdp;
                xfs_inode_t     *ip;
-                int             newnode;
                vp = LINVFS_GET_VP(inode);
                if (inode->i_state & I_NEW) {
-inode_allocate:
                        vn_initialize(inode);
                        error = xfs_iget_core(vp, mp, tp, ino, flags,
                                        lock_flags, ipp, bno);
@@ -526,32 +526,25 @@ inode_allocate:
                                iput(inode);
                        }
                } else {
-                        /* These are true if the inode is in inactive or
+                        /*
-                         * reclaim. The linux inode is about to go away,
+                         * If the inode is not fully constructed due to
-                         * wait for that path to finish, and try again.
+                         * filehandle mistmatches wait for the inode to go
+                         * away and try again.
+                         *
+                         * iget_locked will call __wait_on_freeing_inode
+                         * to wait for the inode to go away.
                         */
-                        if (vp->v_flag & (VINACT | VRECLM)) {
+                        if (is_bad_inode(inode) ||
-                                vn_wait(vp);
+                            ((bdp = vn_bhv_lookup(VN_BHV_HEAD(vp),
+                                                  &xfs_vnodeops)) == NULL)) {
                                iput(inode);
+                                delay(1);
                                goto retry;
                        }
-                        if (is_bad_inode(inode)) {
-                                iput(inode);
-                                return EIO;
-                        }
-                        bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
-                        if (bdp == NULL) {
-                                XFS_STATS_INC(xs_ig_dup);
-                                goto inode_allocate;
-                        }
                        ip = XFS_BHVTOI(bdp);
                        if (lock_flags != 0)
                                xfs_ilock(ip, lock_flags);
-                        newnode = (ip->i_d.di_mode == 0);
-                        if (newnode)
-                                xfs_iocore_inode_reinit(ip);
                        XFS_STATS_INC(xs_ig_found);
                        *ipp = ip;
                        error = 0;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 34bdf5909687..db43308aae93 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1128,7 +1128,6 @@ xfs_ialloc(
        ASSERT(ip != NULL);
        vp = XFS_ITOV(ip);
-        vp->v_type = IFTOVT(mode);
        ip->i_d.di_mode = (__uint16_t)mode;
        ip->i_d.di_onlink = 0;
        ip->i_d.di_nlink = nlink;
@@ -1250,7 +1249,7 @@ xfs_ialloc(
         */
        xfs_trans_log_inode(tp, ip, flags);
-        /* now that we have a v_type we can set Linux inode ops (& unlock) */
+        /* now that we have an i_mode  we can set Linux inode ops (& unlock) */
        VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
        *ipp = ip;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0eed30f5cb19..276ec70eb7f9 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -248,6 +248,7 @@ xfs_inode_item_format(
        vecp->i_addr = (xfs_caddr_t)&iip->ili_format;
        vecp->i_len  = sizeof(xfs_inode_log_format_t);
+        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IFORMAT);
        vecp++;
        nvecs        = 1;
@@ -292,6 +293,7 @@ xfs_inode_item_format(
        vecp->i_addr = (xfs_caddr_t)&ip->i_d;
        vecp->i_len  = sizeof(xfs_dinode_core_t);
+        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
        vecp++;
        nvecs++;
        iip->ili_format.ilf_fields |= XFS_ILOG_CORE;
@@ -349,6 +351,7 @@ xfs_inode_item_format(
                                vecp->i_addr =
                                        (char *)(ip->i_df.if_u1.if_extents);
                                vecp->i_len = ip->i_df.if_bytes;
+                                XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
                        } else
 #endif
                        {
@@ -367,6 +370,7 @@ xfs_inode_item_format(
                                vecp->i_addr = (xfs_caddr_t)ext_buffer;
                                vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
                                                XFS_DATA_FORK);
+                                XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IEXT);
                        }
                        ASSERT(vecp->i_len <= ip->i_df.if_bytes);
                        iip->ili_format.ilf_dsize = vecp->i_len;
@@ -384,6 +388,7 @@ xfs_inode_item_format(
                        ASSERT(ip->i_df.if_broot != NULL);
                        vecp->i_addr = (xfs_caddr_t)ip->i_df.if_broot;
                        vecp->i_len = ip->i_df.if_broot_bytes;
+                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IBROOT);
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_dsize = ip->i_df.if_broot_bytes;
@@ -409,6 +414,7 @@ xfs_inode_item_format(
                        ASSERT((ip->i_df.if_real_bytes == 0) ||
                               (ip->i_df.if_real_bytes == data_bytes));
                        vecp->i_len = (int)data_bytes;
+                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ILOCAL);
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_dsize = (unsigned)data_bytes;
@@ -486,6 +492,7 @@ xfs_inode_item_format(
                        vecp->i_len = xfs_iextents_copy(ip, ext_buffer,
                                        XFS_ATTR_FORK);
 #endif
+                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_EXT);
                        iip->ili_format.ilf_asize = vecp->i_len;
                        vecp++;
                        nvecs++;
@@ -500,6 +507,7 @@ xfs_inode_item_format(
                        ASSERT(ip->i_afp->if_broot != NULL);
                        vecp->i_addr = (xfs_caddr_t)ip->i_afp->if_broot;
                        vecp->i_len = ip->i_afp->if_broot_bytes;
+                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_BROOT);
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_asize = ip->i_afp->if_broot_bytes;
@@ -523,6 +531,7 @@ xfs_inode_item_format(
                        ASSERT((ip->i_afp->if_real_bytes == 0) ||
                               (ip->i_afp->if_real_bytes == data_bytes));
                        vecp->i_len = (int)data_bytes;
+                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_IATTR_LOCAL);
                        vecp++;
                        nvecs++;
                        iip->ili_format.ilf_asize = (unsigned)data_bytes;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 2edd6769e5d3..d0f5be63cddb 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -226,13 +226,12 @@ xfs_iomap(
                xfs_iomap_enter_trace(XFS_IOMAP_READ_ENTER, io, offset, count);
                lockmode = XFS_LCK_MAP_SHARED(mp, io);
                bmapi_flags = XFS_BMAPI_ENTIRE;
-                if (flags & BMAPI_IGNSTATE)
-                        bmapi_flags |= XFS_BMAPI_IGSTATE;
                break;
        case BMAPI_WRITE:
                xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, io, offset, count);
                lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR;
-                bmapi_flags = 0;
+                if (flags & BMAPI_IGNSTATE)
+                        bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE;
                XFS_ILOCK(mp, io, lockmode);
                break;
        case BMAPI_ALLOCATE:
@@ -391,9 +390,9 @@ xfs_iomap_write_direct(
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS], *imapp;
        xfs_bmap_free_t free_list;
        int             aeof;
-        xfs_filblks_t   datablocks, qblocks, resblks;
+        xfs_filblks_t   qblocks, resblks;
        int             committed;
-        int             numrtextents;
+        int             resrtextents;
        /*
         * Make sure that the dquots are there. This doesn't hold
@@ -434,14 +433,14 @@ xfs_iomap_write_direct(
                if (!(extsz = ip->i_d.di_extsize))
                        extsz = mp->m_sb.sb_rextsize;
-                numrtextents = qblocks = (count_fsb + extsz - 1);
+                resrtextents = qblocks = (count_fsb + extsz - 1);
-                do_div(numrtextents, mp->m_sb.sb_rextsize);
+                do_div(resrtextents, mp->m_sb.sb_rextsize);
+                resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
                quota_flag = XFS_QMOPT_RES_RTBLKS;
-                datablocks = 0;
        } else {
-                datablocks = qblocks = count_fsb;
+                resrtextents = 0;
+                resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, count_fsb);
                quota_flag = XFS_QMOPT_RES_REGBLKS;
-                numrtextents = 0;
        }
        /*
@@ -449,9 +448,8 @@ xfs_iomap_write_direct(
         */
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
        tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
-        resblks = XFS_DIOSTRAT_SPACE_RES(mp, datablocks);
        error = xfs_trans_reserve(tp, resblks,
-                        XFS_WRITE_LOG_RES(mp), numrtextents,
+                        XFS_WRITE_LOG_RES(mp), resrtextents,
                        XFS_TRANS_PERM_LOG_RES,
                        XFS_WRITE_LOG_COUNT);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 1cd2ac163877..54a6f1142403 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -159,11 +159,15 @@ xfs_buftarg_t *xlog_target;
 void
 xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
 {
-        if (! log->l_grant_trace) {
+        unsigned long cnts;
-                log->l_grant_trace = ktrace_alloc(1024, KM_NOSLEEP);
-                if (! log->l_grant_trace)
+        if (!log->l_grant_trace) {
+                log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP);
+                if (!log->l_grant_trace)
                        return;
        }
+        /* ticket counts are 1 byte each */
+        cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8;
        ktrace_enter(log->l_grant_trace,
                     (void *)tic,
@@ -178,10 +182,10 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string)
                     (void *)((unsigned long)CYCLE_LSN(log->l_tail_lsn)),
                     (void *)((unsigned long)BLOCK_LSN(log->l_tail_lsn)),
                     (void *)string,
-                     (void *)((unsigned long)13),
+                     (void *)((unsigned long)tic->t_trans_type),
-                     (void *)((unsigned long)14),
+                     (void *)cnts,
-                     (void *)((unsigned long)15),
+                     (void *)((unsigned long)tic->t_curr_res),
-                     (void *)((unsigned long)16));
+                     (void *)((unsigned long)tic->t_unit_res));
 }
 void
@@ -274,9 +278,11 @@ xfs_log_done(xfs_mount_t	*mp,
                 * Release ticket if not permanent reservation or a specifc
                 * request has been made to release a permanent reservation.
                 */
+                xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
                xlog_ungrant_log_space(log, ticket);
                xlog_state_put_ticket(log, ticket);
        } else {
+                xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
                xlog_regrant_reserve_log_space(log, ticket);
        }
@@ -399,7 +405,8 @@ xfs_log_reserve(xfs_mount_t	 *mp,
                int              cnt,
                xfs_log_ticket_t *ticket,
                __uint8_t        client,
-                uint             flags)
+                uint             flags,
+                uint             t_type)
 {
        xlog_t          *log = mp->m_log;
        xlog_ticket_t   *internal_ticket;
@@ -421,13 +428,19 @@ xfs_log_reserve(xfs_mount_t	 *mp,
        if (*ticket != NULL) {
                ASSERT(flags & XFS_LOG_PERM_RESERV);
                internal_ticket = (xlog_ticket_t *)*ticket;
+                xlog_trace_loggrant(log, internal_ticket, "xfs_log_reserve: existing ticket (permanent trans)");
                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
                retval = xlog_regrant_write_log_space(log, internal_ticket);
        } else {
                /* may sleep if need to allocate more tickets */
                internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
                                                  client, flags);
+                internal_ticket->t_trans_type = t_type;
                *ticket = internal_ticket;
+                xlog_trace_loggrant(log, internal_ticket, 
+                        (internal_ticket->t_flags & XLOG_TIC_PERM_RESERV) ?
+                        "xfs_log_reserve: create new ticket (permanent trans)" :
+                        "xfs_log_reserve: create new ticket");
                xlog_grant_push_ail(mp,
                                    (internal_ticket->t_unit_res *
                                     internal_ticket->t_cnt));
@@ -601,8 +614,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
        if (! (XLOG_FORCED_SHUTDOWN(log))) {
                reg[0].i_addr = (void*)&magic;
                reg[0].i_len  = sizeof(magic);
+                XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_UNMOUNT);
-                error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0);
+                error = xfs_log_reserve(mp, 600, 1, &tic, XFS_LOG, 0, 0);
                if (!error) {
                        /* remove inited flag */
                        ((xlog_ticket_t *)tic)->t_flags = 0;
@@ -1272,6 +1286,7 @@ xlog_commit_record(xfs_mount_t  *mp,
        reg[0].i_addr = NULL;
        reg[0].i_len = 0;
+        XLOG_VEC_SET_TYPE(&reg[0], XLOG_REG_TYPE_COMMIT);
        ASSERT_ALWAYS(iclog);
        if ((error = xlog_write(mp, reg, 1, ticket, commitlsnp,
@@ -1605,6 +1620,117 @@ xlog_state_finish_copy(xlog_t		*log,
 /*
+ * print out info relating to regions written which consume
+ * the reservation
+ */
+#if defined(XFS_LOG_RES_DEBUG)
+STATIC void
+xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
+{
+        uint i;
+        uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
+        /* match with XLOG_REG_TYPE_* in xfs_log.h */
+        static char *res_type_str[XLOG_REG_TYPE_MAX] = {
+            "bformat",
+            "bchunk",
+            "efi_format",
+            "efd_format",
+            "iformat",
+            "icore",
+            "iext",
+            "ibroot",
+            "ilocal",
+            "iattr_ext",
+            "iattr_broot",
+            "iattr_local",
+            "qformat",
+            "dquot",
+            "quotaoff",
+            "LR header",
+            "unmount",
+            "commit",
+            "trans header"
+        };
+        static char *trans_type_str[XFS_TRANS_TYPE_MAX] = {
+            "SETATTR_NOT_SIZE",
+            "SETATTR_SIZE",
+            "INACTIVE",
+            "CREATE",
+            "CREATE_TRUNC",
+            "TRUNCATE_FILE",
+            "REMOVE",
+            "LINK",
+            "RENAME",
+            "MKDIR",
+            "RMDIR",
+            "SYMLINK",
+            "SET_DMATTRS",
+            "GROWFS",
+            "STRAT_WRITE",
+            "DIOSTRAT",
+            "WRITE_SYNC",
+            "WRITEID",
+            "ADDAFORK",
+            "ATTRINVAL",
+            "ATRUNCATE",
+            "ATTR_SET",
+            "ATTR_RM",
+            "ATTR_FLAG",
+            "CLEAR_AGI_BUCKET",
+            "QM_SBCHANGE",
+            "DUMMY1",
+            "DUMMY2",
+            "QM_QUOTAOFF",
+            "QM_DQALLOC",
+            "QM_SETQLIM",
+            "QM_DQCLUSTER",
+            "QM_QINOCREATE",
+            "QM_QUOTAOFF_END",
+            "SB_UNIT",
+            "FSYNC_TS",
+            "GROWFSRT_ALLOC",
+            "GROWFSRT_ZERO",
+            "GROWFSRT_FREE",
+            "SWAPEXT"
+        };
+        xfs_fs_cmn_err(CE_WARN, mp,
+                        "xfs_log_write: reservation summary:\n"
+                        "  trans type  = %s (%u)\n"
+                        "  unit res    = %d bytes\n"
+                        "  current res = %d bytes\n"
+                        "  total reg   = %u bytes (o/flow = %u bytes)\n"
+                        "  ophdrs      = %u (ophdr space = %u bytes)\n"
+                        "  ophdr + reg = %u bytes\n"
+                        "  num regions = %u\n",
+                        ((ticket->t_trans_type <= 0 ||
+                          ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
+                          "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
+                        ticket->t_trans_type,
+                        ticket->t_unit_res,
+                        ticket->t_curr_res,
+                        ticket->t_res_arr_sum, ticket->t_res_o_flow,
+                        ticket->t_res_num_ophdrs, ophdr_spc,
+                        ticket->t_res_arr_sum + 
+                          ticket->t_res_o_flow + ophdr_spc,
+                        ticket->t_res_num);
+        for (i = 0; i < ticket->t_res_num; i++) {
+                uint r_type = ticket->t_res_arr[i].r_type; 
+                cmn_err(CE_WARN,
+                            "region[%u]: %s - %u bytes\n",
+                            i, 
+                            ((r_type <= 0 || r_type > XLOG_REG_TYPE_MAX) ?
+                            "bad-rtype" : res_type_str[r_type-1]),
+                            ticket->t_res_arr[i].r_len);
+        }
+}
+#else
+#define xlog_print_tic_res(mp, ticket)
+#endif
+/*
 * Write some region out to in-core log
 *
 * This will be called when writing externally provided regions or when
@@ -1677,16 +1803,21 @@ xlog_write(xfs_mount_t *	mp,
     * xlog_op_header_t and may need to be double word aligned.
     */
    len = 0;
-    if (ticket->t_flags & XLOG_TIC_INITED)     /* acct for start rec of xact */
+    if (ticket->t_flags & XLOG_TIC_INITED) {    /* acct for start rec of xact */
        len += sizeof(xlog_op_header_t);
+        XLOG_TIC_ADD_OPHDR(ticket);
+    }
    for (index = 0; index < nentries; index++) {
        len += sizeof(xlog_op_header_t);            /* each region gets >= 1 */
+        XLOG_TIC_ADD_OPHDR(ticket);
        len += reg[index].i_len;
+        XLOG_TIC_ADD_REGION(ticket, reg[index].i_len, reg[index].i_type);
    }
    contwr = *start_lsn = 0;
    if (ticket->t_curr_res < len) {
+        xlog_print_tic_res(mp, ticket);
 #ifdef DEBUG
        xlog_panic(
                "xfs_log_write: reservation ran out. Need to up reservation");
@@ -1790,6 +1921,7 @@ xlog_write(xfs_mount_t *	mp,
                len += sizeof(xlog_op_header_t); /* from splitting of region */
                /* account for new log op header */
                ticket->t_curr_res -= sizeof(xlog_op_header_t);
+                XLOG_TIC_ADD_OPHDR(ticket);
            }
            xlog_verify_dest_ptr(log, ptr);
@@ -2282,6 +2414,9 @@ restart:
         */
        if (log_offset == 0) {
                ticket->t_curr_res -= log->l_iclog_hsize;
+                XLOG_TIC_ADD_REGION(ticket,
+                                    log->l_iclog_hsize,
+                                    XLOG_REG_TYPE_LRHEADER);
                INT_SET(head->h_cycle, ARCH_CONVERT, log->l_curr_cycle);
                ASSIGN_LSN(head->h_lsn, log);
                ASSERT(log->l_curr_block >= 0);
@@ -2468,6 +2603,7 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 #endif
        tic->t_curr_res = tic->t_unit_res;
+        XLOG_TIC_RESET_RES(tic);
        if (tic->t_cnt > 0)
                return (0);
@@ -2608,6 +2744,7 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'w');
        XLOG_GRANT_SUB_SPACE(log, ticket->t_curr_res, 'r');
        ticket->t_curr_res = ticket->t_unit_res;
+        XLOG_TIC_RESET_RES(ticket);
        xlog_trace_loggrant(log, ticket,
                            "xlog_regrant_reserve_log_space: sub current res");
        xlog_verify_grant_head(log, 1);
@@ -2624,6 +2761,7 @@ xlog_regrant_reserve_log_space(xlog_t	     *log,
        xlog_verify_grant_head(log, 0);
        GRANT_UNLOCK(log, s);
        ticket->t_curr_res = ticket->t_unit_res;
+        XLOG_TIC_RESET_RES(ticket);
 }       /* xlog_regrant_reserve_log_space */
@@ -3179,29 +3317,57 @@ xlog_ticket_get(xlog_t		*log,
         * and their unit amount is the total amount of space required.
         *
         * The following lines of code account for non-transaction data
-         * which occupy space in the on-disk log. 
+         * which occupy space in the on-disk log.
+         *
+         * Normal form of a transaction is:
+         * <oph><trans-hdr><start-oph><reg1-oph><reg1><reg2-oph>...<commit-oph>
+         * and then there are LR hdrs, split-recs and roundoff at end of syncs.
+         *
+         * We need to account for all the leadup data and trailer data
+         * around the transaction data.
+         * And then we need to account for the worst case in terms of using
+         * more space.
+         * The worst case will happen if:
+         * - the placement of the transaction happens to be such that the
+         *   roundoff is at its maximum
+         * - the transaction data is synced before the commit record is synced
+         *   i.e. <transaction-data><roundoff> | <commit-rec><roundoff>
+         *   Therefore the commit record is in its own Log Record.
+         *   This can happen as the commit record is called with its
+         *   own region to xlog_write().
+         *   This then means that in the worst case, roundoff can happen for
+         *   the commit-rec as well.
+         *   The commit-rec is smaller than padding in this scenario and so it is
+         *   not added separately.
         */
+        /* for trans header */
+        unit_bytes += sizeof(xlog_op_header_t);
+        unit_bytes += sizeof(xfs_trans_header_t);
        /* for start-rec */
-        unit_bytes += sizeof(xlog_op_header_t); 
+        unit_bytes += sizeof(xlog_op_header_t);
+        /* for LR headers */
+        num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
+        unit_bytes += log->l_iclog_hsize * num_headers;
+        /* for commit-rec LR header - note: padding will subsume the ophdr */
+        unit_bytes += log->l_iclog_hsize;
+        /* for split-recs - ophdrs added when data split over LRs */
+        unit_bytes += sizeof(xlog_op_header_t) * num_headers;
-        /* for padding */
+        /* for roundoff padding for transaction data and one for commit record */
        if (XFS_SB_VERSION_HASLOGV2(&log->l_mp->m_sb) &&
-                log->l_mp->m_sb.sb_logsunit > 1) {
+            log->l_mp->m_sb.sb_logsunit > 1) {
                /* log su roundoff */
-                unit_bytes += log->l_mp->m_sb.sb_logsunit;  
+                unit_bytes += 2*log->l_mp->m_sb.sb_logsunit;
        } else {
                /* BB roundoff */
-                unit_bytes += BBSIZE;
+                unit_bytes += 2*BBSIZE;
        }
-        /* for commit-rec */
-        unit_bytes += sizeof(xlog_op_header_t);
- 
-        /* for LR headers */
-        num_headers = ((unit_bytes + log->l_iclog_size-1) >> log->l_iclog_size_log);
-        unit_bytes += log->l_iclog_hsize * num_headers;
        tic->t_unit_res         = unit_bytes;
        tic->t_curr_res         = unit_bytes;
        tic->t_cnt              = cnt;
@@ -3209,10 +3375,13 @@ xlog_ticket_get(xlog_t		*log,
        tic->t_tid              = (xlog_tid_t)((__psint_t)tic & 0xffffffff);
        tic->t_clientid         = client;
        tic->t_flags            = XLOG_TIC_INITED;
+        tic->t_trans_type       = 0;
        if (xflags & XFS_LOG_PERM_RESERV)
                tic->t_flags |= XLOG_TIC_PERM_RESERV;
        sv_init(&(tic->t_sema), SV_DEFAULT, "logtick");
+        XLOG_TIC_RESET_RES(tic);
        return tic;
 }       /* xlog_ticket_get */
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 0db122ddda3f..18961119fc65 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -114,9 +114,44 @@ xfs_lsn_t	_lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2)
 #define XFS_VOLUME              0x2
 #define XFS_LOG                 0xaa
+/* Region types for iovec's i_type */
+#if defined(XFS_LOG_RES_DEBUG)
+#define XLOG_REG_TYPE_BFORMAT           1
+#define XLOG_REG_TYPE_BCHUNK            2
+#define XLOG_REG_TYPE_EFI_FORMAT        3
+#define XLOG_REG_TYPE_EFD_FORMAT        4
+#define XLOG_REG_TYPE_IFORMAT           5
+#define XLOG_REG_TYPE_ICORE             6
+#define XLOG_REG_TYPE_IEXT              7
+#define XLOG_REG_TYPE_IBROOT            8
+#define XLOG_REG_TYPE_ILOCAL            9
+#define XLOG_REG_TYPE_IATTR_EXT         10
+#define XLOG_REG_TYPE_IATTR_BROOT       11
+#define XLOG_REG_TYPE_IATTR_LOCAL       12
+#define XLOG_REG_TYPE_QFORMAT           13
+#define XLOG_REG_TYPE_DQUOT             14
+#define XLOG_REG_TYPE_QUOTAOFF          15
+#define XLOG_REG_TYPE_LRHEADER          16
+#define XLOG_REG_TYPE_UNMOUNT           17
+#define XLOG_REG_TYPE_COMMIT            18
+#define XLOG_REG_TYPE_TRANSHDR          19
+#define XLOG_REG_TYPE_MAX               19
+#endif
+#if defined(XFS_LOG_RES_DEBUG)
+#define XLOG_VEC_SET_TYPE(vecp, t) ((vecp)->i_type = (t))
+#else
+#define XLOG_VEC_SET_TYPE(vecp, t)
+#endif
 typedef struct xfs_log_iovec {
        xfs_caddr_t             i_addr;         /* beginning address of region */
        int             i_len;          /* length in bytes of region */
+#if defined(XFS_LOG_RES_DEBUG)
+        uint            i_type;         /* type of region */
+#endif
 } xfs_log_iovec_t;
 typedef void* xfs_log_ticket_t;
@@ -159,7 +194,8 @@ int	  xfs_log_reserve(struct xfs_mount *mp,
                          int              count,
                          xfs_log_ticket_t *ticket,
                          __uint8_t        clientid,
-                          uint             flags);
+                          uint             flags,
+                          uint             t_type);
 int       xfs_log_write(struct xfs_mount *mp,
                        xfs_log_iovec_t  region[],
                        int              nentries,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 1a1d452f15f9..eb7fdc6ebc32 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -335,18 +335,66 @@ typedef __uint32_t xlog_tid_t;
 #define XLOG_COVER_OPS          5
+/* Ticket reservation region accounting */ 
+#if defined(XFS_LOG_RES_DEBUG)
+#define XLOG_TIC_LEN_MAX        15
+#define XLOG_TIC_RESET_RES(t) ((t)->t_res_num = \
+                                (t)->t_res_arr_sum = (t)->t_res_num_ophdrs = 0)
+#define XLOG_TIC_ADD_OPHDR(t) ((t)->t_res_num_ophdrs++)
+#define XLOG_TIC_ADD_REGION(t, len, type)                               \
+        do {                                                            \
+                if ((t)->t_res_num == XLOG_TIC_LEN_MAX) {               \
+                        /* add to overflow and start again */           \
+                        (t)->t_res_o_flow += (t)->t_res_arr_sum;        \
+                        (t)->t_res_num = 0;                             \
+                        (t)->t_res_arr_sum = 0;                         \
+                }                                                       \
+                (t)->t_res_arr[(t)->t_res_num].r_len = (len);           \
+                (t)->t_res_arr[(t)->t_res_num].r_type = (type);         \
+                (t)->t_res_arr_sum += (len);                            \
+                (t)->t_res_num++;                                       \
+        } while (0)
+/*
+ * Reservation region
+ * As would be stored in xfs_log_iovec but without the i_addr which
+ * we don't care about.
+ */
+typedef struct xlog_res {
+        uint    r_len;
+        uint    r_type;
+} xlog_res_t;
+#else
+#define XLOG_TIC_RESET_RES(t)
+#define XLOG_TIC_ADD_OPHDR(t)
+#define XLOG_TIC_ADD_REGION(t, len, type)
+#endif
 typedef struct xlog_ticket {
-        sv_t               t_sema;       /* sleep on this semaphore      :20 */
+        sv_t               t_sema;       /* sleep on this semaphore      : 20 */
-        struct xlog_ticket *t_next;      /*                              : 4 */
+        struct xlog_ticket *t_next;      /*                              :4|8 */
-        struct xlog_ticket *t_prev;      /*                              : 4 */
+        struct xlog_ticket *t_prev;      /*                              :4|8 */
-        xlog_tid_t         t_tid;        /* transaction identifier       : 4 */
+        xlog_tid_t         t_tid;        /* transaction identifier       : 4  */
-        int                t_curr_res;   /* current reservation in bytes : 4 */
+        int                t_curr_res;   /* current reservation in bytes : 4  */
-        int                t_unit_res;   /* unit reservation in bytes    : 4 */
+        int                t_unit_res;   /* unit reservation in bytes    : 4  */
-        __uint8_t          t_ocnt;       /* original count               : 1 */
+        char               t_ocnt;       /* original count               : 1  */
-        __uint8_t          t_cnt;        /* current count                : 1 */
+        char               t_cnt;        /* current count                : 1  */
-        __uint8_t          t_clientid;   /* who does this belong to;     : 1 */
+        char               t_clientid;   /* who does this belong to;     : 1  */
-        __uint8_t          t_flags;      /* properties of reservation    : 1 */
+        char               t_flags;      /* properties of reservation    : 1  */
+        uint               t_trans_type; /* transaction type             : 4  */
+#if defined (XFS_LOG_RES_DEBUG)
+        /* reservation array fields */
+        uint               t_res_num;                    /* num in array : 4 */
+        xlog_res_t         t_res_arr[XLOG_TIC_LEN_MAX];  /* array of res : X */ 
+        uint               t_res_num_ophdrs;             /* num op hdrs  : 4 */
+        uint               t_res_arr_sum;                /* array sum    : 4 */
+        uint               t_res_o_flow;                 /* sum overflow : 4 */
+#endif
 } xlog_ticket_t;
 #endif
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0aac28ddb81c..14faabaabf29 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1387,7 +1387,7 @@ xlog_recover_add_to_cont_trans(
        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
        old_len = item->ri_buf[item->ri_cnt-1].i_len;
-        ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0);
+        ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
        memcpy(&ptr[old_len], dp, len); /* d, s, l */
        item->ri_buf[item->ri_cnt-1].i_len += len;
        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index 4f40c92863d5..a6cd6324e946 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -42,7 +42,8 @@
 #include "xfs_dir2.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_error.h"
 STATIC struct xfs_dquot *
 xfs_dqvopchown_default(
@@ -54,8 +55,79 @@ xfs_dqvopchown_default(
        return NULL;
 }
+/*
+ * Clear the quotaflags in memory and in the superblock.
+ */
+int
+xfs_mount_reset_sbqflags(xfs_mount_t *mp)
+{
+        int                     error;
+        xfs_trans_t             *tp;
+        unsigned long           s;
+        mp->m_qflags = 0;
+        /*
+         * It is OK to look at sb_qflags here in mount path,
+         * without SB_LOCK.
+         */
+        if (mp->m_sb.sb_qflags == 0)
+                return 0;
+        s = XFS_SB_LOCK(mp);
+        mp->m_sb.sb_qflags = 0;
+        XFS_SB_UNLOCK(mp, s);
+        /*
+         * if the fs is readonly, let the incore superblock run
+         * with quotas off but don't flush the update out to disk
+         */
+        if (XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY)
+                return 0;
+#ifdef QUOTADEBUG
+        xfs_fs_cmn_err(CE_NOTE, mp, "Writing superblock quota changes");
+#endif
+        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+        if ((error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+                                      XFS_DEFAULT_LOG_COUNT))) {
+                xfs_trans_cancel(tp, 0);
+                xfs_fs_cmn_err(CE_ALERT, mp,
+                        "xfs_mount_reset_sbqflags: Superblock update failed!");
+                return error;
+        }
+        xfs_mod_sb(tp, XFS_SB_QFLAGS);
+        error = xfs_trans_commit(tp, 0, NULL);
+        return error;
+}
+STATIC int
+xfs_noquota_init(
+        xfs_mount_t     *mp,
+        uint            *needquotamount,
+        uint            *quotaflags)
+{
+        int             error = 0;
+        *quotaflags = 0;
+        *needquotamount = B_FALSE;
+        ASSERT(!XFS_IS_QUOTA_ON(mp));
+        /*
+         * If a file system had quotas running earlier, but decided to
+         * mount without -o uquota/pquota/gquota options, revoke the
+         * quotachecked license.
+         */
+        if (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT) {
+                cmn_err(CE_NOTE,
+                        "XFS resetting qflags for filesystem %s",
+                        mp->m_fsname);
+                error = xfs_mount_reset_sbqflags(mp);
+        }
+        return error;
+}
 xfs_qmops_t     xfs_qmcore_stub = {
-        .xfs_qminit             = (xfs_qminit_t) fs_noerr,
+        .xfs_qminit             = (xfs_qminit_t) xfs_noquota_init,
        .xfs_qmdone             = (xfs_qmdone_t) fs_noerr,
        .xfs_qmmount            = (xfs_qmmount_t) fs_noerr,
        .xfs_qmunmount          = (xfs_qmunmount_t) fs_noerr,
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 7134576ae7fa..32cb79752d5d 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.  All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of version 2 of the GNU General Public License as
@@ -160,6 +160,20 @@ typedef struct xfs_qoff_logformat {
 #define XFS_GQUOTA_ACCT 0x0040  /* group quota accounting ON */
 /*
+ * Quota Accounting/Enforcement flags
+ */
+#define XFS_ALL_QUOTA_ACCT      \
+                (XFS_UQUOTA_ACCT | XFS_GQUOTA_ACCT | XFS_PQUOTA_ACCT)
+#define XFS_ALL_QUOTA_ENFD      (XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD)
+#define XFS_ALL_QUOTA_CHKD      (XFS_UQUOTA_CHKD | XFS_OQUOTA_CHKD)
+#define XFS_IS_QUOTA_RUNNING(mp)        ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT)
+#define XFS_IS_QUOTA_ENFORCED(mp)       ((mp)->m_qflags & XFS_ALL_QUOTA_ENFD)
+#define XFS_IS_UQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_UQUOTA_ACCT)
+#define XFS_IS_PQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_PQUOTA_ACCT)
+#define XFS_IS_GQUOTA_RUNNING(mp)       ((mp)->m_qflags & XFS_GQUOTA_ACCT)
+/*
 * Incore only flags for quotaoff - these bits get cleared when quota(s)
 * are in the process of getting turned off. These flags are in m_qflags but
 * never in sb_qflags.
@@ -362,6 +376,7 @@ typedef struct xfs_dqtrxops {
                                f | XFS_QMOPT_RES_REGBLKS)
 extern int xfs_qm_dqcheck(xfs_disk_dquot_t *, xfs_dqid_t, uint, uint, char *);
+extern int xfs_mount_reset_sbqflags(struct xfs_mount *);
 extern struct bhv_vfsops xfs_qmops;
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 06dfca531f79..92efe272b83d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -276,7 +276,7 @@ xfs_trans_reserve(
                error = xfs_log_reserve(tp->t_mountp, logspace, logcount,
                                        &tp->t_ticket,
-                                        XFS_TRANSACTION, log_flags);
+                                        XFS_TRANSACTION, log_flags, tp->t_type);
                if (error) {
                        goto undo_blocks;
                }
@@ -1032,6 +1032,7 @@ xfs_trans_fill_vecs(
        tp->t_header.th_num_items = nitems;
        log_vector->i_addr = (xfs_caddr_t)&tp->t_header;
        log_vector->i_len = sizeof(xfs_trans_header_t);
+        XLOG_VEC_SET_TYPE(log_vector, XLOG_REG_TYPE_TRANSHDR);
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ec541d66fa2a..a263aec8b3a6 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -112,6 +112,7 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_GROWFSRT_ZERO         38
 #define XFS_TRANS_GROWFSRT_FREE         39
 #define XFS_TRANS_SWAPEXT               40
+#define XFS_TRANS_TYPE_MAX              40
 /* new transaction types need to be reflected in xfs_logprint(8) */
@@ -998,6 +999,7 @@ struct xfs_buf	*xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
 void            xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
+void            xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
 void            xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 7bc5eab4c2c1..2a71b4f91bfa 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -379,8 +379,8 @@ xfs_trans_delete_ail(
                else {
                        xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
                                "xfs_trans_delete_ail: attempting to delete a log item that is not in the AIL");
-                        xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
                        AIL_UNLOCK(mp, s);
+                        xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
                }
        }
 }
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 144da7a85466..e733293dd7f4 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -714,6 +714,29 @@ xfs_trans_bhold(xfs_trans_t	*tp,
 }
 /*
+ * Cancel the previous buffer hold request made on this buffer
+ * for this transaction.
+ */
+void
+xfs_trans_bhold_release(xfs_trans_t     *tp,
+                        xfs_buf_t       *bp)
+{
+        xfs_buf_log_item_t      *bip;
+        ASSERT(XFS_BUF_ISBUSY(bp));
+        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
+        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
+        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
+        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(atomic_read(&bip->bli_refcount) > 0);
+        ASSERT(bip->bli_flags & XFS_BLI_HOLD);
+        bip->bli_flags &= ~XFS_BLI_HOLD;
+        xfs_buf_item_trace("BHOLD RELEASE", bip);
+}
+/*
 * This is called to mark bytes first through last inclusive of the given
 * buffer as needing to be logged when the transaction is committed.
 * The buffer must already be associated with the given transaction.
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 42bcc0215203..f1a904e23ade 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -795,7 +795,6 @@ xfs_statvfs(
        xfs_mount_t     *mp;
        xfs_sb_t        *sbp;
        unsigned long   s;
-        u64 id;
        mp = XFS_BHVTOM(bdp);
        sbp = &(mp->m_sb);
@@ -823,9 +822,7 @@ xfs_statvfs(
        statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
        XFS_SB_UNLOCK(mp, s);
-        id = huge_encode_dev(mp->m_dev);
+        xfs_statvfs_fsid(statp, mp);
-        statp->f_fsid.val[0] = (u32)id;
-        statp->f_fsid.val[1] = (u32)(id >> 32);
        statp->f_namelen = MAXNAMELEN - 1;
        return 0;
@@ -906,7 +903,6 @@ xfs_sync_inodes(
        xfs_inode_t     *ip_next;
        xfs_buf_t       *bp;
        vnode_t         *vp = NULL;
-        vmap_t          vmap;
        int             error;
        int             last_error;
        uint64_t        fflag;
@@ -1101,48 +1097,21 @@ xfs_sync_inodes(
                 * lock in xfs_ireclaim() after the inode is pulled from
                 * the mount list will sleep until we release it here.
                 * This keeps the vnode from being freed while we reference
-                 * it.  It is also cheaper and simpler than actually doing
+                 * it.
-                 * a vn_get() for every inode we touch here.
                 */
                if (xfs_ilock_nowait(ip, lock_flags) == 0) {
                        if ((flags & SYNC_BDFLUSH) || (vp == NULL)) {
                                ip = ip->i_mnext;
                                continue;
                        }
-                        /*
+                        vp = vn_grab(vp);
-                         * We need to unlock the inode list lock in order
-                         * to lock the inode. Insert a marker record into
-                         * the inode list to remember our position, dropping
-                         * the lock is now done inside the IPOINTER_INSERT
-                         * macro.
-                         *
-                         * We also use the inode list lock to protect us
-                         * in taking a snapshot of the vnode version number
-                         * for use in calling vn_get().
-                         */
-                        VMAP(vp, vmap);
-                        IPOINTER_INSERT(ip, mp);
-                        vp = vn_get(vp, &vmap);
                        if (vp == NULL) {
-                                /*
+                                ip = ip->i_mnext;
-                                 * The vnode was reclaimed once we let go
-                                 * of the inode list lock.  Skip to the
-                                 * next list entry. Remove the marker.
-                                 */
-                                XFS_MOUNT_ILOCK(mp);
-                                mount_locked = B_TRUE;
-                                vnode_refed  = B_FALSE;
-                                IPOINTER_REMOVE(ip, mp);
                                continue;
                        }
+                        IPOINTER_INSERT(ip, mp);
                        xfs_ilock(ip, lock_flags);
                        ASSERT(vp == XFS_ITOV(ip));
@@ -1533,7 +1502,10 @@ xfs_syncsub(
         * eventually kicked out of the cache.
         */
        if (flags & SYNC_REFCACHE) {
-                xfs_refcache_purge_some(mp);
+                if (flags & SYNC_WAIT)
+                        xfs_refcache_purge_mp(mp);
+                else
+                        xfs_refcache_purge_some(mp);
        }
        /*
@@ -1649,6 +1621,10 @@ xfs_vget(
 #define MNTOPT_SWIDTH   "swidth"        /* data volume stripe width */
 #define MNTOPT_NOUUID   "nouuid"        /* ignore filesystem UUID */
 #define MNTOPT_MTPT     "mtpt"          /* filesystem mount point */
+#define MNTOPT_GRPID    "grpid"         /* group-ID from parent directory */
+#define MNTOPT_NOGRPID  "nogrpid"       /* group-ID from current process */
+#define MNTOPT_BSDGROUPS    "bsdgroups"    /* group-ID from parent directory */
+#define MNTOPT_SYSVGROUPS   "sysvgroups"   /* group-ID from current process */
 #define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
 #define MNTOPT_IHASHSIZE    "ihashsize"    /* size of inode hash table */
 #define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
@@ -1769,6 +1745,12 @@ xfs_parseargs(
                        }
                        args->flags |= XFSMNT_IHASHSIZE;
                        args->ihashsize = simple_strtoul(value, &eov, 10);
+                } else if (!strcmp(this_char, MNTOPT_GRPID) ||
+                           !strcmp(this_char, MNTOPT_BSDGROUPS)) {
+                        vfsp->vfs_flag |= VFS_GRPID;
+                } else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
+                           !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
+                        vfsp->vfs_flag &= ~VFS_GRPID;
                } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
                        args->flags |= XFSMNT_WSYNC;
                } else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
@@ -1890,6 +1872,7 @@ xfs_showargs(
        };
        struct proc_xfs_info    *xfs_infop;
        struct xfs_mount        *mp = XFS_BHVTOM(bhv);
+        struct vfs              *vfsp = XFS_MTOVFS(mp);
        for (xfs_infop = xfs_info; xfs_infop->flag; xfs_infop++) {
                if (mp->m_flags & xfs_infop->flag)
@@ -1926,7 +1909,10 @@ xfs_showargs(
        if (!(mp->m_flags & XFS_MOUNT_32BITINOOPT))
                seq_printf(m, "," MNTOPT_64BITINODE);
-        
+        if (vfsp->vfs_flag & VFS_GRPID)
+                seq_printf(m, "," MNTOPT_GRPID);
        return 0;
 }
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 1377c868f3f4..58bfe629b933 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -104,7 +104,7 @@ xfs_open(
         * If it's a directory with any blocks, read-ahead block 0
         * as we're almost certain to have the next operation be a read there.
         */
-        if (vp->v_type == VDIR && ip->i_d.di_nextents > 0) {
+        if (VN_ISDIR(vp) && ip->i_d.di_nextents > 0) {
                mode = xfs_ilock_map_shared(ip);
                if (ip->i_d.di_nextents > 0)
                        (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
@@ -163,18 +163,21 @@ xfs_getattr(
        /*
         * Copy from in-core inode.
         */
-        vap->va_type = vp->v_type;
+        vap->va_mode = ip->i_d.di_mode;
-        vap->va_mode = ip->i_d.di_mode & MODEMASK;
        vap->va_uid = ip->i_d.di_uid;
        vap->va_gid = ip->i_d.di_gid;
        vap->va_projid = ip->i_d.di_projid;
        /*
         * Check vnode type block/char vs. everything else.
-         * Do it with bitmask because that's faster than looking
-         * for multiple values individually.
         */
-        if (((1 << vp->v_type) & ((1<<VBLK) | (1<<VCHR))) == 0) {
+        switch (ip->i_d.di_mode & S_IFMT) {
+        case S_IFBLK:
+        case S_IFCHR:
+                vap->va_rdev = ip->i_df.if_u2.if_rdev;
+                vap->va_blocksize = BLKDEV_IOSIZE;
+                break;
+        default:
                vap->va_rdev = 0;
                if (!(ip->i_d.di_flags & XFS_DIFLAG_REALTIME)) {
@@ -224,9 +227,7 @@ xfs_getattr(
                                (ip->i_d.di_extsize << mp->m_sb.sb_blocklog) :
                                (mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog);
                }
-        } else {
+                break;
-                vap->va_rdev = ip->i_df.if_u2.if_rdev;
-                vap->va_blocksize = BLKDEV_IOSIZE;
        }
        vap->va_atime.tv_sec = ip->i_d.di_atime.t_sec;
@@ -468,7 +469,7 @@ xfs_setattr(
                                m |= S_ISGID;
 #if 0
                        /* Linux allows this, Irix doesn't. */
-                        if ((vap->va_mode & S_ISVTX) && vp->v_type != VDIR)
+                        if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp))
                                m |= S_ISVTX;
 #endif
                        if (m && !capable(CAP_FSETID))
@@ -546,10 +547,10 @@ xfs_setattr(
                        goto error_return;
                }
-                if (vp->v_type == VDIR) {
+                if (VN_ISDIR(vp)) {
                        code = XFS_ERROR(EISDIR);
                        goto error_return;
-                } else if (vp->v_type != VREG) {
+                } else if (!VN_ISREG(vp)) {
                        code = XFS_ERROR(EINVAL);
                        goto error_return;
                }
@@ -1567,7 +1568,7 @@ xfs_release(
        vp = BHV_TO_VNODE(bdp);
        ip = XFS_BHVTOI(bdp);
-        if ((vp->v_type != VREG) || (ip->i_d.di_mode == 0)) {
+        if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) {
                return 0;
        }
@@ -1895,7 +1896,7 @@ xfs_create(
        dp = XFS_BHVTOI(dir_bdp);
        mp = dp->i_mount;
-        dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
+        dm_di_mode = vap->va_mode;
        namelen = VNAMELEN(dentry);
        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
@@ -1973,8 +1974,7 @@ xfs_create(
            (error = XFS_DIR_CANENTER(mp, tp, dp, name, namelen)))
                goto error_return;
        rdev = (vap->va_mask & XFS_AT_RDEV) ? vap->va_rdev : 0;
-        error = xfs_dir_ialloc(&tp, dp,
+        error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 1,
-                        MAKEIMODE(vap->va_type,vap->va_mode), 1,
                        rdev, credp, prid, resblks > 0,
                        &ip, &committed);
        if (error) {
@@ -2620,7 +2620,7 @@ xfs_link(
        vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address);
        target_namelen = VNAMELEN(dentry);
-        if (src_vp->v_type == VDIR)
+        if (VN_ISDIR(src_vp))
                return XFS_ERROR(EPERM);
        src_bdp = vn_bhv_lookup_unlocked(VN_BHV_HEAD(src_vp), &xfs_vnodeops);
@@ -2805,7 +2805,7 @@ xfs_mkdir(
        tp = NULL;
        dp_joined_to_trans = B_FALSE;
-        dm_di_mode = vap->va_mode|VTTOIF(vap->va_type);
+        dm_di_mode = vap->va_mode;
        if (DM_EVENT_ENABLED(dir_vp->v_vfsp, dp, DM_EVENT_CREATE)) {
                error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
@@ -2879,8 +2879,7 @@ xfs_mkdir(
        /*
         * create the directory inode.
         */
-        error = xfs_dir_ialloc(&tp, dp,
+        error = xfs_dir_ialloc(&tp, dp, vap->va_mode, 2,
-                        MAKEIMODE(vap->va_type,vap->va_mode), 2,
                        0, credp, prid, resblks > 0,
                &cdp, NULL);
        if (error) {
@@ -3650,7 +3649,7 @@ xfs_rwlock(
        vnode_t         *vp;
        vp = BHV_TO_VNODE(bdp);
-        if (vp->v_type == VDIR)
+        if (VN_ISDIR(vp))
                return 1;
        ip = XFS_BHVTOI(bdp);
        if (locktype == VRWLOCK_WRITE) {
@@ -3681,7 +3680,7 @@ xfs_rwunlock(
        vnode_t         *vp;
        vp = BHV_TO_VNODE(bdp);
-        if (vp->v_type == VDIR)
+        if (VN_ISDIR(vp))
                return;
        ip = XFS_BHVTOI(bdp);
        if (locktype == VRWLOCK_WRITE) {
@@ -3847,51 +3846,10 @@ xfs_reclaim(
                return 0;
        }
-        if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
+        vn_iowait(vp);
-                if (ip->i_d.di_size > 0) {
-                        /*
-                         * Flush and invalidate any data left around that is
-                         * a part of this file.
-                         *
-                         * Get the inode's i/o lock so that buffers are pushed
-                         * out while holding the proper lock.  We can't hold
-                         * the inode lock here since flushing out buffers may
-                         * cause us to try to get the lock in xfs_strategy().
-                         *
-                         * We don't have to call remapf() here, because there
-                         * cannot be any mapped file references to this vnode
-                         * since it is being reclaimed.
-                         */
-                        xfs_ilock(ip, XFS_IOLOCK_EXCL);
-                        /*
-                         * If we hit an IO error, we need to make sure that the
-                         * buffer and page caches of file data for
-                         * the file are tossed away. We don't want to use
-                         * VOP_FLUSHINVAL_PAGES here because we don't want dirty
-                         * pages to stay attached to the vnode, but be
-                         * marked P_BAD. pdflush/vnode_pagebad
-                         * hates that.
-                         */
-                        if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                                VOP_FLUSHINVAL_PAGES(vp, 0, -1, FI_NONE);
-                        } else {
-                                VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
-                        }
-                        ASSERT(VN_CACHED(vp) == 0);
+        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
-                        ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) ||
+        ASSERT(VN_CACHED(vp) == 0);
-                               ip->i_delayed_blks == 0);
-                        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-                } else if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                        /*
-                         * di_size field may not be quite accurate if we're
-                         * shutting down.
-                         */
-                        VOP_TOSS_PAGES(vp, 0, -1, FI_NONE);
-                        ASSERT(VN_CACHED(vp) == 0);
-                }
-        }
        /* If we have nothing to flush with this inode then complete the
         * teardown now, otherwise break the link between the xfs inode
@@ -4567,7 +4525,7 @@ xfs_change_file_space(
        /*
         * must be a regular file and have write permission
         */
-        if (vp->v_type != VREG)
+        if (!VN_ISREG(vp))
                return XFS_ERROR(EINVAL);
        xfs_ilock(ip, XFS_ILOCK_SHARED);
author	Jeff Garzik <jgarzik@pobox.com>	2005-09-08 05:39:55 -0400
committer	Jeff Garzik <jgarzik@pobox.com>	2005-09-08 05:39:55 -0400
commit	c324b44c34050cf2a9b58830e11c974806bd85d8 (patch)
tree	3ac45a783221283925cd698334a8f5e7dd4c1df8 /fs
parent	2fcf522509cceea524b6e7ece8fd6759b682175a (diff)
parent	caf39e87cc1182f7dae84eefc43ca14d54c78ef9 (diff)